From 28f97191a0c7652e3c253c43071d8a81f3ddb7a7 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunng@protonmail.com>
Date: Thu, 12 Mar 2026 10:15:25 +0800
Subject: [PATCH 01/42] fix: make pipeline table ttl forever (#7795)

* fix: make pipeline table ttl forever

* chore: use constants when possible
---
 src/pipeline/src/manager/pipeline_operator.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/pipeline/src/manager/pipeline_operator.rs b/src/pipeline/src/manager/pipeline_operator.rs
index 77ef8ade23..6c4256db69 100644
--- a/src/pipeline/src/manager/pipeline_operator.rs
+++ b/src/pipeline/src/manager/pipeline_operator.rs
@@ -20,6 +20,7 @@ use api::v1::CreateTableExpr;
 use catalog::{CatalogManagerRef, RegisterSystemTableRequest};
 use common_catalog::consts::{DEFAULT_PRIVATE_SCHEMA_NAME, default_engine};
 use common_telemetry::info;
+use common_time::FOREVER;
 use datatypes::timestamp::TimestampNanosecond;
 use futures::FutureExt;
 use operator::insert::InserterRef;
@@ -28,6 +29,7 @@ use query::QueryEngineRef;
 use session::context::QueryContextRef;
 use snafu::{OptionExt, ResultExt};
 use table::TableRef;
+use table::requests::TTL_KEY;
 
 use crate::Pipeline;
 use crate::error::{CatalogSnafu, CreateTableSnafu, PipelineTableNotFoundSnafu, Result};
@@ -59,6 +61,9 @@ impl PipelineOperator {
     fn create_table_request(&self, catalog: &str) -> RegisterSystemTableRequest {
         let (time_index, primary_keys, column_defs) = PipelineTable::build_pipeline_schema();
 
+        let mut table_options = HashMap::new();
+        table_options.insert(TTL_KEY.to_string(), FOREVER.to_string());
+
         let create_table_expr = CreateTableExpr {
             catalog_name: catalog.to_string(),
             schema_name: DEFAULT_PRIVATE_SCHEMA_NAME.to_string(),
@@ -68,7 +73,7 @@ impl PipelineOperator {
             time_index,
             primary_keys,
             create_if_not_exists: true,
-            table_options: Default::default(),
+            table_options,
             table_id: None, // Should and will be assigned by Meta.
             engine: default_engine().to_string(),
         };

From 78661329208abf82ad731eecdcb495e62014ad5c Mon Sep 17 00:00:00 2001
From: Chengjie Jin <48244341+YZL0v3ZZ@users.noreply.github.com>
Date: Thu, 12 Mar 2026 17:33:55 +0800
Subject: [PATCH 02/42] feat(procedure): detect potential deadlock when
 parent/child procedures share lock keys (#7752)

* feat(procedure): detect potential deadlock when parent/child share lock keys

Add a deadlock detection mechanism in submit_subprocedure() to warn
when a child procedure's lock_key overlaps with its parent's lock_key.

When this happens, the parent holds the lock while waiting for the child
to complete (at child_notify.notified().await), but the child blocks
forever trying to acquire the same lock. This is a classic Hold-and-Wait
deadlock.

The detection:
- Emits a warn! log in all builds (visible in production)
- Triggers debug_assert!(false) in debug/test builds for early CI detection

This partially addresses the TODO at line 121-122 and is a follow-up
to the discussion in: https://github.com/GreptimeTeam/greptimedb/issues/7692

Signed-off-by: YZL0v3ZZ <2055877225@qq.com>

* style: fix trailing whitespace

Signed-off-by: YZL0v3ZZ <2055877225@qq.com>

* refactor(procedure): extract deadlock detection into a testable pure function

Signed-off-by: YZL0v3ZZ <2055877225@qq.com>

* fix(procedure): preserve lock mode when detecting parent/child deadlock

Signed-off-by: YZL0v3ZZ <2055877225@qq.com>

* re-run ci check

Signed-off-by: YZL0v3ZZ <2055877225@qq.com>

---------

Signed-off-by: YZL0v3ZZ <2055877225@qq.com>
---
 src/common/procedure/src/local/runner.rs | 241 +++++++++++++++++++++++
 1 file changed, 241 insertions(+)

diff --git a/src/common/procedure/src/local/runner.rs b/src/common/procedure/src/local/runner.rs
index 454afb95b3..46dcef11d4 100644
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -17,6 +17,8 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use backon::{BackoffBuilder, ExponentialBuilder};
+use common_error::ext::PlainError;
+use common_error::status_code::StatusCode;
 use common_event_recorder::EventRecorderRef;
 use common_telemetry::tracing_context::{FutureExt, TracingContext};
 use common_telemetry::{debug, error, info, tracing};
@@ -90,6 +92,45 @@ impl Drop for ProcedureGuard {
     }
 }
 
+/// Returns a list of conflicting lock keys between a parent and a child procedure.
+/// Evaluates the Read/Write lock compatibility matrix:
+/// - Share + Share => Compatible
+/// - Exclusive + Any => Conflict
+/// - Any + Exclusive => Conflict
+fn find_lock_conflicts<'a>(
+    parent_keys: impl Iterator<Item = &'a StringKey>,
+    child_keys: impl Iterator<Item = &'a StringKey>,
+) -> Vec<String> {
+    use std::collections::HashMap;
+
+    // Map from key string slice (&str) to a boolean indicating if the parent holds it EXCLUSIVELY.
+    let mut parent_map = HashMap::new();
+    for key in parent_keys {
+        match key {
+            StringKey::Exclusive(k) => {
+                parent_map.insert(k.as_str(), true);
+            }
+            StringKey::Share(k) => {
+                parent_map.entry(k.as_str()).or_insert(false);
+            }
+        }
+    }
+
+    child_keys
+        .filter_map(|child_key| match child_key {
+            StringKey::Exclusive(k) | StringKey::Share(k)
+                if parent_map.get(k.as_str()) == Some(&true) =>
+            {
+                Some(k.clone())
+            }
+            StringKey::Exclusive(k) if parent_map.get(k.as_str()) == Some(&false) => {
+                Some(k.clone())
+            }
+            _ => None,
+        })
+        .collect()
+}
+
 pub(crate) struct Runner {
     pub(crate) meta: ProcedureMetaRef,
     pub(crate) procedure: BoxedProcedure,
@@ -512,6 +553,41 @@ impl Runner {
 
     async fn on_suspended(&mut self, subprocedures: Vec<ProcedureWithId>) {
         let has_child = !subprocedures.is_empty();
+
+        // Pre-check: detect potential deadlocks BEFORE submitting any subprocedure.
+        // If a child shares conflicting lock keys with the parent, submitting it would
+        // cause a Hold-and-Wait deadlock — the child blocks on lock acquisition while
+        // the parent holds the lock and waits for the child to finish.
+        for sub in &subprocedures {
+            let conflicting = find_lock_conflicts(
+                self.meta.lock_key.keys_to_lock(),
+                sub.procedure.lock_key().keys_to_lock(),
+            );
+            if !conflicting.is_empty() {
+                let err_msg = format!(
+                    "Deadlock prevented: subprocedure {}-{} shares conflicting lock key(s) {:?} \
+                     with parent {}-{}. Parent holds these locks and would wait for child \
+                     completion, but child cannot acquire them.",
+                    sub.procedure.type_name(),
+                    sub.id,
+                    conflicting,
+                    self.procedure.type_name(),
+                    self.meta.id,
+                );
+                error!("{}", err_msg);
+                let err = Arc::new(Error::external(PlainError::new(
+                    err_msg,
+                    StatusCode::Internal,
+                )));
+                if self.procedure.rollback_supported() {
+                    self.meta.set_state(ProcedureState::prepare_rollback(err));
+                } else {
+                    self.meta.set_state(ProcedureState::failed(err));
+                }
+                return;
+            }
+        }
+
         for subprocedure in subprocedures {
             info!(
                 "Procedure {}-{} submit subprocedure {}-{}",
@@ -1939,4 +2015,169 @@ mod tests {
         join_all(tasks).await;
         assert_eq!(shared_atomic_value.load(Ordering::Relaxed), 2);
     }
+    #[tokio::test]
+    async fn test_on_suspend_deadlock_detected_no_rollback() {
+        // Parent holds Exclusive("catalog.schema.table"), child also requests Exclusive("catalog.schema.table").
+        // Since parent does NOT support rollback, state should become Failed.
+        let child_id = ProcedureId::random();
+        let exec_fn = move |_| {
+            async move {
+                let child_exec_fn = |_| async { Ok(Status::done()) }.boxed();
+                let child = ProcedureAdapter {
+                    data: "child".to_string(),
+                    lock_key: LockKey::single_exclusive("catalog.schema.table"),
+                    poison_keys: PoisonKeys::default(),
+                    exec_fn: child_exec_fn,
+                    rollback_fn: None,
+                };
+                Ok(Status::Suspended {
+                    subprocedures: vec![ProcedureWithId {
+                        id: child_id,
+                        procedure: Box::new(child),
+                    }],
+                    persist: false,
+                })
+            }
+            .boxed()
+        };
+        let parent = ProcedureAdapter {
+            data: "parent".to_string(),
+            lock_key: LockKey::single_exclusive("catalog.schema.table"),
+            poison_keys: PoisonKeys::default(),
+            exec_fn,
+            rollback_fn: None, // No rollback support
+        };
+
+        let dir = create_temp_dir("deadlock_no_rollback");
+        let meta = parent.new_meta(ROOT_ID);
+        let ctx = context_without_provider(meta.id);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store);
+        runner.manager_ctx.start();
+
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_failed(), "Expected Failed, got {state:?}");
+        // Verify the error exists
+        assert!(
+            state.error().is_some(),
+            "Failed state should contain an error"
+        );
+        // Child should NOT have been submitted
+        assert!(
+            !runner.manager_ctx.contains_procedure(child_id),
+            "Child procedure should not be submitted when deadlock is detected"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_on_suspend_deadlock_detected_with_rollback() {
+        // Parent holds Exclusive("catalog.schema.table"), child also requests Exclusive("catalog.schema.table").
+        // Since parent DOES support rollback, state should become PrepareRollback.
+        let child_id = ProcedureId::random();
+        let exec_fn = move |_| {
+            async move {
+                let child_exec_fn = |_| async { Ok(Status::done()) }.boxed();
+                let child = ProcedureAdapter {
+                    data: "child".to_string(),
+                    lock_key: LockKey::single_exclusive("catalog.schema.table"),
+                    poison_keys: PoisonKeys::default(),
+                    exec_fn: child_exec_fn,
+                    rollback_fn: None,
+                };
+                Ok(Status::Suspended {
+                    subprocedures: vec![ProcedureWithId {
+                        id: child_id,
+                        procedure: Box::new(child),
+                    }],
+                    persist: false,
+                })
+            }
+            .boxed()
+        };
+        let rollback_fn = move |_| async move { Ok(()) }.boxed();
+        let parent = ProcedureAdapter {
+            data: "parent".to_string(),
+            lock_key: LockKey::single_exclusive("catalog.schema.table"),
+            poison_keys: PoisonKeys::default(),
+            exec_fn,
+            rollback_fn: Some(Box::new(rollback_fn)), // Supports rollback
+        };
+
+        let dir = create_temp_dir("deadlock_with_rollback");
+        let meta = parent.new_meta(ROOT_ID);
+        let ctx = context_without_provider(meta.id);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store);
+        runner.manager_ctx.start();
+
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(
+            state.is_prepare_rollback(),
+            "Expected PrepareRollback, got {state:?}"
+        );
+        // Verify the error exists in PrepareRollback variant
+        match &state {
+            ProcedureState::PrepareRollback { error } => {
+                assert!(!error.to_string().is_empty(), "Error should not be empty");
+            }
+            _ => panic!("Expected PrepareRollback, got {state:?}"),
+        }
+        // Child should NOT have been submitted
+        assert!(
+            !runner.manager_ctx.contains_procedure(child_id),
+            "Child procedure should not be submitted when deadlock is detected"
+        );
+    }
+
+    #[test]
+    fn test_find_lock_conflicts() {
+        use crate::procedure::StringKey;
+
+        // 1. Share + Share = No conflict (Compatible)
+        let parent = [StringKey::Share("A".to_string())];
+        let child = [StringKey::Share("A".to_string())];
+        assert!(super::find_lock_conflicts(parent.iter(), child.iter()).is_empty());
+
+        // 2. Share + Exclusive = Conflict
+        let parent = [StringKey::Share("A".to_string())];
+        let child = [StringKey::Exclusive("A".to_string())];
+        assert_eq!(
+            super::find_lock_conflicts(parent.iter(), child.iter()),
+            vec!["A".to_string()]
+        );
+
+        // 3. Exclusive + Share = Conflict
+        let parent = [StringKey::Exclusive("A".to_string())];
+        let child = [StringKey::Share("A".to_string())];
+        assert_eq!(
+            super::find_lock_conflicts(parent.iter(), child.iter()),
+            vec!["A".to_string()]
+        );
+
+        // 4. Exclusive + Exclusive = Conflict
+        let parent = [StringKey::Exclusive("A".to_string())];
+        let child = [StringKey::Exclusive("A".to_string())];
+        assert_eq!(
+            super::find_lock_conflicts(parent.iter(), child.iter()),
+            vec!["A".to_string()]
+        );
+
+        // 5. Multiple keys, partial overlap
+        let parent = [
+            StringKey::Share("A".to_string()),
+            StringKey::Exclusive("B".to_string()),
+        ];
+        let child = [
+            StringKey::Exclusive("A".to_string()), // Conflict with Share("A")
+            StringKey::Share("B".to_string()),     // Conflict with Exclusive("B")
+            StringKey::Exclusive("C".to_string()), // No conflict, parent doesn't hold C
+        ];
+        let mut conflicts = super::find_lock_conflicts(parent.iter(), child.iter());
+        conflicts.sort();
+        assert_eq!(conflicts, vec!["A".to_string(), "B".to_string()]);
+    }
 }

From 3beb538aa8a505ec968d26b789db691f83b73680 Mon Sep 17 00:00:00 2001
From: discord9 <55937128+discord9@users.noreply.github.com>
Date: Thu, 12 Mar 2026 18:53:47 +0800
Subject: [PATCH 03/42] fix: rm useless analyzer (#7797)

* fix: rm useless analyzer

Signed-off-by: discord9 <discord9@163.com>

* test: rm related test

Signed-off-by: discord9 <discord9@163.com>

* test: flow tql avg

Signed-off-by: discord9 <discord9@163.com>

---------

Signed-off-by: discord9 <discord9@163.com>
---
 src/flow/src/df_optimizer.rs                  | 402 +-------
 src/flow/src/transform/aggr.rs                | 895 +-----------------
 .../common/flow/flow_tql_avg.result           | 126 +++
 .../standalone/common/flow/flow_tql_avg.sql   |  63 ++
 4 files changed, 192 insertions(+), 1294 deletions(-)
 create mode 100644 tests/cases/standalone/common/flow/flow_tql_avg.result
 create mode 100644 tests/cases/standalone/common/flow/flow_tql_avg.sql

diff --git a/src/flow/src/df_optimizer.rs b/src/flow/src/df_optimizer.rs
index 1d41d09346..614b79ccf1 100644
--- a/src/flow/src/df_optimizer.rs
+++ b/src/flow/src/df_optimizer.rs
@@ -16,30 +16,19 @@
 
 #![warn(unused)]
 
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::sync::Arc;
 
 use common_error::ext::BoxedError;
 use common_telemetry::debug;
 use datafusion::config::ConfigOptions;
 use datafusion::error::DataFusionError;
-use datafusion::functions_aggregate::count::count_udaf;
-use datafusion::functions_aggregate::sum::sum_udaf;
 use datafusion::optimizer::analyzer::type_coercion::TypeCoercion;
 use datafusion::optimizer::common_subexpr_eliminate::CommonSubexprEliminate;
 use datafusion::optimizer::optimize_projections::OptimizeProjections;
 use datafusion::optimizer::simplify_expressions::SimplifyExpressions;
-use datafusion::optimizer::utils::NamePreserver;
 use datafusion::optimizer::{Analyzer, AnalyzerRule, Optimizer, OptimizerContext};
-use datafusion_common::tree_node::{
-    Transformed, TreeNode, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor,
-};
-use datafusion_common::{Column, DFSchema, ScalarValue};
-use datafusion_expr::utils::merge_schema;
-use datafusion_expr::{
-    BinaryExpr, ColumnarValue, Expr, Literal, Operator, Projection, ScalarFunctionArgs,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
-};
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor};
 use query::QueryEngine;
 use query::optimizer::count_wildcard::CountWildcardToTimeIndexRule;
 use query::parser::QueryLanguageParser;
@@ -52,7 +41,6 @@ use substrait::DFLogicalSubstraitConvertor;
 
 use crate::adapter::FlownodeContext;
 use crate::error::{DatafusionSnafu, Error, ExternalSnafu, UnexpectedSnafu};
-use crate::expr::{TUMBLE_END, TUMBLE_START};
 use crate::plan::TypedPlan;
 
 // TODO(discord9): use `Analyzer` to manage rules if more `AnalyzerRule` is needed
@@ -63,8 +51,6 @@ pub async fn apply_df_optimizer(
     let cfg = query_ctx.create_config_options();
     let analyzer = Analyzer::with_rules(vec![
         Arc::new(CountWildcardToTimeIndexRule),
-        Arc::new(AvgExpandRule),
-        Arc::new(TumbleExpandRule),
         Arc::new(CheckGroupByRule::new()),
         Arc::new(TypeCoercion::new()),
     ]);
@@ -127,390 +113,6 @@ pub async fn sql_to_flow_plan(
     Ok(flow_plan)
 }
 
-#[derive(Debug)]
-struct AvgExpandRule;
-
-impl AnalyzerRule for AvgExpandRule {
-    fn analyze(
-        &self,
-        plan: datafusion_expr::LogicalPlan,
-        _config: &ConfigOptions,
-    ) -> datafusion_common::Result<datafusion_expr::LogicalPlan> {
-        let transformed = plan
-            .transform_up_with_subqueries(expand_avg_analyzer)?
-            .data
-            .transform_down_with_subqueries(put_aggr_to_proj_analyzer)?
-            .data;
-        Ok(transformed)
-    }
-
-    fn name(&self) -> &str {
-        "avg_expand"
-    }
-}
-
-/// lift aggr's composite aggr_expr to outer proj, and leave aggr only with simple direct aggr expr
-/// i.e.
-/// ```ignore
-/// proj: avg(x)
-/// -- aggr: [sum(x)/count(x) as avg(x)]
-/// ```
-/// becomes:
-/// ```ignore
-/// proj: sum(x)/count(x) as avg(x)
-/// -- aggr: [sum(x), count(x)]
-/// ```
-fn put_aggr_to_proj_analyzer(
-    plan: datafusion_expr::LogicalPlan,
-) -> Result<Transformed<datafusion_expr::LogicalPlan>, DataFusionError> {
-    if let datafusion_expr::LogicalPlan::Projection(proj) = &plan
-        && let datafusion_expr::LogicalPlan::Aggregate(aggr) = proj.input.as_ref()
-    {
-        let mut replace_old_proj_exprs = HashMap::new();
-        let mut expanded_aggr_exprs = vec![];
-        for aggr_expr in &aggr.aggr_expr {
-            let mut is_composite = false;
-            if let Expr::AggregateFunction(_) = &aggr_expr {
-                expanded_aggr_exprs.push(aggr_expr.clone());
-            } else {
-                let old_name = aggr_expr.name_for_alias()?;
-                let new_proj_expr = aggr_expr
-                    .clone()
-                    .transform(|ch| {
-                        if let Expr::AggregateFunction(_) = &ch {
-                            is_composite = true;
-                            expanded_aggr_exprs.push(ch.clone());
-                            Ok(Transformed::yes(Expr::Column(Column::from_qualified_name(
-                                ch.name_for_alias()?,
-                            ))))
-                        } else {
-                            Ok(Transformed::no(ch))
-                        }
-                    })?
-                    .data;
-                replace_old_proj_exprs.insert(old_name, new_proj_expr);
-            }
-        }
-
-        if expanded_aggr_exprs.len() > aggr.aggr_expr.len() {
-            let mut aggr = aggr.clone();
-            aggr.aggr_expr = expanded_aggr_exprs;
-            let mut aggr_plan = datafusion_expr::LogicalPlan::Aggregate(aggr);
-            // important to recompute schema after changing aggr_expr
-            aggr_plan = aggr_plan.recompute_schema()?;
-
-            // reconstruct proj with new proj_exprs
-            let mut new_proj_exprs = proj.expr.clone();
-            for proj_expr in new_proj_exprs.iter_mut() {
-                if let Some(new_proj_expr) =
-                    replace_old_proj_exprs.get(&proj_expr.name_for_alias()?)
-                {
-                    *proj_expr = new_proj_expr.clone();
-                }
-                *proj_expr = proj_expr
-                    .clone()
-                    .transform(|expr| {
-                        if let Some(new_expr) = replace_old_proj_exprs.get(&expr.name_for_alias()?)
-                        {
-                            Ok(Transformed::yes(new_expr.clone()))
-                        } else {
-                            Ok(Transformed::no(expr))
-                        }
-                    })?
-                    .data;
-            }
-            let proj = datafusion_expr::LogicalPlan::Projection(Projection::try_new(
-                new_proj_exprs,
-                Arc::new(aggr_plan),
-            )?);
-            return Ok(Transformed::yes(proj));
-        }
-    }
-    Ok(Transformed::no(plan))
-}
-
-/// expand `avg(<expr>)` function into `cast(sum((<expr>) AS f64)/count((<expr>)`
-fn expand_avg_analyzer(
-    plan: datafusion_expr::LogicalPlan,
-) -> Result<Transformed<datafusion_expr::LogicalPlan>, DataFusionError> {
-    let mut schema = merge_schema(&plan.inputs());
-
-    if let datafusion_expr::LogicalPlan::TableScan(ts) = &plan {
-        let source_schema =
-            DFSchema::try_from_qualified_schema(ts.table_name.clone(), &ts.source.schema())?;
-        schema.merge(&source_schema);
-    }
-
-    let mut expr_rewrite = ExpandAvgRewriter::new(&schema);
-
-    let name_preserver = NamePreserver::new(&plan);
-    // apply coercion rewrite all expressions in the plan individually
-    plan.map_expressions(|expr| {
-        let original_name = name_preserver.save(&expr);
-        Ok(expr
-            .rewrite(&mut expr_rewrite)?
-            .update_data(|expr| original_name.restore(expr)))
-    })?
-    .map_data(|plan| plan.recompute_schema())
-}
-
-/// rewrite `avg(<expr>)` function into `CASE WHEN count(<expr>) !=0 THEN  cast(sum((<expr>) AS avg_return_type)/count((<expr>) ELSE 0`
-///
-/// TODO(discord9): support avg return type decimal128
-///
-/// see impl details at https://github.com/apache/datafusion/blob/4ad4f90d86c57226a4e0fb1f79dfaaf0d404c273/datafusion/expr/src/type_coercion/aggregates.rs#L457-L462
-pub(crate) struct ExpandAvgRewriter<'a> {
-    /// schema of the plan
-    #[allow(unused)]
-    pub(crate) schema: &'a DFSchema,
-}
-
-impl<'a> ExpandAvgRewriter<'a> {
-    fn new(schema: &'a DFSchema) -> Self {
-        Self { schema }
-    }
-}
-
-impl TreeNodeRewriter for ExpandAvgRewriter<'_> {
-    type Node = Expr;
-
-    fn f_up(&mut self, expr: Expr) -> Result<Transformed<Expr>, DataFusionError> {
-        if let Expr::AggregateFunction(aggr_func) = &expr
-            && aggr_func.func.name() == "avg"
-        {
-            let sum_expr = {
-                let mut tmp = aggr_func.clone();
-                tmp.func = sum_udaf();
-                Expr::AggregateFunction(tmp)
-            };
-            let sum_cast = {
-                let mut tmp = sum_expr.clone();
-                tmp = Expr::Cast(datafusion_expr::Cast {
-                    expr: Box::new(tmp),
-                    data_type: arrow_schema::DataType::Float64,
-                });
-                tmp
-            };
-
-            let count_expr = {
-                let mut tmp = aggr_func.clone();
-                tmp.func = count_udaf();
-
-                Expr::AggregateFunction(tmp)
-            };
-            let count_expr_ref =
-                Expr::Column(Column::from_qualified_name(count_expr.name_for_alias()?));
-
-            let div = BinaryExpr::new(Box::new(sum_cast), Operator::Divide, Box::new(count_expr));
-            let div_expr = Box::new(Expr::BinaryExpr(div));
-
-            let zero = Box::new(0.lit());
-            let not_zero = BinaryExpr::new(Box::new(count_expr_ref), Operator::NotEq, zero.clone());
-            let not_zero = Box::new(Expr::BinaryExpr(not_zero));
-            let null = Box::new(Expr::Literal(ScalarValue::Null, None));
-
-            let case_when =
-                datafusion_expr::Case::new(None, vec![(not_zero, div_expr)], Some(null));
-            let case_when_expr = Expr::Case(case_when);
-
-            return Ok(Transformed::yes(case_when_expr));
-        }
-
-        Ok(Transformed::no(expr))
-    }
-}
-
-/// expand tumble in aggr expr to tumble_start and tumble_end with column name like `window_start`
-#[derive(Debug)]
-struct TumbleExpandRule;
-
-impl AnalyzerRule for TumbleExpandRule {
-    fn analyze(
-        &self,
-        plan: datafusion_expr::LogicalPlan,
-        _config: &ConfigOptions,
-    ) -> datafusion_common::Result<datafusion_expr::LogicalPlan> {
-        let transformed = plan
-            .transform_up_with_subqueries(expand_tumble_analyzer)?
-            .data;
-        Ok(transformed)
-    }
-
-    fn name(&self) -> &str {
-        "tumble_expand"
-    }
-}
-
-/// expand `tumble` in aggr expr to `tumble_start` and `tumble_end`, also expand related alias and column ref
-///
-/// will add `tumble_start` and `tumble_end` to outer projection if not exist before
-fn expand_tumble_analyzer(
-    plan: datafusion_expr::LogicalPlan,
-) -> Result<Transformed<datafusion_expr::LogicalPlan>, DataFusionError> {
-    if let datafusion_expr::LogicalPlan::Projection(proj) = &plan
-        && let datafusion_expr::LogicalPlan::Aggregate(aggr) = proj.input.as_ref()
-    {
-        let mut new_group_expr = vec![];
-        let mut alias_to_expand = HashMap::new();
-        let mut encountered_tumble = false;
-        for expr in aggr.group_expr.iter() {
-            match expr {
-                datafusion_expr::Expr::ScalarFunction(func) if func.name() == "tumble" => {
-                    encountered_tumble = true;
-
-                    let tumble_start = TumbleExpand::new(TUMBLE_START);
-                    let tumble_start = datafusion_expr::expr::ScalarFunction::new_udf(
-                        Arc::new(tumble_start.into()),
-                        func.args.clone(),
-                    );
-                    let tumble_start = datafusion_expr::Expr::ScalarFunction(tumble_start);
-                    let start_col_name = tumble_start.name_for_alias()?;
-                    new_group_expr.push(tumble_start);
-
-                    let tumble_end = TumbleExpand::new(TUMBLE_END);
-                    let tumble_end = datafusion_expr::expr::ScalarFunction::new_udf(
-                        Arc::new(tumble_end.into()),
-                        func.args.clone(),
-                    );
-                    let tumble_end = datafusion_expr::Expr::ScalarFunction(tumble_end);
-                    let end_col_name = tumble_end.name_for_alias()?;
-                    new_group_expr.push(tumble_end);
-
-                    alias_to_expand.insert(expr.name_for_alias()?, (start_col_name, end_col_name));
-                }
-                _ => new_group_expr.push(expr.clone()),
-            }
-        }
-        if !encountered_tumble {
-            return Ok(Transformed::no(plan));
-        }
-        let mut new_aggr = aggr.clone();
-        new_aggr.group_expr = new_group_expr;
-        let new_aggr = datafusion_expr::LogicalPlan::Aggregate(new_aggr).recompute_schema()?;
-        // replace alias in projection if needed, and add new column ref if necessary
-        let mut new_proj_expr = vec![];
-        let mut have_expanded = false;
-
-        for proj_expr in proj.expr.iter() {
-            if let Some((start_col_name, end_col_name)) =
-                alias_to_expand.get(&proj_expr.name_for_alias()?)
-            {
-                let start_col = Column::from_qualified_name(start_col_name);
-                let end_col = Column::from_qualified_name(end_col_name);
-                new_proj_expr.push(datafusion_expr::Expr::Column(start_col));
-                new_proj_expr.push(datafusion_expr::Expr::Column(end_col));
-                have_expanded = true;
-            } else {
-                new_proj_expr.push(proj_expr.clone());
-            }
-        }
-
-        // append to end of projection if not exist
-        if !have_expanded {
-            for (start_col_name, end_col_name) in alias_to_expand.values() {
-                let start_col = Column::from_qualified_name(start_col_name);
-                let end_col = Column::from_qualified_name(end_col_name);
-                new_proj_expr.push(datafusion_expr::Expr::Column(start_col).alias("window_start"));
-                new_proj_expr.push(datafusion_expr::Expr::Column(end_col).alias("window_end"));
-            }
-        }
-
-        let new_proj = datafusion_expr::LogicalPlan::Projection(Projection::try_new(
-            new_proj_expr,
-            Arc::new(new_aggr),
-        )?);
-        return Ok(Transformed::yes(new_proj));
-    }
-
-    Ok(Transformed::no(plan))
-}
-
-/// This is a placeholder for tumble_start and tumble_end function, so that datafusion can
-/// recognize them as scalar function
-#[derive(Debug, PartialEq, Eq, Hash)]
-pub struct TumbleExpand {
-    signature: Signature,
-    name: String,
-}
-
-impl TumbleExpand {
-    pub fn new(name: &str) -> Self {
-        Self {
-            signature: Signature::new(TypeSignature::UserDefined, Volatility::Immutable),
-            name: name.to_string(),
-        }
-    }
-}
-
-impl ScalarUDFImpl for TumbleExpand {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        &self.name
-    }
-
-    /// elide the signature for now
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn coerce_types(
-        &self,
-        arg_types: &[arrow_schema::DataType],
-    ) -> datafusion_common::Result<Vec<arrow_schema::DataType>> {
-        match (arg_types.first(), arg_types.get(1), arg_types.get(2)) {
-            (Some(ts), Some(window), opt) => {
-                use arrow_schema::DataType::*;
-                if !matches!(ts, Date32 | Timestamp(_, _)) {
-                    return Err(DataFusionError::Plan(
-                        format!("Expect timestamp column as first arg for tumble_start, found {:?}", ts)
-                    ));
-                }
-                if !matches!(window, Utf8 | Interval(_)) {
-                    return Err(DataFusionError::Plan(
-                        format!("Expect second arg for window size's type being interval for tumble_start, found {:?}", window),
-                    ));
-                }
-
-                if let Some(start_time) = opt
-                    && !matches!(start_time,  Utf8 | Date32 | Timestamp(_, _)){
-                        return Err(DataFusionError::Plan(
-                            format!("Expect start_time to either be date, timestamp or string, found {:?}", start_time)
-                        ));
-                    }
-
-                Ok(arg_types.to_vec())
-            }
-            _ => Err(DataFusionError::Plan(
-                "Expect tumble function have at least two arg(timestamp column and window size) and a third optional arg for starting time".to_string(),
-            )),
-        }
-    }
-
-    fn return_type(
-        &self,
-        arg_types: &[arrow_schema::DataType],
-    ) -> Result<arrow_schema::DataType, DataFusionError> {
-        arg_types.first().cloned().ok_or_else(|| {
-            DataFusionError::Plan(
-                "Expect tumble function have at least two arg(timestamp column and window size)"
-                    .to_string(),
-            )
-        })
-    }
-
-    fn invoke_with_args(
-        &self,
-        _args: ScalarFunctionArgs,
-    ) -> datafusion_common::Result<ColumnarValue> {
-        Err(DataFusionError::Plan(
-            "This function should not be executed by datafusion".to_string(),
-        ))
-    }
-}
-
 /// This rule check all group by exprs, and make sure they are also in select clause in a aggr query
 #[derive(Debug)]
 struct CheckGroupByRule {}
diff --git a/src/flow/src/transform/aggr.rs b/src/flow/src/transform/aggr.rs
index 579f0e8ee3..861ca8fe65 100644
--- a/src/flow/src/transform/aggr.rs
+++ b/src/flow/src/transform/aggr.rs
@@ -382,10 +382,9 @@ impl TypedPlan {
 
 #[cfg(test)]
 mod test {
-    use std::time::Duration;
 
     use bytes::BytesMut;
-    use common_time::{IntervalMonthDayNano, Timestamp};
+    use common_time::IntervalMonthDayNano;
     use datatypes::data_type::ConcreteDataType as CDT;
     use datatypes::prelude::ConcreteDataType;
     use datatypes::value::Value;
@@ -397,898 +396,6 @@ mod test {
     use crate::repr::{ColumnType, RelationType};
     use crate::transform::test::{create_test_ctx, create_test_query_engine, sql_to_substrait};
 
-    #[tokio::test]
-    async fn test_df_func_basic() {
-        let engine = create_test_query_engine();
-        let sql = "SELECT sum(abs(number)) FROM numbers_with_ts GROUP BY tumble(ts, '1 second', '2021-07-01 00:00:00');";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
-            .await
-            .unwrap();
-
-        let aggr_expr = AggregateExpr {
-            func: AggregateFunc::SumUInt64,
-            expr: ScalarExpr::Column(0),
-            distinct: false,
-        };
-        let expected =
-            TypedPlan {
-                schema: RelationType::new(vec![
-                    ColumnType::new(CDT::uint64_datatype(), true), // sum(number)
-                    ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                    ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-                ])
-                .with_key(vec![2])
-                .with_time_index(Some(1))
-                .into_named(vec![
-                    Some("sum(abs(numbers_with_ts.number))".to_string()),
-                    Some("window_start".to_string()),
-                    Some("window_end".to_string()),
-                ]),
-                plan: Plan::Mfp {
-                    input: Box::new(
-                        Plan::Reduce {
-                            input: Box::new(
-                                Plan::Get {
-                                    id: crate::expr::Id::Global(GlobalId::User(1)),
-                                }
-                                .with_types(
-                                    RelationType::new(vec![
-                                        ColumnType::new(ConcreteDataType::uint32_datatype(), false),
-                                        ColumnType::new(
-                                            ConcreteDataType::timestamp_millisecond_datatype(),
-                                            false,
-                                        ),
-                                    ])
-                                    .into_named(vec![
-                                        Some("number".to_string()),
-                                        Some("ts".to_string()),
-                                    ]),
-                                )
-                                .mfp(MapFilterProject::new(2).into_safe())
-                                .unwrap(),
-                            ),
-                            key_val_plan: KeyValPlan {
-                                key_plan: MapFilterProject::new(2)
-                                    .map(vec![
-                                        ScalarExpr::Column(1).call_unary(
-                                            UnaryFunc::TumbleWindowFloor {
-                                                window_size: Duration::from_nanos(1_000_000_000),
-                                                start_time: Some(Timestamp::new_millisecond(
-                                                    1625097600000,
-                                                )),
-                                            },
-                                        ),
-                                        ScalarExpr::Column(1).call_unary(
-                                            UnaryFunc::TumbleWindowCeiling {
-                                                window_size: Duration::from_nanos(1_000_000_000),
-                                                start_time: Some(Timestamp::new_millisecond(
-                                                    1625097600000,
-                                                )),
-                                            },
-                                        ),
-                                    ])
-                                    .unwrap()
-                                    .project(vec![2, 3])
-                                    .unwrap()
-                                    .into_safe(),
-                                val_plan: MapFilterProject::new(2)
-                                    .map(vec![ScalarExpr::CallDf {
-                                    df_scalar_fn: DfScalarFunction::try_from_raw_fn(
-                                        RawDfScalarFn {
-                                            f: BytesMut::from(
-                                                b"\x08\x02\"\x08\x1a\x06\x12\x04\n\x02\x12\0"
-                                                    .as_ref(),
-                                            ),
-                                            input_schema: RelationType::new(vec![ColumnType::new(
-                                                ConcreteDataType::uint32_datatype(),
-                                                false,
-                                            )])
-                                            .into_unnamed(),
-                                            extensions: FunctionExtensions::from_iter(
-                                                [
-                                                    (0, "tumble_start".to_string()),
-                                                    (1, "tumble_end".to_string()),
-                                                    (2, "abs".to_string()),
-                                                    (3, "sum".to_string()),
-                                                ]
-                                                .into_iter(),
-                                            ),
-                                        },
-                                    )
-                                    .await
-                                    .unwrap(),
-                                    exprs: vec![ScalarExpr::Column(0)],
-                                }
-                                .cast(CDT::uint64_datatype())])
-                                    .unwrap()
-                                    .project(vec![2])
-                                    .unwrap()
-                                    .into_safe(),
-                            },
-                            reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                                full_aggrs: vec![aggr_expr.clone()],
-                                simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)],
-                                distinct_aggrs: vec![],
-                            }),
-                        }
-                        .with_types(
-                            RelationType::new(vec![
-                                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-                                ColumnType::new(CDT::uint64_datatype(), true), //sum(number)
-                            ])
-                            .with_key(vec![1])
-                            .with_time_index(Some(0))
-                            .into_unnamed(),
-                        ),
-                    ),
-                    mfp: MapFilterProject::new(3)
-                        .map(vec![
-                            ScalarExpr::Column(2),
-                            ScalarExpr::Column(0),
-                            ScalarExpr::Column(1),
-                        ])
-                        .unwrap()
-                        .project(vec![3, 4, 5])
-                        .unwrap(),
-                },
-            };
-        assert_eq!(flow_plan, expected);
-    }
-
-    #[tokio::test]
-    async fn test_df_func_expr_tree() {
-        let engine = create_test_query_engine();
-        let sql = "SELECT abs(sum(number)) FROM numbers_with_ts GROUP BY tumble(ts, '1 second', '2021-07-01 00:00:00');";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
-            .await
-            .unwrap();
-
-        let aggr_expr = AggregateExpr {
-            func: AggregateFunc::SumUInt64,
-            expr: ScalarExpr::Column(0),
-            distinct: false,
-        };
-        let expected = TypedPlan {
-            schema: RelationType::new(vec![
-                ColumnType::new(CDT::uint64_datatype(), true), // sum(number)
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-            ])
-            .with_key(vec![2])
-            .with_time_index(Some(1))
-            .into_named(vec![
-                Some("abs(sum(numbers_with_ts.number))".to_string()),
-                Some("window_start".to_string()),
-                Some("window_end".to_string()),
-            ]),
-            plan: Plan::Mfp {
-                input: Box::new(
-                    Plan::Reduce {
-                        input: Box::new(
-                            Plan::Get {
-                                id: crate::expr::Id::Global(GlobalId::User(1)),
-                            }
-                            .with_types(
-                                RelationType::new(vec![
-                                    ColumnType::new(ConcreteDataType::uint32_datatype(), false),
-                                    ColumnType::new(
-                                        ConcreteDataType::timestamp_millisecond_datatype(),
-                                        false,
-                                    ),
-                                ])
-                                .into_named(vec![
-                                    Some("number".to_string()),
-                                    Some("ts".to_string()),
-                                ]),
-                            )
-                            .mfp(MapFilterProject::new(2).into_safe())
-                            .unwrap(),
-                        ),
-                        key_val_plan: KeyValPlan {
-                            key_plan: MapFilterProject::new(2)
-                                .map(vec![
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowFloor {
-                                            window_size: Duration::from_nanos(1_000_000_000),
-                                            start_time: Some(Timestamp::new_millisecond(
-                                                1625097600000,
-                                            )),
-                                        },
-                                    ),
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowCeiling {
-                                            window_size: Duration::from_nanos(1_000_000_000),
-                                            start_time: Some(Timestamp::new_millisecond(
-                                                1625097600000,
-                                            )),
-                                        },
-                                    ),
-                                ])
-                                .unwrap()
-                                .project(vec![2, 3])
-                                .unwrap()
-                                .into_safe(),
-                            val_plan: MapFilterProject::new(2)
-                                .map(vec![ScalarExpr::Column(0).cast(CDT::uint64_datatype())])
-                                .unwrap()
-                                .project(vec![2])
-                                .unwrap()
-                                .into_safe(),
-                        },
-                        reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                            full_aggrs: vec![aggr_expr.clone()],
-                            simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)],
-                            distinct_aggrs: vec![],
-                        }),
-                    }
-                    .with_types(
-                        RelationType::new(vec![
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-                            ColumnType::new(CDT::uint64_datatype(), true), //sum(number)
-                        ])
-                        .with_key(vec![1])
-                        .with_time_index(Some(0))
-                        .into_named(vec![None, None, None]),
-                    ),
-                ),
-                mfp: MapFilterProject::new(3)
-                    .map(vec![
-                        ScalarExpr::CallDf {
-                            df_scalar_fn: DfScalarFunction::try_from_raw_fn(RawDfScalarFn {
-                                f: BytesMut::from(b"\"\x08\x1a\x06\x12\x04\n\x02\x12\0".as_ref()),
-                                input_schema: RelationType::new(vec![ColumnType::new(
-                                    ConcreteDataType::uint64_datatype(),
-                                    true,
-                                )])
-                                .into_unnamed(),
-                                extensions: FunctionExtensions::from_iter(
-                                    [
-                                        (0, "abs".to_string()),
-                                        (1, "tumble_start".to_string()),
-                                        (2, "tumble_end".to_string()),
-                                        (3, "sum".to_string()),
-                                    ]
-                                    .into_iter(),
-                                ),
-                            })
-                            .await
-                            .unwrap(),
-                            exprs: vec![ScalarExpr::Column(2)],
-                        },
-                        ScalarExpr::Column(0),
-                        ScalarExpr::Column(1),
-                    ])
-                    .unwrap()
-                    .project(vec![3, 4, 5])
-                    .unwrap(),
-            },
-        };
-        assert_eq!(flow_plan, expected);
-    }
-
-    /// TODO(discord9): add more illegal sql tests
-    #[tokio::test]
-    async fn test_tumble_composite() {
-        let engine = create_test_query_engine();
-        let sql =
-            "SELECT number, avg(number) FROM numbers_with_ts GROUP BY tumble(ts, '1 hour'), number";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
-            .await
-            .unwrap();
-
-        let aggr_exprs = vec![
-            AggregateExpr {
-                func: AggregateFunc::SumUInt64,
-                expr: ScalarExpr::Column(0),
-                distinct: false,
-            },
-            AggregateExpr {
-                func: AggregateFunc::Count,
-                expr: ScalarExpr::Column(1),
-                distinct: false,
-            },
-        ];
-        let avg_expr = ScalarExpr::If {
-            cond: Box::new(ScalarExpr::Column(4).call_binary(
-                ScalarExpr::Literal(Value::from(0i64), CDT::int64_datatype()),
-                BinaryFunc::NotEq,
-            )),
-            then: Box::new(
-                ScalarExpr::Column(3)
-                    .cast(CDT::float64_datatype())
-                    .call_binary(
-                        ScalarExpr::Column(4).cast(CDT::float64_datatype()),
-                        BinaryFunc::DivFloat64,
-                    ),
-            ),
-            els: Box::new(ScalarExpr::Literal(Value::Null, CDT::float64_datatype())),
-        };
-        let expected = TypedPlan {
-            plan: Plan::Mfp {
-                input: Box::new(
-                    Plan::Reduce {
-                        input: Box::new(
-                            Plan::Get {
-                                id: crate::expr::Id::Global(GlobalId::User(1)),
-                            }
-                            .with_types(
-                                RelationType::new(vec![
-                                    ColumnType::new(ConcreteDataType::uint32_datatype(), false),
-                                    ColumnType::new(
-                                        ConcreteDataType::timestamp_millisecond_datatype(),
-                                        false,
-                                    ),
-                                ])
-                                .into_named(vec![
-                                    Some("number".to_string()),
-                                    Some("ts".to_string()),
-                                ]),
-                            )
-                            .mfp(MapFilterProject::new(2).into_safe())
-                            .unwrap(),
-                        ),
-                        key_val_plan: KeyValPlan {
-                            key_plan: MapFilterProject::new(2)
-                                .map(vec![
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowFloor {
-                                            window_size: Duration::from_nanos(3_600_000_000_000),
-                                            start_time: None,
-                                        },
-                                    ),
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowCeiling {
-                                            window_size: Duration::from_nanos(3_600_000_000_000),
-                                            start_time: None,
-                                        },
-                                    ),
-                                    ScalarExpr::Column(0),
-                                ])
-                                .unwrap()
-                                .project(vec![2, 3, 4])
-                                .unwrap()
-                                .into_safe(),
-                            val_plan: MapFilterProject::new(2)
-                                .map(vec![
-                                    ScalarExpr::Column(0).cast(CDT::uint64_datatype()),
-                                    ScalarExpr::Column(0),
-                                ])
-                                .unwrap()
-                                .project(vec![2, 3])
-                                .unwrap()
-                                .into_safe(),
-                        },
-                        reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                            full_aggrs: aggr_exprs.clone(),
-                            simple_aggrs: vec![
-                                AggrWithIndex::new(aggr_exprs[0].clone(), 0, 0),
-                                AggrWithIndex::new(aggr_exprs[1].clone(), 1, 1),
-                            ],
-                            distinct_aggrs: vec![],
-                        }),
-                    }
-                    .with_types(
-                        RelationType::new(vec![
-                            // keys
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start(time index)
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end(pk)
-                            ColumnType::new(CDT::uint32_datatype(), false), // number(pk)
-                            // values
-                            ColumnType::new(CDT::uint64_datatype(), true), // avg.sum(number)
-                            ColumnType::new(CDT::int64_datatype(), true),  // avg.count(number)
-                        ])
-                        .with_key(vec![1, 2])
-                        .with_time_index(Some(0))
-                        .into_named(vec![
-                            None,
-                            None,
-                            Some("number".to_string()),
-                            None,
-                            None,
-                        ]),
-                    ),
-                ),
-                mfp: MapFilterProject::new(5)
-                    .map(vec![
-                        ScalarExpr::Column(2), // number(pk)
-                        avg_expr,
-                        ScalarExpr::Column(0), // window start
-                        ScalarExpr::Column(1), // window end
-                    ])
-                    .unwrap()
-                    .project(vec![5, 6, 7, 8])
-                    .unwrap(),
-            },
-            schema: RelationType::new(vec![
-                ColumnType::new(CDT::uint32_datatype(), false), // number
-                ColumnType::new(CDT::float64_datatype(), true), // avg(number)
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-            ])
-            .with_key(vec![0, 3])
-            .with_time_index(Some(2))
-            .into_named(vec![
-                Some("number".to_string()),
-                Some("avg(numbers_with_ts.number)".to_string()),
-                Some("window_start".to_string()),
-                Some("window_end".to_string()),
-            ]),
-        };
-        assert_eq!(flow_plan, expected);
-    }
-
-    #[tokio::test]
-    async fn test_tumble_parse_optional() {
-        let engine = create_test_query_engine();
-        let sql = "SELECT sum(number) FROM numbers_with_ts GROUP BY tumble(ts, '1 hour')";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
-            .await
-            .unwrap();
-
-        let aggr_expr = AggregateExpr {
-            func: AggregateFunc::SumUInt64,
-            expr: ScalarExpr::Column(0),
-            distinct: false,
-        };
-        let expected = TypedPlan {
-            schema: RelationType::new(vec![
-                ColumnType::new(CDT::uint64_datatype(), true), // sum(number)
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-            ])
-            .with_key(vec![2])
-            .with_time_index(Some(1))
-            .into_named(vec![
-                Some("sum(numbers_with_ts.number)".to_string()),
-                Some("window_start".to_string()),
-                Some("window_end".to_string()),
-            ]),
-            plan: Plan::Mfp {
-                input: Box::new(
-                    Plan::Reduce {
-                        input: Box::new(
-                            Plan::Get {
-                                id: crate::expr::Id::Global(GlobalId::User(1)),
-                            }
-                            .with_types(
-                                RelationType::new(vec![
-                                    ColumnType::new(ConcreteDataType::uint32_datatype(), false),
-                                    ColumnType::new(
-                                        ConcreteDataType::timestamp_millisecond_datatype(),
-                                        false,
-                                    ),
-                                ])
-                                .into_named(vec![
-                                    Some("number".to_string()),
-                                    Some("ts".to_string()),
-                                ]),
-                            )
-                            .mfp(MapFilterProject::new(2).into_safe())
-                            .unwrap(),
-                        ),
-                        key_val_plan: KeyValPlan {
-                            key_plan: MapFilterProject::new(2)
-                                .map(vec![
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowFloor {
-                                            window_size: Duration::from_nanos(3_600_000_000_000),
-                                            start_time: None,
-                                        },
-                                    ),
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowCeiling {
-                                            window_size: Duration::from_nanos(3_600_000_000_000),
-                                            start_time: None,
-                                        },
-                                    ),
-                                ])
-                                .unwrap()
-                                .project(vec![2, 3])
-                                .unwrap()
-                                .into_safe(),
-                            val_plan: MapFilterProject::new(2)
-                                .map(vec![ScalarExpr::Column(0).cast(CDT::uint64_datatype())])
-                                .unwrap()
-                                .project(vec![2])
-                                .unwrap()
-                                .into_safe(),
-                        },
-                        reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                            full_aggrs: vec![aggr_expr.clone()],
-                            simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)],
-                            distinct_aggrs: vec![],
-                        }),
-                    }
-                    .with_types(
-                        RelationType::new(vec![
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-                            ColumnType::new(CDT::uint64_datatype(), true), //sum(number)
-                        ])
-                        .with_key(vec![1])
-                        .with_time_index(Some(0))
-                        .into_named(vec![None, None, None]),
-                    ),
-                ),
-                mfp: MapFilterProject::new(3)
-                    .map(vec![
-                        ScalarExpr::Column(2),
-                        ScalarExpr::Column(0),
-                        ScalarExpr::Column(1),
-                    ])
-                    .unwrap()
-                    .project(vec![3, 4, 5])
-                    .unwrap(),
-            },
-        };
-        assert_eq!(flow_plan, expected);
-    }
-
-    #[tokio::test]
-    async fn test_tumble_parse() {
-        let engine = create_test_query_engine();
-        let sql = "SELECT sum(number) FROM numbers_with_ts GROUP BY tumble(ts, '1 hour', '2021-07-01 00:00:00')";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
-            .await
-            .unwrap();
-
-        let aggr_expr = AggregateExpr {
-            func: AggregateFunc::SumUInt64,
-            expr: ScalarExpr::Column(0),
-            distinct: false,
-        };
-        let expected = TypedPlan {
-            schema: RelationType::new(vec![
-                ColumnType::new(CDT::uint64_datatype(), true), // sum(number)
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-            ])
-            .with_key(vec![2])
-            .with_time_index(Some(1))
-            .into_named(vec![
-                Some("sum(numbers_with_ts.number)".to_string()),
-                Some("window_start".to_string()),
-                Some("window_end".to_string()),
-            ]),
-            plan: Plan::Mfp {
-                input: Box::new(
-                    Plan::Reduce {
-                        input: Box::new(
-                            Plan::Get {
-                                id: crate::expr::Id::Global(GlobalId::User(1)),
-                            }
-                            .with_types(
-                                RelationType::new(vec![
-                                    ColumnType::new(ConcreteDataType::uint32_datatype(), false),
-                                    ColumnType::new(
-                                        ConcreteDataType::timestamp_millisecond_datatype(),
-                                        false,
-                                    ),
-                                ])
-                                .into_named(vec![
-                                    Some("number".to_string()),
-                                    Some("ts".to_string()),
-                                ]),
-                            )
-                            .mfp(MapFilterProject::new(2).into_safe())
-                            .unwrap(),
-                        ),
-                        key_val_plan: KeyValPlan {
-                            key_plan: MapFilterProject::new(2)
-                                .map(vec![
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowFloor {
-                                            window_size: Duration::from_nanos(3_600_000_000_000),
-                                            start_time: Some(Timestamp::new_millisecond(
-                                                1625097600000,
-                                            )),
-                                        },
-                                    ),
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowCeiling {
-                                            window_size: Duration::from_nanos(3_600_000_000_000),
-                                            start_time: Some(Timestamp::new_millisecond(
-                                                1625097600000,
-                                            )),
-                                        },
-                                    ),
-                                ])
-                                .unwrap()
-                                .project(vec![2, 3])
-                                .unwrap()
-                                .into_safe(),
-                            val_plan: MapFilterProject::new(2)
-                                .map(vec![ScalarExpr::Column(0).cast(CDT::uint64_datatype())])
-                                .unwrap()
-                                .project(vec![2])
-                                .unwrap()
-                                .into_safe(),
-                        },
-                        reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                            full_aggrs: vec![aggr_expr.clone()],
-                            simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)],
-                            distinct_aggrs: vec![],
-                        }),
-                    }
-                    .with_types(
-                        RelationType::new(vec![
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-                            ColumnType::new(CDT::uint64_datatype(), true), //sum(number)
-                        ])
-                        .with_key(vec![1])
-                        .with_time_index(Some(0))
-                        .into_unnamed(),
-                    ),
-                ),
-                mfp: MapFilterProject::new(3)
-                    .map(vec![
-                        ScalarExpr::Column(2),
-                        ScalarExpr::Column(0),
-                        ScalarExpr::Column(1),
-                    ])
-                    .unwrap()
-                    .project(vec![3, 4, 5])
-                    .unwrap(),
-            },
-        };
-        assert_eq!(flow_plan, expected);
-    }
-
-    #[tokio::test]
-    async fn test_avg_group_by() {
-        let engine = create_test_query_engine();
-        let sql = "SELECT avg(number), number FROM numbers GROUP BY number";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan).await;
-
-        let aggr_exprs = vec![
-            AggregateExpr {
-                func: AggregateFunc::SumUInt64,
-                expr: ScalarExpr::Column(0),
-                distinct: false,
-            },
-            AggregateExpr {
-                func: AggregateFunc::Count,
-                expr: ScalarExpr::Column(1),
-                distinct: false,
-            },
-        ];
-        let avg_expr = ScalarExpr::If {
-            cond: Box::new(ScalarExpr::Column(2).call_binary(
-                ScalarExpr::Literal(Value::from(0i64), CDT::int64_datatype()),
-                BinaryFunc::NotEq,
-            )),
-            then: Box::new(
-                ScalarExpr::Column(1)
-                    .cast(CDT::float64_datatype())
-                    .call_binary(
-                        ScalarExpr::Column(2).cast(CDT::float64_datatype()),
-                        BinaryFunc::DivFloat64,
-                    ),
-            ),
-            els: Box::new(ScalarExpr::Literal(Value::Null, CDT::float64_datatype())),
-        };
-        let expected = TypedPlan {
-            schema: RelationType::new(vec![
-                ColumnType::new(CDT::float64_datatype(), true), // avg(number: u32) -> f64
-                ColumnType::new(CDT::uint32_datatype(), false), // number
-            ])
-            .with_key(vec![1])
-            .into_named(vec![
-                Some("avg(numbers.number)".to_string()),
-                Some("number".to_string()),
-            ]),
-            plan: Plan::Mfp {
-                input: Box::new(
-                    Plan::Reduce {
-                        input: Box::new(
-                            Plan::Get {
-                                id: crate::expr::Id::Global(GlobalId::User(0)),
-                            }
-                            .with_types(
-                                RelationType::new(vec![ColumnType::new(
-                                    ConcreteDataType::uint32_datatype(),
-                                    false,
-                                )])
-                                .into_named(vec![Some("number".to_string())]),
-                            )
-                            .mfp(
-                                MapFilterProject::new(1)
-                                    .project(vec![0])
-                                    .unwrap()
-                                    .into_safe(),
-                            )
-                            .unwrap(),
-                        ),
-                        key_val_plan: KeyValPlan {
-                            key_plan: MapFilterProject::new(1)
-                                .map(vec![ScalarExpr::Column(0)])
-                                .unwrap()
-                                .project(vec![1])
-                                .unwrap()
-                                .into_safe(),
-                            val_plan: MapFilterProject::new(1)
-                                .map(vec![
-                                    ScalarExpr::Column(0).cast(CDT::uint64_datatype()),
-                                    ScalarExpr::Column(0),
-                                ])
-                                .unwrap()
-                                .project(vec![1, 2])
-                                .unwrap()
-                                .into_safe(),
-                        },
-                        reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                            full_aggrs: aggr_exprs.clone(),
-                            simple_aggrs: vec![
-                                AggrWithIndex::new(aggr_exprs[0].clone(), 0, 0),
-                                AggrWithIndex::new(aggr_exprs[1].clone(), 1, 1),
-                            ],
-                            distinct_aggrs: vec![],
-                        }),
-                    }
-                    .with_types(
-                        RelationType::new(vec![
-                            ColumnType::new(ConcreteDataType::uint32_datatype(), false), // key: number
-                            ColumnType::new(ConcreteDataType::uint64_datatype(), true),  // sum
-                            ColumnType::new(ConcreteDataType::int64_datatype(), true),   // count
-                        ])
-                        .with_key(vec![0])
-                        .into_named(vec![
-                            Some("number".to_string()),
-                            None,
-                            None,
-                        ]),
-                    ),
-                ),
-                mfp: MapFilterProject::new(3)
-                    .map(vec![
-                        avg_expr, // col 3
-                        ScalarExpr::Column(0),
-                        // TODO(discord9): optimize mfp so to remove indirect ref
-                    ])
-                    .unwrap()
-                    .project(vec![3, 4])
-                    .unwrap(),
-            },
-        };
-        assert_eq!(flow_plan.unwrap(), expected);
-    }
-
-    #[tokio::test]
-    async fn test_avg() {
-        let engine = create_test_query_engine();
-        let sql = "SELECT avg(number) FROM numbers";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
-            .await
-            .unwrap();
-
-        let aggr_exprs = vec![
-            AggregateExpr {
-                func: AggregateFunc::SumUInt64,
-                expr: ScalarExpr::Column(0),
-                distinct: false,
-            },
-            AggregateExpr {
-                func: AggregateFunc::Count,
-                expr: ScalarExpr::Column(1),
-                distinct: false,
-            },
-        ];
-        let avg_expr = ScalarExpr::If {
-            cond: Box::new(ScalarExpr::Column(1).call_binary(
-                ScalarExpr::Literal(Value::from(0i64), CDT::int64_datatype()),
-                BinaryFunc::NotEq,
-            )),
-            then: Box::new(
-                ScalarExpr::Column(0)
-                    .cast(CDT::float64_datatype())
-                    .call_binary(
-                        ScalarExpr::Column(1).cast(CDT::float64_datatype()),
-                        BinaryFunc::DivFloat64,
-                    ),
-            ),
-            els: Box::new(ScalarExpr::Literal(Value::Null, CDT::float64_datatype())),
-        };
-        let input = Box::new(
-            Plan::Get {
-                id: crate::expr::Id::Global(GlobalId::User(0)),
-            }
-            .with_types(
-                RelationType::new(vec![ColumnType::new(
-                    ConcreteDataType::uint32_datatype(),
-                    false,
-                )])
-                .into_named(vec![Some("number".to_string())]),
-            ),
-        );
-        let expected = TypedPlan {
-            schema: RelationType::new(vec![ColumnType::new(CDT::float64_datatype(), true)])
-                .into_named(vec![Some("avg(numbers.number)".to_string())]),
-            plan: Plan::Mfp {
-                input: Box::new(
-                    Plan::Reduce {
-                        input: Box::new(
-                            Plan::Mfp {
-                                input: input.clone(),
-                                mfp: MapFilterProject::new(1).project(vec![0]).unwrap(),
-                            }
-                            .with_types(
-                                RelationType::new(vec![ColumnType::new(
-                                    CDT::uint32_datatype(),
-                                    false,
-                                )])
-                                .into_named(vec![Some("number".to_string())]),
-                            ),
-                        ),
-                        key_val_plan: KeyValPlan {
-                            key_plan: MapFilterProject::new(1)
-                                .project(vec![])
-                                .unwrap()
-                                .into_safe(),
-                            val_plan: MapFilterProject::new(1)
-                                .map(vec![
-                                    ScalarExpr::Column(0).cast(CDT::uint64_datatype()),
-                                    ScalarExpr::Column(0),
-                                ])
-                                .unwrap()
-                                .project(vec![1, 2])
-                                .unwrap()
-                                .into_safe(),
-                        },
-                        reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                            full_aggrs: aggr_exprs.clone(),
-                            simple_aggrs: vec![
-                                AggrWithIndex::new(aggr_exprs[0].clone(), 0, 0),
-                                AggrWithIndex::new(aggr_exprs[1].clone(), 1, 1),
-                            ],
-                            distinct_aggrs: vec![],
-                        }),
-                    }
-                    .with_types(
-                        RelationType::new(vec![
-                            ColumnType::new(ConcreteDataType::uint64_datatype(), true), // sum
-                            ColumnType::new(ConcreteDataType::int64_datatype(), true),  // count
-                        ])
-                        .into_named(vec![None, None]),
-                    ),
-                ),
-                mfp: MapFilterProject::new(2)
-                    .map(vec![
-                        avg_expr,
-                        // TODO(discord9): optimize mfp so to remove indirect ref
-                    ])
-                    .unwrap()
-                    .project(vec![2])
-                    .unwrap(),
-            },
-        };
-        assert_eq!(flow_plan, expected);
-    }
-
     #[tokio::test]
     async fn test_sum() {
         let engine = create_test_query_engine();
diff --git a/tests/cases/standalone/common/flow/flow_tql_avg.result b/tests/cases/standalone/common/flow/flow_tql_avg.result
new file mode 100644
index 0000000000..8438f41eb6
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_tql_avg.result
@@ -0,0 +1,126 @@
+CREATE TABLE sensor_readings (
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX,
+    sensor STRING,
+    loc STRING,
+    PRIMARY KEY (sensor, loc)
+);
+
+Affected Rows: 0
+
+CREATE TABLE sensor_readings_avg (
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX,
+    sensor STRING,
+    PRIMARY KEY (sensor)
+);
+
+Affected Rows: 0
+
+INSERT INTO sensor_readings VALUES
+    (20, now() - '30s'::interval, 'test', 'A');
+
+Affected Rows: 1
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (sensor_readings) AS value;
+
++-------+--------+---------------------+
+| value | sensor | ts                  |
++-------+--------+---------------------+
+| 20.0  | test   | TS |
++-------+--------+---------------------+
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+TQL EVAL (now() - '1m'::interval, now(), '1m') (sum by(sensor) (sensor_readings) / count by(sensor) (sensor_readings)) AS value;
+
++-------+--------+---------------------+
+| value | sensor | ts                  |
++-------+--------+---------------------+
+| 20.0  | test   | TS |
++-------+--------+---------------------+
+
+CREATE FLOW sensor_readings_avg_flow
+SINK TO sensor_readings_avg
+EVAL INTERVAL '1m' AS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (sensor_readings) AS value;
+
+Affected Rows: 0
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('sensor_readings_avg_flow');
+
++----------------------------------------------+
+| ADMIN FLUSH_FLOW('sensor_readings_avg_flow') |
++----------------------------------------------+
+|  FLOW_FLUSHED  |
++----------------------------------------------+
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+SELECT * FROM sensor_readings_avg ORDER BY ts DESC LIMIT 1;
+
++-------+---------------------+--------+
+| value | ts                  | sensor |
++-------+---------------------+--------+
+| 20.0  | TS | test   |
++-------+---------------------+--------+
+
+DROP FLOW sensor_readings_avg_flow;
+
+Affected Rows: 0
+
+-- SQLNESS SLEEP 1s
+INSERT INTO sensor_readings VALUES
+    (30, now() - '40s'::interval, 'test', 'B');
+
+Affected Rows: 1
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (sensor_readings) AS value;
+
++-------+--------+---------------------+
+| value | sensor | ts                  |
++-------+--------+---------------------+
+| 25.0  | test   | TS |
++-------+--------+---------------------+
+
+CREATE FLOW sensor_readings_avg_flow
+SINK TO sensor_readings_avg
+EVAL INTERVAL '1m' AS
+TQL EVAL (now() - '1m'::interval, now(), '1m') (sum by(sensor) (sensor_readings) / count by(sensor) (sensor_readings)) AS value;
+
+Affected Rows: 0
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('sensor_readings_avg_flow');
+
++----------------------------------------------+
+| ADMIN FLUSH_FLOW('sensor_readings_avg_flow') |
++----------------------------------------------+
+|  FLOW_FLUSHED  |
++----------------------------------------------+
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+SELECT * FROM sensor_readings_avg ORDER BY ts DESC LIMIT 1;
+
++-------+---------------------+--------+
+| value | ts                  | sensor |
++-------+---------------------+--------+
+| 25.0  | TS | test   |
++-------+---------------------+--------+
+
+DROP FLOW sensor_readings_avg_flow;
+
+Affected Rows: 0
+
+DROP TABLE sensor_readings_avg;
+
+Affected Rows: 0
+
+DROP TABLE sensor_readings;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/flow/flow_tql_avg.sql b/tests/cases/standalone/common/flow/flow_tql_avg.sql
new file mode 100644
index 0000000000..a5d6ab9d2b
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_tql_avg.sql
@@ -0,0 +1,63 @@
+CREATE TABLE sensor_readings (
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX,
+    sensor STRING,
+    loc STRING,
+    PRIMARY KEY (sensor, loc)
+);
+
+CREATE TABLE sensor_readings_avg (
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX,
+    sensor STRING,
+    PRIMARY KEY (sensor)
+);
+
+INSERT INTO sensor_readings VALUES
+    (20, now() - '30s'::interval, 'test', 'A');
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (sensor_readings) AS value;
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+TQL EVAL (now() - '1m'::interval, now(), '1m') (sum by(sensor) (sensor_readings) / count by(sensor) (sensor_readings)) AS value;
+
+CREATE FLOW sensor_readings_avg_flow
+SINK TO sensor_readings_avg
+EVAL INTERVAL '1m' AS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (sensor_readings) AS value;
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('sensor_readings_avg_flow');
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+SELECT * FROM sensor_readings_avg ORDER BY ts DESC LIMIT 1;
+
+DROP FLOW sensor_readings_avg_flow;
+
+-- SQLNESS SLEEP 1s
+INSERT INTO sensor_readings VALUES
+    (30, now() - '40s'::interval, 'test', 'B');
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (sensor_readings) AS value;
+
+
+CREATE FLOW sensor_readings_avg_flow
+SINK TO sensor_readings_avg
+EVAL INTERVAL '1m' AS
+TQL EVAL (now() - '1m'::interval, now(), '1m') (sum by(sensor) (sensor_readings) / count by(sensor) (sensor_readings)) AS value;
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('sensor_readings_avg_flow');
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+SELECT * FROM sensor_readings_avg ORDER BY ts DESC LIMIT 1;
+
+DROP FLOW sensor_readings_avg_flow;
+
+DROP TABLE sensor_readings_avg;
+DROP TABLE sensor_readings;

From 3cdf03d830930424125fe5f31677dadb672071b7 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunng@protonmail.com>
Date: Fri, 13 Mar 2026 11:40:04 +0800
Subject: [PATCH 04/42] feat: introduce APIs for storing perses dashboard
 definition (#7791)

* feat: introduce APIs for storing perses dashboard definition

* test: ensure we can update dashboard

* refactor: construct dashboard defnition directly

* refactor: don't create table on list requests
---
 src/frontend/src/instance.rs                  |   1 +
 src/frontend/src/instance/dashboard.rs        | 405 ++++++++++++++++++
 src/frontend/src/server.rs                    |   2 +
 src/servers/src/http.rs                       |  37 +-
 src/servers/src/http/dashboard.rs             | 114 ++++-
 .../src/http/result/greptime_manage_resp.rs   |  27 ++
 src/servers/src/query_handler.rs              |  18 +
 tests-integration/Cargo.toml                  |   2 +-
 tests-integration/src/test_util.rs            |   1 +
 tests-integration/tests/http.rs               | 116 +++++
 10 files changed, 717 insertions(+), 6 deletions(-)
 create mode 100644 src/frontend/src/instance/dashboard.rs

diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs
index fa8a74cad2..ce589bb677 100644
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 pub mod builder;
+mod dashboard;
 mod grpc;
 mod influxdb;
 mod jaeger;
diff --git a/src/frontend/src/instance/dashboard.rs b/src/frontend/src/instance/dashboard.rs
new file mode 100644
index 0000000000..373961dbfa
--- /dev/null
+++ b/src/frontend/src/instance/dashboard.rs
@@ -0,0 +1,405 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use api::v1::value::ValueData;
+use api::v1::{
+    ColumnDataType, ColumnDef, ColumnSchema as PbColumnSchema, Row, RowInsertRequest,
+    RowInsertRequests, Rows, SemanticType,
+};
+use async_trait::async_trait;
+use common_catalog::consts::{DEFAULT_PRIVATE_SCHEMA_NAME, default_engine};
+use common_error::ext::BoxedError;
+use common_query::OutputData;
+use common_recordbatch::util as record_util;
+use common_telemetry::info;
+use common_time::FOREVER;
+use datafusion::datasource::DefaultTableSource;
+use datafusion::logical_expr::col;
+use datafusion::sql::TableReference;
+use datafusion_expr::{DmlStatement, LogicalPlan, lit};
+use datatypes::arrow::array::{Array, AsArray};
+use servers::error::{
+    CatalogSnafu, CollectRecordbatchSnafu, DataFusionSnafu, ExecuteQuerySnafu, NotSupportedSnafu,
+    TableNotFoundSnafu,
+};
+use servers::query_handler::DashboardDefinition;
+use session::context::{QueryContextBuilder, QueryContextRef};
+use snafu::{OptionExt, ResultExt};
+use table::TableRef;
+use table::metadata::TableInfo;
+use table::requests::TTL_KEY;
+use table::table::adapter::DfTableProviderAdapter;
+
+use crate::instance::Instance;
+
+pub const DASHBOARD_TABLE_NAME: &str = "dashboard";
+pub const DASHBOARD_TABLE_NAME_COLUMN_NAME: &str = "name";
+pub const DASHBOARD_TABLE_DEFINITION_COLUMN_NAME: &str = "definition";
+pub const DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME: &str = "created_at";
+
+impl Instance {
+    /// Build a schema for dashboard table.
+    /// Returns the (time index, primary keys, column) definitions.
+    fn build_dashboard_schema() -> (String, Vec<String>, Vec<ColumnDef>) {
+        (
+            DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME.to_string(),
+            vec![DASHBOARD_TABLE_NAME_COLUMN_NAME.to_string()],
+            vec![
+                ColumnDef {
+                    name: DASHBOARD_TABLE_NAME_COLUMN_NAME.to_string(),
+                    data_type: ColumnDataType::String as i32,
+                    is_nullable: false,
+                    default_constraint: vec![],
+                    semantic_type: SemanticType::Tag as i32,
+                    comment: String::new(),
+                    datatype_extension: None,
+                    options: None,
+                },
+                ColumnDef {
+                    name: DASHBOARD_TABLE_DEFINITION_COLUMN_NAME.to_string(),
+                    data_type: ColumnDataType::String as i32,
+                    is_nullable: false,
+                    default_constraint: vec![],
+                    semantic_type: SemanticType::Field as i32,
+                    comment: String::new(),
+                    datatype_extension: None,
+                    options: None,
+                },
+                ColumnDef {
+                    name: DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME.to_string(),
+                    data_type: ColumnDataType::TimestampNanosecond as i32,
+                    is_nullable: false,
+                    default_constraint: vec![],
+                    semantic_type: SemanticType::Timestamp as i32,
+                    comment: String::new(),
+                    datatype_extension: None,
+                    options: None,
+                },
+            ],
+        )
+    }
+
+    /// Build a column schemas for inserting a row into the dashboard table.
+    fn build_dashboard_insert_column_schemas() -> Vec<PbColumnSchema> {
+        vec![
+            PbColumnSchema {
+                column_name: DASHBOARD_TABLE_NAME_COLUMN_NAME.to_string(),
+                datatype: ColumnDataType::String.into(),
+                semantic_type: SemanticType::Tag.into(),
+                ..Default::default()
+            },
+            PbColumnSchema {
+                column_name: DASHBOARD_TABLE_DEFINITION_COLUMN_NAME.to_string(),
+                datatype: ColumnDataType::String.into(),
+                semantic_type: SemanticType::Field.into(),
+                ..Default::default()
+            },
+            PbColumnSchema {
+                column_name: DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME.to_string(),
+                datatype: ColumnDataType::TimestampNanosecond.into(),
+                semantic_type: SemanticType::Timestamp.into(),
+                ..Default::default()
+            },
+        ]
+    }
+
+    fn dashboard_query_ctx(table_info: &TableInfo) -> QueryContextRef {
+        QueryContextBuilder::default()
+            .current_catalog(table_info.catalog_name.clone())
+            .current_schema(table_info.schema_name.clone())
+            .build()
+            .into()
+    }
+
+    async fn create_dashboard_table_if_not_exists(
+        &self,
+        ctx: QueryContextRef,
+    ) -> servers::error::Result<TableRef> {
+        let catalog = ctx.current_catalog();
+
+        if let Some(table) = self
+            .catalog_manager
+            .table(
+                catalog,
+                DEFAULT_PRIVATE_SCHEMA_NAME,
+                DASHBOARD_TABLE_NAME,
+                Some(&ctx),
+            )
+            .await
+            .context(CatalogSnafu)?
+        {
+            return Ok(table);
+        }
+
+        let (time_index, primary_keys, column_defs) = Self::build_dashboard_schema();
+
+        let mut table_options = HashMap::new();
+        table_options.insert(TTL_KEY.to_string(), FOREVER.to_string());
+
+        let mut create_table_expr = api::v1::CreateTableExpr {
+            catalog_name: catalog.to_string(),
+            schema_name: DEFAULT_PRIVATE_SCHEMA_NAME.to_string(),
+            table_name: DASHBOARD_TABLE_NAME.to_string(),
+            desc: "GreptimeDB dashboard table".to_string(),
+            column_defs,
+            time_index,
+            primary_keys,
+            create_if_not_exists: true,
+            table_options,
+            table_id: None,
+            engine: default_engine().to_string(),
+        };
+
+        self.statement_executor
+            .create_table_inner(&mut create_table_expr, None, ctx.clone())
+            .await
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)?;
+
+        let table = self
+            .catalog_manager
+            .table(
+                catalog,
+                DEFAULT_PRIVATE_SCHEMA_NAME,
+                DASHBOARD_TABLE_NAME,
+                Some(&ctx),
+            )
+            .await
+            .context(CatalogSnafu)?
+            .context(TableNotFoundSnafu {
+                catalog: catalog.to_string(),
+                schema: DEFAULT_PRIVATE_SCHEMA_NAME.to_string(),
+                table: DASHBOARD_TABLE_NAME.to_string(),
+            })?;
+
+        Ok(table)
+    }
+
+    /// Insert a dashboard into the dashboard table.
+    async fn insert_dashboard(
+        &self,
+        name: &str,
+        definition: &str,
+        query_ctx: QueryContextRef,
+    ) -> servers::error::Result<()> {
+        let table = self
+            .create_dashboard_table_if_not_exists(query_ctx.clone())
+            .await?;
+        let table_info = table.table_info();
+
+        let insert = RowInsertRequest {
+            table_name: DASHBOARD_TABLE_NAME.to_string(),
+            rows: Some(Rows {
+                schema: Self::build_dashboard_insert_column_schemas(),
+                rows: vec![Row {
+                    values: vec![
+                        ValueData::StringValue(name.to_string()).into(),
+                        ValueData::StringValue(definition.to_string()).into(),
+                        ValueData::TimestampNanosecondValue(0).into(),
+                    ],
+                }],
+            }),
+        };
+
+        let requests = RowInsertRequests {
+            inserts: vec![insert],
+        };
+
+        let output = self
+            .inserter
+            .handle_row_inserts(
+                requests,
+                Self::dashboard_query_ctx(&table_info),
+                &self.statement_executor,
+                false,
+                false,
+            )
+            .await
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)?;
+
+        info!(
+            "Insert dashboard success, name: {}, table: {}, output: {:?}",
+            name,
+            table_info.full_table_name(),
+            output
+        );
+
+        Ok(())
+    }
+
+    /// List all dashboards.
+    async fn list_dashboards(
+        &self,
+        query_ctx: QueryContextRef,
+    ) -> servers::error::Result<Vec<DashboardDefinition>> {
+        let table = if let Some(table) = self
+            .catalog_manager
+            .table(
+                query_ctx.current_catalog(),
+                DEFAULT_PRIVATE_SCHEMA_NAME,
+                DASHBOARD_TABLE_NAME,
+                Some(&query_ctx),
+            )
+            .await
+            .context(CatalogSnafu)?
+        {
+            table
+        } else {
+            return Ok(vec![]);
+        };
+
+        let table_info = table.table_info();
+
+        let dataframe = self
+            .query_engine
+            .read_table(table.clone())
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)?;
+
+        let dataframe = dataframe
+            .select_columns(&[
+                DASHBOARD_TABLE_NAME_COLUMN_NAME,
+                DASHBOARD_TABLE_DEFINITION_COLUMN_NAME,
+            ])
+            .context(DataFusionSnafu)?;
+
+        let plan = dataframe.into_parts().1;
+
+        let output = self
+            .query_engine
+            .execute(plan, Self::dashboard_query_ctx(&table_info))
+            .await
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)?;
+
+        let stream = match output.data {
+            OutputData::Stream(stream) => stream,
+            OutputData::RecordBatches(record_batches) => record_batches.as_stream(),
+            _ => unreachable!(),
+        };
+
+        let records = record_util::collect(stream)
+            .await
+            .context(CollectRecordbatchSnafu)?;
+
+        let mut dashboards = Vec::new();
+
+        for r in &records {
+            let name_column = r.column(0);
+            let definition_column = r.column(1);
+
+            let name = name_column
+                .as_string_opt::<i32>()
+                .context(NotSupportedSnafu {
+                    feat: "Invalid data type for greptime_private.dashboard.name",
+                })?;
+
+            let definition =
+                definition_column
+                    .as_string_opt::<i32>()
+                    .context(NotSupportedSnafu {
+                        feat: "Invalid data type for greptime_private.dashboard.definition",
+                    })?;
+
+            for i in 0..name.len() {
+                dashboards.push(DashboardDefinition {
+                    name: name.value(i).to_string(),
+                    definition: definition.value(i).to_string(),
+                });
+            }
+        }
+
+        Ok(dashboards)
+    }
+
+    /// Delete a dashboard by name.
+    async fn delete_dashboard(
+        &self,
+        name: &str,
+        query_ctx: QueryContextRef,
+    ) -> servers::error::Result<()> {
+        let table = self
+            .create_dashboard_table_if_not_exists(query_ctx.clone())
+            .await?;
+        let table_info = table.table_info();
+
+        let dataframe = self
+            .query_engine
+            .read_table(table.clone())
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)?;
+
+        let name_condition = col(DASHBOARD_TABLE_NAME_COLUMN_NAME).eq(lit(name));
+
+        let dataframe = dataframe.filter(name_condition).context(DataFusionSnafu)?;
+
+        let table_name = TableReference::full(
+            table_info.catalog_name.clone(),
+            table_info.schema_name.clone(),
+            table_info.name.clone(),
+        );
+
+        let table_provider = Arc::new(DfTableProviderAdapter::new(table.clone()));
+        let table_source = Arc::new(DefaultTableSource::new(table_provider));
+
+        let stmt = DmlStatement::new(
+            table_name,
+            table_source,
+            datafusion_expr::WriteOp::Delete,
+            Arc::new(dataframe.into_parts().1),
+        );
+
+        let plan = LogicalPlan::Dml(stmt);
+
+        let output = self
+            .query_engine
+            .execute(plan, Self::dashboard_query_ctx(&table_info))
+            .await
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)?;
+
+        info!(
+            "Delete dashboard success, name: {}, table: {}, output: {:?}",
+            name,
+            table_info.full_table_name(),
+            output
+        );
+
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl servers::query_handler::DashboardHandler for Instance {
+    async fn save(
+        &self,
+        name: &str,
+        definition: &str,
+        ctx: QueryContextRef,
+    ) -> servers::error::Result<()> {
+        self.insert_dashboard(name, definition, ctx).await
+    }
+
+    async fn list(&self, ctx: QueryContextRef) -> servers::error::Result<Vec<DashboardDefinition>> {
+        self.list_dashboards(ctx).await
+    }
+
+    async fn delete(&self, name: &str, ctx: QueryContextRef) -> servers::error::Result<()> {
+        self.delete_dashboard(name, ctx).await
+    }
+}
diff --git a/src/frontend/src/server.rs b/src/frontend/src/server.rs
index 45c3ec3649..4b51efbd33 100644
--- a/src/frontend/src/server.rs
+++ b/src/frontend/src/server.rs
@@ -143,6 +143,8 @@ where
             builder = builder.with_jaeger_handler(self.instance.clone());
         }
 
+        builder = builder.with_dashboard_handler(self.instance.clone());
+
         if let Some(configurator) = self.plugins.get::<RouterConfigurator>() {
             info!("Adding extra router from plugins");
             builder = builder.with_extra_router(configurator.router());
diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs
index ca6a77a077..ffd0745041 100644
--- a/src/servers/src/http.rs
+++ b/src/servers/src/http.rs
@@ -78,7 +78,7 @@ use crate::metrics_handler::MetricsHandler;
 use crate::prometheus_handler::PrometheusHandlerRef;
 use crate::query_handler::sql::ServerSqlQueryHandlerRef;
 use crate::query_handler::{
-    InfluxdbLineProtocolHandlerRef, JaegerQueryHandlerRef, LogQueryHandlerRef,
+    DashboardHandlerRef, InfluxdbLineProtocolHandlerRef, JaegerQueryHandlerRef, LogQueryHandlerRef,
     OpenTelemetryProtocolHandlerRef, OpentsdbProtocolHandlerRef, PipelineHandlerRef,
     PromStoreProtocolHandlerRef,
 };
@@ -507,6 +507,11 @@ pub struct GreptimeOptionsConfigState {
     pub greptime_config_options: String,
 }
 
+#[derive(Clone)]
+pub struct DashboardState {
+    pub handler: DashboardHandlerRef,
+}
+
 pub struct HttpServerBuilder {
     options: HttpOptions,
     plugins: Plugins,
@@ -703,6 +708,16 @@ impl HttpServerBuilder {
         }
     }
 
+    pub fn with_dashboard_handler(self, handler: DashboardHandlerRef) -> Self {
+        Self {
+            router: self.router.nest(
+                &format!("/{HTTP_API_VERSION}/dashboards"),
+                HttpServer::route_dashboard(handler),
+            ),
+            ..self
+        }
+    }
+
     pub fn with_extra_router(self, router: Router) -> Self {
         Self {
             router: self.router.merge(router),
@@ -1169,6 +1184,26 @@ impl HttpServer {
             )
             .with_state(handler)
     }
+
+    #[cfg(feature = "dashboard")]
+    fn route_dashboard<S>(handler: DashboardHandlerRef) -> Router<S> {
+        use crate::http::dashboard::{add_dashboard, delete_dashboard, list_dashboards};
+
+        Router::new()
+            .route("/", routing::get(list_dashboards))
+            .route("/{dashboard_name}", routing::post(add_dashboard))
+            .route("/{dashboard_name}", routing::delete(delete_dashboard))
+            .layer(
+                ServiceBuilder::new()
+                    .layer(RequestDecompressionLayer::new().pass_through_unaccepted(true)),
+            )
+            .with_state(DashboardState { handler })
+    }
+
+    #[cfg(not(feature = "dashboard"))]
+    fn route_dashboard<S>(handler: DashboardHandlerRef) -> Router<S> {
+        Router::new().with_state(DashboardState { handler })
+    }
 }
 
 pub const HTTP_SERVER: &str = "HTTP_SERVER";
diff --git a/src/servers/src/http/dashboard.rs b/src/servers/src/http/dashboard.rs
index bdb98490f0..ea894ca7d0 100644
--- a/src/servers/src/http/dashboard.rs
+++ b/src/servers/src/http/dashboard.rs
@@ -12,14 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use axum::body::Body;
+use std::sync::Arc;
+use std::time::Instant;
+
+use axum::body::{Body, Bytes};
+use axum::extract::{Extension, Path, State};
 use axum::http::{StatusCode, Uri, header};
 use axum::response::Response;
-use common_telemetry::debug;
+use common_telemetry::{debug, error};
 use rust_embed::RustEmbed;
-use snafu::ResultExt;
+use session::context::{Channel, QueryContext};
+use snafu::{ResultExt, ensure};
 
-use crate::error::{BuildHttpResponseSnafu, Result};
+use crate::error::{BuildHttpResponseSnafu, InvalidParameterSnafu, Result};
+use crate::http::DashboardState;
+use crate::http::result::greptime_manage_resp::{DashboardOutput, GreptimedbManageResponse};
 
 #[derive(RustEmbed)]
 #[folder = "dashboard/dist/"]
@@ -61,3 +68,102 @@ fn get_assets(path: &str) -> Result<Response> {
     }
     .context(BuildHttpResponseSnafu)
 }
+
+#[axum_macros::debug_handler]
+pub async fn add_dashboard(
+    State(state): State<DashboardState>,
+    Path(dashboard_name): Path<String>,
+    Extension(mut query_ctx): Extension<QueryContext>,
+    payload: Bytes,
+) -> Result<GreptimedbManageResponse> {
+    let start = Instant::now();
+    let handler = state.handler;
+    ensure!(
+        !dashboard_name.is_empty(),
+        InvalidParameterSnafu {
+            reason: "dashboard_name is required in path",
+        }
+    );
+
+    let definition = String::from_utf8_lossy(&payload).to_string();
+
+    query_ctx.set_channel(Channel::HttpSql);
+    let query_ctx = Arc::new(query_ctx);
+
+    handler
+        .save(&dashboard_name, &definition, query_ctx)
+        .await
+        .map(|_| {
+            GreptimedbManageResponse::from_dashboard(
+                dashboard_name,
+                start.elapsed().as_millis() as u64,
+            )
+        })
+        .map_err(|e| {
+            error!(e; "failed to save dashboard");
+            e
+        })
+}
+
+#[axum_macros::debug_handler]
+pub async fn list_dashboards(
+    State(state): State<DashboardState>,
+    Extension(mut query_ctx): Extension<QueryContext>,
+) -> Result<GreptimedbManageResponse> {
+    let start = Instant::now();
+    let handler = state.handler;
+
+    query_ctx.set_channel(Channel::HttpSql);
+    let query_ctx = Arc::new(query_ctx);
+
+    handler
+        .list(query_ctx)
+        .await
+        .map(|dashboards| {
+            let outputs: Vec<DashboardOutput> = dashboards
+                .into_iter()
+                .map(|d| DashboardOutput {
+                    name: d.name,
+                    definition: d.definition,
+                })
+                .collect();
+            GreptimedbManageResponse::from_dashboards(outputs, start.elapsed().as_millis() as u64)
+        })
+        .map_err(|e| {
+            error!(e; "failed to list dashboards");
+            e
+        })
+}
+
+#[axum_macros::debug_handler]
+pub async fn delete_dashboard(
+    State(state): State<DashboardState>,
+    Extension(mut query_ctx): Extension<QueryContext>,
+    Path(dashboard_name): Path<String>,
+) -> Result<GreptimedbManageResponse> {
+    let start = Instant::now();
+    let handler = state.handler;
+    ensure!(
+        !dashboard_name.is_empty(),
+        InvalidParameterSnafu {
+            reason: "dashboard_name is required",
+        }
+    );
+
+    query_ctx.set_channel(Channel::HttpSql);
+    let query_ctx = Arc::new(query_ctx);
+
+    handler
+        .delete(&dashboard_name, query_ctx)
+        .await
+        .map(|_| {
+            GreptimedbManageResponse::from_dashboard(
+                dashboard_name,
+                start.elapsed().as_millis() as u64,
+            )
+        })
+        .map_err(|e| {
+            error!(e; "failed to delete dashboard");
+            e
+        })
+}
diff --git a/src/servers/src/http/result/greptime_manage_resp.rs b/src/servers/src/http/result/greptime_manage_resp.rs
index 3f7f3c6eec..2b3a5d455c 100644
--- a/src/servers/src/http/result/greptime_manage_resp.rs
+++ b/src/servers/src/http/result/greptime_manage_resp.rs
@@ -62,6 +62,25 @@ impl GreptimedbManageResponse {
         }
     }
 
+    pub fn from_dashboard(name: String, execution_time_ms: u64) -> Self {
+        GreptimedbManageResponse {
+            manage_result: ManageResult::Dashboards {
+                dashboards: vec![DashboardOutput {
+                    name,
+                    definition: String::new(),
+                }],
+            },
+            execution_time_ms,
+        }
+    }
+
+    pub fn from_dashboards(dashboards: Vec<DashboardOutput>, execution_time_ms: u64) -> Self {
+        GreptimedbManageResponse {
+            manage_result: ManageResult::Dashboards { dashboards },
+            execution_time_ms,
+        }
+    }
+
     pub fn with_execution_time(mut self, execution_time: u64) -> Self {
         self.execution_time_ms = execution_time;
         self
@@ -77,6 +96,7 @@ impl GreptimedbManageResponse {
 pub enum ManageResult {
     Pipelines { pipelines: Vec<PipelineOutput> },
     Sql { sql: SqlOutput },
+    Dashboards { dashboards: Vec<DashboardOutput> },
 }
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -87,6 +107,13 @@ pub struct PipelineOutput {
     pipeline: Option<String>,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct DashboardOutput {
+    pub name: String,
+    #[serde(skip_serializing_if = "String::is_empty")]
+    pub definition: String,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct SqlOutput {
     pub(crate) sql: String,
diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs
index 60efe69faa..21c7646560 100644
--- a/src/servers/src/query_handler.rs
+++ b/src/servers/src/query_handler.rs
@@ -44,6 +44,12 @@ use pipeline::{GreptimePipelineParams, Pipeline, PipelineInfo, PipelineVersion,
 use serde_json::Value;
 use session::context::{QueryContext, QueryContextRef};
 
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct DashboardDefinition {
+    pub name: String,
+    pub definition: String,
+}
+
 use crate::error::Result;
 use crate::http::jaeger::QueryTraceParams;
 use crate::influxdb::InfluxdbRequest;
@@ -176,6 +182,18 @@ pub trait PipelineHandler {
     ) -> Result<(String, TimestampNanosecond)>;
 }
 
+/// Handling dashboard as code CRUD
+pub type DashboardHandlerRef = Arc<dyn DashboardHandler + Send + Sync>;
+
+#[async_trait]
+pub trait DashboardHandler {
+    async fn save(&self, name: &str, definition: &str, ctx: QueryContextRef) -> Result<()>;
+
+    async fn list(&self, ctx: QueryContextRef) -> Result<Vec<DashboardDefinition>>;
+
+    async fn delete(&self, name: &str, ctx: QueryContextRef) -> Result<()>;
+}
+
 /// Handle log query requests.
 #[async_trait]
 pub trait LogQueryHandler {
diff --git a/tests-integration/Cargo.toml b/tests-integration/Cargo.toml
index 0c6b965fd3..ec35205a55 100644
--- a/tests-integration/Cargo.toml
+++ b/tests-integration/Cargo.toml
@@ -5,7 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [features]
-dashboard = []
+dashboard = ["servers/dashboard"]
 vector_index = []
 
 [lints]
diff --git a/tests-integration/src/test_util.rs b/tests-integration/src/test_util.rs
index fd0d1ef3c4..2bf6e812c7 100644
--- a/tests-integration/src/test_util.rs
+++ b/tests-integration/src/test_util.rs
@@ -534,6 +534,7 @@ pub async fn setup_test_http_app_with_frontend_and_custom_options(
         .with_influxdb_handler(instance.fe_instance().clone())
         .with_otlp_handler(instance.fe_instance().clone(), true)
         .with_jaeger_handler(instance.fe_instance().clone())
+        .with_dashboard_handler(instance.fe_instance().clone())
         .with_greptime_config_options(instance.opts.to_toml().unwrap());
 
     if let Some(user_provider) = user_provider {
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index 68fa2a228d..c259d3ff24 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -106,6 +106,7 @@ macro_rules! http_tests {
                 test_config_api,
                 test_dynamic_tracer_toggle,
                 test_dashboard_path,
+                test_dashboard_api,
                 test_prometheus_remote_write,
                 test_prometheus_remote_special_labels,
                 test_prometheus_remote_schema_labels,
@@ -1720,6 +1721,121 @@ pub async fn test_dashboard_path(store_type: StorageType) {
 #[cfg(not(feature = "dashboard"))]
 pub async fn test_dashboard_path(_: StorageType) {}
 
+#[cfg(feature = "dashboard")]
+pub async fn test_dashboard_api(store_type: StorageType) {
+    common_telemetry::init_default_ut_logging();
+    let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "dashboard_api").await;
+    let client = TestClient::new(app).await;
+
+    // 1. List dashboards - should be empty initially
+    let res = client.get("/v1/dashboards").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert!(dashboards.is_empty());
+
+    // 2. Save a dashboard
+    let dashboard_definition = r#"{"title": "My Dashboard", "panels": []}"#;
+    let res = client
+        .post("/v1/dashboards/test_dashboard")
+        .body(dashboard_definition)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert_eq!(dashboards.len(), 1);
+    assert_eq!(dashboards[0].get("name").unwrap(), "test_dashboard");
+
+    // 3. Save another dashboard
+    let res = client
+        .post("/v1/dashboards/another_dashboard")
+        .body(r#"{"title": "Another Dashboard"}"#)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    // 4. List dashboards - should have 2
+    let res = client.get("/v1/dashboards").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert_eq!(dashboards.len(), 2);
+
+    let names: Vec<&str> = dashboards
+        .iter()
+        .map(|d| d.get("name").unwrap().as_str().unwrap())
+        .collect();
+    assert!(names.contains(&"test_dashboard"));
+    assert!(names.contains(&"another_dashboard"));
+
+    // 5. Update a dashboard by posting again with new definition
+    let updated_definition = r#"{"title": "Updated Dashboard", "panels": [{"id": 1}]}"#;
+    let res = client
+        .post("/v1/dashboards/test_dashboard")
+        .body(updated_definition)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert_eq!(dashboards.len(), 1);
+    assert_eq!(dashboards[0].get("name").unwrap(), "test_dashboard");
+
+    // Verify the definition was updated by listing again
+    let res = client.get("/v1/dashboards").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert_eq!(dashboards.len(), 2);
+
+    // Find test_dashboard and verify it has updated definition
+    let test_db = dashboards
+        .iter()
+        .find(|d| d.get("name").unwrap() == "test_dashboard")
+        .unwrap();
+    assert_eq!(
+        test_db.get("definition").unwrap(),
+        r#"{"title": "Updated Dashboard", "panels": [{"id": 1}]}"#
+    );
+
+    // 6. Delete one dashboard
+    let res = client.delete("/v1/dashboards/test_dashboard").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert_eq!(dashboards.len(), 1);
+    assert_eq!(dashboards[0].get("name").unwrap(), "test_dashboard");
+
+    // 7. List dashboards - should have 1
+    let res = client.get("/v1/dashboards").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert_eq!(dashboards.len(), 1);
+    assert_eq!(dashboards[0].get("name").unwrap(), "another_dashboard");
+
+    // 8. Delete the remaining dashboard
+    let res = client
+        .delete("/v1/dashboards/another_dashboard")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    // 9. List dashboards - should be empty
+    let res = client.get("/v1/dashboards").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert!(dashboards.is_empty());
+
+    guard.remove_all().await;
+}
+
+#[cfg(not(feature = "dashboard"))]
+pub async fn test_dashboard_api(_: StorageType) {}
+
 pub async fn test_prometheus_remote_write(store_type: StorageType) {
     common_telemetry::init_default_ut_logging();
     let (app, mut guard) =

From 0572a680af48c1e0fad55a3eea0087852940a273 Mon Sep 17 00:00:00 2001
From: dennis zhuang <killme2008@gmail.com>
Date: Fri, 13 Mar 2026 11:57:08 +0800
Subject: [PATCH 05/42] fix: allow empty string for env values (#7803)

* fix: allow empty string for env values

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* chore: strip suffix

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

---------

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>
---
 src/common/config/src/config.rs | 29 +++++++++++++++++-
 src/common/query/src/prelude.rs | 53 ++++++++++++++++++++++++++++++++-
 2 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/src/common/config/src/config.rs b/src/common/config/src/config.rs
index e25c46a0c0..85ce3d206f 100644
--- a/src/common/config/src/config.rs
+++ b/src/common/config/src/config.rs
@@ -53,7 +53,7 @@ pub trait Configurable: Serialize + DeserializeOwned + Default + Sized {
 
             env.try_parsing(true)
                 .separator(ENV_VAR_SEP)
-                .ignore_empty(true)
+                .ignore_empty(false)
         };
 
         // Workaround: Replacement for `Config::try_from(&default_opts)` due to
@@ -237,4 +237,31 @@ mod tests {
             },
         );
     }
+
+    #[derive(Debug, Serialize, Deserialize, Default)]
+    struct SimpleConfig {
+        name: Option<String>,
+        prefix: Option<String>,
+    }
+
+    impl Configurable for SimpleConfig {}
+
+    #[test]
+    fn test_empty_env_var_is_not_ignored() {
+        let env_prefix = "SIMPLE_CFG_UT";
+        temp_env::with_vars(
+            [(
+                [env_prefix.to_string(), "PREFIX".to_string()].join(ENV_VAR_SEP),
+                Some(""),
+            )],
+            || {
+                let opts = SimpleConfig::load_layered_options(None, env_prefix).unwrap();
+                // With ignore_empty(false), an empty env var should yield Some("")
+                // rather than None (which was the previous behavior with ignore_empty(true)).
+                assert_eq!(opts.prefix, Some("".to_string()));
+                // Unset env var should remain None.
+                assert_eq!(opts.name, None);
+            },
+        );
+    }
 }
diff --git a/src/common/query/src/prelude.rs b/src/common/query/src/prelude.rs
index c27b94294e..50668bbbb1 100644
--- a/src/common/query/src/prelude.rs
+++ b/src/common/query/src/prelude.rs
@@ -27,7 +27,16 @@ static GREPTIME_TIMESTAMP_CELL: OnceCell<String> = OnceCell::new();
 static GREPTIME_VALUE_CELL: OnceCell<String> = OnceCell::new();
 
 pub fn set_default_prefix(prefix: Option<&str>) -> Result<()> {
-    match prefix {
+    // Strip surrounding double quotes as a defensive measure against upstream
+    // sources (scripts, CI, template engines, incorrect shell escaping) that may
+    // pass literal `""` as the value instead of an empty string.
+    let stripped = prefix.map(|s| {
+        s.strip_prefix('"')
+            .and_then(|s| s.strip_suffix('"'))
+            .unwrap_or(s)
+    });
+
+    match stripped {
         None => {
             // use default greptime prefix
             GREPTIME_TIMESTAMP_CELL.get_or_init(|| GREPTIME_TIMESTAMP.to_string());
@@ -70,3 +79,45 @@ const GREPTIME_VALUE: &str = "greptime_value";
 pub const GREPTIME_COUNT: &str = "greptime_count";
 /// Default physical table name
 pub const GREPTIME_PHYSICAL_TABLE: &str = "greptime_physical_table";
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Each test runs in a separate process via `cargo nextest`, so OnceCell
+    // state does not leak between tests.
+
+    #[test]
+    fn test_set_default_prefix_none() {
+        set_default_prefix(None).unwrap();
+        assert_eq!(greptime_timestamp(), "greptime_timestamp");
+        assert_eq!(greptime_value(), "greptime_value");
+    }
+
+    #[test]
+    fn test_set_default_prefix_empty_string() {
+        set_default_prefix(Some("")).unwrap();
+        assert_eq!(greptime_timestamp(), "timestamp");
+        assert_eq!(greptime_value(), "value");
+    }
+
+    #[test]
+    fn test_set_default_prefix_quoted_empty() {
+        // Handles upstream sources that pass literal `""` instead of an empty string
+        set_default_prefix(Some("\"\"")).unwrap();
+        assert_eq!(greptime_timestamp(), "timestamp");
+        assert_eq!(greptime_value(), "value");
+    }
+
+    #[test]
+    fn test_set_default_prefix_custom() {
+        set_default_prefix(Some("mydb")).unwrap();
+        assert_eq!(greptime_timestamp(), "mydb_timestamp");
+        assert_eq!(greptime_value(), "mydb_value");
+    }
+
+    #[test]
+    fn test_set_default_prefix_invalid() {
+        assert!(set_default_prefix(Some("invalid prefix!")).is_err());
+    }
+}

From 37105c8354c0fa86f941f563af1f8db8399e9e14 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 13 Mar 2026 14:28:58 +0800
Subject: [PATCH 06/42] chore(deps): bump quinn-proto from 0.11.12 to 0.11.14
 (#7805)

Bumps [quinn-proto](https://github.com/quinn-rs/quinn) from 0.11.12 to 0.11.14.
- [Release notes](https://github.com/quinn-rs/quinn/releases)
- [Commits](https://github.com/quinn-rs/quinn/compare/quinn-proto-0.11.12...quinn-proto-0.11.14)

---
updated-dependencies:
- dependency-name: quinn-proto
  dependency-version: 0.11.14
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 85c2b1ed2d..94f7a3eca1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7301,7 +7301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
 dependencies = [
  "cfg-if",
- "windows-targets 0.52.6",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
@@ -10771,9 +10771,9 @@ dependencies = [
 
 [[package]]
 name = "quinn-proto"
-version = "0.11.12"
+version = "0.11.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49df843a9161c85bb8aae55f101bc0bac8bcafd637a620d9122fd7e0b2f7422e"
+checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
 dependencies = [
  "bytes",
  "getrandom 0.3.3",

From 20f38d8a6aabeb905e2f4a0c21743ad98fa7aee2 Mon Sep 17 00:00:00 2001
From: Weny Xu <wenymedia@gmail.com>
Date: Fri, 13 Mar 2026 16:00:09 +0800
Subject: [PATCH 07/42] test(fuzz): add metric table repartition fuzz target
 (#7754)

* test: add fuzz_repartition_metric_table target scaffold

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: add metric logical lifecycle in repartition fuzz target

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: support partitioned metric tables in repartition fuzz

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: add repartition loop and partition assertions for metric target

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: use shared timestamp clock in metric repartition writes

Signed-off-by: WenyXu <wenymedia@gmail.com>

* refactor: unify string value and bound generation for fuzzing

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: use fixed physical table name in metric repartition fuzz

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: fmt

Signed-off-by: WenyXu <wenymedia@gmail.com>

* ci: update ci config

Signed-off-by: WenyXu <wenymedia@gmail.com>

* refactor: use btreemap

Signed-off-by: WenyXu <wenymedia@gmail.com>

* print count result

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: add csv translator for insert expr

Introduce a dedicated top-level csv translator so fuzz insert expressions can be converted into writer-ready records through a structured path instead of ad-hoc formatting in targets.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: add csv dump session utilities

Introduce CSV dump env helpers and a session writer that creates run directories, emits seed metadata, and flushes staged CSV records for fuzz workflows.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: bound csv dump buffer with auto flush

Parse readable buffer sizes from env and flush staged CSV records automatically when the in-memory threshold is reached to prevent unbounded growth during long fuzz runs.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: flush csv dump before repartition validation

Wire csv dump session into the metric repartition fuzz flow so successful inserts are translated from insert expressions into CSV records during write loops and flushed to disk right before row validation.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: keep csv dumps on failure and cleanup on pass

Capture run outcomes in metric repartition fuzz, remove dump directories only after successful validation, and retain dump paths on failures so CI and local investigations can use the same artifacts.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: align partial csv records with table headers

Keep append payload compact by storing partial insert-expression columns, then expand to full table-context headers at flush time and fill missing values with empty strings.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: add logs

Signed-off-by: WenyXu <wenymedia@gmail.com>

* dump csv

Signed-off-by: WenyXu <wenymedia@gmail.com>

* ci: dump csv

Signed-off-by: WenyXu <wenymedia@gmail.com>

* refactor

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: add table-scoped sql dump writer primitives

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: capture table-scoped sql traces after execution

Record insert and repartition SQL only after successful execution, include started_at_ms and elapsed_ms in trace comments, and broadcast repartition events into every logical-table trace file for consistent debugging context.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: harden sql trace comments and include create sql

Normalize multiline trace comments into valid SQL comment lines and append logical-table CREATE SQL to per-table traces for better timeline reconstruction during repartition debugging.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: dump physical create and repartition SQL traces

Signed-off-by: WenyXu <wenymedia@gmail.com>

* dump repartition sql

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: scaffold writer control channel for barrier flow

Add Barrier/Resume/Stop control skeleton and channel wiring in write_loop to prepare per-repartition validation barriers. Also align SQL dump tests with broadcast SQL payload behavior.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: implement writer barrier pause and resume control

Make writer control messages effective by pausing writes on barrier, resuming on resume, and stopping via channel signaling so the next commit can enforce deterministic per-repartition validation boundaries.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: validate rows after each repartition barrier

Add per-action barrier/ack synchronization with timeout, run immediate logical-table row validation after each repartition, and resume writer only after validation completes to improve minimal failure localization.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: flush dump sessions before per-epoch validation

Extract a shared flush-and-snapshot helper and call it before each immediate row validation so CSV/SQL artifacts are persisted at the same epoch boundary being validated.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix: fix unit tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: add retry

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions from CR

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
---
 .github/workflows/develop.yml                 |  16 +-
 tests-fuzz/Cargo.toml                         |   7 +
 tests-fuzz/README.md                          |  20 +
 tests-fuzz/src/fake.rs                        |  20 +
 tests-fuzz/src/generator/create_expr.rs       | 129 +++-
 tests-fuzz/src/ir.rs                          |  37 +-
 tests-fuzz/src/ir/partition_expr.rs           |   6 +-
 tests-fuzz/src/ir/string_value.rs             | 162 +++++
 tests-fuzz/src/translator.rs                  |   2 +
 tests-fuzz/src/translator/csv.rs              | 121 ++++
 tests-fuzz/src/utils.rs                       |  42 ++
 tests-fuzz/src/utils/csv_dump_writer.rs       | 383 ++++++++++
 tests-fuzz/src/utils/retry.rs                 |  49 ++
 tests-fuzz/src/utils/sql_dump_writer.rs       | 267 +++++++
 .../ddl/fuzz_repartition_metric_table.rs      | 684 ++++++++++++++++++
 15 files changed, 1892 insertions(+), 53 deletions(-)
 create mode 100644 tests-fuzz/src/ir/string_value.rs
 create mode 100644 tests-fuzz/src/translator/csv.rs
 create mode 100644 tests-fuzz/src/utils/csv_dump_writer.rs
 create mode 100644 tests-fuzz/src/utils/retry.rs
 create mode 100644 tests-fuzz/src/utils/sql_dump_writer.rs
 create mode 100644 tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs

diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml
index 0238e92c8d..b6ab0f8926 100644
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -319,7 +319,13 @@ jobs:
         include:
           - target: "fuzz_repartition_table"
             mode:
-              name: "Local WAL Repartition GC"
+              name: "Local WAL mito table repartition"
+              minio: true
+              kafka: false
+              values: "with-minio-repartition-gc.yaml"
+          - target: "fuzz_repartition_metric_table"
+            mode:
+              name: "Local WAL metric table repartition"
               minio: true
               kafka: false
               values: "with-minio-repartition-gc.yaml"
@@ -455,6 +461,14 @@ jobs:
           path: /tmp/fuzz-monitor-dumps
           if-no-files-found: warn
           retention-days: 3
+      - name: Upload CSV dumps
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: fuzz-tests-csv-dumps-${{ matrix.mode.name }}-${{ matrix.target }}
+          path: /tmp/greptime-fuzz-dumps
+          if-no-files-found: warn
+          retention-days: 3
       - name: Delete cluster
         if: success()
         shell: bash
diff --git a/tests-fuzz/Cargo.toml b/tests-fuzz/Cargo.toml
index a537ca0687..bc687092c0 100644
--- a/tests-fuzz/Cargo.toml
+++ b/tests-fuzz/Cargo.toml
@@ -100,6 +100,13 @@ test = false
 bench = false
 doc = false
 
+[[bin]]
+name = "fuzz_repartition_metric_table"
+path = "targets/ddl/fuzz_repartition_metric_table.rs"
+test = false
+bench = false
+doc = false
+
 [[bin]]
 name = "fuzz_alter_table"
 path = "targets/ddl/fuzz_alter_table.rs"
diff --git a/tests-fuzz/README.md b/tests-fuzz/README.md
index 6807e19a1c..cc9d7eb84e 100644
--- a/tests-fuzz/README.md
+++ b/tests-fuzz/README.md
@@ -66,3 +66,23 @@ GT_FUZZ_OVERRIDE_SEED=6666 GT_FUZZ_OVERRIDE_ACTIONS=175 cargo fuzz run fuzz_targ
 ```
 
 For more details, visit [cargo fuzz](https://rust-fuzz.github.io/book/cargo-fuzz/tutorial.html) or run the command `cargo fuzz --help`.
+
+## Repartition Metric Dump Artifacts
+
+For `fuzz_repartition_metric_table`, dump artifacts are written under one run directory.
+
+- Table data snapshots: `<logical_table>.table-data.csv`
+- SQL traces per logical table: `<logical_table>.trace.sql`
+- Seed metadata: `seed.meta`
+
+SQL trace behavior:
+
+- Insert SQL is appended after successful execution with comment fields including
+  `started_at_ms` and `elapsed_ms`.
+- Repartition events are broadcast to all logical table trace files with comment fields including
+  `action_idx`, `started_at_ms`, `elapsed_ms`, and SQL text.
+
+Run directory lifecycle:
+
+- On success, the run directory is cleaned up.
+- On failure, the run directory is retained for CI/local diffing.
diff --git a/tests-fuzz/src/fake.rs b/tests-fuzz/src/fake.rs
index aa92e0293a..8910a39206 100644
--- a/tests-fuzz/src/fake.rs
+++ b/tests-fuzz/src/fake.rs
@@ -65,6 +65,26 @@ where
     _v: PhantomData<V>,
 }
 
+pub struct ConstGenerator<V> {
+    value: V,
+}
+
+impl<V> ConstGenerator<V> {
+    pub fn new(value: V) -> Self {
+        Self { value }
+    }
+}
+
+impl<R, V> Random<V, R> for ConstGenerator<V>
+where
+    R: Rng,
+    V: Clone,
+{
+    fn choose(&self, _rng: &mut R, amount: usize) -> Vec<V> {
+        vec![self.value.clone(); amount]
+    }
+}
+
 pub fn random_capitalize_map<R: Rng + 'static>(rng: &mut R, s: Ident) -> Ident {
     let mut v = s.value.chars().collect::<Vec<_>>();
 
diff --git a/tests-fuzz/src/generator/create_expr.rs b/tests-fuzz/src/generator/create_expr.rs
index fae6a95eda..261a310db2 100644
--- a/tests-fuzz/src/generator/create_expr.rs
+++ b/tests-fuzz/src/generator/create_expr.rs
@@ -193,6 +193,26 @@ fn generate_partition_def(
     }
 }
 
+fn generate_metric_partition(partitions: usize) -> Option<(Column, PartitionDef)> {
+    if partitions <= 1 {
+        return None;
+    }
+
+    let partition_column = Column {
+        name: Ident::new("host"),
+        column_type: ConcreteDataType::string_datatype(),
+        options: vec![ColumnOption::PrimaryKey],
+    };
+    let bounds = generate_partition_bounds(&partition_column.column_type, partitions - 1);
+    let partitions = SimplePartitions::new(partition_column.name.clone(), bounds);
+    let partition_def = PartitionDef {
+        columns: vec![partitions.column_name.clone()],
+        exprs: partitions.generate().unwrap(),
+    };
+
+    Some((partition_column, partition_def))
+}
+
 /// Generate a physical table with 2 columns: ts of TimestampType::Millisecond as time index and val of Float64Type.
 #[derive(Builder)]
 #[builder(pattern = "owned")]
@@ -201,6 +221,8 @@ pub struct CreatePhysicalTableExprGenerator<R: Rng + 'static> {
     name_generator: Box<dyn Random<Ident, R>>,
     #[builder(default = "false")]
     if_not_exists: bool,
+    #[builder(default = "0")]
+    partition: usize,
     #[builder(default, setter(into))]
     with_clause: HashMap<String, String>,
 }
@@ -215,25 +237,35 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreatePhysicalTableExpr
             options.insert(key.clone(), Value::from(value.clone()));
         }
 
+        let mut columns = vec![
+            Column {
+                name: Ident::new("ts"),
+                column_type: ConcreteDataType::timestamp_millisecond_datatype(),
+                options: vec![ColumnOption::TimeIndex],
+            },
+            Column {
+                name: Ident::new("val"),
+                column_type: ConcreteDataType::float64_datatype(),
+                options: vec![],
+            },
+        ];
+
+        let mut partition = None;
+        let mut primary_keys = vec![];
+        if let Some((partition_column, partition_def)) = generate_metric_partition(self.partition) {
+            columns.push(partition_column);
+            partition = Some(partition_def);
+            primary_keys.push(columns.len() - 1);
+        }
+
         Ok(CreateTableExpr {
             table_name: self.name_generator.generate(rng),
-            columns: vec![
-                Column {
-                    name: Ident::new("ts"),
-                    column_type: ConcreteDataType::timestamp_millisecond_datatype(),
-                    options: vec![ColumnOption::TimeIndex],
-                },
-                Column {
-                    name: Ident::new("val"),
-                    column_type: ConcreteDataType::float64_datatype(),
-                    options: vec![],
-                },
-            ],
+            columns,
             if_not_exists: self.if_not_exists,
-            partition: None,
+            partition,
             engine: "metric".to_string(),
             options,
-            primary_keys: vec![],
+            primary_keys,
         })
     }
 }
@@ -245,6 +277,8 @@ pub struct CreateLogicalTableExprGenerator<R: Rng + 'static> {
     physical_table_ctx: TableContextRef,
     labels: usize,
     if_not_exists: bool,
+    #[builder(default = "true")]
+    include_partition_column: bool,
     #[builder(default = "Box::new(WordGenerator)")]
     name_generator: Box<dyn Random<Ident, R>>,
 }
@@ -253,11 +287,11 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateLogicalTableExprG
     type Error = Error;
 
     fn generate(&self, rng: &mut R) -> Result<CreateTableExpr> {
-        // Currently we mock the usage of GreptimeDB as Prometheus' backend, the physical table must have two columns.
+        // Currently we mock the usage of GreptimeDB as Prometheus' backend, the physical table must have ts and val.
         ensure!(
-            self.physical_table_ctx.columns.len() == 2,
+            self.physical_table_ctx.columns.len() >= 2,
             error::UnexpectedSnafu {
-                violated: "The physical table must have two columns"
+                violated: "The physical table must have at least two columns"
             }
         );
 
@@ -265,9 +299,16 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateLogicalTableExprG
         let logical_table_name = self
             .physical_table_ctx
             .generate_unique_table_name(rng, self.name_generator.as_ref());
+        let mut physical_columns = self.physical_table_ctx.columns.clone();
+        if !self.include_partition_column
+            && let Some(partition_def) = &self.physical_table_ctx.partition
+        {
+            physical_columns.retain(|column| !partition_def.columns.contains(&column.name));
+        }
+
         let mut logical_table = CreateTableExpr {
             table_name: logical_table_name,
-            columns: self.physical_table_ctx.columns.clone(),
+            columns: physical_columns,
             if_not_exists: self.if_not_exists,
             partition: None,
             engine: "metric".to_string(),
@@ -459,6 +500,58 @@ mod tests {
         }));
     }
 
+    #[test]
+    fn test_create_physical_table_expr_generator_with_partition() {
+        let mut rng = rand::rng();
+        let physical_table_expr = CreatePhysicalTableExprGeneratorBuilder::default()
+            .partition(3)
+            .if_not_exists(false)
+            .build()
+            .unwrap()
+            .generate(&mut rng)
+            .unwrap();
+
+        assert_eq!(physical_table_expr.engine, "metric");
+        assert!(physical_table_expr.partition.is_some());
+        assert_eq!(physical_table_expr.partition.unwrap().exprs.len(), 3);
+    }
+
+    #[test]
+    fn test_create_logical_table_expr_generator_without_partition_column() {
+        let mut rng = rand::rng();
+        let physical_table_expr = CreatePhysicalTableExprGeneratorBuilder::default()
+            .partition(3)
+            .if_not_exists(false)
+            .build()
+            .unwrap()
+            .generate(&mut rng)
+            .unwrap();
+        let partition_columns = physical_table_expr
+            .partition
+            .as_ref()
+            .unwrap()
+            .columns
+            .clone();
+        let physical_table_ctx = Arc::new(TableContext::from(&physical_table_expr));
+
+        let logical_table_expr = CreateLogicalTableExprGeneratorBuilder::default()
+            .physical_table_ctx(physical_table_ctx)
+            .labels(3)
+            .include_partition_column(false)
+            .if_not_exists(false)
+            .build()
+            .unwrap()
+            .generate(&mut rng)
+            .unwrap();
+
+        assert!(
+            logical_table_expr
+                .columns
+                .iter()
+                .all(|column| !partition_columns.contains(&column.name))
+        );
+    }
+
     #[test]
     fn test_create_logical_table_expr_generator_deterministic() {
         let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0);
diff --git a/tests-fuzz/src/ir.rs b/tests-fuzz/src/ir.rs
index e8c15dcf95..ce1628cd61 100644
--- a/tests-fuzz/src/ir.rs
+++ b/tests-fuzz/src/ir.rs
@@ -20,6 +20,7 @@ pub(crate) mod insert_expr;
 pub(crate) mod partition_expr;
 pub(crate) mod repartition_expr;
 pub(crate) mod select_expr;
+pub(crate) mod string_value;
 
 use core::fmt;
 use std::collections::HashMap;
@@ -126,20 +127,7 @@ pub fn generate_partition_bounds(datatype: &ConcreteDataType, bounds: usize) ->
         ConcreteDataType::Int64(_) => generate_values!(i64, bounds),
         ConcreteDataType::Float32(_) => generate_values!(f32, bounds),
         ConcreteDataType::Float64(_) => generate_values!(f64, bounds),
-        ConcreteDataType::String(_) => {
-            let base = b'A';
-            let range = b'z' - b'A';
-            let step = range / (bounds as u8 + 1);
-            (1..=bounds)
-                .map(|i| {
-                    Value::from(
-                        char::from(base + step * i as u8)
-                            .escape_default()
-                            .to_string(),
-                    )
-                })
-                .collect()
-        }
+        ConcreteDataType::String(_) => string_value::generate_partition_bounds(bounds),
         _ => unimplemented!("unsupported type: {datatype}"),
     }
 }
@@ -157,10 +145,7 @@ pub fn generate_random_value<R: Rng>(
         ConcreteDataType::Int64(_) => Value::from(rng.random::<i64>()),
         ConcreteDataType::Float32(_) => Value::from(rng.random::<f32>()),
         ConcreteDataType::Float64(_) => Value::from(rng.random::<f64>()),
-        ConcreteDataType::String(_) => match random_str {
-            Some(random) => Value::from(random.generate(rng).value),
-            None => Value::from(rng.random::<char>().to_string()),
-        },
+        ConcreteDataType::String(_) => string_value::generate_data_string_value(rng, random_str),
         ConcreteDataType::Date(_) => generate_random_date(rng),
 
         _ => unimplemented!("unsupported type: {datatype}"),
@@ -341,21 +326,7 @@ pub fn generate_partition_value<R: Rng + 'static>(
             }
         }
         datatypes::data_type::ConcreteDataType::String(_) => {
-            let upper = match first {
-                datatypes::value::Value::String(v) => v.as_utf8(),
-                _ => "",
-            };
-            if bound_idx == 0 {
-                if upper <= "A" {
-                    datatypes::value::Value::from("")
-                } else {
-                    datatypes::value::Value::from("A")
-                }
-            } else if bound_idx < bounds.len() {
-                bounds[bound_idx - 1].clone()
-            } else {
-                last.clone()
-            }
+            string_value::generate_partition_value(bounds, bound_idx)
         }
         _ => unimplemented!("unsupported partition column type: {column_type}"),
     }
diff --git a/tests-fuzz/src/ir/partition_expr.rs b/tests-fuzz/src/ir/partition_expr.rs
index c91dd487ae..908223366c 100644
--- a/tests-fuzz/src/ir/partition_expr.rs
+++ b/tests-fuzz/src/ir/partition_expr.rs
@@ -20,7 +20,7 @@ use snafu::ensure;
 
 use crate::context::TableContext;
 use crate::error::{self, Result};
-use crate::ir::{Ident, generate_random_value};
+use crate::ir::{Ident, generate_random_value, string_value};
 
 /// A partitioning scheme that divides a single column into multiple ranges based on provided bounds.
 ///
@@ -245,6 +245,10 @@ pub fn generate_unique_bound<R: Rng + 'static>(
     datatype: &ConcreteDataType,
     bounds: &[Value],
 ) -> Result<Value> {
+    if matches!(datatype, ConcreteDataType::String(_)) {
+        return string_value::generate_unique_partition_bound(rng, bounds);
+    }
+
     for _ in 0..16 {
         let candidate = generate_random_value(rng, datatype, None);
         if !bounds.contains(&candidate) {
diff --git a/tests-fuzz/src/ir/string_value.rs b/tests-fuzz/src/ir/string_value.rs
new file mode 100644
index 0000000000..6a53aa69de
--- /dev/null
+++ b/tests-fuzz/src/ir/string_value.rs
@@ -0,0 +1,162 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datatypes::value::Value;
+use rand::Rng;
+
+use crate::error::{self, Result};
+use crate::generator::Random;
+use crate::ir::Ident;
+
+const READABLE_CHARSET: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+fn readable_token(index: usize) -> String {
+    let base = READABLE_CHARSET.len();
+    let mut n = index + 1;
+    let mut buf = Vec::new();
+
+    while n > 0 {
+        let rem = (n - 1) % base;
+        buf.push(READABLE_CHARSET[rem] as char);
+        n = (n - 1) / base;
+    }
+
+    buf.iter().rev().collect()
+}
+
+pub fn generate_data_string_value<R: Rng>(
+    rng: &mut R,
+    random_str: Option<&dyn Random<Ident, R>>,
+) -> Value {
+    match random_str {
+        Some(random) => Value::from(random.generate(rng).value),
+        None => {
+            let idx = rng.random_range(0..(READABLE_CHARSET.len() * READABLE_CHARSET.len() * 4));
+            Value::from(readable_token(idx))
+        }
+    }
+}
+
+/// Generates ordered readable string bounds for partition expressions.
+pub fn generate_partition_bounds(bounds: usize) -> Vec<Value> {
+    let token_space = READABLE_CHARSET.len() * READABLE_CHARSET.len() * 1024;
+    (1..=bounds)
+        .map(|i| {
+            let idx = i * token_space / (bounds + 1);
+            Value::from(readable_token(idx))
+        })
+        .collect()
+}
+
+/// Picks a representative string value for the target partition range.
+pub fn generate_partition_value(bounds: &[Value], bound_idx: usize) -> Value {
+    let first = bounds.first().unwrap();
+    let last = bounds.last().unwrap();
+    let upper = match first {
+        Value::String(v) => v.as_utf8(),
+        _ => "",
+    };
+
+    if bound_idx == 0 {
+        if upper <= "0" {
+            Value::from("")
+        } else {
+            Value::from("0")
+        }
+    } else if bound_idx < bounds.len() {
+        bounds[bound_idx - 1].clone()
+    } else {
+        last.clone()
+    }
+}
+
+/// Generates a unique readable bound not present in existing bounds.
+pub fn generate_unique_partition_bound<R: Rng>(rng: &mut R, bounds: &[Value]) -> Result<Value> {
+    let search_space = READABLE_CHARSET.len() * READABLE_CHARSET.len() * 1024;
+    let start = rng.random_range(0..search_space);
+    for offset in 0..search_space {
+        let idx = start + offset;
+        let candidate = Value::from(readable_token(idx));
+        if !bounds.contains(&candidate) {
+            return Ok(candidate);
+        }
+    }
+
+    error::UnexpectedSnafu {
+        violated: "unable to generate unique string partition bound".to_string(),
+    }
+    .fail()
+}
+
+#[cfg(test)]
+mod tests {
+    use rand::SeedableRng;
+    use rand_chacha::ChaCha8Rng;
+
+    use super::*;
+
+    #[test]
+    fn test_readable_token_grows_length() {
+        assert_eq!("0", readable_token(0));
+        assert_eq!("9", readable_token(9));
+        assert_eq!("A", readable_token(10));
+        assert_eq!("z", readable_token(61));
+        assert_eq!("00", readable_token(62));
+    }
+
+    #[test]
+    fn test_generate_partition_bounds_are_readable_and_unique() {
+        let bounds = generate_partition_bounds(8);
+        assert_eq!(8, bounds.len());
+
+        let mut values = bounds
+            .iter()
+            .map(|v| match v {
+                Value::String(s) => s.as_utf8().to_string(),
+                _ => panic!("expected string value"),
+            })
+            .collect::<Vec<_>>();
+        let mut dedup = values.clone();
+        dedup.sort();
+        dedup.dedup();
+        assert_eq!(values.len(), dedup.len());
+
+        for s in values.drain(..) {
+            assert!(s.chars().all(|c| c.is_ascii_alphanumeric()));
+        }
+    }
+
+    #[test]
+    fn test_generate_partition_value_for_string_bounds() {
+        let bounds = vec![Value::from("A"), Value::from("M")];
+        assert_eq!(Value::from("0"), generate_partition_value(&bounds, 0));
+        assert_eq!(Value::from("A"), generate_partition_value(&bounds, 1));
+        assert_eq!(Value::from("M"), generate_partition_value(&bounds, 2));
+    }
+
+    #[test]
+    fn test_generate_unique_partition_bound_not_in_existing() {
+        let mut rng = ChaCha8Rng::seed_from_u64(42);
+        let bounds = vec![Value::from("0"), Value::from("1"), Value::from("2")];
+        let candidate = generate_unique_partition_bound(&mut rng, &bounds).unwrap();
+        assert!(!bounds.contains(&candidate));
+        match candidate {
+            Value::String(s) => {
+                assert!(!s.as_utf8().is_empty());
+                assert!(s.as_utf8().chars().all(|c| c.is_ascii_alphanumeric()));
+            }
+            _ => panic!("expected string value"),
+        }
+    }
+}
diff --git a/tests-fuzz/src/translator.rs b/tests-fuzz/src/translator.rs
index 673b543f2c..4c5e0bb6a4 100644
--- a/tests-fuzz/src/translator.rs
+++ b/tests-fuzz/src/translator.rs
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 mod common;
+/// Translator that converts insert expressions into CSV records.
+pub mod csv;
 pub mod mysql;
 pub mod postgres;
 
diff --git a/tests-fuzz/src/translator/csv.rs b/tests-fuzz/src/translator/csv.rs
new file mode 100644
index 0000000000..e95956862c
--- /dev/null
+++ b/tests-fuzz/src/translator/csv.rs
@@ -0,0 +1,121 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::error::Error;
+use crate::ir::insert_expr::{InsertIntoExpr, RowValue};
+use crate::translator::DslTranslator;
+
+/// One CSV record converted from an insert row.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CsvRecord {
+    /// Cell values in column order.
+    pub values: Vec<String>,
+}
+
+/// CSV records converted from an insert expression.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CsvRecords {
+    /// Target table name from insert expression.
+    pub table_name: String,
+    /// Header values from insert columns.
+    pub headers: Vec<String>,
+    /// Converted row records.
+    pub records: Vec<CsvRecord>,
+}
+
+/// Translates `InsertIntoExpr` into CSV-writer-ready records.
+pub struct InsertExprToCsvRecordsTranslator;
+
+impl DslTranslator<InsertIntoExpr, CsvRecords> for InsertExprToCsvRecordsTranslator {
+    type Error = Error;
+
+    fn translate(&self, input: &InsertIntoExpr) -> Result<CsvRecords, Self::Error> {
+        let headers = input
+            .columns
+            .iter()
+            .map(|column| column.name.to_string())
+            .collect::<Vec<_>>();
+        let records = input
+            .values_list
+            .iter()
+            .map(|row| CsvRecord {
+                values: row.iter().map(Self::format_row_value).collect(),
+            })
+            .collect::<Vec<_>>();
+
+        Ok(CsvRecords {
+            table_name: input.table_name.to_string(),
+            headers,
+            records,
+        })
+    }
+}
+
+impl InsertExprToCsvRecordsTranslator {
+    fn format_row_value(value: &RowValue) -> String {
+        match value {
+            RowValue::Value(datatypes::value::Value::Null) => String::new(),
+            RowValue::Value(v) => v.to_string(),
+            RowValue::Default => "DEFAULT".to_string(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datatypes::data_type::ConcreteDataType;
+
+    use super::InsertExprToCsvRecordsTranslator;
+    use crate::ir::create_expr::ColumnOption;
+    use crate::ir::insert_expr::{InsertIntoExpr, RowValue};
+    use crate::ir::{Column, Ident};
+    use crate::translator::DslTranslator;
+
+    #[test]
+    fn test_translate_insert_expr_to_csv_records() {
+        let input = InsertIntoExpr {
+            table_name: Ident::new("metric_a"),
+            omit_column_list: false,
+            columns: vec![
+                Column {
+                    name: "host".into(),
+                    column_type: ConcreteDataType::string_datatype(),
+                    options: vec![ColumnOption::PrimaryKey],
+                },
+                Column {
+                    name: "value".into(),
+                    column_type: ConcreteDataType::float64_datatype(),
+                    options: vec![],
+                },
+            ],
+            values_list: vec![
+                vec![
+                    RowValue::Value(datatypes::value::Value::String("web-1".into())),
+                    RowValue::Value(datatypes::value::Value::Int32(15)),
+                ],
+                vec![
+                    RowValue::Value(datatypes::value::Value::Null),
+                    RowValue::Default,
+                ],
+            ],
+        };
+
+        let output = InsertExprToCsvRecordsTranslator.translate(&input).unwrap();
+        assert_eq!(output.table_name, "metric_a");
+        assert_eq!(output.headers, vec!["host", "value"]);
+        assert_eq!(output.records.len(), 2);
+        assert_eq!(output.records[0].values, vec!["web-1", "15"]);
+        assert_eq!(output.records[1].values, vec!["", "DEFAULT"]);
+    }
+}
diff --git a/tests-fuzz/src/utils.rs b/tests-fuzz/src/utils.rs
index 0780f6c93d..d55abab3c2 100644
--- a/tests-fuzz/src/utils.rs
+++ b/tests-fuzz/src/utils.rs
@@ -15,6 +15,8 @@
 pub mod cluster_info;
 pub mod config;
 pub mod crd;
+/// CSV dump writer utilities for fuzz tests.
+pub mod csv_dump_writer;
 pub mod health;
 pub mod migration;
 pub mod partition;
@@ -22,10 +24,15 @@ pub mod pod_failure;
 pub mod procedure;
 #[cfg(feature = "unstable")]
 pub mod process;
+pub mod retry;
+/// SQL dump writer utilities for fuzz tests.
+pub mod sql_dump_writer;
 pub mod wait;
 
 use std::env;
+use std::str::FromStr;
 
+use common_base::readable_size::ReadableSize;
 use common_telemetry::info;
 use common_telemetry::tracing::log::LevelFilter;
 use paste::paste;
@@ -126,6 +133,14 @@ pub const GT_FUZZ_INPUT_MAX_COLUMNS: &str = "GT_FUZZ_INPUT_MAX_COLUMNS";
 pub const GT_FUZZ_INPUT_MAX_ALTER_ACTIONS: &str = "GT_FUZZ_INPUT_MAX_ALTER_ACTIONS";
 pub const GT_FUZZ_INPUT_MAX_INSERT_ACTIONS: &str = "GT_FUZZ_INPUT_MAX_INSERT_ACTIONS";
 pub const FUZZ_OVERRIDE_PREFIX: &str = "GT_FUZZ_OVERRIDE_";
+/// Enables CSV dump generation for fuzz runs.
+pub const GT_FUZZ_DUMP_TABLE_CSV: &str = "GT_FUZZ_DUMP_TABLE_CSV";
+/// Base directory for CSV dump sessions.
+pub const GT_FUZZ_DUMP_DIR: &str = "GT_FUZZ_DUMP_DIR";
+/// Directory suffix used by one CSV dump session.
+pub const GT_FUZZ_DUMP_SUFFIX: &str = "GT_FUZZ_DUMP_SUFFIX";
+/// Max in-memory CSV buffer size before auto flush.
+pub const GT_FUZZ_DUMP_BUFFER_MAX_BYTES: &str = "GT_FUZZ_DUMP_BUFFER_MAX_BYTES";
 
 /// Reads an override value for a fuzz parameter from env `GT_FUZZ_OVERRIDE_<NAME>`.
 pub fn get_fuzz_override<T>(name: &str) -> Option<T>
@@ -137,6 +152,33 @@ where
     env::var(&key).ok().and_then(|v| v.parse().ok())
 }
 
+/// Returns CSV dump base directory.
+pub fn get_gt_fuzz_dump_dir() -> String {
+    let _ = dotenv::dotenv();
+    env::var(GT_FUZZ_DUMP_DIR).unwrap_or_else(|_| "/tmp/greptime-fuzz-dumps".to_string())
+}
+
+/// Returns CSV dump directory suffix.
+pub fn get_gt_fuzz_dump_suffix() -> String {
+    let _ = dotenv::dotenv();
+    env::var(GT_FUZZ_DUMP_SUFFIX).unwrap_or_else(|_| ".repartition-metric-csv".to_string())
+}
+
+/// Returns max CSV in-memory buffer size.
+pub fn get_gt_fuzz_dump_buffer_max_bytes() -> usize {
+    let _ = dotenv::dotenv();
+    env::var(GT_FUZZ_DUMP_BUFFER_MAX_BYTES)
+        .ok()
+        .and_then(|value| {
+            value.parse::<usize>().ok().or_else(|| {
+                ReadableSize::from_str(&value)
+                    .ok()
+                    .map(|size| size.as_bytes() as usize)
+            })
+        })
+        .unwrap_or(8 * 1024 * 1024)
+}
+
 macro_rules! make_get_from_env_helper {
     ($key:expr, $default: expr) => {
         paste! {
diff --git a/tests-fuzz/src/utils/csv_dump_writer.rs b/tests-fuzz/src/utils/csv_dump_writer.rs
new file mode 100644
index 0000000000..de16a23c24
--- /dev/null
+++ b/tests-fuzz/src/utils/csv_dump_writer.rs
@@ -0,0 +1,383 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+use std::fs::{File, OpenOptions, create_dir_all, remove_dir_all};
+use std::io::Write;
+use std::path::{Path, PathBuf};
+
+use common_telemetry::{info, warn};
+use common_time::util::current_time_millis;
+use snafu::ResultExt;
+
+use crate::error::{self, Result};
+use crate::translator::csv::CsvRecords;
+use crate::utils::{
+    get_gt_fuzz_dump_buffer_max_bytes, get_gt_fuzz_dump_dir, get_gt_fuzz_dump_suffix,
+};
+
+/// Metadata for one CSV dump session.
+#[derive(Debug, Clone)]
+pub struct CsvDumpMetadata {
+    /// Fuzz target name.
+    pub target: String,
+    /// Seed used by current fuzz input.
+    pub seed: u64,
+    /// Repartition action count.
+    pub actions: usize,
+    /// Initial partition count.
+    pub partitions: usize,
+    /// Logical table count.
+    pub tables: usize,
+    /// Session start time in unix milliseconds.
+    pub started_at_unix_ms: i64,
+}
+
+impl CsvDumpMetadata {
+    /// Builds dump metadata with current timestamp.
+    pub fn new(
+        target: impl Into<String>,
+        seed: u64,
+        actions: usize,
+        partitions: usize,
+        tables: usize,
+    ) -> Self {
+        Self {
+            target: target.into(),
+            seed,
+            actions,
+            partitions,
+            tables,
+            started_at_unix_ms: current_time_millis(),
+        }
+    }
+}
+
+/// Session writer for staged CSV dump records.
+#[derive(Debug)]
+pub struct CsvDumpSession {
+    /// Session metadata.
+    pub metadata: CsvDumpMetadata,
+    /// Session directory path.
+    pub run_dir: PathBuf,
+    /// Max in-memory buffer size before auto flush.
+    pub max_buffer_bytes: usize,
+    records: Vec<CsvRecords>,
+    buffered_bytes: usize,
+    written_tables: HashSet<String>,
+    full_headers_by_table: HashMap<String, Vec<String>>,
+}
+
+impl CsvDumpSession {
+    /// Creates session directory and writes seed metadata file.
+    pub fn new(metadata: CsvDumpMetadata) -> Result<Self> {
+        Self::new_with_buffer_limit(metadata, get_gt_fuzz_dump_buffer_max_bytes())
+    }
+
+    /// Creates session with a custom in-memory buffer limit.
+    pub fn new_with_buffer_limit(
+        metadata: CsvDumpMetadata,
+        max_buffer_bytes: usize,
+    ) -> Result<Self> {
+        let run_dir = build_run_dir(&metadata);
+        create_dir_all(&run_dir).context(error::CreateFileSnafu {
+            path: run_dir.to_string_lossy().to_string(),
+        })?;
+        write_seed_meta(&run_dir, &metadata)?;
+        info!(
+            "Create csv dump session, target: {}, run_dir: {}, max_buffer_bytes: {}",
+            metadata.target,
+            run_dir.display(),
+            max_buffer_bytes
+        );
+
+        Ok(Self {
+            metadata,
+            run_dir,
+            max_buffer_bytes,
+            records: Vec::new(),
+            buffered_bytes: 0,
+            written_tables: HashSet::new(),
+            full_headers_by_table: HashMap::new(),
+        })
+    }
+
+    /// Appends one table CSV records batch with full table headers.
+    pub fn append(&mut self, records: CsvRecords, full_headers: Vec<String>) -> Result<()> {
+        self.full_headers_by_table
+            .entry(records.table_name.clone())
+            .or_insert(full_headers);
+        self.buffered_bytes += estimate_csv_records_size(&records);
+        self.records.push(records);
+        if self.buffered_bytes >= self.max_buffer_bytes {
+            self.flush_buffered_records()?;
+        }
+        Ok(())
+    }
+
+    /// Flushes all appended batches to CSV files.
+    pub fn flush_all(&mut self) -> Result<()> {
+        self.flush_buffered_records()
+    }
+
+    /// Removes session directory after successful validation.
+    pub fn cleanup_on_success(&self) -> std::io::Result<()> {
+        match remove_dir_all(&self.run_dir) {
+            Ok(_) => {
+                info!(
+                    "Cleanup csv dump directory on success: {}",
+                    self.run_dir.display()
+                );
+                Ok(())
+            }
+            Err(err) => {
+                warn!(
+                    "Cleanup csv dump directory failed: {}, error: {:?}",
+                    self.run_dir.display(),
+                    err
+                );
+                Err(err)
+            }
+        }
+    }
+
+    fn flush_buffered_records(&mut self) -> Result<()> {
+        if self.records.is_empty() {
+            return Ok(());
+        }
+        for batch in &self.records {
+            write_batch_csv(
+                &self.run_dir,
+                batch,
+                &mut self.written_tables,
+                &self.full_headers_by_table,
+            )?;
+        }
+        self.records.clear();
+        self.buffered_bytes = 0;
+        Ok(())
+    }
+}
+
+fn write_seed_meta(run_dir: &Path, metadata: &CsvDumpMetadata) -> Result<()> {
+    let path = run_dir.join("seed.meta");
+    let mut file = File::create(&path).context(error::CreateFileSnafu {
+        path: path.to_string_lossy().to_string(),
+    })?;
+
+    let content = format!(
+        "target={}\nseed={}\nactions={}\npartitions={}\ntables={}\nstarted_at_unix_ms={}\n",
+        metadata.target,
+        metadata.seed,
+        metadata.actions,
+        metadata.partitions,
+        metadata.tables,
+        metadata.started_at_unix_ms,
+    );
+    file.write_all(content.as_bytes())
+        .context(error::WriteFileSnafu {
+            path: path.to_string_lossy().to_string(),
+        })
+}
+
+fn write_batch_csv(
+    run_dir: &Path,
+    batch: &CsvRecords,
+    written_tables: &mut HashSet<String>,
+    full_headers_by_table: &HashMap<String, Vec<String>>,
+) -> Result<()> {
+    let output_headers = full_headers_by_table
+        .get(&batch.table_name)
+        .cloned()
+        .unwrap_or_else(|| batch.headers.clone());
+    let file_name = format!("{}.table-data.csv", sanitize_file_name(&batch.table_name));
+    let path = run_dir.join(file_name);
+    let mut file = OpenOptions::new()
+        .create(true)
+        .append(true)
+        .open(&path)
+        .context(error::CreateFileSnafu {
+            path: path.to_string_lossy().to_string(),
+        })?;
+
+    if written_tables.insert(batch.table_name.clone()) {
+        file.write_all(join_line(&output_headers).as_bytes())
+            .context(error::WriteFileSnafu {
+                path: path.to_string_lossy().to_string(),
+            })?;
+        file.write_all(b"\n").context(error::WriteFileSnafu {
+            path: path.to_string_lossy().to_string(),
+        })?;
+    }
+
+    let header_index = batch
+        .headers
+        .iter()
+        .enumerate()
+        .map(|(idx, header)| (header.as_str(), idx))
+        .collect::<HashMap<_, _>>();
+
+    for record in &batch.records {
+        let aligned_values = output_headers
+            .iter()
+            .map(|header| {
+                header_index
+                    .get(header.as_str())
+                    .and_then(|idx| record.values.get(*idx))
+                    .cloned()
+                    .unwrap_or_default()
+            })
+            .collect::<Vec<_>>();
+        file.write_all(join_line(&aligned_values).as_bytes())
+            .context(error::WriteFileSnafu {
+                path: path.to_string_lossy().to_string(),
+            })?;
+        file.write_all(b"\n").context(error::WriteFileSnafu {
+            path: path.to_string_lossy().to_string(),
+        })?;
+    }
+
+    Ok(())
+}
+
+fn estimate_csv_records_size(records: &CsvRecords) -> usize {
+    let headers = records.headers.iter().map(String::len).sum::<usize>();
+    let rows = records
+        .records
+        .iter()
+        .flat_map(|record| record.values.iter())
+        .map(String::len)
+        .sum::<usize>();
+    headers + rows
+}
+
+fn join_line(cells: &[String]) -> String {
+    cells
+        .iter()
+        .map(|cell| escape_csv_cell(cell))
+        .collect::<Vec<_>>()
+        .join(",")
+}
+
+fn escape_csv_cell(value: &str) -> String {
+    if value.contains([',', '"', '\n', '\r']) {
+        format!("\"{}\"", value.replace('"', "\"\""))
+    } else {
+        value.to_string()
+    }
+}
+
+fn sanitize_file_name(raw: &str) -> String {
+    raw.chars()
+        .map(|ch| {
+            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
+                ch
+            } else {
+                '_'
+            }
+        })
+        .collect()
+}
+
+fn build_run_dir(metadata: &CsvDumpMetadata) -> PathBuf {
+    let base = PathBuf::from(get_gt_fuzz_dump_dir());
+    let suffix = get_gt_fuzz_dump_suffix();
+    let name = format!(
+        "{}_seed_{}_actions_{}_ts_{}{}",
+        metadata.target, metadata.seed, metadata.actions, metadata.started_at_unix_ms, suffix
+    );
+    base.join(name)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{CsvDumpMetadata, CsvDumpSession};
+    use crate::translator::csv::{CsvRecord, CsvRecords};
+
+    #[test]
+    fn test_create_session_and_flush() {
+        let mut session = CsvDumpSession::new_with_buffer_limit(
+            CsvDumpMetadata::new("fuzz_case", 1, 2, 3, 4),
+            1024,
+        )
+        .unwrap();
+        session
+            .append(
+                CsvRecords {
+                    table_name: "metric-a".to_string(),
+                    headers: vec!["host".to_string(), "value".to_string()],
+                    records: vec![CsvRecord {
+                        values: vec!["web-1".to_string(), "10".to_string()],
+                    }],
+                },
+                vec!["host".to_string(), "value".to_string()],
+            )
+            .unwrap();
+        session.flush_all().unwrap();
+
+        assert!(session.run_dir.exists());
+        assert!(session.run_dir.join("seed.meta").exists());
+        assert!(session.run_dir.join("metric-a.table-data.csv").exists());
+    }
+
+    #[test]
+    fn test_auto_flush_on_buffer_limit() {
+        let mut session =
+            CsvDumpSession::new_with_buffer_limit(CsvDumpMetadata::new("fuzz_case", 5, 2, 3, 4), 1)
+                .unwrap();
+        session
+            .append(
+                CsvRecords {
+                    table_name: "metric-b".to_string(),
+                    headers: vec!["host".to_string()],
+                    records: vec![CsvRecord {
+                        values: vec!["web-2".to_string()],
+                    }],
+                },
+                vec!["host".to_string()],
+            )
+            .unwrap();
+
+        assert!(session.run_dir.join("metric-b.table-data.csv").exists());
+        assert_eq!(session.buffered_bytes, 0);
+    }
+
+    #[test]
+    fn test_flush_with_partial_headers_uses_full_headers() {
+        let mut session = CsvDumpSession::new_with_buffer_limit(
+            CsvDumpMetadata::new("fuzz_case", 7, 2, 3, 4),
+            1024,
+        )
+        .unwrap();
+        session
+            .append(
+                CsvRecords {
+                    table_name: "metric-c".to_string(),
+                    headers: vec!["host".to_string(), "value".to_string()],
+                    records: vec![CsvRecord {
+                        values: vec!["web-3".to_string(), "12".to_string()],
+                    }],
+                },
+                vec!["host".to_string(), "idc".to_string(), "value".to_string()],
+            )
+            .unwrap();
+        session.flush_all().unwrap();
+
+        let file =
+            std::fs::read_to_string(session.run_dir.join("metric-c.table-data.csv")).unwrap();
+        let mut lines = file.lines();
+        assert_eq!(lines.next().unwrap(), "host,idc,value");
+        assert_eq!(lines.next().unwrap(), "web-3,,12");
+    }
+}
diff --git a/tests-fuzz/src/utils/retry.rs b/tests-fuzz/src/utils/retry.rs
new file mode 100644
index 0000000000..06d1ede54f
--- /dev/null
+++ b/tests-fuzz/src/utils/retry.rs
@@ -0,0 +1,49 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::future::Future;
+use std::time::Duration;
+
+use common_telemetry::warn;
+
+pub async fn retry_with_backoff<T, E, Fut, F>(
+    mut operation: F,
+    max_attempts: usize,
+    init_backoff: Duration,
+    max_backoff: Duration,
+) -> Result<T, E>
+where
+    F: FnMut() -> Fut,
+    Fut: Future<Output = Result<T, E>>,
+    E: std::fmt::Debug,
+{
+    let mut backoff = init_backoff;
+    for attempt in 0..max_attempts {
+        match operation().await {
+            Ok(result) => return Ok(result),
+            Err(err) if attempt + 1 == max_attempts => return Err(err),
+            Err(err) => {
+                let current_attempt = attempt + 1;
+                warn!(
+                    "Retryable operation failed, attempt: {}, max_attempts: {}, backoff: {:?}, error: {:?}",
+                    current_attempt, max_attempts, backoff, err
+                );
+                tokio::time::sleep(backoff).await;
+                backoff = std::cmp::min(backoff * 2, max_backoff);
+            }
+        }
+    }
+
+    panic!("retry loop should always return")
+}
diff --git a/tests-fuzz/src/utils/sql_dump_writer.rs b/tests-fuzz/src/utils/sql_dump_writer.rs
new file mode 100644
index 0000000000..6f098d9584
--- /dev/null
+++ b/tests-fuzz/src/utils/sql_dump_writer.rs
@@ -0,0 +1,267 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::fs::{OpenOptions, create_dir_all};
+use std::io::Write;
+use std::path::PathBuf;
+
+use snafu::ResultExt;
+
+use crate::error::{self, Result};
+use crate::utils::get_gt_fuzz_dump_buffer_max_bytes;
+
+/// Session writer for table-scoped SQL trace files.
+#[derive(Debug)]
+pub struct SqlDumpSession {
+    /// Session directory path.
+    pub run_dir: PathBuf,
+    /// Max in-memory buffer size before auto flush.
+    pub max_buffer_bytes: usize,
+    buffered_bytes: usize,
+    entries_by_table: HashMap<String, Vec<String>>,
+}
+
+impl SqlDumpSession {
+    /// Creates SQL dump session with default buffer limit.
+    pub fn new(run_dir: PathBuf) -> Result<Self> {
+        Self::new_with_buffer_limit(run_dir, get_gt_fuzz_dump_buffer_max_bytes())
+    }
+
+    /// Creates SQL dump session with custom buffer limit.
+    pub fn new_with_buffer_limit(run_dir: PathBuf, max_buffer_bytes: usize) -> Result<Self> {
+        create_dir_all(&run_dir).context(error::CreateFileSnafu {
+            path: run_dir.to_string_lossy().to_string(),
+        })?;
+
+        Ok(Self {
+            run_dir,
+            max_buffer_bytes,
+            buffered_bytes: 0,
+            entries_by_table: HashMap::new(),
+        })
+    }
+
+    /// Appends one SQL statement for a logical table.
+    pub fn append_sql(&mut self, table: &str, sql: &str, comment: Option<&str>) -> Result<()> {
+        let entry = format_sql_entry(sql, comment);
+        self.push_entry(table, entry)?;
+        Ok(())
+    }
+
+    /// Broadcasts one comment event to all table trace files.
+    pub fn broadcast_event<I, T>(&mut self, tables: I, event: &str, sql: &str) -> Result<()>
+    where
+        I: IntoIterator<Item = T>,
+        T: AsRef<str>,
+    {
+        let entry = format_sql_entry(sql, Some(event));
+        for table in tables {
+            self.push_entry(table.as_ref(), entry.clone())?;
+        }
+        Ok(())
+    }
+
+    /// Flushes all staged SQL traces to table-scoped files.
+    pub fn flush_all(&mut self) -> Result<()> {
+        self.flush_buffered_entries()
+    }
+
+    fn push_entry(&mut self, table: &str, entry: String) -> Result<()> {
+        self.buffered_bytes += entry.len();
+        self.entries_by_table
+            .entry(table.to_string())
+            .or_default()
+            .push(entry);
+
+        if self.buffered_bytes >= self.max_buffer_bytes {
+            self.flush_buffered_entries()?;
+        }
+        Ok(())
+    }
+
+    fn flush_buffered_entries(&mut self) -> Result<()> {
+        if self.entries_by_table.is_empty() {
+            return Ok(());
+        }
+
+        for (table, entries) in &self.entries_by_table {
+            let path = self
+                .run_dir
+                .join(format!("{}.trace.sql", sanitize_file_name(table)));
+            let mut file = OpenOptions::new()
+                .create(true)
+                .append(true)
+                .open(&path)
+                .context(error::CreateFileSnafu {
+                    path: path.to_string_lossy().to_string(),
+                })?;
+
+            for entry in entries {
+                file.write_all(entry.as_bytes())
+                    .context(error::WriteFileSnafu {
+                        path: path.to_string_lossy().to_string(),
+                    })?;
+                file.write_all(b"\n").context(error::WriteFileSnafu {
+                    path: path.to_string_lossy().to_string(),
+                })?;
+            }
+        }
+
+        self.entries_by_table.clear();
+        self.buffered_bytes = 0;
+        Ok(())
+    }
+}
+
+fn format_sql_entry(sql: &str, comment: Option<&str>) -> String {
+    let normalized_sql = normalize_sql(sql);
+    if let Some(comment) = comment {
+        format!("{}\n{normalized_sql}", format_comment(comment))
+    } else {
+        normalized_sql
+    }
+}
+
+fn format_comment(comment: &str) -> String {
+    comment
+        .lines()
+        .map(|line| format!("-- {line}"))
+        .collect::<Vec<_>>()
+        .join("\n")
+}
+
+fn normalize_sql(sql: &str) -> String {
+    let trimmed = sql.trim_end();
+    if trimmed.ends_with(';') {
+        trimmed.to_string()
+    } else {
+        format!("{trimmed};")
+    }
+}
+
+fn sanitize_file_name(raw: &str) -> String {
+    raw.chars()
+        .map(|ch| {
+            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
+                ch
+            } else {
+                '_'
+            }
+        })
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::{SystemTime, UNIX_EPOCH};
+
+    use super::SqlDumpSession;
+
+    #[test]
+    fn test_append_sql_writes_table_trace_file() {
+        let run_dir = std::env::temp_dir().join(format!(
+            "tests-fuzz-sql-dump-{}",
+            SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis()
+        ));
+
+        let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1024).unwrap();
+        session
+            .append_sql(
+                "metric-a",
+                "INSERT INTO t VALUES(1)",
+                Some("kind=insert elapsed_ms=10"),
+            )
+            .unwrap();
+        session.flush_all().unwrap();
+
+        let content = std::fs::read_to_string(run_dir.join("metric-a.trace.sql")).unwrap();
+        assert!(content.contains("-- kind=insert elapsed_ms=10"));
+        assert!(content.contains("INSERT INTO t VALUES(1);"));
+    }
+
+    #[test]
+    fn test_broadcast_event_writes_to_all_tables() {
+        let run_dir = std::env::temp_dir().join(format!(
+            "tests-fuzz-sql-broadcast-{}",
+            SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis()
+        ));
+
+        let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1024).unwrap();
+        session
+            .broadcast_event(
+                ["metric-a", "metric-b"],
+                "repartition action_idx=3",
+                "ALTER TABLE t REPARTITION",
+            )
+            .unwrap();
+        session.flush_all().unwrap();
+
+        let content_a = std::fs::read_to_string(run_dir.join("metric-a.trace.sql")).unwrap();
+        let content_b = std::fs::read_to_string(run_dir.join("metric-b.trace.sql")).unwrap();
+        assert!(content_a.contains("-- repartition action_idx=3"));
+        assert!(content_a.contains("ALTER TABLE t REPARTITION;"));
+        assert!(content_b.contains("-- repartition action_idx=3"));
+        assert!(content_b.contains("ALTER TABLE t REPARTITION;"));
+    }
+
+    #[test]
+    fn test_multiline_comment_is_prefixed_per_line() {
+        let run_dir = std::env::temp_dir().join(format!(
+            "tests-fuzz-sql-dump-comment-{}",
+            SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis()
+        ));
+
+        let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1024).unwrap();
+        session
+            .append_sql(
+                "metric-a",
+                "INSERT INTO t VALUES(1)",
+                Some("kind=insert\nstarted_at_ms=1 elapsed_ms=2"),
+            )
+            .unwrap();
+        session.flush_all().unwrap();
+
+        let content = std::fs::read_to_string(run_dir.join("metric-a.trace.sql")).unwrap();
+        assert!(content.contains("-- kind=insert\n-- started_at_ms=1 elapsed_ms=2"));
+    }
+
+    #[test]
+    fn test_auto_flush_on_buffer_limit() {
+        let run_dir = std::env::temp_dir().join(format!(
+            "tests-fuzz-sql-dump-limit-{}",
+            SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis()
+        ));
+
+        let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1).unwrap();
+        session
+            .append_sql("metric-a", "INSERT INTO t VALUES(1)", None)
+            .unwrap();
+
+        assert!(run_dir.join("metric-a.trace.sql").exists());
+        assert_eq!(session.buffered_bytes, 0);
+    }
+}
diff --git a/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs b/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs
new file mode 100644
index 0000000000..7932bc7759
--- /dev/null
+++ b/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs
@@ -0,0 +1,684 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#![no_main]
+
+use std::collections::{BTreeMap, HashMap};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use arbitrary::{Arbitrary, Unstructured};
+use common_telemetry::{info, warn};
+use common_time::Timestamp;
+use common_time::util::current_time_millis;
+use libfuzzer_sys::fuzz_target;
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaChaRng;
+use snafu::{ResultExt, ensure};
+use sqlx::{MySql, Pool};
+use tests_fuzz::context::{TableContext, TableContextRef};
+use tests_fuzz::error::{self, Result};
+use tests_fuzz::fake::{
+    ConstGenerator, MappedGenerator, WordGenerator, merge_two_word_map_fn, random_capitalize_map,
+    uppercase_and_keyword_backtick_map,
+};
+use tests_fuzz::generator::Generator;
+use tests_fuzz::generator::create_expr::{
+    CreateLogicalTableExprGeneratorBuilder, CreatePhysicalTableExprGeneratorBuilder,
+};
+use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder;
+use tests_fuzz::generator::repartition_expr::{
+    MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder,
+};
+use tests_fuzz::ir::{
+    CreateTableExpr, Ident, InsertIntoExpr, RepartitionExpr, generate_random_value,
+    generate_unique_timestamp_for_mysql_with_clock,
+};
+use tests_fuzz::translator::DslTranslator;
+use tests_fuzz::translator::csv::InsertExprToCsvRecordsTranslator;
+use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator;
+use tests_fuzz::translator::mysql::insert_expr::InsertIntoExprTranslator;
+use tests_fuzz::translator::mysql::repartition_expr::RepartitionExprTranslator;
+use tests_fuzz::utils::csv_dump_writer::{CsvDumpMetadata, CsvDumpSession};
+use tests_fuzz::utils::retry::retry_with_backoff;
+use tests_fuzz::utils::sql_dump_writer::SqlDumpSession;
+use tests_fuzz::utils::{
+    Connections, get_fuzz_override, get_gt_fuzz_input_max_alter_actions,
+    get_gt_fuzz_input_max_tables, init_greptime_connections_via_env,
+};
+use tests_fuzz::validator::row::count_values;
+use tokio::sync::{mpsc, oneshot};
+
+const BARRIER_ACK_TIMEOUT_SECS: u64 = 10;
+const VALIDATE_QUERY_MAX_ATTEMPTS: usize = 6;
+const VALIDATE_QUERY_INIT_BACKOFF: Duration = Duration::from_millis(50);
+const VALIDATE_QUERY_MAX_BACKOFF: Duration = Duration::from_millis(800);
+
+#[derive(Clone)]
+struct FuzzContext {
+    greptime: Pool<MySql>,
+}
+
+impl FuzzContext {
+    async fn close(self) {
+        self.greptime.close().await;
+    }
+}
+
+#[derive(Clone, Debug)]
+struct FuzzInput {
+    seed: u64,
+    actions: usize,
+    partitions: usize,
+    tables: usize,
+}
+
+fn generate_create_physical_table_expr<R: Rng + 'static>(
+    partitions: usize,
+    rng: &mut R,
+) -> Result<CreateTableExpr> {
+    CreatePhysicalTableExprGeneratorBuilder::default()
+        .name_generator(Box::new(ConstGenerator::new(Ident::new(
+            "fuzz_repartition_metric_physical",
+        ))))
+        .if_not_exists(rng.random_bool(0.5))
+        .partition(partitions)
+        .build()
+        .unwrap()
+        .generate(rng)
+}
+
+fn generate_create_logical_table_expr<R: Rng + 'static>(
+    physical_table_ctx: TableContextRef,
+    include_partition_column: bool,
+    rng: &mut R,
+) -> Result<CreateTableExpr> {
+    CreateLogicalTableExprGeneratorBuilder::default()
+        .name_generator(Box::new(MappedGenerator::new(
+            WordGenerator,
+            merge_two_word_map_fn(random_capitalize_map, uppercase_and_keyword_backtick_map),
+        )))
+        .physical_table_ctx(physical_table_ctx)
+        .labels(rng.random_range(1..=5))
+        .if_not_exists(rng.random_bool(0.5))
+        .include_partition_column(include_partition_column)
+        .build()
+        .unwrap()
+        .generate(rng)
+}
+
+fn generate_insert_expr<R: Rng + 'static>(
+    rows: usize,
+    rng: &mut R,
+    table_ctx: TableContextRef,
+    clock: Arc<Mutex<Timestamp>>,
+) -> Result<InsertIntoExpr> {
+    let ts_value_generator = generate_unique_timestamp_for_mysql_with_clock(clock);
+    InsertExprGeneratorBuilder::default()
+        .omit_column_list(false)
+        .table_ctx(table_ctx)
+        .rows(rows)
+        .value_generator(Box::new(generate_random_value))
+        .ts_value_generator(ts_value_generator)
+        .build()
+        .unwrap()
+        .generate(rng)
+}
+
+async fn create_metric_tables<R: Rng + 'static>(
+    ctx: &FuzzContext,
+    rng: &mut R,
+    partitions: usize,
+    table_count: usize,
+) -> Result<(
+    TableContextRef,
+    BTreeMap<Ident, TableContextRef>,
+    HashMap<String, String>,
+    String,
+)> {
+    let create_physical_expr = generate_create_physical_table_expr(partitions, rng)?;
+    let translator = CreateTableExprTranslator;
+    let create_physical_sql = translator.translate(&create_physical_expr)?;
+    let result = sqlx::query(&create_physical_sql)
+        .execute(&ctx.greptime)
+        .await
+        .context(error::ExecuteQuerySnafu {
+            sql: &create_physical_sql,
+        })?;
+    info!("Create physical table: {create_physical_sql}, result: {result:?}");
+    let physical_table_ctx = Arc::new(TableContext::from(&create_physical_expr));
+    ensure!(
+        physical_table_ctx.partition.is_some(),
+        error::AssertSnafu {
+            reason: "Physical metric table must have partition".to_string()
+        }
+    );
+
+    let mut logical_tables = BTreeMap::new();
+    let mut create_logical_sqls = HashMap::new();
+    let max_attempts = table_count * 3;
+    for _ in 0..max_attempts {
+        if logical_tables.len() >= table_count {
+            break;
+        }
+
+        let include_partition_column = rng.random_bool(0.5);
+        let create_logical_expr = generate_create_logical_table_expr(
+            physical_table_ctx.clone(),
+            include_partition_column,
+            rng,
+        )?;
+        if logical_tables.contains_key(&create_logical_expr.table_name) {
+            continue;
+        }
+
+        let create_logical_sql = translator.translate(&create_logical_expr)?;
+        let result = sqlx::query(&create_logical_sql)
+            .execute(&ctx.greptime)
+            .await
+            .context(error::ExecuteQuerySnafu {
+                sql: &create_logical_sql,
+            })?;
+        info!("Create logical table: {create_logical_sql}, result: {result:?}");
+        let logical_ctx = Arc::new(TableContext::from(&create_logical_expr));
+        create_logical_sqls.insert(logical_ctx.name.to_string(), create_logical_sql);
+        logical_tables.insert(logical_ctx.name.clone(), logical_ctx);
+    }
+
+    ensure!(
+        !logical_tables.is_empty(),
+        error::AssertSnafu {
+            reason: "No logical table created".to_string()
+        }
+    );
+
+    Ok((
+        physical_table_ctx,
+        logical_tables,
+        create_logical_sqls,
+        create_physical_sql,
+    ))
+}
+
+async fn execute_insert_with_retry(ctx: &FuzzContext, sql: &str) -> Result<()> {
+    let mut delay = Duration::from_millis(100);
+    let mut attempt = 0;
+    let max_attempts = 10;
+    loop {
+        match sqlx::query(sql)
+            .persistent(false)
+            .execute(&ctx.greptime)
+            .await
+        {
+            Ok(_) => return Ok(()),
+            Err(err) => {
+                tokio::time::sleep(delay).await;
+                delay = std::cmp::min(delay * 2, Duration::from_secs(1));
+                attempt += 1;
+                warn!("Execute insert with retry: {sql}, attempt: {attempt}, error: {err:?}");
+                if attempt >= max_attempts {
+                    return Err(err).context(error::ExecuteQuerySnafu { sql });
+                }
+            }
+        }
+    }
+}
+
+struct SharedState {
+    clock: Arc<Mutex<Timestamp>>,
+    inserted_rows: HashMap<String, u64>,
+    csv_dump_session: Option<CsvDumpSession>,
+    sql_dump_session: Option<SqlDumpSession>,
+    running: bool,
+}
+
+enum WriterControl {
+    Barrier {
+        epoch: usize,
+        ack: oneshot::Sender<()>,
+    },
+    Resume {
+        epoch: usize,
+    },
+    Stop,
+}
+
+fn handle_writer_control(control: WriterControl, paused: &mut bool) -> bool {
+    match control {
+        WriterControl::Barrier { epoch, ack } => {
+            info!("Writer received barrier control, epoch: {epoch}");
+            *paused = true;
+            let _ = ack.send(());
+            false
+        }
+        WriterControl::Resume { epoch } => {
+            info!("Writer received resume control, epoch: {epoch}");
+            *paused = false;
+            false
+        }
+        WriterControl::Stop => {
+            info!("Writer received stop control");
+            true
+        }
+    }
+}
+
+async fn write_loop<R: Rng + 'static>(
+    mut rng: R,
+    ctx: FuzzContext,
+    logical_tables: BTreeMap<Ident, TableContextRef>,
+    shared_state: Arc<Mutex<SharedState>>,
+    mut control_rx: mpsc::UnboundedReceiver<WriterControl>,
+) -> Result<()> {
+    info!("Start write loop");
+    let mut paused = false;
+    loop {
+        while let Ok(control) = control_rx.try_recv() {
+            if handle_writer_control(control, &mut paused) {
+                return Ok(());
+            }
+        }
+
+        if paused {
+            match control_rx.recv().await {
+                Some(control) => {
+                    if handle_writer_control(control, &mut paused) {
+                        return Ok(());
+                    }
+                }
+                None => return Ok(()),
+            }
+            continue;
+        }
+
+        let (running, clock) = {
+            let state = shared_state.lock().unwrap();
+            (state.running, state.clock.clone())
+        };
+        if !running {
+            break;
+        }
+
+        for table_ctx in logical_tables.values() {
+            let rows = rng.random_range(1..=3);
+            let insert_expr =
+                generate_insert_expr(rows, &mut rng, table_ctx.clone(), clock.clone())?;
+            let translator = InsertIntoExprTranslator;
+            let sql = translator.translate(&insert_expr)?;
+            let inserted = insert_expr.values_list.len() as u64;
+            let csv_records = InsertExprToCsvRecordsTranslator.translate(&insert_expr)?;
+            let table_name = table_ctx.name.to_string();
+            let full_headers = table_ctx
+                .columns
+                .iter()
+                .map(|column| column.name.value.clone())
+                .collect::<Vec<_>>();
+
+            let started_at_ms = current_time_millis();
+            let now = Instant::now();
+            execute_insert_with_retry(&ctx, &sql).await?;
+            let elapsed = now.elapsed();
+            info!("Execute insert sql: {sql}, elapsed: {elapsed:?}");
+
+            let mut state = shared_state.lock().unwrap();
+            if let Some(csv_dump_session) = state.csv_dump_session.as_mut() {
+                csv_dump_session.append(csv_records, full_headers)?;
+            }
+            if let Some(sql_dump_session) = state.sql_dump_session.as_mut() {
+                let comment = format!(
+                    "kind=insert table={} started_at_ms={} elapsed_ms={}",
+                    table_name,
+                    started_at_ms,
+                    elapsed.as_millis()
+                );
+                sql_dump_session.append_sql(&table_name, &sql, Some(&comment))?;
+            }
+            *state.inserted_rows.entry(table_name).or_insert(0) += inserted;
+        }
+
+        tokio::time::sleep(Duration::from_millis(100)).await;
+    }
+    info!("Write loop ended");
+
+    Ok(())
+}
+
+async fn validate_rows(
+    ctx: &FuzzContext,
+    logical_tables: &BTreeMap<Ident, TableContextRef>,
+    inserted_rows: &HashMap<String, u64>,
+) -> Result<()> {
+    for table_ctx in logical_tables.values() {
+        let expected = *inserted_rows.get(&table_ctx.name.to_string()).unwrap_or(&0) as usize;
+        let count_sql = format!("SELECT COUNT(1) AS count FROM {}", table_ctx.name);
+        let count = retry_with_backoff(
+            || count_values(&ctx.greptime, &count_sql),
+            VALIDATE_QUERY_MAX_ATTEMPTS,
+            VALIDATE_QUERY_INIT_BACKOFF,
+            VALIDATE_QUERY_MAX_BACKOFF,
+        )
+        .await?;
+        let distinct_count_sql = format!(
+            "SELECT COUNT(DISTINCT {}) AS count FROM {}",
+            table_ctx.timestamp_column().unwrap().name,
+            table_ctx.name
+        );
+        let distinct_count = retry_with_backoff(
+            || count_values(&ctx.greptime, &distinct_count_sql),
+            VALIDATE_QUERY_MAX_ATTEMPTS,
+            VALIDATE_QUERY_INIT_BACKOFF,
+            VALIDATE_QUERY_MAX_BACKOFF,
+        )
+        .await?;
+        info!(
+            "Validate rows for table: {}, expected: {}, count: {}, distinct_count: {}",
+            table_ctx.name, expected, count.count as usize, distinct_count.count as usize
+        );
+        assert_eq!(count.count as usize, expected);
+
+        assert_eq!(distinct_count.count as usize, expected);
+    }
+    Ok(())
+}
+
+fn flush_dump_sessions_and_snapshot(
+    shared_state: &Arc<Mutex<SharedState>>,
+) -> Result<HashMap<String, u64>> {
+    let mut state = shared_state.lock().unwrap();
+    if let Some(csv_dump_session) = state.csv_dump_session.as_mut() {
+        csv_dump_session.flush_all()?;
+    }
+    if let Some(sql_dump_session) = state.sql_dump_session.as_mut() {
+        sql_dump_session.flush_all()?;
+    }
+    Ok(state.inserted_rows.clone())
+}
+
+async fn cleanup_tables(
+    ctx: &FuzzContext,
+    physical_table_ctx: &TableContextRef,
+    logical_tables: &BTreeMap<Ident, TableContextRef>,
+) -> Result<()> {
+    for table_ctx in logical_tables.values() {
+        let drop_logical_sql = format!("DROP TABLE {}", table_ctx.name);
+        let result = sqlx::query(&drop_logical_sql)
+            .execute(&ctx.greptime)
+            .await
+            .context(error::ExecuteQuerySnafu {
+                sql: &drop_logical_sql,
+            })?;
+        info!("Drop logical table: {drop_logical_sql}, result: {result:?}");
+    }
+
+    let drop_physical_sql = format!("DROP TABLE {}", physical_table_ctx.name);
+    let result = sqlx::query(&drop_physical_sql)
+        .execute(&ctx.greptime)
+        .await
+        .context(error::ExecuteQuerySnafu {
+            sql: &drop_physical_sql,
+        })?;
+    info!("Drop physical table: {drop_physical_sql}, result: {result:?}");
+    Ok(())
+}
+
+fn repartition_operation<R: Rng + 'static>(
+    table_ctx: &TableContextRef,
+    rng: &mut R,
+) -> Result<RepartitionExpr> {
+    let split = rng.random_bool(0.5);
+    if table_ctx.partition.as_ref().unwrap().exprs.len() <= 2 || split {
+        let expr = SplitPartitionExprGeneratorBuilder::default()
+            .table_ctx(table_ctx.clone())
+            .build()
+            .unwrap()
+            .generate(rng)?;
+        Ok(RepartitionExpr::Split(expr))
+    } else {
+        let expr = MergePartitionExprGeneratorBuilder::default()
+            .table_ctx(table_ctx.clone())
+            .build()
+            .unwrap()
+            .generate(rng)?;
+        Ok(RepartitionExpr::Merge(expr))
+    }
+}
+
+impl Arbitrary<'_> for FuzzInput {
+    fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result<Self> {
+        let seed = get_fuzz_override::<u64>("SEED").unwrap_or(u.int_in_range(u64::MIN..=u64::MAX)?);
+        let mut rng = ChaChaRng::seed_from_u64(seed);
+        let partitions =
+            get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8));
+        let max_tables = get_gt_fuzz_input_max_tables();
+        let tables = get_fuzz_override::<usize>("TABLES")
+            .unwrap_or_else(|| rng.random_range(1..=std::cmp::max(1, max_tables)));
+        let max_actions = get_gt_fuzz_input_max_alter_actions();
+        let actions = get_fuzz_override::<usize>("ACTIONS")
+            .unwrap_or_else(|| rng.random_range(1..max_actions));
+
+        Ok(FuzzInput {
+            seed,
+            actions,
+            partitions,
+            tables,
+        })
+    }
+}
+
+async fn execute_repartition_metric_table(ctx: FuzzContext, input: FuzzInput) -> Result<()> {
+    info!("input: {input:?}");
+    let mut rng = ChaChaRng::seed_from_u64(input.seed);
+    let clock = Arc::new(Mutex::new(Timestamp::current_millis()));
+
+    let (mut physical_table_ctx, logical_tables, create_logical_sqls, create_physical_sql) =
+        create_metric_tables(&ctx, &mut rng, input.partitions, input.tables).await?;
+
+    let mut inserted_rows = HashMap::with_capacity(logical_tables.len());
+    for table_ctx in logical_tables.values() {
+        inserted_rows.insert(table_ctx.name.to_string(), 0);
+    }
+    let csv_dump_session = CsvDumpSession::new(CsvDumpMetadata::new(
+        "fuzz_repartition_metric_table",
+        input.seed,
+        input.actions,
+        input.partitions,
+        input.tables,
+    ))?;
+    let sql_dump_session = SqlDumpSession::new(csv_dump_session.run_dir.clone())?;
+    let logical_table_names = logical_tables
+        .values()
+        .map(|table_ctx| table_ctx.name.to_string())
+        .collect::<Vec<_>>();
+
+    let mut sql_dump_session = sql_dump_session;
+    sql_dump_session.append_sql(
+        &physical_table_ctx.name.to_string(),
+        &create_physical_sql,
+        Some("kind=create_physical_table"),
+    )?;
+    for table_name in &logical_table_names {
+        if let Some(create_sql) = create_logical_sqls.get(table_name) {
+            sql_dump_session.append_sql(
+                table_name,
+                create_sql,
+                Some("kind=create_logical_table"),
+            )?;
+        }
+    }
+
+    let shared_state = Arc::new(Mutex::new(SharedState {
+        clock,
+        inserted_rows,
+        csv_dump_session: Some(csv_dump_session),
+        sql_dump_session: Some(sql_dump_session),
+        running: true,
+    }));
+    let writer_rng = ChaChaRng::seed_from_u64(input.seed ^ 0xA5A5_A5A5_A5A5_A5A5);
+    let (control_tx, control_rx) = mpsc::unbounded_channel::<WriterControl>();
+    let writer_task = tokio::spawn(write_loop(
+        writer_rng,
+        ctx.clone(),
+        logical_tables.clone(),
+        shared_state.clone(),
+        control_rx,
+    ));
+    tokio::time::sleep(Duration::from_millis(100)).await;
+
+    for i in 0..input.actions {
+        let partition_num = physical_table_ctx.partition.as_ref().unwrap().exprs.len();
+        info!(
+            "partition_num: {partition_num}, action: {}/{}, table: {}, logical table num: {}",
+            i + 1,
+            input.actions,
+            physical_table_ctx.name,
+            logical_tables.len()
+        );
+
+        let repartition_expr = repartition_operation(&physical_table_ctx, &mut rng)?;
+        let translator = RepartitionExprTranslator;
+        let sql = translator.translate(&repartition_expr)?;
+        info!("Repartition sql: {sql}");
+        let started_at_ms = current_time_millis();
+        let now = Instant::now();
+        let result = sqlx::query(&sql)
+            .execute(&ctx.greptime)
+            .await
+            .context(error::ExecuteQuerySnafu { sql: &sql })?;
+        let elapsed = now.elapsed();
+        info!("Repartition result: {result:?}, elapsed: {elapsed:?}");
+
+        physical_table_ctx = Arc::new(
+            Arc::unwrap_or_clone(physical_table_ctx)
+                .repartition(repartition_expr)
+                .unwrap(),
+        );
+
+        let partition_entries = tests_fuzz::validator::partition::fetch_partitions_info_schema(
+            &ctx.greptime,
+            "public".into(),
+            &physical_table_ctx.name,
+        )
+        .await?;
+        tests_fuzz::validator::partition::assert_partitions(
+            physical_table_ctx.partition.as_ref().unwrap(),
+            &partition_entries,
+        )?;
+
+        {
+            let mut state = shared_state.lock().unwrap();
+            if let Some(sql_dump_session) = state.sql_dump_session.as_mut() {
+                let repartition_comment = format!(
+                    "kind=repartition table={} action_idx={} started_at_ms={} elapsed_ms={}",
+                    physical_table_ctx.name,
+                    i + 1,
+                    started_at_ms,
+                    elapsed.as_millis()
+                );
+                sql_dump_session.append_sql(
+                    &physical_table_ctx.name.to_string(),
+                    &sql,
+                    Some(&repartition_comment),
+                )?;
+                let event = format!(
+                    "repartition action_idx={} started_at_ms={} elapsed_ms={} sql={}",
+                    i + 1,
+                    started_at_ms,
+                    elapsed.as_millis(),
+                    sql
+                );
+                sql_dump_session.broadcast_event(logical_table_names.iter(), &event, &sql)?;
+            }
+        }
+
+        let (ack_tx, ack_rx) = oneshot::channel();
+        control_tx
+            .send(WriterControl::Barrier {
+                epoch: i + 1,
+                ack: ack_tx,
+            })
+            .expect("barrier control send must succeed");
+        tokio::time::timeout(Duration::from_secs(BARRIER_ACK_TIMEOUT_SECS), ack_rx)
+            .await
+            .expect("barrier ack timeout")
+            .expect("barrier ack dropped");
+
+        let inserted_rows_snapshot = flush_dump_sessions_and_snapshot(&shared_state)?;
+        info!("validate rows, epoch: {}", i + 1);
+        validate_rows(&ctx, &logical_tables, &inserted_rows_snapshot).await?;
+
+        control_tx
+            .send(WriterControl::Resume { epoch: i + 1 })
+            .expect("resume control send must succeed");
+    }
+
+    let _ = control_tx.send(WriterControl::Stop);
+    shared_state.lock().unwrap().running = false;
+    writer_task.await.unwrap().unwrap();
+    let inserted_rows = flush_dump_sessions_and_snapshot(&shared_state)?;
+    let (mut csv_dump_session, mut sql_dump_session) = {
+        let mut state = shared_state.lock().unwrap();
+        (state.csv_dump_session.take(), state.sql_dump_session.take())
+    };
+
+    let run_result = async {
+        validate_rows(&ctx, &logical_tables, &inserted_rows).await?;
+        cleanup_tables(&ctx, &physical_table_ctx, &logical_tables).await?;
+        Ok(())
+    }
+    .await;
+
+    if let Some(csv_dump_session) = csv_dump_session.take() {
+        match &run_result {
+            Ok(_) => {
+                if let Err(err) = csv_dump_session.cleanup_on_success() {
+                    warn!(
+                        "Cleanup csv dump directory failed, path: {}, error: {:?}",
+                        csv_dump_session.run_dir.display(),
+                        err
+                    );
+                }
+            }
+            Err(_) => {
+                warn!(
+                    "Keep csv dump directory for failure analysis, path: {}",
+                    csv_dump_session.run_dir.display()
+                );
+            }
+        }
+    }
+    if let Some(sql_dump_session) = sql_dump_session.take()
+        && run_result.is_err()
+    {
+        warn!(
+            "Keep sql dump directory for failure analysis, path: {}",
+            sql_dump_session.run_dir.display()
+        );
+    }
+
+    ctx.close().await;
+    run_result
+}
+
+fuzz_target!(|input: FuzzInput| {
+    common_telemetry::init_default_ut_logging();
+    common_runtime::block_on_global(async {
+        let Connections { mysql } = init_greptime_connections_via_env().await;
+        let ctx = FuzzContext {
+            greptime: mysql.expect("mysql connection init must be succeed"),
+        };
+        execute_repartition_metric_table(ctx, input)
+            .await
+            .unwrap_or_else(|err| panic!("fuzz test must be succeed: {err:?}"));
+    })
+});

From 74ff5c37eaf45484d10f702b25e2aded92aa6eba Mon Sep 17 00:00:00 2001
From: LFC <990479+MichaelScofield@users.noreply.github.com>
Date: Fri, 13 Mar 2026 17:25:21 +0800
Subject: [PATCH 08/42] refactor: customize standalone instance build (#7807)

* refactor: customize standalone instance build

Signed-off-by: luofucong <luofc@foxmail.com>

* resolve PR comments

Signed-off-by: luofucong <luofc@foxmail.com>

---------

Signed-off-by: luofucong <luofc@foxmail.com>
---
 src/cmd/src/standalone.rs                | 186 ++++++++++++++++++++---
 tests/conf/datanode-test.toml.template   |   2 +-
 tests/conf/frontend-test.toml.template   |   4 +-
 tests/conf/standalone-test.toml.template |   6 +-
 tests/runner/src/server_mode.rs          |  63 +++-----
 5 files changed, 187 insertions(+), 74 deletions(-)

diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs
index 92638d3c4a..215bea0ec5 100644
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -32,14 +32,15 @@ use common_meta::cache::LayeredCacheRegistryBuilder;
 use common_meta::ddl::flow_meta::FlowMetadataAllocator;
 use common_meta::ddl::table_meta::TableMetadataAllocator;
 use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl};
-use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef};
+use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef, DdlManagerRef};
 use common_meta::key::flow::FlowMetadataManager;
 use common_meta::key::{TableMetadataManager, TableMetadataManagerRef};
 use common_meta::kv_backend::KvBackendRef;
-use common_meta::procedure_executor::LocalProcedureExecutor;
+use common_meta::node_manager::{FlownodeRef, NodeManagerRef};
+use common_meta::procedure_executor::{LocalProcedureExecutor, ProcedureExecutorRef};
 use common_meta::region_keeper::MemoryRegionKeeper;
 use common_meta::region_registry::LeaderRegionRegistry;
-use common_meta::sequence::SequenceBuilder;
+use common_meta::sequence::{Sequence, SequenceBuilder};
 use common_meta::wal_provider::{WalProviderRef, build_wal_provider};
 use common_procedure::ProcedureManagerRef;
 use common_query::prelude::set_default_prefix;
@@ -49,6 +50,7 @@ use common_time::timezone::set_default_timezone;
 use common_version::{short_version, verbose_version};
 use datanode::config::DatanodeOptions;
 use datanode::datanode::{Datanode, DatanodeBuilder};
+use datanode::region_server::RegionServer;
 use flow::{
     FlownodeBuilder, FlownodeInstance, FlownodeOptions, FrontendClient, FrontendInvoker,
     GrpcQueryHandlerWithBoxedError,
@@ -58,6 +60,7 @@ use frontend::instance::StandaloneDatanodeManager;
 use frontend::instance::builder::FrontendBuilder;
 use frontend::server::Services;
 use meta_srv::metasrv::{FLOW_ID_SEQ, TABLE_ID_SEQ};
+use plugins::PluginOptions;
 use plugins::frontend::context::{
     CatalogManagerConfigureContext, StandaloneCatalogManagerConfigureContext,
 };
@@ -130,6 +133,18 @@ impl Instance {
     pub fn server_addr(&self, name: &str) -> Option<SocketAddr> {
         self.frontend.server_handlers().addr(name)
     }
+
+    /// Get the mutable Frontend component of this Standalone instance for externally modification
+    /// by others (might not be in this code base, so don't delete this function).
+    pub fn mut_frontend(&mut self) -> &mut Frontend {
+        &mut self.frontend
+    }
+
+    /// Get the Datanode component of this Standalone instance for externally usage
+    /// by others (might not be in this code base, so don't delete this function).
+    pub fn datanode(&self) -> &Datanode {
+        &self.datanode
+    }
 }
 
 #[async_trait]
@@ -342,9 +357,18 @@ impl StartCommand {
         info!("Standalone start command: {:#?}", self);
         info!("Standalone options: {opts:#?}");
 
+        let (mut instance, _) =
+            Self::build_with(opts.component, opts.plugins, InstanceCreator::default()).await?;
+        instance._guard.extend(guard);
+        Ok(instance)
+    }
+
+    pub async fn build_with(
+        mut opts: StandaloneOptions,
+        plugin_opts: Vec<PluginOptions>,
+        creator: InstanceCreator,
+    ) -> Result<(Instance, InstanceCreatorResult)> {
         let mut plugins = Plugins::new();
-        let plugin_opts = opts.plugins;
-        let mut opts = opts.component;
         set_default_prefix(opts.default_column_prefix.as_deref())
             .map_err(BoxedError::new)
             .context(error::BuildCliSnafu)?;
@@ -462,17 +486,16 @@ impl StartCommand {
                 .await;
         }
 
-        let node_manager = Arc::new(StandaloneDatanodeManager {
-            region_server: datanode.region_server(),
-            flow_server: flownode.flow_engine(),
-        });
+        let node_manager = creator
+            .node_manager_creator
+            .create(
+                &kv_backend,
+                datanode.region_server(),
+                flownode.flow_engine(),
+            )
+            .await?;
 
-        let table_id_allocator = Arc::new(
-            SequenceBuilder::new(TABLE_ID_SEQ, kv_backend.clone())
-                .initial(MIN_USER_TABLE_ID as u64)
-                .step(10)
-                .build(),
-        );
+        let table_id_allocator = creator.table_id_allocator_creator.create(&kv_backend);
         let flow_id_sequence = Arc::new(
             SequenceBuilder::new(FLOW_ID_SEQ, kv_backend.clone())
                 .initial(MIN_USER_FLOW_ID as u64)
@@ -489,7 +512,7 @@ impl StartCommand {
             .context(error::BuildWalProviderSnafu)?;
         let wal_provider = Arc::new(wal_provider);
         let table_metadata_allocator = Arc::new(TableMetadataAllocator::new(
-            table_id_allocator,
+            table_id_allocator.clone(),
             wal_provider.clone(),
         ));
         let flow_metadata_allocator = Arc::new(FlowMetadataAllocator::with_noop_peer_allocator(
@@ -532,10 +555,10 @@ impl StartCommand {
             ddl_manager
         };
 
-        let procedure_executor = Arc::new(LocalProcedureExecutor::new(
-            Arc::new(ddl_manager),
-            procedure_manager.clone(),
-        ));
+        let procedure_executor = creator
+            .procedure_executor_creator
+            .create(Arc::new(ddl_manager), procedure_manager.clone())
+            .await?;
 
         let fe_instance = FrontendBuilder::new(
             fe_opts.clone(),
@@ -568,7 +591,7 @@ impl StartCommand {
             kv_backend.clone(),
             layered_cache_registry.clone(),
             procedure_executor,
-            node_manager,
+            node_manager.clone(),
         )
         .await
         .context(StartFlownodeSnafu)?;
@@ -584,14 +607,20 @@ impl StartCommand {
             heartbeat_task: None,
         };
 
-        Ok(Instance {
+        let instance = Instance {
             datanode,
             frontend,
             flownode,
             procedure_manager,
             wal_provider,
-            _guard: guard,
-        })
+            _guard: vec![],
+        };
+        let result = InstanceCreatorResult {
+            kv_backend,
+            node_manager,
+            table_id_allocator,
+        };
+        Ok((instance, result))
     }
 
     pub async fn create_table_metadata_manager(
@@ -608,6 +637,115 @@ impl StartCommand {
     }
 }
 
+#[async_trait]
+pub trait NodeManagerCreator {
+    async fn create(
+        &self,
+        kv_backend: &KvBackendRef,
+        region_server: RegionServer,
+        flow_server: FlownodeRef,
+    ) -> Result<NodeManagerRef>;
+}
+
+pub struct DefaultNodeManagerCreator;
+
+#[async_trait]
+impl NodeManagerCreator for DefaultNodeManagerCreator {
+    async fn create(
+        &self,
+        _: &KvBackendRef,
+        region_server: RegionServer,
+        flow_server: FlownodeRef,
+    ) -> Result<NodeManagerRef> {
+        Ok(Arc::new(StandaloneDatanodeManager {
+            region_server,
+            flow_server,
+        }))
+    }
+}
+
+pub trait TableIdAllocatorCreator {
+    fn create(&self, kv_backend: &KvBackendRef) -> Arc<Sequence>;
+}
+
+struct DefaultTableIdAllocatorCreator;
+
+impl TableIdAllocatorCreator for DefaultTableIdAllocatorCreator {
+    fn create(&self, kv_backend: &KvBackendRef) -> Arc<Sequence> {
+        Arc::new(
+            SequenceBuilder::new(TABLE_ID_SEQ, kv_backend.clone())
+                .initial(MIN_USER_TABLE_ID as u64)
+                .step(10)
+                .build(),
+        )
+    }
+}
+
+#[async_trait]
+pub trait ProcedureExecutorCreator {
+    async fn create(
+        &self,
+        ddl_manager: DdlManagerRef,
+        procedure_manager: ProcedureManagerRef,
+    ) -> Result<ProcedureExecutorRef>;
+}
+
+pub struct DefaultProcedureExecutorCreator;
+
+#[async_trait]
+impl ProcedureExecutorCreator for DefaultProcedureExecutorCreator {
+    async fn create(
+        &self,
+        ddl_manager: DdlManagerRef,
+        procedure_manager: ProcedureManagerRef,
+    ) -> Result<ProcedureExecutorRef> {
+        Ok(Arc::new(LocalProcedureExecutor::new(
+            ddl_manager,
+            procedure_manager,
+        )))
+    }
+}
+
+/// `InstanceCreator` is used for grouping various component creators for building the
+/// Standalone instance, suitable for customizing how the instance can be built.
+pub struct InstanceCreator {
+    node_manager_creator: Box<dyn NodeManagerCreator>,
+    table_id_allocator_creator: Box<dyn TableIdAllocatorCreator>,
+    procedure_executor_creator: Box<dyn ProcedureExecutorCreator>,
+}
+
+impl InstanceCreator {
+    pub fn new(
+        node_manager_creator: Box<dyn NodeManagerCreator>,
+        table_id_allocator_creator: Box<dyn TableIdAllocatorCreator>,
+        procedure_executor_creator: Box<dyn ProcedureExecutorCreator>,
+    ) -> Self {
+        Self {
+            node_manager_creator,
+            table_id_allocator_creator,
+            procedure_executor_creator,
+        }
+    }
+}
+
+impl Default for InstanceCreator {
+    fn default() -> Self {
+        Self {
+            node_manager_creator: Box::new(DefaultNodeManagerCreator),
+            table_id_allocator_creator: Box::new(DefaultTableIdAllocatorCreator),
+            procedure_executor_creator: Box::new(DefaultProcedureExecutorCreator),
+        }
+    }
+}
+
+/// `InstanceCreatorResult` is expected to be used paired with [InstanceCreator].
+/// It stores the created and other important components for further reusing.
+pub struct InstanceCreatorResult {
+    pub kv_backend: KvBackendRef,
+    pub node_manager: NodeManagerRef,
+    pub table_id_allocator: Arc<Sequence>,
+}
+
 #[cfg(test)]
 mod tests {
     use std::default::Default;
diff --git a/tests/conf/datanode-test.toml.template b/tests/conf/datanode-test.toml.template
index 4cb0423c72..3ec8a2f695 100644
--- a/tests/conf/datanode-test.toml.template
+++ b/tests/conf/datanode-test.toml.template
@@ -28,7 +28,7 @@ type = 'File'
 data_home = '{data_home}'
 
 [meta_client_options]
-metasrv_addrs = ['{metasrv_addr}']
+metasrv_addrs = ['{addrs.metasrv_addr}']
 timeout_millis = 3000
 connect_timeout_millis = 5000
 tcp_nodelay = false
diff --git a/tests/conf/frontend-test.toml.template b/tests/conf/frontend-test.toml.template
index de4ce86adc..25d44ff6e4 100644
--- a/tests/conf/frontend-test.toml.template
+++ b/tests/conf/frontend-test.toml.template
@@ -1,3 +1,3 @@
 [grpc]
-bind_addr = "{grpc_addr}"
-server_addr = "{grpc_addr}"
+bind_addr = "{addrs.grpc_addr}"
+server_addr = "{addrs.grpc_addr}"
diff --git a/tests/conf/standalone-test.toml.template b/tests/conf/standalone-test.toml.template
index 509eac7ca6..50c014e991 100644
--- a/tests/conf/standalone-test.toml.template
+++ b/tests/conf/standalone-test.toml.template
@@ -26,12 +26,12 @@ type = 'File'
 data_home = '{data_home}'
 
 [grpc]
-bind_addr = '{grpc_addr}'
+bind_addr = '{addrs.grpc_addr}'
 runtime_size = 8
 
 [mysql]
 enable = true
-addr = "{mysql_addr}"
+addr = "{addrs.mysql_addr}"
 runtime_size = 2
 prepared_stmt_cache_size= 10000
 
@@ -40,7 +40,7 @@ mode = "disable"
 
 [postgres]
 enable = true
-addr = "{postgres_addr}"
+addr = "{addrs.postgres_addr}"
 runtime_size = 2
 
 [procedure]
diff --git a/tests/runner/src/server_mode.rs b/tests/runner/src/server_mode.rs
index 172baf32ff..1f7cb72bf4 100644
--- a/tests/runner/src/server_mode.rs
+++ b/tests/runner/src/server_mode.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::path::Path;
 use std::sync::{Mutex, OnceLock};
 
@@ -96,15 +96,7 @@ struct ConfigContext {
     use_etcd: bool,
     store_addrs: String,
     instance_id: usize,
-    // for following addrs, leave it empty if not needed
-    // required for datanode
-    metasrv_addr: String,
-    // for frontend and standalone
-    grpc_addr: String,
-    // for standalone
-    mysql_addr: String,
-    // for standalone
-    postgres_addr: String,
+    addrs: HashMap<String, String>,
     // enable flat format for storage engine
     enable_flat_format: bool,
 }
@@ -275,40 +267,26 @@ impl ServerMode {
         let procedure_dir = data_home.join("procedure").display().to_string();
 
         // Get the required addresses based on server mode
-        let (metasrv_addr, grpc_addr, mysql_addr, postgres_addr) = match self {
+        let addrs: HashMap<String, String> = match self {
             ServerMode::Standalone {
                 rpc_bind_addr,
                 mysql_addr,
                 postgres_addr,
-                ..
-            } => (
-                String::new(),
-                rpc_bind_addr.clone(),
-                mysql_addr.clone(),
-                postgres_addr.clone(),
-            ),
-            ServerMode::Frontend {
-                rpc_bind_addr,
-                mysql_addr,
-                postgres_addr,
-                ..
-            } => (
-                String::new(),
-                rpc_bind_addr.clone(),
-                mysql_addr.clone(),
-                postgres_addr.clone(),
-            ),
-            ServerMode::Datanode {
-                rpc_bind_addr,
-                metasrv_addr,
-                ..
-            } => (
-                metasrv_addr.clone(),
-                rpc_bind_addr.clone(),
-                String::new(),
-                String::new(),
-            ),
-            _ => (String::new(), String::new(), String::new(), String::new()),
+                http_addr,
+            } => [
+                ("http_addr".to_string(), http_addr.clone()),
+                ("grpc_addr".to_string(), rpc_bind_addr.clone()),
+                ("mysql_addr".to_string(), mysql_addr.clone()),
+                ("postgres_addr".to_string(), postgres_addr.clone()),
+            ]
+            .into(),
+            ServerMode::Frontend { rpc_bind_addr, .. } => {
+                [("grpc_addr".to_string(), rpc_bind_addr.clone())].into()
+            }
+            ServerMode::Datanode { metasrv_addr, .. } => {
+                [("metasrv_addr".to_string(), metasrv_addr.clone())].into()
+            }
+            _ => HashMap::new(),
         };
 
         let ctx = ConfigContext {
@@ -326,10 +304,7 @@ impl ServerMode {
                 .collect::<Vec<_>>()
                 .join(","),
             instance_id: id,
-            metasrv_addr,
-            grpc_addr,
-            mysql_addr,
-            postgres_addr,
+            addrs,
             enable_flat_format: db_ctx.store_config().enable_flat_format,
         };
 

From e215851c8a6487f8ba7e4e385b57f3ceaae11d86 Mon Sep 17 00:00:00 2001
From: Yingwen <realevenyag@gmail.com>
Date: Fri, 13 Mar 2026 17:44:13 +0800
Subject: [PATCH 09/42] refactor: unify flush and compaction to always use
 FlatSource (#7799)

* feat: support write flat as primary key format

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: migrate flush to always use FlatSource

Add FormatType propagation in SstWriteRequest and use it to choose
Flat vs PrimaryKey write paths (write_all_flat vs
write_all_flat_as_primary_key) in AccessLayer and WriteCache. Make
compactor and flush derive the sst_write_format from region options or
engine config. Simplify flush logic and remove the old memtable_source
helper. Update tests to set default sst_write_format.

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: compaction use flat source

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: read parquet sequentially as flat batches

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: remove new_batch_with_binary in favor of new_record_batch_with_binary

Replace PrimaryKeyWriteFormat with FlatWriteFormat in test_read_large_binary
test and use new_record_batch_with_binary directly, removing the now-unused
new_batch_with_binary function and its BinaryArray import.

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: add tests for PrimaryKeyWriteFormat::convert_flat_batch

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: remove Either from SstWriteRequest

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: handle index build mode

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: consider sparse encoding and last non null in flush

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: add unit tests for field_column_start edge cases

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
---
 src/cmd/src/datanode/objbench.rs         |  13 +-
 src/mito2/src/access_layer.rs            |  22 +-
 src/mito2/src/cache/write_cache.rs       |  50 +++--
 src/mito2/src/compaction.rs              |  19 +-
 src/mito2/src/compaction/compactor.rs    |  16 +-
 src/mito2/src/flush.rs                   | 166 +++++----------
 src/mito2/src/memtable/bulk.rs           |   9 +-
 src/mito2/src/read/prune.rs              |   5 -
 src/mito2/src/read/seq_scan.rs           |  56 -----
 src/mito2/src/sst.rs                     | 183 +---------------
 src/mito2/src/sst/index.rs               |  23 +-
 src/mito2/src/sst/parquet.rs             | 256 ++++++++++++-----------
 src/mito2/src/sst/parquet/flat_format.rs | 105 +++++++++-
 src/mito2/src/sst/parquet/format.rs      | 211 ++++++++++---------
 src/mito2/src/sst/parquet/reader.rs      | 124 +++--------
 src/mito2/src/sst/parquet/writer.rs      | 223 +++++++-------------
 src/mito2/src/test_util/sst_util.rs      |  91 +++++---
 17 files changed, 668 insertions(+), 904 deletions(-)

diff --git a/src/cmd/src/datanode/objbench.rs b/src/cmd/src/datanode/objbench.rs
index d8f53b9d71..f6d8674d4c 100644
--- a/src/cmd/src/datanode/objbench.rs
+++ b/src/cmd/src/datanode/objbench.rs
@@ -20,13 +20,14 @@ use clap::Parser;
 use colored::Colorize;
 use datanode::config::RegionEngineConfig;
 use datanode::store;
-use either::Either;
+use futures::stream;
 use mito2::access_layer::{
     AccessLayer, AccessLayerRef, Metrics, OperationType, SstWriteRequest, WriteType,
 };
 use mito2::cache::{CacheManager, CacheManagerRef};
 use mito2::config::{FulltextIndexConfig, MitoConfig, Mode};
-use mito2::read::Source;
+use mito2::read::FlatSource;
+use mito2::sst::FormatType;
 use mito2::sst::file::{FileHandle, FileMeta};
 use mito2::sst::file_purger::{FilePurger, FilePurgerRef};
 use mito2::sst::index::intermediate::IntermediateManager;
@@ -210,6 +211,7 @@ impl ObjbenchCommand {
             object_store.clone(),
         )
         .expected_metadata(Some(region_meta.clone()))
+        .flat_format(true)
         .build()
         .await
         .map_err(|e| {
@@ -231,6 +233,10 @@ impl ObjbenchCommand {
         let reader_build_elapsed = reader_build_start.elapsed();
         let total_rows = reader.parquet_metadata().file_metadata().num_rows();
         println!("{} Reader built in {:?}", "✓".green(), reader_build_elapsed);
+        let reader_stream = Box::pin(stream::try_unfold(reader, |mut reader| async move {
+            let batch = reader.next_record_batch().await?;
+            Ok(batch.map(|batch| (batch, reader)))
+        }));
 
         // Build write request
         let fulltext_index_config = FulltextIndexConfig {
@@ -241,10 +247,11 @@ impl ObjbenchCommand {
         let write_req = SstWriteRequest {
             op_type: OperationType::Flush,
             metadata: region_meta,
-            source: Either::Left(Source::Reader(Box::new(reader))),
+            source: FlatSource::Stream(reader_stream),
             cache_manager,
             storage: None,
             max_sequence: None,
+            sst_write_format: FormatType::PrimaryKey,
             index_options: Default::default(),
             index_config: mito_engine_config.index.clone(),
             inverted_index_config: MitoConfig::default().inverted_index,
diff --git a/src/mito2/src/access_layer.rs b/src/mito2/src/access_layer.rs
index 92c8a3bc36..231285215e 100644
--- a/src/mito2/src/access_layer.rs
+++ b/src/mito2/src/access_layer.rs
@@ -17,7 +17,6 @@ use std::time::{Duration, Instant};
 
 use async_stream::try_stream;
 use common_time::Timestamp;
-use either::Either;
 use futures::{Stream, TryStreamExt};
 use object_store::services::Fs;
 use object_store::util::{join_dir, with_instrument_layers};
@@ -37,7 +36,7 @@ use crate::error::{
     CleanDirSnafu, DeleteIndexSnafu, DeleteIndexesSnafu, DeleteSstsSnafu, OpenDalSnafu, Result,
 };
 use crate::metrics::{COMPACTION_STAGE_ELAPSED, FLUSH_ELAPSED};
-use crate::read::{FlatSource, Source};
+use crate::read::FlatSource;
 use crate::region::options::IndexOptions;
 use crate::sst::file::{FileHandle, RegionFileId, RegionIndexId};
 use crate::sst::index::IndexerBuilderImpl;
@@ -47,7 +46,7 @@ use crate::sst::location::{self, region_dir_from_table_dir};
 use crate::sst::parquet::reader::ParquetReaderBuilder;
 use crate::sst::parquet::writer::ParquetWriter;
 use crate::sst::parquet::{SstInfo, WriteOptions};
-use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY};
+use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FormatType};
 
 pub type AccessLayerRef = Arc<AccessLayer>;
 /// SST write results.
@@ -391,15 +390,19 @@ impl AccessLayer {
             )
             .await
             .with_file_cleaner(cleaner);
-            match request.source {
-                Either::Left(source) => {
+            match request.sst_write_format {
+                FormatType::PrimaryKey => {
                     writer
-                        .write_all(source, request.max_sequence, write_opts)
+                        .write_all_flat_as_primary_key(
+                            request.source,
+                            request.max_sequence,
+                            write_opts,
+                        )
                         .await?
                 }
-                Either::Right(flat_source) => {
+                FormatType::Flat => {
                     writer
-                        .write_all_flat(flat_source, request.max_sequence, write_opts)
+                        .write_all_flat(request.source, request.max_sequence, write_opts)
                         .await?
                 }
             }
@@ -520,11 +523,12 @@ pub enum OperationType {
 pub struct SstWriteRequest {
     pub op_type: OperationType,
     pub metadata: RegionMetadataRef,
-    pub source: Either<Source, FlatSource>,
+    pub source: FlatSource,
     pub cache_manager: CacheManagerRef,
     #[allow(dead_code)]
     pub storage: Option<String>,
     pub max_sequence: Option<SequenceNumber>,
+    pub sst_write_format: FormatType,
 
     /// Configs for index
     pub index_options: IndexOptions,
diff --git a/src/mito2/src/cache/write_cache.rs b/src/mito2/src/cache/write_cache.rs
index a28df3f54c..3d373efe91 100644
--- a/src/mito2/src/cache/write_cache.rs
+++ b/src/mito2/src/cache/write_cache.rs
@@ -244,15 +244,19 @@ impl WriteCache {
         .await
         .with_file_cleaner(cleaner);
 
-        let sst_info = match write_request.source {
-            either::Left(source) => {
+        let sst_info = match write_request.sst_write_format {
+            crate::sst::FormatType::PrimaryKey => {
                 writer
-                    .write_all(source, write_request.max_sequence, write_opts)
+                    .write_all_flat_as_primary_key(
+                        write_request.source,
+                        write_request.max_sequence,
+                        write_opts,
+                    )
                     .await?
             }
-            either::Right(flat_source) => {
+            crate::sst::FormatType::Flat => {
                 writer
-                    .write_all_flat(flat_source, write_request.max_sequence, write_opts)
+                    .write_all_flat(write_request.source, write_request.max_sequence, write_opts)
                     .await?
             }
         };
@@ -509,12 +513,13 @@ mod tests {
     use crate::cache::test_util::{assert_parquet_metadata_equal, new_fs_store};
     use crate::cache::{CacheManager, CacheStrategy};
     use crate::error::InvalidBatchSnafu;
-    use crate::read::Source;
+    use crate::read::FlatSource;
     use crate::region::options::IndexOptions;
     use crate::sst::parquet::reader::ParquetReaderBuilder;
     use crate::test_util::TestEnv;
     use crate::test_util::sst_util::{
-        new_batch_by_range, new_source, sst_file_handle_with_file_id, sst_region_metadata,
+        new_flat_source_from_record_batches, new_record_batch_by_range,
+        sst_file_handle_with_file_id, sst_region_metadata,
     };
 
     #[tokio::test]
@@ -532,21 +537,22 @@ mod tests {
             .create_write_cache(local_store.clone(), ReadableSize::mb(10))
             .await;
 
-        // Create Source
+        // Create source.
         let metadata = Arc::new(sst_region_metadata());
         let region_id = metadata.region_id;
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
 
         let write_request = SstWriteRequest {
             op_type: OperationType::Flush,
             metadata,
-            source: either::Left(source),
+            source,
             storage: None,
             max_sequence: None,
+            sst_write_format: Default::default(),
             cache_manager: Default::default(),
             index_options: IndexOptions::default(),
             index_config: Default::default(),
@@ -636,19 +642,20 @@ mod tests {
         // Create source
         let metadata = Arc::new(sst_region_metadata());
 
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
 
         // Write to local cache and upload sst to mock remote store
         let write_request = SstWriteRequest {
             op_type: OperationType::Flush,
             metadata,
-            source: either::Left(source),
+            source,
             storage: None,
             max_sequence: None,
+            sst_write_format: Default::default(),
             cache_manager: cache_manager.clone(),
             index_options: IndexOptions::default(),
             index_config: Default::default(),
@@ -715,9 +722,9 @@ mod tests {
         let metadata = Arc::new(sst_region_metadata());
 
         // Creates a source that can return an error to abort the writer.
-        let source = Source::Iter(Box::new(
+        let source = FlatSource::Iter(Box::new(
             [
-                Ok(new_batch_by_range(&["a", "d"], 0, 60)),
+                Ok(new_record_batch_by_range(&["a", "d"], 0, 60)),
                 InvalidBatchSnafu {
                     reason: "Abort the writer",
                 }
@@ -730,9 +737,10 @@ mod tests {
         let write_request = SstWriteRequest {
             op_type: OperationType::Flush,
             metadata,
-            source: either::Left(source),
+            source,
             storage: None,
             max_sequence: None,
+            sst_write_format: Default::default(),
             cache_manager: cache_manager.clone(),
             index_options: IndexOptions::default(),
             index_config: Default::default(),
diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs
index 6d51d1dd59..ba6957fdae 100644
--- a/src/mito2/src/compaction.rs
+++ b/src/mito2/src/compaction.rs
@@ -58,10 +58,10 @@ use crate::error::{
     TimeRangePredicateOverflowSnafu, TimeoutSnafu,
 };
 use crate::metrics::{COMPACTION_STAGE_ELAPSED, INFLIGHT_COMPACTION_COUNT};
+use crate::read::BoxedRecordBatchStream;
 use crate::read::projection::ProjectionMapper;
 use crate::read::scan_region::{PredicateGroup, ScanInput};
 use crate::read::seq_scan::SeqScan;
-use crate::read::{BoxedBatchReader, BoxedRecordBatchStream};
 use crate::region::options::{MergeMode, RegionOptions};
 use crate::region::version::VersionControlRef;
 use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState};
@@ -828,7 +828,7 @@ pub struct SerializedCompactionOutput {
     output_time_range: Option<TimestampRange>,
 }
 
-/// Builders to create [BoxedBatchReader] for compaction.
+/// Builders to create [BoxedRecordBatchStream] for compaction.
 struct CompactionSstReaderBuilder<'a> {
     metadata: RegionMetadataRef,
     sst_layer: AccessLayerRef,
@@ -841,24 +841,17 @@ struct CompactionSstReaderBuilder<'a> {
 }
 
 impl CompactionSstReaderBuilder<'_> {
-    /// Builds [BoxedBatchReader] that reads all SST files and yields batches in primary key order.
-    async fn build_sst_reader(self) -> Result<BoxedBatchReader> {
-        let scan_input = self.build_scan_input(false)?.with_compaction(true);
-
-        SeqScan::new(scan_input).build_reader_for_compaction().await
-    }
-
     /// Builds [BoxedRecordBatchStream] that reads all SST files and yields batches in flat format for compaction.
     async fn build_flat_sst_reader(self) -> Result<BoxedRecordBatchStream> {
-        let scan_input = self.build_scan_input(true)?.with_compaction(true);
+        let scan_input = self.build_scan_input()?.with_compaction(true);
 
         SeqScan::new(scan_input)
             .build_flat_reader_for_compaction()
             .await
     }
 
-    fn build_scan_input(self, flat_format: bool) -> Result<ScanInput> {
-        let mapper = ProjectionMapper::all(&self.metadata, flat_format)?;
+    fn build_scan_input(self) -> Result<ScanInput> {
+        let mapper = ProjectionMapper::all(&self.metadata, true)?;
         let mut scan_input = ScanInput::new(self.sst_layer, mapper)
             .with_files(self.inputs.to_vec())
             .with_append_mode(self.append_mode)
@@ -868,7 +861,7 @@ impl CompactionSstReaderBuilder<'_> {
             // We ignore file not found error during compaction.
             .with_ignore_file_not_found(true)
             .with_merge_mode(self.merge_mode)
-            .with_flat_format(flat_format);
+            .with_flat_format(true);
 
         // This serves as a workaround of https://github.com/GreptimeTeam/greptimedb/issues/3944
         // by converting time ranges into predicate.
diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs
index 1876972b0d..b03e6415e8 100644
--- a/src/mito2/src/compaction/compactor.rs
+++ b/src/mito2/src/compaction/compactor.rs
@@ -43,7 +43,7 @@ use crate::error::{
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
 use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
 use crate::metrics;
-use crate::read::{FlatSource, Source};
+use crate::read::FlatSource;
 use crate::region::options::RegionOptions;
 use crate::region::version::VersionRef;
 use crate::region::{ManifestContext, RegionLeaderState, RegionRoleState};
@@ -356,13 +356,8 @@ impl DefaultCompactor {
             time_range: output.output_time_range,
             merge_mode,
         };
-        let source = if flat_format {
-            let reader = builder.build_flat_sst_reader().await?;
-            Either::Right(FlatSource::Stream(reader))
-        } else {
-            let reader = builder.build_sst_reader().await?;
-            Either::Left(Source::Reader(reader))
-        };
+        let reader = builder.build_flat_sst_reader().await?;
+        let source = FlatSource::Stream(reader);
         let mut metrics = Metrics::new(WriteType::Compaction);
         let region_metadata = compaction_region.region_metadata.clone();
         let sst_infos = compaction_region
@@ -375,6 +370,11 @@ impl DefaultCompactor {
                     cache_manager: compaction_region.cache_manager.clone(),
                     storage,
                     max_sequence: max_sequence.map(NonZero::get),
+                    sst_write_format: if flat_format {
+                        FormatType::Flat
+                    } else {
+                        FormatType::PrimaryKey
+                    },
                     index_options,
                     index_config,
                     inverted_index_config,
diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs
index 0c16544b6e..fedac95d27 100644
--- a/src/mito2/src/flush.rs
+++ b/src/mito2/src/flush.rs
@@ -22,7 +22,6 @@ use std::time::Instant;
 
 use common_telemetry::{debug, error, info};
 use datatypes::arrow::datatypes::SchemaRef;
-use either::Either;
 use partition::expr::PartitionExpr;
 use smallvec::{SmallVec, smallvec};
 use snafu::ResultExt;
@@ -41,18 +40,14 @@ use crate::error::{
 };
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
 use crate::memtable::bulk::ENCODE_ROW_THRESHOLD;
-use crate::memtable::{
-    BoxedRecordBatchIterator, EncodedRange, IterBuilder, MemtableRanges, RangesOptions,
-};
+use crate::memtable::{BoxedRecordBatchIterator, EncodedRange, MemtableRanges, RangesOptions};
 use crate::metrics::{
     FLUSH_BYTES_TOTAL, FLUSH_ELAPSED, FLUSH_FAILURE_TOTAL, FLUSH_FILE_TOTAL, FLUSH_REQUESTS_TOTAL,
     INFLIGHT_FLUSH_COUNT,
 };
-use crate::read::dedup::{DedupReader, LastNonNull, LastRow};
+use crate::read::FlatSource;
 use crate::read::flat_dedup::{FlatDedupIterator, FlatLastNonNull, FlatLastRow};
 use crate::read::flat_merge::FlatMergeIterator;
-use crate::read::merge::MergeReaderBuilder;
-use crate::read::{FlatSource, Source};
 use crate::region::options::{IndexOptions, MergeMode, RegionOptions};
 use crate::region::version::{VersionControlData, VersionControlRef, VersionRef};
 use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState, parse_partition_expr};
@@ -62,8 +57,10 @@ use crate::request::{
 };
 use crate::schedule::scheduler::{Job, SchedulerRef};
 use crate::sst::file::FileMeta;
-use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE, SstInfo, WriteOptions};
-use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
+use crate::sst::parquet::{
+    DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE, SstInfo, WriteOptions, flat_format,
+};
+use crate::sst::{FlatSchemaOptions, FormatType, to_flat_sst_arrow_schema};
 use crate::worker::WorkerListener;
 
 /// Global write buffer (memtable) manager.
@@ -480,78 +477,29 @@ impl RegionFlushTask {
             // the counter may have more series than the actual series count.
             series_count += memtable_series_count;
 
-            if mem_ranges.is_record_batch() {
-                let flush_start = Instant::now();
-                let FlushFlatMemResult {
-                    num_encoded,
-                    num_sources,
-                    results,
-                } = self
-                    .flush_flat_mem_ranges(version, &write_opts, mem_ranges)
-                    .await?;
-                encoded_part_count += num_encoded;
-                for (source_idx, result) in results.into_iter().enumerate() {
-                    let (max_sequence, ssts_written, metrics) = result?;
-                    if ssts_written.is_empty() {
-                        // No data written.
-                        continue;
-                    }
-
-                    common_telemetry::debug!(
-                        "Region {} flush one memtable {} {}/{}, metrics: {:?}",
-                        self.region_id,
-                        memtable_id,
-                        source_idx,
-                        num_sources,
-                        metrics
-                    );
-
-                    flush_metrics = flush_metrics.merge(metrics);
-
-                    file_metas.extend(ssts_written.into_iter().map(|sst_info| {
-                        flushed_bytes += sst_info.file_size;
-                        Self::new_file_meta(
-                            self.region_id,
-                            max_sequence,
-                            sst_info,
-                            partition_expr.clone(),
-                        )
-                    }));
-                }
-
-                common_telemetry::debug!(
-                    "Region {} flush {} memtables for {}, num_mem_ranges: {}, num_encoded: {}, num_rows: {}, flush_cost: {:?}, compact_cost: {:?}",
-                    self.region_id,
-                    num_sources,
-                    memtable_id,
-                    num_mem_ranges,
-                    num_encoded,
-                    num_mem_rows,
-                    flush_start.elapsed(),
-                    compact_cost,
-                );
-            } else {
-                let max_sequence = mem_ranges.max_sequence();
-                let source = memtable_source(mem_ranges, &version.options).await?;
-
-                // Flush to level 0.
-                let source = Either::Left(source);
-                let write_request = self.new_write_request(version, max_sequence, source);
-
-                let mut metrics = Metrics::new(WriteType::Flush);
-                let ssts_written = self
-                    .access_layer
-                    .write_sst(write_request, &write_opts, &mut metrics)
-                    .await?;
-                FLUSH_FILE_TOTAL.inc_by(ssts_written.len() as u64);
+            let flush_start = Instant::now();
+            let FlushFlatMemResult {
+                num_encoded,
+                num_sources,
+                results,
+            } = self
+                .flush_flat_mem_ranges(version, &write_opts, mem_ranges)
+                .await?;
+            encoded_part_count += num_encoded;
+            for (source_idx, result) in results.into_iter().enumerate() {
+                let (max_sequence, ssts_written, metrics) = result?;
                 if ssts_written.is_empty() {
                     // No data written.
                     continue;
                 }
 
-                debug!(
-                    "Region {} flush one memtable, num_mem_ranges: {}, num_rows: {}, metrics: {:?}",
-                    self.region_id, num_mem_ranges, num_mem_rows, metrics
+                common_telemetry::debug!(
+                    "Region {} flush one memtable {} {}/{}, metrics: {:?}",
+                    self.region_id,
+                    memtable_id,
+                    source_idx,
+                    num_sources,
+                    metrics
                 );
 
                 flush_metrics = flush_metrics.merge(metrics);
@@ -565,7 +513,19 @@ impl RegionFlushTask {
                         partition_expr.clone(),
                     )
                 }));
-            };
+            }
+
+            common_telemetry::debug!(
+                "Region {} flush {} memtables for {}, num_mem_ranges: {}, num_encoded: {}, num_rows: {}, flush_cost: {:?}, compact_cost: {:?}",
+                self.region_id,
+                num_sources,
+                memtable_id,
+                num_mem_ranges,
+                num_encoded,
+                num_mem_rows,
+                flush_start.elapsed(),
+                compact_cost,
+            );
         }
 
         Ok(DoFlushMemtablesResult {
@@ -587,16 +547,17 @@ impl RegionFlushTask {
             &version.metadata,
             &FlatSchemaOptions::from_encoding(version.metadata.primary_key_encoding),
         );
+        let field_column_start =
+            flat_format::field_column_start(&version.metadata, batch_schema.fields().len());
         let flat_sources = memtable_flat_sources(
             batch_schema,
             mem_ranges,
             &version.options,
-            version.metadata.primary_key.len(),
+            field_column_start,
         )?;
         let mut tasks = Vec::with_capacity(flat_sources.encoded.len() + flat_sources.sources.len());
         let num_encoded = flat_sources.encoded.len();
         for (source, max_sequence) in flat_sources.sources {
-            let source = Either::Right(source);
             let write_request = self.new_write_request(version, max_sequence, source);
             let access_layer = self.access_layer.clone();
             let write_opts = write_opts.clone();
@@ -667,8 +628,13 @@ impl RegionFlushTask {
         &self,
         version: &VersionRef,
         max_sequence: u64,
-        source: Either<Source, FlatSource>,
+        source: FlatSource,
     ) -> SstWriteRequest {
+        let flat_format = version
+            .options
+            .sst_format
+            .map(|f| f == FormatType::Flat)
+            .unwrap_or(self.engine_config.default_experimental_flat_format);
         SstWriteRequest {
             op_type: OperationType::Flush,
             metadata: version.metadata.clone(),
@@ -676,6 +642,11 @@ impl RegionFlushTask {
             cache_manager: self.cache_manager.clone(),
             storage: version.options.storage.clone(),
             max_sequence: Some(max_sequence),
+            sst_write_format: if flat_format {
+                FormatType::Flat
+            } else {
+                FormatType::PrimaryKey
+            },
             index_options: self.index_options.clone(),
             index_config: self.engine_config.index.clone(),
             inverted_index_config: self.engine_config.inverted_index.clone(),
@@ -722,41 +693,6 @@ struct DoFlushMemtablesResult {
     flush_metrics: Metrics,
 }
 
-/// Returns a [Source] for the given memtable.
-async fn memtable_source(mem_ranges: MemtableRanges, options: &RegionOptions) -> Result<Source> {
-    let source = if mem_ranges.ranges.len() == 1 {
-        let only_range = mem_ranges.ranges.into_values().next().unwrap();
-        let iter = only_range.build_iter()?;
-        Source::Iter(iter)
-    } else {
-        // todo(hl): a workaround since sync version of MergeReader is wip.
-        let sources = mem_ranges
-            .ranges
-            .into_values()
-            .map(|r| r.build_iter().map(Source::Iter))
-            .collect::<Result<Vec<_>>>()?;
-        let merge_reader = MergeReaderBuilder::from_sources(sources).build().await?;
-        let maybe_dedup = if options.append_mode {
-            // no dedup in append mode
-            Box::new(merge_reader) as _
-        } else {
-            // dedup according to merge mode
-            match options.merge_mode.unwrap_or(MergeMode::LastRow) {
-                MergeMode::LastRow => {
-                    Box::new(DedupReader::new(merge_reader, LastRow::new(false), None)) as _
-                }
-                MergeMode::LastNonNull => Box::new(DedupReader::new(
-                    merge_reader,
-                    LastNonNull::new(false),
-                    None,
-                )) as _,
-            }
-        };
-        Source::Reader(maybe_dedup)
-    };
-    Ok(source)
-}
-
 struct FlatSources {
     sources: SmallVec<[(FlatSource, SequenceNumber); 4]>,
     encoded: SmallVec<[(EncodedRange, SequenceNumber); 4]>,
diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs
index cf2ced06fe..6056a42013 100644
--- a/src/mito2/src/memtable/bulk.rs
+++ b/src/mito2/src/memtable/bulk.rs
@@ -57,7 +57,7 @@ use crate::memtable::{
 use crate::read::flat_dedup::{FlatDedupIterator, FlatLastNonNull, FlatLastRow};
 use crate::read::flat_merge::FlatMergeIterator;
 use crate::region::options::MergeMode;
-use crate::sst::parquet::format::FIXED_POS_COLUMN_NUM;
+use crate::sst::parquet::flat_format::field_column_start;
 use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE};
 use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
 
@@ -1186,13 +1186,8 @@ impl MemtableCompactor {
                     Box::new(dedup_iter)
                 }
                 MergeMode::LastNonNull => {
-                    // Calculates field column start: total columns - fixed columns - field columns
-                    // Field column count = total metadata columns - time index column - primary key columns
-                    let field_column_count =
-                        metadata.column_metadatas.len() - 1 - metadata.primary_key.len();
-                    let total_columns = arrow_schema.fields().len();
                     let field_column_start =
-                        total_columns - FIXED_POS_COLUMN_NUM - field_column_count;
+                        field_column_start(metadata, arrow_schema.fields().len());
 
                     let dedup_iter = FlatDedupIterator::new(
                         merged_iter,
diff --git a/src/mito2/src/read/prune.rs b/src/mito2/src/read/prune.rs
index 29ded3d49a..2f9fa002d4 100644
--- a/src/mito2/src/read/prune.rs
+++ b/src/mito2/src/read/prune.rs
@@ -80,11 +80,6 @@ impl PruneReader {
         }
     }
 
-    pub(crate) fn reset_source(&mut self, source: Source, skip_fields: bool) {
-        self.source = source;
-        self.skip_fields = skip_fields;
-    }
-
     /// Merge metrics with the inner reader and return the merged metrics.
     pub(crate) fn metrics(&self) -> ReaderMetrics {
         let mut metrics = self.metrics.clone();
diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs
index c13b40d111..d2be17cc83 100644
--- a/src/mito2/src/read/seq_scan.rs
+++ b/src/mito2/src/read/seq_scan.rs
@@ -128,28 +128,6 @@ impl SeqScan {
         Ok(Box::pin(futures::stream::iter(streams).flatten()))
     }
 
-    /// Builds a [BoxedBatchReader] from sequential scan for compaction.
-    ///
-    /// # Panics
-    /// Panics if the compaction flag is not set.
-    pub async fn build_reader_for_compaction(&self) -> Result<BoxedBatchReader> {
-        assert!(self.stream_ctx.input.compaction);
-
-        let metrics_set = ExecutionPlanMetricsSet::new();
-        let part_metrics = self.new_partition_metrics(false, &metrics_set, 0);
-        debug_assert_eq!(1, self.properties.partitions.len());
-        let partition_ranges = &self.properties.partitions[0];
-
-        let reader = Self::merge_all_ranges_for_compaction(
-            &self.stream_ctx,
-            partition_ranges,
-            &part_metrics,
-            self.pruner.clone(),
-        )
-        .await?;
-        Ok(Box::new(reader))
-    }
-
     /// Builds a [BoxedRecordBatchStream] from sequential scan for flat format compaction.
     ///
     /// # Panics
@@ -172,40 +150,6 @@ impl SeqScan {
         Ok(reader)
     }
 
-    /// Builds a merge reader that reads all ranges.
-    /// Callers MUST not split ranges before calling this method.
-    async fn merge_all_ranges_for_compaction(
-        stream_ctx: &Arc<StreamContext>,
-        partition_ranges: &[PartitionRange],
-        part_metrics: &PartitionMetrics,
-        pruner: Arc<Pruner>,
-    ) -> Result<BoxedBatchReader> {
-        pruner.add_partition_ranges(partition_ranges);
-        let partition_pruner = Arc::new(PartitionPruner::new(pruner, partition_ranges));
-
-        let mut sources = Vec::new();
-        for part_range in partition_ranges {
-            build_sources(
-                stream_ctx,
-                part_range,
-                true,
-                part_metrics,
-                partition_pruner.clone(),
-                &mut sources,
-                None,
-            )
-            .await?;
-        }
-
-        common_telemetry::debug!(
-            "Build reader to read all parts, region_id: {}, num_part_ranges: {}, num_sources: {}",
-            stream_ctx.input.mapper.metadata().region_id,
-            partition_ranges.len(),
-            sources.len()
-        );
-        Self::build_reader_from_sources(stream_ctx, sources, None, None).await
-    }
-
     /// Builds a merge reader that reads all flat ranges.
     /// Callers MUST not split ranges before calling this method.
     async fn merge_all_flat_ranges_for_compaction(
diff --git a/src/mito2/src/sst.rs b/src/mito2/src/sst.rs
index 78e4c563b1..94bc1feea8 100644
--- a/src/mito2/src/sst.rs
+++ b/src/mito2/src/sst.rs
@@ -31,7 +31,6 @@ use store_api::storage::consts::{
     OP_TYPE_COLUMN_NAME, PRIMARY_KEY_COLUMN_NAME, SEQUENCE_COLUMN_NAME,
 };
 
-use crate::read::Batch;
 use crate::sst::parquet::flat_format::time_index_column_index;
 
 pub mod file;
@@ -260,33 +259,6 @@ pub(crate) struct SeriesEstimator {
 }
 
 impl SeriesEstimator {
-    /// Updates the estimator with a new Batch.
-    ///
-    /// Since each Batch contains only one series, this increments the series count
-    /// and updates the last timestamp.
-    pub(crate) fn update(&mut self, batch: &Batch) {
-        let Some(last_ts) = batch.last_timestamp() else {
-            return;
-        };
-
-        // Checks if there's a boundary between the last batch and this batch
-        if let Some(prev_last_ts) = self.last_timestamp {
-            // If the first timestamp of this batch is less than the last timestamp
-            // we've seen, it indicates a new series
-            if let Some(first_ts) = batch.first_timestamp()
-                && first_ts.value() <= prev_last_ts
-            {
-                self.series_count += 1;
-            }
-        } else {
-            // First batch, counts as first series
-            self.series_count = 1;
-        }
-
-        // Updates the last timestamp
-        self.last_timestamp = Some(last_ts.value());
-    }
-
     /// Updates the estimator with a new record batch in flat format.
     ///
     /// This method examines the time index column to detect series boundaries.
@@ -340,43 +312,14 @@ impl SeriesEstimator {
 mod tests {
     use std::sync::Arc;
 
-    use api::v1::OpType;
     use datatypes::arrow::array::{
-        BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt8Builder,
-        UInt32Array, UInt64Array,
+        BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array,
+        UInt64Array,
     };
     use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit};
     use datatypes::arrow::record_batch::RecordBatch;
 
     use super::*;
-    use crate::read::{Batch, BatchBuilder};
-
-    fn new_batch(
-        primary_key: &[u8],
-        timestamps: &[i64],
-        sequences: &[u64],
-        op_types: &[OpType],
-    ) -> Batch {
-        let timestamps = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec()));
-        let sequences = Arc::new(UInt64Array::from(sequences.to_vec()));
-        let mut op_type_builder = UInt8Builder::with_capacity(op_types.len());
-        for op_type in op_types {
-            op_type_builder.append_value(*op_type as u8);
-        }
-        let op_types = Arc::new(UInt8Array::from(
-            op_types.iter().map(|op| *op as u8).collect::<Vec<_>>(),
-        ));
-
-        let mut builder = BatchBuilder::new(primary_key.to_vec());
-        builder
-            .timestamps_array(timestamps)
-            .unwrap()
-            .sequences_array(sequences)
-            .unwrap()
-            .op_types_array(op_types)
-            .unwrap();
-        builder.build().unwrap()
-    }
 
     fn new_flat_record_batch(timestamps: &[i64]) -> RecordBatch {
         // Flat format has: [fields..., time_index, __primary_key, __sequence, __op_type]
@@ -411,128 +354,6 @@ mod tests {
         RecordBatch::try_new(schema, vec![time_array, pk_array, seq_array, op_array]).unwrap()
     }
 
-    #[test]
-    fn test_series_estimator_empty_batch() {
-        let mut estimator = SeriesEstimator::default();
-        let batch = new_batch(b"test", &[], &[], &[]);
-        estimator.update(&batch);
-        assert_eq!(0, estimator.finish());
-    }
-
-    #[test]
-    fn test_series_estimator_single_batch() {
-        let mut estimator = SeriesEstimator::default();
-        let batch = new_batch(
-            b"test",
-            &[1, 2, 3],
-            &[1, 2, 3],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch);
-        assert_eq!(1, estimator.finish());
-    }
-
-    #[test]
-    fn test_series_estimator_multiple_batches_same_series() {
-        let mut estimator = SeriesEstimator::default();
-
-        // First batch with timestamps 1, 2, 3
-        let batch1 = new_batch(
-            b"test",
-            &[1, 2, 3],
-            &[1, 2, 3],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch1);
-
-        // Second batch with timestamps 4, 5, 6 (continuation)
-        let batch2 = new_batch(
-            b"test",
-            &[4, 5, 6],
-            &[4, 5, 6],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch2);
-
-        assert_eq!(1, estimator.finish());
-    }
-
-    #[test]
-    fn test_series_estimator_new_series_detected() {
-        let mut estimator = SeriesEstimator::default();
-
-        // First batch with timestamps 1, 2, 3
-        let batch1 = new_batch(
-            b"pk0",
-            &[1, 2, 3],
-            &[1, 2, 3],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch1);
-
-        // Second batch with timestamps 2, 3, 4 (timestamp goes back, new series)
-        let batch2 = new_batch(
-            b"pk1",
-            &[2, 3, 4],
-            &[4, 5, 6],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch2);
-
-        assert_eq!(2, estimator.finish());
-    }
-
-    #[test]
-    fn test_series_estimator_equal_timestamp_boundary() {
-        let mut estimator = SeriesEstimator::default();
-
-        // First batch ending at timestamp 5
-        let batch1 = new_batch(
-            b"test",
-            &[1, 2, 5],
-            &[1, 2, 3],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch1);
-
-        // Second batch starting at timestamp 5 (equal, indicates new series)
-        let batch2 = new_batch(
-            b"test",
-            &[5, 6, 7],
-            &[4, 5, 6],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch2);
-
-        assert_eq!(2, estimator.finish());
-    }
-
-    #[test]
-    fn test_series_estimator_finish_resets_state() {
-        let mut estimator = SeriesEstimator::default();
-
-        let batch1 = new_batch(
-            b"test",
-            &[1, 2, 3],
-            &[1, 2, 3],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch1);
-
-        assert_eq!(1, estimator.finish());
-
-        // After finish, state should be reset
-        let batch2 = new_batch(
-            b"test",
-            &[4, 5, 6],
-            &[4, 5, 6],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch2);
-
-        assert_eq!(1, estimator.finish());
-    }
-
     #[test]
     fn test_series_estimator_flat_empty_batch() {
         let mut estimator = SeriesEstimator::default();
diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs
index 0df3229e9c..88aebfc001 100644
--- a/src/mito2/src/sst/index.rs
+++ b/src/mito2/src/sst/index.rs
@@ -58,7 +58,7 @@ use crate::error::{
 };
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
 use crate::metrics::INDEX_CREATE_MEMORY_USAGE;
-use crate::read::{Batch, BatchReader};
+use crate::read::Batch;
 use crate::region::options::IndexOptions;
 use crate::region::version::VersionControlRef;
 use crate::region::{ManifestContextRef, RegionLeaderState};
@@ -802,9 +802,9 @@ impl IndexBuildTask {
         if let Some(mut parquet_reader) = parquet_reader {
             // TODO(SNC123): optimize index batch
             loop {
-                match parquet_reader.next_batch().await {
-                    Ok(Some(mut batch)) => {
-                        indexer.update(&mut batch).await;
+                match parquet_reader.next_record_batch().await {
+                    Ok(Some(batch)) => {
+                        indexer.update_flat(&batch).await;
                     }
                     Ok(None) => break,
                     Err(e) => {
@@ -1227,7 +1227,9 @@ mod tests {
     use crate::sst::parquet::WriteOptions;
     use crate::test_util::memtable_util::EmptyMemtableBuilder;
     use crate::test_util::scheduler_util::SchedulerEnv;
-    use crate::test_util::sst_util::{new_batch_by_range, new_source, sst_region_metadata};
+    use crate::test_util::sst_util::{
+        new_flat_source_from_record_batches, new_record_batch_by_range, sst_region_metadata,
+    };
 
     struct MetaConfig {
         with_inverted: bool,
@@ -1358,19 +1360,20 @@ mod tests {
         env: &SchedulerEnv,
         build_mode: IndexBuildMode,
     ) -> SstInfo {
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
         let mut index_config = MitoConfig::default().index;
         index_config.build_mode = build_mode;
         let write_request = SstWriteRequest {
             op_type: OperationType::Flush,
             metadata: metadata.clone(),
-            source: either::Left(source),
+            source,
             storage: None,
             max_sequence: None,
+            sst_write_format: Default::default(),
             cache_manager: Default::default(),
             index_options: IndexOptions::default(),
             index_config,
diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs
index aa98b69176..1c5bfd9db0 100644
--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -110,6 +110,7 @@ mod tests {
         TimestampMillisecondArray, UInt8Array, UInt64Array,
     };
     use datatypes::arrow::datatypes::{DataType, Field, Schema, UInt32Type};
+    use datatypes::arrow::util::pretty::pretty_format_batches;
     use datatypes::prelude::ConcreteDataType;
     use datatypes::schema::{FulltextAnalyzer, FulltextBackend, FulltextOptions};
     use object_store::ObjectStore;
@@ -129,7 +130,7 @@ mod tests {
     use crate::cache::test_util::assert_parquet_metadata_equal;
     use crate::cache::{CacheManager, CacheStrategy, PageKey};
     use crate::config::IndexConfig;
-    use crate::read::{BatchBuilder, BatchReader, FlatSource};
+    use crate::read::FlatSource;
     use crate::region::options::{IndexOptions, InvertedIndexOptions};
     use crate::sst::file::{FileHandle, FileMeta, RegionFileId, RegionIndexId};
     use crate::sst::file_purger::NoopFilePurger;
@@ -137,19 +138,19 @@ mod tests {
     use crate::sst::index::fulltext_index::applier::builder::FulltextIndexApplierBuilder;
     use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder;
     use crate::sst::index::{IndexBuildType, Indexer, IndexerBuilder, IndexerBuilderImpl};
-    use crate::sst::parquet::format::PrimaryKeyWriteFormat;
+    use crate::sst::parquet::flat_format::FlatWriteFormat;
     use crate::sst::parquet::reader::{ParquetReader, ParquetReaderBuilder, ReaderMetrics};
     use crate::sst::parquet::writer::ParquetWriter;
     use crate::sst::{
         DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, location, to_flat_sst_arrow_schema,
     };
+    use crate::test_util::TestEnv;
     use crate::test_util::sst_util::{
-        build_test_binary_test_region_metadata, new_batch_by_range, new_batch_with_binary,
-        new_batch_with_custom_sequence, new_primary_key, new_source, new_sparse_primary_key,
-        sst_file_handle, sst_file_handle_with_file_id, sst_region_metadata,
+        build_test_binary_test_region_metadata, new_flat_source_from_record_batches,
+        new_primary_key, new_record_batch_by_range, new_record_batch_with_custom_sequence,
+        new_sparse_primary_key, sst_file_handle, sst_file_handle_with_file_id, sst_region_metadata,
         sst_region_metadata_with_encoding,
     };
-    use crate::test_util::{TestEnv, check_reader_result};
 
     const FILE_DIR: &str = "/";
     const REGION_ID: RegionId = RegionId::new(0, 0);
@@ -191,10 +192,10 @@ mod tests {
             region_file_id: handle.file_id(),
         };
         let metadata = Arc::new(sst_region_metadata());
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
         // Use a small row group size for test.
         let write_opts = WriteOptions {
@@ -214,7 +215,7 @@ mod tests {
         .await;
 
         let info = writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -235,14 +236,14 @@ mod tests {
             object_store,
         );
         let mut reader = builder.build().await.unwrap().unwrap();
-        check_reader_result(
+        check_record_batch_reader_result(
             &mut reader,
             &[
-                new_batch_by_range(&["a", "d"], 0, 50),
-                new_batch_by_range(&["a", "d"], 50, 60),
-                new_batch_by_range(&["b", "f"], 0, 40),
-                new_batch_by_range(&["b", "h"], 100, 150),
-                new_batch_by_range(&["b", "h"], 150, 200),
+                new_record_batch_by_range(&["a", "d"], 0, 50),
+                new_record_batch_by_range(&["a", "d"], 50, 60),
+                new_record_batch_by_range(&["b", "f"], 0, 40),
+                new_record_batch_by_range(&["b", "h"], 100, 150),
+                new_record_batch_by_range(&["b", "h"], 150, 200),
             ],
         )
         .await;
@@ -254,10 +255,10 @@ mod tests {
         let object_store = env.init_object_store_manager();
         let handle = sst_file_handle(0, 1000);
         let metadata = Arc::new(sst_region_metadata());
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
         // Use a small row group size for test.
         let write_opts = WriteOptions {
@@ -279,7 +280,7 @@ mod tests {
         .await;
 
         let sst_info = writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -299,14 +300,14 @@ mod tests {
         .cache(cache.clone());
         for _ in 0..3 {
             let mut reader = builder.build().await.unwrap().unwrap();
-            check_reader_result(
+            check_record_batch_reader_result(
                 &mut reader,
                 &[
-                    new_batch_by_range(&["a", "d"], 0, 50),
-                    new_batch_by_range(&["a", "d"], 50, 60),
-                    new_batch_by_range(&["b", "f"], 0, 40),
-                    new_batch_by_range(&["b", "h"], 100, 150),
-                    new_batch_by_range(&["b", "h"], 150, 200),
+                    new_record_batch_by_range(&["a", "d"], 0, 50),
+                    new_record_batch_by_range(&["a", "d"], 50, 60),
+                    new_record_batch_by_range(&["b", "f"], 0, 40),
+                    new_record_batch_by_range(&["b", "h"], 100, 150),
+                    new_record_batch_by_range(&["b", "h"], 150, 200),
                 ],
             )
             .await;
@@ -340,10 +341,10 @@ mod tests {
         let object_store = env.init_object_store_manager();
         let handle = sst_file_handle(0, 1000);
         let metadata = Arc::new(sst_region_metadata());
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
         let write_opts = WriteOptions {
             row_group_size: 50,
@@ -366,7 +367,7 @@ mod tests {
         .await;
 
         let sst_info = writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -392,10 +393,10 @@ mod tests {
         let object_store = env.init_object_store_manager();
         let handle = sst_file_handle(0, 1000);
         let metadata = Arc::new(sst_region_metadata());
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
         // Use a small row group size for test.
         let write_opts = WriteOptions {
@@ -416,7 +417,7 @@ mod tests {
         )
         .await;
         writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -436,11 +437,11 @@ mod tests {
         )
         .predicate(predicate);
         let mut reader = builder.build().await.unwrap().unwrap();
-        check_reader_result(
+        check_record_batch_reader_result(
             &mut reader,
             &[
-                new_batch_by_range(&["a", "d"], 0, 50),
-                new_batch_by_range(&["a", "d"], 50, 60),
+                new_record_batch_by_range(&["a", "d"], 0, 50),
+                new_record_batch_by_range(&["a", "d"], 50, 60),
             ],
         )
         .await;
@@ -452,10 +453,10 @@ mod tests {
         let object_store = env.init_object_store_manager();
         let handle = sst_file_handle(0, 1000);
         let metadata = Arc::new(sst_region_metadata());
-        let source = new_source(&[
-            new_batch_by_range(&["a", "z"], 0, 0),
-            new_batch_by_range(&["a", "z"], 100, 100),
-            new_batch_by_range(&["a", "z"], 200, 230),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "z"], 0, 0),
+            new_record_batch_by_range(&["a", "z"], 100, 100),
+            new_record_batch_by_range(&["a", "z"], 200, 230),
         ]);
         // Use a small row group size for test.
         let write_opts = WriteOptions {
@@ -476,7 +477,7 @@ mod tests {
         )
         .await;
         writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -488,7 +489,11 @@ mod tests {
             object_store,
         );
         let mut reader = builder.build().await.unwrap().unwrap();
-        check_reader_result(&mut reader, &[new_batch_by_range(&["a", "z"], 200, 230)]).await;
+        check_record_batch_reader_result(
+            &mut reader,
+            &[new_record_batch_by_range(&["a", "z"], 200, 230)],
+        )
+        .await;
     }
 
     #[tokio::test]
@@ -497,10 +502,10 @@ mod tests {
         let object_store = env.init_object_store_manager();
         let handle = sst_file_handle(0, 1000);
         let metadata = Arc::new(sst_region_metadata());
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
         // Use a small row group size for test.
         let write_opts = WriteOptions {
@@ -522,7 +527,7 @@ mod tests {
         .await;
 
         writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -542,7 +547,11 @@ mod tests {
         )
         .predicate(predicate);
         let mut reader = builder.build().await.unwrap().unwrap();
-        check_reader_result(&mut reader, &[new_batch_by_range(&["b", "h"], 150, 200)]).await;
+        check_record_batch_reader_result(
+            &mut reader,
+            &[new_record_batch_by_range(&["b", "h"], 150, 200)],
+        )
+        .await;
     }
 
     #[tokio::test]
@@ -569,7 +578,7 @@ mod tests {
 
         let writer_props = props_builder.build();
 
-        let write_format = PrimaryKeyWriteFormat::new(metadata);
+        let write_format = FlatWriteFormat::new(metadata, &FlatSchemaOptions::default());
         let fields: Vec<_> = write_format
             .arrow_schema()
             .fields()
@@ -603,9 +612,8 @@ mod tests {
         )
         .unwrap();
 
-        let batch = new_batch_with_binary(&["a"], 0, 60);
-        let arrow_batch = write_format.convert_batch(&batch).unwrap();
-        let arrays: Vec<_> = arrow_batch
+        let batch = new_record_batch_with_binary(&["a"], 0, 60);
+        let arrays: Vec<_> = batch
             .columns()
             .iter()
             .map(|array| {
@@ -629,11 +637,11 @@ mod tests {
             object_store,
         );
         let mut reader = builder.build().await.unwrap().unwrap();
-        check_reader_result(
+        check_record_batch_reader_result(
             &mut reader,
             &[
-                new_batch_with_binary(&["a"], 0, 50),
-                new_batch_with_binary(&["a"], 50, 60),
+                new_record_batch_with_binary(&["a"], 0, 50),
+                new_record_batch_with_binary(&["a"], 50, 60),
             ],
         )
         .await;
@@ -646,17 +654,17 @@ mod tests {
         let mut env = TestEnv::new().await;
         let object_store = env.init_object_store_manager();
         let metadata = Arc::new(sst_region_metadata());
-        let batches = &[
-            new_batch_by_range(&["a", "d"], 0, 1000),
-            new_batch_by_range(&["b", "f"], 0, 1000),
-            new_batch_by_range(&["c", "g"], 0, 1000),
-            new_batch_by_range(&["b", "h"], 100, 200),
-            new_batch_by_range(&["b", "h"], 200, 300),
-            new_batch_by_range(&["b", "h"], 300, 1000),
+        let batches = vec![
+            new_record_batch_by_range(&["a", "d"], 0, 1000),
+            new_record_batch_by_range(&["b", "f"], 0, 1000),
+            new_record_batch_by_range(&["c", "g"], 0, 1000),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
+            new_record_batch_by_range(&["b", "h"], 200, 300),
+            new_record_batch_by_range(&["b", "h"], 300, 1000),
         ];
         let total_rows: usize = batches.iter().map(|batch| batch.num_rows()).sum();
 
-        let source = new_source(batches);
+        let source = new_flat_source_from_record_batches(batches);
         let write_opts = WriteOptions {
             row_group_size: 50,
             max_file_size: Some(1024 * 16),
@@ -678,7 +686,10 @@ mod tests {
         )
         .await;
 
-        let files = writer.write_all(source, None, &write_opts).await.unwrap();
+        let files = writer
+            .write_all_flat_as_primary_key(source, None, &write_opts)
+            .await
+            .unwrap();
         assert_eq!(2, files.len());
 
         let mut rows_read = 0;
@@ -695,7 +706,7 @@ mod tests {
                 object_store.clone(),
             );
             let mut reader = builder.build().await.unwrap().unwrap();
-            while let Some(batch) = reader.next_batch().await.unwrap() {
+            while let Some(batch) = reader.next_record_batch().await.unwrap() {
                 rows_read += batch.num_rows();
             }
         }
@@ -710,12 +721,12 @@ mod tests {
         let metadata = Arc::new(sst_region_metadata());
         let row_group_size = 50;
 
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 20),
-            new_batch_by_range(&["b", "d"], 0, 20),
-            new_batch_by_range(&["c", "d"], 0, 20),
-            new_batch_by_range(&["c", "f"], 0, 40),
-            new_batch_by_range(&["c", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 20),
+            new_record_batch_by_range(&["b", "d"], 0, 20),
+            new_record_batch_by_range(&["c", "d"], 0, 20),
+            new_record_batch_by_range(&["c", "f"], 0, 40),
+            new_record_batch_by_range(&["c", "h"], 100, 200),
         ]);
         // Use a small row group size for test.
         let write_opts = WriteOptions {
@@ -760,7 +771,7 @@ mod tests {
         .await;
 
         let info = writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -877,6 +888,7 @@ mod tests {
             handle.clone(),
             object_store.clone(),
         )
+        .flat_format(true)
         .predicate(Some(Predicate::new(preds)))
         .inverted_index_appliers([inverted_index_applier.clone(), None])
         .bloom_filter_index_appliers([bloom_filter_applier.clone(), None])
@@ -891,7 +903,11 @@ mod tests {
         let mut reader = ParquetReader::new(Arc::new(context), selection)
             .await
             .unwrap();
-        check_reader_result(&mut reader, &[new_batch_by_range(&["b", "d"], 0, 20)]).await;
+        check_record_batch_reader_result(
+            &mut reader,
+            &[new_record_batch_by_range(&["b", "d"], 0, 20)],
+        )
+        .await;
 
         assert_eq!(metrics.filter_metrics.rg_total, 4);
         assert_eq!(metrics.filter_metrics.rg_minmax_filtered, 3);
@@ -937,6 +953,7 @@ mod tests {
             handle.clone(),
             object_store.clone(),
         )
+        .flat_format(true)
         .predicate(Some(Predicate::new(preds)))
         .inverted_index_appliers([inverted_index_applier.clone(), None])
         .bloom_filter_index_appliers([bloom_filter_applier.clone(), None])
@@ -991,6 +1008,7 @@ mod tests {
             handle.clone(),
             object_store.clone(),
         )
+        .flat_format(true)
         .predicate(Some(Predicate::new(preds)))
         .inverted_index_appliers([inverted_index_applier.clone(), None])
         .bloom_filter_index_appliers([bloom_filter_applier.clone(), None])
@@ -1005,13 +1023,13 @@ mod tests {
         let mut reader = ParquetReader::new(Arc::new(context), selection)
             .await
             .unwrap();
-        check_reader_result(
+        check_record_batch_reader_result(
             &mut reader,
             &[
-                new_batch_by_range(&["a", "d"], 0, 20),
-                new_batch_by_range(&["b", "d"], 0, 20),
-                new_batch_by_range(&["c", "d"], 0, 10),
-                new_batch_by_range(&["c", "d"], 10, 20),
+                new_record_batch_by_range(&["a", "d"], 0, 20),
+                new_record_batch_by_range(&["b", "d"], 0, 20),
+                new_record_batch_by_range(&["c", "d"], 0, 10),
+                new_record_batch_by_range(&["c", "d"], 10, 20),
             ],
         )
         .await;
@@ -1032,37 +1050,32 @@ mod tests {
         assert!(cached.contains_row_group(3));
     }
 
-    /// Creates a flat format RecordBatch for testing.
-    /// Similar to `new_batch_by_range` but returns a RecordBatch in flat format.
-    fn new_record_batch_by_range(tags: &[&str], start: usize, end: usize) -> RecordBatch {
+    fn new_record_batch_with_binary(tags: &[&str], start: usize, end: usize) -> RecordBatch {
         assert!(end >= start);
-        let metadata = Arc::new(sst_region_metadata());
+        let metadata = build_test_binary_test_region_metadata();
         let flat_schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default());
 
         let num_rows = end - start;
         let mut columns = Vec::new();
 
-        // Add primary key columns (tag_0, tag_1) as dictionary arrays
         let mut tag_0_builder = StringDictionaryBuilder::<UInt32Type>::new();
-        let mut tag_1_builder = StringDictionaryBuilder::<UInt32Type>::new();
-
         for _ in 0..num_rows {
             tag_0_builder.append_value(tags[0]);
-            tag_1_builder.append_value(tags[1]);
         }
-
         columns.push(Arc::new(tag_0_builder.finish()) as ArrayRef);
-        columns.push(Arc::new(tag_1_builder.finish()) as ArrayRef);
 
-        // Add field column (field_0)
-        let field_values: Vec<u64> = (start..end).map(|v| v as u64).collect();
-        columns.push(Arc::new(UInt64Array::from(field_values)));
+        let values = (0..num_rows)
+            .map(|_| "some data".as_bytes())
+            .collect::<Vec<_>>();
+        columns.push(
+            Arc::new(datatypes::arrow::array::BinaryArray::from_iter_values(
+                values,
+            )) as ArrayRef,
+        );
 
-        // Add time index column (ts)
         let timestamps: Vec<i64> = (start..end).map(|v| v as i64).collect();
         columns.push(Arc::new(TimestampMillisecondArray::from(timestamps)));
 
-        // Add encoded primary key column
         let pk = new_primary_key(tags);
         let mut pk_builder = BinaryDictionaryBuilder::<UInt32Type>::new();
         for _ in 0..num_rows {
@@ -1070,10 +1083,7 @@ mod tests {
         }
         columns.push(Arc::new(pk_builder.finish()));
 
-        // Add sequence column
         columns.push(Arc::new(UInt64Array::from_value(1000, num_rows)));
-
-        // Add op_type column
         columns.push(Arc::new(UInt8Array::from_value(
             OpType::Put as u8,
             num_rows,
@@ -1082,9 +1092,19 @@ mod tests {
         RecordBatch::try_new(flat_schema, columns).unwrap()
     }
 
-    /// Creates a FlatSource from flat format RecordBatches.
-    fn new_flat_source_from_record_batches(batches: Vec<RecordBatch>) -> FlatSource {
-        FlatSource::Iter(Box::new(batches.into_iter().map(Ok)))
+    async fn check_record_batch_reader_result(
+        reader: &mut ParquetReader,
+        expected: &[RecordBatch],
+    ) {
+        let mut actual = Vec::new();
+        while let Some(batch) = reader.next_record_batch().await.unwrap() {
+            actual.push(batch);
+        }
+        assert_eq!(
+            pretty_format_batches(expected).unwrap().to_string(),
+            pretty_format_batches(&actual).unwrap().to_string()
+        );
+        assert!(reader.next_record_batch().await.unwrap().is_none());
     }
 
     /// Creates a flat format RecordBatch for testing with sparse primary key encoding.
@@ -1333,10 +1353,11 @@ mod tests {
         };
         let metadata = Arc::new(sst_region_metadata());
 
-        // Create batches with sequence 0 to trigger override functionality
-        let batch1 = new_batch_with_custom_sequence(&["a", "d"], 0, 60, 0);
-        let batch2 = new_batch_with_custom_sequence(&["b", "f"], 0, 40, 0);
-        let source = new_source(&[batch1, batch2]);
+        // Create batches with sequence 0 to trigger override functionality.
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_with_custom_sequence(&["a", "d"], 0, 60, 0),
+            new_record_batch_with_custom_sequence(&["b", "f"], 0, 40, 0),
+        ]);
 
         let write_opts = WriteOptions {
             row_group_size: 50,
@@ -1355,7 +1376,7 @@ mod tests {
         .await;
 
         writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -1369,7 +1390,7 @@ mod tests {
         );
         let mut reader = builder.build().await.unwrap().unwrap();
         let mut normal_batches = Vec::new();
-        while let Some(batch) = reader.next_batch().await.unwrap() {
+        while let Some(batch) = reader.next_record_batch().await.unwrap() {
             normal_batches.push(batch);
         }
 
@@ -1391,22 +1412,19 @@ mod tests {
         );
         let mut reader = builder.build().await.unwrap().unwrap();
         let mut override_batches = Vec::new();
-        while let Some(batch) = reader.next_batch().await.unwrap() {
+        while let Some(batch) = reader.next_record_batch().await.unwrap() {
             override_batches.push(batch);
         }
 
         // Compare the results
         assert_eq!(normal_batches.len(), override_batches.len());
         for (normal, override_batch) in normal_batches.into_iter().zip(override_batches.iter()) {
-            // Create expected batch with override sequence
             let expected_batch = {
-                let num_rows = normal.num_rows();
-                let mut builder = BatchBuilder::from(normal);
-                builder
-                    .sequences_array(Arc::new(UInt64Array::from_value(custom_sequence, num_rows)))
-                    .unwrap();
-
-                builder.build().unwrap()
+                let mut columns = normal.columns().to_vec();
+                let num_cols = columns.len();
+                columns[num_cols - 2] =
+                    Arc::new(UInt64Array::from_value(custom_sequence, normal.num_rows()));
+                RecordBatch::try_new(normal.schema(), columns).unwrap()
             };
 
             // Override batch should match expected batch
diff --git a/src/mito2/src/sst/parquet/flat_format.rs b/src/mito2/src/sst/parquet/flat_format.rs
index d6b061e468..8a59e9a97d 100644
--- a/src/mito2/src/sst/parquet/flat_format.rs
+++ b/src/mito2/src/sst/parquet/flat_format.rs
@@ -52,8 +52,8 @@ use crate::error::{
     NewRecordBatchSnafu, Result,
 };
 use crate::sst::parquet::format::{
-    FormatProjection, INTERNAL_COLUMN_NUM, PrimaryKeyArray, PrimaryKeyReadFormat, ReadFormat,
-    StatValues,
+    FIXED_POS_COLUMN_NUM, FormatProjection, INTERNAL_COLUMN_NUM, PrimaryKeyArray,
+    PrimaryKeyReadFormat, ReadFormat, StatValues,
 };
 use crate::sst::{
     FlatSchemaOptions, flat_sst_arrow_schema_column_num, tag_maybe_to_dictionary_field,
@@ -127,6 +127,21 @@ pub(crate) fn op_type_column_index(num_columns: usize) -> usize {
     num_columns - 1
 }
 
+/// Returns the start index of field columns in a flat batch.
+///
+/// `num_columns` is the total number of columns in the flat batch schema,
+/// including tag columns (if present), field columns, and fixed position columns
+/// (time index, primary key, sequence, op type).
+///
+/// For Dense encoding (raw PK columns included): field_column_start = primary_key.len()
+/// For Sparse encoding (no raw PK columns): field_column_start = 0
+pub(crate) fn field_column_start(metadata: &RegionMetadata, num_columns: usize) -> usize {
+    // Calculates field column start: total columns - fixed columns - field columns
+    // Field column count = total metadata columns - time index column - primary key columns
+    let field_column_count = metadata.column_metadatas.len() - 1 - metadata.primary_key.len();
+    num_columns - FIXED_POS_COLUMN_NUM - field_column_count
+}
+
 // TODO(yingwen): Add an option to skip reading internal columns if the region is
 // append only and doesn't use sparse encoding (We need to check the table id under
 // sparse encoding).
@@ -765,3 +780,89 @@ impl FlatReadFormat {
         .unwrap()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use api::v1::SemanticType;
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::schema::ColumnSchema;
+    use store_api::codec::PrimaryKeyEncoding;
+    use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
+    use store_api::storage::RegionId;
+
+    use super::field_column_start;
+    use crate::sst::{FlatSchemaOptions, flat_sst_arrow_schema_column_num};
+
+    /// Builds a `RegionMetadata` with the given number of tags and fields.
+    fn build_metadata(
+        num_tags: usize,
+        num_fields: usize,
+        encoding: PrimaryKeyEncoding,
+    ) -> RegionMetadata {
+        let mut builder = RegionMetadataBuilder::new(RegionId::new(0, 0));
+        let mut col_id = 0u32;
+
+        for i in 0..num_tags {
+            builder.push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    format!("tag_{i}"),
+                    ConcreteDataType::string_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Tag,
+                column_id: col_id,
+            });
+            col_id += 1;
+        }
+
+        for i in 0..num_fields {
+            builder.push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    format!("field_{i}"),
+                    ConcreteDataType::uint64_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Field,
+                column_id: col_id,
+            });
+            col_id += 1;
+        }
+
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(
+                "ts".to_string(),
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                false,
+            ),
+            semantic_type: SemanticType::Timestamp,
+            column_id: col_id,
+        });
+
+        let primary_key: Vec<u32> = (0..num_tags as u32).collect();
+        builder.primary_key(primary_key);
+        builder.primary_key_encoding(encoding);
+        builder.build().unwrap()
+    }
+
+    #[test]
+    fn test_field_column_start() {
+        // (num_tags, num_fields, encoding, expected)
+        let cases = [
+            (1, 1, PrimaryKeyEncoding::Dense, 1),
+            (2, 2, PrimaryKeyEncoding::Dense, 2),
+            (0, 2, PrimaryKeyEncoding::Dense, 0),
+            (2, 2, PrimaryKeyEncoding::Sparse, 0),
+        ];
+
+        for (num_tags, num_fields, encoding, expected) in cases {
+            let metadata = build_metadata(num_tags, num_fields, encoding);
+            let options = FlatSchemaOptions::from_encoding(encoding);
+            let num_columns = flat_sst_arrow_schema_column_num(&metadata, &options);
+            let result = field_column_start(&metadata, num_columns);
+            assert_eq!(
+                result, expected,
+                "num_tags={num_tags}, num_fields={num_fields}, encoding={encoding:?}"
+            );
+        }
+    }
+}
diff --git a/src/mito2/src/sst/parquet/format.rs b/src/mito2/src/sst/parquet/format.rs
index 70d026e6db..ba64eac78b 100644
--- a/src/mito2/src/sst/parquet/format.rs
+++ b/src/mito2/src/sst/parquet/format.rs
@@ -34,12 +34,12 @@ use api::v1::SemanticType;
 use common_time::Timestamp;
 use datafusion_common::ScalarValue;
 use datatypes::arrow::array::{
-    ArrayRef, BinaryArray, BinaryDictionaryBuilder, DictionaryArray, UInt32Array, UInt64Array,
+    ArrayRef, BinaryArray, BinaryDictionaryBuilder, DictionaryArray, UInt64Array,
 };
 use datatypes::arrow::datatypes::{SchemaRef, UInt32Type};
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::prelude::DataType;
-use datatypes::vectors::{Helper, Vector};
+use datatypes::vectors::Helper;
 use mito_codec::row_converter::{
     CompositeValues, PrimaryKeyCodec, SortField, build_primary_key_codec,
     build_primary_key_codec_with_fields,
@@ -51,8 +51,7 @@ use store_api::metadata::{ColumnMetadata, RegionMetadataRef};
 use store_api::storage::{ColumnId, SequenceNumber};
 
 use crate::error::{
-    ConvertVectorSnafu, DecodeSnafu, InvalidBatchSnafu, InvalidRecordBatchSnafu,
-    NewRecordBatchSnafu, Result,
+    ConvertVectorSnafu, DecodeSnafu, InvalidRecordBatchSnafu, NewRecordBatchSnafu, Result,
 };
 use crate::read::{Batch, BatchBuilder, BatchColumn};
 use crate::sst::file::{FileMeta, FileTimeRange};
@@ -73,7 +72,6 @@ pub(crate) const INTERNAL_COLUMN_NUM: usize = 3;
 
 /// Helper for writing the SST format with primary key.
 pub(crate) struct PrimaryKeyWriteFormat {
-    metadata: RegionMetadataRef,
     /// SST file schema.
     arrow_schema: SchemaRef,
     override_sequence: Option<SequenceNumber>,
@@ -84,7 +82,6 @@ impl PrimaryKeyWriteFormat {
     pub(crate) fn new(metadata: RegionMetadataRef) -> PrimaryKeyWriteFormat {
         let arrow_schema = to_sst_arrow_schema(&metadata);
         PrimaryKeyWriteFormat {
-            metadata,
             arrow_schema,
             override_sequence: None,
         }
@@ -104,40 +101,25 @@ impl PrimaryKeyWriteFormat {
         &self.arrow_schema
     }
 
-    /// Convert `batch` to a arrow record batch to store in parquet.
-    pub(crate) fn convert_batch(&self, batch: &Batch) -> Result<RecordBatch> {
-        debug_assert_eq!(
-            batch.fields().len() + FIXED_POS_COLUMN_NUM,
-            self.arrow_schema.fields().len()
-        );
-        let mut columns = Vec::with_capacity(batch.fields().len() + FIXED_POS_COLUMN_NUM);
-        // Store all fields first.
-        for (column, column_metadata) in batch.fields().iter().zip(self.metadata.field_columns()) {
-            ensure!(
-                column.column_id == column_metadata.column_id,
-                InvalidBatchSnafu {
-                    reason: format!(
-                        "Batch has column {} but metadata has column {}",
-                        column.column_id, column_metadata.column_id
-                    ),
-                }
-            );
-
-            columns.push(column.data.to_arrow_array());
-        }
-        // Add time index column.
-        columns.push(batch.timestamps().to_arrow_array());
-        // Add internal columns: primary key, sequences, op types.
-        columns.push(new_primary_key_array(batch.primary_key(), batch.num_rows()));
+    /// Convert a flat `RecordBatch` to primary-key format, retaining only
+    /// field columns, time index, and internal columns.
+    ///
+    /// `num_fields` is the number of field columns. The method strips
+    /// leading tag columns: `num_tag_columns = batch.num_columns() - num_fields - FIXED_POS_COLUMN_NUM`.
+    pub(crate) fn convert_flat_batch(
+        &self,
+        batch: &RecordBatch,
+        num_fields: usize,
+    ) -> Result<RecordBatch> {
+        let num_tag_columns = batch.num_columns() - num_fields - FIXED_POS_COLUMN_NUM;
+        let mut columns: Vec<ArrayRef> = batch.columns()[num_tag_columns..].to_vec();
 
         if let Some(override_sequence) = self.override_sequence {
-            let sequence_array =
+            let num_cols = columns.len();
+            // sequence is at num_cols - 2 (before op_type)
+            columns[num_cols - 2] =
                 Arc::new(UInt64Array::from(vec![override_sequence; batch.num_rows()]));
-            columns.push(sequence_array);
-        } else {
-            columns.push(batch.sequences().to_arrow_array());
         }
-        columns.push(batch.op_types().to_arrow_array());
 
         RecordBatch::try_new(self.arrow_schema.clone(), columns).context(NewRecordBatchSnafu)
     }
@@ -926,15 +908,6 @@ pub(crate) fn primary_key_offsets(pk_dict_array: &PrimaryKeyArray) -> Result<Vec
     Ok(offsets)
 }
 
-/// Creates a new array for specific `primary_key`.
-fn new_primary_key_array(primary_key: &[u8], num_rows: usize) -> ArrayRef {
-    let values = Arc::new(BinaryArray::from_iter_values([primary_key]));
-    let keys = UInt32Array::from_value(0, num_rows);
-
-    // Safety: The key index is valid.
-    Arc::new(DictionaryArray::new(keys, values))
-}
-
 /// Gets the min/max time index of the row group from the parquet meta.
 /// It assumes the parquet is created by the mito engine.
 pub(crate) fn parquet_row_group_time_range(
@@ -1017,7 +990,7 @@ mod tests {
 
     use api::v1::OpType;
     use datatypes::arrow::array::{
-        Int64Array, StringArray, TimestampMillisecondArray, UInt8Array, UInt64Array,
+        Int64Array, StringArray, TimestampMillisecondArray, UInt8Array, UInt32Array, UInt64Array,
     };
     use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit};
     use datatypes::prelude::ConcreteDataType;
@@ -1145,13 +1118,6 @@ mod tests {
         assert_eq!(&build_test_arrow_schema(), write_format.arrow_schema());
     }
 
-    #[test]
-    fn test_new_primary_key_array() {
-        let array = new_primary_key_array(b"test", 3);
-        let expect = build_test_pk_array(&[(b"test".to_vec(), 3)]) as ArrayRef;
-        assert_eq!(&expect, &array);
-    }
-
     fn build_test_pk_array(pk_row_nums: &[(Vec<u8>, usize)]) -> Arc<PrimaryKeyArray> {
         let values = Arc::new(BinaryArray::from_iter_values(
             pk_row_nums.iter().map(|v| &v.0),
@@ -1164,49 +1130,6 @@ mod tests {
         Arc::new(DictionaryArray::new(keys, values))
     }
 
-    #[test]
-    fn test_convert_batch() {
-        let metadata = build_test_region_metadata();
-        let write_format = PrimaryKeyWriteFormat::new(metadata);
-
-        let num_rows = 4;
-        let batch = new_batch(b"test", 1, 2, num_rows);
-        let columns: Vec<ArrayRef> = vec![
-            Arc::new(Int64Array::from(vec![2; num_rows])), // field1
-            Arc::new(Int64Array::from(vec![3; num_rows])), // field0
-            Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts
-            build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // primary key
-            Arc::new(UInt64Array::from(vec![TEST_SEQUENCE; num_rows])), // sequence
-            Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // op type
-        ];
-        let expect_record = RecordBatch::try_new(build_test_arrow_schema(), columns).unwrap();
-
-        let actual = write_format.convert_batch(&batch).unwrap();
-        assert_eq!(expect_record, actual);
-    }
-
-    #[test]
-    fn test_convert_batch_with_override_sequence() {
-        let metadata = build_test_region_metadata();
-        let write_format =
-            PrimaryKeyWriteFormat::new(metadata).with_override_sequence(Some(415411));
-
-        let num_rows = 4;
-        let batch = new_batch(b"test", 1, 2, num_rows);
-        let columns: Vec<ArrayRef> = vec![
-            Arc::new(Int64Array::from(vec![2; num_rows])), // field1
-            Arc::new(Int64Array::from(vec![3; num_rows])), // field0
-            Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts
-            build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // primary key
-            Arc::new(UInt64Array::from(vec![415411; num_rows])), // sequence
-            Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // op type
-        ];
-        let expect_record = RecordBatch::try_new(build_test_arrow_schema(), columns).unwrap();
-
-        let actual = write_format.convert_batch(&batch).unwrap();
-        assert_eq!(expect_record, actual);
-    }
-
     #[test]
     fn test_projection_indices() {
         let metadata = build_test_region_metadata();
@@ -1867,4 +1790,100 @@ mod tests {
         let result = format.convert_batch(record_batch.clone(), None).unwrap();
         assert_eq!(record_batch, result);
     }
+
+    #[test]
+    fn test_convert_flat_batch() {
+        let metadata = build_test_region_metadata();
+        let write_format = PrimaryKeyWriteFormat::new(metadata);
+
+        let num_rows = 4;
+        // Build a flat record batch: tag0, tag1, field1, field0, ts, __primary_key, __sequence, __op_type
+        let flat_columns: Vec<ArrayRef> = input_columns_for_flat_batch(num_rows);
+        let flat_batch = RecordBatch::try_new(build_test_flat_sst_schema(), flat_columns).unwrap();
+
+        // num_fields = 2 (field1, field0)
+        let result = write_format.convert_flat_batch(&flat_batch, 2).unwrap();
+
+        // Expected: tag columns stripped, only field1, field0, ts, __primary_key, __sequence, __op_type
+        let expected_columns: Vec<ArrayRef> = vec![
+            Arc::new(Int64Array::from(vec![2; num_rows])), // field1
+            Arc::new(Int64Array::from(vec![3; num_rows])), // field0
+            Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts
+            build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // __primary_key
+            Arc::new(UInt64Array::from(vec![TEST_SEQUENCE; num_rows])), // __sequence
+            Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // __op_type
+        ];
+        let expected = RecordBatch::try_new(build_test_arrow_schema(), expected_columns).unwrap();
+
+        assert_eq!(expected, result);
+    }
+
+    #[test]
+    fn test_convert_flat_batch_with_override_sequence() {
+        let metadata = build_test_region_metadata();
+        let write_format = PrimaryKeyWriteFormat::new(metadata).with_override_sequence(Some(999));
+
+        let num_rows = 4;
+        let flat_columns: Vec<ArrayRef> = input_columns_for_flat_batch(num_rows);
+        let flat_batch = RecordBatch::try_new(build_test_flat_sst_schema(), flat_columns).unwrap();
+
+        let result = write_format.convert_flat_batch(&flat_batch, 2).unwrap();
+
+        let expected_columns: Vec<ArrayRef> = vec![
+            Arc::new(Int64Array::from(vec![2; num_rows])), // field1
+            Arc::new(Int64Array::from(vec![3; num_rows])), // field0
+            Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts
+            build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // __primary_key
+            Arc::new(UInt64Array::from(vec![999; num_rows])), // overridden __sequence
+            Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // __op_type
+        ];
+        let expected = RecordBatch::try_new(build_test_arrow_schema(), expected_columns).unwrap();
+
+        assert_eq!(expected, result);
+    }
+
+    #[test]
+    fn test_convert_flat_batch_no_tags() {
+        // Test with a region that has no primary key columns (no tags to strip).
+        let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
+        builder
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "field0",
+                    ConcreteDataType::int64_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Field,
+                column_id: 1,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "ts",
+                    ConcreteDataType::timestamp_millisecond_datatype(),
+                    false,
+                ),
+                semantic_type: SemanticType::Timestamp,
+                column_id: 2,
+            });
+        let metadata = Arc::new(builder.build().unwrap());
+        let write_format = PrimaryKeyWriteFormat::new(metadata);
+
+        let num_rows = 3;
+        // No tag columns, so flat batch is: field0, ts, __primary_key, __sequence, __op_type
+        let sst_schema = write_format.arrow_schema().clone();
+        let columns: Vec<ArrayRef> = vec![
+            Arc::new(Int64Array::from(vec![10; num_rows])), // field0
+            Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3])), // ts
+            build_test_pk_array(&[(b"".to_vec(), num_rows)]), // __primary_key
+            Arc::new(UInt64Array::from(vec![TEST_SEQUENCE; num_rows])), // __sequence
+            Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // __op_type
+        ];
+        let flat_batch = RecordBatch::try_new(sst_schema.clone(), columns.clone()).unwrap();
+
+        // num_fields = 1, num_tag_columns = 5 - 1 - 4 = 0, so nothing is stripped
+        let result = write_format.convert_flat_batch(&flat_batch, 1).unwrap();
+        let expected = RecordBatch::try_new(sst_schema, columns).unwrap();
+
+        assert_eq!(expected, result);
+    }
 }
diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs
index 500f32ae91..4d7122ccc6 100644
--- a/src/mito2/src/sst/parquet/reader.rs
+++ b/src/mito2/src/sst/parquet/reader.rs
@@ -21,9 +21,8 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use api::v1::SemanticType;
-use async_trait::async_trait;
 use common_recordbatch::filter::SimpleFilterEvaluator;
-use common_telemetry::{debug, tracing, warn};
+use common_telemetry::{tracing, warn};
 use datafusion_expr::Expr;
 use datatypes::arrow::array::ArrayRef;
 use datatypes::arrow::datatypes::Field;
@@ -57,7 +56,7 @@ use crate::metrics::{
     READ_ROWS_TOTAL, READ_STAGE_ELAPSED,
 };
 use crate::read::flat_projection::CompactionProjectionMapper;
-use crate::read::prune::{PruneReader, Source};
+use crate::read::prune::FlatPruneReader;
 use crate::read::{Batch, BatchReader};
 use crate::sst::file::FileHandle;
 use crate::sst::index::bloom_filter::applier::{
@@ -303,7 +302,8 @@ impl ParquetReaderBuilder {
     pub async fn build(&self) -> Result<Option<ParquetReader>> {
         let mut metrics = ReaderMetrics::default();
 
-        let Some((context, selection)) = self.build_reader_input(&mut metrics).await? else {
+        let Some((context, selection)) = self.build_reader_input_inner(&mut metrics, true).await?
+        else {
             return Ok(None);
         };
         ParquetReader::new(Arc::new(context), selection)
@@ -325,12 +325,14 @@ impl ParquetReaderBuilder {
         &self,
         metrics: &mut ReaderMetrics,
     ) -> Result<Option<(FileRangeContext, RowGroupSelection)>> {
-        self.build_reader_input_inner(metrics).await
+        self.build_reader_input_inner(metrics, self.flat_format)
+            .await
     }
 
     async fn build_reader_input_inner(
         &self,
         metrics: &mut ReaderMetrics,
+        flat_format: bool,
     ) -> Result<Option<(FileRangeContext, RowGroupSelection)>> {
         let start = Instant::now();
 
@@ -373,7 +375,7 @@ impl ParquetReaderBuilder {
         // before compat handling.
         let compaction_projection_mapper = if self.compaction
             && !is_same_region_partition
-            && self.flat_format
+            && flat_format
             && region_meta.primary_key_encoding == PrimaryKeyEncoding::Sparse
         {
             Some(CompactionProjectionMapper::try_new(&region_meta)?)
@@ -385,7 +387,7 @@ impl ParquetReaderBuilder {
             ReadFormat::new(
                 region_meta.clone(),
                 Some(column_ids),
-                self.flat_format,
+                flat_format,
                 Some(parquet_meta.file_metadata().schema_descr().num_columns()),
                 &file_path,
                 skip_auto_convert,
@@ -401,7 +403,7 @@ impl ParquetReaderBuilder {
             ReadFormat::new(
                 region_meta.clone(),
                 Some(&column_ids),
-                self.flat_format,
+                flat_format,
                 Some(parquet_meta.file_metadata().schema_descr().num_columns()),
                 &file_path,
                 skip_auto_convert,
@@ -1751,24 +1753,6 @@ impl RowGroupReaderBuilder {
     }
 }
 
-/// The state of a [ParquetReader].
-enum ReaderState {
-    /// The reader is reading a row group.
-    Readable(PruneReader),
-    /// The reader is exhausted.
-    Exhausted(ReaderMetrics),
-}
-
-impl ReaderState {
-    /// Returns the metrics of the reader.
-    fn metrics(&self) -> ReaderMetrics {
-        match self {
-            ReaderState::Readable(reader) => reader.metrics(),
-            ReaderState::Exhausted(m) => m.clone(),
-        }
-    }
-}
-
 /// The filter to evaluate or the prune result of the default value.
 pub(crate) enum MaybeFilter {
     /// The filter to evaluate.
@@ -1879,13 +1863,12 @@ pub struct ParquetReader {
     /// Row group selection to read.
     selection: RowGroupSelection,
     /// Reader of current row group.
-    reader_state: ReaderState,
+    reader: Option<FlatPruneReader>,
     /// Metrics for tracking row group fetch operations.
     fetch_metrics: ParquetFetchMetrics,
 }
 
-#[async_trait]
-impl BatchReader for ParquetReader {
+impl ParquetReader {
     #[tracing::instrument(
         skip_all,
         fields(
@@ -1893,18 +1876,20 @@ impl BatchReader for ParquetReader {
             file_id = %self.context.reader_builder().file_handle.file_id()
         )
     )]
-    async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        let ReaderState::Readable(reader) = &mut self.reader_state else {
-            return Ok(None);
-        };
+    pub async fn next_record_batch(&mut self) -> Result<Option<RecordBatch>> {
+        loop {
+            if let Some(reader) = &mut self.reader {
+                if let Some(batch) = reader.next_batch()? {
+                    return Ok(Some(batch));
+                }
+                self.reader = None;
+                continue;
+            }
 
-        // We don't collect the elapsed time if the reader returns an error.
-        if let Some(batch) = reader.next_batch().await? {
-            return Ok(Some(batch));
-        }
+            let Some((row_group_idx, row_selection)) = self.selection.pop_first() else {
+                return Ok(None);
+            };
 
-        // No more items in current row group, reads next row group.
-        while let Some((row_group_idx, row_selection)) = self.selection.pop_first() {
             let parquet_reader = self
                 .context
                 .reader_builder()
@@ -1915,54 +1900,14 @@ impl BatchReader for ParquetReader {
                 )
                 .await?;
 
-            // Resets the parquet reader.
-            // Compute skip_fields for this row group
             let skip_fields = self.context.should_skip_fields(row_group_idx);
-            reader.reset_source(
-                Source::RowGroup(RowGroupReader::new(self.context.clone(), parquet_reader)),
+            self.reader = Some(FlatPruneReader::new_with_row_group_reader(
+                self.context.clone(),
+                FlatRowGroupReader::new(self.context.clone(), parquet_reader),
                 skip_fields,
-            );
-            if let Some(batch) = reader.next_batch().await? {
-                return Ok(Some(batch));
-            }
+            ));
         }
-
-        // The reader is exhausted.
-        self.reader_state = ReaderState::Exhausted(reader.metrics().clone());
-        Ok(None)
     }
-}
-
-impl Drop for ParquetReader {
-    fn drop(&mut self) {
-        let metrics = self.reader_state.metrics();
-        debug!(
-            "Read parquet {} {}, range: {:?}, {}/{} row groups, metrics: {:?}",
-            self.context.reader_builder().file_handle.region_id(),
-            self.context.reader_builder().file_handle.file_id(),
-            self.context.reader_builder().file_handle.time_range(),
-            metrics.filter_metrics.rg_total
-                - metrics.filter_metrics.rg_inverted_filtered
-                - metrics.filter_metrics.rg_minmax_filtered
-                - metrics.filter_metrics.rg_fulltext_filtered
-                - metrics.filter_metrics.rg_bloom_filtered,
-            metrics.filter_metrics.rg_total,
-            metrics
-        );
-
-        // Report metrics.
-        READ_STAGE_ELAPSED
-            .with_label_values(&["build_parquet_reader"])
-            .observe(metrics.build_cost.as_secs_f64());
-        READ_STAGE_ELAPSED
-            .with_label_values(&["scan_row_groups"])
-            .observe(metrics.scan_cost.as_secs_f64());
-        metrics.observe_rows("parquet_reader");
-        metrics.filter_metrics.observe();
-    }
-}
-
-impl ParquetReader {
     /// Creates a new reader.
     #[tracing::instrument(
         skip_all,
@@ -1975,28 +1920,27 @@ impl ParquetReader {
         context: FileRangeContextRef,
         mut selection: RowGroupSelection,
     ) -> Result<Self> {
+        debug_assert!(context.read_format().as_flat().is_some());
         let fetch_metrics = ParquetFetchMetrics::default();
-        // No more items in current row group, reads next row group.
-        let reader_state = if let Some((row_group_idx, row_selection)) = selection.pop_first() {
+        let reader = if let Some((row_group_idx, row_selection)) = selection.pop_first() {
             let parquet_reader = context
                 .reader_builder()
                 .build(row_group_idx, Some(row_selection), Some(&fetch_metrics))
                 .await?;
-            // Compute skip_fields once for this row group
             let skip_fields = context.should_skip_fields(row_group_idx);
-            ReaderState::Readable(PruneReader::new_with_row_group_reader(
+            Some(FlatPruneReader::new_with_row_group_reader(
                 context.clone(),
-                RowGroupReader::new(context.clone(), parquet_reader),
+                FlatRowGroupReader::new(context.clone(), parquet_reader),
                 skip_fields,
             ))
         } else {
-            ReaderState::Exhausted(ReaderMetrics::default())
+            None
         };
 
         Ok(ParquetReader {
             context,
             selection,
-            reader_state,
+            reader,
             fetch_metrics,
         })
     }
diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs
index b207f11ef8..4e75073e26 100644
--- a/src/mito2/src/sst/parquet/writer.rs
+++ b/src/mito2/src/sst/parquet/writer.rs
@@ -50,7 +50,7 @@ use crate::config::{IndexBuildMode, IndexConfig};
 use crate::error::{
     InvalidMetadataSnafu, OpenDalSnafu, Result, UnexpectedSnafu, WriteParquetSnafu,
 };
-use crate::read::{Batch, FlatSource, Source};
+use crate::read::FlatSource;
 use crate::sst::file::RegionFileId;
 use crate::sst::index::{IndexOutput, Indexer, IndexerBuilder};
 use crate::sst::parquet::flat_format::{FlatWriteFormat, time_index_column_index};
@@ -60,6 +60,35 @@ use crate::sst::{
     DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, SeriesEstimator,
 };
 
+/// Converts a flat RecordBatch for writing to parquet.
+enum FlatBatchConverter {
+    /// Write as-is in flat format.
+    Flat(FlatWriteFormat),
+    /// Convert flat batch to primary-key format by stripping tag columns.
+    PrimaryKey {
+        format: PrimaryKeyWriteFormat,
+        num_fields: usize,
+    },
+}
+
+impl FlatBatchConverter {
+    fn arrow_schema(&self) -> &SchemaRef {
+        match self {
+            FlatBatchConverter::Flat(f) => f.arrow_schema(),
+            FlatBatchConverter::PrimaryKey { format, .. } => format.arrow_schema(),
+        }
+    }
+
+    fn convert_batch(&self, batch: &RecordBatch) -> Result<RecordBatch> {
+        match self {
+            FlatBatchConverter::Flat(f) => f.convert_batch(batch),
+            FlatBatchConverter::PrimaryKey { format, num_fields } => {
+                format.convert_flat_batch(batch, *num_fields)
+            }
+        }
+    }
+}
+
 /// Parquet SST writer.
 pub struct ParquetWriter<'a, F: WriterFactory, I: IndexerBuilder, P: FilePathProvider> {
     /// Path provider that creates SST and index file paths according to file id.
@@ -240,81 +269,6 @@ where
         Ok(())
     }
 
-    /// Iterates source and writes all rows to Parquet file.
-    ///
-    /// Returns the [SstInfo] if the SST is written.
-    pub async fn write_all(
-        &mut self,
-        source: Source,
-        override_sequence: Option<SequenceNumber>, // override the `sequence` field from `Source`
-        opts: &WriteOptions,
-    ) -> Result<SstInfoArray> {
-        let res = self
-            .write_all_without_cleaning(source, override_sequence, opts)
-            .await;
-        if res.is_err() {
-            // Clean tmp files explicitly on failure.
-            let file_id = self.current_file;
-            if let Some(cleaner) = &self.file_cleaner {
-                cleaner.clean_by_file_id(file_id).await;
-            }
-        }
-        res
-    }
-
-    async fn write_all_without_cleaning(
-        &mut self,
-        mut source: Source,
-        override_sequence: Option<SequenceNumber>, // override the `sequence` field from `Source`
-        opts: &WriteOptions,
-    ) -> Result<SstInfoArray> {
-        let mut results = smallvec![];
-        let write_format = PrimaryKeyWriteFormat::new(self.metadata.clone())
-            .with_override_sequence(override_sequence);
-        let mut stats = SourceStats::default();
-
-        while let Some(res) = self
-            .write_next_batch(&mut source, &write_format, opts)
-            .await
-            .transpose()
-        {
-            match res {
-                Ok(mut batch) => {
-                    stats.update(&batch);
-                    let start = Instant::now();
-                    // safety: self.current_indexer must be set when first batch has been written.
-                    match self.index_config.build_mode {
-                        IndexBuildMode::Sync => {
-                            self.current_indexer
-                                .as_mut()
-                                .unwrap()
-                                .update(&mut batch)
-                                .await;
-                        }
-                        IndexBuildMode::Async => {}
-                    }
-                    self.metrics.update_index += start.elapsed();
-                    if let Some(max_file_size) = opts.max_file_size
-                        && self.bytes_written.load(Ordering::Relaxed) > max_file_size
-                    {
-                        self.finish_current_file(&mut results, &mut stats).await?;
-                    }
-                }
-                Err(e) => {
-                    if let Some(indexer) = &mut self.current_indexer {
-                        indexer.abort().await;
-                    }
-                    return Err(e);
-                }
-            }
-        }
-
-        self.finish_current_file(&mut results, &mut stats).await?;
-
-        // object_store.write will make sure all bytes are written or an error is raised.
-        Ok(results)
-    }
-
     /// Iterates FlatSource and writes all RecordBatch in flat format to Parquet file.
     ///
     /// Returns the [SstInfo] if the SST is written.
@@ -324,11 +278,15 @@ where
         override_sequence: Option<SequenceNumber>,
         opts: &WriteOptions,
     ) -> Result<SstInfoArray> {
-        let res = self
-            .write_all_flat_without_cleaning(source, override_sequence, opts)
-            .await;
+        let converter = FlatBatchConverter::Flat(
+            FlatWriteFormat::new(
+                self.metadata.clone(),
+                &FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding),
+            )
+            .with_override_sequence(override_sequence),
+        );
+        let res = self.write_all_flat_inner(source, &converter, opts).await;
         if res.is_err() {
-            // Clean tmp files explicitly on failure.
             let file_id = self.current_file;
             if let Some(cleaner) = &self.file_cleaner {
                 cleaner.clean_by_file_id(file_id).await;
@@ -337,36 +295,58 @@ where
         res
     }
 
-    async fn write_all_flat_without_cleaning(
+    /// Iterates FlatSource and writes all RecordBatch in primary-key format to Parquet file.
+    ///
+    /// Returns the [SstInfo] if the SST is written.
+    pub async fn write_all_flat_as_primary_key(
         &mut self,
-        mut source: FlatSource,
+        source: FlatSource,
         override_sequence: Option<SequenceNumber>,
         opts: &WriteOptions,
+    ) -> Result<SstInfoArray> {
+        let num_fields = self.metadata.field_columns().count();
+        let converter = FlatBatchConverter::PrimaryKey {
+            format: PrimaryKeyWriteFormat::new(self.metadata.clone())
+                .with_override_sequence(override_sequence),
+            num_fields,
+        };
+        let res = self.write_all_flat_inner(source, &converter, opts).await;
+        if res.is_err() {
+            let file_id = self.current_file;
+            if let Some(cleaner) = &self.file_cleaner {
+                cleaner.clean_by_file_id(file_id).await;
+            }
+        }
+        res
+    }
+
+    async fn write_all_flat_inner(
+        &mut self,
+        mut source: FlatSource,
+        converter: &FlatBatchConverter,
+        opts: &WriteOptions,
     ) -> Result<SstInfoArray> {
         let mut results = smallvec![];
-        let flat_format = FlatWriteFormat::new(
-            self.metadata.clone(),
-            &FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding),
-        )
-        .with_override_sequence(override_sequence);
         let mut stats = SourceStats::default();
 
         while let Some(record_batch) = self
-            .write_next_flat_batch(&mut source, &flat_format, opts)
+            .write_next_flat_batch(&mut source, converter, opts)
             .await
             .transpose()
         {
             match record_batch {
                 Ok(batch) => {
                     stats.update_flat(&batch)?;
-                    let start = Instant::now();
-                    // safety: self.current_indexer must be set when first batch has been written.
-                    self.current_indexer
-                        .as_mut()
-                        .unwrap()
-                        .update_flat(&batch)
-                        .await;
-                    self.metrics.update_index += start.elapsed();
+                    if matches!(self.index_config.build_mode, IndexBuildMode::Sync) {
+                        let start = Instant::now();
+                        // safety: self.current_indexer must be set when first batch has been written.
+                        self.current_indexer
+                            .as_mut()
+                            .unwrap()
+                            .update_flat(&batch)
+                            .await;
+                        self.metrics.update_index += start.elapsed();
+                    }
                     if let Some(max_file_size) = opts.max_file_size
                         && self.bytes_written.load(Ordering::Relaxed) > max_file_size
                     {
@@ -411,34 +391,10 @@ where
             .set_column_compression(op_type_col, Compression::UNCOMPRESSED)
     }
 
-    async fn write_next_batch(
-        &mut self,
-        source: &mut Source,
-        write_format: &PrimaryKeyWriteFormat,
-        opts: &WriteOptions,
-    ) -> Result<Option<Batch>> {
-        let start = Instant::now();
-        let Some(batch) = source.next_batch().await? else {
-            return Ok(None);
-        };
-        self.metrics.iter_source += start.elapsed();
-
-        let arrow_batch = write_format.convert_batch(&batch)?;
-
-        let start = Instant::now();
-        self.maybe_init_writer(write_format.arrow_schema(), opts)
-            .await?
-            .write(&arrow_batch)
-            .await
-            .context(WriteParquetSnafu)?;
-        self.metrics.write_batch += start.elapsed();
-        Ok(Some(batch))
-    }
-
     async fn write_next_flat_batch(
         &mut self,
         source: &mut FlatSource,
-        flat_format: &FlatWriteFormat,
+        converter: &FlatBatchConverter,
         opts: &WriteOptions,
     ) -> Result<Option<RecordBatch>> {
         let start = Instant::now();
@@ -447,15 +403,16 @@ where
         };
         self.metrics.iter_source += start.elapsed();
 
-        let arrow_batch = flat_format.convert_batch(&record_batch)?;
+        let arrow_batch = converter.convert_batch(&record_batch)?;
 
         let start = Instant::now();
-        self.maybe_init_writer(flat_format.arrow_schema(), opts)
+        self.maybe_init_writer(converter.arrow_schema(), opts)
             .await?
             .write(&arrow_batch)
             .await
             .context(WriteParquetSnafu)?;
         self.metrics.write_batch += start.elapsed();
+        // Return original flat batch for stats/indexer which use flat layout.
         Ok(Some(record_batch))
     }
 
@@ -515,26 +472,6 @@ struct SourceStats {
 }
 
 impl SourceStats {
-    fn update(&mut self, batch: &Batch) {
-        if batch.is_empty() {
-            return;
-        }
-
-        self.num_rows += batch.num_rows();
-        self.series_estimator.update(batch);
-        // Safety: batch is not empty.
-        let (min_in_batch, max_in_batch) = (
-            batch.first_timestamp().unwrap(),
-            batch.last_timestamp().unwrap(),
-        );
-        if let Some(time_range) = &mut self.time_range {
-            time_range.0 = time_range.0.min(min_in_batch);
-            time_range.1 = time_range.1.max(max_in_batch);
-        } else {
-            self.time_range = Some((min_in_batch, max_in_batch));
-        }
-    }
-
     fn update_flat(&mut self, record_batch: &RecordBatch) -> Result<()> {
         if record_batch.num_rows() == 0 {
             return Ok(());
diff --git a/src/mito2/src/test_util/sst_util.rs b/src/mito2/src/test_util/sst_util.rs
index 389d9bf107..e9515030c0 100644
--- a/src/mito2/src/test_util/sst_util.rs
+++ b/src/mito2/src/test_util/sst_util.rs
@@ -18,7 +18,11 @@ use std::sync::Arc;
 
 use api::v1::{OpType, SemanticType};
 use common_time::Timestamp;
-use datatypes::arrow::array::{BinaryArray, TimestampMillisecondArray, UInt8Array, UInt64Array};
+use datatypes::arrow::array::{
+    ArrayRef, BinaryDictionaryBuilder, RecordBatch, StringDictionaryBuilder,
+    TimestampMillisecondArray, UInt8Array, UInt64Array,
+};
+use datatypes::arrow::datatypes::UInt32Type;
 use datatypes::prelude::ConcreteDataType;
 use datatypes::schema::{ColumnSchema, SkippingIndexOptions};
 use datatypes::value::ValueRef;
@@ -32,8 +36,9 @@ use store_api::metric_engine_consts::{
 use store_api::storage::consts::ReservedColumnId;
 use store_api::storage::{FileId, RegionId};
 
-use crate::read::{Batch, BatchBuilder, Source};
+use crate::read::{Batch, FlatSource, Source};
 use crate::sst::file::{FileHandle, FileMeta};
+use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
 use crate::test_util::{VecBatchReader, new_batch_builder, new_noop_file_purger};
 
 /// Test region id.
@@ -246,34 +251,68 @@ pub fn new_batch_by_range(tags: &[&str], start: usize, end: usize) -> Batch {
     new_batch_with_custom_sequence(tags, start, end, 1000)
 }
 
-pub fn new_batch_with_binary(tags: &[&str], start: usize, end: usize) -> Batch {
+/// Creates a flat format RecordBatch for testing.
+/// Similar to `new_batch_by_range` but returns a RecordBatch in flat format.
+pub fn new_record_batch_by_range(tags: &[&str], start: usize, end: usize) -> RecordBatch {
+    new_record_batch_with_custom_sequence(tags, start, end, 1000)
+}
+
+/// Creates a flat format RecordBatch for testing with a custom sequence.
+pub fn new_record_batch_with_custom_sequence(
+    tags: &[&str],
+    start: usize,
+    end: usize,
+    sequence: u64,
+) -> RecordBatch {
     assert!(end >= start);
+    let metadata = Arc::new(sst_region_metadata());
+    let flat_schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default());
+
+    let num_rows = end - start;
+    let mut columns = Vec::new();
+
+    // Add primary key columns (tag_0, tag_1) as dictionary arrays
+    let mut tag_0_builder = StringDictionaryBuilder::<UInt32Type>::new();
+    let mut tag_1_builder = StringDictionaryBuilder::<UInt32Type>::new();
+
+    for _ in 0..num_rows {
+        tag_0_builder.append_value(tags[0]);
+        tag_1_builder.append_value(tags[1]);
+    }
+
+    columns.push(Arc::new(tag_0_builder.finish()) as ArrayRef);
+    columns.push(Arc::new(tag_1_builder.finish()) as ArrayRef);
+
+    // Add field column (field_0)
+    let field_values: Vec<u64> = (start..end).map(|v| v as u64).collect();
+    columns.push(Arc::new(UInt64Array::from(field_values)));
+
+    // Add time index column (ts)
+    let timestamps: Vec<i64> = (start..end).map(|v| v as i64).collect();
+    columns.push(Arc::new(TimestampMillisecondArray::from(timestamps)));
+
+    // Add encoded primary key column
     let pk = new_primary_key(tags);
-    let timestamps: Vec<_> = (start..end).map(|v| v as i64).collect();
-    let sequences = vec![1000; end - start];
-    let op_types = vec![OpType::Put; end - start];
+    let mut pk_builder = BinaryDictionaryBuilder::<UInt32Type>::new();
+    for _ in 0..num_rows {
+        pk_builder.append(&pk).unwrap();
+    }
+    columns.push(Arc::new(pk_builder.finish()));
 
-    let field: Vec<_> = (start..end)
-        .map(|_v| "some data".as_bytes().to_vec())
-        .collect();
+    // Add sequence column
+    columns.push(Arc::new(UInt64Array::from_value(sequence, num_rows)));
 
-    let mut builder = BatchBuilder::new(pk);
-    builder
-        .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
-            timestamps.iter().copied(),
-        )))
-        .unwrap()
-        .sequences_array(Arc::new(UInt64Array::from_iter_values(
-            sequences.iter().copied(),
-        )))
-        .unwrap()
-        .op_types_array(Arc::new(UInt8Array::from_iter_values(
-            op_types.iter().map(|v| *v as u8),
-        )))
-        .unwrap()
-        .push_field_array(1, Arc::new(BinaryArray::from_iter_values(field)))
-        .unwrap();
-    builder.build().unwrap()
+    // Add op_type column
+    columns.push(Arc::new(UInt8Array::from_value(
+        OpType::Put as u8,
+        num_rows,
+    )));
+    RecordBatch::try_new(flat_schema, columns).unwrap()
+}
+
+/// Creates a FlatSource from flat format RecordBatches.
+pub fn new_flat_source_from_record_batches(batches: Vec<RecordBatch>) -> FlatSource {
+    FlatSource::Iter(Box::new(batches.into_iter().map(Ok)))
 }
 
 /// Creates a new region metadata for testing SSTs with binary datatype.

From 0dfbba0b3f4333a20c9a861c6339d085b988be7b Mon Sep 17 00:00:00 2001
From: liyang <daviderli614@gmail.com>
Date: Fri, 13 Mar 2026 20:42:15 +0800
Subject: [PATCH 10/42] ci: upload artifacts use s3 proxy (#7800)

* ci: upload artifacts use s3 proxy

Signed-off-by: liyang <daviderli614@gmail.com>

* update echo context

Signed-off-by: liyang <daviderli614@gmail.com>

---------

Signed-off-by: liyang <daviderli614@gmail.com>
---
 .../actions/release-cn-artifacts/action.yaml  | 32 +++++-----------
 .github/scripts/upload-artifacts-to-s3.sh     | 38 ++++++++++++-------
 .github/workflows/dev-build.yml               |  7 ++--
 .github/workflows/nightly-build.yml           |  7 ++--
 .github/workflows/release.yml                 |  7 ++--
 5 files changed, 44 insertions(+), 47 deletions(-)

diff --git a/.github/actions/release-cn-artifacts/action.yaml b/.github/actions/release-cn-artifacts/action.yaml
index 2825d3f5d0..fe78d5a760 100644
--- a/.github/actions/release-cn-artifacts/action.yaml
+++ b/.github/actions/release-cn-artifacts/action.yaml
@@ -37,17 +37,14 @@ inputs:
     description: Whether to push the latest tag of the image
     required: false
     default: 'true'
-  aws-cn-s3-bucket:
-    description: S3 bucket to store released artifacts in CN region
+  proxy-url:
+    description: The url of the S3 proxy server
     required: true
-  aws-cn-access-key-id:
-    description: AWS access key id in CN region
+  proxy-username:
+    description: The username of the S3 proxy
     required: true
-  aws-cn-secret-access-key:
-    description: AWS secret access key in CN region
-    required: true
-  aws-cn-region:
-    description: AWS region in CN
+  proxy-password:
+    description: The password of the S3 proxy
     required: true
   upload-to-s3:
     description: Upload to S3
@@ -77,21 +74,13 @@ runs:
       with:
         path: ${{ inputs.artifacts-dir }}
 
-    - name: Install s5cmd
-      shell: bash
-      run: |
-        wget https://github.com/peak/s5cmd/releases/download/v2.3.0/s5cmd_2.3.0_Linux-64bit.tar.gz
-        tar -xzf s5cmd_2.3.0_Linux-64bit.tar.gz
-        sudo mv s5cmd /usr/local/bin/
-        sudo chmod +x /usr/local/bin/s5cmd
-
     - name: Release artifacts to cn region
       uses: nick-invision/retry@v2
       if: ${{ inputs.upload-to-s3 == 'true' }}
       env:
-        AWS_ACCESS_KEY_ID: ${{ inputs.aws-cn-access-key-id }}
-        AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-cn-secret-access-key }}
-        AWS_REGION: ${{ inputs.aws-cn-region }}
+        PROXY_URL: ${{ inputs.proxy-url }}
+        PROXY_USERNAME: ${{ inputs.proxy-username }}
+        PROXY_PASSWORD: ${{ inputs.proxy-password }}
         UPDATE_VERSION_INFO: ${{ inputs.update-version-info }}
       with:
         max_attempts: ${{ inputs.upload-max-retry-times }}
@@ -99,8 +88,7 @@ runs:
         command: |
           ./.github/scripts/upload-artifacts-to-s3.sh \
             ${{ inputs.artifacts-dir }} \
-            ${{ inputs.version }} \
-            ${{ inputs.aws-cn-s3-bucket }}
+            ${{ inputs.version }}
 
     - name: Push greptimedb image from Dockerhub to ACR
       shell: bash
diff --git a/.github/scripts/upload-artifacts-to-s3.sh b/.github/scripts/upload-artifacts-to-s3.sh
index 75c8f8d932..310575c069 100755
--- a/.github/scripts/upload-artifacts-to-s3.sh
+++ b/.github/scripts/upload-artifacts-to-s3.sh
@@ -5,16 +5,15 @@ set -o pipefail
 
 ARTIFACTS_DIR=$1
 VERSION=$2
-AWS_S3_BUCKET=$3
 RELEASE_DIRS="releases/greptimedb"
 GREPTIMEDB_REPO="GreptimeTeam/greptimedb"
 
 # Check if necessary variables are set.
 function check_vars() {
-  for var in AWS_S3_BUCKET VERSION ARTIFACTS_DIR; do
+  for var in VERSION ARTIFACTS_DIR; do
     if [ -z "${!var}" ]; then
       echo "$var is not set or empty."
-      echo "Usage: $0 <artifacts-dir> <version> <aws-s3-bucket>"
+      echo "Usage: $0 <artifacts-dir> <version>"
       exit 1
     fi
   done
@@ -33,8 +32,13 @@ function upload_artifacts() {
   #    ├── greptime-darwin-amd64-v0.2.0.sha256sum
   #    └── greptime-darwin-amd64-v0.2.0.tar.gz
   find "$ARTIFACTS_DIR" -type f \( -name "*.tar.gz" -o -name "*.sha256sum" \) | while IFS= read -r file; do
-    s5cmd cp \
-      "$file" "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/$VERSION/$(basename "$file")"
+    filename=$(basename "$file")
+    TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION/$filename"
+
+    curl -X PUT \
+      -u "$PROXY_USERNAME:$PROXY_PASSWORD" \
+      -F "file=@$file" \
+      "$TARGET_URL"
   done
 }
 
@@ -45,16 +49,24 @@ function update_version_info() {
     if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
       echo "Updating latest-version.txt"
       echo "$VERSION" > latest-version.txt
-      s5cmd cp \
-        latest-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-version.txt"
+      TARGET_URL="$PROXY_URL/$RELEASE_DIRS/latest-version.txt"
+
+      curl -X PUT \
+        -u "$PROXY_USERNAME:$PROXY_PASSWORD" \
+        -F "file=@latest-version.txt" \
+        "$TARGET_URL"
     fi
 
     # If it's the nightly release, update latest-nightly-version.txt.
     if [[ "$VERSION" == *"nightly"* ]]; then
       echo "Updating latest-nightly-version.txt"
       echo "$VERSION" > latest-nightly-version.txt
-      s5cmd cp \
-        latest-nightly-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-nightly-version.txt"
+
+      TARGET_URL="$PROXY_URL/$RELEASE_DIRS/latest-nightly-version.txt"
+      curl -X PUT \
+        -u "$PROXY_USERNAME:$PROXY_PASSWORD" \
+        -F "file=@latest-nightly-version.txt" \
+        "$TARGET_URL"
     fi
   fi
 }
@@ -93,10 +105,10 @@ function main() {
 }
 
 # Usage example:
-#   AWS_ACCESS_KEY_ID=<your_access_key_id> \
-#   AWS_SECRET_ACCESS_KEY=<your_secret_access_key> \
-#   AWS_DEFAULT_REGION=<your_region> \
+#   PROXY_URL=<proxy_url> \
+#   PROXY_USERNAME=<proxy_username> \
+#   PROXY_PASSWORD=<proxy_password> \
 #   UPDATE_VERSION_INFO=true \
 #   DOWNLOAD_ARTIFACTS_FROM_GITHUB=false \
-#     ./upload-artifacts-to-s3.sh <artifacts-dir> <version> <aws-s3-bucket>
+#     ./upload-artifacts-to-s3.sh <artifacts-dir> <version>
 main
diff --git a/.github/workflows/dev-build.yml b/.github/workflows/dev-build.yml
index 021867e4ed..d03fbeff14 100644
--- a/.github/workflows/dev-build.yml
+++ b/.github/workflows/dev-build.yml
@@ -285,10 +285,9 @@ jobs:
           dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
           dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
           version: ${{ needs.allocate-runners.outputs.version }}
-          aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }}
-          aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
-          aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
-          aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
+          proxy-url: ${{ secrets.PROXY_URL }}
+          proxy-username: ${{ secrets.PROXY_USERNAME }}
+          proxy-password: ${{ secrets.PROXY_PASSWORD }}
           upload-to-s3: ${{ inputs.upload_artifacts_to_s3 }}
           dev-mode: true                     # Only build the standard images(exclude centos images).
           push-latest-tag: false             # Don't push the latest tag to registry.
diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml
index 9eaa38c789..14ebb6e715 100644
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -236,10 +236,9 @@ jobs:
           dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
           dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
           version: ${{ needs.allocate-runners.outputs.version }}
-          aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }}
-          aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
-          aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
-          aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
+          proxy-url: ${{ secrets.PROXY_URL }}
+          proxy-username: ${{ secrets.PROXY_USERNAME }}
+          proxy-password: ${{ secrets.PROXY_PASSWORD }}
           upload-to-s3: false
           dev-mode: false
           update-version-info: false  # Don't update version info in S3.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3b0eb2d68c..9f8f2d9703 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -358,10 +358,9 @@ jobs:
           dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
           dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
           version: ${{ needs.allocate-runners.outputs.version }}
-          aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }}
-          aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
-          aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
-          aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
+          proxy-url: ${{ secrets.PROXY_URL }}
+          proxy-username: ${{ secrets.PROXY_USERNAME }}
+          proxy-password: ${{ secrets.PROXY_PASSWORD }}
           dev-mode: false
           upload-to-s3: true
           update-version-info: true

From 306e8398cf441ab9041da8297144000eca4657b6 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunng@protonmail.com>
Date: Mon, 16 Mar 2026 11:01:02 +0800
Subject: [PATCH 11/42] fix: correct unicode representation for jsonb_to_string
 (#7810)

* fix: correct unicode representation for jsonb_to_string

* refactor: correct function name and behavior

* fix: fix json_to_string and provide tests
---
 .../src/scalars/json/json_to_string.rs        |   3 +-
 src/datatypes/src/types/json_type.rs          | 146 ++++++++----------
 .../standalone/common/types/json/json.result  |  64 ++++----
 .../standalone/common/types/json/json.sql     |  30 ++--
 4 files changed, 119 insertions(+), 124 deletions(-)

diff --git a/src/common/function/src/scalars/json/json_to_string.rs b/src/common/function/src/scalars/json/json_to_string.rs
index 6c0cc260b2..6364dff4de 100644
--- a/src/common/function/src/scalars/json/json_to_string.rs
+++ b/src/common/function/src/scalars/json/json_to_string.rs
@@ -19,6 +19,7 @@ use datafusion_common::DataFusionError;
 use datafusion_common::arrow::array::{Array, AsArray, StringViewBuilder};
 use datafusion_common::arrow::datatypes::DataType;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
+use datatypes::types::jsonb_to_string;
 
 use crate::function::{Function, extract_args};
 
@@ -74,7 +75,7 @@ impl Function for JsonToStringFunction {
         for i in 0..size {
             let json = jsons.is_valid(i).then(|| jsons.value(i));
             let result = json
-                .map(|json| jsonb::from_slice(json).map(|x| x.to_string()))
+                .map(jsonb_to_string)
                 .transpose()
                 .map_err(|e| DataFusionError::Execution(format!("invalid json binary: {e}")))?;
 
diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs
index 61586fc460..912bbfca54 100644
--- a/src/datatypes/src/types/json_type.rs
+++ b/src/datatypes/src/types/json_type.rs
@@ -396,7 +396,7 @@ pub fn jsonb_to_string(val: &[u8]) -> Result<String> {
     match jsonb::from_slice(val) {
         Ok(jsonb_value) => {
             let serialized = jsonb_value.to_string();
-            Ok(serialized)
+            fix_unicode_point(&serialized)
         }
         Err(e) => InvalidJsonbSnafu { error: e }.fail(),
     }
@@ -405,18 +405,12 @@ pub fn jsonb_to_string(val: &[u8]) -> Result<String> {
 /// Converts a json type value to serde_json::Value
 pub fn jsonb_to_serde_json(val: &[u8]) -> Result<serde_json::Value> {
     let json_string = jsonb_to_string(val)?;
-    jsonb_string_to_serde_value(&json_string)
+    serde_json::Value::from_str(&json_string).context(DeserializeSnafu { json: json_string })
 }
 
-/// Attempts to deserialize a JSON text into `serde_json::Value`, with a best-effort
-/// fallback for Rust-style Unicode escape sequences.
+/// Normalizes a JSON string by converting Rust-style Unicode escape sequences to JSON-compatible format.
 ///
-/// This function is intended to be used on JSON strings produced from the internal
-/// JSONB representation (e.g. via [`jsonb_to_string`]). It first calls
-/// `serde_json::Value::from_str` directly. If that succeeds, the parsed value is
-/// returned as-is.
-///
-/// If the initial parse fails, the input is scanned for Rust-style Unicode code
+/// The input is scanned for Rust-style Unicode code
 /// point escapes of the form `\\u{H...}` (a backslash, `u`, an opening brace,
 /// followed by 1–6 hexadecimal digits, and a closing brace). Each such escape is
 /// converted into JSON-compatible UTF‑16 escape sequences:
@@ -427,59 +421,44 @@ pub fn jsonb_to_serde_json(val: &[u8]) -> Result<serde_json::Value> {
 ///   the code point is encoded as a UTF‑16 surrogate pair and emitted as two consecutive
 ///   `\\uXXXX` sequences (as JSON format required).
 ///
-/// After this normalization, the function retries parsing the resulting string as
-/// JSON and returns the deserialized value or a `DeserializeSnafu` error if it
-/// still cannot be parsed.
-fn jsonb_string_to_serde_value(json: &str) -> Result<serde_json::Value> {
-    match serde_json::Value::from_str(json) {
-        Ok(v) => Ok(v),
-        Err(e) => {
-            // If above deserialization is failed, the JSON string might contain some Rust chars
-            // that are somehow incorrectly represented as Unicode code point literal. For example,
-            // "\u{fe0f}". We have to convert them to JSON compatible format, like "\uFE0F", then
-            // try to deserialize the JSON string again.
-            if !e.is_syntax() || !e.to_string().contains("invalid escape") {
-                return Err(e).context(DeserializeSnafu { json });
-            }
+/// After this normalization, the function returns the normalized string
+fn fix_unicode_point(json: &str) -> Result<String> {
+    static UNICODE_CODE_POINT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
+        // Match literal "\u{...}" sequences, capturing 1–6 (code point range) hex digits
+        // inside braces.
+        Regex::new(r"\\u\{([0-9a-fA-F]{1,6})}").unwrap_or_else(|e| panic!("{}", e))
+    });
 
-            static UNICODE_CODE_POINT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
-                // Match literal "\u{...}" sequences, capturing 1–6 (code point range) hex digits
-                // inside braces.
-                Regex::new(r"\\u\{([0-9a-fA-F]{1,6})}").unwrap_or_else(|e| panic!("{}", e))
-            });
+    let v = UNICODE_CODE_POINT_PATTERN.replace_all(json, |caps: &Captures| {
+        // Extract the hex payload (without braces) and parse to a code point.
+        let hex = &caps[1];
+        let Ok(code) = u32::from_str_radix(hex, 16) else {
+            // On parse failure, leave the original escape sequence unchanged.
+            return caps[0].to_string();
+        };
 
-            let v = UNICODE_CODE_POINT_PATTERN.replace_all(json, |caps: &Captures| {
-                // Extract the hex payload (without braces) and parse to a code point.
-                let hex = &caps[1];
-                let Ok(code) = u32::from_str_radix(hex, 16) else {
-                    // On parse failure, leave the original escape sequence unchanged.
-                    return caps[0].to_string();
-                };
+        if code <= 0xFFFF {
+            // Basic Multilingual Plane: JSON can represent this directly as \uXXXX.
+            format!("\\u{:04X}", code)
+        } else if code > 0x10FFFF {
+            // Beyond max Unicode code point
+            caps[0].to_string()
+        } else {
+            // Supplementary planes: JSON needs UTF-16 surrogate pairs.
+            // Convert the code point to a 20-bit value.
+            let code = code - 0x10000;
 
-                if code <= 0xFFFF {
-                    // Basic Multilingual Plane: JSON can represent this directly as \uXXXX.
-                    format!("\\u{:04X}", code)
-                } else if code > 0x10FFFF {
-                    // Beyond max Unicode code point
-                    caps[0].to_string()
-                } else {
-                    // Supplementary planes: JSON needs UTF-16 surrogate pairs.
-                    // Convert the code point to a 20-bit value.
-                    let code = code - 0x10000;
+            // High surrogate: top 10 bits, offset by 0xD800.
+            let high = 0xD800 + ((code >> 10) & 0x3FF);
 
-                    // High surrogate: top 10 bits, offset by 0xD800.
-                    let high = 0xD800 + ((code >> 10) & 0x3FF);
+            // Low surrogate: bottom 10 bits, offset by 0xDC00.
+            let low = 0xDC00 + (code & 0x3FF);
 
-                    // Low surrogate: bottom 10 bits, offset by 0xDC00.
-                    let low = 0xDC00 + (code & 0x3FF);
-
-                    // Emit two \uXXXX escapes in sequence.
-                    format!("\\u{:04X}\\u{:04X}", high, low)
-                }
-            });
-            serde_json::Value::from_str(&v).context(DeserializeSnafu { json })
+            // Emit two \uXXXX escapes in sequence.
+            format!("\\u{:04X}\\u{:04X}", high, low)
         }
-    }
+    });
+    Ok(v.to_string())
 }
 
 /// Parses a string to a json type value
@@ -495,45 +474,54 @@ mod tests {
     use crate::json::JsonStructureSettings;
 
     #[test]
-    fn test_jsonb_string_to_serde_value() -> Result<()> {
+    fn test_fix_unicode_point() -> Result<()> {
         let valid_cases = vec![
-            (r#"{"data": "simple ascii"}"#, r#"{"data":"simple ascii"}"#),
+            (r#"{"data": "simple ascii"}"#, r#"{"data": "simple ascii"}"#),
             (
-                r#"{"data": "Greek sigma: \u{03a3}"}"#,
-                r#"{"data":"Greek sigma: Σ"}"#,
+                r#"{"data":"Greek sigma: \u{03a3}"}"#,
+                r#"{"data":"Greek sigma: \u03A3"}"#,
             ),
             (
-                r#"{"data": "Joker card: \u{1f0df}"}"#,
-                r#"{"data":"Joker card: 🃟"}"#,
+                r#"{"data":"Joker card: \u{1f0df}"}"#,
+                r#"{"data":"Joker card: \uD83C\uDCDF"}"#,
             ),
             (
-                r#"{"data": "BMP boundary: \u{ffff}"}"#,
-                r#"{"data":"BMP boundary: ￿"}"#,
+                r#"{"data":"BMP boundary: \u{ffff}"}"#,
+                r#"{"data":"BMP boundary: \uFFFF"}"#,
             ),
             (
-                r#"{"data": "Supplementary min: \u{10000}"}"#,
-                r#"{"data":"Supplementary min: 𐀀"}"#,
+                r#"{"data":"Supplementary min: \u{10000}"}"#,
+                r#"{"data":"Supplementary min: \uD800\uDC00"}"#,
             ),
             (
-                r#"{"data": "Supplementary max: \u{10ffff}"}"#,
-                r#"{"data":"Supplementary max: 􏿿"}"#,
+                r#"{"data":"Supplementary max: \u{10ffff}"}"#,
+                r#"{"data":"Supplementary max: \uDBFF\uDFFF"}"#,
             ),
         ];
         for (input, expect) in valid_cases {
-            let v = jsonb_string_to_serde_value(input)?;
-            assert_eq!(v.to_string(), expect);
+            let v = fix_unicode_point(input)?;
+            assert_eq!(v, expect);
         }
 
-        let invalid_cases = vec![
-            r#"{"data": "Invalid hex: \u{gggg}"}"#,
-            r#"{"data": "Beyond max Unicode code point: \u{110000}"}"#,
-            r#"{"data": "Out of range: \u{1100000}"}"#, // 7 digit
-            r#"{"data": "Empty braces: \u{}"}"#,
+        let invalid_escape_cases = vec![
+            (
+                r#"{"data": "Invalid hex: \u{gggg}"}"#,
+                r#"{"data": "Invalid hex: \u{gggg}"}"#,
+            ),
+            (
+                r#"{"data": "Empty braces: \u{}"}"#,
+                r#"{"data": "Empty braces: \u{}"}"#,
+            ),
+            (
+                r#"{"data": "Out of range: \u{1100000}"}"#,
+                r#"{"data": "Out of range: \u{1100000}"}"#,
+            ),
         ];
-        for input in invalid_cases {
-            let result = jsonb_string_to_serde_value(input);
-            assert!(result.is_err());
+        for (input, expect) in invalid_escape_cases {
+            let v = fix_unicode_point(input)?;
+            assert_eq!(v, expect);
         }
+
         Ok(())
     }
 
diff --git a/tests/cases/standalone/common/types/json/json.result b/tests/cases/standalone/common/types/json/json.result
index 8c4755f4ae..8fad9632b1 100644
--- a/tests/cases/standalone/common/types/json/json.result
+++ b/tests/cases/standalone/common/types/json/json.result
@@ -37,22 +37,23 @@ INSERT INTO jsons VALUES('[null]', 0),
             }
         ]
     }
-}}', 11);
+}}', 11),
+('{"a":"abc\u2028tom"}', 12);
 
-Affected Rows: 12
+Affected Rows: 13
 
-INSERT INTO jsons VALUES(parse_json('[null]'), 12),
-(parse_json('[true]'), 13),
-(parse_json('[false]'), 14),
-(parse_json('[0]'), 15),
-(parse_json('["foo"]'), 16),
-(parse_json('[]'), 17),
-(parse_json('{}'), 18),
-(parse_json('[0,1]'), 19),
-(parse_json('{"foo":"bar"}'), 20),
-(parse_json('{"a":null,"foo":"bar"}'), 21),
-(parse_json('[-1]'), 22),
-(parse_json('[-2147483648]'), 23),
+INSERT INTO jsons VALUES(parse_json('[null]'), 1000),
+(parse_json('[true]'), 1001),
+(parse_json('[false]'), 1002),
+(parse_json('[0]'), 1003),
+(parse_json('["foo"]'), 1004),
+(parse_json('[]'), 1005),
+(parse_json('{}'), 1006),
+(parse_json('[0,1]'), 1007),
+(parse_json('{"foo":"bar"}'), 1008),
+(parse_json('{"a":null,"foo":"bar"}'), 1009),
+(parse_json('[-1]'), 1010),
+(parse_json('[-2147483648]'), 1011),
 (parse_json('{"entities": {
             "description": {
                 "urls": [
@@ -76,9 +77,10 @@ INSERT INTO jsons VALUES(parse_json('[null]'), 12),
                     }
                 ]
             }
-        }}'), 24);
+        }}'), 1012),
+(parse_json('{"a":"abc\u2028tom"}'), 1013);
 
-Affected Rows: 13
+Affected Rows: 14
 
 SELECT json_to_string(j), t FROM jsons;
 
@@ -97,25 +99,27 @@ SELECT json_to_string(j), t FROM jsons;
 | {"a":null,"foo":"bar"}                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:00.009 |
 | [-1]                                                                                                                                                                                                                                                                                                                      | 1970-01-01T00:00:00.010 |
 | {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:00.011 |
-| [null]                                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:00.012 |
-| [true]                                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:00.013 |
-| [false]                                                                                                                                                                                                                                                                                                                   | 1970-01-01T00:00:00.014 |
-| [0]                                                                                                                                                                                                                                                                                                                       | 1970-01-01T00:00:00.015 |
-| ["foo"]                                                                                                                                                                                                                                                                                                                   | 1970-01-01T00:00:00.016 |
-| []                                                                                                                                                                                                                                                                                                                        | 1970-01-01T00:00:00.017 |
-| {}                                                                                                                                                                                                                                                                                                                        | 1970-01-01T00:00:00.018 |
-| [0,1]                                                                                                                                                                                                                                                                                                                     | 1970-01-01T00:00:00.019 |
-| {"foo":"bar"}                                                                                                                                                                                                                                                                                                             | 1970-01-01T00:00:00.020 |
-| {"a":null,"foo":"bar"}                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:00.021 |
-| [-1]                                                                                                                                                                                                                                                                                                                      | 1970-01-01T00:00:00.022 |
-| [-2147483648]                                                                                                                                                                                                                                                                                                             | 1970-01-01T00:00:00.023 |
-| {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:00.024 |
+| {"a":"abc\u2028tom"}                                                                                                                                                                                                                                                                                                      | 1970-01-01T00:00:00.012 |
+| [null]                                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:01     |
+| [true]                                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:01.001 |
+| [false]                                                                                                                                                                                                                                                                                                                   | 1970-01-01T00:00:01.002 |
+| [0]                                                                                                                                                                                                                                                                                                                       | 1970-01-01T00:00:01.003 |
+| ["foo"]                                                                                                                                                                                                                                                                                                                   | 1970-01-01T00:00:01.004 |
+| []                                                                                                                                                                                                                                                                                                                        | 1970-01-01T00:00:01.005 |
+| {}                                                                                                                                                                                                                                                                                                                        | 1970-01-01T00:00:01.006 |
+| [0,1]                                                                                                                                                                                                                                                                                                                     | 1970-01-01T00:00:01.007 |
+| {"foo":"bar"}                                                                                                                                                                                                                                                                                                             | 1970-01-01T00:00:01.008 |
+| {"a":null,"foo":"bar"}                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:01.009 |
+| [-1]                                                                                                                                                                                                                                                                                                                      | 1970-01-01T00:00:01.010 |
+| [-2147483648]                                                                                                                                                                                                                                                                                                             | 1970-01-01T00:00:01.011 |
+| {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:01.012 |
+| {"a":"abc\u2028tom"}                                                                                                                                                                                                                                                                                                      | 1970-01-01T00:00:01.013 |
 +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+
 
 --Insert invalid json strings--
 DELETE FROM jsons;
 
-Affected Rows: 25
+Affected Rows: 27
 
 INSERT INTO jsons VALUES(parse_json('{"a":1, "b":2, "c":3'), 4);
 
diff --git a/tests/cases/standalone/common/types/json/json.sql b/tests/cases/standalone/common/types/json/json.sql
index 868edc59e8..5a521ee1c6 100644
--- a/tests/cases/standalone/common/types/json/json.sql
+++ b/tests/cases/standalone/common/types/json/json.sql
@@ -35,20 +35,21 @@ INSERT INTO jsons VALUES('[null]', 0),
             }
         ]
     }
-}}', 11);
+}}', 11),
+('{"a":"abc\u2028tom"}', 12);
 
-INSERT INTO jsons VALUES(parse_json('[null]'), 12),
-(parse_json('[true]'), 13),
-(parse_json('[false]'), 14),
-(parse_json('[0]'), 15),
-(parse_json('["foo"]'), 16),
-(parse_json('[]'), 17),
-(parse_json('{}'), 18),
-(parse_json('[0,1]'), 19),
-(parse_json('{"foo":"bar"}'), 20),
-(parse_json('{"a":null,"foo":"bar"}'), 21),
-(parse_json('[-1]'), 22),
-(parse_json('[-2147483648]'), 23),
+INSERT INTO jsons VALUES(parse_json('[null]'), 1000),
+(parse_json('[true]'), 1001),
+(parse_json('[false]'), 1002),
+(parse_json('[0]'), 1003),
+(parse_json('["foo"]'), 1004),
+(parse_json('[]'), 1005),
+(parse_json('{}'), 1006),
+(parse_json('[0,1]'), 1007),
+(parse_json('{"foo":"bar"}'), 1008),
+(parse_json('{"a":null,"foo":"bar"}'), 1009),
+(parse_json('[-1]'), 1010),
+(parse_json('[-2147483648]'), 1011),
 (parse_json('{"entities": {
             "description": {
                 "urls": [
@@ -72,7 +73,8 @@ INSERT INTO jsons VALUES(parse_json('[null]'), 12),
                     }
                 ]
             }
-        }}'), 24);
+        }}'), 1012),
+(parse_json('{"a":"abc\u2028tom"}'), 1013);
 
 SELECT json_to_string(j), t FROM jsons;
 

From c6f1ef8aecfd78044fbadb88d1de70a7f1a94b39 Mon Sep 17 00:00:00 2001
From: jeremyhi <jiachun_feng@proton.me>
Date: Sun, 15 Mar 2026 20:52:27 -0700
Subject: [PATCH 12/42] feat: track unlimited usage in memory manager (#7811)

* feat: track unlimited usage in memory manager

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: by gemini comment

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: remove unused import

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

---------

Signed-off-by: jeremyhi <fengjiachun@gmail.com>
---
 Cargo.lock                               |   1 -
 src/common/memory-manager/Cargo.toml     |   1 -
 src/common/memory-manager/src/guard.rs   | 131 +++++++++++++++--------
 src/common/memory-manager/src/manager.rs | 113 +++++++++++++++----
 src/common/memory-manager/src/tests.rs   |  18 ++--
 5 files changed, 192 insertions(+), 72 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 94f7a3eca1..1f65f1289c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2488,7 +2488,6 @@ version = "1.0.0-rc.2"
 dependencies = [
  "common-error",
  "common-macro",
- "common-telemetry",
  "humantime",
  "serde",
  "snafu 0.8.6",
diff --git a/src/common/memory-manager/Cargo.toml b/src/common/memory-manager/Cargo.toml
index a6be50f774..6686c98167 100644
--- a/src/common/memory-manager/Cargo.toml
+++ b/src/common/memory-manager/Cargo.toml
@@ -10,7 +10,6 @@ workspace = true
 [dependencies]
 common-error = { workspace = true }
 common-macro = { workspace = true }
-common-telemetry = { workspace = true }
 humantime = { workspace = true }
 serde = { workspace = true }
 snafu = { workspace = true }
diff --git a/src/common/memory-manager/src/guard.rs b/src/common/memory-manager/src/guard.rs
index 770b6dec24..ad3111581b 100644
--- a/src/common/memory-manager/src/guard.rs
+++ b/src/common/memory-manager/src/guard.rs
@@ -14,14 +14,13 @@
 
 use std::{fmt, mem};
 
-use common_telemetry::debug;
 use snafu::ensure;
 use tokio::sync::{OwnedSemaphorePermit, TryAcquireError};
 
 use crate::error::{
     MemoryAcquireTimeoutSnafu, MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result,
 };
-use crate::manager::{MemoryMetrics, MemoryQuota};
+use crate::manager::{MemoryMetrics, MemoryQuota, UnlimitedMemoryQuota};
 use crate::policy::OnExhaustedPolicy;
 
 /// Guard representing a slice of reserved memory.
@@ -30,31 +29,57 @@ pub struct MemoryGuard<M: MemoryMetrics> {
 }
 
 pub(crate) enum GuardState<M: MemoryMetrics> {
-    Unlimited,
+    Released,
+    Unlimited {
+        quota: UnlimitedMemoryQuota<M>,
+        granted_bytes: u64,
+    },
     Limited {
-        permit: OwnedSemaphorePermit,
         quota: MemoryQuota<M>,
+        permit: OwnedSemaphorePermit,
     },
 }
 
+impl<M: MemoryMetrics> GuardState<M> {
+    fn release(self) {
+        match self {
+            GuardState::Released => {}
+            GuardState::Unlimited {
+                quota,
+                granted_bytes,
+            } => {
+                quota.sub_in_use(granted_bytes);
+            }
+            GuardState::Limited { quota, permit } => {
+                quota.release_permit(permit);
+            }
+        }
+    }
+}
+
 impl<M: MemoryMetrics> MemoryGuard<M> {
-    pub(crate) fn unlimited() -> Self {
+    pub(crate) fn unlimited(quota: UnlimitedMemoryQuota<M>, bytes: u64) -> Self {
+        quota.add_in_use(bytes);
         Self {
-            state: GuardState::Unlimited,
+            state: GuardState::Unlimited {
+                quota,
+                granted_bytes: bytes,
+            },
         }
     }
 
-    pub(crate) fn limited(permit: OwnedSemaphorePermit, quota: MemoryQuota<M>) -> Self {
+    pub(crate) fn limited(quota: MemoryQuota<M>, permit: OwnedSemaphorePermit) -> Self {
         Self {
-            state: GuardState::Limited { permit, quota },
+            state: GuardState::Limited { quota, permit },
         }
     }
 
     /// Returns granted quota in bytes.
     pub fn granted_bytes(&self) -> u64 {
         match &self.state {
-            GuardState::Unlimited => 0,
-            GuardState::Limited { permit, quota } => {
+            GuardState::Released => 0,
+            GuardState::Unlimited { granted_bytes, .. } => *granted_bytes,
+            GuardState::Limited { quota, permit } => {
                 quota.permits_to_bytes(permit.num_permits() as u32)
             }
         }
@@ -68,13 +93,24 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
     /// - Returns error if requested bytes would exceed the manager's total limit
     /// - Returns error if the semaphore is unexpectedly closed
     pub async fn acquire_additional(&mut self, bytes: u64) -> Result<()> {
-        match &mut self.state {
-            GuardState::Unlimited => Ok(()),
-            GuardState::Limited { permit, quota } => {
-                if bytes == 0 {
-                    return Ok(());
-                }
+        if bytes == 0 {
+            return Ok(());
+        }
 
+        match &mut self.state {
+            GuardState::Released => {
+                debug_assert!(false, "released memory guard state should not be reused");
+                Ok(())
+            }
+            GuardState::Unlimited {
+                quota,
+                granted_bytes,
+            } => {
+                quota.add_in_use(bytes);
+                *granted_bytes = granted_bytes.saturating_add(bytes);
+                Ok(())
+            }
+            GuardState::Limited { quota, permit } => {
                 let additional_permits = quota.bytes_to_permits(bytes);
                 let current_permits = permit.num_permits() as u32;
 
@@ -95,7 +131,6 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
 
                 permit.merge(additional_permit);
                 quota.update_in_use_metric();
-                debug!("Acquired additional {} bytes", bytes);
                 Ok(())
             }
         }
@@ -106,13 +141,24 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
     /// On success, merges the new memory into this guard and returns true.
     /// On failure, returns false and leaves this guard unchanged.
     pub fn try_acquire_additional(&mut self, bytes: u64) -> bool {
-        match &mut self.state {
-            GuardState::Unlimited => true,
-            GuardState::Limited { permit, quota } => {
-                if bytes == 0 {
-                    return true;
-                }
+        if bytes == 0 {
+            return true;
+        }
 
+        match &mut self.state {
+            GuardState::Released => {
+                debug_assert!(false, "released memory guard state should not be reused");
+                false
+            }
+            GuardState::Unlimited {
+                quota,
+                granted_bytes,
+            } => {
+                quota.add_in_use(bytes);
+                *granted_bytes = granted_bytes.saturating_add(bytes);
+                true
+            }
+            GuardState::Limited { quota, permit } => {
                 let additional_permits = quota.bytes_to_permits(bytes);
 
                 match quota
@@ -123,7 +169,6 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
                     Ok(additional_permit) => {
                         permit.merge(additional_permit);
                         quota.update_in_use_metric();
-                        debug!("Acquired additional {} bytes", bytes);
                         true
                     }
                     Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
@@ -168,7 +213,8 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
                     MemoryLimitExceededSnafu {
                         requested_bytes: bytes,
                         limit_bytes: match &self.state {
-                            GuardState::Unlimited => 0, // unreachable: unlimited mode always succeeds
+                            GuardState::Released => 0,
+                            GuardState::Unlimited { .. } => 0, // unreachable: unlimited mode always succeeds
                             GuardState::Limited { quota, .. } => {
                                 quota.permits_to_bytes(quota.limit_permits)
                             }
@@ -184,22 +230,30 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
     ///
     /// Returns true if the release succeeds or is a no-op; false if the request exceeds granted.
     pub fn release_partial(&mut self, bytes: u64) -> bool {
+        if bytes == 0 {
+            return true;
+        }
+
         match &mut self.state {
-            GuardState::Unlimited => true,
-            GuardState::Limited { permit, quota } => {
-                if bytes == 0 {
-                    return true;
+            GuardState::Released => true,
+            GuardState::Unlimited {
+                quota,
+                granted_bytes,
+            } => {
+                if bytes > *granted_bytes {
+                    return false;
                 }
 
+                quota.sub_in_use(bytes);
+                *granted_bytes = granted_bytes.saturating_sub(bytes);
+                true
+            }
+            GuardState::Limited { quota, permit } => {
                 let release_permits = quota.bytes_to_permits(bytes);
 
                 match permit.split(release_permits as usize) {
                     Some(released_permit) => {
-                        let released_bytes =
-                            quota.permits_to_bytes(released_permit.num_permits() as u32);
-                        drop(released_permit);
-                        quota.update_in_use_metric();
-                        debug!("Released {} bytes from memory guard", released_bytes);
+                        quota.release_permit(released_permit);
                         true
                     }
                     None => false,
@@ -211,14 +265,7 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
 
 impl<M: MemoryMetrics> Drop for MemoryGuard<M> {
     fn drop(&mut self) {
-        if let GuardState::Limited { permit, quota } =
-            mem::replace(&mut self.state, GuardState::Unlimited)
-        {
-            let bytes = quota.permits_to_bytes(permit.num_permits() as u32);
-            drop(permit);
-            quota.update_in_use_metric();
-            debug!("Released memory: {} bytes", bytes);
-        }
+        mem::replace(&mut self.state, GuardState::Released).release();
     }
 }
 
diff --git a/src/common/memory-manager/src/manager.rs b/src/common/memory-manager/src/manager.rs
index 50360d2a31..8cca5f220c 100644
--- a/src/common/memory-manager/src/manager.rs
+++ b/src/common/memory-manager/src/manager.rs
@@ -13,9 +13,10 @@
 // limitations under the License.
 
 use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
 
 use snafu::ensure;
-use tokio::sync::{Semaphore, TryAcquireError};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, TryAcquireError};
 
 use crate::error::{
     MemoryAcquireTimeoutSnafu, MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result,
@@ -34,7 +35,7 @@ pub trait MemoryMetrics: Clone + Send + Sync + 'static {
 /// Generic memory manager for quota-controlled operations.
 #[derive(Clone)]
 pub struct MemoryManager<M: MemoryMetrics> {
-    quota: Option<MemoryQuota<M>>,
+    quota: MemoryQuotaState<M>,
 }
 
 impl<M: MemoryMetrics + Default> Default for MemoryManager<M> {
@@ -51,6 +52,18 @@ pub(crate) struct MemoryQuota<M: MemoryMetrics> {
     pub(crate) metrics: M,
 }
 
+#[derive(Clone)]
+pub(crate) struct UnlimitedMemoryQuota<M: MemoryMetrics> {
+    pub(crate) current_bytes: Arc<AtomicU64>,
+    pub(crate) metrics: M,
+}
+
+#[derive(Clone)]
+pub(crate) enum MemoryQuotaState<M: MemoryMetrics> {
+    Unlimited(UnlimitedMemoryQuota<M>),
+    Limited(MemoryQuota<M>),
+}
+
 impl<M: MemoryMetrics> MemoryManager<M> {
     /// Creates a new memory manager with the given limit in bytes.
     /// `limit_bytes = 0` disables the limit.
@@ -62,7 +75,12 @@ impl<M: MemoryMetrics> MemoryManager<M> {
     pub fn with_granularity(limit_bytes: u64, granularity: PermitGranularity, metrics: M) -> Self {
         if limit_bytes == 0 {
             metrics.set_limit(0);
-            return Self { quota: None };
+            return Self {
+                quota: MemoryQuotaState::Unlimited(UnlimitedMemoryQuota {
+                    current_bytes: Arc::new(AtomicU64::new(0)),
+                    metrics,
+                }),
+            };
         }
 
         let limit_permits = granularity.bytes_to_permits(limit_bytes);
@@ -70,7 +88,7 @@ impl<M: MemoryMetrics> MemoryManager<M> {
         metrics.set_limit(limit_aligned_bytes as i64);
 
         Self {
-            quota: Some(MemoryQuota {
+            quota: MemoryQuotaState::Limited(MemoryQuota {
                 semaphore: Arc::new(Semaphore::new(limit_permits as usize)),
                 limit_permits,
                 granularity,
@@ -81,26 +99,30 @@ impl<M: MemoryMetrics> MemoryManager<M> {
 
     /// Returns the configured limit in bytes (0 if unlimited).
     pub fn limit_bytes(&self) -> u64 {
-        self.quota
-            .as_ref()
-            .map(|quota| quota.permits_to_bytes(quota.limit_permits))
-            .unwrap_or(0)
+        match &self.quota {
+            MemoryQuotaState::Unlimited(_) => 0,
+            MemoryQuotaState::Limited(quota) => quota.permits_to_bytes(quota.limit_permits),
+        }
     }
 
     /// Returns currently used bytes.
     pub fn used_bytes(&self) -> u64 {
-        self.quota
-            .as_ref()
-            .map(|quota| quota.permits_to_bytes(quota.used_permits()))
-            .unwrap_or(0)
+        match &self.quota {
+            MemoryQuotaState::Unlimited(quota) => quota.current_bytes.load(Ordering::Acquire),
+            MemoryQuotaState::Limited(quota) => quota.permits_to_bytes(quota.used_permits()),
+        }
     }
 
     /// Returns available bytes.
+    ///
+    /// Unlimited managers report `u64::MAX`.
     pub fn available_bytes(&self) -> u64 {
-        self.quota
-            .as_ref()
-            .map(|quota| quota.permits_to_bytes(quota.available_permits_clamped()))
-            .unwrap_or(0)
+        match &self.quota {
+            MemoryQuotaState::Unlimited(_) => u64::MAX,
+            MemoryQuotaState::Limited(quota) => {
+                quota.permits_to_bytes(quota.available_permits_clamped())
+            }
+        }
     }
 
     /// Acquires memory, waiting if necessary until enough is available.
@@ -110,8 +132,8 @@ impl<M: MemoryMetrics> MemoryManager<M> {
     /// - Returns error if the semaphore is unexpectedly closed
     pub async fn acquire(&self, bytes: u64) -> Result<MemoryGuard<M>> {
         match &self.quota {
-            None => Ok(MemoryGuard::unlimited()),
-            Some(quota) => {
+            MemoryQuotaState::Unlimited(quota) => Ok(MemoryGuard::unlimited(quota.clone(), bytes)),
+            MemoryQuotaState::Limited(quota) => {
                 let permits = quota.bytes_to_permits(bytes);
 
                 ensure!(
@@ -129,7 +151,7 @@ impl<M: MemoryMetrics> MemoryManager<M> {
                     .await
                     .map_err(|_| MemorySemaphoreClosedSnafu.build())?;
                 quota.update_in_use_metric();
-                Ok(MemoryGuard::limited(permit, quota.clone()))
+                Ok(MemoryGuard::limited(quota.clone(), permit))
             }
         }
     }
@@ -137,14 +159,16 @@ impl<M: MemoryMetrics> MemoryManager<M> {
     /// Tries to acquire memory. Returns Some(guard) on success, None if insufficient.
     pub fn try_acquire(&self, bytes: u64) -> Option<MemoryGuard<M>> {
         match &self.quota {
-            None => Some(MemoryGuard::unlimited()),
-            Some(quota) => {
+            MemoryQuotaState::Unlimited(quota) => {
+                Some(MemoryGuard::unlimited(quota.clone(), bytes))
+            }
+            MemoryQuotaState::Limited(quota) => {
                 let permits = quota.bytes_to_permits(bytes);
 
                 match quota.semaphore.clone().try_acquire_many_owned(permits) {
                     Ok(permit) => {
                         quota.update_in_use_metric();
-                        Some(MemoryGuard::limited(permit, quota.clone()))
+                        Some(MemoryGuard::limited(quota.clone(), permit))
                     }
                     Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
                         quota.metrics.inc_rejected("try_acquire");
@@ -219,4 +243,49 @@ impl<M: MemoryMetrics> MemoryQuota<M> {
         let bytes = self.permits_to_bytes(self.used_permits());
         self.metrics.set_in_use(bytes as i64);
     }
+
+    pub(crate) fn release_permit(&self, permit: OwnedSemaphorePermit) {
+        drop(permit);
+        self.update_in_use_metric();
+    }
+}
+
+impl<M: MemoryMetrics> UnlimitedMemoryQuota<M> {
+    pub(crate) fn add_in_use(&self, bytes: u64) {
+        if bytes == 0 {
+            return;
+        }
+
+        let previous = self
+            .current_bytes
+            .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
+                Some(current.saturating_add(bytes))
+            })
+            .unwrap();
+        let new_total = previous.saturating_add(bytes);
+        debug_assert!(
+            new_total >= previous,
+            "unlimited memory usage counter overflowed"
+        );
+        self.metrics.set_in_use(new_total as i64);
+    }
+
+    pub(crate) fn sub_in_use(&self, bytes: u64) {
+        if bytes == 0 {
+            return;
+        }
+
+        let previous = self
+            .current_bytes
+            .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
+                Some(current.saturating_sub(bytes))
+            })
+            .unwrap();
+        debug_assert!(
+            previous >= bytes,
+            "unlimited memory usage counter underflowed: current={previous}, release={bytes}"
+        );
+        let new_total = previous.saturating_sub(bytes);
+        self.metrics.set_in_use(new_total as i64);
+    }
 }
diff --git a/src/common/memory-manager/src/tests.rs b/src/common/memory-manager/src/tests.rs
index 886eef9dac..fe02703f0b 100644
--- a/src/common/memory-manager/src/tests.rs
+++ b/src/common/memory-manager/src/tests.rs
@@ -24,7 +24,9 @@ fn test_try_acquire_unlimited() {
     let manager = MemoryManager::new(0, NoOpMetrics);
     let guard = manager.try_acquire(10 * PERMIT_GRANULARITY_BYTES).unwrap();
     assert_eq!(manager.limit_bytes(), 0);
-    assert_eq!(guard.granted_bytes(), 0);
+    assert_eq!(manager.available_bytes(), u64::MAX);
+    assert_eq!(guard.granted_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
 }
 
 #[test]
@@ -136,7 +138,10 @@ fn test_request_additional_unlimited() {
 
     // Should always succeed with unlimited manager
     assert!(guard.try_acquire_additional(100 * PERMIT_GRANULARITY_BYTES));
-    assert_eq!(guard.granted_bytes(), 0);
+    assert_eq!(guard.granted_bytes(), 105 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 105 * PERMIT_GRANULARITY_BYTES);
+
+    drop(guard);
     assert_eq!(manager.used_bytes(), 0);
 }
 
@@ -187,9 +192,10 @@ fn test_early_release_partial_unlimited() {
     let manager = MemoryManager::new(0, NoOpMetrics);
     let mut guard = manager.try_acquire(100 * PERMIT_GRANULARITY_BYTES).unwrap();
 
-    // Unlimited guard - release should succeed (no-op)
+    // Unlimited guard should track and release exact bytes.
     assert!(guard.release_partial(50 * PERMIT_GRANULARITY_BYTES));
-    assert_eq!(guard.granted_bytes(), 0);
+    assert_eq!(guard.granted_bytes(), 50 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 50 * PERMIT_GRANULARITY_BYTES);
 }
 
 #[test]
@@ -406,6 +412,6 @@ async fn test_acquire_additional_unlimited() {
         .acquire_additional(1000 * PERMIT_GRANULARITY_BYTES)
         .await
         .unwrap();
-    assert_eq!(guard.granted_bytes(), 0);
-    assert_eq!(manager.used_bytes(), 0);
+    assert_eq!(guard.granted_bytes(), 1000 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 1000 * PERMIT_GRANULARITY_BYTES);
 }

From b007f8598633e2ad6a5ec9b226e1dbe02a6b083d Mon Sep 17 00:00:00 2001
From: maximk777 <maximkirienkov777@gmail.com>
Date: Mon, 16 Mar 2026 12:10:33 +0500
Subject: [PATCH 13/42] feat(http): improve error logging with client IP
 (#7503)

* feat(http): improve error logging with client IP

- Add logging to ErrorResponse::from_error_message()
- Add middleware to log HTTP errors with client IP

Closes #7328

Signed-off-by: maximk777 <maximkirienkov777@gmail.com>

* fix(http): address review comments for error logging

Restore rich Debug logging in from_error(), add URI/method/matched path
to client IP middleware, and only log when client address is available.

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: maximk777 <maximkirienkov777@gmail.com>
Signed-off-by: evenyag <realevenyag@gmail.com>
Co-authored-by: evenyag <realevenyag@gmail.com>
---
 src/servers/src/http.rs                     |   8 +-
 src/servers/src/http/client_ip.rs           | 109 ++++++++++++++++++++
 src/servers/src/http/result/error_result.rs |  13 ++-
 3 files changed, 125 insertions(+), 5 deletions(-)
 create mode 100644 src/servers/src/http/client_ip.rs

diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs
index ffd0745041..506a240cac 100644
--- a/src/servers/src/http.rs
+++ b/src/servers/src/http.rs
@@ -112,8 +112,8 @@ pub mod utils;
 use result::HttpOutputWriter;
 pub(crate) use timeout::DynamicTimeoutLayer;
 
+mod client_ip;
 use crate::prom_remote_write::validation::PromValidationMode;
-
 mod hints;
 mod read_preference;
 #[cfg(any(test, feature = "testing"))]
@@ -883,6 +883,7 @@ impl HttpServer {
                         authorize::check_http_auth,
                     ))
                     .layer(middleware::from_fn(hints::extract_hints))
+                    .layer(middleware::from_fn(client_ip::log_error_with_client_ip))
                     .layer(middleware::from_fn(
                         read_preference::extract_read_preference,
                     )),
@@ -1247,7 +1248,10 @@ impl Server for HttpServer {
                         error!(e; "Failed to set TCP_NODELAY on incoming connection");
                     }
                 });
-            let serve = axum::serve(listener, app.into_make_service());
+            let serve = axum::serve(
+                listener,
+                app.into_make_service_with_connect_info::<SocketAddr>(),
+            );
 
             // FIXME(yingwen): Support keepalive.
             // See:
diff --git a/src/servers/src/http/client_ip.rs b/src/servers/src/http/client_ip.rs
new file mode 100644
index 0000000000..70df554ebb
--- /dev/null
+++ b/src/servers/src/http/client_ip.rs
@@ -0,0 +1,109 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::net::SocketAddr;
+
+use axum::body::Body;
+use axum::extract::{ConnectInfo, MatchedPath};
+use axum::http::Request;
+use axum::middleware::Next;
+use axum::response::Response;
+use common_telemetry::warn;
+
+/// Middleware that logs HTTP error responses (4xx/5xx) with client IP address.
+///
+/// Extracts client address from [`ConnectInfo`] if available.
+pub async fn log_error_with_client_ip(req: Request<Body>, next: Next) -> Response {
+    let request_info = req
+        .extensions()
+        .get::<ConnectInfo<SocketAddr>>()
+        .map(|c| c.0)
+        .map(|addr| {
+            let method = req.method().clone();
+            let uri = req.uri().clone();
+            let matched_path = req.extensions().get::<MatchedPath>().cloned();
+            (addr, method, uri, matched_path)
+        });
+
+    let response = next.run(req).await;
+
+    if (response.status().is_client_error() || response.status().is_server_error())
+        && let Some((addr, method, uri, matched_path)) = request_info
+    {
+        warn!(
+            "HTTP error response {} for {} {} (matched: {}) from client {}",
+            response.status(),
+            method,
+            uri,
+            matched_path
+                .as_ref()
+                .map(|p| p.as_str())
+                .unwrap_or("<unknown>"),
+            addr
+        );
+    }
+
+    response
+}
+
+#[cfg(test)]
+mod tests {
+    use axum::Router;
+    use axum::routing::get;
+    use http::StatusCode;
+    use tower::ServiceExt;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_middleware_passes_error_response() {
+        async fn not_found_handler() -> StatusCode {
+            StatusCode::NOT_FOUND
+        }
+
+        let app = Router::new()
+            .route("/not-found", get(not_found_handler))
+            .layer(axum::middleware::from_fn(log_error_with_client_ip));
+
+        let response = app
+            .oneshot(
+                Request::builder()
+                    .uri("/not-found")
+                    .body(Body::empty())
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(response.status(), StatusCode::NOT_FOUND);
+    }
+
+    #[tokio::test]
+    async fn test_middleware_passes_success_response() {
+        async fn ok_handler() -> StatusCode {
+            StatusCode::OK
+        }
+
+        let app = Router::new()
+            .route("/ok", get(ok_handler))
+            .layer(axum::middleware::from_fn(log_error_with_client_ip));
+
+        let response = app
+            .oneshot(Request::builder().uri("/ok").body(Body::empty()).unwrap())
+            .await
+            .unwrap();
+
+        assert_eq!(response.status(), StatusCode::OK);
+    }
+}
diff --git a/src/servers/src/http/result/error_result.rs b/src/servers/src/http/result/error_result.rs
index 7b70066b68..9bd6e1a7a3 100644
--- a/src/servers/src/http/result/error_result.rs
+++ b/src/servers/src/http/result/error_result.rs
@@ -32,17 +32,24 @@ pub struct ErrorResponse {
 impl ErrorResponse {
     pub fn from_error(error: impl ErrorExt) -> Self {
         let code = error.status_code();
-
         if code.should_log_error() {
             error!(error; "Failed to handle HTTP request");
         } else {
             debug!("Failed to handle HTTP request, err: {:?}", error);
         }
-
-        Self::from_error_message(code, error.output_msg())
+        ErrorResponse {
+            code: code as u32,
+            error: error.output_msg(),
+            execution_time_ms: 0,
+        }
     }
 
     pub fn from_error_message(code: StatusCode, msg: String) -> Self {
+        if code.should_log_error() {
+            error!("Failed to handle HTTP request: {}", msg);
+        } else {
+            debug!("Failed to handle HTTP request: {}", msg);
+        }
         ErrorResponse {
             code: code as u32,
             error: msg,

From be4a7a6d371f29377bcc8acfa2c1f1a24b31d7e1 Mon Sep 17 00:00:00 2001
From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com>
Date: Mon, 16 Mar 2026 15:49:31 +0800
Subject: [PATCH 14/42] refactor: remove Memtable::iter (#7809)

* refactor: remove Memtable::iter

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* fix: review comments

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

---------

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
---
 src/mito2/benches/memtable_bench.rs           | 20 ++++-
 src/mito2/benches/simple_bulk_memtable.rs     |  8 +-
 src/mito2/src/memtable.rs                     | 30 +++----
 src/mito2/src/memtable/bulk.rs                | 10 ---
 src/mito2/src/memtable/partition_tree.rs      | 85 +++++++-----------
 .../src/memtable/simple_bulk_memtable.rs      | 68 +++++++++-----
 .../simple_bulk_memtable/test_only.rs         | 88 +------------------
 src/mito2/src/memtable/time_partition.rs      | 62 +++++++++++--
 src/mito2/src/memtable/time_series.rs         | 55 ++++--------
 src/mito2/src/test_util/memtable_util.rs      | 10 ---
 10 files changed, 184 insertions(+), 252 deletions(-)

diff --git a/src/mito2/benches/memtable_bench.rs b/src/mito2/benches/memtable_bench.rs
index ebe994f861..df991f6f92 100644
--- a/src/mito2/benches/memtable_bench.rs
+++ b/src/mito2/benches/memtable_bench.rs
@@ -28,7 +28,7 @@ use mito2::memtable::bulk::part_reader::BulkPartBatchIter;
 use mito2::memtable::bulk::{BulkMemtable, BulkMemtableConfig};
 use mito2::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtable};
 use mito2::memtable::time_series::TimeSeriesMemtable;
-use mito2::memtable::{KeyValues, Memtable, RangesOptions};
+use mito2::memtable::{IterBuilder, KeyValues, Memtable, RangesOptions};
 use mito2::read::flat_merge::FlatMergeIterator;
 use mito2::read::scan_region::PredicateGroup;
 use mito2::region::options::MergeMode;
@@ -105,7 +105,11 @@ fn full_scan(c: &mut Criterion) {
         }
 
         b.iter(|| {
-            let iter = memtable.iter(None, None, None).unwrap();
+            let iter = memtable
+                .ranges(None, RangesOptions::default())
+                .unwrap()
+                .build(None)
+                .unwrap();
             for batch in iter {
                 let _batch = batch.unwrap();
             }
@@ -145,7 +149,17 @@ fn filter_1_host(c: &mut Criterion) {
         let predicate = generator.random_host_filter();
 
         b.iter(|| {
-            let iter = memtable.iter(None, Some(predicate.clone()), None).unwrap();
+            let iter = memtable
+                .ranges(
+                    None,
+                    RangesOptions {
+                        predicate: PredicateGroup::new(&metadata, predicate.exprs()).unwrap(),
+                        ..Default::default()
+                    },
+                )
+                .unwrap()
+                .build(None)
+                .unwrap();
             for batch in iter {
                 let _batch = batch.unwrap();
             }
diff --git a/src/mito2/benches/simple_bulk_memtable.rs b/src/mito2/benches/simple_bulk_memtable.rs
index 0277397768..05035734de 100644
--- a/src/mito2/benches/simple_bulk_memtable.rs
+++ b/src/mito2/benches/simple_bulk_memtable.rs
@@ -21,7 +21,7 @@ use criterion::{Criterion, criterion_group, criterion_main};
 use datatypes::data_type::ConcreteDataType;
 use datatypes::schema::ColumnSchema;
 use mito2::memtable::simple_bulk_memtable::SimpleBulkMemtable;
-use mito2::memtable::{KeyValues, Memtable, MemtableRanges, RangesOptions};
+use mito2::memtable::{IterBuilder, KeyValues, Memtable, MemtableRanges, RangesOptions};
 use mito2::read;
 use mito2::read::Source;
 use mito2::read::dedup::DedupReader;
@@ -156,7 +156,11 @@ async fn flush(mem: &SimpleBulkMemtable) {
 }
 
 async fn flush_original(mem: &SimpleBulkMemtable) {
-    let iter = mem.iter(None, None, None).unwrap();
+    let iter = mem
+        .ranges(None, RangesOptions::default())
+        .unwrap()
+        .build(None)
+        .unwrap();
     for b in iter {
         black_box(b.unwrap());
     }
diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs
index c39bbfa346..7494ec68ed 100644
--- a/src/mito2/src/memtable.rs
+++ b/src/mito2/src/memtable.rs
@@ -28,6 +28,7 @@ use mito_codec::key_values::KeyValue;
 pub use mito_codec::key_values::KeyValues;
 use mito_codec::row_converter::{PrimaryKeyCodec, build_primary_key_codec};
 use serde::{Deserialize, Serialize};
+use snafu::ensure;
 use store_api::metadata::RegionMetadataRef;
 use store_api::storage::{ColumnId, SequenceNumber, SequenceRange};
 
@@ -231,10 +232,17 @@ impl MemtableRanges {
 
 impl IterBuilder for MemtableRanges {
     fn build(&self, _metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator> {
-        UnsupportedOperationSnafu {
-            err_msg: "MemtableRanges does not support build iterator",
-        }
-        .fail()
+        ensure!(
+            self.ranges.len() == 1,
+            UnsupportedOperationSnafu {
+                err_msg: format!(
+                    "Building an iterator from MemtableRanges expects 1 range, but got {}",
+                    self.ranges.len()
+                ),
+            }
+        );
+
+        self.ranges.values().next().unwrap().build_iter()
     }
 
     fn is_record_batch(&self) -> bool {
@@ -256,20 +264,6 @@ pub trait Memtable: Send + Sync + fmt::Debug {
     /// Writes an encoded batch of into memtable.
     fn write_bulk(&self, part: crate::memtable::bulk::part::BulkPart) -> Result<()>;
 
-    /// Scans the memtable.
-    /// `projection` selects columns to read, `None` means reading all columns.
-    /// `filters` are the predicates to be pushed down to memtable.
-    ///
-    /// # Note
-    /// This method should only be used for tests.
-    #[cfg(any(test, feature = "test"))]
-    fn iter(
-        &self,
-        projection: Option<&[ColumnId]>,
-        predicate: Option<table::predicate::Predicate>,
-        sequence: Option<SequenceRange>,
-    ) -> Result<BoxedBatchIterator>;
-
     /// Returns the ranges in the memtable.
     ///
     /// The returned map contains the range id and the range after applying the predicate.
diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs
index 6056a42013..4dad4fb885 100644
--- a/src/mito2/src/memtable/bulk.rs
+++ b/src/mito2/src/memtable/bulk.rs
@@ -462,16 +462,6 @@ impl Memtable for BulkMemtable {
         Ok(())
     }
 
-    #[cfg(any(test, feature = "test"))]
-    fn iter(
-        &self,
-        _projection: Option<&[ColumnId]>,
-        _predicate: Option<table::predicate::Predicate>,
-        _sequence: Option<SequenceRange>,
-    ) -> Result<crate::memtable::BoxedBatchIterator> {
-        todo!()
-    }
-
     fn ranges(
         &self,
         projection: Option<&[ColumnId]>,
diff --git a/src/mito2/src/memtable/partition_tree.rs b/src/mito2/src/memtable/partition_tree.rs
index febae46784..662bfd99f6 100644
--- a/src/mito2/src/memtable/partition_tree.rs
+++ b/src/mito2/src/memtable/partition_tree.rs
@@ -177,16 +177,6 @@ impl Memtable for PartitionTreeMemtable {
         .fail()
     }
 
-    #[cfg(any(test, feature = "test"))]
-    fn iter(
-        &self,
-        projection: Option<&[ColumnId]>,
-        predicate: Option<Predicate>,
-        sequence: Option<SequenceRange>,
-    ) -> Result<BoxedBatchIterator> {
-        self.tree.read(projection, predicate, sequence, None)
-    }
-
     fn ranges(
         &self,
         projection: Option<&[ColumnId]>,
@@ -396,8 +386,6 @@ mod tests {
     use api::v1::{Mutation, OpType, Rows, SemanticType};
     use common_query::prelude::{greptime_timestamp, greptime_value};
     use common_time::Timestamp;
-    use datafusion_common::Column;
-    use datafusion_expr::{BinaryExpr, Expr, Literal, Operator};
     use datatypes::data_type::ConcreteDataType;
     use datatypes::prelude::Vector;
     use datatypes::scalars::ScalarVector;
@@ -548,7 +536,10 @@ mod tests {
         let expect = (0..100).collect::<Vec<_>>();
         let kvs = memtable_util::build_key_values(&metadata, "hello".to_string(), 10, &expect, 1);
         memtable.write(&kvs).unwrap();
-        let iter = memtable.iter(Some(&[3]), None, None).unwrap();
+        let ranges = memtable
+            .ranges(Some(&[3]), RangesOptions::default())
+            .unwrap();
+        let iter = ranges.build(None).unwrap();
 
         let mut v0_all = vec![];
         for res in iter {
@@ -625,41 +616,6 @@ mod tests {
         assert_eq!(expect, read);
     }
 
-    #[test]
-    fn test_memtable_filter() {
-        let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![0, 1], false));
-        // Try to build a memtable via the builder.
-        let memtable = PartitionTreeMemtableBuilder::new(
-            PartitionTreeConfig {
-                index_max_keys_per_shard: 40,
-                ..Default::default()
-            },
-            None,
-        )
-        .build(1, &metadata);
-
-        for i in 0..100 {
-            let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
-            let kvs =
-                memtable_util::build_key_values(&metadata, "hello".to_string(), i, &timestamps, 1);
-            memtable.write(&kvs).unwrap();
-        }
-
-        for i in 0..100 {
-            let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
-            let expr = Expr::BinaryExpr(BinaryExpr {
-                left: Box::new(Expr::Column(Column::from_name("k1"))),
-                op: Operator::Eq,
-                right: Box::new((i as u32).lit()),
-            });
-            let iter = memtable
-                .iter(None, Some(Predicate::new(vec![expr])), None)
-                .unwrap();
-            let read = collect_iter_timestamps(iter);
-            assert_eq!(timestamps, read);
-        }
-    }
-
     #[test]
     fn test_deserialize_config() {
         let config = PartitionTreeConfig {
@@ -811,7 +767,11 @@ mod tests {
             ))
             .unwrap();
 
-        let mut reader = new_memtable.iter(None, None, None).unwrap();
+        let mut reader = new_memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = reader.next().unwrap().unwrap();
         let pk = codec.decode(batch.primary_key()).unwrap().into_dense();
         if let Value::String(s) = &pk[2] {
@@ -916,7 +876,14 @@ mod tests {
             .unwrap();
         memtable.freeze().unwrap();
         assert_eq!(
-            collect_kvs(memtable.iter(None, None, None).unwrap(), &metadata),
+            collect_kvs(
+                memtable
+                    .ranges(None, RangesOptions::default())
+                    .unwrap()
+                    .build(None)
+                    .unwrap(),
+                &metadata
+            ),
             ('a'..'h').map(|c| (c.to_string(), c.to_string())).collect()
         );
         let forked = memtable.fork(2, &metadata);
@@ -925,7 +892,14 @@ mod tests {
         forked.write(&key_values(&metadata, keys.iter())).unwrap();
         forked.freeze().unwrap();
         assert_eq!(
-            collect_kvs(forked.iter(None, None, None).unwrap(), &metadata),
+            collect_kvs(
+                forked
+                    .ranges(None, RangesOptions::default())
+                    .unwrap()
+                    .build(None)
+                    .unwrap(),
+                &metadata
+            ),
             keys.iter()
                 .map(|c| (c.to_string(), c.to_string()))
                 .collect()
@@ -936,7 +910,14 @@ mod tests {
         let keys = ["g", "e", "a", "f", "b", "c", "h"];
         forked2.write(&key_values(&metadata, keys.iter())).unwrap();
 
-        let kvs = collect_kvs(forked2.iter(None, None, None).unwrap(), &metadata);
+        let kvs = collect_kvs(
+            forked2
+                .ranges(None, RangesOptions::default())
+                .unwrap()
+                .build(None)
+                .unwrap(),
+            &metadata,
+        );
         let expected = keys
             .iter()
             .map(|c| (c.to_string(), c.to_string()))
diff --git a/src/mito2/src/memtable/simple_bulk_memtable.rs b/src/mito2/src/memtable/simple_bulk_memtable.rs
index 4dcaa2bac0..6d91f00361 100644
--- a/src/mito2/src/memtable/simple_bulk_memtable.rs
+++ b/src/mito2/src/memtable/simple_bulk_memtable.rs
@@ -213,22 +213,6 @@ impl Memtable for SimpleBulkMemtable {
         Ok(())
     }
 
-    #[cfg(any(test, feature = "test"))]
-    fn iter(
-        &self,
-        projection: Option<&[ColumnId]>,
-        _predicate: Option<table::predicate::Predicate>,
-        sequence: Option<store_api::storage::SequenceRange>,
-    ) -> error::Result<BoxedBatchIterator> {
-        let iter = self.create_iter(projection, sequence)?.build(None)?;
-        if self.merge_mode == MergeMode::LastNonNull {
-            let iter = LastNonNullIter::new(iter);
-            Ok(Box::new(iter))
-        } else {
-            Ok(Box::new(iter))
-        }
-    }
-
     fn ranges(
         &self,
         projection: Option<&[ColumnId]>,
@@ -526,7 +510,11 @@ mod tests {
             ))
             .unwrap();
 
-        let mut iter = memtable.iter(None, None, None).unwrap();
+        let mut iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
         assert_eq!(2, batch.num_rows());
         assert_eq!(2, batch.fields().len());
@@ -551,7 +539,11 @@ mod tests {
             ))
             .unwrap();
 
-        let mut iter = memtable.iter(None, None, None).unwrap();
+        let mut iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
         assert_eq!(1, batch.num_rows());
         assert_eq!(2, batch.fields().len());
@@ -565,7 +557,11 @@ mod tests {
 
         // Only project column 2 (f1)
         let projection = vec![2];
-        let mut iter = memtable.iter(Some(&projection), None, None).unwrap();
+        let mut iter = memtable
+            .ranges(Some(&projection), RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
 
         assert_eq!(1, batch.num_rows());
@@ -592,7 +588,11 @@ mod tests {
                 OpType::Put,
             ))
             .unwrap();
-        let mut iter = memtable.iter(None, None, None).unwrap();
+        let mut iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
 
         assert_eq!(1, batch.num_rows()); // deduped to 1 row
@@ -611,7 +611,11 @@ mod tests {
         let kv = kvs.iter().next().unwrap();
         memtable.write_one(kv).unwrap();
 
-        let mut iter = memtable.iter(None, None, None).unwrap();
+        let mut iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
         assert_eq!(1, batch.num_rows());
     }
@@ -745,7 +749,11 @@ mod tests {
         };
         memtable.write_bulk(part).unwrap();
 
-        let mut iter = memtable.iter(None, None, None).unwrap();
+        let mut iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
         assert_eq!(2, batch.num_rows());
 
@@ -764,7 +772,11 @@ mod tests {
             OpType::Put,
         );
         memtable.write(&kvs).unwrap();
-        let mut iter = memtable.iter(None, None, None).unwrap();
+        let mut iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
         assert_eq!(3, batch.num_rows());
         assert_eq!(
@@ -854,7 +866,15 @@ mod tests {
 
         // Filter with sequence 0 should only return first write
         let mut iter = memtable
-            .iter(None, None, Some(SequenceRange::LtEq { max: 0 }))
+            .ranges(
+                None,
+                RangesOptions {
+                    sequence: Some(SequenceRange::LtEq { max: 0 }),
+                    ..Default::default()
+                },
+            )
+            .unwrap()
+            .build(None)
             .unwrap();
         let batch = iter.next().unwrap().unwrap();
         assert_eq!(1, batch.num_rows());
diff --git a/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs b/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs
index b71a86c554..08edebdbb2 100644
--- a/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs
+++ b/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs
@@ -12,98 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::HashSet;
-use std::time::Instant;
-
 use store_api::metadata::RegionMetadataRef;
-use store_api::storage::{ColumnId, SequenceRange};
 
-use crate::error;
-use crate::memtable::simple_bulk_memtable::{Iter, SimpleBulkMemtable};
-use crate::memtable::time_series::Values;
-use crate::memtable::{BoxedBatchIterator, IterBuilder, MemScanMetrics};
-use crate::read::dedup::LastNonNullIter;
-use crate::region::options::MergeMode;
+use crate::memtable::simple_bulk_memtable::SimpleBulkMemtable;
 
 impl SimpleBulkMemtable {
     pub fn region_metadata(&self) -> RegionMetadataRef {
         self.region_metadata.clone()
     }
-
-    pub(crate) fn create_iter(
-        &self,
-        projection: Option<&[ColumnId]>,
-        sequence: Option<SequenceRange>,
-    ) -> error::Result<BatchIterBuilderDeprecated> {
-        let mut series = self.series.write().unwrap();
-
-        let values = if series.is_empty() {
-            None
-        } else {
-            Some(series.compact(&self.region_metadata)?.clone())
-        };
-        let projection = self.build_projection(projection);
-        Ok(BatchIterBuilderDeprecated {
-            region_metadata: self.region_metadata.clone(),
-            values,
-            projection,
-            dedup: self.dedup,
-            sequence,
-            merge_mode: self.merge_mode,
-        })
-    }
-}
-
-#[derive(Clone)]
-pub(crate) struct BatchIterBuilderDeprecated {
-    region_metadata: RegionMetadataRef,
-    values: Option<Values>,
-    projection: HashSet<ColumnId>,
-    sequence: Option<SequenceRange>,
-    dedup: bool,
-    merge_mode: MergeMode,
-}
-
-impl IterBuilder for BatchIterBuilderDeprecated {
-    fn build(&self, metrics: Option<MemScanMetrics>) -> error::Result<BoxedBatchIterator> {
-        let start_time = Instant::now();
-        let Some(values) = self.values.clone() else {
-            return Ok(Box::new(Iter { batch: None }));
-        };
-
-        let maybe_batch = values
-            .to_batch(
-                &[],
-                &self.region_metadata,
-                &self.projection,
-                self.sequence,
-                self.dedup,
-                self.merge_mode,
-            )
-            .map(Some)
-            .transpose();
-
-        // Collect metrics from the batch
-        if let Some(metrics) = metrics {
-            let (num_rows, num_batches) = match &maybe_batch {
-                Some(Ok(batch)) => (batch.num_rows(), 1),
-                _ => (0, 0),
-            };
-            let inner = crate::memtable::MemScanMetricsData {
-                total_series: 1,
-                num_rows,
-                num_batches,
-                scan_cost: start_time.elapsed(),
-            };
-            metrics.merge_inner(&inner);
-        }
-
-        let iter = Iter { batch: maybe_batch };
-
-        if self.merge_mode == MergeMode::LastNonNull {
-            Ok(Box::new(LastNonNullIter::new(iter)))
-        } else {
-            Ok(Box::new(iter))
-        }
-    }
 }
diff --git a/src/mito2/src/memtable/time_partition.rs b/src/mito2/src/memtable/time_partition.rs
index 6f11c813cb..ee695aceb8 100644
--- a/src/mito2/src/memtable/time_partition.rs
+++ b/src/mito2/src/memtable/time_partition.rs
@@ -827,6 +827,7 @@ mod tests {
     use super::*;
     use crate::memtable::partition_tree::PartitionTreeMemtableBuilder;
     use crate::memtable::time_series::TimeSeriesMemtableBuilder;
+    use crate::memtable::{IterBuilder, RangesOptions};
     use crate::test_util::memtable_util::{self, collect_iter_timestamps};
 
     #[test]
@@ -852,7 +853,11 @@ mod tests {
         partitions.list_memtables(&mut memtables);
         assert_eq!(0, memtables[0].id());
 
-        let iter = memtables[0].iter(None, None, None).unwrap();
+        let iter = memtables[0]
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[1000, 3000, 5000, 6000, 7000], &timestamps[..]);
     }
@@ -890,7 +895,11 @@ mod tests {
 
         let mut memtables = Vec::new();
         partitions.list_memtables(&mut memtables);
-        let iter = memtables[0].iter(None, None, None).unwrap();
+        let iter = memtables[0]
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[0, 2000, 3000, 4000, 5000, 7000], &timestamps[..]);
         let parts = partitions.list_partitions();
@@ -943,7 +952,12 @@ mod tests {
         let partitions = new_multi_partitions(&metadata);
 
         let parts = partitions.list_partitions();
-        let iter = parts[0].memtable.iter(None, None, None).unwrap();
+        let iter = parts[0]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(0, parts[0].memtable.id());
         assert_eq!(
@@ -955,7 +969,12 @@ mod tests {
             parts[0].time_range.max_timestamp
         );
         assert_eq!(&[0, 2000, 3000, 4000], &timestamps[..]);
-        let iter = parts[1].memtable.iter(None, None, None).unwrap();
+        let iter = parts[1]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         assert_eq!(1, parts[1].memtable.id());
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[5000, 7000], &timestamps[..]);
@@ -1273,7 +1292,12 @@ mod tests {
 
         let parts = partitions.list_partitions();
         assert_eq!(1, parts.len());
-        let iter = parts[0].memtable.iter(None, None, None).unwrap();
+        let iter = parts[0]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[1000, 2000, 3000], &timestamps[..]);
 
@@ -1284,11 +1308,21 @@ mod tests {
         let parts = partitions.list_partitions();
         assert_eq!(2, parts.len());
         // Check first partition [0, 5000)
-        let iter = parts[0].memtable.iter(None, None, None).unwrap();
+        let iter = parts[0]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[1000, 2000, 3000, 4000], &timestamps[..]);
         // Check second partition [5000, 10000)
-        let iter = parts[1].memtable.iter(None, None, None).unwrap();
+        let iter = parts[1]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[5000, 6000], &timestamps[..]);
 
@@ -1301,7 +1335,12 @@ mod tests {
         assert_eq!(3, parts.len());
 
         // Check new partition [10000, 15000)
-        let iter = parts[2].memtable.iter(None, None, None).unwrap();
+        let iter = parts[2]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[11000, 12000], &timestamps[..]);
 
@@ -1314,7 +1353,12 @@ mod tests {
 
         let parts = partitions.list_partitions();
         assert_eq!(1, parts.len());
-        let iter = parts[0].memtable.iter(None, None, None).unwrap();
+        let iter = parts[0]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[1000, 5000, 9000], &timestamps[..]);
     }
diff --git a/src/mito2/src/memtable/time_series.rs b/src/mito2/src/memtable/time_series.rs
index 271a9343eb..97f5f3c9ce 100644
--- a/src/mito2/src/memtable/time_series.rs
+++ b/src/mito2/src/memtable/time_series.rs
@@ -267,39 +267,6 @@ impl Memtable for TimeSeriesMemtable {
         Ok(())
     }
 
-    #[cfg(any(test, feature = "test"))]
-    fn iter(
-        &self,
-        projection: Option<&[ColumnId]>,
-        filters: Option<Predicate>,
-        sequence: Option<SequenceRange>,
-    ) -> Result<BoxedBatchIterator> {
-        let projection = if let Some(projection) = projection {
-            projection.iter().copied().collect()
-        } else {
-            self.region_metadata
-                .field_columns()
-                .map(|c| c.column_id)
-                .collect()
-        };
-
-        let iter = self.series_set.iter_series(
-            projection,
-            filters,
-            self.dedup,
-            self.merge_mode,
-            sequence,
-            None,
-        )?;
-
-        if self.merge_mode == MergeMode::LastNonNull {
-            let iter = LastNonNullIter::new(iter);
-            Ok(Box::new(iter))
-        } else {
-            Ok(Box::new(iter))
-        }
-    }
-
     fn ranges(
         &self,
         projection: Option<&[ColumnId]>,
@@ -1798,7 +1765,9 @@ mod tests {
             *expected_ts.entry(ts).or_default() += if dedup { 1 } else { 2 };
         }
 
-        let iter = memtable.iter(None, None, None).unwrap();
+        let ranges = memtable.ranges(None, RangesOptions::default()).unwrap();
+        let range = ranges.ranges.into_values().next().unwrap();
+        let iter = range.build_iter().unwrap();
         let mut read = HashMap::new();
 
         for ts in iter
@@ -1838,7 +1807,11 @@ mod tests {
         let memtable = TimeSeriesMemtable::new(schema, 42, None, true, MergeMode::LastRow);
         memtable.write(&kvs).unwrap();
 
-        let iter = memtable.iter(Some(&[3]), None, None).unwrap();
+        let iter = memtable
+            .ranges(Some(&[3]), RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
 
         let mut v0_all = vec![];
 
@@ -1917,7 +1890,11 @@ mod tests {
                 barrier.wait();
 
                 for _ in 0..10 {
-                    let iter = memtable.iter(None, None, None).unwrap();
+                    let iter = memtable
+                        .ranges(None, RangesOptions::default())
+                        .unwrap()
+                        .build(None)
+                        .unwrap();
                     for batch_result in iter {
                         let _ = batch_result.unwrap();
                     }
@@ -1936,7 +1913,11 @@ mod tests {
             handle.join().unwrap();
         }
 
-        let iter = memtable.iter(None, None, None).unwrap();
+        let iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let mut series_count = 0;
         let mut row_count = 0;
 
diff --git a/src/mito2/src/test_util/memtable_util.rs b/src/mito2/src/test_util/memtable_util.rs
index 7ddac4ee0d..58ea49fa41 100644
--- a/src/mito2/src/test_util/memtable_util.rs
+++ b/src/mito2/src/test_util/memtable_util.rs
@@ -83,16 +83,6 @@ impl Memtable for EmptyMemtable {
         Ok(())
     }
 
-    #[cfg(any(test, feature = "test"))]
-    fn iter(
-        &self,
-        _projection: Option<&[ColumnId]>,
-        _filters: Option<Predicate>,
-        _sequence: Option<SequenceRange>,
-    ) -> Result<BoxedBatchIterator> {
-        Ok(Box::new(std::iter::empty()))
-    }
-
     fn ranges(
         &self,
         _projection: Option<&[ColumnId]>,

From dd82fcac00856a6dc3317fa4920b490bba959b84 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunng@protonmail.com>
Date: Mon, 16 Mar 2026 17:56:34 +0800
Subject: [PATCH 15/42] chore: update visibility of
 BatchToRecordBatchAdapter::new (#7817)

---
 src/mito2/src/read/batch_adapter.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mito2/src/read/batch_adapter.rs b/src/mito2/src/read/batch_adapter.rs
index 461dbeba69..4698229c5b 100644
--- a/src/mito2/src/read/batch_adapter.rs
+++ b/src/mito2/src/read/batch_adapter.rs
@@ -59,7 +59,7 @@ impl BatchToRecordBatchAdapter {
     /// - `metadata`: region metadata describing the schema.
     /// - `codec`: codec for decoding the encoded primary key bytes.
     /// - `read_column_ids`: projected column ids to read.
-    pub(crate) fn new(
+    pub fn new(
         iter: BoxedBatchIterator,
         metadata: RegionMetadataRef,
         codec: Arc<dyn PrimaryKeyCodec>,

From 5a37e58b4f4c4475e251d15e57436bb78acfe167 Mon Sep 17 00:00:00 2001
From: Yingwen <realevenyag@gmail.com>
Date: Tue, 17 Mar 2026 11:53:20 +0800
Subject: [PATCH 16/42] feat(mito2): add partition range cache infrastructure
 (#7798)

* feat: add partition range cache infra

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: optimize scan request fingerprint cloning

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: merge loops

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: more docs

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: update estimated size method and comment

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: fix clippy

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: only cache when we scan files

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: address PR review comments for partition range cache

- Remove TimeSeriesDistribution from fingerprint as it only affects yield order
- Disable range cache when dyn filters are present since they change at runtime

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: fmt code

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
---
 src/mito2/src/cache.rs            | 134 ++++++++++++++++
 src/mito2/src/read.rs             |   1 +
 src/mito2/src/read/range_cache.rs | 252 ++++++++++++++++++++++++++++++
 src/mito2/src/read/scan_region.rs | 243 +++++++++++++++++++++++++++-
 src/mito2/src/region/options.rs   |   2 +-
 5 files changed, 629 insertions(+), 3 deletions(-)
 create mode 100644 src/mito2/src/read/range_cache.rs

diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs
index 3ad71d2a61..e232489768 100644
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -49,6 +49,7 @@ use crate::cache::write_cache::WriteCacheRef;
 use crate::memtable::record_batch_estimated_size;
 use crate::metrics::{CACHE_BYTES, CACHE_EVICTION, CACHE_HIT, CACHE_MISS};
 use crate::read::Batch;
+use crate::read::range_cache::{RangeScanCacheKey, RangeScanCacheValue};
 use crate::sst::file::{RegionFileId, RegionIndexId};
 use crate::sst::parquet::reader::MetadataCacheMetrics;
 
@@ -64,6 +65,8 @@ const FILE_TYPE: &str = "file";
 const INDEX_TYPE: &str = "index";
 /// Metrics type key for selector result cache.
 const SELECTOR_RESULT_TYPE: &str = "selector_result";
+/// Metrics type key for range scan result cache.
+const RANGE_RESULT_TYPE: &str = "range_result";
 
 /// Cache strategies that may only enable a subset of caches.
 #[derive(Clone)]
@@ -223,6 +226,32 @@ impl CacheStrategy {
         }
     }
 
+    /// Calls [CacheManager::get_range_result()].
+    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub(crate) fn get_range_result(
+        &self,
+        key: &RangeScanCacheKey,
+    ) -> Option<Arc<RangeScanCacheValue>> {
+        match self {
+            CacheStrategy::EnableAll(cache_manager) => cache_manager.get_range_result(key),
+            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
+        }
+    }
+
+    /// Calls [CacheManager::put_range_result()].
+    /// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub(crate) fn put_range_result(
+        &self,
+        key: RangeScanCacheKey,
+        result: Arc<RangeScanCacheValue>,
+    ) {
+        if let CacheStrategy::EnableAll(cache_manager) = self {
+            cache_manager.put_range_result(key, result);
+        }
+    }
+
     /// Calls [CacheManager::write_cache()].
     /// It returns None if the strategy is [CacheStrategy::Disabled].
     pub fn write_cache(&self) -> Option<&WriteCacheRef> {
@@ -324,6 +353,9 @@ pub struct CacheManager {
     puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
     /// Cache for time series selectors.
     selector_result_cache: Option<SelectorResultCache>,
+    /// Cache for range scan outputs in flat format.
+    #[cfg_attr(not(test), allow(dead_code))]
+    range_result_cache: Option<RangeResultCache>,
     /// Cache for index result.
     index_result_cache: Option<IndexResultCache>,
 }
@@ -512,6 +544,32 @@ impl CacheManager {
         }
     }
 
+    /// Gets cached result for range scan.
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub(crate) fn get_range_result(
+        &self,
+        key: &RangeScanCacheKey,
+    ) -> Option<Arc<RangeScanCacheValue>> {
+        self.range_result_cache
+            .as_ref()
+            .and_then(|cache| update_hit_miss(cache.get(key), RANGE_RESULT_TYPE))
+    }
+
+    /// Puts range scan result into the cache.
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub(crate) fn put_range_result(
+        &self,
+        key: RangeScanCacheKey,
+        result: Arc<RangeScanCacheValue>,
+    ) {
+        if let Some(cache) = &self.range_result_cache {
+            CACHE_BYTES
+                .with_label_values(&[RANGE_RESULT_TYPE])
+                .add(range_result_cache_weight(&key, &result).into());
+            cache.insert(key, result);
+        }
+    }
+
     /// Gets the write cache.
     pub(crate) fn write_cache(&self) -> Option<&WriteCacheRef> {
         self.write_cache.as_ref()
@@ -562,6 +620,7 @@ pub struct CacheManagerBuilder {
     puffin_metadata_size: u64,
     write_cache: Option<WriteCacheRef>,
     selector_result_cache_size: u64,
+    range_result_cache_size: u64,
 }
 
 impl CacheManagerBuilder {
@@ -625,6 +684,12 @@ impl CacheManagerBuilder {
         self
     }
 
+    /// Sets range result cache size.
+    pub fn range_result_cache_size(mut self, bytes: u64) -> Self {
+        self.range_result_cache_size = bytes;
+        self
+    }
+
     /// Builds the [CacheManager].
     pub fn build(self) -> CacheManager {
         fn to_str(cause: RemovalCause) -> &'static str {
@@ -712,6 +777,21 @@ impl CacheManagerBuilder {
                 })
                 .build()
         });
+        let range_result_cache = (self.range_result_cache_size != 0).then(|| {
+            Cache::builder()
+                .max_capacity(self.range_result_cache_size)
+                .weigher(range_result_cache_weight)
+                .eviction_listener(|k, v, cause| {
+                    let size = range_result_cache_weight(&k, &v);
+                    CACHE_BYTES
+                        .with_label_values(&[RANGE_RESULT_TYPE])
+                        .sub(size.into());
+                    CACHE_EVICTION
+                        .with_label_values(&[RANGE_RESULT_TYPE, to_str(cause)])
+                        .inc();
+                })
+                .build()
+        });
         CacheManager {
             sst_meta_cache,
             vector_cache,
@@ -723,6 +803,7 @@ impl CacheManagerBuilder {
             vector_index_cache,
             puffin_metadata_cache: Some(Arc::new(puffin_metadata_cache)),
             selector_result_cache,
+            range_result_cache,
             index_result_cache,
         }
     }
@@ -746,6 +827,10 @@ fn selector_result_cache_weight(k: &SelectorResultKey, v: &Arc<SelectorResultVal
     (mem::size_of_val(k) + v.estimated_size()) as u32
 }
 
+fn range_result_cache_weight(k: &RangeScanCacheKey, v: &Arc<RangeScanCacheValue>) -> u32 {
+    (k.estimated_size() + v.estimated_size()) as u32
+}
+
 /// Updates cache hit/miss metrics.
 fn update_hit_miss<T>(value: Option<T>, cache_type: &str) -> Option<T> {
     if value.is_some() {
@@ -902,6 +987,8 @@ type VectorCache = Cache<(ConcreteDataType, Value), VectorRef>;
 type PageCache = Cache<PageKey, Arc<PageValue>>;
 /// Maps (file id, row group id, time series row selector) to [SelectorResultValue].
 type SelectorResultCache = Cache<SelectorResultKey, Arc<SelectorResultValue>>;
+/// Maps partition-range scan key to cached flat batches.
+type RangeResultCache = Cache<RangeScanCacheKey, Arc<RangeScanCacheValue>>;
 
 #[cfg(test)]
 mod tests {
@@ -916,6 +1003,9 @@ mod tests {
     use crate::cache::index::bloom_filter_index::Tag;
     use crate::cache::index::result_cache::PredicateKey;
     use crate::cache::test_util::parquet_meta;
+    use crate::read::range_cache::{
+        RangeScanCacheKey, RangeScanCacheValue, ScanRequestFingerprintBuilder,
+    };
     use crate::sst::parquet::row_selection::RowGroupSelection;
 
     #[tokio::test]
@@ -1028,6 +1118,50 @@ mod tests {
         assert!(cache.get_selector_result(&key).is_some());
     }
 
+    #[test]
+    fn test_range_result_cache() {
+        let cache = Arc::new(
+            CacheManager::builder()
+                .range_result_cache_size(1024 * 1024)
+                .build(),
+        );
+
+        let key = RangeScanCacheKey {
+            region_id: RegionId::new(1, 1),
+            row_groups: vec![(FileId::random(), 0)],
+            scan: ScanRequestFingerprintBuilder {
+                read_column_ids: vec![],
+                read_column_types: vec![],
+                filters: vec!["tag_0 = 1".to_string()],
+                time_filters: vec![],
+                series_row_selector: None,
+                append_mode: false,
+                filter_deleted: true,
+                merge_mode: crate::region::options::MergeMode::LastRow,
+                partition_expr_version: 0,
+            }
+            .build(),
+        };
+        let value = Arc::new(RangeScanCacheValue::new(Vec::new()));
+
+        assert!(cache.get_range_result(&key).is_none());
+        cache.put_range_result(key.clone(), value.clone());
+        assert!(cache.get_range_result(&key).is_some());
+
+        let enable_all = CacheStrategy::EnableAll(cache.clone());
+        assert!(enable_all.get_range_result(&key).is_some());
+
+        let compaction = CacheStrategy::Compaction(cache.clone());
+        assert!(compaction.get_range_result(&key).is_none());
+        compaction.put_range_result(key.clone(), value.clone());
+        assert!(cache.get_range_result(&key).is_some());
+
+        let disabled = CacheStrategy::Disabled;
+        assert!(disabled.get_range_result(&key).is_none());
+        disabled.put_range_result(key.clone(), value);
+        assert!(cache.get_range_result(&key).is_some());
+    }
+
     #[tokio::test]
     async fn test_evict_puffin_cache_clears_all_entries() {
         use std::collections::{BTreeMap, HashMap};
diff --git a/src/mito2/src/read.rs b/src/mito2/src/read.rs
index 5fbd63ce8b..240a99c247 100644
--- a/src/mito2/src/read.rs
+++ b/src/mito2/src/read.rs
@@ -27,6 +27,7 @@ pub mod projection;
 pub(crate) mod prune;
 pub(crate) mod pruner;
 pub mod range;
+pub(crate) mod range_cache;
 pub mod scan_region;
 pub mod scan_util;
 pub(crate) mod seq_scan;
diff --git a/src/mito2/src/read/range_cache.rs b/src/mito2/src/read/range_cache.rs
new file mode 100644
index 0000000000..5b90e68bae
--- /dev/null
+++ b/src/mito2/src/read/range_cache.rs
@@ -0,0 +1,252 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Utilities for the partition range scan result cache.
+
+use std::mem;
+use std::sync::Arc;
+
+use datatypes::arrow::record_batch::RecordBatch;
+use datatypes::prelude::ConcreteDataType;
+use store_api::storage::{ColumnId, FileId, RegionId, TimeSeriesRowSelector};
+
+use crate::memtable::record_batch_estimated_size;
+use crate::region::options::MergeMode;
+
+/// Fingerprint of the scan request fields that affect partition range cache reuse.
+///
+/// It records a normalized view of the projected columns and filters, plus
+/// scan options that can change the returned rows. Schema-dependent metadata
+/// and the partition expression version are included so cached results are not
+/// reused across incompatible schema or partitioning changes.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct ScanRequestFingerprint {
+    /// Projection and filters without the time index and partition exprs.
+    inner: Arc<SharedScanRequestFingerprint>,
+    /// Filters with the time index column.
+    time_filters: Option<Arc<Vec<String>>>,
+    series_row_selector: Option<TimeSeriesRowSelector>,
+    append_mode: bool,
+    filter_deleted: bool,
+    merge_mode: MergeMode,
+    /// We keep the partition expr version to ensure we won't reuse the fingerprint after we change the partition expr.
+    /// We store the version instead of the whole partition expr or partition expr filters.
+    partition_expr_version: u64,
+}
+
+#[derive(Debug)]
+pub(crate) struct ScanRequestFingerprintBuilder {
+    pub(crate) read_column_ids: Vec<ColumnId>,
+    pub(crate) read_column_types: Vec<Option<ConcreteDataType>>,
+    pub(crate) filters: Vec<String>,
+    pub(crate) time_filters: Vec<String>,
+    pub(crate) series_row_selector: Option<TimeSeriesRowSelector>,
+    pub(crate) append_mode: bool,
+    pub(crate) filter_deleted: bool,
+    pub(crate) merge_mode: MergeMode,
+    pub(crate) partition_expr_version: u64,
+}
+
+impl ScanRequestFingerprintBuilder {
+    pub(crate) fn build(self) -> ScanRequestFingerprint {
+        let Self {
+            read_column_ids,
+            read_column_types,
+            filters,
+            time_filters,
+            series_row_selector,
+            append_mode,
+            filter_deleted,
+            merge_mode,
+            partition_expr_version,
+        } = self;
+
+        ScanRequestFingerprint {
+            inner: Arc::new(SharedScanRequestFingerprint {
+                read_column_ids,
+                read_column_types,
+                filters,
+            }),
+            time_filters: (!time_filters.is_empty()).then(|| Arc::new(time_filters)),
+            series_row_selector,
+            append_mode,
+            filter_deleted,
+            merge_mode,
+            partition_expr_version,
+        }
+    }
+}
+
+/// Non-copiable struct of the fingerprint.
+#[derive(Debug, PartialEq, Eq, Hash)]
+struct SharedScanRequestFingerprint {
+    /// Column ids of the projection.
+    read_column_ids: Vec<ColumnId>,
+    /// Column types of the projection.
+    /// We keep this to ensure we won't reuse the fingerprint after a schema change.
+    read_column_types: Vec<Option<ConcreteDataType>>,
+    /// Filters without the time index column and region partition exprs.
+    filters: Vec<String>,
+}
+
+impl ScanRequestFingerprint {
+    #[cfg(test)]
+    pub(crate) fn read_column_ids(&self) -> &[ColumnId] {
+        &self.inner.read_column_ids
+    }
+
+    #[cfg(test)]
+    pub(crate) fn read_column_types(&self) -> &[Option<ConcreteDataType>] {
+        &self.inner.read_column_types
+    }
+
+    #[cfg(test)]
+    pub(crate) fn filters(&self) -> &[String] {
+        &self.inner.filters
+    }
+
+    #[cfg(test)]
+    pub(crate) fn time_filters(&self) -> &[String] {
+        self.time_filters
+            .as_deref()
+            .map(Vec::as_slice)
+            .unwrap_or(&[])
+    }
+
+    #[cfg(test)]
+    pub(crate) fn without_time_filters(&self) -> Self {
+        Self {
+            inner: Arc::clone(&self.inner),
+            time_filters: None,
+            series_row_selector: self.series_row_selector,
+            append_mode: self.append_mode,
+            filter_deleted: self.filter_deleted,
+            merge_mode: self.merge_mode,
+            partition_expr_version: self.partition_expr_version,
+        }
+    }
+
+    pub(crate) fn estimated_size(&self) -> usize {
+        mem::size_of::<SharedScanRequestFingerprint>()
+            + self.inner.read_column_ids.capacity() * mem::size_of::<ColumnId>()
+            + self.inner.read_column_types.capacity() * mem::size_of::<Option<ConcreteDataType>>()
+            + self.inner.filters.capacity() * mem::size_of::<String>()
+            + self
+                .inner
+                .filters
+                .iter()
+                .map(|filter| filter.capacity())
+                .sum::<usize>()
+            + self.time_filters.as_ref().map_or(0, |filters| {
+                mem::size_of::<Vec<String>>()
+                    + filters.capacity() * mem::size_of::<String>()
+                    + filters
+                        .iter()
+                        .map(|filter| filter.capacity())
+                        .sum::<usize>()
+            })
+    }
+}
+
+/// Cache key for range scan outputs.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct RangeScanCacheKey {
+    pub(crate) region_id: RegionId,
+    /// Sorted (file_id, row_group_index) pairs that uniquely identify the covered data.
+    pub(crate) row_groups: Vec<(FileId, i64)>,
+    pub(crate) scan: ScanRequestFingerprint,
+}
+
+impl RangeScanCacheKey {
+    pub(crate) fn estimated_size(&self) -> usize {
+        mem::size_of::<Self>()
+            + self.row_groups.capacity() * mem::size_of::<(FileId, i64)>()
+            + self.scan.estimated_size()
+    }
+}
+
+/// Cached result for one range scan.
+pub(crate) struct RangeScanCacheValue {
+    pub(crate) batches: Vec<RecordBatch>,
+}
+
+impl RangeScanCacheValue {
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub(crate) fn new(batches: Vec<RecordBatch>) -> Self {
+        Self { batches }
+    }
+
+    pub(crate) fn estimated_size(&self) -> usize {
+        mem::size_of::<Self>()
+            + self.batches.capacity() * mem::size_of::<RecordBatch>()
+            + self
+                .batches
+                .iter()
+                .map(record_batch_estimated_size)
+                .sum::<usize>()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use store_api::storage::TimeSeriesRowSelector;
+
+    use super::*;
+
+    #[test]
+    fn normalizes_and_clears_time_filters() {
+        let normalized = ScanRequestFingerprintBuilder {
+            read_column_ids: vec![1, 2],
+            read_column_types: vec![None, None],
+            filters: vec!["k0 = 'foo'".to_string()],
+            time_filters: vec![],
+            series_row_selector: None,
+            append_mode: false,
+            filter_deleted: true,
+            merge_mode: MergeMode::LastRow,
+            partition_expr_version: 0,
+        }
+        .build();
+
+        assert!(normalized.time_filters().is_empty());
+
+        let fingerprint = ScanRequestFingerprintBuilder {
+            read_column_ids: vec![1, 2],
+            read_column_types: vec![None, None],
+            filters: vec!["k0 = 'foo'".to_string()],
+            time_filters: vec!["ts >= 1000".to_string()],
+            series_row_selector: Some(TimeSeriesRowSelector::LastRow),
+            append_mode: false,
+            filter_deleted: true,
+            merge_mode: MergeMode::LastRow,
+            partition_expr_version: 7,
+        }
+        .build();
+
+        let reset = fingerprint.without_time_filters();
+
+        assert_eq!(reset.read_column_ids(), fingerprint.read_column_ids());
+        assert_eq!(reset.read_column_types(), fingerprint.read_column_types());
+        assert_eq!(reset.filters(), fingerprint.filters());
+        assert!(reset.time_filters().is_empty());
+        assert_eq!(reset.series_row_selector, fingerprint.series_row_selector);
+        assert_eq!(reset.append_mode, fingerprint.append_mode);
+        assert_eq!(reset.filter_deleted, fingerprint.filter_deleted);
+        assert_eq!(reset.merge_mode, fingerprint.merge_mode);
+        assert_eq!(
+            reset.partition_expr_version,
+            fingerprint.partition_expr_version
+        );
+    }
+}
diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs
index 5d934afd2d..5cb2d75e25 100644
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -55,6 +55,7 @@ use crate::metrics::READ_SST_COUNT;
 use crate::read::compat::{self, CompatBatch, FlatCompatBatch, PrimaryKeyCompatBatch};
 use crate::read::projection::ProjectionMapper;
 use crate::read::range::{FileRangeBuilder, MemRangeBuilder, RangeMeta, RowGroupIndex};
+use crate::read::range_cache::ScanRequestFingerprint;
 use crate::read::seq_scan::SeqScan;
 use crate::read::series_scan::SeriesScan;
 use crate::read::stream::ScanBatchStream;
@@ -815,7 +816,7 @@ pub struct ScanInput {
     /// But this read columns might also include non-projected columns needed for filtering.
     pub(crate) read_column_ids: Vec<ColumnId>,
     /// Time range filter for time index.
-    time_range: Option<TimestampRange>,
+    pub(crate) time_range: Option<TimestampRange>,
     /// Predicate to push down.
     pub(crate) predicate: PredicateGroup,
     /// Region partition expr applied at read time.
@@ -1417,6 +1418,92 @@ fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode {
     }
 }
 
+/// Builds a [ScanRequestFingerprint] from a [ScanInput] if the scan is eligible
+/// for partition range caching.
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFingerprint> {
+    let eligible = input.flat_format
+        && !input.compaction
+        && !input.files.is_empty()
+        && matches!(input.cache_strategy, CacheStrategy::EnableAll(_));
+
+    if !eligible {
+        return None;
+    }
+
+    let metadata = input.region_metadata();
+    let tag_names: HashSet<&str> = metadata
+        .column_metadatas
+        .iter()
+        .filter(|col| col.semantic_type == SemanticType::Tag)
+        .map(|col| col.column_schema.name.as_str())
+        .collect();
+
+    let time_index_name = metadata.time_index_column().column_schema.name.clone();
+
+    let exprs = input
+        .predicate_group()
+        .predicate_without_region()
+        .map(|predicate| predicate.exprs())
+        .unwrap_or_default();
+
+    let mut filters = Vec::new();
+    let mut time_filters = Vec::new();
+    let mut has_tag_filter = false;
+    let mut columns = HashSet::new();
+
+    for expr in exprs {
+        columns.clear();
+        let is_time_only = match expr_to_columns(expr, &mut columns) {
+            Ok(()) if !columns.is_empty() => {
+                has_tag_filter |= columns
+                    .iter()
+                    .any(|col| tag_names.contains(col.name.as_str()));
+                columns.iter().all(|col| col.name == time_index_name)
+            }
+            _ => false,
+        };
+
+        if is_time_only {
+            time_filters.push(expr.to_string());
+        } else {
+            filters.push(expr.to_string());
+        }
+    }
+
+    if !has_tag_filter {
+        // We only cache requests that have tag filters to avoid caching all series.
+        return None;
+    }
+
+    // Ensure the filters are sorted for consistent fingerprinting.
+    filters.sort_unstable();
+    time_filters.sort_unstable();
+
+    Some(
+        crate::read::range_cache::ScanRequestFingerprintBuilder {
+            read_column_ids: input.read_column_ids.clone(),
+            read_column_types: input
+                .read_column_ids
+                .iter()
+                .map(|id| {
+                    metadata
+                        .column_by_id(*id)
+                        .map(|col| col.column_schema.data_type.clone())
+                })
+                .collect(),
+            filters,
+            time_filters,
+            series_row_selector: input.series_row_selector,
+            append_mode: input.append_mode,
+            filter_deleted: input.filter_deleted,
+            merge_mode: input.merge_mode,
+            partition_expr_version: metadata.partition_expr_version,
+        }
+        .build(),
+    )
+}
+
 /// Context shared by different streams from a scanner.
 /// It contains the input and ranges to scan.
 pub struct StreamContext {
@@ -1763,10 +1850,15 @@ mod tests {
 
     use datafusion::physical_plan::expressions::lit as physical_lit;
     use datafusion_expr::{col, lit};
-    use store_api::storage::ScanRequest;
+    use datatypes::value::Value;
+    use partition::expr::col as partition_col;
+    use store_api::metadata::RegionMetadataBuilder;
+    use store_api::storage::{ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector};
 
     use super::*;
+    use crate::cache::CacheManager;
     use crate::memtable::time_partition::TimePartitions;
+    use crate::read::range_cache::ScanRequestFingerprintBuilder;
     use crate::region::options::RegionOptions;
     use crate::region::version::VersionBuilder;
     use crate::sst::FormatType;
@@ -1804,6 +1896,26 @@ mod tests {
         )
     }
 
+    async fn new_scan_input(metadata: RegionMetadataRef, filters: Vec<Expr>) -> ScanInput {
+        let env = SchedulerEnv::new().await;
+        let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap();
+        let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap();
+        let file = FileHandle::new(
+            crate::sst::file::FileMeta::default(),
+            Arc::new(crate::sst::file_purger::NoopFilePurger),
+        );
+
+        ScanInput::new(env.access_layer.clone(), mapper)
+            .with_predicate(predicate)
+            .with_cache(CacheStrategy::EnableAll(Arc::new(
+                CacheManager::builder()
+                    .range_result_cache_size(1024)
+                    .build(),
+            )))
+            .with_flat_format(true)
+            .with_files(vec![file])
+    }
+
     #[tokio::test]
     async fn test_build_read_column_ids_includes_filters() {
         let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
@@ -1923,6 +2035,133 @@ mod tests {
         assert!(scan_region.use_flat_format());
     }
 
+    #[tokio::test]
+    async fn test_build_scan_fingerprint_for_eligible_scan() {
+        let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
+        let input = new_scan_input(
+            metadata.clone(),
+            vec![
+                col("ts").gt_eq(lit(1000)),
+                col("k0").eq(lit("foo")),
+                col("v0").gt(lit(1)),
+            ],
+        )
+        .await
+        .with_distribution(Some(TimeSeriesDistribution::PerSeries))
+        .with_series_row_selector(Some(TimeSeriesRowSelector::LastRow))
+        .with_merge_mode(MergeMode::LastNonNull)
+        .with_filter_deleted(false);
+
+        let fingerprint = build_scan_fingerprint(&input).unwrap();
+
+        let expected = ScanRequestFingerprintBuilder {
+            read_column_ids: input.read_column_ids.clone(),
+            read_column_types: vec![
+                metadata
+                    .column_by_id(0)
+                    .map(|col| col.column_schema.data_type.clone()),
+                metadata
+                    .column_by_id(2)
+                    .map(|col| col.column_schema.data_type.clone()),
+                metadata
+                    .column_by_id(3)
+                    .map(|col| col.column_schema.data_type.clone()),
+            ],
+            filters: vec![
+                col("k0").eq(lit("foo")).to_string(),
+                col("v0").gt(lit(1)).to_string(),
+            ],
+            time_filters: vec![col("ts").gt_eq(lit(1000)).to_string()],
+            series_row_selector: Some(TimeSeriesRowSelector::LastRow),
+            append_mode: false,
+            filter_deleted: false,
+            merge_mode: MergeMode::LastNonNull,
+            partition_expr_version: 0,
+        }
+        .build();
+        assert_eq!(expected, fingerprint);
+    }
+
+    #[tokio::test]
+    async fn test_build_scan_fingerprint_requires_tag_filter() {
+        let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
+        let input = new_scan_input(
+            metadata,
+            vec![col("ts").gt_eq(lit(1000)), col("v0").gt(lit(1))],
+        )
+        .await;
+
+        assert!(build_scan_fingerprint(&input).is_none());
+    }
+
+    #[tokio::test]
+    async fn test_build_scan_fingerprint_respects_scan_eligibility() {
+        let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
+        let filters = vec![col("k0").eq(lit("foo"))];
+
+        let disabled = ScanInput::new(
+            SchedulerEnv::new().await.access_layer.clone(),
+            ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(),
+        )
+        .with_predicate(PredicateGroup::new(metadata.as_ref(), &filters).unwrap())
+        .with_flat_format(true);
+        assert!(build_scan_fingerprint(&disabled).is_none());
+
+        let non_flat = new_scan_input(metadata.clone(), filters.clone())
+            .await
+            .with_flat_format(false);
+        assert!(build_scan_fingerprint(&non_flat).is_none());
+
+        let compaction = new_scan_input(metadata.clone(), filters.clone())
+            .await
+            .with_compaction(true);
+        assert!(build_scan_fingerprint(&compaction).is_none());
+
+        // No files to read.
+        let no_files = new_scan_input(metadata, filters).await.with_files(vec![]);
+        assert!(build_scan_fingerprint(&no_files).is_none());
+    }
+
+    #[tokio::test]
+    async fn test_build_scan_fingerprint_tracks_schema_and_partition_expr_changes() {
+        let base = metadata_with_primary_key(vec![0, 1], false);
+        let mut builder = RegionMetadataBuilder::from_existing(base);
+        let partition_expr = partition_col("k0")
+            .gt_eq(Value::String("foo".into()))
+            .as_json_str()
+            .unwrap();
+        builder.partition_expr_json(Some(partition_expr));
+        let metadata = Arc::new(builder.build_without_validation().unwrap());
+
+        let input = new_scan_input(metadata.clone(), vec![col("k0").eq(lit("foo"))]).await;
+        let fingerprint = build_scan_fingerprint(&input).unwrap();
+
+        let expected = ScanRequestFingerprintBuilder {
+            read_column_ids: input.read_column_ids.clone(),
+            read_column_types: vec![
+                metadata
+                    .column_by_id(0)
+                    .map(|col| col.column_schema.data_type.clone()),
+                metadata
+                    .column_by_id(2)
+                    .map(|col| col.column_schema.data_type.clone()),
+                metadata
+                    .column_by_id(3)
+                    .map(|col| col.column_schema.data_type.clone()),
+            ],
+            filters: vec![col("k0").eq(lit("foo")).to_string()],
+            time_filters: vec![],
+            series_row_selector: None,
+            append_mode: false,
+            filter_deleted: true,
+            merge_mode: MergeMode::LastRow,
+            partition_expr_version: metadata.partition_expr_version,
+        }
+        .build();
+        assert_eq!(expected, fingerprint);
+        assert_ne!(0, metadata.partition_expr_version);
+    }
+
     #[test]
     fn test_update_dyn_filters_with_empty_base_predicates() {
         let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
diff --git a/src/mito2/src/region/options.rs b/src/mito2/src/region/options.rs
index 0fe0a8f12a..fcf68a9216 100644
--- a/src/mito2/src/region/options.rs
+++ b/src/mito2/src/region/options.rs
@@ -50,7 +50,7 @@ pub(crate) fn parse_wal_options(
 }
 
 /// Mode to handle duplicate rows while merging.
-#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, EnumString)]
+#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, EnumString)]
 #[serde(rename_all = "snake_case")]
 #[strum(serialize_all = "snake_case")]
 pub enum MergeMode {

From e0aadffb911cece5988bf981a126b2a744337490 Mon Sep 17 00:00:00 2001
From: Yingwen <realevenyag@gmail.com>
Date: Tue, 17 Mar 2026 15:55:48 +0800
Subject: [PATCH 17/42] feat: add flat last row reader to the final stream
 (#7818)

Signed-off-by: evenyag <realevenyag@gmail.com>
---
 src/mito2/src/engine/row_selector_test.rs | 25 +++++++++++----
 src/mito2/src/read/last_row.rs            | 38 ++++++++++++++++++++++-
 src/mito2/src/read/seq_scan.rs            |  9 +++++-
 3 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/src/mito2/src/engine/row_selector_test.rs b/src/mito2/src/engine/row_selector_test.rs
index 317ede5a97..d79152e57f 100644
--- a/src/mito2/src/engine/row_selector_test.rs
+++ b/src/mito2/src/engine/row_selector_test.rs
@@ -24,7 +24,7 @@ use crate::test_util::{
     CreateRequestBuilder, TestEnv, build_rows_for_key, flush_region, put_rows, rows_schema,
 };
 
-async fn test_last_row(append_mode: bool) {
+async fn test_last_row(append_mode: bool, flat_format: bool) {
     let mut env = TestEnv::new().await;
     let engine = env.create_engine(MitoConfig::default()).await;
     let region_id = RegionId::new(1, 1);
@@ -39,9 +39,12 @@ async fn test_last_row(append_mode: bool) {
             env.get_kv_backend(),
         )
         .await;
-    let request = CreateRequestBuilder::new()
-        .insert_option("append_mode", &append_mode.to_string())
-        .build();
+    let mut request_builder =
+        CreateRequestBuilder::new().insert_option("append_mode", &append_mode.to_string());
+    if flat_format {
+        request_builder = request_builder.insert_option("sst_format", "flat");
+    }
+    let request = request_builder.build();
     let column_schemas = rows_schema(&request);
     engine
         .handle_request(region_id, RegionRequest::Create(request))
@@ -106,10 +109,20 @@ async fn test_last_row(append_mode: bool) {
 
 #[tokio::test]
 async fn test_last_row_append_mode_disabled() {
-    test_last_row(false).await;
+    test_last_row(false, false).await;
 }
 
 #[tokio::test]
 async fn test_last_row_append_mode_enabled() {
-    test_last_row(true).await;
+    test_last_row(true, false).await;
+}
+
+#[tokio::test]
+async fn test_last_row_flat_format_append_mode_disabled() {
+    test_last_row(false, true).await;
+}
+
+#[tokio::test]
+async fn test_last_row_flat_format_append_mode_enabled() {
+    test_last_row(true, true).await;
 }
diff --git a/src/mito2/src/read/last_row.rs b/src/mito2/src/read/last_row.rs
index c2336f218d..0c13c120a0 100644
--- a/src/mito2/src/read/last_row.rs
+++ b/src/mito2/src/read/last_row.rs
@@ -21,6 +21,7 @@ use datatypes::arrow::array::{Array, BinaryArray};
 use datatypes::arrow::compute::concat_batches;
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::vectors::UInt32Vector;
+use futures::{Stream, TryStreamExt};
 use snafu::ResultExt;
 use store_api::storage::{FileId, TimeSeriesRowSelector};
 
@@ -30,7 +31,7 @@ use crate::cache::{
 };
 use crate::error::{ComputeArrowSnafu, Result};
 use crate::memtable::partition_tree::data::timestamp_array_to_i64_slice;
-use crate::read::{Batch, BatchReader, BoxedBatchReader};
+use crate::read::{Batch, BatchReader, BoxedBatchReader, BoxedRecordBatchStream};
 use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
 use crate::sst::parquet::flat_format::{primary_key_column_index, time_index_column_index};
 use crate::sst::parquet::format::{PrimaryKeyArray, primary_key_offsets};
@@ -610,6 +611,41 @@ impl FlatLastTimestampSelector {
     }
 }
 
+/// Reader that keeps only the last row of each time series from a flat RecordBatch stream.
+/// Assumes input is sorted, deduped, and contains no delete operations.
+pub(crate) struct FlatLastRowReader {
+    stream: BoxedRecordBatchStream,
+    selector: FlatLastTimestampSelector,
+    pending: BatchBuffer,
+}
+
+impl FlatLastRowReader {
+    /// Creates a new `FlatLastRowReader`.
+    pub(crate) fn new(stream: BoxedRecordBatchStream) -> Self {
+        Self {
+            stream,
+            selector: FlatLastTimestampSelector::default(),
+            pending: BatchBuffer::new(),
+        }
+    }
+
+    /// Converts the reader into a stream of RecordBatches.
+    pub(crate) fn into_stream(mut self) -> impl Stream<Item = Result<RecordBatch>> {
+        async_stream::try_stream! {
+            while let Some(batch) = self.stream.try_next().await? {
+                self.selector.on_next(batch, &mut self.pending)?;
+                if self.pending.is_full() {
+                    yield self.pending.concat()?;
+                }
+            }
+            self.selector.finish(&mut self.pending)?;
+            if !self.pending.is_empty() {
+                yield self.pending.concat()?;
+            }
+        }
+    }
+}
+
 /// Gets the primary key bytes at `index` from the primary key dictionary column.
 fn primary_key_bytes_at(batch: &RecordBatch, pk_col_idx: usize, index: usize) -> &[u8] {
     let pk_dict = batch
diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs
index d2be17cc83..a1b3b8f350 100644
--- a/src/mito2/src/read/seq_scan.rs
+++ b/src/mito2/src/read/seq_scan.rs
@@ -39,7 +39,7 @@ use crate::error::{PartitionOutOfRangeSnafu, Result, TooManyFilesToReadSnafu, Un
 use crate::read::dedup::{DedupReader, LastNonNull, LastRow};
 use crate::read::flat_dedup::{FlatDedupReader, FlatLastNonNull, FlatLastRow};
 use crate::read::flat_merge::FlatMergeReader;
-use crate::read::last_row::LastRowReader;
+use crate::read::last_row::{FlatLastRowReader, LastRowReader};
 use crate::read::merge::MergeReaderBuilder;
 use crate::read::pruner::{PartitionPruner, Pruner};
 use crate::read::range::RangeMeta;
@@ -289,6 +289,13 @@ impl SeqScan {
             Box::pin(reader.into_stream()) as _
         };
 
+        let reader = match &stream_ctx.input.series_row_selector {
+            Some(TimeSeriesRowSelector::LastRow) => {
+                Box::pin(FlatLastRowReader::new(reader).into_stream()) as _
+            }
+            None => reader,
+        };
+
         Ok(reader)
     }
 

From dc98e0215bd19312f136dfecd5f3d64fc26023b7 Mon Sep 17 00:00:00 2001
From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com>
Date: Tue, 17 Mar 2026 19:28:06 +0800
Subject: [PATCH 18/42] feat(metric-engine): support bulk inserts with put
 fallback (#7792)

* feat(metric-engine): support bulk inserts

Implement `RegionRequest::BulkInserts` to support efficient columnar data
ingestion in the metric engine.

Key changes:
- Implement `bulk_insert_region` to handle logical-to-physical region mapping
  and dispatch writes.
- Add `batch_modifier` for `RecordBatch` transformations, specifically for
  `__tsid` generation and sparse primary key encoding.
- Integrate `BulkInserts` into the `MetricEngine` request handling logic.
- Provide a row-based fallback mechanism if the underlying storage doesn't
  support bulk writes.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/metric-engine-bulk-insert:
 ### Update `bulk_insert.rs` to Support Partition Expression Version

 - **Enhancements**:
   - Added support for `partition_expr_version` in `RegionBulkInsertsRequest` and `RegionPutRequest`.
   - Modified the handling of `partition_expr_version` to be dynamically set from the `request` object.

 Files affected:
 - `src/metric-engine/src/engine/bulk_insert.rs`

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* fix: cargo lock revert

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* add doc for conversions

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* chore: simplify test

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/metric-engine-bulk-insert:
 ### Refactor `bulk_insert.rs` in `metric-engine`

 - **Refactor Functionality**:
   - Replaced `resolve_tag_columns` with `resolve_tag_columns_from_metadata` to streamline tag column resolution.
   - Moved logic for resolving tag columns directly into `resolve_tag_columns_from_metadata`, removing the need for an external function call.
 - **Enhancements**:
   - Improved error handling and context provision for missing physical regions and columns.
   - Optimized tag column sorting and index management within the batch processing logic.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/metric-engine-bulk-insert:
 ### Refactor `record_batch_to_rows` Function in `bulk_insert.rs`

 - Simplified the `record_batch_to_rows` function by removing the `logical_metadata` parameter and directly validating column types within the function.
 - Enhanced error handling for timestamp, value, and tag columns by checking their data types and providing detailed error messages.
 - Replaced the use of `Helper::try_into_vector` with direct downcasting to `TimestampMillisecondArray`, `Float64Array`, and `StringArray` for improved type safety and clarity.
 - Updated the construction of `api::v1::Rows` to directly handle null values and construct `api::v1::Value` objects accordingly.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/metric-engine-bulk-insert:
 ## Commit Message

 Refactor `bulk_insert.rs` to optimize state access

 - Moved the state read operation inside a new block to limit its scope and improve code clarity.
 - Adjusted logic for processing `tag_columns` and `non_tag_indices` to work within the new block structure.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/metric-engine-bulk-insert:
 ### Refactor `compute_tsid_array` Function

 - **Refactored `compute_tsid_array` function**: Modified the function signature to accept `tag_arrays` as a parameter instead of building it internally. This change affects the following files:
   - `src/metric-engine/src/batch_modifier.rs`

 - **Updated test cases**: Adjusted test cases to accommodate the new `compute_tsid_array` function signature by passing `tag_arrays` explicitly.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* docs: add doc for bulk_insert_region

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/metric-engine-bulk-insert:
 ### Commit Message

 Refactor `bulk_insert.rs` in `metric-engine`:

 - Removed error handling for unsupported status codes in `write_data` method.
 - Eliminated `record_batch_to_rows` function, simplifying the data insertion process.
 - Streamlined the `write_data` method by removing fallback logic for unsupported operations.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/metric-engine-bulk-insert:
 - **Optimize Primary Key Construction**: Refactored `modify_batch_sparse` in `batch_modifier.rs` to use `BinaryBuilder` for more efficient primary key construction.
 - **Add Fallback for Unsupported Bulk Inserts**: Updated `bulk_insert.rs` to handle unsupported bulk inserts by converting record batches to rows and using `RegionPutRequest`.
 - **Implement Record Batch to Rows Conversion**: Added `record_batch_to_rows` function in `bulk_insert.rs` to convert `RecordBatch` to `api::v1::Rows` for fallback operations.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/metric-engine-bulk-insert:
 Add test for handling null values in `record_batch_to_rows`

 - Added a new test `test_record_batch_to_rows_with_null_values` in `bulk_insert.rs` to verify the handling of null values in the `record_batch_to_rows` function.
 - The test checks the conversion of a `RecordBatch` with null values in various fields to ensure correct row creation and schema handling.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/metric-engine-bulk-insert:
 Add fallback path for unsupported status and improve error context handling

 - **`bulk_insert.rs`**:
   - Added a fallback path for `PartitionTreeMemtable` in case of unsupported status code.
   - Enhanced error handling by using `with_context` for better error messages when timestamp and value columns are not found in `RecordBatch`.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

---------

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
---
 Cargo.lock                                  |   1 +
 src/metric-engine/Cargo.toml                |   1 +
 src/metric-engine/src/batch_modifier.rs     | 426 +++++++++++
 src/metric-engine/src/engine.rs             |   6 +-
 src/metric-engine/src/engine/bulk_insert.rs | 783 ++++++++++++++++++++
 src/metric-engine/src/engine/put.rs         |   2 +-
 src/metric-engine/src/lib.rs                |   1 +
 7 files changed, 1216 insertions(+), 4 deletions(-)
 create mode 100644 src/metric-engine/src/batch_modifier.rs
 create mode 100644 src/metric-engine/src/engine/bulk_insert.rs

diff --git a/Cargo.lock b/Cargo.lock
index 1f65f1289c..605b037fc9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7886,6 +7886,7 @@ dependencies = [
  "common-base",
  "common-error",
  "common-function",
+ "common-grpc",
  "common-macro",
  "common-meta",
  "common-query",
diff --git a/src/metric-engine/Cargo.toml b/src/metric-engine/Cargo.toml
index 567210b952..5b561997ab 100644
--- a/src/metric-engine/Cargo.toml
+++ b/src/metric-engine/Cargo.toml
@@ -17,6 +17,7 @@ bytes.workspace = true
 fxhash = "0.2"
 common-base.workspace = true
 common-error.workspace = true
+common-grpc.workspace = true
 common-macro.workspace = true
 common-query.workspace = true
 common-recordbatch.workspace = true
diff --git a/src/metric-engine/src/batch_modifier.rs b/src/metric-engine/src/batch_modifier.rs
new file mode 100644
index 0000000000..8a5774889b
--- /dev/null
+++ b/src/metric-engine/src/batch_modifier.rs
@@ -0,0 +1,426 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::hash::Hasher;
+use std::sync::Arc;
+
+use datatypes::arrow::array::{Array, BinaryBuilder, StringArray, UInt64Array};
+use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+use datatypes::arrow::record_batch::RecordBatch;
+use datatypes::value::ValueRef;
+use fxhash::FxHasher;
+use mito_codec::row_converter::SparsePrimaryKeyCodec;
+use snafu::ResultExt;
+use store_api::storage::ColumnId;
+use store_api::storage::consts::{PRIMARY_KEY_COLUMN_NAME, ReservedColumnId};
+
+use crate::error::{EncodePrimaryKeySnafu, Result, UnexpectedRequestSnafu};
+
+/// Info about a tag column for TSID computation and sparse primary key encoding.
+#[allow(dead_code)]
+pub(crate) struct TagColumnInfo {
+    /// Column name (used for label-name hash).
+    pub name: String,
+    /// Column index in the RecordBatch.
+    pub index: usize,
+    /// Column ID in the physical region.
+    pub column_id: ColumnId,
+}
+
+/// Computes `__tsid` values for each row.
+#[allow(dead_code)]
+pub(crate) fn compute_tsid_array(
+    batch: &RecordBatch,
+    sorted_tag_columns: &[TagColumnInfo],
+    tag_arrays: &[&StringArray],
+) -> UInt64Array {
+    let num_rows = batch.num_rows();
+
+    let label_name_hash = {
+        let mut hasher = FxHasher::default();
+        for tag_col in sorted_tag_columns {
+            hasher.write(tag_col.name.as_bytes());
+            hasher.write_u8(0xff);
+        }
+        hasher.finish()
+    };
+
+    let mut tsid_values = Vec::with_capacity(num_rows);
+    for row in 0..num_rows {
+        let has_null = tag_arrays.iter().any(|arr| arr.is_null(row));
+
+        let tsid = if !has_null {
+            let mut hasher = FxHasher::default();
+            hasher.write_u64(label_name_hash);
+            for arr in tag_arrays {
+                hasher.write(arr.value(row).as_bytes());
+                hasher.write_u8(0xff);
+            }
+            hasher.finish()
+        } else {
+            let mut name_hasher = FxHasher::default();
+            for (tc, arr) in sorted_tag_columns.iter().zip(tag_arrays.iter()) {
+                if !arr.is_null(row) {
+                    name_hasher.write(tc.name.as_bytes());
+                    name_hasher.write_u8(0xff);
+                }
+            }
+            let row_label_hash = name_hasher.finish();
+
+            let mut val_hasher = FxHasher::default();
+            val_hasher.write_u64(row_label_hash);
+            for arr in tag_arrays {
+                if !arr.is_null(row) {
+                    val_hasher.write(arr.value(row).as_bytes());
+                    val_hasher.write_u8(0xff);
+                }
+            }
+            val_hasher.finish()
+        };
+
+        tsid_values.push(tsid);
+    }
+
+    UInt64Array::from(tsid_values)
+}
+
+fn build_tag_arrays<'a>(
+    batch: &'a RecordBatch,
+    sorted_tag_columns: &[TagColumnInfo],
+) -> Vec<&'a StringArray> {
+    sorted_tag_columns
+        .iter()
+        .map(|tc| {
+            batch
+                .column(tc.index)
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .expect("tag column must be utf8")
+        })
+        .collect()
+}
+
+/// Modifies a RecordBatch for sparse primary key encoding.
+#[allow(dead_code)]
+pub(crate) fn modify_batch_sparse(
+    batch: RecordBatch,
+    table_id: u32,
+    sorted_tag_columns: &[TagColumnInfo],
+    non_tag_column_indices: &[usize],
+) -> Result<RecordBatch> {
+    let num_rows = batch.num_rows();
+    let codec = SparsePrimaryKeyCodec::schemaless();
+    let tag_arrays: Vec<&StringArray> = build_tag_arrays(&batch, sorted_tag_columns);
+    let tsid_array = compute_tsid_array(&batch, sorted_tag_columns, &tag_arrays);
+
+    let mut pk_builder = BinaryBuilder::with_capacity(num_rows, 0);
+    let mut buffer = Vec::new();
+    for row in 0..num_rows {
+        buffer.clear();
+        let internal = [
+            (ReservedColumnId::table_id(), ValueRef::UInt32(table_id)),
+            (
+                ReservedColumnId::tsid(),
+                ValueRef::UInt64(tsid_array.value(row)),
+            ),
+        ];
+        codec
+            .encode_to_vec(internal.into_iter(), &mut buffer)
+            .context(EncodePrimaryKeySnafu)?;
+
+        let tags = sorted_tag_columns
+            .iter()
+            .zip(tag_arrays.iter())
+            .filter(|(_, arr)| !arr.is_null(row))
+            .map(|(tc, arr)| (tc.column_id, ValueRef::String(arr.value(row))));
+        codec
+            .encode_to_vec(tags, &mut buffer)
+            .context(EncodePrimaryKeySnafu)?;
+
+        pk_builder.append_value(&buffer);
+    }
+
+    let pk_array = pk_builder.finish();
+
+    let mut fields = vec![Arc::new(Field::new(
+        PRIMARY_KEY_COLUMN_NAME,
+        DataType::Binary,
+        false,
+    ))];
+    let mut columns: Vec<Arc<dyn Array>> = vec![Arc::new(pk_array)];
+
+    for &idx in non_tag_column_indices {
+        fields.push(batch.schema().fields()[idx].clone());
+        columns.push(batch.column(idx).clone());
+    }
+
+    let new_schema = Arc::new(ArrowSchema::new(fields));
+    RecordBatch::try_new(new_schema, columns).map_err(|e| {
+        UnexpectedRequestSnafu {
+            reason: format!("Failed to build modified sparse RecordBatch: {e}"),
+        }
+        .build()
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+    use std::sync::Arc;
+
+    use api::v1::value::ValueData;
+    use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value};
+    use datatypes::arrow::array::{BinaryArray, Int64Array, StringArray};
+    use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+    use datatypes::arrow::record_batch::RecordBatch;
+    use store_api::codec::PrimaryKeyEncoding;
+    use store_api::storage::consts::PRIMARY_KEY_COLUMN_NAME;
+
+    use super::*;
+    use crate::row_modifier::{RowModifier, RowsIter, TableIdInput};
+
+    fn build_sparse_test_batch() -> RecordBatch {
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("greptime_timestamp", DataType::Int64, false),
+            Field::new("greptime_value", DataType::Float64, true),
+            Field::new("namespace", DataType::Utf8, true),
+            Field::new("host", DataType::Utf8, true),
+        ]));
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(Int64Array::from(vec![1000])),
+                Arc::new(datatypes::arrow::array::Float64Array::from(vec![42.0])),
+                Arc::new(StringArray::from(vec!["greptimedb"])),
+                Arc::new(StringArray::from(vec!["127.0.0.1"])),
+            ],
+        )
+        .unwrap()
+    }
+
+    fn sparse_tag_columns() -> Vec<TagColumnInfo> {
+        vec![
+            TagColumnInfo {
+                name: "host".to_string(),
+                index: 3,
+                column_id: 3,
+            },
+            TagColumnInfo {
+                name: "namespace".to_string(),
+                index: 2,
+                column_id: 2,
+            },
+        ]
+    }
+
+    #[test]
+    fn test_compute_tsid_basic() {
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("namespace", DataType::Utf8, true),
+            Field::new("host", DataType::Utf8, true),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(StringArray::from(vec!["greptimedb"])),
+                Arc::new(StringArray::from(vec!["127.0.0.1"])),
+            ],
+        )
+        .unwrap();
+
+        let tag_columns: Vec<TagColumnInfo> = vec![
+            TagColumnInfo {
+                name: "host".to_string(),
+                index: 1,
+                column_id: 2,
+            },
+            TagColumnInfo {
+                name: "namespace".to_string(),
+                index: 0,
+                column_id: 1,
+            },
+        ];
+        let tag_arrays = build_tag_arrays(&batch, &tag_columns);
+        let tsid_array = compute_tsid_array(&batch, &tag_columns, &tag_arrays);
+
+        assert_eq!(tsid_array.value(0), 2721566936019240841);
+    }
+
+    #[test]
+    fn test_compute_tsid_with_nulls() {
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("a", DataType::Utf8, true),
+            Field::new("b", DataType::Utf8, true),
+        ]));
+        let batch_no_null = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(StringArray::from(vec!["A"])),
+                Arc::new(StringArray::from(vec!["B"])),
+            ],
+        )
+        .unwrap();
+        let tag_cols_2: Vec<TagColumnInfo> = vec![
+            TagColumnInfo {
+                name: "a".to_string(),
+                index: 0,
+                column_id: 1,
+            },
+            TagColumnInfo {
+                name: "b".to_string(),
+                index: 1,
+                column_id: 2,
+            },
+        ];
+        let tag_arrays_2 = build_tag_arrays(&batch_no_null, &tag_cols_2);
+        let tsid_no_null = compute_tsid_array(&batch_no_null, &tag_cols_2, &tag_arrays_2);
+
+        let schema3 = Arc::new(ArrowSchema::new(vec![
+            Field::new("a", DataType::Utf8, true),
+            Field::new("b", DataType::Utf8, true),
+            Field::new("c", DataType::Utf8, true),
+        ]));
+        let batch_with_null = RecordBatch::try_new(
+            schema3,
+            vec![
+                Arc::new(StringArray::from(vec!["A"])),
+                Arc::new(StringArray::from(vec!["B"])),
+                Arc::new(StringArray::from(vec![None as Option<&str>])),
+            ],
+        )
+        .unwrap();
+        let tag_cols_3: Vec<TagColumnInfo> = vec![
+            TagColumnInfo {
+                name: "a".to_string(),
+                index: 0,
+                column_id: 1,
+            },
+            TagColumnInfo {
+                name: "b".to_string(),
+                index: 1,
+                column_id: 2,
+            },
+            TagColumnInfo {
+                name: "c".to_string(),
+                index: 2,
+                column_id: 3,
+            },
+        ];
+        let tag_arrays_3 = build_tag_arrays(&batch_with_null, &tag_cols_3);
+        let tsid_with_null = compute_tsid_array(&batch_with_null, &tag_cols_3, &tag_arrays_3);
+
+        assert_eq!(tsid_no_null.value(0), tsid_with_null.value(0));
+    }
+
+    #[test]
+    fn test_modify_batch_sparse() {
+        let batch = build_sparse_test_batch();
+        let tag_columns = sparse_tag_columns();
+        let non_tag_indices = vec![0, 1];
+        let table_id: u32 = 1025;
+
+        let modified =
+            modify_batch_sparse(batch, table_id, &tag_columns, &non_tag_indices).unwrap();
+
+        assert_eq!(modified.num_columns(), 3);
+        assert_eq!(modified.schema().field(0).name(), PRIMARY_KEY_COLUMN_NAME);
+        assert_eq!(modified.schema().field(1).name(), "greptime_timestamp");
+        assert_eq!(modified.schema().field(2).name(), "greptime_value");
+    }
+
+    #[test]
+    fn test_modify_batch_sparse_matches_row_modifier() {
+        let batch = build_sparse_test_batch();
+        let tag_columns = sparse_tag_columns();
+        let non_tag_indices = vec![0, 1];
+        let table_id: u32 = 1025;
+        let modified =
+            modify_batch_sparse(batch, table_id, &tag_columns, &non_tag_indices).unwrap();
+
+        let name_to_column_id: HashMap<String, ColumnId> = [
+            ("greptime_timestamp".to_string(), 0),
+            ("greptime_value".to_string(), 1),
+            ("namespace".to_string(), 2),
+            ("host".to_string(), 3),
+        ]
+        .into_iter()
+        .collect();
+
+        let rows = Rows {
+            schema: vec![
+                ColumnSchema {
+                    column_name: "greptime_timestamp".to_string(),
+                    datatype: ColumnDataType::TimestampMillisecond as i32,
+                    semantic_type: SemanticType::Timestamp as i32,
+                    ..Default::default()
+                },
+                ColumnSchema {
+                    column_name: "greptime_value".to_string(),
+                    datatype: ColumnDataType::Float64 as i32,
+                    semantic_type: SemanticType::Field as i32,
+                    ..Default::default()
+                },
+                ColumnSchema {
+                    column_name: "namespace".to_string(),
+                    datatype: ColumnDataType::String as i32,
+                    semantic_type: SemanticType::Tag as i32,
+                    ..Default::default()
+                },
+                ColumnSchema {
+                    column_name: "host".to_string(),
+                    datatype: ColumnDataType::String as i32,
+                    semantic_type: SemanticType::Tag as i32,
+                    ..Default::default()
+                },
+            ],
+            rows: vec![Row {
+                values: vec![
+                    Value {
+                        value_data: Some(ValueData::TimestampMillisecondValue(1000)),
+                    },
+                    Value {
+                        value_data: Some(ValueData::F64Value(42.0)),
+                    },
+                    Value {
+                        value_data: Some(ValueData::StringValue("greptimedb".to_string())),
+                    },
+                    Value {
+                        value_data: Some(ValueData::StringValue("127.0.0.1".to_string())),
+                    },
+                ],
+            }],
+        };
+
+        let row_iter = RowsIter::new(rows, &name_to_column_id);
+        let rows = RowModifier::default()
+            .modify_rows(
+                row_iter,
+                TableIdInput::Single(table_id),
+                PrimaryKeyEncoding::Sparse,
+            )
+            .unwrap();
+        let ValueData::BinaryValue(expected_pk) =
+            rows.rows[0].values[0].value_data.clone().unwrap()
+        else {
+            panic!("expected binary primary key");
+        };
+
+        let actual_array = modified
+            .column(0)
+            .as_any()
+            .downcast_ref::<BinaryArray>()
+            .unwrap();
+        assert_eq!(actual_array.value(0), expected_pk.as_slice());
+    }
+}
diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs
index 7a1efedac4..ba90ca960d 100644
--- a/src/metric-engine/src/engine.rs
+++ b/src/metric-engine/src/engine.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 mod alter;
+mod bulk_insert;
 mod catchup;
 mod close;
 mod create;
@@ -288,9 +289,8 @@ impl RegionEngine for MetricEngine {
                 debug_assert_eq!(region_id, resp_region_id);
                 return response;
             }
-            RegionRequest::BulkInserts(_) => {
-                // todo(hl): find a way to support bulk inserts in metric engine.
-                UnsupportedRegionRequestSnafu { request }.fail()
+            RegionRequest::BulkInserts(bulk) => {
+                self.inner.bulk_insert_region(region_id, bulk).await
             }
         };
 
diff --git a/src/metric-engine/src/engine/bulk_insert.rs b/src/metric-engine/src/engine/bulk_insert.rs
new file mode 100644
index 0000000000..2a3c26c80c
--- /dev/null
+++ b/src/metric-engine/src/engine/bulk_insert.rs
@@ -0,0 +1,783 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashSet;
+
+use api::v1::{ArrowIpc, ColumnDataType, SemanticType};
+use bytes::Bytes;
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_grpc::flight::{FlightEncoder, FlightMessage};
+use common_query::prelude::{greptime_timestamp, greptime_value};
+use datatypes::arrow::array::{Array, Float64Array, StringArray, TimestampMillisecondArray};
+use datatypes::arrow::record_batch::RecordBatch;
+use snafu::{OptionExt, ensure};
+use store_api::codec::PrimaryKeyEncoding;
+use store_api::metadata::RegionMetadataRef;
+use store_api::region_request::{
+    AffectedRows, RegionBulkInsertsRequest, RegionPutRequest, RegionRequest,
+};
+use store_api::storage::RegionId;
+
+use crate::batch_modifier::{TagColumnInfo, modify_batch_sparse};
+use crate::engine::MetricEngineInner;
+use crate::error;
+use crate::error::Result;
+
+impl MetricEngineInner {
+    /// Bulk-inserts logical rows into a metric region.
+    ///
+    /// This method accepts a `RegionBulkInsertsRequest` whose payload is a logical
+    /// `RecordBatch` (timestamp, value and tag columns) for the given logical `region_id`.
+    ///
+    /// The transformed batch is encoded to Arrow IPC and forwarded as a `BulkInserts`
+    /// request to the data region, along with the original `partition_expr_version`.
+    /// If the data region reports `StatusCode::Unsupported` for bulk inserts, the request
+    /// is transparently retried as a `Put` by converting the original logical batch into
+    /// `api::v1::Rows`, so callers observe the same semantics as `put_region`.
+    ///
+    /// Returns the number of affected rows, or `0` if the input batch is empty.
+    pub async fn bulk_insert_region(
+        &self,
+        region_id: RegionId,
+        request: RegionBulkInsertsRequest,
+    ) -> Result<AffectedRows> {
+        ensure!(
+            !self.is_physical_region(region_id),
+            error::UnsupportedRegionRequestSnafu {
+                request: RegionRequest::BulkInserts(request),
+            }
+        );
+
+        let (physical_region_id, data_region_id, primary_key_encoding) =
+            self.find_data_region_meta(region_id)?;
+
+        if primary_key_encoding != PrimaryKeyEncoding::Sparse {
+            return error::UnsupportedRegionRequestSnafu {
+                request: RegionRequest::BulkInserts(request),
+            }
+            .fail();
+        }
+
+        let batch = request.payload;
+        if batch.num_rows() == 0 {
+            return Ok(0);
+        }
+
+        let logical_metadata = self
+            .logical_region_metadata(physical_region_id, region_id)
+            .await?;
+        let (tag_columns, non_tag_indices) = self.resolve_tag_columns_from_metadata(
+            region_id,
+            data_region_id,
+            &batch,
+            &logical_metadata,
+        )?;
+        let modified_batch = modify_batch_sparse(
+            batch.clone(),
+            region_id.table_id(),
+            &tag_columns,
+            &non_tag_indices,
+        )?;
+        let (schema, data_header, payload) = record_batch_to_ipc(&modified_batch)?;
+
+        let partition_expr_version = request.partition_expr_version;
+        let request = RegionBulkInsertsRequest {
+            region_id: data_region_id,
+            payload: modified_batch,
+            raw_data: ArrowIpc {
+                schema,
+                data_header,
+                payload,
+            },
+            partition_expr_version,
+        };
+        match self
+            .data_region
+            .write_data(data_region_id, RegionRequest::BulkInserts(request))
+            .await
+        {
+            Ok(affected_rows) => Ok(affected_rows),
+            Err(err) if err.status_code() == StatusCode::Unsupported => {
+                // todo(hl): fallback path for PartitionTreeMemtable, remove this once we remove it
+                let rows = record_batch_to_rows(&batch, region_id)?;
+                self.put_region(
+                    region_id,
+                    RegionPutRequest {
+                        rows,
+                        hint: None,
+                        partition_expr_version,
+                    },
+                )
+                .await
+            }
+            Err(err) => Err(err),
+        }
+    }
+
+    fn resolve_tag_columns_from_metadata(
+        &self,
+        logical_region_id: RegionId,
+        data_region_id: RegionId,
+        batch: &RecordBatch,
+        logical_metadata: &RegionMetadataRef,
+    ) -> Result<(Vec<TagColumnInfo>, Vec<usize>)> {
+        let tag_names: HashSet<&str> = logical_metadata
+            .column_metadatas
+            .iter()
+            .filter_map(|column| {
+                if column.semantic_type == SemanticType::Tag {
+                    Some(column.column_schema.name.as_str())
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        let mut tag_columns = Vec::new();
+        let mut non_tag_indices = Vec::new();
+        {
+            let state = self.state.read().unwrap();
+            let physical_columns = state
+                .physical_region_states()
+                .get(&data_region_id)
+                .context(error::PhysicalRegionNotFoundSnafu {
+                    region_id: data_region_id,
+                })?
+                .physical_columns();
+
+            for (index, field) in batch.schema().fields().iter().enumerate() {
+                let name = field.name();
+                let column_id =
+                    *physical_columns
+                        .get(name)
+                        .with_context(|| error::ColumnNotFoundSnafu {
+                            name: name.clone(),
+                            region_id: logical_region_id,
+                        })?;
+                if tag_names.contains(name.as_str()) {
+                    tag_columns.push(TagColumnInfo {
+                        name: name.clone(),
+                        index,
+                        column_id,
+                    });
+                } else {
+                    non_tag_indices.push(index);
+                }
+            }
+        }
+
+        tag_columns.sort_by(|a, b| a.name.cmp(&b.name));
+        Ok((tag_columns, non_tag_indices))
+    }
+}
+
+fn record_batch_to_rows(batch: &RecordBatch, logical_region_id: RegionId) -> Result<api::v1::Rows> {
+    let schema_ref = batch.schema();
+    let fields = schema_ref.fields();
+
+    let mut ts_idx = None;
+    let mut val_idx = None;
+    let mut tag_indices = Vec::new();
+
+    for (idx, field) in fields.iter().enumerate() {
+        if field.name() == greptime_timestamp() {
+            ts_idx = Some(idx);
+            if !matches!(
+                field.data_type(),
+                datatypes::arrow::datatypes::DataType::Timestamp(
+                    datatypes::arrow::datatypes::TimeUnit::Millisecond,
+                    _
+                )
+            ) {
+                return error::UnexpectedRequestSnafu {
+                    reason: format!(
+                        "Timestamp column '{}' in region {:?} has incompatible type: {:?}",
+                        field.name(),
+                        logical_region_id,
+                        field.data_type()
+                    ),
+                }
+                .fail();
+            }
+        } else if field.name() == greptime_value() {
+            val_idx = Some(idx);
+            if !matches!(
+                field.data_type(),
+                datatypes::arrow::datatypes::DataType::Float64
+            ) {
+                return error::UnexpectedRequestSnafu {
+                    reason: format!(
+                        "Value column '{}' in region {:?} has incompatible type: {:?}",
+                        field.name(),
+                        logical_region_id,
+                        field.data_type()
+                    ),
+                }
+                .fail();
+            }
+        } else {
+            if !matches!(
+                field.data_type(),
+                datatypes::arrow::datatypes::DataType::Utf8
+            ) {
+                return error::UnexpectedRequestSnafu {
+                    reason: format!(
+                        "Tag column '{}' in region {:?} must be Utf8, found: {:?}",
+                        field.name(),
+                        logical_region_id,
+                        field.data_type()
+                    ),
+                }
+                .fail();
+            }
+            tag_indices.push(idx);
+        }
+    }
+
+    let ts_idx = ts_idx.with_context(|| error::UnexpectedRequestSnafu {
+        reason: format!(
+            "Timestamp column '{}' not found in RecordBatch for region {:?}",
+            greptime_timestamp(),
+            logical_region_id
+        ),
+    })?;
+    let val_idx = val_idx.with_context(|| error::UnexpectedRequestSnafu {
+        reason: format!(
+            "Value column '{}' not found in RecordBatch for region {:?}",
+            greptime_value(),
+            logical_region_id
+        ),
+    })?;
+
+    let mut schema = Vec::with_capacity(2 + tag_indices.len());
+    schema.push(api::v1::ColumnSchema {
+        column_name: greptime_timestamp().to_string(),
+        datatype: ColumnDataType::TimestampMillisecond as i32,
+        semantic_type: SemanticType::Timestamp as i32,
+        datatype_extension: None,
+        options: None,
+    });
+    schema.push(api::v1::ColumnSchema {
+        column_name: greptime_value().to_string(),
+        datatype: ColumnDataType::Float64 as i32,
+        semantic_type: SemanticType::Field as i32,
+        datatype_extension: None,
+        options: None,
+    });
+    for &idx in &tag_indices {
+        let field = &fields[idx];
+        schema.push(api::v1::ColumnSchema {
+            column_name: field.name().clone(),
+            datatype: ColumnDataType::String as i32,
+            semantic_type: SemanticType::Tag as i32,
+            datatype_extension: None,
+            options: None,
+        });
+    }
+
+    let ts_array = batch
+        .column(ts_idx)
+        .as_any()
+        .downcast_ref::<TimestampMillisecondArray>()
+        .expect("validated as TimestampMillisecond");
+    let val_array = batch
+        .column(val_idx)
+        .as_any()
+        .downcast_ref::<Float64Array>()
+        .expect("validated as Float64");
+    let tag_arrays: Vec<&StringArray> = tag_indices
+        .iter()
+        .map(|&idx| {
+            batch
+                .column(idx)
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .expect("validated as Utf8")
+        })
+        .collect();
+
+    let num_rows = batch.num_rows();
+    let mut rows = Vec::with_capacity(num_rows);
+    for row_idx in 0..num_rows {
+        let mut values = Vec::with_capacity(2 + tag_arrays.len());
+
+        if ts_array.is_null(row_idx) {
+            values.push(api::v1::Value { value_data: None });
+        } else {
+            values.push(api::v1::Value {
+                value_data: Some(api::v1::value::ValueData::TimestampMillisecondValue(
+                    ts_array.value(row_idx),
+                )),
+            });
+        }
+
+        if val_array.is_null(row_idx) {
+            values.push(api::v1::Value { value_data: None });
+        } else {
+            values.push(api::v1::Value {
+                value_data: Some(api::v1::value::ValueData::F64Value(
+                    val_array.value(row_idx),
+                )),
+            });
+        }
+
+        for arr in &tag_arrays {
+            if arr.is_null(row_idx) {
+                values.push(api::v1::Value { value_data: None });
+            } else {
+                values.push(api::v1::Value {
+                    value_data: Some(api::v1::value::ValueData::StringValue(
+                        arr.value(row_idx).to_string(),
+                    )),
+                });
+            }
+        }
+
+        rows.push(api::v1::Row { values });
+    }
+
+    Ok(api::v1::Rows { schema, rows })
+}
+
+fn record_batch_to_ipc(record_batch: &RecordBatch) -> Result<(Bytes, Bytes, Bytes)> {
+    let mut encoder = FlightEncoder::default();
+    let schema = encoder.encode_schema(record_batch.schema().as_ref());
+    let mut iter = encoder
+        .encode(FlightMessage::RecordBatch(record_batch.clone()))
+        .into_iter();
+
+    let Some(flight_data) = iter.next() else {
+        return error::UnexpectedRequestSnafu {
+            reason: "Failed to encode empty flight data",
+        }
+        .fail();
+    };
+    ensure!(
+        iter.next().is_none(),
+        error::UnexpectedRequestSnafu {
+            reason: "Bulk insert RecordBatch with dictionary arrays is unsupported".to_string(),
+        }
+    );
+
+    Ok((
+        schema.data_header,
+        flight_data.data_header,
+        flight_data.data_body,
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::assert_matches::assert_matches;
+    use std::sync::Arc;
+
+    use api::v1::ArrowIpc;
+    use common_error::ext::ErrorExt;
+    use common_query::prelude::{greptime_timestamp, greptime_value};
+    use common_recordbatch::RecordBatches;
+    use datatypes::arrow::array::{Float64Array, StringArray, TimestampMillisecondArray};
+    use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit};
+    use datatypes::arrow::record_batch::RecordBatch;
+    use store_api::metric_engine_consts::MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING;
+    use store_api::path_utils::table_dir;
+    use store_api::region_engine::RegionEngine;
+    use store_api::region_request::{RegionBulkInsertsRequest, RegionPutRequest, RegionRequest};
+    use store_api::storage::{RegionId, ScanRequest};
+
+    use super::record_batch_to_ipc;
+    use crate::error::Error;
+    use crate::test_util::{self, TestEnv};
+
+    fn build_logical_batch(start: usize, rows: usize) -> RecordBatch {
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new(
+                greptime_timestamp(),
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new(greptime_value(), DataType::Float64, true),
+            Field::new("job", DataType::Utf8, true),
+        ]));
+
+        let mut ts = Vec::with_capacity(rows);
+        let mut values = Vec::with_capacity(rows);
+        let mut tags = Vec::with_capacity(rows);
+        for i in start..start + rows {
+            ts.push(i as i64);
+            values.push(i as f64);
+            tags.push("tag_0".to_string());
+        }
+
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(TimestampMillisecondArray::from(ts)),
+                Arc::new(Float64Array::from(values)),
+                Arc::new(StringArray::from(tags)),
+            ],
+        )
+        .unwrap()
+    }
+
+    fn build_bulk_request(logical_region_id: RegionId, batch: RecordBatch) -> RegionRequest {
+        let (schema, data_header, payload) = record_batch_to_ipc(&batch).unwrap();
+        RegionRequest::BulkInserts(RegionBulkInsertsRequest {
+            region_id: logical_region_id,
+            payload: batch,
+            raw_data: ArrowIpc {
+                schema,
+                data_header,
+                payload,
+            },
+            partition_expr_version: None,
+        })
+    }
+
+    async fn init_dense_metric_region(env: &TestEnv) -> RegionId {
+        let physical_region_id = env.default_physical_region_id();
+        env.create_physical_region(
+            physical_region_id,
+            &TestEnv::default_table_dir(),
+            vec![(
+                MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING.to_string(),
+                "dense".to_string(),
+            )],
+        )
+        .await;
+
+        let logical_region_id = env.default_logical_region_id();
+        let request = test_util::create_logical_region_request(
+            &["job"],
+            physical_region_id,
+            &table_dir("test", logical_region_id.table_id()),
+        );
+        env.metric()
+            .handle_request(logical_region_id, RegionRequest::Create(request))
+            .await
+            .unwrap();
+        logical_region_id
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_empty_batch_returns_zero() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+        let logical_region_id = env.default_logical_region_id();
+
+        let batch = build_logical_batch(0, 0);
+        let request = RegionRequest::BulkInserts(RegionBulkInsertsRequest {
+            region_id: logical_region_id,
+            payload: batch,
+            raw_data: ArrowIpc::default(),
+            partition_expr_version: None,
+        });
+        let response = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap();
+        assert_eq!(response.affected_rows, 0);
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_physical_region_rejected() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+
+        let physical_region_id = env.default_physical_region_id();
+        let batch = build_logical_batch(0, 2);
+        let request = build_bulk_request(physical_region_id, batch);
+
+        let err = env
+            .metric()
+            .handle_request(physical_region_id, request)
+            .await
+            .unwrap_err();
+        let Some(err) = err.as_any().downcast_ref::<Error>() else {
+            panic!("unexpected error type");
+        };
+        assert_matches!(err, Error::UnsupportedRegionRequest { .. });
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_unknown_column_errors() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+        let logical_region_id = env.default_logical_region_id();
+
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new(
+                greptime_timestamp(),
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new(greptime_value(), DataType::Float64, true),
+            Field::new("nonexistent_column", DataType::Utf8, true),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(TimestampMillisecondArray::from(vec![0i64])),
+                Arc::new(Float64Array::from(vec![1.0])),
+                Arc::new(StringArray::from(vec!["val"])),
+            ],
+        )
+        .unwrap();
+
+        let request = build_bulk_request(logical_region_id, batch);
+        let err = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap_err();
+        let Some(err) = err.as_any().downcast_ref::<Error>() else {
+            panic!("unexpected error type");
+        };
+        assert_matches!(err, Error::ColumnNotFound { .. });
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_multiple_tag_columns() {
+        let env = TestEnv::new().await;
+        let physical_region_id = env.default_physical_region_id();
+        env.create_physical_region(physical_region_id, &TestEnv::default_table_dir(), vec![])
+            .await;
+        let logical_region_id = env.default_logical_region_id();
+        let request = test_util::create_logical_region_request(
+            &["host", "region"],
+            physical_region_id,
+            &table_dir("test", logical_region_id.table_id()),
+        );
+        env.metric()
+            .handle_request(logical_region_id, RegionRequest::Create(request))
+            .await
+            .unwrap();
+
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new(
+                greptime_timestamp(),
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new(greptime_value(), DataType::Float64, true),
+            Field::new("host", DataType::Utf8, true),
+            Field::new("region", DataType::Utf8, true),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(TimestampMillisecondArray::from(vec![0i64, 1, 2])),
+                Arc::new(Float64Array::from(vec![10.0, 20.0, 30.0])),
+                Arc::new(StringArray::from(vec!["h1", "h2", "h1"])),
+                Arc::new(StringArray::from(vec!["us-east", "us-west", "eu-west"])),
+            ],
+        )
+        .unwrap();
+
+        let request = build_bulk_request(logical_region_id, batch);
+        let response = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap();
+        assert_eq!(response.affected_rows, 3);
+
+        let stream = env
+            .metric()
+            .scan_to_stream(logical_region_id, ScanRequest::default())
+            .await
+            .unwrap();
+        let batches = RecordBatches::try_collect(stream).await.unwrap();
+        assert_eq!(batches.iter().map(|b| b.num_rows()).sum::<usize>(), 3);
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_accumulates_rows() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+        let logical_region_id = env.default_logical_region_id();
+
+        let request = build_bulk_request(logical_region_id, build_logical_batch(0, 3));
+        let response = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap();
+        assert_eq!(response.affected_rows, 3);
+
+        let request = build_bulk_request(logical_region_id, build_logical_batch(3, 5));
+        let response = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap();
+        assert_eq!(response.affected_rows, 5);
+
+        let stream = env
+            .metric()
+            .scan_to_stream(logical_region_id, ScanRequest::default())
+            .await
+            .unwrap();
+        let batches = RecordBatches::try_collect(stream).await.unwrap();
+        assert_eq!(batches.iter().map(|b| b.num_rows()).sum::<usize>(), 8);
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_sparse_encoding() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+        let logical_region_id = env.default_logical_region_id();
+
+        let request = build_bulk_request(logical_region_id, build_logical_batch(0, 4));
+        let response = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap();
+        assert_eq!(response.affected_rows, 4);
+
+        let stream = env
+            .metric()
+            .scan_to_stream(logical_region_id, ScanRequest::default())
+            .await
+            .unwrap();
+        let batches = RecordBatches::try_collect(stream).await.unwrap();
+        assert_eq!(batches.iter().map(|b| b.num_rows()).sum::<usize>(), 4);
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_dense_encoding_rejected() {
+        let env = TestEnv::new().await;
+        let logical_region_id = init_dense_metric_region(&env).await;
+
+        let request = build_bulk_request(logical_region_id, build_logical_batch(0, 2));
+        let err = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap_err();
+        let Some(err) = err.as_any().downcast_ref::<Error>() else {
+            panic!("unexpected error type");
+        };
+        assert_matches!(err, Error::UnsupportedRegionRequest { .. });
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_matches_put() {
+        let env_put = TestEnv::new().await;
+        env_put.init_metric_region().await;
+        let logical_region_id = env_put.default_logical_region_id();
+        let schema = test_util::row_schema_with_tags(&["job"]);
+        let rows = test_util::build_rows(1, 5);
+        env_put
+            .metric()
+            .handle_request(
+                logical_region_id,
+                RegionRequest::Put(RegionPutRequest {
+                    rows: api::v1::Rows { schema, rows },
+                    hint: None,
+                    partition_expr_version: None,
+                }),
+            )
+            .await
+            .unwrap();
+        let put_stream = env_put
+            .metric()
+            .scan_to_stream(logical_region_id, ScanRequest::default())
+            .await
+            .unwrap();
+        let put_batches = RecordBatches::try_collect(put_stream).await.unwrap();
+        let put_output = put_batches.pretty_print().unwrap();
+
+        let env_bulk = TestEnv::new().await;
+        env_bulk.init_metric_region().await;
+        let request = build_bulk_request(logical_region_id, build_logical_batch(0, 5));
+        env_bulk
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap();
+        let bulk_stream = env_bulk
+            .metric()
+            .scan_to_stream(logical_region_id, ScanRequest::default())
+            .await
+            .unwrap();
+        let bulk_batches = RecordBatches::try_collect(bulk_stream).await.unwrap();
+        let bulk_output = bulk_batches.pretty_print().unwrap();
+
+        assert_eq!(put_output, bulk_output);
+    }
+
+    #[test]
+    fn test_record_batch_to_rows_with_null_values() {
+        use datatypes::arrow::array::{Float64Array, StringArray, TimestampMillisecondArray};
+        use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit};
+        use datatypes::arrow::record_batch::RecordBatch;
+        use store_api::storage::RegionId;
+
+        use crate::engine::bulk_insert::record_batch_to_rows;
+
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new(
+                greptime_timestamp(),
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                true,
+            ),
+            Field::new(greptime_value(), DataType::Float64, true),
+            Field::new("job", DataType::Utf8, true),
+            Field::new("host", DataType::Utf8, true),
+        ]));
+
+        let ts_array = TimestampMillisecondArray::from(vec![Some(1000), None, Some(3000)]);
+        let val_array = Float64Array::from(vec![Some(1.0), Some(2.0), None]);
+        let job_array = StringArray::from(vec![Some("job1"), None, Some("job3")]);
+        let host_array = StringArray::from(vec![None, Some("host2"), Some("host3")]);
+
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(ts_array),
+                Arc::new(val_array),
+                Arc::new(job_array),
+                Arc::new(host_array),
+            ],
+        )
+        .unwrap();
+
+        let region_id = RegionId::new(1, 1);
+        let rows = record_batch_to_rows(&batch, region_id).unwrap();
+
+        assert_eq!(rows.rows.len(), 3);
+        assert_eq!(rows.schema.len(), 4);
+
+        // Row 0: all non-null except host
+        assert!(rows.rows[0].values[0].value_data.is_some());
+        assert!(rows.rows[0].values[1].value_data.is_some());
+        assert!(rows.rows[0].values[2].value_data.is_some());
+        assert!(rows.rows[0].values[3].value_data.is_none());
+
+        // Row 1: null timestamp, null job
+        assert!(rows.rows[1].values[0].value_data.is_none());
+        assert!(rows.rows[1].values[1].value_data.is_some());
+        assert!(rows.rows[1].values[2].value_data.is_none());
+        assert!(rows.rows[1].values[3].value_data.is_some());
+
+        // Row 2: null value
+        assert!(rows.rows[2].values[0].value_data.is_some());
+        assert!(rows.rows[2].values[1].value_data.is_none());
+        assert!(rows.rows[2].values[2].value_data.is_some());
+        assert!(rows.rows[2].values[3].value_data.is_some());
+    }
+}
diff --git a/src/metric-engine/src/engine/put.rs b/src/metric-engine/src/engine/put.rs
index 9251605aea..edae0d2bb4 100644
--- a/src/metric-engine/src/engine/put.rs
+++ b/src/metric-engine/src/engine/put.rs
@@ -460,7 +460,7 @@ impl MetricEngineInner {
             .await
     }
 
-    fn find_data_region_meta(
+    pub(crate) fn find_data_region_meta(
         &self,
         logical_region_id: RegionId,
     ) -> Result<(RegionId, RegionId, PrimaryKeyEncoding)> {
diff --git a/src/metric-engine/src/lib.rs b/src/metric-engine/src/lib.rs
index 30daa80b91..b93029f2f4 100644
--- a/src/metric-engine/src/lib.rs
+++ b/src/metric-engine/src/lib.rs
@@ -52,6 +52,7 @@
 
 #![feature(assert_matches)]
 
+mod batch_modifier;
 pub mod config;
 mod data_region;
 pub mod engine;

From cc441b564238562b25767be31c5d93d86c3fdc00 Mon Sep 17 00:00:00 2001
From: ZonaHe <zonahe@qq.com>
Date: Wed, 18 Mar 2026 02:25:14 +0800
Subject: [PATCH 19/42] feat: update dashboard to v0.12.0 (#7823)

Co-authored-by: sunchanglong <sunchanglong@users.noreply.github.com>
---
 src/servers/dashboard/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/servers/dashboard/VERSION b/src/servers/dashboard/VERSION
index 03ee1a5314..87a1cf595a 100644
--- a/src/servers/dashboard/VERSION
+++ b/src/servers/dashboard/VERSION
@@ -1 +1 @@
-v0.11.13
+v0.12.0

From f2bccbd96adadff6d1e07f62f5e4c467c4b7d8ae Mon Sep 17 00:00:00 2001
From: discord9 <55937128+discord9@users.noreply.github.com>
Date: Thu, 19 Mar 2026 08:37:40 +0800
Subject: [PATCH 20/42] docs: flow inc query rfc (#7816)

* docs: flow inc query rfc

Signed-off-by: discord9 <discord9@163.com>

* chore: typo

Signed-off-by: discord9 <discord9@163.com>

* chore

Signed-off-by: discord9 <discord9@163.com>

* docs: clarify flow incremental stale recovery

Clarify that flush-boundary invalidation is part of IncrementalQueryStale, and document the in-memory checkpoint plus cold-start full snapshot recovery model.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
Signed-off-by: discord9 <discord9@163.com>

---------

Signed-off-by: discord9 <discord9@163.com>
Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
---
 docs/rfcs/2026-03-16-flow-inc-query.md | 190 +++++++++++++++++++++++++
 1 file changed, 190 insertions(+)
 create mode 100644 docs/rfcs/2026-03-16-flow-inc-query.md

diff --git a/docs/rfcs/2026-03-16-flow-inc-query.md b/docs/rfcs/2026-03-16-flow-inc-query.md
new file mode 100644
index 0000000000..8041d37d2b
--- /dev/null
+++ b/docs/rfcs/2026-03-16-flow-inc-query.md
@@ -0,0 +1,190 @@
+---
+Feature Name: Flow Batching Sequence-Based Incremental Query Plan (Lite)
+Tracking Issue: TBD
+Date: 2026-03-16
+Author: @discord9
+---
+
+# Summary
+
+This RFC proposes a correctness-first incremental query mode for Flow batching.
+Flow queries can read only `seq > checkpoint` and advance checkpoints using per-region correctness watermarks.
+When incremental reads are stale or correctness cannot be proven, Flow falls back to full recomputation.
+
+# Motivation
+
+Flow batching still needs to repeatedly compute old data in the same time window, so incremental query can improve Flow performance.
+
+# Goals
+
+1. Add opt-in incremental reads (`seq > given_seq`) for Flow.
+2. Return per-region correctness watermarks for checkpoint advancement.
+3. Keep existing query behavior unchanged unless explicitly enabled.
+4. Define deterministic fallback for stale or unprovable incremental reads.
+
+# Non-Goals
+
+1. No business-schema changes (no synthetic watermark columns in result rows).
+2. No global throughput optimization in v1 (correctness first).
+3. No observational watermark output when correctness is unprovable.
+
+# Proposal
+
+## 1) Query options
+
+Introduce three `QueryContext` extension keys:
+
+- `flow.incremental_after_seqs`
+- `flow.incremental_mode`
+- `flow.return_region_seq`
+
+These options are opt-in and only affect Flow incremental execution paths.
+
+## 2) Scan mapping
+
+When incremental mode is enabled:
+
+- map `after_seq` to `memtable_min_sequence` (exclusive lower bound)
+- keep existing snapshot upper-bound behavior (`memtable_max_sequence`)
+
+Important limitation in v1:
+
+- incremental filtering is correctness-proven only for memtable rows
+- SST files do not preserve detailed row-level sequence metadata; they only expose coarser file-level sequence information
+- therefore `seq > checkpoint` must not assume precise incremental pruning across memtable->SST flush boundaries
+
+If required incremental parameters are missing or invalid, return argument error.
+
+## 3) Stale protection
+
+Add dedicated stale error:
+
+- `IncrementalQueryStale { region_id, given_seq, min_readable_seq }`
+
+Behavior:
+
+- if `given_seq < min_readable_seq`, return stale error
+- if `given_seq == min_readable_seq`, query is valid and reads `seq > given_seq`
+- if `given_seq > min_readable_seq`, query is also valid and reads `seq > given_seq`
+
+`IncrementalQueryStale` also covers the case where rows newer than the checkpoint have crossed a memtable->SST flush boundary and sequence-precise incremental exclusion can no longer be proven.
+In other words, the flush-boundary case is not a separate fallback category in v1; it is one concrete way an incremental cursor becomes stale.
+
+## 4) Watermark return
+
+Extend query metrics with optional per-region watermark map:
+
+- `region_latest_sequences: Vec<(region_id: u64, latest_sequence: u64)>`
+
+Rules:
+
+- only terminal metrics of successful query can advance checkpoints
+- for multi-region query, watermark must be complete map or absent
+- if correctness is unprovable, business rows may return but watermark is absent
+
+## 5) Flow state machine
+
+Checkpoint and watermark state are kept only in flownode memory in v1; they are not persisted as durable flow metadata.
+Cold start or flownode restart therefore always re-enters through a full snapshot read.
+Only after that full query succeeds with a complete correctness watermark may Flow switch back to incremental mode.
+
+Flow starts in full mode, then transitions:
+
+1. Full query succeeds with correctness watermark -> enter incremental mode
+2. Incremental query succeeds with correctness watermark -> advance checkpoint
+3. Incremental stale/failure -> fallback to full mode
+4. Full query without correctness watermark -> remain in full mode
+
+```mermaid
+stateDiagram-v2
+    [*] --> FullSnapshot: Flow starts
+
+    state FullSnapshot {
+        [*] --> RunFull
+        RunFull --> RunFull: Full query succeeds but watermark is unprovable<br/>no region_latest_sequences returned
+    }
+
+    FullSnapshot --> Incremental: Full query succeeds and correctness watermark is returned<br/>(checkpoint updated)
+
+    state Incremental {
+        [*] --> RunInc
+        RunInc --> RunInc: Incremental succeeds<br/>(checkpoint advances)
+    }
+
+    Incremental --> FullSnapshot: IncrementalQueryStale<br/>(cursor too old, fallback required)
+    Incremental --> FullSnapshot: Incremental fails<br/> and fallback policy is triggered
+
+    FullSnapshot --> [*]: Flow stops
+    Incremental --> [*]: Flow stops
+```
+
+### Fallback Policy
+
+Fallback to full mode is deterministic and is triggered by any of the following:
+
+1. `IncrementalQueryStale` is returned.
+2. Incremental query fails with execution errors.
+3. Incremental query succeeds but watermark is absent or incomplete for participating regions.
+
+Policy behavior:
+
+1. Do not advance any checkpoint in the failed/incomplete round.
+2. Switch to full mode for the affected flow/window in the next round.
+3. Return to incremental mode only after a full query succeeds with a complete correctness watermark map.
+
+### Persistence and recovery model
+
+The v1 design is intentionally correctness-first and keeps the progress cursor lightweight:
+
+1. Watermarks/checkpoints live only in flownode memory; v1 does not persist them separately.
+2. On cold start, the flow re-establishes progress by running a successful full-query snapshot read, then resumes incremental mode only after that round returns a complete correctness watermark map.
+3. Sequence-precise incremental correctness is currently limited to rows still visible in memtables.
+4. Once relevant rows have been flushed into SST, the system cannot use `seq > checkpoint` alone to prove precise incremental exclusion, because SST lacks detailed row-level sequence metadata.
+5. In that case the correct behavior is to fall back to full recomputation, not to continue a best-effort incremental scan.
+
+# Distributed and Compatibility Requirements
+
+1. Distributed path must preserve region-level snapshot/read-bound semantics end-to-end.
+2. `snapshot_seqs` transport and `flow.*` options must both be carried correctly.
+   - `snapshot_seqs` means the per-region snapshot upper-bound map: `region_id -> sequence`.
+3. New metrics fields must be backward-compatible (old clients ignore unknown fields).
+
+# Rollout Plan
+
+## Phase 1 (MVP, correctness first)
+
+1. Add extension constants and parsing.
+2. Add incremental scan mapping and stale detection.
+3. Add watermark metrics field and terminal-watermark checkpoint update path.
+4. Complete standalone and distributed passthrough.
+
+## Phase 2 (performance and observability)
+
+1. Improve batching key strategy with sequence/watermark context.
+2. Optimize watermark serialization overhead.
+3. Add metrics: incremental hit rate, fallback rate, fallback window size.
+
+# Testing Plan
+
+1. Unit tests for incremental bounds and stale detection.
+2. Query-path tests for extension mapping and watermark semantics.
+3. Flow integration tests for full->incremental->fallback transitions.
+4. Distributed tests for end-to-end snapshot/watermark propagation.
+5. Compatibility tests for old/new client-server combinations.
+
+# Risks
+
+1. Boundary semantic mismatch (`<` vs `<=`) may cause correctness bugs.
+2. Incomplete distributed propagation can silently invalidate watermark safety.
+3. Frequent fallback can reduce throughput before phase-2 optimizations.
+4. Memtable->SST flushes may force more full recomputation than expected until finer-grained SST sequence tracking exists.
+
+# Alternatives
+
+1. Put watermark into business rows (rejected: schema pollution).
+2. Add new dedicated Flight message type in v1 (deferred to reduce scope).
+
+# Conclusion
+
+This plan enables a practical, correctness-first incremental path for Flow batching.
+It reuses existing sequence scan capability, adds strict stale handling, and advances checkpoints only from correctness-proven per-region watermarks.

From 2af39519445d7a8ac1169c42fc190b036ea44c75 Mon Sep 17 00:00:00 2001
From: Ruihang Xia <waynestxia@gmail.com>
Date: Thu, 19 Mar 2026 11:09:47 +0800
Subject: [PATCH 21/42] feat: cache decoded region metadata alone with parquet
 metadata (#7813)

* cache decoded region metadata

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix: account for decoded sst metadata cache weight

* take optional pre-exist metadata

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
---
 src/datatypes/src/schema.rs               |  33 +-
 src/datatypes/src/schema/column_schema.rs |  30 +-
 src/mito2/src/access_layer.rs             |   2 +
 src/mito2/src/cache.rs                    | 382 ++++++++++++++++++----
 src/mito2/src/cache/file_cache.rs         |  30 +-
 src/mito2/src/cache/test_util.rs          |  39 ++-
 src/mito2/src/cache/write_cache.rs        |   8 +-
 src/mito2/src/region/opener.rs            |  34 +-
 src/mito2/src/sst/parquet.rs              |   6 +-
 src/mito2/src/sst/parquet/reader.rs       |  55 +---
 src/store-api/src/metadata.rs             |  27 +-
 11 files changed, 526 insertions(+), 120 deletions(-)

diff --git a/src/datatypes/src/schema.rs b/src/datatypes/src/schema.rs
index 9070e2babe..50f2dba270 100644
--- a/src/datatypes/src/schema.rs
+++ b/src/datatypes/src/schema.rs
@@ -16,8 +16,8 @@ mod column_schema;
 pub mod constraint;
 
 use std::collections::HashMap;
-use std::fmt;
 use std::sync::Arc;
+use std::{fmt, mem};
 
 use arrow::datatypes::{Field, Schema as ArrowSchema};
 use datafusion_common::DFSchemaRef;
@@ -177,6 +177,26 @@ impl Schema {
         &self.arrow_schema.metadata
     }
 
+    /// Returns the estimated memory footprint of this schema.
+    pub fn estimated_size(&self) -> usize {
+        mem::size_of_val(self)
+            + mem::size_of::<ColumnSchema>() * self.column_schemas.capacity()
+            + self
+                .column_schemas
+                .iter()
+                .map(|column_schema| {
+                    column_schema.estimated_size() - mem::size_of::<ColumnSchema>()
+                })
+                .sum::<usize>()
+            + mem::size_of::<(String, usize)>() * self.name_to_index.capacity()
+            + self
+                .name_to_index
+                .keys()
+                .map(|name| name.capacity())
+                .sum::<usize>()
+            + arrow_schema_size(self.arrow_schema.as_ref())
+    }
+
     /// Generate a new projected schema
     ///
     /// # Panic
@@ -213,6 +233,17 @@ impl Schema {
     }
 }
 
+fn arrow_schema_size(schema: &ArrowSchema) -> usize {
+    mem::size_of_val(schema)
+        + schema.fields.size()
+        + mem::size_of::<(String, String)>() * schema.metadata.capacity()
+        + schema
+            .metadata
+            .iter()
+            .map(|(key, value)| key.capacity() + value.capacity())
+            .sum::<usize>()
+}
+
 #[derive(Default)]
 pub struct SchemaBuilder {
     column_schemas: Vec<ColumnSchema>,
diff --git a/src/datatypes/src/schema/column_schema.rs b/src/datatypes/src/schema/column_schema.rs
index 183cf05da8..2479f4fc41 100644
--- a/src/datatypes/src/schema/column_schema.rs
+++ b/src/datatypes/src/schema/column_schema.rs
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 use std::collections::HashMap;
-use std::fmt;
 use std::str::FromStr;
+use std::{fmt, mem};
 
 use arrow::datatypes::Field;
 use arrow_schema::extension::{
@@ -178,6 +178,19 @@ impl ColumnSchema {
         self
     }
 
+    /// Returns the estimated memory footprint of this schema.
+    pub fn estimated_size(&self) -> usize {
+        mem::size_of_val(self) - mem::size_of_val(&self.data_type)
+            + self.data_type.as_arrow_type().size()
+            + self.name.capacity()
+            + self
+                .default_constraint
+                .as_ref()
+                .map(column_default_constraint_size)
+                .unwrap_or_default()
+            + metadata_size(&self.metadata)
+    }
+
     /// Set the inverted index for the column.
     /// Similar to [with_inverted_index] but don't take the ownership.
     ///
@@ -493,6 +506,21 @@ impl ColumnSchema {
     }
 }
 
+fn metadata_size(metadata: &Metadata) -> usize {
+    mem::size_of::<(String, String)>() * metadata.capacity()
+        + metadata
+            .iter()
+            .map(|(key, value)| key.capacity() + value.capacity())
+            .sum::<usize>()
+}
+
+fn column_default_constraint_size(default_constraint: &ColumnDefaultConstraint) -> usize {
+    match default_constraint {
+        ColumnDefaultConstraint::Function(expr) => expr.capacity(),
+        ColumnDefaultConstraint::Value(value) => value.as_value_ref().data_size(),
+    }
+}
+
 /// Column extended type set in column schema's metadata.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum ColumnExtType {
diff --git a/src/mito2/src/access_layer.rs b/src/mito2/src/access_layer.rs
index 231285215e..33180ebf46 100644
--- a/src/mito2/src/access_layer.rs
+++ b/src/mito2/src/access_layer.rs
@@ -338,6 +338,7 @@ impl AccessLayer {
         metrics: &mut Metrics,
     ) -> Result<SstInfoArray> {
         let region_id = request.metadata.region_id;
+        let region_metadata = request.metadata.clone();
         let cache_manager = request.cache_manager.clone();
 
         let sst_info = if let Some(write_cache) = cache_manager.write_cache() {
@@ -415,6 +416,7 @@ impl AccessLayer {
                     cache_manager.put_parquet_meta_data(
                         RegionFileId::new(region_id, sst.file_id),
                         parquet_metadata.clone(),
+                        Some(region_metadata.clone()),
                     )
                 }
             }
diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs
index e232489768..c9a8b99166 100644
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -28,6 +28,7 @@ use std::ops::Range;
 use std::sync::Arc;
 
 use bytes::Bytes;
+use common_telemetry::warn;
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::value::Value;
 use datatypes::vectors::VectorRef;
@@ -36,8 +37,10 @@ use index::result_cache::IndexResultCache;
 use moka::notification::RemovalCause;
 use moka::sync::Cache;
 use object_store::ObjectStore;
-use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData};
+use parquet::file::metadata::{FileMetaData, PageIndexPolicy, ParquetMetaData};
 use puffin::puffin_manager::cache::{PuffinMetadataCache, PuffinMetadataCacheRef};
+use snafu::{OptionExt, ResultExt};
+use store_api::metadata::RegionMetadataRef;
 use store_api::storage::{ConcreteDataType, FileId, RegionId, TimeSeriesRowSelector};
 
 use crate::cache::cache_size::parquet_meta_size;
@@ -46,11 +49,13 @@ use crate::cache::index::inverted_index::{InvertedIndexCache, InvertedIndexCache
 #[cfg(feature = "vector_index")]
 use crate::cache::index::vector_index::{VectorIndexCache, VectorIndexCacheRef};
 use crate::cache::write_cache::WriteCacheRef;
+use crate::error::{InvalidMetadataSnafu, InvalidParquetSnafu, Result};
 use crate::memtable::record_batch_estimated_size;
 use crate::metrics::{CACHE_BYTES, CACHE_EVICTION, CACHE_HIT, CACHE_MISS};
 use crate::read::Batch;
 use crate::read::range_cache::{RangeScanCacheKey, RangeScanCacheValue};
 use crate::sst::file::{RegionFileId, RegionIndexId};
+use crate::sst::parquet::PARQUET_METADATA_KEY;
 use crate::sst::parquet::reader::MetadataCacheMetrics;
 
 /// Metrics type key for sst meta.
@@ -68,6 +73,106 @@ const SELECTOR_RESULT_TYPE: &str = "selector_result";
 /// Metrics type key for range scan result cache.
 const RANGE_RESULT_TYPE: &str = "range_result";
 
+/// Cached SST metadata combines the parquet footer with the decoded region metadata.
+///
+/// The cached parquet footer strips the `greptime:metadata` JSON payload and stores the decoded
+/// [RegionMetadata] separately so readers can skip repeated deserialization work.
+#[derive(Debug)]
+pub(crate) struct CachedSstMeta {
+    parquet_metadata: Arc<ParquetMetaData>,
+    region_metadata: RegionMetadataRef,
+    region_metadata_weight: usize,
+}
+
+impl CachedSstMeta {
+    pub(crate) fn try_new(file_path: &str, parquet_metadata: ParquetMetaData) -> Result<Self> {
+        Self::try_new_with_region_metadata(file_path, parquet_metadata, None)
+    }
+
+    pub(crate) fn try_new_with_region_metadata(
+        file_path: &str,
+        parquet_metadata: ParquetMetaData,
+        region_metadata: Option<RegionMetadataRef>,
+    ) -> Result<Self> {
+        let file_metadata = parquet_metadata.file_metadata();
+        let key_values = file_metadata
+            .key_value_metadata()
+            .context(InvalidParquetSnafu {
+                file: file_path,
+                reason: "missing key value meta",
+            })?;
+        let meta_value = key_values
+            .iter()
+            .find(|kv| kv.key == PARQUET_METADATA_KEY)
+            .with_context(|| InvalidParquetSnafu {
+                file: file_path,
+                reason: format!("key {} not found", PARQUET_METADATA_KEY),
+            })?;
+        let json = meta_value
+            .value
+            .as_ref()
+            .with_context(|| InvalidParquetSnafu {
+                file: file_path,
+                reason: format!("No value for key {}", PARQUET_METADATA_KEY),
+            })?;
+        let region_metadata = match region_metadata {
+            Some(region_metadata) => region_metadata,
+            None => Arc::new(
+                store_api::metadata::RegionMetadata::from_json(json)
+                    .context(InvalidMetadataSnafu)?,
+            ),
+        };
+        // Keep the previous JSON-byte floor and charge the decoded structures as well.
+        let region_metadata_weight = region_metadata.estimated_size().max(json.len());
+        let parquet_metadata = Arc::new(strip_region_metadata_from_parquet(parquet_metadata));
+
+        Ok(Self {
+            parquet_metadata,
+            region_metadata,
+            region_metadata_weight,
+        })
+    }
+
+    pub(crate) fn parquet_metadata(&self) -> Arc<ParquetMetaData> {
+        self.parquet_metadata.clone()
+    }
+
+    pub(crate) fn region_metadata(&self) -> RegionMetadataRef {
+        self.region_metadata.clone()
+    }
+}
+
+fn strip_region_metadata_from_parquet(parquet_metadata: ParquetMetaData) -> ParquetMetaData {
+    let file_metadata = parquet_metadata.file_metadata();
+    let filtered_key_values = file_metadata.key_value_metadata().and_then(|key_values| {
+        let filtered = key_values
+            .iter()
+            .filter(|kv| kv.key != PARQUET_METADATA_KEY)
+            .cloned()
+            .collect::<Vec<_>>();
+        (!filtered.is_empty()).then_some(filtered)
+    });
+    let stripped_file_metadata = FileMetaData::new(
+        file_metadata.version(),
+        file_metadata.num_rows(),
+        file_metadata.created_by().map(ToString::to_string),
+        filtered_key_values,
+        file_metadata.schema_descr_ptr(),
+        file_metadata.column_orders().cloned(),
+    );
+
+    let mut builder = parquet_metadata.into_builder();
+    let row_groups = builder.take_row_groups();
+    let column_index = builder.take_column_index();
+    let offset_index = builder.take_offset_index();
+
+    parquet::file::metadata::ParquetMetaDataBuilder::new(stripped_file_metadata)
+        .set_row_groups(row_groups)
+        .set_column_index(column_index)
+        .set_offset_index(offset_index)
+        .build()
+}
+
 /// Cache strategies that may only enable a subset of caches.
 #[derive(Clone)]
 pub enum CacheStrategy {
@@ -84,18 +189,17 @@ pub enum CacheStrategy {
 }
 
 impl CacheStrategy {
-    /// Gets parquet metadata with cache metrics tracking.
-    /// Returns the metadata and updates the provided metrics.
-    pub(crate) async fn get_parquet_meta_data(
+    /// Gets fused SST metadata with cache metrics tracking.
+    pub(crate) async fn get_sst_meta_data(
         &self,
         file_id: RegionFileId,
         metrics: &mut MetadataCacheMetrics,
         page_index_policy: PageIndexPolicy,
-    ) -> Option<Arc<ParquetMetaData>> {
+    ) -> Option<Arc<CachedSstMeta>> {
         match self {
             CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
                 cache_manager
-                    .get_parquet_meta_data(file_id, metrics, page_index_policy)
+                    .get_sst_meta_data(file_id, metrics, page_index_policy)
                     .await
             }
             CacheStrategy::Disabled => {
@@ -105,30 +209,48 @@ impl CacheStrategy {
         }
     }
 
-    /// Calls [CacheManager::get_parquet_meta_data_from_mem_cache()].
-    pub fn get_parquet_meta_data_from_mem_cache(
+    /// Calls [CacheManager::get_sst_meta_data_from_mem_cache()].
+    pub(crate) fn get_sst_meta_data_from_mem_cache(
         &self,
         file_id: RegionFileId,
-    ) -> Option<Arc<ParquetMetaData>> {
+    ) -> Option<Arc<CachedSstMeta>> {
         match self {
-            CacheStrategy::EnableAll(cache_manager) => {
-                cache_manager.get_parquet_meta_data_from_mem_cache(file_id)
-            }
-            CacheStrategy::Compaction(cache_manager) => {
-                cache_manager.get_parquet_meta_data_from_mem_cache(file_id)
+            CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
+                cache_manager.get_sst_meta_data_from_mem_cache(file_id)
             }
             CacheStrategy::Disabled => None,
         }
     }
 
-    /// Calls [CacheManager::put_parquet_meta_data()].
-    pub fn put_parquet_meta_data(&self, file_id: RegionFileId, metadata: Arc<ParquetMetaData>) {
+    /// Calls [CacheManager::get_parquet_meta_data_from_mem_cache()].
+    pub fn get_parquet_meta_data_from_mem_cache(
+        &self,
+        file_id: RegionFileId,
+    ) -> Option<Arc<ParquetMetaData>> {
+        self.get_sst_meta_data_from_mem_cache(file_id)
+            .map(|metadata| metadata.parquet_metadata())
+    }
+
+    /// Calls [CacheManager::put_sst_meta_data()].
+    pub(crate) fn put_sst_meta_data(&self, file_id: RegionFileId, metadata: Arc<CachedSstMeta>) {
         match self {
-            CacheStrategy::EnableAll(cache_manager) => {
-                cache_manager.put_parquet_meta_data(file_id, metadata);
+            CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
+                cache_manager.put_sst_meta_data(file_id, metadata);
             }
-            CacheStrategy::Compaction(cache_manager) => {
-                cache_manager.put_parquet_meta_data(file_id, metadata);
+            CacheStrategy::Disabled => {}
+        }
+    }
+
+    /// Calls [CacheManager::put_parquet_meta_data()].
+    pub fn put_parquet_meta_data(
+        &self,
+        file_id: RegionFileId,
+        metadata: Arc<ParquetMetaData>,
+        region_metadata: Option<RegionMetadataRef>,
+    ) {
+        match self {
+            CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
+                cache_manager.put_parquet_meta_data(file_id, metadata, region_metadata);
             }
             CacheStrategy::Disabled => {}
         }
@@ -368,6 +490,35 @@ impl CacheManager {
         CacheManagerBuilder::default()
     }
 
+    /// Gets fused SST metadata with metrics tracking.
+    /// Tries in-memory cache first, then file cache, updating metrics accordingly.
+    pub(crate) async fn get_sst_meta_data(
+        &self,
+        file_id: RegionFileId,
+        metrics: &mut MetadataCacheMetrics,
+        page_index_policy: PageIndexPolicy,
+    ) -> Option<Arc<CachedSstMeta>> {
+        if let Some(metadata) = self.get_sst_meta_data_from_mem_cache(file_id) {
+            metrics.mem_cache_hit += 1;
+            return Some(metadata);
+        }
+
+        let key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Parquet);
+        if let Some(write_cache) = &self.write_cache
+            && let Some(metadata) = write_cache
+                .file_cache()
+                .get_sst_meta_data(key, metrics, page_index_policy)
+                .await
+        {
+            metrics.file_cache_hit += 1;
+            self.put_sst_meta_data(file_id, metadata.clone());
+            return Some(metadata);
+        }
+
+        metrics.cache_miss += 1;
+        None
+    }
+
     /// Gets cached [ParquetMetaData] with metrics tracking.
     /// Tries in-memory cache first, then file cache, updating metrics accordingly.
     pub(crate) async fn get_parquet_meta_data(
@@ -376,29 +527,21 @@ impl CacheManager {
         metrics: &mut MetadataCacheMetrics,
         page_index_policy: PageIndexPolicy,
     ) -> Option<Arc<ParquetMetaData>> {
-        // Try to get metadata from sst meta cache
-        if let Some(metadata) = self.get_parquet_meta_data_from_mem_cache(file_id) {
-            metrics.mem_cache_hit += 1;
-            return Some(metadata);
-        }
+        self.get_sst_meta_data(file_id, metrics, page_index_policy)
+            .await
+            .map(|metadata| metadata.parquet_metadata())
+    }
 
-        // Try to get metadata from write cache
-        let key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Parquet);
-        if let Some(write_cache) = &self.write_cache
-            && let Some(metadata) = write_cache
-                .file_cache()
-                .get_parquet_meta_data(key, metrics, page_index_policy)
-                .await
-        {
-            metrics.file_cache_hit += 1;
-            let metadata = Arc::new(metadata);
-            // Put metadata into sst meta cache
-            self.put_parquet_meta_data(file_id, metadata.clone());
-            return Some(metadata);
-        };
-        metrics.cache_miss += 1;
-
-        None
+    /// Gets cached fused SST metadata from in-memory cache.
+    /// This method does not perform I/O.
+    pub(crate) fn get_sst_meta_data_from_mem_cache(
+        &self,
+        file_id: RegionFileId,
+    ) -> Option<Arc<CachedSstMeta>> {
+        self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| {
+            let value = sst_meta_cache.get(&SstMetaKey(file_id.region_id(), file_id.file_id()));
+            update_hit_miss(value, SST_META_TYPE)
+        })
     }
 
     /// Gets cached [ParquetMetaData] from in-memory cache.
@@ -407,15 +550,12 @@ impl CacheManager {
         &self,
         file_id: RegionFileId,
     ) -> Option<Arc<ParquetMetaData>> {
-        // Try to get metadata from sst meta cache
-        self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| {
-            let value = sst_meta_cache.get(&SstMetaKey(file_id.region_id(), file_id.file_id()));
-            update_hit_miss(value, SST_META_TYPE)
-        })
+        self.get_sst_meta_data_from_mem_cache(file_id)
+            .map(|metadata| metadata.parquet_metadata())
     }
 
-    /// Puts [ParquetMetaData] into the cache.
-    pub fn put_parquet_meta_data(&self, file_id: RegionFileId, metadata: Arc<ParquetMetaData>) {
+    /// Puts fused SST metadata into the cache.
+    pub(crate) fn put_sst_meta_data(&self, file_id: RegionFileId, metadata: Arc<CachedSstMeta>) {
         if let Some(cache) = &self.sst_meta_cache {
             let key = SstMetaKey(file_id.region_id(), file_id.file_id());
             CACHE_BYTES
@@ -425,6 +565,34 @@ impl CacheManager {
         }
     }
 
+    /// Puts [ParquetMetaData] into the cache.
+    pub fn put_parquet_meta_data(
+        &self,
+        file_id: RegionFileId,
+        metadata: Arc<ParquetMetaData>,
+        region_metadata: Option<RegionMetadataRef>,
+    ) {
+        if self.sst_meta_cache.is_some() {
+            let file_path = format!(
+                "region_id={}, file_id={}",
+                file_id.region_id(),
+                file_id.file_id()
+            );
+            match CachedSstMeta::try_new_with_region_metadata(
+                &file_path,
+                Arc::unwrap_or_clone(metadata),
+                region_metadata,
+            ) {
+                Ok(metadata) => self.put_sst_meta_data(file_id, Arc::new(metadata)),
+                Err(err) => warn!(
+                    err; "Failed to decode region metadata while caching parquet metadata, region_id: {}, file_id: {}",
+                    file_id.region_id(),
+                    file_id.file_id()
+                ),
+            }
+        }
+    }
+
     /// Removes [ParquetMetaData] from the cache.
     pub fn remove_parquet_meta_data(&self, file_id: RegionFileId) {
         if let Some(cache) = &self.sst_meta_cache {
@@ -809,9 +977,9 @@ impl CacheManagerBuilder {
     }
 }
 
-fn meta_cache_weight(k: &SstMetaKey, v: &Arc<ParquetMetaData>) -> u32 {
+fn meta_cache_weight(k: &SstMetaKey, v: &Arc<CachedSstMeta>) -> u32 {
     // We ignore the size of `Arc`.
-    (k.estimated_size() + parquet_meta_size(v)) as u32
+    (k.estimated_size() + parquet_meta_size(&v.parquet_metadata) + v.region_metadata_weight) as u32
 }
 
 fn vector_cache_weight(_k: &(ConcreteDataType, Value), v: &VectorRef) -> u32 {
@@ -977,8 +1145,8 @@ impl SelectorResultValue {
     }
 }
 
-/// Maps (region id, file id) to [ParquetMetaData].
-type SstMetaCache = Cache<SstMetaKey, Arc<ParquetMetaData>>;
+/// Maps (region id, file id) to fused SST metadata.
+type SstMetaCache = Cache<SstMetaKey, Arc<CachedSstMeta>>;
 /// Maps [Value] to a vector that holds this value repeatedly.
 ///
 /// e.g. `"hello" => ["hello", "hello", "hello"]`
@@ -994,15 +1162,20 @@ type RangeResultCache = Cache<RangeScanCacheKey, Arc<RangeScanCacheValue>>;
 mod tests {
     use std::sync::Arc;
 
+    use api::v1::SemanticType;
     use api::v1::index::{BloomFilterMeta, InvertedIndexMetas};
+    use datatypes::schema::ColumnSchema;
     use datatypes::vectors::Int64Vector;
     use puffin::file_metadata::FileMetadata;
+    use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
     use store_api::storage::ColumnId;
 
     use super::*;
     use crate::cache::index::bloom_filter_index::Tag;
     use crate::cache::index::result_cache::PredicateKey;
-    use crate::cache::test_util::parquet_meta;
+    use crate::cache::test_util::{
+        parquet_meta, sst_parquet_meta, sst_parquet_meta_with_region_metadata,
+    };
     use crate::read::range_cache::{
         RangeScanCacheKey, RangeScanCacheValue, ScanRequestFingerprintBuilder,
     };
@@ -1019,7 +1192,7 @@ mod tests {
         let file_id = RegionFileId::new(region_id, FileId::random());
         let metadata = parquet_meta();
         let mut metrics = MetadataCacheMetrics::default();
-        cache.put_parquet_meta_data(file_id, metadata);
+        cache.put_parquet_meta_data(file_id, metadata, None);
         assert!(
             cache
                 .get_parquet_meta_data(file_id, &mut metrics, Default::default())
@@ -1056,13 +1229,23 @@ mod tests {
                 .await
                 .is_none()
         );
-        let metadata = parquet_meta();
-        cache.put_parquet_meta_data(file_id, metadata);
+        let (metadata, region_metadata) = sst_parquet_meta();
+        cache.put_parquet_meta_data(file_id, metadata, None);
+        let cached = cache
+            .get_sst_meta_data(file_id, &mut metrics, Default::default())
+            .await
+            .unwrap();
+        assert_eq!(region_metadata, cached.region_metadata());
         assert!(
-            cache
-                .get_parquet_meta_data(file_id, &mut metrics, Default::default())
-                .await
-                .is_some()
+            cached
+                .parquet_metadata()
+                .file_metadata()
+                .key_value_metadata()
+                .is_none_or(|key_values| {
+                    key_values
+                        .iter()
+                        .all(|key_value| key_value.key != PARQUET_METADATA_KEY)
+                })
         );
         cache.remove_parquet_meta_data(file_id);
         assert!(
@@ -1073,6 +1256,42 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn test_parquet_meta_cache_with_provided_region_metadata() {
+        let cache = CacheManager::builder().sst_meta_cache_size(2000).build();
+        let mut metrics = MetadataCacheMetrics::default();
+        let region_id = RegionId::new(1, 1);
+        let file_id = RegionFileId::new(region_id, FileId::random());
+        let (metadata, region_metadata) = sst_parquet_meta();
+
+        cache.put_parquet_meta_data(file_id, metadata, Some(region_metadata.clone()));
+
+        let cached = cache
+            .get_sst_meta_data(file_id, &mut metrics, Default::default())
+            .await
+            .unwrap();
+        assert!(Arc::ptr_eq(&region_metadata, &cached.region_metadata()));
+    }
+
+    #[test]
+    fn test_meta_cache_weight_accounts_for_decoded_region_metadata() {
+        let region_metadata = Arc::new(wide_region_metadata(128));
+        let json_len = region_metadata.to_json().unwrap().len();
+        let metadata = sst_parquet_meta_with_region_metadata(region_metadata.clone());
+        let cached = Arc::new(
+            CachedSstMeta::try_new("test.parquet", Arc::unwrap_or_clone(metadata)).unwrap(),
+        );
+        let key = SstMetaKey(region_metadata.region_id, FileId::random());
+
+        assert!(cached.region_metadata_weight > json_len);
+        assert_eq!(
+            meta_cache_weight(&key, &cached) as usize,
+            key.estimated_size()
+                + parquet_meta_size(&cached.parquet_metadata)
+                + cached.region_metadata_weight
+        );
+    }
+
     #[test]
     fn test_repeated_vector_cache() {
         let cache = CacheManager::builder().vector_cache_size(4096).build();
@@ -1256,4 +1475,45 @@ mod tests {
         assert!(result_cache.get(&predicate, index_id.file_id()).is_none());
         assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_none());
     }
+
+    fn wide_region_metadata(column_count: u32) -> RegionMetadata {
+        let region_id = RegionId::new(1024, 7);
+        let mut builder = RegionMetadataBuilder::new(region_id);
+        let mut primary_key = Vec::new();
+
+        for column_id in 0..column_count {
+            let semantic_type = if column_id < 32 {
+                primary_key.push(column_id);
+                SemanticType::Tag
+            } else {
+                SemanticType::Field
+            };
+            let mut column_schema = ColumnSchema::new(
+                format!("wide_column_{column_id}"),
+                ConcreteDataType::string_datatype(),
+                true,
+            );
+            column_schema
+                .mut_metadata()
+                .insert(format!("cache_key_{column_id}"), "cache_value".repeat(4));
+            builder.push_column_metadata(ColumnMetadata {
+                column_schema,
+                semantic_type,
+                column_id,
+            });
+        }
+
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(
+                "ts",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                false,
+            ),
+            semantic_type: SemanticType::Timestamp,
+            column_id: column_count,
+        });
+        builder.primary_key(primary_key);
+
+        builder.build().unwrap()
+    }
 }
diff --git a/src/mito2/src/cache/file_cache.rs b/src/mito2/src/cache/file_cache.rs
index 32a276d0e4..278838b369 100644
--- a/src/mito2/src/cache/file_cache.rs
+++ b/src/mito2/src/cache/file_cache.rs
@@ -34,7 +34,7 @@ use store_api::storage::{FileId, RegionId};
 use tokio::sync::mpsc::{Sender, UnboundedReceiver};
 
 use crate::access_layer::TempFileCleaner;
-use crate::cache::{FILE_TYPE, INDEX_TYPE};
+use crate::cache::{CachedSstMeta, FILE_TYPE, INDEX_TYPE};
 use crate::error::{self, OpenDalSnafu, Result};
 use crate::metrics::{
     CACHE_BYTES, CACHE_HIT, CACHE_MISS, WRITE_CACHE_DOWNLOAD_BYTES_TOTAL,
@@ -612,6 +612,34 @@ impl FileCache {
         }
     }
 
+    /// Get fused SST metadata from the file cache.
+    /// If the file is not in the cache, or metadata loading/decoding fails, return None.
+    pub(crate) async fn get_sst_meta_data(
+        &self,
+        key: IndexKey,
+        cache_metrics: &mut MetadataCacheMetrics,
+        page_index_policy: PageIndexPolicy,
+    ) -> Option<Arc<CachedSstMeta>> {
+        let file_path = self.inner.cache_file_path(key);
+        self.get_parquet_meta_data(key, cache_metrics, page_index_policy)
+            .await
+            .and_then(
+                |metadata| match CachedSstMeta::try_new(&file_path, metadata) {
+                    Ok(metadata) => Some(Arc::new(metadata)),
+                    Err(err) => {
+                        CACHE_MISS
+                            .with_label_values(&[key.file_type.metric_label()])
+                            .inc();
+                        warn!(
+                            err; "Failed to decode cached parquet metadata for key {:?}",
+                            key
+                        );
+                        None
+                    }
+                },
+            )
+    }
+
     async fn get_reader(&self, file_path: &str) -> object_store::Result<Option<Reader>> {
         if self.inner.local_store.exists(file_path).await? {
             Ok(Some(self.inner.local_store.reader(file_path).await?))
diff --git a/src/mito2/src/cache/test_util.rs b/src/mito2/src/cache/test_util.rs
index 65ad9d87eb..ef3d8e9315 100644
--- a/src/mito2/src/cache/test_util.rs
+++ b/src/mito2/src/cache/test_util.rs
@@ -23,8 +23,13 @@ use object_store::ObjectStore;
 use object_store::services::Fs;
 use parquet::arrow::ArrowWriter;
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
-use parquet::file::metadata::ParquetMetaData;
+use parquet::file::metadata::{KeyValue, ParquetMetaData};
+use parquet::file::properties::WriterProperties;
 use parquet::file::statistics::Statistics;
+use store_api::metadata::RegionMetadataRef;
+
+use crate::sst::parquet::PARQUET_METADATA_KEY;
+use crate::test_util::sst_util::sst_region_metadata;
 
 /// Returns a parquet meta data.
 pub(crate) fn parquet_meta() -> Arc<ParquetMetaData> {
@@ -33,13 +38,43 @@ pub(crate) fn parquet_meta() -> Arc<ParquetMetaData> {
     builder.metadata().clone()
 }
 
+/// Returns parquet metadata for an SST parquet file and its decoded region metadata.
+pub(crate) fn sst_parquet_meta() -> (Arc<ParquetMetaData>, RegionMetadataRef) {
+    let region_metadata = Arc::new(sst_region_metadata());
+    let file_data = parquet_file_data_with_region_metadata(&region_metadata);
+    let builder = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(file_data)).unwrap();
+    (builder.metadata().clone(), region_metadata)
+}
+
+/// Returns parquet metadata for an SST parquet file with custom region metadata.
+pub(crate) fn sst_parquet_meta_with_region_metadata(
+    region_metadata: RegionMetadataRef,
+) -> Arc<ParquetMetaData> {
+    let file_data = parquet_file_data_with_region_metadata(&region_metadata);
+    let builder = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(file_data)).unwrap();
+    builder.metadata().clone()
+}
+
 /// Write a test parquet file to a buffer
 fn parquet_file_data() -> Vec<u8> {
+    parquet_file_data_inner(None)
+}
+
+fn parquet_file_data_with_region_metadata(region_metadata: &RegionMetadataRef) -> Vec<u8> {
+    let json = region_metadata.to_json().unwrap();
+    let key_value = KeyValue::new(PARQUET_METADATA_KEY.to_string(), json);
+    parquet_file_data_inner(Some(vec![key_value]))
+}
+
+fn parquet_file_data_inner(key_value_metadata: Option<Vec<KeyValue>>) -> Vec<u8> {
     let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef;
     let to_write = RecordBatch::try_from_iter([("col", col)]).unwrap();
 
     let mut buffer = Vec::new();
-    let mut writer = ArrowWriter::try_new(&mut buffer, to_write.schema(), None).unwrap();
+    let props = WriterProperties::builder()
+        .set_key_value_metadata(key_value_metadata)
+        .build();
+    let mut writer = ArrowWriter::try_new(&mut buffer, to_write.schema(), Some(props)).unwrap();
     writer.write(&to_write).unwrap();
     writer.close().unwrap();
 
diff --git a/src/mito2/src/cache/write_cache.rs b/src/mito2/src/cache/write_cache.rs
index 3d373efe91..e2483ed4e4 100644
--- a/src/mito2/src/cache/write_cache.rs
+++ b/src/mito2/src/cache/write_cache.rs
@@ -693,9 +693,15 @@ mod tests {
         .cache(CacheStrategy::EnableAll(cache_manager.clone()))
         .page_index_policy(PageIndexPolicy::Optional);
         let reader = builder.build().await.unwrap().unwrap();
+        let cached_write_parquet_metadata = crate::cache::CachedSstMeta::try_new(
+            "test.sst",
+            Arc::unwrap_or_clone(write_parquet_metadata),
+        )
+        .unwrap()
+        .parquet_metadata();
 
         // Check parquet metadata
-        assert_parquet_metadata_equal(write_parquet_metadata, reader.parquet_metadata());
+        assert_parquet_metadata_equal(cached_write_parquet_metadata, reader.parquet_metadata());
     }
 
     #[tokio::test]
diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs
index 014c50820f..d089493f81 100644
--- a/src/mito2/src/region/opener.rs
+++ b/src/mito2/src/region/opener.rs
@@ -1043,7 +1043,7 @@ async fn preload_parquet_meta_cache_for_files(
         let loader = MetadataLoader::new(object_store.clone(), &file_path, file_size);
         match loader.load(&mut cache_metrics).await {
             Ok(metadata) => {
-                cache_manager.put_parquet_meta_data(file_id, Arc::new(metadata));
+                cache_manager.put_parquet_meta_data(file_id, Arc::new(metadata), None);
                 loaded += 1;
             }
             Err(err) => {
@@ -1153,6 +1153,8 @@ mod tests {
     use object_store::ObjectStore;
     use object_store::services::{Fs, Memory};
     use parquet::arrow::ArrowWriter;
+    use parquet::file::metadata::KeyValue;
+    use parquet::file::properties::WriterProperties;
     use store_api::region_request::PathType;
     use store_api::storage::{FileId, RegionId};
 
@@ -1161,7 +1163,27 @@ mod tests {
     use crate::cache::file_cache::{FileType, IndexKey};
     use crate::sst::file::{FileHandle, FileMeta};
     use crate::sst::file_purger::NoopFilePurger;
+    use crate::sst::parquet::PARQUET_METADATA_KEY;
     use crate::test_util::TestEnv;
+    use crate::test_util::sst_util::sst_region_metadata;
+
+    fn sst_parquet_bytes(batch: &RecordBatch) -> Vec<u8> {
+        let key_value_meta = KeyValue::new(
+            PARQUET_METADATA_KEY.to_string(),
+            sst_region_metadata().to_json().unwrap(),
+        );
+        let props = WriterProperties::builder()
+            .set_key_value_metadata(Some(vec![key_value_meta]))
+            .build();
+
+        let mut parquet_bytes = Vec::new();
+        let mut writer =
+            ArrowWriter::try_new(&mut parquet_bytes, batch.schema(), Some(props)).unwrap();
+        writer.write(batch).unwrap();
+        writer.close().unwrap();
+
+        parquet_bytes
+    }
 
     #[tokio::test]
     async fn test_preload_parquet_meta_cache_uses_file_cache() {
@@ -1183,10 +1205,7 @@ mod tests {
 
         let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef;
         let batch = RecordBatch::try_from_iter([("col", col)]).unwrap();
-        let mut parquet_bytes = Vec::new();
-        let mut writer = ArrowWriter::try_new(&mut parquet_bytes, batch.schema(), None).unwrap();
-        writer.write(&batch).unwrap();
-        writer.close().unwrap();
+        let parquet_bytes = sst_parquet_bytes(&batch);
         let file_size = parquet_bytes.len() as u64;
 
         let file_meta = FileMeta {
@@ -1334,10 +1353,7 @@ mod tests {
 
         let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef;
         let batch = RecordBatch::try_from_iter([("col", col)]).unwrap();
-        let mut parquet_bytes = Vec::new();
-        let mut writer = ArrowWriter::try_new(&mut parquet_bytes, batch.schema(), None).unwrap();
-        writer.write(&batch).unwrap();
-        writer.close().unwrap();
+        let parquet_bytes = sst_parquet_bytes(&batch);
 
         // file_size is 0 when it's missing/defaulted in manifests; MetadataLoader::load will stat
         // the local filesystem to retrieve it.
diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs
index 1c5bfd9db0..26bed76fd6 100644
--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -383,8 +383,12 @@ mod tests {
         .page_index_policy(PageIndexPolicy::Optional);
         let reader = builder.build().await.unwrap().unwrap();
         let reader_metadata = reader.parquet_metadata();
+        let cached_writer_metadata =
+            crate::cache::CachedSstMeta::try_new("test.sst", Arc::unwrap_or_clone(writer_metadata))
+                .unwrap()
+                .parquet_metadata();
 
-        assert_parquet_metadata_equal(writer_metadata, reader_metadata);
+        assert_parquet_metadata_equal(cached_writer_metadata, reader_metadata);
     }
 
     #[tokio::test]
diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs
index 4d7122ccc6..855204b80e 100644
--- a/src/mito2/src/sst/parquet/reader.rs
+++ b/src/mito2/src/sst/parquet/reader.rs
@@ -34,22 +34,21 @@ use mito_codec::row_converter::build_primary_key_codec;
 use object_store::ObjectStore;
 use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowSelection};
 use parquet::arrow::{FieldLevels, ProjectionMask, parquet_to_arrow_field_levels};
-use parquet::file::metadata::{KeyValue, PageIndexPolicy, ParquetMetaData};
+use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData};
 use partition::expr::PartitionExpr;
-use snafu::{OptionExt, ResultExt};
+use snafu::ResultExt;
 use store_api::codec::PrimaryKeyEncoding;
 use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataRef};
 use store_api::region_request::PathType;
 use store_api::storage::{ColumnId, FileId};
 use table::predicate::Predicate;
 
-use crate::cache::CacheStrategy;
 use crate::cache::index::result_cache::PredicateKey;
+use crate::cache::{CacheStrategy, CachedSstMeta};
 #[cfg(feature = "vector_index")]
 use crate::error::ApplyVectorIndexSnafu;
 use crate::error::{
-    ArrowReaderSnafu, InvalidMetadataSnafu, InvalidParquetSnafu, ReadDataPartSnafu,
-    ReadParquetSnafu, Result, SerializePartitionExprSnafu,
+    ArrowReaderSnafu, ReadDataPartSnafu, ReadParquetSnafu, Result, SerializePartitionExprSnafu,
 };
 use crate::metrics::{
     PRECISE_FILTER_ROWS_TOTAL, READ_ROW_GROUPS_TOTAL, READ_ROWS_IN_ROW_GROUP_TOTAL,
@@ -70,6 +69,7 @@ use crate::sst::index::inverted_index::applier::{
 };
 #[cfg(feature = "vector_index")]
 use crate::sst::index::vector_index::applier::VectorIndexApplierRef;
+use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
 use crate::sst::parquet::file_range::{
     FileRangeContext, FileRangeContextRef, PartitionFilterContext, PreFilterMode, RangeBase,
     row_group_contains_delete,
@@ -79,7 +79,6 @@ use crate::sst::parquet::metadata::MetadataLoader;
 use crate::sst::parquet::row_group::{InMemoryRowGroup, ParquetFetchMetrics};
 use crate::sst::parquet::row_selection::RowGroupSelection;
 use crate::sst::parquet::stats::RowGroupPruningStats;
-use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, PARQUET_METADATA_KEY};
 use crate::sst::tag_maybe_to_dictionary_field;
 
 const INDEX_TYPE_FULLTEXT: &str = "fulltext";
@@ -340,7 +339,7 @@ impl ParquetReaderBuilder {
         let file_size = self.file_handle.meta_ref().file_size;
 
         // Loads parquet metadata of the file.
-        let (parquet_meta, cache_miss) = self
+        let (sst_meta, cache_miss) = self
             .read_parquet_metadata(
                 &file_path,
                 file_size,
@@ -348,9 +347,8 @@ impl ParquetReaderBuilder {
                 self.page_index_policy,
             )
             .await?;
-        // Decodes region metadata.
-        let key_value_meta = parquet_meta.file_metadata().key_value_metadata();
-        let region_meta = Arc::new(Self::get_region_metadata(&file_path, key_value_meta)?);
+        let parquet_meta = sst_meta.parquet_metadata();
+        let region_meta = sst_meta.region_metadata();
         let region_partition_expr_str = self
             .expected_metadata
             .as_ref()
@@ -601,42 +599,15 @@ impl ParquetReaderBuilder {
         }))
     }
 
-    /// Decodes region metadata from key value.
-    fn get_region_metadata(
-        file_path: &str,
-        key_value_meta: Option<&Vec<KeyValue>>,
-    ) -> Result<RegionMetadata> {
-        let key_values = key_value_meta.context(InvalidParquetSnafu {
-            file: file_path,
-            reason: "missing key value meta",
-        })?;
-        let meta_value = key_values
-            .iter()
-            .find(|kv| kv.key == PARQUET_METADATA_KEY)
-            .with_context(|| InvalidParquetSnafu {
-                file: file_path,
-                reason: format!("key {} not found", PARQUET_METADATA_KEY),
-            })?;
-        let json = meta_value
-            .value
-            .as_ref()
-            .with_context(|| InvalidParquetSnafu {
-                file: file_path,
-                reason: format!("No value for key {}", PARQUET_METADATA_KEY),
-            })?;
-
-        RegionMetadata::from_json(json).context(InvalidMetadataSnafu)
-    }
-
     /// Reads parquet metadata of specific file.
-    /// Returns (metadata, cache_miss_flag).
+    /// Returns (fused metadata, cache_miss_flag).
     async fn read_parquet_metadata(
         &self,
         file_path: &str,
         file_size: u64,
         cache_metrics: &mut MetadataCacheMetrics,
         page_index_policy: PageIndexPolicy,
-    ) -> Result<(Arc<ParquetMetaData>, bool)> {
+    ) -> Result<(Arc<CachedSstMeta>, bool)> {
         let start = Instant::now();
         let _t = READ_STAGE_ELAPSED
             .with_label_values(&["read_parquet_metadata"])
@@ -646,7 +617,7 @@ impl ParquetReaderBuilder {
         // Tries to get from cache with metrics tracking.
         if let Some(metadata) = self
             .cache_strategy
-            .get_parquet_meta_data(file_id, cache_metrics, page_index_policy)
+            .get_sst_meta_data(file_id, cache_metrics, page_index_policy)
             .await
         {
             cache_metrics.metadata_load_cost += start.elapsed();
@@ -659,10 +630,10 @@ impl ParquetReaderBuilder {
         metadata_loader.with_page_index_policy(page_index_policy);
         let metadata = metadata_loader.load(cache_metrics).await?;
 
-        let metadata = Arc::new(metadata);
+        let metadata = Arc::new(CachedSstMeta::try_new(file_path, metadata)?);
         // Cache the metadata.
         self.cache_strategy
-            .put_parquet_meta_data(file_id, metadata.clone());
+            .put_sst_meta_data(file_id, metadata.clone());
 
         cache_metrics.metadata_load_cost += start.elapsed();
         Ok((metadata, true))
diff --git a/src/store-api/src/metadata.rs b/src/store-api/src/metadata.rs
index d571a5392f..0c663bccc0 100644
--- a/src/store-api/src/metadata.rs
+++ b/src/store-api/src/metadata.rs
@@ -18,8 +18,8 @@
 
 use std::any::Any;
 use std::collections::{HashMap, HashSet};
-use std::fmt;
 use std::sync::Arc;
+use std::{fmt, mem};
 
 use api::v1::SemanticType;
 use api::v1::column_def::try_as_column_schema;
@@ -99,6 +99,12 @@ impl ColumnMetadata {
     pub fn is_same_datatype(&self, other: &Self) -> bool {
         self.column_schema.data_type == other.column_schema.data_type
     }
+
+    /// Returns the estimated memory footprint of this metadata.
+    pub fn estimated_size(&self) -> usize {
+        mem::size_of_val(self) - mem::size_of_val(&self.column_schema)
+            + self.column_schema.estimated_size()
+    }
 }
 
 #[cfg_attr(doc, aquamarine::aquamarine)]
@@ -226,6 +232,25 @@ impl RegionMetadata {
         serde_json::from_str(s).context(SerdeJsonSnafu)
     }
 
+    /// Returns the estimated memory footprint of this metadata.
+    pub fn estimated_size(&self) -> usize {
+        mem::size_of_val(self)
+            + mem::size_of::<ColumnMetadata>() * self.column_metadatas.capacity()
+            + self
+                .column_metadatas
+                .iter()
+                .map(|column| column.estimated_size() - mem::size_of::<ColumnMetadata>())
+                .sum::<usize>()
+            + mem::size_of::<ColumnId>() * self.primary_key.capacity()
+            + mem::size_of::<(ColumnId, usize)>() * self.id_to_index.capacity()
+            + self.schema.estimated_size()
+            + self
+                .partition_expr
+                .as_ref()
+                .map(|expr| expr.capacity())
+                .unwrap_or_default()
+    }
+
     /// Encode the metadata to a JSON string.
     pub fn to_json(&self) -> Result<String> {
         serde_json::to_string(&self).context(SerdeJsonSnafu)

From 16fcbb27298f106c0142ac3c2ba7c4865da6f0f5 Mon Sep 17 00:00:00 2001
From: jeremyhi <jiachun_feng@proton.me>
Date: Thu, 19 Mar 2026 14:26:41 -0700
Subject: [PATCH 22/42] feat: export import v2 pr1 (#7785)

* feat: v2 schema handling

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* feat: impl m1.5 ddl export/import and schema tests

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: git ignore update

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: add license header

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: make fmt-check happy

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix: Run imported DDL against the intended schema

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix: Canonicalize schema names after case-insensitive check

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix: escape sql funcs

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix: Fixed by carrying explicit execution_schema in DdlStatement instead of parsing schema from SQL

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix: Fixed by encoding schema names as safe path segments in shared DDL path helpers

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* refactor(cli): make export/import v2 schema recovery DDL-only

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: by clippy

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: follow our styling

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix(cli): reject remote snapshot URIs with empty root

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix(cli): dedupe schema filters after canonicalization

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* fix(cli): schema-scoped detection to cover external tables

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

---------

Signed-off-by: jeremyhi <fengjiachun@gmail.com>
---
 .gitignore                               |   3 +
 Cargo.lock                               |   1 +
 docs/rfcs/2025-12-30-export-import-v2.md |  11 +-
 src/cli/Cargo.toml                       |   3 +-
 src/cli/src/data.rs                      |  16 +
 src/cli/src/data/export.rs               |  10 +-
 src/cli/src/data/export_v2.rs            |  49 ++
 src/cli/src/data/export_v2/command.rs    | 496 +++++++++++++++++
 src/cli/src/data/export_v2/error.rs      | 181 +++++++
 src/cli/src/data/export_v2/extractor.rs  | 254 +++++++++
 src/cli/src/data/export_v2/manifest.rs   | 381 +++++++++++++
 src/cli/src/data/export_v2/schema.rs     |  98 ++++
 src/cli/src/data/export_v2/tests.rs      | 341 ++++++++++++
 src/cli/src/data/import.rs               |  11 +-
 src/cli/src/data/import_v2.rs            |  41 ++
 src/cli/src/data/import_v2/command.rs    | 542 +++++++++++++++++++
 src/cli/src/data/import_v2/error.rs      |  82 +++
 src/cli/src/data/import_v2/executor.rs   | 122 +++++
 src/cli/src/data/path.rs                 |  76 +++
 src/cli/src/data/snapshot_storage.rs     | 649 +++++++++++++++++++++++
 src/cli/src/data/sql.rs                  |  40 ++
 src/cli/src/database.rs                  |  21 +-
 src/cli/src/lib.rs                       |   2 +-
 23 files changed, 3412 insertions(+), 18 deletions(-)
 create mode 100644 src/cli/src/data/export_v2.rs
 create mode 100644 src/cli/src/data/export_v2/command.rs
 create mode 100644 src/cli/src/data/export_v2/error.rs
 create mode 100644 src/cli/src/data/export_v2/extractor.rs
 create mode 100644 src/cli/src/data/export_v2/manifest.rs
 create mode 100644 src/cli/src/data/export_v2/schema.rs
 create mode 100644 src/cli/src/data/export_v2/tests.rs
 create mode 100644 src/cli/src/data/import_v2.rs
 create mode 100644 src/cli/src/data/import_v2/command.rs
 create mode 100644 src/cli/src/data/import_v2/error.rs
 create mode 100644 src/cli/src/data/import_v2/executor.rs
 create mode 100644 src/cli/src/data/path.rs
 create mode 100644 src/cli/src/data/snapshot_storage.rs
 create mode 100644 src/cli/src/data/sql.rs

diff --git a/.gitignore b/.gitignore
index 862eb8c5b4..87412d570c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,3 +70,6 @@ CLAUDE.md
 
 # AGENTS.md
 AGENTS.md
+
+# local design docs
+docs/specs/
diff --git a/Cargo.lock b/Cargo.lock
index 605b037fc9..1b2a44d0e4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1946,6 +1946,7 @@ dependencies = [
  "tokio",
  "tracing-appender",
  "url",
+ "uuid",
 ]
 
 [[package]]
diff --git a/docs/rfcs/2025-12-30-export-import-v2.md b/docs/rfcs/2025-12-30-export-import-v2.md
index 197eb7cc9d..6bc8428300 100644
--- a/docs/rfcs/2025-12-30-export-import-v2.md
+++ b/docs/rfcs/2025-12-30-export-import-v2.md
@@ -67,6 +67,7 @@ snapshot-20250101/
 - Self-contained (all information needed for restore)
 - Immutable (content never changes after creation)
 - Verifiable (checksums at file, chunk, and snapshot levels)
+- Schema-only snapshots contain only `manifest.json` and `schema/`; `data/` is absent, `chunks` is empty, and later data append is rejected (use `--force` to recreate)
 
 ### Chunk
 
@@ -116,6 +117,8 @@ greptime export create \
   --schema-only \
   --to s3://my-bucket/snapshots/prod-schema-only
 
+Schema-only snapshots cannot be resumed with data; use `--force` to recreate.
+
 # Export with specific format (default: parquet)
 greptime export create \
   --format csv \
@@ -173,7 +176,9 @@ The manifest is a JSON file containing snapshot metadata and chunk index:
 - `snapshot_id`: Unique identifier (UUID)
 - `catalog`, `schemas`: Catalog and schema list
 - `time_range`: Overall time range covered
+- `schema_only`: Whether the snapshot contains schema only
 - `chunks[]`: Array of chunk metadata
+- `format`: Data format for exported files
 - `checksum`: Snapshot-level SHA256 checksum
 
 **Chunk metadata structure**:
@@ -182,7 +187,7 @@ Each chunk entry in the manifest contains:
 
 - `id`: Chunk identifier (sequential number)
 - `time_range`: Start and end timestamps
-- `status`: Export status (Pending, Completed, Failed)
+- `status`: Export status (Pending, InProgress, Completed, Failed)
 - `files`: List of data files in the chunk directory
 - `checksum`: Chunk-level checksum for integrity verification
 
@@ -292,9 +297,9 @@ Checksums are verified during import before data is written to the database.
 
 **Resume capability**:
 
-- Manifest tracks chunk status (Pending, Completed, Failed)
+- Manifest tracks chunk status (Pending, InProgress, Completed, Failed)
 - Export/import automatically resumes when executed on existing snapshot
-- Skips completed chunks, retries failed chunks, processes pending chunks
+- Skips completed chunks, retries failed/in-progress chunks, processes pending chunks
 - Works across process restarts
 - Use `--force` (export only) to delete existing snapshot and start over
 
diff --git a/src/cli/Cargo.toml b/src/cli/Cargo.toml
index 46e79efd00..1eb2736007 100644
--- a/src/cli/Cargo.toml
+++ b/src/cli/Cargo.toml
@@ -65,6 +65,8 @@ store-api.workspace = true
 table.workspace = true
 tokio.workspace = true
 tracing-appender.workspace = true
+url.workspace = true
+uuid.workspace = true
 
 [dev-dependencies]
 common-meta = { workspace = true, features = ["testing"] }
@@ -72,4 +74,3 @@ common-test-util.workspace = true
 common-version.workspace = true
 serde.workspace = true
 tempfile.workspace = true
-url.workspace = true
diff --git a/src/cli/src/data.rs b/src/cli/src/data.rs
index 5966040a3b..114886542e 100644
--- a/src/cli/src/data.rs
+++ b/src/cli/src/data.rs
@@ -13,7 +13,12 @@
 // limitations under the License.
 
 mod export;
+pub mod export_v2;
 mod import;
+pub mod import_v2;
+pub(crate) mod path;
+pub mod snapshot_storage;
+pub(crate) mod sql;
 mod storage_export;
 
 use clap::Subcommand;
@@ -22,15 +27,24 @@ use common_error::ext::BoxedError;
 
 use crate::Tool;
 use crate::data::export::ExportCommand;
+use crate::data::export_v2::ExportV2Command;
 use crate::data::import::ImportCommand;
+use crate::data::import_v2::ImportV2Command;
 
 pub(crate) const COPY_PATH_PLACEHOLDER: &str = "<PATH/TO/FILES>";
 
 /// Command for data operations including exporting data from and importing data into GreptimeDB.
 #[derive(Subcommand)]
 pub enum DataCommand {
+    /// Export data (V1 - legacy).
     Export(ExportCommand),
+    /// Import data (V1 - legacy).
     Import(ImportCommand),
+    /// Export V2 - JSON-based schema export with manifest support.
+    #[clap(subcommand)]
+    ExportV2(ExportV2Command),
+    /// Import V2 - Import from V2 snapshot.
+    ImportV2(ImportV2Command),
 }
 
 impl DataCommand {
@@ -38,6 +52,8 @@ impl DataCommand {
         match self {
             DataCommand::Export(cmd) => cmd.build().await,
             DataCommand::Import(cmd) => cmd.build().await,
+            DataCommand::ExportV2(cmd) => cmd.build().await,
+            DataCommand::ImportV2(cmd) => cmd.build().await,
         }
     }
 }
diff --git a/src/cli/src/data/export.rs b/src/cli/src/data/export.rs
index 1cdb159336..b5d547d4f3 100644
--- a/src/cli/src/data/export.rs
+++ b/src/cli/src/data/export.rs
@@ -107,13 +107,16 @@ pub struct ExportCommand {
     #[clap(long, value_parser = humantime::parse_duration)]
     timeout: Option<Duration>,
 
-    /// The proxy server address to connect, if set, will override the system proxy.
+    /// The proxy server address to connect.
     ///
-    /// The default behavior will use the system proxy if neither `proxy` nor `no_proxy` is set.
+    /// If set, it overrides the system proxy unless `--no-proxy` is specified.
+    /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
     #[clap(long)]
     proxy: Option<String>,
 
-    /// Disable proxy server, if set, will not use any proxy.
+    /// Disable all proxy usage (ignores `--proxy` and system proxy).
+    ///
+    /// When set and `--proxy` is not provided, this explicitly disables system proxy.
     #[clap(long)]
     no_proxy: bool,
 
@@ -173,6 +176,7 @@ impl ExportCommand {
             // Treats `None` as `0s` to disable server-side default timeout.
             self.timeout.unwrap_or_default(),
             proxy,
+            self.no_proxy,
         );
 
         Ok(Box::new(Export {
diff --git a/src/cli/src/data/export_v2.rs b/src/cli/src/data/export_v2.rs
new file mode 100644
index 0000000000..91020d2f2e
--- /dev/null
+++ b/src/cli/src/data/export_v2.rs
@@ -0,0 +1,49 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Export V2 module.
+//!
+//! This module provides the V2 implementation of database export functionality,
+//! featuring:
+//! - JSON-based schema export (version-agnostic)
+//! - Manifest-based snapshot management
+//! - Support for multiple storage backends (S3, OSS, GCS, Azure Blob, local FS)
+//! - Resume capability for interrupted exports
+//!
+//! # Example
+//!
+//! ```bash
+//! # Export schema only
+//! greptime cli data export-v2 create \
+//!   --addr 127.0.0.1:4000 \
+//!   --to file:///tmp/snapshot \
+//!   --schema-only
+//!
+//! # Export with time range (M2)
+//! greptime cli data export-v2 create \
+//!   --addr 127.0.0.1:4000 \
+//!   --to s3://bucket/snapshots/prod-20250101 \
+//!   --start-time 2025-01-01T00:00:00Z \
+//!   --end-time 2025-01-31T23:59:59Z
+//! ```
+
+mod command;
+pub mod error;
+pub mod extractor;
+pub mod manifest;
+pub mod schema;
+pub use command::ExportV2Command;
+
+#[cfg(test)]
+mod tests;
diff --git a/src/cli/src/data/export_v2/command.rs b/src/cli/src/data/export_v2/command.rs
new file mode 100644
index 0000000000..341436fe0f
--- /dev/null
+++ b/src/cli/src/data/export_v2/command.rs
@@ -0,0 +1,496 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Export V2 CLI commands.
+
+use std::collections::HashSet;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use clap::{Parser, Subcommand};
+use common_error::ext::BoxedError;
+use common_telemetry::info;
+use serde_json::Value;
+use snafu::{OptionExt, ResultExt};
+
+use crate::Tool;
+use crate::common::ObjectStoreConfig;
+use crate::data::export_v2::error::{
+    CannotResumeSchemaOnlySnafu, DataExportNotImplementedSnafu, DatabaseSnafu, EmptyResultSnafu,
+    ManifestVersionMismatchSnafu, Result, UnexpectedValueTypeSnafu,
+};
+use crate::data::export_v2::extractor::SchemaExtractor;
+use crate::data::export_v2::manifest::{DataFormat, MANIFEST_VERSION, Manifest};
+use crate::data::path::ddl_path_for_schema;
+use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri};
+use crate::data::sql::{escape_sql_identifier, escape_sql_literal};
+use crate::database::{DatabaseClient, parse_proxy_opts};
+
+/// Export V2 commands.
+#[derive(Debug, Subcommand)]
+pub enum ExportV2Command {
+    /// Create a new snapshot.
+    Create(ExportCreateCommand),
+}
+
+impl ExportV2Command {
+    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
+        match self {
+            ExportV2Command::Create(cmd) => cmd.build().await,
+        }
+    }
+}
+
+/// Create a new snapshot.
+#[derive(Debug, Parser)]
+pub struct ExportCreateCommand {
+    /// Server address to connect (e.g., 127.0.0.1:4000).
+    #[clap(long)]
+    addr: String,
+
+    /// Target storage location (e.g., s3://bucket/path, file:///tmp/backup).
+    #[clap(long)]
+    to: String,
+
+    /// Catalog name.
+    #[clap(long, default_value = "greptime")]
+    catalog: String,
+
+    /// Schema list to export (default: all non-system schemas).
+    /// Can be specified multiple times or comma-separated.
+    #[clap(long, value_delimiter = ',')]
+    schemas: Vec<String>,
+
+    /// Export schema only, no data.
+    #[clap(long)]
+    schema_only: bool,
+
+    /// Time range start (ISO 8601 format, e.g., 2024-01-01T00:00:00Z).
+    #[clap(long)]
+    start_time: Option<String>,
+
+    /// Time range end (ISO 8601 format, e.g., 2024-12-31T23:59:59Z).
+    #[clap(long)]
+    end_time: Option<String>,
+
+    /// Data format: parquet, csv, json.
+    #[clap(long, value_enum, default_value = "parquet")]
+    format: DataFormat,
+
+    /// Delete existing snapshot and recreate.
+    #[clap(long)]
+    force: bool,
+
+    /// Concurrency level (for future use).
+    #[clap(long, default_value = "1")]
+    parallelism: usize,
+
+    /// Basic authentication (user:password).
+    #[clap(long)]
+    auth_basic: Option<String>,
+
+    /// Request timeout.
+    #[clap(long, value_parser = humantime::parse_duration)]
+    timeout: Option<Duration>,
+
+    /// Proxy server address.
+    ///
+    /// If set, it overrides the system proxy unless `--no-proxy` is specified.
+    /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
+    #[clap(long)]
+    proxy: Option<String>,
+
+    /// Disable all proxy usage (ignores `--proxy` and system proxy).
+    ///
+    /// When set and `--proxy` is not provided, this explicitly disables system proxy.
+    #[clap(long)]
+    no_proxy: bool,
+
+    /// Object store configuration for remote storage backends.
+    #[clap(flatten)]
+    storage: ObjectStoreConfig,
+}
+
+impl ExportCreateCommand {
+    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
+        // Validate URI format
+        validate_uri(&self.to).map_err(BoxedError::new)?;
+
+        if !self.schema_only {
+            return DataExportNotImplementedSnafu
+                .fail()
+                .map_err(BoxedError::new);
+        }
+
+        // Parse schemas (empty vec means all schemas)
+        let schemas = if self.schemas.is_empty() {
+            None
+        } else {
+            Some(self.schemas.clone())
+        };
+
+        // Build storage
+        let storage = OpenDalStorage::from_uri(&self.to, &self.storage).map_err(BoxedError::new)?;
+
+        // Build database client
+        let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?;
+        let database_client = DatabaseClient::new(
+            self.addr.clone(),
+            self.catalog.clone(),
+            self.auth_basic.clone(),
+            self.timeout.unwrap_or(Duration::from_secs(60)),
+            proxy,
+            self.no_proxy,
+        );
+
+        Ok(Box::new(ExportCreate {
+            catalog: self.catalog.clone(),
+            schemas,
+            schema_only: self.schema_only,
+            _format: self.format,
+            force: self.force,
+            _parallelism: self.parallelism,
+            storage: Box::new(storage),
+            database_client,
+        }))
+    }
+}
+
+/// Export tool implementation.
+pub struct ExportCreate {
+    catalog: String,
+    schemas: Option<Vec<String>>,
+    schema_only: bool,
+    _format: DataFormat,
+    force: bool,
+    _parallelism: usize,
+    storage: Box<dyn SnapshotStorage>,
+    database_client: DatabaseClient,
+}
+
+#[async_trait]
+impl Tool for ExportCreate {
+    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
+        self.run().await.map_err(BoxedError::new)
+    }
+}
+
+impl ExportCreate {
+    async fn run(&self) -> Result<()> {
+        // 1. Check if snapshot exists
+        let exists = self.storage.exists().await?;
+
+        if exists {
+            if self.force {
+                info!("Deleting existing snapshot (--force)");
+                self.storage.delete_snapshot().await?;
+            } else {
+                // Resume mode - read existing manifest
+                let manifest = self.storage.read_manifest().await?;
+
+                // Check version compatibility
+                if manifest.version != MANIFEST_VERSION {
+                    return ManifestVersionMismatchSnafu {
+                        expected: MANIFEST_VERSION,
+                        found: manifest.version,
+                    }
+                    .fail();
+                }
+
+                // Cannot resume schema-only with data export
+                if manifest.schema_only && !self.schema_only {
+                    return CannotResumeSchemaOnlySnafu.fail();
+                }
+
+                info!(
+                    "Resuming existing snapshot: {} (completed: {}/{} chunks)",
+                    manifest.snapshot_id,
+                    manifest.completed_count(),
+                    manifest.chunks.len()
+                );
+
+                // For M1, we only handle schema-only exports
+                // M2 will add chunk resume logic
+                if manifest.is_complete() {
+                    info!("Snapshot is already complete");
+                    return Ok(());
+                }
+
+                // TODO: Resume data export in M2
+                info!("Data export resume not yet implemented (M2)");
+                return Ok(());
+            }
+        }
+
+        // 2. Get schema list
+        let extractor = SchemaExtractor::new(&self.database_client, &self.catalog);
+        let schema_snapshot = extractor.extract(self.schemas.as_deref()).await?;
+
+        let schema_names: Vec<String> = schema_snapshot
+            .schemas
+            .iter()
+            .map(|s| s.name.clone())
+            .collect();
+        info!("Exporting schemas: {:?}", schema_names);
+
+        // 3. Create manifest
+        let manifest = Manifest::new_schema_only(self.catalog.clone(), schema_names.clone());
+
+        // 4. Write schema files
+        self.storage.write_schema(&schema_snapshot).await?;
+        info!("Exported {} schemas", schema_snapshot.schemas.len());
+
+        // 5. Export DDL files for import recovery.
+        let ddl_by_schema = self.build_ddl_by_schema(&schema_names).await?;
+        for (schema, ddl) in ddl_by_schema {
+            let ddl_path = ddl_path_for_schema(&schema);
+            self.storage.write_text(&ddl_path, &ddl).await?;
+            info!("Exported DDL for schema {} to {}", schema, ddl_path);
+        }
+
+        // 6. Write manifest last.
+        //
+        // The manifest is the snapshot commit point: only write it after the schema
+        // index and all DDL files are durable, so a crash cannot leave a "valid"
+        // snapshot that is missing required schema artifacts.
+        self.storage.write_manifest(&manifest).await?;
+        info!("Snapshot created: {}", manifest.snapshot_id);
+
+        Ok(())
+    }
+
+    async fn build_ddl_by_schema(&self, schema_names: &[String]) -> Result<Vec<(String, String)>> {
+        let mut schemas = schema_names.to_vec();
+        schemas.sort();
+
+        let mut ddl_by_schema = Vec::with_capacity(schemas.len());
+        for schema in schemas {
+            let create_database = self.show_create("DATABASE", &schema, None).await?;
+
+            let (mut physical_tables, mut tables, mut views) =
+                self.get_schema_objects(&schema).await?;
+            physical_tables.sort();
+            let mut physical_ddls = Vec::with_capacity(physical_tables.len());
+            for table in physical_tables {
+                physical_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?);
+            }
+
+            tables.sort();
+            let mut table_ddls = Vec::with_capacity(tables.len());
+            for table in tables {
+                table_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?);
+            }
+
+            views.sort();
+            let mut view_ddls = Vec::with_capacity(views.len());
+            for view in views {
+                view_ddls.push(self.show_create("VIEW", &schema, Some(&view)).await?);
+            }
+
+            let ddl = build_schema_ddl(
+                &schema,
+                create_database,
+                physical_ddls,
+                table_ddls,
+                view_ddls,
+            );
+            ddl_by_schema.push((schema, ddl));
+        }
+
+        Ok(ddl_by_schema)
+    }
+
+    async fn get_schema_objects(
+        &self,
+        schema: &str,
+    ) -> Result<(Vec<String>, Vec<String>, Vec<String>)> {
+        let physical_tables = self.get_metric_physical_tables(schema).await?;
+        let physical_set: HashSet<&str> = physical_tables.iter().map(String::as_str).collect();
+        let sql = format!(
+            "SELECT table_name, table_type FROM information_schema.tables \
+             WHERE table_catalog = '{}' AND table_schema = '{}' \
+             AND (table_type = 'BASE TABLE' OR table_type = 'VIEW')",
+            escape_sql_literal(&self.catalog),
+            escape_sql_literal(schema)
+        );
+        let records: Option<Vec<Vec<Value>>> = self
+            .database_client
+            .sql_in_public(&sql)
+            .await
+            .context(DatabaseSnafu)?;
+
+        let mut tables = Vec::new();
+        let mut views = Vec::new();
+        if let Some(rows) = records {
+            for row in rows {
+                let name = match row.first() {
+                    Some(Value::String(name)) => name.clone(),
+                    _ => return UnexpectedValueTypeSnafu.fail(),
+                };
+                let table_type = match row.get(1) {
+                    Some(Value::String(table_type)) => table_type.as_str(),
+                    _ => return UnexpectedValueTypeSnafu.fail(),
+                };
+                if !physical_set.contains(name.as_str()) {
+                    if table_type == "VIEW" {
+                        views.push(name);
+                    } else {
+                        tables.push(name);
+                    }
+                }
+            }
+        }
+
+        Ok((physical_tables, tables, views))
+    }
+
+    async fn get_metric_physical_tables(&self, schema: &str) -> Result<Vec<String>> {
+        let sql = format!(
+            "SELECT DISTINCT table_name FROM information_schema.columns \
+             WHERE table_catalog = '{}' AND table_schema = '{}' AND column_name = '__tsid'",
+            escape_sql_literal(&self.catalog),
+            escape_sql_literal(schema)
+        );
+        let records: Option<Vec<Vec<Value>>> = self
+            .database_client
+            .sql_in_public(&sql)
+            .await
+            .context(DatabaseSnafu)?;
+
+        let mut tables = HashSet::new();
+        if let Some(rows) = records {
+            for row in rows {
+                let name = match row.first() {
+                    Some(Value::String(name)) => name.clone(),
+                    _ => return UnexpectedValueTypeSnafu.fail(),
+                };
+                tables.insert(name);
+            }
+        }
+
+        Ok(tables.into_iter().collect())
+    }
+
+    async fn show_create(
+        &self,
+        show_type: &str,
+        schema: &str,
+        table: Option<&str>,
+    ) -> Result<String> {
+        let sql = match table {
+            Some(table) => format!(
+                r#"SHOW CREATE {} "{}"."{}"."{}""#,
+                show_type,
+                escape_sql_identifier(&self.catalog),
+                escape_sql_identifier(schema),
+                escape_sql_identifier(table)
+            ),
+            None => format!(
+                r#"SHOW CREATE {} "{}"."{}""#,
+                show_type,
+                escape_sql_identifier(&self.catalog),
+                escape_sql_identifier(schema)
+            ),
+        };
+
+        let records: Option<Vec<Vec<Value>>> = self
+            .database_client
+            .sql_in_public(&sql)
+            .await
+            .context(DatabaseSnafu)?;
+        let rows = records.context(EmptyResultSnafu)?;
+        let row = rows.first().context(EmptyResultSnafu)?;
+        let Some(Value::String(create)) = row.get(1) else {
+            return UnexpectedValueTypeSnafu.fail();
+        };
+
+        Ok(format!("{};\n", create))
+    }
+}
+
+fn build_schema_ddl(
+    schema: &str,
+    create_database: String,
+    physical_tables: Vec<String>,
+    tables: Vec<String>,
+    views: Vec<String>,
+) -> String {
+    let mut ddl = String::new();
+    ddl.push_str(&format!("-- Schema: {}\n", schema));
+    ddl.push_str(&create_database);
+    for stmt in physical_tables {
+        ddl.push_str(&stmt);
+    }
+    for stmt in tables {
+        ddl.push_str(&stmt);
+    }
+    for stmt in views {
+        ddl.push_str(&stmt);
+    }
+    ddl.push('\n');
+    ddl
+}
+
+#[cfg(test)]
+mod tests {
+    use clap::Parser;
+
+    use super::*;
+    use crate::data::path::ddl_path_for_schema;
+
+    #[test]
+    fn test_ddl_path_for_schema() {
+        assert_eq!(ddl_path_for_schema("public"), "schema/ddl/public.sql");
+        assert_eq!(
+            ddl_path_for_schema("../evil"),
+            "schema/ddl/%2E%2E%2Fevil.sql"
+        );
+    }
+
+    #[test]
+    fn test_build_schema_ddl_order() {
+        let ddl = build_schema_ddl(
+            "public",
+            "CREATE DATABASE public;\n".to_string(),
+            vec!["PHYSICAL;\n".to_string()],
+            vec!["TABLE;\n".to_string()],
+            vec!["VIEW;\n".to_string()],
+        );
+
+        let db_pos = ddl.find("CREATE DATABASE").unwrap();
+        let physical_pos = ddl.find("PHYSICAL;").unwrap();
+        let table_pos = ddl.find("TABLE;").unwrap();
+        let view_pos = ddl.find("VIEW;").unwrap();
+        assert!(db_pos < physical_pos);
+        assert!(physical_pos < table_pos);
+        assert!(table_pos < view_pos);
+    }
+
+    #[tokio::test]
+    async fn test_build_rejects_non_schema_only_export() {
+        let cmd = ExportCreateCommand::parse_from([
+            "export-v2-create",
+            "--addr",
+            "127.0.0.1:4000",
+            "--to",
+            "file:///tmp/export-v2-test",
+        ]);
+
+        let result = cmd.build().await;
+        assert!(result.is_err());
+        let error = result.err().unwrap().to_string();
+
+        assert!(error.contains("Data export is not implemented yet"));
+    }
+}
diff --git a/src/cli/src/data/export_v2/error.rs b/src/cli/src/data/export_v2/error.rs
new file mode 100644
index 0000000000..2db71d5326
--- /dev/null
+++ b/src/cli/src/data/export_v2/error.rs
@@ -0,0 +1,181 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use snafu::{Location, Snafu};
+
+#[derive(Snafu)]
+#[snafu(visibility(pub))]
+#[stack_trace_debug]
+pub enum Error {
+    #[snafu(display("Invalid URI '{}': {}", uri, reason))]
+    InvalidUri {
+        uri: String,
+        reason: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Unsupported storage scheme: {}", scheme))]
+    UnsupportedScheme {
+        scheme: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Storage operation '{}' failed", operation))]
+    StorageOperation {
+        operation: String,
+        #[snafu(source)]
+        error: object_store::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to parse manifest"))]
+    ManifestParse {
+        #[snafu(source)]
+        error: serde_json::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to serialize manifest"))]
+    ManifestSerialize {
+        #[snafu(source)]
+        error: serde_json::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to decode text file as UTF-8"))]
+    TextDecode {
+        #[snafu(source)]
+        error: std::string::FromUtf8Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display(
+        "Cannot resume schema-only snapshot with data export. Use --force to recreate."
+    ))]
+    CannotResumeSchemaOnly {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display(
+        "Data export is not implemented yet. Use --schema-only to create a schema snapshot."
+    ))]
+    DataExportNotImplemented {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Empty result from query"))]
+    EmptyResult {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Unexpected value type in query result"))]
+    UnexpectedValueType {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Database error"))]
+    Database {
+        #[snafu(source)]
+        error: crate::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Snapshot not found at '{}'", uri))]
+    SnapshotNotFound {
+        uri: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Schema '{}' not found in catalog '{}'", schema, catalog))]
+    SchemaNotFound {
+        catalog: String,
+        schema: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to parse URL"))]
+    UrlParse {
+        #[snafu(source)]
+        error: url::ParseError,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to build object store"))]
+    BuildObjectStore {
+        #[snafu(source)]
+        error: object_store::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Manifest version mismatch: expected {}, found {}", expected, found))]
+    ManifestVersionMismatch {
+        expected: u32,
+        found: u32,
+        #[snafu(implicit)]
+        location: Location,
+    },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+impl ErrorExt for Error {
+    fn status_code(&self) -> StatusCode {
+        match self {
+            Error::InvalidUri { .. }
+            | Error::UnsupportedScheme { .. }
+            | Error::CannotResumeSchemaOnly { .. }
+            | Error::DataExportNotImplemented { .. }
+            | Error::ManifestVersionMismatch { .. } => StatusCode::InvalidArguments,
+
+            Error::StorageOperation { .. }
+            | Error::ManifestParse { .. }
+            | Error::ManifestSerialize { .. }
+            | Error::TextDecode { .. }
+            | Error::BuildObjectStore { .. } => StatusCode::StorageUnavailable,
+
+            Error::EmptyResult { .. }
+            | Error::UnexpectedValueType { .. }
+            | Error::UrlParse { .. } => StatusCode::Internal,
+
+            Error::Database { error, .. } => error.status_code(),
+
+            Error::SnapshotNotFound { .. } => StatusCode::InvalidArguments,
+            Error::SchemaNotFound { .. } => StatusCode::DatabaseNotFound,
+        }
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
diff --git a/src/cli/src/data/export_v2/extractor.rs b/src/cli/src/data/export_v2/extractor.rs
new file mode 100644
index 0000000000..ae15b199af
--- /dev/null
+++ b/src/cli/src/data/export_v2/extractor.rs
@@ -0,0 +1,254 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Schema extraction from information_schema.
+//!
+//! For V2 DDL-only snapshots, extractor only persists the schema index.
+
+use std::collections::{HashMap, HashSet};
+
+use serde_json::Value;
+use snafu::ResultExt;
+
+use crate::data::export_v2::error::{
+    DatabaseSnafu, EmptyResultSnafu, Result, SchemaNotFoundSnafu, UnexpectedValueTypeSnafu,
+};
+use crate::data::export_v2::schema::{SchemaDefinition, SchemaSnapshot};
+use crate::data::sql::escape_sql_literal;
+use crate::database::DatabaseClient;
+
+/// System schemas that should be excluded from export.
+const SYSTEM_SCHEMAS: &[&str] = &["information_schema", "pg_catalog"];
+
+/// Extracts schema definitions from information_schema.
+pub struct SchemaExtractor<'a> {
+    client: &'a DatabaseClient,
+    catalog: &'a str,
+}
+
+impl<'a> SchemaExtractor<'a> {
+    /// Creates a new schema extractor.
+    pub fn new(client: &'a DatabaseClient, catalog: &'a str) -> Self {
+        Self { client, catalog }
+    }
+
+    /// Extracts the schema index for the given schemas.
+    ///
+    /// If `schemas` is None, extracts all non-system schemas.
+    pub async fn extract(&self, schemas: Option<&[String]>) -> Result<SchemaSnapshot> {
+        let mut snapshot = SchemaSnapshot::new();
+
+        let schema_names = match schemas {
+            Some(names) => self.validate_schemas(names).await?,
+            None => self.get_all_schemas().await?,
+        };
+
+        for schema_name in &schema_names {
+            let schema_def = self.extract_schema_definition(schema_name).await?;
+            snapshot.add_schema(schema_def);
+        }
+
+        Ok(snapshot)
+    }
+
+    /// Gets all non-system schemas in the catalog.
+    async fn get_all_schemas(&self) -> Result<Vec<String>> {
+        let sql = format!(
+            "SELECT schema_name FROM information_schema.schemata \
+             WHERE catalog_name = '{}'",
+            escape_sql_literal(self.catalog)
+        );
+
+        let records = self.query(&sql).await?;
+        let mut schemas = Vec::new();
+
+        for row in records {
+            let name = extract_string(&row, 0)?;
+            if !SYSTEM_SCHEMAS.contains(&name.as_str()) {
+                schemas.push(name);
+            }
+        }
+
+        Ok(schemas)
+    }
+
+    /// Validates that all specified schemas exist.
+    async fn validate_schemas(&self, schemas: &[String]) -> Result<Vec<String>> {
+        let all_schemas = self.get_all_schemas().await?;
+        dedupe_canonicalized_schemas(schemas, &all_schemas, self.catalog)
+    }
+
+    /// Extracts schema (database) definition.
+    async fn extract_schema_definition(&self, schema: &str) -> Result<SchemaDefinition> {
+        let sql = format!(
+            "SELECT schema_name, options FROM information_schema.schemata \
+             WHERE catalog_name = '{}' AND schema_name = '{}'",
+            escape_sql_literal(self.catalog),
+            escape_sql_literal(schema)
+        );
+
+        let records = self.query(&sql).await?;
+        if records.is_empty() {
+            return SchemaNotFoundSnafu {
+                catalog: self.catalog,
+                schema,
+            }
+            .fail();
+        }
+
+        let name = extract_string(&records[0], 0)?;
+        let options = extract_optional_string(&records[0], 1)
+            .map(|opts| parse_options(&opts))
+            .unwrap_or_default();
+
+        Ok(SchemaDefinition {
+            catalog: self.catalog.to_string(),
+            name,
+            options,
+        })
+    }
+
+    /// Executes a SQL query and returns the results.
+    async fn query(&self, sql: &str) -> Result<Vec<Vec<Value>>> {
+        self.client
+            .sql_in_public(sql)
+            .await
+            .context(DatabaseSnafu)?
+            .ok_or_else(|| EmptyResultSnafu.build())
+    }
+}
+
+/// Extracts a string value from a row.
+fn extract_string(row: &[Value], index: usize) -> Result<String> {
+    match row.get(index) {
+        Some(Value::String(s)) => Ok(s.clone()),
+        Some(Value::Null) => UnexpectedValueTypeSnafu.fail(),
+        _ => UnexpectedValueTypeSnafu.fail(),
+    }
+}
+
+/// Extracts an optional string value from a row.
+fn extract_optional_string(row: &[Value], index: usize) -> Option<String> {
+    match row.get(index) {
+        Some(Value::String(s)) if !s.is_empty() => Some(s.clone()),
+        _ => None,
+    }
+}
+
+/// Parses options string into a HashMap.
+fn parse_options(options_str: &str) -> HashMap<String, String> {
+    if let Ok(map) = serde_json::from_str::<HashMap<String, String>>(options_str) {
+        return map;
+    }
+
+    let mut options = HashMap::new();
+    for line in options_str.lines() {
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+
+        if let Some((key, value)) = parse_quoted_option_line(trimmed) {
+            options.insert(key, value);
+            continue;
+        }
+
+        for part in trimmed.split_whitespace() {
+            if let Some((key, value)) = part.split_once('=') {
+                options.insert(key.to_string(), value.to_string());
+            }
+        }
+    }
+    options
+}
+
+fn parse_quoted_option_line(line: &str) -> Option<(String, String)> {
+    let key = line.strip_prefix('\'')?;
+    let (key, rest) = key.split_once("'='")?;
+    let value = rest.strip_suffix('\'')?;
+    Some((key.to_string(), value.to_string()))
+}
+
+fn dedupe_canonicalized_schemas(
+    requested: &[String],
+    available: &[String],
+    catalog: &str,
+) -> Result<Vec<String>> {
+    let mut canonicalized = Vec::new();
+    let mut seen = HashSet::new();
+
+    for schema in requested {
+        let Some(canonical) = available.iter().find(|s| s.eq_ignore_ascii_case(schema)) else {
+            return SchemaNotFoundSnafu { catalog, schema }.fail();
+        };
+
+        if seen.insert(canonical.to_ascii_lowercase()) {
+            canonicalized.push(canonical.clone());
+        }
+    }
+
+    Ok(canonicalized)
+}
+
+#[cfg(test)]
+mod tests {
+    use serde_json::Value;
+
+    use super::*;
+
+    #[test]
+    fn test_parse_options_json() {
+        let opts = r#"{"ttl": "30d", "custom": "value"}"#;
+        let parsed = parse_options(opts);
+        assert_eq!(parsed.get("ttl"), Some(&"30d".to_string()));
+        assert_eq!(parsed.get("custom"), Some(&"value".to_string()));
+    }
+
+    #[test]
+    fn test_parse_options_key_value() {
+        let opts = "ttl=30d custom=value";
+        let parsed = parse_options(opts);
+        assert_eq!(parsed.get("ttl"), Some(&"30d".to_string()));
+        assert_eq!(parsed.get("custom"), Some(&"value".to_string()));
+    }
+
+    #[test]
+    fn test_parse_options_schema_display_format() {
+        let opts = "'ttl'='30d'\n'custom'='value with spaces'\n";
+        let parsed = parse_options(opts);
+        assert_eq!(parsed.get("ttl"), Some(&"30d".to_string()));
+        assert_eq!(parsed.get("custom"), Some(&"value with spaces".to_string()));
+    }
+
+    #[test]
+    fn test_extract_string_rejects_null() {
+        let row = vec![Value::Null];
+        assert!(extract_string(&row, 0).is_err());
+    }
+
+    #[test]
+    fn test_dedupe_canonicalized_schemas() {
+        let available = vec!["public".to_string(), "test_db".to_string()];
+        let requested = vec![
+            "PUBLIC".to_string(),
+            "public".to_string(),
+            "Test_Db".to_string(),
+        ];
+
+        let canonicalized = dedupe_canonicalized_schemas(&requested, &available, "greptime")
+            .expect("schemas should be canonicalized");
+
+        assert_eq!(canonicalized, vec!["public", "test_db"]);
+    }
+}
diff --git a/src/cli/src/data/export_v2/manifest.rs b/src/cli/src/data/export_v2/manifest.rs
new file mode 100644
index 0000000000..0ebf753fa4
--- /dev/null
+++ b/src/cli/src/data/export_v2/manifest.rs
@@ -0,0 +1,381 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Manifest data structures for Export/Import V2.
+
+use std::{fmt, str};
+
+use chrono::{DateTime, Utc};
+use serde::{Deserialize, Serialize};
+use uuid::Uuid;
+
+/// Current manifest format version.
+pub const MANIFEST_VERSION: u32 = 1;
+
+/// Manifest file name within snapshot directory.
+pub const MANIFEST_FILE: &str = "manifest.json";
+
+/// Time range for data export (half-open interval: [start, end)).
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct TimeRange {
+    /// Start time (inclusive). None means earliest available data.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub start: Option<DateTime<Utc>>,
+    /// End time (exclusive). None means current time.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub end: Option<DateTime<Utc>>,
+}
+
+impl TimeRange {
+    /// Creates a new time range with specified bounds.
+    pub fn new(start: Option<DateTime<Utc>>, end: Option<DateTime<Utc>>) -> Self {
+        Self { start, end }
+    }
+
+    /// Creates an unbounded time range (all data).
+    pub fn unbounded() -> Self {
+        Self {
+            start: None,
+            end: None,
+        }
+    }
+
+    /// Returns true if this time range is unbounded.
+    pub fn is_unbounded(&self) -> bool {
+        self.start.is_none() && self.end.is_none()
+    }
+}
+
+impl Default for TimeRange {
+    fn default() -> Self {
+        Self::unbounded()
+    }
+}
+
+/// Status of a chunk during export/import.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
+#[serde(rename_all = "snake_case")]
+pub enum ChunkStatus {
+    /// Chunk is pending export.
+    #[default]
+    Pending,
+    /// Chunk export is in progress.
+    InProgress,
+    /// Chunk export completed successfully.
+    Completed,
+    /// Chunk export failed.
+    Failed,
+}
+
+/// Metadata for a single chunk of exported data.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChunkMeta {
+    /// Chunk identifier (sequential number starting from 1).
+    pub id: u32,
+    /// Time range covered by this chunk.
+    pub time_range: TimeRange,
+    /// Export status.
+    pub status: ChunkStatus,
+    /// List of data files in this chunk (relative paths from snapshot root).
+    #[serde(default)]
+    pub files: Vec<String>,
+    /// SHA256 checksum of all files in this chunk (aggregated).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub checksum: Option<String>,
+    /// Error message if status is Failed.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub error: Option<String>,
+}
+
+impl ChunkMeta {
+    /// Creates a new pending chunk with the given id and time range.
+    pub fn new(id: u32, time_range: TimeRange) -> Self {
+        Self {
+            id,
+            time_range,
+            status: ChunkStatus::Pending,
+            files: vec![],
+            checksum: None,
+            error: None,
+        }
+    }
+
+    /// Marks this chunk as in progress.
+    pub fn mark_in_progress(&mut self) {
+        self.status = ChunkStatus::InProgress;
+        self.error = None;
+    }
+
+    /// Marks this chunk as completed with the given files and checksum.
+    pub fn mark_completed(&mut self, files: Vec<String>, checksum: Option<String>) {
+        self.status = ChunkStatus::Completed;
+        self.files = files;
+        self.checksum = checksum;
+        self.error = None;
+    }
+
+    /// Marks this chunk as failed with the given error message.
+    pub fn mark_failed(&mut self, error: String) {
+        self.status = ChunkStatus::Failed;
+        self.error = Some(error);
+    }
+}
+
+/// Supported data formats for export.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default, clap::ValueEnum)]
+#[serde(rename_all = "lowercase")]
+#[value(rename_all = "lowercase")]
+pub enum DataFormat {
+    /// Apache Parquet format (default, recommended for production).
+    #[default]
+    Parquet,
+    /// CSV format (human-readable).
+    Csv,
+    /// JSON format (structured text).
+    Json,
+}
+
+impl fmt::Display for DataFormat {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            DataFormat::Parquet => write!(f, "parquet"),
+            DataFormat::Csv => write!(f, "csv"),
+            DataFormat::Json => write!(f, "json"),
+        }
+    }
+}
+
+impl str::FromStr for DataFormat {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "parquet" => Ok(DataFormat::Parquet),
+            "csv" => Ok(DataFormat::Csv),
+            "json" => Ok(DataFormat::Json),
+            _ => Err(format!(
+                "invalid format '{}': expected one of parquet, csv, json",
+                s
+            )),
+        }
+    }
+}
+
+/// Snapshot manifest containing all metadata.
+///
+/// The manifest is stored as `manifest.json` in the snapshot root directory.
+/// It contains:
+/// - Snapshot identification (UUID, timestamps)
+/// - Scope (catalog, schemas, time range)
+/// - Export configuration (format, schema_only)
+/// - Chunk metadata for resume support
+/// - Integrity checksums
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Manifest {
+    /// Manifest format version for compatibility checking.
+    pub version: u32,
+    /// Unique snapshot identifier.
+    pub snapshot_id: Uuid,
+    /// Catalog name.
+    pub catalog: String,
+    /// List of schemas included in this snapshot.
+    pub schemas: Vec<String>,
+    /// Overall time range covered by this snapshot.
+    pub time_range: TimeRange,
+    /// Whether this is a schema-only snapshot (no data).
+    pub schema_only: bool,
+    /// Data format used for export.
+    pub format: DataFormat,
+    /// Chunk metadata (empty for schema-only snapshots).
+    #[serde(default)]
+    pub chunks: Vec<ChunkMeta>,
+    /// Snapshot-level SHA256 checksum (aggregated from all chunks).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub checksum: Option<String>,
+    /// Creation timestamp.
+    pub created_at: DateTime<Utc>,
+    /// Last updated timestamp.
+    pub updated_at: DateTime<Utc>,
+}
+
+impl Manifest {
+    /// Creates a new manifest for schema-only export.
+    pub fn new_schema_only(catalog: String, schemas: Vec<String>) -> Self {
+        let now = Utc::now();
+        Self {
+            version: MANIFEST_VERSION,
+            snapshot_id: Uuid::new_v4(),
+            catalog,
+            schemas,
+            time_range: TimeRange::unbounded(),
+            schema_only: true,
+            format: DataFormat::Parquet,
+            chunks: vec![],
+            checksum: None,
+            created_at: now,
+            updated_at: now,
+        }
+    }
+
+    /// Creates a new manifest for full export with time range and format.
+    pub fn new_full(
+        catalog: String,
+        schemas: Vec<String>,
+        time_range: TimeRange,
+        format: DataFormat,
+    ) -> Self {
+        let now = Utc::now();
+        Self {
+            version: MANIFEST_VERSION,
+            snapshot_id: Uuid::new_v4(),
+            catalog,
+            schemas,
+            time_range,
+            schema_only: false,
+            format,
+            chunks: vec![],
+            checksum: None,
+            created_at: now,
+            updated_at: now,
+        }
+    }
+
+    /// Returns true if all chunks are completed (or if schema-only).
+    pub fn is_complete(&self) -> bool {
+        self.schema_only
+            || (!self.chunks.is_empty()
+                && self
+                    .chunks
+                    .iter()
+                    .all(|c| c.status == ChunkStatus::Completed))
+    }
+
+    /// Returns the number of pending chunks.
+    pub fn pending_count(&self) -> usize {
+        self.chunks
+            .iter()
+            .filter(|c| c.status == ChunkStatus::Pending)
+            .count()
+    }
+
+    /// Returns the number of in-progress chunks.
+    pub fn in_progress_count(&self) -> usize {
+        self.chunks
+            .iter()
+            .filter(|c| c.status == ChunkStatus::InProgress)
+            .count()
+    }
+
+    /// Returns the number of completed chunks.
+    pub fn completed_count(&self) -> usize {
+        self.chunks
+            .iter()
+            .filter(|c| c.status == ChunkStatus::Completed)
+            .count()
+    }
+
+    /// Returns the number of failed chunks.
+    pub fn failed_count(&self) -> usize {
+        self.chunks
+            .iter()
+            .filter(|c| c.status == ChunkStatus::Failed)
+            .count()
+    }
+
+    /// Updates the `updated_at` timestamp to now.
+    pub fn touch(&mut self) {
+        self.updated_at = Utc::now();
+    }
+
+    /// Adds a chunk to the manifest.
+    pub fn add_chunk(&mut self, chunk: ChunkMeta) {
+        self.chunks.push(chunk);
+        self.touch();
+    }
+
+    /// Updates a chunk by id.
+    pub fn update_chunk(&mut self, id: u32, updater: impl FnOnce(&mut ChunkMeta)) {
+        if let Some(chunk) = self.chunks.iter_mut().find(|c| c.id == id) {
+            updater(chunk);
+            self.touch();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_time_range_serialization() {
+        let range = TimeRange::unbounded();
+        let json = serde_json::to_string(&range).unwrap();
+        assert_eq!(json, "{}");
+
+        let range: TimeRange = serde_json::from_str("{}").unwrap();
+        assert!(range.is_unbounded());
+    }
+
+    #[test]
+    fn test_manifest_schema_only() {
+        let manifest =
+            Manifest::new_schema_only("greptime".to_string(), vec!["public".to_string()]);
+
+        assert_eq!(manifest.version, MANIFEST_VERSION);
+        assert!(manifest.schema_only);
+        assert!(manifest.chunks.is_empty());
+        assert!(manifest.is_complete());
+    }
+
+    #[test]
+    fn test_manifest_full() {
+        let manifest = Manifest::new_full(
+            "greptime".to_string(),
+            vec!["public".to_string()],
+            TimeRange::unbounded(),
+            DataFormat::Parquet,
+        );
+
+        assert!(!manifest.schema_only);
+        assert!(manifest.chunks.is_empty());
+        assert!(!manifest.is_complete());
+    }
+
+    #[test]
+    fn test_data_format_parsing() {
+        assert_eq!(
+            "parquet".parse::<DataFormat>().unwrap(),
+            DataFormat::Parquet
+        );
+        assert_eq!("CSV".parse::<DataFormat>().unwrap(), DataFormat::Csv);
+        assert_eq!("JSON".parse::<DataFormat>().unwrap(), DataFormat::Json);
+        assert!("invalid".parse::<DataFormat>().is_err());
+    }
+
+    #[test]
+    fn test_chunk_status_transitions() {
+        let mut chunk = ChunkMeta::new(1, TimeRange::unbounded());
+        assert_eq!(chunk.status, ChunkStatus::Pending);
+
+        chunk.mark_in_progress();
+        assert_eq!(chunk.status, ChunkStatus::InProgress);
+
+        chunk.mark_completed(
+            vec!["file1.parquet".to_string()],
+            Some("abc123".to_string()),
+        );
+        assert_eq!(chunk.status, ChunkStatus::Completed);
+        assert_eq!(chunk.files.len(), 1);
+    }
+}
diff --git a/src/cli/src/data/export_v2/schema.rs b/src/cli/src/data/export_v2/schema.rs
new file mode 100644
index 0000000000..1aab6ac900
--- /dev/null
+++ b/src/cli/src/data/export_v2/schema.rs
@@ -0,0 +1,98 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Minimal schema index structures for Export/Import V2.
+//!
+//! The canonical schema representation is the per-schema DDL file under
+//! `schema/ddl/`. `schemas.json` only records which schemas exist in a snapshot.
+
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+/// Schema directory name within snapshot.
+pub const SCHEMA_DIR: &str = "schema";
+
+/// DDL directory name within schema directory.
+pub const DDL_DIR: &str = "ddl";
+
+/// Schema definition file name.
+pub const SCHEMAS_FILE: &str = "schemas.json";
+
+/// Schema (database) definition.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct SchemaDefinition {
+    /// Catalog name.
+    pub catalog: String,
+    /// Schema (database) name.
+    pub name: String,
+    /// Schema options (if any).
+    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
+    pub options: HashMap<String, String>,
+}
+
+/// Minimal schema index stored in a snapshot.
+#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
+pub struct SchemaSnapshot {
+    /// Schema (database) definitions.
+    pub schemas: Vec<SchemaDefinition>,
+}
+
+impl SchemaSnapshot {
+    /// Creates an empty schema snapshot.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Adds a schema definition.
+    pub fn add_schema(&mut self, schema: SchemaDefinition) {
+        self.schemas.push(schema);
+    }
+
+    /// Filters the snapshot to only include specified schemas.
+    pub fn filter_schemas(&self, schemas: &[String]) -> Self {
+        Self {
+            schemas: self
+                .schemas
+                .iter()
+                .filter(|s| schemas.contains(&s.name))
+                .cloned()
+                .collect(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_schema_snapshot_filter() {
+        let mut snapshot = SchemaSnapshot::new();
+        snapshot.add_schema(SchemaDefinition {
+            catalog: "greptime".to_string(),
+            name: "public".to_string(),
+            options: HashMap::new(),
+        });
+        snapshot.add_schema(SchemaDefinition {
+            catalog: "greptime".to_string(),
+            name: "private".to_string(),
+            options: HashMap::new(),
+        });
+
+        let filtered = snapshot.filter_schemas(&["public".to_string()]);
+        assert_eq!(filtered.schemas.len(), 1);
+        assert_eq!(filtered.schemas[0].name, "public");
+    }
+}
diff --git a/src/cli/src/data/export_v2/tests.rs b/src/cli/src/data/export_v2/tests.rs
new file mode 100644
index 0000000000..bd28801a0d
--- /dev/null
+++ b/src/cli/src/data/export_v2/tests.rs
@@ -0,0 +1,341 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::env;
+use std::time::Duration;
+
+use clap::Parser;
+use common_error::ext::BoxedError;
+use snafu::ResultExt;
+use tempfile::tempdir;
+use url::Url;
+
+use super::command::ExportCreateCommand;
+use crate::common::ObjectStoreConfig;
+use crate::data::import_v2::ImportV2Command;
+use crate::data::snapshot_storage::OpenDalStorage;
+use crate::database::DatabaseClient;
+use crate::error::{FileIoSnafu, InvalidArgumentsSnafu, OtherSnafu, Result};
+
+#[tokio::test]
+#[ignore]
+async fn export_import_v2_schema_parity_e2e() -> Result<()> {
+    let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string());
+    let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string());
+    let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok();
+    let schema = "test_db_schema_parity";
+
+    let database_client = DatabaseClient::new(
+        addr.clone(),
+        catalog.clone(),
+        auth_basic.clone(),
+        Duration::from_secs(60),
+        None,
+        false,
+    );
+
+    database_client
+        .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
+        .await?;
+    database_client
+        .sql_in_public(&format!("CREATE DATABASE {schema}"))
+        .await?;
+    database_client
+        .sql(
+            "CREATE TABLE metrics (\
+                ts TIMESTAMP TIME INDEX, \
+                host STRING PRIMARY KEY, \
+                cpu DOUBLE DEFAULT 0.0, \
+                region_name STRING \
+            ) ENGINE = mito WITH (ttl='7d', 'compaction.type'='twcs')",
+            schema,
+        )
+        .await?;
+    database_client
+        .sql(
+            "CREATE TABLE logs (\
+                ts TIMESTAMP TIME INDEX, \
+                app STRING PRIMARY KEY, \
+                msg STRING NOT NULL COMMENT 'log message' \
+            ) ENGINE = mito",
+            schema,
+        )
+        .await?;
+    database_client
+        .sql(
+            "CREATE TABLE metrics_physical (\
+                ts TIMESTAMP TIME INDEX, \
+                host STRING, \
+                region_name STRING, \
+                cpu DOUBLE DEFAULT 0.0, \
+                PRIMARY KEY (host, region_name) \
+            ) ENGINE = metric WITH (physical_metric_table='true')",
+            schema,
+        )
+        .await?;
+    database_client
+        .sql(
+            "CREATE TABLE metrics_logical (\
+                ts TIMESTAMP TIME INDEX, \
+                host STRING, \
+                region_name STRING, \
+                cpu DOUBLE DEFAULT 0.0, \
+                PRIMARY KEY (host, region_name) \
+            ) ENGINE = metric WITH (on_physical_table='metrics_physical')",
+            schema,
+        )
+        .await?;
+    database_client
+        .sql(
+            "CREATE VIEW metrics_view AS SELECT * FROM metrics WHERE cpu > 0.5",
+            schema,
+        )
+        .await?;
+
+    let src_dir = tempdir().context(FileIoSnafu)?;
+    let src_uri = Url::from_directory_path(src_dir.path())
+        .map_err(|_| {
+            InvalidArgumentsSnafu {
+                msg: "invalid temp dir path".to_string(),
+            }
+            .build()
+        })?
+        .to_string();
+
+    let mut export_args = vec![
+        "export-v2-create",
+        "--addr",
+        &addr,
+        "--to",
+        &src_uri,
+        "--catalog",
+        &catalog,
+        "--schemas",
+        schema,
+        "--schema-only",
+    ];
+    if let Some(auth) = &auth_basic {
+        export_args.push("--auth-basic");
+        export_args.push(auth);
+    }
+    let export_cmd = ExportCreateCommand::parse_from(export_args);
+    export_cmd
+        .build()
+        .await
+        .context(OtherSnafu)?
+        .do_work()
+        .await
+        .context(OtherSnafu)?;
+
+    database_client
+        .sql_in_public(&format!("DROP DATABASE {schema}"))
+        .await?;
+
+    let mut import_args = vec![
+        "import-v2",
+        "--addr",
+        &addr,
+        "--from",
+        &src_uri,
+        "--catalog",
+        &catalog,
+        "--schemas",
+        schema,
+    ];
+    if let Some(auth) = &auth_basic {
+        import_args.push("--auth-basic");
+        import_args.push(auth);
+    }
+    let import_cmd = ImportV2Command::parse_from(import_args);
+    import_cmd
+        .build()
+        .await
+        .context(OtherSnafu)?
+        .do_work()
+        .await
+        .context(OtherSnafu)?;
+
+    let dst_dir = tempdir().context(FileIoSnafu)?;
+    let dst_uri = Url::from_directory_path(dst_dir.path())
+        .map_err(|_| {
+            InvalidArgumentsSnafu {
+                msg: "invalid temp dir path".to_string(),
+            }
+            .build()
+        })?
+        .to_string();
+
+    let mut export_args = vec![
+        "export-v2-create",
+        "--addr",
+        &addr,
+        "--to",
+        &dst_uri,
+        "--catalog",
+        &catalog,
+        "--schemas",
+        schema,
+        "--schema-only",
+    ];
+    if let Some(auth) = &auth_basic {
+        export_args.push("--auth-basic");
+        export_args.push(auth);
+    }
+    let export_cmd = ExportCreateCommand::parse_from(export_args);
+    export_cmd
+        .build()
+        .await
+        .context(OtherSnafu)?
+        .do_work()
+        .await
+        .context(OtherSnafu)?;
+
+    let storage_config = ObjectStoreConfig::default();
+    let src_storage = OpenDalStorage::from_uri(&src_uri, &storage_config)
+        .map_err(BoxedError::new)
+        .context(OtherSnafu)?;
+    let dst_storage = OpenDalStorage::from_uri(&dst_uri, &storage_config)
+        .map_err(BoxedError::new)
+        .context(OtherSnafu)?;
+
+    let src_schema_snapshot = src_storage
+        .read_schema()
+        .await
+        .map_err(BoxedError::new)
+        .context(OtherSnafu)?;
+    let dst_schema_snapshot = dst_storage
+        .read_schema()
+        .await
+        .map_err(BoxedError::new)
+        .context(OtherSnafu)?;
+    assert_eq!(src_schema_snapshot, dst_schema_snapshot);
+
+    database_client
+        .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
+        .await?;
+
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore]
+async fn import_v2_ddl_dry_run_e2e() -> Result<()> {
+    let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string());
+    let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string());
+    let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok();
+    let schema = "test_db_ddl_dry_run";
+
+    let database_client = DatabaseClient::new(
+        addr.clone(),
+        catalog.clone(),
+        auth_basic.clone(),
+        Duration::from_secs(60),
+        None,
+        false,
+    );
+
+    database_client
+        .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
+        .await?;
+    database_client
+        .sql_in_public(&format!("CREATE DATABASE {schema}"))
+        .await?;
+    database_client
+        .sql(
+            "CREATE TABLE metrics (\
+                ts TIMESTAMP TIME INDEX, \
+                host STRING PRIMARY KEY, \
+                cpu DOUBLE DEFAULT 0.0, \
+                region_name STRING \
+            ) ENGINE = mito WITH (ttl='7d', 'compaction.type'='twcs')",
+            schema,
+        )
+        .await?;
+    database_client
+        .sql(
+            "CREATE TABLE logs (\
+                ts TIMESTAMP TIME INDEX, \
+                app STRING PRIMARY KEY, \
+                msg STRING NOT NULL COMMENT 'log message' \
+            ) ENGINE = mito",
+            schema,
+        )
+        .await?;
+
+    let src_dir = tempdir().context(FileIoSnafu)?;
+    let src_uri = Url::from_directory_path(src_dir.path())
+        .map_err(|_| {
+            InvalidArgumentsSnafu {
+                msg: "invalid temp dir path".to_string(),
+            }
+            .build()
+        })?
+        .to_string();
+
+    let mut export_args = vec![
+        "export-v2-create",
+        "--addr",
+        &addr,
+        "--to",
+        &src_uri,
+        "--catalog",
+        &catalog,
+        "--schemas",
+        schema,
+        "--schema-only",
+    ];
+    if let Some(auth) = &auth_basic {
+        export_args.push("--auth-basic");
+        export_args.push(auth);
+    }
+    let export_cmd = ExportCreateCommand::parse_from(export_args);
+    export_cmd
+        .build()
+        .await
+        .context(OtherSnafu)?
+        .do_work()
+        .await
+        .context(OtherSnafu)?;
+
+    let mut import_args = vec![
+        "import-v2",
+        "--addr",
+        &addr,
+        "--from",
+        &src_uri,
+        "--catalog",
+        &catalog,
+        "--schemas",
+        schema,
+        "--dry-run",
+    ];
+    if let Some(auth) = &auth_basic {
+        import_args.push("--auth-basic");
+        import_args.push(auth);
+    }
+    let import_cmd = ImportV2Command::parse_from(import_args);
+    import_cmd
+        .build()
+        .await
+        .context(OtherSnafu)?
+        .do_work()
+        .await
+        .context(OtherSnafu)?;
+
+    database_client
+        .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
+        .await?;
+
+    Ok(())
+}
diff --git a/src/cli/src/data/import.rs b/src/cli/src/data/import.rs
index ffe8b62c7e..f5c234f1a7 100644
--- a/src/cli/src/data/import.rs
+++ b/src/cli/src/data/import.rs
@@ -81,13 +81,16 @@ pub struct ImportCommand {
     #[clap(long, value_parser = humantime::parse_duration)]
     timeout: Option<Duration>,
 
-    /// The proxy server address to connect, if set, will override the system proxy.
+    /// The proxy server address to connect.
     ///
-    /// The default behavior will use the system proxy if neither `proxy` nor `no_proxy` is set.
+    /// If set, it overrides the system proxy unless `--no-proxy` is specified.
+    /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
     #[clap(long)]
     proxy: Option<String>,
 
-    /// Disable proxy server, if set, will not use any proxy.
+    /// Disable all proxy usage (ignores `--proxy` and system proxy).
+    ///
+    /// When set and `--proxy` is not provided, this explicitly disables system proxy.
     #[clap(long, default_value = "false")]
     no_proxy: bool,
 }
@@ -104,6 +107,7 @@ impl ImportCommand {
             // Treats `None` as `0s` to disable server-side default timeout.
             self.timeout.unwrap_or_default(),
             proxy,
+            self.no_proxy,
         );
 
         Ok(Box::new(Import {
@@ -314,6 +318,7 @@ mod tests {
                 None,
                 Duration::from_secs(0),
                 None,
+                false,
             ),
             input_dir: input_dir.to_string(),
             parallelism: 1,
diff --git a/src/cli/src/data/import_v2.rs b/src/cli/src/data/import_v2.rs
new file mode 100644
index 0000000000..772e18cc93
--- /dev/null
+++ b/src/cli/src/data/import_v2.rs
@@ -0,0 +1,41 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Import V2 module.
+//!
+//! This module provides the V2 implementation of database import functionality,
+//! featuring:
+//! - DDL-based schema import
+//! - Dry-run mode for verification
+//!
+//! # Example
+//!
+//! ```bash
+//! # Dry-run import (verify without executing)
+//! greptime cli data import-v2 \
+//!   --addr 127.0.0.1:4000 \
+//!   --from file:///tmp/snapshot \
+//!   --dry-run
+//!
+//! # Actual import
+//! greptime cli data import-v2 \
+//!   --addr 127.0.0.1:4000 \
+//!   --from s3://bucket/snapshots/prod-20250101
+//! ```
+
+mod command;
+pub mod error;
+pub mod executor;
+
+pub use command::ImportV2Command;
diff --git a/src/cli/src/data/import_v2/command.rs b/src/cli/src/data/import_v2/command.rs
new file mode 100644
index 0000000000..544763d92b
--- /dev/null
+++ b/src/cli/src/data/import_v2/command.rs
@@ -0,0 +1,542 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Import V2 CLI command.
+
+use std::collections::HashSet;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use clap::Parser;
+use common_error::ext::BoxedError;
+use common_telemetry::info;
+use snafu::ResultExt;
+
+use crate::Tool;
+use crate::common::ObjectStoreConfig;
+use crate::data::export_v2::manifest::MANIFEST_VERSION;
+use crate::data::import_v2::error::{
+    ManifestVersionMismatchSnafu, Result, SchemaNotInSnapshotSnafu, SnapshotStorageSnafu,
+};
+use crate::data::import_v2::executor::{DdlExecutor, DdlStatement};
+use crate::data::path::ddl_path_for_schema;
+use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri};
+use crate::database::{DatabaseClient, parse_proxy_opts};
+
+/// Import from a snapshot.
+#[derive(Debug, Parser)]
+pub struct ImportV2Command {
+    /// Server address to connect (e.g., 127.0.0.1:4000).
+    #[clap(long)]
+    addr: String,
+
+    /// Source snapshot location (e.g., s3://bucket/path, file:///tmp/backup).
+    #[clap(long)]
+    from: String,
+
+    /// Target catalog name.
+    #[clap(long, default_value = "greptime")]
+    catalog: String,
+
+    /// Schema list to import (default: all in snapshot).
+    /// Can be specified multiple times or comma-separated.
+    #[clap(long, value_delimiter = ',')]
+    schemas: Vec<String>,
+
+    /// Verify without importing (dry-run).
+    #[clap(long)]
+    dry_run: bool,
+
+    /// Concurrency level (for future use).
+    #[clap(long, default_value = "1")]
+    parallelism: usize,
+
+    /// Basic authentication (user:password).
+    #[clap(long)]
+    auth_basic: Option<String>,
+
+    /// Request timeout.
+    #[clap(long, value_parser = humantime::parse_duration)]
+    timeout: Option<Duration>,
+
+    /// Proxy server address.
+    ///
+    /// If set, it overrides the system proxy unless `--no-proxy` is specified.
+    /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
+    #[clap(long)]
+    proxy: Option<String>,
+
+    /// Disable all proxy usage (ignores `--proxy` and system proxy).
+    ///
+    /// When set and `--proxy` is not provided, this explicitly disables system proxy.
+    #[clap(long)]
+    no_proxy: bool,
+
+    /// Object store configuration for remote storage backends.
+    #[clap(flatten)]
+    storage: ObjectStoreConfig,
+}
+
+impl ImportV2Command {
+    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
+        // Validate URI format
+        validate_uri(&self.from)
+            .context(SnapshotStorageSnafu)
+            .map_err(BoxedError::new)?;
+
+        // Parse schemas (empty vec means all schemas)
+        let schemas = if self.schemas.is_empty() {
+            None
+        } else {
+            Some(self.schemas.clone())
+        };
+
+        // Build storage
+        let storage = OpenDalStorage::from_uri(&self.from, &self.storage)
+            .context(SnapshotStorageSnafu)
+            .map_err(BoxedError::new)?;
+
+        // Build database client
+        let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?;
+        let database_client = DatabaseClient::new(
+            self.addr.clone(),
+            self.catalog.clone(),
+            self.auth_basic.clone(),
+            self.timeout.unwrap_or(Duration::from_secs(60)),
+            proxy,
+            self.no_proxy,
+        );
+
+        Ok(Box::new(Import {
+            schemas,
+            dry_run: self.dry_run,
+            _parallelism: self.parallelism,
+            storage: Box::new(storage),
+            database_client,
+        }))
+    }
+}
+
+/// Import tool implementation.
+pub struct Import {
+    schemas: Option<Vec<String>>,
+    dry_run: bool,
+    _parallelism: usize,
+    storage: Box<dyn SnapshotStorage>,
+    database_client: DatabaseClient,
+}
+
+#[async_trait]
+impl Tool for Import {
+    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
+        self.run().await.map_err(BoxedError::new)
+    }
+}
+
+impl Import {
+    async fn run(&self) -> Result<()> {
+        // 1. Read manifest
+        let manifest = self
+            .storage
+            .read_manifest()
+            .await
+            .context(SnapshotStorageSnafu)?;
+
+        info!(
+            "Loading snapshot: {} (version: {}, schema_only: {})",
+            manifest.snapshot_id, manifest.version, manifest.schema_only
+        );
+
+        // Check version compatibility
+        if manifest.version != MANIFEST_VERSION {
+            return ManifestVersionMismatchSnafu {
+                expected: MANIFEST_VERSION,
+                found: manifest.version,
+            }
+            .fail();
+        }
+
+        info!("Snapshot contains {} schema(s)", manifest.schemas.len());
+
+        // 2. Determine schemas to import
+        let schemas_to_import = match &self.schemas {
+            Some(filter) => canonicalize_schema_filter(filter, &manifest.schemas)?,
+            None => manifest.schemas.clone(),
+        };
+
+        info!("Importing schemas: {:?}", schemas_to_import);
+
+        // 3. Read DDL statements
+        let ddl_statements = self.read_ddl_statements(&schemas_to_import).await?;
+
+        info!("Generated {} DDL statements", ddl_statements.len());
+
+        // 4. Dry-run mode: print DDL and exit
+        if self.dry_run {
+            info!("Dry-run mode - DDL statements to execute:");
+            println!();
+            for (i, stmt) in ddl_statements.iter().enumerate() {
+                println!("-- Statement {}", i + 1);
+                println!("{};", stmt.sql);
+                println!();
+            }
+            return Ok(());
+        }
+
+        // 5. Execute DDL
+        let executor = DdlExecutor::new(&self.database_client);
+        executor.execute_strict(&ddl_statements).await?;
+
+        info!(
+            "Import completed: {} DDL statements executed",
+            ddl_statements.len()
+        );
+
+        // 6. Data import would happen here for non-schema-only snapshots (M2/M3)
+        if !manifest.schema_only && !manifest.chunks.is_empty() {
+            info!(
+                "Data import not yet implemented (M3). {} chunks pending.",
+                manifest.chunks.len()
+            );
+        }
+
+        Ok(())
+    }
+
+    async fn read_ddl_statements(&self, schemas: &[String]) -> Result<Vec<DdlStatement>> {
+        let mut statements = Vec::new();
+        for schema in schemas {
+            let path = ddl_path_for_schema(schema);
+            let content = self
+                .storage
+                .read_text(&path)
+                .await
+                .context(SnapshotStorageSnafu)?;
+            statements.extend(
+                parse_ddl_statements(&content)
+                    .into_iter()
+                    .map(|sql| ddl_statement_for_schema(schema, sql)),
+            );
+        }
+
+        Ok(statements)
+    }
+}
+
+fn parse_ddl_statements(content: &str) -> Vec<String> {
+    let mut statements = Vec::new();
+    let mut current = String::new();
+    let mut chars = content.chars().peekable();
+    let mut in_single_quote = false;
+    let mut in_double_quote = false;
+    let mut in_line_comment = false;
+    let mut in_block_comment = false;
+
+    while let Some(ch) = chars.next() {
+        if in_line_comment {
+            if ch == '\n' {
+                in_line_comment = false;
+                current.push('\n');
+            }
+            continue;
+        }
+
+        if in_block_comment {
+            if ch == '*' && chars.peek() == Some(&'/') {
+                chars.next();
+                in_block_comment = false;
+            }
+            continue;
+        }
+
+        if in_single_quote {
+            current.push(ch);
+            if ch == '\'' {
+                if chars.peek() == Some(&'\'') {
+                    current.push(chars.next().expect("peeked quote must exist"));
+                } else {
+                    in_single_quote = false;
+                }
+            }
+            continue;
+        }
+
+        if in_double_quote {
+            current.push(ch);
+            if ch == '"' {
+                if chars.peek() == Some(&'"') {
+                    current.push(chars.next().expect("peeked quote must exist"));
+                } else {
+                    in_double_quote = false;
+                }
+            }
+            continue;
+        }
+
+        match ch {
+            '-' if chars.peek() == Some(&'-') => {
+                chars.next();
+                in_line_comment = true;
+            }
+            '/' if chars.peek() == Some(&'*') => {
+                chars.next();
+                in_block_comment = true;
+            }
+            '\'' => {
+                in_single_quote = true;
+                current.push(ch);
+            }
+            '"' => {
+                in_double_quote = true;
+                current.push(ch);
+            }
+            ';' => {
+                let statement = current.trim();
+                if !statement.is_empty() {
+                    statements.push(statement.to_string());
+                }
+                current.clear();
+            }
+            _ => current.push(ch),
+        }
+    }
+
+    let statement = current.trim();
+    if !statement.is_empty() {
+        statements.push(statement.to_string());
+    }
+
+    statements
+}
+
+fn ddl_statement_for_schema(schema: &str, sql: String) -> DdlStatement {
+    if is_schema_scoped_statement(&sql) {
+        DdlStatement::with_execution_schema(sql, schema.to_string())
+    } else {
+        DdlStatement::new(sql)
+    }
+}
+
+fn is_schema_scoped_statement(sql: &str) -> bool {
+    let trimmed = sql.trim_start();
+    if !starts_with_keyword(trimmed, "CREATE") {
+        return false;
+    }
+
+    let Some(rest) = trimmed.get("CREATE".len()..) else {
+        return false;
+    };
+    let mut rest = rest.trim_start();
+    if starts_with_keyword(rest, "OR") {
+        let Some(next) = rest.get("OR".len()..) else {
+            return false;
+        };
+        rest = next.trim_start();
+        if !starts_with_keyword(rest, "REPLACE") {
+            return false;
+        }
+        let Some(next) = rest.get("REPLACE".len()..) else {
+            return false;
+        };
+        rest = next.trim_start();
+    }
+
+    if starts_with_keyword(rest, "EXTERNAL") {
+        let Some(next) = rest.get("EXTERNAL".len()..) else {
+            return false;
+        };
+        rest = next.trim_start();
+    }
+
+    starts_with_keyword(rest, "TABLE") || starts_with_keyword(rest, "VIEW")
+}
+
+fn starts_with_keyword(input: &str, keyword: &str) -> bool {
+    input
+        .get(0..keyword.len())
+        .map(|s| s.eq_ignore_ascii_case(keyword))
+        .unwrap_or(false)
+        && input
+            .as_bytes()
+            .get(keyword.len())
+            .map(|b| !b.is_ascii_alphanumeric() && *b != b'_')
+            .unwrap_or(true)
+}
+
+fn canonicalize_schema_filter(
+    filter: &[String],
+    manifest_schemas: &[String],
+) -> Result<Vec<String>> {
+    let mut canonicalized = Vec::new();
+    let mut seen = HashSet::new();
+
+    for schema in filter {
+        let canonical = manifest_schemas
+            .iter()
+            .find(|candidate| candidate.eq_ignore_ascii_case(schema))
+            .cloned()
+            .ok_or_else(|| {
+                SchemaNotInSnapshotSnafu {
+                    schema: schema.clone(),
+                }
+                .build()
+            })?;
+
+        if seen.insert(canonical.to_ascii_lowercase()) {
+            canonicalized.push(canonical);
+        }
+    }
+
+    Ok(canonicalized)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_ddl_statements() {
+        let content = r#"
+-- Schema: public
+CREATE DATABASE public;
+CREATE TABLE t (ts TIMESTAMP TIME INDEX, host STRING, PRIMARY KEY (host)) ENGINE=mito;
+
+-- comment
+CREATE VIEW v AS SELECT * FROM t;
+"#;
+        let statements = parse_ddl_statements(content);
+        assert_eq!(statements.len(), 3);
+        assert!(statements[0].starts_with("CREATE DATABASE public"));
+        assert!(statements[1].starts_with("CREATE TABLE t"));
+        assert!(statements[2].starts_with("CREATE VIEW v"));
+    }
+
+    #[test]
+    fn test_parse_ddl_statements_preserves_semicolons_in_string_literals() {
+        let content = r#"
+CREATE TABLE t (
+    host STRING DEFAULT 'a;b'
+);
+CREATE VIEW v AS SELECT ';' AS marker;
+"#;
+
+        let statements = parse_ddl_statements(content);
+
+        assert_eq!(statements.len(), 2);
+        assert!(statements[0].contains("'a;b'"));
+        assert!(statements[1].contains("';' AS marker"));
+    }
+
+    #[test]
+    fn test_parse_ddl_statements_handles_comments_without_splitting() {
+        let content = r#"
+-- leading comment
+CREATE TABLE t (ts TIMESTAMP TIME INDEX); /* block; comment */
+CREATE VIEW v AS SELECT 1;
+"#;
+
+        let statements = parse_ddl_statements(content);
+
+        assert_eq!(statements.len(), 2);
+        assert!(statements[0].starts_with("CREATE TABLE t"));
+        assert!(statements[1].starts_with("CREATE VIEW v"));
+    }
+
+    #[test]
+    fn test_canonicalize_schema_filter_uses_manifest_casing() {
+        let filter = vec!["TEST_DB".to_string(), "PUBLIC".to_string()];
+        let manifest_schemas = vec!["test_db".to_string(), "public".to_string()];
+
+        let canonicalized = canonicalize_schema_filter(&filter, &manifest_schemas).unwrap();
+
+        assert_eq!(canonicalized, vec!["test_db", "public"]);
+    }
+
+    #[test]
+    fn test_canonicalize_schema_filter_dedupes_case_insensitive_matches() {
+        let filter = vec![
+            "TEST_DB".to_string(),
+            "test_db".to_string(),
+            "PUBLIC".to_string(),
+            "public".to_string(),
+        ];
+        let manifest_schemas = vec!["test_db".to_string(), "public".to_string()];
+
+        let canonicalized = canonicalize_schema_filter(&filter, &manifest_schemas).unwrap();
+
+        assert_eq!(canonicalized, vec!["test_db", "public"]);
+    }
+
+    #[test]
+    fn test_canonicalize_schema_filter_rejects_missing_schema() {
+        let filter = vec!["missing".to_string()];
+        let manifest_schemas = vec!["test_db".to_string()];
+
+        let error = canonicalize_schema_filter(&filter, &manifest_schemas)
+            .expect_err("missing schema should fail")
+            .to_string();
+
+        assert!(error.contains("missing"));
+    }
+
+    #[test]
+    fn test_ddl_statement_for_schema_create_table_uses_execution_schema() {
+        let stmt = ddl_statement_for_schema(
+            "test_db",
+            "CREATE TABLE metrics (ts TIMESTAMP TIME INDEX) ENGINE=mito".to_string(),
+        );
+        assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
+    }
+
+    #[test]
+    fn test_ddl_statement_for_schema_create_view_uses_execution_schema() {
+        let stmt = ddl_statement_for_schema(
+            "test_db",
+            "CREATE VIEW metrics_view AS SELECT * FROM metrics".to_string(),
+        );
+        assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
+    }
+
+    #[test]
+    fn test_ddl_statement_for_schema_create_or_replace_view_uses_execution_schema() {
+        let stmt = ddl_statement_for_schema(
+            "test_db",
+            "CREATE OR REPLACE VIEW metrics_view AS SELECT * FROM metrics".to_string(),
+        );
+        assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
+    }
+
+    #[test]
+    fn test_ddl_statement_for_schema_create_external_table_uses_execution_schema() {
+        let stmt = ddl_statement_for_schema(
+            "test_db",
+            "CREATE EXTERNAL TABLE IF NOT EXISTS ext_metrics (ts TIMESTAMP TIME INDEX) ENGINE=file"
+                .to_string(),
+        );
+        assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
+    }
+
+    #[test]
+    fn test_ddl_statement_for_schema_create_database_uses_public_context() {
+        let stmt = ddl_statement_for_schema("test_db", "CREATE DATABASE test_db".to_string());
+        assert_eq!(stmt.execution_schema, None);
+    }
+
+    #[test]
+    fn test_starts_with_keyword_requires_word_boundary() {
+        assert!(starts_with_keyword("CREATE TABLE t", "CREATE"));
+        assert!(!starts_with_keyword("CREATED TABLE t", "CREATE"));
+        assert!(!starts_with_keyword("TABLESPACE foo", "TABLE"));
+    }
+}
diff --git a/src/cli/src/data/import_v2/error.rs b/src/cli/src/data/import_v2/error.rs
new file mode 100644
index 0000000000..5ae3db1583
--- /dev/null
+++ b/src/cli/src/data/import_v2/error.rs
@@ -0,0 +1,82 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use snafu::{Location, Snafu};
+
+#[derive(Snafu)]
+#[snafu(visibility(pub))]
+#[stack_trace_debug]
+pub enum Error {
+    #[snafu(display("Snapshot not found at '{}'", uri))]
+    SnapshotNotFound {
+        uri: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Manifest version mismatch: expected {}, found {}", expected, found))]
+    ManifestVersionMismatch {
+        expected: u32,
+        found: u32,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Schema '{}' not found in snapshot", schema))]
+    SchemaNotInSnapshot {
+        schema: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Snapshot storage error"))]
+    SnapshotStorage {
+        #[snafu(source)]
+        error: crate::data::export_v2::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Database error"))]
+    Database {
+        #[snafu(source)]
+        error: crate::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+impl ErrorExt for Error {
+    fn status_code(&self) -> StatusCode {
+        match self {
+            Error::SnapshotNotFound { .. } | Error::SchemaNotInSnapshot { .. } => {
+                StatusCode::InvalidArguments
+            }
+            Error::ManifestVersionMismatch { .. } => StatusCode::InvalidArguments,
+            Error::Database { error, .. } => error.status_code(),
+            Error::SnapshotStorage { error, .. } => error.status_code(),
+        }
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
diff --git a/src/cli/src/data/import_v2/executor.rs b/src/cli/src/data/import_v2/executor.rs
new file mode 100644
index 0000000000..3f2bf66ae6
--- /dev/null
+++ b/src/cli/src/data/import_v2/executor.rs
@@ -0,0 +1,122 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! DDL execution for import.
+
+use common_telemetry::info;
+use snafu::ResultExt;
+
+use crate::data::import_v2::error::{DatabaseSnafu, Result};
+use crate::database::DatabaseClient;
+
+/// A DDL statement with an explicit execution schema context.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DdlStatement {
+    pub sql: String,
+    pub execution_schema: Option<String>,
+}
+
+impl DdlStatement {
+    pub fn new(sql: String) -> Self {
+        Self {
+            sql,
+            execution_schema: None,
+        }
+    }
+
+    pub fn with_execution_schema(sql: String, schema: String) -> Self {
+        Self {
+            sql,
+            execution_schema: Some(schema),
+        }
+    }
+}
+
+/// Executes DDL statements against the database.
+pub struct DdlExecutor<'a> {
+    client: &'a DatabaseClient,
+}
+
+impl<'a> DdlExecutor<'a> {
+    /// Creates a new DDL executor.
+    pub fn new(client: &'a DatabaseClient) -> Self {
+        Self { client }
+    }
+
+    /// Executes a list of DDL statements, stopping on first error.
+    pub async fn execute_strict(&self, statements: &[DdlStatement]) -> Result<()> {
+        let total = statements.len();
+
+        for (i, stmt) in statements.iter().enumerate() {
+            let preview = preview_sql(&stmt.sql);
+
+            info!("Executing DDL ({}/{}): {}", i + 1, total, preview);
+
+            if let Some(schema) = stmt.execution_schema.as_deref() {
+                self.client
+                    .sql(&stmt.sql, schema)
+                    .await
+                    .context(DatabaseSnafu)?;
+            } else {
+                self.client
+                    .sql_in_public(&stmt.sql)
+                    .await
+                    .context(DatabaseSnafu)?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+fn preview_sql(sql: &str) -> String {
+    let mut chars = sql.chars();
+    let preview: String = chars.by_ref().take(80).collect();
+    if chars.next().is_some() {
+        format!("{preview}...")
+    } else {
+        preview
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_statement_without_execution_schema_uses_public() {
+        let stmt = DdlStatement::new("CREATE DATABASE IF NOT EXISTS test_db".to_string());
+        assert_eq!(stmt.execution_schema, None);
+    }
+
+    #[test]
+    fn test_statement_with_execution_schema_preserves_context() {
+        let stmt = DdlStatement::with_execution_schema(
+            r#"CREATE TABLE IF NOT EXISTS "my""schema"."metrics" (ts TIMESTAMP TIME INDEX)"#
+                .to_string(),
+            r#"my"schema"#.to_string(),
+        );
+        assert_eq!(stmt.execution_schema.as_deref(), Some(r#"my"schema"#));
+    }
+
+    #[test]
+    fn test_preview_sql_truncates_at_char_boundary() {
+        let sql = format!(
+            "CREATE TABLE {} (ts TIMESTAMP TIME INDEX)",
+            "测".repeat(100)
+        );
+        let preview = preview_sql(&sql);
+        assert!(preview.ends_with("..."));
+        assert!(preview.is_char_boundary(preview.len()));
+    }
+}
diff --git a/src/cli/src/data/path.rs b/src/cli/src/data/path.rs
new file mode 100644
index 0000000000..2e0f5d3f1a
--- /dev/null
+++ b/src/cli/src/data/path.rs
@@ -0,0 +1,76 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Shared path helpers for export/import data files.
+
+use crate::data::export_v2::schema::{DDL_DIR, SCHEMA_DIR};
+
+pub(crate) fn ddl_path_for_schema(schema: &str) -> String {
+    format!(
+        "{}/{}/{}.sql",
+        SCHEMA_DIR,
+        DDL_DIR,
+        encode_path_segment(schema)
+    )
+}
+
+pub(crate) fn encode_path_segment(value: &str) -> String {
+    let mut encoded = String::with_capacity(value.len());
+    for byte in value.bytes() {
+        match byte {
+            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' => {
+                encoded.push(byte as char);
+            }
+            _ => {
+                encoded.push('%');
+                encoded.push(hex_char(byte >> 4));
+                encoded.push(hex_char(byte & 0x0F));
+            }
+        }
+    }
+    encoded
+}
+
+fn hex_char(nibble: u8) -> char {
+    match nibble {
+        0..=9 => (b'0' + nibble) as char,
+        10..=15 => (b'A' + (nibble - 10)) as char,
+        _ => unreachable!("nibble must be in 0..=15"),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_encode_path_segment_preserves_safe_ascii() {
+        assert_eq!(encode_path_segment("test_db"), "test_db");
+    }
+
+    #[test]
+    fn test_encode_path_segment_escapes_path_traversal_chars() {
+        assert_eq!(encode_path_segment("../evil"), "%2E%2E%2Fevil");
+        assert_eq!(encode_path_segment(r"..\\evil"), "%2E%2E%5C%5Cevil");
+    }
+
+    #[test]
+    fn test_ddl_path_for_schema_encodes_schema_segment() {
+        assert_eq!(ddl_path_for_schema("public"), "schema/ddl/public.sql");
+        assert_eq!(
+            ddl_path_for_schema("../evil"),
+            "schema/ddl/%2E%2E%2Fevil.sql"
+        );
+    }
+}
diff --git a/src/cli/src/data/snapshot_storage.rs b/src/cli/src/data/snapshot_storage.rs
new file mode 100644
index 0000000000..b6ff1c9222
--- /dev/null
+++ b/src/cli/src/data/snapshot_storage.rs
@@ -0,0 +1,649 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Storage abstraction for Export/Import V2.
+//!
+//! This module provides a unified interface for reading and writing snapshot data
+//! to various storage backends (S3, OSS, GCS, Azure Blob, local filesystem).
+
+use async_trait::async_trait;
+use object_store::services::{Azblob, Fs, Gcs, Oss, S3};
+use object_store::util::{with_instrument_layers, with_retry_layers};
+use object_store::{AzblobConnection, GcsConnection, ObjectStore, OssConnection, S3Connection};
+use snafu::ResultExt;
+use url::Url;
+
+use crate::common::ObjectStoreConfig;
+use crate::data::export_v2::error::{
+    BuildObjectStoreSnafu, InvalidUriSnafu, ManifestParseSnafu, ManifestSerializeSnafu, Result,
+    SnapshotNotFoundSnafu, StorageOperationSnafu, TextDecodeSnafu, UnsupportedSchemeSnafu,
+    UrlParseSnafu,
+};
+use crate::data::export_v2::manifest::{MANIFEST_FILE, Manifest};
+#[cfg(test)]
+use crate::data::export_v2::schema::SchemaDefinition;
+use crate::data::export_v2::schema::{SCHEMA_DIR, SCHEMAS_FILE, SchemaSnapshot};
+
+struct RemoteLocation {
+    bucket_or_container: String,
+    root: String,
+}
+
+/// URI schemes supported for snapshot storage.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum StorageScheme {
+    /// Amazon S3.
+    S3,
+    /// Alibaba Cloud OSS.
+    Oss,
+    /// Google Cloud Storage.
+    Gcs,
+    /// Azure Blob Storage.
+    Azblob,
+    /// Local filesystem (file://).
+    File,
+}
+
+impl StorageScheme {
+    /// Parses storage scheme from URI.
+    pub fn from_uri(uri: &str) -> Result<Self> {
+        let url = Url::parse(uri).context(UrlParseSnafu)?;
+
+        match url.scheme() {
+            "s3" => Ok(Self::S3),
+            "oss" => Ok(Self::Oss),
+            "gs" | "gcs" => Ok(Self::Gcs),
+            "azblob" => Ok(Self::Azblob),
+            "file" => Ok(Self::File),
+            scheme => UnsupportedSchemeSnafu { scheme }.fail(),
+        }
+    }
+}
+
+/// Extracts bucket/container and root path from a URI.
+fn extract_remote_location(uri: &str) -> Result<RemoteLocation> {
+    let url = Url::parse(uri).context(UrlParseSnafu)?;
+    let bucket_or_container = url.host_str().unwrap_or("").to_string();
+    if bucket_or_container.is_empty() {
+        return InvalidUriSnafu {
+            uri,
+            reason: "URI must include bucket/container in host",
+        }
+        .fail();
+    }
+
+    let root = url.path().trim_start_matches('/').to_string();
+    if root.is_empty() {
+        return InvalidUriSnafu {
+            uri,
+            reason: "snapshot URI must include a non-empty path after the bucket/container",
+        }
+        .fail();
+    }
+
+    Ok(RemoteLocation {
+        bucket_or_container,
+        root,
+    })
+}
+
+/// Validates that a URI has a proper scheme.
+///
+/// Rejects bare paths (e.g., `/tmp/backup`, `./backup`) because:
+/// - Schema export (CLI) and data export (server) run in different processes
+/// - Using bare paths would split the snapshot across machines
+///
+/// Supported URI schemes:
+/// - `s3://bucket/path` - Amazon S3
+/// - `oss://bucket/path` - Alibaba Cloud OSS
+/// - `gs://bucket/path` - Google Cloud Storage
+/// - `azblob://container/path` - Azure Blob Storage
+/// - `file:///absolute/path` - Local filesystem
+pub fn validate_uri(uri: &str) -> Result<StorageScheme> {
+    // Must have a scheme
+    if !uri.contains("://") {
+        return InvalidUriSnafu {
+            uri,
+            reason: "URI must have a scheme (e.g., s3://, file://). Bare paths are not supported.",
+        }
+        .fail();
+    }
+
+    StorageScheme::from_uri(uri)
+}
+
+fn schema_index_path() -> String {
+    format!("{}/{}", SCHEMA_DIR, SCHEMAS_FILE)
+}
+
+/// Extracts the absolute filesystem path from a file:// URI.
+fn extract_file_path_from_uri(uri: &str) -> Result<String> {
+    let url = Url::parse(uri).context(UrlParseSnafu)?;
+
+    match url.host_str() {
+        Some(host) if !host.is_empty() && host != "localhost" => InvalidUriSnafu {
+            uri,
+            reason: "file:// URI must use an absolute path like file:///tmp/backup",
+        }
+        .fail(),
+        _ => Ok(url.path().to_string()),
+    }
+}
+
+async fn ensure_snapshot_exists(storage: &OpenDalStorage) -> Result<()> {
+    if storage.exists().await? {
+        Ok(())
+    } else {
+        SnapshotNotFoundSnafu {
+            uri: storage.target_uri.as_str(),
+        }
+        .fail()
+    }
+}
+
+/// Snapshot storage abstraction.
+///
+/// Provides operations for reading and writing snapshot data to various storage backends.
+#[async_trait]
+pub trait SnapshotStorage: Send + Sync {
+    /// Checks if a snapshot exists at this location (manifest.json exists).
+    async fn exists(&self) -> Result<bool>;
+
+    /// Reads the manifest file.
+    async fn read_manifest(&self) -> Result<Manifest>;
+
+    /// Writes the manifest file.
+    async fn write_manifest(&self, manifest: &Manifest) -> Result<()>;
+
+    /// Writes the schema index to schema/schemas.json.
+    async fn write_schema(&self, schema: &SchemaSnapshot) -> Result<()>;
+
+    /// Writes a text file to a relative path under the snapshot root.
+    async fn write_text(&self, path: &str, content: &str) -> Result<()>;
+
+    /// Reads a text file from a relative path under the snapshot root.
+    async fn read_text(&self, path: &str) -> Result<String>;
+
+    /// Deletes the entire snapshot (for --force).
+    async fn delete_snapshot(&self) -> Result<()>;
+}
+
+/// OpenDAL-based implementation of SnapshotStorage.
+pub struct OpenDalStorage {
+    object_store: ObjectStore,
+    target_uri: String,
+}
+
+impl OpenDalStorage {
+    fn new_operator_rooted(object_store: ObjectStore, target_uri: &str) -> Self {
+        Self {
+            object_store,
+            target_uri: target_uri.to_string(),
+        }
+    }
+
+    fn finish_local_store(object_store: ObjectStore) -> ObjectStore {
+        with_instrument_layers(object_store, false)
+    }
+
+    fn finish_remote_store(object_store: ObjectStore) -> ObjectStore {
+        with_instrument_layers(with_retry_layers(object_store), false)
+    }
+
+    fn ensure_backend_enabled(uri: &str, enabled: bool, reason: &'static str) -> Result<()> {
+        if enabled {
+            Ok(())
+        } else {
+            InvalidUriSnafu { uri, reason }.fail()
+        }
+    }
+
+    fn validate_remote_config<E: std::fmt::Display>(
+        uri: &str,
+        backend: &str,
+        result: std::result::Result<(), E>,
+    ) -> Result<()> {
+        result.map_err(|error| {
+            InvalidUriSnafu {
+                uri,
+                reason: format!("invalid {} config: {}", backend, error),
+            }
+            .build()
+        })
+    }
+
+    /// Creates a new storage from a file:// URI.
+    pub fn from_file_uri(uri: &str) -> Result<Self> {
+        let path = extract_file_path_from_uri(uri)?;
+
+        let builder = Fs::default().root(&path);
+        let object_store = ObjectStore::new(builder)
+            .context(BuildObjectStoreSnafu)?
+            .finish();
+        Ok(Self::new_operator_rooted(
+            Self::finish_local_store(object_store),
+            uri,
+        ))
+    }
+
+    fn from_file_uri_with_config(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
+        if storage.enable_s3 || storage.enable_oss || storage.enable_gcs || storage.enable_azblob {
+            return InvalidUriSnafu {
+                uri,
+                reason: "file:// cannot be used with remote storage flags",
+            }
+            .fail();
+        }
+
+        Self::from_file_uri(uri)
+    }
+
+    fn from_s3_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
+        Self::ensure_backend_enabled(
+            uri,
+            storage.enable_s3,
+            "s3:// requires --s3 and related options",
+        )?;
+
+        let location = extract_remote_location(uri)?;
+        let mut config = storage.s3.clone();
+        config.s3_bucket = location.bucket_or_container;
+        config.s3_root = location.root;
+        Self::validate_remote_config(uri, "s3", config.validate())?;
+
+        let conn: S3Connection = config.into();
+        let object_store = ObjectStore::new(S3::from(&conn))
+            .context(BuildObjectStoreSnafu)?
+            .finish();
+        Ok(Self::new_operator_rooted(
+            Self::finish_remote_store(object_store),
+            uri,
+        ))
+    }
+
+    fn from_oss_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
+        Self::ensure_backend_enabled(
+            uri,
+            storage.enable_oss,
+            "oss:// requires --oss and related options",
+        )?;
+
+        let location = extract_remote_location(uri)?;
+        let mut config = storage.oss.clone();
+        config.oss_bucket = location.bucket_or_container;
+        config.oss_root = location.root;
+        Self::validate_remote_config(uri, "oss", config.validate())?;
+
+        let conn: OssConnection = config.into();
+        let object_store = ObjectStore::new(Oss::from(&conn))
+            .context(BuildObjectStoreSnafu)?
+            .finish();
+        Ok(Self::new_operator_rooted(
+            Self::finish_remote_store(object_store),
+            uri,
+        ))
+    }
+
+    fn from_gcs_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
+        Self::ensure_backend_enabled(
+            uri,
+            storage.enable_gcs,
+            "gs:// or gcs:// requires --gcs and related options",
+        )?;
+
+        let location = extract_remote_location(uri)?;
+        let mut config = storage.gcs.clone();
+        config.gcs_bucket = location.bucket_or_container;
+        config.gcs_root = location.root;
+        Self::validate_remote_config(uri, "gcs", config.validate())?;
+
+        let conn: GcsConnection = config.into();
+        let object_store = ObjectStore::new(Gcs::from(&conn))
+            .context(BuildObjectStoreSnafu)?
+            .finish();
+        Ok(Self::new_operator_rooted(
+            Self::finish_remote_store(object_store),
+            uri,
+        ))
+    }
+
+    fn from_azblob_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
+        Self::ensure_backend_enabled(
+            uri,
+            storage.enable_azblob,
+            "azblob:// requires --azblob and related options",
+        )?;
+
+        let location = extract_remote_location(uri)?;
+        let mut config = storage.azblob.clone();
+        config.azblob_container = location.bucket_or_container;
+        config.azblob_root = location.root;
+        Self::validate_remote_config(uri, "azblob", config.validate())?;
+
+        let conn: AzblobConnection = config.into();
+        let object_store = ObjectStore::new(Azblob::from(&conn))
+            .context(BuildObjectStoreSnafu)?
+            .finish();
+        Ok(Self::new_operator_rooted(
+            Self::finish_remote_store(object_store),
+            uri,
+        ))
+    }
+
+    /// Creates a new storage from a URI and object store config.
+    pub fn from_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
+        match StorageScheme::from_uri(uri)? {
+            StorageScheme::File => Self::from_file_uri_with_config(uri, storage),
+            StorageScheme::S3 => Self::from_s3_uri(uri, storage),
+            StorageScheme::Oss => Self::from_oss_uri(uri, storage),
+            StorageScheme::Gcs => Self::from_gcs_uri(uri, storage),
+            StorageScheme::Azblob => Self::from_azblob_uri(uri, storage),
+        }
+    }
+
+    /// Reads a file as bytes.
+    async fn read_file(&self, path: &str) -> Result<Vec<u8>> {
+        let data = self
+            .object_store
+            .read(path)
+            .await
+            .context(StorageOperationSnafu {
+                operation: format!("read {}", path),
+            })?;
+        Ok(data.to_vec())
+    }
+
+    /// Writes bytes to a file.
+    async fn write_file(&self, path: &str, data: Vec<u8>) -> Result<()> {
+        self.object_store
+            .write(path, data)
+            .await
+            .map(|_| ())
+            .context(StorageOperationSnafu {
+                operation: format!("write {}", path),
+            })
+    }
+
+    /// Checks if a file exists using stat.
+    async fn file_exists(&self, path: &str) -> Result<bool> {
+        match self.object_store.stat(path).await {
+            Ok(_) => Ok(true),
+            Err(e) if e.kind() == object_store::ErrorKind::NotFound => Ok(false),
+            Err(e) => Err(e).context(StorageOperationSnafu {
+                operation: format!("check exists {}", path),
+            }),
+        }
+    }
+
+    #[cfg(test)]
+    pub async fn read_schema(&self) -> Result<SchemaSnapshot> {
+        let schemas_path = schema_index_path();
+        let schemas: Vec<SchemaDefinition> = if self.file_exists(&schemas_path).await? {
+            let data = self.read_file(&schemas_path).await?;
+            serde_json::from_slice(&data).context(ManifestParseSnafu)?
+        } else {
+            vec![]
+        };
+
+        Ok(SchemaSnapshot { schemas })
+    }
+}
+
+#[async_trait]
+impl SnapshotStorage for OpenDalStorage {
+    async fn exists(&self) -> Result<bool> {
+        self.file_exists(MANIFEST_FILE).await
+    }
+
+    async fn read_manifest(&self) -> Result<Manifest> {
+        ensure_snapshot_exists(self).await?;
+
+        let data = self.read_file(MANIFEST_FILE).await?;
+        serde_json::from_slice(&data).context(ManifestParseSnafu)
+    }
+
+    async fn write_manifest(&self, manifest: &Manifest) -> Result<()> {
+        let data = serde_json::to_vec_pretty(manifest).context(ManifestSerializeSnafu)?;
+        self.write_file(MANIFEST_FILE, data).await
+    }
+
+    async fn write_schema(&self, schema: &SchemaSnapshot) -> Result<()> {
+        let schemas_path = schema_index_path();
+        let schemas_data =
+            serde_json::to_vec_pretty(&schema.schemas).context(ManifestSerializeSnafu)?;
+        self.write_file(&schemas_path, schemas_data).await
+    }
+
+    async fn write_text(&self, path: &str, content: &str) -> Result<()> {
+        self.write_file(path, content.as_bytes().to_vec()).await
+    }
+
+    async fn read_text(&self, path: &str) -> Result<String> {
+        let data = self.read_file(path).await?;
+        String::from_utf8(data).context(TextDecodeSnafu)
+    }
+
+    async fn delete_snapshot(&self) -> Result<()> {
+        self.object_store
+            .remove_all("/")
+            .await
+            .context(StorageOperationSnafu {
+                operation: "delete snapshot",
+            })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use object_store::ObjectStore;
+    use object_store::services::Fs;
+    use tempfile::tempdir;
+    use url::Url;
+
+    use super::*;
+    use crate::data::export_v2::manifest::{DataFormat, TimeRange};
+    use crate::data::export_v2::schema::SchemaDefinition;
+
+    fn make_storage_with_rooted_fs(dir: &std::path::Path) -> OpenDalStorage {
+        let object_store = ObjectStore::new(Fs::default().root(dir.to_str().unwrap()))
+            .unwrap()
+            .finish();
+        OpenDalStorage::new_operator_rooted(
+            OpenDalStorage::finish_local_store(object_store),
+            Url::from_directory_path(dir).unwrap().as_ref(),
+        )
+    }
+
+    #[test]
+    fn test_validate_uri_valid() {
+        assert_eq!(validate_uri("s3://bucket/path").unwrap(), StorageScheme::S3);
+        assert_eq!(
+            validate_uri("oss://bucket/path").unwrap(),
+            StorageScheme::Oss
+        );
+        assert_eq!(
+            validate_uri("gs://bucket/path").unwrap(),
+            StorageScheme::Gcs
+        );
+        assert_eq!(
+            validate_uri("gcs://bucket/path").unwrap(),
+            StorageScheme::Gcs
+        );
+        assert_eq!(
+            validate_uri("azblob://container/path").unwrap(),
+            StorageScheme::Azblob
+        );
+        assert_eq!(
+            validate_uri("file:///tmp/backup").unwrap(),
+            StorageScheme::File
+        );
+    }
+
+    #[test]
+    fn test_validate_uri_invalid() {
+        // Bare paths should be rejected
+        assert!(validate_uri("/tmp/backup").is_err());
+        assert!(validate_uri("./backup").is_err());
+        assert!(validate_uri("backup").is_err());
+
+        // Unknown schemes
+        assert!(validate_uri("ftp://server/path").is_err());
+    }
+
+    #[test]
+    fn test_extract_remote_location_requires_non_empty_root() {
+        assert!(extract_remote_location("s3://bucket").is_err());
+        assert!(extract_remote_location("s3://bucket/").is_err());
+        assert!(extract_remote_location("oss://bucket").is_err());
+        assert!(extract_remote_location("gs://bucket").is_err());
+        assert!(extract_remote_location("azblob://container").is_err());
+    }
+
+    #[test]
+    fn test_extract_path_from_uri() {
+        assert_eq!(
+            extract_file_path_from_uri("file:///tmp/backup").unwrap(),
+            "/tmp/backup"
+        );
+        assert_eq!(
+            extract_file_path_from_uri("file://localhost/tmp/backup").unwrap(),
+            "/tmp/backup"
+        );
+    }
+
+    #[test]
+    fn test_extract_file_path_from_uri_rejects_file_host() {
+        assert!(extract_file_path_from_uri("file://tmp/backup").is_err());
+    }
+
+    #[tokio::test]
+    async fn test_read_manifest_reports_requested_uri() {
+        let dir = tempdir().unwrap();
+        let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
+        let storage = OpenDalStorage::from_file_uri(&uri).unwrap();
+
+        let error = storage.read_manifest().await.unwrap_err().to_string();
+
+        assert!(error.contains(uri.as_str()));
+    }
+
+    #[tokio::test]
+    async fn test_manifest_round_trip() {
+        let dir = tempdir().unwrap();
+        let storage = make_storage_with_rooted_fs(dir.path());
+
+        let manifest = Manifest::new_full(
+            "greptime".to_string(),
+            vec!["public".to_string()],
+            TimeRange::unbounded(),
+            DataFormat::Parquet,
+        );
+
+        storage.write_manifest(&manifest).await.unwrap();
+        let loaded = storage.read_manifest().await.unwrap();
+
+        assert_eq!(loaded.catalog, manifest.catalog);
+        assert_eq!(loaded.schemas, manifest.schemas);
+        assert_eq!(loaded.schema_only, manifest.schema_only);
+        assert_eq!(loaded.format, manifest.format);
+        assert_eq!(loaded.snapshot_id, manifest.snapshot_id);
+    }
+
+    #[tokio::test]
+    async fn test_schema_round_trip() {
+        let dir = tempdir().unwrap();
+        let storage = make_storage_with_rooted_fs(dir.path());
+
+        let mut snapshot = SchemaSnapshot::new();
+        snapshot.add_schema(SchemaDefinition {
+            catalog: "greptime".to_string(),
+            name: "test_db".to_string(),
+            options: HashMap::from([("ttl".to_string(), "7d".to_string())]),
+        });
+
+        storage.write_schema(&snapshot).await.unwrap();
+        let loaded = storage.read_schema().await.unwrap();
+
+        assert_eq!(loaded, snapshot);
+    }
+
+    #[tokio::test]
+    async fn test_text_round_trip() {
+        let dir = tempdir().unwrap();
+        let storage = make_storage_with_rooted_fs(dir.path());
+        let content = "CREATE TABLE metrics (ts TIMESTAMP TIME INDEX);";
+
+        storage
+            .write_text("schema/ddl/public.sql", content)
+            .await
+            .unwrap();
+        let loaded = storage.read_text("schema/ddl/public.sql").await.unwrap();
+
+        assert_eq!(loaded, content);
+    }
+
+    #[tokio::test]
+    async fn test_read_text_rejects_invalid_utf8() {
+        let dir = tempdir().unwrap();
+        let storage = make_storage_with_rooted_fs(dir.path());
+
+        storage
+            .write_file("schema/ddl/public.sql", vec![0xff, 0xfe, 0xfd])
+            .await
+            .unwrap();
+
+        let error = storage
+            .read_text("schema/ddl/public.sql")
+            .await
+            .unwrap_err();
+        assert!(error.to_string().contains("UTF-8"));
+    }
+
+    #[tokio::test]
+    async fn test_exists_follows_manifest_presence() {
+        let dir = tempdir().unwrap();
+        let storage = make_storage_with_rooted_fs(dir.path());
+
+        assert!(!storage.exists().await.unwrap());
+
+        storage
+            .write_manifest(&Manifest::new_schema_only(
+                "greptime".to_string(),
+                vec!["public".to_string()],
+            ))
+            .await
+            .unwrap();
+
+        assert!(storage.exists().await.unwrap());
+    }
+
+    #[tokio::test]
+    async fn test_delete_snapshot_only_removes_rooted_contents() {
+        let parent = tempdir().unwrap();
+        let snapshot_root = parent.path().join("snapshot");
+        let sibling = parent.path().join("sibling");
+        std::fs::create_dir_all(&snapshot_root).unwrap();
+        std::fs::create_dir_all(&sibling).unwrap();
+        std::fs::write(snapshot_root.join("manifest.json"), b"{}").unwrap();
+        std::fs::write(sibling.join("keep.txt"), b"keep").unwrap();
+
+        let storage = make_storage_with_rooted_fs(&snapshot_root);
+        storage.delete_snapshot().await.unwrap();
+
+        assert!(!snapshot_root.join("manifest.json").exists());
+        assert!(sibling.join("keep.txt").exists());
+    }
+}
diff --git a/src/cli/src/data/sql.rs b/src/cli/src/data/sql.rs
new file mode 100644
index 0000000000..7de4206b26
--- /dev/null
+++ b/src/cli/src/data/sql.rs
@@ -0,0 +1,40 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Shared SQL escaping helpers for CLI-generated statements.
+
+pub(crate) fn escape_sql_literal(value: &str) -> String {
+    value.replace('\'', "''")
+}
+
+pub(crate) fn escape_sql_identifier(value: &str) -> String {
+    value.replace('"', "\"\"")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_escape_sql_literal_escapes_single_quotes() {
+        assert_eq!(escape_sql_literal("test_db"), "test_db");
+        assert_eq!(escape_sql_literal("te'st"), "te''st");
+    }
+
+    #[test]
+    fn test_escape_sql_identifier_escapes_double_quotes() {
+        assert_eq!(escape_sql_identifier("test_db"), "test_db");
+        assert_eq!(escape_sql_identifier(r#"te"st"#), r#"te""st"#);
+    }
+}
diff --git a/src/cli/src/database.rs b/src/cli/src/database.rs
index db98c38e38..fa3f6faefb 100644
--- a/src/cli/src/database.rs
+++ b/src/cli/src/database.rs
@@ -36,6 +36,7 @@ pub struct DatabaseClient {
     auth_header: Option<String>,
     timeout: Duration,
     proxy: Option<reqwest::Proxy>,
+    no_proxy: bool,
 }
 
 pub fn parse_proxy_opts(
@@ -61,6 +62,7 @@ impl DatabaseClient {
         auth_basic: Option<String>,
         timeout: Duration,
         proxy: Option<reqwest::Proxy>,
+        no_proxy: bool,
     ) -> Self {
         let auth_header = if let Some(basic) = auth_basic {
             let encoded = general_purpose::STANDARD.encode(basic);
@@ -69,7 +71,9 @@ impl DatabaseClient {
             None
         };
 
-        if let Some(ref proxy) = proxy {
+        if no_proxy {
+            common_telemetry::info!("Proxy disabled");
+        } else if let Some(ref proxy) = proxy {
             common_telemetry::info!("Using proxy: {:?}", proxy);
         } else {
             common_telemetry::info!("Using system proxy(if any)");
@@ -81,6 +85,7 @@ impl DatabaseClient {
             auth_header,
             timeout,
             proxy,
+            no_proxy,
         }
     }
 
@@ -95,12 +100,14 @@ impl DatabaseClient {
             ("db", format!("{}-{}", self.catalog, schema)),
             ("sql", sql.to_string()),
         ];
-        let client = self
-            .proxy
-            .clone()
-            .map(|proxy| reqwest::Client::builder().proxy(proxy).build())
-            .unwrap_or_else(|| Ok(reqwest::Client::new()))
-            .context(BuildClientSnafu)?;
+        let mut builder = reqwest::Client::builder();
+        if let Some(proxy) = self.proxy.clone() {
+            builder = builder.proxy(proxy);
+        }
+        if self.no_proxy {
+            builder = builder.no_proxy();
+        }
+        let client = builder.build().context(BuildClientSnafu)?;
         let mut request = client
             .post(&url)
             .form(&params)
diff --git a/src/cli/src/lib.rs b/src/cli/src/lib.rs
index acf5df4086..4305da9c8f 100644
--- a/src/cli/src/lib.rs
+++ b/src/cli/src/lib.rs
@@ -29,7 +29,7 @@ pub use database::DatabaseClient;
 use error::Result;
 
 pub use crate::bench::BenchTableMetadataCommand;
-pub use crate::data::DataCommand;
+pub use crate::data::{DataCommand, export_v2, import_v2};
 pub use crate::metadata::MetadataCommand;
 
 #[async_trait]

From f034255fe6d7ce9d3b81e08c7a91e7f960dda96c Mon Sep 17 00:00:00 2001
From: Ruihang Xia <waynestxia@gmail.com>
Date: Fri, 20 Mar 2026 06:40:52 +0800
Subject: [PATCH 23/42] perf: support group accumulators for state wrapper
 (#7826)

* perf: support group accumulators for state wrapper

* new tests and avoid clone

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
---
 src/common/function/src/aggrs/aggr_wrapper.rs | 153 +++++++++++++++++-
 .../function/src/aggrs/aggr_wrapper/tests.rs  | 126 ++++++++++++++-
 2 files changed, 270 insertions(+), 9 deletions(-)

diff --git a/src/common/function/src/aggrs/aggr_wrapper.rs b/src/common/function/src/aggrs/aggr_wrapper.rs
index 3780d39582..6242ab9454 100644
--- a/src/common/function/src/aggrs/aggr_wrapper.rs
+++ b/src/common/function/src/aggrs/aggr_wrapper.rs
@@ -25,7 +25,7 @@
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
-use arrow::array::StructArray;
+use arrow::array::{ArrayRef, BooleanArray, StructArray};
 use arrow_schema::{FieldRef, Fields};
 use common_telemetry::debug;
 use datafusion::functions_aggregate::all_default_aggregate_functions;
@@ -38,8 +38,8 @@ use datafusion_common::{Column, ScalarValue};
 use datafusion_expr::expr::{AggregateFunction, AggregateFunctionParams};
 use datafusion_expr::function::StateFieldsArgs;
 use datafusion_expr::{
-    Accumulator, Aggregate, AggregateUDF, AggregateUDFImpl, Expr, ExprSchemable, LogicalPlan,
-    Signature,
+    Accumulator, Aggregate, AggregateUDF, AggregateUDFImpl, EmitTo, Expr, ExprSchemable,
+    GroupsAccumulator, LogicalPlan, Signature,
 };
 use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
 use datatypes::arrow::datatypes::{DataType, Field};
@@ -322,6 +322,14 @@ impl StateWrapper {
             );
         })
     }
+
+    fn fix_inner_acc_args<'b>(
+        &self,
+        mut acc_args: datafusion_expr::function::AccumulatorArgs<'b>,
+    ) -> datafusion_common::Result<datafusion_expr::function::AccumulatorArgs<'b>> {
+        acc_args.return_field = self.deduce_aggr_return_type(&acc_args)?;
+        Ok(acc_args)
+    }
 }
 
 impl AggregateUDFImpl for StateWrapper {
@@ -331,15 +339,32 @@ impl AggregateUDFImpl for StateWrapper {
     ) -> datafusion_common::Result<Box<dyn Accumulator>> {
         // fix and recover proper acc args for the original aggregate function.
         let state_type = acc_args.return_type().clone();
-        let inner = {
-            let mut new_acc_args = acc_args.clone();
-            new_acc_args.return_field = self.deduce_aggr_return_type(&acc_args)?;
-            self.inner.accumulator(new_acc_args)?
-        };
+        let inner = self.inner.accumulator(self.fix_inner_acc_args(acc_args)?)?;
 
         Ok(Box::new(StateAccum::new(inner, state_type)?))
     }
 
+    fn groups_accumulator_supported(
+        &self,
+        acc_args: datafusion_expr::function::AccumulatorArgs,
+    ) -> bool {
+        self.fix_inner_acc_args(acc_args)
+            .map(|args| self.inner.inner().groups_accumulator_supported(args))
+            .unwrap_or(false)
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        acc_args: datafusion_expr::function::AccumulatorArgs,
+    ) -> datafusion_common::Result<Box<dyn GroupsAccumulator>> {
+        let state_type = acc_args.return_type().clone();
+        let inner = self
+            .inner
+            .inner()
+            .create_groups_accumulator(self.fix_inner_acc_args(acc_args)?)?;
+        Ok(Box::new(StateGroupsAccum::new(inner, state_type)?))
+    }
+
     fn as_any(&self) -> &dyn std::any::Any {
         self
     }
@@ -462,6 +487,118 @@ pub struct StateAccum {
     state_fields: Fields,
 }
 
+pub struct StateGroupsAccum {
+    inner: Box<dyn GroupsAccumulator>,
+    state_fields: Fields,
+}
+
+impl StateGroupsAccum {
+    fn new(
+        inner: Box<dyn GroupsAccumulator>,
+        state_type: DataType,
+    ) -> datafusion_common::Result<Self> {
+        let DataType::Struct(fields) = state_type else {
+            return Err(datafusion_common::DataFusionError::Internal(format!(
+                "Expected a struct type for state, got: {:?}",
+                state_type
+            )));
+        };
+        Ok(Self {
+            inner,
+            state_fields: fields,
+        })
+    }
+
+    fn wrap_state_arrays(&self, arrays: Vec<ArrayRef>) -> datafusion_common::Result<ArrayRef> {
+        let array_type = arrays
+            .iter()
+            .map(|array| array.data_type().clone())
+            .collect::<Vec<_>>();
+        let expected_type = self
+            .state_fields
+            .iter()
+            .map(|field| field.data_type().clone())
+            .collect::<Vec<_>>();
+        if array_type != expected_type {
+            debug!(
+                "State mismatch, expected: {}, got: {} for expected fields: {:?} and given array types: {:?}",
+                self.state_fields.len(),
+                arrays.len(),
+                self.state_fields,
+                array_type,
+            );
+            let guess_schema = arrays
+                .iter()
+                .enumerate()
+                .map(|(index, array)| {
+                    Field::new(
+                        format!("col_{index}[mismatch_state]").as_str(),
+                        array.data_type().clone(),
+                        true,
+                    )
+                })
+                .collect::<Fields>();
+            let array = StructArray::try_new(guess_schema, arrays, None)?;
+            return Ok(Arc::new(array));
+        }
+
+        Ok(Arc::new(StructArray::try_new(
+            self.state_fields.clone(),
+            arrays,
+            None,
+        )?))
+    }
+}
+
+impl GroupsAccumulator for StateGroupsAccum {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> datafusion_common::Result<()> {
+        self.inner
+            .update_batch(values, group_indices, opt_filter, total_num_groups)
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> datafusion_common::Result<()> {
+        self.inner
+            .merge_batch(values, group_indices, opt_filter, total_num_groups)
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> datafusion_common::Result<ArrayRef> {
+        let state = self.inner.state(emit_to)?;
+        self.wrap_state_arrays(state)
+    }
+
+    fn state(&mut self, emit_to: EmitTo) -> datafusion_common::Result<Vec<ArrayRef>> {
+        self.inner.state(emit_to)
+    }
+
+    fn convert_to_state(
+        &self,
+        values: &[ArrayRef],
+        opt_filter: Option<&BooleanArray>,
+    ) -> datafusion_common::Result<Vec<ArrayRef>> {
+        self.inner.convert_to_state(values, opt_filter)
+    }
+
+    fn supports_convert_to_state(&self) -> bool {
+        self.inner.supports_convert_to_state()
+    }
+
+    fn size(&self) -> usize {
+        self.inner.size()
+    }
+}
+
 impl StateAccum {
     pub fn new(
         inner: Box<dyn Accumulator>,
diff --git a/src/common/function/src/aggrs/aggr_wrapper/tests.rs b/src/common/function/src/aggrs/aggr_wrapper/tests.rs
index 8821b9fd24..de3a77df6b 100644
--- a/src/common/function/src/aggrs/aggr_wrapper/tests.rs
+++ b/src/common/function/src/aggrs/aggr_wrapper/tests.rs
@@ -40,10 +40,13 @@ use datafusion_common::arrow::array::AsArray;
 use datafusion_common::arrow::datatypes::{Float64Type, UInt64Type};
 use datafusion_common::{Column, TableReference};
 use datafusion_expr::expr::{AggregateFunction, NullTreatment};
+use datafusion_expr::function::AccumulatorArgs;
 use datafusion_expr::{
-    Aggregate, ColumnarValue, Expr, LogicalPlan, ScalarFunctionArgs, SortExpr, TableScan, lit,
+    Aggregate, AggregateUDFImpl, ColumnarValue, Expr, LogicalPlan, ScalarFunctionArgs, SortExpr,
+    TableScan, lit,
 };
 use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+use datafusion_physical_expr::expressions::col;
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
 use datatypes::arrow_array::StringArray;
 use futures::{Stream, StreamExt as _};
@@ -256,6 +259,38 @@ fn dummy_table_scan_with_ts() -> LogicalPlan {
     )
 }
 
+fn create_avg_state_groups_accumulator() -> Box<dyn GroupsAccumulator> {
+    let state_wrapper = StateWrapper::new((*avg_udaf()).clone()).unwrap();
+    let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+        "number",
+        DataType::Float64,
+        true,
+    )]));
+    let expr = col("number", &schema).unwrap();
+    let expr_field = expr.return_field(&schema).unwrap();
+    let return_field = Arc::new(Field::new(
+        "__avg_state(number)",
+        state_wrapper.return_type(&[DataType::Float64]).unwrap(),
+        true,
+    ));
+    let exprs = [expr];
+    let expr_fields = [expr_field];
+    let acc_args = AccumulatorArgs {
+        return_field,
+        schema: &schema,
+        ignore_nulls: false,
+        order_bys: &[],
+        is_reversed: false,
+        name: "__avg_state(number)",
+        is_distinct: false,
+        exprs: &exprs,
+        expr_fields: &expr_fields,
+    };
+
+    assert!(state_wrapper.groups_accumulator_supported(acc_args.clone()));
+    state_wrapper.create_groups_accumulator(acc_args).unwrap()
+}
+
 #[tokio::test]
 async fn test_sum_udaf() {
     let ctx = SessionContext::new();
@@ -796,6 +831,95 @@ async fn test_last_value_order_by_udaf() {
     assert_eq!(merge_eval_res, ScalarValue::Int64(Some(4)));
 }
 
+#[test]
+fn test_avg_state_groups_accumulator_evaluate() {
+    let mut state_accum = create_avg_state_groups_accumulator();
+    let values = vec![Arc::new(Float64Array::from(vec![
+        Some(1.0),
+        Some(2.0),
+        None,
+        Some(3.0),
+        Some(4.0),
+        Some(5.0),
+    ])) as ArrayRef];
+    let group_indices = vec![0, 1, 0, 0, 1, 2];
+
+    state_accum
+        .update_batch(&values, &group_indices, None, 3)
+        .unwrap();
+
+    let result = state_accum.evaluate(EmitTo::All).unwrap();
+    let result = result.as_any().downcast_ref::<StructArray>().unwrap();
+
+    assert_eq!(
+        result
+            .column(0)
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap(),
+        &UInt64Array::from(vec![2, 2, 1])
+    );
+    assert_eq!(
+        result
+            .column(1)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap(),
+        &Float64Array::from(vec![4.0, 6.0, 5.0])
+    );
+}
+
+#[test]
+fn test_avg_state_groups_accumulator_state_merge_evaluate() {
+    let mut source_accum = create_avg_state_groups_accumulator();
+    let source_values = vec![Arc::new(Float64Array::from(vec![
+        Some(1.0),
+        Some(2.0),
+        None,
+        Some(3.0),
+        Some(4.0),
+        Some(5.0),
+    ])) as ArrayRef];
+    let source_group_indices = vec![0, 1, 0, 0, 1, 2];
+
+    source_accum
+        .update_batch(&source_values, &source_group_indices, None, 3)
+        .unwrap();
+    let source_state = source_accum.state(EmitTo::All).unwrap();
+
+    let mut merged_accum = create_avg_state_groups_accumulator();
+    let merged_values =
+        vec![Arc::new(Float64Array::from(vec![Some(10.0), Some(20.0), Some(30.0)])) as ArrayRef];
+    let merged_group_indices = vec![0, 1, 2];
+
+    merged_accum
+        .update_batch(&merged_values, &merged_group_indices, None, 3)
+        .unwrap();
+    merged_accum
+        .merge_batch(&source_state, &[1, 2, 0], None, 3)
+        .unwrap();
+
+    let result = merged_accum.evaluate(EmitTo::All).unwrap();
+    let result = result.as_any().downcast_ref::<StructArray>().unwrap();
+
+    assert_eq!(
+        result
+            .column(0)
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap(),
+        &UInt64Array::from(vec![2, 3, 3])
+    );
+    assert_eq!(
+        result
+            .column(1)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap(),
+        &Float64Array::from(vec![15.0, 24.0, 36.0])
+    );
+}
+
 /// For testing whether the UDAF state fields are correctly implemented.
 /// esp. for our own custom UDAF's state fields.
 /// By compare eval results before and after split to state/merge functions.

From d14817bfa6ecf3a7f6a4cf98817c1afd42a2a8c5 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunng@protonmail.com>
Date: Fri, 20 Mar 2026 11:58:39 +0800
Subject: [PATCH 24/42] fix: resolve optimization issue for extended query
 (#7824)

* fix: resolve optimization issue for extended query

* fix: type cast from subquery

* chore: update error information in sqlness

* chore: switch to released pgwire

* refactor: remove optimize function completely

* chore: add more tests

* test: attempt to fix the fuzz issue

* fix: try to resolve the test issue
---
 Cargo.lock                                    |  6 +-
 .../information_schema/region_peers.rs        |  2 +-
 src/query/src/datafusion.rs                   | 56 ++++---------------
 src/query/src/planner.rs                      | 42 +++++++++++++-
 src/servers/Cargo.toml                        |  2 +-
 tests-fuzz/src/utils/partition.rs             |  2 +-
 .../migration/fuzz_migrate_mito_regions.rs    | 11 +++-
 .../common/prepare/mysql_prepare.result       | 21 ++++++-
 .../common/prepare/mysql_prepare.sql          |  9 +++
 9 files changed, 94 insertions(+), 57 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1b2a44d0e4..073ae03525 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7301,7 +7301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
 dependencies = [
  "cfg-if",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -9620,9 +9620,9 @@ dependencies = [
 
 [[package]]
 name = "pgwire"
-version = "0.38.0"
+version = "0.38.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89d5e5a60d3f6e40c91f6a2a7f8d09665e636272bd5611977253559b6651aabb"
+checksum = "f2a798d130b8975a566c2cf6d8955746e1f09a9ee2c3ff2e6020a2c6528c5bd1"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
diff --git a/src/catalog/src/system_schema/information_schema/region_peers.rs b/src/catalog/src/system_schema/information_schema/region_peers.rs
index 5bc91d207e..b1438ef53d 100644
--- a/src/catalog/src/system_schema/information_schema/region_peers.rs
+++ b/src/catalog/src/system_schema/information_schema/region_peers.rs
@@ -267,7 +267,7 @@ impl InformationSchemaRegionPeersBuilder {
             ];
 
             if !predicates.eval(&row) {
-                return;
+                continue;
             }
 
             self.table_catalogs.push(Some(table_catalog));
diff --git a/src/query/src/datafusion.rs b/src/query/src/datafusion.rs
index dc84c4afac..e2e577debf 100644
--- a/src/query/src/datafusion.rs
+++ b/src/query/src/datafusion.rs
@@ -354,25 +354,6 @@ impl DatafusionQueryEngine {
         Ok(physical_plan)
     }
 
-    #[tracing::instrument(skip_all)]
-    pub fn optimize(
-        &self,
-        context: &QueryEngineContext,
-        plan: &LogicalPlan,
-    ) -> Result<LogicalPlan> {
-        let _timer = metrics::OPTIMIZE_LOGICAL_ELAPSED.start_timer();
-
-        // Optimized by extension rules
-        let optimized_plan = self
-            .state
-            .optimize_by_extension_rules(plan.clone(), context)?;
-
-        // Optimized by datafusion optimizer
-        let optimized_plan = self.state.session_state().optimize(&optimized_plan)?;
-
-        Ok(optimized_plan)
-    }
-
     #[tracing::instrument(skip_all)]
     fn optimize_physical_plan(
         &self,
@@ -444,32 +425,17 @@ impl QueryEngine for DatafusionQueryEngine {
     async fn describe(
         &self,
         plan: LogicalPlan,
-        query_ctx: QueryContextRef,
+        _query_ctx: QueryContextRef,
     ) -> Result<DescribeResult> {
-        let ctx = self.engine_context(query_ctx);
-        if let Ok(optimised_plan) = self.optimize(&ctx, &plan) {
-            let schema = optimised_plan
-                .schema()
-                .clone()
-                .try_into()
-                .context(ConvertSchemaSnafu)?;
-            Ok(DescribeResult {
-                schema,
-                logical_plan: optimised_plan,
-            })
-        } else {
-            // Table's like those in information_schema cannot be optimized when
-            // it contains parameters. So we fallback to original plans.
-            let schema = plan
-                .schema()
-                .clone()
-                .try_into()
-                .context(ConvertSchemaSnafu)?;
-            Ok(DescribeResult {
-                schema,
-                logical_plan: plan,
-            })
-        }
+        let schema = plan
+            .schema()
+            .clone()
+            .try_into()
+            .context(ConvertSchemaSnafu)?;
+        Ok(DescribeResult {
+            schema,
+            logical_plan: plan,
+        })
     }
 
     async fn execute(&self, plan: LogicalPlan, query_ctx: QueryContextRef) -> Result<Output> {
@@ -924,7 +890,7 @@ mod tests {
             )
         );
         assert_eq!(
-            "Limit: skip=0, fetch=20\n  Aggregate: groupBy=[[]], aggr=[[sum(CAST(numbers.number AS UInt64))]]\n    TableScan: numbers projection=[number]",
+            "Limit: skip=0, fetch=20\n  Projection: sum(numbers.number)\n    Aggregate: groupBy=[[]], aggr=[[sum(numbers.number)]]\n      TableScan: numbers",
             format!("{}", logical_plan.display_indent())
         );
     }
diff --git a/src/query/src/planner.rs b/src/query/src/planner.rs
index 44c9bc3956..f522dc567a 100644
--- a/src/query/src/planner.rs
+++ b/src/query/src/planner.rs
@@ -28,6 +28,7 @@ use datafusion::execution::context::SessionState;
 use datafusion::sql::planner::PlannerContext;
 use datafusion_common::ToDFSchema;
 use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
+use datafusion_expr::expr::{Exists, InSubquery};
 use datafusion_expr::{
     Analyze, Explain, ExplainFormat, Expr as DfExpr, LogicalPlan, LogicalPlanBuilder, PlanType,
     ToStringifiedPlan, col,
@@ -424,9 +425,20 @@ impl DfLogicalPlanner {
         let mut placeholder_types = HashMap::new();
         let mut casted_placeholders = HashSet::new();
 
+        Self::extract_from_plan(plan, &mut placeholder_types, &mut casted_placeholders)?;
+
+        Ok(placeholder_types)
+    }
+
+    fn extract_from_plan(
+        plan: &LogicalPlan,
+        placeholder_types: &mut HashMap<String, Option<DataType>>,
+        casted_placeholders: &mut HashSet<String>,
+    ) -> Result<()> {
         plan.apply(|node| {
             for expr in node.expressions() {
                 let _ = expr.apply(|e| {
+                    // Handle casted placeholders
                     if let DfExpr::Cast(cast) = e
                         && let DfExpr::Placeholder(ph) = &*cast.expr
                     {
@@ -434,6 +446,7 @@ impl DfLogicalPlanner {
                         casted_placeholders.insert(ph.id.clone());
                     }
 
+                    // Handle bare (non-casted) placeholders
                     if let DfExpr::Placeholder(ph) = e
                         && !casted_placeholders.contains(&ph.id)
                         && !placeholder_types.contains_key(&ph.id)
@@ -441,13 +454,26 @@ impl DfLogicalPlanner {
                         placeholder_types.insert(ph.id.clone(), None);
                     }
 
+                    // Recurse into subquery plans embedded in expressions
+                    match e {
+                        DfExpr::Exists(Exists { subquery, .. })
+                        | DfExpr::InSubquery(InSubquery { subquery, .. })
+                        | DfExpr::ScalarSubquery(subquery) => {
+                            Self::extract_from_plan(
+                                &subquery.subquery,
+                                placeholder_types,
+                                casted_placeholders,
+                            )?;
+                        }
+                        _ => {}
+                    }
+
                     Ok(TreeNodeRecursion::Continue)
                 });
             }
             Ok(TreeNodeRecursion::Continue)
         })?;
-
-        Ok(placeholder_types)
+        Ok(())
     }
 
     /// Gets inferred parameter types from a logical plan.
@@ -619,4 +645,16 @@ mod tests {
         assert_eq!(type_2, &Some(DataType::Utf8));
         assert_eq!(type_3, &Some(DataType::Int32));
     }
+
+    #[tokio::test]
+    async fn test_get_inferred_parameter_types_subquery() {
+        let plan = parse_sql_to_plan(
+            r#"SELECT * FROM test WHERE id = (SELECT id FROM test CROSS JOIN (SELECT parse_ident($1::TEXT) AS parts) p LIMIT 1)"#,
+        ).await;
+        let types = DfLogicalPlanner::get_inferred_parameter_types(&plan).unwrap();
+
+        assert_eq!(types.len(), 1);
+        let type_1 = types.get("$1").unwrap();
+        assert_eq!(type_1, &Some(DataType::Utf8));
+    }
 }
diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml
index e75192c9ba..8b64a256e7 100644
--- a/src/servers/Cargo.toml
+++ b/src/servers/Cargo.toml
@@ -89,7 +89,7 @@ operator.workspace = true
 otel-arrow-rust.workspace = true
 parking_lot.workspace = true
 pg_interval = { version = "0.5.2", package = "pg_interval_2" }
-pgwire = { version = "0.38", default-features = false, features = [
+pgwire = { version = "0.38.1", default-features = false, features = [
     "server-api-ring",
     "pg-ext-types",
 ] }
diff --git a/tests-fuzz/src/utils/partition.rs b/tests-fuzz/src/utils/partition.rs
index d3dc30061d..89a684326b 100644
--- a/tests-fuzz/src/utils/partition.rs
+++ b/tests-fuzz/src/utils/partition.rs
@@ -36,7 +36,7 @@ pub struct PartitionCount {
 }
 
 pub async fn count_partitions(db: &MySqlPool, datanode_id: u64) -> Result<PartitionCount> {
-    let sql = "select count(1) as count from information_schema.region_peers where peer_id == ?";
+    let sql = "select count(1) as count from information_schema.region_peers where peer_id = ?";
     sqlx::query_as::<_, PartitionCount>(sql)
         .bind(datanode_id)
         .fetch_one(db)
diff --git a/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs b/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs
index c8ebbb54af..17cbfb9251 100644
--- a/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs
+++ b/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs
@@ -261,13 +261,18 @@ async fn migrate_regions(ctx: &FuzzContext, migrations: &[Migration]) -> Result<
                     {
                         let output = procedure_state(&greptime, &procedure_id).await;
                         info!("Checking procedure: {procedure_id}, output: {output}");
-                        (fetch_partition(&greptime, region_id).await.unwrap(), output)
+                        (fetch_partition(&greptime, region_id).await.ok(), output)
                     }
                 })
             },
             |(partition, output)| {
-                info!("Region: {region_id},  datanode: {}", partition.datanode_id);
-                partition.datanode_id == migration.to_peer && output.contains("Done")
+                if let Some(partition) = partition {
+                    info!("Region: {region_id},  datanode: {}", partition.datanode_id);
+                    partition.datanode_id == migration.to_peer && output.contains("Done")
+                } else {
+                    info!("Region: {region_id}, partition not found yet");
+                    false
+                }
             },
             Duration::from_secs(5),
         )
diff --git a/tests/cases/standalone/common/prepare/mysql_prepare.result b/tests/cases/standalone/common/prepare/mysql_prepare.result
index abc267b50e..5ef242a891 100644
--- a/tests/cases/standalone/common/prepare/mysql_prepare.result
+++ b/tests/cases/standalone/common/prepare/mysql_prepare.result
@@ -42,7 +42,7 @@ affected_rows: 0
 -- SQLNESS PROTOCOL MYSQL
 EXECUTE stmt USING 'a';
 
-Failed to execute query, err: MySqlError { ERROR 1815 (HY000): (EngineExecuteQuery): Cast error: Cannot cast string 'a' to value of Int32 type }
+Failed to execute query, err: MySqlError { ERROR 1210 (HY000): (InvalidArguments): Invalid request parameter: Unable to convert a to datatype Int32(Int32Type) }
 
 -- SQLNESS PROTOCOL MYSQL
 DEALLOCATE stmt;
@@ -124,6 +124,25 @@ DEALLOCATE stmt;
 
 affected_rows: 0
 
+-- SQLNESS PROTOCOL MYSQL
+PREPARE stmt FROM 'SELECT table_name, table_schema FROM information_schema.tables WHERE table_name = ?';
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+EXECUTE stmt USING 'cake';
+
++------------+--------------+
+| table_name | table_schema |
++------------+--------------+
+| cake       | public       |
++------------+--------------+
+
+-- SQLNESS PROTOCOL MYSQL
+DEALLOCATE stmt;
+
+affected_rows: 0
+
 -- SQLNESS PROTOCOL MYSQL
 DROP TABLE cake;
 
diff --git a/tests/cases/standalone/common/prepare/mysql_prepare.sql b/tests/cases/standalone/common/prepare/mysql_prepare.sql
index 8e80a0a867..e96e945f88 100644
--- a/tests/cases/standalone/common/prepare/mysql_prepare.sql
+++ b/tests/cases/standalone/common/prepare/mysql_prepare.sql
@@ -72,5 +72,14 @@ EXECUTE stmt USING 'happy', 42, 0;
 -- SQLNESS PROTOCOL MYSQL
 DEALLOCATE stmt;
 
+-- SQLNESS PROTOCOL MYSQL
+PREPARE stmt FROM 'SELECT table_name, table_schema FROM information_schema.tables WHERE table_name = ?';
+
+-- SQLNESS PROTOCOL MYSQL
+EXECUTE stmt USING 'cake';
+
+-- SQLNESS PROTOCOL MYSQL
+DEALLOCATE stmt;
+
 -- SQLNESS PROTOCOL MYSQL
 DROP TABLE cake;

From 805536aed1fc17ba9ea83f522a9413030972ae46 Mon Sep 17 00:00:00 2001
From: jeremyhi <jiachun_feng@proton.me>
Date: Fri, 20 Mar 2026 01:19:41 -0700
Subject: [PATCH 25/42] fix: windows file path (#7839)

Signed-off-by: jeremyhi <fengjiachun@gmail.com>
---
 src/cli/src/data/snapshot_storage.rs | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/cli/src/data/snapshot_storage.rs b/src/cli/src/data/snapshot_storage.rs
index b6ff1c9222..50c8734a67 100644
--- a/src/cli/src/data/snapshot_storage.rs
+++ b/src/cli/src/data/snapshot_storage.rs
@@ -137,7 +137,16 @@ fn extract_file_path_from_uri(uri: &str) -> Result<String> {
             reason: "file:// URI must use an absolute path like file:///tmp/backup",
         }
         .fail(),
-        _ => Ok(url.path().to_string()),
+        _ => url
+            .to_file_path()
+            .map(|path| path.to_string_lossy().into_owned())
+            .map_err(|_| {
+                InvalidUriSnafu {
+                    uri,
+                    reason: "file:// URI must use a valid absolute filesystem path",
+                }
+                .build()
+            }),
     }
 }
 
@@ -447,6 +456,7 @@ impl SnapshotStorage for OpenDalStorage {
 #[cfg(test)]
 mod tests {
     use std::collections::HashMap;
+    use std::path::Path;
 
     use object_store::ObjectStore;
     use object_store::services::Fs;
@@ -512,8 +522,9 @@ mod tests {
         assert!(extract_remote_location("azblob://container").is_err());
     }
 
+    #[cfg(not(windows))]
     #[test]
-    fn test_extract_path_from_uri() {
+    fn test_extract_path_from_uri_unix_examples() {
         assert_eq!(
             extract_file_path_from_uri("file:///tmp/backup").unwrap(),
             "/tmp/backup"
@@ -529,6 +540,15 @@ mod tests {
         assert!(extract_file_path_from_uri("file://tmp/backup").is_err());
     }
 
+    #[test]
+    fn test_extract_file_path_from_uri_round_trips_directory_url() {
+        let dir = tempdir().unwrap();
+        let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
+        let path = extract_file_path_from_uri(&uri).unwrap();
+
+        assert_eq!(Path::new(&path), dir.path());
+    }
+
     #[tokio::test]
     async fn test_read_manifest_reports_requested_uri() {
         let dir = tempdir().unwrap();

From 72f289df503d9c4496d383362f35de152775e489 Mon Sep 17 00:00:00 2001
From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com>
Date: Mon, 23 Mar 2026 15:12:39 +0800
Subject: [PATCH 26/42] chore: remove GrpcQueryHandler::put_record_batch
 (#7844)

chore: remove GrpcQueryHandler::put_record_batch, we should use GrpcQueryHandler::handle_put_record_batch_stream instead

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
---
 src/frontend/src/instance/grpc.rs     | 57 ---------------------------
 src/servers/src/query_handler/grpc.rs | 11 +-----
 src/servers/tests/mod.rs              | 10 -----
 3 files changed, 1 insertion(+), 77 deletions(-)

diff --git a/src/frontend/src/instance/grpc.rs b/src/frontend/src/instance/grpc.rs
index c4191145f8..70ff50fadc 100644
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -27,7 +27,6 @@ use api::v1::{
 use async_stream::try_stream;
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
-use common_base::AffectedRows;
 use common_error::ext::BoxedError;
 use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
@@ -260,62 +259,6 @@ impl GrpcQueryHandler for Instance {
             .context(server_error::ExecuteGrpcQuerySnafu)
     }
 
-    async fn put_record_batch(
-        &self,
-        request: servers::grpc::flight::PutRecordBatchRequest,
-        table_ref: &mut Option<TableRef>,
-        ctx: QueryContextRef,
-    ) -> server_error::Result<AffectedRows> {
-        let result: Result<AffectedRows> = async {
-            let table = if let Some(table) = table_ref {
-                table.clone()
-            } else {
-                let table = self
-                    .catalog_manager()
-                    .table(
-                        &request.table_name.catalog_name,
-                        &request.table_name.schema_name,
-                        &request.table_name.table_name,
-                        None,
-                    )
-                    .await
-                    .context(CatalogSnafu)?
-                    .with_context(|| TableNotFoundSnafu {
-                        table_name: request.table_name.to_string(),
-                    })?;
-                *table_ref = Some(table.clone());
-                table
-            };
-
-            let interceptor_ref = self.plugins.get::<GrpcQueryInterceptorRef<Error>>();
-            let interceptor = interceptor_ref.as_ref();
-            interceptor.pre_bulk_insert(table.clone(), ctx.clone())?;
-
-            self.plugins
-                .get::<PermissionCheckerRef>()
-                .as_ref()
-                .check_permission(ctx.current_user(), PermissionReq::BulkInsert)
-                .context(PermissionSnafu)?;
-
-            // do we check limit for bulk insert?
-
-            self.inserter
-                .handle_bulk_insert(
-                    table,
-                    request.flight_data,
-                    request.record_batch,
-                    request.schema_bytes,
-                )
-                .await
-                .context(TableOperationSnafu)
-        }
-        .await;
-
-        result
-            .map_err(BoxedError::new)
-            .context(server_error::ExecuteGrpcRequestSnafu)
-    }
-
     fn handle_put_record_batch_stream(
         &self,
         stream: servers::grpc::flight::PutRecordBatchRequestStream,
diff --git a/src/servers/src/query_handler/grpc.rs b/src/servers/src/query_handler/grpc.rs
index 67d8b3890e..d66a76464e 100644
--- a/src/servers/src/query_handler/grpc.rs
+++ b/src/servers/src/query_handler/grpc.rs
@@ -17,15 +17,13 @@ use std::sync::Arc;
 
 use api::v1::greptime_request::Request;
 use async_trait::async_trait;
-use common_base::AffectedRows;
 use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
 use futures::Stream;
 use session::context::QueryContextRef;
-use table::TableRef;
 
 use crate::error::Result;
-use crate::grpc::flight::{PutRecordBatchRequest, PutRecordBatchRequestStream};
+use crate::grpc::flight::PutRecordBatchRequestStream;
 
 pub type ServerGrpcQueryHandlerRef = Arc<dyn GrpcQueryHandler + Send + Sync>;
 
@@ -35,13 +33,6 @@ pub type RawRecordBatch = bytes::Bytes;
 pub trait GrpcQueryHandler {
     async fn do_query(&self, query: Request, ctx: QueryContextRef) -> Result<Output>;
 
-    async fn put_record_batch(
-        &self,
-        request: PutRecordBatchRequest,
-        table_ref: &mut Option<TableRef>,
-        ctx: QueryContextRef,
-    ) -> Result<AffectedRows>;
-
     fn handle_put_record_batch_stream(
         &self,
         stream: PutRecordBatchRequestStream,
diff --git a/src/servers/tests/mod.rs b/src/servers/tests/mod.rs
index e3f8f8fc79..c4f83c5e6c 100644
--- a/src/servers/tests/mod.rs
+++ b/src/servers/tests/mod.rs
@@ -18,7 +18,6 @@ use api::v1::greptime_request::Request;
 use api::v1::query_request::Query;
 use async_trait::async_trait;
 use catalog::memory::MemoryCatalogManager;
-use common_base::AffectedRows;
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
@@ -149,15 +148,6 @@ impl GrpcQueryHandler for DummyInstance {
         Ok(output)
     }
 
-    async fn put_record_batch(
-        &self,
-        _request: servers::grpc::flight::PutRecordBatchRequest,
-        _table_ref: &mut Option<TableRef>,
-        _ctx: QueryContextRef,
-    ) -> Result<AffectedRows> {
-        unimplemented!()
-    }
-
     fn handle_put_record_batch_stream(
         &self,
         _stream: servers::grpc::flight::PutRecordBatchRequestStream,

From 78742820891c245e277260fd3f62bd478d6fdc34 Mon Sep 17 00:00:00 2001
From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com>
Date: Tue, 24 Mar 2026 03:39:57 +0800
Subject: [PATCH 27/42] feat(mito): flat scan for time series memtable (#7814)

* feat/flat-for-time-series:
 ### Commit Message

 Enhance `TimeSeriesMemtable` with Record Batch Support

 - **`time_series.rs`**:
   - Introduced `BatchToRecordBatchContext` to facilitate conversion of batch iterators to record batch iterators.
   - Added `build_record_batch` method in `TimeSeriesIterBuilder` to support record batch creation.
   - Implemented multiple test cases to validate the functionality of record batch creation, including tests for projections,
 deduplication, sequence filtering, and data correctness.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/flat-for-time-series:
 Refactor `TimeSeriesMemtable` and `TimeSeriesIterBuilder`

 - Renamed `adapter_context` to `batch_to_record_batch` in `TimeSeriesMemtable` for clarity.
 - Simplified `MemtableRangeContext` initialization by removing the `batch_to_record_batch` parameter.
 - Added `is_record_batch` method to `TimeSeriesIterBuilder` to indicate record batch status.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

* feat/flat-for-time-series:
 ### Add Time Range Filtering and Predicate Group Enhancements

 - **`memtable.rs`**: Updated `IterBuilder` to include `time_range` parameter in `build_record_batch` method, enhancing record batch iteration with time range filtering.
 - **`time_series.rs`**: Modified `TimeSeriesIterBuilder` to use `PredicateGroup` instead of `Predicate`, and integrated `PruneTimeIterator` for time-based filtering.
 - **`memtable_util.rs`**: Removed unused `Predicate` import, reflecting changes in predicate handling.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>

---------

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
---
 src/mito2/src/memtable.rs                |   6 +-
 src/mito2/src/memtable/bulk.rs           |   4 +
 src/mito2/src/memtable/time_series.rs    | 323 +++++++++++++++++++++--
 src/mito2/src/test_util/memtable_util.rs |   1 -
 4 files changed, 310 insertions(+), 24 deletions(-)

diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs
index 7494ec68ed..3ebfdd3628 100644
--- a/src/mito2/src/memtable.rs
+++ b/src/mito2/src/memtable.rs
@@ -537,11 +537,15 @@ pub trait IterBuilder: Send + Sync {
     }
 
     /// Returns the record batch iterator to read the range.
+    /// ## Note
+    /// Implementations should ensure the iterator yields data within given time range.
     fn build_record_batch(
         &self,
+        time_range: Option<(Timestamp, Timestamp)>,
         metrics: Option<MemScanMetrics>,
     ) -> Result<BoxedRecordBatchIterator> {
         let _metrics = metrics;
+        let _ = time_range;
         UnsupportedOperationSnafu {
             err_msg: "Record batch iterator is not supported by this memtable",
         }
@@ -700,7 +704,7 @@ impl MemtableRange {
         metrics: Option<MemScanMetrics>,
     ) -> Result<BoxedRecordBatchIterator> {
         if self.context.builder.is_record_batch() {
-            return self.context.builder.build_record_batch(metrics);
+            return self.context.builder.build_record_batch(time_range, metrics);
         }
 
         if let Some(context) = self.context.batch_to_record_batch.as_ref() {
diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs
index 4dad4fb885..e649681b76 100644
--- a/src/mito2/src/memtable/bulk.rs
+++ b/src/mito2/src/memtable/bulk.rs
@@ -34,6 +34,7 @@ fn env_usize(name: &str, default: usize) -> usize {
         .unwrap_or(default)
 }
 
+use common_time::Timestamp;
 use datatypes::arrow::datatypes::SchemaRef;
 use mito_codec::key_values::KeyValue;
 use rayon::prelude::*;
@@ -792,6 +793,7 @@ impl IterBuilder for BulkRangeIterBuilder {
 
     fn build_record_batch(
         &self,
+        _time_range: Option<(Timestamp, Timestamp)>,
         metrics: Option<MemScanMetrics>,
     ) -> Result<BoxedRecordBatchIterator> {
         let series_count = self.part.estimated_series_count();
@@ -825,6 +827,7 @@ impl IterBuilder for MultiBulkRangeIterBuilder {
 
     fn build_record_batch(
         &self,
+        _time_range: Option<(Timestamp, Timestamp)>,
         metrics: Option<MemScanMetrics>,
     ) -> Result<BoxedRecordBatchIterator> {
         self.part
@@ -864,6 +867,7 @@ impl IterBuilder for EncodedBulkRangeIterBuilder {
 
     fn build_record_batch(
         &self,
+        _time_range: Option<(Timestamp, Timestamp)>,
         metrics: Option<MemScanMetrics>,
     ) -> Result<BoxedRecordBatchIterator> {
         if let Some(iter) = self
diff --git a/src/mito2/src/memtable/time_series.rs b/src/mito2/src/memtable/time_series.rs
index 97f5f3c9ce..d3d00d0703 100644
--- a/src/mito2/src/memtable/time_series.rs
+++ b/src/mito2/src/memtable/time_series.rs
@@ -51,15 +51,18 @@ use crate::memtable::bulk::part::BulkPart;
 use crate::memtable::simple_bulk_memtable::SimpleBulkMemtable;
 use crate::memtable::stats::WriteMetrics;
 use crate::memtable::{
-    AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, IterBuilder, KeyValues,
-    MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, MemtableRangeContext,
-    MemtableRanges, MemtableRef, MemtableStats, RangesOptions, read_column_ids_from_projection,
+    AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, BoxedRecordBatchIterator,
+    IterBuilder, KeyValues, MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange,
+    MemtableRangeContext, MemtableRanges, MemtableRef, MemtableStats, RangesOptions,
+    read_column_ids_from_projection,
 };
 use crate::metrics::{
     MEMTABLE_ACTIVE_FIELD_BUILDER_COUNT, MEMTABLE_ACTIVE_SERIES_COUNT, READ_ROWS_TOTAL,
     READ_STAGE_ELAPSED,
 };
 use crate::read::dedup::LastNonNullIter;
+use crate::read::prune::PruneTimeIterator;
+use crate::read::scan_region::PredicateGroup;
 use crate::read::{Batch, BatchBuilder, BatchColumn};
 use crate::region::options::MergeMode;
 
@@ -283,25 +286,20 @@ impl Memtable for TimeSeriesMemtable {
                 .map(|c| c.column_id)
                 .collect()
         };
-        let builder = Box::new(TimeSeriesIterBuilder {
-            series_set: self.series_set.clone(),
-            projection,
-            predicate: predicate.predicate().cloned(),
-            dedup: self.dedup,
-            merge_mode: self.merge_mode,
-            sequence,
-        });
-        let adapter_context = Arc::new(BatchToRecordBatchContext::new(
+        let batch_to_record_batch = Arc::new(BatchToRecordBatchContext::new(
             self.region_metadata.clone(),
             read_column_ids,
         ));
-        let context = Arc::new(MemtableRangeContext::new_with_batch_to_record_batch(
-            self.id,
-            builder,
-            predicate,
-            Some(adapter_context),
-        ));
-
+        let builder = Box::new(TimeSeriesIterBuilder {
+            series_set: self.series_set.clone(),
+            projection,
+            predicate: predicate.clone(),
+            dedup: self.dedup,
+            merge_mode: self.merge_mode,
+            sequence,
+            batch_to_record_batch,
+        });
+        let context = Arc::new(MemtableRangeContext::new(self.id, builder, predicate));
         let range_stats = self.stats();
         let range = MemtableRange::new(context, range_stats);
         Ok(MemtableRanges {
@@ -443,7 +441,7 @@ impl SeriesSet {
     fn iter_series(
         &self,
         projection: HashSet<ColumnId>,
-        predicate: Option<Predicate>,
+        predicate: PredicateGroup,
         dedup: bool,
         merge_mode: MergeMode,
         sequence: Option<SequenceRange>,
@@ -460,7 +458,7 @@ impl SeriesSet {
             self.region_metadata.clone(),
             self.series.clone(),
             projection,
-            predicate,
+            predicate.predicate().cloned(),
             primary_key_schema,
             primary_key_datatypes,
             self.codec.clone(),
@@ -1245,10 +1243,11 @@ impl From<ValueBuilder> for Values {
 struct TimeSeriesIterBuilder {
     series_set: SeriesSet,
     projection: HashSet<ColumnId>,
-    predicate: Option<Predicate>,
+    predicate: PredicateGroup,
     dedup: bool,
     sequence: Option<SequenceRange>,
     merge_mode: MergeMode,
+    batch_to_record_batch: Arc<BatchToRecordBatchContext>,
 }
 
 impl IterBuilder for TimeSeriesIterBuilder {
@@ -1268,6 +1267,25 @@ impl IterBuilder for TimeSeriesIterBuilder {
             Ok(Box::new(iter))
         }
     }
+
+    fn is_record_batch(&self) -> bool {
+        true
+    }
+
+    fn build_record_batch(
+        &self,
+        time_range: Option<(Timestamp, Timestamp)>,
+        metrics: Option<MemScanMetrics>,
+    ) -> Result<BoxedRecordBatchIterator> {
+        let iter = self.build(metrics)?;
+        let iter: BoxedBatchIterator = if let Some(time_range) = time_range {
+            let time_filters = self.predicate.time_filters();
+            Box::new(PruneTimeIterator::new(iter, time_range, time_filters))
+        } else {
+            iter
+        };
+        Ok(self.batch_to_record_batch.adapt_iter(iter))
+    }
 }
 
 #[cfg(test)]
@@ -2014,4 +2032,265 @@ mod tests {
         all_timestamps.sort();
         assert_eq!(vec![3, 4, 5, 6, 7], all_timestamps);
     }
+
+    /// Helper to create a TimeSeriesIterBuilder from a memtable and schema.
+    fn build_iter_builder(
+        schema: &RegionMetadataRef,
+        memtable: &TimeSeriesMemtable,
+        projection: Option<&[ColumnId]>,
+        dedup: bool,
+        merge_mode: MergeMode,
+        sequence: Option<SequenceRange>,
+    ) -> TimeSeriesIterBuilder {
+        let read_column_ids = read_column_ids_from_projection(schema, projection);
+        let field_projection = if let Some(projection) = projection {
+            projection.iter().copied().collect()
+        } else {
+            schema.field_columns().map(|c| c.column_id).collect()
+        };
+        let adapter_context = Arc::new(BatchToRecordBatchContext::new(
+            schema.clone(),
+            read_column_ids,
+        ));
+        TimeSeriesIterBuilder {
+            series_set: memtable.series_set.clone(),
+            projection: field_projection,
+            predicate: PredicateGroup::default(),
+            dedup,
+            merge_mode,
+            sequence,
+            batch_to_record_batch: adapter_context,
+        }
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_basic() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        let kvs = build_key_values(&schema, "hello".to_string(), 42, 10);
+        memtable.write(&kvs).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
+
+        let mut iter = builder.build_record_batch(None, None).unwrap();
+        let rb = iter.next().transpose().unwrap().unwrap();
+        assert_eq!(10, rb.num_rows());
+
+        let rb_schema = rb.schema();
+        let col_names: Vec<_> = rb_schema
+            .fields()
+            .iter()
+            .map(|f| f.name().as_str())
+            .collect();
+        assert_eq!(
+            col_names,
+            vec![
+                "k0",
+                "k1",
+                "v0",
+                "v1",
+                "ts",
+                "__primary_key",
+                "__sequence",
+                "__op_type",
+            ]
+        );
+
+        assert!(iter.next().is_none());
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_with_projection() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        let kvs = build_key_values(&schema, "test".to_string(), 1, 5);
+        memtable.write(&kvs).unwrap();
+
+        // Project only field v0 (column_id=3) and ts (column_id=2).
+        let projection = vec![2, 3];
+        let builder = build_iter_builder(
+            &schema,
+            &memtable,
+            Some(&projection),
+            true,
+            MergeMode::LastRow,
+            None,
+        );
+
+        let mut iter = builder.build_record_batch(None, None).unwrap();
+        let rb = iter.next().transpose().unwrap().unwrap();
+        assert_eq!(5, rb.num_rows());
+
+        let rb_schema = rb.schema();
+        let col_names: Vec<_> = rb_schema
+            .fields()
+            .iter()
+            .map(|f| f.name().as_str())
+            .collect();
+        // Only projected columns + internal columns.
+        assert_eq!(
+            col_names,
+            vec!["v0", "ts", "__primary_key", "__sequence", "__op_type",]
+        );
+
+        assert!(iter.next().is_none());
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_multiple_series() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        let kvs_a = build_key_values(&schema, "aaa".to_string(), 1, 3);
+        let kvs_b = build_key_values(&schema, "bbb".to_string(), 2, 4);
+        memtable.write(&kvs_a).unwrap();
+        memtable.write(&kvs_b).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let mut total_rows = 0;
+        for rb in iter {
+            let rb = rb.unwrap();
+            total_rows += rb.num_rows();
+            assert_eq!(8, rb.num_columns());
+        }
+        assert_eq!(7, total_rows);
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_dedup() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        // Write same data twice — dedup should keep only one copy per timestamp.
+        let kvs = build_key_values(&schema, "dup".to_string(), 10, 5);
+        memtable.write(&kvs).unwrap();
+        memtable.write(&kvs).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
+        assert_eq!(5, total_rows);
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_no_dedup() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, false, MergeMode::LastRow);
+
+        let kvs = build_key_values(&schema, "dup".to_string(), 10, 5);
+        memtable.write(&kvs).unwrap();
+        memtable.write(&kvs).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, false, MergeMode::LastRow, None);
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
+        assert_eq!(10, total_rows);
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_with_sequence_filter() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        // build_key_values creates a mutation with base sequence=0.
+        // Each row gets sequence = base + row_index, so 5 rows get sequences 0,1,2,3,4.
+        let kvs = build_key_values(&schema, "seq".to_string(), 1, 5);
+        memtable.write(&kvs).unwrap();
+
+        // Filter to sequence > 4 — should yield no rows.
+        let builder = build_iter_builder(
+            &schema,
+            &memtable,
+            None,
+            true,
+            MergeMode::LastRow,
+            Some(SequenceRange::Gt { min: 4 }),
+        );
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
+        assert_eq!(0, total_rows);
+
+        // Filter to sequence <= 2 — should yield 3 rows (sequences 0, 1, 2).
+        let builder = build_iter_builder(
+            &schema,
+            &memtable,
+            None,
+            true,
+            MergeMode::LastRow,
+            Some(SequenceRange::LtEq { max: 2 }),
+        );
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
+        assert_eq!(3, total_rows);
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_data_correctness() {
+        use datatypes::arrow::array::{
+            Float64Array, Int64Array, TimestampMillisecondArray, UInt8Array,
+        };
+
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        let kvs = build_key_values(&schema, "check".to_string(), 7, 3);
+        memtable.write(&kvs).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
+
+        let mut iter = builder.build_record_batch(None, None).unwrap();
+        let rb = iter.next().transpose().unwrap().unwrap();
+        assert_eq!(3, rb.num_rows());
+
+        // Verify timestamp values.
+        let ts_col = rb
+            .column_by_name("ts")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<TimestampMillisecondArray>()
+            .unwrap();
+        let timestamps: Vec<_> = (0..ts_col.len()).map(|i| ts_col.value(i)).collect();
+        assert_eq!(vec![0, 1, 2], timestamps);
+
+        // Verify field v0 values.
+        let v0_col = rb
+            .column_by_name("v0")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+        let v0_values: Vec<_> = (0..v0_col.len()).map(|i| v0_col.value(i)).collect();
+        assert_eq!(vec![0, 1, 2], v0_values);
+
+        // Verify field v1 values.
+        let v1_col = rb
+            .column_by_name("v1")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap();
+        let v1_values: Vec<_> = (0..v1_col.len()).map(|i| v1_col.value(i)).collect();
+        assert_eq!(vec![0.0, 1.0, 2.0], v1_values);
+
+        // Verify op_type is all Put (1).
+        let op_col = rb
+            .column_by_name("__op_type")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<UInt8Array>()
+            .unwrap();
+        for i in 0..op_col.len() {
+            assert_eq!(OpType::Put as u8, op_col.value(i));
+        }
+
+        assert!(iter.next().is_none());
+    }
 }
diff --git a/src/mito2/src/test_util/memtable_util.rs b/src/mito2/src/test_util/memtable_util.rs
index 58ea49fa41..8917875250 100644
--- a/src/mito2/src/test_util/memtable_util.rs
+++ b/src/mito2/src/test_util/memtable_util.rs
@@ -31,7 +31,6 @@ use store_api::metadata::{
     ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
 };
 use store_api::storage::{ColumnId, RegionId, SequenceNumber, SequenceRange};
-use table::predicate::Predicate;
 
 use crate::error::Result;
 use crate::memtable::bulk::part::BulkPart;

From f999d5e70e3076d7c45223613a7d6465bfa07c3e Mon Sep 17 00:00:00 2001
From: Ruihang Xia <waynestxia@gmail.com>
Date: Tue, 24 Mar 2026 08:11:37 +0800
Subject: [PATCH 28/42] feat: avoid some vector-array conversions on flat
 projection (#7804)

* perf(mito2): optimize flat projection conversion

* shrink the diff size

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* apply gemini's sugg

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* nit

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
---
 src/mito2/src/read/flat_projection.rs | 78 +++++++++++++++++++++++++--
 src/mito2/src/read/projection.rs      | 23 ++++----
 src/mito2/src/read/stream.rs          |  5 +-
 3 files changed, 89 insertions(+), 17 deletions(-)

diff --git a/src/mito2/src/read/flat_projection.rs b/src/mito2/src/read/flat_projection.rs
index 3e0f1169df..02b4c6b3c1 100644
--- a/src/mito2/src/read/flat_projection.rs
+++ b/src/mito2/src/read/flat_projection.rs
@@ -18,18 +18,21 @@ use std::sync::Arc;
 
 use api::v1::SemanticType;
 use common_error::ext::BoxedError;
-use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu};
+use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu, NewDfRecordBatchSnafu};
 use common_recordbatch::{DfRecordBatch, RecordBatch};
-use datatypes::arrow::datatypes::Field;
+use datatypes::arrow::array::Array;
+use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field};
 use datatypes::prelude::{ConcreteDataType, DataType};
 use datatypes::schema::{Schema, SchemaRef};
+use datatypes::value::Value;
 use datatypes::vectors::Helper;
 use snafu::{OptionExt, ResultExt};
 use store_api::metadata::{RegionMetadata, RegionMetadataRef};
 use store_api::storage::ColumnId;
 
+use crate::cache::CacheStrategy;
 use crate::error::{InvalidRequestSnafu, RecordBatchSnafu, Result};
-use crate::read::projection::read_column_ids_from_projection;
+use crate::read::projection::{read_column_ids_from_projection, repeated_vector_with_cache};
 use crate::sst::parquet::flat_format::sst_column_id_indices;
 use crate::sst::parquet::format::FormatProjection;
 use crate::sst::{
@@ -248,12 +251,55 @@ impl FlatProjectionMapper {
     pub(crate) fn convert(
         &self,
         batch: &datatypes::arrow::record_batch::RecordBatch,
+        cache_strategy: &CacheStrategy,
     ) -> common_recordbatch::error::Result<RecordBatch> {
         if self.is_empty_projection {
             return RecordBatch::new_with_count(self.output_schema.clone(), batch.num_rows());
         }
-        let columns = self.project_vectors(batch)?;
-        RecordBatch::new(self.output_schema.clone(), columns)
+        // Construct output record batch directly from Arrow arrays to avoid
+        // Arrow -> Vector -> Arrow roundtrips in the hot path.
+        let mut arrays = Vec::with_capacity(self.output_schema.num_columns());
+        for (output_idx, index) in self.batch_indices.iter().enumerate() {
+            let mut array = batch.column(*index).clone();
+            // Cast dictionary values to the target type.
+            if let ArrowDataType::Dictionary(_key_type, value_type) = array.data_type() {
+                // When a string dictionary column contains only a single value, reuse a cached
+                // repeated vector to avoid repeatedly expanding the dictionary.
+                if let Some(dict_array) = single_value_string_dictionary(
+                    &array,
+                    &self.output_schema.column_schemas()[output_idx].data_type,
+                    value_type.as_ref(),
+                ) {
+                    let dict_values = dict_array.values();
+                    let value = if dict_values.is_null(0) {
+                        Value::Null
+                    } else {
+                        Value::from(datatypes::arrow_array::string_array_value(dict_values, 0))
+                    };
+
+                    let repeated = repeated_vector_with_cache(
+                        &self.output_schema.column_schemas()[output_idx].data_type,
+                        &value,
+                        batch.num_rows(),
+                        cache_strategy,
+                    )?;
+                    array = repeated.to_arrow_array();
+                } else {
+                    let casted = datatypes::arrow::compute::cast(&array, value_type)
+                        .context(ArrowComputeSnafu)?;
+                    array = casted;
+                }
+            }
+            arrays.push(array);
+        }
+
+        let df_record_batch =
+            DfRecordBatch::try_new(self.output_schema.arrow_schema().clone(), arrays)
+                .context(NewDfRecordBatchSnafu)?;
+        Ok(RecordBatch::from_df_record_batch(
+            self.output_schema.clone(),
+            df_record_batch,
+        ))
     }
 
     /// Projects columns from the input batch and converts them into vectors.
@@ -281,6 +327,28 @@ impl FlatProjectionMapper {
     }
 }
 
+fn single_value_string_dictionary<'a>(
+    array: &'a Arc<dyn Array>,
+    output_type: &ConcreteDataType,
+    value_type: &ArrowDataType,
+) -> Option<&'a datatypes::arrow::array::DictionaryArray<datatypes::arrow::datatypes::UInt32Type>> {
+    if !matches!(
+        value_type,
+        ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View
+    ) || !output_type.is_string()
+    {
+        return None;
+    }
+
+    let dict_array = array
+        .as_any()
+        .downcast_ref::<datatypes::arrow::array::DictionaryArray<
+            datatypes::arrow::datatypes::UInt32Type,
+        >>()?;
+
+    (dict_array.values().len() == 1 && dict_array.null_count() == 0).then_some(dict_array)
+}
+
 /// Returns ids and datatypes of columns of the output batch after applying the `projection`.
 ///
 /// It adds the time index column if it doesn't present in the projection.
diff --git a/src/mito2/src/read/projection.rs b/src/mito2/src/read/projection.rs
index 2c000e7bdc..b5b6904521 100644
--- a/src/mito2/src/read/projection.rs
+++ b/src/mito2/src/read/projection.rs
@@ -21,7 +21,7 @@ use std::sync::Arc;
 use api::v1::SemanticType;
 use common_error::ext::BoxedError;
 use common_recordbatch::RecordBatch;
-use common_recordbatch::error::ExternalSnafu;
+use common_recordbatch::error::{DataTypesSnafu, ExternalSnafu};
 use datatypes::prelude::{ConcreteDataType, DataType};
 use datatypes::schema::{Schema, SchemaRef};
 use datatypes::value::Value;
@@ -37,7 +37,7 @@ use crate::read::Batch;
 use crate::read::flat_projection::FlatProjectionMapper;
 
 /// Only cache vector when its length `<=` this value.
-const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384;
+pub(crate) const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384;
 
 /// Wrapper enum for different projection mapper implementations.
 pub enum ProjectionMapper {
@@ -423,7 +423,7 @@ enum BatchIndex {
 }
 
 /// Gets a vector with repeated values from specific cache or creates a new one.
-fn repeated_vector_with_cache(
+pub(crate) fn repeated_vector_with_cache(
     data_type: &ConcreteDataType,
     value: &Value,
     num_rows: usize,
@@ -450,7 +450,7 @@ fn repeated_vector_with_cache(
 }
 
 /// Returns a vector with repeated values.
-fn new_repeated_vector(
+pub(crate) fn new_repeated_vector(
     data_type: &ConcreteDataType,
     value: &Value,
     num_rows: usize,
@@ -458,8 +458,7 @@ fn new_repeated_vector(
     let mut mutable_vector = data_type.create_mutable_vector(1);
     mutable_vector
         .try_push_value_ref(&value.as_value_ref())
-        .map_err(BoxedError::new)
-        .context(ExternalSnafu)?;
+        .context(DataTypesSnafu)?;
     // This requires an additional allocation.
     let base_vector = mutable_vector.to_vector();
     Ok(base_vector.replicate(&[num_rows]))
@@ -809,6 +808,7 @@ mod tests {
                 .num_fields(2)
                 .build(),
         );
+        let cache = CacheStrategy::Disabled;
         let mapper = ProjectionMapper::all(&metadata, true).unwrap();
         assert_eq!([0, 1, 2, 3, 4], mapper.column_ids());
         assert_eq!(
@@ -823,7 +823,7 @@ mod tests {
         );
 
         let batch = new_flat_batch(Some(0), &[(1, 1), (2, 2)], &[(3, 3), (4, 4)], 3);
-        let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap();
+        let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap();
         let expect = "\
 +---------------------+----+----+----+----+
 | ts                  | k0 | k1 | v0 | v1 |
@@ -843,6 +843,7 @@ mod tests {
                 .num_fields(2)
                 .build(),
         );
+        let cache = CacheStrategy::Disabled;
         // Columns v1, k0
         let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter(), true).unwrap();
         assert_eq!([4, 1], mapper.column_ids());
@@ -856,7 +857,7 @@ mod tests {
         );
 
         let batch = new_flat_batch(None, &[(1, 1)], &[(4, 4)], 3);
-        let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap();
+        let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap();
         let expect = "\
 +----+----+
 | v1 | k0 |
@@ -876,6 +877,7 @@ mod tests {
                 .num_fields(2)
                 .build(),
         );
+        let cache = CacheStrategy::Disabled;
         // Output columns v1, k0. Read also includes v0.
         let mapper = ProjectionMapper::new_with_read_columns(
             &metadata,
@@ -887,7 +889,7 @@ mod tests {
         assert_eq!([4, 1, 3], mapper.column_ids());
 
         let batch = new_flat_batch(None, &[(1, 1)], &[(3, 3), (4, 4)], 3);
-        let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap();
+        let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap();
         let expect = "\
 +----+----+
 | v1 | k0 |
@@ -907,6 +909,7 @@ mod tests {
                 .num_fields(2)
                 .build(),
         );
+        let cache = CacheStrategy::Disabled;
         // Empty projection
         let mapper = ProjectionMapper::new(&metadata, [].into_iter(), true).unwrap();
         assert_eq!([0], mapper.column_ids()); // Should still read the time index column
@@ -918,7 +921,7 @@ mod tests {
         );
 
         let batch = new_flat_batch(Some(0), &[], &[], 3);
-        let record_batch = flat_mapper.convert(&batch).unwrap();
+        let record_batch = flat_mapper.convert(&batch, &cache).unwrap();
         assert_eq!(3, record_batch.num_rows());
         assert_eq!(0, record_batch.num_columns());
         assert!(record_batch.schema.is_empty());
diff --git a/src/mito2/src/read/stream.rs b/src/mito2/src/read/stream.rs
index dd85616241..80002147ea 100644
--- a/src/mito2/src/read/stream.rs
+++ b/src/mito2/src/read/stream.rs
@@ -99,7 +99,8 @@ impl ConvertBatchStream {
                         let mapper = self.projection_mapper.as_flat().unwrap();
 
                         for batch in flat_batch.batches {
-                            self.pending.push_back(mapper.convert(&batch)?);
+                            self.pending
+                                .push_back(mapper.convert(&batch, &self.cache_strategy)?);
                         }
                     }
                 }
@@ -114,7 +115,7 @@ impl ConvertBatchStream {
                 // Safety: Only flat format returns this batch.
                 let mapper = self.projection_mapper.as_flat().unwrap();
 
-                mapper.convert(&df_record_batch)
+                mapper.convert(&df_record_batch, &self.cache_strategy)
             }
         }
     }

From 223f6cfdf727f9b7622126d24a98ac19bec61353 Mon Sep 17 00:00:00 2001
From: dennis zhuang <killme2008@gmail.com>
Date: Tue, 24 Mar 2026 10:05:16 +0800
Subject: [PATCH 29/42] feat: supports sst_format for x-greptime-hints and
 database options (#7843)

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>
---
 src/table/src/requests.rs                     |  9 ++-
 tests-integration/tests/http.rs               | 38 +++++++++
 .../common/alter/alter_database.result        | 79 +++++++++++++++++++
 .../common/alter/alter_database.sql           | 22 +++++-
 4 files changed, 144 insertions(+), 4 deletions(-)

diff --git a/src/table/src/requests.rs b/src/table/src/requests.rs
index 43fc36644b..15b4278f51 100644
--- a/src/table/src/requests.rs
+++ b/src/table/src/requests.rs
@@ -36,8 +36,9 @@ use store_api::metric_engine_consts::{
     LOGICAL_TABLE_METADATA_KEY, PHYSICAL_TABLE_METADATA_KEY, is_metric_engine_option_key,
 };
 use store_api::mito_engine_options::{
-    APPEND_MODE_KEY, COMPACTION_TYPE, MEMTABLE_TYPE, MERGE_MODE_KEY, TWCS_FALLBACK_TO_LOCAL,
-    TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW, TWCS_TRIGGER_FILE_NUM, is_mito_engine_option_key,
+    APPEND_MODE_KEY, COMPACTION_TYPE, MEMTABLE_TYPE, MERGE_MODE_KEY, SST_FORMAT_KEY,
+    TWCS_FALLBACK_TO_LOCAL, TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW, TWCS_TRIGGER_FILE_NUM,
+    is_mito_engine_option_key,
 };
 use store_api::region_request::{SetRegionOption, UnsetRegionOption};
 
@@ -56,13 +57,14 @@ pub const TABLE_DATA_MODEL_TRACE_V1: &str = "greptime_trace_v1";
 pub const OTLP_METRIC_COMPAT_KEY: &str = "otlp_metric_compat";
 pub const OTLP_METRIC_COMPAT_PROM: &str = "prom";
 
-pub const VALID_TABLE_OPTION_KEYS: [&str; 12] = [
+pub const VALID_TABLE_OPTION_KEYS: [&str; 13] = [
     // common keys:
     WRITE_BUFFER_SIZE_KEY,
     TTL_KEY,
     STORAGE_KEY,
     COMMENT_KEY,
     SKIP_WAL_KEY,
+    SST_FORMAT_KEY,
     // file engine keys:
     FILE_TABLE_LOCATION_KEY,
     FILE_TABLE_FORMAT_KEY,
@@ -94,6 +96,7 @@ static VALID_DB_OPT_KEYS: Lazy<HashSet<&str>> = Lazy::new(|| {
     set.insert(TWCS_TIME_WINDOW);
     set.insert(TWCS_TRIGGER_FILE_NUM);
     set.insert(TWCS_MAX_OUTPUT_FILE_SIZE);
+    set.insert(SST_FORMAT_KEY);
     set
 });
 
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index c259d3ff24..65e56fa15e 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -148,6 +148,7 @@ macro_rules! http_tests {
                 test_jaeger_query_api_for_trace_v1,
 
                 test_influxdb_write,
+                test_influxdb_write_with_hints,
                 test_http_memory_limit,
             );
         )*
@@ -3638,6 +3639,43 @@ transform:
     guard.remove_all().await;
 }
 
+pub async fn test_influxdb_write_with_hints(storage_type: StorageType) {
+    common_telemetry::init_default_ut_logging();
+    let (app, mut guard) =
+        setup_test_http_app_with_frontend(storage_type, "test_influxdb_write_with_hints").await;
+
+    let client = TestClient::new(app).await;
+
+    let result = client
+        .post("/v1/influxdb/write?db=public")
+        .header("x-greptime-hints", "sst_format=flat,ttl=30d,skip_wal=true")
+        .body("sst_fmt_table,host=host1 cpu=1.2 1664370459457010101")
+        .send()
+        .await;
+    assert_eq!(result.status(), 204);
+
+    let res = client
+        .get("/v1/sql?sql=show create table sst_fmt_table")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let resp = res.text().await;
+    assert!(
+        resp.contains("sst_format = 'flat'"),
+        "expected sst_format = 'flat' in SHOW CREATE TABLE output, got: {resp}"
+    );
+    assert!(
+        resp.contains("ttl = '30days'"),
+        "expected ttl = '30days' in SHOW CREATE TABLE output, got: {resp}"
+    );
+    assert!(
+        resp.contains("skip_wal = 'true'"),
+        "expected skip_wal = 'true' in SHOW CREATE TABLE output, got: {resp}"
+    );
+
+    guard.remove_all().await;
+}
+
 /// Test one-to-many VRL pipeline expansion.
 /// This test verifies that a VRL processor can return an array, which results in
 /// multiple output rows from a single input row.
diff --git a/tests/cases/standalone/common/alter/alter_database.result b/tests/cases/standalone/common/alter/alter_database.result
index 911ef5ddfc..2fccce10de 100644
--- a/tests/cases/standalone/common/alter/alter_database.result
+++ b/tests/cases/standalone/common/alter/alter_database.result
@@ -314,6 +314,85 @@ SHOW CREATE DATABASE alter_database;
 |                | )                                            |
 +----------------+----------------------------------------------+
 
+-- Test sst_format option
+ALTER DATABASE alter_database SET 'sst_format'='flat';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+----------------------------------------------+
+| Database       | Create Database                              |
++----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.twcs.time_window' = '30m',     |
+|                |   'compaction.type' = 'twcs',                |
+|                |   sst_format = 'flat'                        |
+|                | )                                            |
++----------------+----------------------------------------------+
+
+USE alter_database;
+
+Affected Rows: 0
+
+CREATE TABLE monitor(ts TIMESTAMP TIME INDEX);
+
+Affected Rows: 0
+
+SHOW CREATE TABLE monitor;
+
++---------+----------------------------------------+
+| Table   | Create Table                           |
++---------+----------------------------------------+
+| monitor | CREATE TABLE IF NOT EXISTS "monitor" ( |
+|         |   "ts" TIMESTAMP(3) NOT NULL,          |
+|         |   TIME INDEX ("ts")                    |
+|         | )                                      |
+|         |                                        |
+|         | ENGINE=mito                            |
+|         | WITH(                                  |
+|         |   sst_format = 'flat'                  |
+|         | )                                      |
++---------+----------------------------------------+
+
+USE public;
+
+Affected Rows: 0
+
+ALTER DATABASE alter_database SET 'sst_format'='primary_key';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+----------------------------------------------+
+| Database       | Create Database                              |
++----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.twcs.time_window' = '30m',     |
+|                |   'compaction.type' = 'twcs',                |
+|                |   sst_format = 'primary_key'                 |
+|                | )                                            |
++----------------+----------------------------------------------+
+
+ALTER DATABASE alter_database UNSET 'sst_format';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+----------------------------------------------+
+| Database       | Create Database                              |
++----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.twcs.time_window' = '30m',     |
+|                |   'compaction.type' = 'twcs'                 |
+|                | )                                            |
++----------------+----------------------------------------------+
+
 DROP DATABASE alter_database;
 
 Affected Rows: 0
diff --git a/tests/cases/standalone/common/alter/alter_database.sql b/tests/cases/standalone/common/alter/alter_database.sql
index 1b2f75637a..33b309153e 100644
--- a/tests/cases/standalone/common/alter/alter_database.sql
+++ b/tests/cases/standalone/common/alter/alter_database.sql
@@ -90,5 +90,25 @@ ALTER DATABASE alter_database UNSET 'ttl';
 
 SHOW CREATE DATABASE alter_database;
 
-DROP DATABASE alter_database;
+-- Test sst_format option
+ALTER DATABASE alter_database SET 'sst_format'='flat';
 
+SHOW CREATE DATABASE alter_database;
+
+USE alter_database;
+
+CREATE TABLE monitor(ts TIMESTAMP TIME INDEX);
+
+SHOW CREATE TABLE monitor;
+
+USE public;
+
+ALTER DATABASE alter_database SET 'sst_format'='primary_key';
+
+SHOW CREATE DATABASE alter_database;
+
+ALTER DATABASE alter_database UNSET 'sst_format';
+
+SHOW CREATE DATABASE alter_database;
+
+DROP DATABASE alter_database;

From 7afe16ddf75d8857ad75d98108e4740c87eac966 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 24 Mar 2026 10:15:06 +0800
Subject: [PATCH 30/42] chore(deps): bump rustls-webpki from 0.103.3 to
 0.103.10 (#7847)

Bumps [rustls-webpki](https://github.com/rustls/webpki) from 0.103.3 to 0.103.10.
- [Release notes](https://github.com/rustls/webpki/releases)
- [Commits](https://github.com/rustls/webpki/compare/v/0.103.3...v/0.103.10)

---
updated-dependencies:
- dependency-name: rustls-webpki
  dependency-version: 0.103.10
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 073ae03525..2e419019c7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7301,7 +7301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
 dependencies = [
  "cfg-if",
- "windows-targets 0.52.6",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
@@ -11635,9 +11635,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-webpki"
-version = "0.103.3"
+version = "0.103.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435"
+checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef"
 dependencies = [
  "ring",
  "rustls-pki-types",

From 6bebf93caf18022e985da867be3d703e67bb002c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 24 Mar 2026 10:15:27 +0800
Subject: [PATCH 31/42] chore(deps): bump tar from 0.4.44 to 0.4.45 (#7846)

Bumps [tar](https://github.com/alexcrichton/tar-rs) from 0.4.44 to 0.4.45.
- [Commits](https://github.com/alexcrichton/tar-rs/compare/0.4.44...0.4.45)

---
updated-dependencies:
- dependency-name: tar
  dependency-version: 0.4.45
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2e419019c7..32f9aa27d4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -13404,9 +13404,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
 
 [[package]]
 name = "tar"
-version = "0.4.44"
+version = "0.4.45"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a"
+checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973"
 dependencies = [
  "filetime",
  "libc",

From 5231ee40c8666561732a63cb043c3a4c08cd50c9 Mon Sep 17 00:00:00 2001
From: Yingwen <realevenyag@gmail.com>
Date: Tue, 24 Mar 2026 11:57:18 +0800
Subject: [PATCH 32/42] feat: add parquet pk prefilter helpers (#7850)

* feat: extract parquet pk prefilter helpers

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: fmt code

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: fix warnings

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: update todo

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
---
 src/mito2/src/sst/parquet.rs           |   1 +
 src/mito2/src/sst/parquet/prefilter.rs | 528 +++++++++++++++++++++++++
 2 files changed, 529 insertions(+)
 create mode 100644 src/mito2/src/sst/parquet/prefilter.rs

diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs
index 26bed76fd6..fb8e1d1fc2 100644
--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -29,6 +29,7 @@ pub mod flat_format;
 pub mod format;
 pub(crate) mod helper;
 pub(crate) mod metadata;
+pub mod prefilter;
 pub mod reader;
 pub mod row_group;
 pub mod row_selection;
diff --git a/src/mito2/src/sst/parquet/prefilter.rs b/src/mito2/src/sst/parquet/prefilter.rs
new file mode 100644
index 0000000000..5de2e3512f
--- /dev/null
+++ b/src/mito2/src/sst/parquet/prefilter.rs
@@ -0,0 +1,528 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Helpers for parquet prefiltering.
+
+use std::ops::Range;
+
+use api::v1::SemanticType;
+use common_recordbatch::filter::SimpleFilterEvaluator;
+use datatypes::arrow::array::{BinaryArray, BooleanArray};
+use datatypes::arrow::record_batch::RecordBatch;
+use mito_codec::primary_key_filter::is_partition_column;
+use mito_codec::row_converter::PrimaryKeyFilter;
+use snafu::{OptionExt, ResultExt};
+use store_api::metadata::{RegionMetadata, RegionMetadataRef};
+
+use crate::error::{ComputeArrowSnafu, Result, UnexpectedSnafu};
+use crate::sst::parquet::flat_format::primary_key_column_index;
+use crate::sst::parquet::format::PrimaryKeyArray;
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn matching_row_ranges_by_primary_key(
+    input: &RecordBatch,
+    pk_filter: &mut dyn PrimaryKeyFilter,
+) -> Result<Vec<Range<usize>>> {
+    let primary_key_index = primary_key_column_index(input.num_columns());
+    let pk_dict_array = input
+        .column(primary_key_index)
+        .as_any()
+        .downcast_ref::<PrimaryKeyArray>()
+        .context(UnexpectedSnafu {
+            reason: "Primary key column is not a dictionary array",
+        })?;
+    let pk_values = pk_dict_array
+        .values()
+        .as_any()
+        .downcast_ref::<BinaryArray>()
+        .context(UnexpectedSnafu {
+            reason: "Primary key values are not binary array",
+        })?;
+    let keys = pk_dict_array.keys();
+    let key_values = keys.values();
+
+    if key_values.is_empty() {
+        return Ok(std::iter::once(0..input.num_rows()).collect());
+    }
+
+    let mut matched_row_ranges: Vec<Range<usize>> = Vec::new();
+    let mut start = 0;
+    while start < key_values.len() {
+        let key = key_values[start];
+        let mut end = start + 1;
+        while end < key_values.len() && key_values[end] == key {
+            end += 1;
+        }
+
+        if pk_filter.matches(pk_values.value(key as usize)) {
+            if let Some(last) = matched_row_ranges.last_mut()
+                && last.end == start
+            {
+                last.end = end;
+            } else {
+                matched_row_ranges.push(start..end);
+            }
+        }
+
+        start = end;
+    }
+
+    Ok(matched_row_ranges)
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn prefilter_flat_batch_by_primary_key(
+    input: RecordBatch,
+    pk_filter: &mut dyn PrimaryKeyFilter,
+) -> Result<Option<RecordBatch>> {
+    if input.num_rows() == 0 {
+        return Ok(Some(input));
+    }
+
+    let matched_row_ranges = matching_row_ranges_by_primary_key(&input, pk_filter)?;
+    if matched_row_ranges.is_empty() {
+        return Ok(None);
+    }
+
+    if matched_row_ranges.len() == 1
+        && matched_row_ranges[0].start == 0
+        && matched_row_ranges[0].end == input.num_rows()
+    {
+        return Ok(Some(input));
+    }
+
+    if matched_row_ranges.len() == 1 {
+        let span = &matched_row_ranges[0];
+        return Ok(Some(input.slice(span.start, span.end - span.start)));
+    }
+
+    let mut mask = vec![false; input.num_rows()];
+    for span in matched_row_ranges {
+        mask[span].fill(true);
+    }
+
+    let filtered =
+        datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask))
+            .context(ComputeArrowSnafu)?;
+    if filtered.num_rows() == 0 {
+        Ok(None)
+    } else {
+        Ok(Some(filtered))
+    }
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn retain_usable_primary_key_filters(
+    sst_metadata: &RegionMetadataRef,
+    expected_metadata: Option<&RegionMetadata>,
+    filters: &mut Vec<SimpleFilterEvaluator>,
+) {
+    filters.retain(|filter| is_usable_primary_key_filter(sst_metadata, expected_metadata, filter));
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn is_usable_primary_key_filter(
+    sst_metadata: &RegionMetadataRef,
+    expected_metadata: Option<&RegionMetadata>,
+    filter: &SimpleFilterEvaluator,
+) -> bool {
+    // TODO(yingwen): The primary key filter always skips the partition column. Consider using a flag
+    // to control this behavior. We can remove this behavior after we remove the PartitionTreeMemtable.
+    if is_partition_column(filter.column_name()) {
+        return false;
+    }
+
+    let sst_column = match expected_metadata {
+        Some(expected_metadata) => {
+            let Some(expected_column) = expected_metadata.column_by_name(filter.column_name())
+            else {
+                return false;
+            };
+            let Some(sst_column) = sst_metadata.column_by_id(expected_column.column_id) else {
+                return false;
+            };
+
+            if sst_column.column_schema.name != expected_column.column_schema.name
+                || sst_column.semantic_type != expected_column.semantic_type
+                || sst_column.column_schema.data_type != expected_column.column_schema.data_type
+            {
+                return false;
+            }
+
+            sst_column
+        }
+        None => {
+            let Some(sst_column) = sst_metadata.column_by_name(filter.column_name()) else {
+                return false;
+            };
+            sst_column
+        }
+    };
+
+    sst_column.semantic_type == SemanticType::Tag
+        && sst_metadata
+            .primary_key_index(sst_column.column_id)
+            .is_some()
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) struct CachedPrimaryKeyFilter {
+    inner: Box<dyn PrimaryKeyFilter>,
+    last_primary_key: Vec<u8>,
+    last_match: Option<bool>,
+}
+
+impl CachedPrimaryKeyFilter {
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub(crate) fn new(inner: Box<dyn PrimaryKeyFilter>) -> Self {
+        Self {
+            inner,
+            last_primary_key: Vec::new(),
+            last_match: None,
+        }
+    }
+}
+
+impl PrimaryKeyFilter for CachedPrimaryKeyFilter {
+    fn matches(&mut self, pk: &[u8]) -> bool {
+        if let Some(last_match) = self.last_match
+            && self.last_primary_key == pk
+        {
+            return last_match;
+        }
+
+        let matched = self.inner.matches(pk);
+        self.last_primary_key.clear();
+        self.last_primary_key.extend_from_slice(pk);
+        self.last_match = Some(matched);
+        matched
+    }
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn batch_single_primary_key(batch: &RecordBatch) -> Result<Option<&[u8]>> {
+    let primary_key_index = primary_key_column_index(batch.num_columns());
+    let pk_dict_array = batch
+        .column(primary_key_index)
+        .as_any()
+        .downcast_ref::<PrimaryKeyArray>()
+        .context(UnexpectedSnafu {
+            reason: "Primary key column is not a dictionary array",
+        })?;
+    let pk_values = pk_dict_array
+        .values()
+        .as_any()
+        .downcast_ref::<BinaryArray>()
+        .context(UnexpectedSnafu {
+            reason: "Primary key values are not binary array",
+        })?;
+    let keys = pk_dict_array.keys();
+    if keys.is_empty() {
+        return Ok(None);
+    }
+
+    let first_key = keys.value(0);
+    if first_key != keys.value(keys.len() - 1) {
+        return Ok(None);
+    }
+
+    Ok(Some(pk_values.value(first_key as usize)))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    use api::v1::SemanticType;
+    use common_recordbatch::filter::SimpleFilterEvaluator;
+    use datafusion_expr::{col, lit};
+    use datatypes::arrow::array::{
+        ArrayRef, BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array,
+        UInt64Array,
+    };
+    use datatypes::arrow::datatypes::{Schema, UInt32Type};
+    use datatypes::arrow::record_batch::RecordBatch;
+    use datatypes::prelude::ConcreteDataType;
+    use mito_codec::row_converter::{PrimaryKeyFilter, build_primary_key_codec};
+    use store_api::codec::PrimaryKeyEncoding;
+    use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
+    use store_api::storage::ColumnSchema;
+
+    use super::*;
+    use crate::sst::internal_fields;
+    use crate::sst::parquet::format::ReadFormat;
+    use crate::test_util::sst_util::{
+        new_primary_key, sst_region_metadata, sst_region_metadata_with_encoding,
+    };
+
+    fn new_test_filters(exprs: &[datafusion_expr::Expr]) -> Vec<SimpleFilterEvaluator> {
+        exprs
+            .iter()
+            .filter_map(SimpleFilterEvaluator::try_new)
+            .collect()
+    }
+
+    fn expected_metadata_with_reused_tag_name(
+        old_metadata: &RegionMetadata,
+    ) -> Arc<RegionMetadata> {
+        let mut builder = RegionMetadataBuilder::new(old_metadata.region_id);
+        builder
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "tag_0".to_string(),
+                    ConcreteDataType::string_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Tag,
+                column_id: 10,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "tag_1".to_string(),
+                    ConcreteDataType::string_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Tag,
+                column_id: 1,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "field_0".to_string(),
+                    ConcreteDataType::uint64_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Field,
+                column_id: 2,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "ts".to_string(),
+                    ConcreteDataType::timestamp_millisecond_datatype(),
+                    false,
+                ),
+                semantic_type: SemanticType::Timestamp,
+                column_id: 3,
+            })
+            .primary_key(vec![10, 1]);
+
+        Arc::new(builder.build().unwrap())
+    }
+
+    fn new_raw_batch_with_metadata(
+        metadata: Arc<RegionMetadata>,
+        primary_keys: &[&[u8]],
+        field_values: &[u64],
+    ) -> RecordBatch {
+        assert_eq!(primary_keys.len(), field_values.len());
+
+        let arrow_schema = metadata.schema.arrow_schema();
+        let field_column = arrow_schema
+            .field(arrow_schema.index_of("field_0").unwrap())
+            .clone();
+        let time_index_column = arrow_schema
+            .field(arrow_schema.index_of("ts").unwrap())
+            .clone();
+        let mut fields = vec![field_column, time_index_column];
+        fields.extend(
+            internal_fields()
+                .into_iter()
+                .map(|field| field.as_ref().clone()),
+        );
+        let schema = Arc::new(Schema::new(fields));
+
+        let mut dict_values = Vec::new();
+        let mut keys = Vec::with_capacity(primary_keys.len());
+        for pk in primary_keys {
+            let key = dict_values
+                .iter()
+                .position(|existing: &&[u8]| existing == pk)
+                .unwrap_or_else(|| {
+                    dict_values.push(*pk);
+                    dict_values.len() - 1
+                });
+            keys.push(key as u32);
+        }
+
+        let pk_array: ArrayRef = Arc::new(DictionaryArray::<UInt32Type>::new(
+            UInt32Array::from(keys),
+            Arc::new(BinaryArray::from_iter_values(dict_values.iter().copied())),
+        ));
+
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(UInt64Array::from(field_values.to_vec())),
+                Arc::new(TimestampMillisecondArray::from_iter_values(
+                    0..primary_keys.len() as i64,
+                )),
+                pk_array,
+                Arc::new(UInt64Array::from(vec![1; primary_keys.len()])),
+                Arc::new(UInt8Array::from(vec![1; primary_keys.len()])),
+            ],
+        )
+        .unwrap()
+    }
+
+    fn new_raw_batch(primary_keys: &[&[u8]], field_values: &[u64]) -> RecordBatch {
+        new_raw_batch_with_metadata(Arc::new(sst_region_metadata()), primary_keys, field_values)
+    }
+
+    fn field_values(batch: &RecordBatch) -> Vec<u64> {
+        batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap()
+            .values()
+            .to_vec()
+    }
+
+    #[test]
+    fn test_retain_usable_primary_key_filters_skips_non_tag_filters() {
+        let metadata = Arc::new(sst_region_metadata());
+        let mut filters =
+            new_test_filters(&[col("field_0").eq(lit(1_u64)), col("ts").gt(lit(0_i64))]);
+
+        retain_usable_primary_key_filters(&metadata, None, &mut filters);
+
+        assert!(filters.is_empty());
+    }
+
+    #[test]
+    fn test_retain_usable_primary_key_filters_skips_reused_expected_tag_name() {
+        let metadata = Arc::new(sst_region_metadata());
+        let expected_metadata = expected_metadata_with_reused_tag_name(&metadata);
+        let mut filters = new_test_filters(&[col("tag_0").eq(lit("b"))]);
+
+        retain_usable_primary_key_filters(
+            &metadata,
+            Some(expected_metadata.as_ref()),
+            &mut filters,
+        );
+
+        assert!(filters.is_empty());
+    }
+
+    #[test]
+    fn test_is_usable_primary_key_filter_skips_legacy_primary_key_batches() {
+        let metadata = Arc::new(sst_region_metadata_with_encoding(
+            PrimaryKeyEncoding::Sparse,
+        ));
+        let read_format = ReadFormat::new_flat(
+            metadata.clone(),
+            metadata.column_metadatas.iter().map(|c| c.column_id),
+            None,
+            "test",
+            true,
+        )
+        .unwrap();
+        assert!(read_format.as_flat().is_some());
+
+        let filter = SimpleFilterEvaluator::try_new(&col("tag_0").eq(lit("b"))).unwrap();
+        assert!(is_usable_primary_key_filter(&metadata, None, &filter));
+    }
+
+    #[test]
+    fn test_prefilter_primary_key_drops_single_dictionary_batch() {
+        let metadata = Arc::new(sst_region_metadata());
+        let filters = Arc::new(new_test_filters(&[col("tag_0").eq(lit("b"))]));
+        let mut primary_key_filter =
+            build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters);
+        let pk_a = new_primary_key(&["a", "x"]);
+        let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]);
+
+        let filtered =
+            prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut()).unwrap();
+
+        assert!(filtered.is_none());
+    }
+
+    #[test]
+    fn test_prefilter_primary_key_builds_mask_for_fragmented_matches() {
+        let metadata = Arc::new(sst_region_metadata());
+        let filters = Arc::new(new_test_filters(&[col("tag_0")
+            .eq(lit("a"))
+            .or(col("tag_0").eq(lit("c")))]));
+        let mut primary_key_filter =
+            build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters);
+        let pk_a = new_primary_key(&["a", "x"]);
+        let pk_b = new_primary_key(&["b", "x"]);
+        let pk_c = new_primary_key(&["c", "x"]);
+        let pk_d = new_primary_key(&["d", "x"]);
+        let batch = new_raw_batch(
+            &[
+                pk_a.as_slice(),
+                pk_a.as_slice(),
+                pk_b.as_slice(),
+                pk_b.as_slice(),
+                pk_c.as_slice(),
+                pk_c.as_slice(),
+                pk_d.as_slice(),
+                pk_d.as_slice(),
+            ],
+            &[10, 11, 12, 13, 14, 15, 16, 17],
+        );
+
+        let filtered = prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut())
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(filtered.num_rows(), 4);
+        assert_eq!(field_values(&filtered), vec![10, 11, 14, 15]);
+    }
+
+    struct CountingPrimaryKeyFilter {
+        hits: Arc<AtomicUsize>,
+        expected: Vec<u8>,
+    }
+
+    impl PrimaryKeyFilter for CountingPrimaryKeyFilter {
+        fn matches(&mut self, pk: &[u8]) -> bool {
+            self.hits.fetch_add(1, Ordering::Relaxed);
+            pk == self.expected.as_slice()
+        }
+    }
+
+    #[test]
+    fn test_cached_primary_key_filter_reuses_previous_result() {
+        let expected = new_primary_key(&["a", "x"]);
+        let hits = Arc::new(AtomicUsize::new(0));
+        let mut filter = CachedPrimaryKeyFilter::new(Box::new(CountingPrimaryKeyFilter {
+            hits: Arc::clone(&hits),
+            expected: expected.clone(),
+        }));
+
+        assert!(filter.matches(expected.as_slice()));
+        assert!(filter.matches(expected.as_slice()));
+        assert!(!filter.matches(new_primary_key(&["b", "x"]).as_slice()));
+
+        assert_eq!(hits.load(Ordering::Relaxed), 2);
+    }
+
+    #[test]
+    fn test_batch_single_primary_key() {
+        let pk_a = new_primary_key(&["a", "x"]);
+        let pk_b = new_primary_key(&["b", "x"]);
+
+        let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]);
+        assert_eq!(
+            batch_single_primary_key(&batch).unwrap(),
+            Some(pk_a.as_slice())
+        );
+
+        let batch = new_raw_batch(&[pk_a.as_slice(), pk_b.as_slice()], &[10, 11]);
+        assert_eq!(batch_single_primary_key(&batch).unwrap(), None);
+    }
+}

From 9bd983ea4063191679f82eda1523839746cb6aa4 Mon Sep 17 00:00:00 2001
From: Weny Xu <wenymedia@gmail.com>
Date: Tue, 24 Mar 2026 12:24:15 +0800
Subject: [PATCH 33/42] fix: prevent stale in-flight cache refill after
 invalidation in CacheContainer (#7825)

* fix: prevent stale cache refill after invalidate

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions from CR

Signed-off-by: WenyXu <wenymedia@gmail.com>

* feat: introduce `get_latest`

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: styling

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix: enforce construction-time cache init strategy

Make cache initialization behavior explicit via InitStrategy selected at construction and document dirty-vs-checked semantics. Keep latest-read call compatibility while partition manager uses strategy-driven get paths.

Signed-off-by: WenyXu <wenymedia@gmail.com>

* test: rename get_by_ref freshness test

Signed-off-by: WenyXu <wenymedia@gmail.com>

* feat: use `InitStrategy::VersionChecked` for table route cache

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions from CR

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions from CR

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
---
 src/catalog/src/kvbackend/table_cache.rs      |   8 +-
 src/common/meta/Cargo.toml                    |   5 +-
 src/common/meta/src/cache/container.rs        | 288 +++++++++++++++---
 .../meta/src/cache/flow/table_flownode.rs     |  24 +-
 src/common/meta/src/cache/table/schema.rs     |   8 +-
 src/common/meta/src/cache/table/table_info.rs |   8 +-
 src/common/meta/src/cache/table/table_name.rs |   8 +-
 .../meta/src/cache/table/table_route.rs       |  18 +-
 .../meta/src/cache/table/table_schema.rs      |   2 +-
 src/common/meta/src/cache/table/view_info.rs  |   8 +-
 src/common/meta/src/error.rs                  |  16 +-
 src/partition/src/cache.rs                    |   8 +-
 12 files changed, 313 insertions(+), 88 deletions(-)

diff --git a/src/catalog/src/kvbackend/table_cache.rs b/src/catalog/src/kvbackend/table_cache.rs
index ea328c3e17..42b3fbc74b 100644
--- a/src/catalog/src/kvbackend/table_cache.rs
+++ b/src/catalog/src/kvbackend/table_cache.rs
@@ -65,11 +65,13 @@ fn init_factory(
 
 fn invalidator<'a>(
     cache: &'a Cache<TableName, TableRef>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, MetaResult<()>> {
     Box::pin(async move {
-        if let CacheIdent::TableName(table_name) = ident {
-            cache.invalidate(table_name).await
+        for ident in idents {
+            if let CacheIdent::TableName(table_name) = ident {
+                cache.invalidate(table_name).await
+            }
         }
         Ok(())
     })
diff --git a/src/common/meta/Cargo.toml b/src/common/meta/Cargo.toml
index ec000c710d..f5ca9d2c09 100644
--- a/src/common/meta/Cargo.toml
+++ b/src/common/meta/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 testing = []
 pg_kvbackend = [
     "dep:tokio-postgres",
-    "dep:backon",
     "dep:deadpool-postgres",
     "dep:deadpool",
     "dep:tokio-postgres-rustls",
@@ -16,7 +15,7 @@ pg_kvbackend = [
     "dep:rustls-native-certs",
     "dep:rustls",
 ]
-mysql_kvbackend = ["dep:sqlx", "dep:backon"]
+mysql_kvbackend = ["dep:sqlx"]
 enterprise = ["prost-types"]
 
 [lints]
@@ -28,7 +27,7 @@ api.workspace = true
 async-recursion = "1.0"
 async-stream.workspace = true
 async-trait.workspace = true
-backon = { workspace = true, optional = true }
+backon.workspace = true
 base64.workspace = true
 bytes.workspace = true
 chrono.workspace = true
diff --git a/src/common/meta/src/cache/container.rs b/src/common/meta/src/cache/container.rs
index 0510476d15..e3a3e13a76 100644
--- a/src/common/meta/src/cache/container.rs
+++ b/src/common/meta/src/cache/container.rs
@@ -15,10 +15,14 @@
 use std::borrow::Borrow;
 use std::hash::Hash;
 use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::Duration;
 
-use futures::future::{BoxFuture, join_all};
+use backon::{BackoffBuilder, ExponentialBuilder};
+use futures::future::BoxFuture;
 use moka::future::Cache;
 use snafu::{OptionExt, ResultExt};
+use tokio::time::sleep;
 
 use crate::cache_invalidator::{CacheInvalidator, Context};
 use crate::error::{self, Error, Result};
@@ -29,12 +33,29 @@ use crate::metrics;
 pub type TokenFilter<CacheToken> = Box<dyn Fn(&CacheToken) -> bool + Send + Sync>;
 
 /// Invalidates cached values by [CacheToken]s.
-pub type Invalidator<K, V, CacheToken> =
-    Box<dyn for<'a> Fn(&'a Cache<K, V>, &'a CacheToken) -> BoxFuture<'a, Result<()>> + Send + Sync>;
+pub type Invalidator<K, V, CacheToken> = Box<
+    dyn for<'a> Fn(&'a Cache<K, V>, &'a [&CacheToken]) -> BoxFuture<'a, Result<()>> + Send + Sync,
+>;
 
 /// Initializes value (i.e., fetches from remote).
 pub type Initializer<K, V> = Arc<dyn Fn(&'_ K) -> BoxFuture<'_, Result<Option<V>>> + Send + Sync>;
 
+#[derive(Debug, Clone, Copy)]
+/// Initialization strategy for cache-miss loading.
+///
+/// This strategy is selected when building [CacheContainer] and remains immutable
+/// for the lifetime of the container instance.
+pub enum InitStrategy {
+    /// Fast path: load once without version conflict retry.
+    ///
+    /// Under concurrent invalidation, callers may observe stale/dirty value.
+    Unchecked,
+    /// Strict path: retry load when version changes during initialization.
+    ///
+    /// This avoids returning dirty value under invalidate/load races.
+    VersionChecked,
+}
+
 /// [CacheContainer] provides ability to:
 /// - Cache value loaded by [Initializer].
 /// - Invalidate caches by [Invalidator].
@@ -44,6 +65,16 @@ pub struct CacheContainer<K, V, CacheToken> {
     invalidator: Invalidator<K, V, CacheToken>,
     initializer: Initializer<K, V>,
     token_filter: fn(&CacheToken) -> bool,
+    version: Arc<AtomicUsize>,
+    init_strategy: InitStrategy,
+}
+
+fn latest_get_backoff() -> impl Iterator<Item = Duration> {
+    ExponentialBuilder::default()
+        .with_min_delay(Duration::from_millis(10))
+        .with_max_delay(Duration::from_millis(100))
+        .with_max_times(3)
+        .build()
 }
 
 impl<K, V, CacheToken> CacheContainer<K, V, CacheToken>
@@ -52,13 +83,37 @@ where
     V: Send + Sync,
     CacheToken: Send + Sync,
 {
-    /// Constructs an [CacheContainer].
+    /// Constructs an [CacheContainer] with [InitStrategy::Unchecked].
+    ///
+    /// This keeps the historical behavior and can return stale/dirty value under
+    /// concurrent invalidation.
     pub fn new(
         name: String,
         cache: Cache<K, V>,
         invalidator: Invalidator<K, V, CacheToken>,
         initializer: Initializer<K, V>,
         token_filter: fn(&CacheToken) -> bool,
+    ) -> Self {
+        Self::with_strategy(
+            name,
+            cache,
+            invalidator,
+            initializer,
+            token_filter,
+            InitStrategy::Unchecked,
+        )
+    }
+
+    /// Constructs an [CacheContainer] with explicit [InitStrategy].
+    ///
+    /// The strategy is fixed at construction time and cannot be changed later.
+    pub fn with_strategy(
+        name: String,
+        cache: Cache<K, V>,
+        invalidator: Invalidator<K, V, CacheToken>,
+        initializer: Initializer<K, V>,
+        token_filter: fn(&CacheToken) -> bool,
+        init_strategy: InitStrategy,
     ) -> Self {
         Self {
             name,
@@ -66,6 +121,8 @@ where
             invalidator,
             initializer,
             token_filter,
+            version: Arc::new(AtomicUsize::new(0)),
+            init_strategy,
         }
     }
 
@@ -75,6 +132,67 @@ where
     }
 }
 
+impl<K, V, CacheToken> CacheContainer<K, V, CacheToken> {
+    fn inc_version(&self) {
+        self.version.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+async fn init<'a, K, V>(init: Initializer<K, V>, key: K, cache_name: &'a str) -> Result<V>
+where
+    K: Send + Sync + 'a,
+    V: Send + 'a,
+{
+    metrics::CACHE_CONTAINER_CACHE_MISS
+        .with_label_values(&[cache_name])
+        .inc();
+    let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
+        .with_label_values(&[cache_name])
+        .start_timer();
+    init(&key)
+        .await
+        .transpose()
+        .context(error::ValueNotExistSnafu)?
+}
+
+async fn init_with_retry<'a, K, V>(
+    init: Initializer<K, V>,
+    key: K,
+    mut backoff: impl Iterator<Item = Duration> + 'a,
+    version: Arc<AtomicUsize>,
+    cache_name: &'a str,
+) -> Result<V>
+where
+    K: Send + Sync + 'a,
+    V: Send + 'a,
+{
+    let mut attempts = 1usize;
+    loop {
+        let pre_version = version.load(Ordering::Relaxed);
+        metrics::CACHE_CONTAINER_CACHE_MISS
+            .with_label_values(&[cache_name])
+            .inc();
+        let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
+            .with_label_values(&[cache_name])
+            .start_timer();
+        let value = init(&key)
+            .await
+            .transpose()
+            .context(error::ValueNotExistSnafu)??;
+
+        if pre_version == version.load(Ordering::Relaxed) {
+            return Ok(value);
+        }
+
+        if let Some(duration) = backoff.next() {
+            sleep(duration).await;
+            attempts += 1;
+        } else {
+            return error::GetLatestCacheRetryExceededSnafu { attempts }.fail();
+        }
+    }
+}
+
 #[async_trait::async_trait]
 impl<K, V> CacheInvalidator for CacheContainer<K, V, CacheIdent>
 where
@@ -82,14 +200,15 @@ where
     V: Send + Sync,
 {
     async fn invalidate(&self, _ctx: &Context, caches: &[CacheIdent]) -> Result<()> {
-        let tasks = caches
+        let idents = caches
             .iter()
             .filter(|token| (self.token_filter)(token))
-            .map(|token| (self.invalidator)(&self.cache, token));
-        join_all(tasks)
-            .await
-            .into_iter()
-            .collect::<Result<Vec<_>>>()?;
+            .collect::<Vec<_>>();
+        if !idents.is_empty() {
+            self.inc_version();
+            (self.invalidator)(&self.cache, &idents).await?;
+        }
+
         Ok(())
     }
 }
@@ -99,27 +218,39 @@ where
     K: Copy + Hash + Eq + Send + Sync + 'static,
     V: Clone + Send + Sync + 'static,
 {
-    /// Returns a _clone_ of the value corresponding to the key.
+    /// Returns a value from cache for copyable keys.
+    ///
+    /// With [InitStrategy::Unchecked], this method prioritizes latency and may
+    /// return stale/dirty value. With [InitStrategy::VersionChecked], this method
+    /// retries initialization on version change and avoids dirty returns.
     pub async fn get(&self, key: K) -> Result<Option<V>> {
         metrics::CACHE_CONTAINER_CACHE_GET
             .with_label_values(&[&self.name])
             .inc();
-        let moved_init = self.initializer.clone();
-        let moved_key = key;
-        let init = async move {
-            metrics::CACHE_CONTAINER_CACHE_MISS
-                .with_label_values(&[&self.name])
-                .inc();
-            let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
-                .with_label_values(&[&self.name])
-                .start_timer();
-            moved_init(&moved_key)
-                .await
-                .transpose()
-                .context(error::ValueNotExistSnafu)?
+
+        let result = match self.init_strategy {
+            InitStrategy::Unchecked => {
+                self.cache
+                    .try_get_with(key, init(self.initializer.clone(), key, &self.name))
+                    .await
+            }
+            InitStrategy::VersionChecked => {
+                self.cache
+                    .try_get_with(
+                        key,
+                        init_with_retry(
+                            self.initializer.clone(),
+                            key,
+                            latest_get_backoff(),
+                            self.version.clone(),
+                            &self.name,
+                        ),
+                    )
+                    .await
+            }
         };
 
-        match self.cache.try_get_with(key, init).await {
+        match result {
             Ok(value) => Ok(Some(value)),
             Err(err) => match err.as_ref() {
                 Error::ValueNotExist { .. } => Ok(None),
@@ -136,14 +267,15 @@ where
 {
     /// Invalidates cache by [CacheToken].
     pub async fn invalidate(&self, caches: &[CacheToken]) -> Result<()> {
-        let tasks = caches
+        let idents = caches
             .iter()
             .filter(|token| (self.token_filter)(token))
-            .map(|token| (self.invalidator)(&self.cache, token));
-        join_all(tasks)
-            .await
-            .into_iter()
-            .collect::<Result<Vec<_>>>()?;
+            .collect::<Vec<_>>();
+        if !idents.is_empty() {
+            self.inc_version();
+            (self.invalidator)(&self.cache, &idents).await?;
+        }
+
         Ok(())
     }
 
@@ -156,7 +288,11 @@ where
         self.cache.contains_key(key)
     }
 
-    /// Returns a _clone_ of the value corresponding to the key.
+    /// Returns a value from cache by key reference.
+    ///
+    /// With [InitStrategy::Unchecked], this method prioritizes latency and may
+    /// return stale/dirty value. With [InitStrategy::VersionChecked], this method
+    /// retries initialization on version change and avoids dirty returns.
     pub async fn get_by_ref<Q>(&self, key: &Q) -> Result<Option<V>>
     where
         K: Borrow<Q>,
@@ -165,24 +301,32 @@ where
         metrics::CACHE_CONTAINER_CACHE_GET
             .with_label_values(&[&self.name])
             .inc();
-        let moved_init = self.initializer.clone();
-        let moved_key = key.to_owned();
-
-        let init = async move {
-            metrics::CACHE_CONTAINER_CACHE_MISS
-                .with_label_values(&[&self.name])
-                .inc();
-            let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
-                .with_label_values(&[&self.name])
-                .start_timer();
-
-            moved_init(&moved_key)
-                .await
-                .transpose()
-                .context(error::ValueNotExistSnafu)?
+        let result = match self.init_strategy {
+            InitStrategy::Unchecked => {
+                self.cache
+                    .try_get_with_by_ref(
+                        key,
+                        init(self.initializer.clone(), key.to_owned(), &self.name),
+                    )
+                    .await
+            }
+            InitStrategy::VersionChecked => {
+                self.cache
+                    .try_get_with_by_ref(
+                        key,
+                        init_with_retry(
+                            self.initializer.clone(),
+                            key.to_owned(),
+                            latest_get_backoff(),
+                            self.version.clone(),
+                            &self.name,
+                        ),
+                    )
+                    .await
+            }
         };
 
-        match self.cache.try_get_with_by_ref(key, init).await {
+        match result {
             Ok(value) => Ok(Some(value)),
             Err(err) => match err.as_ref() {
                 Error::ValueNotExist { .. } => Ok(None),
@@ -296,9 +440,11 @@ mod tests {
             moved_counter.fetch_add(1, Ordering::Relaxed);
             Box::pin(async { Ok(Some("hi".to_string())) })
         });
-        let invalidator: Invalidator<String, String, String> = Box::new(|cache, key| {
+        let invalidator: Invalidator<String, String, String> = Box::new(|cache, keys| {
             Box::pin(async move {
-                cache.invalidate(key).await;
+                for key in keys {
+                    cache.invalidate(*key).await;
+                }
                 Ok(())
             })
         });
@@ -323,4 +469,46 @@ mod tests {
         assert_eq!(value, "hi");
         assert_eq!(counter.load(Ordering::Relaxed), 2);
     }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_get_by_ref_returns_fresh_value_after_invalidate() {
+        let cache: Cache<String, String> = CacheBuilder::new(128).build();
+        let counter = Arc::new(AtomicI32::new(0));
+        let moved_counter = counter.clone();
+        let init: Initializer<String, String> = Arc::new(move |_| {
+            let counter = moved_counter.clone();
+            Box::pin(async move {
+                let n = counter.fetch_add(1, Ordering::Relaxed) + 1;
+                sleep(Duration::from_millis(100)).await;
+                Ok(Some(format!("v{n}")))
+            })
+        });
+        let invalidator: Invalidator<String, String, String> = Box::new(|cache, keys| {
+            Box::pin(async move {
+                for key in keys {
+                    cache.invalidate(*key).await;
+                }
+                Ok(())
+            })
+        });
+
+        let adv_cache = Arc::new(CacheContainer::with_strategy(
+            "test".to_string(),
+            cache,
+            invalidator,
+            init,
+            always_true_filter,
+            InitStrategy::VersionChecked,
+        ));
+
+        let moved_cache = adv_cache.clone();
+        let get_task = tokio::spawn(async move { moved_cache.get_by_ref("foo").await });
+
+        sleep(Duration::from_millis(50)).await;
+        adv_cache.invalidate(&["foo".to_string()]).await.unwrap();
+
+        let value = get_task.await.unwrap().unwrap().unwrap();
+        assert_eq!(value, "v2");
+        assert_eq!(counter.load(Ordering::Relaxed), 2);
+    }
 }
diff --git a/src/common/meta/src/cache/flow/table_flownode.rs b/src/common/meta/src/cache/flow/table_flownode.rs
index a7777f3361..ebe3664202 100644
--- a/src/common/meta/src/cache/flow/table_flownode.rs
+++ b/src/common/meta/src/cache/flow/table_flownode.rs
@@ -170,20 +170,22 @@ async fn handle_drop_flow(
 
 fn invalidator<'a>(
     cache: &'a Cache<TableId, FlownodeFlowSet>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
     Box::pin(async move {
-        match ident {
-            CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await,
-            CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await,
-            CacheIdent::FlowNodeAddressChange(node_id) => {
-                info!(
-                    "Invalidate flow node cache for node_id in table_flownode: {}",
-                    node_id
-                );
-                cache.invalidate_all();
+        for ident in idents {
+            match ident {
+                CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await,
+                CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await,
+                CacheIdent::FlowNodeAddressChange(node_id) => {
+                    info!(
+                        "Invalidate flow node cache for node_id in table_flownode: {}",
+                        node_id
+                    );
+                    cache.invalidate_all();
+                }
+                _ => {}
             }
-            _ => {}
         }
         Ok(())
     })
diff --git a/src/common/meta/src/cache/table/schema.rs b/src/common/meta/src/cache/table/schema.rs
index bcf81d4fe6..bd9e8e6dc1 100644
--- a/src/common/meta/src/cache/table/schema.rs
+++ b/src/common/meta/src/cache/table/schema.rs
@@ -58,11 +58,13 @@ fn init_factory(schema_manager: SchemaManager) -> Initializer<SchemaName, Arc<Sc
 
 fn invalidator<'a>(
     cache: &'a Cache<SchemaName, Arc<SchemaNameValue>>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, crate::error::Result<()>> {
     Box::pin(async move {
-        if let CacheIdent::SchemaName(schema_name) = ident {
-            cache.invalidate(schema_name).await
+        for ident in idents {
+            if let CacheIdent::SchemaName(schema_name) = ident {
+                cache.invalidate(schema_name).await
+            }
         }
         Ok(())
     })
diff --git a/src/common/meta/src/cache/table/table_info.rs b/src/common/meta/src/cache/table/table_info.rs
index b853d908e8..97af5bcdb7 100644
--- a/src/common/meta/src/cache/table/table_info.rs
+++ b/src/common/meta/src/cache/table/table_info.rs
@@ -61,11 +61,13 @@ fn init_factory(table_info_manager: TableInfoManagerRef) -> Initializer<TableId,
 
 fn invalidator<'a>(
     cache: &'a Cache<TableId, Arc<TableInfo>>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
     Box::pin(async move {
-        if let CacheIdent::TableId(table_id) = ident {
-            cache.invalidate(table_id).await
+        for ident in idents {
+            if let CacheIdent::TableId(table_id) = ident {
+                cache.invalidate(table_id).await
+            }
         }
         Ok(())
     })
diff --git a/src/common/meta/src/cache/table/table_name.rs b/src/common/meta/src/cache/table/table_name.rs
index 540da5e5f4..927a5b3480 100644
--- a/src/common/meta/src/cache/table/table_name.rs
+++ b/src/common/meta/src/cache/table/table_name.rs
@@ -71,11 +71,13 @@ fn init_factory(table_name_manager: TableNameManagerRef) -> Initializer<TableNam
 
 fn invalidator<'a>(
     cache: &'a Cache<TableName, TableId>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
     Box::pin(async move {
-        if let CacheIdent::TableName(table_name) = ident {
-            cache.invalidate(table_name).await
+        for ident in idents {
+            if let CacheIdent::TableName(table_name) = ident {
+                cache.invalidate(table_name).await
+            }
         }
         Ok(())
     })
diff --git a/src/common/meta/src/cache/table/table_route.rs b/src/common/meta/src/cache/table/table_route.rs
index 47abdaa728..be820b0c52 100644
--- a/src/common/meta/src/cache/table/table_route.rs
+++ b/src/common/meta/src/cache/table/table_route.rs
@@ -19,6 +19,7 @@ use moka::future::Cache;
 use snafu::OptionExt;
 use store_api::storage::TableId;
 
+use crate::cache::container::InitStrategy;
 use crate::cache::{CacheContainer, Initializer};
 use crate::error;
 use crate::error::Result;
@@ -65,7 +66,14 @@ pub fn new_table_route_cache(
     let table_info_manager = Arc::new(TableRouteManager::new(kv_backend));
     let init = init_factory(table_info_manager);
 
-    CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
+    CacheContainer::with_strategy(
+        name,
+        cache,
+        Box::new(invalidator),
+        init,
+        filter,
+        InitStrategy::VersionChecked,
+    )
 }
 
 fn init_factory(
@@ -92,11 +100,13 @@ fn init_factory(
 
 fn invalidator<'a>(
     cache: &'a Cache<TableId, Arc<TableRoute>>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
     Box::pin(async move {
-        if let CacheIdent::TableId(table_id) = ident {
-            cache.invalidate(table_id).await
+        for ident in idents {
+            if let CacheIdent::TableId(table_id) = ident {
+                cache.invalidate(table_id).await
+            }
         }
         Ok(())
     })
diff --git a/src/common/meta/src/cache/table/table_schema.rs b/src/common/meta/src/cache/table/table_schema.rs
index 99ece65683..33b1773f45 100644
--- a/src/common/meta/src/cache/table/table_schema.rs
+++ b/src/common/meta/src/cache/table/table_schema.rs
@@ -65,7 +65,7 @@ fn init_factory(table_info_manager: TableInfoManager) -> Initializer<TableId, Ar
 /// Never invalidates table id schema cache.
 fn invalidator<'a>(
     _cache: &'a Cache<TableId, Arc<SchemaName>>,
-    _ident: &'a CacheIdent,
+    _idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, error::Result<()>> {
     Box::pin(std::future::ready(Ok(())))
 }
diff --git a/src/common/meta/src/cache/table/view_info.rs b/src/common/meta/src/cache/table/view_info.rs
index 6a85493d42..d0e1058a7e 100644
--- a/src/common/meta/src/cache/table/view_info.rs
+++ b/src/common/meta/src/cache/table/view_info.rs
@@ -60,11 +60,13 @@ fn init_factory(view_info_manager: ViewInfoManagerRef) -> Initializer<TableId, A
 
 fn invalidator<'a>(
     cache: &'a Cache<TableId, Arc<ViewInfoValue>>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
     Box::pin(async move {
-        if let CacheIdent::TableId(view_id) = ident {
-            cache.invalidate(view_id).await
+        for ident in idents {
+            if let CacheIdent::TableId(view_id) = ident {
+                cache.invalidate(view_id).await
+            }
         }
         Ok(())
     })
diff --git a/src/common/meta/src/error.rs b/src/common/meta/src/error.rs
index c6613af828..b9fcbd6188 100644
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -714,6 +714,16 @@ pub enum Error {
     #[snafu(display("Failed to get cache"))]
     GetCache { source: Arc<Error> },
 
+    #[snafu(display(
+        "Failed to get latest cache value after {} attempts due to concurrent invalidation",
+        attempts
+    ))]
+    GetLatestCacheRetryExceeded {
+        attempts: usize,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[cfg(feature = "pg_kvbackend")]
     #[snafu(display("Failed to execute via Postgres, sql: {}", sql))]
     PostgresExecution {
@@ -1063,6 +1073,7 @@ impl ErrorExt for Error {
             | ConnectEtcd { .. }
             | MoveValues { .. }
             | GetCache { .. }
+            | GetLatestCacheRetryExceeded { .. }
             | SerializeToJson { .. }
             | DeserializeFromJson { .. } => StatusCode::Internal,
 
@@ -1243,7 +1254,10 @@ impl Error {
 
     /// Determine whether it is a retry later type through [StatusCode]
     pub fn is_retry_later(&self) -> bool {
-        matches!(self, Error::RetryLater { .. })
+        matches!(
+            self,
+            Error::RetryLater { .. } | Error::GetLatestCacheRetryExceeded { .. }
+        )
     }
 
     /// Determine whether it needs to clean poisons.
diff --git a/src/partition/src/cache.rs b/src/partition/src/cache.rs
index a886e1e08d..4066b69aa3 100644
--- a/src/partition/src/cache.rs
+++ b/src/partition/src/cache.rs
@@ -121,10 +121,12 @@ pub fn new_partition_info_cache(
     CacheContainer::new(
         name,
         cache,
-        Box::new(|cache, ident| {
+        Box::new(|cache, idents| {
             Box::pin(async move {
-                if let CacheIdent::TableId(table_id) = ident {
-                    cache.invalidate(table_id).await
+                for ident in idents {
+                    if let CacheIdent::TableId(table_id) = ident {
+                        cache.invalidate(table_id).await
+                    }
                 }
                 Ok(())
             })

From 30e895abbef7ec63be7afd8dfdecf448ce88453e Mon Sep 17 00:00:00 2001
From: discord9 <55937128+discord9@users.noreply.github.com>
Date: Tue, 24 Mar 2026 14:24:52 +0800
Subject: [PATCH 34/42] fix: prom cast to f64 (#7840)

* fix: cast to f64

Signed-off-by: discord9 <discord9@163.com>

* test: div case

Signed-off-by: discord9 <discord9@163.com>

* test: int test

Signed-off-by: discord9 <discord9@163.com>

* chore: sqlness update

Signed-off-by: discord9 <discord9@163.com>

* chore: test

Signed-off-by: discord9 <discord9@163.com>

* chore: update test

Signed-off-by: discord9 <discord9@163.com>

---------

Signed-off-by: discord9 <discord9@163.com>
---
 src/query/src/promql/planner.rs               |  55 ++--
 tests-integration/src/tests/promql_test.rs    | 238 +++++++++++++++++-
 .../explain/step_aggr_advance.result          |  90 +++----
 .../promql/anon_promql_ratio_repro.result     | 106 ++++++++
 .../common/promql/anon_promql_ratio_repro.sql |  63 +++++
 .../standalone/common/tql/tql-cte.result      |   4 +-
 6 files changed, 494 insertions(+), 62 deletions(-)
 create mode 100644 tests/cases/standalone/common/promql/anon_promql_ratio_repro.result
 create mode 100644 tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql

diff --git a/src/query/src/promql/planner.rs b/src/query/src/promql/planner.rs
index 427644e26a..b6f4f2d28f 100644
--- a/src/query/src/promql/planner.rs
+++ b/src/query/src/promql/planner.rs
@@ -3323,28 +3323,55 @@ impl PromPlanner {
     fn prom_token_to_binary_expr_builder(
         token: TokenType,
     ) -> Result<Box<dyn Fn(DfExpr, DfExpr) -> Result<DfExpr>>> {
+        let cast_float = |expr| {
+            if matches!(
+                &expr,
+                DfExpr::Cast(Cast {
+                    data_type: ArrowDataType::Float64,
+                    ..
+                })
+            ) || matches!(&expr, DfExpr::Literal(ScalarValue::Float64(_), _))
+            {
+                expr
+            } else {
+                DfExpr::Cast(Cast {
+                    expr: Box::new(expr),
+                    data_type: ArrowDataType::Float64,
+                })
+            }
+        };
         match token.id() {
-            token::T_ADD => Ok(Box::new(|lhs, rhs| Ok(lhs + rhs))),
-            token::T_SUB => Ok(Box::new(|lhs, rhs| Ok(lhs - rhs))),
-            token::T_MUL => Ok(Box::new(|lhs, rhs| Ok(lhs * rhs))),
-            token::T_DIV => Ok(Box::new(|lhs, rhs| Ok(lhs / rhs))),
-            token::T_MOD => Ok(Box::new(|lhs: DfExpr, rhs| Ok(lhs % rhs))),
+            token::T_ADD => Ok(Box::new(move |lhs, rhs| {
+                Ok(cast_float(lhs) + cast_float(rhs))
+            })),
+            token::T_SUB => Ok(Box::new(move |lhs, rhs| {
+                Ok(cast_float(lhs) - cast_float(rhs))
+            })),
+            token::T_MUL => Ok(Box::new(move |lhs, rhs| {
+                Ok(cast_float(lhs) * cast_float(rhs))
+            })),
+            token::T_DIV => Ok(Box::new(move |lhs, rhs| {
+                Ok(cast_float(lhs) / cast_float(rhs))
+            })),
+            token::T_MOD => Ok(Box::new(move |lhs: DfExpr, rhs| {
+                Ok(cast_float(lhs) % cast_float(rhs))
+            })),
             token::T_EQLC => Ok(Box::new(|lhs, rhs| Ok(lhs.eq(rhs)))),
             token::T_NEQ => Ok(Box::new(|lhs, rhs| Ok(lhs.not_eq(rhs)))),
             token::T_GTR => Ok(Box::new(|lhs, rhs| Ok(lhs.gt(rhs)))),
             token::T_LSS => Ok(Box::new(|lhs, rhs| Ok(lhs.lt(rhs)))),
             token::T_GTE => Ok(Box::new(|lhs, rhs| Ok(lhs.gt_eq(rhs)))),
             token::T_LTE => Ok(Box::new(|lhs, rhs| Ok(lhs.lt_eq(rhs)))),
-            token::T_POW => Ok(Box::new(|lhs, rhs| {
+            token::T_POW => Ok(Box::new(move |lhs, rhs| {
                 Ok(DfExpr::ScalarFunction(ScalarFunction {
                     func: datafusion_functions::math::power(),
-                    args: vec![lhs, rhs],
+                    args: vec![cast_float(lhs), cast_float(rhs)],
                 }))
             })),
-            token::T_ATAN2 => Ok(Box::new(|lhs, rhs| {
+            token::T_ATAN2 => Ok(Box::new(move |lhs, rhs| {
                 Ok(DfExpr::ScalarFunction(ScalarFunction {
                     func: datafusion_functions::math::atan2(),
-                    args: vec![lhs, rhs],
+                    args: vec![cast_float(lhs), cast_float(rhs)],
                 }))
             })),
             _ => UnexpectedTokenSnafu { token }.fail(),
@@ -5169,7 +5196,7 @@ mod test {
                 .unwrap();
 
         let expected = String::from(
-            "Projection: rhs.tag_0, rhs.timestamp, lhs.field_0 + rhs.field_0 AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\
+            "Projection: rhs.tag_0, rhs.timestamp, CAST(lhs.field_0 AS Float64) + CAST(rhs.field_0 AS Float64) AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\
             \n  Inner Join: lhs.tag_0 = rhs.tag_0, lhs.timestamp = rhs.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n    SubqueryAlias: lhs [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
@@ -5224,7 +5251,7 @@ mod test {
     async fn binary_op_literal_column() {
         let query = r#"1 + some_metric{tag_0="bar"}"#;
         let expected = String::from(
-            "Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + some_metric.field_0 AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\
+            "Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + CAST(some_metric.field_0 AS Float64) AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\
             \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
@@ -5262,7 +5289,7 @@ mod test {
     async fn bool_with_additional_arithmetic() {
         let query = "some_metric + (1 == bool 2)";
         let expected = String::from(
-            "Projection: some_metric.tag_0, some_metric.timestamp, some_metric.field_0 + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\
+            "Projection: some_metric.tag_0, some_metric.timestamp, CAST(some_metric.field_0 AS Float64) + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\
             \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
@@ -5372,7 +5399,7 @@ mod test {
             PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
                 .await
                 .unwrap();
-        let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\
+        let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, CAST(http_server_requests_seconds_sum.greptime_value AS Float64) / CAST(http_server_requests_seconds_count.greptime_value AS Float64) AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\
             \n  Inner Join: http_server_requests_seconds_sum.greptime_timestamp = http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.uri = http_server_requests_seconds_count.uri\
             \n    SubqueryAlias: http_server_requests_seconds_sum\
             \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp]\
@@ -5763,7 +5790,7 @@ mod test {
 
         let query = "some_alt_metric{__schema__=\"greptime_private\"} / some_metric";
         let expected = String::from(
-            "Projection: some_metric.tag_0, some_metric.timestamp, greptime_private.some_alt_metric.field_0 / some_metric.field_0 AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\
+            "Projection: some_metric.tag_0, some_metric.timestamp, CAST(greptime_private.some_alt_metric.field_0 AS Float64) / CAST(some_metric.field_0 AS Float64) AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\
             \n  Inner Join: greptime_private.some_alt_metric.tag_0 = some_metric.tag_0, greptime_private.some_alt_metric.timestamp = some_metric.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n    SubqueryAlias: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
diff --git a/tests-integration/src/tests/promql_test.rs b/tests-integration/src/tests/promql_test.rs
index 7fbce91ea6..ede4663118 100644
--- a/tests-integration/src/tests/promql_test.rs
+++ b/tests-integration/src/tests/promql_test.rs
@@ -15,7 +15,9 @@
 use std::sync::Arc;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
-use common_query::Output;
+use common_query::{Output, OutputData};
+use common_recordbatch::util::collect_batches;
+use datatypes::arrow::array::{Float64Array, Int64Array};
 use frontend::instance::Instance;
 use query::parser::{PromQuery, QueryLanguageParser, QueryStatement};
 use rstest::rstest;
@@ -151,6 +153,103 @@ async fn create_insert_tql_assert(
     check_unordered_output_stream(query_output, expected).await;
 }
 
+async fn execute_all(instance: &Arc<Instance>, sql: &str, query_ctx: Arc<QueryContext>) {
+    instance
+        .do_query(sql, query_ctx)
+        .await
+        .into_iter()
+        .for_each(|v| {
+            let _ = v.unwrap();
+        });
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn promql_query_as_batches(
+    ins: Arc<Instance>,
+    promql: &str,
+    alias: Option<String>,
+    query_ctx: Arc<QueryContext>,
+    start: SystemTime,
+    end: SystemTime,
+    interval: Duration,
+    lookback: Duration,
+) -> common_recordbatch::RecordBatches {
+    let output = promql_query(
+        ins, promql, alias, query_ctx, start, end, interval, lookback,
+    )
+    .await
+    .unwrap();
+    match output.data {
+        OutputData::Stream(stream) => collect_batches(stream).await.unwrap(),
+        OutputData::RecordBatches(recordbatches) => recordbatches,
+        _ => unreachable!(),
+    }
+}
+
+const ANON_PROMQL_RATIO_REPRO_DB: &str = "repro_db";
+
+const ANON_PROMQL_RATIO_REPRO_CREATE: &str = r#"
+CREATE TABLE phy (
+    t TIMESTAMP TIME INDEX,
+    v DOUBLE
+) ENGINE=metric WITH ("physical_metric_table" = "");
+
+CREATE TABLE metric_a (
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    l5 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l1, l2, l3, l4, l5)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+CREATE TABLE metric_b (
+    l6 STRING NULL,
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l6, l1, l2, l3, l4)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+"#;
+
+const ANON_PROMQL_RATIO_REPRO_INSERT: &str = r#"
+INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120);
+
+INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2);
+"#;
+
+const ANON_PROMQL_RATIO_REPRO_NUMERATOR: &str = r#"count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}) > 0.50))"#;
+
+const ANON_PROMQL_RATIO_REPRO_DENOMINATOR: &str =
+    r#"count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]))"#;
+
+const ANON_PROMQL_RATIO_REPRO_WHOLE: &str = r#"(count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]))) * 100"#;
+
+const ANON_PROMQL_RATIO_REPRO_SCALAR_DIV: &str =
+    r#"count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m])) / 2"#;
+
 #[apply(both_instances_cases)]
 async fn sql_insert_tql_query_ceil(instance: Arc<dyn MockInstance>) {
     let instance = instance.frontend();
@@ -709,3 +808,140 @@ async fn cross_schema_query(instance: Arc<dyn MockInstance>) {
 
     check_unordered_output_stream(query_output, expected).await;
 }
+
+#[apply(both_instances_cases)]
+async fn anon_promql_ratio_repro(instance: Arc<dyn MockInstance>) {
+    let ins = instance.frontend();
+
+    execute_all(
+        &ins,
+        &format!("CREATE DATABASE {ANON_PROMQL_RATIO_REPRO_DB}"),
+        QueryContext::arc(),
+    )
+    .await;
+
+    let repro_ctx: Arc<QueryContext> =
+        QueryContext::with_db_name(Some(ANON_PROMQL_RATIO_REPRO_DB)).into();
+    execute_all(&ins, ANON_PROMQL_RATIO_REPRO_CREATE, repro_ctx.clone()).await;
+    execute_all(&ins, ANON_PROMQL_RATIO_REPRO_INSERT, repro_ctx).await;
+
+    let start = UNIX_EPOCH.checked_add(Duration::from_secs(180)).unwrap();
+    let end = UNIX_EPOCH.checked_add(Duration::from_secs(360)).unwrap();
+    let interval = Duration::from_secs(180);
+    let lookback = Duration::from_secs(1);
+
+    let numerator = promql_query_as_batches(
+        ins.clone(),
+        ANON_PROMQL_RATIO_REPRO_NUMERATOR,
+        Some("num".to_string()),
+        QueryContext::arc(),
+        start,
+        end,
+        interval,
+        lookback,
+    )
+    .await;
+    let denominator = promql_query_as_batches(
+        ins.clone(),
+        ANON_PROMQL_RATIO_REPRO_DENOMINATOR,
+        Some("den".to_string()),
+        QueryContext::arc(),
+        start,
+        end,
+        interval,
+        lookback,
+    )
+    .await;
+    let whole = promql_query_as_batches(
+        ins.clone(),
+        ANON_PROMQL_RATIO_REPRO_WHOLE,
+        Some("pct".to_string()),
+        QueryContext::arc(),
+        start,
+        end,
+        interval,
+        lookback,
+    )
+    .await;
+    let scalar_div = promql_query_as_batches(
+        ins,
+        ANON_PROMQL_RATIO_REPRO_SCALAR_DIV,
+        Some("half_den".to_string()),
+        QueryContext::arc(),
+        start,
+        end,
+        interval,
+        lookback,
+    )
+    .await;
+
+    let numerator = numerator.iter().collect::<Vec<_>>();
+    let denominator = denominator.iter().collect::<Vec<_>>();
+    let whole = whole.iter().collect::<Vec<_>>();
+    let scalar_div = scalar_div.iter().collect::<Vec<_>>();
+
+    let numerator_values = numerator[0]
+        .column_by_name("num")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Int64Array>()
+        .unwrap();
+    let denominator_values = denominator[0]
+        .column_by_name("den")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Int64Array>()
+        .unwrap();
+    let percentage_values = whole[0]
+        .column_by_name("pct")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Float64Array>()
+        .unwrap();
+    let scalar_div_values = scalar_div[0]
+        .column_by_name("half_den")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Float64Array>()
+        .unwrap();
+
+    assert_eq!(numerator_values.len(), 1, "{}", numerator[0].pretty_print());
+    assert_eq!(
+        denominator_values.len(),
+        1,
+        "{}",
+        denominator[0].pretty_print()
+    );
+    assert_eq!(percentage_values.len(), 1, "{}", whole[0].pretty_print());
+    assert_eq!(
+        scalar_div_values.len(),
+        1,
+        "{}",
+        scalar_div[0].pretty_print()
+    );
+
+    assert_eq!(
+        numerator_values.value(0),
+        1,
+        "{}",
+        numerator[0].pretty_print()
+    );
+    assert_eq!(
+        denominator_values.value(0),
+        3,
+        "{}",
+        denominator[0].pretty_print()
+    );
+    assert!(
+        (scalar_div_values.value(0) - 1.5).abs() < 1e-9,
+        "{}",
+        scalar_div[0].pretty_print()
+    );
+
+    let expected = 100.0 / 3.0;
+    assert!(
+        (percentage_values.value(0) - expected).abs() < 1e-9,
+        "{}",
+        whole[0].pretty_print()
+    );
+}
diff --git a/tests/cases/distributed/explain/step_aggr_advance.result b/tests/cases/distributed/explain/step_aggr_advance.result
index 4bd83b7afa..5938fa202d 100644
--- a/tests/cases/distributed/explain/step_aggr_advance.result
+++ b/tests/cases/distributed/explain/step_aggr_advance.result
@@ -442,54 +442,54 @@ Affected Rows: 0
 -- SQLNESS REPLACE (Hash.*) REDACTED
 tql explain (1752591864, 1752592164, '30s') sum by (a, b, c) (rate(aggr_optimize_not [2m])) / sum by (a, b, c) (rate(aggr_optimize_not_count [2m]));
 
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) |
-|               |   Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp                                                                                                                                                                                                                                                                                                                                                      |
-|               |     MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                          |
-|               |       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               | SubqueryAlias: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-|               |   Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-|               |     Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                                                       |
-|               |       Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-|               |         Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d                                                                                                                                                                                                                                                           |
-|               |           PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|               |             PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-|               |               PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-|               |                 Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                          |
-|               |                   Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(1752592164000, None)                                                                                                                                                                                                                                                                                                                                                                                                       |
-|               |                     TableScan: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-|               |     SubqueryAlias: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|               |       Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                     |
-|               |         Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                           |
-|               |           Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-|               |             Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c                                                                                                                                                                                                                                              |
-|               |               PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                          |
-|               |                 PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-|               |                   PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-|               |                     Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                        |
-|               |                       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(1752592164000, None)                                                                                                                                                                                                                                                                                                                                                                                                             |
-|               |   TableScan: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| physical_plan | ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                               |
++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, CAST(aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS Float64) / CAST(aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS Float64) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) |
+|               |   Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |     MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+|               |       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               | SubqueryAlias: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               |   Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|               |     Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                                                                                         |
+|               |       Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+|               |         Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d                                                                                                                                                                                                                                                                                             |
+|               |           PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+|               |             PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               |               PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|               |                 Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                                                            |
+|               |                   Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(1752592164000, None)                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+|               |                     TableScan: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |     SubqueryAlias: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+|               |       Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|               |         Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                                                             |
+|               |           Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+|               |             Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c                                                                                                                                                                                                                                                                                |
+|               |               PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+|               |                 PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+|               |                   PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+|               |                     Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                          |
+|               |                       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(1752592164000, None)                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+|               |   TableScan: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| physical_plan | ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                                                                 |
 |               |   REDACTED
-|               |     CoalescePartitionsExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|               |       AggregateExec: mode=SinglePartitioned, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                                                                                                                                                                                                                                                                                                                            |
-|               |         FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               |           ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c]                                                                                                                                                                                                                                                                                                                 |
-|               |             PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp]                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               |               PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-|               |                 PromSeriesDivideExec: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-|               |                   SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |     CoalescePartitionsExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+|               |       AggregateExec: mode=SinglePartitioned, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                                                                                                                                                                                                                                                                                                                                                              |
+|               |         FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               |           ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c]                                                                                                                                                                                                                                                                                                                                                   |
+|               |             PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               |               PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+|               |                 PromSeriesDivideExec: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |                   SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
 |               |                     MergeScanExec: REDACTED
-|               |     SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-|               |       CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|               |     SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |       CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 |               |         MergeScanExec: REDACTED
-|               |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+|               |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
 -- SQLNESS REPLACE (metrics.*) REDACTED
 -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
diff --git a/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result
new file mode 100644
index 0000000000..ab3c4db715
--- /dev/null
+++ b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result
@@ -0,0 +1,106 @@
+CREATE TABLE phy (
+    t TIMESTAMP TIME INDEX,
+    v DOUBLE
+) ENGINE=metric WITH ("physical_metric_table" = "");
+
+Affected Rows: 0
+
+CREATE TABLE metric_a (
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    l5 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l1, l2, l3, l4, l5)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+Affected Rows: 0
+
+CREATE TABLE metric_b (
+    l6 STRING NULL,
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l6, l1, l2, l3, l4)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+Affected Rows: 0
+
+INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120);
+
+Affected Rows: 9
+
+INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2);
+
+Affected Rows: 6
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50));
+
++---------------------+-------------------------------------------------------------------+
+| t                   | count(metric_a.prom_rate(t_range,v,t,Int64(180000)) / metric_b.v) |
++---------------------+-------------------------------------------------------------------+
+| 1970-01-01T00:03:00 | 1                                                                 |
++---------------------+-------------------------------------------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]));
+
++---------------------+---------------------------------------------+
+| t                   | count(prom_rate(t_range,v,t,Int64(180000))) |
++---------------------+---------------------------------------------+
+| 1970-01-01T00:03:00 | 3                                           |
++---------------------+---------------------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])) / 2;
+
++---------------------+----------------------------------------------------------+
+| t                   | count(prom_rate(t_range,v,t,Int64(180000))) / Float64(2) |
++---------------------+----------------------------------------------------------+
+| 1970-01-01T00:03:00 | 1.5                                                      |
++---------------------+----------------------------------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') (count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]))) * 100;
+
++---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+| t                   | metric_b.count(metric_a.prom_rate(t_range,v,t,Int64(180000)) / metric_b.v) / metric_a.count(prom_rate(t_range,v,t,Int64(180000))) * Float64(100) |
++---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+| 1970-01-01T00:03:00 | 33.33333333333333                                                                                                                                |
++---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+
+DROP TABLE metric_a;
+
+Affected Rows: 0
+
+DROP TABLE metric_b;
+
+Affected Rows: 0
+
+DROP TABLE phy;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql
new file mode 100644
index 0000000000..946d4f93a1
--- /dev/null
+++ b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql
@@ -0,0 +1,63 @@
+CREATE TABLE phy (
+    t TIMESTAMP TIME INDEX,
+    v DOUBLE
+) ENGINE=metric WITH ("physical_metric_table" = "");
+
+CREATE TABLE metric_a (
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    l5 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l1, l2, l3, l4, l5)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+CREATE TABLE metric_b (
+    l6 STRING NULL,
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l6, l1, l2, l3, l4)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120);
+
+INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2);
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50));
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]));
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])) / 2;
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') (count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]))) * 100;
+
+DROP TABLE metric_a;
+DROP TABLE metric_b;
+DROP TABLE phy;
diff --git a/tests/cases/standalone/common/tql/tql-cte.result b/tests/cases/standalone/common/tql/tql-cte.result
index a8c0c45d5d..e8278e80bd 100644
--- a/tests/cases/standalone/common/tql/tql-cte.result
+++ b/tests/cases/standalone/common/tql/tql-cte.result
@@ -427,8 +427,8 @@ SELECT min(val) as min_computed, max(val) as max_computed FROM computed;
 |               |   Aggregate: groupBy=[[]], aggr=[[min(computed.val), max(computed.val)]]                                                    |
 |               |     SubqueryAlias: computed                                                                                                 |
 |               |       Projection: metric.ts AS ts, val * Float64(2) + Float64(1) AS val                                                     |
-|               |         Projection: metric.ts, val * Float64(2) + Float64(1) AS val * Float64(2) + Float64(1)                               |
-|               |           Projection: metric.ts, metric.val * Float64(2) AS val * Float64(2)                                                |
+|               |         Projection: metric.ts, CAST(val * Float64(2) AS Float64) + Float64(1) AS val * Float64(2) + Float64(1)              |
+|               |           Projection: metric.ts, CAST(metric.val AS Float64) * Float64(2) AS val * Float64(2)                               |
 |               |             PromInstantManipulate: range=[0..40000], lookback=[300000], interval=[10000], time index=[ts]                   |
 |               |               PromSeriesDivide: tags=[]                                                                                     |
 |               |                 Filter: metric.ts >= TimestampMillisecond(-299999, None) AND metric.ts <= TimestampMillisecond(40000, None) |

From 187b8d3798b69d4881d2bb667e4facd16729b115 Mon Sep 17 00:00:00 2001
From: liyang <daviderli614@gmail.com>
Date: Tue, 24 Mar 2026 17:19:18 +0800
Subject: [PATCH 35/42] ci: remove redundant directory level when uploading
 artifacts to S3 (#7852)

Signed-off-by: liyang <daviderli614@gmail.com>
---
 .github/scripts/upload-artifacts-to-s3.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/upload-artifacts-to-s3.sh b/.github/scripts/upload-artifacts-to-s3.sh
index 310575c069..1ddf32044b 100755
--- a/.github/scripts/upload-artifacts-to-s3.sh
+++ b/.github/scripts/upload-artifacts-to-s3.sh
@@ -33,7 +33,7 @@ function upload_artifacts() {
   #    └── greptime-darwin-amd64-v0.2.0.tar.gz
   find "$ARTIFACTS_DIR" -type f \( -name "*.tar.gz" -o -name "*.sha256sum" \) | while IFS= read -r file; do
     filename=$(basename "$file")
-    TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION/$filename"
+    TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION"
 
     curl -X PUT \
       -u "$PROXY_USERNAME:$PROXY_PASSWORD" \
@@ -49,7 +49,7 @@ function update_version_info() {
     if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
       echo "Updating latest-version.txt"
       echo "$VERSION" > latest-version.txt
-      TARGET_URL="$PROXY_URL/$RELEASE_DIRS/latest-version.txt"
+      TARGET_URL="$PROXY_URL/$RELEASE_DIRS"
 
       curl -X PUT \
         -u "$PROXY_USERNAME:$PROXY_PASSWORD" \
@@ -62,7 +62,7 @@ function update_version_info() {
       echo "Updating latest-nightly-version.txt"
       echo "$VERSION" > latest-nightly-version.txt
 
-      TARGET_URL="$PROXY_URL/$RELEASE_DIRS/latest-nightly-version.txt"
+      TARGET_URL="$PROXY_URL/$RELEASE_DIRS"
       curl -X PUT \
         -u "$PROXY_USERNAME:$PROXY_PASSWORD" \
         -F "file=@latest-nightly-version.txt" \

From 0e22d6a72b7ee66b5e3c284a47da97ec6af2837e Mon Sep 17 00:00:00 2001
From: Yingwen <realevenyag@gmail.com>
Date: Tue, 24 Mar 2026 18:01:13 +0800
Subject: [PATCH 36/42] feat: implement partition range cache stream (#7842)

* feat: add cache stream helpers, key construction, config wiring, and metrics for partition range cache

Add range result cache size config field and wire it through cache builder
chains. Implement cache key building (build_range_cache_key), stream
replay/store helpers (cached_flat_range_stream, cache_flat_range_stream),
dictionary compaction (compact_pk_dictionary), and partition range row group
collection. Add range cache metrics (size, hit, miss) to ScanMetricsSet
and PartitionMetrics. Move fingerprint tests from scan_region to
range_cache module. These functions are not yet wired into scan execution.

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: add benchmark for cache stream

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: move bench_util to test_util

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: share dict

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: test ptr_eq

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: fmt code

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: simplify value array handling

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: add todo for estimate size

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: simplify size calculation

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: remove one test

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: update config test

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: address review comment

Only ignore exprs that can extract time ranges

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: fix tests

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
---
 src/mito2/Cargo.toml                       |   5 +
 src/mito2/benches/bench_cache_stream.rs    | 126 +++++
 src/mito2/benches/memtable_bench.rs        | 245 +-------
 src/mito2/src/cache.rs                     |  13 +-
 src/mito2/src/config.rs                    |   4 +
 src/mito2/src/memtable/bulk/part.rs        |  11 +-
 src/mito2/src/memtable/bulk/part_reader.rs |   2 +-
 src/mito2/src/read.rs                      |   3 +
 src/mito2/src/read/range_cache.rs          | 628 ++++++++++++++++++++-
 src/mito2/src/read/scan_region.rs          |  39 +-
 src/mito2/src/read/scan_util.rs            |  40 ++
 src/mito2/src/test_util.rs                 |   1 +
 src/mito2/src/test_util/bench_util.rs      | 259 +++++++++
 src/mito2/src/test_util/memtable_util.rs   |   2 +-
 src/mito2/src/worker.rs                    |   2 +
 src/table/src/predicate.rs                 |   2 +-
 tests-integration/tests/http.rs            |   1 +
 17 files changed, 1113 insertions(+), 270 deletions(-)
 create mode 100644 src/mito2/benches/bench_cache_stream.rs
 create mode 100644 src/mito2/src/test_util/bench_util.rs

diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml
index 1d7cf7b6d7..a78bf079b0 100644
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -108,6 +108,11 @@ name = "memtable_bench"
 harness = false
 required-features = ["test"]
 
+[[bench]]
+name = "bench_cache_stream"
+harness = false
+required-features = ["test"]
+
 [[bench]]
 name = "bench_filter_time_partition"
 harness = false
diff --git a/src/mito2/benches/bench_cache_stream.rs b/src/mito2/benches/bench_cache_stream.rs
new file mode 100644
index 0000000000..f2314f2ccb
--- /dev/null
+++ b/src/mito2/benches/bench_cache_stream.rs
@@ -0,0 +1,126 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Benchmarks for `cache_flat_range_stream` overhead.
+//!
+//! Compares consuming batches from a plain stream vs through the caching wrapper
+//! that clones batches for the range cache.
+//!
+//! Run with:
+//! ```sh
+//! cargo bench -p mito2 --features test --bench bench_cache_stream
+//! ```
+
+use std::collections::VecDeque;
+use std::sync::Arc;
+
+use criterion::{Criterion, criterion_group, criterion_main};
+use futures::TryStreamExt;
+use mito_codec::row_converter::DensePrimaryKeyCodec;
+use mito2::memtable::bulk::context::BulkIterContext;
+use mito2::memtable::bulk::part::{BulkPartConverter, BulkPartEncoder};
+use mito2::memtable::bulk::part_reader::EncodedBulkPartIter;
+use mito2::read::range_cache::bench_cache_flat_range_stream;
+use mito2::sst::parquet::DEFAULT_ROW_GROUP_SIZE;
+use mito2::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
+use mito2::test_util::bench_util::{CpuDataGenerator, cpu_metadata};
+
+fn cache_flat_range_stream_bench(c: &mut Criterion) {
+    let metadata = Arc::new(cpu_metadata());
+    let region_id = metadata.region_id;
+    let start_sec = 1710043200;
+    // 2000 hosts × 51 steps = 102,000 rows ≈ DEFAULT_ROW_GROUP_SIZE
+    let num_hosts = 2000;
+    let end_sec = start_sec + 510;
+    let generator = CpuDataGenerator::new(metadata.clone(), num_hosts, start_sec, end_sec);
+
+    // Build a BulkPart from all the generated data
+    let schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default());
+    let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
+
+    let mut converter = BulkPartConverter::new(
+        &metadata,
+        schema,
+        DEFAULT_ROW_GROUP_SIZE,
+        codec,
+        true, // store_pk_columns
+    );
+    for kvs in generator.iter() {
+        converter.append_key_values(&kvs).unwrap();
+    }
+    let bulk_part = converter.convert().unwrap();
+
+    // Encode to parquet
+    let encoder = BulkPartEncoder::new(metadata.clone(), DEFAULT_ROW_GROUP_SIZE).unwrap();
+    let encoded_part = encoder.encode_part(&bulk_part).unwrap().unwrap();
+
+    // Decode all record batches
+    let num_row_groups = encoded_part.metadata().parquet_metadata.num_row_groups();
+    let context = Arc::new(
+        BulkIterContext::new(
+            metadata.clone(),
+            None, // No projection
+            None, // No predicate
+            false,
+        )
+        .unwrap(),
+    );
+    let row_groups: VecDeque<usize> = (0..num_row_groups).collect();
+
+    let rt = tokio::runtime::Runtime::new().unwrap();
+
+    let mut group = c.benchmark_group("cache_flat_range_stream");
+    group.sample_size(10);
+
+    group.bench_function("baseline_iter_stream", |b| {
+        b.iter(|| {
+            rt.block_on(async {
+                let iter = EncodedBulkPartIter::try_new(
+                    &encoded_part,
+                    context.clone(),
+                    row_groups.clone(),
+                    None,
+                    None,
+                )
+                .unwrap();
+                let stream: mito2::read::BoxedRecordBatchStream =
+                    Box::pin(futures::stream::iter(iter));
+                let mut stream = stream;
+                while let Some(_batch) = stream.try_next().await.unwrap() {}
+            });
+        });
+    });
+
+    group.bench_function("cache_flat_range_stream", |b| {
+        b.iter(|| {
+            rt.block_on(async {
+                let iter = EncodedBulkPartIter::try_new(
+                    &encoded_part,
+                    context.clone(),
+                    row_groups.clone(),
+                    None,
+                    None,
+                )
+                .unwrap();
+                let stream: mito2::read::BoxedRecordBatchStream =
+                    Box::pin(futures::stream::iter(iter));
+                let mut stream = bench_cache_flat_range_stream(stream, 64 * 1024 * 1024, region_id);
+                while let Some(_batch) = stream.try_next().await.unwrap() {}
+            });
+        });
+    });
+}
+
+criterion_group!(benches, cache_flat_range_stream_bench);
+criterion_main!(benches);
diff --git a/src/mito2/benches/memtable_bench.rs b/src/mito2/benches/memtable_bench.rs
index df991f6f92..8336625e3c 100644
--- a/src/mito2/benches/memtable_bench.rs
+++ b/src/mito2/benches/memtable_bench.rs
@@ -12,15 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+//! Benchmarks for memtable operations: writes, full scans, filtered scans,
+//! bulk part conversion, record batch iteration with filters, and flat merge.
+//!
+//! Run with:
+//! ```sh
+//! cargo bench -p mito2 --features test --bench memtable_bench
+//! ```
+
 use std::sync::Arc;
 
-use api::v1::value::ValueData;
-use api::v1::{Row, Rows, SemanticType};
 use criterion::{Criterion, criterion_group, criterion_main};
-use datafusion_common::Column;
-use datafusion_expr::{Expr, lit};
-use datatypes::data_type::ConcreteDataType;
-use datatypes::schema::ColumnSchema;
 use mito_codec::row_converter::DensePrimaryKeyCodec;
 use mito2::memtable::bulk::context::BulkIterContext;
 use mito2::memtable::bulk::part::BulkPartConverter;
@@ -28,20 +30,13 @@ use mito2::memtable::bulk::part_reader::BulkPartBatchIter;
 use mito2::memtable::bulk::{BulkMemtable, BulkMemtableConfig};
 use mito2::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtable};
 use mito2::memtable::time_series::TimeSeriesMemtable;
-use mito2::memtable::{IterBuilder, KeyValues, Memtable, RangesOptions};
+use mito2::memtable::{IterBuilder, Memtable, RangesOptions};
 use mito2::read::flat_merge::FlatMergeIterator;
 use mito2::read::scan_region::PredicateGroup;
 use mito2::region::options::MergeMode;
 use mito2::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
-use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema};
-use rand::Rng;
-use rand::rngs::ThreadRng;
-use rand::seq::IndexedRandom;
-use store_api::metadata::{
-    ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
-};
-use store_api::storage::RegionId;
-use table::predicate::Predicate;
+use mito2::test_util::bench_util::{CpuDataGenerator, cpu_metadata};
+use mito2::test_util::memtable_util;
 
 /// Writes rows.
 fn write_rows(c: &mut Criterion) {
@@ -216,224 +211,6 @@ fn filter_1_host(c: &mut Criterion) {
     });
 }
 
-struct Host {
-    hostname: String,
-    region: String,
-    datacenter: String,
-    rack: String,
-    os: String,
-    arch: String,
-    team: String,
-    service: String,
-    service_version: String,
-    service_environment: String,
-}
-
-impl Host {
-    fn random_with_id(id: usize) -> Host {
-        let mut rng = rand::rng();
-        let region = format!("ap-southeast-{}", rng.random_range(0..10));
-        let datacenter = format!(
-            "{}{}",
-            region,
-            ['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap()
-        );
-        Host {
-            hostname: format!("host_{id}"),
-            region,
-            datacenter,
-            rack: rng.random_range(0..100).to_string(),
-            os: "Ubuntu16.04LTS".to_string(),
-            arch: "x86".to_string(),
-            team: "CHI".to_string(),
-            service: rng.random_range(0..100).to_string(),
-            service_version: rng.random_range(0..10).to_string(),
-            service_environment: "test".to_string(),
-        }
-    }
-
-    fn fill_values(&self, values: &mut Vec<api::v1::Value>) {
-        let tags = [
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.hostname.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.region.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.datacenter.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.rack.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.os.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.arch.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.team.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.service.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.service_version.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.service_environment.clone())),
-            },
-        ];
-        for tag in tags {
-            values.push(tag);
-        }
-    }
-}
-
-struct CpuDataGenerator {
-    metadata: RegionMetadataRef,
-    column_schemas: Vec<api::v1::ColumnSchema>,
-    hosts: Vec<Host>,
-    start_sec: i64,
-    end_sec: i64,
-}
-
-impl CpuDataGenerator {
-    fn new(metadata: RegionMetadataRef, num_hosts: usize, start_sec: i64, end_sec: i64) -> Self {
-        let column_schemas = region_metadata_to_row_schema(&metadata);
-        Self {
-            metadata,
-            column_schemas,
-            hosts: Self::generate_hosts(num_hosts),
-            start_sec,
-            end_sec,
-        }
-    }
-
-    fn iter(&self) -> impl Iterator<Item = KeyValues> + '_ {
-        // point per 10s.
-        (self.start_sec..self.end_sec)
-            .step_by(10)
-            .enumerate()
-            .map(|(seq, ts)| self.build_key_values(seq, ts))
-    }
-
-    fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues {
-        let rows = self
-            .hosts
-            .iter()
-            .map(|host| {
-                let mut rng = rand::rng();
-                let mut values = Vec::with_capacity(21);
-                values.push(api::v1::Value {
-                    value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
-                });
-                host.fill_values(&mut values);
-                for _ in 0..10 {
-                    values.push(api::v1::Value {
-                        value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))),
-                    });
-                }
-                Row { values }
-            })
-            .collect();
-        let mutation = api::v1::Mutation {
-            op_type: api::v1::OpType::Put as i32,
-            sequence: seq as u64,
-            rows: Some(Rows {
-                schema: self.column_schemas.clone(),
-                rows,
-            }),
-            write_hint: None,
-        };
-
-        KeyValues::new(&self.metadata, mutation).unwrap()
-    }
-
-    fn random_host_filter(&self) -> Predicate {
-        let host = self.random_hostname();
-        let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host));
-        Predicate::new(vec![expr])
-    }
-
-    fn random_host_filter_exprs(&self) -> Vec<Expr> {
-        let host = self.random_hostname();
-        vec![Expr::Column(Column::from_name("hostname")).eq(lit(host))]
-    }
-
-    fn random_hostname(&self) -> String {
-        let mut rng = rand::rng();
-        self.hosts.choose(&mut rng).unwrap().hostname.clone()
-    }
-
-    fn random_f64(rng: &mut ThreadRng) -> f64 {
-        let base: u32 = rng.random_range(30..95);
-        base as f64
-    }
-
-    fn generate_hosts(num_hosts: usize) -> Vec<Host> {
-        (0..num_hosts).map(Host::random_with_id).collect()
-    }
-}
-
-/// Creates a metadata for TSBS cpu-like table.
-fn cpu_metadata() -> RegionMetadata {
-    let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
-    builder.push_column_metadata(ColumnMetadata {
-        column_schema: ColumnSchema::new(
-            "ts",
-            ConcreteDataType::timestamp_millisecond_datatype(),
-            false,
-        ),
-        semantic_type: SemanticType::Timestamp,
-        column_id: 0,
-    });
-    let mut column_id = 1;
-    let tags = [
-        "hostname",
-        "region",
-        "datacenter",
-        "rack",
-        "os",
-        "arch",
-        "team",
-        "service",
-        "service_version",
-        "service_environment",
-    ];
-    for tag in tags {
-        builder.push_column_metadata(ColumnMetadata {
-            column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true),
-            semantic_type: SemanticType::Tag,
-            column_id,
-        });
-        column_id += 1;
-    }
-    let fields = [
-        "usage_user",
-        "usage_system",
-        "usage_idle",
-        "usage_nice",
-        "usage_iowait",
-        "usage_irq",
-        "usage_softirq",
-        "usage_steal",
-        "usage_guest",
-        "usage_guest_nice",
-    ];
-    for field in fields {
-        builder.push_column_metadata(ColumnMetadata {
-            column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true),
-            semantic_type: SemanticType::Field,
-            column_id,
-        });
-        column_id += 1;
-    }
-    builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
-    builder.build().unwrap()
-}
-
 fn bulk_part_converter(c: &mut Criterion) {
     let metadata = Arc::new(cpu_metadata());
     let start_sec = 1710043200;
diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs
index c9a8b99166..35db74eee6 100644
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -350,7 +350,7 @@ impl CacheStrategy {
 
     /// Calls [CacheManager::get_range_result()].
     /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
-    #[cfg_attr(not(test), allow(dead_code))]
+    #[allow(dead_code)]
     pub(crate) fn get_range_result(
         &self,
         key: &RangeScanCacheKey,
@@ -363,7 +363,6 @@ impl CacheStrategy {
 
     /// Calls [CacheManager::put_range_result()].
     /// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
-    #[cfg_attr(not(test), allow(dead_code))]
     pub(crate) fn put_range_result(
         &self,
         key: RangeScanCacheKey,
@@ -476,7 +475,6 @@ pub struct CacheManager {
     /// Cache for time series selectors.
     selector_result_cache: Option<SelectorResultCache>,
     /// Cache for range scan outputs in flat format.
-    #[cfg_attr(not(test), allow(dead_code))]
     range_result_cache: Option<RangeResultCache>,
     /// Cache for index result.
     index_result_cache: Option<IndexResultCache>,
@@ -713,7 +711,7 @@ impl CacheManager {
     }
 
     /// Gets cached result for range scan.
-    #[cfg_attr(not(test), allow(dead_code))]
+    #[allow(dead_code)]
     pub(crate) fn get_range_result(
         &self,
         key: &RangeScanCacheKey,
@@ -723,8 +721,7 @@ impl CacheManager {
             .and_then(|cache| update_hit_miss(cache.get(key), RANGE_RESULT_TYPE))
     }
 
-    /// Puts range scan result into the cache.
-    #[cfg_attr(not(test), allow(dead_code))]
+    /// Puts range scan result into cache.
     pub(crate) fn put_range_result(
         &self,
         key: RangeScanCacheKey,
@@ -949,7 +946,7 @@ impl CacheManagerBuilder {
             Cache::builder()
                 .max_capacity(self.range_result_cache_size)
                 .weigher(range_result_cache_weight)
-                .eviction_listener(|k, v, cause| {
+                .eviction_listener(move |k, v, cause| {
                     let size = range_result_cache_weight(&k, &v);
                     CACHE_BYTES
                         .with_label_values(&[RANGE_RESULT_TYPE])
@@ -1361,7 +1358,7 @@ mod tests {
             }
             .build(),
         };
-        let value = Arc::new(RangeScanCacheValue::new(Vec::new()));
+        let value = Arc::new(RangeScanCacheValue::new(Vec::new(), 0));
 
         assert!(cache.get_range_result(&key).is_none());
         cache.put_range_result(key.clone(), value.clone());
diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs
index 602f5508ba..0eee067ab6 100644
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -116,6 +116,8 @@ pub struct MitoConfig {
     pub page_cache_size: ReadableSize,
     /// Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.
     pub selector_result_cache_size: ReadableSize,
+    /// Cache size for flat range scan results. Setting it to 0 to disable the cache.
+    pub range_result_cache_size: ReadableSize,
     /// Whether to enable the write cache.
     pub enable_write_cache: bool,
     /// File system path for write cache dir's root, defaults to `{data_home}`.
@@ -200,6 +202,7 @@ impl Default for MitoConfig {
             vector_cache_size: ReadableSize::mb(512),
             page_cache_size: ReadableSize::mb(512),
             selector_result_cache_size: ReadableSize::mb(512),
+            range_result_cache_size: ReadableSize::mb(512),
             enable_write_cache: false,
             write_cache_path: String::new(),
             write_cache_size: ReadableSize::gb(5),
@@ -336,6 +339,7 @@ impl MitoConfig {
         self.vector_cache_size = mem_cache_size;
         self.page_cache_size = page_cache_size;
         self.selector_result_cache_size = mem_cache_size;
+        self.range_result_cache_size = mem_cache_size;
 
         self.index.adjust_buffer_and_cache_size(sys_memory);
     }
diff --git a/src/mito2/src/memtable/bulk/part.rs b/src/mito2/src/memtable/bulk/part.rs
index 71e49776c0..bf345c038e 100644
--- a/src/mito2/src/memtable/bulk/part.rs
+++ b/src/mito2/src/memtable/bulk/part.rs
@@ -967,7 +967,7 @@ impl EncodedBulkPart {
         Self { data, metadata }
     }
 
-    pub(crate) fn metadata(&self) -> &BulkPartMeta {
+    pub fn metadata(&self) -> &BulkPartMeta {
         &self.metadata
     }
 
@@ -977,7 +977,7 @@ impl EncodedBulkPart {
     }
 
     /// Returns the encoded data.
-    pub(crate) fn data(&self) -> &Bytes {
+    pub fn data(&self) -> &Bytes {
         &self.data
     }
 
@@ -1121,10 +1121,7 @@ pub struct BulkPartEncoder {
 }
 
 impl BulkPartEncoder {
-    pub(crate) fn new(
-        metadata: RegionMetadataRef,
-        row_group_size: usize,
-    ) -> Result<BulkPartEncoder> {
+    pub fn new(metadata: RegionMetadataRef, row_group_size: usize) -> Result<BulkPartEncoder> {
         // TODO(yingwen): Skip arrow schema if needed.
         let json = metadata.to_json().context(InvalidMetadataSnafu)?;
         let key_value_meta =
@@ -1216,7 +1213,7 @@ impl BulkPartEncoder {
     }
 
     /// Encodes bulk part to a [EncodedBulkPart], returns the encoded data.
-    fn encode_part(&self, part: &BulkPart) -> Result<Option<EncodedBulkPart>> {
+    pub fn encode_part(&self, part: &BulkPart) -> Result<Option<EncodedBulkPart>> {
         if part.batch.num_rows() == 0 {
             return Ok(None);
         }
diff --git a/src/mito2/src/memtable/bulk/part_reader.rs b/src/mito2/src/memtable/bulk/part_reader.rs
index 1e9d955321..904aae8c90 100644
--- a/src/mito2/src/memtable/bulk/part_reader.rs
+++ b/src/mito2/src/memtable/bulk/part_reader.rs
@@ -50,7 +50,7 @@ pub struct EncodedBulkPartIter {
 
 impl EncodedBulkPartIter {
     /// Creates a new [BulkPartIter].
-    pub(crate) fn try_new(
+    pub fn try_new(
         encoded_part: &EncodedBulkPart,
         context: BulkIterContextRef,
         mut row_groups_to_read: VecDeque<usize>,
diff --git a/src/mito2/src/read.rs b/src/mito2/src/read.rs
index 240a99c247..84931b9f37 100644
--- a/src/mito2/src/read.rs
+++ b/src/mito2/src/read.rs
@@ -27,6 +27,9 @@ pub mod projection;
 pub(crate) mod prune;
 pub(crate) mod pruner;
 pub mod range;
+#[cfg(feature = "test")]
+pub mod range_cache;
+#[cfg(not(feature = "test"))]
 pub(crate) mod range_cache;
 pub mod scan_region;
 pub mod scan_util;
diff --git a/src/mito2/src/read/range_cache.rs b/src/mito2/src/read/range_cache.rs
index 5b90e68bae..5fc8931691 100644
--- a/src/mito2/src/read/range_cache.rs
+++ b/src/mito2/src/read/range_cache.rs
@@ -17,12 +17,23 @@
 use std::mem;
 use std::sync::Arc;
 
+use async_stream::try_stream;
+use common_time::range::TimestampRange;
+use datatypes::arrow::array::{Array, AsArray, DictionaryArray};
+use datatypes::arrow::datatypes::UInt32Type;
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::prelude::ConcreteDataType;
+use futures::TryStreamExt;
+use store_api::region_engine::PartitionRange;
 use store_api::storage::{ColumnId, FileId, RegionId, TimeSeriesRowSelector};
 
-use crate::memtable::record_batch_estimated_size;
+use crate::cache::CacheStrategy;
+use crate::read::BoxedRecordBatchStream;
+use crate::read::scan_region::StreamContext;
+use crate::read::scan_util::PartitionMetrics;
 use crate::region::options::MergeMode;
+use crate::sst::file::FileTimeRange;
+use crate::sst::parquet::flat_format::primary_key_column_index;
 
 /// Fingerprint of the scan request fields that affect partition range cache reuse.
 ///
@@ -124,7 +135,6 @@ impl ScanRequestFingerprint {
             .unwrap_or(&[])
     }
 
-    #[cfg(test)]
     pub(crate) fn without_time_filters(&self) -> Self {
         Self {
             inner: Arc::clone(&self.inner),
@@ -163,7 +173,7 @@ impl ScanRequestFingerprint {
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub(crate) struct RangeScanCacheKey {
     pub(crate) region_id: RegionId,
-    /// Sorted (file_id, row_group_index) pairs that uniquely identify the covered data.
+    /// Sorted (file_id, row_group_index) pairs that uniquely identify the data this range covers.
     pub(crate) row_groups: Vec<(FileId, i64)>,
     pub(crate) scan: ScanRequestFingerprint,
 }
@@ -179,30 +189,458 @@ impl RangeScanCacheKey {
 /// Cached result for one range scan.
 pub(crate) struct RangeScanCacheValue {
     pub(crate) batches: Vec<RecordBatch>,
+    /// Precomputed size of all batches, accounting for shared dictionary values.
+    estimated_batches_size: usize,
 }
 
 impl RangeScanCacheValue {
-    #[cfg_attr(not(test), allow(dead_code))]
-    pub(crate) fn new(batches: Vec<RecordBatch>) -> Self {
-        Self { batches }
+    pub(crate) fn new(batches: Vec<RecordBatch>, estimated_batches_size: usize) -> Self {
+        Self {
+            batches,
+            estimated_batches_size,
+        }
     }
 
     pub(crate) fn estimated_size(&self) -> usize {
         mem::size_of::<Self>()
             + self.batches.capacity() * mem::size_of::<RecordBatch>()
-            + self
-                .batches
-                .iter()
-                .map(record_batch_estimated_size)
-                .sum::<usize>()
+            + self.estimated_batches_size
     }
 }
 
+/// Row groups and whether all sources are file-only for a partition range.
+#[allow(dead_code)]
+pub(crate) struct PartitionRangeRowGroups {
+    /// Sorted (file_id, row_group_index) pairs.
+    pub(crate) row_groups: Vec<(FileId, i64)>,
+    pub(crate) only_file_sources: bool,
+}
+
+/// Collects (file_id, row_group_index) pairs from a partition range's row group indices.
+#[allow(dead_code)]
+pub(crate) fn collect_partition_range_row_groups(
+    stream_ctx: &StreamContext,
+    part_range: &PartitionRange,
+) -> PartitionRangeRowGroups {
+    let range_meta = &stream_ctx.ranges[part_range.identifier];
+    let mut row_groups = Vec::new();
+    let mut only_file_sources = true;
+
+    for index in &range_meta.row_group_indices {
+        if stream_ctx.is_file_range_index(*index) {
+            let file_id = stream_ctx.input.file_from_index(*index).file_id().file_id();
+            row_groups.push((file_id, index.row_group_index));
+        } else {
+            only_file_sources = false;
+        }
+    }
+
+    row_groups.sort_unstable_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()).then(a.1.cmp(&b.1)));
+
+    PartitionRangeRowGroups {
+        row_groups,
+        only_file_sources,
+    }
+}
+
+/// Builds a cache key for the given partition range if it is eligible for caching.
+#[allow(dead_code)]
+pub(crate) fn build_range_cache_key(
+    stream_ctx: &StreamContext,
+    part_range: &PartitionRange,
+) -> Option<RangeScanCacheKey> {
+    let fingerprint = stream_ctx.scan_fingerprint.as_ref()?;
+
+    // Dyn filters can change at runtime, so we can't cache when they're present.
+    let has_dyn_filters = stream_ctx
+        .input
+        .predicate_group()
+        .predicate_without_region()
+        .is_some_and(|p| !p.dyn_filters().is_empty());
+    if has_dyn_filters {
+        return None;
+    }
+
+    let rg = collect_partition_range_row_groups(stream_ctx, part_range);
+    if !rg.only_file_sources || rg.row_groups.is_empty() {
+        return None;
+    }
+
+    let range_meta = &stream_ctx.ranges[part_range.identifier];
+    let scan = if query_time_range_covers_partition_range(
+        stream_ctx.input.time_range.as_ref(),
+        range_meta.time_range,
+    ) {
+        fingerprint.without_time_filters()
+    } else {
+        fingerprint.clone()
+    };
+
+    Some(RangeScanCacheKey {
+        region_id: stream_ctx.input.region_metadata().region_id,
+        row_groups: rg.row_groups,
+        scan,
+    })
+}
+
+#[allow(dead_code)]
+fn query_time_range_covers_partition_range(
+    query_time_range: Option<&TimestampRange>,
+    partition_time_range: FileTimeRange,
+) -> bool {
+    let Some(query_time_range) = query_time_range else {
+        return true;
+    };
+
+    let (part_start, part_end) = partition_time_range;
+    query_time_range.contains(&part_start) && query_time_range.contains(&part_end)
+}
+
+/// Returns a stream that replays cached record batches.
+#[allow(dead_code)]
+pub(crate) fn cached_flat_range_stream(value: Arc<RangeScanCacheValue>) -> BoxedRecordBatchStream {
+    Box::pin(futures::stream::iter(
+        value.batches.clone().into_iter().map(Ok),
+    ))
+}
+
+/// Returns true if two primary key dictionary arrays share the same underlying
+/// values buffers by pointer comparison.
+///
+/// The primary key column is always `DictionaryArray<UInt32Type>` with `Binary` values.
+fn pk_values_ptr_eq(a: &DictionaryArray<UInt32Type>, b: &DictionaryArray<UInt32Type>) -> bool {
+    let a = a.values().as_binary::<i32>();
+    let b = b.values().as_binary::<i32>();
+    let values_eq = a.values().ptr_eq(b.values()) && a.offsets().ptr_eq(b.offsets());
+    match (a.nulls(), b.nulls()) {
+        (Some(a), Some(b)) => values_eq && a.inner().ptr_eq(b.inner()),
+        (None, None) => values_eq,
+        _ => false,
+    }
+}
+
+/// Buffers record batches for caching, tracking memory size while deduplicating
+/// shared dictionary values across batches.
+///
+/// Uses the primary key column as a proxy to detect dictionary sharing: if the PK
+/// column's dictionary values are pointer-equal across batches, we assume all
+/// dictionary columns share their values and deduct the total dictionary values size.
+struct CacheBatchBuffer {
+    batches: Vec<RecordBatch>,
+    /// Running total of batch memory.
+    total_size: usize,
+    /// The first batch's PK dictionary array, for pointer comparison.
+    /// `None` if no dictionary PK column exists or no batch has been added yet.
+    first_pk_dict: Option<DictionaryArray<UInt32Type>>,
+    /// Sum of `get_array_memory_size()` of all dictionary value arrays from the first batch.
+    total_dict_values_size: usize,
+    /// Whether the PK dictionary is still shared across all batches seen so far.
+    shared: bool,
+}
+
+impl CacheBatchBuffer {
+    fn new() -> Self {
+        Self {
+            batches: Vec::new(),
+            total_size: 0,
+            first_pk_dict: None,
+            total_dict_values_size: 0,
+            shared: true,
+        }
+    }
+
+    fn push(&mut self, batch: RecordBatch) {
+        if self.batches.is_empty() {
+            self.init_first_batch(&batch);
+        } else {
+            self.add_subsequent_batch(&batch);
+        }
+        self.batches.push(batch);
+    }
+
+    fn init_first_batch(&mut self, batch: &RecordBatch) {
+        self.total_size += batch.get_array_memory_size();
+
+        let pk_col_idx = primary_key_column_index(batch.num_columns());
+        let mut total_dict_values_size = 0;
+        for col_idx in 0..batch.num_columns() {
+            let col = batch.column(col_idx);
+            if let Some(dict) = col.as_any().downcast_ref::<DictionaryArray<UInt32Type>>() {
+                total_dict_values_size += dict.values().get_array_memory_size();
+                if col_idx == pk_col_idx {
+                    self.first_pk_dict = Some(dict.clone());
+                }
+            }
+        }
+        self.total_dict_values_size = total_dict_values_size;
+    }
+
+    fn add_subsequent_batch(&mut self, batch: &RecordBatch) {
+        let batch_size = batch.get_array_memory_size();
+
+        if self.shared
+            && let Some(first_pk_dict) = &self.first_pk_dict
+        {
+            let pk_col_idx = primary_key_column_index(batch.num_columns());
+            let col = batch.column(pk_col_idx);
+            if let Some(dict) = col.as_any().downcast_ref::<DictionaryArray<UInt32Type>>()
+                && pk_values_ptr_eq(first_pk_dict, dict)
+            {
+                // PK dict is shared, deduct all dict values sizes.
+                self.total_size += batch_size - self.total_dict_values_size;
+                return;
+            }
+            // Dictionary diverged.
+            self.shared = false;
+        }
+
+        self.total_size += batch_size;
+    }
+
+    fn estimated_batches_size(&self) -> usize {
+        self.total_size
+    }
+
+    fn into_batches(self) -> Vec<RecordBatch> {
+        self.batches
+    }
+}
+
+/// Wraps a stream to cache its output for future range cache hits.
+#[allow(dead_code)]
+pub(crate) fn cache_flat_range_stream(
+    mut stream: BoxedRecordBatchStream,
+    cache_strategy: CacheStrategy,
+    key: RangeScanCacheKey,
+    part_metrics: PartitionMetrics,
+) -> BoxedRecordBatchStream {
+    Box::pin(try_stream! {
+        let mut buffer = CacheBatchBuffer::new();
+        while let Some(batch) = stream.try_next().await? {
+            buffer.push(batch.clone());
+            yield batch;
+        }
+
+        let estimated_size = buffer.estimated_batches_size();
+        let batches = buffer.into_batches();
+        let value = Arc::new(RangeScanCacheValue::new(batches, estimated_size));
+        part_metrics.inc_range_cache_size(key.estimated_size() + value.estimated_size());
+        cache_strategy.put_range_result(key, value);
+    })
+}
+
+/// Creates a `cache_flat_range_stream` with dummy internals for benchmarking.
+///
+/// This avoids exposing `RangeScanCacheKey`, `ScanRequestFingerprint`, and
+/// `PartitionMetrics` publicly.
+#[cfg(feature = "test")]
+pub fn bench_cache_flat_range_stream(
+    stream: BoxedRecordBatchStream,
+    cache_size_bytes: u64,
+    region_id: RegionId,
+) -> BoxedRecordBatchStream {
+    use std::time::Instant;
+
+    use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
+
+    use crate::region::options::MergeMode;
+
+    let cache_manager = Arc::new(
+        crate::cache::CacheManager::builder()
+            .range_result_cache_size(cache_size_bytes)
+            .build(),
+    );
+    let cache_strategy = CacheStrategy::EnableAll(cache_manager);
+
+    let fingerprint = ScanRequestFingerprintBuilder {
+        read_column_ids: vec![],
+        read_column_types: vec![],
+        filters: vec![],
+        time_filters: vec![],
+        series_row_selector: None,
+        append_mode: false,
+        filter_deleted: false,
+        merge_mode: MergeMode::LastRow,
+        partition_expr_version: 0,
+    }
+    .build();
+
+    let key = RangeScanCacheKey {
+        region_id,
+        row_groups: vec![],
+        scan: fingerprint,
+    };
+
+    let metrics_set = ExecutionPlanMetricsSet::new();
+    let part_metrics =
+        PartitionMetrics::new(region_id, 0, "bench", Instant::now(), false, &metrics_set);
+
+    cache_flat_range_stream(stream, cache_strategy, key, part_metrics)
+}
+
 #[cfg(test)]
 mod tests {
-    use store_api::storage::TimeSeriesRowSelector;
+    use std::sync::Arc;
+    use std::time::Instant;
+
+    use common_time::Timestamp;
+    use common_time::range::TimestampRange;
+    use common_time::timestamp::TimeUnit;
+    use datafusion_common::ScalarValue;
+    use datafusion_expr::{Expr, col, lit};
+    use smallvec::smallvec;
+    use store_api::storage::FileId;
 
     use super::*;
+    use crate::cache::CacheManager;
+    use crate::read::projection::ProjectionMapper;
+    use crate::read::range::{RangeMeta, RowGroupIndex, SourceIndex};
+    use crate::read::scan_region::{PredicateGroup, ScanInput};
+    use crate::test_util::memtable_util::metadata_with_primary_key;
+    use crate::test_util::scheduler_util::SchedulerEnv;
+    use crate::test_util::sst_util::sst_file_handle_with_file_id;
+
+    fn test_cache_strategy() -> CacheStrategy {
+        CacheStrategy::EnableAll(Arc::new(
+            CacheManager::builder()
+                .range_result_cache_size(1024)
+                .build(),
+        ))
+    }
+
+    async fn new_stream_context(
+        filters: Vec<Expr>,
+        query_time_range: Option<TimestampRange>,
+        partition_time_range: FileTimeRange,
+    ) -> (StreamContext, PartitionRange) {
+        let env = SchedulerEnv::new().await;
+        let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
+        let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap();
+        let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap();
+        let file_id = FileId::random();
+        let file = sst_file_handle_with_file_id(
+            file_id,
+            partition_time_range.0.value(),
+            partition_time_range.1.value(),
+        );
+        let input = ScanInput::new(env.access_layer.clone(), mapper)
+            .with_predicate(predicate)
+            .with_time_range(query_time_range)
+            .with_files(vec![file])
+            .with_cache(test_cache_strategy())
+            .with_flat_format(true);
+        let range_meta = RangeMeta {
+            time_range: partition_time_range,
+            indices: smallvec![SourceIndex {
+                index: 0,
+                num_row_groups: 1,
+            }],
+            row_group_indices: smallvec![RowGroupIndex {
+                index: 0,
+                row_group_index: 0,
+            }],
+            num_rows: 10,
+        };
+        let partition_range = range_meta.new_partition_range(0);
+        let scan_fingerprint = crate::read::scan_region::build_scan_fingerprint(&input);
+        let stream_ctx = StreamContext {
+            input,
+            ranges: vec![range_meta],
+            scan_fingerprint,
+            query_start: Instant::now(),
+        };
+
+        (stream_ctx, partition_range)
+    }
+
+    /// Helper to create a timestamp millisecond literal.
+    fn ts_lit(val: i64) -> Expr {
+        lit(ScalarValue::TimestampMillisecond(Some(val), None))
+    }
+
+    #[tokio::test]
+    async fn strips_time_only_filters_when_query_covers_partition_range() {
+        let (stream_ctx, part_range) = new_stream_context(
+            vec![
+                col("ts").gt_eq(ts_lit(1000)),
+                col("ts").lt(ts_lit(2001)),
+                col("ts").is_not_null(),
+                col("k0").eq(lit("foo")),
+            ],
+            TimestampRange::with_unit(1000, 2002, TimeUnit::Millisecond),
+            (
+                Timestamp::new_millisecond(1000),
+                Timestamp::new_millisecond(2000),
+            ),
+        )
+        .await;
+
+        let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
+
+        // Range-reducible time filters should be cleared when query covers partition range.
+        assert!(key.scan.time_filters().is_empty());
+        // Non-range time predicates stay in filters.
+        let mut expected_filters = [
+            col("k0").eq(lit("foo")).to_string(),
+            col("ts").is_not_null().to_string(),
+        ];
+        expected_filters.sort_unstable();
+        assert_eq!(key.scan.filters(), expected_filters.as_slice());
+    }
+
+    #[tokio::test]
+    async fn preserves_time_filters_when_query_does_not_cover_partition_range() {
+        let (stream_ctx, part_range) = new_stream_context(
+            vec![col("ts").gt_eq(ts_lit(1000)), col("k0").eq(lit("foo"))],
+            TimestampRange::with_unit(1000, 1500, TimeUnit::Millisecond),
+            (
+                Timestamp::new_millisecond(1000),
+                Timestamp::new_millisecond(2000),
+            ),
+        )
+        .await;
+
+        let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
+
+        // Time filters should be preserved when query does not cover partition range.
+        assert_eq!(
+            key.scan.time_filters(),
+            [col("ts").gt_eq(ts_lit(1000)).to_string()].as_slice()
+        );
+        assert_eq!(
+            key.scan.filters(),
+            [col("k0").eq(lit("foo")).to_string()].as_slice()
+        );
+    }
+
+    #[tokio::test]
+    async fn strips_time_only_filters_when_query_has_no_time_range_limit() {
+        let (stream_ctx, part_range) = new_stream_context(
+            vec![
+                col("ts").gt_eq(ts_lit(1000)),
+                col("ts").is_not_null(),
+                col("k0").eq(lit("foo")),
+            ],
+            None,
+            (
+                Timestamp::new_millisecond(1000),
+                Timestamp::new_millisecond(2000),
+            ),
+        )
+        .await;
+
+        let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
+
+        // Range-reducible time filters should be cleared when query has no time range limit.
+        assert!(key.scan.time_filters().is_empty());
+        // Non-range time predicates stay in filters.
+        let mut expected_filters = [
+            col("k0").eq(lit("foo")).to_string(),
+            col("ts").is_not_null().to_string(),
+        ];
+        expected_filters.sort_unstable();
+        assert_eq!(key.scan.filters(), expected_filters.as_slice());
+    }
 
     #[test]
     fn normalizes_and_clears_time_filters() {
@@ -249,4 +687,170 @@ mod tests {
             fingerprint.partition_expr_version
         );
     }
+
+    /// Creates a test schema with 5 columns where the primary key dictionary column
+    /// is at index 2 (`num_columns - 3`), matching the flat format layout.
+    ///
+    /// Layout: `[field0: Int64, field1: Int64, pk: Dictionary<UInt32,Binary>, ts: Int64, seq: Int64]`
+    fn dict_test_schema() -> Arc<datatypes::arrow::datatypes::Schema> {
+        use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema};
+        Arc::new(Schema::new(vec![
+            Field::new("field0", ArrowDataType::Int64, false),
+            Field::new("field1", ArrowDataType::Int64, false),
+            Field::new(
+                "pk",
+                ArrowDataType::Dictionary(
+                    Box::new(ArrowDataType::UInt32),
+                    Box::new(ArrowDataType::Binary),
+                ),
+                false,
+            ),
+            Field::new("ts", ArrowDataType::Int64, false),
+            Field::new("seq", ArrowDataType::Int64, false),
+        ]))
+    }
+
+    /// Helper to create a record batch with a dictionary column at the primary key position.
+    fn make_dict_batch(
+        schema: Arc<datatypes::arrow::datatypes::Schema>,
+        dict_values: &datatypes::arrow::array::BinaryArray,
+        keys: &[u32],
+        int_values: &[i64],
+    ) -> RecordBatch {
+        use datatypes::arrow::array::{Int64Array, UInt32Array};
+
+        let key_array = UInt32Array::from(keys.to_vec());
+        let dict_array: DictionaryArray<UInt32Type> =
+            DictionaryArray::new(key_array, Arc::new(dict_values.clone()));
+        let int_array = Int64Array::from(int_values.to_vec());
+        let zeros = Int64Array::from(vec![0i64; int_values.len()]);
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(zeros.clone()),
+                Arc::new(int_array),
+                Arc::new(dict_array),
+                Arc::new(zeros.clone()),
+                Arc::new(zeros),
+            ],
+        )
+        .unwrap()
+    }
+
+    /// Computes the total `get_array_memory_size()` of all dictionary value arrays in a batch.
+    fn compute_total_dict_values_size(batch: &RecordBatch) -> usize {
+        batch
+            .columns()
+            .iter()
+            .filter_map(|col| {
+                col.as_any()
+                    .downcast_ref::<DictionaryArray<UInt32Type>>()
+                    .map(|dict| dict.values().get_array_memory_size())
+            })
+            .sum()
+    }
+
+    #[test]
+    fn cache_batch_buffer_empty() {
+        let buffer = CacheBatchBuffer::new();
+        assert_eq!(buffer.estimated_batches_size(), 0);
+        assert!(buffer.into_batches().is_empty());
+    }
+
+    #[test]
+    fn cache_batch_buffer_single_batch() {
+        use datatypes::arrow::array::BinaryArray;
+
+        let schema = dict_test_schema();
+        let dict_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]);
+        let batch = make_dict_batch(schema, &dict_values, &[0, 1, 2], &[10, 20, 30]);
+
+        let full_size = batch.get_array_memory_size();
+
+        let mut buffer = CacheBatchBuffer::new();
+        buffer.push(batch);
+        assert_eq!(buffer.estimated_batches_size(), full_size);
+        assert_eq!(buffer.into_batches().len(), 1);
+    }
+
+    #[test]
+    fn cache_batch_buffer_shared_dictionary() {
+        use datatypes::arrow::array::BinaryArray;
+
+        let schema = dict_test_schema();
+        let dict_values = BinaryArray::from_vec(vec![b"alpha", b"beta", b"gamma"]);
+
+        // Two batches sharing the same dictionary values array.
+        let batch1 = make_dict_batch(schema.clone(), &dict_values, &[0, 1], &[10, 20]);
+        let batch2 = make_dict_batch(schema, &dict_values, &[1, 2], &[30, 40]);
+
+        let batch1_full = batch1.get_array_memory_size();
+        let batch2_full = batch2.get_array_memory_size();
+
+        // The total dictionary values size that should be deduplicated for the second batch.
+        let dict_values_size = compute_total_dict_values_size(&batch2);
+
+        let mut buffer = CacheBatchBuffer::new();
+        buffer.push(batch1);
+        buffer.push(batch2);
+
+        // Second batch's dict values should not be counted again.
+        assert_eq!(
+            buffer.estimated_batches_size(),
+            batch1_full + batch2_full - dict_values_size
+        );
+        assert_eq!(buffer.into_batches().len(), 2);
+    }
+
+    #[test]
+    fn cache_batch_buffer_non_shared_dictionary() {
+        use datatypes::arrow::array::BinaryArray;
+
+        let schema = dict_test_schema();
+        let dict_values1 = BinaryArray::from_vec(vec![b"a", b"b"]);
+        let dict_values2 = BinaryArray::from_vec(vec![b"x", b"y"]);
+
+        let batch1 = make_dict_batch(schema.clone(), &dict_values1, &[0, 1], &[10, 20]);
+        let batch2 = make_dict_batch(schema, &dict_values2, &[0, 1], &[30, 40]);
+
+        let batch1_full = batch1.get_array_memory_size();
+        let batch2_full = batch2.get_array_memory_size();
+
+        let mut buffer = CacheBatchBuffer::new();
+        buffer.push(batch1);
+        buffer.push(batch2);
+
+        // Different dictionaries: full size for both.
+        assert_eq!(buffer.estimated_batches_size(), batch1_full + batch2_full);
+    }
+
+    #[test]
+    fn cache_batch_buffer_shared_then_diverged() {
+        use datatypes::arrow::array::BinaryArray;
+
+        let schema = dict_test_schema();
+        let shared_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]);
+        let different_values = BinaryArray::from_vec(vec![b"x", b"y"]);
+
+        let batch1 = make_dict_batch(schema.clone(), &shared_values, &[0], &[1]);
+        let batch2 = make_dict_batch(schema.clone(), &shared_values, &[1], &[2]);
+        let batch3 = make_dict_batch(schema, &different_values, &[0], &[3]);
+
+        let size1 = batch1.get_array_memory_size();
+        let size2 = batch2.get_array_memory_size();
+        let size3 = batch3.get_array_memory_size();
+
+        let dict_values_size = compute_total_dict_values_size(&batch2);
+
+        let mut buffer = CacheBatchBuffer::new();
+        buffer.push(batch1);
+        buffer.push(batch2);
+        buffer.push(batch3);
+
+        // batch2 shares dict with batch1 (dedup), batch3 does not (full size).
+        assert_eq!(
+            buffer.estimated_batches_size(),
+            size1 + (size2 - dict_values_size) + size3
+        );
+    }
 }
diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs
index 5cb2d75e25..e7cae7e7b8 100644
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -40,7 +40,7 @@ use store_api::region_engine::{PartitionRange, RegionScannerRef};
 use store_api::storage::{
     ColumnId, RegionId, ScanRequest, SequenceRange, TimeSeriesDistribution, TimeSeriesRowSelector,
 };
-use table::predicate::{Predicate, build_time_range_predicate};
+use table::predicate::{Predicate, build_time_range_predicate, extract_time_range_from_expr};
 use tokio::sync::{Semaphore, mpsc};
 use tokio_stream::wrappers::ReceiverStream;
 
@@ -1420,7 +1420,6 @@ fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode {
 
 /// Builds a [ScanRequestFingerprint] from a [ScanInput] if the scan is eligible
 /// for partition range caching.
-#[cfg_attr(not(test), allow(dead_code))]
 pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFingerprint> {
     let eligible = input.flat_format
         && !input.compaction
@@ -1439,7 +1438,14 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFin
         .map(|col| col.column_schema.name.as_str())
         .collect();
 
-    let time_index_name = metadata.time_index_column().column_schema.name.clone();
+    let time_index = metadata.time_index_column();
+    let time_index_name = time_index.column_schema.name.clone();
+    let ts_col_unit = time_index
+        .column_schema
+        .data_type
+        .as_timestamp()
+        .expect("Time index must have timestamp-compatible type")
+        .unit();
 
     let exprs = input
         .predicate_group()
@@ -1464,9 +1470,16 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFin
             _ => false,
         };
 
-        if is_time_only {
+        if is_time_only
+            && extract_time_range_from_expr(&time_index_name, ts_col_unit, expr).is_some()
+        {
+            // Range-reducible time predicates can be safely dropped from the
+            // cache key when the query time range covers the partition range.
             time_filters.push(expr.to_string());
         } else {
+            // Non-time filters and non-range time predicates (those that
+            // extract_time_range_from_expr cannot convert to a TimestampRange)
+            // always stay in the cache key.
             filters.push(expr.to_string());
         }
     }
@@ -1511,6 +1524,10 @@ pub struct StreamContext {
     pub input: ScanInput,
     /// Metadata for partition ranges.
     pub(crate) ranges: Vec<RangeMeta>,
+    /// Precomputed scan fingerprint for partition range caching.
+    /// `None` when the scan is not eligible for caching.
+    #[allow(dead_code)]
+    pub(crate) scan_fingerprint: Option<ScanRequestFingerprint>,
 
     // Metrics:
     /// The start time of the query.
@@ -1523,10 +1540,12 @@ impl StreamContext {
         let query_start = input.query_start.unwrap_or_else(Instant::now);
         let ranges = RangeMeta::seq_scan_ranges(&input);
         READ_SST_COUNT.observe(input.num_files() as f64);
+        let scan_fingerprint = build_scan_fingerprint(&input);
 
         Self {
             input,
             ranges,
+            scan_fingerprint,
             query_start,
         }
     }
@@ -1536,10 +1555,12 @@ impl StreamContext {
         let query_start = input.query_start.unwrap_or_else(Instant::now);
         let ranges = RangeMeta::unordered_scan_ranges(&input);
         READ_SST_COUNT.observe(input.num_files() as f64);
+        let scan_fingerprint = build_scan_fingerprint(&input);
 
         Self {
             input,
             ranges,
+            scan_fingerprint,
             query_start,
         }
     }
@@ -1849,6 +1870,7 @@ mod tests {
     use std::sync::Arc;
 
     use datafusion::physical_plan::expressions::lit as physical_lit;
+    use datafusion_common::ScalarValue;
     use datafusion_expr::{col, lit};
     use datatypes::value::Value;
     use partition::expr::col as partition_col;
@@ -2035,13 +2057,18 @@ mod tests {
         assert!(scan_region.use_flat_format());
     }
 
+    /// Helper to create a timestamp millisecond literal.
+    fn ts_lit(val: i64) -> datafusion_expr::Expr {
+        lit(ScalarValue::TimestampMillisecond(Some(val), None))
+    }
+
     #[tokio::test]
     async fn test_build_scan_fingerprint_for_eligible_scan() {
         let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
         let input = new_scan_input(
             metadata.clone(),
             vec![
-                col("ts").gt_eq(lit(1000)),
+                col("ts").gt_eq(ts_lit(1000)),
                 col("k0").eq(lit("foo")),
                 col("v0").gt(lit(1)),
             ],
@@ -2071,7 +2098,7 @@ mod tests {
                 col("k0").eq(lit("foo")).to_string(),
                 col("v0").gt(lit(1)).to_string(),
             ],
-            time_filters: vec![col("ts").gt_eq(lit(1000)).to_string()],
+            time_filters: vec![col("ts").gt_eq(ts_lit(1000)).to_string()],
             series_row_selector: Some(TimeSeriesRowSelector::LastRow),
             append_mode: false,
             filter_deleted: false,
diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs
index 0ee6a4437d..6f68616709 100644
--- a/src/mito2/src/read/scan_util.rs
+++ b/src/mito2/src/read/scan_util.rs
@@ -247,6 +247,12 @@ pub(crate) struct ScanMetricsSet {
     num_range_builders: isize,
     /// Peak number of file range builders.
     num_peak_range_builders: isize,
+    /// Total bytes added to the range cache during this scan.
+    range_cache_size: usize,
+    /// Number of range cache hits during this scan.
+    range_cache_hit: usize,
+    /// Number of range cache misses during this scan.
+    range_cache_miss: usize,
 }
 
 /// Wrapper for file metrics that compares by total cost in reverse order.
@@ -345,6 +351,9 @@ impl fmt::Debug for ScanMetricsSet {
             build_ranges_peak_mem_size,
             num_range_builders: _,
             num_peak_range_builders,
+            range_cache_size,
+            range_cache_hit,
+            range_cache_miss,
         } = self;
 
         // Write core metrics
@@ -590,6 +599,16 @@ impl fmt::Debug for ScanMetricsSet {
             write!(f, "}}")?;
         }
 
+        if *range_cache_size > 0 {
+            write!(f, ", \"range_cache_size\":{range_cache_size}")?;
+        }
+        if *range_cache_hit > 0 {
+            write!(f, ", \"range_cache_hit\":{range_cache_hit}")?;
+        }
+        if *range_cache_miss > 0 {
+            write!(f, ", \"range_cache_miss\":{range_cache_miss}")?;
+        }
+
         write!(
             f,
             ", \"build_ranges_peak_mem_size\":{build_ranges_peak_mem_size}, \
@@ -1097,6 +1116,27 @@ impl PartitionMetrics {
     pub(crate) fn dedup_metrics_reporter(&self) -> Arc<dyn DedupMetricsReport> {
         self.0.clone()
     }
+
+    /// Increments the total bytes added to the range cache.
+    #[allow(dead_code)]
+    pub(crate) fn inc_range_cache_size(&self, size: usize) {
+        let mut metrics = self.0.metrics.lock().unwrap();
+        metrics.range_cache_size += size;
+    }
+
+    /// Increments the range cache hit counter.
+    #[allow(dead_code)]
+    pub(crate) fn inc_range_cache_hit(&self) {
+        let mut metrics = self.0.metrics.lock().unwrap();
+        metrics.range_cache_hit += 1;
+    }
+
+    /// Increments the range cache miss counter.
+    #[allow(dead_code)]
+    pub(crate) fn inc_range_cache_miss(&self) {
+        let mut metrics = self.0.metrics.lock().unwrap();
+        metrics.range_cache_miss += 1;
+    }
 }
 
 impl fmt::Debug for PartitionMetrics {
diff --git a/src/mito2/src/test_util.rs b/src/mito2/src/test_util.rs
index 842689bba6..350195bfa9 100644
--- a/src/mito2/src/test_util.rs
+++ b/src/mito2/src/test_util.rs
@@ -15,6 +15,7 @@
 //! Utilities for testing.
 
 pub mod batch_util;
+pub mod bench_util;
 pub mod memtable_util;
 pub mod scheduler_util;
 pub mod sst_util;
diff --git a/src/mito2/src/test_util/bench_util.rs b/src/mito2/src/test_util/bench_util.rs
new file mode 100644
index 0000000000..8f182e4157
--- /dev/null
+++ b/src/mito2/src/test_util/bench_util.rs
@@ -0,0 +1,259 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Shared utilities for mito2 benchmarks.
+//!
+//! Provides a TSBS cpu-like data generator ([`CpuDataGenerator`]) and schema
+//! ([`cpu_metadata`]) used by multiple benchmark binaries in this directory.
+
+use api::v1::value::ValueData;
+use api::v1::{Row, Rows, SemanticType};
+use datafusion_common::Column;
+use datafusion_expr::{Expr, lit};
+use datatypes::data_type::ConcreteDataType;
+use datatypes::schema::ColumnSchema;
+use rand::Rng;
+use rand::rngs::ThreadRng;
+use rand::seq::IndexedRandom;
+use store_api::metadata::{
+    ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
+};
+use store_api::storage::RegionId;
+use table::predicate::Predicate;
+
+use crate::memtable::KeyValues;
+use crate::test_util::memtable_util::region_metadata_to_row_schema;
+
+pub struct Host {
+    pub hostname: String,
+    pub region: String,
+    pub datacenter: String,
+    pub rack: String,
+    pub os: String,
+    pub arch: String,
+    pub team: String,
+    pub service: String,
+    pub service_version: String,
+    pub service_environment: String,
+}
+
+impl Host {
+    pub fn random_with_id(id: usize) -> Host {
+        let mut rng = rand::rng();
+        let region = format!("ap-southeast-{}", rng.random_range(0..10));
+        let datacenter = format!(
+            "{}{}",
+            region,
+            ['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap()
+        );
+        Host {
+            hostname: format!("host_{id}"),
+            region,
+            datacenter,
+            rack: rng.random_range(0..100).to_string(),
+            os: "Ubuntu16.04LTS".to_string(),
+            arch: "x86".to_string(),
+            team: "CHI".to_string(),
+            service: rng.random_range(0..100).to_string(),
+            service_version: rng.random_range(0..10).to_string(),
+            service_environment: "test".to_string(),
+        }
+    }
+
+    pub fn fill_values(&self, values: &mut Vec<api::v1::Value>) {
+        let tags = [
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.hostname.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.region.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.datacenter.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.rack.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.os.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.arch.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.team.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.service.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.service_version.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.service_environment.clone())),
+            },
+        ];
+        for tag in tags {
+            values.push(tag);
+        }
+    }
+}
+
+pub struct CpuDataGenerator {
+    pub metadata: RegionMetadataRef,
+    column_schemas: Vec<api::v1::ColumnSchema>,
+    hosts: Vec<Host>,
+    start_sec: i64,
+    end_sec: i64,
+}
+
+impl CpuDataGenerator {
+    pub fn new(
+        metadata: RegionMetadataRef,
+        num_hosts: usize,
+        start_sec: i64,
+        end_sec: i64,
+    ) -> Self {
+        let column_schemas = region_metadata_to_row_schema(&metadata);
+        Self {
+            metadata,
+            column_schemas,
+            hosts: Self::generate_hosts(num_hosts),
+            start_sec,
+            end_sec,
+        }
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = KeyValues> + '_ {
+        // point per 10s.
+        (self.start_sec..self.end_sec)
+            .step_by(10)
+            .enumerate()
+            .map(|(seq, ts)| self.build_key_values(seq, ts))
+    }
+
+    pub fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues {
+        let rows = self
+            .hosts
+            .iter()
+            .map(|host| {
+                let mut rng = rand::rng();
+                let mut values = Vec::with_capacity(21);
+                values.push(api::v1::Value {
+                    value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
+                });
+                host.fill_values(&mut values);
+                for _ in 0..10 {
+                    values.push(api::v1::Value {
+                        value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))),
+                    });
+                }
+                Row { values }
+            })
+            .collect();
+        let mutation = api::v1::Mutation {
+            op_type: api::v1::OpType::Put as i32,
+            sequence: seq as u64,
+            rows: Some(Rows {
+                schema: self.column_schemas.clone(),
+                rows,
+            }),
+            write_hint: None,
+        };
+
+        KeyValues::new(&self.metadata, mutation).unwrap()
+    }
+
+    pub fn random_host_filter(&self) -> Predicate {
+        let host = self.random_hostname();
+        let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host));
+        Predicate::new(vec![expr])
+    }
+
+    pub fn random_host_filter_exprs(&self) -> Vec<Expr> {
+        let host = self.random_hostname();
+        vec![Expr::Column(Column::from_name("hostname")).eq(lit(host))]
+    }
+
+    pub fn random_hostname(&self) -> String {
+        let mut rng = rand::rng();
+        self.hosts.choose(&mut rng).unwrap().hostname.clone()
+    }
+
+    pub fn random_f64(rng: &mut ThreadRng) -> f64 {
+        let base: u32 = rng.random_range(30..95);
+        base as f64
+    }
+
+    pub fn generate_hosts(num_hosts: usize) -> Vec<Host> {
+        (0..num_hosts).map(Host::random_with_id).collect()
+    }
+}
+
+/// Creates a metadata for TSBS cpu-like table.
+pub fn cpu_metadata() -> RegionMetadata {
+    let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
+    builder.push_column_metadata(ColumnMetadata {
+        column_schema: ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        ),
+        semantic_type: SemanticType::Timestamp,
+        column_id: 0,
+    });
+    let mut column_id = 1;
+    let tags = [
+        "hostname",
+        "region",
+        "datacenter",
+        "rack",
+        "os",
+        "arch",
+        "team",
+        "service",
+        "service_version",
+        "service_environment",
+    ];
+    for tag in tags {
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true),
+            semantic_type: SemanticType::Tag,
+            column_id,
+        });
+        column_id += 1;
+    }
+    let fields = [
+        "usage_user",
+        "usage_system",
+        "usage_idle",
+        "usage_nice",
+        "usage_iowait",
+        "usage_irq",
+        "usage_softirq",
+        "usage_steal",
+        "usage_guest",
+        "usage_guest_nice",
+    ];
+    for field in fields {
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true),
+            semantic_type: SemanticType::Field,
+            column_id,
+        });
+        column_id += 1;
+    }
+    builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
+    builder.build().unwrap()
+}
diff --git a/src/mito2/src/test_util/memtable_util.rs b/src/mito2/src/test_util/memtable_util.rs
index 8917875250..25ab9bb8b4 100644
--- a/src/mito2/src/test_util/memtable_util.rs
+++ b/src/mito2/src/test_util/memtable_util.rs
@@ -30,7 +30,7 @@ use mito_codec::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodecExt, SortFi
 use store_api::metadata::{
     ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
 };
-use store_api::storage::{ColumnId, RegionId, SequenceNumber, SequenceRange};
+use store_api::storage::{ColumnId, RegionId, SequenceNumber};
 
 use crate::error::Result;
 use crate::memtable::bulk::part::BulkPart;
diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs
index 71896b3d5d..fd5ad82f3f 100644
--- a/src/mito2/src/worker.rs
+++ b/src/mito2/src/worker.rs
@@ -207,6 +207,7 @@ impl WorkerGroup {
                 .vector_cache_size(config.vector_cache_size.as_bytes())
                 .page_cache_size(config.page_cache_size.as_bytes())
                 .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
+                .range_result_cache_size(config.range_result_cache_size.as_bytes())
                 .index_metadata_size(config.index.metadata_cache_size.as_bytes())
                 .index_content_size(config.index.content_cache_size.as_bytes())
                 .index_content_page_size(config.index.content_cache_page_size.as_bytes())
@@ -421,6 +422,7 @@ impl WorkerGroup {
                 .vector_cache_size(config.vector_cache_size.as_bytes())
                 .page_cache_size(config.page_cache_size.as_bytes())
                 .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
+                .range_result_cache_size(config.range_result_cache_size.as_bytes())
                 .write_cache(write_cache)
                 .build(),
         );
diff --git a/src/table/src/predicate.rs b/src/table/src/predicate.rs
index f9be7be16e..2c9ac41560 100644
--- a/src/table/src/predicate.rs
+++ b/src/table/src/predicate.rs
@@ -203,7 +203,7 @@ pub fn build_time_range_predicate(
 
 /// Extract time range filter from `WHERE`/`IN (...)`/`BETWEEN` clauses.
 /// Return None if no time range can be found in expr.
-fn extract_time_range_from_expr(
+pub fn extract_time_range_from_expr(
     ts_col_name: &str,
     ts_col_unit: TimeUnit,
     expr: &Expr,
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index 65e56fa15e..7ae59ae9fc 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -1642,6 +1642,7 @@ fn drop_lines_with_inconsistent_results(input: String) -> String {
         "metadata_cache_size =",
         "content_cache_size =",
         "result_cache_size =",
+        "range_result_cache_size =",
         "name =",
         "recovery_parallelism =",
         "max_background_index_builds =",

From c8c2e09eedd5a2f42acd599d76d4301e29abae53 Mon Sep 17 00:00:00 2001
From: shuiyisong <113876041+shuiyisong@users.noreply.github.com>
Date: Tue, 24 Mar 2026 18:21:31 +0800
Subject: [PATCH 37/42] refactor: move election trait and implementations to
 the `common-meta` crate (#7820)

* refactor: move election impl to common-meta

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

* fix: adding back comment

Signed-off-by: shuiyisong <xixing.sys@gmail.com>

---------

Signed-off-by: shuiyisong <xixing.sys@gmail.com>
---
 src/{meta-srv => common/meta}/src/election.rs | 76 ++++++++++++++++++-
 .../meta}/src/election/etcd.rs                | 14 ++--
 .../meta}/src/election/rds.rs                 |  4 +-
 .../meta}/src/election/rds/mysql.rs           | 42 +++++-----
 .../meta}/src/election/rds/postgres.rs        | 45 ++++++-----
 src/common/meta/src/error.rs                  | 70 +++++++++++++++--
 src/common/meta/src/lib.rs                    |  1 +
 src/meta-srv/src/bootstrap.rs                 | 21 ++---
 src/meta-srv/src/cluster.rs                   |  4 +-
 src/meta-srv/src/lib.rs                       |  1 -
 src/meta-srv/src/metasrv.rs                   | 74 +-----------------
 src/meta-srv/src/service/admin/leader.rs      |  2 +-
 src/meta-srv/src/service/cluster.rs           |  5 +-
 src/meta-srv/src/service/heartbeat.rs         |  4 +-
 14 files changed, 218 insertions(+), 145 deletions(-)
 rename src/{meta-srv => common/meta}/src/election.rs (67%)
 rename src/{meta-srv => common/meta}/src/election/etcd.rs (94%)
 rename src/{meta-srv => common/meta}/src/election/rds.rs (96%)
 rename src/{meta-srv => common/meta}/src/election/rds/mysql.rs (97%)
 rename src/{meta-srv => common/meta}/src/election/rds/postgres.rs (97%)

diff --git a/src/meta-srv/src/election.rs b/src/common/meta/src/election.rs
similarity index 67%
rename from src/meta-srv/src/election.rs
rename to src/common/meta/src/election.rs
index 2d2826b286..12173beda8 100644
--- a/src/meta-srv/src/election.rs
+++ b/src/common/meta/src/election.rs
@@ -21,15 +21,85 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 
 use common_telemetry::{error, info, warn};
+use serde::{Deserialize, Serialize};
 use tokio::sync::broadcast::error::RecvError;
 use tokio::sync::broadcast::{self, Receiver, Sender};
 
 use crate::error::Result;
-use crate::metasrv::MetasrvNodeInfo;
 
-pub(crate) const CANDIDATE_LEASE_SECS: u64 = 600;
+pub const CANDIDATE_LEASE_SECS: u64 = 600;
 const KEEP_ALIVE_INTERVAL_SECS: u64 = CANDIDATE_LEASE_SECS / 2;
 
+/// The value of the leader. It is used to store the leader's address.
+pub struct LeaderValue(pub String);
+
+impl<T: AsRef<[u8]>> From<T> for LeaderValue {
+    fn from(value: T) -> Self {
+        let string = String::from_utf8_lossy(value.as_ref());
+        Self(string.to_string())
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MetasrvNodeInfo {
+    // The metasrv's address
+    pub addr: String,
+    // The node build version
+    pub version: String,
+    // The node build git commit hash
+    pub git_commit: String,
+    // The node start timestamp in milliseconds
+    pub start_time_ms: u64,
+    // The node total cpu millicores
+    #[serde(default)]
+    pub total_cpu_millicores: i64,
+    // The node total memory bytes
+    #[serde(default)]
+    pub total_memory_bytes: i64,
+    /// The node build cpu usage millicores
+    #[serde(default)]
+    pub cpu_usage_millicores: i64,
+    /// The node build memory usage bytes
+    #[serde(default)]
+    pub memory_usage_bytes: i64,
+    // The node hostname
+    #[serde(default)]
+    pub hostname: String,
+}
+
+// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto.
+#[allow(deprecated)]
+impl From<MetasrvNodeInfo> for api::v1::meta::MetasrvNodeInfo {
+    fn from(node_info: MetasrvNodeInfo) -> Self {
+        Self {
+            peer: Some(api::v1::meta::Peer {
+                addr: node_info.addr,
+                ..Default::default()
+            }),
+            // TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version.
+            // New code should use the fields in `info.NodeInfo` instead.
+            version: node_info.version.clone(),
+            git_commit: node_info.git_commit.clone(),
+            start_time_ms: node_info.start_time_ms,
+            cpus: node_info.total_cpu_millicores as u32,
+            memory_bytes: node_info.total_memory_bytes as u64,
+            // The canonical location for node information.
+            info: Some(api::v1::meta::NodeInfo {
+                version: node_info.version,
+                git_commit: node_info.git_commit,
+                start_time_ms: node_info.start_time_ms,
+                total_cpu_millicores: node_info.total_cpu_millicores,
+                total_memory_bytes: node_info.total_memory_bytes,
+                cpu_usage_millicores: node_info.cpu_usage_millicores,
+                memory_usage_bytes: node_info.memory_usage_bytes,
+                cpus: node_info.total_cpu_millicores as u32,
+                memory_bytes: node_info.total_memory_bytes as u64,
+                hostname: node_info.hostname,
+            }),
+        }
+    }
+}
+
 /// Messages sent when the leader changes.
 #[derive(Debug, Clone)]
 pub enum LeaderChangeMessage {
@@ -168,3 +238,5 @@ pub trait Election: Send + Sync {
 
     fn subscribe_leader_change(&self) -> Receiver<LeaderChangeMessage>;
 }
+
+pub type ElectionRef = Arc<dyn Election<Leader = LeaderValue>>;
diff --git a/src/meta-srv/src/election/etcd.rs b/src/common/meta/src/election/etcd.rs
similarity index 94%
rename from src/meta-srv/src/election/etcd.rs
rename to src/common/meta/src/election/etcd.rs
index 883f723d74..affad31ef4 100644
--- a/src/meta-srv/src/election/etcd.rs
+++ b/src/common/meta/src/election/etcd.rs
@@ -16,8 +16,6 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::time::Duration;
 
-use common_meta::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS};
-use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
 use common_telemetry::{error, info, warn};
 use etcd_client::{
     Client, GetOptions, LeaderKey as EtcdLeaderKey, LeaseKeepAliveStream, LeaseKeeper, PutOptions,
@@ -27,13 +25,15 @@ use tokio::sync::broadcast;
 use tokio::sync::broadcast::Receiver;
 use tokio::time::{MissedTickBehavior, timeout};
 
+use crate::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS};
 use crate::election::{
-    CANDIDATE_LEASE_SECS, Election, KEEP_ALIVE_INTERVAL_SECS, LeaderChangeMessage, LeaderKey,
-    listen_leader_change, send_leader_change_and_set_flags,
+    CANDIDATE_LEASE_SECS, Election, ElectionRef, KEEP_ALIVE_INTERVAL_SECS, LeaderChangeMessage,
+    LeaderKey, LeaderValue, MetasrvNodeInfo, listen_leader_change,
+    send_leader_change_and_set_flags,
 };
 use crate::error;
 use crate::error::Result;
-use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
+use crate::key::{CANDIDATES_ROOT, ELECTION_KEY};
 
 impl LeaderKey for EtcdLeaderKey {
     fn name(&self) -> &[u8] {
@@ -253,7 +253,7 @@ impl Election for EtcdElection {
                 .leader(self.election_key())
                 .await
                 .context(error::EtcdFailedSnafu)?;
-            let leader_value = res.kv().context(error::NoLeaderSnafu)?.value();
+            let leader_value = res.kv().context(error::ElectionNoLeaderSnafu)?.value();
             Ok(leader_value.into())
         }
     }
@@ -279,7 +279,7 @@ impl EtcdElection {
             ensure!(
                 res.ttl() > 0,
                 error::UnexpectedSnafu {
-                    violated: "Failed to refresh the lease",
+                    err_msg: "Failed to refresh the lease".to_string(),
                 }
             );
 
diff --git a/src/meta-srv/src/election/rds.rs b/src/common/meta/src/election/rds.rs
similarity index 96%
rename from src/meta-srv/src/election/rds.rs
rename to src/common/meta/src/election/rds.rs
index 16e113415a..6ee529ee02 100644
--- a/src/meta-srv/src/election/rds.rs
+++ b/src/common/meta/src/election/rds.rs
@@ -36,7 +36,7 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> {
             .split(LEASE_SEP)
             .collect_tuple()
             .with_context(|| UnexpectedSnafu {
-                violated: format!(
+                err_msg: format!(
                     "Invalid value {}, expect node info || {} || expire time",
                     value, LEASE_SEP
                 ),
@@ -45,7 +45,7 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> {
     let expire_time = match Timestamp::from_str(expire_time, None) {
         Ok(ts) => ts,
         Err(_) => UnexpectedSnafu {
-            violated: format!("Invalid timestamp: {}", expire_time),
+            err_msg: format!("Invalid timestamp: {}", expire_time),
         }
         .fail()?,
     };
diff --git a/src/meta-srv/src/election/rds/mysql.rs b/src/common/meta/src/election/rds/mysql.rs
similarity index 97%
rename from src/meta-srv/src/election/rds/mysql.rs
rename to src/common/meta/src/election/rds/mysql.rs
index 20051a2610..80f3d8ca7c 100644
--- a/src/meta-srv/src/election/rds/mysql.rs
+++ b/src/common/meta/src/election/rds/mysql.rs
@@ -16,7 +16,6 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::time::Duration;
 
-use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
 use common_telemetry::{error, info, warn};
 use common_time::Timestamp;
 use snafu::{OptionExt, ResultExt, ensure};
@@ -29,14 +28,15 @@ use tokio::time::MissedTickBehavior;
 
 use crate::election::rds::{LEASE_SEP, Lease, RdsLeaderKey, parse_value_and_expire_time};
 use crate::election::{
-    Election, LeaderChangeMessage, listen_leader_change, send_leader_change_and_set_flags,
+    Election, ElectionRef, LeaderChangeMessage, LeaderValue, MetasrvNodeInfo, listen_leader_change,
+    send_leader_change_and_set_flags,
 };
 use crate::error::{
     AcquireMySqlClientSnafu, DecodeSqlValueSnafu, DeserializeFromJsonSnafu,
-    LeaderLeaseChangedSnafu, LeaderLeaseExpiredSnafu, MySqlExecutionSnafu, NoLeaderSnafu, Result,
-    SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu,
+    ElectionLeaderLeaseChangedSnafu, ElectionLeaderLeaseExpiredSnafu, ElectionNoLeaderSnafu,
+    MySqlExecutionSnafu, Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu,
 };
-use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
+use crate::key::{CANDIDATES_ROOT, ELECTION_KEY};
 
 struct ElectionSqlFactory<'a> {
     table_name: &'a str,
@@ -592,7 +592,7 @@ impl Election for MySqlElection {
             ensure!(
                 lease.expire_time > lease.current,
                 UnexpectedSnafu {
-                    violated: format!(
+                    err_msg: format!(
                         "Candidate lease expired at {:?} (current time: {:?}), key: {:?}",
                         lease.expire_time,
                         lease.current,
@@ -667,10 +667,10 @@ impl Election for MySqlElection {
             let client = self.client.lock().await;
             let mut executor = Executor::Default(client);
             if let Some(lease) = self.get_value_with_lease(&key, &mut executor).await? {
-                ensure!(lease.expire_time > lease.current, NoLeaderSnafu);
+                ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu);
                 Ok(lease.leader_value.as_bytes().into())
             } else {
-                NoLeaderSnafu.fail()
+                ElectionNoLeaderSnafu.fail()
             }
         }
     }
@@ -705,7 +705,7 @@ impl MySqlElection {
         let current_time = match Timestamp::from_str(&current_time_str, None) {
             Ok(ts) => ts,
             Err(_) => UnexpectedSnafu {
-                violated: format!("Invalid timestamp: {}", current_time_str),
+                err_msg: format!("Invalid timestamp: {}", current_time_str),
             }
             .fail()?,
         };
@@ -740,7 +740,7 @@ impl MySqlElection {
             current = match Timestamp::from_str(current_time_str, None) {
                 Ok(ts) => ts,
                 Err(_) => UnexpectedSnafu {
-                    violated: format!("Invalid timestamp: {}", current_time_str),
+                    err_msg: format!("Invalid timestamp: {}", current_time_str),
                 }
                 .fail()?,
             };
@@ -777,7 +777,7 @@ impl MySqlElection {
         ensure!(
             res == 1,
             UnexpectedSnafu {
-                violated: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
+                err_msg: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
             }
         );
 
@@ -920,9 +920,12 @@ impl MySqlElection {
     ///   will be released.
     /// - **Case 2**: If all checks pass, the function returns without performing any actions.
     fn lease_check(&self, lease: &Option<Lease>) -> Result<Lease> {
-        let lease = lease.as_ref().context(NoLeaderSnafu)?;
+        let lease = lease.as_ref().context(ElectionNoLeaderSnafu)?;
         // Case 1: Lease expired
-        ensure!(lease.expire_time > lease.current, LeaderLeaseExpiredSnafu);
+        ensure!(
+            lease.expire_time > lease.current,
+            ElectionLeaderLeaseExpiredSnafu
+        );
         // Case 2: Everything is fine
         Ok(lease.clone())
     }
@@ -960,7 +963,7 @@ impl MySqlElection {
         let remote_lease = self.get_value_with_lease(&key, &mut executor).await?;
         ensure!(
             expected_lease.map(|lease| lease.origin) == remote_lease.map(|lease| lease.origin),
-            LeaderLeaseChangedSnafu
+            ElectionLeaderLeaseChangedSnafu
         );
         self.delete_value(&key, &mut executor).await?;
         self.put_value_with_lease(
@@ -987,12 +990,11 @@ mod tests {
     use std::assert_matches::assert_matches;
     use std::env;
 
-    use common_meta::maybe_skip_mysql_integration_test;
     use common_telemetry::init_default_ut_logging;
+    use sqlx::MySqlPool;
 
     use super::*;
-    use crate::error;
-    use crate::utils::mysql::create_mysql_pool;
+    use crate::{error, maybe_skip_mysql_integration_test};
 
     async fn create_mysql_client(
         table_name: Option<&str>,
@@ -1003,11 +1005,11 @@ mod tests {
         let endpoint = env::var("GT_MYSQL_ENDPOINTS").unwrap_or_default();
         if endpoint.is_empty() {
             return UnexpectedSnafu {
-                violated: "MySQL endpoint is empty".to_string(),
+                err_msg: "MySQL endpoint is empty".to_string(),
             }
             .fail();
         }
-        let pool = create_mysql_pool(&[endpoint], None).await.unwrap();
+        let pool = MySqlPool::connect(&endpoint).await.unwrap();
         let mut client = ElectionMysqlClient::new(
             pool,
             execution_timeout,
@@ -1302,7 +1304,7 @@ mod tests {
         let err = elected(&leader_mysql_election, table_name, Some(incorrect_lease))
             .await
             .unwrap_err();
-        assert_matches!(err, error::Error::LeaderLeaseChanged { .. });
+        assert_matches!(err, error::Error::ElectionLeaderLeaseChanged { .. });
         let lease = get_lease(&leader_mysql_election).await;
         assert!(lease.is_none());
         drop_table(&leader_mysql_election.client, table_name).await;
diff --git a/src/meta-srv/src/election/rds/postgres.rs b/src/common/meta/src/election/rds/postgres.rs
similarity index 97%
rename from src/meta-srv/src/election/rds/postgres.rs
rename to src/common/meta/src/election/rds/postgres.rs
index c21efd780b..01910335a0 100644
--- a/src/meta-srv/src/election/rds/postgres.rs
+++ b/src/common/meta/src/election/rds/postgres.rs
@@ -16,7 +16,6 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::time::Duration;
 
-use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
 use common_telemetry::{error, info, warn};
 use common_time::Timestamp;
 use deadpool_postgres::{Manager, Pool};
@@ -28,13 +27,15 @@ use tokio_postgres::types::ToSql;
 
 use crate::election::rds::{LEASE_SEP, Lease, RdsLeaderKey, parse_value_and_expire_time};
 use crate::election::{
-    Election, LeaderChangeMessage, listen_leader_change, send_leader_change_and_set_flags,
+    Election, ElectionRef, LeaderChangeMessage, LeaderValue, MetasrvNodeInfo, listen_leader_change,
+    send_leader_change_and_set_flags,
 };
 use crate::error::{
-    DeserializeFromJsonSnafu, GetPostgresClientSnafu, NoLeaderSnafu, PostgresExecutionSnafu,
-    Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu,
+    DeserializeFromJsonSnafu, ElectionNoLeaderSnafu, GetPostgresClientSnafu,
+    PostgresExecutionSnafu, Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu,
+    UnexpectedSnafu,
 };
-use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
+use crate::key::{CANDIDATES_ROOT, ELECTION_KEY};
 
 struct ElectionSqlFactory<'a> {
     lock_id: u64,
@@ -404,13 +405,13 @@ impl Election for PgElection {
                 .get_value_with_lease(&key)
                 .await?
                 .context(UnexpectedSnafu {
-                    violated: format!("Failed to get lease for key: {:?}", key),
+                    err_msg: format!("Failed to get lease for key: {:?}", key),
                 })?;
 
             ensure!(
                 lease.expire_time > lease.current,
                 UnexpectedSnafu {
-                    violated: format!(
+                    err_msg: format!(
                         "Candidate lease expired at {:?} (current time {:?}), key: {:?}",
                         lease.expire_time, lease.current, key
                     ),
@@ -464,11 +465,11 @@ impl Election for PgElection {
                 .query(&self.sql_set.campaign, &[])
                 .await?;
             let row = res.first().context(UnexpectedSnafu {
-                violated: "Failed to get the result of acquiring advisory lock",
+                err_msg: "Failed to get the result of acquiring advisory lock".to_string(),
             })?;
             let is_leader = row.try_get(0).map_err(|_| {
                 UnexpectedSnafu {
-                    violated: "Failed to get the result of get lock",
+                    err_msg: "Failed to get the result of get lock".to_string(),
                 }
                 .build()
             })?;
@@ -500,10 +501,10 @@ impl Election for PgElection {
         } else {
             let key = self.election_key();
             if let Some(lease) = self.get_value_with_lease(&key).await? {
-                ensure!(lease.expire_time > lease.current, NoLeaderSnafu);
+                ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu);
                 Ok(lease.leader_value.as_bytes().into())
             } else {
-                NoLeaderSnafu.fail()
+                ElectionNoLeaderSnafu.fail()
             }
         }
     }
@@ -537,7 +538,7 @@ impl PgElection {
             let current_time = match Timestamp::from_str(current_time_str, None) {
                 Ok(ts) => ts,
                 Err(_) => UnexpectedSnafu {
-                    violated: format!("Invalid timestamp: {}", current_time_str),
+                    err_msg: format!("Invalid timestamp: {}", current_time_str),
                 }
                 .fail()?,
             };
@@ -576,7 +577,7 @@ impl PgElection {
             current = match Timestamp::from_str(current_time_str, None) {
                 Ok(ts) => ts,
                 Err(_) => UnexpectedSnafu {
-                    violated: format!("Invalid timestamp: {}", current_time_str),
+                    err_msg: format!("Invalid timestamp: {}", current_time_str),
                 }
                 .fail()?,
             };
@@ -613,7 +614,7 @@ impl PgElection {
         ensure!(
             res == 1,
             UnexpectedSnafu {
-                violated: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
+                err_msg: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
             }
         );
 
@@ -742,9 +743,9 @@ impl PgElection {
         let lease = self
             .get_value_with_lease(&key)
             .await?
-            .context(NoLeaderSnafu)?;
+            .context(ElectionNoLeaderSnafu)?;
         // Case 2
-        ensure!(lease.expire_time > lease.current, NoLeaderSnafu);
+        ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu);
         // Case 3
         Ok(())
     }
@@ -831,11 +832,11 @@ mod tests {
     use std::assert_matches::assert_matches;
     use std::env;
 
-    use common_meta::maybe_skip_postgres_integration_test;
+    use deadpool_postgres::{Config, Runtime};
+    use tokio_postgres::NoTls;
 
     use super::*;
-    use crate::error;
-    use crate::utils::postgres::create_postgres_pool;
+    use crate::{error, maybe_skip_postgres_integration_test};
 
     async fn create_postgres_client(
         table_name: Option<&str>,
@@ -846,11 +847,13 @@ mod tests {
         let endpoint = env::var("GT_POSTGRES_ENDPOINTS").unwrap_or_default();
         if endpoint.is_empty() {
             return UnexpectedSnafu {
-                violated: "Postgres endpoint is empty".to_string(),
+                err_msg: "Postgres endpoint is empty".to_string(),
             }
             .fail();
         }
-        let pool = create_postgres_pool(&[endpoint], None, None).await.unwrap();
+        let mut cfg = Config::new();
+        cfg.url = Some(endpoint);
+        let pool = cfg.create_pool(Some(Runtime::Tokio1), NoTls).unwrap();
         let mut pg_client = ElectionPgClient::new(
             pool,
             execution_timeout,
diff --git a/src/common/meta/src/error.rs b/src/common/meta/src/error.rs
index b9fcbd6188..05b5af393b 100644
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -338,6 +338,24 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Metasrv election has no leader at this moment"))]
+    ElectionNoLeader {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Metasrv election leader lease expired"))]
+    ElectionLeaderLeaseExpired {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Metasrv election leader lease changed during election"))]
+    ElectionLeaderLeaseChanged {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display("Table already exists, table: {}", table_name))]
     TableAlreadyExists {
         table_name: String,
@@ -751,6 +769,15 @@ pub enum Error {
         location: Location,
     },
 
+    #[cfg(feature = "pg_kvbackend")]
+    #[snafu(display("Failed to get Postgres client"))]
+    GetPostgresClient {
+        #[snafu(source)]
+        error: deadpool::managed::PoolError<tokio_postgres::Error>,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[cfg(feature = "pg_kvbackend")]
     #[snafu(display("Failed to {} Postgres transaction", operation))]
     PostgresTransaction {
@@ -805,6 +832,24 @@ pub enum Error {
         location: Location,
     },
 
+    #[cfg(feature = "mysql_kvbackend")]
+    #[snafu(display("Failed to decode sql value"))]
+    DecodeSqlValue {
+        #[snafu(source)]
+        error: sqlx::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[cfg(feature = "mysql_kvbackend")]
+    #[snafu(display("Failed to acquire mysql client from pool"))]
+    AcquireMySqlClient {
+        #[snafu(source)]
+        error: sqlx::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[cfg(feature = "mysql_kvbackend")]
     #[snafu(display("Failed to {} MySql transaction", operation))]
     MySqlTransaction {
@@ -822,6 +867,15 @@ pub enum Error {
         location: Location,
     },
 
+    #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
+    #[snafu(display("Sql execution timeout, sql: {}, duration: {:?}", sql, duration))]
+    SqlExecutionTimeout {
+        sql: String,
+        duration: std::time::Duration,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display(
         "Datanode table info not found, table id: {}, datanode id: {}",
         table_id,
@@ -1075,7 +1129,10 @@ impl ErrorExt for Error {
             | GetCache { .. }
             | GetLatestCacheRetryExceeded { .. }
             | SerializeToJson { .. }
-            | DeserializeFromJson { .. } => StatusCode::Internal,
+            | DeserializeFromJson { .. }
+            | ElectionNoLeader { .. }
+            | ElectionLeaderLeaseExpired { .. }
+            | ElectionLeaderLeaseChanged { .. } => StatusCode::Internal,
 
             NoLeader { .. } => StatusCode::TableUnavailable,
             ValueNotExist { .. }
@@ -1198,15 +1255,18 @@ impl ErrorExt for Error {
             PostgresExecution { .. }
             | CreatePostgresPool { .. }
             | GetPostgresConnection { .. }
+            | GetPostgresClient { .. }
             | PostgresTransaction { .. }
             | PostgresTlsConfig { .. }
             | InvalidTlsConfig { .. } => StatusCode::Internal,
             #[cfg(feature = "mysql_kvbackend")]
-            MySqlExecution { .. } | CreateMySqlPool { .. } | MySqlTransaction { .. } => {
-                StatusCode::Internal
-            }
+            MySqlExecution { .. }
+            | CreateMySqlPool { .. }
+            | DecodeSqlValue { .. }
+            | AcquireMySqlClient { .. }
+            | MySqlTransaction { .. } => StatusCode::Internal,
             #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
-            RdsTransactionRetryFailed { .. } => StatusCode::Internal,
+            RdsTransactionRetryFailed { .. } | SqlExecutionTimeout { .. } => StatusCode::Internal,
             DatanodeTableInfoNotFound { .. } => StatusCode::Internal,
         }
     }
diff --git a/src/common/meta/src/lib.rs b/src/common/meta/src/lib.rs
index 93cd229b16..36aae1026e 100644
--- a/src/common/meta/src/lib.rs
+++ b/src/common/meta/src/lib.rs
@@ -22,6 +22,7 @@ pub mod datanode;
 pub mod ddl;
 pub mod ddl_manager;
 pub mod distributed_time_constants;
+pub mod election;
 pub mod error;
 pub mod flow_name;
 pub mod heartbeat;
diff --git a/src/meta-srv/src/bootstrap.rs b/src/meta-srv/src/bootstrap.rs
index 2cfe7d2f7d..eadb7cdc75 100644
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -24,6 +24,8 @@ use common_base::Plugins;
 use common_config::Configurable;
 #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
 use common_meta::distributed_time_constants::META_LEASE_SECS;
+use common_meta::election::CANDIDATE_LEASE_SECS;
+use common_meta::election::etcd::EtcdElection;
 use common_meta::kv_backend::chroot::ChrootKvBackend;
 use common_meta::kv_backend::etcd::EtcdStore;
 use common_meta::kv_backend::memory::MemoryKvBackend;
@@ -42,9 +44,6 @@ use tonic::codec::CompressionEncoding;
 use tonic::transport::server::{Router, TcpIncoming};
 
 use crate::cluster::{MetaPeerClientBuilder, MetaPeerClientRef};
-#[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
-use crate::election::CANDIDATE_LEASE_SECS;
-use crate::election::etcd::EtcdElection;
 use crate::error::OtherSnafu;
 use crate::metasrv::builder::MetasrvBuilder;
 use crate::metasrv::{
@@ -281,7 +280,8 @@ pub async fn metasrv_builder(
                 etcd_client,
                 opts.store_key_prefix.clone(),
             )
-            .await?;
+            .await
+            .context(error::KvBackendSnafu)?;
 
             (kv_backend, Some(election))
         }
@@ -290,10 +290,10 @@ pub async fn metasrv_builder(
             use std::time::Duration;
 
             use common_meta::distributed_time_constants::POSTGRES_KEEP_ALIVE_SECS;
+            use common_meta::election::rds::postgres::{ElectionPgClient, PgElection};
             use common_meta::kv_backend::rds::PgStore;
             use deadpool_postgres::{Config, ManagerConfig, RecyclingMethod};
 
-            use crate::election::rds::postgres::{ElectionPgClient, PgElection};
             use crate::utils::postgres::create_postgres_pool;
 
             let candidate_lease_ttl = Duration::from_secs(CANDIDATE_LEASE_SECS);
@@ -321,7 +321,8 @@ pub async fn metasrv_builder(
                 execution_timeout,
                 idle_session_timeout,
                 statement_timeout,
-            )?;
+            )
+            .context(error::KvBackendSnafu)?;
             let election = PgElection::with_pg_client(
                 opts.grpc.server_addr.clone(),
                 election_client,
@@ -332,7 +333,8 @@ pub async fn metasrv_builder(
                 &opts.meta_table_name,
                 opts.meta_election_lock_id,
             )
-            .await?;
+            .await
+            .context(error::KvBackendSnafu)?;
 
             let pool = create_postgres_pool(&opts.store_addrs, Some(cfg), opts.backend_tls.clone())
                 .await?;
@@ -352,9 +354,9 @@ pub async fn metasrv_builder(
         (None, BackendImpl::MysqlStore) => {
             use std::time::Duration;
 
+            use common_meta::election::rds::mysql::{ElectionMysqlClient, MySqlElection};
             use common_meta::kv_backend::rds::MySqlStore;
 
-            use crate::election::rds::mysql::{ElectionMysqlClient, MySqlElection};
             use crate::utils::mysql::create_mysql_pool;
 
             let pool = create_mysql_pool(&opts.store_addrs, opts.backend_tls.as_ref()).await?;
@@ -389,7 +391,8 @@ pub async fn metasrv_builder(
                 meta_lease_ttl,
                 &election_table_name,
             )
-            .await?;
+            .await
+            .context(error::KvBackendSnafu)?;
             (kv_backend, Some(election))
         }
     };
diff --git a/src/meta-srv/src/cluster.rs b/src/meta-srv/src/cluster.rs
index 35b15b3b29..ef3ba07702 100644
--- a/src/meta-srv/src/cluster.rs
+++ b/src/meta-srv/src/cluster.rs
@@ -247,7 +247,7 @@ impl MetaPeerClient {
         // Safety: when self.is_leader() == false, election must not empty.
         let election = self.election.as_ref().unwrap();
 
-        let leader_addr = election.leader().await?.0;
+        let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0;
 
         let channel = self
             .channel_manager
@@ -279,7 +279,7 @@ impl MetaPeerClient {
         // Safety: when self.is_leader() == false, election must not empty.
         let election = self.election.as_ref().unwrap();
 
-        let leader_addr = election.leader().await?.0;
+        let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0;
 
         let channel = self
             .channel_manager
diff --git a/src/meta-srv/src/lib.rs b/src/meta-srv/src/lib.rs
index c67bc32b40..0e87d4421a 100644
--- a/src/meta-srv/src/lib.rs
+++ b/src/meta-srv/src/lib.rs
@@ -21,7 +21,6 @@ pub mod bootstrap;
 pub mod cache_invalidator;
 pub mod cluster;
 pub mod discovery;
-pub mod election;
 pub mod error;
 pub mod events;
 mod failure_detector;
diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs
index 165efd0555..a1515d897e 100644
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -32,6 +32,8 @@ use common_meta::ddl_manager::DdlManagerRef;
 use common_meta::distributed_time_constants::{
     self, BASE_HEARTBEAT_INTERVAL, default_distributed_time_constants, frontend_heartbeat_interval,
 };
+use common_meta::election::LeaderChangeMessage;
+pub use common_meta::election::{ElectionRef, MetasrvNodeInfo};
 use common_meta::key::TableMetadataManagerRef;
 use common_meta::key::runtime_switch::RuntimeSwitchManagerRef;
 use common_meta::kv_backend::{KvBackendRef, ResettableKvBackend, ResettableKvBackendRef};
@@ -64,7 +66,6 @@ use tokio::sync::broadcast::error::RecvError;
 
 use crate::cluster::MetaPeerClientRef;
 use crate::discovery;
-use crate::election::{Election, LeaderChangeMessage};
 use crate::error::{
     self, InitMetadataSnafu, KvBackendSnafu, Result, StartProcedureManagerSnafu,
     StartTelemetryTaskSnafu, StopProcedureManagerSnafu,
@@ -459,76 +460,6 @@ impl Context {
     }
 }
 
-/// The value of the leader. It is used to store the leader's address.
-pub struct LeaderValue(pub String);
-
-impl<T: AsRef<[u8]>> From<T> for LeaderValue {
-    fn from(value: T) -> Self {
-        let string = String::from_utf8_lossy(value.as_ref());
-        Self(string.to_string())
-    }
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct MetasrvNodeInfo {
-    // The metasrv's address
-    pub addr: String,
-    // The node build version
-    pub version: String,
-    // The node build git commit hash
-    pub git_commit: String,
-    // The node start timestamp in milliseconds
-    pub start_time_ms: u64,
-    // The node total cpu millicores
-    #[serde(default)]
-    pub total_cpu_millicores: i64,
-    // The node total memory bytes
-    #[serde(default)]
-    pub total_memory_bytes: i64,
-    /// The node build cpu usage millicores
-    #[serde(default)]
-    pub cpu_usage_millicores: i64,
-    /// The node build memory usage bytes
-    #[serde(default)]
-    pub memory_usage_bytes: i64,
-    // The node hostname
-    #[serde(default)]
-    pub hostname: String,
-}
-
-// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto.
-#[allow(deprecated)]
-impl From<MetasrvNodeInfo> for api::v1::meta::MetasrvNodeInfo {
-    fn from(node_info: MetasrvNodeInfo) -> Self {
-        Self {
-            peer: Some(api::v1::meta::Peer {
-                addr: node_info.addr,
-                ..Default::default()
-            }),
-            // TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version.
-            // New code should use the fields in `info.NodeInfo` instead.
-            version: node_info.version.clone(),
-            git_commit: node_info.git_commit.clone(),
-            start_time_ms: node_info.start_time_ms,
-            cpus: node_info.total_cpu_millicores as u32,
-            memory_bytes: node_info.total_memory_bytes as u64,
-            // The canonical location for node information.
-            info: Some(api::v1::meta::NodeInfo {
-                version: node_info.version,
-                git_commit: node_info.git_commit,
-                start_time_ms: node_info.start_time_ms,
-                total_cpu_millicores: node_info.total_cpu_millicores,
-                total_memory_bytes: node_info.total_memory_bytes,
-                cpu_usage_millicores: node_info.cpu_usage_millicores,
-                memory_usage_bytes: node_info.memory_usage_bytes,
-                cpus: node_info.total_cpu_millicores as u32,
-                memory_bytes: node_info.total_memory_bytes as u64,
-                hostname: node_info.hostname,
-            }),
-        }
-    }
-}
-
 #[derive(Clone, Copy)]
 pub enum SelectTarget {
     Datanode,
@@ -552,7 +483,6 @@ pub struct SelectorContext {
 pub type SelectorRef = Arc<dyn Selector<Context = SelectorContext, Output = Vec<Peer>>>;
 pub type RegionStatAwareSelectorRef =
     Arc<dyn RegionStatAwareSelector<Context = SelectorContext, Output = Vec<(RegionId, Peer)>>>;
-pub type ElectionRef = Arc<dyn Election<Leader = LeaderValue>>;
 
 pub struct MetaStateHandler {
     subscribe_manager: Option<SubscriptionManagerRef>,
diff --git a/src/meta-srv/src/service/admin/leader.rs b/src/meta-srv/src/service/admin/leader.rs
index 1fadb4a3ef..17329e7b47 100644
--- a/src/meta-srv/src/service/admin/leader.rs
+++ b/src/meta-srv/src/service/admin/leader.rs
@@ -32,7 +32,7 @@ pub struct LeaderHandler {
 impl LeaderHandler {
     async fn get_leader(&self) -> Result<Option<String>> {
         if let Some(election) = &self.election {
-            let leader_addr = election.leader().await?.0;
+            let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0;
             return Ok(Some(leader_addr));
         }
         Ok(None)
diff --git a/src/meta-srv/src/service/cluster.rs b/src/meta-srv/src/service/cluster.rs
index 5c0ae4c71f..366a8aa5fb 100644
--- a/src/meta-srv/src/service/cluster.rs
+++ b/src/meta-srv/src/service/cluster.rs
@@ -63,7 +63,10 @@ impl cluster_server::Cluster for Metasrv {
         let leader_addr = &self.options().grpc.server_addr;
         let (leader, followers) = match self.election() {
             Some(election) => {
-                let nodes = election.all_candidates().await?;
+                let nodes = election
+                    .all_candidates()
+                    .await
+                    .context(error::KvBackendSnafu)?;
                 let followers = nodes
                     .into_iter()
                     .filter(|node_info| &node_info.addr != leader_addr)
diff --git a/src/meta-srv/src/service/heartbeat.rs b/src/meta-srv/src/service/heartbeat.rs
index e09073546a..238ed99df2 100644
--- a/src/meta-srv/src/service/heartbeat.rs
+++ b/src/meta-srv/src/service/heartbeat.rs
@@ -23,7 +23,7 @@ use api::v1::meta::{
 use common_telemetry::{debug, error, info, warn};
 use futures::StreamExt;
 use once_cell::sync::OnceCell;
-use snafu::OptionExt;
+use snafu::{OptionExt, ResultExt};
 use tokio::sync::mpsc;
 use tokio::sync::mpsc::Sender;
 use tokio_stream::wrappers::ReceiverStream;
@@ -148,7 +148,7 @@ async fn handle_ask_leader(_req: AskLeaderRequest, ctx: Context) -> Result<AskLe
             if election.is_leader() {
                 ctx.server_addr
             } else {
-                election.leader().await?.0
+                election.leader().await.context(error::KvBackendSnafu)?.0
             }
         }
         None => ctx.server_addr,

From 13cdfa9b59a2d2b7a1f166c3993f50597807a368 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunng@protonmail.com>
Date: Tue, 24 Mar 2026 20:16:38 +0800
Subject: [PATCH 38/42] fix: update 8-bit int to smallint in postgres (#7854)

---
 src/servers/src/postgres/types.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/servers/src/postgres/types.rs b/src/servers/src/postgres/types.rs
index a95890e78c..d4d15ef64a 100644
--- a/src/servers/src/postgres/types.rs
+++ b/src/servers/src/postgres/types.rs
@@ -235,7 +235,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result<Type> {
     match origin {
         &ConcreteDataType::Null(_) => Ok(Type::UNKNOWN),
         &ConcreteDataType::Boolean(_) => Ok(Type::BOOL),
-        &ConcreteDataType::Int8(_) => Ok(Type::CHAR),
+        &ConcreteDataType::Int8(_) => Ok(Type::INT2),
         &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt8(_) => Ok(Type::INT2),
         &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt16(_) => Ok(Type::INT4),
         &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt32(_) => Ok(Type::INT8),
@@ -253,7 +253,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result<Type> {
         ConcreteDataType::List(list) => match list.item_type() {
             &ConcreteDataType::Null(_) => Ok(Type::UNKNOWN),
             &ConcreteDataType::Boolean(_) => Ok(Type::BOOL_ARRAY),
-            &ConcreteDataType::Int8(_) => Ok(Type::CHAR_ARRAY),
+            &ConcreteDataType::Int8(_) => Ok(Type::INT2_ARRAY),
             &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt8(_) => Ok(Type::INT2_ARRAY),
             &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt16(_) => Ok(Type::INT4_ARRAY),
             &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt32(_) => Ok(Type::INT8_ARRAY),
@@ -1151,7 +1151,7 @@ mod test {
         let pg_field_info = vec![
             FieldInfo::new("nulls".into(), None, None, Type::UNKNOWN, FieldFormat::Text),
             FieldInfo::new("bools".into(), None, None, Type::BOOL, FieldFormat::Text),
-            FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text),
+            FieldInfo::new("int8s".into(), None, None, Type::INT2, FieldFormat::Text),
             FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text),
             FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text),
             FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text),
@@ -1230,7 +1230,7 @@ mod test {
                 Type::NUMERIC,
                 FieldFormat::Text,
             ),
-            FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text),
+            FieldInfo::new("int8s".into(), None, None, Type::INT2, FieldFormat::Text),
             FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text),
             FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text),
             FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text),

From 04aa84af62640df1e2480ca671e3468649b99df2 Mon Sep 17 00:00:00 2001
From: Yingwen <realevenyag@gmail.com>
Date: Wed, 25 Mar 2026 11:10:19 +0800
Subject: [PATCH 39/42] feat: use ArrowReaderBuilder instead of the RowGroups
 API (#7853)

* feat: use ArrowReaderBuilder instead of the RowGroups API

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: make row_group_idx required

Signed-off-by: evenyag <realevenyag@gmail.com>

* chore: remove unsed variant

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: collect total_fetch_elapsed metrics

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
---
 src/mito2/src/error.rs                        |  10 -
 src/mito2/src/memtable/bulk.rs                |   1 +
 src/mito2/src/memtable/bulk/chunk_reader.rs   |  65 +++
 src/mito2/src/memtable/bulk/part_reader.rs    |   1 -
 .../src/memtable/bulk/row_group_reader.rs     | 152 +-----
 src/mito2/src/read/last_row.rs                |   8 +-
 src/mito2/src/read/prune.rs                   |  21 +-
 src/mito2/src/read/scan_util.rs               |   2 +-
 src/mito2/src/sst/parquet.rs                  |   1 +
 src/mito2/src/sst/parquet/async_reader.rs     | 221 ++++++++
 src/mito2/src/sst/parquet/reader.rs           | 176 ++++---
 src/mito2/src/sst/parquet/row_group.rs        | 470 +-----------------
 12 files changed, 423 insertions(+), 705 deletions(-)
 create mode 100644 src/mito2/src/memtable/bulk/chunk_reader.rs
 create mode 100644 src/mito2/src/sst/parquet/async_reader.rs

diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs
index 923d8a2713..c6b69fe607 100644
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -616,15 +616,6 @@ pub enum Error {
         location: Location,
     },
 
-    #[snafu(display("Failed to read arrow record batch from parquet file {}", path))]
-    ArrowReader {
-        path: String,
-        #[snafu(source)]
-        error: ArrowError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
     #[snafu(display("Column not found, column: {column}"))]
     ColumnNotFound {
         column: String,
@@ -1349,7 +1340,6 @@ impl ErrorExt for Error {
             RegionState { .. } | UpdateManifest { .. } => StatusCode::RegionNotReady,
             JsonOptions { .. } => StatusCode::InvalidArguments,
             EmptyRegionDir { .. } | EmptyManifestDir { .. } => StatusCode::RegionNotFound,
-            ArrowReader { .. } => StatusCode::StorageUnavailable,
             ConvertValue { source, .. } => source.status_code(),
             ApplyBloomFilterIndex { source, .. } => source.status_code(),
             InvalidPartitionExpr { source, .. } => source.status_code(),
diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs
index e649681b76..502b61759d 100644
--- a/src/mito2/src/memtable/bulk.rs
+++ b/src/mito2/src/memtable/bulk.rs
@@ -14,6 +14,7 @@
 
 //! Memtable implementation for bulk load
 
+pub(crate) mod chunk_reader;
 #[allow(unused)]
 pub mod context;
 #[allow(unused)]
diff --git a/src/mito2/src/memtable/bulk/chunk_reader.rs b/src/mito2/src/memtable/bulk/chunk_reader.rs
new file mode 100644
index 0000000000..e632cd1b37
--- /dev/null
+++ b/src/mito2/src/memtable/bulk/chunk_reader.rs
@@ -0,0 +1,65 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! ChunkReader implementation for in-memory parquet bytes.
+
+use std::io::Cursor;
+
+use bytes::Bytes;
+use parquet::errors::{ParquetError, Result};
+use parquet::file::reader::{ChunkReader, Length};
+
+/// A [ChunkReader] implementation for in-memory parquet bytes.
+///
+/// This provides byte access to parquet data stored in memory (Bytes),
+/// used for reading parquet data from bulk memtable.
+#[derive(Clone)]
+pub struct MemtableChunkReader {
+    /// The in-memory parquet data.
+    data: Bytes,
+}
+
+impl MemtableChunkReader {
+    /// Creates a new [MemtableChunkReader] from the given bytes.
+    pub fn new(data: Bytes) -> Self {
+        Self { data }
+    }
+}
+
+impl Length for MemtableChunkReader {
+    fn len(&self) -> u64 {
+        self.data.len() as u64
+    }
+}
+
+impl ChunkReader for MemtableChunkReader {
+    type T = Cursor<Bytes>;
+
+    fn get_read(&self, start: u64) -> Result<Self::T> {
+        let start = start as usize;
+        if start > self.data.len() {
+            return Err(ParquetError::IndexOutOfBound(start, self.data.len()));
+        }
+        Ok(Cursor::new(self.data.slice(start..)))
+    }
+
+    fn get_bytes(&self, start: u64, length: usize) -> Result<Bytes> {
+        let start = start as usize;
+        let end = start + length;
+        if end > self.data.len() {
+            return Err(ParquetError::IndexOutOfBound(end, self.data.len()));
+        }
+        Ok(self.data.slice(start..end))
+    }
+}
diff --git a/src/mito2/src/memtable/bulk/part_reader.rs b/src/mito2/src/memtable/bulk/part_reader.rs
index 904aae8c90..edb9ff52d9 100644
--- a/src/mito2/src/memtable/bulk/part_reader.rs
+++ b/src/mito2/src/memtable/bulk/part_reader.rs
@@ -30,7 +30,6 @@ use crate::memtable::{MemScanMetrics, MemScanMetricsData};
 use crate::metrics::{READ_ROWS_TOTAL, READ_STAGE_ELAPSED};
 use crate::sst::parquet::file_range::{PreFilterMode, TagDecodeState};
 use crate::sst::parquet::flat_format::sequence_column_index;
-use crate::sst::parquet::reader::RowGroupReaderContext;
 
 /// Iterator for reading data inside a bulk part.
 pub struct EncodedBulkPartIter {
diff --git a/src/mito2/src/memtable/bulk/row_group_reader.rs b/src/mito2/src/memtable/bulk/row_group_reader.rs
index fccd22db10..40a5b2f85d 100644
--- a/src/mito2/src/memtable/bulk/row_group_reader.rs
+++ b/src/mito2/src/memtable/bulk/row_group_reader.rs
@@ -12,124 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::ops::Range;
 use std::sync::Arc;
 
 use bytes::Bytes;
-use datatypes::arrow::array::RecordBatch;
-use datatypes::arrow::error::ArrowError;
-use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowGroups, RowSelection};
-use parquet::arrow::{FieldLevels, ProjectionMask, parquet_to_arrow_field_levels};
-use parquet::column::page::{PageIterator, PageReader};
-use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
+use parquet::arrow::ProjectionMask;
+use parquet::arrow::arrow_reader::{
+    ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReader,
+    ParquetRecordBatchReaderBuilder, RowSelection,
+};
+use parquet::file::metadata::ParquetMetaData;
 use snafu::ResultExt;
 
 use crate::error;
 use crate::error::ReadDataPartSnafu;
+use crate::memtable::bulk::chunk_reader::MemtableChunkReader;
 use crate::memtable::bulk::context::BulkIterContextRef;
 use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
-use crate::sst::parquet::format::ReadFormat;
-use crate::sst::parquet::reader::RowGroupReaderContext;
-use crate::sst::parquet::row_group::{ColumnChunkIterator, RowGroupBase};
-
-/// Helper for reading specific row group inside Memtable Parquet parts.
-// This is similar to [mito2::sst::parquet::row_group::InMemoryRowGroup] since
-// it's a workaround for lacking of keyword generics.
-pub struct MemtableRowGroupPageFetcher<'a> {
-    /// Shared structs for reading row group.
-    base: RowGroupBase<'a>,
-    bytes: Bytes,
-}
-
-impl<'a> MemtableRowGroupPageFetcher<'a> {
-    pub(crate) fn create(
-        row_group_idx: usize,
-        parquet_meta: &'a ParquetMetaData,
-        bytes: Bytes,
-    ) -> Self {
-        Self {
-            // the cached `column_uncompressed_pages` would never be used in Memtable readers.
-            base: RowGroupBase::new(parquet_meta, row_group_idx),
-            bytes,
-        }
-    }
-
-    /// Fetches column pages from memory file.
-    pub(crate) fn fetch(&mut self, projection: &ProjectionMask, selection: Option<&RowSelection>) {
-        if let Some((selection, offset_index)) = selection.zip(self.base.offset_index) {
-            // Selection provided.
-            let (fetch_ranges, page_start_offsets) =
-                self.base
-                    .calc_sparse_read_ranges(projection, offset_index, selection);
-            if fetch_ranges.is_empty() {
-                return;
-            }
-            let chunk_data = self.fetch_bytes(&fetch_ranges);
-
-            self.base
-                .assign_sparse_chunk(projection, chunk_data, page_start_offsets);
-        } else {
-            let fetch_ranges = self.base.calc_dense_read_ranges(projection);
-            if fetch_ranges.is_empty() {
-                // Nothing to fetch.
-                return;
-            }
-            let chunk_data = self.fetch_bytes(&fetch_ranges);
-            self.base.assign_dense_chunk(projection, chunk_data);
-        }
-    }
-
-    fn fetch_bytes(&self, ranges: &[Range<u64>]) -> Vec<Bytes> {
-        ranges
-            .iter()
-            .map(|range| self.bytes.slice(range.start as usize..range.end as usize))
-            .collect()
-    }
-
-    /// Creates a page reader to read column at `i`.
-    fn column_page_reader(&self, i: usize) -> parquet::errors::Result<Box<dyn PageReader>> {
-        let reader = self.base.column_reader(i)?;
-        Ok(Box::new(reader))
-    }
-}
-
-impl RowGroups for MemtableRowGroupPageFetcher<'_> {
-    fn num_rows(&self) -> usize {
-        self.base.row_count
-    }
-
-    fn column_chunks(&self, i: usize) -> parquet::errors::Result<Box<dyn PageIterator>> {
-        Ok(Box::new(ColumnChunkIterator {
-            reader: Some(self.column_page_reader(i)),
-        }))
-    }
-
-    fn row_groups(&self) -> Box<dyn Iterator<Item = &RowGroupMetaData> + '_> {
-        Box::new(std::iter::once(self.base.row_group_metadata()))
-    }
-
-    fn metadata(&self) -> &ParquetMetaData {
-        self.base.parquet_metadata()
-    }
-}
-
-impl RowGroupReaderContext for BulkIterContextRef {
-    fn map_result(
-        &self,
-        result: Result<Option<RecordBatch>, ArrowError>,
-    ) -> error::Result<Option<RecordBatch>> {
-        result.context(error::DecodeArrowRowGroupSnafu)
-    }
-
-    fn read_format(&self) -> &ReadFormat {
-        self.as_ref().read_format()
-    }
-}
 
 pub(crate) struct MemtableRowGroupReaderBuilder {
     projection: ProjectionMask,
     parquet_metadata: Arc<ParquetMetaData>,
-    field_levels: FieldLevels,
+    arrow_metadata: ArrowReaderMetadata,
     data: Bytes,
 }
 
@@ -140,15 +43,16 @@ impl MemtableRowGroupReaderBuilder {
         parquet_metadata: Arc<ParquetMetaData>,
         data: Bytes,
     ) -> error::Result<Self> {
-        let parquet_schema_desc = parquet_metadata.file_metadata().schema_descr();
-        let hint = Some(context.read_format().arrow_schema().fields());
-        let field_levels =
-            parquet_to_arrow_field_levels(parquet_schema_desc, projection.clone(), hint)
+        // Create ArrowReaderMetadata for building the reader.
+        let arrow_reader_options =
+            ArrowReaderOptions::new().with_schema(context.read_format().arrow_schema().clone());
+        let arrow_metadata =
+            ArrowReaderMetadata::try_new(parquet_metadata.clone(), arrow_reader_options)
                 .context(ReadDataPartSnafu)?;
         Ok(Self {
             projection,
             parquet_metadata,
-            field_levels,
+            arrow_metadata,
             data,
         })
     }
@@ -159,23 +63,21 @@ impl MemtableRowGroupReaderBuilder {
         row_group_idx: usize,
         row_selection: Option<RowSelection>,
     ) -> error::Result<ParquetRecordBatchReader> {
-        let mut row_group = MemtableRowGroupPageFetcher::create(
-            row_group_idx,
-            &self.parquet_metadata,
-            self.data.clone(),
-        );
-        // Fetches data from memory part. Currently, row selection is not supported.
-        row_group.fetch(&self.projection, row_selection.as_ref());
+        let chunk_reader = MemtableChunkReader::new(self.data.clone());
 
-        // Builds the parquet reader.
-        // Now the row selection is None.
-        ParquetRecordBatchReader::try_new_with_row_groups(
-            &self.field_levels,
-            &row_group,
-            DEFAULT_READ_BATCH_SIZE,
-            row_selection,
+        let mut builder = ParquetRecordBatchReaderBuilder::new_with_metadata(
+            chunk_reader,
+            self.arrow_metadata.clone(),
         )
-        .context(ReadDataPartSnafu)
+        .with_row_groups(vec![row_group_idx])
+        .with_projection(self.projection.clone())
+        .with_batch_size(DEFAULT_READ_BATCH_SIZE);
+
+        if let Some(selection) = row_selection {
+            builder = builder.with_row_selection(selection);
+        }
+
+        builder.build().context(ReadDataPartSnafu)
     }
 
     /// Computes whether to skip field filters for a specific row group based on PreFilterMode.
diff --git a/src/mito2/src/read/last_row.rs b/src/mito2/src/read/last_row.rs
index 0c13c120a0..1dc4102311 100644
--- a/src/mito2/src/read/last_row.rs
+++ b/src/mito2/src/read/last_row.rs
@@ -333,10 +333,10 @@ impl FlatRowGroupLastRowCachedReader {
     }
 
     /// Returns the next RecordBatch.
-    pub(crate) fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
+    pub(crate) async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
         match self {
             FlatRowGroupLastRowCachedReader::Hit(r) => r.next_batch(),
-            FlatRowGroupLastRowCachedReader::Miss(r) => r.next_batch(),
+            FlatRowGroupLastRowCachedReader::Miss(r) => r.next_batch().await,
         }
     }
 
@@ -466,12 +466,12 @@ impl FlatRowGroupLastRowReader {
         Ok(Some(merged))
     }
 
-    fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
+    async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
         if self.pending.is_full() {
             return self.flush_pending();
         }
 
-        while let Some(batch) = self.reader.next_batch()? {
+        while let Some(batch) = self.reader.next_batch().await? {
             self.selector.on_next(batch, &mut self.pending)?;
             if self.pending.is_full() {
                 return self.flush_pending();
diff --git a/src/mito2/src/read/prune.rs b/src/mito2/src/read/prune.rs
index 2f9fa002d4..6766bf3f38 100644
--- a/src/mito2/src/read/prune.rs
+++ b/src/mito2/src/read/prune.rs
@@ -247,10 +247,10 @@ pub enum FlatSource {
 }
 
 impl FlatSource {
-    fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
+    async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
         match self {
-            FlatSource::RowGroup(r) => r.next_batch(),
-            FlatSource::LastRow(r) => r.next_batch(),
+            FlatSource::RowGroup(r) => r.next_batch().await,
+            FlatSource::LastRow(r) => r.next_batch().await,
         }
     }
 }
@@ -297,13 +297,16 @@ impl FlatPruneReader {
         self.metrics.clone()
     }
 
-    pub(crate) fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
-        while let Some(record_batch) = {
+    pub(crate) async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
+        loop {
             let start = std::time::Instant::now();
-            let batch = self.source.next_batch()?;
+            let batch = self.source.next_batch().await?;
             self.metrics.scan_cost += start.elapsed();
-            batch
-        } {
+
+            let Some(record_batch) = batch else {
+                return Ok(None);
+            };
+
             // Update metrics for the received batch
             self.metrics.num_rows += record_batch.num_rows();
             self.metrics.num_batches += 1;
@@ -317,8 +320,6 @@ impl FlatPruneReader {
                 }
             }
         }
-
-        Ok(None)
     }
 
     /// Prunes batches by the pushed down predicate and returns RecordBatch.
diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs
index 6f68616709..9bf1c17276 100644
--- a/src/mito2/src/read/scan_util.rs
+++ b/src/mito2/src/read/scan_util.rs
@@ -1533,7 +1533,7 @@ pub fn build_flat_file_range_scan_stream(
                 .transpose()?;
 
             let mapper = range.compaction_projection_mapper();
-            while let Some(record_batch) = reader.next_batch()? {
+            while let Some(record_batch) = reader.next_batch().await? {
                 let record_batch = if let Some(mapper) = mapper {
                     let batch = mapper.project(record_batch)?;
                     batch
diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs
index fb8e1d1fc2..79a08a209d 100644
--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -24,6 +24,7 @@ use crate::sst::DEFAULT_WRITE_BUFFER_SIZE;
 use crate::sst::file::FileTimeRange;
 use crate::sst::index::IndexOutput;
 
+pub(crate) mod async_reader;
 pub mod file_range;
 pub mod flat_format;
 pub mod format;
diff --git a/src/mito2/src/sst/parquet/async_reader.rs b/src/mito2/src/sst/parquet/async_reader.rs
new file mode 100644
index 0000000000..a060fd367d
--- /dev/null
+++ b/src/mito2/src/sst/parquet/async_reader.rs
@@ -0,0 +1,221 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Async file reader implementation for SST parquet files.
+
+use std::ops::Range;
+use std::sync::Arc;
+
+use bytes::Bytes;
+use futures::FutureExt;
+use futures::future::BoxFuture;
+use object_store::ObjectStore;
+use parquet::arrow::async_reader::AsyncFileReader;
+use parquet::errors::{ParquetError, Result as ParquetResult};
+use parquet::file::metadata::ParquetMetaData;
+
+use crate::cache::file_cache::{FileType, IndexKey};
+use crate::cache::{CacheStrategy, PageKey, PageValue};
+use crate::metrics::{READ_STAGE_ELAPSED, READ_STAGE_FETCH_PAGES};
+use crate::sst::file::RegionFileId;
+use crate::sst::parquet::helper::fetch_byte_ranges;
+use crate::sst::parquet::row_group::{ParquetFetchMetrics, compute_total_range_size};
+
+/// An [AsyncFileReader] implementation for SST parquet files.
+///
+/// This reader provides async byte access to parquet data in object storage,
+/// with caching support (page cache and write cache).
+pub struct SstAsyncFileReader {
+    /// Region file ID for cache key.
+    region_file_id: RegionFileId,
+    /// Path to the parquet file in object storage.
+    file_path: String,
+    /// Object store for reading data.
+    object_store: ObjectStore,
+    /// Cache strategy for reading pages.
+    cache_strategy: CacheStrategy,
+    /// Cached parquet metadata.
+    metadata: Arc<ParquetMetaData>,
+    /// Row group index for cache key.
+    row_group_idx: usize,
+    /// Optional metrics for tracking fetch operations.
+    fetch_metrics: Option<ParquetFetchMetrics>,
+}
+
+impl SstAsyncFileReader {
+    /// Creates a new [SstAsyncFileReader].
+    pub fn new(
+        region_file_id: RegionFileId,
+        file_path: String,
+        object_store: ObjectStore,
+        cache_strategy: CacheStrategy,
+        metadata: Arc<ParquetMetaData>,
+        row_group_idx: usize,
+    ) -> Self {
+        Self {
+            region_file_id,
+            file_path,
+            object_store,
+            cache_strategy,
+            metadata,
+            row_group_idx,
+            fetch_metrics: None,
+        }
+    }
+
+    /// Sets the fetch metrics.
+    pub fn with_fetch_metrics(mut self, metrics: Option<ParquetFetchMetrics>) -> Self {
+        self.fetch_metrics = metrics;
+        self
+    }
+
+    /// Fetches byte ranges from page cache, write cache, or object store.
+    async fn fetch_bytes_with_cache(&self, ranges: Vec<Range<u64>>) -> ParquetResult<Vec<Bytes>> {
+        let fetch_start = self
+            .fetch_metrics
+            .as_ref()
+            .map(|_| std::time::Instant::now());
+        let _timer = READ_STAGE_FETCH_PAGES.start_timer();
+
+        let page_key = PageKey::new(
+            self.region_file_id.file_id(),
+            self.row_group_idx,
+            ranges.clone(),
+        );
+
+        // Check page cache first.
+        if let Some(pages) = self.cache_strategy.get_pages(&page_key) {
+            if let Some(metrics) = &self.fetch_metrics {
+                let total_size: u64 = ranges.iter().map(|r| r.end - r.start).sum();
+                let mut metrics_data = metrics.data.lock().unwrap();
+                metrics_data.page_cache_hit += 1;
+                metrics_data.pages_to_fetch_mem += ranges.len();
+                metrics_data.page_size_to_fetch_mem += total_size;
+                metrics_data.page_size_needed += total_size;
+                if let Some(start) = fetch_start {
+                    metrics_data.total_fetch_elapsed += start.elapsed();
+                }
+            }
+            return Ok(pages.compressed.clone());
+        }
+
+        // Calculate total range size for metrics.
+        let (total_range_size, unaligned_size) = compute_total_range_size(&ranges);
+
+        // Check write cache.
+        let key = IndexKey::new(
+            self.region_file_id.region_id(),
+            self.region_file_id.file_id(),
+            FileType::Parquet,
+        );
+        let fetch_write_cache_start = self
+            .fetch_metrics
+            .as_ref()
+            .map(|_| std::time::Instant::now());
+        let write_cache_result = self.fetch_ranges_from_write_cache(key, &ranges).await;
+
+        let pages = match write_cache_result {
+            Some(data) => {
+                if let Some(metrics) = &self.fetch_metrics {
+                    let elapsed = fetch_write_cache_start
+                        .map(|start| start.elapsed())
+                        .unwrap_or_default();
+                    let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
+                    let mut metrics_data = metrics.data.lock().unwrap();
+                    metrics_data.write_cache_fetch_elapsed += elapsed;
+                    metrics_data.write_cache_hit += 1;
+                    metrics_data.pages_to_fetch_write_cache += ranges.len();
+                    metrics_data.page_size_to_fetch_write_cache += unaligned_size;
+                    metrics_data.page_size_needed += range_size_needed;
+                }
+                data
+            }
+            None => {
+                // Fetch data from object store.
+                let _timer = READ_STAGE_ELAPSED
+                    .with_label_values(&["cache_miss_read"])
+                    .start_timer();
+
+                let start = self
+                    .fetch_metrics
+                    .as_ref()
+                    .map(|_| std::time::Instant::now());
+                let data = fetch_byte_ranges(&self.file_path, self.object_store.clone(), &ranges)
+                    .await
+                    .map_err(|e| ParquetError::External(Box::new(e)))?;
+
+                if let Some(metrics) = &self.fetch_metrics {
+                    let elapsed = start.map(|start| start.elapsed()).unwrap_or_default();
+                    let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
+                    let mut metrics_data = metrics.data.lock().unwrap();
+                    metrics_data.store_fetch_elapsed += elapsed;
+                    metrics_data.cache_miss += 1;
+                    metrics_data.pages_to_fetch_store += ranges.len();
+                    metrics_data.page_size_to_fetch_store += unaligned_size;
+                    metrics_data.page_size_needed += range_size_needed;
+                }
+                data
+            }
+        };
+
+        // Put pages back to the cache.
+        let page_value = PageValue::new(pages.clone(), total_range_size);
+        self.cache_strategy
+            .put_pages(page_key, Arc::new(page_value));
+
+        if let (Some(metrics), Some(start)) = (&self.fetch_metrics, fetch_start) {
+            metrics.data.lock().unwrap().total_fetch_elapsed += start.elapsed();
+        }
+
+        Ok(pages)
+    }
+
+    /// Fetches data from write cache.
+    /// Returns `None` if the data is not in the cache.
+    async fn fetch_ranges_from_write_cache(
+        &self,
+        key: IndexKey,
+        ranges: &[Range<u64>],
+    ) -> Option<Vec<Bytes>> {
+        if let Some(cache) = self.cache_strategy.write_cache() {
+            return cache.file_cache().read_ranges(key, ranges).await;
+        }
+        None
+    }
+}
+
+impl AsyncFileReader for SstAsyncFileReader {
+    fn get_bytes(&mut self, range: Range<u64>) -> BoxFuture<'_, ParquetResult<Bytes>> {
+        async move {
+            let mut result = self.fetch_bytes_with_cache(vec![range]).await?;
+            Ok(result.pop().unwrap_or_default())
+        }
+        .boxed()
+    }
+
+    fn get_byte_ranges(
+        &mut self,
+        ranges: Vec<Range<u64>>,
+    ) -> BoxFuture<'_, ParquetResult<Vec<Bytes>>> {
+        async move { self.fetch_bytes_with_cache(ranges).await }.boxed()
+    }
+
+    fn get_metadata(
+        &mut self,
+        _options: Option<&parquet::arrow::arrow_reader::ArrowReaderOptions>,
+    ) -> BoxFuture<'_, ParquetResult<Arc<ParquetMetaData>>> {
+        // Metadata is already cached, return it immediately.
+        std::future::ready(Ok(self.metadata.clone())).boxed()
+    }
+}
diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs
index 855204b80e..f152c97075 100644
--- a/src/mito2/src/sst/parquet/reader.rs
+++ b/src/mito2/src/sst/parquet/reader.rs
@@ -26,14 +26,15 @@ use common_telemetry::{tracing, warn};
 use datafusion_expr::Expr;
 use datatypes::arrow::array::ArrayRef;
 use datatypes::arrow::datatypes::Field;
-use datatypes::arrow::error::ArrowError;
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::data_type::ConcreteDataType;
 use datatypes::prelude::DataType;
+use futures::StreamExt;
 use mito_codec::row_converter::build_primary_key_codec;
 use object_store::ObjectStore;
-use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowSelection};
-use parquet::arrow::{FieldLevels, ProjectionMask, parquet_to_arrow_field_levels};
+use parquet::arrow::ProjectionMask;
+use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions, RowSelection};
+use parquet::arrow::async_reader::{ParquetRecordBatchStream, ParquetRecordBatchStreamBuilder};
 use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData};
 use partition::expr::PartitionExpr;
 use snafu::ResultExt;
@@ -47,9 +48,7 @@ use crate::cache::index::result_cache::PredicateKey;
 use crate::cache::{CacheStrategy, CachedSstMeta};
 #[cfg(feature = "vector_index")]
 use crate::error::ApplyVectorIndexSnafu;
-use crate::error::{
-    ArrowReaderSnafu, ReadDataPartSnafu, ReadParquetSnafu, Result, SerializePartitionExprSnafu,
-};
+use crate::error::{ReadDataPartSnafu, ReadParquetSnafu, Result, SerializePartitionExprSnafu};
 use crate::metrics::{
     PRECISE_FILTER_ROWS_TOTAL, READ_ROW_GROUPS_TOTAL, READ_ROWS_IN_ROW_GROUP_TOTAL,
     READ_ROWS_TOTAL, READ_STAGE_ELAPSED,
@@ -70,13 +69,14 @@ use crate::sst::index::inverted_index::applier::{
 #[cfg(feature = "vector_index")]
 use crate::sst::index::vector_index::applier::VectorIndexApplierRef;
 use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
+use crate::sst::parquet::async_reader::SstAsyncFileReader;
 use crate::sst::parquet::file_range::{
     FileRangeContext, FileRangeContextRef, PartitionFilterContext, PreFilterMode, RangeBase,
     row_group_contains_delete,
 };
 use crate::sst::parquet::format::{ReadFormat, need_override_sequence};
 use crate::sst::parquet::metadata::MetadataLoader;
-use crate::sst::parquet::row_group::{InMemoryRowGroup, ParquetFetchMetrics};
+use crate::sst::parquet::row_group::ParquetFetchMetrics;
 use crate::sst::parquet::row_selection::RowGroupSelection;
 use crate::sst::parquet::stats::RowGroupPruningStats;
 use crate::sst::tag_maybe_to_dictionary_field;
@@ -415,6 +415,12 @@ impl ParquetReaderBuilder {
                 .set_override_sequence(self.file_handle.meta_ref().sequence.map(|x| x.get()));
         }
 
+        // Computes the projection mask.
+        let parquet_schema_desc = parquet_meta.file_metadata().schema_descr();
+        let indices = read_format.projection_indices();
+        // Now we assumes we don't have nested schemas.
+        // TODO(yingwen): Revisit this if we introduce nested types such as JSON type.
+        let projection_mask = ProjectionMask::roots(parquet_schema_desc, indices.iter().copied());
         let selection = self
             .row_groups_to_read(&read_format, &parquet_meta, &mut metrics.filter_metrics)
             .await;
@@ -446,26 +452,20 @@ impl ParquetReaderBuilder {
             .map(|meta| meta.schema.clone())
             .unwrap_or_else(|| region_meta.schema.clone());
 
-        // Computes the projection mask.
-        let parquet_schema_desc = parquet_meta.file_metadata().schema_descr();
-        let indices = read_format.projection_indices();
-        // Now we assumes we don't have nested schemas.
-        // TODO(yingwen): Revisit this if we introduce nested types such as JSON type.
-        let projection_mask = ProjectionMask::roots(parquet_schema_desc, indices.iter().copied());
-
-        // Computes the field levels.
-        let hint = Some(read_format.arrow_schema().fields());
-        let field_levels =
-            parquet_to_arrow_field_levels(parquet_schema_desc, projection_mask.clone(), hint)
+        // Create ArrowReaderMetadata for async stream building.
+        let arrow_reader_options =
+            ArrowReaderOptions::new().with_schema(read_format.arrow_schema().clone());
+        let arrow_metadata =
+            ArrowReaderMetadata::try_new(parquet_meta.clone(), arrow_reader_options)
                 .context(ReadDataPartSnafu)?;
 
         let reader_builder = RowGroupReaderBuilder {
             file_handle: self.file_handle.clone(),
             file_path,
             parquet_meta,
+            arrow_metadata,
             object_store: self.object_store.clone(),
             projection: projection_mask,
-            field_levels,
             cache_strategy: self.cache_strategy.clone(),
         };
 
@@ -1640,7 +1640,7 @@ impl ReaderMetrics {
     }
 }
 
-/// Builder to build a [ParquetRecordBatchReader] for a row group.
+/// Builder to build a [ParquetRecordBatchStream] for a row group.
 pub(crate) struct RowGroupReaderBuilder {
     /// SST file to read.
     ///
@@ -1650,12 +1650,12 @@ pub(crate) struct RowGroupReaderBuilder {
     file_path: String,
     /// Metadata of the parquet file.
     parquet_meta: Arc<ParquetMetaData>,
+    /// Arrow reader metadata for building async stream.
+    arrow_metadata: ArrowReaderMetadata,
     /// Object store as an Operator.
     object_store: ObjectStore,
     /// Projection mask.
     projection: ProjectionMask,
-    /// Field levels to read.
-    field_levels: FieldLevels,
     /// Cache.
     cache_strategy: CacheStrategy,
 }
@@ -1679,48 +1679,43 @@ impl RowGroupReaderBuilder {
         &self.cache_strategy
     }
 
-    /// Builds a [ParquetRecordBatchReader] to read the row group at `row_group_idx`.
+    /// Builds a [ParquetRecordBatchStream] to read the row group at `row_group_idx`.
     pub(crate) async fn build(
         &self,
         row_group_idx: usize,
         row_selection: Option<RowSelection>,
         fetch_metrics: Option<&ParquetFetchMetrics>,
-    ) -> Result<ParquetRecordBatchReader> {
-        let fetch_start = Instant::now();
-
-        let mut row_group = InMemoryRowGroup::create(
-            self.file_handle.region_id(),
-            self.file_handle.file_id().file_id(),
-            &self.parquet_meta,
-            row_group_idx,
-            self.cache_strategy.clone(),
-            &self.file_path,
+    ) -> Result<ParquetRecordBatchStream<SstAsyncFileReader>> {
+        // Create async file reader with caching support.
+        let async_reader = SstAsyncFileReader::new(
+            self.file_handle.file_id(),
+            self.file_path.clone(),
             self.object_store.clone(),
-        );
-        // Fetches data into memory.
-        row_group
-            .fetch(&self.projection, row_selection.as_ref(), fetch_metrics)
-            .await
-            .context(ReadParquetSnafu {
-                path: &self.file_path,
-            })?;
+            self.cache_strategy.clone(),
+            self.parquet_meta.clone(),
+            row_group_idx,
+        )
+        .with_fetch_metrics(fetch_metrics.cloned());
 
-        // Record total fetch elapsed time.
-        if let Some(metrics) = fetch_metrics {
-            metrics.data.lock().unwrap().total_fetch_elapsed += fetch_start.elapsed();
+        // Build the async stream using ArrowReaderBuilder API.
+        let mut builder = ParquetRecordBatchStreamBuilder::new_with_metadata(
+            async_reader,
+            self.arrow_metadata.clone(),
+        );
+        builder = builder
+            .with_row_groups(vec![row_group_idx])
+            .with_projection(self.projection.clone())
+            .with_batch_size(DEFAULT_READ_BATCH_SIZE);
+
+        if let Some(selection) = row_selection {
+            builder = builder.with_row_selection(selection);
         }
 
-        // Builds the parquet reader.
-        // Now the row selection is None.
-        ParquetRecordBatchReader::try_new_with_row_groups(
-            &self.field_levels,
-            &row_group,
-            DEFAULT_READ_BATCH_SIZE,
-            row_selection,
-        )
-        .context(ReadParquetSnafu {
+        let stream = builder.build().context(ReadParquetSnafu {
             path: &self.file_path,
-        })
+        })?;
+
+        Ok(stream)
     }
 }
 
@@ -1850,7 +1845,7 @@ impl ParquetReader {
     pub async fn next_record_batch(&mut self) -> Result<Option<RecordBatch>> {
         loop {
             if let Some(reader) = &mut self.reader {
-                if let Some(batch) = reader.next_batch()? {
+                if let Some(batch) = reader.next_batch().await? {
                     return Ok(Some(batch));
                 }
                 self.reader = None;
@@ -1929,27 +1924,19 @@ impl ParquetReader {
 /// RowGroupReaderContext represents the fields that cannot be shared
 /// between different `RowGroupReader`s.
 pub(crate) trait RowGroupReaderContext: Send {
-    fn map_result(
-        &self,
-        result: std::result::Result<Option<RecordBatch>, ArrowError>,
-    ) -> Result<Option<RecordBatch>>;
-
     fn read_format(&self) -> &ReadFormat;
+
+    fn file_path(&self) -> &str;
 }
 
 impl RowGroupReaderContext for FileRangeContextRef {
-    fn map_result(
-        &self,
-        result: std::result::Result<Option<RecordBatch>, ArrowError>,
-    ) -> Result<Option<RecordBatch>> {
-        result.context(ArrowReaderSnafu {
-            path: self.file_path(),
-        })
-    }
-
     fn read_format(&self) -> &ReadFormat {
         self.as_ref().read_format()
     }
+
+    fn file_path(&self) -> &str {
+        self.as_ref().file_path()
+    }
 }
 
 /// [RowGroupReader] that reads from [FileRange].
@@ -1957,8 +1944,11 @@ pub(crate) type RowGroupReader = RowGroupReaderBase<FileRangeContextRef>;
 
 impl RowGroupReader {
     /// Creates a new reader from file range.
-    pub(crate) fn new(context: FileRangeContextRef, reader: ParquetRecordBatchReader) -> Self {
-        Self::create(context, reader)
+    pub(crate) fn new(
+        context: FileRangeContextRef,
+        stream: ParquetRecordBatchStream<SstAsyncFileReader>,
+    ) -> Self {
+        Self::create(context, stream)
     }
 }
 
@@ -1966,8 +1956,8 @@ impl RowGroupReader {
 pub(crate) struct RowGroupReaderBase<T> {
     /// Context of [RowGroupReader] so adapts to different underlying implementation.
     context: T,
-    /// Inner parquet reader.
-    reader: ParquetRecordBatchReader,
+    /// Inner parquet record batch stream.
+    stream: ParquetRecordBatchStream<SstAsyncFileReader>,
     /// Buffered batches to return.
     batches: VecDeque<Batch>,
     /// Local scan metrics.
@@ -1981,7 +1971,7 @@ where
     T: RowGroupReaderContext,
 {
     /// Creates a new reader to read the primary key format.
-    pub(crate) fn create(context: T, reader: ParquetRecordBatchReader) -> Self {
+    pub(crate) fn create(context: T, stream: ParquetRecordBatchStream<SstAsyncFileReader>) -> Self {
         // The batch length from the reader should be less than or equal to DEFAULT_READ_BATCH_SIZE.
         let override_sequence = context
             .read_format()
@@ -1990,7 +1980,7 @@ where
 
         Self {
             context,
-            reader,
+            stream,
             batches: VecDeque::new(),
             metrics: ReaderMetrics::default(),
             override_sequence,
@@ -2007,13 +1997,18 @@ where
         self.context.read_format()
     }
 
-    /// Tries to fetch next [RecordBatch] from the reader.
-    fn fetch_next_record_batch(&mut self) -> Result<Option<RecordBatch>> {
-        self.context.map_result(self.reader.next().transpose())
+    /// Tries to fetch next [RecordBatch] from the stream asynchronously.
+    async fn fetch_next_record_batch(&mut self) -> Result<Option<RecordBatch>> {
+        match self.stream.next().await.transpose() {
+            Ok(batch) => Ok(batch),
+            Err(e) => Err(e).context(ReadParquetSnafu {
+                path: self.context.file_path(),
+            }),
+        }
     }
 
     /// Returns the next [Batch].
-    pub(crate) fn next_inner(&mut self) -> Result<Option<Batch>> {
+    pub(crate) async fn next_inner(&mut self) -> Result<Option<Batch>> {
         let scan_start = Instant::now();
         if let Some(batch) = self.batches.pop_front() {
             self.metrics.num_rows += batch.num_rows();
@@ -2023,7 +2018,7 @@ where
 
         // We need to fetch next record batch and convert it to batches.
         while self.batches.is_empty() {
-            let Some(record_batch) = self.fetch_next_record_batch()? else {
+            let Some(record_batch) = self.fetch_next_record_batch().await? else {
                 self.metrics.scan_cost += scan_start.elapsed();
                 return Ok(None);
             };
@@ -2051,10 +2046,10 @@ where
 #[async_trait::async_trait]
 impl<T> BatchReader for RowGroupReaderBase<T>
 where
-    T: RowGroupReaderContext,
+    T: RowGroupReaderContext + Send + Sync,
 {
     async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        self.next_inner()
+        self.next_inner().await
     }
 }
 
@@ -2062,15 +2057,18 @@ where
 pub(crate) struct FlatRowGroupReader {
     /// Context for file ranges.
     context: FileRangeContextRef,
-    /// Inner parquet reader.
-    reader: ParquetRecordBatchReader,
+    /// Inner parquet record batch stream.
+    stream: ParquetRecordBatchStream<SstAsyncFileReader>,
     /// Cached sequence array to override sequences.
     override_sequence: Option<ArrayRef>,
 }
 
 impl FlatRowGroupReader {
     /// Creates a new flat reader from file range.
-    pub(crate) fn new(context: FileRangeContextRef, reader: ParquetRecordBatchReader) -> Self {
+    pub(crate) fn new(
+        context: FileRangeContextRef,
+        stream: ParquetRecordBatchStream<SstAsyncFileReader>,
+    ) -> Self {
         // The batch length from the reader should be less than or equal to DEFAULT_READ_BATCH_SIZE.
         let override_sequence = context
             .read_format()
@@ -2078,16 +2076,16 @@ impl FlatRowGroupReader {
 
         Self {
             context,
-            reader,
+            stream,
             override_sequence,
         }
     }
 
     /// Returns the next RecordBatch.
-    pub(crate) fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
-        match self.reader.next() {
+    pub(crate) async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
+        match self.stream.next().await {
             Some(batch_result) => {
-                let record_batch = batch_result.context(ArrowReaderSnafu {
+                let record_batch = batch_result.context(ReadParquetSnafu {
                     path: self.context.file_path(),
                 })?;
 
diff --git a/src/mito2/src/sst/parquet/row_group.rs b/src/mito2/src/sst/parquet/row_group.rs
index 8f3f6c5f62..38ef62c6b8 100644
--- a/src/mito2/src/sst/parquet/row_group.rs
+++ b/src/mito2/src/sst/parquet/row_group.rs
@@ -12,28 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//! Ports private structs from [parquet crate](https://github.com/apache/arrow-rs/blob/7e134f4d277c0b62c27529fc15a4739de3ad0afd/parquet/src/arrow/async_reader/mod.rs#L644-L650).
+//! Parquet row group reading utilities.
 
 use std::ops::Range;
 use std::sync::Arc;
 
-use bytes::{Buf, Bytes};
-use object_store::ObjectStore;
-use parquet::arrow::ProjectionMask;
-use parquet::arrow::arrow_reader::{RowGroups, RowSelection};
-use parquet::column::page::{PageIterator, PageReader};
-use parquet::errors::{ParquetError, Result};
-use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
-use parquet::file::page_index::offset_index::OffsetIndexMetaData;
-use parquet::file::reader::{ChunkReader, Length};
-use parquet::file::serialized_reader::SerializedPageReader;
-use store_api::storage::{FileId, RegionId};
-use tokio::task::yield_now;
-
-use crate::cache::file_cache::{FileType, IndexKey};
-use crate::cache::{CacheStrategy, PageKey, PageValue};
-use crate::metrics::{READ_STAGE_ELAPSED, READ_STAGE_FETCH_PAGES};
-use crate::sst::parquet::helper::{MERGE_GAP, fetch_byte_ranges};
+use crate::sst::parquet::helper::MERGE_GAP;
 
 /// Inner data for ParquetFetchMetrics.
 #[derive(Default, Debug, Clone)]
@@ -74,9 +58,9 @@ impl ParquetFetchMetricsData {
 }
 
 /// Metrics for tracking page/row group fetch operations.
-#[derive(Default)]
+#[derive(Default, Clone)]
 pub struct ParquetFetchMetrics {
-    pub data: std::sync::Mutex<ParquetFetchMetricsData>,
+    pub data: Arc<std::sync::Mutex<ParquetFetchMetricsData>>,
 }
 
 impl std::fmt::Debug for ParquetFetchMetrics {
@@ -204,363 +188,12 @@ impl ParquetFetchMetrics {
     }
 }
 
-pub(crate) struct RowGroupBase<'a> {
-    parquet_metadata: &'a ParquetMetaData,
-    row_group_idx: usize,
-    pub(crate) offset_index: Option<&'a [OffsetIndexMetaData]>,
-    /// Compressed page of each column.
-    column_chunks: Vec<Option<Arc<ColumnChunkData>>>,
-    pub(crate) row_count: usize,
-}
-
-impl<'a> RowGroupBase<'a> {
-    pub(crate) fn new(parquet_meta: &'a ParquetMetaData, row_group_idx: usize) -> Self {
-        let metadata = parquet_meta.row_group(row_group_idx);
-        // `offset_index` is always `None` if we don't set
-        // [with_page_index()](https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index)
-        // to `true`.
-        let offset_index = parquet_meta
-            .offset_index()
-            // filter out empty offset indexes (old versions specified Some(vec![]) when no present)
-            .filter(|index| !index.is_empty())
-            .map(|x| x[row_group_idx].as_slice());
-
-        Self {
-            parquet_metadata: parquet_meta,
-            row_group_idx,
-            offset_index,
-            column_chunks: vec![None; metadata.columns().len()],
-            row_count: metadata.num_rows() as usize,
-        }
-    }
-
-    pub(crate) fn calc_sparse_read_ranges(
-        &self,
-        projection: &ProjectionMask,
-        offset_index: &[OffsetIndexMetaData],
-        selection: &RowSelection,
-    ) -> (Vec<Range<u64>>, Vec<Vec<usize>>) {
-        // If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the
-        // `RowSelection`
-        let mut page_start_offsets: Vec<Vec<usize>> = vec![];
-        let ranges = self
-            .column_chunks
-            .iter()
-            .zip(self.row_group_metadata().columns())
-            .enumerate()
-            .filter(|&(idx, (chunk, _chunk_meta))| chunk.is_none() && projection.leaf_included(idx))
-            .flat_map(|(idx, (_chunk, chunk_meta))| {
-                // If the first page does not start at the beginning of the column,
-                // then we need to also fetch a dictionary page.
-                let mut ranges = vec![];
-                let (start, _len) = chunk_meta.byte_range();
-                match offset_index[idx].page_locations.first() {
-                    Some(first) if first.offset as u64 != start => {
-                        ranges.push(start..first.offset as u64);
-                    }
-                    _ => (),
-                }
-
-                ranges.extend(
-                    selection
-                        .scan_ranges(&offset_index[idx].page_locations)
-                        .iter()
-                        .map(|range| range.start..range.end),
-                );
-                page_start_offsets.push(ranges.iter().map(|range| range.start as usize).collect());
-
-                ranges
-            })
-            .collect::<Vec<_>>();
-        (ranges, page_start_offsets)
-    }
-
-    pub(crate) fn assign_sparse_chunk(
-        &mut self,
-        projection: &ProjectionMask,
-        data: Vec<Bytes>,
-        page_start_offsets: Vec<Vec<usize>>,
-    ) {
-        let mut page_start_offsets = page_start_offsets.into_iter();
-        let mut chunk_data = data.into_iter();
-
-        for (idx, chunk) in self.column_chunks.iter_mut().enumerate() {
-            if chunk.is_some() || !projection.leaf_included(idx) {
-                continue;
-            }
-
-            if let Some(offsets) = page_start_offsets.next() {
-                let mut chunks = Vec::with_capacity(offsets.len());
-                for _ in 0..offsets.len() {
-                    chunks.push(chunk_data.next().unwrap());
-                }
-
-                let column = self
-                    .parquet_metadata
-                    .row_group(self.row_group_idx)
-                    .column(idx);
-                *chunk = Some(Arc::new(ColumnChunkData::Sparse {
-                    length: column.byte_range().1 as usize,
-                    data: offsets.into_iter().zip(chunks).collect(),
-                }))
-            }
-        }
-    }
-
-    pub(crate) fn calc_dense_read_ranges(&self, projection: &ProjectionMask) -> Vec<Range<u64>> {
-        self.column_chunks
-            .iter()
-            .enumerate()
-            .filter(|&(idx, chunk)| chunk.is_none() && projection.leaf_included(idx))
-            .map(|(idx, _chunk)| {
-                let column = self.row_group_metadata().column(idx);
-                let (start, length) = column.byte_range();
-                start..(start + length)
-            })
-            .collect::<Vec<_>>()
-    }
-
-    /// Assigns compressed chunk binary data to [RowGroupBase::column_chunks]
-    /// and returns the chunk offset and binary data assigned.
-    pub(crate) fn assign_dense_chunk(
-        &mut self,
-        projection: &ProjectionMask,
-        chunk_data: Vec<Bytes>,
-    ) {
-        let mut chunk_data = chunk_data.into_iter();
-
-        for (idx, chunk) in self.column_chunks.iter_mut().enumerate() {
-            if chunk.is_some() || !projection.leaf_included(idx) {
-                continue;
-            }
-
-            // Get the fetched page.
-            let Some(data) = chunk_data.next() else {
-                continue;
-            };
-
-            let column = self
-                .parquet_metadata
-                .row_group(self.row_group_idx)
-                .column(idx);
-            *chunk = Some(Arc::new(ColumnChunkData::Dense {
-                offset: column.byte_range().0 as usize,
-                data,
-            }));
-        }
-    }
-
-    /// Create [PageReader] from [RowGroupBase::column_chunks]
-    pub(crate) fn column_reader(
-        &self,
-        col_idx: usize,
-    ) -> Result<SerializedPageReader<ColumnChunkData>> {
-        let page_reader = match &self.column_chunks[col_idx] {
-            None => {
-                return Err(ParquetError::General(format!(
-                    "Invalid column index {col_idx}, column was not fetched"
-                )));
-            }
-            Some(data) => {
-                let page_locations = self
-                    .offset_index
-                    // filter out empty offset indexes (old versions specified Some(vec![]) when no present)
-                    .filter(|index| !index.is_empty())
-                    .map(|index| index[col_idx].page_locations.clone());
-                SerializedPageReader::new(
-                    data.clone(),
-                    self.row_group_metadata().column(col_idx),
-                    self.row_count,
-                    page_locations,
-                )?
-            }
-        };
-
-        Ok(page_reader)
-    }
-
-    pub(crate) fn parquet_metadata(&self) -> &ParquetMetaData {
-        self.parquet_metadata
-    }
-
-    pub(crate) fn row_group_metadata(&self) -> &RowGroupMetaData {
-        self.parquet_metadata().row_group(self.row_group_idx)
-    }
-}
-
-/// An in-memory collection of column chunks
-pub struct InMemoryRowGroup<'a> {
-    region_id: RegionId,
-    file_id: FileId,
-    row_group_idx: usize,
-    cache_strategy: CacheStrategy,
-    file_path: &'a str,
-    /// Object store.
-    object_store: ObjectStore,
-    base: RowGroupBase<'a>,
-}
-
-impl<'a> InMemoryRowGroup<'a> {
-    /// Creates a new [InMemoryRowGroup] by `row_group_idx`.
-    ///
-    /// # Panics
-    /// Panics if the `row_group_idx` is invalid.
-    pub fn create(
-        region_id: RegionId,
-        file_id: FileId,
-        parquet_meta: &'a ParquetMetaData,
-        row_group_idx: usize,
-        cache_strategy: CacheStrategy,
-        file_path: &'a str,
-        object_store: ObjectStore,
-    ) -> Self {
-        Self {
-            region_id,
-            file_id,
-            row_group_idx,
-            cache_strategy,
-            file_path,
-            object_store,
-            base: RowGroupBase::new(parquet_meta, row_group_idx),
-        }
-    }
-
-    /// Fetches the necessary column data into memory
-    pub async fn fetch(
-        &mut self,
-        projection: &ProjectionMask,
-        selection: Option<&RowSelection>,
-        metrics: Option<&ParquetFetchMetrics>,
-    ) -> Result<()> {
-        if let Some((selection, offset_index)) = selection.zip(self.base.offset_index) {
-            let (fetch_ranges, page_start_offsets) =
-                self.base
-                    .calc_sparse_read_ranges(projection, offset_index, selection);
-
-            let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?;
-            // Assign sparse chunk data to base.
-            self.base
-                .assign_sparse_chunk(projection, chunk_data, page_start_offsets);
-        } else {
-            // Release the CPU to avoid blocking the runtime. Since `fetch_pages_from_cache`
-            // is a synchronous, CPU-bound operation.
-            yield_now().await;
-
-            // Calculate ranges to read.
-            let fetch_ranges = self.base.calc_dense_read_ranges(projection);
-
-            if fetch_ranges.is_empty() {
-                // Nothing to fetch.
-                return Ok(());
-            }
-
-            // Fetch data with ranges
-            let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?;
-
-            // Assigns fetched data to base.
-            self.base.assign_dense_chunk(projection, chunk_data);
-        }
-
-        Ok(())
-    }
-
-    /// Try to fetch data from the memory cache or the WriteCache,
-    /// if not in WriteCache, fetch data from object store directly.
-    async fn fetch_bytes(
-        &self,
-        ranges: &[Range<u64>],
-        metrics: Option<&ParquetFetchMetrics>,
-    ) -> Result<Vec<Bytes>> {
-        // Now fetch page timer includes the whole time to read pages.
-        let _timer = READ_STAGE_FETCH_PAGES.start_timer();
-
-        let page_key = PageKey::new(self.file_id, self.row_group_idx, ranges.to_vec());
-        if let Some(pages) = self.cache_strategy.get_pages(&page_key) {
-            if let Some(metrics) = metrics {
-                let total_size: u64 = ranges.iter().map(|r| r.end - r.start).sum();
-                let mut metrics_data = metrics.data.lock().unwrap();
-                metrics_data.page_cache_hit += 1;
-                metrics_data.pages_to_fetch_mem += ranges.len();
-                metrics_data.page_size_to_fetch_mem += total_size;
-                metrics_data.page_size_needed += total_size;
-            }
-            return Ok(pages.compressed.clone());
-        }
-
-        // Calculate total range size for metrics.
-        let (total_range_size, unaligned_size) = compute_total_range_size(ranges);
-
-        let key = IndexKey::new(self.region_id, self.file_id, FileType::Parquet);
-        let fetch_write_cache_start = metrics.map(|_| std::time::Instant::now());
-        let write_cache_result = self.fetch_ranges_from_write_cache(key, ranges).await;
-        let pages = match write_cache_result {
-            Some(data) => {
-                if let Some(metrics) = metrics {
-                    let elapsed = fetch_write_cache_start
-                        .map(|start| start.elapsed())
-                        .unwrap_or_default();
-                    let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
-                    let mut metrics_data = metrics.data.lock().unwrap();
-                    metrics_data.write_cache_fetch_elapsed += elapsed;
-                    metrics_data.write_cache_hit += 1;
-                    metrics_data.pages_to_fetch_write_cache += ranges.len();
-                    metrics_data.page_size_to_fetch_write_cache += unaligned_size;
-                    metrics_data.page_size_needed += range_size_needed;
-                }
-                data
-            }
-            None => {
-                // Fetch data from object store.
-                let _timer = READ_STAGE_ELAPSED
-                    .with_label_values(&["cache_miss_read"])
-                    .start_timer();
-
-                let start = metrics.map(|_| std::time::Instant::now());
-                let data = fetch_byte_ranges(self.file_path, self.object_store.clone(), ranges)
-                    .await
-                    .map_err(|e| ParquetError::External(Box::new(e)))?;
-                if let Some(metrics) = metrics {
-                    let elapsed = start.map(|start| start.elapsed()).unwrap_or_default();
-                    let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
-                    let mut metrics_data = metrics.data.lock().unwrap();
-                    metrics_data.store_fetch_elapsed += elapsed;
-                    metrics_data.cache_miss += 1;
-                    metrics_data.pages_to_fetch_store += ranges.len();
-                    metrics_data.page_size_to_fetch_store += unaligned_size;
-                    metrics_data.page_size_needed += range_size_needed;
-                }
-                data
-            }
-        };
-
-        // Put pages back to the cache.
-        let page_value = PageValue::new(pages.clone(), total_range_size);
-        self.cache_strategy
-            .put_pages(page_key, Arc::new(page_value));
-
-        Ok(pages)
-    }
-
-    /// Fetches data from write cache.
-    /// Returns `None` if the data is not in the cache.
-    async fn fetch_ranges_from_write_cache(
-        &self,
-        key: IndexKey,
-        ranges: &[Range<u64>],
-    ) -> Option<Vec<Bytes>> {
-        if let Some(cache) = self.cache_strategy.write_cache() {
-            return cache.file_cache().read_ranges(key, ranges).await;
-        }
-        None
-    }
-}
-
 /// Computes the max possible buffer size to read the given `ranges`.
 /// Returns (aligned_size, unaligned_size) where:
 /// - aligned_size: total size aligned to pooled buffer size
 /// - unaligned_size: actual total size without alignment
 // See https://github.com/apache/opendal/blob/v0.54.0/core/src/types/read/reader.rs#L166-L192
-fn compute_total_range_size(ranges: &[Range<u64>]) -> (u64, u64) {
+pub(crate) fn compute_total_range_size(ranges: &[Range<u64>]) -> (u64, u64) {
     if ranges.is_empty() {
         return (0, 0);
     }
@@ -602,96 +235,3 @@ fn align_to_pooled_buf_size(size: u64) -> u64 {
     const POOLED_BUF_SIZE: u64 = 2 * 1024 * 1024;
     size.div_ceil(POOLED_BUF_SIZE) * POOLED_BUF_SIZE
 }
-
-impl RowGroups for InMemoryRowGroup<'_> {
-    fn num_rows(&self) -> usize {
-        self.base.row_count
-    }
-
-    fn column_chunks(&self, i: usize) -> Result<Box<dyn PageIterator>> {
-        // Creates a page reader to read column at `i`.
-        let page_reader = self.base.column_reader(i)?;
-
-        Ok(Box::new(ColumnChunkIterator {
-            reader: Some(Ok(Box::new(page_reader))),
-        }))
-    }
-
-    fn row_groups(&self) -> Box<dyn Iterator<Item = &RowGroupMetaData> + '_> {
-        Box::new(std::iter::once(self.base.row_group_metadata()))
-    }
-
-    fn metadata(&self) -> &ParquetMetaData {
-        self.base.parquet_metadata()
-    }
-}
-
-/// An in-memory column chunk
-#[derive(Clone)]
-pub(crate) enum ColumnChunkData {
-    /// Column chunk data representing only a subset of data pages
-    Sparse {
-        /// Length of the full column chunk
-        length: usize,
-        /// Set of data pages included in this sparse chunk. Each element is a tuple
-        /// of (page offset, page data)
-        data: Vec<(usize, Bytes)>,
-    },
-    /// Full column chunk and its offset
-    Dense { offset: usize, data: Bytes },
-}
-
-impl ColumnChunkData {
-    fn get(&self, start: u64) -> Result<Bytes> {
-        match &self {
-            ColumnChunkData::Sparse { data, .. } => data
-                .binary_search_by_key(&start, |(offset, _)| *offset as u64)
-                .map(|idx| data[idx].1.clone())
-                .map_err(|_| {
-                    ParquetError::General(format!(
-                        "Invalid offset in sparse column chunk data: {start}"
-                    ))
-                }),
-            ColumnChunkData::Dense { offset, data } => {
-                let start = start as usize - *offset;
-                Ok(data.slice(start..))
-            }
-        }
-    }
-}
-
-impl Length for ColumnChunkData {
-    fn len(&self) -> u64 {
-        match &self {
-            ColumnChunkData::Sparse { length, .. } => *length as u64,
-            ColumnChunkData::Dense { data, .. } => data.len() as u64,
-        }
-    }
-}
-
-impl ChunkReader for ColumnChunkData {
-    type T = bytes::buf::Reader<Bytes>;
-
-    fn get_read(&self, start: u64) -> Result<Self::T> {
-        Ok(self.get(start)?.reader())
-    }
-
-    fn get_bytes(&self, start: u64, length: usize) -> Result<Bytes> {
-        Ok(self.get(start)?.slice(..length))
-    }
-}
-
-/// Implements [`PageIterator`] for a single column chunk, yielding a single [`PageReader`]
-pub(crate) struct ColumnChunkIterator {
-    pub(crate) reader: Option<Result<Box<dyn PageReader>>>,
-}
-
-impl Iterator for ColumnChunkIterator {
-    type Item = Result<Box<dyn PageReader>>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        self.reader.take()
-    }
-}
-
-impl PageIterator for ColumnChunkIterator {}

From 35c5a4adb7c390969d8d42f1ec23300ad14dc90b Mon Sep 17 00:00:00 2001
From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com>
Date: Wed, 25 Mar 2026 20:26:27 +0800
Subject: [PATCH 40/42] fix(mito2): accept post-truncate flush for skip-wal
 tables (#7858)

Allow flush edits with equal entry ids when flushed sequence advances, so close-time flush after truncate still succeeds for skip-wal regions while stale pre-truncate flushes are rejected. Add a regression test for create->truncate->write->close timing.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
---
 src/mito2/src/engine/skip_wal_test.rs | 77 ++++++++++++++++++++++++++-
 src/mito2/src/region.rs               | 17 +++++-
 2 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/src/mito2/src/engine/skip_wal_test.rs b/src/mito2/src/engine/skip_wal_test.rs
index d1b38c47fb..c59be6ba2c 100644
--- a/src/mito2/src/engine/skip_wal_test.rs
+++ b/src/mito2/src/engine/skip_wal_test.rs
@@ -15,7 +15,9 @@
 use api::v1::Rows;
 use common_wal::options::{WAL_OPTIONS_KEY, WalOptions};
 use store_api::region_engine::{RegionEngine, RegionRole};
-use store_api::region_request::{RegionCloseRequest, RegionRequest};
+use store_api::region_request::{
+    RegionCloseRequest, RegionOpenRequest, RegionRequest, RegionTruncateRequest,
+};
 use store_api::storage::{RegionId, ScanRequest};
 
 use crate::config::MitoConfig;
@@ -168,3 +170,76 @@ async fn test_close_follower_region_skip_wal() {
     let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
     assert_eq!(0, total_rows);
 }
+
+#[tokio::test]
+async fn test_close_region_after_truncate_skip_wal() {
+    common_telemetry::init_default_ut_logging();
+    let mut env = TestEnv::with_prefix("close-truncate-skip-wal").await;
+    let engine = env.create_engine(MitoConfig::default()).await;
+
+    let region_id = RegionId::new(1, 1);
+    let mut request = CreateRequestBuilder::new().build();
+    let wal_options = WalOptions::Noop;
+    request.options.insert(
+        WAL_OPTIONS_KEY.to_string(),
+        serde_json::to_string(&wal_options).unwrap(),
+    );
+
+    engine
+        .handle_request(region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+
+    engine
+        .handle_request(
+            region_id,
+            RegionRequest::Truncate(RegionTruncateRequest::All),
+        )
+        .await
+        .unwrap();
+
+    let region = engine.get_region(region_id).unwrap();
+    let version_data = region.version_control.current();
+    assert_eq!(
+        version_data.version.truncated_entry_id,
+        Some(version_data.last_entry_id)
+    );
+
+    let rows = Rows {
+        schema: rows_schema(&request),
+        rows: build_rows(0, 3),
+    };
+    put_rows(&engine, region_id, rows).await;
+
+    let region = engine.get_region(region_id).unwrap();
+    assert!(!region.version().memtables.is_empty());
+
+    engine
+        .handle_request(region_id, RegionRequest::Close(RegionCloseRequest {}))
+        .await
+        .unwrap();
+
+    engine
+        .handle_request(
+            region_id,
+            RegionRequest::Open(RegionOpenRequest {
+                engine: String::new(),
+                table_dir: request.table_dir,
+                path_type: store_api::region_request::PathType::Bare,
+                options: request.options,
+                skip_wal_replay: false,
+                checkpoint: None,
+            }),
+        )
+        .await
+        .unwrap();
+    let stream = engine
+        .scan_to_stream(region_id, ScanRequest::default())
+        .await
+        .unwrap();
+    let batches = common_recordbatch::RecordBatches::try_collect(stream)
+        .await
+        .unwrap();
+    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(3, total_rows);
+}
diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs
index de8927c4de..3020c9ecf4 100644
--- a/src/mito2/src/region.rs
+++ b/src/mito2/src/region.rs
@@ -973,8 +973,23 @@ impl ManifestContext {
 
             // This is an edit from flush.
             if let Some(flushed_entry_id) = edit.flushed_entry_id {
+                // A flush edit is valid after truncate in two cases:
+                // 1. `flushed_entry_id` moves past `truncated_entry_id`, meaning it definitely
+                //    flushed data newer than the truncate point.
+                // 2. `flushed_entry_id` equals `truncated_entry_id`, but `flushed_sequence`
+                //    increases. This happens in skip-WAL tables where entry id can stay at 0,
+                //    while sequence still advances for post-truncate writes.
+                //
+                // We still reject stale flushes from before truncate:
+                // if entry id is equal and sequence does not advance, the flush is outdated.
+                let is_newer_entry = truncated_entry_id < flushed_entry_id;
+                let is_same_entry_with_newer_sequence = truncated_entry_id == flushed_entry_id
+                    && edit.flushed_sequence.is_some_and(|flushed_sequence| {
+                        manifest.flushed_sequence < flushed_sequence
+                    });
+
                 ensure!(
-                    truncated_entry_id < flushed_entry_id,
+                    is_newer_entry || is_same_entry_with_newer_sequence,
                     RegionTruncatedSnafu {
                         region_id: manifest.metadata.region_id,
                     }

From ec9d57cecc098b72a2382dd4de3817bb18fbdc12 Mon Sep 17 00:00:00 2001
From: Boudewijn van Groos <boudewijn@vangroos.nl>
Date: Wed, 25 Mar 2026 18:58:45 +0100
Subject: [PATCH 41/42] fix: nested views not working (#7857)

Signed-off-by: Boudewijn van Groos <boudewijn.vangroos@foundationzero.org>
---
 src/catalog/src/table_source.rs               |  6 +++-
 .../standalone/common/view/create.result      | 31 ++++++++++++++++++-
 tests/cases/standalone/common/view/create.sql |  8 ++++-
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/src/catalog/src/table_source.rs b/src/catalog/src/table_source.rs
index 132e02fe14..8aabf64e99 100644
--- a/src/catalog/src/table_source.rs
+++ b/src/catalog/src/table_source.rs
@@ -151,7 +151,11 @@ impl DfTableSourceProvider {
         let catalog_list = Arc::new(DummyCatalogList::new(self.catalog_manager.clone()));
         let logical_plan = self
             .plan_decoder
-            .decode(Bytes::from(view_info.view_info.clone()), catalog_list, true)
+            .decode(
+                Bytes::from(view_info.view_info.clone()),
+                catalog_list,
+                false,
+            )
             .await
             .context(DecodePlanSnafu {
                 name: &table.table_info().name,
diff --git a/tests/cases/standalone/common/view/create.result b/tests/cases/standalone/common/view/create.result
index 1c6e0ee50b..76b9838628 100644
--- a/tests/cases/standalone/common/view/create.result
+++ b/tests/cases/standalone/common/view/create.result
@@ -30,6 +30,10 @@ CREATE VIEW test_view as SELECT * FROM public.numbers;
 
 Affected Rows: 0
 
+CREATE VIEW test_view2 as SELECT * FROM test_view;
+
+Affected Rows: 0
+
 --- View already exists ----
 CREATE VIEW test_view as SELECT * FROM public.numbers;
 
@@ -51,6 +55,7 @@ SHOW TABLES;
 | numbers          |
 | test_table       |
 | test_view        |
+| test_view2       |
 +------------------+
 
 SHOW FULL TABLES;
@@ -61,6 +66,7 @@ SHOW FULL TABLES;
 | numbers          | LOCAL TEMPORARY |
 | test_table       | BASE TABLE      |
 | test_view        | VIEW            |
+| test_view2       | VIEW            |
 +------------------+-----------------+
 
 -- psql: \dv
@@ -124,17 +130,19 @@ SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE;
 |greptime|information_schema|tables|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 |greptime|public|test_table|BASETABLE|ID|ID|ID|ID|ID|ID|mito|ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||N|
 |greptime|public|test_view|VIEW|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||N|
+|greptime|public|test_view2|VIEW|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||N|
 |greptime|information_schema|views|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 +++++++++++++++++++++++++
 
 -- SQLNESS REPLACE (\s\d+\s) ID
 -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) DATETIME
-SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW';
+SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW' ORDER BY TABLE_NAME;
 
 +---------------+--------------+------------+------------+----------+-------------+-----------------+--------------+------------------+----------------+--------+---------+------------+------------+-----------+----------------+---------------------+---------------------+------------+-----------------+----------+----------------+---------------+-----------+
 | table_catalog | table_schema | table_name | table_type | table_id | data_length | max_data_length | index_length | max_index_length | avg_row_length | engine | version | row_format | table_rows | data_free | auto_increment | create_time         | update_time         | check_time | table_collation | checksum | create_options | table_comment | temporary |
 +---------------+--------------+------------+------------+----------+-------------+-----------------+--------------+------------------+----------------+--------+---------+------------+------------+-----------+----------------+---------------------+---------------------+------------+-----------------+----------+----------------+---------------+-----------+
 | greptime      | public       | test_view  | VIEW       |ID    |ID          |ID              |ID           |ID               |ID             |        |ID     | Fixed      |ID         |ID        |ID             |DATETIME |DATETIME |            | utf8_bin        |ID       |                |               | N         |
+| greptime      | public       | test_view2 | VIEW       |ID    |ID          |ID              |ID           |ID               |ID             |        |ID     | Fixed      |ID         |ID        |ID             |DATETIME |DATETIME |            | utf8_bin        |ID       |                |               | N         |
 +---------------+--------------+------------+------------+----------+-------------+-----------------+--------------+------------------+----------------+--------+---------+------------+------------+-----------+----------------+---------------------+---------------------+------------+-----------------+----------+----------------+---------------+-----------+
 
 SHOW COLUMNS FROM test_view;
@@ -169,10 +177,31 @@ SELECT * FROM test_view LIMIT 10;
 | 9      |
 +--------+
 
+SELECT * FROM test_view2 LIMIT 10;
+
++--------+
+| number |
++--------+
+| 0      |
+| 1      |
+| 2      |
+| 3      |
+| 4      |
+| 5      |
+| 6      |
+| 7      |
+| 8      |
+| 9      |
++--------+
+
 DROP VIEW test_view;
 
 Affected Rows: 0
 
+DROP VIEW test_view2;
+
+Affected Rows: 0
+
 DROP TABLE test_table;
 
 Affected Rows: 0
diff --git a/tests/cases/standalone/common/view/create.sql b/tests/cases/standalone/common/view/create.sql
index b82704d3a9..91149f44f4 100644
--- a/tests/cases/standalone/common/view/create.sql
+++ b/tests/cases/standalone/common/view/create.sql
@@ -16,6 +16,8 @@ CREATE OR REPLACE VIEW test_table as SELECT * FROM public.numbers;
 
 CREATE VIEW test_view as SELECT * FROM public.numbers;
 
+CREATE VIEW test_view2 as SELECT * FROM test_view;
+
 --- View already exists ----
 CREATE VIEW test_view as SELECT * FROM public.numbers;
 
@@ -48,7 +50,7 @@ SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE;
 
 -- SQLNESS REPLACE (\s\d+\s) ID
 -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) DATETIME
-SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW';
+SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW' ORDER BY TABLE_NAME;
 
 SHOW COLUMNS FROM test_view;
 
@@ -58,8 +60,12 @@ SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'test_view';
 
 SELECT * FROM test_view LIMIT 10;
 
+SELECT * FROM test_view2 LIMIT 10;
+
 DROP VIEW test_view;
 
+DROP VIEW test_view2;
+
 DROP TABLE test_table;
 
 SELECT * FROM test_view LIMIT 10;

From 59dd4186297f4cbc026fbe43c43289b8477f68e9 Mon Sep 17 00:00:00 2001
From: Ruihang Xia <waynestxia@gmail.com>
Date: Thu, 26 Mar 2026 08:08:38 +0800
Subject: [PATCH 42/42] feat: simplify nested aggr inside count query (#7859)

* as optimizer rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* dump changes

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* perf: tighten count-count optimizer rewrite

* extend inner op set

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* simplify and more coverage

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* remove prom-non-null

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* preserve value column through pruning

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* more sqlness cases

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* rename

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* enforce is not null before inner aggr

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* finalize

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update sqlness result

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
---
 src/query/src/optimizer.rs                    |   1 +
 src/query/src/optimizer/count_nest_aggr.rs    | 346 ++++++++++++++++++
 src/query/src/planner.rs                      | 210 ++++++++++-
 src/query/src/promql/planner.rs               | 170 +++++++++
 src/query/src/query_engine/state.rs           |   2 +
 .../standalone/common/promql/scalar.result    | 130 ++++++-
 .../cases/standalone/common/promql/scalar.sql |  56 ++-
 .../tql-explain-analyze/tsid_column.result    |  59 ++-
 .../tql-explain-analyze/tsid_column.sql       |  10 +-
 9 files changed, 973 insertions(+), 11 deletions(-)
 create mode 100644 src/query/src/optimizer/count_nest_aggr.rs

diff --git a/src/query/src/optimizer.rs b/src/query/src/optimizer.rs
index 4259b587ba..aaac1e3124 100644
--- a/src/query/src/optimizer.rs
+++ b/src/query/src/optimizer.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 pub mod constant_term;
+pub mod count_nest_aggr;
 pub mod count_wildcard;
 pub mod parallelize_scan;
 pub mod pass_distribution;
diff --git a/src/query/src/optimizer/count_nest_aggr.rs b/src/query/src/optimizer/count_nest_aggr.rs
new file mode 100644
index 0000000000..89ba426074
--- /dev/null
+++ b/src/query/src/optimizer/count_nest_aggr.rs
@@ -0,0 +1,346 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashSet;
+use std::sync::Arc;
+
+use datafusion::config::ConfigOptions;
+use datafusion::functions_aggregate::count::count_udaf;
+use datafusion::logical_expr::{Extension, LogicalPlan, LogicalPlanBuilder, Sort};
+use datafusion_common::Result;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_expr::{Expr, UserDefinedLogicalNodeCore, lit};
+use promql::extension_plan::{InstantManipulate, SeriesDivide, SeriesNormalize};
+use store_api::metric_engine_consts::DATA_SCHEMA_TSID_COLUMN_NAME;
+
+use crate::QueryEngineContext;
+use crate::optimizer::ExtensionAnalyzerRule;
+
+/// Rewrites `count(<presence-preserving-agg>(<vector_selector>) by (...))` into a presence-based
+/// group count.
+///
+/// This stays intentionally narrow:
+/// - the outer aggregate must be plain `count`
+/// - the inner aggregate must be a plain aggregate whose result existence is equivalent to input
+///   group existence
+/// - the inner input must be the direct instant-vector-selector plan
+/// - the outer count must only group by the evaluation timestamp
+#[derive(Debug)]
+pub struct CountNestAggrRule;
+
+impl ExtensionAnalyzerRule for CountNestAggrRule {
+    fn analyze(
+        &self,
+        plan: LogicalPlan,
+        _ctx: &QueryEngineContext,
+        _config: &ConfigOptions,
+    ) -> Result<LogicalPlan> {
+        plan.transform_down(&Self::rewrite_plan).map(|x| x.data)
+    }
+}
+
+impl CountNestAggrRule {
+    fn rewrite_plan(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
+        let LogicalPlan::Sort(sort) = plan else {
+            return Ok(Transformed::no(plan));
+        };
+
+        if let Some(rewritten) = Self::try_rewrite_sort(&sort)? {
+            Ok(Transformed::yes(rewritten))
+        } else {
+            Ok(Transformed::no(LogicalPlan::Sort(sort)))
+        }
+    }
+
+    fn try_rewrite_sort(sort: &Sort) -> Result<Option<LogicalPlan>> {
+        if sort.fetch.is_some() {
+            return Ok(None);
+        }
+
+        let LogicalPlan::Aggregate(outer_agg) = sort.input.as_ref() else {
+            return Ok(None);
+        };
+        if outer_agg.group_expr.len() != 1 || outer_agg.aggr_expr.len() != 1 {
+            return Ok(None);
+        }
+        let outer_time_expr = outer_agg.group_expr[0].clone();
+        let outer_count_arg =
+            match Self::aggregate_if(&outer_agg.aggr_expr[0], |name| name == "count") {
+                Some((_, arg)) => arg,
+                None => return Ok(None),
+            };
+
+        let LogicalPlan::Sort(inner_sort) = outer_agg.input.as_ref() else {
+            return Ok(None);
+        };
+        if inner_sort.fetch.is_some() {
+            return Ok(None);
+        }
+
+        let LogicalPlan::Aggregate(inner_agg) = inner_sort.input.as_ref() else {
+            return Ok(None);
+        };
+        if inner_agg.aggr_expr.len() != 1 || inner_agg.group_expr.is_empty() {
+            return Ok(None);
+        }
+        let (inner_is_count, inner_value_expr) =
+            match Self::aggregate_if(&inner_agg.aggr_expr[0], |name| {
+                Self::is_supported_inner_aggregate(name)
+            }) {
+                Some((name, arg)) => (name == "count", arg),
+                None => return Ok(None),
+            };
+        let Expr::Column(_) = inner_value_expr else {
+            return Ok(None);
+        };
+
+        let Expr::Column(outer_count_column) = outer_count_arg else {
+            return Ok(None);
+        };
+        let inner_output_field = inner_agg.schema.field(inner_agg.group_expr.len());
+        if outer_count_column.name != *inner_output_field.name() {
+            return Ok(None);
+        }
+
+        if !Self::is_projection_chain_to_instant(inner_agg.input.as_ref()) {
+            return Ok(None);
+        }
+
+        if !inner_agg
+            .group_expr
+            .iter()
+            .all(|expr| matches!(expr, Expr::Column(_)))
+        {
+            return Ok(None);
+        }
+
+        let Some(time_expr_pos) = inner_agg
+            .group_expr
+            .iter()
+            .position(|expr| expr == &outer_time_expr)
+        else {
+            return Ok(None);
+        };
+
+        let mut presence_group_exprs = Vec::with_capacity(inner_agg.group_expr.len());
+        presence_group_exprs.push(outer_time_expr.clone());
+        presence_group_exprs.extend(
+            inner_agg
+                .group_expr
+                .iter()
+                .enumerate()
+                .filter(|(idx, _)| *idx != time_expr_pos)
+                .map(|(_, expr)| expr.clone()),
+        );
+
+        let mut required_input_columns =
+            Self::collect_required_input_columns(&presence_group_exprs, inner_value_expr);
+        required_input_columns.extend(Self::collect_required_instant_columns(
+            inner_agg.input.as_ref(),
+        ));
+        let presence_source = Self::rebuild_projection_chain_to_instant(
+            inner_agg.input.as_ref(),
+            &required_input_columns,
+        )?;
+
+        let outer_value_name = outer_agg
+            .schema
+            .field(outer_agg.group_expr.len())
+            .name()
+            .clone();
+        let mut presence_input = LogicalPlanBuilder::from(presence_source);
+        if !inner_is_count {
+            presence_input = presence_input.filter(inner_value_expr.clone().is_not_null())?;
+        }
+        let presence_input = presence_input
+            .project(presence_group_exprs.clone())?
+            .distinct()?
+            .build()?;
+
+        let rewritten = LogicalPlanBuilder::from(presence_input)
+            .aggregate(
+                outer_agg.group_expr.clone(),
+                vec![count_udaf().call(vec![lit(1_i64)]).alias(outer_value_name)],
+            )?
+            .sort(sort.expr.clone())?
+            .build()?;
+
+        Ok(Some(rewritten))
+    }
+
+    fn collect_required_input_columns(group_exprs: &[Expr], value_expr: &Expr) -> HashSet<String> {
+        let mut required = HashSet::new();
+
+        for expr in group_exprs {
+            if let Expr::Column(column) = expr {
+                required.insert(column.name.clone());
+            }
+        }
+        if let Expr::Column(column) = value_expr {
+            // Keep the value column in the pruned instant input so `InstantManipulate`
+            // can still perform stale-NaN filtering before we project down to keys.
+            required.insert(column.name.clone());
+        }
+
+        required
+    }
+
+    fn collect_required_instant_columns(plan: &LogicalPlan) -> HashSet<String> {
+        let mut required = HashSet::new();
+        Self::collect_required_instant_columns_into(plan, &mut required);
+        required
+    }
+
+    fn collect_required_instant_columns_into(plan: &LogicalPlan, required: &mut HashSet<String>) {
+        match plan {
+            LogicalPlan::Projection(projection) => {
+                Self::collect_required_instant_columns_into(projection.input.as_ref(), required);
+            }
+            LogicalPlan::Extension(extension) => {
+                for expr in extension.node.expressions() {
+                    if let Expr::Column(column) = expr {
+                        required.insert(column.name);
+                    }
+                }
+
+                if extension.node.as_any().is::<SeriesDivide>()
+                    && extension.node.inputs()[0]
+                        .schema()
+                        .fields()
+                        .iter()
+                        .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
+                {
+                    required.insert(DATA_SCHEMA_TSID_COLUMN_NAME.to_string());
+                }
+
+                if let Some(input) = extension.node.inputs().into_iter().next() {
+                    Self::collect_required_instant_columns_into(input, required);
+                }
+            }
+            _ => {}
+        }
+    }
+
+    fn aggregate_if<F>(expr: &Expr, accept_name: F) -> Option<(&str, &Expr)>
+    where
+        F: FnOnce(&str) -> bool,
+    {
+        let Expr::AggregateFunction(func) = expr else {
+            return None;
+        };
+        let name = func.func.name();
+        if !accept_name(name)
+            || func.params.filter.is_some()
+            || func.params.distinct
+            || !func.params.order_by.is_empty()
+            || func.params.args.len() != 1
+        {
+            return None;
+        }
+
+        Some((name, &func.params.args[0]))
+    }
+
+    fn is_supported_inner_aggregate(name: &str) -> bool {
+        matches!(
+            name,
+            "count" | "sum" | "avg" | "min" | "max" | "stddev_pop" | "var_pop"
+        )
+    }
+
+    fn is_projection_chain_to_instant(plan: &LogicalPlan) -> bool {
+        let mut current = plan;
+        loop {
+            match current {
+                LogicalPlan::Projection(projection) => current = projection.input.as_ref(),
+                LogicalPlan::Extension(ext) => {
+                    return ext.node.as_any().is::<InstantManipulate>();
+                }
+                _ => return false,
+            }
+        }
+    }
+
+    fn rebuild_projection_chain_to_instant(
+        plan: &LogicalPlan,
+        required_columns: &HashSet<String>,
+    ) -> Result<LogicalPlan> {
+        match plan {
+            LogicalPlan::Projection(projection) => {
+                let input = Self::rebuild_projection_chain_to_instant(
+                    projection.input.as_ref(),
+                    required_columns,
+                )?;
+                LogicalPlanBuilder::from(input)
+                    .project(projection.expr.clone())?
+                    .build()
+            }
+            LogicalPlan::Extension(extension) => {
+                if let Some(instant) = extension.node.as_any().downcast_ref::<InstantManipulate>() {
+                    let input =
+                        Self::prune_instant_input(extension.node.inputs()[0], required_columns)?;
+                    return Ok(LogicalPlan::Extension(Extension {
+                        node: Arc::new(instant.with_exprs_and_inputs(vec![], vec![input])?),
+                    }));
+                }
+
+                Ok(plan.clone())
+            }
+            _ => Ok(plan.clone()),
+        }
+    }
+
+    fn prune_instant_input(
+        plan: &LogicalPlan,
+        required_columns: &HashSet<String>,
+    ) -> Result<LogicalPlan> {
+        match plan {
+            LogicalPlan::Extension(extension) => {
+                if let Some(normalize) = extension.node.as_any().downcast_ref::<SeriesNormalize>() {
+                    let input =
+                        Self::prune_instant_input(extension.node.inputs()[0], required_columns)?;
+                    return Ok(LogicalPlan::Extension(Extension {
+                        node: Arc::new(normalize.with_exprs_and_inputs(vec![], vec![input])?),
+                    }));
+                }
+
+                if let Some(divide) = extension.node.as_any().downcast_ref::<SeriesDivide>() {
+                    let divide_input = extension.node.inputs()[0].clone();
+
+                    let projection_exprs = divide_input
+                        .schema()
+                        .fields()
+                        .iter()
+                        .filter(|field| required_columns.contains(field.name()))
+                        .map(|field| {
+                            Expr::Column(datafusion_common::Column::from_name(field.name().clone()))
+                        })
+                        .collect::<Vec<_>>();
+                    let projected_input = LogicalPlanBuilder::from(divide_input)
+                        .project(projection_exprs)?
+                        .build()?;
+
+                    return Ok(LogicalPlan::Extension(Extension {
+                        node: Arc::new(
+                            divide.with_exprs_and_inputs(vec![], vec![projected_input])?,
+                        ),
+                    }));
+                }
+
+                Ok(plan.clone())
+            }
+            _ => Ok(plan.clone()),
+        }
+    }
+}
diff --git a/src/query/src/planner.rs b/src/query/src/planner.rs
index f522dc567a..6b206b9d8d 100644
--- a/src/query/src/planner.rs
+++ b/src/query/src/planner.rs
@@ -278,17 +278,22 @@ impl DfLogicalPlanner {
         let table_provider = DfTableSourceProvider::new(
             self.engine_state.catalog_manager().clone(),
             self.engine_state.disallow_cross_catalog_query(),
-            query_ctx,
+            query_ctx.clone(),
             plan_decoder,
             self.session_state
                 .config_options()
                 .sql_parser
                 .enable_ident_normalization,
         );
-        PromPlanner::stmt_to_plan(table_provider, stmt, &self.engine_state)
+        let plan = PromPlanner::stmt_to_plan(table_provider, stmt, &self.engine_state)
             .await
             .map_err(BoxedError::new)
-            .context(QueryPlanSnafu)
+            .context(QueryPlanSnafu)?;
+
+        let context = QueryEngineContext::new(self.session_state.clone(), query_ctx);
+        Ok(self
+            .engine_state
+            .optimize_by_extension_rules(plan, &context)?)
     }
 
     #[tracing::instrument(skip_all)]
@@ -571,15 +576,22 @@ mod tests {
     use std::sync::Arc;
 
     use arrow_schema::DataType;
+    use catalog::RegisterTableRequest;
+    use catalog::memory::MemoryCatalogManager;
+    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
     use datatypes::prelude::ConcreteDataType;
     use datatypes::schema::{ColumnSchema, Schema};
     use session::context::QueryContext;
+    use store_api::metric_engine_consts::{
+        DATA_SCHEMA_TABLE_ID_COLUMN_NAME, DATA_SCHEMA_TSID_COLUMN_NAME, LOGICAL_TABLE_METADATA_KEY,
+        METRIC_ENGINE_NAME,
+    };
     use table::metadata::{TableInfoBuilder, TableMetaBuilder};
     use table::test_util::EmptyTable;
 
     use super::*;
-    use crate::QueryEngineRef;
-    use crate::parser::QueryLanguageParser;
+    use crate::parser::{PromQuery, QueryLanguageParser};
+    use crate::{QueryEngineFactory, QueryEngineRef};
 
     async fn create_test_engine() -> QueryEngineRef {
         let columns = vec![
@@ -600,6 +612,109 @@ mod tests {
         crate::tests::new_query_engine_with_table(table)
     }
 
+    fn create_promql_test_engine() -> QueryEngineRef {
+        let catalog_manager = MemoryCatalogManager::with_default_setup();
+        let physical_table_name = "phy";
+        let physical_table_id = 999u32;
+
+        let physical_schema = Arc::new(Schema::new(vec![
+            ColumnSchema::new(
+                DATA_SCHEMA_TABLE_ID_COLUMN_NAME.to_string(),
+                ConcreteDataType::uint32_datatype(),
+                false,
+            ),
+            ColumnSchema::new(
+                DATA_SCHEMA_TSID_COLUMN_NAME.to_string(),
+                ConcreteDataType::uint64_datatype(),
+                false,
+            ),
+            ColumnSchema::new("tag_0", ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new("tag_1", ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new(
+                "timestamp",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                false,
+            )
+            .with_time_index(true),
+            ColumnSchema::new("field_0", ConcreteDataType::float64_datatype(), true),
+        ]));
+        let physical_meta = TableMetaBuilder::empty()
+            .schema(physical_schema)
+            .primary_key_indices(vec![0, 1, 2, 3])
+            .value_indices(vec![4, 5])
+            .engine(METRIC_ENGINE_NAME.to_string())
+            .next_column_id(1024)
+            .build()
+            .unwrap();
+        let physical_info = TableInfoBuilder::default()
+            .table_id(physical_table_id)
+            .name(physical_table_name)
+            .meta(physical_meta)
+            .build()
+            .unwrap();
+        catalog_manager
+            .register_table_sync(RegisterTableRequest {
+                catalog: DEFAULT_CATALOG_NAME.to_string(),
+                schema: DEFAULT_SCHEMA_NAME.to_string(),
+                table_name: physical_table_name.to_string(),
+                table_id: physical_table_id,
+                table: EmptyTable::from_table_info(&physical_info),
+            })
+            .unwrap();
+
+        let mut options = table::requests::TableOptions::default();
+        options.extra_options.insert(
+            LOGICAL_TABLE_METADATA_KEY.to_string(),
+            physical_table_name.to_string(),
+        );
+        let logical_schema = Arc::new(Schema::new(vec![
+            ColumnSchema::new("tag_0", ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new("tag_1", ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new(
+                "timestamp",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                false,
+            )
+            .with_time_index(true),
+            ColumnSchema::new("field_0", ConcreteDataType::float64_datatype(), true),
+        ]));
+        let logical_meta = TableMetaBuilder::empty()
+            .schema(logical_schema)
+            .primary_key_indices(vec![0, 1])
+            .value_indices(vec![3])
+            .engine(METRIC_ENGINE_NAME.to_string())
+            .options(options)
+            .next_column_id(1024)
+            .build()
+            .unwrap();
+        let logical_info = TableInfoBuilder::default()
+            .table_id(1024)
+            .name("some_metric")
+            .meta(logical_meta)
+            .build()
+            .unwrap();
+        catalog_manager
+            .register_table_sync(RegisterTableRequest {
+                catalog: DEFAULT_CATALOG_NAME.to_string(),
+                schema: DEFAULT_SCHEMA_NAME.to_string(),
+                table_name: "some_metric".to_string(),
+                table_id: 1024,
+                table: EmptyTable::from_table_info(&logical_info),
+            })
+            .unwrap();
+
+        QueryEngineFactory::new(
+            catalog_manager,
+            None,
+            None,
+            None,
+            None,
+            false,
+            crate::options::QueryOptions::default(),
+        )
+        .query_engine()
+    }
+
     async fn parse_sql_to_plan(sql: &str) -> LogicalPlan {
         let stmt = QueryLanguageParser::parse_sql(sql, &QueryContext::arc()).unwrap();
         let engine = create_test_engine().await;
@@ -610,6 +725,25 @@ mod tests {
             .unwrap()
     }
 
+    async fn parse_promql_to_plan(query: &str) -> LogicalPlan {
+        let engine = create_promql_test_engine();
+        let query_ctx = QueryContext::arc();
+        let stmt = QueryLanguageParser::parse_promql(
+            &PromQuery {
+                query: query.to_string(),
+                start: "0".to_string(),
+                end: "10".to_string(),
+                step: "5s".to_string(),
+                lookback: "300s".to_string(),
+                alias: None,
+            },
+            &query_ctx,
+        )
+        .unwrap();
+
+        engine.planner().plan(&stmt, query_ctx).await.unwrap()
+    }
+
     #[tokio::test]
     async fn test_extract_placeholder_cast_types_multiple() {
         let plan = parse_sql_to_plan(
@@ -646,6 +780,72 @@ mod tests {
         assert_eq!(type_3, &Some(DataType::Int32));
     }
 
+    #[tokio::test]
+    async fn test_plan_pql_applies_extension_rules() {
+        for inner_agg in ["count", "sum", "avg", "min", "max", "stddev", "stdvar"] {
+            let plan = parse_promql_to_plan(&format!(
+                "sum(irate(some_metric[1h])) / scalar(count({inner_agg}(some_metric) by (tag_0)))"
+            ))
+            .await;
+            let plan_str = plan.display_indent_schema().to_string();
+            assert!(plan_str.contains("Distinct:"), "{inner_agg}: {plan_str}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_plan_pql_filters_null_only_groups_for_non_count_inner_aggs() {
+        let count_plan = parse_promql_to_plan("scalar(count(count(some_metric) by (tag_0)))").await;
+        let count_plan_str = count_plan.display_indent_schema().to_string();
+        assert!(
+            !count_plan_str.contains("field_0 IS NOT NULL"),
+            "{count_plan_str}"
+        );
+
+        for inner_agg in ["sum", "avg", "min", "max", "stddev", "stdvar"] {
+            let plan = parse_promql_to_plan(&format!(
+                "scalar(count({inner_agg}(some_metric) by (tag_0)))"
+            ))
+            .await;
+            let plan_str = plan.display_indent_schema().to_string();
+            assert!(
+                plan_str.contains("field_0 IS NOT NULL"),
+                "{inner_agg}: {plan_str}"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_plan_pql_skips_extension_rules_for_non_direct_or_unsupported_inner_agg() {
+        for query in [
+            "sum(irate(some_metric[1h])) / scalar(count(sum(irate(some_metric[1h])) by (tag_0)))",
+            "sum(irate(some_metric[1h])) / scalar(count(group(some_metric) by (tag_0)))",
+        ] {
+            let plan = parse_promql_to_plan(query).await;
+            let plan_str = plan.display_indent_schema().to_string();
+            assert!(!plan_str.contains("Distinct:"), "{query}: {plan_str}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_plan_sql_does_not_apply_nested_count_rule() {
+        let plan = parse_sql_to_plan(
+            "SELECT id, count(inner_count) \
+             FROM ( \
+                 SELECT id, count(name) AS inner_count \
+                 FROM test \
+                 GROUP BY id \
+                 ORDER BY id \
+                 LIMIT 1000000 \
+             ) t \
+             GROUP BY id \
+             ORDER BY id",
+        )
+        .await;
+
+        let plan_str = plan.display_indent_schema().to_string();
+        assert!(!plan_str.contains("Distinct:"), "{plan_str}");
+    }
+
     #[tokio::test]
     async fn test_get_inferred_parameter_types_subquery() {
         let plan = parse_sql_to_plan(
diff --git a/src/query/src/promql/planner.rs b/src/query/src/promql/planner.rs
index b6f4f2d28f..23d654d2b6 100644
--- a/src/query/src/promql/planner.rs
+++ b/src/query/src/promql/planner.rs
@@ -4056,6 +4056,7 @@ mod test {
     use table::test_util::EmptyTable;
 
     use super::*;
+    use crate::QueryEngineContext;
     use crate::options::QueryOptions;
     use crate::parser::QueryLanguageParser;
 
@@ -4073,6 +4074,64 @@ mod test {
         )
     }
 
+    async fn build_optimized_promql_plan(
+        table_provider: DfTableSourceProvider,
+        eval_stmt: &EvalStmt,
+    ) -> LogicalPlan {
+        let state = build_query_engine_state();
+        let raw_plan = PromPlanner::stmt_to_plan(table_provider, eval_stmt, &state)
+            .await
+            .unwrap();
+        let context = QueryEngineContext::new(state.session_state(), QueryContext::arc());
+        state
+            .optimize_by_extension_rules(raw_plan, &context)
+            .unwrap()
+    }
+
+    async fn build_optimized_tsid_plan(
+        query: &str,
+        num_tag: usize,
+        num_field: usize,
+        end_secs: u64,
+        lookback_secs: u64,
+    ) -> String {
+        let eval_stmt = EvalStmt {
+            expr: parser::parse(query).unwrap(),
+            start: UNIX_EPOCH,
+            end: UNIX_EPOCH
+                .checked_add(Duration::from_secs(end_secs))
+                .unwrap(),
+            interval: Duration::from_secs(5),
+            lookback_delta: Duration::from_secs(lookback_secs),
+        };
+        let table_provider = build_test_table_provider_with_tsid(
+            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
+            num_tag,
+            num_field,
+        )
+        .await;
+
+        build_optimized_promql_plan(table_provider, &eval_stmt)
+            .await
+            .display_indent_schema()
+            .to_string()
+    }
+
+    async fn assert_nested_count_rewrite_applies(query: &str, expected_outer_agg: &str) {
+        let plan_str = build_optimized_tsid_plan(query, 2, 1, 100_000, 1).await;
+
+        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
+        assert!(plan_str.contains("Projection: some_metric.timestamp, some_metric.tag_0"));
+        assert!(plan_str.contains("Distinct:"));
+        assert!(plan_str.contains(expected_outer_agg), "{plan_str}");
+        assert!(!plan_str.contains("PromSeriesDivide: tags=[\"tag_0\"]"));
+    }
+
+    async fn assert_nested_count_rewrite_missing(query: &str, num_tag: usize, lookback_secs: u64) {
+        let plan_str = build_optimized_tsid_plan(query, num_tag, 1, 100_000, lookback_secs).await;
+        assert!(!plan_str.contains("Distinct:"), "{plan_str}");
+    }
+
     async fn build_test_table_provider(
         table_name_tuples: &[(String, String)],
         num_tag: usize,
@@ -4685,6 +4744,117 @@ mod test {
         );
     }
 
+    #[tokio::test]
+    async fn scalar_count_count_range_keeps_full_window() {
+        let plan_str = build_optimized_tsid_plan(
+            "scalar(count(count(some_metric) by (tag_0)))",
+            1,
+            1,
+            100_000,
+            1,
+        )
+        .await;
+        assert!(plan_str.contains("ScalarCalculate: tags=[]"));
+        assert!(plan_str.contains("PromInstantManipulate: range=[0..100000000]"));
+        assert!(!plan_str.contains("PromInstantManipulate: range=[99999000..99999000]"));
+    }
+
+    #[tokio::test]
+    async fn scalar_count_count_rewrite_applies_inside_binary_expr_for_tsid_input() {
+        let plan_str = build_optimized_tsid_plan(
+            "sum(irate(some_metric[1h])) / scalar(count(count(some_metric) by (tag_0)))",
+            2,
+            1,
+            10,
+            300,
+        )
+        .await;
+        assert!(plan_str.contains("Distinct:"), "{plan_str}");
+    }
+
+    #[tokio::test]
+    async fn nested_count_rewrite_keeps_full_series_key_with_tsid_input() {
+        assert_nested_count_rewrite_applies(
+            "count(count(some_metric) by (tag_0))",
+            "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(count(some_metric.field_0))]]"
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn nested_sum_count_rewrite_keeps_full_series_key_with_tsid_input() {
+        assert_nested_count_rewrite_applies(
+            "count(sum(some_metric) by (tag_0))",
+            "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(sum(some_metric.field_0))]]"
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn nested_supported_inner_aggs_rewrite_apply_for_tsid_input() {
+        for (query, expected_outer_agg) in [
+            (
+                "count(avg(some_metric) by (tag_0))",
+                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(avg(some_metric.field_0))]]",
+            ),
+            (
+                "count(min(some_metric) by (tag_0))",
+                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(min(some_metric.field_0))]]",
+            ),
+            (
+                "count(max(some_metric) by (tag_0))",
+                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(max(some_metric.field_0))]]",
+            ),
+            (
+                "count(stddev(some_metric) by (tag_0))",
+                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(stddev_pop(some_metric.field_0))]]",
+            ),
+            (
+                "count(stdvar(some_metric) by (tag_0))",
+                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(var_pop(some_metric.field_0))]]",
+            ),
+        ] {
+            assert_nested_count_rewrite_applies(query, expected_outer_agg).await;
+        }
+    }
+
+    #[tokio::test]
+    async fn nested_non_count_inner_aggs_rewrite_filter_null_values_for_tsid_input() {
+        let count_plan =
+            build_optimized_tsid_plan("count(count(some_metric) by (tag_0))", 2, 1, 100_000, 1)
+                .await;
+        assert!(
+            !count_plan.contains("some_metric.field_0 IS NOT NULL"),
+            "{count_plan}"
+        );
+
+        for query in [
+            "count(sum(some_metric) by (tag_0))",
+            "count(avg(some_metric) by (tag_0))",
+            "count(min(some_metric) by (tag_0))",
+            "count(max(some_metric) by (tag_0))",
+            "count(stddev(some_metric) by (tag_0))",
+            "count(stdvar(some_metric) by (tag_0))",
+        ] {
+            let plan_str = build_optimized_tsid_plan(query, 2, 1, 100_000, 1).await;
+            assert!(
+                plan_str.contains("Filter: some_metric.field_0 IS NOT NULL"),
+                "{query}: {plan_str}"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn nested_unsupported_or_non_direct_inner_aggs_do_not_rewrite() {
+        assert_nested_count_rewrite_missing("count(group(some_metric) by (tag_0))", 2, 1).await;
+        assert_nested_count_rewrite_missing(
+            "count(sum(irate(some_metric[1h])) by (tag_0))",
+            2,
+            300,
+        )
+        .await;
+    }
+
     #[tokio::test]
     async fn physical_table_name_is_not_leaked_in_plan() {
         let prom_expr = parser::parse("some_metric").unwrap();
diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs
index a45fc4c896..f696c8b53e 100644
--- a/src/query/src/query_engine/state.rs
+++ b/src/query/src/query_engine/state.rs
@@ -60,6 +60,7 @@ use crate::dist_plan::{
 use crate::metrics::{QUERY_MEMORY_POOL_REJECTED_TOTAL, QUERY_MEMORY_POOL_USAGE_BYTES};
 use crate::optimizer::ExtensionAnalyzerRule;
 use crate::optimizer::constant_term::MatchesConstantTermOptimizer;
+use crate::optimizer::count_nest_aggr::CountNestAggrRule;
 use crate::optimizer::count_wildcard::CountWildcardToTimeIndexRule;
 use crate::optimizer::parallelize_scan::ParallelizeScan;
 use crate::optimizer::pass_distribution::PassDistribution;
@@ -146,6 +147,7 @@ impl QueryEngineState {
 
         // The [`TypeConversionRule`] must be at first
         extension_rules.insert(0, Arc::new(TypeConversionRule) as _);
+        extension_rules.push(Arc::new(CountNestAggrRule) as _);
 
         // Apply the datafusion rules
         let mut analyzer = Analyzer::new();
diff --git a/tests/cases/standalone/common/promql/scalar.result b/tests/cases/standalone/common/promql/scalar.result
index c5c3e5ebd1..c3292b4f5c 100644
--- a/tests/cases/standalone/common/promql/scalar.result
+++ b/tests/cases/standalone/common/promql/scalar.result
@@ -136,6 +136,42 @@ TQL EVAL (0, 15, '5s') scalar(count(count(host) by (host)));
 | 1970-01-01T00:00:15 | 2.0                            |
 +---------------------+--------------------------------+
 
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 15, '5s') scalar(count(sum(host) by (host)));
+
++---------------------+------------------------------+
+| ts                  | scalar(count(sum(host.val))) |
++---------------------+------------------------------+
+| 1970-01-01T00:00:00 | 2.0                          |
+| 1970-01-01T00:00:05 | 2.0                          |
+| 1970-01-01T00:00:10 | 2.0                          |
+| 1970-01-01T00:00:15 | 2.0                          |
++---------------------+------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 15, '5s') scalar(count(avg(host) by (host)));
+
++---------------------+------------------------------+
+| ts                  | scalar(count(avg(host.val))) |
++---------------------+------------------------------+
+| 1970-01-01T00:00:00 | 2.0                          |
+| 1970-01-01T00:00:05 | 2.0                          |
+| 1970-01-01T00:00:10 | 2.0                          |
+| 1970-01-01T00:00:15 | 2.0                          |
++---------------------+------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 15, '5s') scalar(count(stddev(host) by (host)));
+
++---------------------+-------------------------------------+
+| ts                  | scalar(count(stddev_pop(host.val))) |
++---------------------+-------------------------------------+
+| 1970-01-01T00:00:00 | 2.0                                 |
+| 1970-01-01T00:00:05 | 2.0                                 |
+| 1970-01-01T00:00:10 | 2.0                                 |
+| 1970-01-01T00:00:15 | 2.0                                 |
++---------------------+-------------------------------------+
+
 -- SQLNESS SORT_RESULT 3 1
 TQL EVAL (0, 15, '5s') scalar(host{host="host1"} + scalar(host{host="host2"}));
 
@@ -516,7 +552,99 @@ TQL EVAL (0, 15, '5s') clamp_max(clamp(host{host="host1"}, 0, 15), 6);
 | 1970-01-01T00:00:15 | 6.0                                                     | host1 |
 +---------------------+---------------------------------------------------------+-------+
 
-Drop table host;
+DROP TABLE host;
+
+Affected Rows: 0
+
+CREATE TABLE presence_metric (
+  ts timestamp(3) time index,
+  instance STRING,
+  cpu STRING,
+  shard STRING,
+  val DOUBLE,
+  PRIMARY KEY (instance, cpu, shard),
+);
+
+Affected Rows: 0
+
+INSERT INTO TABLE presence_metric VALUES
+    (0,      'i1', 'cpu0', 'a', 1.0),
+    (0,      'i1', 'cpu0', 'b', 2.0),
+    (0,      'i1', 'cpu1', 'a', 10.0),
+    (0,      'i1', 'cpu2', 'a', 20.0),
+    (0,      'i2', 'cpu9', 'a', 100.0),
+    (200000, 'i1', 'cpu0', 'a', 'NAN'::DOUBLE),
+    (200000, 'i1', 'cpu0', 'b', 'NAN'::DOUBLE),
+    (200000, 'i1', 'cpu1', 'a', 11.0),
+    (200000, 'i1', 'cpu2', 'a', NULL),
+    (200000, 'i2', 'cpu9', 'a', 101.0),
+    (400000, 'i1', 'cpu1', 'a', 12.0),
+    (400000, 'i2', 'cpu9', 'a', 102.0),
+    (600000, 'i1', 'cpu0', 'a', 7.0),
+    (600000, 'i1', 'cpu0', 'b', 8.0),
+    (600000, 'i2', 'cpu9', 'a', 103.0);
+
+Affected Rows: 15
+
+-- NaN drops `cpu0` from the grouped count, while the NULL sample on `cpu2`
+-- still leaves a zero-valued row in `count(...) by (cpu)`.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') count(presence_metric{instance="i1"}) by (cpu);
+
++------+---------------------+----------------------------+
+| cpu  | ts                  | count(presence_metric.val) |
++------+---------------------+----------------------------+
+| cpu0 | 1970-01-01T00:00:00 | 2                          |
+| cpu0 | 1970-01-01T00:10:00 | 2                          |
+| cpu1 | 1970-01-01T00:00:00 | 1                          |
+| cpu1 | 1970-01-01T00:03:20 | 1                          |
+| cpu1 | 1970-01-01T00:06:40 | 1                          |
+| cpu1 | 1970-01-01T00:10:00 | 1                          |
+| cpu2 | 1970-01-01T00:00:00 | 1                          |
+| cpu2 | 1970-01-01T00:03:20 | 0                          |
+| cpu2 | 1970-01-01T00:06:40 | 0                          |
++------+---------------------+----------------------------+
+
+-- Nested-count rewrite should preserve grouped presence after stale-NaN filtering and null-value pruning.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric{instance="i1"}) by (cpu)));
+
++---------------------+-------------------------------------------+
+| ts                  | scalar(count(count(presence_metric.val))) |
++---------------------+-------------------------------------------+
+| 1970-01-01T00:00:00 | 3.0                                       |
+| 1970-01-01T00:03:20 | 2.0                                       |
+| 1970-01-01T00:06:40 | 2.0                                       |
+| 1970-01-01T00:10:00 | 2.0                                       |
++---------------------+-------------------------------------------+
+
+-- Non-count inner aggregates must drop NULL-only groups before the outer count.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') scalar(count(sum(presence_metric{instance="i1"}) by (cpu)));
+
++---------------------+-----------------------------------------+
+| ts                  | scalar(count(sum(presence_metric.val))) |
++---------------------+-----------------------------------------+
+| 1970-01-01T00:00:00 | 3.0                                     |
+| 1970-01-01T00:03:20 | 1.0                                     |
+| 1970-01-01T00:06:40 | 1.0                                     |
+| 1970-01-01T00:10:00 | 2.0                                     |
++---------------------+-----------------------------------------+
+
+-- False case: outer `by (instance)` keeps multiple series at the scalar input, so scalar should still yield NaN.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric) by (instance, cpu)) by (instance));
+
++---------------------+-------------------------------------------+
+| ts                  | scalar(count(count(presence_metric.val))) |
++---------------------+-------------------------------------------+
+| 1970-01-01T00:00:00 | NaN                                       |
+| 1970-01-01T00:03:20 | NaN                                       |
+| 1970-01-01T00:06:40 | NaN                                       |
+| 1970-01-01T00:10:00 | NaN                                       |
++---------------------+-------------------------------------------+
+
+DROP TABLE presence_metric;
 
 Affected Rows: 0
 
diff --git a/tests/cases/standalone/common/promql/scalar.sql b/tests/cases/standalone/common/promql/scalar.sql
index b4007bbf15..662f9665fe 100644
--- a/tests/cases/standalone/common/promql/scalar.sql
+++ b/tests/cases/standalone/common/promql/scalar.sql
@@ -43,6 +43,15 @@ TQL EVAL (0, 15, '5s') scalar(host{host="host1"}) + host;
 -- SQLNESS SORT_RESULT 3 1
 TQL EVAL (0, 15, '5s') scalar(count(count(host) by (host)));
 
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 15, '5s') scalar(count(sum(host) by (host)));
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 15, '5s') scalar(count(avg(host) by (host)));
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 15, '5s') scalar(count(stddev(host) by (host)));
+
 -- SQLNESS SORT_RESULT 3 1
 TQL EVAL (0, 15, '5s') scalar(host{host="host1"} + scalar(host{host="host2"}));
 
@@ -149,4 +158,49 @@ TQL EVAL (0, 15, '5s') clamp(clamp_min(host{host="host1"}, 1), 0, 12);
 -- SQLNESS SORT_RESULT 3 1
 TQL EVAL (0, 15, '5s') clamp_max(clamp(host{host="host1"}, 0, 15), 6);
 
-Drop table host;
+DROP TABLE host;
+
+CREATE TABLE presence_metric (
+  ts timestamp(3) time index,
+  instance STRING,
+  cpu STRING,
+  shard STRING,
+  val DOUBLE,
+  PRIMARY KEY (instance, cpu, shard),
+);
+
+INSERT INTO TABLE presence_metric VALUES
+    (0,      'i1', 'cpu0', 'a', 1.0),
+    (0,      'i1', 'cpu0', 'b', 2.0),
+    (0,      'i1', 'cpu1', 'a', 10.0),
+    (0,      'i1', 'cpu2', 'a', 20.0),
+    (0,      'i2', 'cpu9', 'a', 100.0),
+    (200000, 'i1', 'cpu0', 'a', 'NAN'::DOUBLE),
+    (200000, 'i1', 'cpu0', 'b', 'NAN'::DOUBLE),
+    (200000, 'i1', 'cpu1', 'a', 11.0),
+    (200000, 'i1', 'cpu2', 'a', NULL),
+    (200000, 'i2', 'cpu9', 'a', 101.0),
+    (400000, 'i1', 'cpu1', 'a', 12.0),
+    (400000, 'i2', 'cpu9', 'a', 102.0),
+    (600000, 'i1', 'cpu0', 'a', 7.0),
+    (600000, 'i1', 'cpu0', 'b', 8.0),
+    (600000, 'i2', 'cpu9', 'a', 103.0);
+
+-- NaN drops `cpu0` from the grouped count, while the NULL sample on `cpu2`
+-- still leaves a zero-valued row in `count(...) by (cpu)`.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') count(presence_metric{instance="i1"}) by (cpu);
+
+-- Nested-count rewrite should preserve grouped presence after stale-NaN filtering and null-value pruning.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric{instance="i1"}) by (cpu)));
+
+-- Non-count inner aggregates must drop NULL-only groups before the outer count.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') scalar(count(sum(presence_metric{instance="i1"}) by (cpu)));
+
+-- False case: outer `by (instance)` keeps multiple series at the scalar input, so scalar should still yield NaN.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric) by (instance, cpu)) by (instance));
+
+DROP TABLE presence_metric;
diff --git a/tests/cases/standalone/tql-explain-analyze/tsid_column.result b/tests/cases/standalone/tql-explain-analyze/tsid_column.result
index 84544b1655..4a7a875060 100644
--- a/tests/cases/standalone/tql-explain-analyze/tsid_column.result
+++ b/tests/cases/standalone/tql-explain-analyze/tsid_column.result
@@ -112,10 +112,63 @@ TQL ANALYZE (0, 10, '5s')  sum(irate(tsid_metric[1h])) / scalar(count(count(tsid
 |_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts], aggr=[count(count(tsid_metric.val))] REDACTED
 |_|_|_RepartitionExec: partitioning=REDACTED
 |_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts], aggr=[count(count(tsid_metric.val))] REDACTED
-|_|_|_ProjectionExec: expr=[ts@1 as ts, count(tsid_metric.val)@2 as count(tsid_metric.val)] REDACTED
-|_|_|_AggregateExec: mode=FinalPartitioned, gby=[job@0 as job, ts@1 as ts], aggr=[count(tsid_metric.val)] REDACTED
+|_|_|_ProjectionExec: expr=[ts@0 as ts] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED
 |_|_|_RepartitionExec: partitioning=REDACTED
-|_|_|_AggregateExec: mode=Partial, gby=[job@1 as job, ts@2 as ts], aggr=[count(tsid_metric.val)] REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED
+|_|_|_ProjectionExec: expr=[ts@3 as ts, job@1 as job] REDACTED
+|_|_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[5000], time index=[ts] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["__tsid"] REDACTED
+|_|_|_ProjectionExec: expr=[val@1 as val, job@3 as job, __tsid@2 as __tsid, ts@0 as ts] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+| 1_| 0_|_SortPreservingMergeExec: [ts@0 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts], aggr=[sum(prom_irate(ts_range,val))] REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts], aggr=[sum(prom_irate(ts_range,val))] REDACTED
+|_|_|_FilterExec: prom_irate(ts_range,val)@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[ts@2 as ts, prom_irate(ts_range@3, val@0) as prom_irate(ts_range,val)] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[0..10000], interval=[5000], eval range=[3600000], time index=[ts] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[ts], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["__tsid"] REDACTED
+|_|_|_ProjectionExec: expr=[val@1 as val, __tsid@2 as __tsid, ts@0 as ts] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+|_|_| Total rows: 2_|
++-+-+-+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+TQL ANALYZE (0, 10, '5s')  sum(irate(tsid_metric[1h])) / scalar(count(sum(tsid_metric) by (job)));
+
++-+-+-+
+| stage | node | plan_|
++-+-+-+
+| 0_| 0_|_ProjectionExec: expr=[ts@1 as ts, sum(prom_irate(ts_range,val))@2 / scalar(count(sum(tsid_metric.val)))@0 as lhs.sum(prom_irate(ts_range,val)) / rhs.scalar(count(sum(tsid_metric.val)))] REDACTED
+|_|_|_REDACTED
+|_|_|_ScalarCalculateExec: tags=[] REDACTED
+|_|_|_CoalescePartitionsExec REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_SortPreservingMergeExec: [ts@0 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts], aggr=[count(sum(tsid_metric.val))] REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts], aggr=[count(sum(tsid_metric.val))] REDACTED
+|_|_|_ProjectionExec: expr=[ts@0 as ts] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED
+|_|_|_ProjectionExec: expr=[ts@1 as ts, job@0 as job] REDACTED
+|_|_|_FilterExec: val@0 IS NOT NULL, projection=[job@1, ts@2] REDACTED
 |_|_|_ProjectionExec: expr=[val@0 as val, job@1 as job, ts@3 as ts] REDACTED
 |_|_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[5000], time index=[ts] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["__tsid"] REDACTED
diff --git a/tests/cases/standalone/tql-explain-analyze/tsid_column.sql b/tests/cases/standalone/tql-explain-analyze/tsid_column.sql
index 7b3de23f33..dedce2dfb1 100644
--- a/tests/cases/standalone/tql-explain-analyze/tsid_column.sql
+++ b/tests/cases/standalone/tql-explain-analyze/tsid_column.sql
@@ -51,6 +51,14 @@ TQL ANALYZE (0, 10, '5s') sum by (job, instance) (tsid_metric);
 -- SQLNESS REPLACE (Hash.*) REDACTED
 TQL ANALYZE (0, 10, '5s')  sum(irate(tsid_metric[1h])) / scalar(count(count(tsid_metric) by (job)));
 
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+TQL ANALYZE (0, 10, '5s')  sum(irate(tsid_metric[1h])) / scalar(count(sum(tsid_metric) by (job)));
+
 DROP TABLE tsid_metric;
 DROP TABLE tsid_physical;
-