diff --git a/.github/actions/release-cn-artifacts/action.yaml b/.github/actions/release-cn-artifacts/action.yaml
index 2825d3f5d0..fe78d5a760 100644
--- a/.github/actions/release-cn-artifacts/action.yaml
+++ b/.github/actions/release-cn-artifacts/action.yaml
@@ -37,17 +37,14 @@ inputs:
     description: Whether to push the latest tag of the image
     required: false
     default: 'true'
-  aws-cn-s3-bucket:
-    description: S3 bucket to store released artifacts in CN region
+  proxy-url:
+    description: The url of the S3 proxy server
     required: true
-  aws-cn-access-key-id:
-    description: AWS access key id in CN region
+  proxy-username:
+    description: The username of the S3 proxy
     required: true
-  aws-cn-secret-access-key:
-    description: AWS secret access key in CN region
-    required: true
-  aws-cn-region:
-    description: AWS region in CN
+  proxy-password:
+    description: The password of the S3 proxy
     required: true
   upload-to-s3:
     description: Upload to S3
@@ -77,21 +74,13 @@ runs:
       with:
         path: ${{ inputs.artifacts-dir }}
 
-    - name: Install s5cmd
-      shell: bash
-      run: |
-        wget https://github.com/peak/s5cmd/releases/download/v2.3.0/s5cmd_2.3.0_Linux-64bit.tar.gz
-        tar -xzf s5cmd_2.3.0_Linux-64bit.tar.gz
-        sudo mv s5cmd /usr/local/bin/
-        sudo chmod +x /usr/local/bin/s5cmd
-
     - name: Release artifacts to cn region
       uses: nick-invision/retry@v2
       if: ${{ inputs.upload-to-s3 == 'true' }}
       env:
-        AWS_ACCESS_KEY_ID: ${{ inputs.aws-cn-access-key-id }}
-        AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-cn-secret-access-key }}
-        AWS_REGION: ${{ inputs.aws-cn-region }}
+        PROXY_URL: ${{ inputs.proxy-url }}
+        PROXY_USERNAME: ${{ inputs.proxy-username }}
+        PROXY_PASSWORD: ${{ inputs.proxy-password }}
         UPDATE_VERSION_INFO: ${{ inputs.update-version-info }}
       with:
         max_attempts: ${{ inputs.upload-max-retry-times }}
@@ -99,8 +88,7 @@ runs:
         command: |
           ./.github/scripts/upload-artifacts-to-s3.sh \
             ${{ inputs.artifacts-dir }} \
-            ${{ inputs.version }} \
-            ${{ inputs.aws-cn-s3-bucket }}
+            ${{ inputs.version }}
 
     - name: Push greptimedb image from Dockerhub to ACR
       shell: bash
diff --git a/.github/scripts/upload-artifacts-to-s3.sh b/.github/scripts/upload-artifacts-to-s3.sh
index 75c8f8d932..1ddf32044b 100755
--- a/.github/scripts/upload-artifacts-to-s3.sh
+++ b/.github/scripts/upload-artifacts-to-s3.sh
@@ -5,16 +5,15 @@ set -o pipefail
 
 ARTIFACTS_DIR=$1
 VERSION=$2
-AWS_S3_BUCKET=$3
 RELEASE_DIRS="releases/greptimedb"
 GREPTIMEDB_REPO="GreptimeTeam/greptimedb"
 
 # Check if necessary variables are set.
 function check_vars() {
-  for var in AWS_S3_BUCKET VERSION ARTIFACTS_DIR; do
+  for var in VERSION ARTIFACTS_DIR; do
     if [ -z "${!var}" ]; then
       echo "$var is not set or empty."
-      echo "Usage: $0 <artifacts-dir> <version> <aws-s3-bucket>"
+      echo "Usage: $0 <artifacts-dir> <version>"
       exit 1
     fi
   done
@@ -33,8 +32,13 @@ function upload_artifacts() {
   #    ├── greptime-darwin-amd64-v0.2.0.sha256sum
   #    └── greptime-darwin-amd64-v0.2.0.tar.gz
   find "$ARTIFACTS_DIR" -type f \( -name "*.tar.gz" -o -name "*.sha256sum" \) | while IFS= read -r file; do
-    s5cmd cp \
-      "$file" "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/$VERSION/$(basename "$file")"
+    filename=$(basename "$file")
+    TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION"
+
+    curl -X PUT \
+      -u "$PROXY_USERNAME:$PROXY_PASSWORD" \
+      -F "file=@$file" \
+      "$TARGET_URL"
   done
 }
 
@@ -45,16 +49,24 @@ function update_version_info() {
     if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
       echo "Updating latest-version.txt"
       echo "$VERSION" > latest-version.txt
-      s5cmd cp \
-        latest-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-version.txt"
+      TARGET_URL="$PROXY_URL/$RELEASE_DIRS"
+
+      curl -X PUT \
+        -u "$PROXY_USERNAME:$PROXY_PASSWORD" \
+        -F "file=@latest-version.txt" \
+        "$TARGET_URL"
     fi
 
     # If it's the nightly release, update latest-nightly-version.txt.
     if [[ "$VERSION" == *"nightly"* ]]; then
       echo "Updating latest-nightly-version.txt"
       echo "$VERSION" > latest-nightly-version.txt
-      s5cmd cp \
-        latest-nightly-version.txt "s3://$AWS_S3_BUCKET/$RELEASE_DIRS/latest-nightly-version.txt"
+
+      TARGET_URL="$PROXY_URL/$RELEASE_DIRS"
+      curl -X PUT \
+        -u "$PROXY_USERNAME:$PROXY_PASSWORD" \
+        -F "file=@latest-nightly-version.txt" \
+        "$TARGET_URL"
     fi
   fi
 }
@@ -93,10 +105,10 @@ function main() {
 }
 
 # Usage example:
-#   AWS_ACCESS_KEY_ID=<your_access_key_id> \
-#   AWS_SECRET_ACCESS_KEY=<your_secret_access_key> \
-#   AWS_DEFAULT_REGION=<your_region> \
+#   PROXY_URL=<proxy_url> \
+#   PROXY_USERNAME=<proxy_username> \
+#   PROXY_PASSWORD=<proxy_password> \
 #   UPDATE_VERSION_INFO=true \
 #   DOWNLOAD_ARTIFACTS_FROM_GITHUB=false \
-#     ./upload-artifacts-to-s3.sh <artifacts-dir> <version> <aws-s3-bucket>
+#     ./upload-artifacts-to-s3.sh <artifacts-dir> <version>
 main
diff --git a/.github/workflows/dev-build.yml b/.github/workflows/dev-build.yml
index 021867e4ed..d03fbeff14 100644
--- a/.github/workflows/dev-build.yml
+++ b/.github/workflows/dev-build.yml
@@ -285,10 +285,9 @@ jobs:
           dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
           dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
           version: ${{ needs.allocate-runners.outputs.version }}
-          aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }}
-          aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
-          aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
-          aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
+          proxy-url: ${{ secrets.PROXY_URL }}
+          proxy-username: ${{ secrets.PROXY_USERNAME }}
+          proxy-password: ${{ secrets.PROXY_PASSWORD }}
           upload-to-s3: ${{ inputs.upload_artifacts_to_s3 }}
           dev-mode: true                     # Only build the standard images(exclude centos images).
           push-latest-tag: false             # Don't push the latest tag to registry.
diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml
index 0238e92c8d..b6ab0f8926 100644
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -319,7 +319,13 @@ jobs:
         include:
           - target: "fuzz_repartition_table"
             mode:
-              name: "Local WAL Repartition GC"
+              name: "Local WAL mito table repartition"
+              minio: true
+              kafka: false
+              values: "with-minio-repartition-gc.yaml"
+          - target: "fuzz_repartition_metric_table"
+            mode:
+              name: "Local WAL metric table repartition"
               minio: true
               kafka: false
               values: "with-minio-repartition-gc.yaml"
@@ -455,6 +461,14 @@ jobs:
           path: /tmp/fuzz-monitor-dumps
           if-no-files-found: warn
           retention-days: 3
+      - name: Upload CSV dumps
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: fuzz-tests-csv-dumps-${{ matrix.mode.name }}-${{ matrix.target }}
+          path: /tmp/greptime-fuzz-dumps
+          if-no-files-found: warn
+          retention-days: 3
       - name: Delete cluster
         if: success()
         shell: bash
diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml
index 9eaa38c789..14ebb6e715 100644
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -236,10 +236,9 @@ jobs:
           dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
           dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
           version: ${{ needs.allocate-runners.outputs.version }}
-          aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }}
-          aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
-          aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
-          aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
+          proxy-url: ${{ secrets.PROXY_URL }}
+          proxy-username: ${{ secrets.PROXY_USERNAME }}
+          proxy-password: ${{ secrets.PROXY_PASSWORD }}
           upload-to-s3: false
           dev-mode: false
           update-version-info: false  # Don't update version info in S3.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3b0eb2d68c..9f8f2d9703 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -358,10 +358,9 @@ jobs:
           dst-image-registry: ${{ vars.ACR_IMAGE_REGISTRY }}
           dst-image-namespace: ${{ vars.IMAGE_NAMESPACE }}
           version: ${{ needs.allocate-runners.outputs.version }}
-          aws-cn-s3-bucket: ${{ vars.AWS_RELEASE_BUCKET }}
-          aws-cn-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
-          aws-cn-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
-          aws-cn-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
+          proxy-url: ${{ secrets.PROXY_URL }}
+          proxy-username: ${{ secrets.PROXY_USERNAME }}
+          proxy-password: ${{ secrets.PROXY_PASSWORD }}
           dev-mode: false
           upload-to-s3: true
           update-version-info: true
diff --git a/.gitignore b/.gitignore
index 862eb8c5b4..87412d570c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,3 +70,6 @@ CLAUDE.md
 
 # AGENTS.md
 AGENTS.md
+
+# local design docs
+docs/specs/
diff --git a/Cargo.lock b/Cargo.lock
index 85c2b1ed2d..32f9aa27d4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1946,6 +1946,7 @@ dependencies = [
  "tokio",
  "tracing-appender",
  "url",
+ "uuid",
 ]
 
 [[package]]
@@ -2488,7 +2489,6 @@ version = "1.0.0-rc.2"
 dependencies = [
  "common-error",
  "common-macro",
- "common-telemetry",
  "humantime",
  "serde",
  "snafu 0.8.6",
@@ -7301,7 +7301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
 dependencies = [
  "cfg-if",
- "windows-targets 0.52.6",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
@@ -7887,6 +7887,7 @@ dependencies = [
  "common-base",
  "common-error",
  "common-function",
+ "common-grpc",
  "common-macro",
  "common-meta",
  "common-query",
@@ -9619,9 +9620,9 @@ dependencies = [
 
 [[package]]
 name = "pgwire"
-version = "0.38.0"
+version = "0.38.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89d5e5a60d3f6e40c91f6a2a7f8d09665e636272bd5611977253559b6651aabb"
+checksum = "f2a798d130b8975a566c2cf6d8955746e1f09a9ee2c3ff2e6020a2c6528c5bd1"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -10771,9 +10772,9 @@ dependencies = [
 
 [[package]]
 name = "quinn-proto"
-version = "0.11.12"
+version = "0.11.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49df843a9161c85bb8aae55f101bc0bac8bcafd637a620d9122fd7e0b2f7422e"
+checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
 dependencies = [
  "bytes",
  "getrandom 0.3.3",
@@ -11634,9 +11635,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-webpki"
-version = "0.103.3"
+version = "0.103.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435"
+checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef"
 dependencies = [
  "ring",
  "rustls-pki-types",
@@ -13403,9 +13404,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
 
 [[package]]
 name = "tar"
-version = "0.4.44"
+version = "0.4.45"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a"
+checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973"
 dependencies = [
  "filetime",
  "libc",
diff --git a/docs/rfcs/2025-12-30-export-import-v2.md b/docs/rfcs/2025-12-30-export-import-v2.md
index 197eb7cc9d..6bc8428300 100644
--- a/docs/rfcs/2025-12-30-export-import-v2.md
+++ b/docs/rfcs/2025-12-30-export-import-v2.md
@@ -67,6 +67,7 @@ snapshot-20250101/
 - Self-contained (all information needed for restore)
 - Immutable (content never changes after creation)
 - Verifiable (checksums at file, chunk, and snapshot levels)
+- Schema-only snapshots contain only `manifest.json` and `schema/`; `data/` is absent, `chunks` is empty, and later data append is rejected (use `--force` to recreate)
 
 ### Chunk
 
@@ -116,6 +117,8 @@ greptime export create \
   --schema-only \
   --to s3://my-bucket/snapshots/prod-schema-only
 
+Schema-only snapshots cannot be resumed with data; use `--force` to recreate.
+
 # Export with specific format (default: parquet)
 greptime export create \
   --format csv \
@@ -173,7 +176,9 @@ The manifest is a JSON file containing snapshot metadata and chunk index:
 - `snapshot_id`: Unique identifier (UUID)
 - `catalog`, `schemas`: Catalog and schema list
 - `time_range`: Overall time range covered
+- `schema_only`: Whether the snapshot contains schema only
 - `chunks[]`: Array of chunk metadata
+- `format`: Data format for exported files
 - `checksum`: Snapshot-level SHA256 checksum
 
 **Chunk metadata structure**:
@@ -182,7 +187,7 @@ Each chunk entry in the manifest contains:
 
 - `id`: Chunk identifier (sequential number)
 - `time_range`: Start and end timestamps
-- `status`: Export status (Pending, Completed, Failed)
+- `status`: Export status (Pending, InProgress, Completed, Failed)
 - `files`: List of data files in the chunk directory
 - `checksum`: Chunk-level checksum for integrity verification
 
@@ -292,9 +297,9 @@ Checksums are verified during import before data is written to the database.
 
 **Resume capability**:
 
-- Manifest tracks chunk status (Pending, Completed, Failed)
+- Manifest tracks chunk status (Pending, InProgress, Completed, Failed)
 - Export/import automatically resumes when executed on existing snapshot
-- Skips completed chunks, retries failed chunks, processes pending chunks
+- Skips completed chunks, retries failed/in-progress chunks, processes pending chunks
 - Works across process restarts
 - Use `--force` (export only) to delete existing snapshot and start over
 
diff --git a/docs/rfcs/2026-03-16-flow-inc-query.md b/docs/rfcs/2026-03-16-flow-inc-query.md
new file mode 100644
index 0000000000..8041d37d2b
--- /dev/null
+++ b/docs/rfcs/2026-03-16-flow-inc-query.md
@@ -0,0 +1,190 @@
+---
+Feature Name: Flow Batching Sequence-Based Incremental Query Plan (Lite)
+Tracking Issue: TBD
+Date: 2026-03-16
+Author: @discord9
+---
+
+# Summary
+
+This RFC proposes a correctness-first incremental query mode for Flow batching.
+Flow queries can read only `seq > checkpoint` and advance checkpoints using per-region correctness watermarks.
+When incremental reads are stale or correctness cannot be proven, Flow falls back to full recomputation.
+
+# Motivation
+
+Flow batching still needs to repeatedly compute old data in the same time window, so incremental query can improve Flow performance.
+
+# Goals
+
+1. Add opt-in incremental reads (`seq > given_seq`) for Flow.
+2. Return per-region correctness watermarks for checkpoint advancement.
+3. Keep existing query behavior unchanged unless explicitly enabled.
+4. Define deterministic fallback for stale or unprovable incremental reads.
+
+# Non-Goals
+
+1. No business-schema changes (no synthetic watermark columns in result rows).
+2. No global throughput optimization in v1 (correctness first).
+3. No observational watermark output when correctness is unprovable.
+
+# Proposal
+
+## 1) Query options
+
+Introduce three `QueryContext` extension keys:
+
+- `flow.incremental_after_seqs`
+- `flow.incremental_mode`
+- `flow.return_region_seq`
+
+These options are opt-in and only affect Flow incremental execution paths.
+
+## 2) Scan mapping
+
+When incremental mode is enabled:
+
+- map `after_seq` to `memtable_min_sequence` (exclusive lower bound)
+- keep existing snapshot upper-bound behavior (`memtable_max_sequence`)
+
+Important limitation in v1:
+
+- incremental filtering is correctness-proven only for memtable rows
+- SST files do not preserve detailed row-level sequence metadata; they only expose coarser file-level sequence information
+- therefore `seq > checkpoint` must not assume precise incremental pruning across memtable->SST flush boundaries
+
+If required incremental parameters are missing or invalid, return argument error.
+
+## 3) Stale protection
+
+Add dedicated stale error:
+
+- `IncrementalQueryStale { region_id, given_seq, min_readable_seq }`
+
+Behavior:
+
+- if `given_seq < min_readable_seq`, return stale error
+- if `given_seq == min_readable_seq`, query is valid and reads `seq > given_seq`
+- if `given_seq > min_readable_seq`, query is also valid and reads `seq > given_seq`
+
+`IncrementalQueryStale` also covers the case where rows newer than the checkpoint have crossed a memtable->SST flush boundary and sequence-precise incremental exclusion can no longer be proven.
+In other words, the flush-boundary case is not a separate fallback category in v1; it is one concrete way an incremental cursor becomes stale.
+
+## 4) Watermark return
+
+Extend query metrics with optional per-region watermark map:
+
+- `region_latest_sequences: Vec<(region_id: u64, latest_sequence: u64)>`
+
+Rules:
+
+- only terminal metrics of successful query can advance checkpoints
+- for multi-region query, watermark must be complete map or absent
+- if correctness is unprovable, business rows may return but watermark is absent
+
+## 5) Flow state machine
+
+Checkpoint and watermark state are kept only in flownode memory in v1; they are not persisted as durable flow metadata.
+Cold start or flownode restart therefore always re-enters through a full snapshot read.
+Only after that full query succeeds with a complete correctness watermark may Flow switch back to incremental mode.
+
+Flow starts in full mode, then transitions:
+
+1. Full query succeeds with correctness watermark -> enter incremental mode
+2. Incremental query succeeds with correctness watermark -> advance checkpoint
+3. Incremental stale/failure -> fallback to full mode
+4. Full query without correctness watermark -> remain in full mode
+
+```mermaid
+stateDiagram-v2
+    [*] --> FullSnapshot: Flow starts
+
+    state FullSnapshot {
+        [*] --> RunFull
+        RunFull --> RunFull: Full query succeeds but watermark is unprovable<br/>no region_latest_sequences returned
+    }
+
+    FullSnapshot --> Incremental: Full query succeeds and correctness watermark is returned<br/>(checkpoint updated)
+
+    state Incremental {
+        [*] --> RunInc
+        RunInc --> RunInc: Incremental succeeds<br/>(checkpoint advances)
+    }
+
+    Incremental --> FullSnapshot: IncrementalQueryStale<br/>(cursor too old, fallback required)
+    Incremental --> FullSnapshot: Incremental fails<br/> and fallback policy is triggered
+
+    FullSnapshot --> [*]: Flow stops
+    Incremental --> [*]: Flow stops
+```
+
+### Fallback Policy
+
+Fallback to full mode is deterministic and is triggered by any of the following:
+
+1. `IncrementalQueryStale` is returned.
+2. Incremental query fails with execution errors.
+3. Incremental query succeeds but watermark is absent or incomplete for participating regions.
+
+Policy behavior:
+
+1. Do not advance any checkpoint in the failed/incomplete round.
+2. Switch to full mode for the affected flow/window in the next round.
+3. Return to incremental mode only after a full query succeeds with a complete correctness watermark map.
+
+### Persistence and recovery model
+
+The v1 design is intentionally correctness-first and keeps the progress cursor lightweight:
+
+1. Watermarks/checkpoints live only in flownode memory; v1 does not persist them separately.
+2. On cold start, the flow re-establishes progress by running a successful full-query snapshot read, then resumes incremental mode only after that round returns a complete correctness watermark map.
+3. Sequence-precise incremental correctness is currently limited to rows still visible in memtables.
+4. Once relevant rows have been flushed into SST, the system cannot use `seq > checkpoint` alone to prove precise incremental exclusion, because SST lacks detailed row-level sequence metadata.
+5. In that case the correct behavior is to fall back to full recomputation, not to continue a best-effort incremental scan.
+
+# Distributed and Compatibility Requirements
+
+1. Distributed path must preserve region-level snapshot/read-bound semantics end-to-end.
+2. `snapshot_seqs` transport and `flow.*` options must both be carried correctly.
+   - `snapshot_seqs` means the per-region snapshot upper-bound map: `region_id -> sequence`.
+3. New metrics fields must be backward-compatible (old clients ignore unknown fields).
+
+# Rollout Plan
+
+## Phase 1 (MVP, correctness first)
+
+1. Add extension constants and parsing.
+2. Add incremental scan mapping and stale detection.
+3. Add watermark metrics field and terminal-watermark checkpoint update path.
+4. Complete standalone and distributed passthrough.
+
+## Phase 2 (performance and observability)
+
+1. Improve batching key strategy with sequence/watermark context.
+2. Optimize watermark serialization overhead.
+3. Add metrics: incremental hit rate, fallback rate, fallback window size.
+
+# Testing Plan
+
+1. Unit tests for incremental bounds and stale detection.
+2. Query-path tests for extension mapping and watermark semantics.
+3. Flow integration tests for full->incremental->fallback transitions.
+4. Distributed tests for end-to-end snapshot/watermark propagation.
+5. Compatibility tests for old/new client-server combinations.
+
+# Risks
+
+1. Boundary semantic mismatch (`<` vs `<=`) may cause correctness bugs.
+2. Incomplete distributed propagation can silently invalidate watermark safety.
+3. Frequent fallback can reduce throughput before phase-2 optimizations.
+4. Memtable->SST flushes may force more full recomputation than expected until finer-grained SST sequence tracking exists.
+
+# Alternatives
+
+1. Put watermark into business rows (rejected: schema pollution).
+2. Add new dedicated Flight message type in v1 (deferred to reduce scope).
+
+# Conclusion
+
+This plan enables a practical, correctness-first incremental path for Flow batching.
+It reuses existing sequence scan capability, adds strict stale handling, and advances checkpoints only from correctness-proven per-region watermarks.
diff --git a/src/catalog/src/kvbackend/table_cache.rs b/src/catalog/src/kvbackend/table_cache.rs
index ea328c3e17..42b3fbc74b 100644
--- a/src/catalog/src/kvbackend/table_cache.rs
+++ b/src/catalog/src/kvbackend/table_cache.rs
@@ -65,11 +65,13 @@ fn init_factory(
 
 fn invalidator<'a>(
     cache: &'a Cache<TableName, TableRef>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, MetaResult<()>> {
     Box::pin(async move {
-        if let CacheIdent::TableName(table_name) = ident {
-            cache.invalidate(table_name).await
+        for ident in idents {
+            if let CacheIdent::TableName(table_name) = ident {
+                cache.invalidate(table_name).await
+            }
         }
         Ok(())
     })
diff --git a/src/catalog/src/system_schema/information_schema/region_peers.rs b/src/catalog/src/system_schema/information_schema/region_peers.rs
index 5bc91d207e..b1438ef53d 100644
--- a/src/catalog/src/system_schema/information_schema/region_peers.rs
+++ b/src/catalog/src/system_schema/information_schema/region_peers.rs
@@ -267,7 +267,7 @@ impl InformationSchemaRegionPeersBuilder {
             ];
 
             if !predicates.eval(&row) {
-                return;
+                continue;
             }
 
             self.table_catalogs.push(Some(table_catalog));
diff --git a/src/catalog/src/table_source.rs b/src/catalog/src/table_source.rs
index 132e02fe14..8aabf64e99 100644
--- a/src/catalog/src/table_source.rs
+++ b/src/catalog/src/table_source.rs
@@ -151,7 +151,11 @@ impl DfTableSourceProvider {
         let catalog_list = Arc::new(DummyCatalogList::new(self.catalog_manager.clone()));
         let logical_plan = self
             .plan_decoder
-            .decode(Bytes::from(view_info.view_info.clone()), catalog_list, true)
+            .decode(
+                Bytes::from(view_info.view_info.clone()),
+                catalog_list,
+                false,
+            )
             .await
             .context(DecodePlanSnafu {
                 name: &table.table_info().name,
diff --git a/src/cli/Cargo.toml b/src/cli/Cargo.toml
index 46e79efd00..1eb2736007 100644
--- a/src/cli/Cargo.toml
+++ b/src/cli/Cargo.toml
@@ -65,6 +65,8 @@ store-api.workspace = true
 table.workspace = true
 tokio.workspace = true
 tracing-appender.workspace = true
+url.workspace = true
+uuid.workspace = true
 
 [dev-dependencies]
 common-meta = { workspace = true, features = ["testing"] }
@@ -72,4 +74,3 @@ common-test-util.workspace = true
 common-version.workspace = true
 serde.workspace = true
 tempfile.workspace = true
-url.workspace = true
diff --git a/src/cli/src/data.rs b/src/cli/src/data.rs
index 5966040a3b..114886542e 100644
--- a/src/cli/src/data.rs
+++ b/src/cli/src/data.rs
@@ -13,7 +13,12 @@
 // limitations under the License.
 
 mod export;
+pub mod export_v2;
 mod import;
+pub mod import_v2;
+pub(crate) mod path;
+pub mod snapshot_storage;
+pub(crate) mod sql;
 mod storage_export;
 
 use clap::Subcommand;
@@ -22,15 +27,24 @@ use common_error::ext::BoxedError;
 
 use crate::Tool;
 use crate::data::export::ExportCommand;
+use crate::data::export_v2::ExportV2Command;
 use crate::data::import::ImportCommand;
+use crate::data::import_v2::ImportV2Command;
 
 pub(crate) const COPY_PATH_PLACEHOLDER: &str = "<PATH/TO/FILES>";
 
 /// Command for data operations including exporting data from and importing data into GreptimeDB.
 #[derive(Subcommand)]
 pub enum DataCommand {
+    /// Export data (V1 - legacy).
     Export(ExportCommand),
+    /// Import data (V1 - legacy).
     Import(ImportCommand),
+    /// Export V2 - JSON-based schema export with manifest support.
+    #[clap(subcommand)]
+    ExportV2(ExportV2Command),
+    /// Import V2 - Import from V2 snapshot.
+    ImportV2(ImportV2Command),
 }
 
 impl DataCommand {
@@ -38,6 +52,8 @@ impl DataCommand {
         match self {
             DataCommand::Export(cmd) => cmd.build().await,
             DataCommand::Import(cmd) => cmd.build().await,
+            DataCommand::ExportV2(cmd) => cmd.build().await,
+            DataCommand::ImportV2(cmd) => cmd.build().await,
         }
     }
 }
diff --git a/src/cli/src/data/export.rs b/src/cli/src/data/export.rs
index 1cdb159336..b5d547d4f3 100644
--- a/src/cli/src/data/export.rs
+++ b/src/cli/src/data/export.rs
@@ -107,13 +107,16 @@ pub struct ExportCommand {
     #[clap(long, value_parser = humantime::parse_duration)]
     timeout: Option<Duration>,
 
-    /// The proxy server address to connect, if set, will override the system proxy.
+    /// The proxy server address to connect.
     ///
-    /// The default behavior will use the system proxy if neither `proxy` nor `no_proxy` is set.
+    /// If set, it overrides the system proxy unless `--no-proxy` is specified.
+    /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
     #[clap(long)]
     proxy: Option<String>,
 
-    /// Disable proxy server, if set, will not use any proxy.
+    /// Disable all proxy usage (ignores `--proxy` and system proxy).
+    ///
+    /// When set and `--proxy` is not provided, this explicitly disables system proxy.
     #[clap(long)]
     no_proxy: bool,
 
@@ -173,6 +176,7 @@ impl ExportCommand {
             // Treats `None` as `0s` to disable server-side default timeout.
             self.timeout.unwrap_or_default(),
             proxy,
+            self.no_proxy,
         );
 
         Ok(Box::new(Export {
diff --git a/src/cli/src/data/export_v2.rs b/src/cli/src/data/export_v2.rs
new file mode 100644
index 0000000000..91020d2f2e
--- /dev/null
+++ b/src/cli/src/data/export_v2.rs
@@ -0,0 +1,49 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Export V2 module.
+//!
+//! This module provides the V2 implementation of database export functionality,
+//! featuring:
+//! - JSON-based schema export (version-agnostic)
+//! - Manifest-based snapshot management
+//! - Support for multiple storage backends (S3, OSS, GCS, Azure Blob, local FS)
+//! - Resume capability for interrupted exports
+//!
+//! # Example
+//!
+//! ```bash
+//! # Export schema only
+//! greptime cli data export-v2 create \
+//!   --addr 127.0.0.1:4000 \
+//!   --to file:///tmp/snapshot \
+//!   --schema-only
+//!
+//! # Export with time range (M2)
+//! greptime cli data export-v2 create \
+//!   --addr 127.0.0.1:4000 \
+//!   --to s3://bucket/snapshots/prod-20250101 \
+//!   --start-time 2025-01-01T00:00:00Z \
+//!   --end-time 2025-01-31T23:59:59Z
+//! ```
+
+mod command;
+pub mod error;
+pub mod extractor;
+pub mod manifest;
+pub mod schema;
+pub use command::ExportV2Command;
+
+#[cfg(test)]
+mod tests;
diff --git a/src/cli/src/data/export_v2/command.rs b/src/cli/src/data/export_v2/command.rs
new file mode 100644
index 0000000000..341436fe0f
--- /dev/null
+++ b/src/cli/src/data/export_v2/command.rs
@@ -0,0 +1,496 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Export V2 CLI commands.
+
+use std::collections::HashSet;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use clap::{Parser, Subcommand};
+use common_error::ext::BoxedError;
+use common_telemetry::info;
+use serde_json::Value;
+use snafu::{OptionExt, ResultExt};
+
+use crate::Tool;
+use crate::common::ObjectStoreConfig;
+use crate::data::export_v2::error::{
+    CannotResumeSchemaOnlySnafu, DataExportNotImplementedSnafu, DatabaseSnafu, EmptyResultSnafu,
+    ManifestVersionMismatchSnafu, Result, UnexpectedValueTypeSnafu,
+};
+use crate::data::export_v2::extractor::SchemaExtractor;
+use crate::data::export_v2::manifest::{DataFormat, MANIFEST_VERSION, Manifest};
+use crate::data::path::ddl_path_for_schema;
+use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri};
+use crate::data::sql::{escape_sql_identifier, escape_sql_literal};
+use crate::database::{DatabaseClient, parse_proxy_opts};
+
+/// Export V2 commands.
+#[derive(Debug, Subcommand)]
+pub enum ExportV2Command {
+    /// Create a new snapshot.
+    Create(ExportCreateCommand),
+}
+
+impl ExportV2Command {
+    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
+        match self {
+            ExportV2Command::Create(cmd) => cmd.build().await,
+        }
+    }
+}
+
+/// Create a new snapshot.
+#[derive(Debug, Parser)]
+pub struct ExportCreateCommand {
+    /// Server address to connect (e.g., 127.0.0.1:4000).
+    #[clap(long)]
+    addr: String,
+
+    /// Target storage location (e.g., s3://bucket/path, file:///tmp/backup).
+    #[clap(long)]
+    to: String,
+
+    /// Catalog name.
+    #[clap(long, default_value = "greptime")]
+    catalog: String,
+
+    /// Schema list to export (default: all non-system schemas).
+    /// Can be specified multiple times or comma-separated.
+    #[clap(long, value_delimiter = ',')]
+    schemas: Vec<String>,
+
+    /// Export schema only, no data.
+    #[clap(long)]
+    schema_only: bool,
+
+    /// Time range start (ISO 8601 format, e.g., 2024-01-01T00:00:00Z).
+    #[clap(long)]
+    start_time: Option<String>,
+
+    /// Time range end (ISO 8601 format, e.g., 2024-12-31T23:59:59Z).
+    #[clap(long)]
+    end_time: Option<String>,
+
+    /// Data format: parquet, csv, json.
+    #[clap(long, value_enum, default_value = "parquet")]
+    format: DataFormat,
+
+    /// Delete existing snapshot and recreate.
+    #[clap(long)]
+    force: bool,
+
+    /// Concurrency level (for future use).
+    #[clap(long, default_value = "1")]
+    parallelism: usize,
+
+    /// Basic authentication (user:password).
+    #[clap(long)]
+    auth_basic: Option<String>,
+
+    /// Request timeout.
+    #[clap(long, value_parser = humantime::parse_duration)]
+    timeout: Option<Duration>,
+
+    /// Proxy server address.
+    ///
+    /// If set, it overrides the system proxy unless `--no-proxy` is specified.
+    /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
+    #[clap(long)]
+    proxy: Option<String>,
+
+    /// Disable all proxy usage (ignores `--proxy` and system proxy).
+    ///
+    /// When set and `--proxy` is not provided, this explicitly disables system proxy.
+    #[clap(long)]
+    no_proxy: bool,
+
+    /// Object store configuration for remote storage backends.
+    #[clap(flatten)]
+    storage: ObjectStoreConfig,
+}
+
+impl ExportCreateCommand {
+    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
+        // Validate URI format
+        validate_uri(&self.to).map_err(BoxedError::new)?;
+
+        if !self.schema_only {
+            return DataExportNotImplementedSnafu
+                .fail()
+                .map_err(BoxedError::new);
+        }
+
+        // Parse schemas (empty vec means all schemas)
+        let schemas = if self.schemas.is_empty() {
+            None
+        } else {
+            Some(self.schemas.clone())
+        };
+
+        // Build storage
+        let storage = OpenDalStorage::from_uri(&self.to, &self.storage).map_err(BoxedError::new)?;
+
+        // Build database client
+        let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?;
+        let database_client = DatabaseClient::new(
+            self.addr.clone(),
+            self.catalog.clone(),
+            self.auth_basic.clone(),
+            self.timeout.unwrap_or(Duration::from_secs(60)),
+            proxy,
+            self.no_proxy,
+        );
+
+        Ok(Box::new(ExportCreate {
+            catalog: self.catalog.clone(),
+            schemas,
+            schema_only: self.schema_only,
+            _format: self.format,
+            force: self.force,
+            _parallelism: self.parallelism,
+            storage: Box::new(storage),
+            database_client,
+        }))
+    }
+}
+
+/// Export tool implementation.
+pub struct ExportCreate {
+    catalog: String,
+    schemas: Option<Vec<String>>,
+    schema_only: bool,
+    _format: DataFormat,
+    force: bool,
+    _parallelism: usize,
+    storage: Box<dyn SnapshotStorage>,
+    database_client: DatabaseClient,
+}
+
+#[async_trait]
+impl Tool for ExportCreate {
+    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
+        self.run().await.map_err(BoxedError::new)
+    }
+}
+
+impl ExportCreate {
+    async fn run(&self) -> Result<()> {
+        // 1. Check if snapshot exists
+        let exists = self.storage.exists().await?;
+
+        if exists {
+            if self.force {
+                info!("Deleting existing snapshot (--force)");
+                self.storage.delete_snapshot().await?;
+            } else {
+                // Resume mode - read existing manifest
+                let manifest = self.storage.read_manifest().await?;
+
+                // Check version compatibility
+                if manifest.version != MANIFEST_VERSION {
+                    return ManifestVersionMismatchSnafu {
+                        expected: MANIFEST_VERSION,
+                        found: manifest.version,
+                    }
+                    .fail();
+                }
+
+                // Cannot resume schema-only with data export
+                if manifest.schema_only && !self.schema_only {
+                    return CannotResumeSchemaOnlySnafu.fail();
+                }
+
+                info!(
+                    "Resuming existing snapshot: {} (completed: {}/{} chunks)",
+                    manifest.snapshot_id,
+                    manifest.completed_count(),
+                    manifest.chunks.len()
+                );
+
+                // For M1, we only handle schema-only exports
+                // M2 will add chunk resume logic
+                if manifest.is_complete() {
+                    info!("Snapshot is already complete");
+                    return Ok(());
+                }
+
+                // TODO: Resume data export in M2
+                info!("Data export resume not yet implemented (M2)");
+                return Ok(());
+            }
+        }
+
+        // 2. Get schema list
+        let extractor = SchemaExtractor::new(&self.database_client, &self.catalog);
+        let schema_snapshot = extractor.extract(self.schemas.as_deref()).await?;
+
+        let schema_names: Vec<String> = schema_snapshot
+            .schemas
+            .iter()
+            .map(|s| s.name.clone())
+            .collect();
+        info!("Exporting schemas: {:?}", schema_names);
+
+        // 3. Create manifest
+        let manifest = Manifest::new_schema_only(self.catalog.clone(), schema_names.clone());
+
+        // 4. Write schema files
+        self.storage.write_schema(&schema_snapshot).await?;
+        info!("Exported {} schemas", schema_snapshot.schemas.len());
+
+        // 5. Export DDL files for import recovery.
+        let ddl_by_schema = self.build_ddl_by_schema(&schema_names).await?;
+        for (schema, ddl) in ddl_by_schema {
+            let ddl_path = ddl_path_for_schema(&schema);
+            self.storage.write_text(&ddl_path, &ddl).await?;
+            info!("Exported DDL for schema {} to {}", schema, ddl_path);
+        }
+
+        // 6. Write manifest last.
+        //
+        // The manifest is the snapshot commit point: only write it after the schema
+        // index and all DDL files are durable, so a crash cannot leave a "valid"
+        // snapshot that is missing required schema artifacts.
+        self.storage.write_manifest(&manifest).await?;
+        info!("Snapshot created: {}", manifest.snapshot_id);
+
+        Ok(())
+    }
+
+    async fn build_ddl_by_schema(&self, schema_names: &[String]) -> Result<Vec<(String, String)>> {
+        let mut schemas = schema_names.to_vec();
+        schemas.sort();
+
+        let mut ddl_by_schema = Vec::with_capacity(schemas.len());
+        for schema in schemas {
+            let create_database = self.show_create("DATABASE", &schema, None).await?;
+
+            let (mut physical_tables, mut tables, mut views) =
+                self.get_schema_objects(&schema).await?;
+            physical_tables.sort();
+            let mut physical_ddls = Vec::with_capacity(physical_tables.len());
+            for table in physical_tables {
+                physical_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?);
+            }
+
+            tables.sort();
+            let mut table_ddls = Vec::with_capacity(tables.len());
+            for table in tables {
+                table_ddls.push(self.show_create("TABLE", &schema, Some(&table)).await?);
+            }
+
+            views.sort();
+            let mut view_ddls = Vec::with_capacity(views.len());
+            for view in views {
+                view_ddls.push(self.show_create("VIEW", &schema, Some(&view)).await?);
+            }
+
+            let ddl = build_schema_ddl(
+                &schema,
+                create_database,
+                physical_ddls,
+                table_ddls,
+                view_ddls,
+            );
+            ddl_by_schema.push((schema, ddl));
+        }
+
+        Ok(ddl_by_schema)
+    }
+
+    async fn get_schema_objects(
+        &self,
+        schema: &str,
+    ) -> Result<(Vec<String>, Vec<String>, Vec<String>)> {
+        let physical_tables = self.get_metric_physical_tables(schema).await?;
+        let physical_set: HashSet<&str> = physical_tables.iter().map(String::as_str).collect();
+        let sql = format!(
+            "SELECT table_name, table_type FROM information_schema.tables \
+             WHERE table_catalog = '{}' AND table_schema = '{}' \
+             AND (table_type = 'BASE TABLE' OR table_type = 'VIEW')",
+            escape_sql_literal(&self.catalog),
+            escape_sql_literal(schema)
+        );
+        let records: Option<Vec<Vec<Value>>> = self
+            .database_client
+            .sql_in_public(&sql)
+            .await
+            .context(DatabaseSnafu)?;
+
+        let mut tables = Vec::new();
+        let mut views = Vec::new();
+        if let Some(rows) = records {
+            for row in rows {
+                let name = match row.first() {
+                    Some(Value::String(name)) => name.clone(),
+                    _ => return UnexpectedValueTypeSnafu.fail(),
+                };
+                let table_type = match row.get(1) {
+                    Some(Value::String(table_type)) => table_type.as_str(),
+                    _ => return UnexpectedValueTypeSnafu.fail(),
+                };
+                if !physical_set.contains(name.as_str()) {
+                    if table_type == "VIEW" {
+                        views.push(name);
+                    } else {
+                        tables.push(name);
+                    }
+                }
+            }
+        }
+
+        Ok((physical_tables, tables, views))
+    }
+
+    async fn get_metric_physical_tables(&self, schema: &str) -> Result<Vec<String>> {
+        let sql = format!(
+            "SELECT DISTINCT table_name FROM information_schema.columns \
+             WHERE table_catalog = '{}' AND table_schema = '{}' AND column_name = '__tsid'",
+            escape_sql_literal(&self.catalog),
+            escape_sql_literal(schema)
+        );
+        let records: Option<Vec<Vec<Value>>> = self
+            .database_client
+            .sql_in_public(&sql)
+            .await
+            .context(DatabaseSnafu)?;
+
+        let mut tables = HashSet::new();
+        if let Some(rows) = records {
+            for row in rows {
+                let name = match row.first() {
+                    Some(Value::String(name)) => name.clone(),
+                    _ => return UnexpectedValueTypeSnafu.fail(),
+                };
+                tables.insert(name);
+            }
+        }
+
+        Ok(tables.into_iter().collect())
+    }
+
+    async fn show_create(
+        &self,
+        show_type: &str,
+        schema: &str,
+        table: Option<&str>,
+    ) -> Result<String> {
+        let sql = match table {
+            Some(table) => format!(
+                r#"SHOW CREATE {} "{}"."{}"."{}""#,
+                show_type,
+                escape_sql_identifier(&self.catalog),
+                escape_sql_identifier(schema),
+                escape_sql_identifier(table)
+            ),
+            None => format!(
+                r#"SHOW CREATE {} "{}"."{}""#,
+                show_type,
+                escape_sql_identifier(&self.catalog),
+                escape_sql_identifier(schema)
+            ),
+        };
+
+        let records: Option<Vec<Vec<Value>>> = self
+            .database_client
+            .sql_in_public(&sql)
+            .await
+            .context(DatabaseSnafu)?;
+        let rows = records.context(EmptyResultSnafu)?;
+        let row = rows.first().context(EmptyResultSnafu)?;
+        let Some(Value::String(create)) = row.get(1) else {
+            return UnexpectedValueTypeSnafu.fail();
+        };
+
+        Ok(format!("{};\n", create))
+    }
+}
+
+fn build_schema_ddl(
+    schema: &str,
+    create_database: String,
+    physical_tables: Vec<String>,
+    tables: Vec<String>,
+    views: Vec<String>,
+) -> String {
+    let mut ddl = String::new();
+    ddl.push_str(&format!("-- Schema: {}\n", schema));
+    ddl.push_str(&create_database);
+    for stmt in physical_tables {
+        ddl.push_str(&stmt);
+    }
+    for stmt in tables {
+        ddl.push_str(&stmt);
+    }
+    for stmt in views {
+        ddl.push_str(&stmt);
+    }
+    ddl.push('\n');
+    ddl
+}
+
+#[cfg(test)]
+mod tests {
+    use clap::Parser;
+
+    use super::*;
+    use crate::data::path::ddl_path_for_schema;
+
+    #[test]
+    fn test_ddl_path_for_schema() {
+        assert_eq!(ddl_path_for_schema("public"), "schema/ddl/public.sql");
+        assert_eq!(
+            ddl_path_for_schema("../evil"),
+            "schema/ddl/%2E%2E%2Fevil.sql"
+        );
+    }
+
+    #[test]
+    fn test_build_schema_ddl_order() {
+        let ddl = build_schema_ddl(
+            "public",
+            "CREATE DATABASE public;\n".to_string(),
+            vec!["PHYSICAL;\n".to_string()],
+            vec!["TABLE;\n".to_string()],
+            vec!["VIEW;\n".to_string()],
+        );
+
+        let db_pos = ddl.find("CREATE DATABASE").unwrap();
+        let physical_pos = ddl.find("PHYSICAL;").unwrap();
+        let table_pos = ddl.find("TABLE;").unwrap();
+        let view_pos = ddl.find("VIEW;").unwrap();
+        assert!(db_pos < physical_pos);
+        assert!(physical_pos < table_pos);
+        assert!(table_pos < view_pos);
+    }
+
+    #[tokio::test]
+    async fn test_build_rejects_non_schema_only_export() {
+        let cmd = ExportCreateCommand::parse_from([
+            "export-v2-create",
+            "--addr",
+            "127.0.0.1:4000",
+            "--to",
+            "file:///tmp/export-v2-test",
+        ]);
+
+        let result = cmd.build().await;
+        assert!(result.is_err());
+        let error = result.err().unwrap().to_string();
+
+        assert!(error.contains("Data export is not implemented yet"));
+    }
+}
diff --git a/src/cli/src/data/export_v2/error.rs b/src/cli/src/data/export_v2/error.rs
new file mode 100644
index 0000000000..2db71d5326
--- /dev/null
+++ b/src/cli/src/data/export_v2/error.rs
@@ -0,0 +1,181 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use snafu::{Location, Snafu};
+
+#[derive(Snafu)]
+#[snafu(visibility(pub))]
+#[stack_trace_debug]
+pub enum Error {
+    #[snafu(display("Invalid URI '{}': {}", uri, reason))]
+    InvalidUri {
+        uri: String,
+        reason: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Unsupported storage scheme: {}", scheme))]
+    UnsupportedScheme {
+        scheme: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Storage operation '{}' failed", operation))]
+    StorageOperation {
+        operation: String,
+        #[snafu(source)]
+        error: object_store::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to parse manifest"))]
+    ManifestParse {
+        #[snafu(source)]
+        error: serde_json::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to serialize manifest"))]
+    ManifestSerialize {
+        #[snafu(source)]
+        error: serde_json::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to decode text file as UTF-8"))]
+    TextDecode {
+        #[snafu(source)]
+        error: std::string::FromUtf8Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display(
+        "Cannot resume schema-only snapshot with data export. Use --force to recreate."
+    ))]
+    CannotResumeSchemaOnly {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display(
+        "Data export is not implemented yet. Use --schema-only to create a schema snapshot."
+    ))]
+    DataExportNotImplemented {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Empty result from query"))]
+    EmptyResult {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Unexpected value type in query result"))]
+    UnexpectedValueType {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Database error"))]
+    Database {
+        #[snafu(source)]
+        error: crate::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Snapshot not found at '{}'", uri))]
+    SnapshotNotFound {
+        uri: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Schema '{}' not found in catalog '{}'", schema, catalog))]
+    SchemaNotFound {
+        catalog: String,
+        schema: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to parse URL"))]
+    UrlParse {
+        #[snafu(source)]
+        error: url::ParseError,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to build object store"))]
+    BuildObjectStore {
+        #[snafu(source)]
+        error: object_store::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Manifest version mismatch: expected {}, found {}", expected, found))]
+    ManifestVersionMismatch {
+        expected: u32,
+        found: u32,
+        #[snafu(implicit)]
+        location: Location,
+    },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+impl ErrorExt for Error {
+    fn status_code(&self) -> StatusCode {
+        match self {
+            Error::InvalidUri { .. }
+            | Error::UnsupportedScheme { .. }
+            | Error::CannotResumeSchemaOnly { .. }
+            | Error::DataExportNotImplemented { .. }
+            | Error::ManifestVersionMismatch { .. } => StatusCode::InvalidArguments,
+
+            Error::StorageOperation { .. }
+            | Error::ManifestParse { .. }
+            | Error::ManifestSerialize { .. }
+            | Error::TextDecode { .. }
+            | Error::BuildObjectStore { .. } => StatusCode::StorageUnavailable,
+
+            Error::EmptyResult { .. }
+            | Error::UnexpectedValueType { .. }
+            | Error::UrlParse { .. } => StatusCode::Internal,
+
+            Error::Database { error, .. } => error.status_code(),
+
+            Error::SnapshotNotFound { .. } => StatusCode::InvalidArguments,
+            Error::SchemaNotFound { .. } => StatusCode::DatabaseNotFound,
+        }
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
diff --git a/src/cli/src/data/export_v2/extractor.rs b/src/cli/src/data/export_v2/extractor.rs
new file mode 100644
index 0000000000..ae15b199af
--- /dev/null
+++ b/src/cli/src/data/export_v2/extractor.rs
@@ -0,0 +1,254 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Schema extraction from information_schema.
+//!
+//! For V2 DDL-only snapshots, extractor only persists the schema index.
+
+use std::collections::{HashMap, HashSet};
+
+use serde_json::Value;
+use snafu::ResultExt;
+
+use crate::data::export_v2::error::{
+    DatabaseSnafu, EmptyResultSnafu, Result, SchemaNotFoundSnafu, UnexpectedValueTypeSnafu,
+};
+use crate::data::export_v2::schema::{SchemaDefinition, SchemaSnapshot};
+use crate::data::sql::escape_sql_literal;
+use crate::database::DatabaseClient;
+
+/// System schemas that should be excluded from export.
+const SYSTEM_SCHEMAS: &[&str] = &["information_schema", "pg_catalog"];
+
+/// Extracts schema definitions from information_schema.
+pub struct SchemaExtractor<'a> {
+    client: &'a DatabaseClient,
+    catalog: &'a str,
+}
+
+impl<'a> SchemaExtractor<'a> {
+    /// Creates a new schema extractor.
+    pub fn new(client: &'a DatabaseClient, catalog: &'a str) -> Self {
+        Self { client, catalog }
+    }
+
+    /// Extracts the schema index for the given schemas.
+    ///
+    /// If `schemas` is None, extracts all non-system schemas.
+    pub async fn extract(&self, schemas: Option<&[String]>) -> Result<SchemaSnapshot> {
+        let mut snapshot = SchemaSnapshot::new();
+
+        let schema_names = match schemas {
+            Some(names) => self.validate_schemas(names).await?,
+            None => self.get_all_schemas().await?,
+        };
+
+        for schema_name in &schema_names {
+            let schema_def = self.extract_schema_definition(schema_name).await?;
+            snapshot.add_schema(schema_def);
+        }
+
+        Ok(snapshot)
+    }
+
+    /// Gets all non-system schemas in the catalog.
+    async fn get_all_schemas(&self) -> Result<Vec<String>> {
+        let sql = format!(
+            "SELECT schema_name FROM information_schema.schemata \
+             WHERE catalog_name = '{}'",
+            escape_sql_literal(self.catalog)
+        );
+
+        let records = self.query(&sql).await?;
+        let mut schemas = Vec::new();
+
+        for row in records {
+            let name = extract_string(&row, 0)?;
+            if !SYSTEM_SCHEMAS.contains(&name.as_str()) {
+                schemas.push(name);
+            }
+        }
+
+        Ok(schemas)
+    }
+
+    /// Validates that all specified schemas exist.
+    async fn validate_schemas(&self, schemas: &[String]) -> Result<Vec<String>> {
+        let all_schemas = self.get_all_schemas().await?;
+        dedupe_canonicalized_schemas(schemas, &all_schemas, self.catalog)
+    }
+
+    /// Extracts schema (database) definition.
+    async fn extract_schema_definition(&self, schema: &str) -> Result<SchemaDefinition> {
+        let sql = format!(
+            "SELECT schema_name, options FROM information_schema.schemata \
+             WHERE catalog_name = '{}' AND schema_name = '{}'",
+            escape_sql_literal(self.catalog),
+            escape_sql_literal(schema)
+        );
+
+        let records = self.query(&sql).await?;
+        if records.is_empty() {
+            return SchemaNotFoundSnafu {
+                catalog: self.catalog,
+                schema,
+            }
+            .fail();
+        }
+
+        let name = extract_string(&records[0], 0)?;
+        let options = extract_optional_string(&records[0], 1)
+            .map(|opts| parse_options(&opts))
+            .unwrap_or_default();
+
+        Ok(SchemaDefinition {
+            catalog: self.catalog.to_string(),
+            name,
+            options,
+        })
+    }
+
+    /// Executes a SQL query and returns the results.
+    async fn query(&self, sql: &str) -> Result<Vec<Vec<Value>>> {
+        self.client
+            .sql_in_public(sql)
+            .await
+            .context(DatabaseSnafu)?
+            .ok_or_else(|| EmptyResultSnafu.build())
+    }
+}
+
+/// Extracts a string value from a row.
+fn extract_string(row: &[Value], index: usize) -> Result<String> {
+    match row.get(index) {
+        Some(Value::String(s)) => Ok(s.clone()),
+        Some(Value::Null) => UnexpectedValueTypeSnafu.fail(),
+        _ => UnexpectedValueTypeSnafu.fail(),
+    }
+}
+
+/// Extracts an optional string value from a row.
+fn extract_optional_string(row: &[Value], index: usize) -> Option<String> {
+    match row.get(index) {
+        Some(Value::String(s)) if !s.is_empty() => Some(s.clone()),
+        _ => None,
+    }
+}
+
+/// Parses options string into a HashMap.
+fn parse_options(options_str: &str) -> HashMap<String, String> {
+    if let Ok(map) = serde_json::from_str::<HashMap<String, String>>(options_str) {
+        return map;
+    }
+
+    let mut options = HashMap::new();
+    for line in options_str.lines() {
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+
+        if let Some((key, value)) = parse_quoted_option_line(trimmed) {
+            options.insert(key, value);
+            continue;
+        }
+
+        for part in trimmed.split_whitespace() {
+            if let Some((key, value)) = part.split_once('=') {
+                options.insert(key.to_string(), value.to_string());
+            }
+        }
+    }
+    options
+}
+
+fn parse_quoted_option_line(line: &str) -> Option<(String, String)> {
+    let key = line.strip_prefix('\'')?;
+    let (key, rest) = key.split_once("'='")?;
+    let value = rest.strip_suffix('\'')?;
+    Some((key.to_string(), value.to_string()))
+}
+
+fn dedupe_canonicalized_schemas(
+    requested: &[String],
+    available: &[String],
+    catalog: &str,
+) -> Result<Vec<String>> {
+    let mut canonicalized = Vec::new();
+    let mut seen = HashSet::new();
+
+    for schema in requested {
+        let Some(canonical) = available.iter().find(|s| s.eq_ignore_ascii_case(schema)) else {
+            return SchemaNotFoundSnafu { catalog, schema }.fail();
+        };
+
+        if seen.insert(canonical.to_ascii_lowercase()) {
+            canonicalized.push(canonical.clone());
+        }
+    }
+
+    Ok(canonicalized)
+}
+
+#[cfg(test)]
+mod tests {
+    use serde_json::Value;
+
+    use super::*;
+
+    #[test]
+    fn test_parse_options_json() {
+        let opts = r#"{"ttl": "30d", "custom": "value"}"#;
+        let parsed = parse_options(opts);
+        assert_eq!(parsed.get("ttl"), Some(&"30d".to_string()));
+        assert_eq!(parsed.get("custom"), Some(&"value".to_string()));
+    }
+
+    #[test]
+    fn test_parse_options_key_value() {
+        let opts = "ttl=30d custom=value";
+        let parsed = parse_options(opts);
+        assert_eq!(parsed.get("ttl"), Some(&"30d".to_string()));
+        assert_eq!(parsed.get("custom"), Some(&"value".to_string()));
+    }
+
+    #[test]
+    fn test_parse_options_schema_display_format() {
+        let opts = "'ttl'='30d'\n'custom'='value with spaces'\n";
+        let parsed = parse_options(opts);
+        assert_eq!(parsed.get("ttl"), Some(&"30d".to_string()));
+        assert_eq!(parsed.get("custom"), Some(&"value with spaces".to_string()));
+    }
+
+    #[test]
+    fn test_extract_string_rejects_null() {
+        let row = vec![Value::Null];
+        assert!(extract_string(&row, 0).is_err());
+    }
+
+    #[test]
+    fn test_dedupe_canonicalized_schemas() {
+        let available = vec!["public".to_string(), "test_db".to_string()];
+        let requested = vec![
+            "PUBLIC".to_string(),
+            "public".to_string(),
+            "Test_Db".to_string(),
+        ];
+
+        let canonicalized = dedupe_canonicalized_schemas(&requested, &available, "greptime")
+            .expect("schemas should be canonicalized");
+
+        assert_eq!(canonicalized, vec!["public", "test_db"]);
+    }
+}
diff --git a/src/cli/src/data/export_v2/manifest.rs b/src/cli/src/data/export_v2/manifest.rs
new file mode 100644
index 0000000000..0ebf753fa4
--- /dev/null
+++ b/src/cli/src/data/export_v2/manifest.rs
@@ -0,0 +1,381 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Manifest data structures for Export/Import V2.
+
+use std::{fmt, str};
+
+use chrono::{DateTime, Utc};
+use serde::{Deserialize, Serialize};
+use uuid::Uuid;
+
+/// Current manifest format version.
+pub const MANIFEST_VERSION: u32 = 1;
+
+/// Manifest file name within snapshot directory.
+pub const MANIFEST_FILE: &str = "manifest.json";
+
+/// Time range for data export (half-open interval: [start, end)).
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct TimeRange {
+    /// Start time (inclusive). None means earliest available data.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub start: Option<DateTime<Utc>>,
+    /// End time (exclusive). None means current time.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub end: Option<DateTime<Utc>>,
+}
+
+impl TimeRange {
+    /// Creates a new time range with specified bounds.
+    pub fn new(start: Option<DateTime<Utc>>, end: Option<DateTime<Utc>>) -> Self {
+        Self { start, end }
+    }
+
+    /// Creates an unbounded time range (all data).
+    pub fn unbounded() -> Self {
+        Self {
+            start: None,
+            end: None,
+        }
+    }
+
+    /// Returns true if this time range is unbounded.
+    pub fn is_unbounded(&self) -> bool {
+        self.start.is_none() && self.end.is_none()
+    }
+}
+
+impl Default for TimeRange {
+    fn default() -> Self {
+        Self::unbounded()
+    }
+}
+
+/// Status of a chunk during export/import.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
+#[serde(rename_all = "snake_case")]
+pub enum ChunkStatus {
+    /// Chunk is pending export.
+    #[default]
+    Pending,
+    /// Chunk export is in progress.
+    InProgress,
+    /// Chunk export completed successfully.
+    Completed,
+    /// Chunk export failed.
+    Failed,
+}
+
+/// Metadata for a single chunk of exported data.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ChunkMeta {
+    /// Chunk identifier (sequential number starting from 1).
+    pub id: u32,
+    /// Time range covered by this chunk.
+    pub time_range: TimeRange,
+    /// Export status.
+    pub status: ChunkStatus,
+    /// List of data files in this chunk (relative paths from snapshot root).
+    #[serde(default)]
+    pub files: Vec<String>,
+    /// SHA256 checksum of all files in this chunk (aggregated).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub checksum: Option<String>,
+    /// Error message if status is Failed.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub error: Option<String>,
+}
+
+impl ChunkMeta {
+    /// Creates a new pending chunk with the given id and time range.
+    pub fn new(id: u32, time_range: TimeRange) -> Self {
+        Self {
+            id,
+            time_range,
+            status: ChunkStatus::Pending,
+            files: vec![],
+            checksum: None,
+            error: None,
+        }
+    }
+
+    /// Marks this chunk as in progress.
+    pub fn mark_in_progress(&mut self) {
+        self.status = ChunkStatus::InProgress;
+        self.error = None;
+    }
+
+    /// Marks this chunk as completed with the given files and checksum.
+    pub fn mark_completed(&mut self, files: Vec<String>, checksum: Option<String>) {
+        self.status = ChunkStatus::Completed;
+        self.files = files;
+        self.checksum = checksum;
+        self.error = None;
+    }
+
+    /// Marks this chunk as failed with the given error message.
+    pub fn mark_failed(&mut self, error: String) {
+        self.status = ChunkStatus::Failed;
+        self.error = Some(error);
+    }
+}
+
+/// Supported data formats for export.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default, clap::ValueEnum)]
+#[serde(rename_all = "lowercase")]
+#[value(rename_all = "lowercase")]
+pub enum DataFormat {
+    /// Apache Parquet format (default, recommended for production).
+    #[default]
+    Parquet,
+    /// CSV format (human-readable).
+    Csv,
+    /// JSON format (structured text).
+    Json,
+}
+
+impl fmt::Display for DataFormat {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            DataFormat::Parquet => write!(f, "parquet"),
+            DataFormat::Csv => write!(f, "csv"),
+            DataFormat::Json => write!(f, "json"),
+        }
+    }
+}
+
+impl str::FromStr for DataFormat {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "parquet" => Ok(DataFormat::Parquet),
+            "csv" => Ok(DataFormat::Csv),
+            "json" => Ok(DataFormat::Json),
+            _ => Err(format!(
+                "invalid format '{}': expected one of parquet, csv, json",
+                s
+            )),
+        }
+    }
+}
+
+/// Snapshot manifest containing all metadata.
+///
+/// The manifest is stored as `manifest.json` in the snapshot root directory.
+/// It contains:
+/// - Snapshot identification (UUID, timestamps)
+/// - Scope (catalog, schemas, time range)
+/// - Export configuration (format, schema_only)
+/// - Chunk metadata for resume support
+/// - Integrity checksums
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Manifest {
+    /// Manifest format version for compatibility checking.
+    pub version: u32,
+    /// Unique snapshot identifier.
+    pub snapshot_id: Uuid,
+    /// Catalog name.
+    pub catalog: String,
+    /// List of schemas included in this snapshot.
+    pub schemas: Vec<String>,
+    /// Overall time range covered by this snapshot.
+    pub time_range: TimeRange,
+    /// Whether this is a schema-only snapshot (no data).
+    pub schema_only: bool,
+    /// Data format used for export.
+    pub format: DataFormat,
+    /// Chunk metadata (empty for schema-only snapshots).
+    #[serde(default)]
+    pub chunks: Vec<ChunkMeta>,
+    /// Snapshot-level SHA256 checksum (aggregated from all chunks).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub checksum: Option<String>,
+    /// Creation timestamp.
+    pub created_at: DateTime<Utc>,
+    /// Last updated timestamp.
+    pub updated_at: DateTime<Utc>,
+}
+
+impl Manifest {
+    /// Creates a new manifest for schema-only export.
+    pub fn new_schema_only(catalog: String, schemas: Vec<String>) -> Self {
+        let now = Utc::now();
+        Self {
+            version: MANIFEST_VERSION,
+            snapshot_id: Uuid::new_v4(),
+            catalog,
+            schemas,
+            time_range: TimeRange::unbounded(),
+            schema_only: true,
+            format: DataFormat::Parquet,
+            chunks: vec![],
+            checksum: None,
+            created_at: now,
+            updated_at: now,
+        }
+    }
+
+    /// Creates a new manifest for full export with time range and format.
+    pub fn new_full(
+        catalog: String,
+        schemas: Vec<String>,
+        time_range: TimeRange,
+        format: DataFormat,
+    ) -> Self {
+        let now = Utc::now();
+        Self {
+            version: MANIFEST_VERSION,
+            snapshot_id: Uuid::new_v4(),
+            catalog,
+            schemas,
+            time_range,
+            schema_only: false,
+            format,
+            chunks: vec![],
+            checksum: None,
+            created_at: now,
+            updated_at: now,
+        }
+    }
+
+    /// Returns true if all chunks are completed (or if schema-only).
+    pub fn is_complete(&self) -> bool {
+        self.schema_only
+            || (!self.chunks.is_empty()
+                && self
+                    .chunks
+                    .iter()
+                    .all(|c| c.status == ChunkStatus::Completed))
+    }
+
+    /// Returns the number of pending chunks.
+    pub fn pending_count(&self) -> usize {
+        self.chunks
+            .iter()
+            .filter(|c| c.status == ChunkStatus::Pending)
+            .count()
+    }
+
+    /// Returns the number of in-progress chunks.
+    pub fn in_progress_count(&self) -> usize {
+        self.chunks
+            .iter()
+            .filter(|c| c.status == ChunkStatus::InProgress)
+            .count()
+    }
+
+    /// Returns the number of completed chunks.
+    pub fn completed_count(&self) -> usize {
+        self.chunks
+            .iter()
+            .filter(|c| c.status == ChunkStatus::Completed)
+            .count()
+    }
+
+    /// Returns the number of failed chunks.
+    pub fn failed_count(&self) -> usize {
+        self.chunks
+            .iter()
+            .filter(|c| c.status == ChunkStatus::Failed)
+            .count()
+    }
+
+    /// Updates the `updated_at` timestamp to now.
+    pub fn touch(&mut self) {
+        self.updated_at = Utc::now();
+    }
+
+    /// Adds a chunk to the manifest.
+    pub fn add_chunk(&mut self, chunk: ChunkMeta) {
+        self.chunks.push(chunk);
+        self.touch();
+    }
+
+    /// Updates a chunk by id.
+    pub fn update_chunk(&mut self, id: u32, updater: impl FnOnce(&mut ChunkMeta)) {
+        if let Some(chunk) = self.chunks.iter_mut().find(|c| c.id == id) {
+            updater(chunk);
+            self.touch();
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_time_range_serialization() {
+        let range = TimeRange::unbounded();
+        let json = serde_json::to_string(&range).unwrap();
+        assert_eq!(json, "{}");
+
+        let range: TimeRange = serde_json::from_str("{}").unwrap();
+        assert!(range.is_unbounded());
+    }
+
+    #[test]
+    fn test_manifest_schema_only() {
+        let manifest =
+            Manifest::new_schema_only("greptime".to_string(), vec!["public".to_string()]);
+
+        assert_eq!(manifest.version, MANIFEST_VERSION);
+        assert!(manifest.schema_only);
+        assert!(manifest.chunks.is_empty());
+        assert!(manifest.is_complete());
+    }
+
+    #[test]
+    fn test_manifest_full() {
+        let manifest = Manifest::new_full(
+            "greptime".to_string(),
+            vec!["public".to_string()],
+            TimeRange::unbounded(),
+            DataFormat::Parquet,
+        );
+
+        assert!(!manifest.schema_only);
+        assert!(manifest.chunks.is_empty());
+        assert!(!manifest.is_complete());
+    }
+
+    #[test]
+    fn test_data_format_parsing() {
+        assert_eq!(
+            "parquet".parse::<DataFormat>().unwrap(),
+            DataFormat::Parquet
+        );
+        assert_eq!("CSV".parse::<DataFormat>().unwrap(), DataFormat::Csv);
+        assert_eq!("JSON".parse::<DataFormat>().unwrap(), DataFormat::Json);
+        assert!("invalid".parse::<DataFormat>().is_err());
+    }
+
+    #[test]
+    fn test_chunk_status_transitions() {
+        let mut chunk = ChunkMeta::new(1, TimeRange::unbounded());
+        assert_eq!(chunk.status, ChunkStatus::Pending);
+
+        chunk.mark_in_progress();
+        assert_eq!(chunk.status, ChunkStatus::InProgress);
+
+        chunk.mark_completed(
+            vec!["file1.parquet".to_string()],
+            Some("abc123".to_string()),
+        );
+        assert_eq!(chunk.status, ChunkStatus::Completed);
+        assert_eq!(chunk.files.len(), 1);
+    }
+}
diff --git a/src/cli/src/data/export_v2/schema.rs b/src/cli/src/data/export_v2/schema.rs
new file mode 100644
index 0000000000..1aab6ac900
--- /dev/null
+++ b/src/cli/src/data/export_v2/schema.rs
@@ -0,0 +1,98 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Minimal schema index structures for Export/Import V2.
+//!
+//! The canonical schema representation is the per-schema DDL file under
+//! `schema/ddl/`. `schemas.json` only records which schemas exist in a snapshot.
+
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+/// Schema directory name within snapshot.
+pub const SCHEMA_DIR: &str = "schema";
+
+/// DDL directory name within schema directory.
+pub const DDL_DIR: &str = "ddl";
+
+/// Schema definition file name.
+pub const SCHEMAS_FILE: &str = "schemas.json";
+
+/// Schema (database) definition.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct SchemaDefinition {
+    /// Catalog name.
+    pub catalog: String,
+    /// Schema (database) name.
+    pub name: String,
+    /// Schema options (if any).
+    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
+    pub options: HashMap<String, String>,
+}
+
+/// Minimal schema index stored in a snapshot.
+#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
+pub struct SchemaSnapshot {
+    /// Schema (database) definitions.
+    pub schemas: Vec<SchemaDefinition>,
+}
+
+impl SchemaSnapshot {
+    /// Creates an empty schema snapshot.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Adds a schema definition.
+    pub fn add_schema(&mut self, schema: SchemaDefinition) {
+        self.schemas.push(schema);
+    }
+
+    /// Filters the snapshot to only include specified schemas.
+    pub fn filter_schemas(&self, schemas: &[String]) -> Self {
+        Self {
+            schemas: self
+                .schemas
+                .iter()
+                .filter(|s| schemas.contains(&s.name))
+                .cloned()
+                .collect(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_schema_snapshot_filter() {
+        let mut snapshot = SchemaSnapshot::new();
+        snapshot.add_schema(SchemaDefinition {
+            catalog: "greptime".to_string(),
+            name: "public".to_string(),
+            options: HashMap::new(),
+        });
+        snapshot.add_schema(SchemaDefinition {
+            catalog: "greptime".to_string(),
+            name: "private".to_string(),
+            options: HashMap::new(),
+        });
+
+        let filtered = snapshot.filter_schemas(&["public".to_string()]);
+        assert_eq!(filtered.schemas.len(), 1);
+        assert_eq!(filtered.schemas[0].name, "public");
+    }
+}
diff --git a/src/cli/src/data/export_v2/tests.rs b/src/cli/src/data/export_v2/tests.rs
new file mode 100644
index 0000000000..bd28801a0d
--- /dev/null
+++ b/src/cli/src/data/export_v2/tests.rs
@@ -0,0 +1,341 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::env;
+use std::time::Duration;
+
+use clap::Parser;
+use common_error::ext::BoxedError;
+use snafu::ResultExt;
+use tempfile::tempdir;
+use url::Url;
+
+use super::command::ExportCreateCommand;
+use crate::common::ObjectStoreConfig;
+use crate::data::import_v2::ImportV2Command;
+use crate::data::snapshot_storage::OpenDalStorage;
+use crate::database::DatabaseClient;
+use crate::error::{FileIoSnafu, InvalidArgumentsSnafu, OtherSnafu, Result};
+
+#[tokio::test]
+#[ignore]
+async fn export_import_v2_schema_parity_e2e() -> Result<()> {
+    let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string());
+    let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string());
+    let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok();
+    let schema = "test_db_schema_parity";
+
+    let database_client = DatabaseClient::new(
+        addr.clone(),
+        catalog.clone(),
+        auth_basic.clone(),
+        Duration::from_secs(60),
+        None,
+        false,
+    );
+
+    database_client
+        .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
+        .await?;
+    database_client
+        .sql_in_public(&format!("CREATE DATABASE {schema}"))
+        .await?;
+    database_client
+        .sql(
+            "CREATE TABLE metrics (\
+                ts TIMESTAMP TIME INDEX, \
+                host STRING PRIMARY KEY, \
+                cpu DOUBLE DEFAULT 0.0, \
+                region_name STRING \
+            ) ENGINE = mito WITH (ttl='7d', 'compaction.type'='twcs')",
+            schema,
+        )
+        .await?;
+    database_client
+        .sql(
+            "CREATE TABLE logs (\
+                ts TIMESTAMP TIME INDEX, \
+                app STRING PRIMARY KEY, \
+                msg STRING NOT NULL COMMENT 'log message' \
+            ) ENGINE = mito",
+            schema,
+        )
+        .await?;
+    database_client
+        .sql(
+            "CREATE TABLE metrics_physical (\
+                ts TIMESTAMP TIME INDEX, \
+                host STRING, \
+                region_name STRING, \
+                cpu DOUBLE DEFAULT 0.0, \
+                PRIMARY KEY (host, region_name) \
+            ) ENGINE = metric WITH (physical_metric_table='true')",
+            schema,
+        )
+        .await?;
+    database_client
+        .sql(
+            "CREATE TABLE metrics_logical (\
+                ts TIMESTAMP TIME INDEX, \
+                host STRING, \
+                region_name STRING, \
+                cpu DOUBLE DEFAULT 0.0, \
+                PRIMARY KEY (host, region_name) \
+            ) ENGINE = metric WITH (on_physical_table='metrics_physical')",
+            schema,
+        )
+        .await?;
+    database_client
+        .sql(
+            "CREATE VIEW metrics_view AS SELECT * FROM metrics WHERE cpu > 0.5",
+            schema,
+        )
+        .await?;
+
+    let src_dir = tempdir().context(FileIoSnafu)?;
+    let src_uri = Url::from_directory_path(src_dir.path())
+        .map_err(|_| {
+            InvalidArgumentsSnafu {
+                msg: "invalid temp dir path".to_string(),
+            }
+            .build()
+        })?
+        .to_string();
+
+    let mut export_args = vec![
+        "export-v2-create",
+        "--addr",
+        &addr,
+        "--to",
+        &src_uri,
+        "--catalog",
+        &catalog,
+        "--schemas",
+        schema,
+        "--schema-only",
+    ];
+    if let Some(auth) = &auth_basic {
+        export_args.push("--auth-basic");
+        export_args.push(auth);
+    }
+    let export_cmd = ExportCreateCommand::parse_from(export_args);
+    export_cmd
+        .build()
+        .await
+        .context(OtherSnafu)?
+        .do_work()
+        .await
+        .context(OtherSnafu)?;
+
+    database_client
+        .sql_in_public(&format!("DROP DATABASE {schema}"))
+        .await?;
+
+    let mut import_args = vec![
+        "import-v2",
+        "--addr",
+        &addr,
+        "--from",
+        &src_uri,
+        "--catalog",
+        &catalog,
+        "--schemas",
+        schema,
+    ];
+    if let Some(auth) = &auth_basic {
+        import_args.push("--auth-basic");
+        import_args.push(auth);
+    }
+    let import_cmd = ImportV2Command::parse_from(import_args);
+    import_cmd
+        .build()
+        .await
+        .context(OtherSnafu)?
+        .do_work()
+        .await
+        .context(OtherSnafu)?;
+
+    let dst_dir = tempdir().context(FileIoSnafu)?;
+    let dst_uri = Url::from_directory_path(dst_dir.path())
+        .map_err(|_| {
+            InvalidArgumentsSnafu {
+                msg: "invalid temp dir path".to_string(),
+            }
+            .build()
+        })?
+        .to_string();
+
+    let mut export_args = vec![
+        "export-v2-create",
+        "--addr",
+        &addr,
+        "--to",
+        &dst_uri,
+        "--catalog",
+        &catalog,
+        "--schemas",
+        schema,
+        "--schema-only",
+    ];
+    if let Some(auth) = &auth_basic {
+        export_args.push("--auth-basic");
+        export_args.push(auth);
+    }
+    let export_cmd = ExportCreateCommand::parse_from(export_args);
+    export_cmd
+        .build()
+        .await
+        .context(OtherSnafu)?
+        .do_work()
+        .await
+        .context(OtherSnafu)?;
+
+    let storage_config = ObjectStoreConfig::default();
+    let src_storage = OpenDalStorage::from_uri(&src_uri, &storage_config)
+        .map_err(BoxedError::new)
+        .context(OtherSnafu)?;
+    let dst_storage = OpenDalStorage::from_uri(&dst_uri, &storage_config)
+        .map_err(BoxedError::new)
+        .context(OtherSnafu)?;
+
+    let src_schema_snapshot = src_storage
+        .read_schema()
+        .await
+        .map_err(BoxedError::new)
+        .context(OtherSnafu)?;
+    let dst_schema_snapshot = dst_storage
+        .read_schema()
+        .await
+        .map_err(BoxedError::new)
+        .context(OtherSnafu)?;
+    assert_eq!(src_schema_snapshot, dst_schema_snapshot);
+
+    database_client
+        .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
+        .await?;
+
+    Ok(())
+}
+
+#[tokio::test]
+#[ignore]
+async fn import_v2_ddl_dry_run_e2e() -> Result<()> {
+    let addr = env::var("GREPTIME_ADDR").unwrap_or_else(|_| "127.0.0.1:4000".to_string());
+    let catalog = env::var("GREPTIME_CATALOG").unwrap_or_else(|_| "greptime".to_string());
+    let auth_basic = env::var("GREPTIME_AUTH_BASIC").ok();
+    let schema = "test_db_ddl_dry_run";
+
+    let database_client = DatabaseClient::new(
+        addr.clone(),
+        catalog.clone(),
+        auth_basic.clone(),
+        Duration::from_secs(60),
+        None,
+        false,
+    );
+
+    database_client
+        .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
+        .await?;
+    database_client
+        .sql_in_public(&format!("CREATE DATABASE {schema}"))
+        .await?;
+    database_client
+        .sql(
+            "CREATE TABLE metrics (\
+                ts TIMESTAMP TIME INDEX, \
+                host STRING PRIMARY KEY, \
+                cpu DOUBLE DEFAULT 0.0, \
+                region_name STRING \
+            ) ENGINE = mito WITH (ttl='7d', 'compaction.type'='twcs')",
+            schema,
+        )
+        .await?;
+    database_client
+        .sql(
+            "CREATE TABLE logs (\
+                ts TIMESTAMP TIME INDEX, \
+                app STRING PRIMARY KEY, \
+                msg STRING NOT NULL COMMENT 'log message' \
+            ) ENGINE = mito",
+            schema,
+        )
+        .await?;
+
+    let src_dir = tempdir().context(FileIoSnafu)?;
+    let src_uri = Url::from_directory_path(src_dir.path())
+        .map_err(|_| {
+            InvalidArgumentsSnafu {
+                msg: "invalid temp dir path".to_string(),
+            }
+            .build()
+        })?
+        .to_string();
+
+    let mut export_args = vec![
+        "export-v2-create",
+        "--addr",
+        &addr,
+        "--to",
+        &src_uri,
+        "--catalog",
+        &catalog,
+        "--schemas",
+        schema,
+        "--schema-only",
+    ];
+    if let Some(auth) = &auth_basic {
+        export_args.push("--auth-basic");
+        export_args.push(auth);
+    }
+    let export_cmd = ExportCreateCommand::parse_from(export_args);
+    export_cmd
+        .build()
+        .await
+        .context(OtherSnafu)?
+        .do_work()
+        .await
+        .context(OtherSnafu)?;
+
+    let mut import_args = vec![
+        "import-v2",
+        "--addr",
+        &addr,
+        "--from",
+        &src_uri,
+        "--catalog",
+        &catalog,
+        "--schemas",
+        schema,
+        "--dry-run",
+    ];
+    if let Some(auth) = &auth_basic {
+        import_args.push("--auth-basic");
+        import_args.push(auth);
+    }
+    let import_cmd = ImportV2Command::parse_from(import_args);
+    import_cmd
+        .build()
+        .await
+        .context(OtherSnafu)?
+        .do_work()
+        .await
+        .context(OtherSnafu)?;
+
+    database_client
+        .sql_in_public(&format!("DROP DATABASE IF EXISTS {schema}"))
+        .await?;
+
+    Ok(())
+}
diff --git a/src/cli/src/data/import.rs b/src/cli/src/data/import.rs
index ffe8b62c7e..f5c234f1a7 100644
--- a/src/cli/src/data/import.rs
+++ b/src/cli/src/data/import.rs
@@ -81,13 +81,16 @@ pub struct ImportCommand {
     #[clap(long, value_parser = humantime::parse_duration)]
     timeout: Option<Duration>,
 
-    /// The proxy server address to connect, if set, will override the system proxy.
+    /// The proxy server address to connect.
     ///
-    /// The default behavior will use the system proxy if neither `proxy` nor `no_proxy` is set.
+    /// If set, it overrides the system proxy unless `--no-proxy` is specified.
+    /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
     #[clap(long)]
     proxy: Option<String>,
 
-    /// Disable proxy server, if set, will not use any proxy.
+    /// Disable all proxy usage (ignores `--proxy` and system proxy).
+    ///
+    /// When set and `--proxy` is not provided, this explicitly disables system proxy.
     #[clap(long, default_value = "false")]
     no_proxy: bool,
 }
@@ -104,6 +107,7 @@ impl ImportCommand {
             // Treats `None` as `0s` to disable server-side default timeout.
             self.timeout.unwrap_or_default(),
             proxy,
+            self.no_proxy,
         );
 
         Ok(Box::new(Import {
@@ -314,6 +318,7 @@ mod tests {
                 None,
                 Duration::from_secs(0),
                 None,
+                false,
             ),
             input_dir: input_dir.to_string(),
             parallelism: 1,
diff --git a/src/cli/src/data/import_v2.rs b/src/cli/src/data/import_v2.rs
new file mode 100644
index 0000000000..772e18cc93
--- /dev/null
+++ b/src/cli/src/data/import_v2.rs
@@ -0,0 +1,41 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Import V2 module.
+//!
+//! This module provides the V2 implementation of database import functionality,
+//! featuring:
+//! - DDL-based schema import
+//! - Dry-run mode for verification
+//!
+//! # Example
+//!
+//! ```bash
+//! # Dry-run import (verify without executing)
+//! greptime cli data import-v2 \
+//!   --addr 127.0.0.1:4000 \
+//!   --from file:///tmp/snapshot \
+//!   --dry-run
+//!
+//! # Actual import
+//! greptime cli data import-v2 \
+//!   --addr 127.0.0.1:4000 \
+//!   --from s3://bucket/snapshots/prod-20250101
+//! ```
+
+mod command;
+pub mod error;
+pub mod executor;
+
+pub use command::ImportV2Command;
diff --git a/src/cli/src/data/import_v2/command.rs b/src/cli/src/data/import_v2/command.rs
new file mode 100644
index 0000000000..544763d92b
--- /dev/null
+++ b/src/cli/src/data/import_v2/command.rs
@@ -0,0 +1,542 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Import V2 CLI command.
+
+use std::collections::HashSet;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use clap::Parser;
+use common_error::ext::BoxedError;
+use common_telemetry::info;
+use snafu::ResultExt;
+
+use crate::Tool;
+use crate::common::ObjectStoreConfig;
+use crate::data::export_v2::manifest::MANIFEST_VERSION;
+use crate::data::import_v2::error::{
+    ManifestVersionMismatchSnafu, Result, SchemaNotInSnapshotSnafu, SnapshotStorageSnafu,
+};
+use crate::data::import_v2::executor::{DdlExecutor, DdlStatement};
+use crate::data::path::ddl_path_for_schema;
+use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri};
+use crate::database::{DatabaseClient, parse_proxy_opts};
+
+/// Import from a snapshot.
+#[derive(Debug, Parser)]
+pub struct ImportV2Command {
+    /// Server address to connect (e.g., 127.0.0.1:4000).
+    #[clap(long)]
+    addr: String,
+
+    /// Source snapshot location (e.g., s3://bucket/path, file:///tmp/backup).
+    #[clap(long)]
+    from: String,
+
+    /// Target catalog name.
+    #[clap(long, default_value = "greptime")]
+    catalog: String,
+
+    /// Schema list to import (default: all in snapshot).
+    /// Can be specified multiple times or comma-separated.
+    #[clap(long, value_delimiter = ',')]
+    schemas: Vec<String>,
+
+    /// Verify without importing (dry-run).
+    #[clap(long)]
+    dry_run: bool,
+
+    /// Concurrency level (for future use).
+    #[clap(long, default_value = "1")]
+    parallelism: usize,
+
+    /// Basic authentication (user:password).
+    #[clap(long)]
+    auth_basic: Option<String>,
+
+    /// Request timeout.
+    #[clap(long, value_parser = humantime::parse_duration)]
+    timeout: Option<Duration>,
+
+    /// Proxy server address.
+    ///
+    /// If set, it overrides the system proxy unless `--no-proxy` is specified.
+    /// If neither `--proxy` nor `--no-proxy` is set, system proxy (env) may be used.
+    #[clap(long)]
+    proxy: Option<String>,
+
+    /// Disable all proxy usage (ignores `--proxy` and system proxy).
+    ///
+    /// When set and `--proxy` is not provided, this explicitly disables system proxy.
+    #[clap(long)]
+    no_proxy: bool,
+
+    /// Object store configuration for remote storage backends.
+    #[clap(flatten)]
+    storage: ObjectStoreConfig,
+}
+
+impl ImportV2Command {
+    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
+        // Validate URI format
+        validate_uri(&self.from)
+            .context(SnapshotStorageSnafu)
+            .map_err(BoxedError::new)?;
+
+        // Parse schemas (empty vec means all schemas)
+        let schemas = if self.schemas.is_empty() {
+            None
+        } else {
+            Some(self.schemas.clone())
+        };
+
+        // Build storage
+        let storage = OpenDalStorage::from_uri(&self.from, &self.storage)
+            .context(SnapshotStorageSnafu)
+            .map_err(BoxedError::new)?;
+
+        // Build database client
+        let proxy = parse_proxy_opts(self.proxy.clone(), self.no_proxy)?;
+        let database_client = DatabaseClient::new(
+            self.addr.clone(),
+            self.catalog.clone(),
+            self.auth_basic.clone(),
+            self.timeout.unwrap_or(Duration::from_secs(60)),
+            proxy,
+            self.no_proxy,
+        );
+
+        Ok(Box::new(Import {
+            schemas,
+            dry_run: self.dry_run,
+            _parallelism: self.parallelism,
+            storage: Box::new(storage),
+            database_client,
+        }))
+    }
+}
+
+/// Import tool implementation.
+pub struct Import {
+    schemas: Option<Vec<String>>,
+    dry_run: bool,
+    _parallelism: usize,
+    storage: Box<dyn SnapshotStorage>,
+    database_client: DatabaseClient,
+}
+
+#[async_trait]
+impl Tool for Import {
+    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
+        self.run().await.map_err(BoxedError::new)
+    }
+}
+
+impl Import {
+    async fn run(&self) -> Result<()> {
+        // 1. Read manifest
+        let manifest = self
+            .storage
+            .read_manifest()
+            .await
+            .context(SnapshotStorageSnafu)?;
+
+        info!(
+            "Loading snapshot: {} (version: {}, schema_only: {})",
+            manifest.snapshot_id, manifest.version, manifest.schema_only
+        );
+
+        // Check version compatibility
+        if manifest.version != MANIFEST_VERSION {
+            return ManifestVersionMismatchSnafu {
+                expected: MANIFEST_VERSION,
+                found: manifest.version,
+            }
+            .fail();
+        }
+
+        info!("Snapshot contains {} schema(s)", manifest.schemas.len());
+
+        // 2. Determine schemas to import
+        let schemas_to_import = match &self.schemas {
+            Some(filter) => canonicalize_schema_filter(filter, &manifest.schemas)?,
+            None => manifest.schemas.clone(),
+        };
+
+        info!("Importing schemas: {:?}", schemas_to_import);
+
+        // 3. Read DDL statements
+        let ddl_statements = self.read_ddl_statements(&schemas_to_import).await?;
+
+        info!("Generated {} DDL statements", ddl_statements.len());
+
+        // 4. Dry-run mode: print DDL and exit
+        if self.dry_run {
+            info!("Dry-run mode - DDL statements to execute:");
+            println!();
+            for (i, stmt) in ddl_statements.iter().enumerate() {
+                println!("-- Statement {}", i + 1);
+                println!("{};", stmt.sql);
+                println!();
+            }
+            return Ok(());
+        }
+
+        // 5. Execute DDL
+        let executor = DdlExecutor::new(&self.database_client);
+        executor.execute_strict(&ddl_statements).await?;
+
+        info!(
+            "Import completed: {} DDL statements executed",
+            ddl_statements.len()
+        );
+
+        // 6. Data import would happen here for non-schema-only snapshots (M2/M3)
+        if !manifest.schema_only && !manifest.chunks.is_empty() {
+            info!(
+                "Data import not yet implemented (M3). {} chunks pending.",
+                manifest.chunks.len()
+            );
+        }
+
+        Ok(())
+    }
+
+    async fn read_ddl_statements(&self, schemas: &[String]) -> Result<Vec<DdlStatement>> {
+        let mut statements = Vec::new();
+        for schema in schemas {
+            let path = ddl_path_for_schema(schema);
+            let content = self
+                .storage
+                .read_text(&path)
+                .await
+                .context(SnapshotStorageSnafu)?;
+            statements.extend(
+                parse_ddl_statements(&content)
+                    .into_iter()
+                    .map(|sql| ddl_statement_for_schema(schema, sql)),
+            );
+        }
+
+        Ok(statements)
+    }
+}
+
+fn parse_ddl_statements(content: &str) -> Vec<String> {
+    let mut statements = Vec::new();
+    let mut current = String::new();
+    let mut chars = content.chars().peekable();
+    let mut in_single_quote = false;
+    let mut in_double_quote = false;
+    let mut in_line_comment = false;
+    let mut in_block_comment = false;
+
+    while let Some(ch) = chars.next() {
+        if in_line_comment {
+            if ch == '\n' {
+                in_line_comment = false;
+                current.push('\n');
+            }
+            continue;
+        }
+
+        if in_block_comment {
+            if ch == '*' && chars.peek() == Some(&'/') {
+                chars.next();
+                in_block_comment = false;
+            }
+            continue;
+        }
+
+        if in_single_quote {
+            current.push(ch);
+            if ch == '\'' {
+                if chars.peek() == Some(&'\'') {
+                    current.push(chars.next().expect("peeked quote must exist"));
+                } else {
+                    in_single_quote = false;
+                }
+            }
+            continue;
+        }
+
+        if in_double_quote {
+            current.push(ch);
+            if ch == '"' {
+                if chars.peek() == Some(&'"') {
+                    current.push(chars.next().expect("peeked quote must exist"));
+                } else {
+                    in_double_quote = false;
+                }
+            }
+            continue;
+        }
+
+        match ch {
+            '-' if chars.peek() == Some(&'-') => {
+                chars.next();
+                in_line_comment = true;
+            }
+            '/' if chars.peek() == Some(&'*') => {
+                chars.next();
+                in_block_comment = true;
+            }
+            '\'' => {
+                in_single_quote = true;
+                current.push(ch);
+            }
+            '"' => {
+                in_double_quote = true;
+                current.push(ch);
+            }
+            ';' => {
+                let statement = current.trim();
+                if !statement.is_empty() {
+                    statements.push(statement.to_string());
+                }
+                current.clear();
+            }
+            _ => current.push(ch),
+        }
+    }
+
+    let statement = current.trim();
+    if !statement.is_empty() {
+        statements.push(statement.to_string());
+    }
+
+    statements
+}
+
+fn ddl_statement_for_schema(schema: &str, sql: String) -> DdlStatement {
+    if is_schema_scoped_statement(&sql) {
+        DdlStatement::with_execution_schema(sql, schema.to_string())
+    } else {
+        DdlStatement::new(sql)
+    }
+}
+
+fn is_schema_scoped_statement(sql: &str) -> bool {
+    let trimmed = sql.trim_start();
+    if !starts_with_keyword(trimmed, "CREATE") {
+        return false;
+    }
+
+    let Some(rest) = trimmed.get("CREATE".len()..) else {
+        return false;
+    };
+    let mut rest = rest.trim_start();
+    if starts_with_keyword(rest, "OR") {
+        let Some(next) = rest.get("OR".len()..) else {
+            return false;
+        };
+        rest = next.trim_start();
+        if !starts_with_keyword(rest, "REPLACE") {
+            return false;
+        }
+        let Some(next) = rest.get("REPLACE".len()..) else {
+            return false;
+        };
+        rest = next.trim_start();
+    }
+
+    if starts_with_keyword(rest, "EXTERNAL") {
+        let Some(next) = rest.get("EXTERNAL".len()..) else {
+            return false;
+        };
+        rest = next.trim_start();
+    }
+
+    starts_with_keyword(rest, "TABLE") || starts_with_keyword(rest, "VIEW")
+}
+
+fn starts_with_keyword(input: &str, keyword: &str) -> bool {
+    input
+        .get(0..keyword.len())
+        .map(|s| s.eq_ignore_ascii_case(keyword))
+        .unwrap_or(false)
+        && input
+            .as_bytes()
+            .get(keyword.len())
+            .map(|b| !b.is_ascii_alphanumeric() && *b != b'_')
+            .unwrap_or(true)
+}
+
+fn canonicalize_schema_filter(
+    filter: &[String],
+    manifest_schemas: &[String],
+) -> Result<Vec<String>> {
+    let mut canonicalized = Vec::new();
+    let mut seen = HashSet::new();
+
+    for schema in filter {
+        let canonical = manifest_schemas
+            .iter()
+            .find(|candidate| candidate.eq_ignore_ascii_case(schema))
+            .cloned()
+            .ok_or_else(|| {
+                SchemaNotInSnapshotSnafu {
+                    schema: schema.clone(),
+                }
+                .build()
+            })?;
+
+        if seen.insert(canonical.to_ascii_lowercase()) {
+            canonicalized.push(canonical);
+        }
+    }
+
+    Ok(canonicalized)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_ddl_statements() {
+        let content = r#"
+-- Schema: public
+CREATE DATABASE public;
+CREATE TABLE t (ts TIMESTAMP TIME INDEX, host STRING, PRIMARY KEY (host)) ENGINE=mito;
+
+-- comment
+CREATE VIEW v AS SELECT * FROM t;
+"#;
+        let statements = parse_ddl_statements(content);
+        assert_eq!(statements.len(), 3);
+        assert!(statements[0].starts_with("CREATE DATABASE public"));
+        assert!(statements[1].starts_with("CREATE TABLE t"));
+        assert!(statements[2].starts_with("CREATE VIEW v"));
+    }
+
+    #[test]
+    fn test_parse_ddl_statements_preserves_semicolons_in_string_literals() {
+        let content = r#"
+CREATE TABLE t (
+    host STRING DEFAULT 'a;b'
+);
+CREATE VIEW v AS SELECT ';' AS marker;
+"#;
+
+        let statements = parse_ddl_statements(content);
+
+        assert_eq!(statements.len(), 2);
+        assert!(statements[0].contains("'a;b'"));
+        assert!(statements[1].contains("';' AS marker"));
+    }
+
+    #[test]
+    fn test_parse_ddl_statements_handles_comments_without_splitting() {
+        let content = r#"
+-- leading comment
+CREATE TABLE t (ts TIMESTAMP TIME INDEX); /* block; comment */
+CREATE VIEW v AS SELECT 1;
+"#;
+
+        let statements = parse_ddl_statements(content);
+
+        assert_eq!(statements.len(), 2);
+        assert!(statements[0].starts_with("CREATE TABLE t"));
+        assert!(statements[1].starts_with("CREATE VIEW v"));
+    }
+
+    #[test]
+    fn test_canonicalize_schema_filter_uses_manifest_casing() {
+        let filter = vec!["TEST_DB".to_string(), "PUBLIC".to_string()];
+        let manifest_schemas = vec!["test_db".to_string(), "public".to_string()];
+
+        let canonicalized = canonicalize_schema_filter(&filter, &manifest_schemas).unwrap();
+
+        assert_eq!(canonicalized, vec!["test_db", "public"]);
+    }
+
+    #[test]
+    fn test_canonicalize_schema_filter_dedupes_case_insensitive_matches() {
+        let filter = vec![
+            "TEST_DB".to_string(),
+            "test_db".to_string(),
+            "PUBLIC".to_string(),
+            "public".to_string(),
+        ];
+        let manifest_schemas = vec!["test_db".to_string(), "public".to_string()];
+
+        let canonicalized = canonicalize_schema_filter(&filter, &manifest_schemas).unwrap();
+
+        assert_eq!(canonicalized, vec!["test_db", "public"]);
+    }
+
+    #[test]
+    fn test_canonicalize_schema_filter_rejects_missing_schema() {
+        let filter = vec!["missing".to_string()];
+        let manifest_schemas = vec!["test_db".to_string()];
+
+        let error = canonicalize_schema_filter(&filter, &manifest_schemas)
+            .expect_err("missing schema should fail")
+            .to_string();
+
+        assert!(error.contains("missing"));
+    }
+
+    #[test]
+    fn test_ddl_statement_for_schema_create_table_uses_execution_schema() {
+        let stmt = ddl_statement_for_schema(
+            "test_db",
+            "CREATE TABLE metrics (ts TIMESTAMP TIME INDEX) ENGINE=mito".to_string(),
+        );
+        assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
+    }
+
+    #[test]
+    fn test_ddl_statement_for_schema_create_view_uses_execution_schema() {
+        let stmt = ddl_statement_for_schema(
+            "test_db",
+            "CREATE VIEW metrics_view AS SELECT * FROM metrics".to_string(),
+        );
+        assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
+    }
+
+    #[test]
+    fn test_ddl_statement_for_schema_create_or_replace_view_uses_execution_schema() {
+        let stmt = ddl_statement_for_schema(
+            "test_db",
+            "CREATE OR REPLACE VIEW metrics_view AS SELECT * FROM metrics".to_string(),
+        );
+        assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
+    }
+
+    #[test]
+    fn test_ddl_statement_for_schema_create_external_table_uses_execution_schema() {
+        let stmt = ddl_statement_for_schema(
+            "test_db",
+            "CREATE EXTERNAL TABLE IF NOT EXISTS ext_metrics (ts TIMESTAMP TIME INDEX) ENGINE=file"
+                .to_string(),
+        );
+        assert_eq!(stmt.execution_schema.as_deref(), Some("test_db"));
+    }
+
+    #[test]
+    fn test_ddl_statement_for_schema_create_database_uses_public_context() {
+        let stmt = ddl_statement_for_schema("test_db", "CREATE DATABASE test_db".to_string());
+        assert_eq!(stmt.execution_schema, None);
+    }
+
+    #[test]
+    fn test_starts_with_keyword_requires_word_boundary() {
+        assert!(starts_with_keyword("CREATE TABLE t", "CREATE"));
+        assert!(!starts_with_keyword("CREATED TABLE t", "CREATE"));
+        assert!(!starts_with_keyword("TABLESPACE foo", "TABLE"));
+    }
+}
diff --git a/src/cli/src/data/import_v2/error.rs b/src/cli/src/data/import_v2/error.rs
new file mode 100644
index 0000000000..5ae3db1583
--- /dev/null
+++ b/src/cli/src/data/import_v2/error.rs
@@ -0,0 +1,82 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use snafu::{Location, Snafu};
+
+#[derive(Snafu)]
+#[snafu(visibility(pub))]
+#[stack_trace_debug]
+pub enum Error {
+    #[snafu(display("Snapshot not found at '{}'", uri))]
+    SnapshotNotFound {
+        uri: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Manifest version mismatch: expected {}, found {}", expected, found))]
+    ManifestVersionMismatch {
+        expected: u32,
+        found: u32,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Schema '{}' not found in snapshot", schema))]
+    SchemaNotInSnapshot {
+        schema: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Snapshot storage error"))]
+    SnapshotStorage {
+        #[snafu(source)]
+        error: crate::data::export_v2::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Database error"))]
+    Database {
+        #[snafu(source)]
+        error: crate::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+}
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+impl ErrorExt for Error {
+    fn status_code(&self) -> StatusCode {
+        match self {
+            Error::SnapshotNotFound { .. } | Error::SchemaNotInSnapshot { .. } => {
+                StatusCode::InvalidArguments
+            }
+            Error::ManifestVersionMismatch { .. } => StatusCode::InvalidArguments,
+            Error::Database { error, .. } => error.status_code(),
+            Error::SnapshotStorage { error, .. } => error.status_code(),
+        }
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
diff --git a/src/cli/src/data/import_v2/executor.rs b/src/cli/src/data/import_v2/executor.rs
new file mode 100644
index 0000000000..3f2bf66ae6
--- /dev/null
+++ b/src/cli/src/data/import_v2/executor.rs
@@ -0,0 +1,122 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! DDL execution for import.
+
+use common_telemetry::info;
+use snafu::ResultExt;
+
+use crate::data::import_v2::error::{DatabaseSnafu, Result};
+use crate::database::DatabaseClient;
+
+/// A DDL statement with an explicit execution schema context.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DdlStatement {
+    pub sql: String,
+    pub execution_schema: Option<String>,
+}
+
+impl DdlStatement {
+    pub fn new(sql: String) -> Self {
+        Self {
+            sql,
+            execution_schema: None,
+        }
+    }
+
+    pub fn with_execution_schema(sql: String, schema: String) -> Self {
+        Self {
+            sql,
+            execution_schema: Some(schema),
+        }
+    }
+}
+
+/// Executes DDL statements against the database.
+pub struct DdlExecutor<'a> {
+    client: &'a DatabaseClient,
+}
+
+impl<'a> DdlExecutor<'a> {
+    /// Creates a new DDL executor.
+    pub fn new(client: &'a DatabaseClient) -> Self {
+        Self { client }
+    }
+
+    /// Executes a list of DDL statements, stopping on first error.
+    pub async fn execute_strict(&self, statements: &[DdlStatement]) -> Result<()> {
+        let total = statements.len();
+
+        for (i, stmt) in statements.iter().enumerate() {
+            let preview = preview_sql(&stmt.sql);
+
+            info!("Executing DDL ({}/{}): {}", i + 1, total, preview);
+
+            if let Some(schema) = stmt.execution_schema.as_deref() {
+                self.client
+                    .sql(&stmt.sql, schema)
+                    .await
+                    .context(DatabaseSnafu)?;
+            } else {
+                self.client
+                    .sql_in_public(&stmt.sql)
+                    .await
+                    .context(DatabaseSnafu)?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+fn preview_sql(sql: &str) -> String {
+    let mut chars = sql.chars();
+    let preview: String = chars.by_ref().take(80).collect();
+    if chars.next().is_some() {
+        format!("{preview}...")
+    } else {
+        preview
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_statement_without_execution_schema_uses_public() {
+        let stmt = DdlStatement::new("CREATE DATABASE IF NOT EXISTS test_db".to_string());
+        assert_eq!(stmt.execution_schema, None);
+    }
+
+    #[test]
+    fn test_statement_with_execution_schema_preserves_context() {
+        let stmt = DdlStatement::with_execution_schema(
+            r#"CREATE TABLE IF NOT EXISTS "my""schema"."metrics" (ts TIMESTAMP TIME INDEX)"#
+                .to_string(),
+            r#"my"schema"#.to_string(),
+        );
+        assert_eq!(stmt.execution_schema.as_deref(), Some(r#"my"schema"#));
+    }
+
+    #[test]
+    fn test_preview_sql_truncates_at_char_boundary() {
+        let sql = format!(
+            "CREATE TABLE {} (ts TIMESTAMP TIME INDEX)",
+            "测".repeat(100)
+        );
+        let preview = preview_sql(&sql);
+        assert!(preview.ends_with("..."));
+        assert!(preview.is_char_boundary(preview.len()));
+    }
+}
diff --git a/src/cli/src/data/path.rs b/src/cli/src/data/path.rs
new file mode 100644
index 0000000000..2e0f5d3f1a
--- /dev/null
+++ b/src/cli/src/data/path.rs
@@ -0,0 +1,76 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Shared path helpers for export/import data files.
+
+use crate::data::export_v2::schema::{DDL_DIR, SCHEMA_DIR};
+
+pub(crate) fn ddl_path_for_schema(schema: &str) -> String {
+    format!(
+        "{}/{}/{}.sql",
+        SCHEMA_DIR,
+        DDL_DIR,
+        encode_path_segment(schema)
+    )
+}
+
+pub(crate) fn encode_path_segment(value: &str) -> String {
+    let mut encoded = String::with_capacity(value.len());
+    for byte in value.bytes() {
+        match byte {
+            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' => {
+                encoded.push(byte as char);
+            }
+            _ => {
+                encoded.push('%');
+                encoded.push(hex_char(byte >> 4));
+                encoded.push(hex_char(byte & 0x0F));
+            }
+        }
+    }
+    encoded
+}
+
+fn hex_char(nibble: u8) -> char {
+    match nibble {
+        0..=9 => (b'0' + nibble) as char,
+        10..=15 => (b'A' + (nibble - 10)) as char,
+        _ => unreachable!("nibble must be in 0..=15"),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_encode_path_segment_preserves_safe_ascii() {
+        assert_eq!(encode_path_segment("test_db"), "test_db");
+    }
+
+    #[test]
+    fn test_encode_path_segment_escapes_path_traversal_chars() {
+        assert_eq!(encode_path_segment("../evil"), "%2E%2E%2Fevil");
+        assert_eq!(encode_path_segment(r"..\\evil"), "%2E%2E%5C%5Cevil");
+    }
+
+    #[test]
+    fn test_ddl_path_for_schema_encodes_schema_segment() {
+        assert_eq!(ddl_path_for_schema("public"), "schema/ddl/public.sql");
+        assert_eq!(
+            ddl_path_for_schema("../evil"),
+            "schema/ddl/%2E%2E%2Fevil.sql"
+        );
+    }
+}
diff --git a/src/cli/src/data/snapshot_storage.rs b/src/cli/src/data/snapshot_storage.rs
new file mode 100644
index 0000000000..50c8734a67
--- /dev/null
+++ b/src/cli/src/data/snapshot_storage.rs
@@ -0,0 +1,669 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Storage abstraction for Export/Import V2.
+//!
+//! This module provides a unified interface for reading and writing snapshot data
+//! to various storage backends (S3, OSS, GCS, Azure Blob, local filesystem).
+
+use async_trait::async_trait;
+use object_store::services::{Azblob, Fs, Gcs, Oss, S3};
+use object_store::util::{with_instrument_layers, with_retry_layers};
+use object_store::{AzblobConnection, GcsConnection, ObjectStore, OssConnection, S3Connection};
+use snafu::ResultExt;
+use url::Url;
+
+use crate::common::ObjectStoreConfig;
+use crate::data::export_v2::error::{
+    BuildObjectStoreSnafu, InvalidUriSnafu, ManifestParseSnafu, ManifestSerializeSnafu, Result,
+    SnapshotNotFoundSnafu, StorageOperationSnafu, TextDecodeSnafu, UnsupportedSchemeSnafu,
+    UrlParseSnafu,
+};
+use crate::data::export_v2::manifest::{MANIFEST_FILE, Manifest};
+#[cfg(test)]
+use crate::data::export_v2::schema::SchemaDefinition;
+use crate::data::export_v2::schema::{SCHEMA_DIR, SCHEMAS_FILE, SchemaSnapshot};
+
+struct RemoteLocation {
+    bucket_or_container: String,
+    root: String,
+}
+
+/// URI schemes supported for snapshot storage.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum StorageScheme {
+    /// Amazon S3.
+    S3,
+    /// Alibaba Cloud OSS.
+    Oss,
+    /// Google Cloud Storage.
+    Gcs,
+    /// Azure Blob Storage.
+    Azblob,
+    /// Local filesystem (file://).
+    File,
+}
+
+impl StorageScheme {
+    /// Parses storage scheme from URI.
+    pub fn from_uri(uri: &str) -> Result<Self> {
+        let url = Url::parse(uri).context(UrlParseSnafu)?;
+
+        match url.scheme() {
+            "s3" => Ok(Self::S3),
+            "oss" => Ok(Self::Oss),
+            "gs" | "gcs" => Ok(Self::Gcs),
+            "azblob" => Ok(Self::Azblob),
+            "file" => Ok(Self::File),
+            scheme => UnsupportedSchemeSnafu { scheme }.fail(),
+        }
+    }
+}
+
+/// Extracts bucket/container and root path from a URI.
+fn extract_remote_location(uri: &str) -> Result<RemoteLocation> {
+    let url = Url::parse(uri).context(UrlParseSnafu)?;
+    let bucket_or_container = url.host_str().unwrap_or("").to_string();
+    if bucket_or_container.is_empty() {
+        return InvalidUriSnafu {
+            uri,
+            reason: "URI must include bucket/container in host",
+        }
+        .fail();
+    }
+
+    let root = url.path().trim_start_matches('/').to_string();
+    if root.is_empty() {
+        return InvalidUriSnafu {
+            uri,
+            reason: "snapshot URI must include a non-empty path after the bucket/container",
+        }
+        .fail();
+    }
+
+    Ok(RemoteLocation {
+        bucket_or_container,
+        root,
+    })
+}
+
+/// Validates that a URI has a proper scheme.
+///
+/// Rejects bare paths (e.g., `/tmp/backup`, `./backup`) because:
+/// - Schema export (CLI) and data export (server) run in different processes
+/// - Using bare paths would split the snapshot across machines
+///
+/// Supported URI schemes:
+/// - `s3://bucket/path` - Amazon S3
+/// - `oss://bucket/path` - Alibaba Cloud OSS
+/// - `gs://bucket/path` - Google Cloud Storage
+/// - `azblob://container/path` - Azure Blob Storage
+/// - `file:///absolute/path` - Local filesystem
+pub fn validate_uri(uri: &str) -> Result<StorageScheme> {
+    // Must have a scheme
+    if !uri.contains("://") {
+        return InvalidUriSnafu {
+            uri,
+            reason: "URI must have a scheme (e.g., s3://, file://). Bare paths are not supported.",
+        }
+        .fail();
+    }
+
+    StorageScheme::from_uri(uri)
+}
+
+fn schema_index_path() -> String {
+    format!("{}/{}", SCHEMA_DIR, SCHEMAS_FILE)
+}
+
+/// Extracts the absolute filesystem path from a file:// URI.
+fn extract_file_path_from_uri(uri: &str) -> Result<String> {
+    let url = Url::parse(uri).context(UrlParseSnafu)?;
+
+    match url.host_str() {
+        Some(host) if !host.is_empty() && host != "localhost" => InvalidUriSnafu {
+            uri,
+            reason: "file:// URI must use an absolute path like file:///tmp/backup",
+        }
+        .fail(),
+        _ => url
+            .to_file_path()
+            .map(|path| path.to_string_lossy().into_owned())
+            .map_err(|_| {
+                InvalidUriSnafu {
+                    uri,
+                    reason: "file:// URI must use a valid absolute filesystem path",
+                }
+                .build()
+            }),
+    }
+}
+
+async fn ensure_snapshot_exists(storage: &OpenDalStorage) -> Result<()> {
+    if storage.exists().await? {
+        Ok(())
+    } else {
+        SnapshotNotFoundSnafu {
+            uri: storage.target_uri.as_str(),
+        }
+        .fail()
+    }
+}
+
+/// Snapshot storage abstraction.
+///
+/// Provides operations for reading and writing snapshot data to various storage backends.
+#[async_trait]
+pub trait SnapshotStorage: Send + Sync {
+    /// Checks if a snapshot exists at this location (manifest.json exists).
+    async fn exists(&self) -> Result<bool>;
+
+    /// Reads the manifest file.
+    async fn read_manifest(&self) -> Result<Manifest>;
+
+    /// Writes the manifest file.
+    async fn write_manifest(&self, manifest: &Manifest) -> Result<()>;
+
+    /// Writes the schema index to schema/schemas.json.
+    async fn write_schema(&self, schema: &SchemaSnapshot) -> Result<()>;
+
+    /// Writes a text file to a relative path under the snapshot root.
+    async fn write_text(&self, path: &str, content: &str) -> Result<()>;
+
+    /// Reads a text file from a relative path under the snapshot root.
+    async fn read_text(&self, path: &str) -> Result<String>;
+
+    /// Deletes the entire snapshot (for --force).
+    async fn delete_snapshot(&self) -> Result<()>;
+}
+
+/// OpenDAL-based implementation of SnapshotStorage.
+pub struct OpenDalStorage {
+    object_store: ObjectStore,
+    target_uri: String,
+}
+
+impl OpenDalStorage {
+    fn new_operator_rooted(object_store: ObjectStore, target_uri: &str) -> Self {
+        Self {
+            object_store,
+            target_uri: target_uri.to_string(),
+        }
+    }
+
+    fn finish_local_store(object_store: ObjectStore) -> ObjectStore {
+        with_instrument_layers(object_store, false)
+    }
+
+    fn finish_remote_store(object_store: ObjectStore) -> ObjectStore {
+        with_instrument_layers(with_retry_layers(object_store), false)
+    }
+
+    fn ensure_backend_enabled(uri: &str, enabled: bool, reason: &'static str) -> Result<()> {
+        if enabled {
+            Ok(())
+        } else {
+            InvalidUriSnafu { uri, reason }.fail()
+        }
+    }
+
+    fn validate_remote_config<E: std::fmt::Display>(
+        uri: &str,
+        backend: &str,
+        result: std::result::Result<(), E>,
+    ) -> Result<()> {
+        result.map_err(|error| {
+            InvalidUriSnafu {
+                uri,
+                reason: format!("invalid {} config: {}", backend, error),
+            }
+            .build()
+        })
+    }
+
+    /// Creates a new storage from a file:// URI.
+    pub fn from_file_uri(uri: &str) -> Result<Self> {
+        let path = extract_file_path_from_uri(uri)?;
+
+        let builder = Fs::default().root(&path);
+        let object_store = ObjectStore::new(builder)
+            .context(BuildObjectStoreSnafu)?
+            .finish();
+        Ok(Self::new_operator_rooted(
+            Self::finish_local_store(object_store),
+            uri,
+        ))
+    }
+
+    fn from_file_uri_with_config(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
+        if storage.enable_s3 || storage.enable_oss || storage.enable_gcs || storage.enable_azblob {
+            return InvalidUriSnafu {
+                uri,
+                reason: "file:// cannot be used with remote storage flags",
+            }
+            .fail();
+        }
+
+        Self::from_file_uri(uri)
+    }
+
+    fn from_s3_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
+        Self::ensure_backend_enabled(
+            uri,
+            storage.enable_s3,
+            "s3:// requires --s3 and related options",
+        )?;
+
+        let location = extract_remote_location(uri)?;
+        let mut config = storage.s3.clone();
+        config.s3_bucket = location.bucket_or_container;
+        config.s3_root = location.root;
+        Self::validate_remote_config(uri, "s3", config.validate())?;
+
+        let conn: S3Connection = config.into();
+        let object_store = ObjectStore::new(S3::from(&conn))
+            .context(BuildObjectStoreSnafu)?
+            .finish();
+        Ok(Self::new_operator_rooted(
+            Self::finish_remote_store(object_store),
+            uri,
+        ))
+    }
+
+    fn from_oss_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
+        Self::ensure_backend_enabled(
+            uri,
+            storage.enable_oss,
+            "oss:// requires --oss and related options",
+        )?;
+
+        let location = extract_remote_location(uri)?;
+        let mut config = storage.oss.clone();
+        config.oss_bucket = location.bucket_or_container;
+        config.oss_root = location.root;
+        Self::validate_remote_config(uri, "oss", config.validate())?;
+
+        let conn: OssConnection = config.into();
+        let object_store = ObjectStore::new(Oss::from(&conn))
+            .context(BuildObjectStoreSnafu)?
+            .finish();
+        Ok(Self::new_operator_rooted(
+            Self::finish_remote_store(object_store),
+            uri,
+        ))
+    }
+
+    fn from_gcs_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
+        Self::ensure_backend_enabled(
+            uri,
+            storage.enable_gcs,
+            "gs:// or gcs:// requires --gcs and related options",
+        )?;
+
+        let location = extract_remote_location(uri)?;
+        let mut config = storage.gcs.clone();
+        config.gcs_bucket = location.bucket_or_container;
+        config.gcs_root = location.root;
+        Self::validate_remote_config(uri, "gcs", config.validate())?;
+
+        let conn: GcsConnection = config.into();
+        let object_store = ObjectStore::new(Gcs::from(&conn))
+            .context(BuildObjectStoreSnafu)?
+            .finish();
+        Ok(Self::new_operator_rooted(
+            Self::finish_remote_store(object_store),
+            uri,
+        ))
+    }
+
+    fn from_azblob_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
+        Self::ensure_backend_enabled(
+            uri,
+            storage.enable_azblob,
+            "azblob:// requires --azblob and related options",
+        )?;
+
+        let location = extract_remote_location(uri)?;
+        let mut config = storage.azblob.clone();
+        config.azblob_container = location.bucket_or_container;
+        config.azblob_root = location.root;
+        Self::validate_remote_config(uri, "azblob", config.validate())?;
+
+        let conn: AzblobConnection = config.into();
+        let object_store = ObjectStore::new(Azblob::from(&conn))
+            .context(BuildObjectStoreSnafu)?
+            .finish();
+        Ok(Self::new_operator_rooted(
+            Self::finish_remote_store(object_store),
+            uri,
+        ))
+    }
+
+    /// Creates a new storage from a URI and object store config.
+    pub fn from_uri(uri: &str, storage: &ObjectStoreConfig) -> Result<Self> {
+        match StorageScheme::from_uri(uri)? {
+            StorageScheme::File => Self::from_file_uri_with_config(uri, storage),
+            StorageScheme::S3 => Self::from_s3_uri(uri, storage),
+            StorageScheme::Oss => Self::from_oss_uri(uri, storage),
+            StorageScheme::Gcs => Self::from_gcs_uri(uri, storage),
+            StorageScheme::Azblob => Self::from_azblob_uri(uri, storage),
+        }
+    }
+
+    /// Reads a file as bytes.
+    async fn read_file(&self, path: &str) -> Result<Vec<u8>> {
+        let data = self
+            .object_store
+            .read(path)
+            .await
+            .context(StorageOperationSnafu {
+                operation: format!("read {}", path),
+            })?;
+        Ok(data.to_vec())
+    }
+
+    /// Writes bytes to a file.
+    async fn write_file(&self, path: &str, data: Vec<u8>) -> Result<()> {
+        self.object_store
+            .write(path, data)
+            .await
+            .map(|_| ())
+            .context(StorageOperationSnafu {
+                operation: format!("write {}", path),
+            })
+    }
+
+    /// Checks if a file exists using stat.
+    async fn file_exists(&self, path: &str) -> Result<bool> {
+        match self.object_store.stat(path).await {
+            Ok(_) => Ok(true),
+            Err(e) if e.kind() == object_store::ErrorKind::NotFound => Ok(false),
+            Err(e) => Err(e).context(StorageOperationSnafu {
+                operation: format!("check exists {}", path),
+            }),
+        }
+    }
+
+    #[cfg(test)]
+    pub async fn read_schema(&self) -> Result<SchemaSnapshot> {
+        let schemas_path = schema_index_path();
+        let schemas: Vec<SchemaDefinition> = if self.file_exists(&schemas_path).await? {
+            let data = self.read_file(&schemas_path).await?;
+            serde_json::from_slice(&data).context(ManifestParseSnafu)?
+        } else {
+            vec![]
+        };
+
+        Ok(SchemaSnapshot { schemas })
+    }
+}
+
+#[async_trait]
+impl SnapshotStorage for OpenDalStorage {
+    async fn exists(&self) -> Result<bool> {
+        self.file_exists(MANIFEST_FILE).await
+    }
+
+    async fn read_manifest(&self) -> Result<Manifest> {
+        ensure_snapshot_exists(self).await?;
+
+        let data = self.read_file(MANIFEST_FILE).await?;
+        serde_json::from_slice(&data).context(ManifestParseSnafu)
+    }
+
+    async fn write_manifest(&self, manifest: &Manifest) -> Result<()> {
+        let data = serde_json::to_vec_pretty(manifest).context(ManifestSerializeSnafu)?;
+        self.write_file(MANIFEST_FILE, data).await
+    }
+
+    async fn write_schema(&self, schema: &SchemaSnapshot) -> Result<()> {
+        let schemas_path = schema_index_path();
+        let schemas_data =
+            serde_json::to_vec_pretty(&schema.schemas).context(ManifestSerializeSnafu)?;
+        self.write_file(&schemas_path, schemas_data).await
+    }
+
+    async fn write_text(&self, path: &str, content: &str) -> Result<()> {
+        self.write_file(path, content.as_bytes().to_vec()).await
+    }
+
+    async fn read_text(&self, path: &str) -> Result<String> {
+        let data = self.read_file(path).await?;
+        String::from_utf8(data).context(TextDecodeSnafu)
+    }
+
+    async fn delete_snapshot(&self) -> Result<()> {
+        self.object_store
+            .remove_all("/")
+            .await
+            .context(StorageOperationSnafu {
+                operation: "delete snapshot",
+            })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+    use std::path::Path;
+
+    use object_store::ObjectStore;
+    use object_store::services::Fs;
+    use tempfile::tempdir;
+    use url::Url;
+
+    use super::*;
+    use crate::data::export_v2::manifest::{DataFormat, TimeRange};
+    use crate::data::export_v2::schema::SchemaDefinition;
+
+    fn make_storage_with_rooted_fs(dir: &std::path::Path) -> OpenDalStorage {
+        let object_store = ObjectStore::new(Fs::default().root(dir.to_str().unwrap()))
+            .unwrap()
+            .finish();
+        OpenDalStorage::new_operator_rooted(
+            OpenDalStorage::finish_local_store(object_store),
+            Url::from_directory_path(dir).unwrap().as_ref(),
+        )
+    }
+
+    #[test]
+    fn test_validate_uri_valid() {
+        assert_eq!(validate_uri("s3://bucket/path").unwrap(), StorageScheme::S3);
+        assert_eq!(
+            validate_uri("oss://bucket/path").unwrap(),
+            StorageScheme::Oss
+        );
+        assert_eq!(
+            validate_uri("gs://bucket/path").unwrap(),
+            StorageScheme::Gcs
+        );
+        assert_eq!(
+            validate_uri("gcs://bucket/path").unwrap(),
+            StorageScheme::Gcs
+        );
+        assert_eq!(
+            validate_uri("azblob://container/path").unwrap(),
+            StorageScheme::Azblob
+        );
+        assert_eq!(
+            validate_uri("file:///tmp/backup").unwrap(),
+            StorageScheme::File
+        );
+    }
+
+    #[test]
+    fn test_validate_uri_invalid() {
+        // Bare paths should be rejected
+        assert!(validate_uri("/tmp/backup").is_err());
+        assert!(validate_uri("./backup").is_err());
+        assert!(validate_uri("backup").is_err());
+
+        // Unknown schemes
+        assert!(validate_uri("ftp://server/path").is_err());
+    }
+
+    #[test]
+    fn test_extract_remote_location_requires_non_empty_root() {
+        assert!(extract_remote_location("s3://bucket").is_err());
+        assert!(extract_remote_location("s3://bucket/").is_err());
+        assert!(extract_remote_location("oss://bucket").is_err());
+        assert!(extract_remote_location("gs://bucket").is_err());
+        assert!(extract_remote_location("azblob://container").is_err());
+    }
+
+    #[cfg(not(windows))]
+    #[test]
+    fn test_extract_path_from_uri_unix_examples() {
+        assert_eq!(
+            extract_file_path_from_uri("file:///tmp/backup").unwrap(),
+            "/tmp/backup"
+        );
+        assert_eq!(
+            extract_file_path_from_uri("file://localhost/tmp/backup").unwrap(),
+            "/tmp/backup"
+        );
+    }
+
+    #[test]
+    fn test_extract_file_path_from_uri_rejects_file_host() {
+        assert!(extract_file_path_from_uri("file://tmp/backup").is_err());
+    }
+
+    #[test]
+    fn test_extract_file_path_from_uri_round_trips_directory_url() {
+        let dir = tempdir().unwrap();
+        let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
+        let path = extract_file_path_from_uri(&uri).unwrap();
+
+        assert_eq!(Path::new(&path), dir.path());
+    }
+
+    #[tokio::test]
+    async fn test_read_manifest_reports_requested_uri() {
+        let dir = tempdir().unwrap();
+        let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
+        let storage = OpenDalStorage::from_file_uri(&uri).unwrap();
+
+        let error = storage.read_manifest().await.unwrap_err().to_string();
+
+        assert!(error.contains(uri.as_str()));
+    }
+
+    #[tokio::test]
+    async fn test_manifest_round_trip() {
+        let dir = tempdir().unwrap();
+        let storage = make_storage_with_rooted_fs(dir.path());
+
+        let manifest = Manifest::new_full(
+            "greptime".to_string(),
+            vec!["public".to_string()],
+            TimeRange::unbounded(),
+            DataFormat::Parquet,
+        );
+
+        storage.write_manifest(&manifest).await.unwrap();
+        let loaded = storage.read_manifest().await.unwrap();
+
+        assert_eq!(loaded.catalog, manifest.catalog);
+        assert_eq!(loaded.schemas, manifest.schemas);
+        assert_eq!(loaded.schema_only, manifest.schema_only);
+        assert_eq!(loaded.format, manifest.format);
+        assert_eq!(loaded.snapshot_id, manifest.snapshot_id);
+    }
+
+    #[tokio::test]
+    async fn test_schema_round_trip() {
+        let dir = tempdir().unwrap();
+        let storage = make_storage_with_rooted_fs(dir.path());
+
+        let mut snapshot = SchemaSnapshot::new();
+        snapshot.add_schema(SchemaDefinition {
+            catalog: "greptime".to_string(),
+            name: "test_db".to_string(),
+            options: HashMap::from([("ttl".to_string(), "7d".to_string())]),
+        });
+
+        storage.write_schema(&snapshot).await.unwrap();
+        let loaded = storage.read_schema().await.unwrap();
+
+        assert_eq!(loaded, snapshot);
+    }
+
+    #[tokio::test]
+    async fn test_text_round_trip() {
+        let dir = tempdir().unwrap();
+        let storage = make_storage_with_rooted_fs(dir.path());
+        let content = "CREATE TABLE metrics (ts TIMESTAMP TIME INDEX);";
+
+        storage
+            .write_text("schema/ddl/public.sql", content)
+            .await
+            .unwrap();
+        let loaded = storage.read_text("schema/ddl/public.sql").await.unwrap();
+
+        assert_eq!(loaded, content);
+    }
+
+    #[tokio::test]
+    async fn test_read_text_rejects_invalid_utf8() {
+        let dir = tempdir().unwrap();
+        let storage = make_storage_with_rooted_fs(dir.path());
+
+        storage
+            .write_file("schema/ddl/public.sql", vec![0xff, 0xfe, 0xfd])
+            .await
+            .unwrap();
+
+        let error = storage
+            .read_text("schema/ddl/public.sql")
+            .await
+            .unwrap_err();
+        assert!(error.to_string().contains("UTF-8"));
+    }
+
+    #[tokio::test]
+    async fn test_exists_follows_manifest_presence() {
+        let dir = tempdir().unwrap();
+        let storage = make_storage_with_rooted_fs(dir.path());
+
+        assert!(!storage.exists().await.unwrap());
+
+        storage
+            .write_manifest(&Manifest::new_schema_only(
+                "greptime".to_string(),
+                vec!["public".to_string()],
+            ))
+            .await
+            .unwrap();
+
+        assert!(storage.exists().await.unwrap());
+    }
+
+    #[tokio::test]
+    async fn test_delete_snapshot_only_removes_rooted_contents() {
+        let parent = tempdir().unwrap();
+        let snapshot_root = parent.path().join("snapshot");
+        let sibling = parent.path().join("sibling");
+        std::fs::create_dir_all(&snapshot_root).unwrap();
+        std::fs::create_dir_all(&sibling).unwrap();
+        std::fs::write(snapshot_root.join("manifest.json"), b"{}").unwrap();
+        std::fs::write(sibling.join("keep.txt"), b"keep").unwrap();
+
+        let storage = make_storage_with_rooted_fs(&snapshot_root);
+        storage.delete_snapshot().await.unwrap();
+
+        assert!(!snapshot_root.join("manifest.json").exists());
+        assert!(sibling.join("keep.txt").exists());
+    }
+}
diff --git a/src/cli/src/data/sql.rs b/src/cli/src/data/sql.rs
new file mode 100644
index 0000000000..7de4206b26
--- /dev/null
+++ b/src/cli/src/data/sql.rs
@@ -0,0 +1,40 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Shared SQL escaping helpers for CLI-generated statements.
+
+pub(crate) fn escape_sql_literal(value: &str) -> String {
+    value.replace('\'', "''")
+}
+
+pub(crate) fn escape_sql_identifier(value: &str) -> String {
+    value.replace('"', "\"\"")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_escape_sql_literal_escapes_single_quotes() {
+        assert_eq!(escape_sql_literal("test_db"), "test_db");
+        assert_eq!(escape_sql_literal("te'st"), "te''st");
+    }
+
+    #[test]
+    fn test_escape_sql_identifier_escapes_double_quotes() {
+        assert_eq!(escape_sql_identifier("test_db"), "test_db");
+        assert_eq!(escape_sql_identifier(r#"te"st"#), r#"te""st"#);
+    }
+}
diff --git a/src/cli/src/database.rs b/src/cli/src/database.rs
index db98c38e38..fa3f6faefb 100644
--- a/src/cli/src/database.rs
+++ b/src/cli/src/database.rs
@@ -36,6 +36,7 @@ pub struct DatabaseClient {
     auth_header: Option<String>,
     timeout: Duration,
     proxy: Option<reqwest::Proxy>,
+    no_proxy: bool,
 }
 
 pub fn parse_proxy_opts(
@@ -61,6 +62,7 @@ impl DatabaseClient {
         auth_basic: Option<String>,
         timeout: Duration,
         proxy: Option<reqwest::Proxy>,
+        no_proxy: bool,
     ) -> Self {
         let auth_header = if let Some(basic) = auth_basic {
             let encoded = general_purpose::STANDARD.encode(basic);
@@ -69,7 +71,9 @@ impl DatabaseClient {
             None
         };
 
-        if let Some(ref proxy) = proxy {
+        if no_proxy {
+            common_telemetry::info!("Proxy disabled");
+        } else if let Some(ref proxy) = proxy {
             common_telemetry::info!("Using proxy: {:?}", proxy);
         } else {
             common_telemetry::info!("Using system proxy(if any)");
@@ -81,6 +85,7 @@ impl DatabaseClient {
             auth_header,
             timeout,
             proxy,
+            no_proxy,
         }
     }
 
@@ -95,12 +100,14 @@ impl DatabaseClient {
             ("db", format!("{}-{}", self.catalog, schema)),
             ("sql", sql.to_string()),
         ];
-        let client = self
-            .proxy
-            .clone()
-            .map(|proxy| reqwest::Client::builder().proxy(proxy).build())
-            .unwrap_or_else(|| Ok(reqwest::Client::new()))
-            .context(BuildClientSnafu)?;
+        let mut builder = reqwest::Client::builder();
+        if let Some(proxy) = self.proxy.clone() {
+            builder = builder.proxy(proxy);
+        }
+        if self.no_proxy {
+            builder = builder.no_proxy();
+        }
+        let client = builder.build().context(BuildClientSnafu)?;
         let mut request = client
             .post(&url)
             .form(&params)
diff --git a/src/cli/src/lib.rs b/src/cli/src/lib.rs
index acf5df4086..4305da9c8f 100644
--- a/src/cli/src/lib.rs
+++ b/src/cli/src/lib.rs
@@ -29,7 +29,7 @@ pub use database::DatabaseClient;
 use error::Result;
 
 pub use crate::bench::BenchTableMetadataCommand;
-pub use crate::data::DataCommand;
+pub use crate::data::{DataCommand, export_v2, import_v2};
 pub use crate::metadata::MetadataCommand;
 
 #[async_trait]
diff --git a/src/cmd/src/datanode/objbench.rs b/src/cmd/src/datanode/objbench.rs
index d8f53b9d71..f6d8674d4c 100644
--- a/src/cmd/src/datanode/objbench.rs
+++ b/src/cmd/src/datanode/objbench.rs
@@ -20,13 +20,14 @@ use clap::Parser;
 use colored::Colorize;
 use datanode::config::RegionEngineConfig;
 use datanode::store;
-use either::Either;
+use futures::stream;
 use mito2::access_layer::{
     AccessLayer, AccessLayerRef, Metrics, OperationType, SstWriteRequest, WriteType,
 };
 use mito2::cache::{CacheManager, CacheManagerRef};
 use mito2::config::{FulltextIndexConfig, MitoConfig, Mode};
-use mito2::read::Source;
+use mito2::read::FlatSource;
+use mito2::sst::FormatType;
 use mito2::sst::file::{FileHandle, FileMeta};
 use mito2::sst::file_purger::{FilePurger, FilePurgerRef};
 use mito2::sst::index::intermediate::IntermediateManager;
@@ -210,6 +211,7 @@ impl ObjbenchCommand {
             object_store.clone(),
         )
         .expected_metadata(Some(region_meta.clone()))
+        .flat_format(true)
         .build()
         .await
         .map_err(|e| {
@@ -231,6 +233,10 @@ impl ObjbenchCommand {
         let reader_build_elapsed = reader_build_start.elapsed();
         let total_rows = reader.parquet_metadata().file_metadata().num_rows();
         println!("{} Reader built in {:?}", "✓".green(), reader_build_elapsed);
+        let reader_stream = Box::pin(stream::try_unfold(reader, |mut reader| async move {
+            let batch = reader.next_record_batch().await?;
+            Ok(batch.map(|batch| (batch, reader)))
+        }));
 
         // Build write request
         let fulltext_index_config = FulltextIndexConfig {
@@ -241,10 +247,11 @@ impl ObjbenchCommand {
         let write_req = SstWriteRequest {
             op_type: OperationType::Flush,
             metadata: region_meta,
-            source: Either::Left(Source::Reader(Box::new(reader))),
+            source: FlatSource::Stream(reader_stream),
             cache_manager,
             storage: None,
             max_sequence: None,
+            sst_write_format: FormatType::PrimaryKey,
             index_options: Default::default(),
             index_config: mito_engine_config.index.clone(),
             inverted_index_config: MitoConfig::default().inverted_index,
diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs
index 92638d3c4a..215bea0ec5 100644
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -32,14 +32,15 @@ use common_meta::cache::LayeredCacheRegistryBuilder;
 use common_meta::ddl::flow_meta::FlowMetadataAllocator;
 use common_meta::ddl::table_meta::TableMetadataAllocator;
 use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl};
-use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef};
+use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef, DdlManagerRef};
 use common_meta::key::flow::FlowMetadataManager;
 use common_meta::key::{TableMetadataManager, TableMetadataManagerRef};
 use common_meta::kv_backend::KvBackendRef;
-use common_meta::procedure_executor::LocalProcedureExecutor;
+use common_meta::node_manager::{FlownodeRef, NodeManagerRef};
+use common_meta::procedure_executor::{LocalProcedureExecutor, ProcedureExecutorRef};
 use common_meta::region_keeper::MemoryRegionKeeper;
 use common_meta::region_registry::LeaderRegionRegistry;
-use common_meta::sequence::SequenceBuilder;
+use common_meta::sequence::{Sequence, SequenceBuilder};
 use common_meta::wal_provider::{WalProviderRef, build_wal_provider};
 use common_procedure::ProcedureManagerRef;
 use common_query::prelude::set_default_prefix;
@@ -49,6 +50,7 @@ use common_time::timezone::set_default_timezone;
 use common_version::{short_version, verbose_version};
 use datanode::config::DatanodeOptions;
 use datanode::datanode::{Datanode, DatanodeBuilder};
+use datanode::region_server::RegionServer;
 use flow::{
     FlownodeBuilder, FlownodeInstance, FlownodeOptions, FrontendClient, FrontendInvoker,
     GrpcQueryHandlerWithBoxedError,
@@ -58,6 +60,7 @@ use frontend::instance::StandaloneDatanodeManager;
 use frontend::instance::builder::FrontendBuilder;
 use frontend::server::Services;
 use meta_srv::metasrv::{FLOW_ID_SEQ, TABLE_ID_SEQ};
+use plugins::PluginOptions;
 use plugins::frontend::context::{
     CatalogManagerConfigureContext, StandaloneCatalogManagerConfigureContext,
 };
@@ -130,6 +133,18 @@ impl Instance {
     pub fn server_addr(&self, name: &str) -> Option<SocketAddr> {
         self.frontend.server_handlers().addr(name)
     }
+
+    /// Get the mutable Frontend component of this Standalone instance for externally modification
+    /// by others (might not be in this code base, so don't delete this function).
+    pub fn mut_frontend(&mut self) -> &mut Frontend {
+        &mut self.frontend
+    }
+
+    /// Get the Datanode component of this Standalone instance for externally usage
+    /// by others (might not be in this code base, so don't delete this function).
+    pub fn datanode(&self) -> &Datanode {
+        &self.datanode
+    }
 }
 
 #[async_trait]
@@ -342,9 +357,18 @@ impl StartCommand {
         info!("Standalone start command: {:#?}", self);
         info!("Standalone options: {opts:#?}");
 
+        let (mut instance, _) =
+            Self::build_with(opts.component, opts.plugins, InstanceCreator::default()).await?;
+        instance._guard.extend(guard);
+        Ok(instance)
+    }
+
+    pub async fn build_with(
+        mut opts: StandaloneOptions,
+        plugin_opts: Vec<PluginOptions>,
+        creator: InstanceCreator,
+    ) -> Result<(Instance, InstanceCreatorResult)> {
         let mut plugins = Plugins::new();
-        let plugin_opts = opts.plugins;
-        let mut opts = opts.component;
         set_default_prefix(opts.default_column_prefix.as_deref())
             .map_err(BoxedError::new)
             .context(error::BuildCliSnafu)?;
@@ -462,17 +486,16 @@ impl StartCommand {
                 .await;
         }
 
-        let node_manager = Arc::new(StandaloneDatanodeManager {
-            region_server: datanode.region_server(),
-            flow_server: flownode.flow_engine(),
-        });
+        let node_manager = creator
+            .node_manager_creator
+            .create(
+                &kv_backend,
+                datanode.region_server(),
+                flownode.flow_engine(),
+            )
+            .await?;
 
-        let table_id_allocator = Arc::new(
-            SequenceBuilder::new(TABLE_ID_SEQ, kv_backend.clone())
-                .initial(MIN_USER_TABLE_ID as u64)
-                .step(10)
-                .build(),
-        );
+        let table_id_allocator = creator.table_id_allocator_creator.create(&kv_backend);
         let flow_id_sequence = Arc::new(
             SequenceBuilder::new(FLOW_ID_SEQ, kv_backend.clone())
                 .initial(MIN_USER_FLOW_ID as u64)
@@ -489,7 +512,7 @@ impl StartCommand {
             .context(error::BuildWalProviderSnafu)?;
         let wal_provider = Arc::new(wal_provider);
         let table_metadata_allocator = Arc::new(TableMetadataAllocator::new(
-            table_id_allocator,
+            table_id_allocator.clone(),
             wal_provider.clone(),
         ));
         let flow_metadata_allocator = Arc::new(FlowMetadataAllocator::with_noop_peer_allocator(
@@ -532,10 +555,10 @@ impl StartCommand {
             ddl_manager
         };
 
-        let procedure_executor = Arc::new(LocalProcedureExecutor::new(
-            Arc::new(ddl_manager),
-            procedure_manager.clone(),
-        ));
+        let procedure_executor = creator
+            .procedure_executor_creator
+            .create(Arc::new(ddl_manager), procedure_manager.clone())
+            .await?;
 
         let fe_instance = FrontendBuilder::new(
             fe_opts.clone(),
@@ -568,7 +591,7 @@ impl StartCommand {
             kv_backend.clone(),
             layered_cache_registry.clone(),
             procedure_executor,
-            node_manager,
+            node_manager.clone(),
         )
         .await
         .context(StartFlownodeSnafu)?;
@@ -584,14 +607,20 @@ impl StartCommand {
             heartbeat_task: None,
         };
 
-        Ok(Instance {
+        let instance = Instance {
             datanode,
             frontend,
             flownode,
             procedure_manager,
             wal_provider,
-            _guard: guard,
-        })
+            _guard: vec![],
+        };
+        let result = InstanceCreatorResult {
+            kv_backend,
+            node_manager,
+            table_id_allocator,
+        };
+        Ok((instance, result))
     }
 
     pub async fn create_table_metadata_manager(
@@ -608,6 +637,115 @@ impl StartCommand {
     }
 }
 
+#[async_trait]
+pub trait NodeManagerCreator {
+    async fn create(
+        &self,
+        kv_backend: &KvBackendRef,
+        region_server: RegionServer,
+        flow_server: FlownodeRef,
+    ) -> Result<NodeManagerRef>;
+}
+
+pub struct DefaultNodeManagerCreator;
+
+#[async_trait]
+impl NodeManagerCreator for DefaultNodeManagerCreator {
+    async fn create(
+        &self,
+        _: &KvBackendRef,
+        region_server: RegionServer,
+        flow_server: FlownodeRef,
+    ) -> Result<NodeManagerRef> {
+        Ok(Arc::new(StandaloneDatanodeManager {
+            region_server,
+            flow_server,
+        }))
+    }
+}
+
+pub trait TableIdAllocatorCreator {
+    fn create(&self, kv_backend: &KvBackendRef) -> Arc<Sequence>;
+}
+
+struct DefaultTableIdAllocatorCreator;
+
+impl TableIdAllocatorCreator for DefaultTableIdAllocatorCreator {
+    fn create(&self, kv_backend: &KvBackendRef) -> Arc<Sequence> {
+        Arc::new(
+            SequenceBuilder::new(TABLE_ID_SEQ, kv_backend.clone())
+                .initial(MIN_USER_TABLE_ID as u64)
+                .step(10)
+                .build(),
+        )
+    }
+}
+
+#[async_trait]
+pub trait ProcedureExecutorCreator {
+    async fn create(
+        &self,
+        ddl_manager: DdlManagerRef,
+        procedure_manager: ProcedureManagerRef,
+    ) -> Result<ProcedureExecutorRef>;
+}
+
+pub struct DefaultProcedureExecutorCreator;
+
+#[async_trait]
+impl ProcedureExecutorCreator for DefaultProcedureExecutorCreator {
+    async fn create(
+        &self,
+        ddl_manager: DdlManagerRef,
+        procedure_manager: ProcedureManagerRef,
+    ) -> Result<ProcedureExecutorRef> {
+        Ok(Arc::new(LocalProcedureExecutor::new(
+            ddl_manager,
+            procedure_manager,
+        )))
+    }
+}
+
+/// `InstanceCreator` is used for grouping various component creators for building the
+/// Standalone instance, suitable for customizing how the instance can be built.
+pub struct InstanceCreator {
+    node_manager_creator: Box<dyn NodeManagerCreator>,
+    table_id_allocator_creator: Box<dyn TableIdAllocatorCreator>,
+    procedure_executor_creator: Box<dyn ProcedureExecutorCreator>,
+}
+
+impl InstanceCreator {
+    pub fn new(
+        node_manager_creator: Box<dyn NodeManagerCreator>,
+        table_id_allocator_creator: Box<dyn TableIdAllocatorCreator>,
+        procedure_executor_creator: Box<dyn ProcedureExecutorCreator>,
+    ) -> Self {
+        Self {
+            node_manager_creator,
+            table_id_allocator_creator,
+            procedure_executor_creator,
+        }
+    }
+}
+
+impl Default for InstanceCreator {
+    fn default() -> Self {
+        Self {
+            node_manager_creator: Box::new(DefaultNodeManagerCreator),
+            table_id_allocator_creator: Box::new(DefaultTableIdAllocatorCreator),
+            procedure_executor_creator: Box::new(DefaultProcedureExecutorCreator),
+        }
+    }
+}
+
+/// `InstanceCreatorResult` is expected to be used paired with [InstanceCreator].
+/// It stores the created and other important components for further reusing.
+pub struct InstanceCreatorResult {
+    pub kv_backend: KvBackendRef,
+    pub node_manager: NodeManagerRef,
+    pub table_id_allocator: Arc<Sequence>,
+}
+
 #[cfg(test)]
 mod tests {
     use std::default::Default;
diff --git a/src/common/config/src/config.rs b/src/common/config/src/config.rs
index e25c46a0c0..85ce3d206f 100644
--- a/src/common/config/src/config.rs
+++ b/src/common/config/src/config.rs
@@ -53,7 +53,7 @@ pub trait Configurable: Serialize + DeserializeOwned + Default + Sized {
 
             env.try_parsing(true)
                 .separator(ENV_VAR_SEP)
-                .ignore_empty(true)
+                .ignore_empty(false)
         };
 
         // Workaround: Replacement for `Config::try_from(&default_opts)` due to
@@ -237,4 +237,31 @@ mod tests {
             },
         );
     }
+
+    #[derive(Debug, Serialize, Deserialize, Default)]
+    struct SimpleConfig {
+        name: Option<String>,
+        prefix: Option<String>,
+    }
+
+    impl Configurable for SimpleConfig {}
+
+    #[test]
+    fn test_empty_env_var_is_not_ignored() {
+        let env_prefix = "SIMPLE_CFG_UT";
+        temp_env::with_vars(
+            [(
+                [env_prefix.to_string(), "PREFIX".to_string()].join(ENV_VAR_SEP),
+                Some(""),
+            )],
+            || {
+                let opts = SimpleConfig::load_layered_options(None, env_prefix).unwrap();
+                // With ignore_empty(false), an empty env var should yield Some("")
+                // rather than None (which was the previous behavior with ignore_empty(true)).
+                assert_eq!(opts.prefix, Some("".to_string()));
+                // Unset env var should remain None.
+                assert_eq!(opts.name, None);
+            },
+        );
+    }
 }
diff --git a/src/common/function/src/aggrs/aggr_wrapper.rs b/src/common/function/src/aggrs/aggr_wrapper.rs
index 3780d39582..6242ab9454 100644
--- a/src/common/function/src/aggrs/aggr_wrapper.rs
+++ b/src/common/function/src/aggrs/aggr_wrapper.rs
@@ -25,7 +25,7 @@
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
-use arrow::array::StructArray;
+use arrow::array::{ArrayRef, BooleanArray, StructArray};
 use arrow_schema::{FieldRef, Fields};
 use common_telemetry::debug;
 use datafusion::functions_aggregate::all_default_aggregate_functions;
@@ -38,8 +38,8 @@ use datafusion_common::{Column, ScalarValue};
 use datafusion_expr::expr::{AggregateFunction, AggregateFunctionParams};
 use datafusion_expr::function::StateFieldsArgs;
 use datafusion_expr::{
-    Accumulator, Aggregate, AggregateUDF, AggregateUDFImpl, Expr, ExprSchemable, LogicalPlan,
-    Signature,
+    Accumulator, Aggregate, AggregateUDF, AggregateUDFImpl, EmitTo, Expr, ExprSchemable,
+    GroupsAccumulator, LogicalPlan, Signature,
 };
 use datafusion_physical_expr::aggregate::AggregateFunctionExpr;
 use datatypes::arrow::datatypes::{DataType, Field};
@@ -322,6 +322,14 @@ impl StateWrapper {
             );
         })
     }
+
+    fn fix_inner_acc_args<'b>(
+        &self,
+        mut acc_args: datafusion_expr::function::AccumulatorArgs<'b>,
+    ) -> datafusion_common::Result<datafusion_expr::function::AccumulatorArgs<'b>> {
+        acc_args.return_field = self.deduce_aggr_return_type(&acc_args)?;
+        Ok(acc_args)
+    }
 }
 
 impl AggregateUDFImpl for StateWrapper {
@@ -331,15 +339,32 @@ impl AggregateUDFImpl for StateWrapper {
     ) -> datafusion_common::Result<Box<dyn Accumulator>> {
         // fix and recover proper acc args for the original aggregate function.
         let state_type = acc_args.return_type().clone();
-        let inner = {
-            let mut new_acc_args = acc_args.clone();
-            new_acc_args.return_field = self.deduce_aggr_return_type(&acc_args)?;
-            self.inner.accumulator(new_acc_args)?
-        };
+        let inner = self.inner.accumulator(self.fix_inner_acc_args(acc_args)?)?;
 
         Ok(Box::new(StateAccum::new(inner, state_type)?))
     }
 
+    fn groups_accumulator_supported(
+        &self,
+        acc_args: datafusion_expr::function::AccumulatorArgs,
+    ) -> bool {
+        self.fix_inner_acc_args(acc_args)
+            .map(|args| self.inner.inner().groups_accumulator_supported(args))
+            .unwrap_or(false)
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        acc_args: datafusion_expr::function::AccumulatorArgs,
+    ) -> datafusion_common::Result<Box<dyn GroupsAccumulator>> {
+        let state_type = acc_args.return_type().clone();
+        let inner = self
+            .inner
+            .inner()
+            .create_groups_accumulator(self.fix_inner_acc_args(acc_args)?)?;
+        Ok(Box::new(StateGroupsAccum::new(inner, state_type)?))
+    }
+
     fn as_any(&self) -> &dyn std::any::Any {
         self
     }
@@ -462,6 +487,118 @@ pub struct StateAccum {
     state_fields: Fields,
 }
 
+pub struct StateGroupsAccum {
+    inner: Box<dyn GroupsAccumulator>,
+    state_fields: Fields,
+}
+
+impl StateGroupsAccum {
+    fn new(
+        inner: Box<dyn GroupsAccumulator>,
+        state_type: DataType,
+    ) -> datafusion_common::Result<Self> {
+        let DataType::Struct(fields) = state_type else {
+            return Err(datafusion_common::DataFusionError::Internal(format!(
+                "Expected a struct type for state, got: {:?}",
+                state_type
+            )));
+        };
+        Ok(Self {
+            inner,
+            state_fields: fields,
+        })
+    }
+
+    fn wrap_state_arrays(&self, arrays: Vec<ArrayRef>) -> datafusion_common::Result<ArrayRef> {
+        let array_type = arrays
+            .iter()
+            .map(|array| array.data_type().clone())
+            .collect::<Vec<_>>();
+        let expected_type = self
+            .state_fields
+            .iter()
+            .map(|field| field.data_type().clone())
+            .collect::<Vec<_>>();
+        if array_type != expected_type {
+            debug!(
+                "State mismatch, expected: {}, got: {} for expected fields: {:?} and given array types: {:?}",
+                self.state_fields.len(),
+                arrays.len(),
+                self.state_fields,
+                array_type,
+            );
+            let guess_schema = arrays
+                .iter()
+                .enumerate()
+                .map(|(index, array)| {
+                    Field::new(
+                        format!("col_{index}[mismatch_state]").as_str(),
+                        array.data_type().clone(),
+                        true,
+                    )
+                })
+                .collect::<Fields>();
+            let array = StructArray::try_new(guess_schema, arrays, None)?;
+            return Ok(Arc::new(array));
+        }
+
+        Ok(Arc::new(StructArray::try_new(
+            self.state_fields.clone(),
+            arrays,
+            None,
+        )?))
+    }
+}
+
+impl GroupsAccumulator for StateGroupsAccum {
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> datafusion_common::Result<()> {
+        self.inner
+            .update_batch(values, group_indices, opt_filter, total_num_groups)
+    }
+
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indices: &[usize],
+        opt_filter: Option<&BooleanArray>,
+        total_num_groups: usize,
+    ) -> datafusion_common::Result<()> {
+        self.inner
+            .merge_batch(values, group_indices, opt_filter, total_num_groups)
+    }
+
+    fn evaluate(&mut self, emit_to: EmitTo) -> datafusion_common::Result<ArrayRef> {
+        let state = self.inner.state(emit_to)?;
+        self.wrap_state_arrays(state)
+    }
+
+    fn state(&mut self, emit_to: EmitTo) -> datafusion_common::Result<Vec<ArrayRef>> {
+        self.inner.state(emit_to)
+    }
+
+    fn convert_to_state(
+        &self,
+        values: &[ArrayRef],
+        opt_filter: Option<&BooleanArray>,
+    ) -> datafusion_common::Result<Vec<ArrayRef>> {
+        self.inner.convert_to_state(values, opt_filter)
+    }
+
+    fn supports_convert_to_state(&self) -> bool {
+        self.inner.supports_convert_to_state()
+    }
+
+    fn size(&self) -> usize {
+        self.inner.size()
+    }
+}
+
 impl StateAccum {
     pub fn new(
         inner: Box<dyn Accumulator>,
diff --git a/src/common/function/src/aggrs/aggr_wrapper/tests.rs b/src/common/function/src/aggrs/aggr_wrapper/tests.rs
index 8821b9fd24..de3a77df6b 100644
--- a/src/common/function/src/aggrs/aggr_wrapper/tests.rs
+++ b/src/common/function/src/aggrs/aggr_wrapper/tests.rs
@@ -40,10 +40,13 @@ use datafusion_common::arrow::array::AsArray;
 use datafusion_common::arrow::datatypes::{Float64Type, UInt64Type};
 use datafusion_common::{Column, TableReference};
 use datafusion_expr::expr::{AggregateFunction, NullTreatment};
+use datafusion_expr::function::AccumulatorArgs;
 use datafusion_expr::{
-    Aggregate, ColumnarValue, Expr, LogicalPlan, ScalarFunctionArgs, SortExpr, TableScan, lit,
+    Aggregate, AggregateUDFImpl, ColumnarValue, Expr, LogicalPlan, ScalarFunctionArgs, SortExpr,
+    TableScan, lit,
 };
 use datafusion_physical_expr::aggregate::AggregateExprBuilder;
+use datafusion_physical_expr::expressions::col;
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
 use datatypes::arrow_array::StringArray;
 use futures::{Stream, StreamExt as _};
@@ -256,6 +259,38 @@ fn dummy_table_scan_with_ts() -> LogicalPlan {
     )
 }
 
+fn create_avg_state_groups_accumulator() -> Box<dyn GroupsAccumulator> {
+    let state_wrapper = StateWrapper::new((*avg_udaf()).clone()).unwrap();
+    let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+        "number",
+        DataType::Float64,
+        true,
+    )]));
+    let expr = col("number", &schema).unwrap();
+    let expr_field = expr.return_field(&schema).unwrap();
+    let return_field = Arc::new(Field::new(
+        "__avg_state(number)",
+        state_wrapper.return_type(&[DataType::Float64]).unwrap(),
+        true,
+    ));
+    let exprs = [expr];
+    let expr_fields = [expr_field];
+    let acc_args = AccumulatorArgs {
+        return_field,
+        schema: &schema,
+        ignore_nulls: false,
+        order_bys: &[],
+        is_reversed: false,
+        name: "__avg_state(number)",
+        is_distinct: false,
+        exprs: &exprs,
+        expr_fields: &expr_fields,
+    };
+
+    assert!(state_wrapper.groups_accumulator_supported(acc_args.clone()));
+    state_wrapper.create_groups_accumulator(acc_args).unwrap()
+}
+
 #[tokio::test]
 async fn test_sum_udaf() {
     let ctx = SessionContext::new();
@@ -796,6 +831,95 @@ async fn test_last_value_order_by_udaf() {
     assert_eq!(merge_eval_res, ScalarValue::Int64(Some(4)));
 }
 
+#[test]
+fn test_avg_state_groups_accumulator_evaluate() {
+    let mut state_accum = create_avg_state_groups_accumulator();
+    let values = vec![Arc::new(Float64Array::from(vec![
+        Some(1.0),
+        Some(2.0),
+        None,
+        Some(3.0),
+        Some(4.0),
+        Some(5.0),
+    ])) as ArrayRef];
+    let group_indices = vec![0, 1, 0, 0, 1, 2];
+
+    state_accum
+        .update_batch(&values, &group_indices, None, 3)
+        .unwrap();
+
+    let result = state_accum.evaluate(EmitTo::All).unwrap();
+    let result = result.as_any().downcast_ref::<StructArray>().unwrap();
+
+    assert_eq!(
+        result
+            .column(0)
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap(),
+        &UInt64Array::from(vec![2, 2, 1])
+    );
+    assert_eq!(
+        result
+            .column(1)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap(),
+        &Float64Array::from(vec![4.0, 6.0, 5.0])
+    );
+}
+
+#[test]
+fn test_avg_state_groups_accumulator_state_merge_evaluate() {
+    let mut source_accum = create_avg_state_groups_accumulator();
+    let source_values = vec![Arc::new(Float64Array::from(vec![
+        Some(1.0),
+        Some(2.0),
+        None,
+        Some(3.0),
+        Some(4.0),
+        Some(5.0),
+    ])) as ArrayRef];
+    let source_group_indices = vec![0, 1, 0, 0, 1, 2];
+
+    source_accum
+        .update_batch(&source_values, &source_group_indices, None, 3)
+        .unwrap();
+    let source_state = source_accum.state(EmitTo::All).unwrap();
+
+    let mut merged_accum = create_avg_state_groups_accumulator();
+    let merged_values =
+        vec![Arc::new(Float64Array::from(vec![Some(10.0), Some(20.0), Some(30.0)])) as ArrayRef];
+    let merged_group_indices = vec![0, 1, 2];
+
+    merged_accum
+        .update_batch(&merged_values, &merged_group_indices, None, 3)
+        .unwrap();
+    merged_accum
+        .merge_batch(&source_state, &[1, 2, 0], None, 3)
+        .unwrap();
+
+    let result = merged_accum.evaluate(EmitTo::All).unwrap();
+    let result = result.as_any().downcast_ref::<StructArray>().unwrap();
+
+    assert_eq!(
+        result
+            .column(0)
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap(),
+        &UInt64Array::from(vec![2, 3, 3])
+    );
+    assert_eq!(
+        result
+            .column(1)
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap(),
+        &Float64Array::from(vec![15.0, 24.0, 36.0])
+    );
+}
+
 /// For testing whether the UDAF state fields are correctly implemented.
 /// esp. for our own custom UDAF's state fields.
 /// By compare eval results before and after split to state/merge functions.
diff --git a/src/common/function/src/scalars/json/json_to_string.rs b/src/common/function/src/scalars/json/json_to_string.rs
index 6c0cc260b2..6364dff4de 100644
--- a/src/common/function/src/scalars/json/json_to_string.rs
+++ b/src/common/function/src/scalars/json/json_to_string.rs
@@ -19,6 +19,7 @@ use datafusion_common::DataFusionError;
 use datafusion_common::arrow::array::{Array, AsArray, StringViewBuilder};
 use datafusion_common::arrow::datatypes::DataType;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
+use datatypes::types::jsonb_to_string;
 
 use crate::function::{Function, extract_args};
 
@@ -74,7 +75,7 @@ impl Function for JsonToStringFunction {
         for i in 0..size {
             let json = jsons.is_valid(i).then(|| jsons.value(i));
             let result = json
-                .map(|json| jsonb::from_slice(json).map(|x| x.to_string()))
+                .map(jsonb_to_string)
                 .transpose()
                 .map_err(|e| DataFusionError::Execution(format!("invalid json binary: {e}")))?;
 
diff --git a/src/common/memory-manager/Cargo.toml b/src/common/memory-manager/Cargo.toml
index a6be50f774..6686c98167 100644
--- a/src/common/memory-manager/Cargo.toml
+++ b/src/common/memory-manager/Cargo.toml
@@ -10,7 +10,6 @@ workspace = true
 [dependencies]
 common-error = { workspace = true }
 common-macro = { workspace = true }
-common-telemetry = { workspace = true }
 humantime = { workspace = true }
 serde = { workspace = true }
 snafu = { workspace = true }
diff --git a/src/common/memory-manager/src/guard.rs b/src/common/memory-manager/src/guard.rs
index 770b6dec24..ad3111581b 100644
--- a/src/common/memory-manager/src/guard.rs
+++ b/src/common/memory-manager/src/guard.rs
@@ -14,14 +14,13 @@
 
 use std::{fmt, mem};
 
-use common_telemetry::debug;
 use snafu::ensure;
 use tokio::sync::{OwnedSemaphorePermit, TryAcquireError};
 
 use crate::error::{
     MemoryAcquireTimeoutSnafu, MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result,
 };
-use crate::manager::{MemoryMetrics, MemoryQuota};
+use crate::manager::{MemoryMetrics, MemoryQuota, UnlimitedMemoryQuota};
 use crate::policy::OnExhaustedPolicy;
 
 /// Guard representing a slice of reserved memory.
@@ -30,31 +29,57 @@ pub struct MemoryGuard<M: MemoryMetrics> {
 }
 
 pub(crate) enum GuardState<M: MemoryMetrics> {
-    Unlimited,
+    Released,
+    Unlimited {
+        quota: UnlimitedMemoryQuota<M>,
+        granted_bytes: u64,
+    },
     Limited {
-        permit: OwnedSemaphorePermit,
         quota: MemoryQuota<M>,
+        permit: OwnedSemaphorePermit,
     },
 }
 
+impl<M: MemoryMetrics> GuardState<M> {
+    fn release(self) {
+        match self {
+            GuardState::Released => {}
+            GuardState::Unlimited {
+                quota,
+                granted_bytes,
+            } => {
+                quota.sub_in_use(granted_bytes);
+            }
+            GuardState::Limited { quota, permit } => {
+                quota.release_permit(permit);
+            }
+        }
+    }
+}
+
 impl<M: MemoryMetrics> MemoryGuard<M> {
-    pub(crate) fn unlimited() -> Self {
+    pub(crate) fn unlimited(quota: UnlimitedMemoryQuota<M>, bytes: u64) -> Self {
+        quota.add_in_use(bytes);
         Self {
-            state: GuardState::Unlimited,
+            state: GuardState::Unlimited {
+                quota,
+                granted_bytes: bytes,
+            },
         }
     }
 
-    pub(crate) fn limited(permit: OwnedSemaphorePermit, quota: MemoryQuota<M>) -> Self {
+    pub(crate) fn limited(quota: MemoryQuota<M>, permit: OwnedSemaphorePermit) -> Self {
         Self {
-            state: GuardState::Limited { permit, quota },
+            state: GuardState::Limited { quota, permit },
         }
     }
 
     /// Returns granted quota in bytes.
     pub fn granted_bytes(&self) -> u64 {
         match &self.state {
-            GuardState::Unlimited => 0,
-            GuardState::Limited { permit, quota } => {
+            GuardState::Released => 0,
+            GuardState::Unlimited { granted_bytes, .. } => *granted_bytes,
+            GuardState::Limited { quota, permit } => {
                 quota.permits_to_bytes(permit.num_permits() as u32)
             }
         }
@@ -68,13 +93,24 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
     /// - Returns error if requested bytes would exceed the manager's total limit
     /// - Returns error if the semaphore is unexpectedly closed
     pub async fn acquire_additional(&mut self, bytes: u64) -> Result<()> {
-        match &mut self.state {
-            GuardState::Unlimited => Ok(()),
-            GuardState::Limited { permit, quota } => {
-                if bytes == 0 {
-                    return Ok(());
-                }
+        if bytes == 0 {
+            return Ok(());
+        }
 
+        match &mut self.state {
+            GuardState::Released => {
+                debug_assert!(false, "released memory guard state should not be reused");
+                Ok(())
+            }
+            GuardState::Unlimited {
+                quota,
+                granted_bytes,
+            } => {
+                quota.add_in_use(bytes);
+                *granted_bytes = granted_bytes.saturating_add(bytes);
+                Ok(())
+            }
+            GuardState::Limited { quota, permit } => {
                 let additional_permits = quota.bytes_to_permits(bytes);
                 let current_permits = permit.num_permits() as u32;
 
@@ -95,7 +131,6 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
 
                 permit.merge(additional_permit);
                 quota.update_in_use_metric();
-                debug!("Acquired additional {} bytes", bytes);
                 Ok(())
             }
         }
@@ -106,13 +141,24 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
     /// On success, merges the new memory into this guard and returns true.
     /// On failure, returns false and leaves this guard unchanged.
     pub fn try_acquire_additional(&mut self, bytes: u64) -> bool {
-        match &mut self.state {
-            GuardState::Unlimited => true,
-            GuardState::Limited { permit, quota } => {
-                if bytes == 0 {
-                    return true;
-                }
+        if bytes == 0 {
+            return true;
+        }
 
+        match &mut self.state {
+            GuardState::Released => {
+                debug_assert!(false, "released memory guard state should not be reused");
+                false
+            }
+            GuardState::Unlimited {
+                quota,
+                granted_bytes,
+            } => {
+                quota.add_in_use(bytes);
+                *granted_bytes = granted_bytes.saturating_add(bytes);
+                true
+            }
+            GuardState::Limited { quota, permit } => {
                 let additional_permits = quota.bytes_to_permits(bytes);
 
                 match quota
@@ -123,7 +169,6 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
                     Ok(additional_permit) => {
                         permit.merge(additional_permit);
                         quota.update_in_use_metric();
-                        debug!("Acquired additional {} bytes", bytes);
                         true
                     }
                     Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
@@ -168,7 +213,8 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
                     MemoryLimitExceededSnafu {
                         requested_bytes: bytes,
                         limit_bytes: match &self.state {
-                            GuardState::Unlimited => 0, // unreachable: unlimited mode always succeeds
+                            GuardState::Released => 0,
+                            GuardState::Unlimited { .. } => 0, // unreachable: unlimited mode always succeeds
                             GuardState::Limited { quota, .. } => {
                                 quota.permits_to_bytes(quota.limit_permits)
                             }
@@ -184,22 +230,30 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
     ///
     /// Returns true if the release succeeds or is a no-op; false if the request exceeds granted.
     pub fn release_partial(&mut self, bytes: u64) -> bool {
+        if bytes == 0 {
+            return true;
+        }
+
         match &mut self.state {
-            GuardState::Unlimited => true,
-            GuardState::Limited { permit, quota } => {
-                if bytes == 0 {
-                    return true;
+            GuardState::Released => true,
+            GuardState::Unlimited {
+                quota,
+                granted_bytes,
+            } => {
+                if bytes > *granted_bytes {
+                    return false;
                 }
 
+                quota.sub_in_use(bytes);
+                *granted_bytes = granted_bytes.saturating_sub(bytes);
+                true
+            }
+            GuardState::Limited { quota, permit } => {
                 let release_permits = quota.bytes_to_permits(bytes);
 
                 match permit.split(release_permits as usize) {
                     Some(released_permit) => {
-                        let released_bytes =
-                            quota.permits_to_bytes(released_permit.num_permits() as u32);
-                        drop(released_permit);
-                        quota.update_in_use_metric();
-                        debug!("Released {} bytes from memory guard", released_bytes);
+                        quota.release_permit(released_permit);
                         true
                     }
                     None => false,
@@ -211,14 +265,7 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
 
 impl<M: MemoryMetrics> Drop for MemoryGuard<M> {
     fn drop(&mut self) {
-        if let GuardState::Limited { permit, quota } =
-            mem::replace(&mut self.state, GuardState::Unlimited)
-        {
-            let bytes = quota.permits_to_bytes(permit.num_permits() as u32);
-            drop(permit);
-            quota.update_in_use_metric();
-            debug!("Released memory: {} bytes", bytes);
-        }
+        mem::replace(&mut self.state, GuardState::Released).release();
     }
 }
 
diff --git a/src/common/memory-manager/src/manager.rs b/src/common/memory-manager/src/manager.rs
index 50360d2a31..8cca5f220c 100644
--- a/src/common/memory-manager/src/manager.rs
+++ b/src/common/memory-manager/src/manager.rs
@@ -13,9 +13,10 @@
 // limitations under the License.
 
 use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
 
 use snafu::ensure;
-use tokio::sync::{Semaphore, TryAcquireError};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, TryAcquireError};
 
 use crate::error::{
     MemoryAcquireTimeoutSnafu, MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result,
@@ -34,7 +35,7 @@ pub trait MemoryMetrics: Clone + Send + Sync + 'static {
 /// Generic memory manager for quota-controlled operations.
 #[derive(Clone)]
 pub struct MemoryManager<M: MemoryMetrics> {
-    quota: Option<MemoryQuota<M>>,
+    quota: MemoryQuotaState<M>,
 }
 
 impl<M: MemoryMetrics + Default> Default for MemoryManager<M> {
@@ -51,6 +52,18 @@ pub(crate) struct MemoryQuota<M: MemoryMetrics> {
     pub(crate) metrics: M,
 }
 
+#[derive(Clone)]
+pub(crate) struct UnlimitedMemoryQuota<M: MemoryMetrics> {
+    pub(crate) current_bytes: Arc<AtomicU64>,
+    pub(crate) metrics: M,
+}
+
+#[derive(Clone)]
+pub(crate) enum MemoryQuotaState<M: MemoryMetrics> {
+    Unlimited(UnlimitedMemoryQuota<M>),
+    Limited(MemoryQuota<M>),
+}
+
 impl<M: MemoryMetrics> MemoryManager<M> {
     /// Creates a new memory manager with the given limit in bytes.
     /// `limit_bytes = 0` disables the limit.
@@ -62,7 +75,12 @@ impl<M: MemoryMetrics> MemoryManager<M> {
     pub fn with_granularity(limit_bytes: u64, granularity: PermitGranularity, metrics: M) -> Self {
         if limit_bytes == 0 {
             metrics.set_limit(0);
-            return Self { quota: None };
+            return Self {
+                quota: MemoryQuotaState::Unlimited(UnlimitedMemoryQuota {
+                    current_bytes: Arc::new(AtomicU64::new(0)),
+                    metrics,
+                }),
+            };
         }
 
         let limit_permits = granularity.bytes_to_permits(limit_bytes);
@@ -70,7 +88,7 @@ impl<M: MemoryMetrics> MemoryManager<M> {
         metrics.set_limit(limit_aligned_bytes as i64);
 
         Self {
-            quota: Some(MemoryQuota {
+            quota: MemoryQuotaState::Limited(MemoryQuota {
                 semaphore: Arc::new(Semaphore::new(limit_permits as usize)),
                 limit_permits,
                 granularity,
@@ -81,26 +99,30 @@ impl<M: MemoryMetrics> MemoryManager<M> {
 
     /// Returns the configured limit in bytes (0 if unlimited).
     pub fn limit_bytes(&self) -> u64 {
-        self.quota
-            .as_ref()
-            .map(|quota| quota.permits_to_bytes(quota.limit_permits))
-            .unwrap_or(0)
+        match &self.quota {
+            MemoryQuotaState::Unlimited(_) => 0,
+            MemoryQuotaState::Limited(quota) => quota.permits_to_bytes(quota.limit_permits),
+        }
     }
 
     /// Returns currently used bytes.
     pub fn used_bytes(&self) -> u64 {
-        self.quota
-            .as_ref()
-            .map(|quota| quota.permits_to_bytes(quota.used_permits()))
-            .unwrap_or(0)
+        match &self.quota {
+            MemoryQuotaState::Unlimited(quota) => quota.current_bytes.load(Ordering::Acquire),
+            MemoryQuotaState::Limited(quota) => quota.permits_to_bytes(quota.used_permits()),
+        }
     }
 
     /// Returns available bytes.
+    ///
+    /// Unlimited managers report `u64::MAX`.
     pub fn available_bytes(&self) -> u64 {
-        self.quota
-            .as_ref()
-            .map(|quota| quota.permits_to_bytes(quota.available_permits_clamped()))
-            .unwrap_or(0)
+        match &self.quota {
+            MemoryQuotaState::Unlimited(_) => u64::MAX,
+            MemoryQuotaState::Limited(quota) => {
+                quota.permits_to_bytes(quota.available_permits_clamped())
+            }
+        }
     }
 
     /// Acquires memory, waiting if necessary until enough is available.
@@ -110,8 +132,8 @@ impl<M: MemoryMetrics> MemoryManager<M> {
     /// - Returns error if the semaphore is unexpectedly closed
     pub async fn acquire(&self, bytes: u64) -> Result<MemoryGuard<M>> {
         match &self.quota {
-            None => Ok(MemoryGuard::unlimited()),
-            Some(quota) => {
+            MemoryQuotaState::Unlimited(quota) => Ok(MemoryGuard::unlimited(quota.clone(), bytes)),
+            MemoryQuotaState::Limited(quota) => {
                 let permits = quota.bytes_to_permits(bytes);
 
                 ensure!(
@@ -129,7 +151,7 @@ impl<M: MemoryMetrics> MemoryManager<M> {
                     .await
                     .map_err(|_| MemorySemaphoreClosedSnafu.build())?;
                 quota.update_in_use_metric();
-                Ok(MemoryGuard::limited(permit, quota.clone()))
+                Ok(MemoryGuard::limited(quota.clone(), permit))
             }
         }
     }
@@ -137,14 +159,16 @@ impl<M: MemoryMetrics> MemoryManager<M> {
     /// Tries to acquire memory. Returns Some(guard) on success, None if insufficient.
     pub fn try_acquire(&self, bytes: u64) -> Option<MemoryGuard<M>> {
         match &self.quota {
-            None => Some(MemoryGuard::unlimited()),
-            Some(quota) => {
+            MemoryQuotaState::Unlimited(quota) => {
+                Some(MemoryGuard::unlimited(quota.clone(), bytes))
+            }
+            MemoryQuotaState::Limited(quota) => {
                 let permits = quota.bytes_to_permits(bytes);
 
                 match quota.semaphore.clone().try_acquire_many_owned(permits) {
                     Ok(permit) => {
                         quota.update_in_use_metric();
-                        Some(MemoryGuard::limited(permit, quota.clone()))
+                        Some(MemoryGuard::limited(quota.clone(), permit))
                     }
                     Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
                         quota.metrics.inc_rejected("try_acquire");
@@ -219,4 +243,49 @@ impl<M: MemoryMetrics> MemoryQuota<M> {
         let bytes = self.permits_to_bytes(self.used_permits());
         self.metrics.set_in_use(bytes as i64);
     }
+
+    pub(crate) fn release_permit(&self, permit: OwnedSemaphorePermit) {
+        drop(permit);
+        self.update_in_use_metric();
+    }
+}
+
+impl<M: MemoryMetrics> UnlimitedMemoryQuota<M> {
+    pub(crate) fn add_in_use(&self, bytes: u64) {
+        if bytes == 0 {
+            return;
+        }
+
+        let previous = self
+            .current_bytes
+            .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
+                Some(current.saturating_add(bytes))
+            })
+            .unwrap();
+        let new_total = previous.saturating_add(bytes);
+        debug_assert!(
+            new_total >= previous,
+            "unlimited memory usage counter overflowed"
+        );
+        self.metrics.set_in_use(new_total as i64);
+    }
+
+    pub(crate) fn sub_in_use(&self, bytes: u64) {
+        if bytes == 0 {
+            return;
+        }
+
+        let previous = self
+            .current_bytes
+            .fetch_update(Ordering::AcqRel, Ordering::Acquire, |current| {
+                Some(current.saturating_sub(bytes))
+            })
+            .unwrap();
+        debug_assert!(
+            previous >= bytes,
+            "unlimited memory usage counter underflowed: current={previous}, release={bytes}"
+        );
+        let new_total = previous.saturating_sub(bytes);
+        self.metrics.set_in_use(new_total as i64);
+    }
 }
diff --git a/src/common/memory-manager/src/tests.rs b/src/common/memory-manager/src/tests.rs
index 886eef9dac..fe02703f0b 100644
--- a/src/common/memory-manager/src/tests.rs
+++ b/src/common/memory-manager/src/tests.rs
@@ -24,7 +24,9 @@ fn test_try_acquire_unlimited() {
     let manager = MemoryManager::new(0, NoOpMetrics);
     let guard = manager.try_acquire(10 * PERMIT_GRANULARITY_BYTES).unwrap();
     assert_eq!(manager.limit_bytes(), 0);
-    assert_eq!(guard.granted_bytes(), 0);
+    assert_eq!(manager.available_bytes(), u64::MAX);
+    assert_eq!(guard.granted_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
 }
 
 #[test]
@@ -136,7 +138,10 @@ fn test_request_additional_unlimited() {
 
     // Should always succeed with unlimited manager
     assert!(guard.try_acquire_additional(100 * PERMIT_GRANULARITY_BYTES));
-    assert_eq!(guard.granted_bytes(), 0);
+    assert_eq!(guard.granted_bytes(), 105 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 105 * PERMIT_GRANULARITY_BYTES);
+
+    drop(guard);
     assert_eq!(manager.used_bytes(), 0);
 }
 
@@ -187,9 +192,10 @@ fn test_early_release_partial_unlimited() {
     let manager = MemoryManager::new(0, NoOpMetrics);
     let mut guard = manager.try_acquire(100 * PERMIT_GRANULARITY_BYTES).unwrap();
 
-    // Unlimited guard - release should succeed (no-op)
+    // Unlimited guard should track and release exact bytes.
     assert!(guard.release_partial(50 * PERMIT_GRANULARITY_BYTES));
-    assert_eq!(guard.granted_bytes(), 0);
+    assert_eq!(guard.granted_bytes(), 50 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 50 * PERMIT_GRANULARITY_BYTES);
 }
 
 #[test]
@@ -406,6 +412,6 @@ async fn test_acquire_additional_unlimited() {
         .acquire_additional(1000 * PERMIT_GRANULARITY_BYTES)
         .await
         .unwrap();
-    assert_eq!(guard.granted_bytes(), 0);
-    assert_eq!(manager.used_bytes(), 0);
+    assert_eq!(guard.granted_bytes(), 1000 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 1000 * PERMIT_GRANULARITY_BYTES);
 }
diff --git a/src/common/meta/Cargo.toml b/src/common/meta/Cargo.toml
index ec000c710d..f5ca9d2c09 100644
--- a/src/common/meta/Cargo.toml
+++ b/src/common/meta/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 testing = []
 pg_kvbackend = [
     "dep:tokio-postgres",
-    "dep:backon",
     "dep:deadpool-postgres",
     "dep:deadpool",
     "dep:tokio-postgres-rustls",
@@ -16,7 +15,7 @@ pg_kvbackend = [
     "dep:rustls-native-certs",
     "dep:rustls",
 ]
-mysql_kvbackend = ["dep:sqlx", "dep:backon"]
+mysql_kvbackend = ["dep:sqlx"]
 enterprise = ["prost-types"]
 
 [lints]
@@ -28,7 +27,7 @@ api.workspace = true
 async-recursion = "1.0"
 async-stream.workspace = true
 async-trait.workspace = true
-backon = { workspace = true, optional = true }
+backon.workspace = true
 base64.workspace = true
 bytes.workspace = true
 chrono.workspace = true
diff --git a/src/common/meta/src/cache/container.rs b/src/common/meta/src/cache/container.rs
index 0510476d15..e3a3e13a76 100644
--- a/src/common/meta/src/cache/container.rs
+++ b/src/common/meta/src/cache/container.rs
@@ -15,10 +15,14 @@
 use std::borrow::Borrow;
 use std::hash::Hash;
 use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::Duration;
 
-use futures::future::{BoxFuture, join_all};
+use backon::{BackoffBuilder, ExponentialBuilder};
+use futures::future::BoxFuture;
 use moka::future::Cache;
 use snafu::{OptionExt, ResultExt};
+use tokio::time::sleep;
 
 use crate::cache_invalidator::{CacheInvalidator, Context};
 use crate::error::{self, Error, Result};
@@ -29,12 +33,29 @@ use crate::metrics;
 pub type TokenFilter<CacheToken> = Box<dyn Fn(&CacheToken) -> bool + Send + Sync>;
 
 /// Invalidates cached values by [CacheToken]s.
-pub type Invalidator<K, V, CacheToken> =
-    Box<dyn for<'a> Fn(&'a Cache<K, V>, &'a CacheToken) -> BoxFuture<'a, Result<()>> + Send + Sync>;
+pub type Invalidator<K, V, CacheToken> = Box<
+    dyn for<'a> Fn(&'a Cache<K, V>, &'a [&CacheToken]) -> BoxFuture<'a, Result<()>> + Send + Sync,
+>;
 
 /// Initializes value (i.e., fetches from remote).
 pub type Initializer<K, V> = Arc<dyn Fn(&'_ K) -> BoxFuture<'_, Result<Option<V>>> + Send + Sync>;
 
+#[derive(Debug, Clone, Copy)]
+/// Initialization strategy for cache-miss loading.
+///
+/// This strategy is selected when building [CacheContainer] and remains immutable
+/// for the lifetime of the container instance.
+pub enum InitStrategy {
+    /// Fast path: load once without version conflict retry.
+    ///
+    /// Under concurrent invalidation, callers may observe stale/dirty value.
+    Unchecked,
+    /// Strict path: retry load when version changes during initialization.
+    ///
+    /// This avoids returning dirty value under invalidate/load races.
+    VersionChecked,
+}
+
 /// [CacheContainer] provides ability to:
 /// - Cache value loaded by [Initializer].
 /// - Invalidate caches by [Invalidator].
@@ -44,6 +65,16 @@ pub struct CacheContainer<K, V, CacheToken> {
     invalidator: Invalidator<K, V, CacheToken>,
     initializer: Initializer<K, V>,
     token_filter: fn(&CacheToken) -> bool,
+    version: Arc<AtomicUsize>,
+    init_strategy: InitStrategy,
+}
+
+fn latest_get_backoff() -> impl Iterator<Item = Duration> {
+    ExponentialBuilder::default()
+        .with_min_delay(Duration::from_millis(10))
+        .with_max_delay(Duration::from_millis(100))
+        .with_max_times(3)
+        .build()
 }
 
 impl<K, V, CacheToken> CacheContainer<K, V, CacheToken>
@@ -52,13 +83,37 @@ where
     V: Send + Sync,
     CacheToken: Send + Sync,
 {
-    /// Constructs an [CacheContainer].
+    /// Constructs an [CacheContainer] with [InitStrategy::Unchecked].
+    ///
+    /// This keeps the historical behavior and can return stale/dirty value under
+    /// concurrent invalidation.
     pub fn new(
         name: String,
         cache: Cache<K, V>,
         invalidator: Invalidator<K, V, CacheToken>,
         initializer: Initializer<K, V>,
         token_filter: fn(&CacheToken) -> bool,
+    ) -> Self {
+        Self::with_strategy(
+            name,
+            cache,
+            invalidator,
+            initializer,
+            token_filter,
+            InitStrategy::Unchecked,
+        )
+    }
+
+    /// Constructs an [CacheContainer] with explicit [InitStrategy].
+    ///
+    /// The strategy is fixed at construction time and cannot be changed later.
+    pub fn with_strategy(
+        name: String,
+        cache: Cache<K, V>,
+        invalidator: Invalidator<K, V, CacheToken>,
+        initializer: Initializer<K, V>,
+        token_filter: fn(&CacheToken) -> bool,
+        init_strategy: InitStrategy,
     ) -> Self {
         Self {
             name,
@@ -66,6 +121,8 @@ where
             invalidator,
             initializer,
             token_filter,
+            version: Arc::new(AtomicUsize::new(0)),
+            init_strategy,
         }
     }
 
@@ -75,6 +132,67 @@ where
     }
 }
 
+impl<K, V, CacheToken> CacheContainer<K, V, CacheToken> {
+    fn inc_version(&self) {
+        self.version.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+async fn init<'a, K, V>(init: Initializer<K, V>, key: K, cache_name: &'a str) -> Result<V>
+where
+    K: Send + Sync + 'a,
+    V: Send + 'a,
+{
+    metrics::CACHE_CONTAINER_CACHE_MISS
+        .with_label_values(&[cache_name])
+        .inc();
+    let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
+        .with_label_values(&[cache_name])
+        .start_timer();
+    init(&key)
+        .await
+        .transpose()
+        .context(error::ValueNotExistSnafu)?
+}
+
+async fn init_with_retry<'a, K, V>(
+    init: Initializer<K, V>,
+    key: K,
+    mut backoff: impl Iterator<Item = Duration> + 'a,
+    version: Arc<AtomicUsize>,
+    cache_name: &'a str,
+) -> Result<V>
+where
+    K: Send + Sync + 'a,
+    V: Send + 'a,
+{
+    let mut attempts = 1usize;
+    loop {
+        let pre_version = version.load(Ordering::Relaxed);
+        metrics::CACHE_CONTAINER_CACHE_MISS
+            .with_label_values(&[cache_name])
+            .inc();
+        let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
+            .with_label_values(&[cache_name])
+            .start_timer();
+        let value = init(&key)
+            .await
+            .transpose()
+            .context(error::ValueNotExistSnafu)??;
+
+        if pre_version == version.load(Ordering::Relaxed) {
+            return Ok(value);
+        }
+
+        if let Some(duration) = backoff.next() {
+            sleep(duration).await;
+            attempts += 1;
+        } else {
+            return error::GetLatestCacheRetryExceededSnafu { attempts }.fail();
+        }
+    }
+}
+
 #[async_trait::async_trait]
 impl<K, V> CacheInvalidator for CacheContainer<K, V, CacheIdent>
 where
@@ -82,14 +200,15 @@ where
     V: Send + Sync,
 {
     async fn invalidate(&self, _ctx: &Context, caches: &[CacheIdent]) -> Result<()> {
-        let tasks = caches
+        let idents = caches
             .iter()
             .filter(|token| (self.token_filter)(token))
-            .map(|token| (self.invalidator)(&self.cache, token));
-        join_all(tasks)
-            .await
-            .into_iter()
-            .collect::<Result<Vec<_>>>()?;
+            .collect::<Vec<_>>();
+        if !idents.is_empty() {
+            self.inc_version();
+            (self.invalidator)(&self.cache, &idents).await?;
+        }
+
         Ok(())
     }
 }
@@ -99,27 +218,39 @@ where
     K: Copy + Hash + Eq + Send + Sync + 'static,
     V: Clone + Send + Sync + 'static,
 {
-    /// Returns a _clone_ of the value corresponding to the key.
+    /// Returns a value from cache for copyable keys.
+    ///
+    /// With [InitStrategy::Unchecked], this method prioritizes latency and may
+    /// return stale/dirty value. With [InitStrategy::VersionChecked], this method
+    /// retries initialization on version change and avoids dirty returns.
     pub async fn get(&self, key: K) -> Result<Option<V>> {
         metrics::CACHE_CONTAINER_CACHE_GET
             .with_label_values(&[&self.name])
             .inc();
-        let moved_init = self.initializer.clone();
-        let moved_key = key;
-        let init = async move {
-            metrics::CACHE_CONTAINER_CACHE_MISS
-                .with_label_values(&[&self.name])
-                .inc();
-            let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
-                .with_label_values(&[&self.name])
-                .start_timer();
-            moved_init(&moved_key)
-                .await
-                .transpose()
-                .context(error::ValueNotExistSnafu)?
+
+        let result = match self.init_strategy {
+            InitStrategy::Unchecked => {
+                self.cache
+                    .try_get_with(key, init(self.initializer.clone(), key, &self.name))
+                    .await
+            }
+            InitStrategy::VersionChecked => {
+                self.cache
+                    .try_get_with(
+                        key,
+                        init_with_retry(
+                            self.initializer.clone(),
+                            key,
+                            latest_get_backoff(),
+                            self.version.clone(),
+                            &self.name,
+                        ),
+                    )
+                    .await
+            }
         };
 
-        match self.cache.try_get_with(key, init).await {
+        match result {
             Ok(value) => Ok(Some(value)),
             Err(err) => match err.as_ref() {
                 Error::ValueNotExist { .. } => Ok(None),
@@ -136,14 +267,15 @@ where
 {
     /// Invalidates cache by [CacheToken].
     pub async fn invalidate(&self, caches: &[CacheToken]) -> Result<()> {
-        let tasks = caches
+        let idents = caches
             .iter()
             .filter(|token| (self.token_filter)(token))
-            .map(|token| (self.invalidator)(&self.cache, token));
-        join_all(tasks)
-            .await
-            .into_iter()
-            .collect::<Result<Vec<_>>>()?;
+            .collect::<Vec<_>>();
+        if !idents.is_empty() {
+            self.inc_version();
+            (self.invalidator)(&self.cache, &idents).await?;
+        }
+
         Ok(())
     }
 
@@ -156,7 +288,11 @@ where
         self.cache.contains_key(key)
     }
 
-    /// Returns a _clone_ of the value corresponding to the key.
+    /// Returns a value from cache by key reference.
+    ///
+    /// With [InitStrategy::Unchecked], this method prioritizes latency and may
+    /// return stale/dirty value. With [InitStrategy::VersionChecked], this method
+    /// retries initialization on version change and avoids dirty returns.
     pub async fn get_by_ref<Q>(&self, key: &Q) -> Result<Option<V>>
     where
         K: Borrow<Q>,
@@ -165,24 +301,32 @@ where
         metrics::CACHE_CONTAINER_CACHE_GET
             .with_label_values(&[&self.name])
             .inc();
-        let moved_init = self.initializer.clone();
-        let moved_key = key.to_owned();
-
-        let init = async move {
-            metrics::CACHE_CONTAINER_CACHE_MISS
-                .with_label_values(&[&self.name])
-                .inc();
-            let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
-                .with_label_values(&[&self.name])
-                .start_timer();
-
-            moved_init(&moved_key)
-                .await
-                .transpose()
-                .context(error::ValueNotExistSnafu)?
+        let result = match self.init_strategy {
+            InitStrategy::Unchecked => {
+                self.cache
+                    .try_get_with_by_ref(
+                        key,
+                        init(self.initializer.clone(), key.to_owned(), &self.name),
+                    )
+                    .await
+            }
+            InitStrategy::VersionChecked => {
+                self.cache
+                    .try_get_with_by_ref(
+                        key,
+                        init_with_retry(
+                            self.initializer.clone(),
+                            key.to_owned(),
+                            latest_get_backoff(),
+                            self.version.clone(),
+                            &self.name,
+                        ),
+                    )
+                    .await
+            }
         };
 
-        match self.cache.try_get_with_by_ref(key, init).await {
+        match result {
             Ok(value) => Ok(Some(value)),
             Err(err) => match err.as_ref() {
                 Error::ValueNotExist { .. } => Ok(None),
@@ -296,9 +440,11 @@ mod tests {
             moved_counter.fetch_add(1, Ordering::Relaxed);
             Box::pin(async { Ok(Some("hi".to_string())) })
         });
-        let invalidator: Invalidator<String, String, String> = Box::new(|cache, key| {
+        let invalidator: Invalidator<String, String, String> = Box::new(|cache, keys| {
             Box::pin(async move {
-                cache.invalidate(key).await;
+                for key in keys {
+                    cache.invalidate(*key).await;
+                }
                 Ok(())
             })
         });
@@ -323,4 +469,46 @@ mod tests {
         assert_eq!(value, "hi");
         assert_eq!(counter.load(Ordering::Relaxed), 2);
     }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_get_by_ref_returns_fresh_value_after_invalidate() {
+        let cache: Cache<String, String> = CacheBuilder::new(128).build();
+        let counter = Arc::new(AtomicI32::new(0));
+        let moved_counter = counter.clone();
+        let init: Initializer<String, String> = Arc::new(move |_| {
+            let counter = moved_counter.clone();
+            Box::pin(async move {
+                let n = counter.fetch_add(1, Ordering::Relaxed) + 1;
+                sleep(Duration::from_millis(100)).await;
+                Ok(Some(format!("v{n}")))
+            })
+        });
+        let invalidator: Invalidator<String, String, String> = Box::new(|cache, keys| {
+            Box::pin(async move {
+                for key in keys {
+                    cache.invalidate(*key).await;
+                }
+                Ok(())
+            })
+        });
+
+        let adv_cache = Arc::new(CacheContainer::with_strategy(
+            "test".to_string(),
+            cache,
+            invalidator,
+            init,
+            always_true_filter,
+            InitStrategy::VersionChecked,
+        ));
+
+        let moved_cache = adv_cache.clone();
+        let get_task = tokio::spawn(async move { moved_cache.get_by_ref("foo").await });
+
+        sleep(Duration::from_millis(50)).await;
+        adv_cache.invalidate(&["foo".to_string()]).await.unwrap();
+
+        let value = get_task.await.unwrap().unwrap().unwrap();
+        assert_eq!(value, "v2");
+        assert_eq!(counter.load(Ordering::Relaxed), 2);
+    }
 }
diff --git a/src/common/meta/src/cache/flow/table_flownode.rs b/src/common/meta/src/cache/flow/table_flownode.rs
index a7777f3361..ebe3664202 100644
--- a/src/common/meta/src/cache/flow/table_flownode.rs
+++ b/src/common/meta/src/cache/flow/table_flownode.rs
@@ -170,20 +170,22 @@ async fn handle_drop_flow(
 
 fn invalidator<'a>(
     cache: &'a Cache<TableId, FlownodeFlowSet>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
     Box::pin(async move {
-        match ident {
-            CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await,
-            CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await,
-            CacheIdent::FlowNodeAddressChange(node_id) => {
-                info!(
-                    "Invalidate flow node cache for node_id in table_flownode: {}",
-                    node_id
-                );
-                cache.invalidate_all();
+        for ident in idents {
+            match ident {
+                CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await,
+                CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await,
+                CacheIdent::FlowNodeAddressChange(node_id) => {
+                    info!(
+                        "Invalidate flow node cache for node_id in table_flownode: {}",
+                        node_id
+                    );
+                    cache.invalidate_all();
+                }
+                _ => {}
             }
-            _ => {}
         }
         Ok(())
     })
diff --git a/src/common/meta/src/cache/table/schema.rs b/src/common/meta/src/cache/table/schema.rs
index bcf81d4fe6..bd9e8e6dc1 100644
--- a/src/common/meta/src/cache/table/schema.rs
+++ b/src/common/meta/src/cache/table/schema.rs
@@ -58,11 +58,13 @@ fn init_factory(schema_manager: SchemaManager) -> Initializer<SchemaName, Arc<Sc
 
 fn invalidator<'a>(
     cache: &'a Cache<SchemaName, Arc<SchemaNameValue>>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, crate::error::Result<()>> {
     Box::pin(async move {
-        if let CacheIdent::SchemaName(schema_name) = ident {
-            cache.invalidate(schema_name).await
+        for ident in idents {
+            if let CacheIdent::SchemaName(schema_name) = ident {
+                cache.invalidate(schema_name).await
+            }
         }
         Ok(())
     })
diff --git a/src/common/meta/src/cache/table/table_info.rs b/src/common/meta/src/cache/table/table_info.rs
index b853d908e8..97af5bcdb7 100644
--- a/src/common/meta/src/cache/table/table_info.rs
+++ b/src/common/meta/src/cache/table/table_info.rs
@@ -61,11 +61,13 @@ fn init_factory(table_info_manager: TableInfoManagerRef) -> Initializer<TableId,
 
 fn invalidator<'a>(
     cache: &'a Cache<TableId, Arc<TableInfo>>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
     Box::pin(async move {
-        if let CacheIdent::TableId(table_id) = ident {
-            cache.invalidate(table_id).await
+        for ident in idents {
+            if let CacheIdent::TableId(table_id) = ident {
+                cache.invalidate(table_id).await
+            }
         }
         Ok(())
     })
diff --git a/src/common/meta/src/cache/table/table_name.rs b/src/common/meta/src/cache/table/table_name.rs
index 540da5e5f4..927a5b3480 100644
--- a/src/common/meta/src/cache/table/table_name.rs
+++ b/src/common/meta/src/cache/table/table_name.rs
@@ -71,11 +71,13 @@ fn init_factory(table_name_manager: TableNameManagerRef) -> Initializer<TableNam
 
 fn invalidator<'a>(
     cache: &'a Cache<TableName, TableId>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
     Box::pin(async move {
-        if let CacheIdent::TableName(table_name) = ident {
-            cache.invalidate(table_name).await
+        for ident in idents {
+            if let CacheIdent::TableName(table_name) = ident {
+                cache.invalidate(table_name).await
+            }
         }
         Ok(())
     })
diff --git a/src/common/meta/src/cache/table/table_route.rs b/src/common/meta/src/cache/table/table_route.rs
index 47abdaa728..be820b0c52 100644
--- a/src/common/meta/src/cache/table/table_route.rs
+++ b/src/common/meta/src/cache/table/table_route.rs
@@ -19,6 +19,7 @@ use moka::future::Cache;
 use snafu::OptionExt;
 use store_api::storage::TableId;
 
+use crate::cache::container::InitStrategy;
 use crate::cache::{CacheContainer, Initializer};
 use crate::error;
 use crate::error::Result;
@@ -65,7 +66,14 @@ pub fn new_table_route_cache(
     let table_info_manager = Arc::new(TableRouteManager::new(kv_backend));
     let init = init_factory(table_info_manager);
 
-    CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
+    CacheContainer::with_strategy(
+        name,
+        cache,
+        Box::new(invalidator),
+        init,
+        filter,
+        InitStrategy::VersionChecked,
+    )
 }
 
 fn init_factory(
@@ -92,11 +100,13 @@ fn init_factory(
 
 fn invalidator<'a>(
     cache: &'a Cache<TableId, Arc<TableRoute>>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
     Box::pin(async move {
-        if let CacheIdent::TableId(table_id) = ident {
-            cache.invalidate(table_id).await
+        for ident in idents {
+            if let CacheIdent::TableId(table_id) = ident {
+                cache.invalidate(table_id).await
+            }
         }
         Ok(())
     })
diff --git a/src/common/meta/src/cache/table/table_schema.rs b/src/common/meta/src/cache/table/table_schema.rs
index 99ece65683..33b1773f45 100644
--- a/src/common/meta/src/cache/table/table_schema.rs
+++ b/src/common/meta/src/cache/table/table_schema.rs
@@ -65,7 +65,7 @@ fn init_factory(table_info_manager: TableInfoManager) -> Initializer<TableId, Ar
 /// Never invalidates table id schema cache.
 fn invalidator<'a>(
     _cache: &'a Cache<TableId, Arc<SchemaName>>,
-    _ident: &'a CacheIdent,
+    _idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, error::Result<()>> {
     Box::pin(std::future::ready(Ok(())))
 }
diff --git a/src/common/meta/src/cache/table/view_info.rs b/src/common/meta/src/cache/table/view_info.rs
index 6a85493d42..d0e1058a7e 100644
--- a/src/common/meta/src/cache/table/view_info.rs
+++ b/src/common/meta/src/cache/table/view_info.rs
@@ -60,11 +60,13 @@ fn init_factory(view_info_manager: ViewInfoManagerRef) -> Initializer<TableId, A
 
 fn invalidator<'a>(
     cache: &'a Cache<TableId, Arc<ViewInfoValue>>,
-    ident: &'a CacheIdent,
+    idents: &'a [&CacheIdent],
 ) -> BoxFuture<'a, Result<()>> {
     Box::pin(async move {
-        if let CacheIdent::TableId(view_id) = ident {
-            cache.invalidate(view_id).await
+        for ident in idents {
+            if let CacheIdent::TableId(view_id) = ident {
+                cache.invalidate(view_id).await
+            }
         }
         Ok(())
     })
diff --git a/src/meta-srv/src/election.rs b/src/common/meta/src/election.rs
similarity index 67%
rename from src/meta-srv/src/election.rs
rename to src/common/meta/src/election.rs
index 2d2826b286..12173beda8 100644
--- a/src/meta-srv/src/election.rs
+++ b/src/common/meta/src/election.rs
@@ -21,15 +21,85 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 
 use common_telemetry::{error, info, warn};
+use serde::{Deserialize, Serialize};
 use tokio::sync::broadcast::error::RecvError;
 use tokio::sync::broadcast::{self, Receiver, Sender};
 
 use crate::error::Result;
-use crate::metasrv::MetasrvNodeInfo;
 
-pub(crate) const CANDIDATE_LEASE_SECS: u64 = 600;
+pub const CANDIDATE_LEASE_SECS: u64 = 600;
 const KEEP_ALIVE_INTERVAL_SECS: u64 = CANDIDATE_LEASE_SECS / 2;
 
+/// The value of the leader. It is used to store the leader's address.
+pub struct LeaderValue(pub String);
+
+impl<T: AsRef<[u8]>> From<T> for LeaderValue {
+    fn from(value: T) -> Self {
+        let string = String::from_utf8_lossy(value.as_ref());
+        Self(string.to_string())
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MetasrvNodeInfo {
+    // The metasrv's address
+    pub addr: String,
+    // The node build version
+    pub version: String,
+    // The node build git commit hash
+    pub git_commit: String,
+    // The node start timestamp in milliseconds
+    pub start_time_ms: u64,
+    // The node total cpu millicores
+    #[serde(default)]
+    pub total_cpu_millicores: i64,
+    // The node total memory bytes
+    #[serde(default)]
+    pub total_memory_bytes: i64,
+    /// The node build cpu usage millicores
+    #[serde(default)]
+    pub cpu_usage_millicores: i64,
+    /// The node build memory usage bytes
+    #[serde(default)]
+    pub memory_usage_bytes: i64,
+    // The node hostname
+    #[serde(default)]
+    pub hostname: String,
+}
+
+// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto.
+#[allow(deprecated)]
+impl From<MetasrvNodeInfo> for api::v1::meta::MetasrvNodeInfo {
+    fn from(node_info: MetasrvNodeInfo) -> Self {
+        Self {
+            peer: Some(api::v1::meta::Peer {
+                addr: node_info.addr,
+                ..Default::default()
+            }),
+            // TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version.
+            // New code should use the fields in `info.NodeInfo` instead.
+            version: node_info.version.clone(),
+            git_commit: node_info.git_commit.clone(),
+            start_time_ms: node_info.start_time_ms,
+            cpus: node_info.total_cpu_millicores as u32,
+            memory_bytes: node_info.total_memory_bytes as u64,
+            // The canonical location for node information.
+            info: Some(api::v1::meta::NodeInfo {
+                version: node_info.version,
+                git_commit: node_info.git_commit,
+                start_time_ms: node_info.start_time_ms,
+                total_cpu_millicores: node_info.total_cpu_millicores,
+                total_memory_bytes: node_info.total_memory_bytes,
+                cpu_usage_millicores: node_info.cpu_usage_millicores,
+                memory_usage_bytes: node_info.memory_usage_bytes,
+                cpus: node_info.total_cpu_millicores as u32,
+                memory_bytes: node_info.total_memory_bytes as u64,
+                hostname: node_info.hostname,
+            }),
+        }
+    }
+}
+
 /// Messages sent when the leader changes.
 #[derive(Debug, Clone)]
 pub enum LeaderChangeMessage {
@@ -168,3 +238,5 @@ pub trait Election: Send + Sync {
 
     fn subscribe_leader_change(&self) -> Receiver<LeaderChangeMessage>;
 }
+
+pub type ElectionRef = Arc<dyn Election<Leader = LeaderValue>>;
diff --git a/src/meta-srv/src/election/etcd.rs b/src/common/meta/src/election/etcd.rs
similarity index 94%
rename from src/meta-srv/src/election/etcd.rs
rename to src/common/meta/src/election/etcd.rs
index 883f723d74..affad31ef4 100644
--- a/src/meta-srv/src/election/etcd.rs
+++ b/src/common/meta/src/election/etcd.rs
@@ -16,8 +16,6 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::time::Duration;
 
-use common_meta::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS};
-use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
 use common_telemetry::{error, info, warn};
 use etcd_client::{
     Client, GetOptions, LeaderKey as EtcdLeaderKey, LeaseKeepAliveStream, LeaseKeeper, PutOptions,
@@ -27,13 +25,15 @@ use tokio::sync::broadcast;
 use tokio::sync::broadcast::Receiver;
 use tokio::time::{MissedTickBehavior, timeout};
 
+use crate::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS};
 use crate::election::{
-    CANDIDATE_LEASE_SECS, Election, KEEP_ALIVE_INTERVAL_SECS, LeaderChangeMessage, LeaderKey,
-    listen_leader_change, send_leader_change_and_set_flags,
+    CANDIDATE_LEASE_SECS, Election, ElectionRef, KEEP_ALIVE_INTERVAL_SECS, LeaderChangeMessage,
+    LeaderKey, LeaderValue, MetasrvNodeInfo, listen_leader_change,
+    send_leader_change_and_set_flags,
 };
 use crate::error;
 use crate::error::Result;
-use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
+use crate::key::{CANDIDATES_ROOT, ELECTION_KEY};
 
 impl LeaderKey for EtcdLeaderKey {
     fn name(&self) -> &[u8] {
@@ -253,7 +253,7 @@ impl Election for EtcdElection {
                 .leader(self.election_key())
                 .await
                 .context(error::EtcdFailedSnafu)?;
-            let leader_value = res.kv().context(error::NoLeaderSnafu)?.value();
+            let leader_value = res.kv().context(error::ElectionNoLeaderSnafu)?.value();
             Ok(leader_value.into())
         }
     }
@@ -279,7 +279,7 @@ impl EtcdElection {
             ensure!(
                 res.ttl() > 0,
                 error::UnexpectedSnafu {
-                    violated: "Failed to refresh the lease",
+                    err_msg: "Failed to refresh the lease".to_string(),
                 }
             );
 
diff --git a/src/meta-srv/src/election/rds.rs b/src/common/meta/src/election/rds.rs
similarity index 96%
rename from src/meta-srv/src/election/rds.rs
rename to src/common/meta/src/election/rds.rs
index 16e113415a..6ee529ee02 100644
--- a/src/meta-srv/src/election/rds.rs
+++ b/src/common/meta/src/election/rds.rs
@@ -36,7 +36,7 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> {
             .split(LEASE_SEP)
             .collect_tuple()
             .with_context(|| UnexpectedSnafu {
-                violated: format!(
+                err_msg: format!(
                     "Invalid value {}, expect node info || {} || expire time",
                     value, LEASE_SEP
                 ),
@@ -45,7 +45,7 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> {
     let expire_time = match Timestamp::from_str(expire_time, None) {
         Ok(ts) => ts,
         Err(_) => UnexpectedSnafu {
-            violated: format!("Invalid timestamp: {}", expire_time),
+            err_msg: format!("Invalid timestamp: {}", expire_time),
         }
         .fail()?,
     };
diff --git a/src/meta-srv/src/election/rds/mysql.rs b/src/common/meta/src/election/rds/mysql.rs
similarity index 97%
rename from src/meta-srv/src/election/rds/mysql.rs
rename to src/common/meta/src/election/rds/mysql.rs
index 20051a2610..80f3d8ca7c 100644
--- a/src/meta-srv/src/election/rds/mysql.rs
+++ b/src/common/meta/src/election/rds/mysql.rs
@@ -16,7 +16,6 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::time::Duration;
 
-use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
 use common_telemetry::{error, info, warn};
 use common_time::Timestamp;
 use snafu::{OptionExt, ResultExt, ensure};
@@ -29,14 +28,15 @@ use tokio::time::MissedTickBehavior;
 
 use crate::election::rds::{LEASE_SEP, Lease, RdsLeaderKey, parse_value_and_expire_time};
 use crate::election::{
-    Election, LeaderChangeMessage, listen_leader_change, send_leader_change_and_set_flags,
+    Election, ElectionRef, LeaderChangeMessage, LeaderValue, MetasrvNodeInfo, listen_leader_change,
+    send_leader_change_and_set_flags,
 };
 use crate::error::{
     AcquireMySqlClientSnafu, DecodeSqlValueSnafu, DeserializeFromJsonSnafu,
-    LeaderLeaseChangedSnafu, LeaderLeaseExpiredSnafu, MySqlExecutionSnafu, NoLeaderSnafu, Result,
-    SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu,
+    ElectionLeaderLeaseChangedSnafu, ElectionLeaderLeaseExpiredSnafu, ElectionNoLeaderSnafu,
+    MySqlExecutionSnafu, Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu,
 };
-use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
+use crate::key::{CANDIDATES_ROOT, ELECTION_KEY};
 
 struct ElectionSqlFactory<'a> {
     table_name: &'a str,
@@ -592,7 +592,7 @@ impl Election for MySqlElection {
             ensure!(
                 lease.expire_time > lease.current,
                 UnexpectedSnafu {
-                    violated: format!(
+                    err_msg: format!(
                         "Candidate lease expired at {:?} (current time: {:?}), key: {:?}",
                         lease.expire_time,
                         lease.current,
@@ -667,10 +667,10 @@ impl Election for MySqlElection {
             let client = self.client.lock().await;
             let mut executor = Executor::Default(client);
             if let Some(lease) = self.get_value_with_lease(&key, &mut executor).await? {
-                ensure!(lease.expire_time > lease.current, NoLeaderSnafu);
+                ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu);
                 Ok(lease.leader_value.as_bytes().into())
             } else {
-                NoLeaderSnafu.fail()
+                ElectionNoLeaderSnafu.fail()
             }
         }
     }
@@ -705,7 +705,7 @@ impl MySqlElection {
         let current_time = match Timestamp::from_str(&current_time_str, None) {
             Ok(ts) => ts,
             Err(_) => UnexpectedSnafu {
-                violated: format!("Invalid timestamp: {}", current_time_str),
+                err_msg: format!("Invalid timestamp: {}", current_time_str),
             }
             .fail()?,
         };
@@ -740,7 +740,7 @@ impl MySqlElection {
             current = match Timestamp::from_str(current_time_str, None) {
                 Ok(ts) => ts,
                 Err(_) => UnexpectedSnafu {
-                    violated: format!("Invalid timestamp: {}", current_time_str),
+                    err_msg: format!("Invalid timestamp: {}", current_time_str),
                 }
                 .fail()?,
             };
@@ -777,7 +777,7 @@ impl MySqlElection {
         ensure!(
             res == 1,
             UnexpectedSnafu {
-                violated: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
+                err_msg: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
             }
         );
 
@@ -920,9 +920,12 @@ impl MySqlElection {
     ///   will be released.
     /// - **Case 2**: If all checks pass, the function returns without performing any actions.
     fn lease_check(&self, lease: &Option<Lease>) -> Result<Lease> {
-        let lease = lease.as_ref().context(NoLeaderSnafu)?;
+        let lease = lease.as_ref().context(ElectionNoLeaderSnafu)?;
         // Case 1: Lease expired
-        ensure!(lease.expire_time > lease.current, LeaderLeaseExpiredSnafu);
+        ensure!(
+            lease.expire_time > lease.current,
+            ElectionLeaderLeaseExpiredSnafu
+        );
         // Case 2: Everything is fine
         Ok(lease.clone())
     }
@@ -960,7 +963,7 @@ impl MySqlElection {
         let remote_lease = self.get_value_with_lease(&key, &mut executor).await?;
         ensure!(
             expected_lease.map(|lease| lease.origin) == remote_lease.map(|lease| lease.origin),
-            LeaderLeaseChangedSnafu
+            ElectionLeaderLeaseChangedSnafu
         );
         self.delete_value(&key, &mut executor).await?;
         self.put_value_with_lease(
@@ -987,12 +990,11 @@ mod tests {
     use std::assert_matches::assert_matches;
     use std::env;
 
-    use common_meta::maybe_skip_mysql_integration_test;
     use common_telemetry::init_default_ut_logging;
+    use sqlx::MySqlPool;
 
     use super::*;
-    use crate::error;
-    use crate::utils::mysql::create_mysql_pool;
+    use crate::{error, maybe_skip_mysql_integration_test};
 
     async fn create_mysql_client(
         table_name: Option<&str>,
@@ -1003,11 +1005,11 @@ mod tests {
         let endpoint = env::var("GT_MYSQL_ENDPOINTS").unwrap_or_default();
         if endpoint.is_empty() {
             return UnexpectedSnafu {
-                violated: "MySQL endpoint is empty".to_string(),
+                err_msg: "MySQL endpoint is empty".to_string(),
             }
             .fail();
         }
-        let pool = create_mysql_pool(&[endpoint], None).await.unwrap();
+        let pool = MySqlPool::connect(&endpoint).await.unwrap();
         let mut client = ElectionMysqlClient::new(
             pool,
             execution_timeout,
@@ -1302,7 +1304,7 @@ mod tests {
         let err = elected(&leader_mysql_election, table_name, Some(incorrect_lease))
             .await
             .unwrap_err();
-        assert_matches!(err, error::Error::LeaderLeaseChanged { .. });
+        assert_matches!(err, error::Error::ElectionLeaderLeaseChanged { .. });
         let lease = get_lease(&leader_mysql_election).await;
         assert!(lease.is_none());
         drop_table(&leader_mysql_election.client, table_name).await;
diff --git a/src/meta-srv/src/election/rds/postgres.rs b/src/common/meta/src/election/rds/postgres.rs
similarity index 97%
rename from src/meta-srv/src/election/rds/postgres.rs
rename to src/common/meta/src/election/rds/postgres.rs
index c21efd780b..01910335a0 100644
--- a/src/meta-srv/src/election/rds/postgres.rs
+++ b/src/common/meta/src/election/rds/postgres.rs
@@ -16,7 +16,6 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::time::Duration;
 
-use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
 use common_telemetry::{error, info, warn};
 use common_time::Timestamp;
 use deadpool_postgres::{Manager, Pool};
@@ -28,13 +27,15 @@ use tokio_postgres::types::ToSql;
 
 use crate::election::rds::{LEASE_SEP, Lease, RdsLeaderKey, parse_value_and_expire_time};
 use crate::election::{
-    Election, LeaderChangeMessage, listen_leader_change, send_leader_change_and_set_flags,
+    Election, ElectionRef, LeaderChangeMessage, LeaderValue, MetasrvNodeInfo, listen_leader_change,
+    send_leader_change_and_set_flags,
 };
 use crate::error::{
-    DeserializeFromJsonSnafu, GetPostgresClientSnafu, NoLeaderSnafu, PostgresExecutionSnafu,
-    Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu,
+    DeserializeFromJsonSnafu, ElectionNoLeaderSnafu, GetPostgresClientSnafu,
+    PostgresExecutionSnafu, Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu,
+    UnexpectedSnafu,
 };
-use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
+use crate::key::{CANDIDATES_ROOT, ELECTION_KEY};
 
 struct ElectionSqlFactory<'a> {
     lock_id: u64,
@@ -404,13 +405,13 @@ impl Election for PgElection {
                 .get_value_with_lease(&key)
                 .await?
                 .context(UnexpectedSnafu {
-                    violated: format!("Failed to get lease for key: {:?}", key),
+                    err_msg: format!("Failed to get lease for key: {:?}", key),
                 })?;
 
             ensure!(
                 lease.expire_time > lease.current,
                 UnexpectedSnafu {
-                    violated: format!(
+                    err_msg: format!(
                         "Candidate lease expired at {:?} (current time {:?}), key: {:?}",
                         lease.expire_time, lease.current, key
                     ),
@@ -464,11 +465,11 @@ impl Election for PgElection {
                 .query(&self.sql_set.campaign, &[])
                 .await?;
             let row = res.first().context(UnexpectedSnafu {
-                violated: "Failed to get the result of acquiring advisory lock",
+                err_msg: "Failed to get the result of acquiring advisory lock".to_string(),
             })?;
             let is_leader = row.try_get(0).map_err(|_| {
                 UnexpectedSnafu {
-                    violated: "Failed to get the result of get lock",
+                    err_msg: "Failed to get the result of get lock".to_string(),
                 }
                 .build()
             })?;
@@ -500,10 +501,10 @@ impl Election for PgElection {
         } else {
             let key = self.election_key();
             if let Some(lease) = self.get_value_with_lease(&key).await? {
-                ensure!(lease.expire_time > lease.current, NoLeaderSnafu);
+                ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu);
                 Ok(lease.leader_value.as_bytes().into())
             } else {
-                NoLeaderSnafu.fail()
+                ElectionNoLeaderSnafu.fail()
             }
         }
     }
@@ -537,7 +538,7 @@ impl PgElection {
             let current_time = match Timestamp::from_str(current_time_str, None) {
                 Ok(ts) => ts,
                 Err(_) => UnexpectedSnafu {
-                    violated: format!("Invalid timestamp: {}", current_time_str),
+                    err_msg: format!("Invalid timestamp: {}", current_time_str),
                 }
                 .fail()?,
             };
@@ -576,7 +577,7 @@ impl PgElection {
             current = match Timestamp::from_str(current_time_str, None) {
                 Ok(ts) => ts,
                 Err(_) => UnexpectedSnafu {
-                    violated: format!("Invalid timestamp: {}", current_time_str),
+                    err_msg: format!("Invalid timestamp: {}", current_time_str),
                 }
                 .fail()?,
             };
@@ -613,7 +614,7 @@ impl PgElection {
         ensure!(
             res == 1,
             UnexpectedSnafu {
-                violated: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
+                err_msg: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
             }
         );
 
@@ -742,9 +743,9 @@ impl PgElection {
         let lease = self
             .get_value_with_lease(&key)
             .await?
-            .context(NoLeaderSnafu)?;
+            .context(ElectionNoLeaderSnafu)?;
         // Case 2
-        ensure!(lease.expire_time > lease.current, NoLeaderSnafu);
+        ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu);
         // Case 3
         Ok(())
     }
@@ -831,11 +832,11 @@ mod tests {
     use std::assert_matches::assert_matches;
     use std::env;
 
-    use common_meta::maybe_skip_postgres_integration_test;
+    use deadpool_postgres::{Config, Runtime};
+    use tokio_postgres::NoTls;
 
     use super::*;
-    use crate::error;
-    use crate::utils::postgres::create_postgres_pool;
+    use crate::{error, maybe_skip_postgres_integration_test};
 
     async fn create_postgres_client(
         table_name: Option<&str>,
@@ -846,11 +847,13 @@ mod tests {
         let endpoint = env::var("GT_POSTGRES_ENDPOINTS").unwrap_or_default();
         if endpoint.is_empty() {
             return UnexpectedSnafu {
-                violated: "Postgres endpoint is empty".to_string(),
+                err_msg: "Postgres endpoint is empty".to_string(),
             }
             .fail();
         }
-        let pool = create_postgres_pool(&[endpoint], None, None).await.unwrap();
+        let mut cfg = Config::new();
+        cfg.url = Some(endpoint);
+        let pool = cfg.create_pool(Some(Runtime::Tokio1), NoTls).unwrap();
         let mut pg_client = ElectionPgClient::new(
             pool,
             execution_timeout,
diff --git a/src/common/meta/src/error.rs b/src/common/meta/src/error.rs
index c6613af828..05b5af393b 100644
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -338,6 +338,24 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Metasrv election has no leader at this moment"))]
+    ElectionNoLeader {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Metasrv election leader lease expired"))]
+    ElectionLeaderLeaseExpired {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Metasrv election leader lease changed during election"))]
+    ElectionLeaderLeaseChanged {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display("Table already exists, table: {}", table_name))]
     TableAlreadyExists {
         table_name: String,
@@ -714,6 +732,16 @@ pub enum Error {
     #[snafu(display("Failed to get cache"))]
     GetCache { source: Arc<Error> },
 
+    #[snafu(display(
+        "Failed to get latest cache value after {} attempts due to concurrent invalidation",
+        attempts
+    ))]
+    GetLatestCacheRetryExceeded {
+        attempts: usize,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[cfg(feature = "pg_kvbackend")]
     #[snafu(display("Failed to execute via Postgres, sql: {}", sql))]
     PostgresExecution {
@@ -741,6 +769,15 @@ pub enum Error {
         location: Location,
     },
 
+    #[cfg(feature = "pg_kvbackend")]
+    #[snafu(display("Failed to get Postgres client"))]
+    GetPostgresClient {
+        #[snafu(source)]
+        error: deadpool::managed::PoolError<tokio_postgres::Error>,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[cfg(feature = "pg_kvbackend")]
     #[snafu(display("Failed to {} Postgres transaction", operation))]
     PostgresTransaction {
@@ -795,6 +832,24 @@ pub enum Error {
         location: Location,
     },
 
+    #[cfg(feature = "mysql_kvbackend")]
+    #[snafu(display("Failed to decode sql value"))]
+    DecodeSqlValue {
+        #[snafu(source)]
+        error: sqlx::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[cfg(feature = "mysql_kvbackend")]
+    #[snafu(display("Failed to acquire mysql client from pool"))]
+    AcquireMySqlClient {
+        #[snafu(source)]
+        error: sqlx::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[cfg(feature = "mysql_kvbackend")]
     #[snafu(display("Failed to {} MySql transaction", operation))]
     MySqlTransaction {
@@ -812,6 +867,15 @@ pub enum Error {
         location: Location,
     },
 
+    #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
+    #[snafu(display("Sql execution timeout, sql: {}, duration: {:?}", sql, duration))]
+    SqlExecutionTimeout {
+        sql: String,
+        duration: std::time::Duration,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display(
         "Datanode table info not found, table id: {}, datanode id: {}",
         table_id,
@@ -1063,8 +1127,12 @@ impl ErrorExt for Error {
             | ConnectEtcd { .. }
             | MoveValues { .. }
             | GetCache { .. }
+            | GetLatestCacheRetryExceeded { .. }
             | SerializeToJson { .. }
-            | DeserializeFromJson { .. } => StatusCode::Internal,
+            | DeserializeFromJson { .. }
+            | ElectionNoLeader { .. }
+            | ElectionLeaderLeaseExpired { .. }
+            | ElectionLeaderLeaseChanged { .. } => StatusCode::Internal,
 
             NoLeader { .. } => StatusCode::TableUnavailable,
             ValueNotExist { .. }
@@ -1187,15 +1255,18 @@ impl ErrorExt for Error {
             PostgresExecution { .. }
             | CreatePostgresPool { .. }
             | GetPostgresConnection { .. }
+            | GetPostgresClient { .. }
             | PostgresTransaction { .. }
             | PostgresTlsConfig { .. }
             | InvalidTlsConfig { .. } => StatusCode::Internal,
             #[cfg(feature = "mysql_kvbackend")]
-            MySqlExecution { .. } | CreateMySqlPool { .. } | MySqlTransaction { .. } => {
-                StatusCode::Internal
-            }
+            MySqlExecution { .. }
+            | CreateMySqlPool { .. }
+            | DecodeSqlValue { .. }
+            | AcquireMySqlClient { .. }
+            | MySqlTransaction { .. } => StatusCode::Internal,
             #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
-            RdsTransactionRetryFailed { .. } => StatusCode::Internal,
+            RdsTransactionRetryFailed { .. } | SqlExecutionTimeout { .. } => StatusCode::Internal,
             DatanodeTableInfoNotFound { .. } => StatusCode::Internal,
         }
     }
@@ -1243,7 +1314,10 @@ impl Error {
 
     /// Determine whether it is a retry later type through [StatusCode]
     pub fn is_retry_later(&self) -> bool {
-        matches!(self, Error::RetryLater { .. })
+        matches!(
+            self,
+            Error::RetryLater { .. } | Error::GetLatestCacheRetryExceeded { .. }
+        )
     }
 
     /// Determine whether it needs to clean poisons.
diff --git a/src/common/meta/src/lib.rs b/src/common/meta/src/lib.rs
index 93cd229b16..36aae1026e 100644
--- a/src/common/meta/src/lib.rs
+++ b/src/common/meta/src/lib.rs
@@ -22,6 +22,7 @@ pub mod datanode;
 pub mod ddl;
 pub mod ddl_manager;
 pub mod distributed_time_constants;
+pub mod election;
 pub mod error;
 pub mod flow_name;
 pub mod heartbeat;
diff --git a/src/common/procedure/src/local/runner.rs b/src/common/procedure/src/local/runner.rs
index 454afb95b3..46dcef11d4 100644
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -17,6 +17,8 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use backon::{BackoffBuilder, ExponentialBuilder};
+use common_error::ext::PlainError;
+use common_error::status_code::StatusCode;
 use common_event_recorder::EventRecorderRef;
 use common_telemetry::tracing_context::{FutureExt, TracingContext};
 use common_telemetry::{debug, error, info, tracing};
@@ -90,6 +92,45 @@ impl Drop for ProcedureGuard {
     }
 }
 
+/// Returns a list of conflicting lock keys between a parent and a child procedure.
+/// Evaluates the Read/Write lock compatibility matrix:
+/// - Share + Share => Compatible
+/// - Exclusive + Any => Conflict
+/// - Any + Exclusive => Conflict
+fn find_lock_conflicts<'a>(
+    parent_keys: impl Iterator<Item = &'a StringKey>,
+    child_keys: impl Iterator<Item = &'a StringKey>,
+) -> Vec<String> {
+    use std::collections::HashMap;
+
+    // Map from key string slice (&str) to a boolean indicating if the parent holds it EXCLUSIVELY.
+    let mut parent_map = HashMap::new();
+    for key in parent_keys {
+        match key {
+            StringKey::Exclusive(k) => {
+                parent_map.insert(k.as_str(), true);
+            }
+            StringKey::Share(k) => {
+                parent_map.entry(k.as_str()).or_insert(false);
+            }
+        }
+    }
+
+    child_keys
+        .filter_map(|child_key| match child_key {
+            StringKey::Exclusive(k) | StringKey::Share(k)
+                if parent_map.get(k.as_str()) == Some(&true) =>
+            {
+                Some(k.clone())
+            }
+            StringKey::Exclusive(k) if parent_map.get(k.as_str()) == Some(&false) => {
+                Some(k.clone())
+            }
+            _ => None,
+        })
+        .collect()
+}
+
 pub(crate) struct Runner {
     pub(crate) meta: ProcedureMetaRef,
     pub(crate) procedure: BoxedProcedure,
@@ -512,6 +553,41 @@ impl Runner {
 
     async fn on_suspended(&mut self, subprocedures: Vec<ProcedureWithId>) {
         let has_child = !subprocedures.is_empty();
+
+        // Pre-check: detect potential deadlocks BEFORE submitting any subprocedure.
+        // If a child shares conflicting lock keys with the parent, submitting it would
+        // cause a Hold-and-Wait deadlock — the child blocks on lock acquisition while
+        // the parent holds the lock and waits for the child to finish.
+        for sub in &subprocedures {
+            let conflicting = find_lock_conflicts(
+                self.meta.lock_key.keys_to_lock(),
+                sub.procedure.lock_key().keys_to_lock(),
+            );
+            if !conflicting.is_empty() {
+                let err_msg = format!(
+                    "Deadlock prevented: subprocedure {}-{} shares conflicting lock key(s) {:?} \
+                     with parent {}-{}. Parent holds these locks and would wait for child \
+                     completion, but child cannot acquire them.",
+                    sub.procedure.type_name(),
+                    sub.id,
+                    conflicting,
+                    self.procedure.type_name(),
+                    self.meta.id,
+                );
+                error!("{}", err_msg);
+                let err = Arc::new(Error::external(PlainError::new(
+                    err_msg,
+                    StatusCode::Internal,
+                )));
+                if self.procedure.rollback_supported() {
+                    self.meta.set_state(ProcedureState::prepare_rollback(err));
+                } else {
+                    self.meta.set_state(ProcedureState::failed(err));
+                }
+                return;
+            }
+        }
+
         for subprocedure in subprocedures {
             info!(
                 "Procedure {}-{} submit subprocedure {}-{}",
@@ -1939,4 +2015,169 @@ mod tests {
         join_all(tasks).await;
         assert_eq!(shared_atomic_value.load(Ordering::Relaxed), 2);
     }
+    #[tokio::test]
+    async fn test_on_suspend_deadlock_detected_no_rollback() {
+        // Parent holds Exclusive("catalog.schema.table"), child also requests Exclusive("catalog.schema.table").
+        // Since parent does NOT support rollback, state should become Failed.
+        let child_id = ProcedureId::random();
+        let exec_fn = move |_| {
+            async move {
+                let child_exec_fn = |_| async { Ok(Status::done()) }.boxed();
+                let child = ProcedureAdapter {
+                    data: "child".to_string(),
+                    lock_key: LockKey::single_exclusive("catalog.schema.table"),
+                    poison_keys: PoisonKeys::default(),
+                    exec_fn: child_exec_fn,
+                    rollback_fn: None,
+                };
+                Ok(Status::Suspended {
+                    subprocedures: vec![ProcedureWithId {
+                        id: child_id,
+                        procedure: Box::new(child),
+                    }],
+                    persist: false,
+                })
+            }
+            .boxed()
+        };
+        let parent = ProcedureAdapter {
+            data: "parent".to_string(),
+            lock_key: LockKey::single_exclusive("catalog.schema.table"),
+            poison_keys: PoisonKeys::default(),
+            exec_fn,
+            rollback_fn: None, // No rollback support
+        };
+
+        let dir = create_temp_dir("deadlock_no_rollback");
+        let meta = parent.new_meta(ROOT_ID);
+        let ctx = context_without_provider(meta.id);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store);
+        runner.manager_ctx.start();
+
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_failed(), "Expected Failed, got {state:?}");
+        // Verify the error exists
+        assert!(
+            state.error().is_some(),
+            "Failed state should contain an error"
+        );
+        // Child should NOT have been submitted
+        assert!(
+            !runner.manager_ctx.contains_procedure(child_id),
+            "Child procedure should not be submitted when deadlock is detected"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_on_suspend_deadlock_detected_with_rollback() {
+        // Parent holds Exclusive("catalog.schema.table"), child also requests Exclusive("catalog.schema.table").
+        // Since parent DOES support rollback, state should become PrepareRollback.
+        let child_id = ProcedureId::random();
+        let exec_fn = move |_| {
+            async move {
+                let child_exec_fn = |_| async { Ok(Status::done()) }.boxed();
+                let child = ProcedureAdapter {
+                    data: "child".to_string(),
+                    lock_key: LockKey::single_exclusive("catalog.schema.table"),
+                    poison_keys: PoisonKeys::default(),
+                    exec_fn: child_exec_fn,
+                    rollback_fn: None,
+                };
+                Ok(Status::Suspended {
+                    subprocedures: vec![ProcedureWithId {
+                        id: child_id,
+                        procedure: Box::new(child),
+                    }],
+                    persist: false,
+                })
+            }
+            .boxed()
+        };
+        let rollback_fn = move |_| async move { Ok(()) }.boxed();
+        let parent = ProcedureAdapter {
+            data: "parent".to_string(),
+            lock_key: LockKey::single_exclusive("catalog.schema.table"),
+            poison_keys: PoisonKeys::default(),
+            exec_fn,
+            rollback_fn: Some(Box::new(rollback_fn)), // Supports rollback
+        };
+
+        let dir = create_temp_dir("deadlock_with_rollback");
+        let meta = parent.new_meta(ROOT_ID);
+        let ctx = context_without_provider(meta.id);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(meta.clone(), Box::new(parent), procedure_store);
+        runner.manager_ctx.start();
+
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(
+            state.is_prepare_rollback(),
+            "Expected PrepareRollback, got {state:?}"
+        );
+        // Verify the error exists in PrepareRollback variant
+        match &state {
+            ProcedureState::PrepareRollback { error } => {
+                assert!(!error.to_string().is_empty(), "Error should not be empty");
+            }
+            _ => panic!("Expected PrepareRollback, got {state:?}"),
+        }
+        // Child should NOT have been submitted
+        assert!(
+            !runner.manager_ctx.contains_procedure(child_id),
+            "Child procedure should not be submitted when deadlock is detected"
+        );
+    }
+
+    #[test]
+    fn test_find_lock_conflicts() {
+        use crate::procedure::StringKey;
+
+        // 1. Share + Share = No conflict (Compatible)
+        let parent = [StringKey::Share("A".to_string())];
+        let child = [StringKey::Share("A".to_string())];
+        assert!(super::find_lock_conflicts(parent.iter(), child.iter()).is_empty());
+
+        // 2. Share + Exclusive = Conflict
+        let parent = [StringKey::Share("A".to_string())];
+        let child = [StringKey::Exclusive("A".to_string())];
+        assert_eq!(
+            super::find_lock_conflicts(parent.iter(), child.iter()),
+            vec!["A".to_string()]
+        );
+
+        // 3. Exclusive + Share = Conflict
+        let parent = [StringKey::Exclusive("A".to_string())];
+        let child = [StringKey::Share("A".to_string())];
+        assert_eq!(
+            super::find_lock_conflicts(parent.iter(), child.iter()),
+            vec!["A".to_string()]
+        );
+
+        // 4. Exclusive + Exclusive = Conflict
+        let parent = [StringKey::Exclusive("A".to_string())];
+        let child = [StringKey::Exclusive("A".to_string())];
+        assert_eq!(
+            super::find_lock_conflicts(parent.iter(), child.iter()),
+            vec!["A".to_string()]
+        );
+
+        // 5. Multiple keys, partial overlap
+        let parent = [
+            StringKey::Share("A".to_string()),
+            StringKey::Exclusive("B".to_string()),
+        ];
+        let child = [
+            StringKey::Exclusive("A".to_string()), // Conflict with Share("A")
+            StringKey::Share("B".to_string()),     // Conflict with Exclusive("B")
+            StringKey::Exclusive("C".to_string()), // No conflict, parent doesn't hold C
+        ];
+        let mut conflicts = super::find_lock_conflicts(parent.iter(), child.iter());
+        conflicts.sort();
+        assert_eq!(conflicts, vec!["A".to_string(), "B".to_string()]);
+    }
 }
diff --git a/src/common/query/src/prelude.rs b/src/common/query/src/prelude.rs
index c27b94294e..50668bbbb1 100644
--- a/src/common/query/src/prelude.rs
+++ b/src/common/query/src/prelude.rs
@@ -27,7 +27,16 @@ static GREPTIME_TIMESTAMP_CELL: OnceCell<String> = OnceCell::new();
 static GREPTIME_VALUE_CELL: OnceCell<String> = OnceCell::new();
 
 pub fn set_default_prefix(prefix: Option<&str>) -> Result<()> {
-    match prefix {
+    // Strip surrounding double quotes as a defensive measure against upstream
+    // sources (scripts, CI, template engines, incorrect shell escaping) that may
+    // pass literal `""` as the value instead of an empty string.
+    let stripped = prefix.map(|s| {
+        s.strip_prefix('"')
+            .and_then(|s| s.strip_suffix('"'))
+            .unwrap_or(s)
+    });
+
+    match stripped {
         None => {
             // use default greptime prefix
             GREPTIME_TIMESTAMP_CELL.get_or_init(|| GREPTIME_TIMESTAMP.to_string());
@@ -70,3 +79,45 @@ const GREPTIME_VALUE: &str = "greptime_value";
 pub const GREPTIME_COUNT: &str = "greptime_count";
 /// Default physical table name
 pub const GREPTIME_PHYSICAL_TABLE: &str = "greptime_physical_table";
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // Each test runs in a separate process via `cargo nextest`, so OnceCell
+    // state does not leak between tests.
+
+    #[test]
+    fn test_set_default_prefix_none() {
+        set_default_prefix(None).unwrap();
+        assert_eq!(greptime_timestamp(), "greptime_timestamp");
+        assert_eq!(greptime_value(), "greptime_value");
+    }
+
+    #[test]
+    fn test_set_default_prefix_empty_string() {
+        set_default_prefix(Some("")).unwrap();
+        assert_eq!(greptime_timestamp(), "timestamp");
+        assert_eq!(greptime_value(), "value");
+    }
+
+    #[test]
+    fn test_set_default_prefix_quoted_empty() {
+        // Handles upstream sources that pass literal `""` instead of an empty string
+        set_default_prefix(Some("\"\"")).unwrap();
+        assert_eq!(greptime_timestamp(), "timestamp");
+        assert_eq!(greptime_value(), "value");
+    }
+
+    #[test]
+    fn test_set_default_prefix_custom() {
+        set_default_prefix(Some("mydb")).unwrap();
+        assert_eq!(greptime_timestamp(), "mydb_timestamp");
+        assert_eq!(greptime_value(), "mydb_value");
+    }
+
+    #[test]
+    fn test_set_default_prefix_invalid() {
+        assert!(set_default_prefix(Some("invalid prefix!")).is_err());
+    }
+}
diff --git a/src/datatypes/src/schema.rs b/src/datatypes/src/schema.rs
index 9070e2babe..50f2dba270 100644
--- a/src/datatypes/src/schema.rs
+++ b/src/datatypes/src/schema.rs
@@ -16,8 +16,8 @@ mod column_schema;
 pub mod constraint;
 
 use std::collections::HashMap;
-use std::fmt;
 use std::sync::Arc;
+use std::{fmt, mem};
 
 use arrow::datatypes::{Field, Schema as ArrowSchema};
 use datafusion_common::DFSchemaRef;
@@ -177,6 +177,26 @@ impl Schema {
         &self.arrow_schema.metadata
     }
 
+    /// Returns the estimated memory footprint of this schema.
+    pub fn estimated_size(&self) -> usize {
+        mem::size_of_val(self)
+            + mem::size_of::<ColumnSchema>() * self.column_schemas.capacity()
+            + self
+                .column_schemas
+                .iter()
+                .map(|column_schema| {
+                    column_schema.estimated_size() - mem::size_of::<ColumnSchema>()
+                })
+                .sum::<usize>()
+            + mem::size_of::<(String, usize)>() * self.name_to_index.capacity()
+            + self
+                .name_to_index
+                .keys()
+                .map(|name| name.capacity())
+                .sum::<usize>()
+            + arrow_schema_size(self.arrow_schema.as_ref())
+    }
+
     /// Generate a new projected schema
     ///
     /// # Panic
@@ -213,6 +233,17 @@ impl Schema {
     }
 }
 
+fn arrow_schema_size(schema: &ArrowSchema) -> usize {
+    mem::size_of_val(schema)
+        + schema.fields.size()
+        + mem::size_of::<(String, String)>() * schema.metadata.capacity()
+        + schema
+            .metadata
+            .iter()
+            .map(|(key, value)| key.capacity() + value.capacity())
+            .sum::<usize>()
+}
+
 #[derive(Default)]
 pub struct SchemaBuilder {
     column_schemas: Vec<ColumnSchema>,
diff --git a/src/datatypes/src/schema/column_schema.rs b/src/datatypes/src/schema/column_schema.rs
index 183cf05da8..2479f4fc41 100644
--- a/src/datatypes/src/schema/column_schema.rs
+++ b/src/datatypes/src/schema/column_schema.rs
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 use std::collections::HashMap;
-use std::fmt;
 use std::str::FromStr;
+use std::{fmt, mem};
 
 use arrow::datatypes::Field;
 use arrow_schema::extension::{
@@ -178,6 +178,19 @@ impl ColumnSchema {
         self
     }
 
+    /// Returns the estimated memory footprint of this schema.
+    pub fn estimated_size(&self) -> usize {
+        mem::size_of_val(self) - mem::size_of_val(&self.data_type)
+            + self.data_type.as_arrow_type().size()
+            + self.name.capacity()
+            + self
+                .default_constraint
+                .as_ref()
+                .map(column_default_constraint_size)
+                .unwrap_or_default()
+            + metadata_size(&self.metadata)
+    }
+
     /// Set the inverted index for the column.
     /// Similar to [with_inverted_index] but don't take the ownership.
     ///
@@ -493,6 +506,21 @@ impl ColumnSchema {
     }
 }
 
+fn metadata_size(metadata: &Metadata) -> usize {
+    mem::size_of::<(String, String)>() * metadata.capacity()
+        + metadata
+            .iter()
+            .map(|(key, value)| key.capacity() + value.capacity())
+            .sum::<usize>()
+}
+
+fn column_default_constraint_size(default_constraint: &ColumnDefaultConstraint) -> usize {
+    match default_constraint {
+        ColumnDefaultConstraint::Function(expr) => expr.capacity(),
+        ColumnDefaultConstraint::Value(value) => value.as_value_ref().data_size(),
+    }
+}
+
 /// Column extended type set in column schema's metadata.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum ColumnExtType {
diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs
index 61586fc460..912bbfca54 100644
--- a/src/datatypes/src/types/json_type.rs
+++ b/src/datatypes/src/types/json_type.rs
@@ -396,7 +396,7 @@ pub fn jsonb_to_string(val: &[u8]) -> Result<String> {
     match jsonb::from_slice(val) {
         Ok(jsonb_value) => {
             let serialized = jsonb_value.to_string();
-            Ok(serialized)
+            fix_unicode_point(&serialized)
         }
         Err(e) => InvalidJsonbSnafu { error: e }.fail(),
     }
@@ -405,18 +405,12 @@ pub fn jsonb_to_string(val: &[u8]) -> Result<String> {
 /// Converts a json type value to serde_json::Value
 pub fn jsonb_to_serde_json(val: &[u8]) -> Result<serde_json::Value> {
     let json_string = jsonb_to_string(val)?;
-    jsonb_string_to_serde_value(&json_string)
+    serde_json::Value::from_str(&json_string).context(DeserializeSnafu { json: json_string })
 }
 
-/// Attempts to deserialize a JSON text into `serde_json::Value`, with a best-effort
-/// fallback for Rust-style Unicode escape sequences.
+/// Normalizes a JSON string by converting Rust-style Unicode escape sequences to JSON-compatible format.
 ///
-/// This function is intended to be used on JSON strings produced from the internal
-/// JSONB representation (e.g. via [`jsonb_to_string`]). It first calls
-/// `serde_json::Value::from_str` directly. If that succeeds, the parsed value is
-/// returned as-is.
-///
-/// If the initial parse fails, the input is scanned for Rust-style Unicode code
+/// The input is scanned for Rust-style Unicode code
 /// point escapes of the form `\\u{H...}` (a backslash, `u`, an opening brace,
 /// followed by 1–6 hexadecimal digits, and a closing brace). Each such escape is
 /// converted into JSON-compatible UTF‑16 escape sequences:
@@ -427,59 +421,44 @@ pub fn jsonb_to_serde_json(val: &[u8]) -> Result<serde_json::Value> {
 ///   the code point is encoded as a UTF‑16 surrogate pair and emitted as two consecutive
 ///   `\\uXXXX` sequences (as JSON format required).
 ///
-/// After this normalization, the function retries parsing the resulting string as
-/// JSON and returns the deserialized value or a `DeserializeSnafu` error if it
-/// still cannot be parsed.
-fn jsonb_string_to_serde_value(json: &str) -> Result<serde_json::Value> {
-    match serde_json::Value::from_str(json) {
-        Ok(v) => Ok(v),
-        Err(e) => {
-            // If above deserialization is failed, the JSON string might contain some Rust chars
-            // that are somehow incorrectly represented as Unicode code point literal. For example,
-            // "\u{fe0f}". We have to convert them to JSON compatible format, like "\uFE0F", then
-            // try to deserialize the JSON string again.
-            if !e.is_syntax() || !e.to_string().contains("invalid escape") {
-                return Err(e).context(DeserializeSnafu { json });
-            }
+/// After this normalization, the function returns the normalized string
+fn fix_unicode_point(json: &str) -> Result<String> {
+    static UNICODE_CODE_POINT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
+        // Match literal "\u{...}" sequences, capturing 1–6 (code point range) hex digits
+        // inside braces.
+        Regex::new(r"\\u\{([0-9a-fA-F]{1,6})}").unwrap_or_else(|e| panic!("{}", e))
+    });
 
-            static UNICODE_CODE_POINT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
-                // Match literal "\u{...}" sequences, capturing 1–6 (code point range) hex digits
-                // inside braces.
-                Regex::new(r"\\u\{([0-9a-fA-F]{1,6})}").unwrap_or_else(|e| panic!("{}", e))
-            });
+    let v = UNICODE_CODE_POINT_PATTERN.replace_all(json, |caps: &Captures| {
+        // Extract the hex payload (without braces) and parse to a code point.
+        let hex = &caps[1];
+        let Ok(code) = u32::from_str_radix(hex, 16) else {
+            // On parse failure, leave the original escape sequence unchanged.
+            return caps[0].to_string();
+        };
 
-            let v = UNICODE_CODE_POINT_PATTERN.replace_all(json, |caps: &Captures| {
-                // Extract the hex payload (without braces) and parse to a code point.
-                let hex = &caps[1];
-                let Ok(code) = u32::from_str_radix(hex, 16) else {
-                    // On parse failure, leave the original escape sequence unchanged.
-                    return caps[0].to_string();
-                };
+        if code <= 0xFFFF {
+            // Basic Multilingual Plane: JSON can represent this directly as \uXXXX.
+            format!("\\u{:04X}", code)
+        } else if code > 0x10FFFF {
+            // Beyond max Unicode code point
+            caps[0].to_string()
+        } else {
+            // Supplementary planes: JSON needs UTF-16 surrogate pairs.
+            // Convert the code point to a 20-bit value.
+            let code = code - 0x10000;
 
-                if code <= 0xFFFF {
-                    // Basic Multilingual Plane: JSON can represent this directly as \uXXXX.
-                    format!("\\u{:04X}", code)
-                } else if code > 0x10FFFF {
-                    // Beyond max Unicode code point
-                    caps[0].to_string()
-                } else {
-                    // Supplementary planes: JSON needs UTF-16 surrogate pairs.
-                    // Convert the code point to a 20-bit value.
-                    let code = code - 0x10000;
+            // High surrogate: top 10 bits, offset by 0xD800.
+            let high = 0xD800 + ((code >> 10) & 0x3FF);
 
-                    // High surrogate: top 10 bits, offset by 0xD800.
-                    let high = 0xD800 + ((code >> 10) & 0x3FF);
+            // Low surrogate: bottom 10 bits, offset by 0xDC00.
+            let low = 0xDC00 + (code & 0x3FF);
 
-                    // Low surrogate: bottom 10 bits, offset by 0xDC00.
-                    let low = 0xDC00 + (code & 0x3FF);
-
-                    // Emit two \uXXXX escapes in sequence.
-                    format!("\\u{:04X}\\u{:04X}", high, low)
-                }
-            });
-            serde_json::Value::from_str(&v).context(DeserializeSnafu { json })
+            // Emit two \uXXXX escapes in sequence.
+            format!("\\u{:04X}\\u{:04X}", high, low)
         }
-    }
+    });
+    Ok(v.to_string())
 }
 
 /// Parses a string to a json type value
@@ -495,45 +474,54 @@ mod tests {
     use crate::json::JsonStructureSettings;
 
     #[test]
-    fn test_jsonb_string_to_serde_value() -> Result<()> {
+    fn test_fix_unicode_point() -> Result<()> {
         let valid_cases = vec![
-            (r#"{"data": "simple ascii"}"#, r#"{"data":"simple ascii"}"#),
+            (r#"{"data": "simple ascii"}"#, r#"{"data": "simple ascii"}"#),
             (
-                r#"{"data": "Greek sigma: \u{03a3}"}"#,
-                r#"{"data":"Greek sigma: Σ"}"#,
+                r#"{"data":"Greek sigma: \u{03a3}"}"#,
+                r#"{"data":"Greek sigma: \u03A3"}"#,
             ),
             (
-                r#"{"data": "Joker card: \u{1f0df}"}"#,
-                r#"{"data":"Joker card: 🃟"}"#,
+                r#"{"data":"Joker card: \u{1f0df}"}"#,
+                r#"{"data":"Joker card: \uD83C\uDCDF"}"#,
             ),
             (
-                r#"{"data": "BMP boundary: \u{ffff}"}"#,
-                r#"{"data":"BMP boundary: ￿"}"#,
+                r#"{"data":"BMP boundary: \u{ffff}"}"#,
+                r#"{"data":"BMP boundary: \uFFFF"}"#,
             ),
             (
-                r#"{"data": "Supplementary min: \u{10000}"}"#,
-                r#"{"data":"Supplementary min: 𐀀"}"#,
+                r#"{"data":"Supplementary min: \u{10000}"}"#,
+                r#"{"data":"Supplementary min: \uD800\uDC00"}"#,
             ),
             (
-                r#"{"data": "Supplementary max: \u{10ffff}"}"#,
-                r#"{"data":"Supplementary max: 􏿿"}"#,
+                r#"{"data":"Supplementary max: \u{10ffff}"}"#,
+                r#"{"data":"Supplementary max: \uDBFF\uDFFF"}"#,
             ),
         ];
         for (input, expect) in valid_cases {
-            let v = jsonb_string_to_serde_value(input)?;
-            assert_eq!(v.to_string(), expect);
+            let v = fix_unicode_point(input)?;
+            assert_eq!(v, expect);
         }
 
-        let invalid_cases = vec![
-            r#"{"data": "Invalid hex: \u{gggg}"}"#,
-            r#"{"data": "Beyond max Unicode code point: \u{110000}"}"#,
-            r#"{"data": "Out of range: \u{1100000}"}"#, // 7 digit
-            r#"{"data": "Empty braces: \u{}"}"#,
+        let invalid_escape_cases = vec![
+            (
+                r#"{"data": "Invalid hex: \u{gggg}"}"#,
+                r#"{"data": "Invalid hex: \u{gggg}"}"#,
+            ),
+            (
+                r#"{"data": "Empty braces: \u{}"}"#,
+                r#"{"data": "Empty braces: \u{}"}"#,
+            ),
+            (
+                r#"{"data": "Out of range: \u{1100000}"}"#,
+                r#"{"data": "Out of range: \u{1100000}"}"#,
+            ),
         ];
-        for input in invalid_cases {
-            let result = jsonb_string_to_serde_value(input);
-            assert!(result.is_err());
+        for (input, expect) in invalid_escape_cases {
+            let v = fix_unicode_point(input)?;
+            assert_eq!(v, expect);
         }
+
         Ok(())
     }
 
diff --git a/src/flow/src/df_optimizer.rs b/src/flow/src/df_optimizer.rs
index 1d41d09346..614b79ccf1 100644
--- a/src/flow/src/df_optimizer.rs
+++ b/src/flow/src/df_optimizer.rs
@@ -16,30 +16,19 @@
 
 #![warn(unused)]
 
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::sync::Arc;
 
 use common_error::ext::BoxedError;
 use common_telemetry::debug;
 use datafusion::config::ConfigOptions;
 use datafusion::error::DataFusionError;
-use datafusion::functions_aggregate::count::count_udaf;
-use datafusion::functions_aggregate::sum::sum_udaf;
 use datafusion::optimizer::analyzer::type_coercion::TypeCoercion;
 use datafusion::optimizer::common_subexpr_eliminate::CommonSubexprEliminate;
 use datafusion::optimizer::optimize_projections::OptimizeProjections;
 use datafusion::optimizer::simplify_expressions::SimplifyExpressions;
-use datafusion::optimizer::utils::NamePreserver;
 use datafusion::optimizer::{Analyzer, AnalyzerRule, Optimizer, OptimizerContext};
-use datafusion_common::tree_node::{
-    Transformed, TreeNode, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor,
-};
-use datafusion_common::{Column, DFSchema, ScalarValue};
-use datafusion_expr::utils::merge_schema;
-use datafusion_expr::{
-    BinaryExpr, ColumnarValue, Expr, Literal, Operator, Projection, ScalarFunctionArgs,
-    ScalarUDFImpl, Signature, TypeSignature, Volatility,
-};
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, TreeNodeVisitor};
 use query::QueryEngine;
 use query::optimizer::count_wildcard::CountWildcardToTimeIndexRule;
 use query::parser::QueryLanguageParser;
@@ -52,7 +41,6 @@ use substrait::DFLogicalSubstraitConvertor;
 
 use crate::adapter::FlownodeContext;
 use crate::error::{DatafusionSnafu, Error, ExternalSnafu, UnexpectedSnafu};
-use crate::expr::{TUMBLE_END, TUMBLE_START};
 use crate::plan::TypedPlan;
 
 // TODO(discord9): use `Analyzer` to manage rules if more `AnalyzerRule` is needed
@@ -63,8 +51,6 @@ pub async fn apply_df_optimizer(
     let cfg = query_ctx.create_config_options();
     let analyzer = Analyzer::with_rules(vec![
         Arc::new(CountWildcardToTimeIndexRule),
-        Arc::new(AvgExpandRule),
-        Arc::new(TumbleExpandRule),
         Arc::new(CheckGroupByRule::new()),
         Arc::new(TypeCoercion::new()),
     ]);
@@ -127,390 +113,6 @@ pub async fn sql_to_flow_plan(
     Ok(flow_plan)
 }
 
-#[derive(Debug)]
-struct AvgExpandRule;
-
-impl AnalyzerRule for AvgExpandRule {
-    fn analyze(
-        &self,
-        plan: datafusion_expr::LogicalPlan,
-        _config: &ConfigOptions,
-    ) -> datafusion_common::Result<datafusion_expr::LogicalPlan> {
-        let transformed = plan
-            .transform_up_with_subqueries(expand_avg_analyzer)?
-            .data
-            .transform_down_with_subqueries(put_aggr_to_proj_analyzer)?
-            .data;
-        Ok(transformed)
-    }
-
-    fn name(&self) -> &str {
-        "avg_expand"
-    }
-}
-
-/// lift aggr's composite aggr_expr to outer proj, and leave aggr only with simple direct aggr expr
-/// i.e.
-/// ```ignore
-/// proj: avg(x)
-/// -- aggr: [sum(x)/count(x) as avg(x)]
-/// ```
-/// becomes:
-/// ```ignore
-/// proj: sum(x)/count(x) as avg(x)
-/// -- aggr: [sum(x), count(x)]
-/// ```
-fn put_aggr_to_proj_analyzer(
-    plan: datafusion_expr::LogicalPlan,
-) -> Result<Transformed<datafusion_expr::LogicalPlan>, DataFusionError> {
-    if let datafusion_expr::LogicalPlan::Projection(proj) = &plan
-        && let datafusion_expr::LogicalPlan::Aggregate(aggr) = proj.input.as_ref()
-    {
-        let mut replace_old_proj_exprs = HashMap::new();
-        let mut expanded_aggr_exprs = vec![];
-        for aggr_expr in &aggr.aggr_expr {
-            let mut is_composite = false;
-            if let Expr::AggregateFunction(_) = &aggr_expr {
-                expanded_aggr_exprs.push(aggr_expr.clone());
-            } else {
-                let old_name = aggr_expr.name_for_alias()?;
-                let new_proj_expr = aggr_expr
-                    .clone()
-                    .transform(|ch| {
-                        if let Expr::AggregateFunction(_) = &ch {
-                            is_composite = true;
-                            expanded_aggr_exprs.push(ch.clone());
-                            Ok(Transformed::yes(Expr::Column(Column::from_qualified_name(
-                                ch.name_for_alias()?,
-                            ))))
-                        } else {
-                            Ok(Transformed::no(ch))
-                        }
-                    })?
-                    .data;
-                replace_old_proj_exprs.insert(old_name, new_proj_expr);
-            }
-        }
-
-        if expanded_aggr_exprs.len() > aggr.aggr_expr.len() {
-            let mut aggr = aggr.clone();
-            aggr.aggr_expr = expanded_aggr_exprs;
-            let mut aggr_plan = datafusion_expr::LogicalPlan::Aggregate(aggr);
-            // important to recompute schema after changing aggr_expr
-            aggr_plan = aggr_plan.recompute_schema()?;
-
-            // reconstruct proj with new proj_exprs
-            let mut new_proj_exprs = proj.expr.clone();
-            for proj_expr in new_proj_exprs.iter_mut() {
-                if let Some(new_proj_expr) =
-                    replace_old_proj_exprs.get(&proj_expr.name_for_alias()?)
-                {
-                    *proj_expr = new_proj_expr.clone();
-                }
-                *proj_expr = proj_expr
-                    .clone()
-                    .transform(|expr| {
-                        if let Some(new_expr) = replace_old_proj_exprs.get(&expr.name_for_alias()?)
-                        {
-                            Ok(Transformed::yes(new_expr.clone()))
-                        } else {
-                            Ok(Transformed::no(expr))
-                        }
-                    })?
-                    .data;
-            }
-            let proj = datafusion_expr::LogicalPlan::Projection(Projection::try_new(
-                new_proj_exprs,
-                Arc::new(aggr_plan),
-            )?);
-            return Ok(Transformed::yes(proj));
-        }
-    }
-    Ok(Transformed::no(plan))
-}
-
-/// expand `avg(<expr>)` function into `cast(sum((<expr>) AS f64)/count((<expr>)`
-fn expand_avg_analyzer(
-    plan: datafusion_expr::LogicalPlan,
-) -> Result<Transformed<datafusion_expr::LogicalPlan>, DataFusionError> {
-    let mut schema = merge_schema(&plan.inputs());
-
-    if let datafusion_expr::LogicalPlan::TableScan(ts) = &plan {
-        let source_schema =
-            DFSchema::try_from_qualified_schema(ts.table_name.clone(), &ts.source.schema())?;
-        schema.merge(&source_schema);
-    }
-
-    let mut expr_rewrite = ExpandAvgRewriter::new(&schema);
-
-    let name_preserver = NamePreserver::new(&plan);
-    // apply coercion rewrite all expressions in the plan individually
-    plan.map_expressions(|expr| {
-        let original_name = name_preserver.save(&expr);
-        Ok(expr
-            .rewrite(&mut expr_rewrite)?
-            .update_data(|expr| original_name.restore(expr)))
-    })?
-    .map_data(|plan| plan.recompute_schema())
-}
-
-/// rewrite `avg(<expr>)` function into `CASE WHEN count(<expr>) !=0 THEN  cast(sum((<expr>) AS avg_return_type)/count((<expr>) ELSE 0`
-///
-/// TODO(discord9): support avg return type decimal128
-///
-/// see impl details at https://github.com/apache/datafusion/blob/4ad4f90d86c57226a4e0fb1f79dfaaf0d404c273/datafusion/expr/src/type_coercion/aggregates.rs#L457-L462
-pub(crate) struct ExpandAvgRewriter<'a> {
-    /// schema of the plan
-    #[allow(unused)]
-    pub(crate) schema: &'a DFSchema,
-}
-
-impl<'a> ExpandAvgRewriter<'a> {
-    fn new(schema: &'a DFSchema) -> Self {
-        Self { schema }
-    }
-}
-
-impl TreeNodeRewriter for ExpandAvgRewriter<'_> {
-    type Node = Expr;
-
-    fn f_up(&mut self, expr: Expr) -> Result<Transformed<Expr>, DataFusionError> {
-        if let Expr::AggregateFunction(aggr_func) = &expr
-            && aggr_func.func.name() == "avg"
-        {
-            let sum_expr = {
-                let mut tmp = aggr_func.clone();
-                tmp.func = sum_udaf();
-                Expr::AggregateFunction(tmp)
-            };
-            let sum_cast = {
-                let mut tmp = sum_expr.clone();
-                tmp = Expr::Cast(datafusion_expr::Cast {
-                    expr: Box::new(tmp),
-                    data_type: arrow_schema::DataType::Float64,
-                });
-                tmp
-            };
-
-            let count_expr = {
-                let mut tmp = aggr_func.clone();
-                tmp.func = count_udaf();
-
-                Expr::AggregateFunction(tmp)
-            };
-            let count_expr_ref =
-                Expr::Column(Column::from_qualified_name(count_expr.name_for_alias()?));
-
-            let div = BinaryExpr::new(Box::new(sum_cast), Operator::Divide, Box::new(count_expr));
-            let div_expr = Box::new(Expr::BinaryExpr(div));
-
-            let zero = Box::new(0.lit());
-            let not_zero = BinaryExpr::new(Box::new(count_expr_ref), Operator::NotEq, zero.clone());
-            let not_zero = Box::new(Expr::BinaryExpr(not_zero));
-            let null = Box::new(Expr::Literal(ScalarValue::Null, None));
-
-            let case_when =
-                datafusion_expr::Case::new(None, vec![(not_zero, div_expr)], Some(null));
-            let case_when_expr = Expr::Case(case_when);
-
-            return Ok(Transformed::yes(case_when_expr));
-        }
-
-        Ok(Transformed::no(expr))
-    }
-}
-
-/// expand tumble in aggr expr to tumble_start and tumble_end with column name like `window_start`
-#[derive(Debug)]
-struct TumbleExpandRule;
-
-impl AnalyzerRule for TumbleExpandRule {
-    fn analyze(
-        &self,
-        plan: datafusion_expr::LogicalPlan,
-        _config: &ConfigOptions,
-    ) -> datafusion_common::Result<datafusion_expr::LogicalPlan> {
-        let transformed = plan
-            .transform_up_with_subqueries(expand_tumble_analyzer)?
-            .data;
-        Ok(transformed)
-    }
-
-    fn name(&self) -> &str {
-        "tumble_expand"
-    }
-}
-
-/// expand `tumble` in aggr expr to `tumble_start` and `tumble_end`, also expand related alias and column ref
-///
-/// will add `tumble_start` and `tumble_end` to outer projection if not exist before
-fn expand_tumble_analyzer(
-    plan: datafusion_expr::LogicalPlan,
-) -> Result<Transformed<datafusion_expr::LogicalPlan>, DataFusionError> {
-    if let datafusion_expr::LogicalPlan::Projection(proj) = &plan
-        && let datafusion_expr::LogicalPlan::Aggregate(aggr) = proj.input.as_ref()
-    {
-        let mut new_group_expr = vec![];
-        let mut alias_to_expand = HashMap::new();
-        let mut encountered_tumble = false;
-        for expr in aggr.group_expr.iter() {
-            match expr {
-                datafusion_expr::Expr::ScalarFunction(func) if func.name() == "tumble" => {
-                    encountered_tumble = true;
-
-                    let tumble_start = TumbleExpand::new(TUMBLE_START);
-                    let tumble_start = datafusion_expr::expr::ScalarFunction::new_udf(
-                        Arc::new(tumble_start.into()),
-                        func.args.clone(),
-                    );
-                    let tumble_start = datafusion_expr::Expr::ScalarFunction(tumble_start);
-                    let start_col_name = tumble_start.name_for_alias()?;
-                    new_group_expr.push(tumble_start);
-
-                    let tumble_end = TumbleExpand::new(TUMBLE_END);
-                    let tumble_end = datafusion_expr::expr::ScalarFunction::new_udf(
-                        Arc::new(tumble_end.into()),
-                        func.args.clone(),
-                    );
-                    let tumble_end = datafusion_expr::Expr::ScalarFunction(tumble_end);
-                    let end_col_name = tumble_end.name_for_alias()?;
-                    new_group_expr.push(tumble_end);
-
-                    alias_to_expand.insert(expr.name_for_alias()?, (start_col_name, end_col_name));
-                }
-                _ => new_group_expr.push(expr.clone()),
-            }
-        }
-        if !encountered_tumble {
-            return Ok(Transformed::no(plan));
-        }
-        let mut new_aggr = aggr.clone();
-        new_aggr.group_expr = new_group_expr;
-        let new_aggr = datafusion_expr::LogicalPlan::Aggregate(new_aggr).recompute_schema()?;
-        // replace alias in projection if needed, and add new column ref if necessary
-        let mut new_proj_expr = vec![];
-        let mut have_expanded = false;
-
-        for proj_expr in proj.expr.iter() {
-            if let Some((start_col_name, end_col_name)) =
-                alias_to_expand.get(&proj_expr.name_for_alias()?)
-            {
-                let start_col = Column::from_qualified_name(start_col_name);
-                let end_col = Column::from_qualified_name(end_col_name);
-                new_proj_expr.push(datafusion_expr::Expr::Column(start_col));
-                new_proj_expr.push(datafusion_expr::Expr::Column(end_col));
-                have_expanded = true;
-            } else {
-                new_proj_expr.push(proj_expr.clone());
-            }
-        }
-
-        // append to end of projection if not exist
-        if !have_expanded {
-            for (start_col_name, end_col_name) in alias_to_expand.values() {
-                let start_col = Column::from_qualified_name(start_col_name);
-                let end_col = Column::from_qualified_name(end_col_name);
-                new_proj_expr.push(datafusion_expr::Expr::Column(start_col).alias("window_start"));
-                new_proj_expr.push(datafusion_expr::Expr::Column(end_col).alias("window_end"));
-            }
-        }
-
-        let new_proj = datafusion_expr::LogicalPlan::Projection(Projection::try_new(
-            new_proj_expr,
-            Arc::new(new_aggr),
-        )?);
-        return Ok(Transformed::yes(new_proj));
-    }
-
-    Ok(Transformed::no(plan))
-}
-
-/// This is a placeholder for tumble_start and tumble_end function, so that datafusion can
-/// recognize them as scalar function
-#[derive(Debug, PartialEq, Eq, Hash)]
-pub struct TumbleExpand {
-    signature: Signature,
-    name: String,
-}
-
-impl TumbleExpand {
-    pub fn new(name: &str) -> Self {
-        Self {
-            signature: Signature::new(TypeSignature::UserDefined, Volatility::Immutable),
-            name: name.to_string(),
-        }
-    }
-}
-
-impl ScalarUDFImpl for TumbleExpand {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
-    fn name(&self) -> &str {
-        &self.name
-    }
-
-    /// elide the signature for now
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn coerce_types(
-        &self,
-        arg_types: &[arrow_schema::DataType],
-    ) -> datafusion_common::Result<Vec<arrow_schema::DataType>> {
-        match (arg_types.first(), arg_types.get(1), arg_types.get(2)) {
-            (Some(ts), Some(window), opt) => {
-                use arrow_schema::DataType::*;
-                if !matches!(ts, Date32 | Timestamp(_, _)) {
-                    return Err(DataFusionError::Plan(
-                        format!("Expect timestamp column as first arg for tumble_start, found {:?}", ts)
-                    ));
-                }
-                if !matches!(window, Utf8 | Interval(_)) {
-                    return Err(DataFusionError::Plan(
-                        format!("Expect second arg for window size's type being interval for tumble_start, found {:?}", window),
-                    ));
-                }
-
-                if let Some(start_time) = opt
-                    && !matches!(start_time,  Utf8 | Date32 | Timestamp(_, _)){
-                        return Err(DataFusionError::Plan(
-                            format!("Expect start_time to either be date, timestamp or string, found {:?}", start_time)
-                        ));
-                    }
-
-                Ok(arg_types.to_vec())
-            }
-            _ => Err(DataFusionError::Plan(
-                "Expect tumble function have at least two arg(timestamp column and window size) and a third optional arg for starting time".to_string(),
-            )),
-        }
-    }
-
-    fn return_type(
-        &self,
-        arg_types: &[arrow_schema::DataType],
-    ) -> Result<arrow_schema::DataType, DataFusionError> {
-        arg_types.first().cloned().ok_or_else(|| {
-            DataFusionError::Plan(
-                "Expect tumble function have at least two arg(timestamp column and window size)"
-                    .to_string(),
-            )
-        })
-    }
-
-    fn invoke_with_args(
-        &self,
-        _args: ScalarFunctionArgs,
-    ) -> datafusion_common::Result<ColumnarValue> {
-        Err(DataFusionError::Plan(
-            "This function should not be executed by datafusion".to_string(),
-        ))
-    }
-}
-
 /// This rule check all group by exprs, and make sure they are also in select clause in a aggr query
 #[derive(Debug)]
 struct CheckGroupByRule {}
diff --git a/src/flow/src/transform/aggr.rs b/src/flow/src/transform/aggr.rs
index 579f0e8ee3..861ca8fe65 100644
--- a/src/flow/src/transform/aggr.rs
+++ b/src/flow/src/transform/aggr.rs
@@ -382,10 +382,9 @@ impl TypedPlan {
 
 #[cfg(test)]
 mod test {
-    use std::time::Duration;
 
     use bytes::BytesMut;
-    use common_time::{IntervalMonthDayNano, Timestamp};
+    use common_time::IntervalMonthDayNano;
     use datatypes::data_type::ConcreteDataType as CDT;
     use datatypes::prelude::ConcreteDataType;
     use datatypes::value::Value;
@@ -397,898 +396,6 @@ mod test {
     use crate::repr::{ColumnType, RelationType};
     use crate::transform::test::{create_test_ctx, create_test_query_engine, sql_to_substrait};
 
-    #[tokio::test]
-    async fn test_df_func_basic() {
-        let engine = create_test_query_engine();
-        let sql = "SELECT sum(abs(number)) FROM numbers_with_ts GROUP BY tumble(ts, '1 second', '2021-07-01 00:00:00');";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
-            .await
-            .unwrap();
-
-        let aggr_expr = AggregateExpr {
-            func: AggregateFunc::SumUInt64,
-            expr: ScalarExpr::Column(0),
-            distinct: false,
-        };
-        let expected =
-            TypedPlan {
-                schema: RelationType::new(vec![
-                    ColumnType::new(CDT::uint64_datatype(), true), // sum(number)
-                    ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                    ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-                ])
-                .with_key(vec![2])
-                .with_time_index(Some(1))
-                .into_named(vec![
-                    Some("sum(abs(numbers_with_ts.number))".to_string()),
-                    Some("window_start".to_string()),
-                    Some("window_end".to_string()),
-                ]),
-                plan: Plan::Mfp {
-                    input: Box::new(
-                        Plan::Reduce {
-                            input: Box::new(
-                                Plan::Get {
-                                    id: crate::expr::Id::Global(GlobalId::User(1)),
-                                }
-                                .with_types(
-                                    RelationType::new(vec![
-                                        ColumnType::new(ConcreteDataType::uint32_datatype(), false),
-                                        ColumnType::new(
-                                            ConcreteDataType::timestamp_millisecond_datatype(),
-                                            false,
-                                        ),
-                                    ])
-                                    .into_named(vec![
-                                        Some("number".to_string()),
-                                        Some("ts".to_string()),
-                                    ]),
-                                )
-                                .mfp(MapFilterProject::new(2).into_safe())
-                                .unwrap(),
-                            ),
-                            key_val_plan: KeyValPlan {
-                                key_plan: MapFilterProject::new(2)
-                                    .map(vec![
-                                        ScalarExpr::Column(1).call_unary(
-                                            UnaryFunc::TumbleWindowFloor {
-                                                window_size: Duration::from_nanos(1_000_000_000),
-                                                start_time: Some(Timestamp::new_millisecond(
-                                                    1625097600000,
-                                                )),
-                                            },
-                                        ),
-                                        ScalarExpr::Column(1).call_unary(
-                                            UnaryFunc::TumbleWindowCeiling {
-                                                window_size: Duration::from_nanos(1_000_000_000),
-                                                start_time: Some(Timestamp::new_millisecond(
-                                                    1625097600000,
-                                                )),
-                                            },
-                                        ),
-                                    ])
-                                    .unwrap()
-                                    .project(vec![2, 3])
-                                    .unwrap()
-                                    .into_safe(),
-                                val_plan: MapFilterProject::new(2)
-                                    .map(vec![ScalarExpr::CallDf {
-                                    df_scalar_fn: DfScalarFunction::try_from_raw_fn(
-                                        RawDfScalarFn {
-                                            f: BytesMut::from(
-                                                b"\x08\x02\"\x08\x1a\x06\x12\x04\n\x02\x12\0"
-                                                    .as_ref(),
-                                            ),
-                                            input_schema: RelationType::new(vec![ColumnType::new(
-                                                ConcreteDataType::uint32_datatype(),
-                                                false,
-                                            )])
-                                            .into_unnamed(),
-                                            extensions: FunctionExtensions::from_iter(
-                                                [
-                                                    (0, "tumble_start".to_string()),
-                                                    (1, "tumble_end".to_string()),
-                                                    (2, "abs".to_string()),
-                                                    (3, "sum".to_string()),
-                                                ]
-                                                .into_iter(),
-                                            ),
-                                        },
-                                    )
-                                    .await
-                                    .unwrap(),
-                                    exprs: vec![ScalarExpr::Column(0)],
-                                }
-                                .cast(CDT::uint64_datatype())])
-                                    .unwrap()
-                                    .project(vec![2])
-                                    .unwrap()
-                                    .into_safe(),
-                            },
-                            reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                                full_aggrs: vec![aggr_expr.clone()],
-                                simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)],
-                                distinct_aggrs: vec![],
-                            }),
-                        }
-                        .with_types(
-                            RelationType::new(vec![
-                                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-                                ColumnType::new(CDT::uint64_datatype(), true), //sum(number)
-                            ])
-                            .with_key(vec![1])
-                            .with_time_index(Some(0))
-                            .into_unnamed(),
-                        ),
-                    ),
-                    mfp: MapFilterProject::new(3)
-                        .map(vec![
-                            ScalarExpr::Column(2),
-                            ScalarExpr::Column(0),
-                            ScalarExpr::Column(1),
-                        ])
-                        .unwrap()
-                        .project(vec![3, 4, 5])
-                        .unwrap(),
-                },
-            };
-        assert_eq!(flow_plan, expected);
-    }
-
-    #[tokio::test]
-    async fn test_df_func_expr_tree() {
-        let engine = create_test_query_engine();
-        let sql = "SELECT abs(sum(number)) FROM numbers_with_ts GROUP BY tumble(ts, '1 second', '2021-07-01 00:00:00');";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
-            .await
-            .unwrap();
-
-        let aggr_expr = AggregateExpr {
-            func: AggregateFunc::SumUInt64,
-            expr: ScalarExpr::Column(0),
-            distinct: false,
-        };
-        let expected = TypedPlan {
-            schema: RelationType::new(vec![
-                ColumnType::new(CDT::uint64_datatype(), true), // sum(number)
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-            ])
-            .with_key(vec![2])
-            .with_time_index(Some(1))
-            .into_named(vec![
-                Some("abs(sum(numbers_with_ts.number))".to_string()),
-                Some("window_start".to_string()),
-                Some("window_end".to_string()),
-            ]),
-            plan: Plan::Mfp {
-                input: Box::new(
-                    Plan::Reduce {
-                        input: Box::new(
-                            Plan::Get {
-                                id: crate::expr::Id::Global(GlobalId::User(1)),
-                            }
-                            .with_types(
-                                RelationType::new(vec![
-                                    ColumnType::new(ConcreteDataType::uint32_datatype(), false),
-                                    ColumnType::new(
-                                        ConcreteDataType::timestamp_millisecond_datatype(),
-                                        false,
-                                    ),
-                                ])
-                                .into_named(vec![
-                                    Some("number".to_string()),
-                                    Some("ts".to_string()),
-                                ]),
-                            )
-                            .mfp(MapFilterProject::new(2).into_safe())
-                            .unwrap(),
-                        ),
-                        key_val_plan: KeyValPlan {
-                            key_plan: MapFilterProject::new(2)
-                                .map(vec![
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowFloor {
-                                            window_size: Duration::from_nanos(1_000_000_000),
-                                            start_time: Some(Timestamp::new_millisecond(
-                                                1625097600000,
-                                            )),
-                                        },
-                                    ),
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowCeiling {
-                                            window_size: Duration::from_nanos(1_000_000_000),
-                                            start_time: Some(Timestamp::new_millisecond(
-                                                1625097600000,
-                                            )),
-                                        },
-                                    ),
-                                ])
-                                .unwrap()
-                                .project(vec![2, 3])
-                                .unwrap()
-                                .into_safe(),
-                            val_plan: MapFilterProject::new(2)
-                                .map(vec![ScalarExpr::Column(0).cast(CDT::uint64_datatype())])
-                                .unwrap()
-                                .project(vec![2])
-                                .unwrap()
-                                .into_safe(),
-                        },
-                        reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                            full_aggrs: vec![aggr_expr.clone()],
-                            simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)],
-                            distinct_aggrs: vec![],
-                        }),
-                    }
-                    .with_types(
-                        RelationType::new(vec![
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-                            ColumnType::new(CDT::uint64_datatype(), true), //sum(number)
-                        ])
-                        .with_key(vec![1])
-                        .with_time_index(Some(0))
-                        .into_named(vec![None, None, None]),
-                    ),
-                ),
-                mfp: MapFilterProject::new(3)
-                    .map(vec![
-                        ScalarExpr::CallDf {
-                            df_scalar_fn: DfScalarFunction::try_from_raw_fn(RawDfScalarFn {
-                                f: BytesMut::from(b"\"\x08\x1a\x06\x12\x04\n\x02\x12\0".as_ref()),
-                                input_schema: RelationType::new(vec![ColumnType::new(
-                                    ConcreteDataType::uint64_datatype(),
-                                    true,
-                                )])
-                                .into_unnamed(),
-                                extensions: FunctionExtensions::from_iter(
-                                    [
-                                        (0, "abs".to_string()),
-                                        (1, "tumble_start".to_string()),
-                                        (2, "tumble_end".to_string()),
-                                        (3, "sum".to_string()),
-                                    ]
-                                    .into_iter(),
-                                ),
-                            })
-                            .await
-                            .unwrap(),
-                            exprs: vec![ScalarExpr::Column(2)],
-                        },
-                        ScalarExpr::Column(0),
-                        ScalarExpr::Column(1),
-                    ])
-                    .unwrap()
-                    .project(vec![3, 4, 5])
-                    .unwrap(),
-            },
-        };
-        assert_eq!(flow_plan, expected);
-    }
-
-    /// TODO(discord9): add more illegal sql tests
-    #[tokio::test]
-    async fn test_tumble_composite() {
-        let engine = create_test_query_engine();
-        let sql =
-            "SELECT number, avg(number) FROM numbers_with_ts GROUP BY tumble(ts, '1 hour'), number";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
-            .await
-            .unwrap();
-
-        let aggr_exprs = vec![
-            AggregateExpr {
-                func: AggregateFunc::SumUInt64,
-                expr: ScalarExpr::Column(0),
-                distinct: false,
-            },
-            AggregateExpr {
-                func: AggregateFunc::Count,
-                expr: ScalarExpr::Column(1),
-                distinct: false,
-            },
-        ];
-        let avg_expr = ScalarExpr::If {
-            cond: Box::new(ScalarExpr::Column(4).call_binary(
-                ScalarExpr::Literal(Value::from(0i64), CDT::int64_datatype()),
-                BinaryFunc::NotEq,
-            )),
-            then: Box::new(
-                ScalarExpr::Column(3)
-                    .cast(CDT::float64_datatype())
-                    .call_binary(
-                        ScalarExpr::Column(4).cast(CDT::float64_datatype()),
-                        BinaryFunc::DivFloat64,
-                    ),
-            ),
-            els: Box::new(ScalarExpr::Literal(Value::Null, CDT::float64_datatype())),
-        };
-        let expected = TypedPlan {
-            plan: Plan::Mfp {
-                input: Box::new(
-                    Plan::Reduce {
-                        input: Box::new(
-                            Plan::Get {
-                                id: crate::expr::Id::Global(GlobalId::User(1)),
-                            }
-                            .with_types(
-                                RelationType::new(vec![
-                                    ColumnType::new(ConcreteDataType::uint32_datatype(), false),
-                                    ColumnType::new(
-                                        ConcreteDataType::timestamp_millisecond_datatype(),
-                                        false,
-                                    ),
-                                ])
-                                .into_named(vec![
-                                    Some("number".to_string()),
-                                    Some("ts".to_string()),
-                                ]),
-                            )
-                            .mfp(MapFilterProject::new(2).into_safe())
-                            .unwrap(),
-                        ),
-                        key_val_plan: KeyValPlan {
-                            key_plan: MapFilterProject::new(2)
-                                .map(vec![
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowFloor {
-                                            window_size: Duration::from_nanos(3_600_000_000_000),
-                                            start_time: None,
-                                        },
-                                    ),
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowCeiling {
-                                            window_size: Duration::from_nanos(3_600_000_000_000),
-                                            start_time: None,
-                                        },
-                                    ),
-                                    ScalarExpr::Column(0),
-                                ])
-                                .unwrap()
-                                .project(vec![2, 3, 4])
-                                .unwrap()
-                                .into_safe(),
-                            val_plan: MapFilterProject::new(2)
-                                .map(vec![
-                                    ScalarExpr::Column(0).cast(CDT::uint64_datatype()),
-                                    ScalarExpr::Column(0),
-                                ])
-                                .unwrap()
-                                .project(vec![2, 3])
-                                .unwrap()
-                                .into_safe(),
-                        },
-                        reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                            full_aggrs: aggr_exprs.clone(),
-                            simple_aggrs: vec![
-                                AggrWithIndex::new(aggr_exprs[0].clone(), 0, 0),
-                                AggrWithIndex::new(aggr_exprs[1].clone(), 1, 1),
-                            ],
-                            distinct_aggrs: vec![],
-                        }),
-                    }
-                    .with_types(
-                        RelationType::new(vec![
-                            // keys
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start(time index)
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end(pk)
-                            ColumnType::new(CDT::uint32_datatype(), false), // number(pk)
-                            // values
-                            ColumnType::new(CDT::uint64_datatype(), true), // avg.sum(number)
-                            ColumnType::new(CDT::int64_datatype(), true),  // avg.count(number)
-                        ])
-                        .with_key(vec![1, 2])
-                        .with_time_index(Some(0))
-                        .into_named(vec![
-                            None,
-                            None,
-                            Some("number".to_string()),
-                            None,
-                            None,
-                        ]),
-                    ),
-                ),
-                mfp: MapFilterProject::new(5)
-                    .map(vec![
-                        ScalarExpr::Column(2), // number(pk)
-                        avg_expr,
-                        ScalarExpr::Column(0), // window start
-                        ScalarExpr::Column(1), // window end
-                    ])
-                    .unwrap()
-                    .project(vec![5, 6, 7, 8])
-                    .unwrap(),
-            },
-            schema: RelationType::new(vec![
-                ColumnType::new(CDT::uint32_datatype(), false), // number
-                ColumnType::new(CDT::float64_datatype(), true), // avg(number)
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-            ])
-            .with_key(vec![0, 3])
-            .with_time_index(Some(2))
-            .into_named(vec![
-                Some("number".to_string()),
-                Some("avg(numbers_with_ts.number)".to_string()),
-                Some("window_start".to_string()),
-                Some("window_end".to_string()),
-            ]),
-        };
-        assert_eq!(flow_plan, expected);
-    }
-
-    #[tokio::test]
-    async fn test_tumble_parse_optional() {
-        let engine = create_test_query_engine();
-        let sql = "SELECT sum(number) FROM numbers_with_ts GROUP BY tumble(ts, '1 hour')";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
-            .await
-            .unwrap();
-
-        let aggr_expr = AggregateExpr {
-            func: AggregateFunc::SumUInt64,
-            expr: ScalarExpr::Column(0),
-            distinct: false,
-        };
-        let expected = TypedPlan {
-            schema: RelationType::new(vec![
-                ColumnType::new(CDT::uint64_datatype(), true), // sum(number)
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-            ])
-            .with_key(vec![2])
-            .with_time_index(Some(1))
-            .into_named(vec![
-                Some("sum(numbers_with_ts.number)".to_string()),
-                Some("window_start".to_string()),
-                Some("window_end".to_string()),
-            ]),
-            plan: Plan::Mfp {
-                input: Box::new(
-                    Plan::Reduce {
-                        input: Box::new(
-                            Plan::Get {
-                                id: crate::expr::Id::Global(GlobalId::User(1)),
-                            }
-                            .with_types(
-                                RelationType::new(vec![
-                                    ColumnType::new(ConcreteDataType::uint32_datatype(), false),
-                                    ColumnType::new(
-                                        ConcreteDataType::timestamp_millisecond_datatype(),
-                                        false,
-                                    ),
-                                ])
-                                .into_named(vec![
-                                    Some("number".to_string()),
-                                    Some("ts".to_string()),
-                                ]),
-                            )
-                            .mfp(MapFilterProject::new(2).into_safe())
-                            .unwrap(),
-                        ),
-                        key_val_plan: KeyValPlan {
-                            key_plan: MapFilterProject::new(2)
-                                .map(vec![
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowFloor {
-                                            window_size: Duration::from_nanos(3_600_000_000_000),
-                                            start_time: None,
-                                        },
-                                    ),
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowCeiling {
-                                            window_size: Duration::from_nanos(3_600_000_000_000),
-                                            start_time: None,
-                                        },
-                                    ),
-                                ])
-                                .unwrap()
-                                .project(vec![2, 3])
-                                .unwrap()
-                                .into_safe(),
-                            val_plan: MapFilterProject::new(2)
-                                .map(vec![ScalarExpr::Column(0).cast(CDT::uint64_datatype())])
-                                .unwrap()
-                                .project(vec![2])
-                                .unwrap()
-                                .into_safe(),
-                        },
-                        reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                            full_aggrs: vec![aggr_expr.clone()],
-                            simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)],
-                            distinct_aggrs: vec![],
-                        }),
-                    }
-                    .with_types(
-                        RelationType::new(vec![
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-                            ColumnType::new(CDT::uint64_datatype(), true), //sum(number)
-                        ])
-                        .with_key(vec![1])
-                        .with_time_index(Some(0))
-                        .into_named(vec![None, None, None]),
-                    ),
-                ),
-                mfp: MapFilterProject::new(3)
-                    .map(vec![
-                        ScalarExpr::Column(2),
-                        ScalarExpr::Column(0),
-                        ScalarExpr::Column(1),
-                    ])
-                    .unwrap()
-                    .project(vec![3, 4, 5])
-                    .unwrap(),
-            },
-        };
-        assert_eq!(flow_plan, expected);
-    }
-
-    #[tokio::test]
-    async fn test_tumble_parse() {
-        let engine = create_test_query_engine();
-        let sql = "SELECT sum(number) FROM numbers_with_ts GROUP BY tumble(ts, '1 hour', '2021-07-01 00:00:00')";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
-            .await
-            .unwrap();
-
-        let aggr_expr = AggregateExpr {
-            func: AggregateFunc::SumUInt64,
-            expr: ScalarExpr::Column(0),
-            distinct: false,
-        };
-        let expected = TypedPlan {
-            schema: RelationType::new(vec![
-                ColumnType::new(CDT::uint64_datatype(), true), // sum(number)
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-            ])
-            .with_key(vec![2])
-            .with_time_index(Some(1))
-            .into_named(vec![
-                Some("sum(numbers_with_ts.number)".to_string()),
-                Some("window_start".to_string()),
-                Some("window_end".to_string()),
-            ]),
-            plan: Plan::Mfp {
-                input: Box::new(
-                    Plan::Reduce {
-                        input: Box::new(
-                            Plan::Get {
-                                id: crate::expr::Id::Global(GlobalId::User(1)),
-                            }
-                            .with_types(
-                                RelationType::new(vec![
-                                    ColumnType::new(ConcreteDataType::uint32_datatype(), false),
-                                    ColumnType::new(
-                                        ConcreteDataType::timestamp_millisecond_datatype(),
-                                        false,
-                                    ),
-                                ])
-                                .into_named(vec![
-                                    Some("number".to_string()),
-                                    Some("ts".to_string()),
-                                ]),
-                            )
-                            .mfp(MapFilterProject::new(2).into_safe())
-                            .unwrap(),
-                        ),
-                        key_val_plan: KeyValPlan {
-                            key_plan: MapFilterProject::new(2)
-                                .map(vec![
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowFloor {
-                                            window_size: Duration::from_nanos(3_600_000_000_000),
-                                            start_time: Some(Timestamp::new_millisecond(
-                                                1625097600000,
-                                            )),
-                                        },
-                                    ),
-                                    ScalarExpr::Column(1).call_unary(
-                                        UnaryFunc::TumbleWindowCeiling {
-                                            window_size: Duration::from_nanos(3_600_000_000_000),
-                                            start_time: Some(Timestamp::new_millisecond(
-                                                1625097600000,
-                                            )),
-                                        },
-                                    ),
-                                ])
-                                .unwrap()
-                                .project(vec![2, 3])
-                                .unwrap()
-                                .into_safe(),
-                            val_plan: MapFilterProject::new(2)
-                                .map(vec![ScalarExpr::Column(0).cast(CDT::uint64_datatype())])
-                                .unwrap()
-                                .project(vec![2])
-                                .unwrap()
-                                .into_safe(),
-                        },
-                        reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                            full_aggrs: vec![aggr_expr.clone()],
-                            simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)],
-                            distinct_aggrs: vec![],
-                        }),
-                    }
-                    .with_types(
-                        RelationType::new(vec![
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window start
-                            ColumnType::new(CDT::timestamp_millisecond_datatype(), true), // window end
-                            ColumnType::new(CDT::uint64_datatype(), true), //sum(number)
-                        ])
-                        .with_key(vec![1])
-                        .with_time_index(Some(0))
-                        .into_unnamed(),
-                    ),
-                ),
-                mfp: MapFilterProject::new(3)
-                    .map(vec![
-                        ScalarExpr::Column(2),
-                        ScalarExpr::Column(0),
-                        ScalarExpr::Column(1),
-                    ])
-                    .unwrap()
-                    .project(vec![3, 4, 5])
-                    .unwrap(),
-            },
-        };
-        assert_eq!(flow_plan, expected);
-    }
-
-    #[tokio::test]
-    async fn test_avg_group_by() {
-        let engine = create_test_query_engine();
-        let sql = "SELECT avg(number), number FROM numbers GROUP BY number";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan).await;
-
-        let aggr_exprs = vec![
-            AggregateExpr {
-                func: AggregateFunc::SumUInt64,
-                expr: ScalarExpr::Column(0),
-                distinct: false,
-            },
-            AggregateExpr {
-                func: AggregateFunc::Count,
-                expr: ScalarExpr::Column(1),
-                distinct: false,
-            },
-        ];
-        let avg_expr = ScalarExpr::If {
-            cond: Box::new(ScalarExpr::Column(2).call_binary(
-                ScalarExpr::Literal(Value::from(0i64), CDT::int64_datatype()),
-                BinaryFunc::NotEq,
-            )),
-            then: Box::new(
-                ScalarExpr::Column(1)
-                    .cast(CDT::float64_datatype())
-                    .call_binary(
-                        ScalarExpr::Column(2).cast(CDT::float64_datatype()),
-                        BinaryFunc::DivFloat64,
-                    ),
-            ),
-            els: Box::new(ScalarExpr::Literal(Value::Null, CDT::float64_datatype())),
-        };
-        let expected = TypedPlan {
-            schema: RelationType::new(vec![
-                ColumnType::new(CDT::float64_datatype(), true), // avg(number: u32) -> f64
-                ColumnType::new(CDT::uint32_datatype(), false), // number
-            ])
-            .with_key(vec![1])
-            .into_named(vec![
-                Some("avg(numbers.number)".to_string()),
-                Some("number".to_string()),
-            ]),
-            plan: Plan::Mfp {
-                input: Box::new(
-                    Plan::Reduce {
-                        input: Box::new(
-                            Plan::Get {
-                                id: crate::expr::Id::Global(GlobalId::User(0)),
-                            }
-                            .with_types(
-                                RelationType::new(vec![ColumnType::new(
-                                    ConcreteDataType::uint32_datatype(),
-                                    false,
-                                )])
-                                .into_named(vec![Some("number".to_string())]),
-                            )
-                            .mfp(
-                                MapFilterProject::new(1)
-                                    .project(vec![0])
-                                    .unwrap()
-                                    .into_safe(),
-                            )
-                            .unwrap(),
-                        ),
-                        key_val_plan: KeyValPlan {
-                            key_plan: MapFilterProject::new(1)
-                                .map(vec![ScalarExpr::Column(0)])
-                                .unwrap()
-                                .project(vec![1])
-                                .unwrap()
-                                .into_safe(),
-                            val_plan: MapFilterProject::new(1)
-                                .map(vec![
-                                    ScalarExpr::Column(0).cast(CDT::uint64_datatype()),
-                                    ScalarExpr::Column(0),
-                                ])
-                                .unwrap()
-                                .project(vec![1, 2])
-                                .unwrap()
-                                .into_safe(),
-                        },
-                        reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                            full_aggrs: aggr_exprs.clone(),
-                            simple_aggrs: vec![
-                                AggrWithIndex::new(aggr_exprs[0].clone(), 0, 0),
-                                AggrWithIndex::new(aggr_exprs[1].clone(), 1, 1),
-                            ],
-                            distinct_aggrs: vec![],
-                        }),
-                    }
-                    .with_types(
-                        RelationType::new(vec![
-                            ColumnType::new(ConcreteDataType::uint32_datatype(), false), // key: number
-                            ColumnType::new(ConcreteDataType::uint64_datatype(), true),  // sum
-                            ColumnType::new(ConcreteDataType::int64_datatype(), true),   // count
-                        ])
-                        .with_key(vec![0])
-                        .into_named(vec![
-                            Some("number".to_string()),
-                            None,
-                            None,
-                        ]),
-                    ),
-                ),
-                mfp: MapFilterProject::new(3)
-                    .map(vec![
-                        avg_expr, // col 3
-                        ScalarExpr::Column(0),
-                        // TODO(discord9): optimize mfp so to remove indirect ref
-                    ])
-                    .unwrap()
-                    .project(vec![3, 4])
-                    .unwrap(),
-            },
-        };
-        assert_eq!(flow_plan.unwrap(), expected);
-    }
-
-    #[tokio::test]
-    async fn test_avg() {
-        let engine = create_test_query_engine();
-        let sql = "SELECT avg(number) FROM numbers";
-        let plan = sql_to_substrait(engine.clone(), sql).await;
-
-        let mut ctx = create_test_ctx();
-
-        let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan)
-            .await
-            .unwrap();
-
-        let aggr_exprs = vec![
-            AggregateExpr {
-                func: AggregateFunc::SumUInt64,
-                expr: ScalarExpr::Column(0),
-                distinct: false,
-            },
-            AggregateExpr {
-                func: AggregateFunc::Count,
-                expr: ScalarExpr::Column(1),
-                distinct: false,
-            },
-        ];
-        let avg_expr = ScalarExpr::If {
-            cond: Box::new(ScalarExpr::Column(1).call_binary(
-                ScalarExpr::Literal(Value::from(0i64), CDT::int64_datatype()),
-                BinaryFunc::NotEq,
-            )),
-            then: Box::new(
-                ScalarExpr::Column(0)
-                    .cast(CDT::float64_datatype())
-                    .call_binary(
-                        ScalarExpr::Column(1).cast(CDT::float64_datatype()),
-                        BinaryFunc::DivFloat64,
-                    ),
-            ),
-            els: Box::new(ScalarExpr::Literal(Value::Null, CDT::float64_datatype())),
-        };
-        let input = Box::new(
-            Plan::Get {
-                id: crate::expr::Id::Global(GlobalId::User(0)),
-            }
-            .with_types(
-                RelationType::new(vec![ColumnType::new(
-                    ConcreteDataType::uint32_datatype(),
-                    false,
-                )])
-                .into_named(vec![Some("number".to_string())]),
-            ),
-        );
-        let expected = TypedPlan {
-            schema: RelationType::new(vec![ColumnType::new(CDT::float64_datatype(), true)])
-                .into_named(vec![Some("avg(numbers.number)".to_string())]),
-            plan: Plan::Mfp {
-                input: Box::new(
-                    Plan::Reduce {
-                        input: Box::new(
-                            Plan::Mfp {
-                                input: input.clone(),
-                                mfp: MapFilterProject::new(1).project(vec![0]).unwrap(),
-                            }
-                            .with_types(
-                                RelationType::new(vec![ColumnType::new(
-                                    CDT::uint32_datatype(),
-                                    false,
-                                )])
-                                .into_named(vec![Some("number".to_string())]),
-                            ),
-                        ),
-                        key_val_plan: KeyValPlan {
-                            key_plan: MapFilterProject::new(1)
-                                .project(vec![])
-                                .unwrap()
-                                .into_safe(),
-                            val_plan: MapFilterProject::new(1)
-                                .map(vec![
-                                    ScalarExpr::Column(0).cast(CDT::uint64_datatype()),
-                                    ScalarExpr::Column(0),
-                                ])
-                                .unwrap()
-                                .project(vec![1, 2])
-                                .unwrap()
-                                .into_safe(),
-                        },
-                        reduce_plan: ReducePlan::Accumulable(AccumulablePlan {
-                            full_aggrs: aggr_exprs.clone(),
-                            simple_aggrs: vec![
-                                AggrWithIndex::new(aggr_exprs[0].clone(), 0, 0),
-                                AggrWithIndex::new(aggr_exprs[1].clone(), 1, 1),
-                            ],
-                            distinct_aggrs: vec![],
-                        }),
-                    }
-                    .with_types(
-                        RelationType::new(vec![
-                            ColumnType::new(ConcreteDataType::uint64_datatype(), true), // sum
-                            ColumnType::new(ConcreteDataType::int64_datatype(), true),  // count
-                        ])
-                        .into_named(vec![None, None]),
-                    ),
-                ),
-                mfp: MapFilterProject::new(2)
-                    .map(vec![
-                        avg_expr,
-                        // TODO(discord9): optimize mfp so to remove indirect ref
-                    ])
-                    .unwrap()
-                    .project(vec![2])
-                    .unwrap(),
-            },
-        };
-        assert_eq!(flow_plan, expected);
-    }
-
     #[tokio::test]
     async fn test_sum() {
         let engine = create_test_query_engine();
diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs
index fa8a74cad2..ce589bb677 100644
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 pub mod builder;
+mod dashboard;
 mod grpc;
 mod influxdb;
 mod jaeger;
diff --git a/src/frontend/src/instance/dashboard.rs b/src/frontend/src/instance/dashboard.rs
new file mode 100644
index 0000000000..373961dbfa
--- /dev/null
+++ b/src/frontend/src/instance/dashboard.rs
@@ -0,0 +1,405 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use api::v1::value::ValueData;
+use api::v1::{
+    ColumnDataType, ColumnDef, ColumnSchema as PbColumnSchema, Row, RowInsertRequest,
+    RowInsertRequests, Rows, SemanticType,
+};
+use async_trait::async_trait;
+use common_catalog::consts::{DEFAULT_PRIVATE_SCHEMA_NAME, default_engine};
+use common_error::ext::BoxedError;
+use common_query::OutputData;
+use common_recordbatch::util as record_util;
+use common_telemetry::info;
+use common_time::FOREVER;
+use datafusion::datasource::DefaultTableSource;
+use datafusion::logical_expr::col;
+use datafusion::sql::TableReference;
+use datafusion_expr::{DmlStatement, LogicalPlan, lit};
+use datatypes::arrow::array::{Array, AsArray};
+use servers::error::{
+    CatalogSnafu, CollectRecordbatchSnafu, DataFusionSnafu, ExecuteQuerySnafu, NotSupportedSnafu,
+    TableNotFoundSnafu,
+};
+use servers::query_handler::DashboardDefinition;
+use session::context::{QueryContextBuilder, QueryContextRef};
+use snafu::{OptionExt, ResultExt};
+use table::TableRef;
+use table::metadata::TableInfo;
+use table::requests::TTL_KEY;
+use table::table::adapter::DfTableProviderAdapter;
+
+use crate::instance::Instance;
+
+pub const DASHBOARD_TABLE_NAME: &str = "dashboard";
+pub const DASHBOARD_TABLE_NAME_COLUMN_NAME: &str = "name";
+pub const DASHBOARD_TABLE_DEFINITION_COLUMN_NAME: &str = "definition";
+pub const DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME: &str = "created_at";
+
+impl Instance {
+    /// Build a schema for dashboard table.
+    /// Returns the (time index, primary keys, column) definitions.
+    fn build_dashboard_schema() -> (String, Vec<String>, Vec<ColumnDef>) {
+        (
+            DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME.to_string(),
+            vec![DASHBOARD_TABLE_NAME_COLUMN_NAME.to_string()],
+            vec![
+                ColumnDef {
+                    name: DASHBOARD_TABLE_NAME_COLUMN_NAME.to_string(),
+                    data_type: ColumnDataType::String as i32,
+                    is_nullable: false,
+                    default_constraint: vec![],
+                    semantic_type: SemanticType::Tag as i32,
+                    comment: String::new(),
+                    datatype_extension: None,
+                    options: None,
+                },
+                ColumnDef {
+                    name: DASHBOARD_TABLE_DEFINITION_COLUMN_NAME.to_string(),
+                    data_type: ColumnDataType::String as i32,
+                    is_nullable: false,
+                    default_constraint: vec![],
+                    semantic_type: SemanticType::Field as i32,
+                    comment: String::new(),
+                    datatype_extension: None,
+                    options: None,
+                },
+                ColumnDef {
+                    name: DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME.to_string(),
+                    data_type: ColumnDataType::TimestampNanosecond as i32,
+                    is_nullable: false,
+                    default_constraint: vec![],
+                    semantic_type: SemanticType::Timestamp as i32,
+                    comment: String::new(),
+                    datatype_extension: None,
+                    options: None,
+                },
+            ],
+        )
+    }
+
+    /// Build a column schemas for inserting a row into the dashboard table.
+    fn build_dashboard_insert_column_schemas() -> Vec<PbColumnSchema> {
+        vec![
+            PbColumnSchema {
+                column_name: DASHBOARD_TABLE_NAME_COLUMN_NAME.to_string(),
+                datatype: ColumnDataType::String.into(),
+                semantic_type: SemanticType::Tag.into(),
+                ..Default::default()
+            },
+            PbColumnSchema {
+                column_name: DASHBOARD_TABLE_DEFINITION_COLUMN_NAME.to_string(),
+                datatype: ColumnDataType::String.into(),
+                semantic_type: SemanticType::Field.into(),
+                ..Default::default()
+            },
+            PbColumnSchema {
+                column_name: DASHBOARD_TABLE_CREATED_AT_COLUMN_NAME.to_string(),
+                datatype: ColumnDataType::TimestampNanosecond.into(),
+                semantic_type: SemanticType::Timestamp.into(),
+                ..Default::default()
+            },
+        ]
+    }
+
+    fn dashboard_query_ctx(table_info: &TableInfo) -> QueryContextRef {
+        QueryContextBuilder::default()
+            .current_catalog(table_info.catalog_name.clone())
+            .current_schema(table_info.schema_name.clone())
+            .build()
+            .into()
+    }
+
+    async fn create_dashboard_table_if_not_exists(
+        &self,
+        ctx: QueryContextRef,
+    ) -> servers::error::Result<TableRef> {
+        let catalog = ctx.current_catalog();
+
+        if let Some(table) = self
+            .catalog_manager
+            .table(
+                catalog,
+                DEFAULT_PRIVATE_SCHEMA_NAME,
+                DASHBOARD_TABLE_NAME,
+                Some(&ctx),
+            )
+            .await
+            .context(CatalogSnafu)?
+        {
+            return Ok(table);
+        }
+
+        let (time_index, primary_keys, column_defs) = Self::build_dashboard_schema();
+
+        let mut table_options = HashMap::new();
+        table_options.insert(TTL_KEY.to_string(), FOREVER.to_string());
+
+        let mut create_table_expr = api::v1::CreateTableExpr {
+            catalog_name: catalog.to_string(),
+            schema_name: DEFAULT_PRIVATE_SCHEMA_NAME.to_string(),
+            table_name: DASHBOARD_TABLE_NAME.to_string(),
+            desc: "GreptimeDB dashboard table".to_string(),
+            column_defs,
+            time_index,
+            primary_keys,
+            create_if_not_exists: true,
+            table_options,
+            table_id: None,
+            engine: default_engine().to_string(),
+        };
+
+        self.statement_executor
+            .create_table_inner(&mut create_table_expr, None, ctx.clone())
+            .await
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)?;
+
+        let table = self
+            .catalog_manager
+            .table(
+                catalog,
+                DEFAULT_PRIVATE_SCHEMA_NAME,
+                DASHBOARD_TABLE_NAME,
+                Some(&ctx),
+            )
+            .await
+            .context(CatalogSnafu)?
+            .context(TableNotFoundSnafu {
+                catalog: catalog.to_string(),
+                schema: DEFAULT_PRIVATE_SCHEMA_NAME.to_string(),
+                table: DASHBOARD_TABLE_NAME.to_string(),
+            })?;
+
+        Ok(table)
+    }
+
+    /// Insert a dashboard into the dashboard table.
+    async fn insert_dashboard(
+        &self,
+        name: &str,
+        definition: &str,
+        query_ctx: QueryContextRef,
+    ) -> servers::error::Result<()> {
+        let table = self
+            .create_dashboard_table_if_not_exists(query_ctx.clone())
+            .await?;
+        let table_info = table.table_info();
+
+        let insert = RowInsertRequest {
+            table_name: DASHBOARD_TABLE_NAME.to_string(),
+            rows: Some(Rows {
+                schema: Self::build_dashboard_insert_column_schemas(),
+                rows: vec![Row {
+                    values: vec![
+                        ValueData::StringValue(name.to_string()).into(),
+                        ValueData::StringValue(definition.to_string()).into(),
+                        ValueData::TimestampNanosecondValue(0).into(),
+                    ],
+                }],
+            }),
+        };
+
+        let requests = RowInsertRequests {
+            inserts: vec![insert],
+        };
+
+        let output = self
+            .inserter
+            .handle_row_inserts(
+                requests,
+                Self::dashboard_query_ctx(&table_info),
+                &self.statement_executor,
+                false,
+                false,
+            )
+            .await
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)?;
+
+        info!(
+            "Insert dashboard success, name: {}, table: {}, output: {:?}",
+            name,
+            table_info.full_table_name(),
+            output
+        );
+
+        Ok(())
+    }
+
+    /// List all dashboards.
+    async fn list_dashboards(
+        &self,
+        query_ctx: QueryContextRef,
+    ) -> servers::error::Result<Vec<DashboardDefinition>> {
+        let table = if let Some(table) = self
+            .catalog_manager
+            .table(
+                query_ctx.current_catalog(),
+                DEFAULT_PRIVATE_SCHEMA_NAME,
+                DASHBOARD_TABLE_NAME,
+                Some(&query_ctx),
+            )
+            .await
+            .context(CatalogSnafu)?
+        {
+            table
+        } else {
+            return Ok(vec![]);
+        };
+
+        let table_info = table.table_info();
+
+        let dataframe = self
+            .query_engine
+            .read_table(table.clone())
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)?;
+
+        let dataframe = dataframe
+            .select_columns(&[
+                DASHBOARD_TABLE_NAME_COLUMN_NAME,
+                DASHBOARD_TABLE_DEFINITION_COLUMN_NAME,
+            ])
+            .context(DataFusionSnafu)?;
+
+        let plan = dataframe.into_parts().1;
+
+        let output = self
+            .query_engine
+            .execute(plan, Self::dashboard_query_ctx(&table_info))
+            .await
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)?;
+
+        let stream = match output.data {
+            OutputData::Stream(stream) => stream,
+            OutputData::RecordBatches(record_batches) => record_batches.as_stream(),
+            _ => unreachable!(),
+        };
+
+        let records = record_util::collect(stream)
+            .await
+            .context(CollectRecordbatchSnafu)?;
+
+        let mut dashboards = Vec::new();
+
+        for r in &records {
+            let name_column = r.column(0);
+            let definition_column = r.column(1);
+
+            let name = name_column
+                .as_string_opt::<i32>()
+                .context(NotSupportedSnafu {
+                    feat: "Invalid data type for greptime_private.dashboard.name",
+                })?;
+
+            let definition =
+                definition_column
+                    .as_string_opt::<i32>()
+                    .context(NotSupportedSnafu {
+                        feat: "Invalid data type for greptime_private.dashboard.definition",
+                    })?;
+
+            for i in 0..name.len() {
+                dashboards.push(DashboardDefinition {
+                    name: name.value(i).to_string(),
+                    definition: definition.value(i).to_string(),
+                });
+            }
+        }
+
+        Ok(dashboards)
+    }
+
+    /// Delete a dashboard by name.
+    async fn delete_dashboard(
+        &self,
+        name: &str,
+        query_ctx: QueryContextRef,
+    ) -> servers::error::Result<()> {
+        let table = self
+            .create_dashboard_table_if_not_exists(query_ctx.clone())
+            .await?;
+        let table_info = table.table_info();
+
+        let dataframe = self
+            .query_engine
+            .read_table(table.clone())
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)?;
+
+        let name_condition = col(DASHBOARD_TABLE_NAME_COLUMN_NAME).eq(lit(name));
+
+        let dataframe = dataframe.filter(name_condition).context(DataFusionSnafu)?;
+
+        let table_name = TableReference::full(
+            table_info.catalog_name.clone(),
+            table_info.schema_name.clone(),
+            table_info.name.clone(),
+        );
+
+        let table_provider = Arc::new(DfTableProviderAdapter::new(table.clone()));
+        let table_source = Arc::new(DefaultTableSource::new(table_provider));
+
+        let stmt = DmlStatement::new(
+            table_name,
+            table_source,
+            datafusion_expr::WriteOp::Delete,
+            Arc::new(dataframe.into_parts().1),
+        );
+
+        let plan = LogicalPlan::Dml(stmt);
+
+        let output = self
+            .query_engine
+            .execute(plan, Self::dashboard_query_ctx(&table_info))
+            .await
+            .map_err(BoxedError::new)
+            .context(ExecuteQuerySnafu)?;
+
+        info!(
+            "Delete dashboard success, name: {}, table: {}, output: {:?}",
+            name,
+            table_info.full_table_name(),
+            output
+        );
+
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl servers::query_handler::DashboardHandler for Instance {
+    async fn save(
+        &self,
+        name: &str,
+        definition: &str,
+        ctx: QueryContextRef,
+    ) -> servers::error::Result<()> {
+        self.insert_dashboard(name, definition, ctx).await
+    }
+
+    async fn list(&self, ctx: QueryContextRef) -> servers::error::Result<Vec<DashboardDefinition>> {
+        self.list_dashboards(ctx).await
+    }
+
+    async fn delete(&self, name: &str, ctx: QueryContextRef) -> servers::error::Result<()> {
+        self.delete_dashboard(name, ctx).await
+    }
+}
diff --git a/src/frontend/src/instance/grpc.rs b/src/frontend/src/instance/grpc.rs
index c4191145f8..70ff50fadc 100644
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -27,7 +27,6 @@ use api::v1::{
 use async_stream::try_stream;
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
-use common_base::AffectedRows;
 use common_error::ext::BoxedError;
 use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
@@ -260,62 +259,6 @@ impl GrpcQueryHandler for Instance {
             .context(server_error::ExecuteGrpcQuerySnafu)
     }
 
-    async fn put_record_batch(
-        &self,
-        request: servers::grpc::flight::PutRecordBatchRequest,
-        table_ref: &mut Option<TableRef>,
-        ctx: QueryContextRef,
-    ) -> server_error::Result<AffectedRows> {
-        let result: Result<AffectedRows> = async {
-            let table = if let Some(table) = table_ref {
-                table.clone()
-            } else {
-                let table = self
-                    .catalog_manager()
-                    .table(
-                        &request.table_name.catalog_name,
-                        &request.table_name.schema_name,
-                        &request.table_name.table_name,
-                        None,
-                    )
-                    .await
-                    .context(CatalogSnafu)?
-                    .with_context(|| TableNotFoundSnafu {
-                        table_name: request.table_name.to_string(),
-                    })?;
-                *table_ref = Some(table.clone());
-                table
-            };
-
-            let interceptor_ref = self.plugins.get::<GrpcQueryInterceptorRef<Error>>();
-            let interceptor = interceptor_ref.as_ref();
-            interceptor.pre_bulk_insert(table.clone(), ctx.clone())?;
-
-            self.plugins
-                .get::<PermissionCheckerRef>()
-                .as_ref()
-                .check_permission(ctx.current_user(), PermissionReq::BulkInsert)
-                .context(PermissionSnafu)?;
-
-            // do we check limit for bulk insert?
-
-            self.inserter
-                .handle_bulk_insert(
-                    table,
-                    request.flight_data,
-                    request.record_batch,
-                    request.schema_bytes,
-                )
-                .await
-                .context(TableOperationSnafu)
-        }
-        .await;
-
-        result
-            .map_err(BoxedError::new)
-            .context(server_error::ExecuteGrpcRequestSnafu)
-    }
-
     fn handle_put_record_batch_stream(
         &self,
         stream: servers::grpc::flight::PutRecordBatchRequestStream,
diff --git a/src/frontend/src/server.rs b/src/frontend/src/server.rs
index 45c3ec3649..4b51efbd33 100644
--- a/src/frontend/src/server.rs
+++ b/src/frontend/src/server.rs
@@ -143,6 +143,8 @@ where
             builder = builder.with_jaeger_handler(self.instance.clone());
         }
 
+        builder = builder.with_dashboard_handler(self.instance.clone());
+
         if let Some(configurator) = self.plugins.get::<RouterConfigurator>() {
             info!("Adding extra router from plugins");
             builder = builder.with_extra_router(configurator.router());
diff --git a/src/meta-srv/src/bootstrap.rs b/src/meta-srv/src/bootstrap.rs
index 2cfe7d2f7d..eadb7cdc75 100644
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -24,6 +24,8 @@ use common_base::Plugins;
 use common_config::Configurable;
 #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
 use common_meta::distributed_time_constants::META_LEASE_SECS;
+use common_meta::election::CANDIDATE_LEASE_SECS;
+use common_meta::election::etcd::EtcdElection;
 use common_meta::kv_backend::chroot::ChrootKvBackend;
 use common_meta::kv_backend::etcd::EtcdStore;
 use common_meta::kv_backend::memory::MemoryKvBackend;
@@ -42,9 +44,6 @@ use tonic::codec::CompressionEncoding;
 use tonic::transport::server::{Router, TcpIncoming};
 
 use crate::cluster::{MetaPeerClientBuilder, MetaPeerClientRef};
-#[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
-use crate::election::CANDIDATE_LEASE_SECS;
-use crate::election::etcd::EtcdElection;
 use crate::error::OtherSnafu;
 use crate::metasrv::builder::MetasrvBuilder;
 use crate::metasrv::{
@@ -281,7 +280,8 @@ pub async fn metasrv_builder(
                 etcd_client,
                 opts.store_key_prefix.clone(),
             )
-            .await?;
+            .await
+            .context(error::KvBackendSnafu)?;
 
             (kv_backend, Some(election))
         }
@@ -290,10 +290,10 @@ pub async fn metasrv_builder(
             use std::time::Duration;
 
             use common_meta::distributed_time_constants::POSTGRES_KEEP_ALIVE_SECS;
+            use common_meta::election::rds::postgres::{ElectionPgClient, PgElection};
             use common_meta::kv_backend::rds::PgStore;
             use deadpool_postgres::{Config, ManagerConfig, RecyclingMethod};
 
-            use crate::election::rds::postgres::{ElectionPgClient, PgElection};
             use crate::utils::postgres::create_postgres_pool;
 
             let candidate_lease_ttl = Duration::from_secs(CANDIDATE_LEASE_SECS);
@@ -321,7 +321,8 @@ pub async fn metasrv_builder(
                 execution_timeout,
                 idle_session_timeout,
                 statement_timeout,
-            )?;
+            )
+            .context(error::KvBackendSnafu)?;
             let election = PgElection::with_pg_client(
                 opts.grpc.server_addr.clone(),
                 election_client,
@@ -332,7 +333,8 @@ pub async fn metasrv_builder(
                 &opts.meta_table_name,
                 opts.meta_election_lock_id,
             )
-            .await?;
+            .await
+            .context(error::KvBackendSnafu)?;
 
             let pool = create_postgres_pool(&opts.store_addrs, Some(cfg), opts.backend_tls.clone())
                 .await?;
@@ -352,9 +354,9 @@ pub async fn metasrv_builder(
         (None, BackendImpl::MysqlStore) => {
             use std::time::Duration;
 
+            use common_meta::election::rds::mysql::{ElectionMysqlClient, MySqlElection};
             use common_meta::kv_backend::rds::MySqlStore;
 
-            use crate::election::rds::mysql::{ElectionMysqlClient, MySqlElection};
             use crate::utils::mysql::create_mysql_pool;
 
             let pool = create_mysql_pool(&opts.store_addrs, opts.backend_tls.as_ref()).await?;
@@ -389,7 +391,8 @@ pub async fn metasrv_builder(
                 meta_lease_ttl,
                 &election_table_name,
             )
-            .await?;
+            .await
+            .context(error::KvBackendSnafu)?;
             (kv_backend, Some(election))
         }
     };
diff --git a/src/meta-srv/src/cluster.rs b/src/meta-srv/src/cluster.rs
index 35b15b3b29..ef3ba07702 100644
--- a/src/meta-srv/src/cluster.rs
+++ b/src/meta-srv/src/cluster.rs
@@ -247,7 +247,7 @@ impl MetaPeerClient {
         // Safety: when self.is_leader() == false, election must not empty.
         let election = self.election.as_ref().unwrap();
 
-        let leader_addr = election.leader().await?.0;
+        let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0;
 
         let channel = self
             .channel_manager
@@ -279,7 +279,7 @@ impl MetaPeerClient {
         // Safety: when self.is_leader() == false, election must not empty.
         let election = self.election.as_ref().unwrap();
 
-        let leader_addr = election.leader().await?.0;
+        let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0;
 
         let channel = self
             .channel_manager
diff --git a/src/meta-srv/src/lib.rs b/src/meta-srv/src/lib.rs
index c67bc32b40..0e87d4421a 100644
--- a/src/meta-srv/src/lib.rs
+++ b/src/meta-srv/src/lib.rs
@@ -21,7 +21,6 @@ pub mod bootstrap;
 pub mod cache_invalidator;
 pub mod cluster;
 pub mod discovery;
-pub mod election;
 pub mod error;
 pub mod events;
 mod failure_detector;
diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs
index 165efd0555..a1515d897e 100644
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -32,6 +32,8 @@ use common_meta::ddl_manager::DdlManagerRef;
 use common_meta::distributed_time_constants::{
     self, BASE_HEARTBEAT_INTERVAL, default_distributed_time_constants, frontend_heartbeat_interval,
 };
+use common_meta::election::LeaderChangeMessage;
+pub use common_meta::election::{ElectionRef, MetasrvNodeInfo};
 use common_meta::key::TableMetadataManagerRef;
 use common_meta::key::runtime_switch::RuntimeSwitchManagerRef;
 use common_meta::kv_backend::{KvBackendRef, ResettableKvBackend, ResettableKvBackendRef};
@@ -64,7 +66,6 @@ use tokio::sync::broadcast::error::RecvError;
 
 use crate::cluster::MetaPeerClientRef;
 use crate::discovery;
-use crate::election::{Election, LeaderChangeMessage};
 use crate::error::{
     self, InitMetadataSnafu, KvBackendSnafu, Result, StartProcedureManagerSnafu,
     StartTelemetryTaskSnafu, StopProcedureManagerSnafu,
@@ -459,76 +460,6 @@ impl Context {
     }
 }
 
-/// The value of the leader. It is used to store the leader's address.
-pub struct LeaderValue(pub String);
-
-impl<T: AsRef<[u8]>> From<T> for LeaderValue {
-    fn from(value: T) -> Self {
-        let string = String::from_utf8_lossy(value.as_ref());
-        Self(string.to_string())
-    }
-}
-
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct MetasrvNodeInfo {
-    // The metasrv's address
-    pub addr: String,
-    // The node build version
-    pub version: String,
-    // The node build git commit hash
-    pub git_commit: String,
-    // The node start timestamp in milliseconds
-    pub start_time_ms: u64,
-    // The node total cpu millicores
-    #[serde(default)]
-    pub total_cpu_millicores: i64,
-    // The node total memory bytes
-    #[serde(default)]
-    pub total_memory_bytes: i64,
-    /// The node build cpu usage millicores
-    #[serde(default)]
-    pub cpu_usage_millicores: i64,
-    /// The node build memory usage bytes
-    #[serde(default)]
-    pub memory_usage_bytes: i64,
-    // The node hostname
-    #[serde(default)]
-    pub hostname: String,
-}
-
-// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto.
-#[allow(deprecated)]
-impl From<MetasrvNodeInfo> for api::v1::meta::MetasrvNodeInfo {
-    fn from(node_info: MetasrvNodeInfo) -> Self {
-        Self {
-            peer: Some(api::v1::meta::Peer {
-                addr: node_info.addr,
-                ..Default::default()
-            }),
-            // TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version.
-            // New code should use the fields in `info.NodeInfo` instead.
-            version: node_info.version.clone(),
-            git_commit: node_info.git_commit.clone(),
-            start_time_ms: node_info.start_time_ms,
-            cpus: node_info.total_cpu_millicores as u32,
-            memory_bytes: node_info.total_memory_bytes as u64,
-            // The canonical location for node information.
-            info: Some(api::v1::meta::NodeInfo {
-                version: node_info.version,
-                git_commit: node_info.git_commit,
-                start_time_ms: node_info.start_time_ms,
-                total_cpu_millicores: node_info.total_cpu_millicores,
-                total_memory_bytes: node_info.total_memory_bytes,
-                cpu_usage_millicores: node_info.cpu_usage_millicores,
-                memory_usage_bytes: node_info.memory_usage_bytes,
-                cpus: node_info.total_cpu_millicores as u32,
-                memory_bytes: node_info.total_memory_bytes as u64,
-                hostname: node_info.hostname,
-            }),
-        }
-    }
-}
-
 #[derive(Clone, Copy)]
 pub enum SelectTarget {
     Datanode,
@@ -552,7 +483,6 @@ pub struct SelectorContext {
 pub type SelectorRef = Arc<dyn Selector<Context = SelectorContext, Output = Vec<Peer>>>;
 pub type RegionStatAwareSelectorRef =
     Arc<dyn RegionStatAwareSelector<Context = SelectorContext, Output = Vec<(RegionId, Peer)>>>;
-pub type ElectionRef = Arc<dyn Election<Leader = LeaderValue>>;
 
 pub struct MetaStateHandler {
     subscribe_manager: Option<SubscriptionManagerRef>,
diff --git a/src/meta-srv/src/service/admin/leader.rs b/src/meta-srv/src/service/admin/leader.rs
index 1fadb4a3ef..17329e7b47 100644
--- a/src/meta-srv/src/service/admin/leader.rs
+++ b/src/meta-srv/src/service/admin/leader.rs
@@ -32,7 +32,7 @@ pub struct LeaderHandler {
 impl LeaderHandler {
     async fn get_leader(&self) -> Result<Option<String>> {
         if let Some(election) = &self.election {
-            let leader_addr = election.leader().await?.0;
+            let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0;
             return Ok(Some(leader_addr));
         }
         Ok(None)
diff --git a/src/meta-srv/src/service/cluster.rs b/src/meta-srv/src/service/cluster.rs
index 5c0ae4c71f..366a8aa5fb 100644
--- a/src/meta-srv/src/service/cluster.rs
+++ b/src/meta-srv/src/service/cluster.rs
@@ -63,7 +63,10 @@ impl cluster_server::Cluster for Metasrv {
         let leader_addr = &self.options().grpc.server_addr;
         let (leader, followers) = match self.election() {
             Some(election) => {
-                let nodes = election.all_candidates().await?;
+                let nodes = election
+                    .all_candidates()
+                    .await
+                    .context(error::KvBackendSnafu)?;
                 let followers = nodes
                     .into_iter()
                     .filter(|node_info| &node_info.addr != leader_addr)
diff --git a/src/meta-srv/src/service/heartbeat.rs b/src/meta-srv/src/service/heartbeat.rs
index e09073546a..238ed99df2 100644
--- a/src/meta-srv/src/service/heartbeat.rs
+++ b/src/meta-srv/src/service/heartbeat.rs
@@ -23,7 +23,7 @@ use api::v1::meta::{
 use common_telemetry::{debug, error, info, warn};
 use futures::StreamExt;
 use once_cell::sync::OnceCell;
-use snafu::OptionExt;
+use snafu::{OptionExt, ResultExt};
 use tokio::sync::mpsc;
 use tokio::sync::mpsc::Sender;
 use tokio_stream::wrappers::ReceiverStream;
@@ -148,7 +148,7 @@ async fn handle_ask_leader(_req: AskLeaderRequest, ctx: Context) -> Result<AskLe
             if election.is_leader() {
                 ctx.server_addr
             } else {
-                election.leader().await?.0
+                election.leader().await.context(error::KvBackendSnafu)?.0
             }
         }
         None => ctx.server_addr,
diff --git a/src/metric-engine/Cargo.toml b/src/metric-engine/Cargo.toml
index 567210b952..5b561997ab 100644
--- a/src/metric-engine/Cargo.toml
+++ b/src/metric-engine/Cargo.toml
@@ -17,6 +17,7 @@ bytes.workspace = true
 fxhash = "0.2"
 common-base.workspace = true
 common-error.workspace = true
+common-grpc.workspace = true
 common-macro.workspace = true
 common-query.workspace = true
 common-recordbatch.workspace = true
diff --git a/src/metric-engine/src/batch_modifier.rs b/src/metric-engine/src/batch_modifier.rs
new file mode 100644
index 0000000000..8a5774889b
--- /dev/null
+++ b/src/metric-engine/src/batch_modifier.rs
@@ -0,0 +1,426 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::hash::Hasher;
+use std::sync::Arc;
+
+use datatypes::arrow::array::{Array, BinaryBuilder, StringArray, UInt64Array};
+use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+use datatypes::arrow::record_batch::RecordBatch;
+use datatypes::value::ValueRef;
+use fxhash::FxHasher;
+use mito_codec::row_converter::SparsePrimaryKeyCodec;
+use snafu::ResultExt;
+use store_api::storage::ColumnId;
+use store_api::storage::consts::{PRIMARY_KEY_COLUMN_NAME, ReservedColumnId};
+
+use crate::error::{EncodePrimaryKeySnafu, Result, UnexpectedRequestSnafu};
+
+/// Info about a tag column for TSID computation and sparse primary key encoding.
+#[allow(dead_code)]
+pub(crate) struct TagColumnInfo {
+    /// Column name (used for label-name hash).
+    pub name: String,
+    /// Column index in the RecordBatch.
+    pub index: usize,
+    /// Column ID in the physical region.
+    pub column_id: ColumnId,
+}
+
+/// Computes `__tsid` values for each row.
+#[allow(dead_code)]
+pub(crate) fn compute_tsid_array(
+    batch: &RecordBatch,
+    sorted_tag_columns: &[TagColumnInfo],
+    tag_arrays: &[&StringArray],
+) -> UInt64Array {
+    let num_rows = batch.num_rows();
+
+    let label_name_hash = {
+        let mut hasher = FxHasher::default();
+        for tag_col in sorted_tag_columns {
+            hasher.write(tag_col.name.as_bytes());
+            hasher.write_u8(0xff);
+        }
+        hasher.finish()
+    };
+
+    let mut tsid_values = Vec::with_capacity(num_rows);
+    for row in 0..num_rows {
+        let has_null = tag_arrays.iter().any(|arr| arr.is_null(row));
+
+        let tsid = if !has_null {
+            let mut hasher = FxHasher::default();
+            hasher.write_u64(label_name_hash);
+            for arr in tag_arrays {
+                hasher.write(arr.value(row).as_bytes());
+                hasher.write_u8(0xff);
+            }
+            hasher.finish()
+        } else {
+            let mut name_hasher = FxHasher::default();
+            for (tc, arr) in sorted_tag_columns.iter().zip(tag_arrays.iter()) {
+                if !arr.is_null(row) {
+                    name_hasher.write(tc.name.as_bytes());
+                    name_hasher.write_u8(0xff);
+                }
+            }
+            let row_label_hash = name_hasher.finish();
+
+            let mut val_hasher = FxHasher::default();
+            val_hasher.write_u64(row_label_hash);
+            for arr in tag_arrays {
+                if !arr.is_null(row) {
+                    val_hasher.write(arr.value(row).as_bytes());
+                    val_hasher.write_u8(0xff);
+                }
+            }
+            val_hasher.finish()
+        };
+
+        tsid_values.push(tsid);
+    }
+
+    UInt64Array::from(tsid_values)
+}
+
+fn build_tag_arrays<'a>(
+    batch: &'a RecordBatch,
+    sorted_tag_columns: &[TagColumnInfo],
+) -> Vec<&'a StringArray> {
+    sorted_tag_columns
+        .iter()
+        .map(|tc| {
+            batch
+                .column(tc.index)
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .expect("tag column must be utf8")
+        })
+        .collect()
+}
+
+/// Modifies a RecordBatch for sparse primary key encoding.
+#[allow(dead_code)]
+pub(crate) fn modify_batch_sparse(
+    batch: RecordBatch,
+    table_id: u32,
+    sorted_tag_columns: &[TagColumnInfo],
+    non_tag_column_indices: &[usize],
+) -> Result<RecordBatch> {
+    let num_rows = batch.num_rows();
+    let codec = SparsePrimaryKeyCodec::schemaless();
+    let tag_arrays: Vec<&StringArray> = build_tag_arrays(&batch, sorted_tag_columns);
+    let tsid_array = compute_tsid_array(&batch, sorted_tag_columns, &tag_arrays);
+
+    let mut pk_builder = BinaryBuilder::with_capacity(num_rows, 0);
+    let mut buffer = Vec::new();
+    for row in 0..num_rows {
+        buffer.clear();
+        let internal = [
+            (ReservedColumnId::table_id(), ValueRef::UInt32(table_id)),
+            (
+                ReservedColumnId::tsid(),
+                ValueRef::UInt64(tsid_array.value(row)),
+            ),
+        ];
+        codec
+            .encode_to_vec(internal.into_iter(), &mut buffer)
+            .context(EncodePrimaryKeySnafu)?;
+
+        let tags = sorted_tag_columns
+            .iter()
+            .zip(tag_arrays.iter())
+            .filter(|(_, arr)| !arr.is_null(row))
+            .map(|(tc, arr)| (tc.column_id, ValueRef::String(arr.value(row))));
+        codec
+            .encode_to_vec(tags, &mut buffer)
+            .context(EncodePrimaryKeySnafu)?;
+
+        pk_builder.append_value(&buffer);
+    }
+
+    let pk_array = pk_builder.finish();
+
+    let mut fields = vec![Arc::new(Field::new(
+        PRIMARY_KEY_COLUMN_NAME,
+        DataType::Binary,
+        false,
+    ))];
+    let mut columns: Vec<Arc<dyn Array>> = vec![Arc::new(pk_array)];
+
+    for &idx in non_tag_column_indices {
+        fields.push(batch.schema().fields()[idx].clone());
+        columns.push(batch.column(idx).clone());
+    }
+
+    let new_schema = Arc::new(ArrowSchema::new(fields));
+    RecordBatch::try_new(new_schema, columns).map_err(|e| {
+        UnexpectedRequestSnafu {
+            reason: format!("Failed to build modified sparse RecordBatch: {e}"),
+        }
+        .build()
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+    use std::sync::Arc;
+
+    use api::v1::value::ValueData;
+    use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value};
+    use datatypes::arrow::array::{BinaryArray, Int64Array, StringArray};
+    use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
+    use datatypes::arrow::record_batch::RecordBatch;
+    use store_api::codec::PrimaryKeyEncoding;
+    use store_api::storage::consts::PRIMARY_KEY_COLUMN_NAME;
+
+    use super::*;
+    use crate::row_modifier::{RowModifier, RowsIter, TableIdInput};
+
+    fn build_sparse_test_batch() -> RecordBatch {
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("greptime_timestamp", DataType::Int64, false),
+            Field::new("greptime_value", DataType::Float64, true),
+            Field::new("namespace", DataType::Utf8, true),
+            Field::new("host", DataType::Utf8, true),
+        ]));
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(Int64Array::from(vec![1000])),
+                Arc::new(datatypes::arrow::array::Float64Array::from(vec![42.0])),
+                Arc::new(StringArray::from(vec!["greptimedb"])),
+                Arc::new(StringArray::from(vec!["127.0.0.1"])),
+            ],
+        )
+        .unwrap()
+    }
+
+    fn sparse_tag_columns() -> Vec<TagColumnInfo> {
+        vec![
+            TagColumnInfo {
+                name: "host".to_string(),
+                index: 3,
+                column_id: 3,
+            },
+            TagColumnInfo {
+                name: "namespace".to_string(),
+                index: 2,
+                column_id: 2,
+            },
+        ]
+    }
+
+    #[test]
+    fn test_compute_tsid_basic() {
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("namespace", DataType::Utf8, true),
+            Field::new("host", DataType::Utf8, true),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(StringArray::from(vec!["greptimedb"])),
+                Arc::new(StringArray::from(vec!["127.0.0.1"])),
+            ],
+        )
+        .unwrap();
+
+        let tag_columns: Vec<TagColumnInfo> = vec![
+            TagColumnInfo {
+                name: "host".to_string(),
+                index: 1,
+                column_id: 2,
+            },
+            TagColumnInfo {
+                name: "namespace".to_string(),
+                index: 0,
+                column_id: 1,
+            },
+        ];
+        let tag_arrays = build_tag_arrays(&batch, &tag_columns);
+        let tsid_array = compute_tsid_array(&batch, &tag_columns, &tag_arrays);
+
+        assert_eq!(tsid_array.value(0), 2721566936019240841);
+    }
+
+    #[test]
+    fn test_compute_tsid_with_nulls() {
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("a", DataType::Utf8, true),
+            Field::new("b", DataType::Utf8, true),
+        ]));
+        let batch_no_null = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(StringArray::from(vec!["A"])),
+                Arc::new(StringArray::from(vec!["B"])),
+            ],
+        )
+        .unwrap();
+        let tag_cols_2: Vec<TagColumnInfo> = vec![
+            TagColumnInfo {
+                name: "a".to_string(),
+                index: 0,
+                column_id: 1,
+            },
+            TagColumnInfo {
+                name: "b".to_string(),
+                index: 1,
+                column_id: 2,
+            },
+        ];
+        let tag_arrays_2 = build_tag_arrays(&batch_no_null, &tag_cols_2);
+        let tsid_no_null = compute_tsid_array(&batch_no_null, &tag_cols_2, &tag_arrays_2);
+
+        let schema3 = Arc::new(ArrowSchema::new(vec![
+            Field::new("a", DataType::Utf8, true),
+            Field::new("b", DataType::Utf8, true),
+            Field::new("c", DataType::Utf8, true),
+        ]));
+        let batch_with_null = RecordBatch::try_new(
+            schema3,
+            vec![
+                Arc::new(StringArray::from(vec!["A"])),
+                Arc::new(StringArray::from(vec!["B"])),
+                Arc::new(StringArray::from(vec![None as Option<&str>])),
+            ],
+        )
+        .unwrap();
+        let tag_cols_3: Vec<TagColumnInfo> = vec![
+            TagColumnInfo {
+                name: "a".to_string(),
+                index: 0,
+                column_id: 1,
+            },
+            TagColumnInfo {
+                name: "b".to_string(),
+                index: 1,
+                column_id: 2,
+            },
+            TagColumnInfo {
+                name: "c".to_string(),
+                index: 2,
+                column_id: 3,
+            },
+        ];
+        let tag_arrays_3 = build_tag_arrays(&batch_with_null, &tag_cols_3);
+        let tsid_with_null = compute_tsid_array(&batch_with_null, &tag_cols_3, &tag_arrays_3);
+
+        assert_eq!(tsid_no_null.value(0), tsid_with_null.value(0));
+    }
+
+    #[test]
+    fn test_modify_batch_sparse() {
+        let batch = build_sparse_test_batch();
+        let tag_columns = sparse_tag_columns();
+        let non_tag_indices = vec![0, 1];
+        let table_id: u32 = 1025;
+
+        let modified =
+            modify_batch_sparse(batch, table_id, &tag_columns, &non_tag_indices).unwrap();
+
+        assert_eq!(modified.num_columns(), 3);
+        assert_eq!(modified.schema().field(0).name(), PRIMARY_KEY_COLUMN_NAME);
+        assert_eq!(modified.schema().field(1).name(), "greptime_timestamp");
+        assert_eq!(modified.schema().field(2).name(), "greptime_value");
+    }
+
+    #[test]
+    fn test_modify_batch_sparse_matches_row_modifier() {
+        let batch = build_sparse_test_batch();
+        let tag_columns = sparse_tag_columns();
+        let non_tag_indices = vec![0, 1];
+        let table_id: u32 = 1025;
+        let modified =
+            modify_batch_sparse(batch, table_id, &tag_columns, &non_tag_indices).unwrap();
+
+        let name_to_column_id: HashMap<String, ColumnId> = [
+            ("greptime_timestamp".to_string(), 0),
+            ("greptime_value".to_string(), 1),
+            ("namespace".to_string(), 2),
+            ("host".to_string(), 3),
+        ]
+        .into_iter()
+        .collect();
+
+        let rows = Rows {
+            schema: vec![
+                ColumnSchema {
+                    column_name: "greptime_timestamp".to_string(),
+                    datatype: ColumnDataType::TimestampMillisecond as i32,
+                    semantic_type: SemanticType::Timestamp as i32,
+                    ..Default::default()
+                },
+                ColumnSchema {
+                    column_name: "greptime_value".to_string(),
+                    datatype: ColumnDataType::Float64 as i32,
+                    semantic_type: SemanticType::Field as i32,
+                    ..Default::default()
+                },
+                ColumnSchema {
+                    column_name: "namespace".to_string(),
+                    datatype: ColumnDataType::String as i32,
+                    semantic_type: SemanticType::Tag as i32,
+                    ..Default::default()
+                },
+                ColumnSchema {
+                    column_name: "host".to_string(),
+                    datatype: ColumnDataType::String as i32,
+                    semantic_type: SemanticType::Tag as i32,
+                    ..Default::default()
+                },
+            ],
+            rows: vec![Row {
+                values: vec![
+                    Value {
+                        value_data: Some(ValueData::TimestampMillisecondValue(1000)),
+                    },
+                    Value {
+                        value_data: Some(ValueData::F64Value(42.0)),
+                    },
+                    Value {
+                        value_data: Some(ValueData::StringValue("greptimedb".to_string())),
+                    },
+                    Value {
+                        value_data: Some(ValueData::StringValue("127.0.0.1".to_string())),
+                    },
+                ],
+            }],
+        };
+
+        let row_iter = RowsIter::new(rows, &name_to_column_id);
+        let rows = RowModifier::default()
+            .modify_rows(
+                row_iter,
+                TableIdInput::Single(table_id),
+                PrimaryKeyEncoding::Sparse,
+            )
+            .unwrap();
+        let ValueData::BinaryValue(expected_pk) =
+            rows.rows[0].values[0].value_data.clone().unwrap()
+        else {
+            panic!("expected binary primary key");
+        };
+
+        let actual_array = modified
+            .column(0)
+            .as_any()
+            .downcast_ref::<BinaryArray>()
+            .unwrap();
+        assert_eq!(actual_array.value(0), expected_pk.as_slice());
+    }
+}
diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs
index 7a1efedac4..ba90ca960d 100644
--- a/src/metric-engine/src/engine.rs
+++ b/src/metric-engine/src/engine.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 mod alter;
+mod bulk_insert;
 mod catchup;
 mod close;
 mod create;
@@ -288,9 +289,8 @@ impl RegionEngine for MetricEngine {
                 debug_assert_eq!(region_id, resp_region_id);
                 return response;
             }
-            RegionRequest::BulkInserts(_) => {
-                // todo(hl): find a way to support bulk inserts in metric engine.
-                UnsupportedRegionRequestSnafu { request }.fail()
+            RegionRequest::BulkInserts(bulk) => {
+                self.inner.bulk_insert_region(region_id, bulk).await
             }
         };
 
diff --git a/src/metric-engine/src/engine/bulk_insert.rs b/src/metric-engine/src/engine/bulk_insert.rs
new file mode 100644
index 0000000000..2a3c26c80c
--- /dev/null
+++ b/src/metric-engine/src/engine/bulk_insert.rs
@@ -0,0 +1,783 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashSet;
+
+use api::v1::{ArrowIpc, ColumnDataType, SemanticType};
+use bytes::Bytes;
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_grpc::flight::{FlightEncoder, FlightMessage};
+use common_query::prelude::{greptime_timestamp, greptime_value};
+use datatypes::arrow::array::{Array, Float64Array, StringArray, TimestampMillisecondArray};
+use datatypes::arrow::record_batch::RecordBatch;
+use snafu::{OptionExt, ensure};
+use store_api::codec::PrimaryKeyEncoding;
+use store_api::metadata::RegionMetadataRef;
+use store_api::region_request::{
+    AffectedRows, RegionBulkInsertsRequest, RegionPutRequest, RegionRequest,
+};
+use store_api::storage::RegionId;
+
+use crate::batch_modifier::{TagColumnInfo, modify_batch_sparse};
+use crate::engine::MetricEngineInner;
+use crate::error;
+use crate::error::Result;
+
+impl MetricEngineInner {
+    /// Bulk-inserts logical rows into a metric region.
+    ///
+    /// This method accepts a `RegionBulkInsertsRequest` whose payload is a logical
+    /// `RecordBatch` (timestamp, value and tag columns) for the given logical `region_id`.
+    ///
+    /// The transformed batch is encoded to Arrow IPC and forwarded as a `BulkInserts`
+    /// request to the data region, along with the original `partition_expr_version`.
+    /// If the data region reports `StatusCode::Unsupported` for bulk inserts, the request
+    /// is transparently retried as a `Put` by converting the original logical batch into
+    /// `api::v1::Rows`, so callers observe the same semantics as `put_region`.
+    ///
+    /// Returns the number of affected rows, or `0` if the input batch is empty.
+    pub async fn bulk_insert_region(
+        &self,
+        region_id: RegionId,
+        request: RegionBulkInsertsRequest,
+    ) -> Result<AffectedRows> {
+        ensure!(
+            !self.is_physical_region(region_id),
+            error::UnsupportedRegionRequestSnafu {
+                request: RegionRequest::BulkInserts(request),
+            }
+        );
+
+        let (physical_region_id, data_region_id, primary_key_encoding) =
+            self.find_data_region_meta(region_id)?;
+
+        if primary_key_encoding != PrimaryKeyEncoding::Sparse {
+            return error::UnsupportedRegionRequestSnafu {
+                request: RegionRequest::BulkInserts(request),
+            }
+            .fail();
+        }
+
+        let batch = request.payload;
+        if batch.num_rows() == 0 {
+            return Ok(0);
+        }
+
+        let logical_metadata = self
+            .logical_region_metadata(physical_region_id, region_id)
+            .await?;
+        let (tag_columns, non_tag_indices) = self.resolve_tag_columns_from_metadata(
+            region_id,
+            data_region_id,
+            &batch,
+            &logical_metadata,
+        )?;
+        let modified_batch = modify_batch_sparse(
+            batch.clone(),
+            region_id.table_id(),
+            &tag_columns,
+            &non_tag_indices,
+        )?;
+        let (schema, data_header, payload) = record_batch_to_ipc(&modified_batch)?;
+
+        let partition_expr_version = request.partition_expr_version;
+        let request = RegionBulkInsertsRequest {
+            region_id: data_region_id,
+            payload: modified_batch,
+            raw_data: ArrowIpc {
+                schema,
+                data_header,
+                payload,
+            },
+            partition_expr_version,
+        };
+        match self
+            .data_region
+            .write_data(data_region_id, RegionRequest::BulkInserts(request))
+            .await
+        {
+            Ok(affected_rows) => Ok(affected_rows),
+            Err(err) if err.status_code() == StatusCode::Unsupported => {
+                // todo(hl): fallback path for PartitionTreeMemtable, remove this once we remove it
+                let rows = record_batch_to_rows(&batch, region_id)?;
+                self.put_region(
+                    region_id,
+                    RegionPutRequest {
+                        rows,
+                        hint: None,
+                        partition_expr_version,
+                    },
+                )
+                .await
+            }
+            Err(err) => Err(err),
+        }
+    }
+
+    fn resolve_tag_columns_from_metadata(
+        &self,
+        logical_region_id: RegionId,
+        data_region_id: RegionId,
+        batch: &RecordBatch,
+        logical_metadata: &RegionMetadataRef,
+    ) -> Result<(Vec<TagColumnInfo>, Vec<usize>)> {
+        let tag_names: HashSet<&str> = logical_metadata
+            .column_metadatas
+            .iter()
+            .filter_map(|column| {
+                if column.semantic_type == SemanticType::Tag {
+                    Some(column.column_schema.name.as_str())
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        let mut tag_columns = Vec::new();
+        let mut non_tag_indices = Vec::new();
+        {
+            let state = self.state.read().unwrap();
+            let physical_columns = state
+                .physical_region_states()
+                .get(&data_region_id)
+                .context(error::PhysicalRegionNotFoundSnafu {
+                    region_id: data_region_id,
+                })?
+                .physical_columns();
+
+            for (index, field) in batch.schema().fields().iter().enumerate() {
+                let name = field.name();
+                let column_id =
+                    *physical_columns
+                        .get(name)
+                        .with_context(|| error::ColumnNotFoundSnafu {
+                            name: name.clone(),
+                            region_id: logical_region_id,
+                        })?;
+                if tag_names.contains(name.as_str()) {
+                    tag_columns.push(TagColumnInfo {
+                        name: name.clone(),
+                        index,
+                        column_id,
+                    });
+                } else {
+                    non_tag_indices.push(index);
+                }
+            }
+        }
+
+        tag_columns.sort_by(|a, b| a.name.cmp(&b.name));
+        Ok((tag_columns, non_tag_indices))
+    }
+}
+
+fn record_batch_to_rows(batch: &RecordBatch, logical_region_id: RegionId) -> Result<api::v1::Rows> {
+    let schema_ref = batch.schema();
+    let fields = schema_ref.fields();
+
+    let mut ts_idx = None;
+    let mut val_idx = None;
+    let mut tag_indices = Vec::new();
+
+    for (idx, field) in fields.iter().enumerate() {
+        if field.name() == greptime_timestamp() {
+            ts_idx = Some(idx);
+            if !matches!(
+                field.data_type(),
+                datatypes::arrow::datatypes::DataType::Timestamp(
+                    datatypes::arrow::datatypes::TimeUnit::Millisecond,
+                    _
+                )
+            ) {
+                return error::UnexpectedRequestSnafu {
+                    reason: format!(
+                        "Timestamp column '{}' in region {:?} has incompatible type: {:?}",
+                        field.name(),
+                        logical_region_id,
+                        field.data_type()
+                    ),
+                }
+                .fail();
+            }
+        } else if field.name() == greptime_value() {
+            val_idx = Some(idx);
+            if !matches!(
+                field.data_type(),
+                datatypes::arrow::datatypes::DataType::Float64
+            ) {
+                return error::UnexpectedRequestSnafu {
+                    reason: format!(
+                        "Value column '{}' in region {:?} has incompatible type: {:?}",
+                        field.name(),
+                        logical_region_id,
+                        field.data_type()
+                    ),
+                }
+                .fail();
+            }
+        } else {
+            if !matches!(
+                field.data_type(),
+                datatypes::arrow::datatypes::DataType::Utf8
+            ) {
+                return error::UnexpectedRequestSnafu {
+                    reason: format!(
+                        "Tag column '{}' in region {:?} must be Utf8, found: {:?}",
+                        field.name(),
+                        logical_region_id,
+                        field.data_type()
+                    ),
+                }
+                .fail();
+            }
+            tag_indices.push(idx);
+        }
+    }
+
+    let ts_idx = ts_idx.with_context(|| error::UnexpectedRequestSnafu {
+        reason: format!(
+            "Timestamp column '{}' not found in RecordBatch for region {:?}",
+            greptime_timestamp(),
+            logical_region_id
+        ),
+    })?;
+    let val_idx = val_idx.with_context(|| error::UnexpectedRequestSnafu {
+        reason: format!(
+            "Value column '{}' not found in RecordBatch for region {:?}",
+            greptime_value(),
+            logical_region_id
+        ),
+    })?;
+
+    let mut schema = Vec::with_capacity(2 + tag_indices.len());
+    schema.push(api::v1::ColumnSchema {
+        column_name: greptime_timestamp().to_string(),
+        datatype: ColumnDataType::TimestampMillisecond as i32,
+        semantic_type: SemanticType::Timestamp as i32,
+        datatype_extension: None,
+        options: None,
+    });
+    schema.push(api::v1::ColumnSchema {
+        column_name: greptime_value().to_string(),
+        datatype: ColumnDataType::Float64 as i32,
+        semantic_type: SemanticType::Field as i32,
+        datatype_extension: None,
+        options: None,
+    });
+    for &idx in &tag_indices {
+        let field = &fields[idx];
+        schema.push(api::v1::ColumnSchema {
+            column_name: field.name().clone(),
+            datatype: ColumnDataType::String as i32,
+            semantic_type: SemanticType::Tag as i32,
+            datatype_extension: None,
+            options: None,
+        });
+    }
+
+    let ts_array = batch
+        .column(ts_idx)
+        .as_any()
+        .downcast_ref::<TimestampMillisecondArray>()
+        .expect("validated as TimestampMillisecond");
+    let val_array = batch
+        .column(val_idx)
+        .as_any()
+        .downcast_ref::<Float64Array>()
+        .expect("validated as Float64");
+    let tag_arrays: Vec<&StringArray> = tag_indices
+        .iter()
+        .map(|&idx| {
+            batch
+                .column(idx)
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .expect("validated as Utf8")
+        })
+        .collect();
+
+    let num_rows = batch.num_rows();
+    let mut rows = Vec::with_capacity(num_rows);
+    for row_idx in 0..num_rows {
+        let mut values = Vec::with_capacity(2 + tag_arrays.len());
+
+        if ts_array.is_null(row_idx) {
+            values.push(api::v1::Value { value_data: None });
+        } else {
+            values.push(api::v1::Value {
+                value_data: Some(api::v1::value::ValueData::TimestampMillisecondValue(
+                    ts_array.value(row_idx),
+                )),
+            });
+        }
+
+        if val_array.is_null(row_idx) {
+            values.push(api::v1::Value { value_data: None });
+        } else {
+            values.push(api::v1::Value {
+                value_data: Some(api::v1::value::ValueData::F64Value(
+                    val_array.value(row_idx),
+                )),
+            });
+        }
+
+        for arr in &tag_arrays {
+            if arr.is_null(row_idx) {
+                values.push(api::v1::Value { value_data: None });
+            } else {
+                values.push(api::v1::Value {
+                    value_data: Some(api::v1::value::ValueData::StringValue(
+                        arr.value(row_idx).to_string(),
+                    )),
+                });
+            }
+        }
+
+        rows.push(api::v1::Row { values });
+    }
+
+    Ok(api::v1::Rows { schema, rows })
+}
+
+fn record_batch_to_ipc(record_batch: &RecordBatch) -> Result<(Bytes, Bytes, Bytes)> {
+    let mut encoder = FlightEncoder::default();
+    let schema = encoder.encode_schema(record_batch.schema().as_ref());
+    let mut iter = encoder
+        .encode(FlightMessage::RecordBatch(record_batch.clone()))
+        .into_iter();
+
+    let Some(flight_data) = iter.next() else {
+        return error::UnexpectedRequestSnafu {
+            reason: "Failed to encode empty flight data",
+        }
+        .fail();
+    };
+    ensure!(
+        iter.next().is_none(),
+        error::UnexpectedRequestSnafu {
+            reason: "Bulk insert RecordBatch with dictionary arrays is unsupported".to_string(),
+        }
+    );
+
+    Ok((
+        schema.data_header,
+        flight_data.data_header,
+        flight_data.data_body,
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::assert_matches::assert_matches;
+    use std::sync::Arc;
+
+    use api::v1::ArrowIpc;
+    use common_error::ext::ErrorExt;
+    use common_query::prelude::{greptime_timestamp, greptime_value};
+    use common_recordbatch::RecordBatches;
+    use datatypes::arrow::array::{Float64Array, StringArray, TimestampMillisecondArray};
+    use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit};
+    use datatypes::arrow::record_batch::RecordBatch;
+    use store_api::metric_engine_consts::MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING;
+    use store_api::path_utils::table_dir;
+    use store_api::region_engine::RegionEngine;
+    use store_api::region_request::{RegionBulkInsertsRequest, RegionPutRequest, RegionRequest};
+    use store_api::storage::{RegionId, ScanRequest};
+
+    use super::record_batch_to_ipc;
+    use crate::error::Error;
+    use crate::test_util::{self, TestEnv};
+
+    fn build_logical_batch(start: usize, rows: usize) -> RecordBatch {
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new(
+                greptime_timestamp(),
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new(greptime_value(), DataType::Float64, true),
+            Field::new("job", DataType::Utf8, true),
+        ]));
+
+        let mut ts = Vec::with_capacity(rows);
+        let mut values = Vec::with_capacity(rows);
+        let mut tags = Vec::with_capacity(rows);
+        for i in start..start + rows {
+            ts.push(i as i64);
+            values.push(i as f64);
+            tags.push("tag_0".to_string());
+        }
+
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(TimestampMillisecondArray::from(ts)),
+                Arc::new(Float64Array::from(values)),
+                Arc::new(StringArray::from(tags)),
+            ],
+        )
+        .unwrap()
+    }
+
+    fn build_bulk_request(logical_region_id: RegionId, batch: RecordBatch) -> RegionRequest {
+        let (schema, data_header, payload) = record_batch_to_ipc(&batch).unwrap();
+        RegionRequest::BulkInserts(RegionBulkInsertsRequest {
+            region_id: logical_region_id,
+            payload: batch,
+            raw_data: ArrowIpc {
+                schema,
+                data_header,
+                payload,
+            },
+            partition_expr_version: None,
+        })
+    }
+
+    async fn init_dense_metric_region(env: &TestEnv) -> RegionId {
+        let physical_region_id = env.default_physical_region_id();
+        env.create_physical_region(
+            physical_region_id,
+            &TestEnv::default_table_dir(),
+            vec![(
+                MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING.to_string(),
+                "dense".to_string(),
+            )],
+        )
+        .await;
+
+        let logical_region_id = env.default_logical_region_id();
+        let request = test_util::create_logical_region_request(
+            &["job"],
+            physical_region_id,
+            &table_dir("test", logical_region_id.table_id()),
+        );
+        env.metric()
+            .handle_request(logical_region_id, RegionRequest::Create(request))
+            .await
+            .unwrap();
+        logical_region_id
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_empty_batch_returns_zero() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+        let logical_region_id = env.default_logical_region_id();
+
+        let batch = build_logical_batch(0, 0);
+        let request = RegionRequest::BulkInserts(RegionBulkInsertsRequest {
+            region_id: logical_region_id,
+            payload: batch,
+            raw_data: ArrowIpc::default(),
+            partition_expr_version: None,
+        });
+        let response = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap();
+        assert_eq!(response.affected_rows, 0);
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_physical_region_rejected() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+
+        let physical_region_id = env.default_physical_region_id();
+        let batch = build_logical_batch(0, 2);
+        let request = build_bulk_request(physical_region_id, batch);
+
+        let err = env
+            .metric()
+            .handle_request(physical_region_id, request)
+            .await
+            .unwrap_err();
+        let Some(err) = err.as_any().downcast_ref::<Error>() else {
+            panic!("unexpected error type");
+        };
+        assert_matches!(err, Error::UnsupportedRegionRequest { .. });
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_unknown_column_errors() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+        let logical_region_id = env.default_logical_region_id();
+
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new(
+                greptime_timestamp(),
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new(greptime_value(), DataType::Float64, true),
+            Field::new("nonexistent_column", DataType::Utf8, true),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(TimestampMillisecondArray::from(vec![0i64])),
+                Arc::new(Float64Array::from(vec![1.0])),
+                Arc::new(StringArray::from(vec!["val"])),
+            ],
+        )
+        .unwrap();
+
+        let request = build_bulk_request(logical_region_id, batch);
+        let err = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap_err();
+        let Some(err) = err.as_any().downcast_ref::<Error>() else {
+            panic!("unexpected error type");
+        };
+        assert_matches!(err, Error::ColumnNotFound { .. });
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_multiple_tag_columns() {
+        let env = TestEnv::new().await;
+        let physical_region_id = env.default_physical_region_id();
+        env.create_physical_region(physical_region_id, &TestEnv::default_table_dir(), vec![])
+            .await;
+        let logical_region_id = env.default_logical_region_id();
+        let request = test_util::create_logical_region_request(
+            &["host", "region"],
+            physical_region_id,
+            &table_dir("test", logical_region_id.table_id()),
+        );
+        env.metric()
+            .handle_request(logical_region_id, RegionRequest::Create(request))
+            .await
+            .unwrap();
+
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new(
+                greptime_timestamp(),
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new(greptime_value(), DataType::Float64, true),
+            Field::new("host", DataType::Utf8, true),
+            Field::new("region", DataType::Utf8, true),
+        ]));
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(TimestampMillisecondArray::from(vec![0i64, 1, 2])),
+                Arc::new(Float64Array::from(vec![10.0, 20.0, 30.0])),
+                Arc::new(StringArray::from(vec!["h1", "h2", "h1"])),
+                Arc::new(StringArray::from(vec!["us-east", "us-west", "eu-west"])),
+            ],
+        )
+        .unwrap();
+
+        let request = build_bulk_request(logical_region_id, batch);
+        let response = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap();
+        assert_eq!(response.affected_rows, 3);
+
+        let stream = env
+            .metric()
+            .scan_to_stream(logical_region_id, ScanRequest::default())
+            .await
+            .unwrap();
+        let batches = RecordBatches::try_collect(stream).await.unwrap();
+        assert_eq!(batches.iter().map(|b| b.num_rows()).sum::<usize>(), 3);
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_accumulates_rows() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+        let logical_region_id = env.default_logical_region_id();
+
+        let request = build_bulk_request(logical_region_id, build_logical_batch(0, 3));
+        let response = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap();
+        assert_eq!(response.affected_rows, 3);
+
+        let request = build_bulk_request(logical_region_id, build_logical_batch(3, 5));
+        let response = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap();
+        assert_eq!(response.affected_rows, 5);
+
+        let stream = env
+            .metric()
+            .scan_to_stream(logical_region_id, ScanRequest::default())
+            .await
+            .unwrap();
+        let batches = RecordBatches::try_collect(stream).await.unwrap();
+        assert_eq!(batches.iter().map(|b| b.num_rows()).sum::<usize>(), 8);
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_sparse_encoding() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+        let logical_region_id = env.default_logical_region_id();
+
+        let request = build_bulk_request(logical_region_id, build_logical_batch(0, 4));
+        let response = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap();
+        assert_eq!(response.affected_rows, 4);
+
+        let stream = env
+            .metric()
+            .scan_to_stream(logical_region_id, ScanRequest::default())
+            .await
+            .unwrap();
+        let batches = RecordBatches::try_collect(stream).await.unwrap();
+        assert_eq!(batches.iter().map(|b| b.num_rows()).sum::<usize>(), 4);
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_dense_encoding_rejected() {
+        let env = TestEnv::new().await;
+        let logical_region_id = init_dense_metric_region(&env).await;
+
+        let request = build_bulk_request(logical_region_id, build_logical_batch(0, 2));
+        let err = env
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap_err();
+        let Some(err) = err.as_any().downcast_ref::<Error>() else {
+            panic!("unexpected error type");
+        };
+        assert_matches!(err, Error::UnsupportedRegionRequest { .. });
+    }
+
+    #[tokio::test]
+    async fn test_bulk_insert_matches_put() {
+        let env_put = TestEnv::new().await;
+        env_put.init_metric_region().await;
+        let logical_region_id = env_put.default_logical_region_id();
+        let schema = test_util::row_schema_with_tags(&["job"]);
+        let rows = test_util::build_rows(1, 5);
+        env_put
+            .metric()
+            .handle_request(
+                logical_region_id,
+                RegionRequest::Put(RegionPutRequest {
+                    rows: api::v1::Rows { schema, rows },
+                    hint: None,
+                    partition_expr_version: None,
+                }),
+            )
+            .await
+            .unwrap();
+        let put_stream = env_put
+            .metric()
+            .scan_to_stream(logical_region_id, ScanRequest::default())
+            .await
+            .unwrap();
+        let put_batches = RecordBatches::try_collect(put_stream).await.unwrap();
+        let put_output = put_batches.pretty_print().unwrap();
+
+        let env_bulk = TestEnv::new().await;
+        env_bulk.init_metric_region().await;
+        let request = build_bulk_request(logical_region_id, build_logical_batch(0, 5));
+        env_bulk
+            .metric()
+            .handle_request(logical_region_id, request)
+            .await
+            .unwrap();
+        let bulk_stream = env_bulk
+            .metric()
+            .scan_to_stream(logical_region_id, ScanRequest::default())
+            .await
+            .unwrap();
+        let bulk_batches = RecordBatches::try_collect(bulk_stream).await.unwrap();
+        let bulk_output = bulk_batches.pretty_print().unwrap();
+
+        assert_eq!(put_output, bulk_output);
+    }
+
+    #[test]
+    fn test_record_batch_to_rows_with_null_values() {
+        use datatypes::arrow::array::{Float64Array, StringArray, TimestampMillisecondArray};
+        use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit};
+        use datatypes::arrow::record_batch::RecordBatch;
+        use store_api::storage::RegionId;
+
+        use crate::engine::bulk_insert::record_batch_to_rows;
+
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new(
+                greptime_timestamp(),
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                true,
+            ),
+            Field::new(greptime_value(), DataType::Float64, true),
+            Field::new("job", DataType::Utf8, true),
+            Field::new("host", DataType::Utf8, true),
+        ]));
+
+        let ts_array = TimestampMillisecondArray::from(vec![Some(1000), None, Some(3000)]);
+        let val_array = Float64Array::from(vec![Some(1.0), Some(2.0), None]);
+        let job_array = StringArray::from(vec![Some("job1"), None, Some("job3")]);
+        let host_array = StringArray::from(vec![None, Some("host2"), Some("host3")]);
+
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(ts_array),
+                Arc::new(val_array),
+                Arc::new(job_array),
+                Arc::new(host_array),
+            ],
+        )
+        .unwrap();
+
+        let region_id = RegionId::new(1, 1);
+        let rows = record_batch_to_rows(&batch, region_id).unwrap();
+
+        assert_eq!(rows.rows.len(), 3);
+        assert_eq!(rows.schema.len(), 4);
+
+        // Row 0: all non-null except host
+        assert!(rows.rows[0].values[0].value_data.is_some());
+        assert!(rows.rows[0].values[1].value_data.is_some());
+        assert!(rows.rows[0].values[2].value_data.is_some());
+        assert!(rows.rows[0].values[3].value_data.is_none());
+
+        // Row 1: null timestamp, null job
+        assert!(rows.rows[1].values[0].value_data.is_none());
+        assert!(rows.rows[1].values[1].value_data.is_some());
+        assert!(rows.rows[1].values[2].value_data.is_none());
+        assert!(rows.rows[1].values[3].value_data.is_some());
+
+        // Row 2: null value
+        assert!(rows.rows[2].values[0].value_data.is_some());
+        assert!(rows.rows[2].values[1].value_data.is_none());
+        assert!(rows.rows[2].values[2].value_data.is_some());
+        assert!(rows.rows[2].values[3].value_data.is_some());
+    }
+}
diff --git a/src/metric-engine/src/engine/put.rs b/src/metric-engine/src/engine/put.rs
index 9251605aea..edae0d2bb4 100644
--- a/src/metric-engine/src/engine/put.rs
+++ b/src/metric-engine/src/engine/put.rs
@@ -460,7 +460,7 @@ impl MetricEngineInner {
             .await
     }
 
-    fn find_data_region_meta(
+    pub(crate) fn find_data_region_meta(
         &self,
         logical_region_id: RegionId,
     ) -> Result<(RegionId, RegionId, PrimaryKeyEncoding)> {
diff --git a/src/metric-engine/src/lib.rs b/src/metric-engine/src/lib.rs
index 30daa80b91..b93029f2f4 100644
--- a/src/metric-engine/src/lib.rs
+++ b/src/metric-engine/src/lib.rs
@@ -52,6 +52,7 @@
 
 #![feature(assert_matches)]
 
+mod batch_modifier;
 pub mod config;
 mod data_region;
 pub mod engine;
diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml
index 1d7cf7b6d7..a78bf079b0 100644
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -108,6 +108,11 @@ name = "memtable_bench"
 harness = false
 required-features = ["test"]
 
+[[bench]]
+name = "bench_cache_stream"
+harness = false
+required-features = ["test"]
+
 [[bench]]
 name = "bench_filter_time_partition"
 harness = false
diff --git a/src/mito2/benches/bench_cache_stream.rs b/src/mito2/benches/bench_cache_stream.rs
new file mode 100644
index 0000000000..f2314f2ccb
--- /dev/null
+++ b/src/mito2/benches/bench_cache_stream.rs
@@ -0,0 +1,126 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Benchmarks for `cache_flat_range_stream` overhead.
+//!
+//! Compares consuming batches from a plain stream vs through the caching wrapper
+//! that clones batches for the range cache.
+//!
+//! Run with:
+//! ```sh
+//! cargo bench -p mito2 --features test --bench bench_cache_stream
+//! ```
+
+use std::collections::VecDeque;
+use std::sync::Arc;
+
+use criterion::{Criterion, criterion_group, criterion_main};
+use futures::TryStreamExt;
+use mito_codec::row_converter::DensePrimaryKeyCodec;
+use mito2::memtable::bulk::context::BulkIterContext;
+use mito2::memtable::bulk::part::{BulkPartConverter, BulkPartEncoder};
+use mito2::memtable::bulk::part_reader::EncodedBulkPartIter;
+use mito2::read::range_cache::bench_cache_flat_range_stream;
+use mito2::sst::parquet::DEFAULT_ROW_GROUP_SIZE;
+use mito2::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
+use mito2::test_util::bench_util::{CpuDataGenerator, cpu_metadata};
+
+fn cache_flat_range_stream_bench(c: &mut Criterion) {
+    let metadata = Arc::new(cpu_metadata());
+    let region_id = metadata.region_id;
+    let start_sec = 1710043200;
+    // 2000 hosts × 51 steps = 102,000 rows ≈ DEFAULT_ROW_GROUP_SIZE
+    let num_hosts = 2000;
+    let end_sec = start_sec + 510;
+    let generator = CpuDataGenerator::new(metadata.clone(), num_hosts, start_sec, end_sec);
+
+    // Build a BulkPart from all the generated data
+    let schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default());
+    let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
+
+    let mut converter = BulkPartConverter::new(
+        &metadata,
+        schema,
+        DEFAULT_ROW_GROUP_SIZE,
+        codec,
+        true, // store_pk_columns
+    );
+    for kvs in generator.iter() {
+        converter.append_key_values(&kvs).unwrap();
+    }
+    let bulk_part = converter.convert().unwrap();
+
+    // Encode to parquet
+    let encoder = BulkPartEncoder::new(metadata.clone(), DEFAULT_ROW_GROUP_SIZE).unwrap();
+    let encoded_part = encoder.encode_part(&bulk_part).unwrap().unwrap();
+
+    // Decode all record batches
+    let num_row_groups = encoded_part.metadata().parquet_metadata.num_row_groups();
+    let context = Arc::new(
+        BulkIterContext::new(
+            metadata.clone(),
+            None, // No projection
+            None, // No predicate
+            false,
+        )
+        .unwrap(),
+    );
+    let row_groups: VecDeque<usize> = (0..num_row_groups).collect();
+
+    let rt = tokio::runtime::Runtime::new().unwrap();
+
+    let mut group = c.benchmark_group("cache_flat_range_stream");
+    group.sample_size(10);
+
+    group.bench_function("baseline_iter_stream", |b| {
+        b.iter(|| {
+            rt.block_on(async {
+                let iter = EncodedBulkPartIter::try_new(
+                    &encoded_part,
+                    context.clone(),
+                    row_groups.clone(),
+                    None,
+                    None,
+                )
+                .unwrap();
+                let stream: mito2::read::BoxedRecordBatchStream =
+                    Box::pin(futures::stream::iter(iter));
+                let mut stream = stream;
+                while let Some(_batch) = stream.try_next().await.unwrap() {}
+            });
+        });
+    });
+
+    group.bench_function("cache_flat_range_stream", |b| {
+        b.iter(|| {
+            rt.block_on(async {
+                let iter = EncodedBulkPartIter::try_new(
+                    &encoded_part,
+                    context.clone(),
+                    row_groups.clone(),
+                    None,
+                    None,
+                )
+                .unwrap();
+                let stream: mito2::read::BoxedRecordBatchStream =
+                    Box::pin(futures::stream::iter(iter));
+                let mut stream = bench_cache_flat_range_stream(stream, 64 * 1024 * 1024, region_id);
+                while let Some(_batch) = stream.try_next().await.unwrap() {}
+            });
+        });
+    });
+}
+
+criterion_group!(benches, cache_flat_range_stream_bench);
+criterion_main!(benches);
diff --git a/src/mito2/benches/memtable_bench.rs b/src/mito2/benches/memtable_bench.rs
index ebe994f861..8336625e3c 100644
--- a/src/mito2/benches/memtable_bench.rs
+++ b/src/mito2/benches/memtable_bench.rs
@@ -12,15 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+//! Benchmarks for memtable operations: writes, full scans, filtered scans,
+//! bulk part conversion, record batch iteration with filters, and flat merge.
+//!
+//! Run with:
+//! ```sh
+//! cargo bench -p mito2 --features test --bench memtable_bench
+//! ```
+
 use std::sync::Arc;
 
-use api::v1::value::ValueData;
-use api::v1::{Row, Rows, SemanticType};
 use criterion::{Criterion, criterion_group, criterion_main};
-use datafusion_common::Column;
-use datafusion_expr::{Expr, lit};
-use datatypes::data_type::ConcreteDataType;
-use datatypes::schema::ColumnSchema;
 use mito_codec::row_converter::DensePrimaryKeyCodec;
 use mito2::memtable::bulk::context::BulkIterContext;
 use mito2::memtable::bulk::part::BulkPartConverter;
@@ -28,20 +30,13 @@ use mito2::memtable::bulk::part_reader::BulkPartBatchIter;
 use mito2::memtable::bulk::{BulkMemtable, BulkMemtableConfig};
 use mito2::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtable};
 use mito2::memtable::time_series::TimeSeriesMemtable;
-use mito2::memtable::{KeyValues, Memtable, RangesOptions};
+use mito2::memtable::{IterBuilder, Memtable, RangesOptions};
 use mito2::read::flat_merge::FlatMergeIterator;
 use mito2::read::scan_region::PredicateGroup;
 use mito2::region::options::MergeMode;
 use mito2::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
-use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema};
-use rand::Rng;
-use rand::rngs::ThreadRng;
-use rand::seq::IndexedRandom;
-use store_api::metadata::{
-    ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
-};
-use store_api::storage::RegionId;
-use table::predicate::Predicate;
+use mito2::test_util::bench_util::{CpuDataGenerator, cpu_metadata};
+use mito2::test_util::memtable_util;
 
 /// Writes rows.
 fn write_rows(c: &mut Criterion) {
@@ -105,7 +100,11 @@ fn full_scan(c: &mut Criterion) {
         }
 
         b.iter(|| {
-            let iter = memtable.iter(None, None, None).unwrap();
+            let iter = memtable
+                .ranges(None, RangesOptions::default())
+                .unwrap()
+                .build(None)
+                .unwrap();
             for batch in iter {
                 let _batch = batch.unwrap();
             }
@@ -145,7 +144,17 @@ fn filter_1_host(c: &mut Criterion) {
         let predicate = generator.random_host_filter();
 
         b.iter(|| {
-            let iter = memtable.iter(None, Some(predicate.clone()), None).unwrap();
+            let iter = memtable
+                .ranges(
+                    None,
+                    RangesOptions {
+                        predicate: PredicateGroup::new(&metadata, predicate.exprs()).unwrap(),
+                        ..Default::default()
+                    },
+                )
+                .unwrap()
+                .build(None)
+                .unwrap();
             for batch in iter {
                 let _batch = batch.unwrap();
             }
@@ -202,224 +211,6 @@ fn filter_1_host(c: &mut Criterion) {
     });
 }
 
-struct Host {
-    hostname: String,
-    region: String,
-    datacenter: String,
-    rack: String,
-    os: String,
-    arch: String,
-    team: String,
-    service: String,
-    service_version: String,
-    service_environment: String,
-}
-
-impl Host {
-    fn random_with_id(id: usize) -> Host {
-        let mut rng = rand::rng();
-        let region = format!("ap-southeast-{}", rng.random_range(0..10));
-        let datacenter = format!(
-            "{}{}",
-            region,
-            ['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap()
-        );
-        Host {
-            hostname: format!("host_{id}"),
-            region,
-            datacenter,
-            rack: rng.random_range(0..100).to_string(),
-            os: "Ubuntu16.04LTS".to_string(),
-            arch: "x86".to_string(),
-            team: "CHI".to_string(),
-            service: rng.random_range(0..100).to_string(),
-            service_version: rng.random_range(0..10).to_string(),
-            service_environment: "test".to_string(),
-        }
-    }
-
-    fn fill_values(&self, values: &mut Vec<api::v1::Value>) {
-        let tags = [
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.hostname.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.region.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.datacenter.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.rack.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.os.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.arch.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.team.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.service.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.service_version.clone())),
-            },
-            api::v1::Value {
-                value_data: Some(ValueData::StringValue(self.service_environment.clone())),
-            },
-        ];
-        for tag in tags {
-            values.push(tag);
-        }
-    }
-}
-
-struct CpuDataGenerator {
-    metadata: RegionMetadataRef,
-    column_schemas: Vec<api::v1::ColumnSchema>,
-    hosts: Vec<Host>,
-    start_sec: i64,
-    end_sec: i64,
-}
-
-impl CpuDataGenerator {
-    fn new(metadata: RegionMetadataRef, num_hosts: usize, start_sec: i64, end_sec: i64) -> Self {
-        let column_schemas = region_metadata_to_row_schema(&metadata);
-        Self {
-            metadata,
-            column_schemas,
-            hosts: Self::generate_hosts(num_hosts),
-            start_sec,
-            end_sec,
-        }
-    }
-
-    fn iter(&self) -> impl Iterator<Item = KeyValues> + '_ {
-        // point per 10s.
-        (self.start_sec..self.end_sec)
-            .step_by(10)
-            .enumerate()
-            .map(|(seq, ts)| self.build_key_values(seq, ts))
-    }
-
-    fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues {
-        let rows = self
-            .hosts
-            .iter()
-            .map(|host| {
-                let mut rng = rand::rng();
-                let mut values = Vec::with_capacity(21);
-                values.push(api::v1::Value {
-                    value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
-                });
-                host.fill_values(&mut values);
-                for _ in 0..10 {
-                    values.push(api::v1::Value {
-                        value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))),
-                    });
-                }
-                Row { values }
-            })
-            .collect();
-        let mutation = api::v1::Mutation {
-            op_type: api::v1::OpType::Put as i32,
-            sequence: seq as u64,
-            rows: Some(Rows {
-                schema: self.column_schemas.clone(),
-                rows,
-            }),
-            write_hint: None,
-        };
-
-        KeyValues::new(&self.metadata, mutation).unwrap()
-    }
-
-    fn random_host_filter(&self) -> Predicate {
-        let host = self.random_hostname();
-        let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host));
-        Predicate::new(vec![expr])
-    }
-
-    fn random_host_filter_exprs(&self) -> Vec<Expr> {
-        let host = self.random_hostname();
-        vec![Expr::Column(Column::from_name("hostname")).eq(lit(host))]
-    }
-
-    fn random_hostname(&self) -> String {
-        let mut rng = rand::rng();
-        self.hosts.choose(&mut rng).unwrap().hostname.clone()
-    }
-
-    fn random_f64(rng: &mut ThreadRng) -> f64 {
-        let base: u32 = rng.random_range(30..95);
-        base as f64
-    }
-
-    fn generate_hosts(num_hosts: usize) -> Vec<Host> {
-        (0..num_hosts).map(Host::random_with_id).collect()
-    }
-}
-
-/// Creates a metadata for TSBS cpu-like table.
-fn cpu_metadata() -> RegionMetadata {
-    let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
-    builder.push_column_metadata(ColumnMetadata {
-        column_schema: ColumnSchema::new(
-            "ts",
-            ConcreteDataType::timestamp_millisecond_datatype(),
-            false,
-        ),
-        semantic_type: SemanticType::Timestamp,
-        column_id: 0,
-    });
-    let mut column_id = 1;
-    let tags = [
-        "hostname",
-        "region",
-        "datacenter",
-        "rack",
-        "os",
-        "arch",
-        "team",
-        "service",
-        "service_version",
-        "service_environment",
-    ];
-    for tag in tags {
-        builder.push_column_metadata(ColumnMetadata {
-            column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true),
-            semantic_type: SemanticType::Tag,
-            column_id,
-        });
-        column_id += 1;
-    }
-    let fields = [
-        "usage_user",
-        "usage_system",
-        "usage_idle",
-        "usage_nice",
-        "usage_iowait",
-        "usage_irq",
-        "usage_softirq",
-        "usage_steal",
-        "usage_guest",
-        "usage_guest_nice",
-    ];
-    for field in fields {
-        builder.push_column_metadata(ColumnMetadata {
-            column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true),
-            semantic_type: SemanticType::Field,
-            column_id,
-        });
-        column_id += 1;
-    }
-    builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
-    builder.build().unwrap()
-}
-
 fn bulk_part_converter(c: &mut Criterion) {
     let metadata = Arc::new(cpu_metadata());
     let start_sec = 1710043200;
diff --git a/src/mito2/benches/simple_bulk_memtable.rs b/src/mito2/benches/simple_bulk_memtable.rs
index 0277397768..05035734de 100644
--- a/src/mito2/benches/simple_bulk_memtable.rs
+++ b/src/mito2/benches/simple_bulk_memtable.rs
@@ -21,7 +21,7 @@ use criterion::{Criterion, criterion_group, criterion_main};
 use datatypes::data_type::ConcreteDataType;
 use datatypes::schema::ColumnSchema;
 use mito2::memtable::simple_bulk_memtable::SimpleBulkMemtable;
-use mito2::memtable::{KeyValues, Memtable, MemtableRanges, RangesOptions};
+use mito2::memtable::{IterBuilder, KeyValues, Memtable, MemtableRanges, RangesOptions};
 use mito2::read;
 use mito2::read::Source;
 use mito2::read::dedup::DedupReader;
@@ -156,7 +156,11 @@ async fn flush(mem: &SimpleBulkMemtable) {
 }
 
 async fn flush_original(mem: &SimpleBulkMemtable) {
-    let iter = mem.iter(None, None, None).unwrap();
+    let iter = mem
+        .ranges(None, RangesOptions::default())
+        .unwrap()
+        .build(None)
+        .unwrap();
     for b in iter {
         black_box(b.unwrap());
     }
diff --git a/src/mito2/src/access_layer.rs b/src/mito2/src/access_layer.rs
index 92c8a3bc36..33180ebf46 100644
--- a/src/mito2/src/access_layer.rs
+++ b/src/mito2/src/access_layer.rs
@@ -17,7 +17,6 @@ use std::time::{Duration, Instant};
 
 use async_stream::try_stream;
 use common_time::Timestamp;
-use either::Either;
 use futures::{Stream, TryStreamExt};
 use object_store::services::Fs;
 use object_store::util::{join_dir, with_instrument_layers};
@@ -37,7 +36,7 @@ use crate::error::{
     CleanDirSnafu, DeleteIndexSnafu, DeleteIndexesSnafu, DeleteSstsSnafu, OpenDalSnafu, Result,
 };
 use crate::metrics::{COMPACTION_STAGE_ELAPSED, FLUSH_ELAPSED};
-use crate::read::{FlatSource, Source};
+use crate::read::FlatSource;
 use crate::region::options::IndexOptions;
 use crate::sst::file::{FileHandle, RegionFileId, RegionIndexId};
 use crate::sst::index::IndexerBuilderImpl;
@@ -47,7 +46,7 @@ use crate::sst::location::{self, region_dir_from_table_dir};
 use crate::sst::parquet::reader::ParquetReaderBuilder;
 use crate::sst::parquet::writer::ParquetWriter;
 use crate::sst::parquet::{SstInfo, WriteOptions};
-use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY};
+use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FormatType};
 
 pub type AccessLayerRef = Arc<AccessLayer>;
 /// SST write results.
@@ -339,6 +338,7 @@ impl AccessLayer {
         metrics: &mut Metrics,
     ) -> Result<SstInfoArray> {
         let region_id = request.metadata.region_id;
+        let region_metadata = request.metadata.clone();
         let cache_manager = request.cache_manager.clone();
 
         let sst_info = if let Some(write_cache) = cache_manager.write_cache() {
@@ -391,15 +391,19 @@ impl AccessLayer {
             )
             .await
             .with_file_cleaner(cleaner);
-            match request.source {
-                Either::Left(source) => {
+            match request.sst_write_format {
+                FormatType::PrimaryKey => {
                     writer
-                        .write_all(source, request.max_sequence, write_opts)
+                        .write_all_flat_as_primary_key(
+                            request.source,
+                            request.max_sequence,
+                            write_opts,
+                        )
                         .await?
                 }
-                Either::Right(flat_source) => {
+                FormatType::Flat => {
                     writer
-                        .write_all_flat(flat_source, request.max_sequence, write_opts)
+                        .write_all_flat(request.source, request.max_sequence, write_opts)
                         .await?
                 }
             }
@@ -412,6 +416,7 @@ impl AccessLayer {
                     cache_manager.put_parquet_meta_data(
                         RegionFileId::new(region_id, sst.file_id),
                         parquet_metadata.clone(),
+                        Some(region_metadata.clone()),
                     )
                 }
             }
@@ -520,11 +525,12 @@ pub enum OperationType {
 pub struct SstWriteRequest {
     pub op_type: OperationType,
     pub metadata: RegionMetadataRef,
-    pub source: Either<Source, FlatSource>,
+    pub source: FlatSource,
     pub cache_manager: CacheManagerRef,
     #[allow(dead_code)]
     pub storage: Option<String>,
     pub max_sequence: Option<SequenceNumber>,
+    pub sst_write_format: FormatType,
 
     /// Configs for index
     pub index_options: IndexOptions,
diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs
index 3ad71d2a61..35db74eee6 100644
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -28,6 +28,7 @@ use std::ops::Range;
 use std::sync::Arc;
 
 use bytes::Bytes;
+use common_telemetry::warn;
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::value::Value;
 use datatypes::vectors::VectorRef;
@@ -36,8 +37,10 @@ use index::result_cache::IndexResultCache;
 use moka::notification::RemovalCause;
 use moka::sync::Cache;
 use object_store::ObjectStore;
-use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData};
+use parquet::file::metadata::{FileMetaData, PageIndexPolicy, ParquetMetaData};
 use puffin::puffin_manager::cache::{PuffinMetadataCache, PuffinMetadataCacheRef};
+use snafu::{OptionExt, ResultExt};
+use store_api::metadata::RegionMetadataRef;
 use store_api::storage::{ConcreteDataType, FileId, RegionId, TimeSeriesRowSelector};
 
 use crate::cache::cache_size::parquet_meta_size;
@@ -46,10 +49,13 @@ use crate::cache::index::inverted_index::{InvertedIndexCache, InvertedIndexCache
 #[cfg(feature = "vector_index")]
 use crate::cache::index::vector_index::{VectorIndexCache, VectorIndexCacheRef};
 use crate::cache::write_cache::WriteCacheRef;
+use crate::error::{InvalidMetadataSnafu, InvalidParquetSnafu, Result};
 use crate::memtable::record_batch_estimated_size;
 use crate::metrics::{CACHE_BYTES, CACHE_EVICTION, CACHE_HIT, CACHE_MISS};
 use crate::read::Batch;
+use crate::read::range_cache::{RangeScanCacheKey, RangeScanCacheValue};
 use crate::sst::file::{RegionFileId, RegionIndexId};
+use crate::sst::parquet::PARQUET_METADATA_KEY;
 use crate::sst::parquet::reader::MetadataCacheMetrics;
 
 /// Metrics type key for sst meta.
@@ -64,6 +70,108 @@ const FILE_TYPE: &str = "file";
 const INDEX_TYPE: &str = "index";
 /// Metrics type key for selector result cache.
 const SELECTOR_RESULT_TYPE: &str = "selector_result";
+/// Metrics type key for range scan result cache.
+const RANGE_RESULT_TYPE: &str = "range_result";
+
+/// Cached SST metadata combines the parquet footer with the decoded region metadata.
+///
+/// The cached parquet footer strips the `greptime:metadata` JSON payload and stores the decoded
+/// [RegionMetadata] separately so readers can skip repeated deserialization work.
+#[derive(Debug)]
+pub(crate) struct CachedSstMeta {
+    parquet_metadata: Arc<ParquetMetaData>,
+    region_metadata: RegionMetadataRef,
+    region_metadata_weight: usize,
+}
+
+impl CachedSstMeta {
+    pub(crate) fn try_new(file_path: &str, parquet_metadata: ParquetMetaData) -> Result<Self> {
+        Self::try_new_with_region_metadata(file_path, parquet_metadata, None)
+    }
+
+    pub(crate) fn try_new_with_region_metadata(
+        file_path: &str,
+        parquet_metadata: ParquetMetaData,
+        region_metadata: Option<RegionMetadataRef>,
+    ) -> Result<Self> {
+        let file_metadata = parquet_metadata.file_metadata();
+        let key_values = file_metadata
+            .key_value_metadata()
+            .context(InvalidParquetSnafu {
+                file: file_path,
+                reason: "missing key value meta",
+            })?;
+        let meta_value = key_values
+            .iter()
+            .find(|kv| kv.key == PARQUET_METADATA_KEY)
+            .with_context(|| InvalidParquetSnafu {
+                file: file_path,
+                reason: format!("key {} not found", PARQUET_METADATA_KEY),
+            })?;
+        let json = meta_value
+            .value
+            .as_ref()
+            .with_context(|| InvalidParquetSnafu {
+                file: file_path,
+                reason: format!("No value for key {}", PARQUET_METADATA_KEY),
+            })?;
+        let region_metadata = match region_metadata {
+            Some(region_metadata) => region_metadata,
+            None => Arc::new(
+                store_api::metadata::RegionMetadata::from_json(json)
+                    .context(InvalidMetadataSnafu)?,
+            ),
+        };
+        // Keep the previous JSON-byte floor and charge the decoded structures as well.
+        let region_metadata_weight = region_metadata.estimated_size().max(json.len());
+        let parquet_metadata = Arc::new(strip_region_metadata_from_parquet(parquet_metadata));
+
+        Ok(Self {
+            parquet_metadata,
+            region_metadata,
+            region_metadata_weight,
+        })
+    }
+
+    pub(crate) fn parquet_metadata(&self) -> Arc<ParquetMetaData> {
+        self.parquet_metadata.clone()
+    }
+
+    pub(crate) fn region_metadata(&self) -> RegionMetadataRef {
+        self.region_metadata.clone()
+    }
+}
+
+fn strip_region_metadata_from_parquet(parquet_metadata: ParquetMetaData) -> ParquetMetaData {
+    let file_metadata = parquet_metadata.file_metadata();
+    let filtered_key_values = file_metadata.key_value_metadata().and_then(|key_values| {
+        let filtered = key_values
+            .iter()
+            .filter(|kv| kv.key != PARQUET_METADATA_KEY)
+            .cloned()
+            .collect::<Vec<_>>();
+        (!filtered.is_empty()).then_some(filtered)
+    });
+    let stripped_file_metadata = FileMetaData::new(
+        file_metadata.version(),
+        file_metadata.num_rows(),
+        file_metadata.created_by().map(ToString::to_string),
+        filtered_key_values,
+        file_metadata.schema_descr_ptr(),
+        file_metadata.column_orders().cloned(),
+    );
+
+    let mut builder = parquet_metadata.into_builder();
+    let row_groups = builder.take_row_groups();
+    let column_index = builder.take_column_index();
+    let offset_index = builder.take_offset_index();
+
+    parquet::file::metadata::ParquetMetaDataBuilder::new(stripped_file_metadata)
+        .set_row_groups(row_groups)
+        .set_column_index(column_index)
+        .set_offset_index(offset_index)
+        .build()
+}
 
 /// Cache strategies that may only enable a subset of caches.
 #[derive(Clone)]
@@ -81,18 +189,17 @@ pub enum CacheStrategy {
 }
 
 impl CacheStrategy {
-    /// Gets parquet metadata with cache metrics tracking.
-    /// Returns the metadata and updates the provided metrics.
-    pub(crate) async fn get_parquet_meta_data(
+    /// Gets fused SST metadata with cache metrics tracking.
+    pub(crate) async fn get_sst_meta_data(
         &self,
         file_id: RegionFileId,
         metrics: &mut MetadataCacheMetrics,
         page_index_policy: PageIndexPolicy,
-    ) -> Option<Arc<ParquetMetaData>> {
+    ) -> Option<Arc<CachedSstMeta>> {
         match self {
             CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
                 cache_manager
-                    .get_parquet_meta_data(file_id, metrics, page_index_policy)
+                    .get_sst_meta_data(file_id, metrics, page_index_policy)
                     .await
             }
             CacheStrategy::Disabled => {
@@ -102,30 +209,48 @@ impl CacheStrategy {
         }
     }
 
-    /// Calls [CacheManager::get_parquet_meta_data_from_mem_cache()].
-    pub fn get_parquet_meta_data_from_mem_cache(
+    /// Calls [CacheManager::get_sst_meta_data_from_mem_cache()].
+    pub(crate) fn get_sst_meta_data_from_mem_cache(
         &self,
         file_id: RegionFileId,
-    ) -> Option<Arc<ParquetMetaData>> {
+    ) -> Option<Arc<CachedSstMeta>> {
         match self {
-            CacheStrategy::EnableAll(cache_manager) => {
-                cache_manager.get_parquet_meta_data_from_mem_cache(file_id)
-            }
-            CacheStrategy::Compaction(cache_manager) => {
-                cache_manager.get_parquet_meta_data_from_mem_cache(file_id)
+            CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
+                cache_manager.get_sst_meta_data_from_mem_cache(file_id)
             }
             CacheStrategy::Disabled => None,
         }
     }
 
-    /// Calls [CacheManager::put_parquet_meta_data()].
-    pub fn put_parquet_meta_data(&self, file_id: RegionFileId, metadata: Arc<ParquetMetaData>) {
+    /// Calls [CacheManager::get_parquet_meta_data_from_mem_cache()].
+    pub fn get_parquet_meta_data_from_mem_cache(
+        &self,
+        file_id: RegionFileId,
+    ) -> Option<Arc<ParquetMetaData>> {
+        self.get_sst_meta_data_from_mem_cache(file_id)
+            .map(|metadata| metadata.parquet_metadata())
+    }
+
+    /// Calls [CacheManager::put_sst_meta_data()].
+    pub(crate) fn put_sst_meta_data(&self, file_id: RegionFileId, metadata: Arc<CachedSstMeta>) {
         match self {
-            CacheStrategy::EnableAll(cache_manager) => {
-                cache_manager.put_parquet_meta_data(file_id, metadata);
+            CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
+                cache_manager.put_sst_meta_data(file_id, metadata);
             }
-            CacheStrategy::Compaction(cache_manager) => {
-                cache_manager.put_parquet_meta_data(file_id, metadata);
+            CacheStrategy::Disabled => {}
+        }
+    }
+
+    /// Calls [CacheManager::put_parquet_meta_data()].
+    pub fn put_parquet_meta_data(
+        &self,
+        file_id: RegionFileId,
+        metadata: Arc<ParquetMetaData>,
+        region_metadata: Option<RegionMetadataRef>,
+    ) {
+        match self {
+            CacheStrategy::EnableAll(cache_manager) | CacheStrategy::Compaction(cache_manager) => {
+                cache_manager.put_parquet_meta_data(file_id, metadata, region_metadata);
             }
             CacheStrategy::Disabled => {}
         }
@@ -223,6 +348,31 @@ impl CacheStrategy {
         }
     }
 
+    /// Calls [CacheManager::get_range_result()].
+    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
+    #[allow(dead_code)]
+    pub(crate) fn get_range_result(
+        &self,
+        key: &RangeScanCacheKey,
+    ) -> Option<Arc<RangeScanCacheValue>> {
+        match self {
+            CacheStrategy::EnableAll(cache_manager) => cache_manager.get_range_result(key),
+            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
+        }
+    }
+
+    /// Calls [CacheManager::put_range_result()].
+    /// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
+    pub(crate) fn put_range_result(
+        &self,
+        key: RangeScanCacheKey,
+        result: Arc<RangeScanCacheValue>,
+    ) {
+        if let CacheStrategy::EnableAll(cache_manager) = self {
+            cache_manager.put_range_result(key, result);
+        }
+    }
+
     /// Calls [CacheManager::write_cache()].
     /// It returns None if the strategy is [CacheStrategy::Disabled].
     pub fn write_cache(&self) -> Option<&WriteCacheRef> {
@@ -324,6 +474,8 @@ pub struct CacheManager {
     puffin_metadata_cache: Option<PuffinMetadataCacheRef>,
     /// Cache for time series selectors.
     selector_result_cache: Option<SelectorResultCache>,
+    /// Cache for range scan outputs in flat format.
+    range_result_cache: Option<RangeResultCache>,
     /// Cache for index result.
     index_result_cache: Option<IndexResultCache>,
 }
@@ -336,6 +488,35 @@ impl CacheManager {
         CacheManagerBuilder::default()
     }
 
+    /// Gets fused SST metadata with metrics tracking.
+    /// Tries in-memory cache first, then file cache, updating metrics accordingly.
+    pub(crate) async fn get_sst_meta_data(
+        &self,
+        file_id: RegionFileId,
+        metrics: &mut MetadataCacheMetrics,
+        page_index_policy: PageIndexPolicy,
+    ) -> Option<Arc<CachedSstMeta>> {
+        if let Some(metadata) = self.get_sst_meta_data_from_mem_cache(file_id) {
+            metrics.mem_cache_hit += 1;
+            return Some(metadata);
+        }
+
+        let key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Parquet);
+        if let Some(write_cache) = &self.write_cache
+            && let Some(metadata) = write_cache
+                .file_cache()
+                .get_sst_meta_data(key, metrics, page_index_policy)
+                .await
+        {
+            metrics.file_cache_hit += 1;
+            self.put_sst_meta_data(file_id, metadata.clone());
+            return Some(metadata);
+        }
+
+        metrics.cache_miss += 1;
+        None
+    }
+
     /// Gets cached [ParquetMetaData] with metrics tracking.
     /// Tries in-memory cache first, then file cache, updating metrics accordingly.
     pub(crate) async fn get_parquet_meta_data(
@@ -344,29 +525,21 @@ impl CacheManager {
         metrics: &mut MetadataCacheMetrics,
         page_index_policy: PageIndexPolicy,
     ) -> Option<Arc<ParquetMetaData>> {
-        // Try to get metadata from sst meta cache
-        if let Some(metadata) = self.get_parquet_meta_data_from_mem_cache(file_id) {
-            metrics.mem_cache_hit += 1;
-            return Some(metadata);
-        }
+        self.get_sst_meta_data(file_id, metrics, page_index_policy)
+            .await
+            .map(|metadata| metadata.parquet_metadata())
+    }
 
-        // Try to get metadata from write cache
-        let key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Parquet);
-        if let Some(write_cache) = &self.write_cache
-            && let Some(metadata) = write_cache
-                .file_cache()
-                .get_parquet_meta_data(key, metrics, page_index_policy)
-                .await
-        {
-            metrics.file_cache_hit += 1;
-            let metadata = Arc::new(metadata);
-            // Put metadata into sst meta cache
-            self.put_parquet_meta_data(file_id, metadata.clone());
-            return Some(metadata);
-        };
-        metrics.cache_miss += 1;
-
-        None
+    /// Gets cached fused SST metadata from in-memory cache.
+    /// This method does not perform I/O.
+    pub(crate) fn get_sst_meta_data_from_mem_cache(
+        &self,
+        file_id: RegionFileId,
+    ) -> Option<Arc<CachedSstMeta>> {
+        self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| {
+            let value = sst_meta_cache.get(&SstMetaKey(file_id.region_id(), file_id.file_id()));
+            update_hit_miss(value, SST_META_TYPE)
+        })
     }
 
     /// Gets cached [ParquetMetaData] from in-memory cache.
@@ -375,15 +548,12 @@ impl CacheManager {
         &self,
         file_id: RegionFileId,
     ) -> Option<Arc<ParquetMetaData>> {
-        // Try to get metadata from sst meta cache
-        self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| {
-            let value = sst_meta_cache.get(&SstMetaKey(file_id.region_id(), file_id.file_id()));
-            update_hit_miss(value, SST_META_TYPE)
-        })
+        self.get_sst_meta_data_from_mem_cache(file_id)
+            .map(|metadata| metadata.parquet_metadata())
     }
 
-    /// Puts [ParquetMetaData] into the cache.
-    pub fn put_parquet_meta_data(&self, file_id: RegionFileId, metadata: Arc<ParquetMetaData>) {
+    /// Puts fused SST metadata into the cache.
+    pub(crate) fn put_sst_meta_data(&self, file_id: RegionFileId, metadata: Arc<CachedSstMeta>) {
         if let Some(cache) = &self.sst_meta_cache {
             let key = SstMetaKey(file_id.region_id(), file_id.file_id());
             CACHE_BYTES
@@ -393,6 +563,34 @@ impl CacheManager {
         }
     }
 
+    /// Puts [ParquetMetaData] into the cache.
+    pub fn put_parquet_meta_data(
+        &self,
+        file_id: RegionFileId,
+        metadata: Arc<ParquetMetaData>,
+        region_metadata: Option<RegionMetadataRef>,
+    ) {
+        if self.sst_meta_cache.is_some() {
+            let file_path = format!(
+                "region_id={}, file_id={}",
+                file_id.region_id(),
+                file_id.file_id()
+            );
+            match CachedSstMeta::try_new_with_region_metadata(
+                &file_path,
+                Arc::unwrap_or_clone(metadata),
+                region_metadata,
+            ) {
+                Ok(metadata) => self.put_sst_meta_data(file_id, Arc::new(metadata)),
+                Err(err) => warn!(
+                    err; "Failed to decode region metadata while caching parquet metadata, region_id: {}, file_id: {}",
+                    file_id.region_id(),
+                    file_id.file_id()
+                ),
+            }
+        }
+    }
+
     /// Removes [ParquetMetaData] from the cache.
     pub fn remove_parquet_meta_data(&self, file_id: RegionFileId) {
         if let Some(cache) = &self.sst_meta_cache {
@@ -512,6 +710,31 @@ impl CacheManager {
         }
     }
 
+    /// Gets cached result for range scan.
+    #[allow(dead_code)]
+    pub(crate) fn get_range_result(
+        &self,
+        key: &RangeScanCacheKey,
+    ) -> Option<Arc<RangeScanCacheValue>> {
+        self.range_result_cache
+            .as_ref()
+            .and_then(|cache| update_hit_miss(cache.get(key), RANGE_RESULT_TYPE))
+    }
+
+    /// Puts range scan result into cache.
+    pub(crate) fn put_range_result(
+        &self,
+        key: RangeScanCacheKey,
+        result: Arc<RangeScanCacheValue>,
+    ) {
+        if let Some(cache) = &self.range_result_cache {
+            CACHE_BYTES
+                .with_label_values(&[RANGE_RESULT_TYPE])
+                .add(range_result_cache_weight(&key, &result).into());
+            cache.insert(key, result);
+        }
+    }
+
     /// Gets the write cache.
     pub(crate) fn write_cache(&self) -> Option<&WriteCacheRef> {
         self.write_cache.as_ref()
@@ -562,6 +785,7 @@ pub struct CacheManagerBuilder {
     puffin_metadata_size: u64,
     write_cache: Option<WriteCacheRef>,
     selector_result_cache_size: u64,
+    range_result_cache_size: u64,
 }
 
 impl CacheManagerBuilder {
@@ -625,6 +849,12 @@ impl CacheManagerBuilder {
         self
     }
 
+    /// Sets range result cache size.
+    pub fn range_result_cache_size(mut self, bytes: u64) -> Self {
+        self.range_result_cache_size = bytes;
+        self
+    }
+
     /// Builds the [CacheManager].
     pub fn build(self) -> CacheManager {
         fn to_str(cause: RemovalCause) -> &'static str {
@@ -712,6 +942,21 @@ impl CacheManagerBuilder {
                 })
                 .build()
         });
+        let range_result_cache = (self.range_result_cache_size != 0).then(|| {
+            Cache::builder()
+                .max_capacity(self.range_result_cache_size)
+                .weigher(range_result_cache_weight)
+                .eviction_listener(move |k, v, cause| {
+                    let size = range_result_cache_weight(&k, &v);
+                    CACHE_BYTES
+                        .with_label_values(&[RANGE_RESULT_TYPE])
+                        .sub(size.into());
+                    CACHE_EVICTION
+                        .with_label_values(&[RANGE_RESULT_TYPE, to_str(cause)])
+                        .inc();
+                })
+                .build()
+        });
         CacheManager {
             sst_meta_cache,
             vector_cache,
@@ -723,14 +968,15 @@ impl CacheManagerBuilder {
             vector_index_cache,
             puffin_metadata_cache: Some(Arc::new(puffin_metadata_cache)),
             selector_result_cache,
+            range_result_cache,
             index_result_cache,
         }
     }
 }
 
-fn meta_cache_weight(k: &SstMetaKey, v: &Arc<ParquetMetaData>) -> u32 {
+fn meta_cache_weight(k: &SstMetaKey, v: &Arc<CachedSstMeta>) -> u32 {
     // We ignore the size of `Arc`.
-    (k.estimated_size() + parquet_meta_size(v)) as u32
+    (k.estimated_size() + parquet_meta_size(&v.parquet_metadata) + v.region_metadata_weight) as u32
 }
 
 fn vector_cache_weight(_k: &(ConcreteDataType, Value), v: &VectorRef) -> u32 {
@@ -746,6 +992,10 @@ fn selector_result_cache_weight(k: &SelectorResultKey, v: &Arc<SelectorResultVal
     (mem::size_of_val(k) + v.estimated_size()) as u32
 }
 
+fn range_result_cache_weight(k: &RangeScanCacheKey, v: &Arc<RangeScanCacheValue>) -> u32 {
+    (k.estimated_size() + v.estimated_size()) as u32
+}
+
 /// Updates cache hit/miss metrics.
 fn update_hit_miss<T>(value: Option<T>, cache_type: &str) -> Option<T> {
     if value.is_some() {
@@ -892,8 +1142,8 @@ impl SelectorResultValue {
     }
 }
 
-/// Maps (region id, file id) to [ParquetMetaData].
-type SstMetaCache = Cache<SstMetaKey, Arc<ParquetMetaData>>;
+/// Maps (region id, file id) to fused SST metadata.
+type SstMetaCache = Cache<SstMetaKey, Arc<CachedSstMeta>>;
 /// Maps [Value] to a vector that holds this value repeatedly.
 ///
 /// e.g. `"hello" => ["hello", "hello", "hello"]`
@@ -902,20 +1152,30 @@ type VectorCache = Cache<(ConcreteDataType, Value), VectorRef>;
 type PageCache = Cache<PageKey, Arc<PageValue>>;
 /// Maps (file id, row group id, time series row selector) to [SelectorResultValue].
 type SelectorResultCache = Cache<SelectorResultKey, Arc<SelectorResultValue>>;
+/// Maps partition-range scan key to cached flat batches.
+type RangeResultCache = Cache<RangeScanCacheKey, Arc<RangeScanCacheValue>>;
 
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
 
+    use api::v1::SemanticType;
     use api::v1::index::{BloomFilterMeta, InvertedIndexMetas};
+    use datatypes::schema::ColumnSchema;
     use datatypes::vectors::Int64Vector;
     use puffin::file_metadata::FileMetadata;
+    use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
     use store_api::storage::ColumnId;
 
     use super::*;
     use crate::cache::index::bloom_filter_index::Tag;
     use crate::cache::index::result_cache::PredicateKey;
-    use crate::cache::test_util::parquet_meta;
+    use crate::cache::test_util::{
+        parquet_meta, sst_parquet_meta, sst_parquet_meta_with_region_metadata,
+    };
+    use crate::read::range_cache::{
+        RangeScanCacheKey, RangeScanCacheValue, ScanRequestFingerprintBuilder,
+    };
     use crate::sst::parquet::row_selection::RowGroupSelection;
 
     #[tokio::test]
@@ -929,7 +1189,7 @@ mod tests {
         let file_id = RegionFileId::new(region_id, FileId::random());
         let metadata = parquet_meta();
         let mut metrics = MetadataCacheMetrics::default();
-        cache.put_parquet_meta_data(file_id, metadata);
+        cache.put_parquet_meta_data(file_id, metadata, None);
         assert!(
             cache
                 .get_parquet_meta_data(file_id, &mut metrics, Default::default())
@@ -966,13 +1226,23 @@ mod tests {
                 .await
                 .is_none()
         );
-        let metadata = parquet_meta();
-        cache.put_parquet_meta_data(file_id, metadata);
+        let (metadata, region_metadata) = sst_parquet_meta();
+        cache.put_parquet_meta_data(file_id, metadata, None);
+        let cached = cache
+            .get_sst_meta_data(file_id, &mut metrics, Default::default())
+            .await
+            .unwrap();
+        assert_eq!(region_metadata, cached.region_metadata());
         assert!(
-            cache
-                .get_parquet_meta_data(file_id, &mut metrics, Default::default())
-                .await
-                .is_some()
+            cached
+                .parquet_metadata()
+                .file_metadata()
+                .key_value_metadata()
+                .is_none_or(|key_values| {
+                    key_values
+                        .iter()
+                        .all(|key_value| key_value.key != PARQUET_METADATA_KEY)
+                })
         );
         cache.remove_parquet_meta_data(file_id);
         assert!(
@@ -983,6 +1253,42 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn test_parquet_meta_cache_with_provided_region_metadata() {
+        let cache = CacheManager::builder().sst_meta_cache_size(2000).build();
+        let mut metrics = MetadataCacheMetrics::default();
+        let region_id = RegionId::new(1, 1);
+        let file_id = RegionFileId::new(region_id, FileId::random());
+        let (metadata, region_metadata) = sst_parquet_meta();
+
+        cache.put_parquet_meta_data(file_id, metadata, Some(region_metadata.clone()));
+
+        let cached = cache
+            .get_sst_meta_data(file_id, &mut metrics, Default::default())
+            .await
+            .unwrap();
+        assert!(Arc::ptr_eq(&region_metadata, &cached.region_metadata()));
+    }
+
+    #[test]
+    fn test_meta_cache_weight_accounts_for_decoded_region_metadata() {
+        let region_metadata = Arc::new(wide_region_metadata(128));
+        let json_len = region_metadata.to_json().unwrap().len();
+        let metadata = sst_parquet_meta_with_region_metadata(region_metadata.clone());
+        let cached = Arc::new(
+            CachedSstMeta::try_new("test.parquet", Arc::unwrap_or_clone(metadata)).unwrap(),
+        );
+        let key = SstMetaKey(region_metadata.region_id, FileId::random());
+
+        assert!(cached.region_metadata_weight > json_len);
+        assert_eq!(
+            meta_cache_weight(&key, &cached) as usize,
+            key.estimated_size()
+                + parquet_meta_size(&cached.parquet_metadata)
+                + cached.region_metadata_weight
+        );
+    }
+
     #[test]
     fn test_repeated_vector_cache() {
         let cache = CacheManager::builder().vector_cache_size(4096).build();
@@ -1028,6 +1334,50 @@ mod tests {
         assert!(cache.get_selector_result(&key).is_some());
     }
 
+    #[test]
+    fn test_range_result_cache() {
+        let cache = Arc::new(
+            CacheManager::builder()
+                .range_result_cache_size(1024 * 1024)
+                .build(),
+        );
+
+        let key = RangeScanCacheKey {
+            region_id: RegionId::new(1, 1),
+            row_groups: vec![(FileId::random(), 0)],
+            scan: ScanRequestFingerprintBuilder {
+                read_column_ids: vec![],
+                read_column_types: vec![],
+                filters: vec!["tag_0 = 1".to_string()],
+                time_filters: vec![],
+                series_row_selector: None,
+                append_mode: false,
+                filter_deleted: true,
+                merge_mode: crate::region::options::MergeMode::LastRow,
+                partition_expr_version: 0,
+            }
+            .build(),
+        };
+        let value = Arc::new(RangeScanCacheValue::new(Vec::new(), 0));
+
+        assert!(cache.get_range_result(&key).is_none());
+        cache.put_range_result(key.clone(), value.clone());
+        assert!(cache.get_range_result(&key).is_some());
+
+        let enable_all = CacheStrategy::EnableAll(cache.clone());
+        assert!(enable_all.get_range_result(&key).is_some());
+
+        let compaction = CacheStrategy::Compaction(cache.clone());
+        assert!(compaction.get_range_result(&key).is_none());
+        compaction.put_range_result(key.clone(), value.clone());
+        assert!(cache.get_range_result(&key).is_some());
+
+        let disabled = CacheStrategy::Disabled;
+        assert!(disabled.get_range_result(&key).is_none());
+        disabled.put_range_result(key.clone(), value);
+        assert!(cache.get_range_result(&key).is_some());
+    }
+
     #[tokio::test]
     async fn test_evict_puffin_cache_clears_all_entries() {
         use std::collections::{BTreeMap, HashMap};
@@ -1122,4 +1472,45 @@ mod tests {
         assert!(result_cache.get(&predicate, index_id.file_id()).is_none());
         assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_none());
     }
+
+    fn wide_region_metadata(column_count: u32) -> RegionMetadata {
+        let region_id = RegionId::new(1024, 7);
+        let mut builder = RegionMetadataBuilder::new(region_id);
+        let mut primary_key = Vec::new();
+
+        for column_id in 0..column_count {
+            let semantic_type = if column_id < 32 {
+                primary_key.push(column_id);
+                SemanticType::Tag
+            } else {
+                SemanticType::Field
+            };
+            let mut column_schema = ColumnSchema::new(
+                format!("wide_column_{column_id}"),
+                ConcreteDataType::string_datatype(),
+                true,
+            );
+            column_schema
+                .mut_metadata()
+                .insert(format!("cache_key_{column_id}"), "cache_value".repeat(4));
+            builder.push_column_metadata(ColumnMetadata {
+                column_schema,
+                semantic_type,
+                column_id,
+            });
+        }
+
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(
+                "ts",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                false,
+            ),
+            semantic_type: SemanticType::Timestamp,
+            column_id: column_count,
+        });
+        builder.primary_key(primary_key);
+
+        builder.build().unwrap()
+    }
 }
diff --git a/src/mito2/src/cache/file_cache.rs b/src/mito2/src/cache/file_cache.rs
index 32a276d0e4..278838b369 100644
--- a/src/mito2/src/cache/file_cache.rs
+++ b/src/mito2/src/cache/file_cache.rs
@@ -34,7 +34,7 @@ use store_api::storage::{FileId, RegionId};
 use tokio::sync::mpsc::{Sender, UnboundedReceiver};
 
 use crate::access_layer::TempFileCleaner;
-use crate::cache::{FILE_TYPE, INDEX_TYPE};
+use crate::cache::{CachedSstMeta, FILE_TYPE, INDEX_TYPE};
 use crate::error::{self, OpenDalSnafu, Result};
 use crate::metrics::{
     CACHE_BYTES, CACHE_HIT, CACHE_MISS, WRITE_CACHE_DOWNLOAD_BYTES_TOTAL,
@@ -612,6 +612,34 @@ impl FileCache {
         }
     }
 
+    /// Get fused SST metadata from the file cache.
+    /// If the file is not in the cache, or metadata loading/decoding fails, return None.
+    pub(crate) async fn get_sst_meta_data(
+        &self,
+        key: IndexKey,
+        cache_metrics: &mut MetadataCacheMetrics,
+        page_index_policy: PageIndexPolicy,
+    ) -> Option<Arc<CachedSstMeta>> {
+        let file_path = self.inner.cache_file_path(key);
+        self.get_parquet_meta_data(key, cache_metrics, page_index_policy)
+            .await
+            .and_then(
+                |metadata| match CachedSstMeta::try_new(&file_path, metadata) {
+                    Ok(metadata) => Some(Arc::new(metadata)),
+                    Err(err) => {
+                        CACHE_MISS
+                            .with_label_values(&[key.file_type.metric_label()])
+                            .inc();
+                        warn!(
+                            err; "Failed to decode cached parquet metadata for key {:?}",
+                            key
+                        );
+                        None
+                    }
+                },
+            )
+    }
+
     async fn get_reader(&self, file_path: &str) -> object_store::Result<Option<Reader>> {
         if self.inner.local_store.exists(file_path).await? {
             Ok(Some(self.inner.local_store.reader(file_path).await?))
diff --git a/src/mito2/src/cache/test_util.rs b/src/mito2/src/cache/test_util.rs
index 65ad9d87eb..ef3d8e9315 100644
--- a/src/mito2/src/cache/test_util.rs
+++ b/src/mito2/src/cache/test_util.rs
@@ -23,8 +23,13 @@ use object_store::ObjectStore;
 use object_store::services::Fs;
 use parquet::arrow::ArrowWriter;
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
-use parquet::file::metadata::ParquetMetaData;
+use parquet::file::metadata::{KeyValue, ParquetMetaData};
+use parquet::file::properties::WriterProperties;
 use parquet::file::statistics::Statistics;
+use store_api::metadata::RegionMetadataRef;
+
+use crate::sst::parquet::PARQUET_METADATA_KEY;
+use crate::test_util::sst_util::sst_region_metadata;
 
 /// Returns a parquet meta data.
 pub(crate) fn parquet_meta() -> Arc<ParquetMetaData> {
@@ -33,13 +38,43 @@ pub(crate) fn parquet_meta() -> Arc<ParquetMetaData> {
     builder.metadata().clone()
 }
 
+/// Returns parquet metadata for an SST parquet file and its decoded region metadata.
+pub(crate) fn sst_parquet_meta() -> (Arc<ParquetMetaData>, RegionMetadataRef) {
+    let region_metadata = Arc::new(sst_region_metadata());
+    let file_data = parquet_file_data_with_region_metadata(&region_metadata);
+    let builder = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(file_data)).unwrap();
+    (builder.metadata().clone(), region_metadata)
+}
+
+/// Returns parquet metadata for an SST parquet file with custom region metadata.
+pub(crate) fn sst_parquet_meta_with_region_metadata(
+    region_metadata: RegionMetadataRef,
+) -> Arc<ParquetMetaData> {
+    let file_data = parquet_file_data_with_region_metadata(&region_metadata);
+    let builder = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(file_data)).unwrap();
+    builder.metadata().clone()
+}
+
 /// Write a test parquet file to a buffer
 fn parquet_file_data() -> Vec<u8> {
+    parquet_file_data_inner(None)
+}
+
+fn parquet_file_data_with_region_metadata(region_metadata: &RegionMetadataRef) -> Vec<u8> {
+    let json = region_metadata.to_json().unwrap();
+    let key_value = KeyValue::new(PARQUET_METADATA_KEY.to_string(), json);
+    parquet_file_data_inner(Some(vec![key_value]))
+}
+
+fn parquet_file_data_inner(key_value_metadata: Option<Vec<KeyValue>>) -> Vec<u8> {
     let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef;
     let to_write = RecordBatch::try_from_iter([("col", col)]).unwrap();
 
     let mut buffer = Vec::new();
-    let mut writer = ArrowWriter::try_new(&mut buffer, to_write.schema(), None).unwrap();
+    let props = WriterProperties::builder()
+        .set_key_value_metadata(key_value_metadata)
+        .build();
+    let mut writer = ArrowWriter::try_new(&mut buffer, to_write.schema(), Some(props)).unwrap();
     writer.write(&to_write).unwrap();
     writer.close().unwrap();
 
diff --git a/src/mito2/src/cache/write_cache.rs b/src/mito2/src/cache/write_cache.rs
index a28df3f54c..e2483ed4e4 100644
--- a/src/mito2/src/cache/write_cache.rs
+++ b/src/mito2/src/cache/write_cache.rs
@@ -244,15 +244,19 @@ impl WriteCache {
         .await
         .with_file_cleaner(cleaner);
 
-        let sst_info = match write_request.source {
-            either::Left(source) => {
+        let sst_info = match write_request.sst_write_format {
+            crate::sst::FormatType::PrimaryKey => {
                 writer
-                    .write_all(source, write_request.max_sequence, write_opts)
+                    .write_all_flat_as_primary_key(
+                        write_request.source,
+                        write_request.max_sequence,
+                        write_opts,
+                    )
                     .await?
             }
-            either::Right(flat_source) => {
+            crate::sst::FormatType::Flat => {
                 writer
-                    .write_all_flat(flat_source, write_request.max_sequence, write_opts)
+                    .write_all_flat(write_request.source, write_request.max_sequence, write_opts)
                     .await?
             }
         };
@@ -509,12 +513,13 @@ mod tests {
     use crate::cache::test_util::{assert_parquet_metadata_equal, new_fs_store};
     use crate::cache::{CacheManager, CacheStrategy};
     use crate::error::InvalidBatchSnafu;
-    use crate::read::Source;
+    use crate::read::FlatSource;
     use crate::region::options::IndexOptions;
     use crate::sst::parquet::reader::ParquetReaderBuilder;
     use crate::test_util::TestEnv;
     use crate::test_util::sst_util::{
-        new_batch_by_range, new_source, sst_file_handle_with_file_id, sst_region_metadata,
+        new_flat_source_from_record_batches, new_record_batch_by_range,
+        sst_file_handle_with_file_id, sst_region_metadata,
     };
 
     #[tokio::test]
@@ -532,21 +537,22 @@ mod tests {
             .create_write_cache(local_store.clone(), ReadableSize::mb(10))
             .await;
 
-        // Create Source
+        // Create source.
         let metadata = Arc::new(sst_region_metadata());
         let region_id = metadata.region_id;
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
 
         let write_request = SstWriteRequest {
             op_type: OperationType::Flush,
             metadata,
-            source: either::Left(source),
+            source,
             storage: None,
             max_sequence: None,
+            sst_write_format: Default::default(),
             cache_manager: Default::default(),
             index_options: IndexOptions::default(),
             index_config: Default::default(),
@@ -636,19 +642,20 @@ mod tests {
         // Create source
         let metadata = Arc::new(sst_region_metadata());
 
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
 
         // Write to local cache and upload sst to mock remote store
         let write_request = SstWriteRequest {
             op_type: OperationType::Flush,
             metadata,
-            source: either::Left(source),
+            source,
             storage: None,
             max_sequence: None,
+            sst_write_format: Default::default(),
             cache_manager: cache_manager.clone(),
             index_options: IndexOptions::default(),
             index_config: Default::default(),
@@ -686,9 +693,15 @@ mod tests {
         .cache(CacheStrategy::EnableAll(cache_manager.clone()))
         .page_index_policy(PageIndexPolicy::Optional);
         let reader = builder.build().await.unwrap().unwrap();
+        let cached_write_parquet_metadata = crate::cache::CachedSstMeta::try_new(
+            "test.sst",
+            Arc::unwrap_or_clone(write_parquet_metadata),
+        )
+        .unwrap()
+        .parquet_metadata();
 
         // Check parquet metadata
-        assert_parquet_metadata_equal(write_parquet_metadata, reader.parquet_metadata());
+        assert_parquet_metadata_equal(cached_write_parquet_metadata, reader.parquet_metadata());
     }
 
     #[tokio::test]
@@ -715,9 +728,9 @@ mod tests {
         let metadata = Arc::new(sst_region_metadata());
 
         // Creates a source that can return an error to abort the writer.
-        let source = Source::Iter(Box::new(
+        let source = FlatSource::Iter(Box::new(
             [
-                Ok(new_batch_by_range(&["a", "d"], 0, 60)),
+                Ok(new_record_batch_by_range(&["a", "d"], 0, 60)),
                 InvalidBatchSnafu {
                     reason: "Abort the writer",
                 }
@@ -730,9 +743,10 @@ mod tests {
         let write_request = SstWriteRequest {
             op_type: OperationType::Flush,
             metadata,
-            source: either::Left(source),
+            source,
             storage: None,
             max_sequence: None,
+            sst_write_format: Default::default(),
             cache_manager: cache_manager.clone(),
             index_options: IndexOptions::default(),
             index_config: Default::default(),
diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs
index 6d51d1dd59..ba6957fdae 100644
--- a/src/mito2/src/compaction.rs
+++ b/src/mito2/src/compaction.rs
@@ -58,10 +58,10 @@ use crate::error::{
     TimeRangePredicateOverflowSnafu, TimeoutSnafu,
 };
 use crate::metrics::{COMPACTION_STAGE_ELAPSED, INFLIGHT_COMPACTION_COUNT};
+use crate::read::BoxedRecordBatchStream;
 use crate::read::projection::ProjectionMapper;
 use crate::read::scan_region::{PredicateGroup, ScanInput};
 use crate::read::seq_scan::SeqScan;
-use crate::read::{BoxedBatchReader, BoxedRecordBatchStream};
 use crate::region::options::{MergeMode, RegionOptions};
 use crate::region::version::VersionControlRef;
 use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState};
@@ -828,7 +828,7 @@ pub struct SerializedCompactionOutput {
     output_time_range: Option<TimestampRange>,
 }
 
-/// Builders to create [BoxedBatchReader] for compaction.
+/// Builders to create [BoxedRecordBatchStream] for compaction.
 struct CompactionSstReaderBuilder<'a> {
     metadata: RegionMetadataRef,
     sst_layer: AccessLayerRef,
@@ -841,24 +841,17 @@ struct CompactionSstReaderBuilder<'a> {
 }
 
 impl CompactionSstReaderBuilder<'_> {
-    /// Builds [BoxedBatchReader] that reads all SST files and yields batches in primary key order.
-    async fn build_sst_reader(self) -> Result<BoxedBatchReader> {
-        let scan_input = self.build_scan_input(false)?.with_compaction(true);
-
-        SeqScan::new(scan_input).build_reader_for_compaction().await
-    }
-
     /// Builds [BoxedRecordBatchStream] that reads all SST files and yields batches in flat format for compaction.
     async fn build_flat_sst_reader(self) -> Result<BoxedRecordBatchStream> {
-        let scan_input = self.build_scan_input(true)?.with_compaction(true);
+        let scan_input = self.build_scan_input()?.with_compaction(true);
 
         SeqScan::new(scan_input)
             .build_flat_reader_for_compaction()
             .await
     }
 
-    fn build_scan_input(self, flat_format: bool) -> Result<ScanInput> {
-        let mapper = ProjectionMapper::all(&self.metadata, flat_format)?;
+    fn build_scan_input(self) -> Result<ScanInput> {
+        let mapper = ProjectionMapper::all(&self.metadata, true)?;
         let mut scan_input = ScanInput::new(self.sst_layer, mapper)
             .with_files(self.inputs.to_vec())
             .with_append_mode(self.append_mode)
@@ -868,7 +861,7 @@ impl CompactionSstReaderBuilder<'_> {
             // We ignore file not found error during compaction.
             .with_ignore_file_not_found(true)
             .with_merge_mode(self.merge_mode)
-            .with_flat_format(flat_format);
+            .with_flat_format(true);
 
         // This serves as a workaround of https://github.com/GreptimeTeam/greptimedb/issues/3944
         // by converting time ranges into predicate.
diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs
index 1876972b0d..b03e6415e8 100644
--- a/src/mito2/src/compaction/compactor.rs
+++ b/src/mito2/src/compaction/compactor.rs
@@ -43,7 +43,7 @@ use crate::error::{
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
 use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
 use crate::metrics;
-use crate::read::{FlatSource, Source};
+use crate::read::FlatSource;
 use crate::region::options::RegionOptions;
 use crate::region::version::VersionRef;
 use crate::region::{ManifestContext, RegionLeaderState, RegionRoleState};
@@ -356,13 +356,8 @@ impl DefaultCompactor {
             time_range: output.output_time_range,
             merge_mode,
         };
-        let source = if flat_format {
-            let reader = builder.build_flat_sst_reader().await?;
-            Either::Right(FlatSource::Stream(reader))
-        } else {
-            let reader = builder.build_sst_reader().await?;
-            Either::Left(Source::Reader(reader))
-        };
+        let reader = builder.build_flat_sst_reader().await?;
+        let source = FlatSource::Stream(reader);
         let mut metrics = Metrics::new(WriteType::Compaction);
         let region_metadata = compaction_region.region_metadata.clone();
         let sst_infos = compaction_region
@@ -375,6 +370,11 @@ impl DefaultCompactor {
                     cache_manager: compaction_region.cache_manager.clone(),
                     storage,
                     max_sequence: max_sequence.map(NonZero::get),
+                    sst_write_format: if flat_format {
+                        FormatType::Flat
+                    } else {
+                        FormatType::PrimaryKey
+                    },
                     index_options,
                     index_config,
                     inverted_index_config,
diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs
index 602f5508ba..0eee067ab6 100644
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -116,6 +116,8 @@ pub struct MitoConfig {
     pub page_cache_size: ReadableSize,
     /// Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.
     pub selector_result_cache_size: ReadableSize,
+    /// Cache size for flat range scan results. Setting it to 0 to disable the cache.
+    pub range_result_cache_size: ReadableSize,
     /// Whether to enable the write cache.
     pub enable_write_cache: bool,
     /// File system path for write cache dir's root, defaults to `{data_home}`.
@@ -200,6 +202,7 @@ impl Default for MitoConfig {
             vector_cache_size: ReadableSize::mb(512),
             page_cache_size: ReadableSize::mb(512),
             selector_result_cache_size: ReadableSize::mb(512),
+            range_result_cache_size: ReadableSize::mb(512),
             enable_write_cache: false,
             write_cache_path: String::new(),
             write_cache_size: ReadableSize::gb(5),
@@ -336,6 +339,7 @@ impl MitoConfig {
         self.vector_cache_size = mem_cache_size;
         self.page_cache_size = page_cache_size;
         self.selector_result_cache_size = mem_cache_size;
+        self.range_result_cache_size = mem_cache_size;
 
         self.index.adjust_buffer_and_cache_size(sys_memory);
     }
diff --git a/src/mito2/src/engine/row_selector_test.rs b/src/mito2/src/engine/row_selector_test.rs
index 317ede5a97..d79152e57f 100644
--- a/src/mito2/src/engine/row_selector_test.rs
+++ b/src/mito2/src/engine/row_selector_test.rs
@@ -24,7 +24,7 @@ use crate::test_util::{
     CreateRequestBuilder, TestEnv, build_rows_for_key, flush_region, put_rows, rows_schema,
 };
 
-async fn test_last_row(append_mode: bool) {
+async fn test_last_row(append_mode: bool, flat_format: bool) {
     let mut env = TestEnv::new().await;
     let engine = env.create_engine(MitoConfig::default()).await;
     let region_id = RegionId::new(1, 1);
@@ -39,9 +39,12 @@ async fn test_last_row(append_mode: bool) {
             env.get_kv_backend(),
         )
         .await;
-    let request = CreateRequestBuilder::new()
-        .insert_option("append_mode", &append_mode.to_string())
-        .build();
+    let mut request_builder =
+        CreateRequestBuilder::new().insert_option("append_mode", &append_mode.to_string());
+    if flat_format {
+        request_builder = request_builder.insert_option("sst_format", "flat");
+    }
+    let request = request_builder.build();
     let column_schemas = rows_schema(&request);
     engine
         .handle_request(region_id, RegionRequest::Create(request))
@@ -106,10 +109,20 @@ async fn test_last_row(append_mode: bool) {
 
 #[tokio::test]
 async fn test_last_row_append_mode_disabled() {
-    test_last_row(false).await;
+    test_last_row(false, false).await;
 }
 
 #[tokio::test]
 async fn test_last_row_append_mode_enabled() {
-    test_last_row(true).await;
+    test_last_row(true, false).await;
+}
+
+#[tokio::test]
+async fn test_last_row_flat_format_append_mode_disabled() {
+    test_last_row(false, true).await;
+}
+
+#[tokio::test]
+async fn test_last_row_flat_format_append_mode_enabled() {
+    test_last_row(true, true).await;
 }
diff --git a/src/mito2/src/engine/skip_wal_test.rs b/src/mito2/src/engine/skip_wal_test.rs
index d1b38c47fb..c59be6ba2c 100644
--- a/src/mito2/src/engine/skip_wal_test.rs
+++ b/src/mito2/src/engine/skip_wal_test.rs
@@ -15,7 +15,9 @@
 use api::v1::Rows;
 use common_wal::options::{WAL_OPTIONS_KEY, WalOptions};
 use store_api::region_engine::{RegionEngine, RegionRole};
-use store_api::region_request::{RegionCloseRequest, RegionRequest};
+use store_api::region_request::{
+    RegionCloseRequest, RegionOpenRequest, RegionRequest, RegionTruncateRequest,
+};
 use store_api::storage::{RegionId, ScanRequest};
 
 use crate::config::MitoConfig;
@@ -168,3 +170,76 @@ async fn test_close_follower_region_skip_wal() {
     let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
     assert_eq!(0, total_rows);
 }
+
+#[tokio::test]
+async fn test_close_region_after_truncate_skip_wal() {
+    common_telemetry::init_default_ut_logging();
+    let mut env = TestEnv::with_prefix("close-truncate-skip-wal").await;
+    let engine = env.create_engine(MitoConfig::default()).await;
+
+    let region_id = RegionId::new(1, 1);
+    let mut request = CreateRequestBuilder::new().build();
+    let wal_options = WalOptions::Noop;
+    request.options.insert(
+        WAL_OPTIONS_KEY.to_string(),
+        serde_json::to_string(&wal_options).unwrap(),
+    );
+
+    engine
+        .handle_request(region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+
+    engine
+        .handle_request(
+            region_id,
+            RegionRequest::Truncate(RegionTruncateRequest::All),
+        )
+        .await
+        .unwrap();
+
+    let region = engine.get_region(region_id).unwrap();
+    let version_data = region.version_control.current();
+    assert_eq!(
+        version_data.version.truncated_entry_id,
+        Some(version_data.last_entry_id)
+    );
+
+    let rows = Rows {
+        schema: rows_schema(&request),
+        rows: build_rows(0, 3),
+    };
+    put_rows(&engine, region_id, rows).await;
+
+    let region = engine.get_region(region_id).unwrap();
+    assert!(!region.version().memtables.is_empty());
+
+    engine
+        .handle_request(region_id, RegionRequest::Close(RegionCloseRequest {}))
+        .await
+        .unwrap();
+
+    engine
+        .handle_request(
+            region_id,
+            RegionRequest::Open(RegionOpenRequest {
+                engine: String::new(),
+                table_dir: request.table_dir,
+                path_type: store_api::region_request::PathType::Bare,
+                options: request.options,
+                skip_wal_replay: false,
+                checkpoint: None,
+            }),
+        )
+        .await
+        .unwrap();
+    let stream = engine
+        .scan_to_stream(region_id, ScanRequest::default())
+        .await
+        .unwrap();
+    let batches = common_recordbatch::RecordBatches::try_collect(stream)
+        .await
+        .unwrap();
+    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(3, total_rows);
+}
diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs
index 923d8a2713..c6b69fe607 100644
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -616,15 +616,6 @@ pub enum Error {
         location: Location,
     },
 
-    #[snafu(display("Failed to read arrow record batch from parquet file {}", path))]
-    ArrowReader {
-        path: String,
-        #[snafu(source)]
-        error: ArrowError,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
     #[snafu(display("Column not found, column: {column}"))]
     ColumnNotFound {
         column: String,
@@ -1349,7 +1340,6 @@ impl ErrorExt for Error {
             RegionState { .. } | UpdateManifest { .. } => StatusCode::RegionNotReady,
             JsonOptions { .. } => StatusCode::InvalidArguments,
             EmptyRegionDir { .. } | EmptyManifestDir { .. } => StatusCode::RegionNotFound,
-            ArrowReader { .. } => StatusCode::StorageUnavailable,
             ConvertValue { source, .. } => source.status_code(),
             ApplyBloomFilterIndex { source, .. } => source.status_code(),
             InvalidPartitionExpr { source, .. } => source.status_code(),
diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs
index 0c16544b6e..fedac95d27 100644
--- a/src/mito2/src/flush.rs
+++ b/src/mito2/src/flush.rs
@@ -22,7 +22,6 @@ use std::time::Instant;
 
 use common_telemetry::{debug, error, info};
 use datatypes::arrow::datatypes::SchemaRef;
-use either::Either;
 use partition::expr::PartitionExpr;
 use smallvec::{SmallVec, smallvec};
 use snafu::ResultExt;
@@ -41,18 +40,14 @@ use crate::error::{
 };
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
 use crate::memtable::bulk::ENCODE_ROW_THRESHOLD;
-use crate::memtable::{
-    BoxedRecordBatchIterator, EncodedRange, IterBuilder, MemtableRanges, RangesOptions,
-};
+use crate::memtable::{BoxedRecordBatchIterator, EncodedRange, MemtableRanges, RangesOptions};
 use crate::metrics::{
     FLUSH_BYTES_TOTAL, FLUSH_ELAPSED, FLUSH_FAILURE_TOTAL, FLUSH_FILE_TOTAL, FLUSH_REQUESTS_TOTAL,
     INFLIGHT_FLUSH_COUNT,
 };
-use crate::read::dedup::{DedupReader, LastNonNull, LastRow};
+use crate::read::FlatSource;
 use crate::read::flat_dedup::{FlatDedupIterator, FlatLastNonNull, FlatLastRow};
 use crate::read::flat_merge::FlatMergeIterator;
-use crate::read::merge::MergeReaderBuilder;
-use crate::read::{FlatSource, Source};
 use crate::region::options::{IndexOptions, MergeMode, RegionOptions};
 use crate::region::version::{VersionControlData, VersionControlRef, VersionRef};
 use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState, parse_partition_expr};
@@ -62,8 +57,10 @@ use crate::request::{
 };
 use crate::schedule::scheduler::{Job, SchedulerRef};
 use crate::sst::file::FileMeta;
-use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE, SstInfo, WriteOptions};
-use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
+use crate::sst::parquet::{
+    DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE, SstInfo, WriteOptions, flat_format,
+};
+use crate::sst::{FlatSchemaOptions, FormatType, to_flat_sst_arrow_schema};
 use crate::worker::WorkerListener;
 
 /// Global write buffer (memtable) manager.
@@ -480,78 +477,29 @@ impl RegionFlushTask {
             // the counter may have more series than the actual series count.
             series_count += memtable_series_count;
 
-            if mem_ranges.is_record_batch() {
-                let flush_start = Instant::now();
-                let FlushFlatMemResult {
-                    num_encoded,
-                    num_sources,
-                    results,
-                } = self
-                    .flush_flat_mem_ranges(version, &write_opts, mem_ranges)
-                    .await?;
-                encoded_part_count += num_encoded;
-                for (source_idx, result) in results.into_iter().enumerate() {
-                    let (max_sequence, ssts_written, metrics) = result?;
-                    if ssts_written.is_empty() {
-                        // No data written.
-                        continue;
-                    }
-
-                    common_telemetry::debug!(
-                        "Region {} flush one memtable {} {}/{}, metrics: {:?}",
-                        self.region_id,
-                        memtable_id,
-                        source_idx,
-                        num_sources,
-                        metrics
-                    );
-
-                    flush_metrics = flush_metrics.merge(metrics);
-
-                    file_metas.extend(ssts_written.into_iter().map(|sst_info| {
-                        flushed_bytes += sst_info.file_size;
-                        Self::new_file_meta(
-                            self.region_id,
-                            max_sequence,
-                            sst_info,
-                            partition_expr.clone(),
-                        )
-                    }));
-                }
-
-                common_telemetry::debug!(
-                    "Region {} flush {} memtables for {}, num_mem_ranges: {}, num_encoded: {}, num_rows: {}, flush_cost: {:?}, compact_cost: {:?}",
-                    self.region_id,
-                    num_sources,
-                    memtable_id,
-                    num_mem_ranges,
-                    num_encoded,
-                    num_mem_rows,
-                    flush_start.elapsed(),
-                    compact_cost,
-                );
-            } else {
-                let max_sequence = mem_ranges.max_sequence();
-                let source = memtable_source(mem_ranges, &version.options).await?;
-
-                // Flush to level 0.
-                let source = Either::Left(source);
-                let write_request = self.new_write_request(version, max_sequence, source);
-
-                let mut metrics = Metrics::new(WriteType::Flush);
-                let ssts_written = self
-                    .access_layer
-                    .write_sst(write_request, &write_opts, &mut metrics)
-                    .await?;
-                FLUSH_FILE_TOTAL.inc_by(ssts_written.len() as u64);
+            let flush_start = Instant::now();
+            let FlushFlatMemResult {
+                num_encoded,
+                num_sources,
+                results,
+            } = self
+                .flush_flat_mem_ranges(version, &write_opts, mem_ranges)
+                .await?;
+            encoded_part_count += num_encoded;
+            for (source_idx, result) in results.into_iter().enumerate() {
+                let (max_sequence, ssts_written, metrics) = result?;
                 if ssts_written.is_empty() {
                     // No data written.
                     continue;
                 }
 
-                debug!(
-                    "Region {} flush one memtable, num_mem_ranges: {}, num_rows: {}, metrics: {:?}",
-                    self.region_id, num_mem_ranges, num_mem_rows, metrics
+                common_telemetry::debug!(
+                    "Region {} flush one memtable {} {}/{}, metrics: {:?}",
+                    self.region_id,
+                    memtable_id,
+                    source_idx,
+                    num_sources,
+                    metrics
                 );
 
                 flush_metrics = flush_metrics.merge(metrics);
@@ -565,7 +513,19 @@ impl RegionFlushTask {
                         partition_expr.clone(),
                     )
                 }));
-            };
+            }
+
+            common_telemetry::debug!(
+                "Region {} flush {} memtables for {}, num_mem_ranges: {}, num_encoded: {}, num_rows: {}, flush_cost: {:?}, compact_cost: {:?}",
+                self.region_id,
+                num_sources,
+                memtable_id,
+                num_mem_ranges,
+                num_encoded,
+                num_mem_rows,
+                flush_start.elapsed(),
+                compact_cost,
+            );
         }
 
         Ok(DoFlushMemtablesResult {
@@ -587,16 +547,17 @@ impl RegionFlushTask {
             &version.metadata,
             &FlatSchemaOptions::from_encoding(version.metadata.primary_key_encoding),
         );
+        let field_column_start =
+            flat_format::field_column_start(&version.metadata, batch_schema.fields().len());
         let flat_sources = memtable_flat_sources(
             batch_schema,
             mem_ranges,
             &version.options,
-            version.metadata.primary_key.len(),
+            field_column_start,
         )?;
         let mut tasks = Vec::with_capacity(flat_sources.encoded.len() + flat_sources.sources.len());
         let num_encoded = flat_sources.encoded.len();
         for (source, max_sequence) in flat_sources.sources {
-            let source = Either::Right(source);
             let write_request = self.new_write_request(version, max_sequence, source);
             let access_layer = self.access_layer.clone();
             let write_opts = write_opts.clone();
@@ -667,8 +628,13 @@ impl RegionFlushTask {
         &self,
         version: &VersionRef,
         max_sequence: u64,
-        source: Either<Source, FlatSource>,
+        source: FlatSource,
     ) -> SstWriteRequest {
+        let flat_format = version
+            .options
+            .sst_format
+            .map(|f| f == FormatType::Flat)
+            .unwrap_or(self.engine_config.default_experimental_flat_format);
         SstWriteRequest {
             op_type: OperationType::Flush,
             metadata: version.metadata.clone(),
@@ -676,6 +642,11 @@ impl RegionFlushTask {
             cache_manager: self.cache_manager.clone(),
             storage: version.options.storage.clone(),
             max_sequence: Some(max_sequence),
+            sst_write_format: if flat_format {
+                FormatType::Flat
+            } else {
+                FormatType::PrimaryKey
+            },
             index_options: self.index_options.clone(),
             index_config: self.engine_config.index.clone(),
             inverted_index_config: self.engine_config.inverted_index.clone(),
@@ -722,41 +693,6 @@ struct DoFlushMemtablesResult {
     flush_metrics: Metrics,
 }
 
-/// Returns a [Source] for the given memtable.
-async fn memtable_source(mem_ranges: MemtableRanges, options: &RegionOptions) -> Result<Source> {
-    let source = if mem_ranges.ranges.len() == 1 {
-        let only_range = mem_ranges.ranges.into_values().next().unwrap();
-        let iter = only_range.build_iter()?;
-        Source::Iter(iter)
-    } else {
-        // todo(hl): a workaround since sync version of MergeReader is wip.
-        let sources = mem_ranges
-            .ranges
-            .into_values()
-            .map(|r| r.build_iter().map(Source::Iter))
-            .collect::<Result<Vec<_>>>()?;
-        let merge_reader = MergeReaderBuilder::from_sources(sources).build().await?;
-        let maybe_dedup = if options.append_mode {
-            // no dedup in append mode
-            Box::new(merge_reader) as _
-        } else {
-            // dedup according to merge mode
-            match options.merge_mode.unwrap_or(MergeMode::LastRow) {
-                MergeMode::LastRow => {
-                    Box::new(DedupReader::new(merge_reader, LastRow::new(false), None)) as _
-                }
-                MergeMode::LastNonNull => Box::new(DedupReader::new(
-                    merge_reader,
-                    LastNonNull::new(false),
-                    None,
-                )) as _,
-            }
-        };
-        Source::Reader(maybe_dedup)
-    };
-    Ok(source)
-}
-
 struct FlatSources {
     sources: SmallVec<[(FlatSource, SequenceNumber); 4]>,
     encoded: SmallVec<[(EncodedRange, SequenceNumber); 4]>,
diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs
index c39bbfa346..3ebfdd3628 100644
--- a/src/mito2/src/memtable.rs
+++ b/src/mito2/src/memtable.rs
@@ -28,6 +28,7 @@ use mito_codec::key_values::KeyValue;
 pub use mito_codec::key_values::KeyValues;
 use mito_codec::row_converter::{PrimaryKeyCodec, build_primary_key_codec};
 use serde::{Deserialize, Serialize};
+use snafu::ensure;
 use store_api::metadata::RegionMetadataRef;
 use store_api::storage::{ColumnId, SequenceNumber, SequenceRange};
 
@@ -231,10 +232,17 @@ impl MemtableRanges {
 
 impl IterBuilder for MemtableRanges {
     fn build(&self, _metrics: Option<MemScanMetrics>) -> Result<BoxedBatchIterator> {
-        UnsupportedOperationSnafu {
-            err_msg: "MemtableRanges does not support build iterator",
-        }
-        .fail()
+        ensure!(
+            self.ranges.len() == 1,
+            UnsupportedOperationSnafu {
+                err_msg: format!(
+                    "Building an iterator from MemtableRanges expects 1 range, but got {}",
+                    self.ranges.len()
+                ),
+            }
+        );
+
+        self.ranges.values().next().unwrap().build_iter()
     }
 
     fn is_record_batch(&self) -> bool {
@@ -256,20 +264,6 @@ pub trait Memtable: Send + Sync + fmt::Debug {
     /// Writes an encoded batch of into memtable.
     fn write_bulk(&self, part: crate::memtable::bulk::part::BulkPart) -> Result<()>;
 
-    /// Scans the memtable.
-    /// `projection` selects columns to read, `None` means reading all columns.
-    /// `filters` are the predicates to be pushed down to memtable.
-    ///
-    /// # Note
-    /// This method should only be used for tests.
-    #[cfg(any(test, feature = "test"))]
-    fn iter(
-        &self,
-        projection: Option<&[ColumnId]>,
-        predicate: Option<table::predicate::Predicate>,
-        sequence: Option<SequenceRange>,
-    ) -> Result<BoxedBatchIterator>;
-
     /// Returns the ranges in the memtable.
     ///
     /// The returned map contains the range id and the range after applying the predicate.
@@ -543,11 +537,15 @@ pub trait IterBuilder: Send + Sync {
     }
 
     /// Returns the record batch iterator to read the range.
+    /// ## Note
+    /// Implementations should ensure the iterator yields data within given time range.
     fn build_record_batch(
         &self,
+        time_range: Option<(Timestamp, Timestamp)>,
         metrics: Option<MemScanMetrics>,
     ) -> Result<BoxedRecordBatchIterator> {
         let _metrics = metrics;
+        let _ = time_range;
         UnsupportedOperationSnafu {
             err_msg: "Record batch iterator is not supported by this memtable",
         }
@@ -706,7 +704,7 @@ impl MemtableRange {
         metrics: Option<MemScanMetrics>,
     ) -> Result<BoxedRecordBatchIterator> {
         if self.context.builder.is_record_batch() {
-            return self.context.builder.build_record_batch(metrics);
+            return self.context.builder.build_record_batch(time_range, metrics);
         }
 
         if let Some(context) = self.context.batch_to_record_batch.as_ref() {
diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs
index cf2ced06fe..502b61759d 100644
--- a/src/mito2/src/memtable/bulk.rs
+++ b/src/mito2/src/memtable/bulk.rs
@@ -14,6 +14,7 @@
 
 //! Memtable implementation for bulk load
 
+pub(crate) mod chunk_reader;
 #[allow(unused)]
 pub mod context;
 #[allow(unused)]
@@ -34,6 +35,7 @@ fn env_usize(name: &str, default: usize) -> usize {
         .unwrap_or(default)
 }
 
+use common_time::Timestamp;
 use datatypes::arrow::datatypes::SchemaRef;
 use mito_codec::key_values::KeyValue;
 use rayon::prelude::*;
@@ -57,7 +59,7 @@ use crate::memtable::{
 use crate::read::flat_dedup::{FlatDedupIterator, FlatLastNonNull, FlatLastRow};
 use crate::read::flat_merge::FlatMergeIterator;
 use crate::region::options::MergeMode;
-use crate::sst::parquet::format::FIXED_POS_COLUMN_NUM;
+use crate::sst::parquet::flat_format::field_column_start;
 use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, DEFAULT_ROW_GROUP_SIZE};
 use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
 
@@ -462,16 +464,6 @@ impl Memtable for BulkMemtable {
         Ok(())
     }
 
-    #[cfg(any(test, feature = "test"))]
-    fn iter(
-        &self,
-        _projection: Option<&[ColumnId]>,
-        _predicate: Option<table::predicate::Predicate>,
-        _sequence: Option<SequenceRange>,
-    ) -> Result<crate::memtable::BoxedBatchIterator> {
-        todo!()
-    }
-
     fn ranges(
         &self,
         projection: Option<&[ColumnId]>,
@@ -802,6 +794,7 @@ impl IterBuilder for BulkRangeIterBuilder {
 
     fn build_record_batch(
         &self,
+        _time_range: Option<(Timestamp, Timestamp)>,
         metrics: Option<MemScanMetrics>,
     ) -> Result<BoxedRecordBatchIterator> {
         let series_count = self.part.estimated_series_count();
@@ -835,6 +828,7 @@ impl IterBuilder for MultiBulkRangeIterBuilder {
 
     fn build_record_batch(
         &self,
+        _time_range: Option<(Timestamp, Timestamp)>,
         metrics: Option<MemScanMetrics>,
     ) -> Result<BoxedRecordBatchIterator> {
         self.part
@@ -874,6 +868,7 @@ impl IterBuilder for EncodedBulkRangeIterBuilder {
 
     fn build_record_batch(
         &self,
+        _time_range: Option<(Timestamp, Timestamp)>,
         metrics: Option<MemScanMetrics>,
     ) -> Result<BoxedRecordBatchIterator> {
         if let Some(iter) = self
@@ -1186,13 +1181,8 @@ impl MemtableCompactor {
                     Box::new(dedup_iter)
                 }
                 MergeMode::LastNonNull => {
-                    // Calculates field column start: total columns - fixed columns - field columns
-                    // Field column count = total metadata columns - time index column - primary key columns
-                    let field_column_count =
-                        metadata.column_metadatas.len() - 1 - metadata.primary_key.len();
-                    let total_columns = arrow_schema.fields().len();
                     let field_column_start =
-                        total_columns - FIXED_POS_COLUMN_NUM - field_column_count;
+                        field_column_start(metadata, arrow_schema.fields().len());
 
                     let dedup_iter = FlatDedupIterator::new(
                         merged_iter,
diff --git a/src/mito2/src/memtable/bulk/chunk_reader.rs b/src/mito2/src/memtable/bulk/chunk_reader.rs
new file mode 100644
index 0000000000..e632cd1b37
--- /dev/null
+++ b/src/mito2/src/memtable/bulk/chunk_reader.rs
@@ -0,0 +1,65 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! ChunkReader implementation for in-memory parquet bytes.
+
+use std::io::Cursor;
+
+use bytes::Bytes;
+use parquet::errors::{ParquetError, Result};
+use parquet::file::reader::{ChunkReader, Length};
+
+/// A [ChunkReader] implementation for in-memory parquet bytes.
+///
+/// This provides byte access to parquet data stored in memory (Bytes),
+/// used for reading parquet data from bulk memtable.
+#[derive(Clone)]
+pub struct MemtableChunkReader {
+    /// The in-memory parquet data.
+    data: Bytes,
+}
+
+impl MemtableChunkReader {
+    /// Creates a new [MemtableChunkReader] from the given bytes.
+    pub fn new(data: Bytes) -> Self {
+        Self { data }
+    }
+}
+
+impl Length for MemtableChunkReader {
+    fn len(&self) -> u64 {
+        self.data.len() as u64
+    }
+}
+
+impl ChunkReader for MemtableChunkReader {
+    type T = Cursor<Bytes>;
+
+    fn get_read(&self, start: u64) -> Result<Self::T> {
+        let start = start as usize;
+        if start > self.data.len() {
+            return Err(ParquetError::IndexOutOfBound(start, self.data.len()));
+        }
+        Ok(Cursor::new(self.data.slice(start..)))
+    }
+
+    fn get_bytes(&self, start: u64, length: usize) -> Result<Bytes> {
+        let start = start as usize;
+        let end = start + length;
+        if end > self.data.len() {
+            return Err(ParquetError::IndexOutOfBound(end, self.data.len()));
+        }
+        Ok(self.data.slice(start..end))
+    }
+}
diff --git a/src/mito2/src/memtable/bulk/part.rs b/src/mito2/src/memtable/bulk/part.rs
index 71e49776c0..bf345c038e 100644
--- a/src/mito2/src/memtable/bulk/part.rs
+++ b/src/mito2/src/memtable/bulk/part.rs
@@ -967,7 +967,7 @@ impl EncodedBulkPart {
         Self { data, metadata }
     }
 
-    pub(crate) fn metadata(&self) -> &BulkPartMeta {
+    pub fn metadata(&self) -> &BulkPartMeta {
         &self.metadata
     }
 
@@ -977,7 +977,7 @@ impl EncodedBulkPart {
     }
 
     /// Returns the encoded data.
-    pub(crate) fn data(&self) -> &Bytes {
+    pub fn data(&self) -> &Bytes {
         &self.data
     }
 
@@ -1121,10 +1121,7 @@ pub struct BulkPartEncoder {
 }
 
 impl BulkPartEncoder {
-    pub(crate) fn new(
-        metadata: RegionMetadataRef,
-        row_group_size: usize,
-    ) -> Result<BulkPartEncoder> {
+    pub fn new(metadata: RegionMetadataRef, row_group_size: usize) -> Result<BulkPartEncoder> {
         // TODO(yingwen): Skip arrow schema if needed.
         let json = metadata.to_json().context(InvalidMetadataSnafu)?;
         let key_value_meta =
@@ -1216,7 +1213,7 @@ impl BulkPartEncoder {
     }
 
     /// Encodes bulk part to a [EncodedBulkPart], returns the encoded data.
-    fn encode_part(&self, part: &BulkPart) -> Result<Option<EncodedBulkPart>> {
+    pub fn encode_part(&self, part: &BulkPart) -> Result<Option<EncodedBulkPart>> {
         if part.batch.num_rows() == 0 {
             return Ok(None);
         }
diff --git a/src/mito2/src/memtable/bulk/part_reader.rs b/src/mito2/src/memtable/bulk/part_reader.rs
index 1e9d955321..edb9ff52d9 100644
--- a/src/mito2/src/memtable/bulk/part_reader.rs
+++ b/src/mito2/src/memtable/bulk/part_reader.rs
@@ -30,7 +30,6 @@ use crate::memtable::{MemScanMetrics, MemScanMetricsData};
 use crate::metrics::{READ_ROWS_TOTAL, READ_STAGE_ELAPSED};
 use crate::sst::parquet::file_range::{PreFilterMode, TagDecodeState};
 use crate::sst::parquet::flat_format::sequence_column_index;
-use crate::sst::parquet::reader::RowGroupReaderContext;
 
 /// Iterator for reading data inside a bulk part.
 pub struct EncodedBulkPartIter {
@@ -50,7 +49,7 @@ pub struct EncodedBulkPartIter {
 
 impl EncodedBulkPartIter {
     /// Creates a new [BulkPartIter].
-    pub(crate) fn try_new(
+    pub fn try_new(
         encoded_part: &EncodedBulkPart,
         context: BulkIterContextRef,
         mut row_groups_to_read: VecDeque<usize>,
diff --git a/src/mito2/src/memtable/bulk/row_group_reader.rs b/src/mito2/src/memtable/bulk/row_group_reader.rs
index fccd22db10..40a5b2f85d 100644
--- a/src/mito2/src/memtable/bulk/row_group_reader.rs
+++ b/src/mito2/src/memtable/bulk/row_group_reader.rs
@@ -12,124 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::ops::Range;
 use std::sync::Arc;
 
 use bytes::Bytes;
-use datatypes::arrow::array::RecordBatch;
-use datatypes::arrow::error::ArrowError;
-use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowGroups, RowSelection};
-use parquet::arrow::{FieldLevels, ProjectionMask, parquet_to_arrow_field_levels};
-use parquet::column::page::{PageIterator, PageReader};
-use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
+use parquet::arrow::ProjectionMask;
+use parquet::arrow::arrow_reader::{
+    ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReader,
+    ParquetRecordBatchReaderBuilder, RowSelection,
+};
+use parquet::file::metadata::ParquetMetaData;
 use snafu::ResultExt;
 
 use crate::error;
 use crate::error::ReadDataPartSnafu;
+use crate::memtable::bulk::chunk_reader::MemtableChunkReader;
 use crate::memtable::bulk::context::BulkIterContextRef;
 use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
-use crate::sst::parquet::format::ReadFormat;
-use crate::sst::parquet::reader::RowGroupReaderContext;
-use crate::sst::parquet::row_group::{ColumnChunkIterator, RowGroupBase};
-
-/// Helper for reading specific row group inside Memtable Parquet parts.
-// This is similar to [mito2::sst::parquet::row_group::InMemoryRowGroup] since
-// it's a workaround for lacking of keyword generics.
-pub struct MemtableRowGroupPageFetcher<'a> {
-    /// Shared structs for reading row group.
-    base: RowGroupBase<'a>,
-    bytes: Bytes,
-}
-
-impl<'a> MemtableRowGroupPageFetcher<'a> {
-    pub(crate) fn create(
-        row_group_idx: usize,
-        parquet_meta: &'a ParquetMetaData,
-        bytes: Bytes,
-    ) -> Self {
-        Self {
-            // the cached `column_uncompressed_pages` would never be used in Memtable readers.
-            base: RowGroupBase::new(parquet_meta, row_group_idx),
-            bytes,
-        }
-    }
-
-    /// Fetches column pages from memory file.
-    pub(crate) fn fetch(&mut self, projection: &ProjectionMask, selection: Option<&RowSelection>) {
-        if let Some((selection, offset_index)) = selection.zip(self.base.offset_index) {
-            // Selection provided.
-            let (fetch_ranges, page_start_offsets) =
-                self.base
-                    .calc_sparse_read_ranges(projection, offset_index, selection);
-            if fetch_ranges.is_empty() {
-                return;
-            }
-            let chunk_data = self.fetch_bytes(&fetch_ranges);
-
-            self.base
-                .assign_sparse_chunk(projection, chunk_data, page_start_offsets);
-        } else {
-            let fetch_ranges = self.base.calc_dense_read_ranges(projection);
-            if fetch_ranges.is_empty() {
-                // Nothing to fetch.
-                return;
-            }
-            let chunk_data = self.fetch_bytes(&fetch_ranges);
-            self.base.assign_dense_chunk(projection, chunk_data);
-        }
-    }
-
-    fn fetch_bytes(&self, ranges: &[Range<u64>]) -> Vec<Bytes> {
-        ranges
-            .iter()
-            .map(|range| self.bytes.slice(range.start as usize..range.end as usize))
-            .collect()
-    }
-
-    /// Creates a page reader to read column at `i`.
-    fn column_page_reader(&self, i: usize) -> parquet::errors::Result<Box<dyn PageReader>> {
-        let reader = self.base.column_reader(i)?;
-        Ok(Box::new(reader))
-    }
-}
-
-impl RowGroups for MemtableRowGroupPageFetcher<'_> {
-    fn num_rows(&self) -> usize {
-        self.base.row_count
-    }
-
-    fn column_chunks(&self, i: usize) -> parquet::errors::Result<Box<dyn PageIterator>> {
-        Ok(Box::new(ColumnChunkIterator {
-            reader: Some(self.column_page_reader(i)),
-        }))
-    }
-
-    fn row_groups(&self) -> Box<dyn Iterator<Item = &RowGroupMetaData> + '_> {
-        Box::new(std::iter::once(self.base.row_group_metadata()))
-    }
-
-    fn metadata(&self) -> &ParquetMetaData {
-        self.base.parquet_metadata()
-    }
-}
-
-impl RowGroupReaderContext for BulkIterContextRef {
-    fn map_result(
-        &self,
-        result: Result<Option<RecordBatch>, ArrowError>,
-    ) -> error::Result<Option<RecordBatch>> {
-        result.context(error::DecodeArrowRowGroupSnafu)
-    }
-
-    fn read_format(&self) -> &ReadFormat {
-        self.as_ref().read_format()
-    }
-}
 
 pub(crate) struct MemtableRowGroupReaderBuilder {
     projection: ProjectionMask,
     parquet_metadata: Arc<ParquetMetaData>,
-    field_levels: FieldLevels,
+    arrow_metadata: ArrowReaderMetadata,
     data: Bytes,
 }
 
@@ -140,15 +43,16 @@ impl MemtableRowGroupReaderBuilder {
         parquet_metadata: Arc<ParquetMetaData>,
         data: Bytes,
     ) -> error::Result<Self> {
-        let parquet_schema_desc = parquet_metadata.file_metadata().schema_descr();
-        let hint = Some(context.read_format().arrow_schema().fields());
-        let field_levels =
-            parquet_to_arrow_field_levels(parquet_schema_desc, projection.clone(), hint)
+        // Create ArrowReaderMetadata for building the reader.
+        let arrow_reader_options =
+            ArrowReaderOptions::new().with_schema(context.read_format().arrow_schema().clone());
+        let arrow_metadata =
+            ArrowReaderMetadata::try_new(parquet_metadata.clone(), arrow_reader_options)
                 .context(ReadDataPartSnafu)?;
         Ok(Self {
             projection,
             parquet_metadata,
-            field_levels,
+            arrow_metadata,
             data,
         })
     }
@@ -159,23 +63,21 @@ impl MemtableRowGroupReaderBuilder {
         row_group_idx: usize,
         row_selection: Option<RowSelection>,
     ) -> error::Result<ParquetRecordBatchReader> {
-        let mut row_group = MemtableRowGroupPageFetcher::create(
-            row_group_idx,
-            &self.parquet_metadata,
-            self.data.clone(),
-        );
-        // Fetches data from memory part. Currently, row selection is not supported.
-        row_group.fetch(&self.projection, row_selection.as_ref());
+        let chunk_reader = MemtableChunkReader::new(self.data.clone());
 
-        // Builds the parquet reader.
-        // Now the row selection is None.
-        ParquetRecordBatchReader::try_new_with_row_groups(
-            &self.field_levels,
-            &row_group,
-            DEFAULT_READ_BATCH_SIZE,
-            row_selection,
+        let mut builder = ParquetRecordBatchReaderBuilder::new_with_metadata(
+            chunk_reader,
+            self.arrow_metadata.clone(),
         )
-        .context(ReadDataPartSnafu)
+        .with_row_groups(vec![row_group_idx])
+        .with_projection(self.projection.clone())
+        .with_batch_size(DEFAULT_READ_BATCH_SIZE);
+
+        if let Some(selection) = row_selection {
+            builder = builder.with_row_selection(selection);
+        }
+
+        builder.build().context(ReadDataPartSnafu)
     }
 
     /// Computes whether to skip field filters for a specific row group based on PreFilterMode.
diff --git a/src/mito2/src/memtable/partition_tree.rs b/src/mito2/src/memtable/partition_tree.rs
index febae46784..662bfd99f6 100644
--- a/src/mito2/src/memtable/partition_tree.rs
+++ b/src/mito2/src/memtable/partition_tree.rs
@@ -177,16 +177,6 @@ impl Memtable for PartitionTreeMemtable {
         .fail()
     }
 
-    #[cfg(any(test, feature = "test"))]
-    fn iter(
-        &self,
-        projection: Option<&[ColumnId]>,
-        predicate: Option<Predicate>,
-        sequence: Option<SequenceRange>,
-    ) -> Result<BoxedBatchIterator> {
-        self.tree.read(projection, predicate, sequence, None)
-    }
-
     fn ranges(
         &self,
         projection: Option<&[ColumnId]>,
@@ -396,8 +386,6 @@ mod tests {
     use api::v1::{Mutation, OpType, Rows, SemanticType};
     use common_query::prelude::{greptime_timestamp, greptime_value};
     use common_time::Timestamp;
-    use datafusion_common::Column;
-    use datafusion_expr::{BinaryExpr, Expr, Literal, Operator};
     use datatypes::data_type::ConcreteDataType;
     use datatypes::prelude::Vector;
     use datatypes::scalars::ScalarVector;
@@ -548,7 +536,10 @@ mod tests {
         let expect = (0..100).collect::<Vec<_>>();
         let kvs = memtable_util::build_key_values(&metadata, "hello".to_string(), 10, &expect, 1);
         memtable.write(&kvs).unwrap();
-        let iter = memtable.iter(Some(&[3]), None, None).unwrap();
+        let ranges = memtable
+            .ranges(Some(&[3]), RangesOptions::default())
+            .unwrap();
+        let iter = ranges.build(None).unwrap();
 
         let mut v0_all = vec![];
         for res in iter {
@@ -625,41 +616,6 @@ mod tests {
         assert_eq!(expect, read);
     }
 
-    #[test]
-    fn test_memtable_filter() {
-        let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![0, 1], false));
-        // Try to build a memtable via the builder.
-        let memtable = PartitionTreeMemtableBuilder::new(
-            PartitionTreeConfig {
-                index_max_keys_per_shard: 40,
-                ..Default::default()
-            },
-            None,
-        )
-        .build(1, &metadata);
-
-        for i in 0..100 {
-            let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
-            let kvs =
-                memtable_util::build_key_values(&metadata, "hello".to_string(), i, &timestamps, 1);
-            memtable.write(&kvs).unwrap();
-        }
-
-        for i in 0..100 {
-            let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
-            let expr = Expr::BinaryExpr(BinaryExpr {
-                left: Box::new(Expr::Column(Column::from_name("k1"))),
-                op: Operator::Eq,
-                right: Box::new((i as u32).lit()),
-            });
-            let iter = memtable
-                .iter(None, Some(Predicate::new(vec![expr])), None)
-                .unwrap();
-            let read = collect_iter_timestamps(iter);
-            assert_eq!(timestamps, read);
-        }
-    }
-
     #[test]
     fn test_deserialize_config() {
         let config = PartitionTreeConfig {
@@ -811,7 +767,11 @@ mod tests {
             ))
             .unwrap();
 
-        let mut reader = new_memtable.iter(None, None, None).unwrap();
+        let mut reader = new_memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = reader.next().unwrap().unwrap();
         let pk = codec.decode(batch.primary_key()).unwrap().into_dense();
         if let Value::String(s) = &pk[2] {
@@ -916,7 +876,14 @@ mod tests {
             .unwrap();
         memtable.freeze().unwrap();
         assert_eq!(
-            collect_kvs(memtable.iter(None, None, None).unwrap(), &metadata),
+            collect_kvs(
+                memtable
+                    .ranges(None, RangesOptions::default())
+                    .unwrap()
+                    .build(None)
+                    .unwrap(),
+                &metadata
+            ),
             ('a'..'h').map(|c| (c.to_string(), c.to_string())).collect()
         );
         let forked = memtable.fork(2, &metadata);
@@ -925,7 +892,14 @@ mod tests {
         forked.write(&key_values(&metadata, keys.iter())).unwrap();
         forked.freeze().unwrap();
         assert_eq!(
-            collect_kvs(forked.iter(None, None, None).unwrap(), &metadata),
+            collect_kvs(
+                forked
+                    .ranges(None, RangesOptions::default())
+                    .unwrap()
+                    .build(None)
+                    .unwrap(),
+                &metadata
+            ),
             keys.iter()
                 .map(|c| (c.to_string(), c.to_string()))
                 .collect()
@@ -936,7 +910,14 @@ mod tests {
         let keys = ["g", "e", "a", "f", "b", "c", "h"];
         forked2.write(&key_values(&metadata, keys.iter())).unwrap();
 
-        let kvs = collect_kvs(forked2.iter(None, None, None).unwrap(), &metadata);
+        let kvs = collect_kvs(
+            forked2
+                .ranges(None, RangesOptions::default())
+                .unwrap()
+                .build(None)
+                .unwrap(),
+            &metadata,
+        );
         let expected = keys
             .iter()
             .map(|c| (c.to_string(), c.to_string()))
diff --git a/src/mito2/src/memtable/simple_bulk_memtable.rs b/src/mito2/src/memtable/simple_bulk_memtable.rs
index 4dcaa2bac0..6d91f00361 100644
--- a/src/mito2/src/memtable/simple_bulk_memtable.rs
+++ b/src/mito2/src/memtable/simple_bulk_memtable.rs
@@ -213,22 +213,6 @@ impl Memtable for SimpleBulkMemtable {
         Ok(())
     }
 
-    #[cfg(any(test, feature = "test"))]
-    fn iter(
-        &self,
-        projection: Option<&[ColumnId]>,
-        _predicate: Option<table::predicate::Predicate>,
-        sequence: Option<store_api::storage::SequenceRange>,
-    ) -> error::Result<BoxedBatchIterator> {
-        let iter = self.create_iter(projection, sequence)?.build(None)?;
-        if self.merge_mode == MergeMode::LastNonNull {
-            let iter = LastNonNullIter::new(iter);
-            Ok(Box::new(iter))
-        } else {
-            Ok(Box::new(iter))
-        }
-    }
-
     fn ranges(
         &self,
         projection: Option<&[ColumnId]>,
@@ -526,7 +510,11 @@ mod tests {
             ))
             .unwrap();
 
-        let mut iter = memtable.iter(None, None, None).unwrap();
+        let mut iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
         assert_eq!(2, batch.num_rows());
         assert_eq!(2, batch.fields().len());
@@ -551,7 +539,11 @@ mod tests {
             ))
             .unwrap();
 
-        let mut iter = memtable.iter(None, None, None).unwrap();
+        let mut iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
         assert_eq!(1, batch.num_rows());
         assert_eq!(2, batch.fields().len());
@@ -565,7 +557,11 @@ mod tests {
 
         // Only project column 2 (f1)
         let projection = vec![2];
-        let mut iter = memtable.iter(Some(&projection), None, None).unwrap();
+        let mut iter = memtable
+            .ranges(Some(&projection), RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
 
         assert_eq!(1, batch.num_rows());
@@ -592,7 +588,11 @@ mod tests {
                 OpType::Put,
             ))
             .unwrap();
-        let mut iter = memtable.iter(None, None, None).unwrap();
+        let mut iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
 
         assert_eq!(1, batch.num_rows()); // deduped to 1 row
@@ -611,7 +611,11 @@ mod tests {
         let kv = kvs.iter().next().unwrap();
         memtable.write_one(kv).unwrap();
 
-        let mut iter = memtable.iter(None, None, None).unwrap();
+        let mut iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
         assert_eq!(1, batch.num_rows());
     }
@@ -745,7 +749,11 @@ mod tests {
         };
         memtable.write_bulk(part).unwrap();
 
-        let mut iter = memtable.iter(None, None, None).unwrap();
+        let mut iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
         assert_eq!(2, batch.num_rows());
 
@@ -764,7 +772,11 @@ mod tests {
             OpType::Put,
         );
         memtable.write(&kvs).unwrap();
-        let mut iter = memtable.iter(None, None, None).unwrap();
+        let mut iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let batch = iter.next().unwrap().unwrap();
         assert_eq!(3, batch.num_rows());
         assert_eq!(
@@ -854,7 +866,15 @@ mod tests {
 
         // Filter with sequence 0 should only return first write
         let mut iter = memtable
-            .iter(None, None, Some(SequenceRange::LtEq { max: 0 }))
+            .ranges(
+                None,
+                RangesOptions {
+                    sequence: Some(SequenceRange::LtEq { max: 0 }),
+                    ..Default::default()
+                },
+            )
+            .unwrap()
+            .build(None)
             .unwrap();
         let batch = iter.next().unwrap().unwrap();
         assert_eq!(1, batch.num_rows());
diff --git a/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs b/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs
index b71a86c554..08edebdbb2 100644
--- a/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs
+++ b/src/mito2/src/memtable/simple_bulk_memtable/test_only.rs
@@ -12,98 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::HashSet;
-use std::time::Instant;
-
 use store_api::metadata::RegionMetadataRef;
-use store_api::storage::{ColumnId, SequenceRange};
 
-use crate::error;
-use crate::memtable::simple_bulk_memtable::{Iter, SimpleBulkMemtable};
-use crate::memtable::time_series::Values;
-use crate::memtable::{BoxedBatchIterator, IterBuilder, MemScanMetrics};
-use crate::read::dedup::LastNonNullIter;
-use crate::region::options::MergeMode;
+use crate::memtable::simple_bulk_memtable::SimpleBulkMemtable;
 
 impl SimpleBulkMemtable {
     pub fn region_metadata(&self) -> RegionMetadataRef {
         self.region_metadata.clone()
     }
-
-    pub(crate) fn create_iter(
-        &self,
-        projection: Option<&[ColumnId]>,
-        sequence: Option<SequenceRange>,
-    ) -> error::Result<BatchIterBuilderDeprecated> {
-        let mut series = self.series.write().unwrap();
-
-        let values = if series.is_empty() {
-            None
-        } else {
-            Some(series.compact(&self.region_metadata)?.clone())
-        };
-        let projection = self.build_projection(projection);
-        Ok(BatchIterBuilderDeprecated {
-            region_metadata: self.region_metadata.clone(),
-            values,
-            projection,
-            dedup: self.dedup,
-            sequence,
-            merge_mode: self.merge_mode,
-        })
-    }
-}
-
-#[derive(Clone)]
-pub(crate) struct BatchIterBuilderDeprecated {
-    region_metadata: RegionMetadataRef,
-    values: Option<Values>,
-    projection: HashSet<ColumnId>,
-    sequence: Option<SequenceRange>,
-    dedup: bool,
-    merge_mode: MergeMode,
-}
-
-impl IterBuilder for BatchIterBuilderDeprecated {
-    fn build(&self, metrics: Option<MemScanMetrics>) -> error::Result<BoxedBatchIterator> {
-        let start_time = Instant::now();
-        let Some(values) = self.values.clone() else {
-            return Ok(Box::new(Iter { batch: None }));
-        };
-
-        let maybe_batch = values
-            .to_batch(
-                &[],
-                &self.region_metadata,
-                &self.projection,
-                self.sequence,
-                self.dedup,
-                self.merge_mode,
-            )
-            .map(Some)
-            .transpose();
-
-        // Collect metrics from the batch
-        if let Some(metrics) = metrics {
-            let (num_rows, num_batches) = match &maybe_batch {
-                Some(Ok(batch)) => (batch.num_rows(), 1),
-                _ => (0, 0),
-            };
-            let inner = crate::memtable::MemScanMetricsData {
-                total_series: 1,
-                num_rows,
-                num_batches,
-                scan_cost: start_time.elapsed(),
-            };
-            metrics.merge_inner(&inner);
-        }
-
-        let iter = Iter { batch: maybe_batch };
-
-        if self.merge_mode == MergeMode::LastNonNull {
-            Ok(Box::new(LastNonNullIter::new(iter)))
-        } else {
-            Ok(Box::new(iter))
-        }
-    }
 }
diff --git a/src/mito2/src/memtable/time_partition.rs b/src/mito2/src/memtable/time_partition.rs
index 6f11c813cb..ee695aceb8 100644
--- a/src/mito2/src/memtable/time_partition.rs
+++ b/src/mito2/src/memtable/time_partition.rs
@@ -827,6 +827,7 @@ mod tests {
     use super::*;
     use crate::memtable::partition_tree::PartitionTreeMemtableBuilder;
     use crate::memtable::time_series::TimeSeriesMemtableBuilder;
+    use crate::memtable::{IterBuilder, RangesOptions};
     use crate::test_util::memtable_util::{self, collect_iter_timestamps};
 
     #[test]
@@ -852,7 +853,11 @@ mod tests {
         partitions.list_memtables(&mut memtables);
         assert_eq!(0, memtables[0].id());
 
-        let iter = memtables[0].iter(None, None, None).unwrap();
+        let iter = memtables[0]
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[1000, 3000, 5000, 6000, 7000], &timestamps[..]);
     }
@@ -890,7 +895,11 @@ mod tests {
 
         let mut memtables = Vec::new();
         partitions.list_memtables(&mut memtables);
-        let iter = memtables[0].iter(None, None, None).unwrap();
+        let iter = memtables[0]
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[0, 2000, 3000, 4000, 5000, 7000], &timestamps[..]);
         let parts = partitions.list_partitions();
@@ -943,7 +952,12 @@ mod tests {
         let partitions = new_multi_partitions(&metadata);
 
         let parts = partitions.list_partitions();
-        let iter = parts[0].memtable.iter(None, None, None).unwrap();
+        let iter = parts[0]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(0, parts[0].memtable.id());
         assert_eq!(
@@ -955,7 +969,12 @@ mod tests {
             parts[0].time_range.max_timestamp
         );
         assert_eq!(&[0, 2000, 3000, 4000], &timestamps[..]);
-        let iter = parts[1].memtable.iter(None, None, None).unwrap();
+        let iter = parts[1]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         assert_eq!(1, parts[1].memtable.id());
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[5000, 7000], &timestamps[..]);
@@ -1273,7 +1292,12 @@ mod tests {
 
         let parts = partitions.list_partitions();
         assert_eq!(1, parts.len());
-        let iter = parts[0].memtable.iter(None, None, None).unwrap();
+        let iter = parts[0]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[1000, 2000, 3000], &timestamps[..]);
 
@@ -1284,11 +1308,21 @@ mod tests {
         let parts = partitions.list_partitions();
         assert_eq!(2, parts.len());
         // Check first partition [0, 5000)
-        let iter = parts[0].memtable.iter(None, None, None).unwrap();
+        let iter = parts[0]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[1000, 2000, 3000, 4000], &timestamps[..]);
         // Check second partition [5000, 10000)
-        let iter = parts[1].memtable.iter(None, None, None).unwrap();
+        let iter = parts[1]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[5000, 6000], &timestamps[..]);
 
@@ -1301,7 +1335,12 @@ mod tests {
         assert_eq!(3, parts.len());
 
         // Check new partition [10000, 15000)
-        let iter = parts[2].memtable.iter(None, None, None).unwrap();
+        let iter = parts[2]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[11000, 12000], &timestamps[..]);
 
@@ -1314,7 +1353,12 @@ mod tests {
 
         let parts = partitions.list_partitions();
         assert_eq!(1, parts.len());
-        let iter = parts[0].memtable.iter(None, None, None).unwrap();
+        let iter = parts[0]
+            .memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let timestamps = collect_iter_timestamps(iter);
         assert_eq!(&[1000, 5000, 9000], &timestamps[..]);
     }
diff --git a/src/mito2/src/memtable/time_series.rs b/src/mito2/src/memtable/time_series.rs
index 271a9343eb..d3d00d0703 100644
--- a/src/mito2/src/memtable/time_series.rs
+++ b/src/mito2/src/memtable/time_series.rs
@@ -51,15 +51,18 @@ use crate::memtable::bulk::part::BulkPart;
 use crate::memtable::simple_bulk_memtable::SimpleBulkMemtable;
 use crate::memtable::stats::WriteMetrics;
 use crate::memtable::{
-    AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, IterBuilder, KeyValues,
-    MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, MemtableRangeContext,
-    MemtableRanges, MemtableRef, MemtableStats, RangesOptions, read_column_ids_from_projection,
+    AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, BoxedRecordBatchIterator,
+    IterBuilder, KeyValues, MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange,
+    MemtableRangeContext, MemtableRanges, MemtableRef, MemtableStats, RangesOptions,
+    read_column_ids_from_projection,
 };
 use crate::metrics::{
     MEMTABLE_ACTIVE_FIELD_BUILDER_COUNT, MEMTABLE_ACTIVE_SERIES_COUNT, READ_ROWS_TOTAL,
     READ_STAGE_ELAPSED,
 };
 use crate::read::dedup::LastNonNullIter;
+use crate::read::prune::PruneTimeIterator;
+use crate::read::scan_region::PredicateGroup;
 use crate::read::{Batch, BatchBuilder, BatchColumn};
 use crate::region::options::MergeMode;
 
@@ -267,39 +270,6 @@ impl Memtable for TimeSeriesMemtable {
         Ok(())
     }
 
-    #[cfg(any(test, feature = "test"))]
-    fn iter(
-        &self,
-        projection: Option<&[ColumnId]>,
-        filters: Option<Predicate>,
-        sequence: Option<SequenceRange>,
-    ) -> Result<BoxedBatchIterator> {
-        let projection = if let Some(projection) = projection {
-            projection.iter().copied().collect()
-        } else {
-            self.region_metadata
-                .field_columns()
-                .map(|c| c.column_id)
-                .collect()
-        };
-
-        let iter = self.series_set.iter_series(
-            projection,
-            filters,
-            self.dedup,
-            self.merge_mode,
-            sequence,
-            None,
-        )?;
-
-        if self.merge_mode == MergeMode::LastNonNull {
-            let iter = LastNonNullIter::new(iter);
-            Ok(Box::new(iter))
-        } else {
-            Ok(Box::new(iter))
-        }
-    }
-
     fn ranges(
         &self,
         projection: Option<&[ColumnId]>,
@@ -316,25 +286,20 @@ impl Memtable for TimeSeriesMemtable {
                 .map(|c| c.column_id)
                 .collect()
         };
-        let builder = Box::new(TimeSeriesIterBuilder {
-            series_set: self.series_set.clone(),
-            projection,
-            predicate: predicate.predicate().cloned(),
-            dedup: self.dedup,
-            merge_mode: self.merge_mode,
-            sequence,
-        });
-        let adapter_context = Arc::new(BatchToRecordBatchContext::new(
+        let batch_to_record_batch = Arc::new(BatchToRecordBatchContext::new(
             self.region_metadata.clone(),
             read_column_ids,
         ));
-        let context = Arc::new(MemtableRangeContext::new_with_batch_to_record_batch(
-            self.id,
-            builder,
-            predicate,
-            Some(adapter_context),
-        ));
-
+        let builder = Box::new(TimeSeriesIterBuilder {
+            series_set: self.series_set.clone(),
+            projection,
+            predicate: predicate.clone(),
+            dedup: self.dedup,
+            merge_mode: self.merge_mode,
+            sequence,
+            batch_to_record_batch,
+        });
+        let context = Arc::new(MemtableRangeContext::new(self.id, builder, predicate));
         let range_stats = self.stats();
         let range = MemtableRange::new(context, range_stats);
         Ok(MemtableRanges {
@@ -476,7 +441,7 @@ impl SeriesSet {
     fn iter_series(
         &self,
         projection: HashSet<ColumnId>,
-        predicate: Option<Predicate>,
+        predicate: PredicateGroup,
         dedup: bool,
         merge_mode: MergeMode,
         sequence: Option<SequenceRange>,
@@ -493,7 +458,7 @@ impl SeriesSet {
             self.region_metadata.clone(),
             self.series.clone(),
             projection,
-            predicate,
+            predicate.predicate().cloned(),
             primary_key_schema,
             primary_key_datatypes,
             self.codec.clone(),
@@ -1278,10 +1243,11 @@ impl From<ValueBuilder> for Values {
 struct TimeSeriesIterBuilder {
     series_set: SeriesSet,
     projection: HashSet<ColumnId>,
-    predicate: Option<Predicate>,
+    predicate: PredicateGroup,
     dedup: bool,
     sequence: Option<SequenceRange>,
     merge_mode: MergeMode,
+    batch_to_record_batch: Arc<BatchToRecordBatchContext>,
 }
 
 impl IterBuilder for TimeSeriesIterBuilder {
@@ -1301,6 +1267,25 @@ impl IterBuilder for TimeSeriesIterBuilder {
             Ok(Box::new(iter))
         }
     }
+
+    fn is_record_batch(&self) -> bool {
+        true
+    }
+
+    fn build_record_batch(
+        &self,
+        time_range: Option<(Timestamp, Timestamp)>,
+        metrics: Option<MemScanMetrics>,
+    ) -> Result<BoxedRecordBatchIterator> {
+        let iter = self.build(metrics)?;
+        let iter: BoxedBatchIterator = if let Some(time_range) = time_range {
+            let time_filters = self.predicate.time_filters();
+            Box::new(PruneTimeIterator::new(iter, time_range, time_filters))
+        } else {
+            iter
+        };
+        Ok(self.batch_to_record_batch.adapt_iter(iter))
+    }
 }
 
 #[cfg(test)]
@@ -1798,7 +1783,9 @@ mod tests {
             *expected_ts.entry(ts).or_default() += if dedup { 1 } else { 2 };
         }
 
-        let iter = memtable.iter(None, None, None).unwrap();
+        let ranges = memtable.ranges(None, RangesOptions::default()).unwrap();
+        let range = ranges.ranges.into_values().next().unwrap();
+        let iter = range.build_iter().unwrap();
         let mut read = HashMap::new();
 
         for ts in iter
@@ -1838,7 +1825,11 @@ mod tests {
         let memtable = TimeSeriesMemtable::new(schema, 42, None, true, MergeMode::LastRow);
         memtable.write(&kvs).unwrap();
 
-        let iter = memtable.iter(Some(&[3]), None, None).unwrap();
+        let iter = memtable
+            .ranges(Some(&[3]), RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
 
         let mut v0_all = vec![];
 
@@ -1917,7 +1908,11 @@ mod tests {
                 barrier.wait();
 
                 for _ in 0..10 {
-                    let iter = memtable.iter(None, None, None).unwrap();
+                    let iter = memtable
+                        .ranges(None, RangesOptions::default())
+                        .unwrap()
+                        .build(None)
+                        .unwrap();
                     for batch_result in iter {
                         let _ = batch_result.unwrap();
                     }
@@ -1936,7 +1931,11 @@ mod tests {
             handle.join().unwrap();
         }
 
-        let iter = memtable.iter(None, None, None).unwrap();
+        let iter = memtable
+            .ranges(None, RangesOptions::default())
+            .unwrap()
+            .build(None)
+            .unwrap();
         let mut series_count = 0;
         let mut row_count = 0;
 
@@ -2033,4 +2032,265 @@ mod tests {
         all_timestamps.sort();
         assert_eq!(vec![3, 4, 5, 6, 7], all_timestamps);
     }
+
+    /// Helper to create a TimeSeriesIterBuilder from a memtable and schema.
+    fn build_iter_builder(
+        schema: &RegionMetadataRef,
+        memtable: &TimeSeriesMemtable,
+        projection: Option<&[ColumnId]>,
+        dedup: bool,
+        merge_mode: MergeMode,
+        sequence: Option<SequenceRange>,
+    ) -> TimeSeriesIterBuilder {
+        let read_column_ids = read_column_ids_from_projection(schema, projection);
+        let field_projection = if let Some(projection) = projection {
+            projection.iter().copied().collect()
+        } else {
+            schema.field_columns().map(|c| c.column_id).collect()
+        };
+        let adapter_context = Arc::new(BatchToRecordBatchContext::new(
+            schema.clone(),
+            read_column_ids,
+        ));
+        TimeSeriesIterBuilder {
+            series_set: memtable.series_set.clone(),
+            projection: field_projection,
+            predicate: PredicateGroup::default(),
+            dedup,
+            merge_mode,
+            sequence,
+            batch_to_record_batch: adapter_context,
+        }
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_basic() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        let kvs = build_key_values(&schema, "hello".to_string(), 42, 10);
+        memtable.write(&kvs).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
+
+        let mut iter = builder.build_record_batch(None, None).unwrap();
+        let rb = iter.next().transpose().unwrap().unwrap();
+        assert_eq!(10, rb.num_rows());
+
+        let rb_schema = rb.schema();
+        let col_names: Vec<_> = rb_schema
+            .fields()
+            .iter()
+            .map(|f| f.name().as_str())
+            .collect();
+        assert_eq!(
+            col_names,
+            vec![
+                "k0",
+                "k1",
+                "v0",
+                "v1",
+                "ts",
+                "__primary_key",
+                "__sequence",
+                "__op_type",
+            ]
+        );
+
+        assert!(iter.next().is_none());
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_with_projection() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        let kvs = build_key_values(&schema, "test".to_string(), 1, 5);
+        memtable.write(&kvs).unwrap();
+
+        // Project only field v0 (column_id=3) and ts (column_id=2).
+        let projection = vec![2, 3];
+        let builder = build_iter_builder(
+            &schema,
+            &memtable,
+            Some(&projection),
+            true,
+            MergeMode::LastRow,
+            None,
+        );
+
+        let mut iter = builder.build_record_batch(None, None).unwrap();
+        let rb = iter.next().transpose().unwrap().unwrap();
+        assert_eq!(5, rb.num_rows());
+
+        let rb_schema = rb.schema();
+        let col_names: Vec<_> = rb_schema
+            .fields()
+            .iter()
+            .map(|f| f.name().as_str())
+            .collect();
+        // Only projected columns + internal columns.
+        assert_eq!(
+            col_names,
+            vec!["v0", "ts", "__primary_key", "__sequence", "__op_type",]
+        );
+
+        assert!(iter.next().is_none());
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_multiple_series() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        let kvs_a = build_key_values(&schema, "aaa".to_string(), 1, 3);
+        let kvs_b = build_key_values(&schema, "bbb".to_string(), 2, 4);
+        memtable.write(&kvs_a).unwrap();
+        memtable.write(&kvs_b).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let mut total_rows = 0;
+        for rb in iter {
+            let rb = rb.unwrap();
+            total_rows += rb.num_rows();
+            assert_eq!(8, rb.num_columns());
+        }
+        assert_eq!(7, total_rows);
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_dedup() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        // Write same data twice — dedup should keep only one copy per timestamp.
+        let kvs = build_key_values(&schema, "dup".to_string(), 10, 5);
+        memtable.write(&kvs).unwrap();
+        memtable.write(&kvs).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
+        assert_eq!(5, total_rows);
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_no_dedup() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, false, MergeMode::LastRow);
+
+        let kvs = build_key_values(&schema, "dup".to_string(), 10, 5);
+        memtable.write(&kvs).unwrap();
+        memtable.write(&kvs).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, false, MergeMode::LastRow, None);
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
+        assert_eq!(10, total_rows);
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_with_sequence_filter() {
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        // build_key_values creates a mutation with base sequence=0.
+        // Each row gets sequence = base + row_index, so 5 rows get sequences 0,1,2,3,4.
+        let kvs = build_key_values(&schema, "seq".to_string(), 1, 5);
+        memtable.write(&kvs).unwrap();
+
+        // Filter to sequence > 4 — should yield no rows.
+        let builder = build_iter_builder(
+            &schema,
+            &memtable,
+            None,
+            true,
+            MergeMode::LastRow,
+            Some(SequenceRange::Gt { min: 4 }),
+        );
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
+        assert_eq!(0, total_rows);
+
+        // Filter to sequence <= 2 — should yield 3 rows (sequences 0, 1, 2).
+        let builder = build_iter_builder(
+            &schema,
+            &memtable,
+            None,
+            true,
+            MergeMode::LastRow,
+            Some(SequenceRange::LtEq { max: 2 }),
+        );
+
+        let iter = builder.build_record_batch(None, None).unwrap();
+        let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
+        assert_eq!(3, total_rows);
+    }
+
+    #[test]
+    fn test_iter_builder_build_record_batch_data_correctness() {
+        use datatypes::arrow::array::{
+            Float64Array, Int64Array, TimestampMillisecondArray, UInt8Array,
+        };
+
+        let schema = schema_for_test();
+        let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
+
+        let kvs = build_key_values(&schema, "check".to_string(), 7, 3);
+        memtable.write(&kvs).unwrap();
+
+        let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
+
+        let mut iter = builder.build_record_batch(None, None).unwrap();
+        let rb = iter.next().transpose().unwrap().unwrap();
+        assert_eq!(3, rb.num_rows());
+
+        // Verify timestamp values.
+        let ts_col = rb
+            .column_by_name("ts")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<TimestampMillisecondArray>()
+            .unwrap();
+        let timestamps: Vec<_> = (0..ts_col.len()).map(|i| ts_col.value(i)).collect();
+        assert_eq!(vec![0, 1, 2], timestamps);
+
+        // Verify field v0 values.
+        let v0_col = rb
+            .column_by_name("v0")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<Int64Array>()
+            .unwrap();
+        let v0_values: Vec<_> = (0..v0_col.len()).map(|i| v0_col.value(i)).collect();
+        assert_eq!(vec![0, 1, 2], v0_values);
+
+        // Verify field v1 values.
+        let v1_col = rb
+            .column_by_name("v1")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<Float64Array>()
+            .unwrap();
+        let v1_values: Vec<_> = (0..v1_col.len()).map(|i| v1_col.value(i)).collect();
+        assert_eq!(vec![0.0, 1.0, 2.0], v1_values);
+
+        // Verify op_type is all Put (1).
+        let op_col = rb
+            .column_by_name("__op_type")
+            .unwrap()
+            .as_any()
+            .downcast_ref::<UInt8Array>()
+            .unwrap();
+        for i in 0..op_col.len() {
+            assert_eq!(OpType::Put as u8, op_col.value(i));
+        }
+
+        assert!(iter.next().is_none());
+    }
 }
diff --git a/src/mito2/src/read.rs b/src/mito2/src/read.rs
index 5fbd63ce8b..84931b9f37 100644
--- a/src/mito2/src/read.rs
+++ b/src/mito2/src/read.rs
@@ -27,6 +27,10 @@ pub mod projection;
 pub(crate) mod prune;
 pub(crate) mod pruner;
 pub mod range;
+#[cfg(feature = "test")]
+pub mod range_cache;
+#[cfg(not(feature = "test"))]
+pub(crate) mod range_cache;
 pub mod scan_region;
 pub mod scan_util;
 pub(crate) mod seq_scan;
diff --git a/src/mito2/src/read/batch_adapter.rs b/src/mito2/src/read/batch_adapter.rs
index 461dbeba69..4698229c5b 100644
--- a/src/mito2/src/read/batch_adapter.rs
+++ b/src/mito2/src/read/batch_adapter.rs
@@ -59,7 +59,7 @@ impl BatchToRecordBatchAdapter {
     /// - `metadata`: region metadata describing the schema.
     /// - `codec`: codec for decoding the encoded primary key bytes.
     /// - `read_column_ids`: projected column ids to read.
-    pub(crate) fn new(
+    pub fn new(
         iter: BoxedBatchIterator,
         metadata: RegionMetadataRef,
         codec: Arc<dyn PrimaryKeyCodec>,
diff --git a/src/mito2/src/read/flat_projection.rs b/src/mito2/src/read/flat_projection.rs
index 3e0f1169df..02b4c6b3c1 100644
--- a/src/mito2/src/read/flat_projection.rs
+++ b/src/mito2/src/read/flat_projection.rs
@@ -18,18 +18,21 @@ use std::sync::Arc;
 
 use api::v1::SemanticType;
 use common_error::ext::BoxedError;
-use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu};
+use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu, NewDfRecordBatchSnafu};
 use common_recordbatch::{DfRecordBatch, RecordBatch};
-use datatypes::arrow::datatypes::Field;
+use datatypes::arrow::array::Array;
+use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field};
 use datatypes::prelude::{ConcreteDataType, DataType};
 use datatypes::schema::{Schema, SchemaRef};
+use datatypes::value::Value;
 use datatypes::vectors::Helper;
 use snafu::{OptionExt, ResultExt};
 use store_api::metadata::{RegionMetadata, RegionMetadataRef};
 use store_api::storage::ColumnId;
 
+use crate::cache::CacheStrategy;
 use crate::error::{InvalidRequestSnafu, RecordBatchSnafu, Result};
-use crate::read::projection::read_column_ids_from_projection;
+use crate::read::projection::{read_column_ids_from_projection, repeated_vector_with_cache};
 use crate::sst::parquet::flat_format::sst_column_id_indices;
 use crate::sst::parquet::format::FormatProjection;
 use crate::sst::{
@@ -248,12 +251,55 @@ impl FlatProjectionMapper {
     pub(crate) fn convert(
         &self,
         batch: &datatypes::arrow::record_batch::RecordBatch,
+        cache_strategy: &CacheStrategy,
     ) -> common_recordbatch::error::Result<RecordBatch> {
         if self.is_empty_projection {
             return RecordBatch::new_with_count(self.output_schema.clone(), batch.num_rows());
         }
-        let columns = self.project_vectors(batch)?;
-        RecordBatch::new(self.output_schema.clone(), columns)
+        // Construct output record batch directly from Arrow arrays to avoid
+        // Arrow -> Vector -> Arrow roundtrips in the hot path.
+        let mut arrays = Vec::with_capacity(self.output_schema.num_columns());
+        for (output_idx, index) in self.batch_indices.iter().enumerate() {
+            let mut array = batch.column(*index).clone();
+            // Cast dictionary values to the target type.
+            if let ArrowDataType::Dictionary(_key_type, value_type) = array.data_type() {
+                // When a string dictionary column contains only a single value, reuse a cached
+                // repeated vector to avoid repeatedly expanding the dictionary.
+                if let Some(dict_array) = single_value_string_dictionary(
+                    &array,
+                    &self.output_schema.column_schemas()[output_idx].data_type,
+                    value_type.as_ref(),
+                ) {
+                    let dict_values = dict_array.values();
+                    let value = if dict_values.is_null(0) {
+                        Value::Null
+                    } else {
+                        Value::from(datatypes::arrow_array::string_array_value(dict_values, 0))
+                    };
+
+                    let repeated = repeated_vector_with_cache(
+                        &self.output_schema.column_schemas()[output_idx].data_type,
+                        &value,
+                        batch.num_rows(),
+                        cache_strategy,
+                    )?;
+                    array = repeated.to_arrow_array();
+                } else {
+                    let casted = datatypes::arrow::compute::cast(&array, value_type)
+                        .context(ArrowComputeSnafu)?;
+                    array = casted;
+                }
+            }
+            arrays.push(array);
+        }
+
+        let df_record_batch =
+            DfRecordBatch::try_new(self.output_schema.arrow_schema().clone(), arrays)
+                .context(NewDfRecordBatchSnafu)?;
+        Ok(RecordBatch::from_df_record_batch(
+            self.output_schema.clone(),
+            df_record_batch,
+        ))
     }
 
     /// Projects columns from the input batch and converts them into vectors.
@@ -281,6 +327,28 @@ impl FlatProjectionMapper {
     }
 }
 
+fn single_value_string_dictionary<'a>(
+    array: &'a Arc<dyn Array>,
+    output_type: &ConcreteDataType,
+    value_type: &ArrowDataType,
+) -> Option<&'a datatypes::arrow::array::DictionaryArray<datatypes::arrow::datatypes::UInt32Type>> {
+    if !matches!(
+        value_type,
+        ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View
+    ) || !output_type.is_string()
+    {
+        return None;
+    }
+
+    let dict_array = array
+        .as_any()
+        .downcast_ref::<datatypes::arrow::array::DictionaryArray<
+            datatypes::arrow::datatypes::UInt32Type,
+        >>()?;
+
+    (dict_array.values().len() == 1 && dict_array.null_count() == 0).then_some(dict_array)
+}
+
 /// Returns ids and datatypes of columns of the output batch after applying the `projection`.
 ///
 /// It adds the time index column if it doesn't present in the projection.
diff --git a/src/mito2/src/read/last_row.rs b/src/mito2/src/read/last_row.rs
index c2336f218d..1dc4102311 100644
--- a/src/mito2/src/read/last_row.rs
+++ b/src/mito2/src/read/last_row.rs
@@ -21,6 +21,7 @@ use datatypes::arrow::array::{Array, BinaryArray};
 use datatypes::arrow::compute::concat_batches;
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::vectors::UInt32Vector;
+use futures::{Stream, TryStreamExt};
 use snafu::ResultExt;
 use store_api::storage::{FileId, TimeSeriesRowSelector};
 
@@ -30,7 +31,7 @@ use crate::cache::{
 };
 use crate::error::{ComputeArrowSnafu, Result};
 use crate::memtable::partition_tree::data::timestamp_array_to_i64_slice;
-use crate::read::{Batch, BatchReader, BoxedBatchReader};
+use crate::read::{Batch, BatchReader, BoxedBatchReader, BoxedRecordBatchStream};
 use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
 use crate::sst::parquet::flat_format::{primary_key_column_index, time_index_column_index};
 use crate::sst::parquet::format::{PrimaryKeyArray, primary_key_offsets};
@@ -332,10 +333,10 @@ impl FlatRowGroupLastRowCachedReader {
     }
 
     /// Returns the next RecordBatch.
-    pub(crate) fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
+    pub(crate) async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
         match self {
             FlatRowGroupLastRowCachedReader::Hit(r) => r.next_batch(),
-            FlatRowGroupLastRowCachedReader::Miss(r) => r.next_batch(),
+            FlatRowGroupLastRowCachedReader::Miss(r) => r.next_batch().await,
         }
     }
 
@@ -465,12 +466,12 @@ impl FlatRowGroupLastRowReader {
         Ok(Some(merged))
     }
 
-    fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
+    async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
         if self.pending.is_full() {
             return self.flush_pending();
         }
 
-        while let Some(batch) = self.reader.next_batch()? {
+        while let Some(batch) = self.reader.next_batch().await? {
             self.selector.on_next(batch, &mut self.pending)?;
             if self.pending.is_full() {
                 return self.flush_pending();
@@ -610,6 +611,41 @@ impl FlatLastTimestampSelector {
     }
 }
 
+/// Reader that keeps only the last row of each time series from a flat RecordBatch stream.
+/// Assumes input is sorted, deduped, and contains no delete operations.
+pub(crate) struct FlatLastRowReader {
+    stream: BoxedRecordBatchStream,
+    selector: FlatLastTimestampSelector,
+    pending: BatchBuffer,
+}
+
+impl FlatLastRowReader {
+    /// Creates a new `FlatLastRowReader`.
+    pub(crate) fn new(stream: BoxedRecordBatchStream) -> Self {
+        Self {
+            stream,
+            selector: FlatLastTimestampSelector::default(),
+            pending: BatchBuffer::new(),
+        }
+    }
+
+    /// Converts the reader into a stream of RecordBatches.
+    pub(crate) fn into_stream(mut self) -> impl Stream<Item = Result<RecordBatch>> {
+        async_stream::try_stream! {
+            while let Some(batch) = self.stream.try_next().await? {
+                self.selector.on_next(batch, &mut self.pending)?;
+                if self.pending.is_full() {
+                    yield self.pending.concat()?;
+                }
+            }
+            self.selector.finish(&mut self.pending)?;
+            if !self.pending.is_empty() {
+                yield self.pending.concat()?;
+            }
+        }
+    }
+}
+
 /// Gets the primary key bytes at `index` from the primary key dictionary column.
 fn primary_key_bytes_at(batch: &RecordBatch, pk_col_idx: usize, index: usize) -> &[u8] {
     let pk_dict = batch
diff --git a/src/mito2/src/read/projection.rs b/src/mito2/src/read/projection.rs
index 2c000e7bdc..b5b6904521 100644
--- a/src/mito2/src/read/projection.rs
+++ b/src/mito2/src/read/projection.rs
@@ -21,7 +21,7 @@ use std::sync::Arc;
 use api::v1::SemanticType;
 use common_error::ext::BoxedError;
 use common_recordbatch::RecordBatch;
-use common_recordbatch::error::ExternalSnafu;
+use common_recordbatch::error::{DataTypesSnafu, ExternalSnafu};
 use datatypes::prelude::{ConcreteDataType, DataType};
 use datatypes::schema::{Schema, SchemaRef};
 use datatypes::value::Value;
@@ -37,7 +37,7 @@ use crate::read::Batch;
 use crate::read::flat_projection::FlatProjectionMapper;
 
 /// Only cache vector when its length `<=` this value.
-const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384;
+pub(crate) const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384;
 
 /// Wrapper enum for different projection mapper implementations.
 pub enum ProjectionMapper {
@@ -423,7 +423,7 @@ enum BatchIndex {
 }
 
 /// Gets a vector with repeated values from specific cache or creates a new one.
-fn repeated_vector_with_cache(
+pub(crate) fn repeated_vector_with_cache(
     data_type: &ConcreteDataType,
     value: &Value,
     num_rows: usize,
@@ -450,7 +450,7 @@ fn repeated_vector_with_cache(
 }
 
 /// Returns a vector with repeated values.
-fn new_repeated_vector(
+pub(crate) fn new_repeated_vector(
     data_type: &ConcreteDataType,
     value: &Value,
     num_rows: usize,
@@ -458,8 +458,7 @@ fn new_repeated_vector(
     let mut mutable_vector = data_type.create_mutable_vector(1);
     mutable_vector
         .try_push_value_ref(&value.as_value_ref())
-        .map_err(BoxedError::new)
-        .context(ExternalSnafu)?;
+        .context(DataTypesSnafu)?;
     // This requires an additional allocation.
     let base_vector = mutable_vector.to_vector();
     Ok(base_vector.replicate(&[num_rows]))
@@ -809,6 +808,7 @@ mod tests {
                 .num_fields(2)
                 .build(),
         );
+        let cache = CacheStrategy::Disabled;
         let mapper = ProjectionMapper::all(&metadata, true).unwrap();
         assert_eq!([0, 1, 2, 3, 4], mapper.column_ids());
         assert_eq!(
@@ -823,7 +823,7 @@ mod tests {
         );
 
         let batch = new_flat_batch(Some(0), &[(1, 1), (2, 2)], &[(3, 3), (4, 4)], 3);
-        let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap();
+        let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap();
         let expect = "\
 +---------------------+----+----+----+----+
 | ts                  | k0 | k1 | v0 | v1 |
@@ -843,6 +843,7 @@ mod tests {
                 .num_fields(2)
                 .build(),
         );
+        let cache = CacheStrategy::Disabled;
         // Columns v1, k0
         let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter(), true).unwrap();
         assert_eq!([4, 1], mapper.column_ids());
@@ -856,7 +857,7 @@ mod tests {
         );
 
         let batch = new_flat_batch(None, &[(1, 1)], &[(4, 4)], 3);
-        let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap();
+        let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap();
         let expect = "\
 +----+----+
 | v1 | k0 |
@@ -876,6 +877,7 @@ mod tests {
                 .num_fields(2)
                 .build(),
         );
+        let cache = CacheStrategy::Disabled;
         // Output columns v1, k0. Read also includes v0.
         let mapper = ProjectionMapper::new_with_read_columns(
             &metadata,
@@ -887,7 +889,7 @@ mod tests {
         assert_eq!([4, 1, 3], mapper.column_ids());
 
         let batch = new_flat_batch(None, &[(1, 1)], &[(3, 3), (4, 4)], 3);
-        let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap();
+        let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap();
         let expect = "\
 +----+----+
 | v1 | k0 |
@@ -907,6 +909,7 @@ mod tests {
                 .num_fields(2)
                 .build(),
         );
+        let cache = CacheStrategy::Disabled;
         // Empty projection
         let mapper = ProjectionMapper::new(&metadata, [].into_iter(), true).unwrap();
         assert_eq!([0], mapper.column_ids()); // Should still read the time index column
@@ -918,7 +921,7 @@ mod tests {
         );
 
         let batch = new_flat_batch(Some(0), &[], &[], 3);
-        let record_batch = flat_mapper.convert(&batch).unwrap();
+        let record_batch = flat_mapper.convert(&batch, &cache).unwrap();
         assert_eq!(3, record_batch.num_rows());
         assert_eq!(0, record_batch.num_columns());
         assert!(record_batch.schema.is_empty());
diff --git a/src/mito2/src/read/prune.rs b/src/mito2/src/read/prune.rs
index 29ded3d49a..6766bf3f38 100644
--- a/src/mito2/src/read/prune.rs
+++ b/src/mito2/src/read/prune.rs
@@ -80,11 +80,6 @@ impl PruneReader {
         }
     }
 
-    pub(crate) fn reset_source(&mut self, source: Source, skip_fields: bool) {
-        self.source = source;
-        self.skip_fields = skip_fields;
-    }
-
     /// Merge metrics with the inner reader and return the merged metrics.
     pub(crate) fn metrics(&self) -> ReaderMetrics {
         let mut metrics = self.metrics.clone();
@@ -252,10 +247,10 @@ pub enum FlatSource {
 }
 
 impl FlatSource {
-    fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
+    async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
         match self {
-            FlatSource::RowGroup(r) => r.next_batch(),
-            FlatSource::LastRow(r) => r.next_batch(),
+            FlatSource::RowGroup(r) => r.next_batch().await,
+            FlatSource::LastRow(r) => r.next_batch().await,
         }
     }
 }
@@ -302,13 +297,16 @@ impl FlatPruneReader {
         self.metrics.clone()
     }
 
-    pub(crate) fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
-        while let Some(record_batch) = {
+    pub(crate) async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
+        loop {
             let start = std::time::Instant::now();
-            let batch = self.source.next_batch()?;
+            let batch = self.source.next_batch().await?;
             self.metrics.scan_cost += start.elapsed();
-            batch
-        } {
+
+            let Some(record_batch) = batch else {
+                return Ok(None);
+            };
+
             // Update metrics for the received batch
             self.metrics.num_rows += record_batch.num_rows();
             self.metrics.num_batches += 1;
@@ -322,8 +320,6 @@ impl FlatPruneReader {
                 }
             }
         }
-
-        Ok(None)
     }
 
     /// Prunes batches by the pushed down predicate and returns RecordBatch.
diff --git a/src/mito2/src/read/range_cache.rs b/src/mito2/src/read/range_cache.rs
new file mode 100644
index 0000000000..5fc8931691
--- /dev/null
+++ b/src/mito2/src/read/range_cache.rs
@@ -0,0 +1,856 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Utilities for the partition range scan result cache.
+
+use std::mem;
+use std::sync::Arc;
+
+use async_stream::try_stream;
+use common_time::range::TimestampRange;
+use datatypes::arrow::array::{Array, AsArray, DictionaryArray};
+use datatypes::arrow::datatypes::UInt32Type;
+use datatypes::arrow::record_batch::RecordBatch;
+use datatypes::prelude::ConcreteDataType;
+use futures::TryStreamExt;
+use store_api::region_engine::PartitionRange;
+use store_api::storage::{ColumnId, FileId, RegionId, TimeSeriesRowSelector};
+
+use crate::cache::CacheStrategy;
+use crate::read::BoxedRecordBatchStream;
+use crate::read::scan_region::StreamContext;
+use crate::read::scan_util::PartitionMetrics;
+use crate::region::options::MergeMode;
+use crate::sst::file::FileTimeRange;
+use crate::sst::parquet::flat_format::primary_key_column_index;
+
+/// Fingerprint of the scan request fields that affect partition range cache reuse.
+///
+/// It records a normalized view of the projected columns and filters, plus
+/// scan options that can change the returned rows. Schema-dependent metadata
+/// and the partition expression version are included so cached results are not
+/// reused across incompatible schema or partitioning changes.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct ScanRequestFingerprint {
+    /// Projection and filters without the time index and partition exprs.
+    inner: Arc<SharedScanRequestFingerprint>,
+    /// Filters with the time index column.
+    time_filters: Option<Arc<Vec<String>>>,
+    series_row_selector: Option<TimeSeriesRowSelector>,
+    append_mode: bool,
+    filter_deleted: bool,
+    merge_mode: MergeMode,
+    /// We keep the partition expr version to ensure we won't reuse the fingerprint after we change the partition expr.
+    /// We store the version instead of the whole partition expr or partition expr filters.
+    partition_expr_version: u64,
+}
+
+#[derive(Debug)]
+pub(crate) struct ScanRequestFingerprintBuilder {
+    pub(crate) read_column_ids: Vec<ColumnId>,
+    pub(crate) read_column_types: Vec<Option<ConcreteDataType>>,
+    pub(crate) filters: Vec<String>,
+    pub(crate) time_filters: Vec<String>,
+    pub(crate) series_row_selector: Option<TimeSeriesRowSelector>,
+    pub(crate) append_mode: bool,
+    pub(crate) filter_deleted: bool,
+    pub(crate) merge_mode: MergeMode,
+    pub(crate) partition_expr_version: u64,
+}
+
+impl ScanRequestFingerprintBuilder {
+    pub(crate) fn build(self) -> ScanRequestFingerprint {
+        let Self {
+            read_column_ids,
+            read_column_types,
+            filters,
+            time_filters,
+            series_row_selector,
+            append_mode,
+            filter_deleted,
+            merge_mode,
+            partition_expr_version,
+        } = self;
+
+        ScanRequestFingerprint {
+            inner: Arc::new(SharedScanRequestFingerprint {
+                read_column_ids,
+                read_column_types,
+                filters,
+            }),
+            time_filters: (!time_filters.is_empty()).then(|| Arc::new(time_filters)),
+            series_row_selector,
+            append_mode,
+            filter_deleted,
+            merge_mode,
+            partition_expr_version,
+        }
+    }
+}
+
+/// Non-copiable struct of the fingerprint.
+#[derive(Debug, PartialEq, Eq, Hash)]
+struct SharedScanRequestFingerprint {
+    /// Column ids of the projection.
+    read_column_ids: Vec<ColumnId>,
+    /// Column types of the projection.
+    /// We keep this to ensure we won't reuse the fingerprint after a schema change.
+    read_column_types: Vec<Option<ConcreteDataType>>,
+    /// Filters without the time index column and region partition exprs.
+    filters: Vec<String>,
+}
+
+impl ScanRequestFingerprint {
+    #[cfg(test)]
+    pub(crate) fn read_column_ids(&self) -> &[ColumnId] {
+        &self.inner.read_column_ids
+    }
+
+    #[cfg(test)]
+    pub(crate) fn read_column_types(&self) -> &[Option<ConcreteDataType>] {
+        &self.inner.read_column_types
+    }
+
+    #[cfg(test)]
+    pub(crate) fn filters(&self) -> &[String] {
+        &self.inner.filters
+    }
+
+    #[cfg(test)]
+    pub(crate) fn time_filters(&self) -> &[String] {
+        self.time_filters
+            .as_deref()
+            .map(Vec::as_slice)
+            .unwrap_or(&[])
+    }
+
+    pub(crate) fn without_time_filters(&self) -> Self {
+        Self {
+            inner: Arc::clone(&self.inner),
+            time_filters: None,
+            series_row_selector: self.series_row_selector,
+            append_mode: self.append_mode,
+            filter_deleted: self.filter_deleted,
+            merge_mode: self.merge_mode,
+            partition_expr_version: self.partition_expr_version,
+        }
+    }
+
+    pub(crate) fn estimated_size(&self) -> usize {
+        mem::size_of::<SharedScanRequestFingerprint>()
+            + self.inner.read_column_ids.capacity() * mem::size_of::<ColumnId>()
+            + self.inner.read_column_types.capacity() * mem::size_of::<Option<ConcreteDataType>>()
+            + self.inner.filters.capacity() * mem::size_of::<String>()
+            + self
+                .inner
+                .filters
+                .iter()
+                .map(|filter| filter.capacity())
+                .sum::<usize>()
+            + self.time_filters.as_ref().map_or(0, |filters| {
+                mem::size_of::<Vec<String>>()
+                    + filters.capacity() * mem::size_of::<String>()
+                    + filters
+                        .iter()
+                        .map(|filter| filter.capacity())
+                        .sum::<usize>()
+            })
+    }
+}
+
+/// Cache key for range scan outputs.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct RangeScanCacheKey {
+    pub(crate) region_id: RegionId,
+    /// Sorted (file_id, row_group_index) pairs that uniquely identify the data this range covers.
+    pub(crate) row_groups: Vec<(FileId, i64)>,
+    pub(crate) scan: ScanRequestFingerprint,
+}
+
+impl RangeScanCacheKey {
+    pub(crate) fn estimated_size(&self) -> usize {
+        mem::size_of::<Self>()
+            + self.row_groups.capacity() * mem::size_of::<(FileId, i64)>()
+            + self.scan.estimated_size()
+    }
+}
+
+/// Cached result for one range scan.
+pub(crate) struct RangeScanCacheValue {
+    pub(crate) batches: Vec<RecordBatch>,
+    /// Precomputed size of all batches, accounting for shared dictionary values.
+    estimated_batches_size: usize,
+}
+
+impl RangeScanCacheValue {
+    pub(crate) fn new(batches: Vec<RecordBatch>, estimated_batches_size: usize) -> Self {
+        Self {
+            batches,
+            estimated_batches_size,
+        }
+    }
+
+    pub(crate) fn estimated_size(&self) -> usize {
+        mem::size_of::<Self>()
+            + self.batches.capacity() * mem::size_of::<RecordBatch>()
+            + self.estimated_batches_size
+    }
+}
+
+/// Row groups and whether all sources are file-only for a partition range.
+#[allow(dead_code)]
+pub(crate) struct PartitionRangeRowGroups {
+    /// Sorted (file_id, row_group_index) pairs.
+    pub(crate) row_groups: Vec<(FileId, i64)>,
+    pub(crate) only_file_sources: bool,
+}
+
+/// Collects (file_id, row_group_index) pairs from a partition range's row group indices.
+#[allow(dead_code)]
+pub(crate) fn collect_partition_range_row_groups(
+    stream_ctx: &StreamContext,
+    part_range: &PartitionRange,
+) -> PartitionRangeRowGroups {
+    let range_meta = &stream_ctx.ranges[part_range.identifier];
+    let mut row_groups = Vec::new();
+    let mut only_file_sources = true;
+
+    for index in &range_meta.row_group_indices {
+        if stream_ctx.is_file_range_index(*index) {
+            let file_id = stream_ctx.input.file_from_index(*index).file_id().file_id();
+            row_groups.push((file_id, index.row_group_index));
+        } else {
+            only_file_sources = false;
+        }
+    }
+
+    row_groups.sort_unstable_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()).then(a.1.cmp(&b.1)));
+
+    PartitionRangeRowGroups {
+        row_groups,
+        only_file_sources,
+    }
+}
+
+/// Builds a cache key for the given partition range if it is eligible for caching.
+#[allow(dead_code)]
+pub(crate) fn build_range_cache_key(
+    stream_ctx: &StreamContext,
+    part_range: &PartitionRange,
+) -> Option<RangeScanCacheKey> {
+    let fingerprint = stream_ctx.scan_fingerprint.as_ref()?;
+
+    // Dyn filters can change at runtime, so we can't cache when they're present.
+    let has_dyn_filters = stream_ctx
+        .input
+        .predicate_group()
+        .predicate_without_region()
+        .is_some_and(|p| !p.dyn_filters().is_empty());
+    if has_dyn_filters {
+        return None;
+    }
+
+    let rg = collect_partition_range_row_groups(stream_ctx, part_range);
+    if !rg.only_file_sources || rg.row_groups.is_empty() {
+        return None;
+    }
+
+    let range_meta = &stream_ctx.ranges[part_range.identifier];
+    let scan = if query_time_range_covers_partition_range(
+        stream_ctx.input.time_range.as_ref(),
+        range_meta.time_range,
+    ) {
+        fingerprint.without_time_filters()
+    } else {
+        fingerprint.clone()
+    };
+
+    Some(RangeScanCacheKey {
+        region_id: stream_ctx.input.region_metadata().region_id,
+        row_groups: rg.row_groups,
+        scan,
+    })
+}
+
+#[allow(dead_code)]
+fn query_time_range_covers_partition_range(
+    query_time_range: Option<&TimestampRange>,
+    partition_time_range: FileTimeRange,
+) -> bool {
+    let Some(query_time_range) = query_time_range else {
+        return true;
+    };
+
+    let (part_start, part_end) = partition_time_range;
+    query_time_range.contains(&part_start) && query_time_range.contains(&part_end)
+}
+
+/// Returns a stream that replays cached record batches.
+#[allow(dead_code)]
+pub(crate) fn cached_flat_range_stream(value: Arc<RangeScanCacheValue>) -> BoxedRecordBatchStream {
+    Box::pin(futures::stream::iter(
+        value.batches.clone().into_iter().map(Ok),
+    ))
+}
+
+/// Returns true if two primary key dictionary arrays share the same underlying
+/// values buffers by pointer comparison.
+///
+/// The primary key column is always `DictionaryArray<UInt32Type>` with `Binary` values.
+fn pk_values_ptr_eq(a: &DictionaryArray<UInt32Type>, b: &DictionaryArray<UInt32Type>) -> bool {
+    let a = a.values().as_binary::<i32>();
+    let b = b.values().as_binary::<i32>();
+    let values_eq = a.values().ptr_eq(b.values()) && a.offsets().ptr_eq(b.offsets());
+    match (a.nulls(), b.nulls()) {
+        (Some(a), Some(b)) => values_eq && a.inner().ptr_eq(b.inner()),
+        (None, None) => values_eq,
+        _ => false,
+    }
+}
+
+/// Buffers record batches for caching, tracking memory size while deduplicating
+/// shared dictionary values across batches.
+///
+/// Uses the primary key column as a proxy to detect dictionary sharing: if the PK
+/// column's dictionary values are pointer-equal across batches, we assume all
+/// dictionary columns share their values and deduct the total dictionary values size.
+struct CacheBatchBuffer {
+    batches: Vec<RecordBatch>,
+    /// Running total of batch memory.
+    total_size: usize,
+    /// The first batch's PK dictionary array, for pointer comparison.
+    /// `None` if no dictionary PK column exists or no batch has been added yet.
+    first_pk_dict: Option<DictionaryArray<UInt32Type>>,
+    /// Sum of `get_array_memory_size()` of all dictionary value arrays from the first batch.
+    total_dict_values_size: usize,
+    /// Whether the PK dictionary is still shared across all batches seen so far.
+    shared: bool,
+}
+
+impl CacheBatchBuffer {
+    fn new() -> Self {
+        Self {
+            batches: Vec::new(),
+            total_size: 0,
+            first_pk_dict: None,
+            total_dict_values_size: 0,
+            shared: true,
+        }
+    }
+
+    fn push(&mut self, batch: RecordBatch) {
+        if self.batches.is_empty() {
+            self.init_first_batch(&batch);
+        } else {
+            self.add_subsequent_batch(&batch);
+        }
+        self.batches.push(batch);
+    }
+
+    fn init_first_batch(&mut self, batch: &RecordBatch) {
+        self.total_size += batch.get_array_memory_size();
+
+        let pk_col_idx = primary_key_column_index(batch.num_columns());
+        let mut total_dict_values_size = 0;
+        for col_idx in 0..batch.num_columns() {
+            let col = batch.column(col_idx);
+            if let Some(dict) = col.as_any().downcast_ref::<DictionaryArray<UInt32Type>>() {
+                total_dict_values_size += dict.values().get_array_memory_size();
+                if col_idx == pk_col_idx {
+                    self.first_pk_dict = Some(dict.clone());
+                }
+            }
+        }
+        self.total_dict_values_size = total_dict_values_size;
+    }
+
+    fn add_subsequent_batch(&mut self, batch: &RecordBatch) {
+        let batch_size = batch.get_array_memory_size();
+
+        if self.shared
+            && let Some(first_pk_dict) = &self.first_pk_dict
+        {
+            let pk_col_idx = primary_key_column_index(batch.num_columns());
+            let col = batch.column(pk_col_idx);
+            if let Some(dict) = col.as_any().downcast_ref::<DictionaryArray<UInt32Type>>()
+                && pk_values_ptr_eq(first_pk_dict, dict)
+            {
+                // PK dict is shared, deduct all dict values sizes.
+                self.total_size += batch_size - self.total_dict_values_size;
+                return;
+            }
+            // Dictionary diverged.
+            self.shared = false;
+        }
+
+        self.total_size += batch_size;
+    }
+
+    fn estimated_batches_size(&self) -> usize {
+        self.total_size
+    }
+
+    fn into_batches(self) -> Vec<RecordBatch> {
+        self.batches
+    }
+}
+
+/// Wraps a stream to cache its output for future range cache hits.
+#[allow(dead_code)]
+pub(crate) fn cache_flat_range_stream(
+    mut stream: BoxedRecordBatchStream,
+    cache_strategy: CacheStrategy,
+    key: RangeScanCacheKey,
+    part_metrics: PartitionMetrics,
+) -> BoxedRecordBatchStream {
+    Box::pin(try_stream! {
+        let mut buffer = CacheBatchBuffer::new();
+        while let Some(batch) = stream.try_next().await? {
+            buffer.push(batch.clone());
+            yield batch;
+        }
+
+        let estimated_size = buffer.estimated_batches_size();
+        let batches = buffer.into_batches();
+        let value = Arc::new(RangeScanCacheValue::new(batches, estimated_size));
+        part_metrics.inc_range_cache_size(key.estimated_size() + value.estimated_size());
+        cache_strategy.put_range_result(key, value);
+    })
+}
+
+/// Creates a `cache_flat_range_stream` with dummy internals for benchmarking.
+///
+/// This avoids exposing `RangeScanCacheKey`, `ScanRequestFingerprint`, and
+/// `PartitionMetrics` publicly.
+#[cfg(feature = "test")]
+pub fn bench_cache_flat_range_stream(
+    stream: BoxedRecordBatchStream,
+    cache_size_bytes: u64,
+    region_id: RegionId,
+) -> BoxedRecordBatchStream {
+    use std::time::Instant;
+
+    use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
+
+    use crate::region::options::MergeMode;
+
+    let cache_manager = Arc::new(
+        crate::cache::CacheManager::builder()
+            .range_result_cache_size(cache_size_bytes)
+            .build(),
+    );
+    let cache_strategy = CacheStrategy::EnableAll(cache_manager);
+
+    let fingerprint = ScanRequestFingerprintBuilder {
+        read_column_ids: vec![],
+        read_column_types: vec![],
+        filters: vec![],
+        time_filters: vec![],
+        series_row_selector: None,
+        append_mode: false,
+        filter_deleted: false,
+        merge_mode: MergeMode::LastRow,
+        partition_expr_version: 0,
+    }
+    .build();
+
+    let key = RangeScanCacheKey {
+        region_id,
+        row_groups: vec![],
+        scan: fingerprint,
+    };
+
+    let metrics_set = ExecutionPlanMetricsSet::new();
+    let part_metrics =
+        PartitionMetrics::new(region_id, 0, "bench", Instant::now(), false, &metrics_set);
+
+    cache_flat_range_stream(stream, cache_strategy, key, part_metrics)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+    use std::time::Instant;
+
+    use common_time::Timestamp;
+    use common_time::range::TimestampRange;
+    use common_time::timestamp::TimeUnit;
+    use datafusion_common::ScalarValue;
+    use datafusion_expr::{Expr, col, lit};
+    use smallvec::smallvec;
+    use store_api::storage::FileId;
+
+    use super::*;
+    use crate::cache::CacheManager;
+    use crate::read::projection::ProjectionMapper;
+    use crate::read::range::{RangeMeta, RowGroupIndex, SourceIndex};
+    use crate::read::scan_region::{PredicateGroup, ScanInput};
+    use crate::test_util::memtable_util::metadata_with_primary_key;
+    use crate::test_util::scheduler_util::SchedulerEnv;
+    use crate::test_util::sst_util::sst_file_handle_with_file_id;
+
+    fn test_cache_strategy() -> CacheStrategy {
+        CacheStrategy::EnableAll(Arc::new(
+            CacheManager::builder()
+                .range_result_cache_size(1024)
+                .build(),
+        ))
+    }
+
+    async fn new_stream_context(
+        filters: Vec<Expr>,
+        query_time_range: Option<TimestampRange>,
+        partition_time_range: FileTimeRange,
+    ) -> (StreamContext, PartitionRange) {
+        let env = SchedulerEnv::new().await;
+        let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
+        let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap();
+        let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap();
+        let file_id = FileId::random();
+        let file = sst_file_handle_with_file_id(
+            file_id,
+            partition_time_range.0.value(),
+            partition_time_range.1.value(),
+        );
+        let input = ScanInput::new(env.access_layer.clone(), mapper)
+            .with_predicate(predicate)
+            .with_time_range(query_time_range)
+            .with_files(vec![file])
+            .with_cache(test_cache_strategy())
+            .with_flat_format(true);
+        let range_meta = RangeMeta {
+            time_range: partition_time_range,
+            indices: smallvec![SourceIndex {
+                index: 0,
+                num_row_groups: 1,
+            }],
+            row_group_indices: smallvec![RowGroupIndex {
+                index: 0,
+                row_group_index: 0,
+            }],
+            num_rows: 10,
+        };
+        let partition_range = range_meta.new_partition_range(0);
+        let scan_fingerprint = crate::read::scan_region::build_scan_fingerprint(&input);
+        let stream_ctx = StreamContext {
+            input,
+            ranges: vec![range_meta],
+            scan_fingerprint,
+            query_start: Instant::now(),
+        };
+
+        (stream_ctx, partition_range)
+    }
+
+    /// Helper to create a timestamp millisecond literal.
+    fn ts_lit(val: i64) -> Expr {
+        lit(ScalarValue::TimestampMillisecond(Some(val), None))
+    }
+
+    #[tokio::test]
+    async fn strips_time_only_filters_when_query_covers_partition_range() {
+        let (stream_ctx, part_range) = new_stream_context(
+            vec![
+                col("ts").gt_eq(ts_lit(1000)),
+                col("ts").lt(ts_lit(2001)),
+                col("ts").is_not_null(),
+                col("k0").eq(lit("foo")),
+            ],
+            TimestampRange::with_unit(1000, 2002, TimeUnit::Millisecond),
+            (
+                Timestamp::new_millisecond(1000),
+                Timestamp::new_millisecond(2000),
+            ),
+        )
+        .await;
+
+        let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
+
+        // Range-reducible time filters should be cleared when query covers partition range.
+        assert!(key.scan.time_filters().is_empty());
+        // Non-range time predicates stay in filters.
+        let mut expected_filters = [
+            col("k0").eq(lit("foo")).to_string(),
+            col("ts").is_not_null().to_string(),
+        ];
+        expected_filters.sort_unstable();
+        assert_eq!(key.scan.filters(), expected_filters.as_slice());
+    }
+
+    #[tokio::test]
+    async fn preserves_time_filters_when_query_does_not_cover_partition_range() {
+        let (stream_ctx, part_range) = new_stream_context(
+            vec![col("ts").gt_eq(ts_lit(1000)), col("k0").eq(lit("foo"))],
+            TimestampRange::with_unit(1000, 1500, TimeUnit::Millisecond),
+            (
+                Timestamp::new_millisecond(1000),
+                Timestamp::new_millisecond(2000),
+            ),
+        )
+        .await;
+
+        let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
+
+        // Time filters should be preserved when query does not cover partition range.
+        assert_eq!(
+            key.scan.time_filters(),
+            [col("ts").gt_eq(ts_lit(1000)).to_string()].as_slice()
+        );
+        assert_eq!(
+            key.scan.filters(),
+            [col("k0").eq(lit("foo")).to_string()].as_slice()
+        );
+    }
+
+    #[tokio::test]
+    async fn strips_time_only_filters_when_query_has_no_time_range_limit() {
+        let (stream_ctx, part_range) = new_stream_context(
+            vec![
+                col("ts").gt_eq(ts_lit(1000)),
+                col("ts").is_not_null(),
+                col("k0").eq(lit("foo")),
+            ],
+            None,
+            (
+                Timestamp::new_millisecond(1000),
+                Timestamp::new_millisecond(2000),
+            ),
+        )
+        .await;
+
+        let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
+
+        // Range-reducible time filters should be cleared when query has no time range limit.
+        assert!(key.scan.time_filters().is_empty());
+        // Non-range time predicates stay in filters.
+        let mut expected_filters = [
+            col("k0").eq(lit("foo")).to_string(),
+            col("ts").is_not_null().to_string(),
+        ];
+        expected_filters.sort_unstable();
+        assert_eq!(key.scan.filters(), expected_filters.as_slice());
+    }
+
+    #[test]
+    fn normalizes_and_clears_time_filters() {
+        let normalized = ScanRequestFingerprintBuilder {
+            read_column_ids: vec![1, 2],
+            read_column_types: vec![None, None],
+            filters: vec!["k0 = 'foo'".to_string()],
+            time_filters: vec![],
+            series_row_selector: None,
+            append_mode: false,
+            filter_deleted: true,
+            merge_mode: MergeMode::LastRow,
+            partition_expr_version: 0,
+        }
+        .build();
+
+        assert!(normalized.time_filters().is_empty());
+
+        let fingerprint = ScanRequestFingerprintBuilder {
+            read_column_ids: vec![1, 2],
+            read_column_types: vec![None, None],
+            filters: vec!["k0 = 'foo'".to_string()],
+            time_filters: vec!["ts >= 1000".to_string()],
+            series_row_selector: Some(TimeSeriesRowSelector::LastRow),
+            append_mode: false,
+            filter_deleted: true,
+            merge_mode: MergeMode::LastRow,
+            partition_expr_version: 7,
+        }
+        .build();
+
+        let reset = fingerprint.without_time_filters();
+
+        assert_eq!(reset.read_column_ids(), fingerprint.read_column_ids());
+        assert_eq!(reset.read_column_types(), fingerprint.read_column_types());
+        assert_eq!(reset.filters(), fingerprint.filters());
+        assert!(reset.time_filters().is_empty());
+        assert_eq!(reset.series_row_selector, fingerprint.series_row_selector);
+        assert_eq!(reset.append_mode, fingerprint.append_mode);
+        assert_eq!(reset.filter_deleted, fingerprint.filter_deleted);
+        assert_eq!(reset.merge_mode, fingerprint.merge_mode);
+        assert_eq!(
+            reset.partition_expr_version,
+            fingerprint.partition_expr_version
+        );
+    }
+
+    /// Creates a test schema with 5 columns where the primary key dictionary column
+    /// is at index 2 (`num_columns - 3`), matching the flat format layout.
+    ///
+    /// Layout: `[field0: Int64, field1: Int64, pk: Dictionary<UInt32,Binary>, ts: Int64, seq: Int64]`
+    fn dict_test_schema() -> Arc<datatypes::arrow::datatypes::Schema> {
+        use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema};
+        Arc::new(Schema::new(vec![
+            Field::new("field0", ArrowDataType::Int64, false),
+            Field::new("field1", ArrowDataType::Int64, false),
+            Field::new(
+                "pk",
+                ArrowDataType::Dictionary(
+                    Box::new(ArrowDataType::UInt32),
+                    Box::new(ArrowDataType::Binary),
+                ),
+                false,
+            ),
+            Field::new("ts", ArrowDataType::Int64, false),
+            Field::new("seq", ArrowDataType::Int64, false),
+        ]))
+    }
+
+    /// Helper to create a record batch with a dictionary column at the primary key position.
+    fn make_dict_batch(
+        schema: Arc<datatypes::arrow::datatypes::Schema>,
+        dict_values: &datatypes::arrow::array::BinaryArray,
+        keys: &[u32],
+        int_values: &[i64],
+    ) -> RecordBatch {
+        use datatypes::arrow::array::{Int64Array, UInt32Array};
+
+        let key_array = UInt32Array::from(keys.to_vec());
+        let dict_array: DictionaryArray<UInt32Type> =
+            DictionaryArray::new(key_array, Arc::new(dict_values.clone()));
+        let int_array = Int64Array::from(int_values.to_vec());
+        let zeros = Int64Array::from(vec![0i64; int_values.len()]);
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(zeros.clone()),
+                Arc::new(int_array),
+                Arc::new(dict_array),
+                Arc::new(zeros.clone()),
+                Arc::new(zeros),
+            ],
+        )
+        .unwrap()
+    }
+
+    /// Computes the total `get_array_memory_size()` of all dictionary value arrays in a batch.
+    fn compute_total_dict_values_size(batch: &RecordBatch) -> usize {
+        batch
+            .columns()
+            .iter()
+            .filter_map(|col| {
+                col.as_any()
+                    .downcast_ref::<DictionaryArray<UInt32Type>>()
+                    .map(|dict| dict.values().get_array_memory_size())
+            })
+            .sum()
+    }
+
+    #[test]
+    fn cache_batch_buffer_empty() {
+        let buffer = CacheBatchBuffer::new();
+        assert_eq!(buffer.estimated_batches_size(), 0);
+        assert!(buffer.into_batches().is_empty());
+    }
+
+    #[test]
+    fn cache_batch_buffer_single_batch() {
+        use datatypes::arrow::array::BinaryArray;
+
+        let schema = dict_test_schema();
+        let dict_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]);
+        let batch = make_dict_batch(schema, &dict_values, &[0, 1, 2], &[10, 20, 30]);
+
+        let full_size = batch.get_array_memory_size();
+
+        let mut buffer = CacheBatchBuffer::new();
+        buffer.push(batch);
+        assert_eq!(buffer.estimated_batches_size(), full_size);
+        assert_eq!(buffer.into_batches().len(), 1);
+    }
+
+    #[test]
+    fn cache_batch_buffer_shared_dictionary() {
+        use datatypes::arrow::array::BinaryArray;
+
+        let schema = dict_test_schema();
+        let dict_values = BinaryArray::from_vec(vec![b"alpha", b"beta", b"gamma"]);
+
+        // Two batches sharing the same dictionary values array.
+        let batch1 = make_dict_batch(schema.clone(), &dict_values, &[0, 1], &[10, 20]);
+        let batch2 = make_dict_batch(schema, &dict_values, &[1, 2], &[30, 40]);
+
+        let batch1_full = batch1.get_array_memory_size();
+        let batch2_full = batch2.get_array_memory_size();
+
+        // The total dictionary values size that should be deduplicated for the second batch.
+        let dict_values_size = compute_total_dict_values_size(&batch2);
+
+        let mut buffer = CacheBatchBuffer::new();
+        buffer.push(batch1);
+        buffer.push(batch2);
+
+        // Second batch's dict values should not be counted again.
+        assert_eq!(
+            buffer.estimated_batches_size(),
+            batch1_full + batch2_full - dict_values_size
+        );
+        assert_eq!(buffer.into_batches().len(), 2);
+    }
+
+    #[test]
+    fn cache_batch_buffer_non_shared_dictionary() {
+        use datatypes::arrow::array::BinaryArray;
+
+        let schema = dict_test_schema();
+        let dict_values1 = BinaryArray::from_vec(vec![b"a", b"b"]);
+        let dict_values2 = BinaryArray::from_vec(vec![b"x", b"y"]);
+
+        let batch1 = make_dict_batch(schema.clone(), &dict_values1, &[0, 1], &[10, 20]);
+        let batch2 = make_dict_batch(schema, &dict_values2, &[0, 1], &[30, 40]);
+
+        let batch1_full = batch1.get_array_memory_size();
+        let batch2_full = batch2.get_array_memory_size();
+
+        let mut buffer = CacheBatchBuffer::new();
+        buffer.push(batch1);
+        buffer.push(batch2);
+
+        // Different dictionaries: full size for both.
+        assert_eq!(buffer.estimated_batches_size(), batch1_full + batch2_full);
+    }
+
+    #[test]
+    fn cache_batch_buffer_shared_then_diverged() {
+        use datatypes::arrow::array::BinaryArray;
+
+        let schema = dict_test_schema();
+        let shared_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]);
+        let different_values = BinaryArray::from_vec(vec![b"x", b"y"]);
+
+        let batch1 = make_dict_batch(schema.clone(), &shared_values, &[0], &[1]);
+        let batch2 = make_dict_batch(schema.clone(), &shared_values, &[1], &[2]);
+        let batch3 = make_dict_batch(schema, &different_values, &[0], &[3]);
+
+        let size1 = batch1.get_array_memory_size();
+        let size2 = batch2.get_array_memory_size();
+        let size3 = batch3.get_array_memory_size();
+
+        let dict_values_size = compute_total_dict_values_size(&batch2);
+
+        let mut buffer = CacheBatchBuffer::new();
+        buffer.push(batch1);
+        buffer.push(batch2);
+        buffer.push(batch3);
+
+        // batch2 shares dict with batch1 (dedup), batch3 does not (full size).
+        assert_eq!(
+            buffer.estimated_batches_size(),
+            size1 + (size2 - dict_values_size) + size3
+        );
+    }
+}
diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs
index 5d934afd2d..e7cae7e7b8 100644
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -40,7 +40,7 @@ use store_api::region_engine::{PartitionRange, RegionScannerRef};
 use store_api::storage::{
     ColumnId, RegionId, ScanRequest, SequenceRange, TimeSeriesDistribution, TimeSeriesRowSelector,
 };
-use table::predicate::{Predicate, build_time_range_predicate};
+use table::predicate::{Predicate, build_time_range_predicate, extract_time_range_from_expr};
 use tokio::sync::{Semaphore, mpsc};
 use tokio_stream::wrappers::ReceiverStream;
 
@@ -55,6 +55,7 @@ use crate::metrics::READ_SST_COUNT;
 use crate::read::compat::{self, CompatBatch, FlatCompatBatch, PrimaryKeyCompatBatch};
 use crate::read::projection::ProjectionMapper;
 use crate::read::range::{FileRangeBuilder, MemRangeBuilder, RangeMeta, RowGroupIndex};
+use crate::read::range_cache::ScanRequestFingerprint;
 use crate::read::seq_scan::SeqScan;
 use crate::read::series_scan::SeriesScan;
 use crate::read::stream::ScanBatchStream;
@@ -815,7 +816,7 @@ pub struct ScanInput {
     /// But this read columns might also include non-projected columns needed for filtering.
     pub(crate) read_column_ids: Vec<ColumnId>,
     /// Time range filter for time index.
-    time_range: Option<TimestampRange>,
+    pub(crate) time_range: Option<TimestampRange>,
     /// Predicate to push down.
     pub(crate) predicate: PredicateGroup,
     /// Region partition expr applied at read time.
@@ -1417,6 +1418,105 @@ fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode {
     }
 }
 
+/// Builds a [ScanRequestFingerprint] from a [ScanInput] if the scan is eligible
+/// for partition range caching.
+pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFingerprint> {
+    let eligible = input.flat_format
+        && !input.compaction
+        && !input.files.is_empty()
+        && matches!(input.cache_strategy, CacheStrategy::EnableAll(_));
+
+    if !eligible {
+        return None;
+    }
+
+    let metadata = input.region_metadata();
+    let tag_names: HashSet<&str> = metadata
+        .column_metadatas
+        .iter()
+        .filter(|col| col.semantic_type == SemanticType::Tag)
+        .map(|col| col.column_schema.name.as_str())
+        .collect();
+
+    let time_index = metadata.time_index_column();
+    let time_index_name = time_index.column_schema.name.clone();
+    let ts_col_unit = time_index
+        .column_schema
+        .data_type
+        .as_timestamp()
+        .expect("Time index must have timestamp-compatible type")
+        .unit();
+
+    let exprs = input
+        .predicate_group()
+        .predicate_without_region()
+        .map(|predicate| predicate.exprs())
+        .unwrap_or_default();
+
+    let mut filters = Vec::new();
+    let mut time_filters = Vec::new();
+    let mut has_tag_filter = false;
+    let mut columns = HashSet::new();
+
+    for expr in exprs {
+        columns.clear();
+        let is_time_only = match expr_to_columns(expr, &mut columns) {
+            Ok(()) if !columns.is_empty() => {
+                has_tag_filter |= columns
+                    .iter()
+                    .any(|col| tag_names.contains(col.name.as_str()));
+                columns.iter().all(|col| col.name == time_index_name)
+            }
+            _ => false,
+        };
+
+        if is_time_only
+            && extract_time_range_from_expr(&time_index_name, ts_col_unit, expr).is_some()
+        {
+            // Range-reducible time predicates can be safely dropped from the
+            // cache key when the query time range covers the partition range.
+            time_filters.push(expr.to_string());
+        } else {
+            // Non-time filters and non-range time predicates (those that
+            // extract_time_range_from_expr cannot convert to a TimestampRange)
+            // always stay in the cache key.
+            filters.push(expr.to_string());
+        }
+    }
+
+    if !has_tag_filter {
+        // We only cache requests that have tag filters to avoid caching all series.
+        return None;
+    }
+
+    // Ensure the filters are sorted for consistent fingerprinting.
+    filters.sort_unstable();
+    time_filters.sort_unstable();
+
+    Some(
+        crate::read::range_cache::ScanRequestFingerprintBuilder {
+            read_column_ids: input.read_column_ids.clone(),
+            read_column_types: input
+                .read_column_ids
+                .iter()
+                .map(|id| {
+                    metadata
+                        .column_by_id(*id)
+                        .map(|col| col.column_schema.data_type.clone())
+                })
+                .collect(),
+            filters,
+            time_filters,
+            series_row_selector: input.series_row_selector,
+            append_mode: input.append_mode,
+            filter_deleted: input.filter_deleted,
+            merge_mode: input.merge_mode,
+            partition_expr_version: metadata.partition_expr_version,
+        }
+        .build(),
+    )
+}
+
 /// Context shared by different streams from a scanner.
 /// It contains the input and ranges to scan.
 pub struct StreamContext {
@@ -1424,6 +1524,10 @@ pub struct StreamContext {
     pub input: ScanInput,
     /// Metadata for partition ranges.
     pub(crate) ranges: Vec<RangeMeta>,
+    /// Precomputed scan fingerprint for partition range caching.
+    /// `None` when the scan is not eligible for caching.
+    #[allow(dead_code)]
+    pub(crate) scan_fingerprint: Option<ScanRequestFingerprint>,
 
     // Metrics:
     /// The start time of the query.
@@ -1436,10 +1540,12 @@ impl StreamContext {
         let query_start = input.query_start.unwrap_or_else(Instant::now);
         let ranges = RangeMeta::seq_scan_ranges(&input);
         READ_SST_COUNT.observe(input.num_files() as f64);
+        let scan_fingerprint = build_scan_fingerprint(&input);
 
         Self {
             input,
             ranges,
+            scan_fingerprint,
             query_start,
         }
     }
@@ -1449,10 +1555,12 @@ impl StreamContext {
         let query_start = input.query_start.unwrap_or_else(Instant::now);
         let ranges = RangeMeta::unordered_scan_ranges(&input);
         READ_SST_COUNT.observe(input.num_files() as f64);
+        let scan_fingerprint = build_scan_fingerprint(&input);
 
         Self {
             input,
             ranges,
+            scan_fingerprint,
             query_start,
         }
     }
@@ -1762,11 +1870,17 @@ mod tests {
     use std::sync::Arc;
 
     use datafusion::physical_plan::expressions::lit as physical_lit;
+    use datafusion_common::ScalarValue;
     use datafusion_expr::{col, lit};
-    use store_api::storage::ScanRequest;
+    use datatypes::value::Value;
+    use partition::expr::col as partition_col;
+    use store_api::metadata::RegionMetadataBuilder;
+    use store_api::storage::{ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector};
 
     use super::*;
+    use crate::cache::CacheManager;
     use crate::memtable::time_partition::TimePartitions;
+    use crate::read::range_cache::ScanRequestFingerprintBuilder;
     use crate::region::options::RegionOptions;
     use crate::region::version::VersionBuilder;
     use crate::sst::FormatType;
@@ -1804,6 +1918,26 @@ mod tests {
         )
     }
 
+    async fn new_scan_input(metadata: RegionMetadataRef, filters: Vec<Expr>) -> ScanInput {
+        let env = SchedulerEnv::new().await;
+        let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap();
+        let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap();
+        let file = FileHandle::new(
+            crate::sst::file::FileMeta::default(),
+            Arc::new(crate::sst::file_purger::NoopFilePurger),
+        );
+
+        ScanInput::new(env.access_layer.clone(), mapper)
+            .with_predicate(predicate)
+            .with_cache(CacheStrategy::EnableAll(Arc::new(
+                CacheManager::builder()
+                    .range_result_cache_size(1024)
+                    .build(),
+            )))
+            .with_flat_format(true)
+            .with_files(vec![file])
+    }
+
     #[tokio::test]
     async fn test_build_read_column_ids_includes_filters() {
         let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
@@ -1923,6 +2057,138 @@ mod tests {
         assert!(scan_region.use_flat_format());
     }
 
+    /// Helper to create a timestamp millisecond literal.
+    fn ts_lit(val: i64) -> datafusion_expr::Expr {
+        lit(ScalarValue::TimestampMillisecond(Some(val), None))
+    }
+
+    #[tokio::test]
+    async fn test_build_scan_fingerprint_for_eligible_scan() {
+        let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
+        let input = new_scan_input(
+            metadata.clone(),
+            vec![
+                col("ts").gt_eq(ts_lit(1000)),
+                col("k0").eq(lit("foo")),
+                col("v0").gt(lit(1)),
+            ],
+        )
+        .await
+        .with_distribution(Some(TimeSeriesDistribution::PerSeries))
+        .with_series_row_selector(Some(TimeSeriesRowSelector::LastRow))
+        .with_merge_mode(MergeMode::LastNonNull)
+        .with_filter_deleted(false);
+
+        let fingerprint = build_scan_fingerprint(&input).unwrap();
+
+        let expected = ScanRequestFingerprintBuilder {
+            read_column_ids: input.read_column_ids.clone(),
+            read_column_types: vec![
+                metadata
+                    .column_by_id(0)
+                    .map(|col| col.column_schema.data_type.clone()),
+                metadata
+                    .column_by_id(2)
+                    .map(|col| col.column_schema.data_type.clone()),
+                metadata
+                    .column_by_id(3)
+                    .map(|col| col.column_schema.data_type.clone()),
+            ],
+            filters: vec![
+                col("k0").eq(lit("foo")).to_string(),
+                col("v0").gt(lit(1)).to_string(),
+            ],
+            time_filters: vec![col("ts").gt_eq(ts_lit(1000)).to_string()],
+            series_row_selector: Some(TimeSeriesRowSelector::LastRow),
+            append_mode: false,
+            filter_deleted: false,
+            merge_mode: MergeMode::LastNonNull,
+            partition_expr_version: 0,
+        }
+        .build();
+        assert_eq!(expected, fingerprint);
+    }
+
+    #[tokio::test]
+    async fn test_build_scan_fingerprint_requires_tag_filter() {
+        let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
+        let input = new_scan_input(
+            metadata,
+            vec![col("ts").gt_eq(lit(1000)), col("v0").gt(lit(1))],
+        )
+        .await;
+
+        assert!(build_scan_fingerprint(&input).is_none());
+    }
+
+    #[tokio::test]
+    async fn test_build_scan_fingerprint_respects_scan_eligibility() {
+        let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
+        let filters = vec![col("k0").eq(lit("foo"))];
+
+        let disabled = ScanInput::new(
+            SchedulerEnv::new().await.access_layer.clone(),
+            ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap(),
+        )
+        .with_predicate(PredicateGroup::new(metadata.as_ref(), &filters).unwrap())
+        .with_flat_format(true);
+        assert!(build_scan_fingerprint(&disabled).is_none());
+
+        let non_flat = new_scan_input(metadata.clone(), filters.clone())
+            .await
+            .with_flat_format(false);
+        assert!(build_scan_fingerprint(&non_flat).is_none());
+
+        let compaction = new_scan_input(metadata.clone(), filters.clone())
+            .await
+            .with_compaction(true);
+        assert!(build_scan_fingerprint(&compaction).is_none());
+
+        // No files to read.
+        let no_files = new_scan_input(metadata, filters).await.with_files(vec![]);
+        assert!(build_scan_fingerprint(&no_files).is_none());
+    }
+
+    #[tokio::test]
+    async fn test_build_scan_fingerprint_tracks_schema_and_partition_expr_changes() {
+        let base = metadata_with_primary_key(vec![0, 1], false);
+        let mut builder = RegionMetadataBuilder::from_existing(base);
+        let partition_expr = partition_col("k0")
+            .gt_eq(Value::String("foo".into()))
+            .as_json_str()
+            .unwrap();
+        builder.partition_expr_json(Some(partition_expr));
+        let metadata = Arc::new(builder.build_without_validation().unwrap());
+
+        let input = new_scan_input(metadata.clone(), vec![col("k0").eq(lit("foo"))]).await;
+        let fingerprint = build_scan_fingerprint(&input).unwrap();
+
+        let expected = ScanRequestFingerprintBuilder {
+            read_column_ids: input.read_column_ids.clone(),
+            read_column_types: vec![
+                metadata
+                    .column_by_id(0)
+                    .map(|col| col.column_schema.data_type.clone()),
+                metadata
+                    .column_by_id(2)
+                    .map(|col| col.column_schema.data_type.clone()),
+                metadata
+                    .column_by_id(3)
+                    .map(|col| col.column_schema.data_type.clone()),
+            ],
+            filters: vec![col("k0").eq(lit("foo")).to_string()],
+            time_filters: vec![],
+            series_row_selector: None,
+            append_mode: false,
+            filter_deleted: true,
+            merge_mode: MergeMode::LastRow,
+            partition_expr_version: metadata.partition_expr_version,
+        }
+        .build();
+        assert_eq!(expected, fingerprint);
+        assert_ne!(0, metadata.partition_expr_version);
+    }
+
     #[test]
     fn test_update_dyn_filters_with_empty_base_predicates() {
         let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs
index 0ee6a4437d..9bf1c17276 100644
--- a/src/mito2/src/read/scan_util.rs
+++ b/src/mito2/src/read/scan_util.rs
@@ -247,6 +247,12 @@ pub(crate) struct ScanMetricsSet {
     num_range_builders: isize,
     /// Peak number of file range builders.
     num_peak_range_builders: isize,
+    /// Total bytes added to the range cache during this scan.
+    range_cache_size: usize,
+    /// Number of range cache hits during this scan.
+    range_cache_hit: usize,
+    /// Number of range cache misses during this scan.
+    range_cache_miss: usize,
 }
 
 /// Wrapper for file metrics that compares by total cost in reverse order.
@@ -345,6 +351,9 @@ impl fmt::Debug for ScanMetricsSet {
             build_ranges_peak_mem_size,
             num_range_builders: _,
             num_peak_range_builders,
+            range_cache_size,
+            range_cache_hit,
+            range_cache_miss,
         } = self;
 
         // Write core metrics
@@ -590,6 +599,16 @@ impl fmt::Debug for ScanMetricsSet {
             write!(f, "}}")?;
         }
 
+        if *range_cache_size > 0 {
+            write!(f, ", \"range_cache_size\":{range_cache_size}")?;
+        }
+        if *range_cache_hit > 0 {
+            write!(f, ", \"range_cache_hit\":{range_cache_hit}")?;
+        }
+        if *range_cache_miss > 0 {
+            write!(f, ", \"range_cache_miss\":{range_cache_miss}")?;
+        }
+
         write!(
             f,
             ", \"build_ranges_peak_mem_size\":{build_ranges_peak_mem_size}, \
@@ -1097,6 +1116,27 @@ impl PartitionMetrics {
     pub(crate) fn dedup_metrics_reporter(&self) -> Arc<dyn DedupMetricsReport> {
         self.0.clone()
     }
+
+    /// Increments the total bytes added to the range cache.
+    #[allow(dead_code)]
+    pub(crate) fn inc_range_cache_size(&self, size: usize) {
+        let mut metrics = self.0.metrics.lock().unwrap();
+        metrics.range_cache_size += size;
+    }
+
+    /// Increments the range cache hit counter.
+    #[allow(dead_code)]
+    pub(crate) fn inc_range_cache_hit(&self) {
+        let mut metrics = self.0.metrics.lock().unwrap();
+        metrics.range_cache_hit += 1;
+    }
+
+    /// Increments the range cache miss counter.
+    #[allow(dead_code)]
+    pub(crate) fn inc_range_cache_miss(&self) {
+        let mut metrics = self.0.metrics.lock().unwrap();
+        metrics.range_cache_miss += 1;
+    }
 }
 
 impl fmt::Debug for PartitionMetrics {
@@ -1493,7 +1533,7 @@ pub fn build_flat_file_range_scan_stream(
                 .transpose()?;
 
             let mapper = range.compaction_projection_mapper();
-            while let Some(record_batch) = reader.next_batch()? {
+            while let Some(record_batch) = reader.next_batch().await? {
                 let record_batch = if let Some(mapper) = mapper {
                     let batch = mapper.project(record_batch)?;
                     batch
diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs
index c13b40d111..a1b3b8f350 100644
--- a/src/mito2/src/read/seq_scan.rs
+++ b/src/mito2/src/read/seq_scan.rs
@@ -39,7 +39,7 @@ use crate::error::{PartitionOutOfRangeSnafu, Result, TooManyFilesToReadSnafu, Un
 use crate::read::dedup::{DedupReader, LastNonNull, LastRow};
 use crate::read::flat_dedup::{FlatDedupReader, FlatLastNonNull, FlatLastRow};
 use crate::read::flat_merge::FlatMergeReader;
-use crate::read::last_row::LastRowReader;
+use crate::read::last_row::{FlatLastRowReader, LastRowReader};
 use crate::read::merge::MergeReaderBuilder;
 use crate::read::pruner::{PartitionPruner, Pruner};
 use crate::read::range::RangeMeta;
@@ -128,28 +128,6 @@ impl SeqScan {
         Ok(Box::pin(futures::stream::iter(streams).flatten()))
     }
 
-    /// Builds a [BoxedBatchReader] from sequential scan for compaction.
-    ///
-    /// # Panics
-    /// Panics if the compaction flag is not set.
-    pub async fn build_reader_for_compaction(&self) -> Result<BoxedBatchReader> {
-        assert!(self.stream_ctx.input.compaction);
-
-        let metrics_set = ExecutionPlanMetricsSet::new();
-        let part_metrics = self.new_partition_metrics(false, &metrics_set, 0);
-        debug_assert_eq!(1, self.properties.partitions.len());
-        let partition_ranges = &self.properties.partitions[0];
-
-        let reader = Self::merge_all_ranges_for_compaction(
-            &self.stream_ctx,
-            partition_ranges,
-            &part_metrics,
-            self.pruner.clone(),
-        )
-        .await?;
-        Ok(Box::new(reader))
-    }
-
     /// Builds a [BoxedRecordBatchStream] from sequential scan for flat format compaction.
     ///
     /// # Panics
@@ -172,40 +150,6 @@ impl SeqScan {
         Ok(reader)
     }
 
-    /// Builds a merge reader that reads all ranges.
-    /// Callers MUST not split ranges before calling this method.
-    async fn merge_all_ranges_for_compaction(
-        stream_ctx: &Arc<StreamContext>,
-        partition_ranges: &[PartitionRange],
-        part_metrics: &PartitionMetrics,
-        pruner: Arc<Pruner>,
-    ) -> Result<BoxedBatchReader> {
-        pruner.add_partition_ranges(partition_ranges);
-        let partition_pruner = Arc::new(PartitionPruner::new(pruner, partition_ranges));
-
-        let mut sources = Vec::new();
-        for part_range in partition_ranges {
-            build_sources(
-                stream_ctx,
-                part_range,
-                true,
-                part_metrics,
-                partition_pruner.clone(),
-                &mut sources,
-                None,
-            )
-            .await?;
-        }
-
-        common_telemetry::debug!(
-            "Build reader to read all parts, region_id: {}, num_part_ranges: {}, num_sources: {}",
-            stream_ctx.input.mapper.metadata().region_id,
-            partition_ranges.len(),
-            sources.len()
-        );
-        Self::build_reader_from_sources(stream_ctx, sources, None, None).await
-    }
-
     /// Builds a merge reader that reads all flat ranges.
     /// Callers MUST not split ranges before calling this method.
     async fn merge_all_flat_ranges_for_compaction(
@@ -345,6 +289,13 @@ impl SeqScan {
             Box::pin(reader.into_stream()) as _
         };
 
+        let reader = match &stream_ctx.input.series_row_selector {
+            Some(TimeSeriesRowSelector::LastRow) => {
+                Box::pin(FlatLastRowReader::new(reader).into_stream()) as _
+            }
+            None => reader,
+        };
+
         Ok(reader)
     }
 
diff --git a/src/mito2/src/read/stream.rs b/src/mito2/src/read/stream.rs
index dd85616241..80002147ea 100644
--- a/src/mito2/src/read/stream.rs
+++ b/src/mito2/src/read/stream.rs
@@ -99,7 +99,8 @@ impl ConvertBatchStream {
                         let mapper = self.projection_mapper.as_flat().unwrap();
 
                         for batch in flat_batch.batches {
-                            self.pending.push_back(mapper.convert(&batch)?);
+                            self.pending
+                                .push_back(mapper.convert(&batch, &self.cache_strategy)?);
                         }
                     }
                 }
@@ -114,7 +115,7 @@ impl ConvertBatchStream {
                 // Safety: Only flat format returns this batch.
                 let mapper = self.projection_mapper.as_flat().unwrap();
 
-                mapper.convert(&df_record_batch)
+                mapper.convert(&df_record_batch, &self.cache_strategy)
             }
         }
     }
diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs
index de8927c4de..3020c9ecf4 100644
--- a/src/mito2/src/region.rs
+++ b/src/mito2/src/region.rs
@@ -973,8 +973,23 @@ impl ManifestContext {
 
             // This is an edit from flush.
             if let Some(flushed_entry_id) = edit.flushed_entry_id {
+                // A flush edit is valid after truncate in two cases:
+                // 1. `flushed_entry_id` moves past `truncated_entry_id`, meaning it definitely
+                //    flushed data newer than the truncate point.
+                // 2. `flushed_entry_id` equals `truncated_entry_id`, but `flushed_sequence`
+                //    increases. This happens in skip-WAL tables where entry id can stay at 0,
+                //    while sequence still advances for post-truncate writes.
+                //
+                // We still reject stale flushes from before truncate:
+                // if entry id is equal and sequence does not advance, the flush is outdated.
+                let is_newer_entry = truncated_entry_id < flushed_entry_id;
+                let is_same_entry_with_newer_sequence = truncated_entry_id == flushed_entry_id
+                    && edit.flushed_sequence.is_some_and(|flushed_sequence| {
+                        manifest.flushed_sequence < flushed_sequence
+                    });
+
                 ensure!(
-                    truncated_entry_id < flushed_entry_id,
+                    is_newer_entry || is_same_entry_with_newer_sequence,
                     RegionTruncatedSnafu {
                         region_id: manifest.metadata.region_id,
                     }
diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs
index 014c50820f..d089493f81 100644
--- a/src/mito2/src/region/opener.rs
+++ b/src/mito2/src/region/opener.rs
@@ -1043,7 +1043,7 @@ async fn preload_parquet_meta_cache_for_files(
         let loader = MetadataLoader::new(object_store.clone(), &file_path, file_size);
         match loader.load(&mut cache_metrics).await {
             Ok(metadata) => {
-                cache_manager.put_parquet_meta_data(file_id, Arc::new(metadata));
+                cache_manager.put_parquet_meta_data(file_id, Arc::new(metadata), None);
                 loaded += 1;
             }
             Err(err) => {
@@ -1153,6 +1153,8 @@ mod tests {
     use object_store::ObjectStore;
     use object_store::services::{Fs, Memory};
     use parquet::arrow::ArrowWriter;
+    use parquet::file::metadata::KeyValue;
+    use parquet::file::properties::WriterProperties;
     use store_api::region_request::PathType;
     use store_api::storage::{FileId, RegionId};
 
@@ -1161,7 +1163,27 @@ mod tests {
     use crate::cache::file_cache::{FileType, IndexKey};
     use crate::sst::file::{FileHandle, FileMeta};
     use crate::sst::file_purger::NoopFilePurger;
+    use crate::sst::parquet::PARQUET_METADATA_KEY;
     use crate::test_util::TestEnv;
+    use crate::test_util::sst_util::sst_region_metadata;
+
+    fn sst_parquet_bytes(batch: &RecordBatch) -> Vec<u8> {
+        let key_value_meta = KeyValue::new(
+            PARQUET_METADATA_KEY.to_string(),
+            sst_region_metadata().to_json().unwrap(),
+        );
+        let props = WriterProperties::builder()
+            .set_key_value_metadata(Some(vec![key_value_meta]))
+            .build();
+
+        let mut parquet_bytes = Vec::new();
+        let mut writer =
+            ArrowWriter::try_new(&mut parquet_bytes, batch.schema(), Some(props)).unwrap();
+        writer.write(batch).unwrap();
+        writer.close().unwrap();
+
+        parquet_bytes
+    }
 
     #[tokio::test]
     async fn test_preload_parquet_meta_cache_uses_file_cache() {
@@ -1183,10 +1205,7 @@ mod tests {
 
         let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef;
         let batch = RecordBatch::try_from_iter([("col", col)]).unwrap();
-        let mut parquet_bytes = Vec::new();
-        let mut writer = ArrowWriter::try_new(&mut parquet_bytes, batch.schema(), None).unwrap();
-        writer.write(&batch).unwrap();
-        writer.close().unwrap();
+        let parquet_bytes = sst_parquet_bytes(&batch);
         let file_size = parquet_bytes.len() as u64;
 
         let file_meta = FileMeta {
@@ -1334,10 +1353,7 @@ mod tests {
 
         let col = Arc::new(Int64Array::from_iter_values([1, 2, 3])) as ArrayRef;
         let batch = RecordBatch::try_from_iter([("col", col)]).unwrap();
-        let mut parquet_bytes = Vec::new();
-        let mut writer = ArrowWriter::try_new(&mut parquet_bytes, batch.schema(), None).unwrap();
-        writer.write(&batch).unwrap();
-        writer.close().unwrap();
+        let parquet_bytes = sst_parquet_bytes(&batch);
 
         // file_size is 0 when it's missing/defaulted in manifests; MetadataLoader::load will stat
         // the local filesystem to retrieve it.
diff --git a/src/mito2/src/region/options.rs b/src/mito2/src/region/options.rs
index 0fe0a8f12a..fcf68a9216 100644
--- a/src/mito2/src/region/options.rs
+++ b/src/mito2/src/region/options.rs
@@ -50,7 +50,7 @@ pub(crate) fn parse_wal_options(
 }
 
 /// Mode to handle duplicate rows while merging.
-#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, EnumString)]
+#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, EnumString)]
 #[serde(rename_all = "snake_case")]
 #[strum(serialize_all = "snake_case")]
 pub enum MergeMode {
diff --git a/src/mito2/src/sst.rs b/src/mito2/src/sst.rs
index 78e4c563b1..94bc1feea8 100644
--- a/src/mito2/src/sst.rs
+++ b/src/mito2/src/sst.rs
@@ -31,7 +31,6 @@ use store_api::storage::consts::{
     OP_TYPE_COLUMN_NAME, PRIMARY_KEY_COLUMN_NAME, SEQUENCE_COLUMN_NAME,
 };
 
-use crate::read::Batch;
 use crate::sst::parquet::flat_format::time_index_column_index;
 
 pub mod file;
@@ -260,33 +259,6 @@ pub(crate) struct SeriesEstimator {
 }
 
 impl SeriesEstimator {
-    /// Updates the estimator with a new Batch.
-    ///
-    /// Since each Batch contains only one series, this increments the series count
-    /// and updates the last timestamp.
-    pub(crate) fn update(&mut self, batch: &Batch) {
-        let Some(last_ts) = batch.last_timestamp() else {
-            return;
-        };
-
-        // Checks if there's a boundary between the last batch and this batch
-        if let Some(prev_last_ts) = self.last_timestamp {
-            // If the first timestamp of this batch is less than the last timestamp
-            // we've seen, it indicates a new series
-            if let Some(first_ts) = batch.first_timestamp()
-                && first_ts.value() <= prev_last_ts
-            {
-                self.series_count += 1;
-            }
-        } else {
-            // First batch, counts as first series
-            self.series_count = 1;
-        }
-
-        // Updates the last timestamp
-        self.last_timestamp = Some(last_ts.value());
-    }
-
     /// Updates the estimator with a new record batch in flat format.
     ///
     /// This method examines the time index column to detect series boundaries.
@@ -340,43 +312,14 @@ impl SeriesEstimator {
 mod tests {
     use std::sync::Arc;
 
-    use api::v1::OpType;
     use datatypes::arrow::array::{
-        BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt8Builder,
-        UInt32Array, UInt64Array,
+        BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array,
+        UInt64Array,
     };
     use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit};
     use datatypes::arrow::record_batch::RecordBatch;
 
     use super::*;
-    use crate::read::{Batch, BatchBuilder};
-
-    fn new_batch(
-        primary_key: &[u8],
-        timestamps: &[i64],
-        sequences: &[u64],
-        op_types: &[OpType],
-    ) -> Batch {
-        let timestamps = Arc::new(TimestampMillisecondArray::from(timestamps.to_vec()));
-        let sequences = Arc::new(UInt64Array::from(sequences.to_vec()));
-        let mut op_type_builder = UInt8Builder::with_capacity(op_types.len());
-        for op_type in op_types {
-            op_type_builder.append_value(*op_type as u8);
-        }
-        let op_types = Arc::new(UInt8Array::from(
-            op_types.iter().map(|op| *op as u8).collect::<Vec<_>>(),
-        ));
-
-        let mut builder = BatchBuilder::new(primary_key.to_vec());
-        builder
-            .timestamps_array(timestamps)
-            .unwrap()
-            .sequences_array(sequences)
-            .unwrap()
-            .op_types_array(op_types)
-            .unwrap();
-        builder.build().unwrap()
-    }
 
     fn new_flat_record_batch(timestamps: &[i64]) -> RecordBatch {
         // Flat format has: [fields..., time_index, __primary_key, __sequence, __op_type]
@@ -411,128 +354,6 @@ mod tests {
         RecordBatch::try_new(schema, vec![time_array, pk_array, seq_array, op_array]).unwrap()
     }
 
-    #[test]
-    fn test_series_estimator_empty_batch() {
-        let mut estimator = SeriesEstimator::default();
-        let batch = new_batch(b"test", &[], &[], &[]);
-        estimator.update(&batch);
-        assert_eq!(0, estimator.finish());
-    }
-
-    #[test]
-    fn test_series_estimator_single_batch() {
-        let mut estimator = SeriesEstimator::default();
-        let batch = new_batch(
-            b"test",
-            &[1, 2, 3],
-            &[1, 2, 3],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch);
-        assert_eq!(1, estimator.finish());
-    }
-
-    #[test]
-    fn test_series_estimator_multiple_batches_same_series() {
-        let mut estimator = SeriesEstimator::default();
-
-        // First batch with timestamps 1, 2, 3
-        let batch1 = new_batch(
-            b"test",
-            &[1, 2, 3],
-            &[1, 2, 3],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch1);
-
-        // Second batch with timestamps 4, 5, 6 (continuation)
-        let batch2 = new_batch(
-            b"test",
-            &[4, 5, 6],
-            &[4, 5, 6],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch2);
-
-        assert_eq!(1, estimator.finish());
-    }
-
-    #[test]
-    fn test_series_estimator_new_series_detected() {
-        let mut estimator = SeriesEstimator::default();
-
-        // First batch with timestamps 1, 2, 3
-        let batch1 = new_batch(
-            b"pk0",
-            &[1, 2, 3],
-            &[1, 2, 3],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch1);
-
-        // Second batch with timestamps 2, 3, 4 (timestamp goes back, new series)
-        let batch2 = new_batch(
-            b"pk1",
-            &[2, 3, 4],
-            &[4, 5, 6],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch2);
-
-        assert_eq!(2, estimator.finish());
-    }
-
-    #[test]
-    fn test_series_estimator_equal_timestamp_boundary() {
-        let mut estimator = SeriesEstimator::default();
-
-        // First batch ending at timestamp 5
-        let batch1 = new_batch(
-            b"test",
-            &[1, 2, 5],
-            &[1, 2, 3],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch1);
-
-        // Second batch starting at timestamp 5 (equal, indicates new series)
-        let batch2 = new_batch(
-            b"test",
-            &[5, 6, 7],
-            &[4, 5, 6],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch2);
-
-        assert_eq!(2, estimator.finish());
-    }
-
-    #[test]
-    fn test_series_estimator_finish_resets_state() {
-        let mut estimator = SeriesEstimator::default();
-
-        let batch1 = new_batch(
-            b"test",
-            &[1, 2, 3],
-            &[1, 2, 3],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch1);
-
-        assert_eq!(1, estimator.finish());
-
-        // After finish, state should be reset
-        let batch2 = new_batch(
-            b"test",
-            &[4, 5, 6],
-            &[4, 5, 6],
-            &[OpType::Put, OpType::Put, OpType::Put],
-        );
-        estimator.update(&batch2);
-
-        assert_eq!(1, estimator.finish());
-    }
-
     #[test]
     fn test_series_estimator_flat_empty_batch() {
         let mut estimator = SeriesEstimator::default();
diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs
index 0df3229e9c..88aebfc001 100644
--- a/src/mito2/src/sst/index.rs
+++ b/src/mito2/src/sst/index.rs
@@ -58,7 +58,7 @@ use crate::error::{
 };
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
 use crate::metrics::INDEX_CREATE_MEMORY_USAGE;
-use crate::read::{Batch, BatchReader};
+use crate::read::Batch;
 use crate::region::options::IndexOptions;
 use crate::region::version::VersionControlRef;
 use crate::region::{ManifestContextRef, RegionLeaderState};
@@ -802,9 +802,9 @@ impl IndexBuildTask {
         if let Some(mut parquet_reader) = parquet_reader {
             // TODO(SNC123): optimize index batch
             loop {
-                match parquet_reader.next_batch().await {
-                    Ok(Some(mut batch)) => {
-                        indexer.update(&mut batch).await;
+                match parquet_reader.next_record_batch().await {
+                    Ok(Some(batch)) => {
+                        indexer.update_flat(&batch).await;
                     }
                     Ok(None) => break,
                     Err(e) => {
@@ -1227,7 +1227,9 @@ mod tests {
     use crate::sst::parquet::WriteOptions;
     use crate::test_util::memtable_util::EmptyMemtableBuilder;
     use crate::test_util::scheduler_util::SchedulerEnv;
-    use crate::test_util::sst_util::{new_batch_by_range, new_source, sst_region_metadata};
+    use crate::test_util::sst_util::{
+        new_flat_source_from_record_batches, new_record_batch_by_range, sst_region_metadata,
+    };
 
     struct MetaConfig {
         with_inverted: bool,
@@ -1358,19 +1360,20 @@ mod tests {
         env: &SchedulerEnv,
         build_mode: IndexBuildMode,
     ) -> SstInfo {
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
         let mut index_config = MitoConfig::default().index;
         index_config.build_mode = build_mode;
         let write_request = SstWriteRequest {
             op_type: OperationType::Flush,
             metadata: metadata.clone(),
-            source: either::Left(source),
+            source,
             storage: None,
             max_sequence: None,
+            sst_write_format: Default::default(),
             cache_manager: Default::default(),
             index_options: IndexOptions::default(),
             index_config,
diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs
index aa98b69176..79a08a209d 100644
--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -24,11 +24,13 @@ use crate::sst::DEFAULT_WRITE_BUFFER_SIZE;
 use crate::sst::file::FileTimeRange;
 use crate::sst::index::IndexOutput;
 
+pub(crate) mod async_reader;
 pub mod file_range;
 pub mod flat_format;
 pub mod format;
 pub(crate) mod helper;
 pub(crate) mod metadata;
+pub mod prefilter;
 pub mod reader;
 pub mod row_group;
 pub mod row_selection;
@@ -110,6 +112,7 @@ mod tests {
         TimestampMillisecondArray, UInt8Array, UInt64Array,
     };
     use datatypes::arrow::datatypes::{DataType, Field, Schema, UInt32Type};
+    use datatypes::arrow::util::pretty::pretty_format_batches;
     use datatypes::prelude::ConcreteDataType;
     use datatypes::schema::{FulltextAnalyzer, FulltextBackend, FulltextOptions};
     use object_store::ObjectStore;
@@ -129,7 +132,7 @@ mod tests {
     use crate::cache::test_util::assert_parquet_metadata_equal;
     use crate::cache::{CacheManager, CacheStrategy, PageKey};
     use crate::config::IndexConfig;
-    use crate::read::{BatchBuilder, BatchReader, FlatSource};
+    use crate::read::FlatSource;
     use crate::region::options::{IndexOptions, InvertedIndexOptions};
     use crate::sst::file::{FileHandle, FileMeta, RegionFileId, RegionIndexId};
     use crate::sst::file_purger::NoopFilePurger;
@@ -137,19 +140,19 @@ mod tests {
     use crate::sst::index::fulltext_index::applier::builder::FulltextIndexApplierBuilder;
     use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder;
     use crate::sst::index::{IndexBuildType, Indexer, IndexerBuilder, IndexerBuilderImpl};
-    use crate::sst::parquet::format::PrimaryKeyWriteFormat;
+    use crate::sst::parquet::flat_format::FlatWriteFormat;
     use crate::sst::parquet::reader::{ParquetReader, ParquetReaderBuilder, ReaderMetrics};
     use crate::sst::parquet::writer::ParquetWriter;
     use crate::sst::{
         DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, location, to_flat_sst_arrow_schema,
     };
+    use crate::test_util::TestEnv;
     use crate::test_util::sst_util::{
-        build_test_binary_test_region_metadata, new_batch_by_range, new_batch_with_binary,
-        new_batch_with_custom_sequence, new_primary_key, new_source, new_sparse_primary_key,
-        sst_file_handle, sst_file_handle_with_file_id, sst_region_metadata,
+        build_test_binary_test_region_metadata, new_flat_source_from_record_batches,
+        new_primary_key, new_record_batch_by_range, new_record_batch_with_custom_sequence,
+        new_sparse_primary_key, sst_file_handle, sst_file_handle_with_file_id, sst_region_metadata,
         sst_region_metadata_with_encoding,
     };
-    use crate::test_util::{TestEnv, check_reader_result};
 
     const FILE_DIR: &str = "/";
     const REGION_ID: RegionId = RegionId::new(0, 0);
@@ -191,10 +194,10 @@ mod tests {
             region_file_id: handle.file_id(),
         };
         let metadata = Arc::new(sst_region_metadata());
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
         // Use a small row group size for test.
         let write_opts = WriteOptions {
@@ -214,7 +217,7 @@ mod tests {
         .await;
 
         let info = writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -235,14 +238,14 @@ mod tests {
             object_store,
         );
         let mut reader = builder.build().await.unwrap().unwrap();
-        check_reader_result(
+        check_record_batch_reader_result(
             &mut reader,
             &[
-                new_batch_by_range(&["a", "d"], 0, 50),
-                new_batch_by_range(&["a", "d"], 50, 60),
-                new_batch_by_range(&["b", "f"], 0, 40),
-                new_batch_by_range(&["b", "h"], 100, 150),
-                new_batch_by_range(&["b", "h"], 150, 200),
+                new_record_batch_by_range(&["a", "d"], 0, 50),
+                new_record_batch_by_range(&["a", "d"], 50, 60),
+                new_record_batch_by_range(&["b", "f"], 0, 40),
+                new_record_batch_by_range(&["b", "h"], 100, 150),
+                new_record_batch_by_range(&["b", "h"], 150, 200),
             ],
         )
         .await;
@@ -254,10 +257,10 @@ mod tests {
         let object_store = env.init_object_store_manager();
         let handle = sst_file_handle(0, 1000);
         let metadata = Arc::new(sst_region_metadata());
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
         // Use a small row group size for test.
         let write_opts = WriteOptions {
@@ -279,7 +282,7 @@ mod tests {
         .await;
 
         let sst_info = writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -299,14 +302,14 @@ mod tests {
         .cache(cache.clone());
         for _ in 0..3 {
             let mut reader = builder.build().await.unwrap().unwrap();
-            check_reader_result(
+            check_record_batch_reader_result(
                 &mut reader,
                 &[
-                    new_batch_by_range(&["a", "d"], 0, 50),
-                    new_batch_by_range(&["a", "d"], 50, 60),
-                    new_batch_by_range(&["b", "f"], 0, 40),
-                    new_batch_by_range(&["b", "h"], 100, 150),
-                    new_batch_by_range(&["b", "h"], 150, 200),
+                    new_record_batch_by_range(&["a", "d"], 0, 50),
+                    new_record_batch_by_range(&["a", "d"], 50, 60),
+                    new_record_batch_by_range(&["b", "f"], 0, 40),
+                    new_record_batch_by_range(&["b", "h"], 100, 150),
+                    new_record_batch_by_range(&["b", "h"], 150, 200),
                 ],
             )
             .await;
@@ -340,10 +343,10 @@ mod tests {
         let object_store = env.init_object_store_manager();
         let handle = sst_file_handle(0, 1000);
         let metadata = Arc::new(sst_region_metadata());
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
         let write_opts = WriteOptions {
             row_group_size: 50,
@@ -366,7 +369,7 @@ mod tests {
         .await;
 
         let sst_info = writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -382,8 +385,12 @@ mod tests {
         .page_index_policy(PageIndexPolicy::Optional);
         let reader = builder.build().await.unwrap().unwrap();
         let reader_metadata = reader.parquet_metadata();
+        let cached_writer_metadata =
+            crate::cache::CachedSstMeta::try_new("test.sst", Arc::unwrap_or_clone(writer_metadata))
+                .unwrap()
+                .parquet_metadata();
 
-        assert_parquet_metadata_equal(writer_metadata, reader_metadata);
+        assert_parquet_metadata_equal(cached_writer_metadata, reader_metadata);
     }
 
     #[tokio::test]
@@ -392,10 +399,10 @@ mod tests {
         let object_store = env.init_object_store_manager();
         let handle = sst_file_handle(0, 1000);
         let metadata = Arc::new(sst_region_metadata());
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
         // Use a small row group size for test.
         let write_opts = WriteOptions {
@@ -416,7 +423,7 @@ mod tests {
         )
         .await;
         writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -436,11 +443,11 @@ mod tests {
         )
         .predicate(predicate);
         let mut reader = builder.build().await.unwrap().unwrap();
-        check_reader_result(
+        check_record_batch_reader_result(
             &mut reader,
             &[
-                new_batch_by_range(&["a", "d"], 0, 50),
-                new_batch_by_range(&["a", "d"], 50, 60),
+                new_record_batch_by_range(&["a", "d"], 0, 50),
+                new_record_batch_by_range(&["a", "d"], 50, 60),
             ],
         )
         .await;
@@ -452,10 +459,10 @@ mod tests {
         let object_store = env.init_object_store_manager();
         let handle = sst_file_handle(0, 1000);
         let metadata = Arc::new(sst_region_metadata());
-        let source = new_source(&[
-            new_batch_by_range(&["a", "z"], 0, 0),
-            new_batch_by_range(&["a", "z"], 100, 100),
-            new_batch_by_range(&["a", "z"], 200, 230),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "z"], 0, 0),
+            new_record_batch_by_range(&["a", "z"], 100, 100),
+            new_record_batch_by_range(&["a", "z"], 200, 230),
         ]);
         // Use a small row group size for test.
         let write_opts = WriteOptions {
@@ -476,7 +483,7 @@ mod tests {
         )
         .await;
         writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -488,7 +495,11 @@ mod tests {
             object_store,
         );
         let mut reader = builder.build().await.unwrap().unwrap();
-        check_reader_result(&mut reader, &[new_batch_by_range(&["a", "z"], 200, 230)]).await;
+        check_record_batch_reader_result(
+            &mut reader,
+            &[new_record_batch_by_range(&["a", "z"], 200, 230)],
+        )
+        .await;
     }
 
     #[tokio::test]
@@ -497,10 +508,10 @@ mod tests {
         let object_store = env.init_object_store_manager();
         let handle = sst_file_handle(0, 1000);
         let metadata = Arc::new(sst_region_metadata());
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 60),
-            new_batch_by_range(&["b", "f"], 0, 40),
-            new_batch_by_range(&["b", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 60),
+            new_record_batch_by_range(&["b", "f"], 0, 40),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
         ]);
         // Use a small row group size for test.
         let write_opts = WriteOptions {
@@ -522,7 +533,7 @@ mod tests {
         .await;
 
         writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -542,7 +553,11 @@ mod tests {
         )
         .predicate(predicate);
         let mut reader = builder.build().await.unwrap().unwrap();
-        check_reader_result(&mut reader, &[new_batch_by_range(&["b", "h"], 150, 200)]).await;
+        check_record_batch_reader_result(
+            &mut reader,
+            &[new_record_batch_by_range(&["b", "h"], 150, 200)],
+        )
+        .await;
     }
 
     #[tokio::test]
@@ -569,7 +584,7 @@ mod tests {
 
         let writer_props = props_builder.build();
 
-        let write_format = PrimaryKeyWriteFormat::new(metadata);
+        let write_format = FlatWriteFormat::new(metadata, &FlatSchemaOptions::default());
         let fields: Vec<_> = write_format
             .arrow_schema()
             .fields()
@@ -603,9 +618,8 @@ mod tests {
         )
         .unwrap();
 
-        let batch = new_batch_with_binary(&["a"], 0, 60);
-        let arrow_batch = write_format.convert_batch(&batch).unwrap();
-        let arrays: Vec<_> = arrow_batch
+        let batch = new_record_batch_with_binary(&["a"], 0, 60);
+        let arrays: Vec<_> = batch
             .columns()
             .iter()
             .map(|array| {
@@ -629,11 +643,11 @@ mod tests {
             object_store,
         );
         let mut reader = builder.build().await.unwrap().unwrap();
-        check_reader_result(
+        check_record_batch_reader_result(
             &mut reader,
             &[
-                new_batch_with_binary(&["a"], 0, 50),
-                new_batch_with_binary(&["a"], 50, 60),
+                new_record_batch_with_binary(&["a"], 0, 50),
+                new_record_batch_with_binary(&["a"], 50, 60),
             ],
         )
         .await;
@@ -646,17 +660,17 @@ mod tests {
         let mut env = TestEnv::new().await;
         let object_store = env.init_object_store_manager();
         let metadata = Arc::new(sst_region_metadata());
-        let batches = &[
-            new_batch_by_range(&["a", "d"], 0, 1000),
-            new_batch_by_range(&["b", "f"], 0, 1000),
-            new_batch_by_range(&["c", "g"], 0, 1000),
-            new_batch_by_range(&["b", "h"], 100, 200),
-            new_batch_by_range(&["b", "h"], 200, 300),
-            new_batch_by_range(&["b", "h"], 300, 1000),
+        let batches = vec![
+            new_record_batch_by_range(&["a", "d"], 0, 1000),
+            new_record_batch_by_range(&["b", "f"], 0, 1000),
+            new_record_batch_by_range(&["c", "g"], 0, 1000),
+            new_record_batch_by_range(&["b", "h"], 100, 200),
+            new_record_batch_by_range(&["b", "h"], 200, 300),
+            new_record_batch_by_range(&["b", "h"], 300, 1000),
         ];
         let total_rows: usize = batches.iter().map(|batch| batch.num_rows()).sum();
 
-        let source = new_source(batches);
+        let source = new_flat_source_from_record_batches(batches);
         let write_opts = WriteOptions {
             row_group_size: 50,
             max_file_size: Some(1024 * 16),
@@ -678,7 +692,10 @@ mod tests {
         )
         .await;
 
-        let files = writer.write_all(source, None, &write_opts).await.unwrap();
+        let files = writer
+            .write_all_flat_as_primary_key(source, None, &write_opts)
+            .await
+            .unwrap();
         assert_eq!(2, files.len());
 
         let mut rows_read = 0;
@@ -695,7 +712,7 @@ mod tests {
                 object_store.clone(),
             );
             let mut reader = builder.build().await.unwrap().unwrap();
-            while let Some(batch) = reader.next_batch().await.unwrap() {
+            while let Some(batch) = reader.next_record_batch().await.unwrap() {
                 rows_read += batch.num_rows();
             }
         }
@@ -710,12 +727,12 @@ mod tests {
         let metadata = Arc::new(sst_region_metadata());
         let row_group_size = 50;
 
-        let source = new_source(&[
-            new_batch_by_range(&["a", "d"], 0, 20),
-            new_batch_by_range(&["b", "d"], 0, 20),
-            new_batch_by_range(&["c", "d"], 0, 20),
-            new_batch_by_range(&["c", "f"], 0, 40),
-            new_batch_by_range(&["c", "h"], 100, 200),
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_by_range(&["a", "d"], 0, 20),
+            new_record_batch_by_range(&["b", "d"], 0, 20),
+            new_record_batch_by_range(&["c", "d"], 0, 20),
+            new_record_batch_by_range(&["c", "f"], 0, 40),
+            new_record_batch_by_range(&["c", "h"], 100, 200),
         ]);
         // Use a small row group size for test.
         let write_opts = WriteOptions {
@@ -760,7 +777,7 @@ mod tests {
         .await;
 
         let info = writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -877,6 +894,7 @@ mod tests {
             handle.clone(),
             object_store.clone(),
         )
+        .flat_format(true)
         .predicate(Some(Predicate::new(preds)))
         .inverted_index_appliers([inverted_index_applier.clone(), None])
         .bloom_filter_index_appliers([bloom_filter_applier.clone(), None])
@@ -891,7 +909,11 @@ mod tests {
         let mut reader = ParquetReader::new(Arc::new(context), selection)
             .await
             .unwrap();
-        check_reader_result(&mut reader, &[new_batch_by_range(&["b", "d"], 0, 20)]).await;
+        check_record_batch_reader_result(
+            &mut reader,
+            &[new_record_batch_by_range(&["b", "d"], 0, 20)],
+        )
+        .await;
 
         assert_eq!(metrics.filter_metrics.rg_total, 4);
         assert_eq!(metrics.filter_metrics.rg_minmax_filtered, 3);
@@ -937,6 +959,7 @@ mod tests {
             handle.clone(),
             object_store.clone(),
         )
+        .flat_format(true)
         .predicate(Some(Predicate::new(preds)))
         .inverted_index_appliers([inverted_index_applier.clone(), None])
         .bloom_filter_index_appliers([bloom_filter_applier.clone(), None])
@@ -991,6 +1014,7 @@ mod tests {
             handle.clone(),
             object_store.clone(),
         )
+        .flat_format(true)
         .predicate(Some(Predicate::new(preds)))
         .inverted_index_appliers([inverted_index_applier.clone(), None])
         .bloom_filter_index_appliers([bloom_filter_applier.clone(), None])
@@ -1005,13 +1029,13 @@ mod tests {
         let mut reader = ParquetReader::new(Arc::new(context), selection)
             .await
             .unwrap();
-        check_reader_result(
+        check_record_batch_reader_result(
             &mut reader,
             &[
-                new_batch_by_range(&["a", "d"], 0, 20),
-                new_batch_by_range(&["b", "d"], 0, 20),
-                new_batch_by_range(&["c", "d"], 0, 10),
-                new_batch_by_range(&["c", "d"], 10, 20),
+                new_record_batch_by_range(&["a", "d"], 0, 20),
+                new_record_batch_by_range(&["b", "d"], 0, 20),
+                new_record_batch_by_range(&["c", "d"], 0, 10),
+                new_record_batch_by_range(&["c", "d"], 10, 20),
             ],
         )
         .await;
@@ -1032,37 +1056,32 @@ mod tests {
         assert!(cached.contains_row_group(3));
     }
 
-    /// Creates a flat format RecordBatch for testing.
-    /// Similar to `new_batch_by_range` but returns a RecordBatch in flat format.
-    fn new_record_batch_by_range(tags: &[&str], start: usize, end: usize) -> RecordBatch {
+    fn new_record_batch_with_binary(tags: &[&str], start: usize, end: usize) -> RecordBatch {
         assert!(end >= start);
-        let metadata = Arc::new(sst_region_metadata());
+        let metadata = build_test_binary_test_region_metadata();
         let flat_schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default());
 
         let num_rows = end - start;
         let mut columns = Vec::new();
 
-        // Add primary key columns (tag_0, tag_1) as dictionary arrays
         let mut tag_0_builder = StringDictionaryBuilder::<UInt32Type>::new();
-        let mut tag_1_builder = StringDictionaryBuilder::<UInt32Type>::new();
-
         for _ in 0..num_rows {
             tag_0_builder.append_value(tags[0]);
-            tag_1_builder.append_value(tags[1]);
         }
-
         columns.push(Arc::new(tag_0_builder.finish()) as ArrayRef);
-        columns.push(Arc::new(tag_1_builder.finish()) as ArrayRef);
 
-        // Add field column (field_0)
-        let field_values: Vec<u64> = (start..end).map(|v| v as u64).collect();
-        columns.push(Arc::new(UInt64Array::from(field_values)));
+        let values = (0..num_rows)
+            .map(|_| "some data".as_bytes())
+            .collect::<Vec<_>>();
+        columns.push(
+            Arc::new(datatypes::arrow::array::BinaryArray::from_iter_values(
+                values,
+            )) as ArrayRef,
+        );
 
-        // Add time index column (ts)
         let timestamps: Vec<i64> = (start..end).map(|v| v as i64).collect();
         columns.push(Arc::new(TimestampMillisecondArray::from(timestamps)));
 
-        // Add encoded primary key column
         let pk = new_primary_key(tags);
         let mut pk_builder = BinaryDictionaryBuilder::<UInt32Type>::new();
         for _ in 0..num_rows {
@@ -1070,10 +1089,7 @@ mod tests {
         }
         columns.push(Arc::new(pk_builder.finish()));
 
-        // Add sequence column
         columns.push(Arc::new(UInt64Array::from_value(1000, num_rows)));
-
-        // Add op_type column
         columns.push(Arc::new(UInt8Array::from_value(
             OpType::Put as u8,
             num_rows,
@@ -1082,9 +1098,19 @@ mod tests {
         RecordBatch::try_new(flat_schema, columns).unwrap()
     }
 
-    /// Creates a FlatSource from flat format RecordBatches.
-    fn new_flat_source_from_record_batches(batches: Vec<RecordBatch>) -> FlatSource {
-        FlatSource::Iter(Box::new(batches.into_iter().map(Ok)))
+    async fn check_record_batch_reader_result(
+        reader: &mut ParquetReader,
+        expected: &[RecordBatch],
+    ) {
+        let mut actual = Vec::new();
+        while let Some(batch) = reader.next_record_batch().await.unwrap() {
+            actual.push(batch);
+        }
+        assert_eq!(
+            pretty_format_batches(expected).unwrap().to_string(),
+            pretty_format_batches(&actual).unwrap().to_string()
+        );
+        assert!(reader.next_record_batch().await.unwrap().is_none());
     }
 
     /// Creates a flat format RecordBatch for testing with sparse primary key encoding.
@@ -1333,10 +1359,11 @@ mod tests {
         };
         let metadata = Arc::new(sst_region_metadata());
 
-        // Create batches with sequence 0 to trigger override functionality
-        let batch1 = new_batch_with_custom_sequence(&["a", "d"], 0, 60, 0);
-        let batch2 = new_batch_with_custom_sequence(&["b", "f"], 0, 40, 0);
-        let source = new_source(&[batch1, batch2]);
+        // Create batches with sequence 0 to trigger override functionality.
+        let source = new_flat_source_from_record_batches(vec![
+            new_record_batch_with_custom_sequence(&["a", "d"], 0, 60, 0),
+            new_record_batch_with_custom_sequence(&["b", "f"], 0, 40, 0),
+        ]);
 
         let write_opts = WriteOptions {
             row_group_size: 50,
@@ -1355,7 +1382,7 @@ mod tests {
         .await;
 
         writer
-            .write_all(source, None, &write_opts)
+            .write_all_flat_as_primary_key(source, None, &write_opts)
             .await
             .unwrap()
             .remove(0);
@@ -1369,7 +1396,7 @@ mod tests {
         );
         let mut reader = builder.build().await.unwrap().unwrap();
         let mut normal_batches = Vec::new();
-        while let Some(batch) = reader.next_batch().await.unwrap() {
+        while let Some(batch) = reader.next_record_batch().await.unwrap() {
             normal_batches.push(batch);
         }
 
@@ -1391,22 +1418,19 @@ mod tests {
         );
         let mut reader = builder.build().await.unwrap().unwrap();
         let mut override_batches = Vec::new();
-        while let Some(batch) = reader.next_batch().await.unwrap() {
+        while let Some(batch) = reader.next_record_batch().await.unwrap() {
             override_batches.push(batch);
         }
 
         // Compare the results
         assert_eq!(normal_batches.len(), override_batches.len());
         for (normal, override_batch) in normal_batches.into_iter().zip(override_batches.iter()) {
-            // Create expected batch with override sequence
             let expected_batch = {
-                let num_rows = normal.num_rows();
-                let mut builder = BatchBuilder::from(normal);
-                builder
-                    .sequences_array(Arc::new(UInt64Array::from_value(custom_sequence, num_rows)))
-                    .unwrap();
-
-                builder.build().unwrap()
+                let mut columns = normal.columns().to_vec();
+                let num_cols = columns.len();
+                columns[num_cols - 2] =
+                    Arc::new(UInt64Array::from_value(custom_sequence, normal.num_rows()));
+                RecordBatch::try_new(normal.schema(), columns).unwrap()
             };
 
             // Override batch should match expected batch
diff --git a/src/mito2/src/sst/parquet/async_reader.rs b/src/mito2/src/sst/parquet/async_reader.rs
new file mode 100644
index 0000000000..a060fd367d
--- /dev/null
+++ b/src/mito2/src/sst/parquet/async_reader.rs
@@ -0,0 +1,221 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Async file reader implementation for SST parquet files.
+
+use std::ops::Range;
+use std::sync::Arc;
+
+use bytes::Bytes;
+use futures::FutureExt;
+use futures::future::BoxFuture;
+use object_store::ObjectStore;
+use parquet::arrow::async_reader::AsyncFileReader;
+use parquet::errors::{ParquetError, Result as ParquetResult};
+use parquet::file::metadata::ParquetMetaData;
+
+use crate::cache::file_cache::{FileType, IndexKey};
+use crate::cache::{CacheStrategy, PageKey, PageValue};
+use crate::metrics::{READ_STAGE_ELAPSED, READ_STAGE_FETCH_PAGES};
+use crate::sst::file::RegionFileId;
+use crate::sst::parquet::helper::fetch_byte_ranges;
+use crate::sst::parquet::row_group::{ParquetFetchMetrics, compute_total_range_size};
+
+/// An [AsyncFileReader] implementation for SST parquet files.
+///
+/// This reader provides async byte access to parquet data in object storage,
+/// with caching support (page cache and write cache).
+pub struct SstAsyncFileReader {
+    /// Region file ID for cache key.
+    region_file_id: RegionFileId,
+    /// Path to the parquet file in object storage.
+    file_path: String,
+    /// Object store for reading data.
+    object_store: ObjectStore,
+    /// Cache strategy for reading pages.
+    cache_strategy: CacheStrategy,
+    /// Cached parquet metadata.
+    metadata: Arc<ParquetMetaData>,
+    /// Row group index for cache key.
+    row_group_idx: usize,
+    /// Optional metrics for tracking fetch operations.
+    fetch_metrics: Option<ParquetFetchMetrics>,
+}
+
+impl SstAsyncFileReader {
+    /// Creates a new [SstAsyncFileReader].
+    pub fn new(
+        region_file_id: RegionFileId,
+        file_path: String,
+        object_store: ObjectStore,
+        cache_strategy: CacheStrategy,
+        metadata: Arc<ParquetMetaData>,
+        row_group_idx: usize,
+    ) -> Self {
+        Self {
+            region_file_id,
+            file_path,
+            object_store,
+            cache_strategy,
+            metadata,
+            row_group_idx,
+            fetch_metrics: None,
+        }
+    }
+
+    /// Sets the fetch metrics.
+    pub fn with_fetch_metrics(mut self, metrics: Option<ParquetFetchMetrics>) -> Self {
+        self.fetch_metrics = metrics;
+        self
+    }
+
+    /// Fetches byte ranges from page cache, write cache, or object store.
+    async fn fetch_bytes_with_cache(&self, ranges: Vec<Range<u64>>) -> ParquetResult<Vec<Bytes>> {
+        let fetch_start = self
+            .fetch_metrics
+            .as_ref()
+            .map(|_| std::time::Instant::now());
+        let _timer = READ_STAGE_FETCH_PAGES.start_timer();
+
+        let page_key = PageKey::new(
+            self.region_file_id.file_id(),
+            self.row_group_idx,
+            ranges.clone(),
+        );
+
+        // Check page cache first.
+        if let Some(pages) = self.cache_strategy.get_pages(&page_key) {
+            if let Some(metrics) = &self.fetch_metrics {
+                let total_size: u64 = ranges.iter().map(|r| r.end - r.start).sum();
+                let mut metrics_data = metrics.data.lock().unwrap();
+                metrics_data.page_cache_hit += 1;
+                metrics_data.pages_to_fetch_mem += ranges.len();
+                metrics_data.page_size_to_fetch_mem += total_size;
+                metrics_data.page_size_needed += total_size;
+                if let Some(start) = fetch_start {
+                    metrics_data.total_fetch_elapsed += start.elapsed();
+                }
+            }
+            return Ok(pages.compressed.clone());
+        }
+
+        // Calculate total range size for metrics.
+        let (total_range_size, unaligned_size) = compute_total_range_size(&ranges);
+
+        // Check write cache.
+        let key = IndexKey::new(
+            self.region_file_id.region_id(),
+            self.region_file_id.file_id(),
+            FileType::Parquet,
+        );
+        let fetch_write_cache_start = self
+            .fetch_metrics
+            .as_ref()
+            .map(|_| std::time::Instant::now());
+        let write_cache_result = self.fetch_ranges_from_write_cache(key, &ranges).await;
+
+        let pages = match write_cache_result {
+            Some(data) => {
+                if let Some(metrics) = &self.fetch_metrics {
+                    let elapsed = fetch_write_cache_start
+                        .map(|start| start.elapsed())
+                        .unwrap_or_default();
+                    let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
+                    let mut metrics_data = metrics.data.lock().unwrap();
+                    metrics_data.write_cache_fetch_elapsed += elapsed;
+                    metrics_data.write_cache_hit += 1;
+                    metrics_data.pages_to_fetch_write_cache += ranges.len();
+                    metrics_data.page_size_to_fetch_write_cache += unaligned_size;
+                    metrics_data.page_size_needed += range_size_needed;
+                }
+                data
+            }
+            None => {
+                // Fetch data from object store.
+                let _timer = READ_STAGE_ELAPSED
+                    .with_label_values(&["cache_miss_read"])
+                    .start_timer();
+
+                let start = self
+                    .fetch_metrics
+                    .as_ref()
+                    .map(|_| std::time::Instant::now());
+                let data = fetch_byte_ranges(&self.file_path, self.object_store.clone(), &ranges)
+                    .await
+                    .map_err(|e| ParquetError::External(Box::new(e)))?;
+
+                if let Some(metrics) = &self.fetch_metrics {
+                    let elapsed = start.map(|start| start.elapsed()).unwrap_or_default();
+                    let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
+                    let mut metrics_data = metrics.data.lock().unwrap();
+                    metrics_data.store_fetch_elapsed += elapsed;
+                    metrics_data.cache_miss += 1;
+                    metrics_data.pages_to_fetch_store += ranges.len();
+                    metrics_data.page_size_to_fetch_store += unaligned_size;
+                    metrics_data.page_size_needed += range_size_needed;
+                }
+                data
+            }
+        };
+
+        // Put pages back to the cache.
+        let page_value = PageValue::new(pages.clone(), total_range_size);
+        self.cache_strategy
+            .put_pages(page_key, Arc::new(page_value));
+
+        if let (Some(metrics), Some(start)) = (&self.fetch_metrics, fetch_start) {
+            metrics.data.lock().unwrap().total_fetch_elapsed += start.elapsed();
+        }
+
+        Ok(pages)
+    }
+
+    /// Fetches data from write cache.
+    /// Returns `None` if the data is not in the cache.
+    async fn fetch_ranges_from_write_cache(
+        &self,
+        key: IndexKey,
+        ranges: &[Range<u64>],
+    ) -> Option<Vec<Bytes>> {
+        if let Some(cache) = self.cache_strategy.write_cache() {
+            return cache.file_cache().read_ranges(key, ranges).await;
+        }
+        None
+    }
+}
+
+impl AsyncFileReader for SstAsyncFileReader {
+    fn get_bytes(&mut self, range: Range<u64>) -> BoxFuture<'_, ParquetResult<Bytes>> {
+        async move {
+            let mut result = self.fetch_bytes_with_cache(vec![range]).await?;
+            Ok(result.pop().unwrap_or_default())
+        }
+        .boxed()
+    }
+
+    fn get_byte_ranges(
+        &mut self,
+        ranges: Vec<Range<u64>>,
+    ) -> BoxFuture<'_, ParquetResult<Vec<Bytes>>> {
+        async move { self.fetch_bytes_with_cache(ranges).await }.boxed()
+    }
+
+    fn get_metadata(
+        &mut self,
+        _options: Option<&parquet::arrow::arrow_reader::ArrowReaderOptions>,
+    ) -> BoxFuture<'_, ParquetResult<Arc<ParquetMetaData>>> {
+        // Metadata is already cached, return it immediately.
+        std::future::ready(Ok(self.metadata.clone())).boxed()
+    }
+}
diff --git a/src/mito2/src/sst/parquet/flat_format.rs b/src/mito2/src/sst/parquet/flat_format.rs
index d6b061e468..8a59e9a97d 100644
--- a/src/mito2/src/sst/parquet/flat_format.rs
+++ b/src/mito2/src/sst/parquet/flat_format.rs
@@ -52,8 +52,8 @@ use crate::error::{
     NewRecordBatchSnafu, Result,
 };
 use crate::sst::parquet::format::{
-    FormatProjection, INTERNAL_COLUMN_NUM, PrimaryKeyArray, PrimaryKeyReadFormat, ReadFormat,
-    StatValues,
+    FIXED_POS_COLUMN_NUM, FormatProjection, INTERNAL_COLUMN_NUM, PrimaryKeyArray,
+    PrimaryKeyReadFormat, ReadFormat, StatValues,
 };
 use crate::sst::{
     FlatSchemaOptions, flat_sst_arrow_schema_column_num, tag_maybe_to_dictionary_field,
@@ -127,6 +127,21 @@ pub(crate) fn op_type_column_index(num_columns: usize) -> usize {
     num_columns - 1
 }
 
+/// Returns the start index of field columns in a flat batch.
+///
+/// `num_columns` is the total number of columns in the flat batch schema,
+/// including tag columns (if present), field columns, and fixed position columns
+/// (time index, primary key, sequence, op type).
+///
+/// For Dense encoding (raw PK columns included): field_column_start = primary_key.len()
+/// For Sparse encoding (no raw PK columns): field_column_start = 0
+pub(crate) fn field_column_start(metadata: &RegionMetadata, num_columns: usize) -> usize {
+    // Calculates field column start: total columns - fixed columns - field columns
+    // Field column count = total metadata columns - time index column - primary key columns
+    let field_column_count = metadata.column_metadatas.len() - 1 - metadata.primary_key.len();
+    num_columns - FIXED_POS_COLUMN_NUM - field_column_count
+}
+
 // TODO(yingwen): Add an option to skip reading internal columns if the region is
 // append only and doesn't use sparse encoding (We need to check the table id under
 // sparse encoding).
@@ -765,3 +780,89 @@ impl FlatReadFormat {
         .unwrap()
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use api::v1::SemanticType;
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::schema::ColumnSchema;
+    use store_api::codec::PrimaryKeyEncoding;
+    use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
+    use store_api::storage::RegionId;
+
+    use super::field_column_start;
+    use crate::sst::{FlatSchemaOptions, flat_sst_arrow_schema_column_num};
+
+    /// Builds a `RegionMetadata` with the given number of tags and fields.
+    fn build_metadata(
+        num_tags: usize,
+        num_fields: usize,
+        encoding: PrimaryKeyEncoding,
+    ) -> RegionMetadata {
+        let mut builder = RegionMetadataBuilder::new(RegionId::new(0, 0));
+        let mut col_id = 0u32;
+
+        for i in 0..num_tags {
+            builder.push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    format!("tag_{i}"),
+                    ConcreteDataType::string_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Tag,
+                column_id: col_id,
+            });
+            col_id += 1;
+        }
+
+        for i in 0..num_fields {
+            builder.push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    format!("field_{i}"),
+                    ConcreteDataType::uint64_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Field,
+                column_id: col_id,
+            });
+            col_id += 1;
+        }
+
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(
+                "ts".to_string(),
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                false,
+            ),
+            semantic_type: SemanticType::Timestamp,
+            column_id: col_id,
+        });
+
+        let primary_key: Vec<u32> = (0..num_tags as u32).collect();
+        builder.primary_key(primary_key);
+        builder.primary_key_encoding(encoding);
+        builder.build().unwrap()
+    }
+
+    #[test]
+    fn test_field_column_start() {
+        // (num_tags, num_fields, encoding, expected)
+        let cases = [
+            (1, 1, PrimaryKeyEncoding::Dense, 1),
+            (2, 2, PrimaryKeyEncoding::Dense, 2),
+            (0, 2, PrimaryKeyEncoding::Dense, 0),
+            (2, 2, PrimaryKeyEncoding::Sparse, 0),
+        ];
+
+        for (num_tags, num_fields, encoding, expected) in cases {
+            let metadata = build_metadata(num_tags, num_fields, encoding);
+            let options = FlatSchemaOptions::from_encoding(encoding);
+            let num_columns = flat_sst_arrow_schema_column_num(&metadata, &options);
+            let result = field_column_start(&metadata, num_columns);
+            assert_eq!(
+                result, expected,
+                "num_tags={num_tags}, num_fields={num_fields}, encoding={encoding:?}"
+            );
+        }
+    }
+}
diff --git a/src/mito2/src/sst/parquet/format.rs b/src/mito2/src/sst/parquet/format.rs
index 70d026e6db..ba64eac78b 100644
--- a/src/mito2/src/sst/parquet/format.rs
+++ b/src/mito2/src/sst/parquet/format.rs
@@ -34,12 +34,12 @@ use api::v1::SemanticType;
 use common_time::Timestamp;
 use datafusion_common::ScalarValue;
 use datatypes::arrow::array::{
-    ArrayRef, BinaryArray, BinaryDictionaryBuilder, DictionaryArray, UInt32Array, UInt64Array,
+    ArrayRef, BinaryArray, BinaryDictionaryBuilder, DictionaryArray, UInt64Array,
 };
 use datatypes::arrow::datatypes::{SchemaRef, UInt32Type};
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::prelude::DataType;
-use datatypes::vectors::{Helper, Vector};
+use datatypes::vectors::Helper;
 use mito_codec::row_converter::{
     CompositeValues, PrimaryKeyCodec, SortField, build_primary_key_codec,
     build_primary_key_codec_with_fields,
@@ -51,8 +51,7 @@ use store_api::metadata::{ColumnMetadata, RegionMetadataRef};
 use store_api::storage::{ColumnId, SequenceNumber};
 
 use crate::error::{
-    ConvertVectorSnafu, DecodeSnafu, InvalidBatchSnafu, InvalidRecordBatchSnafu,
-    NewRecordBatchSnafu, Result,
+    ConvertVectorSnafu, DecodeSnafu, InvalidRecordBatchSnafu, NewRecordBatchSnafu, Result,
 };
 use crate::read::{Batch, BatchBuilder, BatchColumn};
 use crate::sst::file::{FileMeta, FileTimeRange};
@@ -73,7 +72,6 @@ pub(crate) const INTERNAL_COLUMN_NUM: usize = 3;
 
 /// Helper for writing the SST format with primary key.
 pub(crate) struct PrimaryKeyWriteFormat {
-    metadata: RegionMetadataRef,
     /// SST file schema.
     arrow_schema: SchemaRef,
     override_sequence: Option<SequenceNumber>,
@@ -84,7 +82,6 @@ impl PrimaryKeyWriteFormat {
     pub(crate) fn new(metadata: RegionMetadataRef) -> PrimaryKeyWriteFormat {
         let arrow_schema = to_sst_arrow_schema(&metadata);
         PrimaryKeyWriteFormat {
-            metadata,
             arrow_schema,
             override_sequence: None,
         }
@@ -104,40 +101,25 @@ impl PrimaryKeyWriteFormat {
         &self.arrow_schema
     }
 
-    /// Convert `batch` to a arrow record batch to store in parquet.
-    pub(crate) fn convert_batch(&self, batch: &Batch) -> Result<RecordBatch> {
-        debug_assert_eq!(
-            batch.fields().len() + FIXED_POS_COLUMN_NUM,
-            self.arrow_schema.fields().len()
-        );
-        let mut columns = Vec::with_capacity(batch.fields().len() + FIXED_POS_COLUMN_NUM);
-        // Store all fields first.
-        for (column, column_metadata) in batch.fields().iter().zip(self.metadata.field_columns()) {
-            ensure!(
-                column.column_id == column_metadata.column_id,
-                InvalidBatchSnafu {
-                    reason: format!(
-                        "Batch has column {} but metadata has column {}",
-                        column.column_id, column_metadata.column_id
-                    ),
-                }
-            );
-
-            columns.push(column.data.to_arrow_array());
-        }
-        // Add time index column.
-        columns.push(batch.timestamps().to_arrow_array());
-        // Add internal columns: primary key, sequences, op types.
-        columns.push(new_primary_key_array(batch.primary_key(), batch.num_rows()));
+    /// Convert a flat `RecordBatch` to primary-key format, retaining only
+    /// field columns, time index, and internal columns.
+    ///
+    /// `num_fields` is the number of field columns. The method strips
+    /// leading tag columns: `num_tag_columns = batch.num_columns() - num_fields - FIXED_POS_COLUMN_NUM`.
+    pub(crate) fn convert_flat_batch(
+        &self,
+        batch: &RecordBatch,
+        num_fields: usize,
+    ) -> Result<RecordBatch> {
+        let num_tag_columns = batch.num_columns() - num_fields - FIXED_POS_COLUMN_NUM;
+        let mut columns: Vec<ArrayRef> = batch.columns()[num_tag_columns..].to_vec();
 
         if let Some(override_sequence) = self.override_sequence {
-            let sequence_array =
+            let num_cols = columns.len();
+            // sequence is at num_cols - 2 (before op_type)
+            columns[num_cols - 2] =
                 Arc::new(UInt64Array::from(vec![override_sequence; batch.num_rows()]));
-            columns.push(sequence_array);
-        } else {
-            columns.push(batch.sequences().to_arrow_array());
         }
-        columns.push(batch.op_types().to_arrow_array());
 
         RecordBatch::try_new(self.arrow_schema.clone(), columns).context(NewRecordBatchSnafu)
     }
@@ -926,15 +908,6 @@ pub(crate) fn primary_key_offsets(pk_dict_array: &PrimaryKeyArray) -> Result<Vec
     Ok(offsets)
 }
 
-/// Creates a new array for specific `primary_key`.
-fn new_primary_key_array(primary_key: &[u8], num_rows: usize) -> ArrayRef {
-    let values = Arc::new(BinaryArray::from_iter_values([primary_key]));
-    let keys = UInt32Array::from_value(0, num_rows);
-
-    // Safety: The key index is valid.
-    Arc::new(DictionaryArray::new(keys, values))
-}
-
 /// Gets the min/max time index of the row group from the parquet meta.
 /// It assumes the parquet is created by the mito engine.
 pub(crate) fn parquet_row_group_time_range(
@@ -1017,7 +990,7 @@ mod tests {
 
     use api::v1::OpType;
     use datatypes::arrow::array::{
-        Int64Array, StringArray, TimestampMillisecondArray, UInt8Array, UInt64Array,
+        Int64Array, StringArray, TimestampMillisecondArray, UInt8Array, UInt32Array, UInt64Array,
     };
     use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, TimeUnit};
     use datatypes::prelude::ConcreteDataType;
@@ -1145,13 +1118,6 @@ mod tests {
         assert_eq!(&build_test_arrow_schema(), write_format.arrow_schema());
     }
 
-    #[test]
-    fn test_new_primary_key_array() {
-        let array = new_primary_key_array(b"test", 3);
-        let expect = build_test_pk_array(&[(b"test".to_vec(), 3)]) as ArrayRef;
-        assert_eq!(&expect, &array);
-    }
-
     fn build_test_pk_array(pk_row_nums: &[(Vec<u8>, usize)]) -> Arc<PrimaryKeyArray> {
         let values = Arc::new(BinaryArray::from_iter_values(
             pk_row_nums.iter().map(|v| &v.0),
@@ -1164,49 +1130,6 @@ mod tests {
         Arc::new(DictionaryArray::new(keys, values))
     }
 
-    #[test]
-    fn test_convert_batch() {
-        let metadata = build_test_region_metadata();
-        let write_format = PrimaryKeyWriteFormat::new(metadata);
-
-        let num_rows = 4;
-        let batch = new_batch(b"test", 1, 2, num_rows);
-        let columns: Vec<ArrayRef> = vec![
-            Arc::new(Int64Array::from(vec![2; num_rows])), // field1
-            Arc::new(Int64Array::from(vec![3; num_rows])), // field0
-            Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts
-            build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // primary key
-            Arc::new(UInt64Array::from(vec![TEST_SEQUENCE; num_rows])), // sequence
-            Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // op type
-        ];
-        let expect_record = RecordBatch::try_new(build_test_arrow_schema(), columns).unwrap();
-
-        let actual = write_format.convert_batch(&batch).unwrap();
-        assert_eq!(expect_record, actual);
-    }
-
-    #[test]
-    fn test_convert_batch_with_override_sequence() {
-        let metadata = build_test_region_metadata();
-        let write_format =
-            PrimaryKeyWriteFormat::new(metadata).with_override_sequence(Some(415411));
-
-        let num_rows = 4;
-        let batch = new_batch(b"test", 1, 2, num_rows);
-        let columns: Vec<ArrayRef> = vec![
-            Arc::new(Int64Array::from(vec![2; num_rows])), // field1
-            Arc::new(Int64Array::from(vec![3; num_rows])), // field0
-            Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts
-            build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // primary key
-            Arc::new(UInt64Array::from(vec![415411; num_rows])), // sequence
-            Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // op type
-        ];
-        let expect_record = RecordBatch::try_new(build_test_arrow_schema(), columns).unwrap();
-
-        let actual = write_format.convert_batch(&batch).unwrap();
-        assert_eq!(expect_record, actual);
-    }
-
     #[test]
     fn test_projection_indices() {
         let metadata = build_test_region_metadata();
@@ -1867,4 +1790,100 @@ mod tests {
         let result = format.convert_batch(record_batch.clone(), None).unwrap();
         assert_eq!(record_batch, result);
     }
+
+    #[test]
+    fn test_convert_flat_batch() {
+        let metadata = build_test_region_metadata();
+        let write_format = PrimaryKeyWriteFormat::new(metadata);
+
+        let num_rows = 4;
+        // Build a flat record batch: tag0, tag1, field1, field0, ts, __primary_key, __sequence, __op_type
+        let flat_columns: Vec<ArrayRef> = input_columns_for_flat_batch(num_rows);
+        let flat_batch = RecordBatch::try_new(build_test_flat_sst_schema(), flat_columns).unwrap();
+
+        // num_fields = 2 (field1, field0)
+        let result = write_format.convert_flat_batch(&flat_batch, 2).unwrap();
+
+        // Expected: tag columns stripped, only field1, field0, ts, __primary_key, __sequence, __op_type
+        let expected_columns: Vec<ArrayRef> = vec![
+            Arc::new(Int64Array::from(vec![2; num_rows])), // field1
+            Arc::new(Int64Array::from(vec![3; num_rows])), // field0
+            Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts
+            build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // __primary_key
+            Arc::new(UInt64Array::from(vec![TEST_SEQUENCE; num_rows])), // __sequence
+            Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // __op_type
+        ];
+        let expected = RecordBatch::try_new(build_test_arrow_schema(), expected_columns).unwrap();
+
+        assert_eq!(expected, result);
+    }
+
+    #[test]
+    fn test_convert_flat_batch_with_override_sequence() {
+        let metadata = build_test_region_metadata();
+        let write_format = PrimaryKeyWriteFormat::new(metadata).with_override_sequence(Some(999));
+
+        let num_rows = 4;
+        let flat_columns: Vec<ArrayRef> = input_columns_for_flat_batch(num_rows);
+        let flat_batch = RecordBatch::try_new(build_test_flat_sst_schema(), flat_columns).unwrap();
+
+        let result = write_format.convert_flat_batch(&flat_batch, 2).unwrap();
+
+        let expected_columns: Vec<ArrayRef> = vec![
+            Arc::new(Int64Array::from(vec![2; num_rows])), // field1
+            Arc::new(Int64Array::from(vec![3; num_rows])), // field0
+            Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3, 4])), // ts
+            build_test_pk_array(&[(b"test".to_vec(), num_rows)]), // __primary_key
+            Arc::new(UInt64Array::from(vec![999; num_rows])), // overridden __sequence
+            Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // __op_type
+        ];
+        let expected = RecordBatch::try_new(build_test_arrow_schema(), expected_columns).unwrap();
+
+        assert_eq!(expected, result);
+    }
+
+    #[test]
+    fn test_convert_flat_batch_no_tags() {
+        // Test with a region that has no primary key columns (no tags to strip).
+        let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
+        builder
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "field0",
+                    ConcreteDataType::int64_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Field,
+                column_id: 1,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "ts",
+                    ConcreteDataType::timestamp_millisecond_datatype(),
+                    false,
+                ),
+                semantic_type: SemanticType::Timestamp,
+                column_id: 2,
+            });
+        let metadata = Arc::new(builder.build().unwrap());
+        let write_format = PrimaryKeyWriteFormat::new(metadata);
+
+        let num_rows = 3;
+        // No tag columns, so flat batch is: field0, ts, __primary_key, __sequence, __op_type
+        let sst_schema = write_format.arrow_schema().clone();
+        let columns: Vec<ArrayRef> = vec![
+            Arc::new(Int64Array::from(vec![10; num_rows])), // field0
+            Arc::new(TimestampMillisecondArray::from(vec![1, 2, 3])), // ts
+            build_test_pk_array(&[(b"".to_vec(), num_rows)]), // __primary_key
+            Arc::new(UInt64Array::from(vec![TEST_SEQUENCE; num_rows])), // __sequence
+            Arc::new(UInt8Array::from(vec![TEST_OP_TYPE; num_rows])), // __op_type
+        ];
+        let flat_batch = RecordBatch::try_new(sst_schema.clone(), columns.clone()).unwrap();
+
+        // num_fields = 1, num_tag_columns = 5 - 1 - 4 = 0, so nothing is stripped
+        let result = write_format.convert_flat_batch(&flat_batch, 1).unwrap();
+        let expected = RecordBatch::try_new(sst_schema, columns).unwrap();
+
+        assert_eq!(expected, result);
+    }
 }
diff --git a/src/mito2/src/sst/parquet/prefilter.rs b/src/mito2/src/sst/parquet/prefilter.rs
new file mode 100644
index 0000000000..5de2e3512f
--- /dev/null
+++ b/src/mito2/src/sst/parquet/prefilter.rs
@@ -0,0 +1,528 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Helpers for parquet prefiltering.
+
+use std::ops::Range;
+
+use api::v1::SemanticType;
+use common_recordbatch::filter::SimpleFilterEvaluator;
+use datatypes::arrow::array::{BinaryArray, BooleanArray};
+use datatypes::arrow::record_batch::RecordBatch;
+use mito_codec::primary_key_filter::is_partition_column;
+use mito_codec::row_converter::PrimaryKeyFilter;
+use snafu::{OptionExt, ResultExt};
+use store_api::metadata::{RegionMetadata, RegionMetadataRef};
+
+use crate::error::{ComputeArrowSnafu, Result, UnexpectedSnafu};
+use crate::sst::parquet::flat_format::primary_key_column_index;
+use crate::sst::parquet::format::PrimaryKeyArray;
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn matching_row_ranges_by_primary_key(
+    input: &RecordBatch,
+    pk_filter: &mut dyn PrimaryKeyFilter,
+) -> Result<Vec<Range<usize>>> {
+    let primary_key_index = primary_key_column_index(input.num_columns());
+    let pk_dict_array = input
+        .column(primary_key_index)
+        .as_any()
+        .downcast_ref::<PrimaryKeyArray>()
+        .context(UnexpectedSnafu {
+            reason: "Primary key column is not a dictionary array",
+        })?;
+    let pk_values = pk_dict_array
+        .values()
+        .as_any()
+        .downcast_ref::<BinaryArray>()
+        .context(UnexpectedSnafu {
+            reason: "Primary key values are not binary array",
+        })?;
+    let keys = pk_dict_array.keys();
+    let key_values = keys.values();
+
+    if key_values.is_empty() {
+        return Ok(std::iter::once(0..input.num_rows()).collect());
+    }
+
+    let mut matched_row_ranges: Vec<Range<usize>> = Vec::new();
+    let mut start = 0;
+    while start < key_values.len() {
+        let key = key_values[start];
+        let mut end = start + 1;
+        while end < key_values.len() && key_values[end] == key {
+            end += 1;
+        }
+
+        if pk_filter.matches(pk_values.value(key as usize)) {
+            if let Some(last) = matched_row_ranges.last_mut()
+                && last.end == start
+            {
+                last.end = end;
+            } else {
+                matched_row_ranges.push(start..end);
+            }
+        }
+
+        start = end;
+    }
+
+    Ok(matched_row_ranges)
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn prefilter_flat_batch_by_primary_key(
+    input: RecordBatch,
+    pk_filter: &mut dyn PrimaryKeyFilter,
+) -> Result<Option<RecordBatch>> {
+    if input.num_rows() == 0 {
+        return Ok(Some(input));
+    }
+
+    let matched_row_ranges = matching_row_ranges_by_primary_key(&input, pk_filter)?;
+    if matched_row_ranges.is_empty() {
+        return Ok(None);
+    }
+
+    if matched_row_ranges.len() == 1
+        && matched_row_ranges[0].start == 0
+        && matched_row_ranges[0].end == input.num_rows()
+    {
+        return Ok(Some(input));
+    }
+
+    if matched_row_ranges.len() == 1 {
+        let span = &matched_row_ranges[0];
+        return Ok(Some(input.slice(span.start, span.end - span.start)));
+    }
+
+    let mut mask = vec![false; input.num_rows()];
+    for span in matched_row_ranges {
+        mask[span].fill(true);
+    }
+
+    let filtered =
+        datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask))
+            .context(ComputeArrowSnafu)?;
+    if filtered.num_rows() == 0 {
+        Ok(None)
+    } else {
+        Ok(Some(filtered))
+    }
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn retain_usable_primary_key_filters(
+    sst_metadata: &RegionMetadataRef,
+    expected_metadata: Option<&RegionMetadata>,
+    filters: &mut Vec<SimpleFilterEvaluator>,
+) {
+    filters.retain(|filter| is_usable_primary_key_filter(sst_metadata, expected_metadata, filter));
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn is_usable_primary_key_filter(
+    sst_metadata: &RegionMetadataRef,
+    expected_metadata: Option<&RegionMetadata>,
+    filter: &SimpleFilterEvaluator,
+) -> bool {
+    // TODO(yingwen): The primary key filter always skips the partition column. Consider using a flag
+    // to control this behavior. We can remove this behavior after we remove the PartitionTreeMemtable.
+    if is_partition_column(filter.column_name()) {
+        return false;
+    }
+
+    let sst_column = match expected_metadata {
+        Some(expected_metadata) => {
+            let Some(expected_column) = expected_metadata.column_by_name(filter.column_name())
+            else {
+                return false;
+            };
+            let Some(sst_column) = sst_metadata.column_by_id(expected_column.column_id) else {
+                return false;
+            };
+
+            if sst_column.column_schema.name != expected_column.column_schema.name
+                || sst_column.semantic_type != expected_column.semantic_type
+                || sst_column.column_schema.data_type != expected_column.column_schema.data_type
+            {
+                return false;
+            }
+
+            sst_column
+        }
+        None => {
+            let Some(sst_column) = sst_metadata.column_by_name(filter.column_name()) else {
+                return false;
+            };
+            sst_column
+        }
+    };
+
+    sst_column.semantic_type == SemanticType::Tag
+        && sst_metadata
+            .primary_key_index(sst_column.column_id)
+            .is_some()
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) struct CachedPrimaryKeyFilter {
+    inner: Box<dyn PrimaryKeyFilter>,
+    last_primary_key: Vec<u8>,
+    last_match: Option<bool>,
+}
+
+impl CachedPrimaryKeyFilter {
+    #[cfg_attr(not(test), allow(dead_code))]
+    pub(crate) fn new(inner: Box<dyn PrimaryKeyFilter>) -> Self {
+        Self {
+            inner,
+            last_primary_key: Vec::new(),
+            last_match: None,
+        }
+    }
+}
+
+impl PrimaryKeyFilter for CachedPrimaryKeyFilter {
+    fn matches(&mut self, pk: &[u8]) -> bool {
+        if let Some(last_match) = self.last_match
+            && self.last_primary_key == pk
+        {
+            return last_match;
+        }
+
+        let matched = self.inner.matches(pk);
+        self.last_primary_key.clear();
+        self.last_primary_key.extend_from_slice(pk);
+        self.last_match = Some(matched);
+        matched
+    }
+}
+
+#[cfg_attr(not(test), allow(dead_code))]
+pub(crate) fn batch_single_primary_key(batch: &RecordBatch) -> Result<Option<&[u8]>> {
+    let primary_key_index = primary_key_column_index(batch.num_columns());
+    let pk_dict_array = batch
+        .column(primary_key_index)
+        .as_any()
+        .downcast_ref::<PrimaryKeyArray>()
+        .context(UnexpectedSnafu {
+            reason: "Primary key column is not a dictionary array",
+        })?;
+    let pk_values = pk_dict_array
+        .values()
+        .as_any()
+        .downcast_ref::<BinaryArray>()
+        .context(UnexpectedSnafu {
+            reason: "Primary key values are not binary array",
+        })?;
+    let keys = pk_dict_array.keys();
+    if keys.is_empty() {
+        return Ok(None);
+    }
+
+    let first_key = keys.value(0);
+    if first_key != keys.value(keys.len() - 1) {
+        return Ok(None);
+    }
+
+    Ok(Some(pk_values.value(first_key as usize)))
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    use api::v1::SemanticType;
+    use common_recordbatch::filter::SimpleFilterEvaluator;
+    use datafusion_expr::{col, lit};
+    use datatypes::arrow::array::{
+        ArrayRef, BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array,
+        UInt64Array,
+    };
+    use datatypes::arrow::datatypes::{Schema, UInt32Type};
+    use datatypes::arrow::record_batch::RecordBatch;
+    use datatypes::prelude::ConcreteDataType;
+    use mito_codec::row_converter::{PrimaryKeyFilter, build_primary_key_codec};
+    use store_api::codec::PrimaryKeyEncoding;
+    use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
+    use store_api::storage::ColumnSchema;
+
+    use super::*;
+    use crate::sst::internal_fields;
+    use crate::sst::parquet::format::ReadFormat;
+    use crate::test_util::sst_util::{
+        new_primary_key, sst_region_metadata, sst_region_metadata_with_encoding,
+    };
+
+    fn new_test_filters(exprs: &[datafusion_expr::Expr]) -> Vec<SimpleFilterEvaluator> {
+        exprs
+            .iter()
+            .filter_map(SimpleFilterEvaluator::try_new)
+            .collect()
+    }
+
+    fn expected_metadata_with_reused_tag_name(
+        old_metadata: &RegionMetadata,
+    ) -> Arc<RegionMetadata> {
+        let mut builder = RegionMetadataBuilder::new(old_metadata.region_id);
+        builder
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "tag_0".to_string(),
+                    ConcreteDataType::string_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Tag,
+                column_id: 10,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "tag_1".to_string(),
+                    ConcreteDataType::string_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Tag,
+                column_id: 1,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "field_0".to_string(),
+                    ConcreteDataType::uint64_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Field,
+                column_id: 2,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "ts".to_string(),
+                    ConcreteDataType::timestamp_millisecond_datatype(),
+                    false,
+                ),
+                semantic_type: SemanticType::Timestamp,
+                column_id: 3,
+            })
+            .primary_key(vec![10, 1]);
+
+        Arc::new(builder.build().unwrap())
+    }
+
+    fn new_raw_batch_with_metadata(
+        metadata: Arc<RegionMetadata>,
+        primary_keys: &[&[u8]],
+        field_values: &[u64],
+    ) -> RecordBatch {
+        assert_eq!(primary_keys.len(), field_values.len());
+
+        let arrow_schema = metadata.schema.arrow_schema();
+        let field_column = arrow_schema
+            .field(arrow_schema.index_of("field_0").unwrap())
+            .clone();
+        let time_index_column = arrow_schema
+            .field(arrow_schema.index_of("ts").unwrap())
+            .clone();
+        let mut fields = vec![field_column, time_index_column];
+        fields.extend(
+            internal_fields()
+                .into_iter()
+                .map(|field| field.as_ref().clone()),
+        );
+        let schema = Arc::new(Schema::new(fields));
+
+        let mut dict_values = Vec::new();
+        let mut keys = Vec::with_capacity(primary_keys.len());
+        for pk in primary_keys {
+            let key = dict_values
+                .iter()
+                .position(|existing: &&[u8]| existing == pk)
+                .unwrap_or_else(|| {
+                    dict_values.push(*pk);
+                    dict_values.len() - 1
+                });
+            keys.push(key as u32);
+        }
+
+        let pk_array: ArrayRef = Arc::new(DictionaryArray::<UInt32Type>::new(
+            UInt32Array::from(keys),
+            Arc::new(BinaryArray::from_iter_values(dict_values.iter().copied())),
+        ));
+
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(UInt64Array::from(field_values.to_vec())),
+                Arc::new(TimestampMillisecondArray::from_iter_values(
+                    0..primary_keys.len() as i64,
+                )),
+                pk_array,
+                Arc::new(UInt64Array::from(vec![1; primary_keys.len()])),
+                Arc::new(UInt8Array::from(vec![1; primary_keys.len()])),
+            ],
+        )
+        .unwrap()
+    }
+
+    fn new_raw_batch(primary_keys: &[&[u8]], field_values: &[u64]) -> RecordBatch {
+        new_raw_batch_with_metadata(Arc::new(sst_region_metadata()), primary_keys, field_values)
+    }
+
+    fn field_values(batch: &RecordBatch) -> Vec<u64> {
+        batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap()
+            .values()
+            .to_vec()
+    }
+
+    #[test]
+    fn test_retain_usable_primary_key_filters_skips_non_tag_filters() {
+        let metadata = Arc::new(sst_region_metadata());
+        let mut filters =
+            new_test_filters(&[col("field_0").eq(lit(1_u64)), col("ts").gt(lit(0_i64))]);
+
+        retain_usable_primary_key_filters(&metadata, None, &mut filters);
+
+        assert!(filters.is_empty());
+    }
+
+    #[test]
+    fn test_retain_usable_primary_key_filters_skips_reused_expected_tag_name() {
+        let metadata = Arc::new(sst_region_metadata());
+        let expected_metadata = expected_metadata_with_reused_tag_name(&metadata);
+        let mut filters = new_test_filters(&[col("tag_0").eq(lit("b"))]);
+
+        retain_usable_primary_key_filters(
+            &metadata,
+            Some(expected_metadata.as_ref()),
+            &mut filters,
+        );
+
+        assert!(filters.is_empty());
+    }
+
+    #[test]
+    fn test_is_usable_primary_key_filter_skips_legacy_primary_key_batches() {
+        let metadata = Arc::new(sst_region_metadata_with_encoding(
+            PrimaryKeyEncoding::Sparse,
+        ));
+        let read_format = ReadFormat::new_flat(
+            metadata.clone(),
+            metadata.column_metadatas.iter().map(|c| c.column_id),
+            None,
+            "test",
+            true,
+        )
+        .unwrap();
+        assert!(read_format.as_flat().is_some());
+
+        let filter = SimpleFilterEvaluator::try_new(&col("tag_0").eq(lit("b"))).unwrap();
+        assert!(is_usable_primary_key_filter(&metadata, None, &filter));
+    }
+
+    #[test]
+    fn test_prefilter_primary_key_drops_single_dictionary_batch() {
+        let metadata = Arc::new(sst_region_metadata());
+        let filters = Arc::new(new_test_filters(&[col("tag_0").eq(lit("b"))]));
+        let mut primary_key_filter =
+            build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters);
+        let pk_a = new_primary_key(&["a", "x"]);
+        let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]);
+
+        let filtered =
+            prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut()).unwrap();
+
+        assert!(filtered.is_none());
+    }
+
+    #[test]
+    fn test_prefilter_primary_key_builds_mask_for_fragmented_matches() {
+        let metadata = Arc::new(sst_region_metadata());
+        let filters = Arc::new(new_test_filters(&[col("tag_0")
+            .eq(lit("a"))
+            .or(col("tag_0").eq(lit("c")))]));
+        let mut primary_key_filter =
+            build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters);
+        let pk_a = new_primary_key(&["a", "x"]);
+        let pk_b = new_primary_key(&["b", "x"]);
+        let pk_c = new_primary_key(&["c", "x"]);
+        let pk_d = new_primary_key(&["d", "x"]);
+        let batch = new_raw_batch(
+            &[
+                pk_a.as_slice(),
+                pk_a.as_slice(),
+                pk_b.as_slice(),
+                pk_b.as_slice(),
+                pk_c.as_slice(),
+                pk_c.as_slice(),
+                pk_d.as_slice(),
+                pk_d.as_slice(),
+            ],
+            &[10, 11, 12, 13, 14, 15, 16, 17],
+        );
+
+        let filtered = prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut())
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(filtered.num_rows(), 4);
+        assert_eq!(field_values(&filtered), vec![10, 11, 14, 15]);
+    }
+
+    struct CountingPrimaryKeyFilter {
+        hits: Arc<AtomicUsize>,
+        expected: Vec<u8>,
+    }
+
+    impl PrimaryKeyFilter for CountingPrimaryKeyFilter {
+        fn matches(&mut self, pk: &[u8]) -> bool {
+            self.hits.fetch_add(1, Ordering::Relaxed);
+            pk == self.expected.as_slice()
+        }
+    }
+
+    #[test]
+    fn test_cached_primary_key_filter_reuses_previous_result() {
+        let expected = new_primary_key(&["a", "x"]);
+        let hits = Arc::new(AtomicUsize::new(0));
+        let mut filter = CachedPrimaryKeyFilter::new(Box::new(CountingPrimaryKeyFilter {
+            hits: Arc::clone(&hits),
+            expected: expected.clone(),
+        }));
+
+        assert!(filter.matches(expected.as_slice()));
+        assert!(filter.matches(expected.as_slice()));
+        assert!(!filter.matches(new_primary_key(&["b", "x"]).as_slice()));
+
+        assert_eq!(hits.load(Ordering::Relaxed), 2);
+    }
+
+    #[test]
+    fn test_batch_single_primary_key() {
+        let pk_a = new_primary_key(&["a", "x"]);
+        let pk_b = new_primary_key(&["b", "x"]);
+
+        let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]);
+        assert_eq!(
+            batch_single_primary_key(&batch).unwrap(),
+            Some(pk_a.as_slice())
+        );
+
+        let batch = new_raw_batch(&[pk_a.as_slice(), pk_b.as_slice()], &[10, 11]);
+        assert_eq!(batch_single_primary_key(&batch).unwrap(), None);
+    }
+}
diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs
index 500f32ae91..f152c97075 100644
--- a/src/mito2/src/sst/parquet/reader.rs
+++ b/src/mito2/src/sst/parquet/reader.rs
@@ -21,43 +21,40 @@ use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use api::v1::SemanticType;
-use async_trait::async_trait;
 use common_recordbatch::filter::SimpleFilterEvaluator;
-use common_telemetry::{debug, tracing, warn};
+use common_telemetry::{tracing, warn};
 use datafusion_expr::Expr;
 use datatypes::arrow::array::ArrayRef;
 use datatypes::arrow::datatypes::Field;
-use datatypes::arrow::error::ArrowError;
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::data_type::ConcreteDataType;
 use datatypes::prelude::DataType;
+use futures::StreamExt;
 use mito_codec::row_converter::build_primary_key_codec;
 use object_store::ObjectStore;
-use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, RowSelection};
-use parquet::arrow::{FieldLevels, ProjectionMask, parquet_to_arrow_field_levels};
-use parquet::file::metadata::{KeyValue, PageIndexPolicy, ParquetMetaData};
+use parquet::arrow::ProjectionMask;
+use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions, RowSelection};
+use parquet::arrow::async_reader::{ParquetRecordBatchStream, ParquetRecordBatchStreamBuilder};
+use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData};
 use partition::expr::PartitionExpr;
-use snafu::{OptionExt, ResultExt};
+use snafu::ResultExt;
 use store_api::codec::PrimaryKeyEncoding;
 use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataRef};
 use store_api::region_request::PathType;
 use store_api::storage::{ColumnId, FileId};
 use table::predicate::Predicate;
 
-use crate::cache::CacheStrategy;
 use crate::cache::index::result_cache::PredicateKey;
+use crate::cache::{CacheStrategy, CachedSstMeta};
 #[cfg(feature = "vector_index")]
 use crate::error::ApplyVectorIndexSnafu;
-use crate::error::{
-    ArrowReaderSnafu, InvalidMetadataSnafu, InvalidParquetSnafu, ReadDataPartSnafu,
-    ReadParquetSnafu, Result, SerializePartitionExprSnafu,
-};
+use crate::error::{ReadDataPartSnafu, ReadParquetSnafu, Result, SerializePartitionExprSnafu};
 use crate::metrics::{
     PRECISE_FILTER_ROWS_TOTAL, READ_ROW_GROUPS_TOTAL, READ_ROWS_IN_ROW_GROUP_TOTAL,
     READ_ROWS_TOTAL, READ_STAGE_ELAPSED,
 };
 use crate::read::flat_projection::CompactionProjectionMapper;
-use crate::read::prune::{PruneReader, Source};
+use crate::read::prune::FlatPruneReader;
 use crate::read::{Batch, BatchReader};
 use crate::sst::file::FileHandle;
 use crate::sst::index::bloom_filter::applier::{
@@ -71,16 +68,17 @@ use crate::sst::index::inverted_index::applier::{
 };
 #[cfg(feature = "vector_index")]
 use crate::sst::index::vector_index::applier::VectorIndexApplierRef;
+use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
+use crate::sst::parquet::async_reader::SstAsyncFileReader;
 use crate::sst::parquet::file_range::{
     FileRangeContext, FileRangeContextRef, PartitionFilterContext, PreFilterMode, RangeBase,
     row_group_contains_delete,
 };
 use crate::sst::parquet::format::{ReadFormat, need_override_sequence};
 use crate::sst::parquet::metadata::MetadataLoader;
-use crate::sst::parquet::row_group::{InMemoryRowGroup, ParquetFetchMetrics};
+use crate::sst::parquet::row_group::ParquetFetchMetrics;
 use crate::sst::parquet::row_selection::RowGroupSelection;
 use crate::sst::parquet::stats::RowGroupPruningStats;
-use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, PARQUET_METADATA_KEY};
 use crate::sst::tag_maybe_to_dictionary_field;
 
 const INDEX_TYPE_FULLTEXT: &str = "fulltext";
@@ -303,7 +301,8 @@ impl ParquetReaderBuilder {
     pub async fn build(&self) -> Result<Option<ParquetReader>> {
         let mut metrics = ReaderMetrics::default();
 
-        let Some((context, selection)) = self.build_reader_input(&mut metrics).await? else {
+        let Some((context, selection)) = self.build_reader_input_inner(&mut metrics, true).await?
+        else {
             return Ok(None);
         };
         ParquetReader::new(Arc::new(context), selection)
@@ -325,12 +324,14 @@ impl ParquetReaderBuilder {
         &self,
         metrics: &mut ReaderMetrics,
     ) -> Result<Option<(FileRangeContext, RowGroupSelection)>> {
-        self.build_reader_input_inner(metrics).await
+        self.build_reader_input_inner(metrics, self.flat_format)
+            .await
     }
 
     async fn build_reader_input_inner(
         &self,
         metrics: &mut ReaderMetrics,
+        flat_format: bool,
     ) -> Result<Option<(FileRangeContext, RowGroupSelection)>> {
         let start = Instant::now();
 
@@ -338,7 +339,7 @@ impl ParquetReaderBuilder {
         let file_size = self.file_handle.meta_ref().file_size;
 
         // Loads parquet metadata of the file.
-        let (parquet_meta, cache_miss) = self
+        let (sst_meta, cache_miss) = self
             .read_parquet_metadata(
                 &file_path,
                 file_size,
@@ -346,9 +347,8 @@ impl ParquetReaderBuilder {
                 self.page_index_policy,
             )
             .await?;
-        // Decodes region metadata.
-        let key_value_meta = parquet_meta.file_metadata().key_value_metadata();
-        let region_meta = Arc::new(Self::get_region_metadata(&file_path, key_value_meta)?);
+        let parquet_meta = sst_meta.parquet_metadata();
+        let region_meta = sst_meta.region_metadata();
         let region_partition_expr_str = self
             .expected_metadata
             .as_ref()
@@ -373,7 +373,7 @@ impl ParquetReaderBuilder {
         // before compat handling.
         let compaction_projection_mapper = if self.compaction
             && !is_same_region_partition
-            && self.flat_format
+            && flat_format
             && region_meta.primary_key_encoding == PrimaryKeyEncoding::Sparse
         {
             Some(CompactionProjectionMapper::try_new(&region_meta)?)
@@ -385,7 +385,7 @@ impl ParquetReaderBuilder {
             ReadFormat::new(
                 region_meta.clone(),
                 Some(column_ids),
-                self.flat_format,
+                flat_format,
                 Some(parquet_meta.file_metadata().schema_descr().num_columns()),
                 &file_path,
                 skip_auto_convert,
@@ -401,7 +401,7 @@ impl ParquetReaderBuilder {
             ReadFormat::new(
                 region_meta.clone(),
                 Some(&column_ids),
-                self.flat_format,
+                flat_format,
                 Some(parquet_meta.file_metadata().schema_descr().num_columns()),
                 &file_path,
                 skip_auto_convert,
@@ -415,6 +415,12 @@ impl ParquetReaderBuilder {
                 .set_override_sequence(self.file_handle.meta_ref().sequence.map(|x| x.get()));
         }
 
+        // Computes the projection mask.
+        let parquet_schema_desc = parquet_meta.file_metadata().schema_descr();
+        let indices = read_format.projection_indices();
+        // Now we assumes we don't have nested schemas.
+        // TODO(yingwen): Revisit this if we introduce nested types such as JSON type.
+        let projection_mask = ProjectionMask::roots(parquet_schema_desc, indices.iter().copied());
         let selection = self
             .row_groups_to_read(&read_format, &parquet_meta, &mut metrics.filter_metrics)
             .await;
@@ -446,26 +452,20 @@ impl ParquetReaderBuilder {
             .map(|meta| meta.schema.clone())
             .unwrap_or_else(|| region_meta.schema.clone());
 
-        // Computes the projection mask.
-        let parquet_schema_desc = parquet_meta.file_metadata().schema_descr();
-        let indices = read_format.projection_indices();
-        // Now we assumes we don't have nested schemas.
-        // TODO(yingwen): Revisit this if we introduce nested types such as JSON type.
-        let projection_mask = ProjectionMask::roots(parquet_schema_desc, indices.iter().copied());
-
-        // Computes the field levels.
-        let hint = Some(read_format.arrow_schema().fields());
-        let field_levels =
-            parquet_to_arrow_field_levels(parquet_schema_desc, projection_mask.clone(), hint)
+        // Create ArrowReaderMetadata for async stream building.
+        let arrow_reader_options =
+            ArrowReaderOptions::new().with_schema(read_format.arrow_schema().clone());
+        let arrow_metadata =
+            ArrowReaderMetadata::try_new(parquet_meta.clone(), arrow_reader_options)
                 .context(ReadDataPartSnafu)?;
 
         let reader_builder = RowGroupReaderBuilder {
             file_handle: self.file_handle.clone(),
             file_path,
             parquet_meta,
+            arrow_metadata,
             object_store: self.object_store.clone(),
             projection: projection_mask,
-            field_levels,
             cache_strategy: self.cache_strategy.clone(),
         };
 
@@ -599,42 +599,15 @@ impl ParquetReaderBuilder {
         }))
     }
 
-    /// Decodes region metadata from key value.
-    fn get_region_metadata(
-        file_path: &str,
-        key_value_meta: Option<&Vec<KeyValue>>,
-    ) -> Result<RegionMetadata> {
-        let key_values = key_value_meta.context(InvalidParquetSnafu {
-            file: file_path,
-            reason: "missing key value meta",
-        })?;
-        let meta_value = key_values
-            .iter()
-            .find(|kv| kv.key == PARQUET_METADATA_KEY)
-            .with_context(|| InvalidParquetSnafu {
-                file: file_path,
-                reason: format!("key {} not found", PARQUET_METADATA_KEY),
-            })?;
-        let json = meta_value
-            .value
-            .as_ref()
-            .with_context(|| InvalidParquetSnafu {
-                file: file_path,
-                reason: format!("No value for key {}", PARQUET_METADATA_KEY),
-            })?;
-
-        RegionMetadata::from_json(json).context(InvalidMetadataSnafu)
-    }
-
     /// Reads parquet metadata of specific file.
-    /// Returns (metadata, cache_miss_flag).
+    /// Returns (fused metadata, cache_miss_flag).
     async fn read_parquet_metadata(
         &self,
         file_path: &str,
         file_size: u64,
         cache_metrics: &mut MetadataCacheMetrics,
         page_index_policy: PageIndexPolicy,
-    ) -> Result<(Arc<ParquetMetaData>, bool)> {
+    ) -> Result<(Arc<CachedSstMeta>, bool)> {
         let start = Instant::now();
         let _t = READ_STAGE_ELAPSED
             .with_label_values(&["read_parquet_metadata"])
@@ -644,7 +617,7 @@ impl ParquetReaderBuilder {
         // Tries to get from cache with metrics tracking.
         if let Some(metadata) = self
             .cache_strategy
-            .get_parquet_meta_data(file_id, cache_metrics, page_index_policy)
+            .get_sst_meta_data(file_id, cache_metrics, page_index_policy)
             .await
         {
             cache_metrics.metadata_load_cost += start.elapsed();
@@ -657,10 +630,10 @@ impl ParquetReaderBuilder {
         metadata_loader.with_page_index_policy(page_index_policy);
         let metadata = metadata_loader.load(cache_metrics).await?;
 
-        let metadata = Arc::new(metadata);
+        let metadata = Arc::new(CachedSstMeta::try_new(file_path, metadata)?);
         // Cache the metadata.
         self.cache_strategy
-            .put_parquet_meta_data(file_id, metadata.clone());
+            .put_sst_meta_data(file_id, metadata.clone());
 
         cache_metrics.metadata_load_cost += start.elapsed();
         Ok((metadata, true))
@@ -1667,7 +1640,7 @@ impl ReaderMetrics {
     }
 }
 
-/// Builder to build a [ParquetRecordBatchReader] for a row group.
+/// Builder to build a [ParquetRecordBatchStream] for a row group.
 pub(crate) struct RowGroupReaderBuilder {
     /// SST file to read.
     ///
@@ -1677,12 +1650,12 @@ pub(crate) struct RowGroupReaderBuilder {
     file_path: String,
     /// Metadata of the parquet file.
     parquet_meta: Arc<ParquetMetaData>,
+    /// Arrow reader metadata for building async stream.
+    arrow_metadata: ArrowReaderMetadata,
     /// Object store as an Operator.
     object_store: ObjectStore,
     /// Projection mask.
     projection: ProjectionMask,
-    /// Field levels to read.
-    field_levels: FieldLevels,
     /// Cache.
     cache_strategy: CacheStrategy,
 }
@@ -1706,66 +1679,43 @@ impl RowGroupReaderBuilder {
         &self.cache_strategy
     }
 
-    /// Builds a [ParquetRecordBatchReader] to read the row group at `row_group_idx`.
+    /// Builds a [ParquetRecordBatchStream] to read the row group at `row_group_idx`.
     pub(crate) async fn build(
         &self,
         row_group_idx: usize,
         row_selection: Option<RowSelection>,
         fetch_metrics: Option<&ParquetFetchMetrics>,
-    ) -> Result<ParquetRecordBatchReader> {
-        let fetch_start = Instant::now();
-
-        let mut row_group = InMemoryRowGroup::create(
-            self.file_handle.region_id(),
-            self.file_handle.file_id().file_id(),
-            &self.parquet_meta,
-            row_group_idx,
-            self.cache_strategy.clone(),
-            &self.file_path,
+    ) -> Result<ParquetRecordBatchStream<SstAsyncFileReader>> {
+        // Create async file reader with caching support.
+        let async_reader = SstAsyncFileReader::new(
+            self.file_handle.file_id(),
+            self.file_path.clone(),
             self.object_store.clone(),
-        );
-        // Fetches data into memory.
-        row_group
-            .fetch(&self.projection, row_selection.as_ref(), fetch_metrics)
-            .await
-            .context(ReadParquetSnafu {
-                path: &self.file_path,
-            })?;
-
-        // Record total fetch elapsed time.
-        if let Some(metrics) = fetch_metrics {
-            metrics.data.lock().unwrap().total_fetch_elapsed += fetch_start.elapsed();
-        }
-
-        // Builds the parquet reader.
-        // Now the row selection is None.
-        ParquetRecordBatchReader::try_new_with_row_groups(
-            &self.field_levels,
-            &row_group,
-            DEFAULT_READ_BATCH_SIZE,
-            row_selection,
+            self.cache_strategy.clone(),
+            self.parquet_meta.clone(),
+            row_group_idx,
         )
-        .context(ReadParquetSnafu {
-            path: &self.file_path,
-        })
-    }
-}
+        .with_fetch_metrics(fetch_metrics.cloned());
 
-/// The state of a [ParquetReader].
-enum ReaderState {
-    /// The reader is reading a row group.
-    Readable(PruneReader),
-    /// The reader is exhausted.
-    Exhausted(ReaderMetrics),
-}
+        // Build the async stream using ArrowReaderBuilder API.
+        let mut builder = ParquetRecordBatchStreamBuilder::new_with_metadata(
+            async_reader,
+            self.arrow_metadata.clone(),
+        );
+        builder = builder
+            .with_row_groups(vec![row_group_idx])
+            .with_projection(self.projection.clone())
+            .with_batch_size(DEFAULT_READ_BATCH_SIZE);
 
-impl ReaderState {
-    /// Returns the metrics of the reader.
-    fn metrics(&self) -> ReaderMetrics {
-        match self {
-            ReaderState::Readable(reader) => reader.metrics(),
-            ReaderState::Exhausted(m) => m.clone(),
+        if let Some(selection) = row_selection {
+            builder = builder.with_row_selection(selection);
         }
+
+        let stream = builder.build().context(ReadParquetSnafu {
+            path: &self.file_path,
+        })?;
+
+        Ok(stream)
     }
 }
 
@@ -1879,13 +1829,12 @@ pub struct ParquetReader {
     /// Row group selection to read.
     selection: RowGroupSelection,
     /// Reader of current row group.
-    reader_state: ReaderState,
+    reader: Option<FlatPruneReader>,
     /// Metrics for tracking row group fetch operations.
     fetch_metrics: ParquetFetchMetrics,
 }
 
-#[async_trait]
-impl BatchReader for ParquetReader {
+impl ParquetReader {
     #[tracing::instrument(
         skip_all,
         fields(
@@ -1893,18 +1842,20 @@ impl BatchReader for ParquetReader {
             file_id = %self.context.reader_builder().file_handle.file_id()
         )
     )]
-    async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        let ReaderState::Readable(reader) = &mut self.reader_state else {
-            return Ok(None);
-        };
+    pub async fn next_record_batch(&mut self) -> Result<Option<RecordBatch>> {
+        loop {
+            if let Some(reader) = &mut self.reader {
+                if let Some(batch) = reader.next_batch().await? {
+                    return Ok(Some(batch));
+                }
+                self.reader = None;
+                continue;
+            }
 
-        // We don't collect the elapsed time if the reader returns an error.
-        if let Some(batch) = reader.next_batch().await? {
-            return Ok(Some(batch));
-        }
+            let Some((row_group_idx, row_selection)) = self.selection.pop_first() else {
+                return Ok(None);
+            };
 
-        // No more items in current row group, reads next row group.
-        while let Some((row_group_idx, row_selection)) = self.selection.pop_first() {
             let parquet_reader = self
                 .context
                 .reader_builder()
@@ -1915,54 +1866,14 @@ impl BatchReader for ParquetReader {
                 )
                 .await?;
 
-            // Resets the parquet reader.
-            // Compute skip_fields for this row group
             let skip_fields = self.context.should_skip_fields(row_group_idx);
-            reader.reset_source(
-                Source::RowGroup(RowGroupReader::new(self.context.clone(), parquet_reader)),
+            self.reader = Some(FlatPruneReader::new_with_row_group_reader(
+                self.context.clone(),
+                FlatRowGroupReader::new(self.context.clone(), parquet_reader),
                 skip_fields,
-            );
-            if let Some(batch) = reader.next_batch().await? {
-                return Ok(Some(batch));
-            }
+            ));
         }
-
-        // The reader is exhausted.
-        self.reader_state = ReaderState::Exhausted(reader.metrics().clone());
-        Ok(None)
     }
-}
-
-impl Drop for ParquetReader {
-    fn drop(&mut self) {
-        let metrics = self.reader_state.metrics();
-        debug!(
-            "Read parquet {} {}, range: {:?}, {}/{} row groups, metrics: {:?}",
-            self.context.reader_builder().file_handle.region_id(),
-            self.context.reader_builder().file_handle.file_id(),
-            self.context.reader_builder().file_handle.time_range(),
-            metrics.filter_metrics.rg_total
-                - metrics.filter_metrics.rg_inverted_filtered
-                - metrics.filter_metrics.rg_minmax_filtered
-                - metrics.filter_metrics.rg_fulltext_filtered
-                - metrics.filter_metrics.rg_bloom_filtered,
-            metrics.filter_metrics.rg_total,
-            metrics
-        );
-
-        // Report metrics.
-        READ_STAGE_ELAPSED
-            .with_label_values(&["build_parquet_reader"])
-            .observe(metrics.build_cost.as_secs_f64());
-        READ_STAGE_ELAPSED
-            .with_label_values(&["scan_row_groups"])
-            .observe(metrics.scan_cost.as_secs_f64());
-        metrics.observe_rows("parquet_reader");
-        metrics.filter_metrics.observe();
-    }
-}
-
-impl ParquetReader {
     /// Creates a new reader.
     #[tracing::instrument(
         skip_all,
@@ -1975,28 +1886,27 @@ impl ParquetReader {
         context: FileRangeContextRef,
         mut selection: RowGroupSelection,
     ) -> Result<Self> {
+        debug_assert!(context.read_format().as_flat().is_some());
         let fetch_metrics = ParquetFetchMetrics::default();
-        // No more items in current row group, reads next row group.
-        let reader_state = if let Some((row_group_idx, row_selection)) = selection.pop_first() {
+        let reader = if let Some((row_group_idx, row_selection)) = selection.pop_first() {
             let parquet_reader = context
                 .reader_builder()
                 .build(row_group_idx, Some(row_selection), Some(&fetch_metrics))
                 .await?;
-            // Compute skip_fields once for this row group
             let skip_fields = context.should_skip_fields(row_group_idx);
-            ReaderState::Readable(PruneReader::new_with_row_group_reader(
+            Some(FlatPruneReader::new_with_row_group_reader(
                 context.clone(),
-                RowGroupReader::new(context.clone(), parquet_reader),
+                FlatRowGroupReader::new(context.clone(), parquet_reader),
                 skip_fields,
             ))
         } else {
-            ReaderState::Exhausted(ReaderMetrics::default())
+            None
         };
 
         Ok(ParquetReader {
             context,
             selection,
-            reader_state,
+            reader,
             fetch_metrics,
         })
     }
@@ -2014,27 +1924,19 @@ impl ParquetReader {
 /// RowGroupReaderContext represents the fields that cannot be shared
 /// between different `RowGroupReader`s.
 pub(crate) trait RowGroupReaderContext: Send {
-    fn map_result(
-        &self,
-        result: std::result::Result<Option<RecordBatch>, ArrowError>,
-    ) -> Result<Option<RecordBatch>>;
-
     fn read_format(&self) -> &ReadFormat;
+
+    fn file_path(&self) -> &str;
 }
 
 impl RowGroupReaderContext for FileRangeContextRef {
-    fn map_result(
-        &self,
-        result: std::result::Result<Option<RecordBatch>, ArrowError>,
-    ) -> Result<Option<RecordBatch>> {
-        result.context(ArrowReaderSnafu {
-            path: self.file_path(),
-        })
-    }
-
     fn read_format(&self) -> &ReadFormat {
         self.as_ref().read_format()
     }
+
+    fn file_path(&self) -> &str {
+        self.as_ref().file_path()
+    }
 }
 
 /// [RowGroupReader] that reads from [FileRange].
@@ -2042,8 +1944,11 @@ pub(crate) type RowGroupReader = RowGroupReaderBase<FileRangeContextRef>;
 
 impl RowGroupReader {
     /// Creates a new reader from file range.
-    pub(crate) fn new(context: FileRangeContextRef, reader: ParquetRecordBatchReader) -> Self {
-        Self::create(context, reader)
+    pub(crate) fn new(
+        context: FileRangeContextRef,
+        stream: ParquetRecordBatchStream<SstAsyncFileReader>,
+    ) -> Self {
+        Self::create(context, stream)
     }
 }
 
@@ -2051,8 +1956,8 @@ impl RowGroupReader {
 pub(crate) struct RowGroupReaderBase<T> {
     /// Context of [RowGroupReader] so adapts to different underlying implementation.
     context: T,
-    /// Inner parquet reader.
-    reader: ParquetRecordBatchReader,
+    /// Inner parquet record batch stream.
+    stream: ParquetRecordBatchStream<SstAsyncFileReader>,
     /// Buffered batches to return.
     batches: VecDeque<Batch>,
     /// Local scan metrics.
@@ -2066,7 +1971,7 @@ where
     T: RowGroupReaderContext,
 {
     /// Creates a new reader to read the primary key format.
-    pub(crate) fn create(context: T, reader: ParquetRecordBatchReader) -> Self {
+    pub(crate) fn create(context: T, stream: ParquetRecordBatchStream<SstAsyncFileReader>) -> Self {
         // The batch length from the reader should be less than or equal to DEFAULT_READ_BATCH_SIZE.
         let override_sequence = context
             .read_format()
@@ -2075,7 +1980,7 @@ where
 
         Self {
             context,
-            reader,
+            stream,
             batches: VecDeque::new(),
             metrics: ReaderMetrics::default(),
             override_sequence,
@@ -2092,13 +1997,18 @@ where
         self.context.read_format()
     }
 
-    /// Tries to fetch next [RecordBatch] from the reader.
-    fn fetch_next_record_batch(&mut self) -> Result<Option<RecordBatch>> {
-        self.context.map_result(self.reader.next().transpose())
+    /// Tries to fetch next [RecordBatch] from the stream asynchronously.
+    async fn fetch_next_record_batch(&mut self) -> Result<Option<RecordBatch>> {
+        match self.stream.next().await.transpose() {
+            Ok(batch) => Ok(batch),
+            Err(e) => Err(e).context(ReadParquetSnafu {
+                path: self.context.file_path(),
+            }),
+        }
     }
 
     /// Returns the next [Batch].
-    pub(crate) fn next_inner(&mut self) -> Result<Option<Batch>> {
+    pub(crate) async fn next_inner(&mut self) -> Result<Option<Batch>> {
         let scan_start = Instant::now();
         if let Some(batch) = self.batches.pop_front() {
             self.metrics.num_rows += batch.num_rows();
@@ -2108,7 +2018,7 @@ where
 
         // We need to fetch next record batch and convert it to batches.
         while self.batches.is_empty() {
-            let Some(record_batch) = self.fetch_next_record_batch()? else {
+            let Some(record_batch) = self.fetch_next_record_batch().await? else {
                 self.metrics.scan_cost += scan_start.elapsed();
                 return Ok(None);
             };
@@ -2136,10 +2046,10 @@ where
 #[async_trait::async_trait]
 impl<T> BatchReader for RowGroupReaderBase<T>
 where
-    T: RowGroupReaderContext,
+    T: RowGroupReaderContext + Send + Sync,
 {
     async fn next_batch(&mut self) -> Result<Option<Batch>> {
-        self.next_inner()
+        self.next_inner().await
     }
 }
 
@@ -2147,15 +2057,18 @@ where
 pub(crate) struct FlatRowGroupReader {
     /// Context for file ranges.
     context: FileRangeContextRef,
-    /// Inner parquet reader.
-    reader: ParquetRecordBatchReader,
+    /// Inner parquet record batch stream.
+    stream: ParquetRecordBatchStream<SstAsyncFileReader>,
     /// Cached sequence array to override sequences.
     override_sequence: Option<ArrayRef>,
 }
 
 impl FlatRowGroupReader {
     /// Creates a new flat reader from file range.
-    pub(crate) fn new(context: FileRangeContextRef, reader: ParquetRecordBatchReader) -> Self {
+    pub(crate) fn new(
+        context: FileRangeContextRef,
+        stream: ParquetRecordBatchStream<SstAsyncFileReader>,
+    ) -> Self {
         // The batch length from the reader should be less than or equal to DEFAULT_READ_BATCH_SIZE.
         let override_sequence = context
             .read_format()
@@ -2163,16 +2076,16 @@ impl FlatRowGroupReader {
 
         Self {
             context,
-            reader,
+            stream,
             override_sequence,
         }
     }
 
     /// Returns the next RecordBatch.
-    pub(crate) fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
-        match self.reader.next() {
+    pub(crate) async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
+        match self.stream.next().await {
             Some(batch_result) => {
-                let record_batch = batch_result.context(ArrowReaderSnafu {
+                let record_batch = batch_result.context(ReadParquetSnafu {
                     path: self.context.file_path(),
                 })?;
 
diff --git a/src/mito2/src/sst/parquet/row_group.rs b/src/mito2/src/sst/parquet/row_group.rs
index 8f3f6c5f62..38ef62c6b8 100644
--- a/src/mito2/src/sst/parquet/row_group.rs
+++ b/src/mito2/src/sst/parquet/row_group.rs
@@ -12,28 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//! Ports private structs from [parquet crate](https://github.com/apache/arrow-rs/blob/7e134f4d277c0b62c27529fc15a4739de3ad0afd/parquet/src/arrow/async_reader/mod.rs#L644-L650).
+//! Parquet row group reading utilities.
 
 use std::ops::Range;
 use std::sync::Arc;
 
-use bytes::{Buf, Bytes};
-use object_store::ObjectStore;
-use parquet::arrow::ProjectionMask;
-use parquet::arrow::arrow_reader::{RowGroups, RowSelection};
-use parquet::column::page::{PageIterator, PageReader};
-use parquet::errors::{ParquetError, Result};
-use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
-use parquet::file::page_index::offset_index::OffsetIndexMetaData;
-use parquet::file::reader::{ChunkReader, Length};
-use parquet::file::serialized_reader::SerializedPageReader;
-use store_api::storage::{FileId, RegionId};
-use tokio::task::yield_now;
-
-use crate::cache::file_cache::{FileType, IndexKey};
-use crate::cache::{CacheStrategy, PageKey, PageValue};
-use crate::metrics::{READ_STAGE_ELAPSED, READ_STAGE_FETCH_PAGES};
-use crate::sst::parquet::helper::{MERGE_GAP, fetch_byte_ranges};
+use crate::sst::parquet::helper::MERGE_GAP;
 
 /// Inner data for ParquetFetchMetrics.
 #[derive(Default, Debug, Clone)]
@@ -74,9 +58,9 @@ impl ParquetFetchMetricsData {
 }
 
 /// Metrics for tracking page/row group fetch operations.
-#[derive(Default)]
+#[derive(Default, Clone)]
 pub struct ParquetFetchMetrics {
-    pub data: std::sync::Mutex<ParquetFetchMetricsData>,
+    pub data: Arc<std::sync::Mutex<ParquetFetchMetricsData>>,
 }
 
 impl std::fmt::Debug for ParquetFetchMetrics {
@@ -204,363 +188,12 @@ impl ParquetFetchMetrics {
     }
 }
 
-pub(crate) struct RowGroupBase<'a> {
-    parquet_metadata: &'a ParquetMetaData,
-    row_group_idx: usize,
-    pub(crate) offset_index: Option<&'a [OffsetIndexMetaData]>,
-    /// Compressed page of each column.
-    column_chunks: Vec<Option<Arc<ColumnChunkData>>>,
-    pub(crate) row_count: usize,
-}
-
-impl<'a> RowGroupBase<'a> {
-    pub(crate) fn new(parquet_meta: &'a ParquetMetaData, row_group_idx: usize) -> Self {
-        let metadata = parquet_meta.row_group(row_group_idx);
-        // `offset_index` is always `None` if we don't set
-        // [with_page_index()](https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index)
-        // to `true`.
-        let offset_index = parquet_meta
-            .offset_index()
-            // filter out empty offset indexes (old versions specified Some(vec![]) when no present)
-            .filter(|index| !index.is_empty())
-            .map(|x| x[row_group_idx].as_slice());
-
-        Self {
-            parquet_metadata: parquet_meta,
-            row_group_idx,
-            offset_index,
-            column_chunks: vec![None; metadata.columns().len()],
-            row_count: metadata.num_rows() as usize,
-        }
-    }
-
-    pub(crate) fn calc_sparse_read_ranges(
-        &self,
-        projection: &ProjectionMask,
-        offset_index: &[OffsetIndexMetaData],
-        selection: &RowSelection,
-    ) -> (Vec<Range<u64>>, Vec<Vec<usize>>) {
-        // If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the
-        // `RowSelection`
-        let mut page_start_offsets: Vec<Vec<usize>> = vec![];
-        let ranges = self
-            .column_chunks
-            .iter()
-            .zip(self.row_group_metadata().columns())
-            .enumerate()
-            .filter(|&(idx, (chunk, _chunk_meta))| chunk.is_none() && projection.leaf_included(idx))
-            .flat_map(|(idx, (_chunk, chunk_meta))| {
-                // If the first page does not start at the beginning of the column,
-                // then we need to also fetch a dictionary page.
-                let mut ranges = vec![];
-                let (start, _len) = chunk_meta.byte_range();
-                match offset_index[idx].page_locations.first() {
-                    Some(first) if first.offset as u64 != start => {
-                        ranges.push(start..first.offset as u64);
-                    }
-                    _ => (),
-                }
-
-                ranges.extend(
-                    selection
-                        .scan_ranges(&offset_index[idx].page_locations)
-                        .iter()
-                        .map(|range| range.start..range.end),
-                );
-                page_start_offsets.push(ranges.iter().map(|range| range.start as usize).collect());
-
-                ranges
-            })
-            .collect::<Vec<_>>();
-        (ranges, page_start_offsets)
-    }
-
-    pub(crate) fn assign_sparse_chunk(
-        &mut self,
-        projection: &ProjectionMask,
-        data: Vec<Bytes>,
-        page_start_offsets: Vec<Vec<usize>>,
-    ) {
-        let mut page_start_offsets = page_start_offsets.into_iter();
-        let mut chunk_data = data.into_iter();
-
-        for (idx, chunk) in self.column_chunks.iter_mut().enumerate() {
-            if chunk.is_some() || !projection.leaf_included(idx) {
-                continue;
-            }
-
-            if let Some(offsets) = page_start_offsets.next() {
-                let mut chunks = Vec::with_capacity(offsets.len());
-                for _ in 0..offsets.len() {
-                    chunks.push(chunk_data.next().unwrap());
-                }
-
-                let column = self
-                    .parquet_metadata
-                    .row_group(self.row_group_idx)
-                    .column(idx);
-                *chunk = Some(Arc::new(ColumnChunkData::Sparse {
-                    length: column.byte_range().1 as usize,
-                    data: offsets.into_iter().zip(chunks).collect(),
-                }))
-            }
-        }
-    }
-
-    pub(crate) fn calc_dense_read_ranges(&self, projection: &ProjectionMask) -> Vec<Range<u64>> {
-        self.column_chunks
-            .iter()
-            .enumerate()
-            .filter(|&(idx, chunk)| chunk.is_none() && projection.leaf_included(idx))
-            .map(|(idx, _chunk)| {
-                let column = self.row_group_metadata().column(idx);
-                let (start, length) = column.byte_range();
-                start..(start + length)
-            })
-            .collect::<Vec<_>>()
-    }
-
-    /// Assigns compressed chunk binary data to [RowGroupBase::column_chunks]
-    /// and returns the chunk offset and binary data assigned.
-    pub(crate) fn assign_dense_chunk(
-        &mut self,
-        projection: &ProjectionMask,
-        chunk_data: Vec<Bytes>,
-    ) {
-        let mut chunk_data = chunk_data.into_iter();
-
-        for (idx, chunk) in self.column_chunks.iter_mut().enumerate() {
-            if chunk.is_some() || !projection.leaf_included(idx) {
-                continue;
-            }
-
-            // Get the fetched page.
-            let Some(data) = chunk_data.next() else {
-                continue;
-            };
-
-            let column = self
-                .parquet_metadata
-                .row_group(self.row_group_idx)
-                .column(idx);
-            *chunk = Some(Arc::new(ColumnChunkData::Dense {
-                offset: column.byte_range().0 as usize,
-                data,
-            }));
-        }
-    }
-
-    /// Create [PageReader] from [RowGroupBase::column_chunks]
-    pub(crate) fn column_reader(
-        &self,
-        col_idx: usize,
-    ) -> Result<SerializedPageReader<ColumnChunkData>> {
-        let page_reader = match &self.column_chunks[col_idx] {
-            None => {
-                return Err(ParquetError::General(format!(
-                    "Invalid column index {col_idx}, column was not fetched"
-                )));
-            }
-            Some(data) => {
-                let page_locations = self
-                    .offset_index
-                    // filter out empty offset indexes (old versions specified Some(vec![]) when no present)
-                    .filter(|index| !index.is_empty())
-                    .map(|index| index[col_idx].page_locations.clone());
-                SerializedPageReader::new(
-                    data.clone(),
-                    self.row_group_metadata().column(col_idx),
-                    self.row_count,
-                    page_locations,
-                )?
-            }
-        };
-
-        Ok(page_reader)
-    }
-
-    pub(crate) fn parquet_metadata(&self) -> &ParquetMetaData {
-        self.parquet_metadata
-    }
-
-    pub(crate) fn row_group_metadata(&self) -> &RowGroupMetaData {
-        self.parquet_metadata().row_group(self.row_group_idx)
-    }
-}
-
-/// An in-memory collection of column chunks
-pub struct InMemoryRowGroup<'a> {
-    region_id: RegionId,
-    file_id: FileId,
-    row_group_idx: usize,
-    cache_strategy: CacheStrategy,
-    file_path: &'a str,
-    /// Object store.
-    object_store: ObjectStore,
-    base: RowGroupBase<'a>,
-}
-
-impl<'a> InMemoryRowGroup<'a> {
-    /// Creates a new [InMemoryRowGroup] by `row_group_idx`.
-    ///
-    /// # Panics
-    /// Panics if the `row_group_idx` is invalid.
-    pub fn create(
-        region_id: RegionId,
-        file_id: FileId,
-        parquet_meta: &'a ParquetMetaData,
-        row_group_idx: usize,
-        cache_strategy: CacheStrategy,
-        file_path: &'a str,
-        object_store: ObjectStore,
-    ) -> Self {
-        Self {
-            region_id,
-            file_id,
-            row_group_idx,
-            cache_strategy,
-            file_path,
-            object_store,
-            base: RowGroupBase::new(parquet_meta, row_group_idx),
-        }
-    }
-
-    /// Fetches the necessary column data into memory
-    pub async fn fetch(
-        &mut self,
-        projection: &ProjectionMask,
-        selection: Option<&RowSelection>,
-        metrics: Option<&ParquetFetchMetrics>,
-    ) -> Result<()> {
-        if let Some((selection, offset_index)) = selection.zip(self.base.offset_index) {
-            let (fetch_ranges, page_start_offsets) =
-                self.base
-                    .calc_sparse_read_ranges(projection, offset_index, selection);
-
-            let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?;
-            // Assign sparse chunk data to base.
-            self.base
-                .assign_sparse_chunk(projection, chunk_data, page_start_offsets);
-        } else {
-            // Release the CPU to avoid blocking the runtime. Since `fetch_pages_from_cache`
-            // is a synchronous, CPU-bound operation.
-            yield_now().await;
-
-            // Calculate ranges to read.
-            let fetch_ranges = self.base.calc_dense_read_ranges(projection);
-
-            if fetch_ranges.is_empty() {
-                // Nothing to fetch.
-                return Ok(());
-            }
-
-            // Fetch data with ranges
-            let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?;
-
-            // Assigns fetched data to base.
-            self.base.assign_dense_chunk(projection, chunk_data);
-        }
-
-        Ok(())
-    }
-
-    /// Try to fetch data from the memory cache or the WriteCache,
-    /// if not in WriteCache, fetch data from object store directly.
-    async fn fetch_bytes(
-        &self,
-        ranges: &[Range<u64>],
-        metrics: Option<&ParquetFetchMetrics>,
-    ) -> Result<Vec<Bytes>> {
-        // Now fetch page timer includes the whole time to read pages.
-        let _timer = READ_STAGE_FETCH_PAGES.start_timer();
-
-        let page_key = PageKey::new(self.file_id, self.row_group_idx, ranges.to_vec());
-        if let Some(pages) = self.cache_strategy.get_pages(&page_key) {
-            if let Some(metrics) = metrics {
-                let total_size: u64 = ranges.iter().map(|r| r.end - r.start).sum();
-                let mut metrics_data = metrics.data.lock().unwrap();
-                metrics_data.page_cache_hit += 1;
-                metrics_data.pages_to_fetch_mem += ranges.len();
-                metrics_data.page_size_to_fetch_mem += total_size;
-                metrics_data.page_size_needed += total_size;
-            }
-            return Ok(pages.compressed.clone());
-        }
-
-        // Calculate total range size for metrics.
-        let (total_range_size, unaligned_size) = compute_total_range_size(ranges);
-
-        let key = IndexKey::new(self.region_id, self.file_id, FileType::Parquet);
-        let fetch_write_cache_start = metrics.map(|_| std::time::Instant::now());
-        let write_cache_result = self.fetch_ranges_from_write_cache(key, ranges).await;
-        let pages = match write_cache_result {
-            Some(data) => {
-                if let Some(metrics) = metrics {
-                    let elapsed = fetch_write_cache_start
-                        .map(|start| start.elapsed())
-                        .unwrap_or_default();
-                    let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
-                    let mut metrics_data = metrics.data.lock().unwrap();
-                    metrics_data.write_cache_fetch_elapsed += elapsed;
-                    metrics_data.write_cache_hit += 1;
-                    metrics_data.pages_to_fetch_write_cache += ranges.len();
-                    metrics_data.page_size_to_fetch_write_cache += unaligned_size;
-                    metrics_data.page_size_needed += range_size_needed;
-                }
-                data
-            }
-            None => {
-                // Fetch data from object store.
-                let _timer = READ_STAGE_ELAPSED
-                    .with_label_values(&["cache_miss_read"])
-                    .start_timer();
-
-                let start = metrics.map(|_| std::time::Instant::now());
-                let data = fetch_byte_ranges(self.file_path, self.object_store.clone(), ranges)
-                    .await
-                    .map_err(|e| ParquetError::External(Box::new(e)))?;
-                if let Some(metrics) = metrics {
-                    let elapsed = start.map(|start| start.elapsed()).unwrap_or_default();
-                    let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
-                    let mut metrics_data = metrics.data.lock().unwrap();
-                    metrics_data.store_fetch_elapsed += elapsed;
-                    metrics_data.cache_miss += 1;
-                    metrics_data.pages_to_fetch_store += ranges.len();
-                    metrics_data.page_size_to_fetch_store += unaligned_size;
-                    metrics_data.page_size_needed += range_size_needed;
-                }
-                data
-            }
-        };
-
-        // Put pages back to the cache.
-        let page_value = PageValue::new(pages.clone(), total_range_size);
-        self.cache_strategy
-            .put_pages(page_key, Arc::new(page_value));
-
-        Ok(pages)
-    }
-
-    /// Fetches data from write cache.
-    /// Returns `None` if the data is not in the cache.
-    async fn fetch_ranges_from_write_cache(
-        &self,
-        key: IndexKey,
-        ranges: &[Range<u64>],
-    ) -> Option<Vec<Bytes>> {
-        if let Some(cache) = self.cache_strategy.write_cache() {
-            return cache.file_cache().read_ranges(key, ranges).await;
-        }
-        None
-    }
-}
-
 /// Computes the max possible buffer size to read the given `ranges`.
 /// Returns (aligned_size, unaligned_size) where:
 /// - aligned_size: total size aligned to pooled buffer size
 /// - unaligned_size: actual total size without alignment
 // See https://github.com/apache/opendal/blob/v0.54.0/core/src/types/read/reader.rs#L166-L192
-fn compute_total_range_size(ranges: &[Range<u64>]) -> (u64, u64) {
+pub(crate) fn compute_total_range_size(ranges: &[Range<u64>]) -> (u64, u64) {
     if ranges.is_empty() {
         return (0, 0);
     }
@@ -602,96 +235,3 @@ fn align_to_pooled_buf_size(size: u64) -> u64 {
     const POOLED_BUF_SIZE: u64 = 2 * 1024 * 1024;
     size.div_ceil(POOLED_BUF_SIZE) * POOLED_BUF_SIZE
 }
-
-impl RowGroups for InMemoryRowGroup<'_> {
-    fn num_rows(&self) -> usize {
-        self.base.row_count
-    }
-
-    fn column_chunks(&self, i: usize) -> Result<Box<dyn PageIterator>> {
-        // Creates a page reader to read column at `i`.
-        let page_reader = self.base.column_reader(i)?;
-
-        Ok(Box::new(ColumnChunkIterator {
-            reader: Some(Ok(Box::new(page_reader))),
-        }))
-    }
-
-    fn row_groups(&self) -> Box<dyn Iterator<Item = &RowGroupMetaData> + '_> {
-        Box::new(std::iter::once(self.base.row_group_metadata()))
-    }
-
-    fn metadata(&self) -> &ParquetMetaData {
-        self.base.parquet_metadata()
-    }
-}
-
-/// An in-memory column chunk
-#[derive(Clone)]
-pub(crate) enum ColumnChunkData {
-    /// Column chunk data representing only a subset of data pages
-    Sparse {
-        /// Length of the full column chunk
-        length: usize,
-        /// Set of data pages included in this sparse chunk. Each element is a tuple
-        /// of (page offset, page data)
-        data: Vec<(usize, Bytes)>,
-    },
-    /// Full column chunk and its offset
-    Dense { offset: usize, data: Bytes },
-}
-
-impl ColumnChunkData {
-    fn get(&self, start: u64) -> Result<Bytes> {
-        match &self {
-            ColumnChunkData::Sparse { data, .. } => data
-                .binary_search_by_key(&start, |(offset, _)| *offset as u64)
-                .map(|idx| data[idx].1.clone())
-                .map_err(|_| {
-                    ParquetError::General(format!(
-                        "Invalid offset in sparse column chunk data: {start}"
-                    ))
-                }),
-            ColumnChunkData::Dense { offset, data } => {
-                let start = start as usize - *offset;
-                Ok(data.slice(start..))
-            }
-        }
-    }
-}
-
-impl Length for ColumnChunkData {
-    fn len(&self) -> u64 {
-        match &self {
-            ColumnChunkData::Sparse { length, .. } => *length as u64,
-            ColumnChunkData::Dense { data, .. } => data.len() as u64,
-        }
-    }
-}
-
-impl ChunkReader for ColumnChunkData {
-    type T = bytes::buf::Reader<Bytes>;
-
-    fn get_read(&self, start: u64) -> Result<Self::T> {
-        Ok(self.get(start)?.reader())
-    }
-
-    fn get_bytes(&self, start: u64, length: usize) -> Result<Bytes> {
-        Ok(self.get(start)?.slice(..length))
-    }
-}
-
-/// Implements [`PageIterator`] for a single column chunk, yielding a single [`PageReader`]
-pub(crate) struct ColumnChunkIterator {
-    pub(crate) reader: Option<Result<Box<dyn PageReader>>>,
-}
-
-impl Iterator for ColumnChunkIterator {
-    type Item = Result<Box<dyn PageReader>>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        self.reader.take()
-    }
-}
-
-impl PageIterator for ColumnChunkIterator {}
diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs
index b207f11ef8..4e75073e26 100644
--- a/src/mito2/src/sst/parquet/writer.rs
+++ b/src/mito2/src/sst/parquet/writer.rs
@@ -50,7 +50,7 @@ use crate::config::{IndexBuildMode, IndexConfig};
 use crate::error::{
     InvalidMetadataSnafu, OpenDalSnafu, Result, UnexpectedSnafu, WriteParquetSnafu,
 };
-use crate::read::{Batch, FlatSource, Source};
+use crate::read::FlatSource;
 use crate::sst::file::RegionFileId;
 use crate::sst::index::{IndexOutput, Indexer, IndexerBuilder};
 use crate::sst::parquet::flat_format::{FlatWriteFormat, time_index_column_index};
@@ -60,6 +60,35 @@ use crate::sst::{
     DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY, FlatSchemaOptions, SeriesEstimator,
 };
 
+/// Converts a flat RecordBatch for writing to parquet.
+enum FlatBatchConverter {
+    /// Write as-is in flat format.
+    Flat(FlatWriteFormat),
+    /// Convert flat batch to primary-key format by stripping tag columns.
+    PrimaryKey {
+        format: PrimaryKeyWriteFormat,
+        num_fields: usize,
+    },
+}
+
+impl FlatBatchConverter {
+    fn arrow_schema(&self) -> &SchemaRef {
+        match self {
+            FlatBatchConverter::Flat(f) => f.arrow_schema(),
+            FlatBatchConverter::PrimaryKey { format, .. } => format.arrow_schema(),
+        }
+    }
+
+    fn convert_batch(&self, batch: &RecordBatch) -> Result<RecordBatch> {
+        match self {
+            FlatBatchConverter::Flat(f) => f.convert_batch(batch),
+            FlatBatchConverter::PrimaryKey { format, num_fields } => {
+                format.convert_flat_batch(batch, *num_fields)
+            }
+        }
+    }
+}
+
 /// Parquet SST writer.
 pub struct ParquetWriter<'a, F: WriterFactory, I: IndexerBuilder, P: FilePathProvider> {
     /// Path provider that creates SST and index file paths according to file id.
@@ -240,81 +269,6 @@ where
         Ok(())
     }
 
-    /// Iterates source and writes all rows to Parquet file.
-    ///
-    /// Returns the [SstInfo] if the SST is written.
-    pub async fn write_all(
-        &mut self,
-        source: Source,
-        override_sequence: Option<SequenceNumber>, // override the `sequence` field from `Source`
-        opts: &WriteOptions,
-    ) -> Result<SstInfoArray> {
-        let res = self
-            .write_all_without_cleaning(source, override_sequence, opts)
-            .await;
-        if res.is_err() {
-            // Clean tmp files explicitly on failure.
-            let file_id = self.current_file;
-            if let Some(cleaner) = &self.file_cleaner {
-                cleaner.clean_by_file_id(file_id).await;
-            }
-        }
-        res
-    }
-
-    async fn write_all_without_cleaning(
-        &mut self,
-        mut source: Source,
-        override_sequence: Option<SequenceNumber>, // override the `sequence` field from `Source`
-        opts: &WriteOptions,
-    ) -> Result<SstInfoArray> {
-        let mut results = smallvec![];
-        let write_format = PrimaryKeyWriteFormat::new(self.metadata.clone())
-            .with_override_sequence(override_sequence);
-        let mut stats = SourceStats::default();
-
-        while let Some(res) = self
-            .write_next_batch(&mut source, &write_format, opts)
-            .await
-            .transpose()
-        {
-            match res {
-                Ok(mut batch) => {
-                    stats.update(&batch);
-                    let start = Instant::now();
-                    // safety: self.current_indexer must be set when first batch has been written.
-                    match self.index_config.build_mode {
-                        IndexBuildMode::Sync => {
-                            self.current_indexer
-                                .as_mut()
-                                .unwrap()
-                                .update(&mut batch)
-                                .await;
-                        }
-                        IndexBuildMode::Async => {}
-                    }
-                    self.metrics.update_index += start.elapsed();
-                    if let Some(max_file_size) = opts.max_file_size
-                        && self.bytes_written.load(Ordering::Relaxed) > max_file_size
-                    {
-                        self.finish_current_file(&mut results, &mut stats).await?;
-                    }
-                }
-                Err(e) => {
-                    if let Some(indexer) = &mut self.current_indexer {
-                        indexer.abort().await;
-                    }
-                    return Err(e);
-                }
-            }
-        }
-
-        self.finish_current_file(&mut results, &mut stats).await?;
-
-        // object_store.write will make sure all bytes are written or an error is raised.
-        Ok(results)
-    }
-
     /// Iterates FlatSource and writes all RecordBatch in flat format to Parquet file.
     ///
     /// Returns the [SstInfo] if the SST is written.
@@ -324,11 +278,15 @@ where
         override_sequence: Option<SequenceNumber>,
         opts: &WriteOptions,
     ) -> Result<SstInfoArray> {
-        let res = self
-            .write_all_flat_without_cleaning(source, override_sequence, opts)
-            .await;
+        let converter = FlatBatchConverter::Flat(
+            FlatWriteFormat::new(
+                self.metadata.clone(),
+                &FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding),
+            )
+            .with_override_sequence(override_sequence),
+        );
+        let res = self.write_all_flat_inner(source, &converter, opts).await;
         if res.is_err() {
-            // Clean tmp files explicitly on failure.
             let file_id = self.current_file;
             if let Some(cleaner) = &self.file_cleaner {
                 cleaner.clean_by_file_id(file_id).await;
@@ -337,36 +295,58 @@ where
         res
     }
 
-    async fn write_all_flat_without_cleaning(
+    /// Iterates FlatSource and writes all RecordBatch in primary-key format to Parquet file.
+    ///
+    /// Returns the [SstInfo] if the SST is written.
+    pub async fn write_all_flat_as_primary_key(
         &mut self,
-        mut source: FlatSource,
+        source: FlatSource,
         override_sequence: Option<SequenceNumber>,
         opts: &WriteOptions,
+    ) -> Result<SstInfoArray> {
+        let num_fields = self.metadata.field_columns().count();
+        let converter = FlatBatchConverter::PrimaryKey {
+            format: PrimaryKeyWriteFormat::new(self.metadata.clone())
+                .with_override_sequence(override_sequence),
+            num_fields,
+        };
+        let res = self.write_all_flat_inner(source, &converter, opts).await;
+        if res.is_err() {
+            let file_id = self.current_file;
+            if let Some(cleaner) = &self.file_cleaner {
+                cleaner.clean_by_file_id(file_id).await;
+            }
+        }
+        res
+    }
+
+    async fn write_all_flat_inner(
+        &mut self,
+        mut source: FlatSource,
+        converter: &FlatBatchConverter,
+        opts: &WriteOptions,
     ) -> Result<SstInfoArray> {
         let mut results = smallvec![];
-        let flat_format = FlatWriteFormat::new(
-            self.metadata.clone(),
-            &FlatSchemaOptions::from_encoding(self.metadata.primary_key_encoding),
-        )
-        .with_override_sequence(override_sequence);
         let mut stats = SourceStats::default();
 
         while let Some(record_batch) = self
-            .write_next_flat_batch(&mut source, &flat_format, opts)
+            .write_next_flat_batch(&mut source, converter, opts)
             .await
             .transpose()
         {
             match record_batch {
                 Ok(batch) => {
                     stats.update_flat(&batch)?;
-                    let start = Instant::now();
-                    // safety: self.current_indexer must be set when first batch has been written.
-                    self.current_indexer
-                        .as_mut()
-                        .unwrap()
-                        .update_flat(&batch)
-                        .await;
-                    self.metrics.update_index += start.elapsed();
+                    if matches!(self.index_config.build_mode, IndexBuildMode::Sync) {
+                        let start = Instant::now();
+                        // safety: self.current_indexer must be set when first batch has been written.
+                        self.current_indexer
+                            .as_mut()
+                            .unwrap()
+                            .update_flat(&batch)
+                            .await;
+                        self.metrics.update_index += start.elapsed();
+                    }
                     if let Some(max_file_size) = opts.max_file_size
                         && self.bytes_written.load(Ordering::Relaxed) > max_file_size
                     {
@@ -411,34 +391,10 @@ where
             .set_column_compression(op_type_col, Compression::UNCOMPRESSED)
     }
 
-    async fn write_next_batch(
-        &mut self,
-        source: &mut Source,
-        write_format: &PrimaryKeyWriteFormat,
-        opts: &WriteOptions,
-    ) -> Result<Option<Batch>> {
-        let start = Instant::now();
-        let Some(batch) = source.next_batch().await? else {
-            return Ok(None);
-        };
-        self.metrics.iter_source += start.elapsed();
-
-        let arrow_batch = write_format.convert_batch(&batch)?;
-
-        let start = Instant::now();
-        self.maybe_init_writer(write_format.arrow_schema(), opts)
-            .await?
-            .write(&arrow_batch)
-            .await
-            .context(WriteParquetSnafu)?;
-        self.metrics.write_batch += start.elapsed();
-        Ok(Some(batch))
-    }
-
     async fn write_next_flat_batch(
         &mut self,
         source: &mut FlatSource,
-        flat_format: &FlatWriteFormat,
+        converter: &FlatBatchConverter,
         opts: &WriteOptions,
     ) -> Result<Option<RecordBatch>> {
         let start = Instant::now();
@@ -447,15 +403,16 @@ where
         };
         self.metrics.iter_source += start.elapsed();
 
-        let arrow_batch = flat_format.convert_batch(&record_batch)?;
+        let arrow_batch = converter.convert_batch(&record_batch)?;
 
         let start = Instant::now();
-        self.maybe_init_writer(flat_format.arrow_schema(), opts)
+        self.maybe_init_writer(converter.arrow_schema(), opts)
             .await?
             .write(&arrow_batch)
             .await
             .context(WriteParquetSnafu)?;
         self.metrics.write_batch += start.elapsed();
+        // Return original flat batch for stats/indexer which use flat layout.
         Ok(Some(record_batch))
     }
 
@@ -515,26 +472,6 @@ struct SourceStats {
 }
 
 impl SourceStats {
-    fn update(&mut self, batch: &Batch) {
-        if batch.is_empty() {
-            return;
-        }
-
-        self.num_rows += batch.num_rows();
-        self.series_estimator.update(batch);
-        // Safety: batch is not empty.
-        let (min_in_batch, max_in_batch) = (
-            batch.first_timestamp().unwrap(),
-            batch.last_timestamp().unwrap(),
-        );
-        if let Some(time_range) = &mut self.time_range {
-            time_range.0 = time_range.0.min(min_in_batch);
-            time_range.1 = time_range.1.max(max_in_batch);
-        } else {
-            self.time_range = Some((min_in_batch, max_in_batch));
-        }
-    }
-
     fn update_flat(&mut self, record_batch: &RecordBatch) -> Result<()> {
         if record_batch.num_rows() == 0 {
             return Ok(());
diff --git a/src/mito2/src/test_util.rs b/src/mito2/src/test_util.rs
index 842689bba6..350195bfa9 100644
--- a/src/mito2/src/test_util.rs
+++ b/src/mito2/src/test_util.rs
@@ -15,6 +15,7 @@
 //! Utilities for testing.
 
 pub mod batch_util;
+pub mod bench_util;
 pub mod memtable_util;
 pub mod scheduler_util;
 pub mod sst_util;
diff --git a/src/mito2/src/test_util/bench_util.rs b/src/mito2/src/test_util/bench_util.rs
new file mode 100644
index 0000000000..8f182e4157
--- /dev/null
+++ b/src/mito2/src/test_util/bench_util.rs
@@ -0,0 +1,259 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Shared utilities for mito2 benchmarks.
+//!
+//! Provides a TSBS cpu-like data generator ([`CpuDataGenerator`]) and schema
+//! ([`cpu_metadata`]) used by multiple benchmark binaries in this directory.
+
+use api::v1::value::ValueData;
+use api::v1::{Row, Rows, SemanticType};
+use datafusion_common::Column;
+use datafusion_expr::{Expr, lit};
+use datatypes::data_type::ConcreteDataType;
+use datatypes::schema::ColumnSchema;
+use rand::Rng;
+use rand::rngs::ThreadRng;
+use rand::seq::IndexedRandom;
+use store_api::metadata::{
+    ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
+};
+use store_api::storage::RegionId;
+use table::predicate::Predicate;
+
+use crate::memtable::KeyValues;
+use crate::test_util::memtable_util::region_metadata_to_row_schema;
+
+pub struct Host {
+    pub hostname: String,
+    pub region: String,
+    pub datacenter: String,
+    pub rack: String,
+    pub os: String,
+    pub arch: String,
+    pub team: String,
+    pub service: String,
+    pub service_version: String,
+    pub service_environment: String,
+}
+
+impl Host {
+    pub fn random_with_id(id: usize) -> Host {
+        let mut rng = rand::rng();
+        let region = format!("ap-southeast-{}", rng.random_range(0..10));
+        let datacenter = format!(
+            "{}{}",
+            region,
+            ['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap()
+        );
+        Host {
+            hostname: format!("host_{id}"),
+            region,
+            datacenter,
+            rack: rng.random_range(0..100).to_string(),
+            os: "Ubuntu16.04LTS".to_string(),
+            arch: "x86".to_string(),
+            team: "CHI".to_string(),
+            service: rng.random_range(0..100).to_string(),
+            service_version: rng.random_range(0..10).to_string(),
+            service_environment: "test".to_string(),
+        }
+    }
+
+    pub fn fill_values(&self, values: &mut Vec<api::v1::Value>) {
+        let tags = [
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.hostname.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.region.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.datacenter.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.rack.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.os.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.arch.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.team.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.service.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.service_version.clone())),
+            },
+            api::v1::Value {
+                value_data: Some(ValueData::StringValue(self.service_environment.clone())),
+            },
+        ];
+        for tag in tags {
+            values.push(tag);
+        }
+    }
+}
+
+pub struct CpuDataGenerator {
+    pub metadata: RegionMetadataRef,
+    column_schemas: Vec<api::v1::ColumnSchema>,
+    hosts: Vec<Host>,
+    start_sec: i64,
+    end_sec: i64,
+}
+
+impl CpuDataGenerator {
+    pub fn new(
+        metadata: RegionMetadataRef,
+        num_hosts: usize,
+        start_sec: i64,
+        end_sec: i64,
+    ) -> Self {
+        let column_schemas = region_metadata_to_row_schema(&metadata);
+        Self {
+            metadata,
+            column_schemas,
+            hosts: Self::generate_hosts(num_hosts),
+            start_sec,
+            end_sec,
+        }
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = KeyValues> + '_ {
+        // point per 10s.
+        (self.start_sec..self.end_sec)
+            .step_by(10)
+            .enumerate()
+            .map(|(seq, ts)| self.build_key_values(seq, ts))
+    }
+
+    pub fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues {
+        let rows = self
+            .hosts
+            .iter()
+            .map(|host| {
+                let mut rng = rand::rng();
+                let mut values = Vec::with_capacity(21);
+                values.push(api::v1::Value {
+                    value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
+                });
+                host.fill_values(&mut values);
+                for _ in 0..10 {
+                    values.push(api::v1::Value {
+                        value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))),
+                    });
+                }
+                Row { values }
+            })
+            .collect();
+        let mutation = api::v1::Mutation {
+            op_type: api::v1::OpType::Put as i32,
+            sequence: seq as u64,
+            rows: Some(Rows {
+                schema: self.column_schemas.clone(),
+                rows,
+            }),
+            write_hint: None,
+        };
+
+        KeyValues::new(&self.metadata, mutation).unwrap()
+    }
+
+    pub fn random_host_filter(&self) -> Predicate {
+        let host = self.random_hostname();
+        let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host));
+        Predicate::new(vec![expr])
+    }
+
+    pub fn random_host_filter_exprs(&self) -> Vec<Expr> {
+        let host = self.random_hostname();
+        vec![Expr::Column(Column::from_name("hostname")).eq(lit(host))]
+    }
+
+    pub fn random_hostname(&self) -> String {
+        let mut rng = rand::rng();
+        self.hosts.choose(&mut rng).unwrap().hostname.clone()
+    }
+
+    pub fn random_f64(rng: &mut ThreadRng) -> f64 {
+        let base: u32 = rng.random_range(30..95);
+        base as f64
+    }
+
+    pub fn generate_hosts(num_hosts: usize) -> Vec<Host> {
+        (0..num_hosts).map(Host::random_with_id).collect()
+    }
+}
+
+/// Creates a metadata for TSBS cpu-like table.
+pub fn cpu_metadata() -> RegionMetadata {
+    let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
+    builder.push_column_metadata(ColumnMetadata {
+        column_schema: ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        ),
+        semantic_type: SemanticType::Timestamp,
+        column_id: 0,
+    });
+    let mut column_id = 1;
+    let tags = [
+        "hostname",
+        "region",
+        "datacenter",
+        "rack",
+        "os",
+        "arch",
+        "team",
+        "service",
+        "service_version",
+        "service_environment",
+    ];
+    for tag in tags {
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true),
+            semantic_type: SemanticType::Tag,
+            column_id,
+        });
+        column_id += 1;
+    }
+    let fields = [
+        "usage_user",
+        "usage_system",
+        "usage_idle",
+        "usage_nice",
+        "usage_iowait",
+        "usage_irq",
+        "usage_softirq",
+        "usage_steal",
+        "usage_guest",
+        "usage_guest_nice",
+    ];
+    for field in fields {
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true),
+            semantic_type: SemanticType::Field,
+            column_id,
+        });
+        column_id += 1;
+    }
+    builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
+    builder.build().unwrap()
+}
diff --git a/src/mito2/src/test_util/memtable_util.rs b/src/mito2/src/test_util/memtable_util.rs
index 7ddac4ee0d..25ab9bb8b4 100644
--- a/src/mito2/src/test_util/memtable_util.rs
+++ b/src/mito2/src/test_util/memtable_util.rs
@@ -30,8 +30,7 @@ use mito_codec::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodecExt, SortFi
 use store_api::metadata::{
     ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
 };
-use store_api::storage::{ColumnId, RegionId, SequenceNumber, SequenceRange};
-use table::predicate::Predicate;
+use store_api::storage::{ColumnId, RegionId, SequenceNumber};
 
 use crate::error::Result;
 use crate::memtable::bulk::part::BulkPart;
@@ -83,16 +82,6 @@ impl Memtable for EmptyMemtable {
         Ok(())
     }
 
-    #[cfg(any(test, feature = "test"))]
-    fn iter(
-        &self,
-        _projection: Option<&[ColumnId]>,
-        _filters: Option<Predicate>,
-        _sequence: Option<SequenceRange>,
-    ) -> Result<BoxedBatchIterator> {
-        Ok(Box::new(std::iter::empty()))
-    }
-
     fn ranges(
         &self,
         _projection: Option<&[ColumnId]>,
diff --git a/src/mito2/src/test_util/sst_util.rs b/src/mito2/src/test_util/sst_util.rs
index 389d9bf107..e9515030c0 100644
--- a/src/mito2/src/test_util/sst_util.rs
+++ b/src/mito2/src/test_util/sst_util.rs
@@ -18,7 +18,11 @@ use std::sync::Arc;
 
 use api::v1::{OpType, SemanticType};
 use common_time::Timestamp;
-use datatypes::arrow::array::{BinaryArray, TimestampMillisecondArray, UInt8Array, UInt64Array};
+use datatypes::arrow::array::{
+    ArrayRef, BinaryDictionaryBuilder, RecordBatch, StringDictionaryBuilder,
+    TimestampMillisecondArray, UInt8Array, UInt64Array,
+};
+use datatypes::arrow::datatypes::UInt32Type;
 use datatypes::prelude::ConcreteDataType;
 use datatypes::schema::{ColumnSchema, SkippingIndexOptions};
 use datatypes::value::ValueRef;
@@ -32,8 +36,9 @@ use store_api::metric_engine_consts::{
 use store_api::storage::consts::ReservedColumnId;
 use store_api::storage::{FileId, RegionId};
 
-use crate::read::{Batch, BatchBuilder, Source};
+use crate::read::{Batch, FlatSource, Source};
 use crate::sst::file::{FileHandle, FileMeta};
+use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
 use crate::test_util::{VecBatchReader, new_batch_builder, new_noop_file_purger};
 
 /// Test region id.
@@ -246,34 +251,68 @@ pub fn new_batch_by_range(tags: &[&str], start: usize, end: usize) -> Batch {
     new_batch_with_custom_sequence(tags, start, end, 1000)
 }
 
-pub fn new_batch_with_binary(tags: &[&str], start: usize, end: usize) -> Batch {
+/// Creates a flat format RecordBatch for testing.
+/// Similar to `new_batch_by_range` but returns a RecordBatch in flat format.
+pub fn new_record_batch_by_range(tags: &[&str], start: usize, end: usize) -> RecordBatch {
+    new_record_batch_with_custom_sequence(tags, start, end, 1000)
+}
+
+/// Creates a flat format RecordBatch for testing with a custom sequence.
+pub fn new_record_batch_with_custom_sequence(
+    tags: &[&str],
+    start: usize,
+    end: usize,
+    sequence: u64,
+) -> RecordBatch {
     assert!(end >= start);
+    let metadata = Arc::new(sst_region_metadata());
+    let flat_schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default());
+
+    let num_rows = end - start;
+    let mut columns = Vec::new();
+
+    // Add primary key columns (tag_0, tag_1) as dictionary arrays
+    let mut tag_0_builder = StringDictionaryBuilder::<UInt32Type>::new();
+    let mut tag_1_builder = StringDictionaryBuilder::<UInt32Type>::new();
+
+    for _ in 0..num_rows {
+        tag_0_builder.append_value(tags[0]);
+        tag_1_builder.append_value(tags[1]);
+    }
+
+    columns.push(Arc::new(tag_0_builder.finish()) as ArrayRef);
+    columns.push(Arc::new(tag_1_builder.finish()) as ArrayRef);
+
+    // Add field column (field_0)
+    let field_values: Vec<u64> = (start..end).map(|v| v as u64).collect();
+    columns.push(Arc::new(UInt64Array::from(field_values)));
+
+    // Add time index column (ts)
+    let timestamps: Vec<i64> = (start..end).map(|v| v as i64).collect();
+    columns.push(Arc::new(TimestampMillisecondArray::from(timestamps)));
+
+    // Add encoded primary key column
     let pk = new_primary_key(tags);
-    let timestamps: Vec<_> = (start..end).map(|v| v as i64).collect();
-    let sequences = vec![1000; end - start];
-    let op_types = vec![OpType::Put; end - start];
+    let mut pk_builder = BinaryDictionaryBuilder::<UInt32Type>::new();
+    for _ in 0..num_rows {
+        pk_builder.append(&pk).unwrap();
+    }
+    columns.push(Arc::new(pk_builder.finish()));
 
-    let field: Vec<_> = (start..end)
-        .map(|_v| "some data".as_bytes().to_vec())
-        .collect();
+    // Add sequence column
+    columns.push(Arc::new(UInt64Array::from_value(sequence, num_rows)));
 
-    let mut builder = BatchBuilder::new(pk);
-    builder
-        .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
-            timestamps.iter().copied(),
-        )))
-        .unwrap()
-        .sequences_array(Arc::new(UInt64Array::from_iter_values(
-            sequences.iter().copied(),
-        )))
-        .unwrap()
-        .op_types_array(Arc::new(UInt8Array::from_iter_values(
-            op_types.iter().map(|v| *v as u8),
-        )))
-        .unwrap()
-        .push_field_array(1, Arc::new(BinaryArray::from_iter_values(field)))
-        .unwrap();
-    builder.build().unwrap()
+    // Add op_type column
+    columns.push(Arc::new(UInt8Array::from_value(
+        OpType::Put as u8,
+        num_rows,
+    )));
+    RecordBatch::try_new(flat_schema, columns).unwrap()
+}
+
+/// Creates a FlatSource from flat format RecordBatches.
+pub fn new_flat_source_from_record_batches(batches: Vec<RecordBatch>) -> FlatSource {
+    FlatSource::Iter(Box::new(batches.into_iter().map(Ok)))
 }
 
 /// Creates a new region metadata for testing SSTs with binary datatype.
diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs
index 71896b3d5d..fd5ad82f3f 100644
--- a/src/mito2/src/worker.rs
+++ b/src/mito2/src/worker.rs
@@ -207,6 +207,7 @@ impl WorkerGroup {
                 .vector_cache_size(config.vector_cache_size.as_bytes())
                 .page_cache_size(config.page_cache_size.as_bytes())
                 .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
+                .range_result_cache_size(config.range_result_cache_size.as_bytes())
                 .index_metadata_size(config.index.metadata_cache_size.as_bytes())
                 .index_content_size(config.index.content_cache_size.as_bytes())
                 .index_content_page_size(config.index.content_cache_page_size.as_bytes())
@@ -421,6 +422,7 @@ impl WorkerGroup {
                 .vector_cache_size(config.vector_cache_size.as_bytes())
                 .page_cache_size(config.page_cache_size.as_bytes())
                 .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
+                .range_result_cache_size(config.range_result_cache_size.as_bytes())
                 .write_cache(write_cache)
                 .build(),
         );
diff --git a/src/partition/src/cache.rs b/src/partition/src/cache.rs
index a886e1e08d..4066b69aa3 100644
--- a/src/partition/src/cache.rs
+++ b/src/partition/src/cache.rs
@@ -121,10 +121,12 @@ pub fn new_partition_info_cache(
     CacheContainer::new(
         name,
         cache,
-        Box::new(|cache, ident| {
+        Box::new(|cache, idents| {
             Box::pin(async move {
-                if let CacheIdent::TableId(table_id) = ident {
-                    cache.invalidate(table_id).await
+                for ident in idents {
+                    if let CacheIdent::TableId(table_id) = ident {
+                        cache.invalidate(table_id).await
+                    }
                 }
                 Ok(())
             })
diff --git a/src/pipeline/src/manager/pipeline_operator.rs b/src/pipeline/src/manager/pipeline_operator.rs
index 77ef8ade23..6c4256db69 100644
--- a/src/pipeline/src/manager/pipeline_operator.rs
+++ b/src/pipeline/src/manager/pipeline_operator.rs
@@ -20,6 +20,7 @@ use api::v1::CreateTableExpr;
 use catalog::{CatalogManagerRef, RegisterSystemTableRequest};
 use common_catalog::consts::{DEFAULT_PRIVATE_SCHEMA_NAME, default_engine};
 use common_telemetry::info;
+use common_time::FOREVER;
 use datatypes::timestamp::TimestampNanosecond;
 use futures::FutureExt;
 use operator::insert::InserterRef;
@@ -28,6 +29,7 @@ use query::QueryEngineRef;
 use session::context::QueryContextRef;
 use snafu::{OptionExt, ResultExt};
 use table::TableRef;
+use table::requests::TTL_KEY;
 
 use crate::Pipeline;
 use crate::error::{CatalogSnafu, CreateTableSnafu, PipelineTableNotFoundSnafu, Result};
@@ -59,6 +61,9 @@ impl PipelineOperator {
     fn create_table_request(&self, catalog: &str) -> RegisterSystemTableRequest {
         let (time_index, primary_keys, column_defs) = PipelineTable::build_pipeline_schema();
 
+        let mut table_options = HashMap::new();
+        table_options.insert(TTL_KEY.to_string(), FOREVER.to_string());
+
         let create_table_expr = CreateTableExpr {
             catalog_name: catalog.to_string(),
             schema_name: DEFAULT_PRIVATE_SCHEMA_NAME.to_string(),
@@ -68,7 +73,7 @@ impl PipelineOperator {
             time_index,
             primary_keys,
             create_if_not_exists: true,
-            table_options: Default::default(),
+            table_options,
             table_id: None, // Should and will be assigned by Meta.
             engine: default_engine().to_string(),
         };
diff --git a/src/query/src/datafusion.rs b/src/query/src/datafusion.rs
index dc84c4afac..e2e577debf 100644
--- a/src/query/src/datafusion.rs
+++ b/src/query/src/datafusion.rs
@@ -354,25 +354,6 @@ impl DatafusionQueryEngine {
         Ok(physical_plan)
     }
 
-    #[tracing::instrument(skip_all)]
-    pub fn optimize(
-        &self,
-        context: &QueryEngineContext,
-        plan: &LogicalPlan,
-    ) -> Result<LogicalPlan> {
-        let _timer = metrics::OPTIMIZE_LOGICAL_ELAPSED.start_timer();
-
-        // Optimized by extension rules
-        let optimized_plan = self
-            .state
-            .optimize_by_extension_rules(plan.clone(), context)?;
-
-        // Optimized by datafusion optimizer
-        let optimized_plan = self.state.session_state().optimize(&optimized_plan)?;
-
-        Ok(optimized_plan)
-    }
-
     #[tracing::instrument(skip_all)]
     fn optimize_physical_plan(
         &self,
@@ -444,32 +425,17 @@ impl QueryEngine for DatafusionQueryEngine {
     async fn describe(
         &self,
         plan: LogicalPlan,
-        query_ctx: QueryContextRef,
+        _query_ctx: QueryContextRef,
     ) -> Result<DescribeResult> {
-        let ctx = self.engine_context(query_ctx);
-        if let Ok(optimised_plan) = self.optimize(&ctx, &plan) {
-            let schema = optimised_plan
-                .schema()
-                .clone()
-                .try_into()
-                .context(ConvertSchemaSnafu)?;
-            Ok(DescribeResult {
-                schema,
-                logical_plan: optimised_plan,
-            })
-        } else {
-            // Table's like those in information_schema cannot be optimized when
-            // it contains parameters. So we fallback to original plans.
-            let schema = plan
-                .schema()
-                .clone()
-                .try_into()
-                .context(ConvertSchemaSnafu)?;
-            Ok(DescribeResult {
-                schema,
-                logical_plan: plan,
-            })
-        }
+        let schema = plan
+            .schema()
+            .clone()
+            .try_into()
+            .context(ConvertSchemaSnafu)?;
+        Ok(DescribeResult {
+            schema,
+            logical_plan: plan,
+        })
     }
 
     async fn execute(&self, plan: LogicalPlan, query_ctx: QueryContextRef) -> Result<Output> {
@@ -924,7 +890,7 @@ mod tests {
             )
         );
         assert_eq!(
-            "Limit: skip=0, fetch=20\n  Aggregate: groupBy=[[]], aggr=[[sum(CAST(numbers.number AS UInt64))]]\n    TableScan: numbers projection=[number]",
+            "Limit: skip=0, fetch=20\n  Projection: sum(numbers.number)\n    Aggregate: groupBy=[[]], aggr=[[sum(numbers.number)]]\n      TableScan: numbers",
             format!("{}", logical_plan.display_indent())
         );
     }
diff --git a/src/query/src/optimizer.rs b/src/query/src/optimizer.rs
index 4259b587ba..aaac1e3124 100644
--- a/src/query/src/optimizer.rs
+++ b/src/query/src/optimizer.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 pub mod constant_term;
+pub mod count_nest_aggr;
 pub mod count_wildcard;
 pub mod parallelize_scan;
 pub mod pass_distribution;
diff --git a/src/query/src/optimizer/count_nest_aggr.rs b/src/query/src/optimizer/count_nest_aggr.rs
new file mode 100644
index 0000000000..89ba426074
--- /dev/null
+++ b/src/query/src/optimizer/count_nest_aggr.rs
@@ -0,0 +1,346 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashSet;
+use std::sync::Arc;
+
+use datafusion::config::ConfigOptions;
+use datafusion::functions_aggregate::count::count_udaf;
+use datafusion::logical_expr::{Extension, LogicalPlan, LogicalPlanBuilder, Sort};
+use datafusion_common::Result;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_expr::{Expr, UserDefinedLogicalNodeCore, lit};
+use promql::extension_plan::{InstantManipulate, SeriesDivide, SeriesNormalize};
+use store_api::metric_engine_consts::DATA_SCHEMA_TSID_COLUMN_NAME;
+
+use crate::QueryEngineContext;
+use crate::optimizer::ExtensionAnalyzerRule;
+
+/// Rewrites `count(<presence-preserving-agg>(<vector_selector>) by (...))` into a presence-based
+/// group count.
+///
+/// This stays intentionally narrow:
+/// - the outer aggregate must be plain `count`
+/// - the inner aggregate must be a plain aggregate whose result existence is equivalent to input
+///   group existence
+/// - the inner input must be the direct instant-vector-selector plan
+/// - the outer count must only group by the evaluation timestamp
+#[derive(Debug)]
+pub struct CountNestAggrRule;
+
+impl ExtensionAnalyzerRule for CountNestAggrRule {
+    fn analyze(
+        &self,
+        plan: LogicalPlan,
+        _ctx: &QueryEngineContext,
+        _config: &ConfigOptions,
+    ) -> Result<LogicalPlan> {
+        plan.transform_down(&Self::rewrite_plan).map(|x| x.data)
+    }
+}
+
+impl CountNestAggrRule {
+    fn rewrite_plan(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
+        let LogicalPlan::Sort(sort) = plan else {
+            return Ok(Transformed::no(plan));
+        };
+
+        if let Some(rewritten) = Self::try_rewrite_sort(&sort)? {
+            Ok(Transformed::yes(rewritten))
+        } else {
+            Ok(Transformed::no(LogicalPlan::Sort(sort)))
+        }
+    }
+
+    fn try_rewrite_sort(sort: &Sort) -> Result<Option<LogicalPlan>> {
+        if sort.fetch.is_some() {
+            return Ok(None);
+        }
+
+        let LogicalPlan::Aggregate(outer_agg) = sort.input.as_ref() else {
+            return Ok(None);
+        };
+        if outer_agg.group_expr.len() != 1 || outer_agg.aggr_expr.len() != 1 {
+            return Ok(None);
+        }
+        let outer_time_expr = outer_agg.group_expr[0].clone();
+        let outer_count_arg =
+            match Self::aggregate_if(&outer_agg.aggr_expr[0], |name| name == "count") {
+                Some((_, arg)) => arg,
+                None => return Ok(None),
+            };
+
+        let LogicalPlan::Sort(inner_sort) = outer_agg.input.as_ref() else {
+            return Ok(None);
+        };
+        if inner_sort.fetch.is_some() {
+            return Ok(None);
+        }
+
+        let LogicalPlan::Aggregate(inner_agg) = inner_sort.input.as_ref() else {
+            return Ok(None);
+        };
+        if inner_agg.aggr_expr.len() != 1 || inner_agg.group_expr.is_empty() {
+            return Ok(None);
+        }
+        let (inner_is_count, inner_value_expr) =
+            match Self::aggregate_if(&inner_agg.aggr_expr[0], |name| {
+                Self::is_supported_inner_aggregate(name)
+            }) {
+                Some((name, arg)) => (name == "count", arg),
+                None => return Ok(None),
+            };
+        let Expr::Column(_) = inner_value_expr else {
+            return Ok(None);
+        };
+
+        let Expr::Column(outer_count_column) = outer_count_arg else {
+            return Ok(None);
+        };
+        let inner_output_field = inner_agg.schema.field(inner_agg.group_expr.len());
+        if outer_count_column.name != *inner_output_field.name() {
+            return Ok(None);
+        }
+
+        if !Self::is_projection_chain_to_instant(inner_agg.input.as_ref()) {
+            return Ok(None);
+        }
+
+        if !inner_agg
+            .group_expr
+            .iter()
+            .all(|expr| matches!(expr, Expr::Column(_)))
+        {
+            return Ok(None);
+        }
+
+        let Some(time_expr_pos) = inner_agg
+            .group_expr
+            .iter()
+            .position(|expr| expr == &outer_time_expr)
+        else {
+            return Ok(None);
+        };
+
+        let mut presence_group_exprs = Vec::with_capacity(inner_agg.group_expr.len());
+        presence_group_exprs.push(outer_time_expr.clone());
+        presence_group_exprs.extend(
+            inner_agg
+                .group_expr
+                .iter()
+                .enumerate()
+                .filter(|(idx, _)| *idx != time_expr_pos)
+                .map(|(_, expr)| expr.clone()),
+        );
+
+        let mut required_input_columns =
+            Self::collect_required_input_columns(&presence_group_exprs, inner_value_expr);
+        required_input_columns.extend(Self::collect_required_instant_columns(
+            inner_agg.input.as_ref(),
+        ));
+        let presence_source = Self::rebuild_projection_chain_to_instant(
+            inner_agg.input.as_ref(),
+            &required_input_columns,
+        )?;
+
+        let outer_value_name = outer_agg
+            .schema
+            .field(outer_agg.group_expr.len())
+            .name()
+            .clone();
+        let mut presence_input = LogicalPlanBuilder::from(presence_source);
+        if !inner_is_count {
+            presence_input = presence_input.filter(inner_value_expr.clone().is_not_null())?;
+        }
+        let presence_input = presence_input
+            .project(presence_group_exprs.clone())?
+            .distinct()?
+            .build()?;
+
+        let rewritten = LogicalPlanBuilder::from(presence_input)
+            .aggregate(
+                outer_agg.group_expr.clone(),
+                vec![count_udaf().call(vec![lit(1_i64)]).alias(outer_value_name)],
+            )?
+            .sort(sort.expr.clone())?
+            .build()?;
+
+        Ok(Some(rewritten))
+    }
+
+    fn collect_required_input_columns(group_exprs: &[Expr], value_expr: &Expr) -> HashSet<String> {
+        let mut required = HashSet::new();
+
+        for expr in group_exprs {
+            if let Expr::Column(column) = expr {
+                required.insert(column.name.clone());
+            }
+        }
+        if let Expr::Column(column) = value_expr {
+            // Keep the value column in the pruned instant input so `InstantManipulate`
+            // can still perform stale-NaN filtering before we project down to keys.
+            required.insert(column.name.clone());
+        }
+
+        required
+    }
+
+    fn collect_required_instant_columns(plan: &LogicalPlan) -> HashSet<String> {
+        let mut required = HashSet::new();
+        Self::collect_required_instant_columns_into(plan, &mut required);
+        required
+    }
+
+    fn collect_required_instant_columns_into(plan: &LogicalPlan, required: &mut HashSet<String>) {
+        match plan {
+            LogicalPlan::Projection(projection) => {
+                Self::collect_required_instant_columns_into(projection.input.as_ref(), required);
+            }
+            LogicalPlan::Extension(extension) => {
+                for expr in extension.node.expressions() {
+                    if let Expr::Column(column) = expr {
+                        required.insert(column.name);
+                    }
+                }
+
+                if extension.node.as_any().is::<SeriesDivide>()
+                    && extension.node.inputs()[0]
+                        .schema()
+                        .fields()
+                        .iter()
+                        .any(|field| field.name() == DATA_SCHEMA_TSID_COLUMN_NAME)
+                {
+                    required.insert(DATA_SCHEMA_TSID_COLUMN_NAME.to_string());
+                }
+
+                if let Some(input) = extension.node.inputs().into_iter().next() {
+                    Self::collect_required_instant_columns_into(input, required);
+                }
+            }
+            _ => {}
+        }
+    }
+
+    fn aggregate_if<F>(expr: &Expr, accept_name: F) -> Option<(&str, &Expr)>
+    where
+        F: FnOnce(&str) -> bool,
+    {
+        let Expr::AggregateFunction(func) = expr else {
+            return None;
+        };
+        let name = func.func.name();
+        if !accept_name(name)
+            || func.params.filter.is_some()
+            || func.params.distinct
+            || !func.params.order_by.is_empty()
+            || func.params.args.len() != 1
+        {
+            return None;
+        }
+
+        Some((name, &func.params.args[0]))
+    }
+
+    fn is_supported_inner_aggregate(name: &str) -> bool {
+        matches!(
+            name,
+            "count" | "sum" | "avg" | "min" | "max" | "stddev_pop" | "var_pop"
+        )
+    }
+
+    fn is_projection_chain_to_instant(plan: &LogicalPlan) -> bool {
+        let mut current = plan;
+        loop {
+            match current {
+                LogicalPlan::Projection(projection) => current = projection.input.as_ref(),
+                LogicalPlan::Extension(ext) => {
+                    return ext.node.as_any().is::<InstantManipulate>();
+                }
+                _ => return false,
+            }
+        }
+    }
+
+    fn rebuild_projection_chain_to_instant(
+        plan: &LogicalPlan,
+        required_columns: &HashSet<String>,
+    ) -> Result<LogicalPlan> {
+        match plan {
+            LogicalPlan::Projection(projection) => {
+                let input = Self::rebuild_projection_chain_to_instant(
+                    projection.input.as_ref(),
+                    required_columns,
+                )?;
+                LogicalPlanBuilder::from(input)
+                    .project(projection.expr.clone())?
+                    .build()
+            }
+            LogicalPlan::Extension(extension) => {
+                if let Some(instant) = extension.node.as_any().downcast_ref::<InstantManipulate>() {
+                    let input =
+                        Self::prune_instant_input(extension.node.inputs()[0], required_columns)?;
+                    return Ok(LogicalPlan::Extension(Extension {
+                        node: Arc::new(instant.with_exprs_and_inputs(vec![], vec![input])?),
+                    }));
+                }
+
+                Ok(plan.clone())
+            }
+            _ => Ok(plan.clone()),
+        }
+    }
+
+    fn prune_instant_input(
+        plan: &LogicalPlan,
+        required_columns: &HashSet<String>,
+    ) -> Result<LogicalPlan> {
+        match plan {
+            LogicalPlan::Extension(extension) => {
+                if let Some(normalize) = extension.node.as_any().downcast_ref::<SeriesNormalize>() {
+                    let input =
+                        Self::prune_instant_input(extension.node.inputs()[0], required_columns)?;
+                    return Ok(LogicalPlan::Extension(Extension {
+                        node: Arc::new(normalize.with_exprs_and_inputs(vec![], vec![input])?),
+                    }));
+                }
+
+                if let Some(divide) = extension.node.as_any().downcast_ref::<SeriesDivide>() {
+                    let divide_input = extension.node.inputs()[0].clone();
+
+                    let projection_exprs = divide_input
+                        .schema()
+                        .fields()
+                        .iter()
+                        .filter(|field| required_columns.contains(field.name()))
+                        .map(|field| {
+                            Expr::Column(datafusion_common::Column::from_name(field.name().clone()))
+                        })
+                        .collect::<Vec<_>>();
+                    let projected_input = LogicalPlanBuilder::from(divide_input)
+                        .project(projection_exprs)?
+                        .build()?;
+
+                    return Ok(LogicalPlan::Extension(Extension {
+                        node: Arc::new(
+                            divide.with_exprs_and_inputs(vec![], vec![projected_input])?,
+                        ),
+                    }));
+                }
+
+                Ok(plan.clone())
+            }
+            _ => Ok(plan.clone()),
+        }
+    }
+}
diff --git a/src/query/src/planner.rs b/src/query/src/planner.rs
index 44c9bc3956..6b206b9d8d 100644
--- a/src/query/src/planner.rs
+++ b/src/query/src/planner.rs
@@ -28,6 +28,7 @@ use datafusion::execution::context::SessionState;
 use datafusion::sql::planner::PlannerContext;
 use datafusion_common::ToDFSchema;
 use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
+use datafusion_expr::expr::{Exists, InSubquery};
 use datafusion_expr::{
     Analyze, Explain, ExplainFormat, Expr as DfExpr, LogicalPlan, LogicalPlanBuilder, PlanType,
     ToStringifiedPlan, col,
@@ -277,17 +278,22 @@ impl DfLogicalPlanner {
         let table_provider = DfTableSourceProvider::new(
             self.engine_state.catalog_manager().clone(),
             self.engine_state.disallow_cross_catalog_query(),
-            query_ctx,
+            query_ctx.clone(),
             plan_decoder,
             self.session_state
                 .config_options()
                 .sql_parser
                 .enable_ident_normalization,
         );
-        PromPlanner::stmt_to_plan(table_provider, stmt, &self.engine_state)
+        let plan = PromPlanner::stmt_to_plan(table_provider, stmt, &self.engine_state)
             .await
             .map_err(BoxedError::new)
-            .context(QueryPlanSnafu)
+            .context(QueryPlanSnafu)?;
+
+        let context = QueryEngineContext::new(self.session_state.clone(), query_ctx);
+        Ok(self
+            .engine_state
+            .optimize_by_extension_rules(plan, &context)?)
     }
 
     #[tracing::instrument(skip_all)]
@@ -424,9 +430,20 @@ impl DfLogicalPlanner {
         let mut placeholder_types = HashMap::new();
         let mut casted_placeholders = HashSet::new();
 
+        Self::extract_from_plan(plan, &mut placeholder_types, &mut casted_placeholders)?;
+
+        Ok(placeholder_types)
+    }
+
+    fn extract_from_plan(
+        plan: &LogicalPlan,
+        placeholder_types: &mut HashMap<String, Option<DataType>>,
+        casted_placeholders: &mut HashSet<String>,
+    ) -> Result<()> {
         plan.apply(|node| {
             for expr in node.expressions() {
                 let _ = expr.apply(|e| {
+                    // Handle casted placeholders
                     if let DfExpr::Cast(cast) = e
                         && let DfExpr::Placeholder(ph) = &*cast.expr
                     {
@@ -434,6 +451,7 @@ impl DfLogicalPlanner {
                         casted_placeholders.insert(ph.id.clone());
                     }
 
+                    // Handle bare (non-casted) placeholders
                     if let DfExpr::Placeholder(ph) = e
                         && !casted_placeholders.contains(&ph.id)
                         && !placeholder_types.contains_key(&ph.id)
@@ -441,13 +459,26 @@ impl DfLogicalPlanner {
                         placeholder_types.insert(ph.id.clone(), None);
                     }
 
+                    // Recurse into subquery plans embedded in expressions
+                    match e {
+                        DfExpr::Exists(Exists { subquery, .. })
+                        | DfExpr::InSubquery(InSubquery { subquery, .. })
+                        | DfExpr::ScalarSubquery(subquery) => {
+                            Self::extract_from_plan(
+                                &subquery.subquery,
+                                placeholder_types,
+                                casted_placeholders,
+                            )?;
+                        }
+                        _ => {}
+                    }
+
                     Ok(TreeNodeRecursion::Continue)
                 });
             }
             Ok(TreeNodeRecursion::Continue)
         })?;
-
-        Ok(placeholder_types)
+        Ok(())
     }
 
     /// Gets inferred parameter types from a logical plan.
@@ -545,15 +576,22 @@ mod tests {
     use std::sync::Arc;
 
     use arrow_schema::DataType;
+    use catalog::RegisterTableRequest;
+    use catalog::memory::MemoryCatalogManager;
+    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
     use datatypes::prelude::ConcreteDataType;
     use datatypes::schema::{ColumnSchema, Schema};
     use session::context::QueryContext;
+    use store_api::metric_engine_consts::{
+        DATA_SCHEMA_TABLE_ID_COLUMN_NAME, DATA_SCHEMA_TSID_COLUMN_NAME, LOGICAL_TABLE_METADATA_KEY,
+        METRIC_ENGINE_NAME,
+    };
     use table::metadata::{TableInfoBuilder, TableMetaBuilder};
     use table::test_util::EmptyTable;
 
     use super::*;
-    use crate::QueryEngineRef;
-    use crate::parser::QueryLanguageParser;
+    use crate::parser::{PromQuery, QueryLanguageParser};
+    use crate::{QueryEngineFactory, QueryEngineRef};
 
     async fn create_test_engine() -> QueryEngineRef {
         let columns = vec![
@@ -574,6 +612,109 @@ mod tests {
         crate::tests::new_query_engine_with_table(table)
     }
 
+    fn create_promql_test_engine() -> QueryEngineRef {
+        let catalog_manager = MemoryCatalogManager::with_default_setup();
+        let physical_table_name = "phy";
+        let physical_table_id = 999u32;
+
+        let physical_schema = Arc::new(Schema::new(vec![
+            ColumnSchema::new(
+                DATA_SCHEMA_TABLE_ID_COLUMN_NAME.to_string(),
+                ConcreteDataType::uint32_datatype(),
+                false,
+            ),
+            ColumnSchema::new(
+                DATA_SCHEMA_TSID_COLUMN_NAME.to_string(),
+                ConcreteDataType::uint64_datatype(),
+                false,
+            ),
+            ColumnSchema::new("tag_0", ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new("tag_1", ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new(
+                "timestamp",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                false,
+            )
+            .with_time_index(true),
+            ColumnSchema::new("field_0", ConcreteDataType::float64_datatype(), true),
+        ]));
+        let physical_meta = TableMetaBuilder::empty()
+            .schema(physical_schema)
+            .primary_key_indices(vec![0, 1, 2, 3])
+            .value_indices(vec![4, 5])
+            .engine(METRIC_ENGINE_NAME.to_string())
+            .next_column_id(1024)
+            .build()
+            .unwrap();
+        let physical_info = TableInfoBuilder::default()
+            .table_id(physical_table_id)
+            .name(physical_table_name)
+            .meta(physical_meta)
+            .build()
+            .unwrap();
+        catalog_manager
+            .register_table_sync(RegisterTableRequest {
+                catalog: DEFAULT_CATALOG_NAME.to_string(),
+                schema: DEFAULT_SCHEMA_NAME.to_string(),
+                table_name: physical_table_name.to_string(),
+                table_id: physical_table_id,
+                table: EmptyTable::from_table_info(&physical_info),
+            })
+            .unwrap();
+
+        let mut options = table::requests::TableOptions::default();
+        options.extra_options.insert(
+            LOGICAL_TABLE_METADATA_KEY.to_string(),
+            physical_table_name.to_string(),
+        );
+        let logical_schema = Arc::new(Schema::new(vec![
+            ColumnSchema::new("tag_0", ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new("tag_1", ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new(
+                "timestamp",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                false,
+            )
+            .with_time_index(true),
+            ColumnSchema::new("field_0", ConcreteDataType::float64_datatype(), true),
+        ]));
+        let logical_meta = TableMetaBuilder::empty()
+            .schema(logical_schema)
+            .primary_key_indices(vec![0, 1])
+            .value_indices(vec![3])
+            .engine(METRIC_ENGINE_NAME.to_string())
+            .options(options)
+            .next_column_id(1024)
+            .build()
+            .unwrap();
+        let logical_info = TableInfoBuilder::default()
+            .table_id(1024)
+            .name("some_metric")
+            .meta(logical_meta)
+            .build()
+            .unwrap();
+        catalog_manager
+            .register_table_sync(RegisterTableRequest {
+                catalog: DEFAULT_CATALOG_NAME.to_string(),
+                schema: DEFAULT_SCHEMA_NAME.to_string(),
+                table_name: "some_metric".to_string(),
+                table_id: 1024,
+                table: EmptyTable::from_table_info(&logical_info),
+            })
+            .unwrap();
+
+        QueryEngineFactory::new(
+            catalog_manager,
+            None,
+            None,
+            None,
+            None,
+            false,
+            crate::options::QueryOptions::default(),
+        )
+        .query_engine()
+    }
+
     async fn parse_sql_to_plan(sql: &str) -> LogicalPlan {
         let stmt = QueryLanguageParser::parse_sql(sql, &QueryContext::arc()).unwrap();
         let engine = create_test_engine().await;
@@ -584,6 +725,25 @@ mod tests {
             .unwrap()
     }
 
+    async fn parse_promql_to_plan(query: &str) -> LogicalPlan {
+        let engine = create_promql_test_engine();
+        let query_ctx = QueryContext::arc();
+        let stmt = QueryLanguageParser::parse_promql(
+            &PromQuery {
+                query: query.to_string(),
+                start: "0".to_string(),
+                end: "10".to_string(),
+                step: "5s".to_string(),
+                lookback: "300s".to_string(),
+                alias: None,
+            },
+            &query_ctx,
+        )
+        .unwrap();
+
+        engine.planner().plan(&stmt, query_ctx).await.unwrap()
+    }
+
     #[tokio::test]
     async fn test_extract_placeholder_cast_types_multiple() {
         let plan = parse_sql_to_plan(
@@ -619,4 +779,82 @@ mod tests {
         assert_eq!(type_2, &Some(DataType::Utf8));
         assert_eq!(type_3, &Some(DataType::Int32));
     }
+
+    #[tokio::test]
+    async fn test_plan_pql_applies_extension_rules() {
+        for inner_agg in ["count", "sum", "avg", "min", "max", "stddev", "stdvar"] {
+            let plan = parse_promql_to_plan(&format!(
+                "sum(irate(some_metric[1h])) / scalar(count({inner_agg}(some_metric) by (tag_0)))"
+            ))
+            .await;
+            let plan_str = plan.display_indent_schema().to_string();
+            assert!(plan_str.contains("Distinct:"), "{inner_agg}: {plan_str}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_plan_pql_filters_null_only_groups_for_non_count_inner_aggs() {
+        let count_plan = parse_promql_to_plan("scalar(count(count(some_metric) by (tag_0)))").await;
+        let count_plan_str = count_plan.display_indent_schema().to_string();
+        assert!(
+            !count_plan_str.contains("field_0 IS NOT NULL"),
+            "{count_plan_str}"
+        );
+
+        for inner_agg in ["sum", "avg", "min", "max", "stddev", "stdvar"] {
+            let plan = parse_promql_to_plan(&format!(
+                "scalar(count({inner_agg}(some_metric) by (tag_0)))"
+            ))
+            .await;
+            let plan_str = plan.display_indent_schema().to_string();
+            assert!(
+                plan_str.contains("field_0 IS NOT NULL"),
+                "{inner_agg}: {plan_str}"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_plan_pql_skips_extension_rules_for_non_direct_or_unsupported_inner_agg() {
+        for query in [
+            "sum(irate(some_metric[1h])) / scalar(count(sum(irate(some_metric[1h])) by (tag_0)))",
+            "sum(irate(some_metric[1h])) / scalar(count(group(some_metric) by (tag_0)))",
+        ] {
+            let plan = parse_promql_to_plan(query).await;
+            let plan_str = plan.display_indent_schema().to_string();
+            assert!(!plan_str.contains("Distinct:"), "{query}: {plan_str}");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_plan_sql_does_not_apply_nested_count_rule() {
+        let plan = parse_sql_to_plan(
+            "SELECT id, count(inner_count) \
+             FROM ( \
+                 SELECT id, count(name) AS inner_count \
+                 FROM test \
+                 GROUP BY id \
+                 ORDER BY id \
+                 LIMIT 1000000 \
+             ) t \
+             GROUP BY id \
+             ORDER BY id",
+        )
+        .await;
+
+        let plan_str = plan.display_indent_schema().to_string();
+        assert!(!plan_str.contains("Distinct:"), "{plan_str}");
+    }
+
+    #[tokio::test]
+    async fn test_get_inferred_parameter_types_subquery() {
+        let plan = parse_sql_to_plan(
+            r#"SELECT * FROM test WHERE id = (SELECT id FROM test CROSS JOIN (SELECT parse_ident($1::TEXT) AS parts) p LIMIT 1)"#,
+        ).await;
+        let types = DfLogicalPlanner::get_inferred_parameter_types(&plan).unwrap();
+
+        assert_eq!(types.len(), 1);
+        let type_1 = types.get("$1").unwrap();
+        assert_eq!(type_1, &Some(DataType::Utf8));
+    }
 }
diff --git a/src/query/src/promql/planner.rs b/src/query/src/promql/planner.rs
index 427644e26a..23d654d2b6 100644
--- a/src/query/src/promql/planner.rs
+++ b/src/query/src/promql/planner.rs
@@ -3323,28 +3323,55 @@ impl PromPlanner {
     fn prom_token_to_binary_expr_builder(
         token: TokenType,
     ) -> Result<Box<dyn Fn(DfExpr, DfExpr) -> Result<DfExpr>>> {
+        let cast_float = |expr| {
+            if matches!(
+                &expr,
+                DfExpr::Cast(Cast {
+                    data_type: ArrowDataType::Float64,
+                    ..
+                })
+            ) || matches!(&expr, DfExpr::Literal(ScalarValue::Float64(_), _))
+            {
+                expr
+            } else {
+                DfExpr::Cast(Cast {
+                    expr: Box::new(expr),
+                    data_type: ArrowDataType::Float64,
+                })
+            }
+        };
         match token.id() {
-            token::T_ADD => Ok(Box::new(|lhs, rhs| Ok(lhs + rhs))),
-            token::T_SUB => Ok(Box::new(|lhs, rhs| Ok(lhs - rhs))),
-            token::T_MUL => Ok(Box::new(|lhs, rhs| Ok(lhs * rhs))),
-            token::T_DIV => Ok(Box::new(|lhs, rhs| Ok(lhs / rhs))),
-            token::T_MOD => Ok(Box::new(|lhs: DfExpr, rhs| Ok(lhs % rhs))),
+            token::T_ADD => Ok(Box::new(move |lhs, rhs| {
+                Ok(cast_float(lhs) + cast_float(rhs))
+            })),
+            token::T_SUB => Ok(Box::new(move |lhs, rhs| {
+                Ok(cast_float(lhs) - cast_float(rhs))
+            })),
+            token::T_MUL => Ok(Box::new(move |lhs, rhs| {
+                Ok(cast_float(lhs) * cast_float(rhs))
+            })),
+            token::T_DIV => Ok(Box::new(move |lhs, rhs| {
+                Ok(cast_float(lhs) / cast_float(rhs))
+            })),
+            token::T_MOD => Ok(Box::new(move |lhs: DfExpr, rhs| {
+                Ok(cast_float(lhs) % cast_float(rhs))
+            })),
             token::T_EQLC => Ok(Box::new(|lhs, rhs| Ok(lhs.eq(rhs)))),
             token::T_NEQ => Ok(Box::new(|lhs, rhs| Ok(lhs.not_eq(rhs)))),
             token::T_GTR => Ok(Box::new(|lhs, rhs| Ok(lhs.gt(rhs)))),
             token::T_LSS => Ok(Box::new(|lhs, rhs| Ok(lhs.lt(rhs)))),
             token::T_GTE => Ok(Box::new(|lhs, rhs| Ok(lhs.gt_eq(rhs)))),
             token::T_LTE => Ok(Box::new(|lhs, rhs| Ok(lhs.lt_eq(rhs)))),
-            token::T_POW => Ok(Box::new(|lhs, rhs| {
+            token::T_POW => Ok(Box::new(move |lhs, rhs| {
                 Ok(DfExpr::ScalarFunction(ScalarFunction {
                     func: datafusion_functions::math::power(),
-                    args: vec![lhs, rhs],
+                    args: vec![cast_float(lhs), cast_float(rhs)],
                 }))
             })),
-            token::T_ATAN2 => Ok(Box::new(|lhs, rhs| {
+            token::T_ATAN2 => Ok(Box::new(move |lhs, rhs| {
                 Ok(DfExpr::ScalarFunction(ScalarFunction {
                     func: datafusion_functions::math::atan2(),
-                    args: vec![lhs, rhs],
+                    args: vec![cast_float(lhs), cast_float(rhs)],
                 }))
             })),
             _ => UnexpectedTokenSnafu { token }.fail(),
@@ -4029,6 +4056,7 @@ mod test {
     use table::test_util::EmptyTable;
 
     use super::*;
+    use crate::QueryEngineContext;
     use crate::options::QueryOptions;
     use crate::parser::QueryLanguageParser;
 
@@ -4046,6 +4074,64 @@ mod test {
         )
     }
 
+    async fn build_optimized_promql_plan(
+        table_provider: DfTableSourceProvider,
+        eval_stmt: &EvalStmt,
+    ) -> LogicalPlan {
+        let state = build_query_engine_state();
+        let raw_plan = PromPlanner::stmt_to_plan(table_provider, eval_stmt, &state)
+            .await
+            .unwrap();
+        let context = QueryEngineContext::new(state.session_state(), QueryContext::arc());
+        state
+            .optimize_by_extension_rules(raw_plan, &context)
+            .unwrap()
+    }
+
+    async fn build_optimized_tsid_plan(
+        query: &str,
+        num_tag: usize,
+        num_field: usize,
+        end_secs: u64,
+        lookback_secs: u64,
+    ) -> String {
+        let eval_stmt = EvalStmt {
+            expr: parser::parse(query).unwrap(),
+            start: UNIX_EPOCH,
+            end: UNIX_EPOCH
+                .checked_add(Duration::from_secs(end_secs))
+                .unwrap(),
+            interval: Duration::from_secs(5),
+            lookback_delta: Duration::from_secs(lookback_secs),
+        };
+        let table_provider = build_test_table_provider_with_tsid(
+            &[(DEFAULT_SCHEMA_NAME.to_string(), "some_metric".to_string())],
+            num_tag,
+            num_field,
+        )
+        .await;
+
+        build_optimized_promql_plan(table_provider, &eval_stmt)
+            .await
+            .display_indent_schema()
+            .to_string()
+    }
+
+    async fn assert_nested_count_rewrite_applies(query: &str, expected_outer_agg: &str) {
+        let plan_str = build_optimized_tsid_plan(query, 2, 1, 100_000, 1).await;
+
+        assert!(plan_str.contains("PromSeriesDivide: tags=[\"__tsid\"]"));
+        assert!(plan_str.contains("Projection: some_metric.timestamp, some_metric.tag_0"));
+        assert!(plan_str.contains("Distinct:"));
+        assert!(plan_str.contains(expected_outer_agg), "{plan_str}");
+        assert!(!plan_str.contains("PromSeriesDivide: tags=[\"tag_0\"]"));
+    }
+
+    async fn assert_nested_count_rewrite_missing(query: &str, num_tag: usize, lookback_secs: u64) {
+        let plan_str = build_optimized_tsid_plan(query, num_tag, 1, 100_000, lookback_secs).await;
+        assert!(!plan_str.contains("Distinct:"), "{plan_str}");
+    }
+
     async fn build_test_table_provider(
         table_name_tuples: &[(String, String)],
         num_tag: usize,
@@ -4658,6 +4744,117 @@ mod test {
         );
     }
 
+    #[tokio::test]
+    async fn scalar_count_count_range_keeps_full_window() {
+        let plan_str = build_optimized_tsid_plan(
+            "scalar(count(count(some_metric) by (tag_0)))",
+            1,
+            1,
+            100_000,
+            1,
+        )
+        .await;
+        assert!(plan_str.contains("ScalarCalculate: tags=[]"));
+        assert!(plan_str.contains("PromInstantManipulate: range=[0..100000000]"));
+        assert!(!plan_str.contains("PromInstantManipulate: range=[99999000..99999000]"));
+    }
+
+    #[tokio::test]
+    async fn scalar_count_count_rewrite_applies_inside_binary_expr_for_tsid_input() {
+        let plan_str = build_optimized_tsid_plan(
+            "sum(irate(some_metric[1h])) / scalar(count(count(some_metric) by (tag_0)))",
+            2,
+            1,
+            10,
+            300,
+        )
+        .await;
+        assert!(plan_str.contains("Distinct:"), "{plan_str}");
+    }
+
+    #[tokio::test]
+    async fn nested_count_rewrite_keeps_full_series_key_with_tsid_input() {
+        assert_nested_count_rewrite_applies(
+            "count(count(some_metric) by (tag_0))",
+            "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(count(some_metric.field_0))]]"
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn nested_sum_count_rewrite_keeps_full_series_key_with_tsid_input() {
+        assert_nested_count_rewrite_applies(
+            "count(sum(some_metric) by (tag_0))",
+            "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(sum(some_metric.field_0))]]"
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn nested_supported_inner_aggs_rewrite_apply_for_tsid_input() {
+        for (query, expected_outer_agg) in [
+            (
+                "count(avg(some_metric) by (tag_0))",
+                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(avg(some_metric.field_0))]]",
+            ),
+            (
+                "count(min(some_metric) by (tag_0))",
+                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(min(some_metric.field_0))]]",
+            ),
+            (
+                "count(max(some_metric) by (tag_0))",
+                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(max(some_metric.field_0))]]",
+            ),
+            (
+                "count(stddev(some_metric) by (tag_0))",
+                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(stddev_pop(some_metric.field_0))]]",
+            ),
+            (
+                "count(stdvar(some_metric) by (tag_0))",
+                "Aggregate: groupBy=[[some_metric.timestamp]], aggr=[[count(Int64(1)) AS count(var_pop(some_metric.field_0))]]",
+            ),
+        ] {
+            assert_nested_count_rewrite_applies(query, expected_outer_agg).await;
+        }
+    }
+
+    #[tokio::test]
+    async fn nested_non_count_inner_aggs_rewrite_filter_null_values_for_tsid_input() {
+        let count_plan =
+            build_optimized_tsid_plan("count(count(some_metric) by (tag_0))", 2, 1, 100_000, 1)
+                .await;
+        assert!(
+            !count_plan.contains("some_metric.field_0 IS NOT NULL"),
+            "{count_plan}"
+        );
+
+        for query in [
+            "count(sum(some_metric) by (tag_0))",
+            "count(avg(some_metric) by (tag_0))",
+            "count(min(some_metric) by (tag_0))",
+            "count(max(some_metric) by (tag_0))",
+            "count(stddev(some_metric) by (tag_0))",
+            "count(stdvar(some_metric) by (tag_0))",
+        ] {
+            let plan_str = build_optimized_tsid_plan(query, 2, 1, 100_000, 1).await;
+            assert!(
+                plan_str.contains("Filter: some_metric.field_0 IS NOT NULL"),
+                "{query}: {plan_str}"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn nested_unsupported_or_non_direct_inner_aggs_do_not_rewrite() {
+        assert_nested_count_rewrite_missing("count(group(some_metric) by (tag_0))", 2, 1).await;
+        assert_nested_count_rewrite_missing(
+            "count(sum(irate(some_metric[1h])) by (tag_0))",
+            2,
+            300,
+        )
+        .await;
+    }
+
     #[tokio::test]
     async fn physical_table_name_is_not_leaked_in_plan() {
         let prom_expr = parser::parse("some_metric").unwrap();
@@ -5169,7 +5366,7 @@ mod test {
                 .unwrap();
 
         let expected = String::from(
-            "Projection: rhs.tag_0, rhs.timestamp, lhs.field_0 + rhs.field_0 AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\
+            "Projection: rhs.tag_0, rhs.timestamp, CAST(lhs.field_0 AS Float64) + CAST(rhs.field_0 AS Float64) AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\
             \n  Inner Join: lhs.tag_0 = rhs.tag_0, lhs.timestamp = rhs.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n    SubqueryAlias: lhs [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
@@ -5224,7 +5421,7 @@ mod test {
     async fn binary_op_literal_column() {
         let query = r#"1 + some_metric{tag_0="bar"}"#;
         let expected = String::from(
-            "Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + some_metric.field_0 AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\
+            "Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + CAST(some_metric.field_0 AS Float64) AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\
             \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
@@ -5262,7 +5459,7 @@ mod test {
     async fn bool_with_additional_arithmetic() {
         let query = "some_metric + (1 == bool 2)";
         let expected = String::from(
-            "Projection: some_metric.tag_0, some_metric.timestamp, some_metric.field_0 + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\
+            "Projection: some_metric.tag_0, some_metric.timestamp, CAST(some_metric.field_0 AS Float64) + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\
             \n  PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n    PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n      Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
@@ -5372,7 +5569,7 @@ mod test {
             PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
                 .await
                 .unwrap();
-        let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\
+        let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, CAST(http_server_requests_seconds_sum.greptime_value AS Float64) / CAST(http_server_requests_seconds_count.greptime_value AS Float64) AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\
             \n  Inner Join: http_server_requests_seconds_sum.greptime_timestamp = http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.uri = http_server_requests_seconds_count.uri\
             \n    SubqueryAlias: http_server_requests_seconds_sum\
             \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp]\
@@ -5763,7 +5960,7 @@ mod test {
 
         let query = "some_alt_metric{__schema__=\"greptime_private\"} / some_metric";
         let expected = String::from(
-            "Projection: some_metric.tag_0, some_metric.timestamp, greptime_private.some_alt_metric.field_0 / some_metric.field_0 AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\
+            "Projection: some_metric.tag_0, some_metric.timestamp, CAST(greptime_private.some_alt_metric.field_0 AS Float64) / CAST(some_metric.field_0 AS Float64) AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\
             \n  Inner Join: greptime_private.some_alt_metric.tag_0 = some_metric.tag_0, greptime_private.some_alt_metric.timestamp = some_metric.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n    SubqueryAlias: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
             \n      PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs
index a45fc4c896..f696c8b53e 100644
--- a/src/query/src/query_engine/state.rs
+++ b/src/query/src/query_engine/state.rs
@@ -60,6 +60,7 @@ use crate::dist_plan::{
 use crate::metrics::{QUERY_MEMORY_POOL_REJECTED_TOTAL, QUERY_MEMORY_POOL_USAGE_BYTES};
 use crate::optimizer::ExtensionAnalyzerRule;
 use crate::optimizer::constant_term::MatchesConstantTermOptimizer;
+use crate::optimizer::count_nest_aggr::CountNestAggrRule;
 use crate::optimizer::count_wildcard::CountWildcardToTimeIndexRule;
 use crate::optimizer::parallelize_scan::ParallelizeScan;
 use crate::optimizer::pass_distribution::PassDistribution;
@@ -146,6 +147,7 @@ impl QueryEngineState {
 
         // The [`TypeConversionRule`] must be at first
         extension_rules.insert(0, Arc::new(TypeConversionRule) as _);
+        extension_rules.push(Arc::new(CountNestAggrRule) as _);
 
         // Apply the datafusion rules
         let mut analyzer = Analyzer::new();
diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml
index e75192c9ba..8b64a256e7 100644
--- a/src/servers/Cargo.toml
+++ b/src/servers/Cargo.toml
@@ -89,7 +89,7 @@ operator.workspace = true
 otel-arrow-rust.workspace = true
 parking_lot.workspace = true
 pg_interval = { version = "0.5.2", package = "pg_interval_2" }
-pgwire = { version = "0.38", default-features = false, features = [
+pgwire = { version = "0.38.1", default-features = false, features = [
     "server-api-ring",
     "pg-ext-types",
 ] }
diff --git a/src/servers/dashboard/VERSION b/src/servers/dashboard/VERSION
index 03ee1a5314..87a1cf595a 100644
--- a/src/servers/dashboard/VERSION
+++ b/src/servers/dashboard/VERSION
@@ -1 +1 @@
-v0.11.13
+v0.12.0
diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs
index ca6a77a077..506a240cac 100644
--- a/src/servers/src/http.rs
+++ b/src/servers/src/http.rs
@@ -78,7 +78,7 @@ use crate::metrics_handler::MetricsHandler;
 use crate::prometheus_handler::PrometheusHandlerRef;
 use crate::query_handler::sql::ServerSqlQueryHandlerRef;
 use crate::query_handler::{
-    InfluxdbLineProtocolHandlerRef, JaegerQueryHandlerRef, LogQueryHandlerRef,
+    DashboardHandlerRef, InfluxdbLineProtocolHandlerRef, JaegerQueryHandlerRef, LogQueryHandlerRef,
     OpenTelemetryProtocolHandlerRef, OpentsdbProtocolHandlerRef, PipelineHandlerRef,
     PromStoreProtocolHandlerRef,
 };
@@ -112,8 +112,8 @@ pub mod utils;
 use result::HttpOutputWriter;
 pub(crate) use timeout::DynamicTimeoutLayer;
 
+mod client_ip;
 use crate::prom_remote_write::validation::PromValidationMode;
-
 mod hints;
 mod read_preference;
 #[cfg(any(test, feature = "testing"))]
@@ -507,6 +507,11 @@ pub struct GreptimeOptionsConfigState {
     pub greptime_config_options: String,
 }
 
+#[derive(Clone)]
+pub struct DashboardState {
+    pub handler: DashboardHandlerRef,
+}
+
 pub struct HttpServerBuilder {
     options: HttpOptions,
     plugins: Plugins,
@@ -703,6 +708,16 @@ impl HttpServerBuilder {
         }
     }
 
+    pub fn with_dashboard_handler(self, handler: DashboardHandlerRef) -> Self {
+        Self {
+            router: self.router.nest(
+                &format!("/{HTTP_API_VERSION}/dashboards"),
+                HttpServer::route_dashboard(handler),
+            ),
+            ..self
+        }
+    }
+
     pub fn with_extra_router(self, router: Router) -> Self {
         Self {
             router: self.router.merge(router),
@@ -868,6 +883,7 @@ impl HttpServer {
                         authorize::check_http_auth,
                     ))
                     .layer(middleware::from_fn(hints::extract_hints))
+                    .layer(middleware::from_fn(client_ip::log_error_with_client_ip))
                     .layer(middleware::from_fn(
                         read_preference::extract_read_preference,
                     )),
@@ -1169,6 +1185,26 @@ impl HttpServer {
             )
             .with_state(handler)
     }
+
+    #[cfg(feature = "dashboard")]
+    fn route_dashboard<S>(handler: DashboardHandlerRef) -> Router<S> {
+        use crate::http::dashboard::{add_dashboard, delete_dashboard, list_dashboards};
+
+        Router::new()
+            .route("/", routing::get(list_dashboards))
+            .route("/{dashboard_name}", routing::post(add_dashboard))
+            .route("/{dashboard_name}", routing::delete(delete_dashboard))
+            .layer(
+                ServiceBuilder::new()
+                    .layer(RequestDecompressionLayer::new().pass_through_unaccepted(true)),
+            )
+            .with_state(DashboardState { handler })
+    }
+
+    #[cfg(not(feature = "dashboard"))]
+    fn route_dashboard<S>(handler: DashboardHandlerRef) -> Router<S> {
+        Router::new().with_state(DashboardState { handler })
+    }
 }
 
 pub const HTTP_SERVER: &str = "HTTP_SERVER";
@@ -1212,7 +1248,10 @@ impl Server for HttpServer {
                         error!(e; "Failed to set TCP_NODELAY on incoming connection");
                     }
                 });
-            let serve = axum::serve(listener, app.into_make_service());
+            let serve = axum::serve(
+                listener,
+                app.into_make_service_with_connect_info::<SocketAddr>(),
+            );
 
             // FIXME(yingwen): Support keepalive.
             // See:
diff --git a/src/servers/src/http/client_ip.rs b/src/servers/src/http/client_ip.rs
new file mode 100644
index 0000000000..70df554ebb
--- /dev/null
+++ b/src/servers/src/http/client_ip.rs
@@ -0,0 +1,109 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::net::SocketAddr;
+
+use axum::body::Body;
+use axum::extract::{ConnectInfo, MatchedPath};
+use axum::http::Request;
+use axum::middleware::Next;
+use axum::response::Response;
+use common_telemetry::warn;
+
+/// Middleware that logs HTTP error responses (4xx/5xx) with client IP address.
+///
+/// Extracts client address from [`ConnectInfo`] if available.
+pub async fn log_error_with_client_ip(req: Request<Body>, next: Next) -> Response {
+    let request_info = req
+        .extensions()
+        .get::<ConnectInfo<SocketAddr>>()
+        .map(|c| c.0)
+        .map(|addr| {
+            let method = req.method().clone();
+            let uri = req.uri().clone();
+            let matched_path = req.extensions().get::<MatchedPath>().cloned();
+            (addr, method, uri, matched_path)
+        });
+
+    let response = next.run(req).await;
+
+    if (response.status().is_client_error() || response.status().is_server_error())
+        && let Some((addr, method, uri, matched_path)) = request_info
+    {
+        warn!(
+            "HTTP error response {} for {} {} (matched: {}) from client {}",
+            response.status(),
+            method,
+            uri,
+            matched_path
+                .as_ref()
+                .map(|p| p.as_str())
+                .unwrap_or("<unknown>"),
+            addr
+        );
+    }
+
+    response
+}
+
+#[cfg(test)]
+mod tests {
+    use axum::Router;
+    use axum::routing::get;
+    use http::StatusCode;
+    use tower::ServiceExt;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_middleware_passes_error_response() {
+        async fn not_found_handler() -> StatusCode {
+            StatusCode::NOT_FOUND
+        }
+
+        let app = Router::new()
+            .route("/not-found", get(not_found_handler))
+            .layer(axum::middleware::from_fn(log_error_with_client_ip));
+
+        let response = app
+            .oneshot(
+                Request::builder()
+                    .uri("/not-found")
+                    .body(Body::empty())
+                    .unwrap(),
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(response.status(), StatusCode::NOT_FOUND);
+    }
+
+    #[tokio::test]
+    async fn test_middleware_passes_success_response() {
+        async fn ok_handler() -> StatusCode {
+            StatusCode::OK
+        }
+
+        let app = Router::new()
+            .route("/ok", get(ok_handler))
+            .layer(axum::middleware::from_fn(log_error_with_client_ip));
+
+        let response = app
+            .oneshot(Request::builder().uri("/ok").body(Body::empty()).unwrap())
+            .await
+            .unwrap();
+
+        assert_eq!(response.status(), StatusCode::OK);
+    }
+}
diff --git a/src/servers/src/http/dashboard.rs b/src/servers/src/http/dashboard.rs
index bdb98490f0..ea894ca7d0 100644
--- a/src/servers/src/http/dashboard.rs
+++ b/src/servers/src/http/dashboard.rs
@@ -12,14 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use axum::body::Body;
+use std::sync::Arc;
+use std::time::Instant;
+
+use axum::body::{Body, Bytes};
+use axum::extract::{Extension, Path, State};
 use axum::http::{StatusCode, Uri, header};
 use axum::response::Response;
-use common_telemetry::debug;
+use common_telemetry::{debug, error};
 use rust_embed::RustEmbed;
-use snafu::ResultExt;
+use session::context::{Channel, QueryContext};
+use snafu::{ResultExt, ensure};
 
-use crate::error::{BuildHttpResponseSnafu, Result};
+use crate::error::{BuildHttpResponseSnafu, InvalidParameterSnafu, Result};
+use crate::http::DashboardState;
+use crate::http::result::greptime_manage_resp::{DashboardOutput, GreptimedbManageResponse};
 
 #[derive(RustEmbed)]
 #[folder = "dashboard/dist/"]
@@ -61,3 +68,102 @@ fn get_assets(path: &str) -> Result<Response> {
     }
     .context(BuildHttpResponseSnafu)
 }
+
+#[axum_macros::debug_handler]
+pub async fn add_dashboard(
+    State(state): State<DashboardState>,
+    Path(dashboard_name): Path<String>,
+    Extension(mut query_ctx): Extension<QueryContext>,
+    payload: Bytes,
+) -> Result<GreptimedbManageResponse> {
+    let start = Instant::now();
+    let handler = state.handler;
+    ensure!(
+        !dashboard_name.is_empty(),
+        InvalidParameterSnafu {
+            reason: "dashboard_name is required in path",
+        }
+    );
+
+    let definition = String::from_utf8_lossy(&payload).to_string();
+
+    query_ctx.set_channel(Channel::HttpSql);
+    let query_ctx = Arc::new(query_ctx);
+
+    handler
+        .save(&dashboard_name, &definition, query_ctx)
+        .await
+        .map(|_| {
+            GreptimedbManageResponse::from_dashboard(
+                dashboard_name,
+                start.elapsed().as_millis() as u64,
+            )
+        })
+        .map_err(|e| {
+            error!(e; "failed to save dashboard");
+            e
+        })
+}
+
+#[axum_macros::debug_handler]
+pub async fn list_dashboards(
+    State(state): State<DashboardState>,
+    Extension(mut query_ctx): Extension<QueryContext>,
+) -> Result<GreptimedbManageResponse> {
+    let start = Instant::now();
+    let handler = state.handler;
+
+    query_ctx.set_channel(Channel::HttpSql);
+    let query_ctx = Arc::new(query_ctx);
+
+    handler
+        .list(query_ctx)
+        .await
+        .map(|dashboards| {
+            let outputs: Vec<DashboardOutput> = dashboards
+                .into_iter()
+                .map(|d| DashboardOutput {
+                    name: d.name,
+                    definition: d.definition,
+                })
+                .collect();
+            GreptimedbManageResponse::from_dashboards(outputs, start.elapsed().as_millis() as u64)
+        })
+        .map_err(|e| {
+            error!(e; "failed to list dashboards");
+            e
+        })
+}
+
+#[axum_macros::debug_handler]
+pub async fn delete_dashboard(
+    State(state): State<DashboardState>,
+    Extension(mut query_ctx): Extension<QueryContext>,
+    Path(dashboard_name): Path<String>,
+) -> Result<GreptimedbManageResponse> {
+    let start = Instant::now();
+    let handler = state.handler;
+    ensure!(
+        !dashboard_name.is_empty(),
+        InvalidParameterSnafu {
+            reason: "dashboard_name is required",
+        }
+    );
+
+    query_ctx.set_channel(Channel::HttpSql);
+    let query_ctx = Arc::new(query_ctx);
+
+    handler
+        .delete(&dashboard_name, query_ctx)
+        .await
+        .map(|_| {
+            GreptimedbManageResponse::from_dashboard(
+                dashboard_name,
+                start.elapsed().as_millis() as u64,
+            )
+        })
+        .map_err(|e| {
+            error!(e; "failed to delete dashboard");
+            e
+        })
+}
diff --git a/src/servers/src/http/result/error_result.rs b/src/servers/src/http/result/error_result.rs
index 7b70066b68..9bd6e1a7a3 100644
--- a/src/servers/src/http/result/error_result.rs
+++ b/src/servers/src/http/result/error_result.rs
@@ -32,17 +32,24 @@ pub struct ErrorResponse {
 impl ErrorResponse {
     pub fn from_error(error: impl ErrorExt) -> Self {
         let code = error.status_code();
-
         if code.should_log_error() {
             error!(error; "Failed to handle HTTP request");
         } else {
             debug!("Failed to handle HTTP request, err: {:?}", error);
         }
-
-        Self::from_error_message(code, error.output_msg())
+        ErrorResponse {
+            code: code as u32,
+            error: error.output_msg(),
+            execution_time_ms: 0,
+        }
     }
 
     pub fn from_error_message(code: StatusCode, msg: String) -> Self {
+        if code.should_log_error() {
+            error!("Failed to handle HTTP request: {}", msg);
+        } else {
+            debug!("Failed to handle HTTP request: {}", msg);
+        }
         ErrorResponse {
             code: code as u32,
             error: msg,
diff --git a/src/servers/src/http/result/greptime_manage_resp.rs b/src/servers/src/http/result/greptime_manage_resp.rs
index 3f7f3c6eec..2b3a5d455c 100644
--- a/src/servers/src/http/result/greptime_manage_resp.rs
+++ b/src/servers/src/http/result/greptime_manage_resp.rs
@@ -62,6 +62,25 @@ impl GreptimedbManageResponse {
         }
     }
 
+    pub fn from_dashboard(name: String, execution_time_ms: u64) -> Self {
+        GreptimedbManageResponse {
+            manage_result: ManageResult::Dashboards {
+                dashboards: vec![DashboardOutput {
+                    name,
+                    definition: String::new(),
+                }],
+            },
+            execution_time_ms,
+        }
+    }
+
+    pub fn from_dashboards(dashboards: Vec<DashboardOutput>, execution_time_ms: u64) -> Self {
+        GreptimedbManageResponse {
+            manage_result: ManageResult::Dashboards { dashboards },
+            execution_time_ms,
+        }
+    }
+
     pub fn with_execution_time(mut self, execution_time: u64) -> Self {
         self.execution_time_ms = execution_time;
         self
@@ -77,6 +96,7 @@ impl GreptimedbManageResponse {
 pub enum ManageResult {
     Pipelines { pipelines: Vec<PipelineOutput> },
     Sql { sql: SqlOutput },
+    Dashboards { dashboards: Vec<DashboardOutput> },
 }
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -87,6 +107,13 @@ pub struct PipelineOutput {
     pipeline: Option<String>,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct DashboardOutput {
+    pub name: String,
+    #[serde(skip_serializing_if = "String::is_empty")]
+    pub definition: String,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct SqlOutput {
     pub(crate) sql: String,
diff --git a/src/servers/src/postgres/types.rs b/src/servers/src/postgres/types.rs
index a95890e78c..d4d15ef64a 100644
--- a/src/servers/src/postgres/types.rs
+++ b/src/servers/src/postgres/types.rs
@@ -235,7 +235,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result<Type> {
     match origin {
         &ConcreteDataType::Null(_) => Ok(Type::UNKNOWN),
         &ConcreteDataType::Boolean(_) => Ok(Type::BOOL),
-        &ConcreteDataType::Int8(_) => Ok(Type::CHAR),
+        &ConcreteDataType::Int8(_) => Ok(Type::INT2),
         &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt8(_) => Ok(Type::INT2),
         &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt16(_) => Ok(Type::INT4),
         &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt32(_) => Ok(Type::INT8),
@@ -253,7 +253,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result<Type> {
         ConcreteDataType::List(list) => match list.item_type() {
             &ConcreteDataType::Null(_) => Ok(Type::UNKNOWN),
             &ConcreteDataType::Boolean(_) => Ok(Type::BOOL_ARRAY),
-            &ConcreteDataType::Int8(_) => Ok(Type::CHAR_ARRAY),
+            &ConcreteDataType::Int8(_) => Ok(Type::INT2_ARRAY),
             &ConcreteDataType::Int16(_) | &ConcreteDataType::UInt8(_) => Ok(Type::INT2_ARRAY),
             &ConcreteDataType::Int32(_) | &ConcreteDataType::UInt16(_) => Ok(Type::INT4_ARRAY),
             &ConcreteDataType::Int64(_) | &ConcreteDataType::UInt32(_) => Ok(Type::INT8_ARRAY),
@@ -1151,7 +1151,7 @@ mod test {
         let pg_field_info = vec![
             FieldInfo::new("nulls".into(), None, None, Type::UNKNOWN, FieldFormat::Text),
             FieldInfo::new("bools".into(), None, None, Type::BOOL, FieldFormat::Text),
-            FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text),
+            FieldInfo::new("int8s".into(), None, None, Type::INT2, FieldFormat::Text),
             FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text),
             FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text),
             FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text),
@@ -1230,7 +1230,7 @@ mod test {
                 Type::NUMERIC,
                 FieldFormat::Text,
             ),
-            FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text),
+            FieldInfo::new("int8s".into(), None, None, Type::INT2, FieldFormat::Text),
             FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text),
             FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text),
             FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text),
diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs
index 60efe69faa..21c7646560 100644
--- a/src/servers/src/query_handler.rs
+++ b/src/servers/src/query_handler.rs
@@ -44,6 +44,12 @@ use pipeline::{GreptimePipelineParams, Pipeline, PipelineInfo, PipelineVersion,
 use serde_json::Value;
 use session::context::{QueryContext, QueryContextRef};
 
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct DashboardDefinition {
+    pub name: String,
+    pub definition: String,
+}
+
 use crate::error::Result;
 use crate::http::jaeger::QueryTraceParams;
 use crate::influxdb::InfluxdbRequest;
@@ -176,6 +182,18 @@ pub trait PipelineHandler {
     ) -> Result<(String, TimestampNanosecond)>;
 }
 
+/// Handling dashboard as code CRUD
+pub type DashboardHandlerRef = Arc<dyn DashboardHandler + Send + Sync>;
+
+#[async_trait]
+pub trait DashboardHandler {
+    async fn save(&self, name: &str, definition: &str, ctx: QueryContextRef) -> Result<()>;
+
+    async fn list(&self, ctx: QueryContextRef) -> Result<Vec<DashboardDefinition>>;
+
+    async fn delete(&self, name: &str, ctx: QueryContextRef) -> Result<()>;
+}
+
 /// Handle log query requests.
 #[async_trait]
 pub trait LogQueryHandler {
diff --git a/src/servers/src/query_handler/grpc.rs b/src/servers/src/query_handler/grpc.rs
index 67d8b3890e..d66a76464e 100644
--- a/src/servers/src/query_handler/grpc.rs
+++ b/src/servers/src/query_handler/grpc.rs
@@ -17,15 +17,13 @@ use std::sync::Arc;
 
 use api::v1::greptime_request::Request;
 use async_trait::async_trait;
-use common_base::AffectedRows;
 use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
 use futures::Stream;
 use session::context::QueryContextRef;
-use table::TableRef;
 
 use crate::error::Result;
-use crate::grpc::flight::{PutRecordBatchRequest, PutRecordBatchRequestStream};
+use crate::grpc::flight::PutRecordBatchRequestStream;
 
 pub type ServerGrpcQueryHandlerRef = Arc<dyn GrpcQueryHandler + Send + Sync>;
 
@@ -35,13 +33,6 @@ pub type RawRecordBatch = bytes::Bytes;
 pub trait GrpcQueryHandler {
     async fn do_query(&self, query: Request, ctx: QueryContextRef) -> Result<Output>;
 
-    async fn put_record_batch(
-        &self,
-        request: PutRecordBatchRequest,
-        table_ref: &mut Option<TableRef>,
-        ctx: QueryContextRef,
-    ) -> Result<AffectedRows>;
-
     fn handle_put_record_batch_stream(
         &self,
         stream: PutRecordBatchRequestStream,
diff --git a/src/servers/tests/mod.rs b/src/servers/tests/mod.rs
index e3f8f8fc79..c4f83c5e6c 100644
--- a/src/servers/tests/mod.rs
+++ b/src/servers/tests/mod.rs
@@ -18,7 +18,6 @@ use api::v1::greptime_request::Request;
 use api::v1::query_request::Query;
 use async_trait::async_trait;
 use catalog::memory::MemoryCatalogManager;
-use common_base::AffectedRows;
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
@@ -149,15 +148,6 @@ impl GrpcQueryHandler for DummyInstance {
         Ok(output)
     }
 
-    async fn put_record_batch(
-        &self,
-        _request: servers::grpc::flight::PutRecordBatchRequest,
-        _table_ref: &mut Option<TableRef>,
-        _ctx: QueryContextRef,
-    ) -> Result<AffectedRows> {
-        unimplemented!()
-    }
-
     fn handle_put_record_batch_stream(
         &self,
         _stream: servers::grpc::flight::PutRecordBatchRequestStream,
diff --git a/src/store-api/src/metadata.rs b/src/store-api/src/metadata.rs
index d571a5392f..0c663bccc0 100644
--- a/src/store-api/src/metadata.rs
+++ b/src/store-api/src/metadata.rs
@@ -18,8 +18,8 @@
 
 use std::any::Any;
 use std::collections::{HashMap, HashSet};
-use std::fmt;
 use std::sync::Arc;
+use std::{fmt, mem};
 
 use api::v1::SemanticType;
 use api::v1::column_def::try_as_column_schema;
@@ -99,6 +99,12 @@ impl ColumnMetadata {
     pub fn is_same_datatype(&self, other: &Self) -> bool {
         self.column_schema.data_type == other.column_schema.data_type
     }
+
+    /// Returns the estimated memory footprint of this metadata.
+    pub fn estimated_size(&self) -> usize {
+        mem::size_of_val(self) - mem::size_of_val(&self.column_schema)
+            + self.column_schema.estimated_size()
+    }
 }
 
 #[cfg_attr(doc, aquamarine::aquamarine)]
@@ -226,6 +232,25 @@ impl RegionMetadata {
         serde_json::from_str(s).context(SerdeJsonSnafu)
     }
 
+    /// Returns the estimated memory footprint of this metadata.
+    pub fn estimated_size(&self) -> usize {
+        mem::size_of_val(self)
+            + mem::size_of::<ColumnMetadata>() * self.column_metadatas.capacity()
+            + self
+                .column_metadatas
+                .iter()
+                .map(|column| column.estimated_size() - mem::size_of::<ColumnMetadata>())
+                .sum::<usize>()
+            + mem::size_of::<ColumnId>() * self.primary_key.capacity()
+            + mem::size_of::<(ColumnId, usize)>() * self.id_to_index.capacity()
+            + self.schema.estimated_size()
+            + self
+                .partition_expr
+                .as_ref()
+                .map(|expr| expr.capacity())
+                .unwrap_or_default()
+    }
+
     /// Encode the metadata to a JSON string.
     pub fn to_json(&self) -> Result<String> {
         serde_json::to_string(&self).context(SerdeJsonSnafu)
diff --git a/src/table/src/predicate.rs b/src/table/src/predicate.rs
index f9be7be16e..2c9ac41560 100644
--- a/src/table/src/predicate.rs
+++ b/src/table/src/predicate.rs
@@ -203,7 +203,7 @@ pub fn build_time_range_predicate(
 
 /// Extract time range filter from `WHERE`/`IN (...)`/`BETWEEN` clauses.
 /// Return None if no time range can be found in expr.
-fn extract_time_range_from_expr(
+pub fn extract_time_range_from_expr(
     ts_col_name: &str,
     ts_col_unit: TimeUnit,
     expr: &Expr,
diff --git a/src/table/src/requests.rs b/src/table/src/requests.rs
index 43fc36644b..15b4278f51 100644
--- a/src/table/src/requests.rs
+++ b/src/table/src/requests.rs
@@ -36,8 +36,9 @@ use store_api::metric_engine_consts::{
     LOGICAL_TABLE_METADATA_KEY, PHYSICAL_TABLE_METADATA_KEY, is_metric_engine_option_key,
 };
 use store_api::mito_engine_options::{
-    APPEND_MODE_KEY, COMPACTION_TYPE, MEMTABLE_TYPE, MERGE_MODE_KEY, TWCS_FALLBACK_TO_LOCAL,
-    TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW, TWCS_TRIGGER_FILE_NUM, is_mito_engine_option_key,
+    APPEND_MODE_KEY, COMPACTION_TYPE, MEMTABLE_TYPE, MERGE_MODE_KEY, SST_FORMAT_KEY,
+    TWCS_FALLBACK_TO_LOCAL, TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW, TWCS_TRIGGER_FILE_NUM,
+    is_mito_engine_option_key,
 };
 use store_api::region_request::{SetRegionOption, UnsetRegionOption};
 
@@ -56,13 +57,14 @@ pub const TABLE_DATA_MODEL_TRACE_V1: &str = "greptime_trace_v1";
 pub const OTLP_METRIC_COMPAT_KEY: &str = "otlp_metric_compat";
 pub const OTLP_METRIC_COMPAT_PROM: &str = "prom";
 
-pub const VALID_TABLE_OPTION_KEYS: [&str; 12] = [
+pub const VALID_TABLE_OPTION_KEYS: [&str; 13] = [
     // common keys:
     WRITE_BUFFER_SIZE_KEY,
     TTL_KEY,
     STORAGE_KEY,
     COMMENT_KEY,
     SKIP_WAL_KEY,
+    SST_FORMAT_KEY,
     // file engine keys:
     FILE_TABLE_LOCATION_KEY,
     FILE_TABLE_FORMAT_KEY,
@@ -94,6 +96,7 @@ static VALID_DB_OPT_KEYS: Lazy<HashSet<&str>> = Lazy::new(|| {
     set.insert(TWCS_TIME_WINDOW);
     set.insert(TWCS_TRIGGER_FILE_NUM);
     set.insert(TWCS_MAX_OUTPUT_FILE_SIZE);
+    set.insert(SST_FORMAT_KEY);
     set
 });
 
diff --git a/tests-fuzz/Cargo.toml b/tests-fuzz/Cargo.toml
index a537ca0687..bc687092c0 100644
--- a/tests-fuzz/Cargo.toml
+++ b/tests-fuzz/Cargo.toml
@@ -100,6 +100,13 @@ test = false
 bench = false
 doc = false
 
+[[bin]]
+name = "fuzz_repartition_metric_table"
+path = "targets/ddl/fuzz_repartition_metric_table.rs"
+test = false
+bench = false
+doc = false
+
 [[bin]]
 name = "fuzz_alter_table"
 path = "targets/ddl/fuzz_alter_table.rs"
diff --git a/tests-fuzz/README.md b/tests-fuzz/README.md
index 6807e19a1c..cc9d7eb84e 100644
--- a/tests-fuzz/README.md
+++ b/tests-fuzz/README.md
@@ -66,3 +66,23 @@ GT_FUZZ_OVERRIDE_SEED=6666 GT_FUZZ_OVERRIDE_ACTIONS=175 cargo fuzz run fuzz_targ
 ```
 
 For more details, visit [cargo fuzz](https://rust-fuzz.github.io/book/cargo-fuzz/tutorial.html) or run the command `cargo fuzz --help`.
+
+## Repartition Metric Dump Artifacts
+
+For `fuzz_repartition_metric_table`, dump artifacts are written under one run directory.
+
+- Table data snapshots: `<logical_table>.table-data.csv`
+- SQL traces per logical table: `<logical_table>.trace.sql`
+- Seed metadata: `seed.meta`
+
+SQL trace behavior:
+
+- Insert SQL is appended after successful execution with comment fields including
+  `started_at_ms` and `elapsed_ms`.
+- Repartition events are broadcast to all logical table trace files with comment fields including
+  `action_idx`, `started_at_ms`, `elapsed_ms`, and SQL text.
+
+Run directory lifecycle:
+
+- On success, the run directory is cleaned up.
+- On failure, the run directory is retained for CI/local diffing.
diff --git a/tests-fuzz/src/fake.rs b/tests-fuzz/src/fake.rs
index aa92e0293a..8910a39206 100644
--- a/tests-fuzz/src/fake.rs
+++ b/tests-fuzz/src/fake.rs
@@ -65,6 +65,26 @@ where
     _v: PhantomData<V>,
 }
 
+pub struct ConstGenerator<V> {
+    value: V,
+}
+
+impl<V> ConstGenerator<V> {
+    pub fn new(value: V) -> Self {
+        Self { value }
+    }
+}
+
+impl<R, V> Random<V, R> for ConstGenerator<V>
+where
+    R: Rng,
+    V: Clone,
+{
+    fn choose(&self, _rng: &mut R, amount: usize) -> Vec<V> {
+        vec![self.value.clone(); amount]
+    }
+}
+
 pub fn random_capitalize_map<R: Rng + 'static>(rng: &mut R, s: Ident) -> Ident {
     let mut v = s.value.chars().collect::<Vec<_>>();
 
diff --git a/tests-fuzz/src/generator/create_expr.rs b/tests-fuzz/src/generator/create_expr.rs
index fae6a95eda..261a310db2 100644
--- a/tests-fuzz/src/generator/create_expr.rs
+++ b/tests-fuzz/src/generator/create_expr.rs
@@ -193,6 +193,26 @@ fn generate_partition_def(
     }
 }
 
+fn generate_metric_partition(partitions: usize) -> Option<(Column, PartitionDef)> {
+    if partitions <= 1 {
+        return None;
+    }
+
+    let partition_column = Column {
+        name: Ident::new("host"),
+        column_type: ConcreteDataType::string_datatype(),
+        options: vec![ColumnOption::PrimaryKey],
+    };
+    let bounds = generate_partition_bounds(&partition_column.column_type, partitions - 1);
+    let partitions = SimplePartitions::new(partition_column.name.clone(), bounds);
+    let partition_def = PartitionDef {
+        columns: vec![partitions.column_name.clone()],
+        exprs: partitions.generate().unwrap(),
+    };
+
+    Some((partition_column, partition_def))
+}
+
 /// Generate a physical table with 2 columns: ts of TimestampType::Millisecond as time index and val of Float64Type.
 #[derive(Builder)]
 #[builder(pattern = "owned")]
@@ -201,6 +221,8 @@ pub struct CreatePhysicalTableExprGenerator<R: Rng + 'static> {
     name_generator: Box<dyn Random<Ident, R>>,
     #[builder(default = "false")]
     if_not_exists: bool,
+    #[builder(default = "0")]
+    partition: usize,
     #[builder(default, setter(into))]
     with_clause: HashMap<String, String>,
 }
@@ -215,25 +237,35 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreatePhysicalTableExpr
             options.insert(key.clone(), Value::from(value.clone()));
         }
 
+        let mut columns = vec![
+            Column {
+                name: Ident::new("ts"),
+                column_type: ConcreteDataType::timestamp_millisecond_datatype(),
+                options: vec![ColumnOption::TimeIndex],
+            },
+            Column {
+                name: Ident::new("val"),
+                column_type: ConcreteDataType::float64_datatype(),
+                options: vec![],
+            },
+        ];
+
+        let mut partition = None;
+        let mut primary_keys = vec![];
+        if let Some((partition_column, partition_def)) = generate_metric_partition(self.partition) {
+            columns.push(partition_column);
+            partition = Some(partition_def);
+            primary_keys.push(columns.len() - 1);
+        }
+
         Ok(CreateTableExpr {
             table_name: self.name_generator.generate(rng),
-            columns: vec![
-                Column {
-                    name: Ident::new("ts"),
-                    column_type: ConcreteDataType::timestamp_millisecond_datatype(),
-                    options: vec![ColumnOption::TimeIndex],
-                },
-                Column {
-                    name: Ident::new("val"),
-                    column_type: ConcreteDataType::float64_datatype(),
-                    options: vec![],
-                },
-            ],
+            columns,
             if_not_exists: self.if_not_exists,
-            partition: None,
+            partition,
             engine: "metric".to_string(),
             options,
-            primary_keys: vec![],
+            primary_keys,
         })
     }
 }
@@ -245,6 +277,8 @@ pub struct CreateLogicalTableExprGenerator<R: Rng + 'static> {
     physical_table_ctx: TableContextRef,
     labels: usize,
     if_not_exists: bool,
+    #[builder(default = "true")]
+    include_partition_column: bool,
     #[builder(default = "Box::new(WordGenerator)")]
     name_generator: Box<dyn Random<Ident, R>>,
 }
@@ -253,11 +287,11 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateLogicalTableExprG
     type Error = Error;
 
     fn generate(&self, rng: &mut R) -> Result<CreateTableExpr> {
-        // Currently we mock the usage of GreptimeDB as Prometheus' backend, the physical table must have two columns.
+        // Currently we mock the usage of GreptimeDB as Prometheus' backend, the physical table must have ts and val.
         ensure!(
-            self.physical_table_ctx.columns.len() == 2,
+            self.physical_table_ctx.columns.len() >= 2,
             error::UnexpectedSnafu {
-                violated: "The physical table must have two columns"
+                violated: "The physical table must have at least two columns"
             }
         );
 
@@ -265,9 +299,16 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateLogicalTableExprG
         let logical_table_name = self
             .physical_table_ctx
             .generate_unique_table_name(rng, self.name_generator.as_ref());
+        let mut physical_columns = self.physical_table_ctx.columns.clone();
+        if !self.include_partition_column
+            && let Some(partition_def) = &self.physical_table_ctx.partition
+        {
+            physical_columns.retain(|column| !partition_def.columns.contains(&column.name));
+        }
+
         let mut logical_table = CreateTableExpr {
             table_name: logical_table_name,
-            columns: self.physical_table_ctx.columns.clone(),
+            columns: physical_columns,
             if_not_exists: self.if_not_exists,
             partition: None,
             engine: "metric".to_string(),
@@ -459,6 +500,58 @@ mod tests {
         }));
     }
 
+    #[test]
+    fn test_create_physical_table_expr_generator_with_partition() {
+        let mut rng = rand::rng();
+        let physical_table_expr = CreatePhysicalTableExprGeneratorBuilder::default()
+            .partition(3)
+            .if_not_exists(false)
+            .build()
+            .unwrap()
+            .generate(&mut rng)
+            .unwrap();
+
+        assert_eq!(physical_table_expr.engine, "metric");
+        assert!(physical_table_expr.partition.is_some());
+        assert_eq!(physical_table_expr.partition.unwrap().exprs.len(), 3);
+    }
+
+    #[test]
+    fn test_create_logical_table_expr_generator_without_partition_column() {
+        let mut rng = rand::rng();
+        let physical_table_expr = CreatePhysicalTableExprGeneratorBuilder::default()
+            .partition(3)
+            .if_not_exists(false)
+            .build()
+            .unwrap()
+            .generate(&mut rng)
+            .unwrap();
+        let partition_columns = physical_table_expr
+            .partition
+            .as_ref()
+            .unwrap()
+            .columns
+            .clone();
+        let physical_table_ctx = Arc::new(TableContext::from(&physical_table_expr));
+
+        let logical_table_expr = CreateLogicalTableExprGeneratorBuilder::default()
+            .physical_table_ctx(physical_table_ctx)
+            .labels(3)
+            .include_partition_column(false)
+            .if_not_exists(false)
+            .build()
+            .unwrap()
+            .generate(&mut rng)
+            .unwrap();
+
+        assert!(
+            logical_table_expr
+                .columns
+                .iter()
+                .all(|column| !partition_columns.contains(&column.name))
+        );
+    }
+
     #[test]
     fn test_create_logical_table_expr_generator_deterministic() {
         let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(0);
diff --git a/tests-fuzz/src/ir.rs b/tests-fuzz/src/ir.rs
index e8c15dcf95..ce1628cd61 100644
--- a/tests-fuzz/src/ir.rs
+++ b/tests-fuzz/src/ir.rs
@@ -20,6 +20,7 @@ pub(crate) mod insert_expr;
 pub(crate) mod partition_expr;
 pub(crate) mod repartition_expr;
 pub(crate) mod select_expr;
+pub(crate) mod string_value;
 
 use core::fmt;
 use std::collections::HashMap;
@@ -126,20 +127,7 @@ pub fn generate_partition_bounds(datatype: &ConcreteDataType, bounds: usize) ->
         ConcreteDataType::Int64(_) => generate_values!(i64, bounds),
         ConcreteDataType::Float32(_) => generate_values!(f32, bounds),
         ConcreteDataType::Float64(_) => generate_values!(f64, bounds),
-        ConcreteDataType::String(_) => {
-            let base = b'A';
-            let range = b'z' - b'A';
-            let step = range / (bounds as u8 + 1);
-            (1..=bounds)
-                .map(|i| {
-                    Value::from(
-                        char::from(base + step * i as u8)
-                            .escape_default()
-                            .to_string(),
-                    )
-                })
-                .collect()
-        }
+        ConcreteDataType::String(_) => string_value::generate_partition_bounds(bounds),
         _ => unimplemented!("unsupported type: {datatype}"),
     }
 }
@@ -157,10 +145,7 @@ pub fn generate_random_value<R: Rng>(
         ConcreteDataType::Int64(_) => Value::from(rng.random::<i64>()),
         ConcreteDataType::Float32(_) => Value::from(rng.random::<f32>()),
         ConcreteDataType::Float64(_) => Value::from(rng.random::<f64>()),
-        ConcreteDataType::String(_) => match random_str {
-            Some(random) => Value::from(random.generate(rng).value),
-            None => Value::from(rng.random::<char>().to_string()),
-        },
+        ConcreteDataType::String(_) => string_value::generate_data_string_value(rng, random_str),
         ConcreteDataType::Date(_) => generate_random_date(rng),
 
         _ => unimplemented!("unsupported type: {datatype}"),
@@ -341,21 +326,7 @@ pub fn generate_partition_value<R: Rng + 'static>(
             }
         }
         datatypes::data_type::ConcreteDataType::String(_) => {
-            let upper = match first {
-                datatypes::value::Value::String(v) => v.as_utf8(),
-                _ => "",
-            };
-            if bound_idx == 0 {
-                if upper <= "A" {
-                    datatypes::value::Value::from("")
-                } else {
-                    datatypes::value::Value::from("A")
-                }
-            } else if bound_idx < bounds.len() {
-                bounds[bound_idx - 1].clone()
-            } else {
-                last.clone()
-            }
+            string_value::generate_partition_value(bounds, bound_idx)
         }
         _ => unimplemented!("unsupported partition column type: {column_type}"),
     }
diff --git a/tests-fuzz/src/ir/partition_expr.rs b/tests-fuzz/src/ir/partition_expr.rs
index c91dd487ae..908223366c 100644
--- a/tests-fuzz/src/ir/partition_expr.rs
+++ b/tests-fuzz/src/ir/partition_expr.rs
@@ -20,7 +20,7 @@ use snafu::ensure;
 
 use crate::context::TableContext;
 use crate::error::{self, Result};
-use crate::ir::{Ident, generate_random_value};
+use crate::ir::{Ident, generate_random_value, string_value};
 
 /// A partitioning scheme that divides a single column into multiple ranges based on provided bounds.
 ///
@@ -245,6 +245,10 @@ pub fn generate_unique_bound<R: Rng + 'static>(
     datatype: &ConcreteDataType,
     bounds: &[Value],
 ) -> Result<Value> {
+    if matches!(datatype, ConcreteDataType::String(_)) {
+        return string_value::generate_unique_partition_bound(rng, bounds);
+    }
+
     for _ in 0..16 {
         let candidate = generate_random_value(rng, datatype, None);
         if !bounds.contains(&candidate) {
diff --git a/tests-fuzz/src/ir/string_value.rs b/tests-fuzz/src/ir/string_value.rs
new file mode 100644
index 0000000000..6a53aa69de
--- /dev/null
+++ b/tests-fuzz/src/ir/string_value.rs
@@ -0,0 +1,162 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use datatypes::value::Value;
+use rand::Rng;
+
+use crate::error::{self, Result};
+use crate::generator::Random;
+use crate::ir::Ident;
+
+const READABLE_CHARSET: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+fn readable_token(index: usize) -> String {
+    let base = READABLE_CHARSET.len();
+    let mut n = index + 1;
+    let mut buf = Vec::new();
+
+    while n > 0 {
+        let rem = (n - 1) % base;
+        buf.push(READABLE_CHARSET[rem] as char);
+        n = (n - 1) / base;
+    }
+
+    buf.iter().rev().collect()
+}
+
+pub fn generate_data_string_value<R: Rng>(
+    rng: &mut R,
+    random_str: Option<&dyn Random<Ident, R>>,
+) -> Value {
+    match random_str {
+        Some(random) => Value::from(random.generate(rng).value),
+        None => {
+            let idx = rng.random_range(0..(READABLE_CHARSET.len() * READABLE_CHARSET.len() * 4));
+            Value::from(readable_token(idx))
+        }
+    }
+}
+
+/// Generates ordered readable string bounds for partition expressions.
+pub fn generate_partition_bounds(bounds: usize) -> Vec<Value> {
+    let token_space = READABLE_CHARSET.len() * READABLE_CHARSET.len() * 1024;
+    (1..=bounds)
+        .map(|i| {
+            let idx = i * token_space / (bounds + 1);
+            Value::from(readable_token(idx))
+        })
+        .collect()
+}
+
+/// Picks a representative string value for the target partition range.
+pub fn generate_partition_value(bounds: &[Value], bound_idx: usize) -> Value {
+    let first = bounds.first().unwrap();
+    let last = bounds.last().unwrap();
+    let upper = match first {
+        Value::String(v) => v.as_utf8(),
+        _ => "",
+    };
+
+    if bound_idx == 0 {
+        if upper <= "0" {
+            Value::from("")
+        } else {
+            Value::from("0")
+        }
+    } else if bound_idx < bounds.len() {
+        bounds[bound_idx - 1].clone()
+    } else {
+        last.clone()
+    }
+}
+
+/// Generates a unique readable bound not present in existing bounds.
+pub fn generate_unique_partition_bound<R: Rng>(rng: &mut R, bounds: &[Value]) -> Result<Value> {
+    let search_space = READABLE_CHARSET.len() * READABLE_CHARSET.len() * 1024;
+    let start = rng.random_range(0..search_space);
+    for offset in 0..search_space {
+        let idx = start + offset;
+        let candidate = Value::from(readable_token(idx));
+        if !bounds.contains(&candidate) {
+            return Ok(candidate);
+        }
+    }
+
+    error::UnexpectedSnafu {
+        violated: "unable to generate unique string partition bound".to_string(),
+    }
+    .fail()
+}
+
+#[cfg(test)]
+mod tests {
+    use rand::SeedableRng;
+    use rand_chacha::ChaCha8Rng;
+
+    use super::*;
+
+    #[test]
+    fn test_readable_token_grows_length() {
+        assert_eq!("0", readable_token(0));
+        assert_eq!("9", readable_token(9));
+        assert_eq!("A", readable_token(10));
+        assert_eq!("z", readable_token(61));
+        assert_eq!("00", readable_token(62));
+    }
+
+    #[test]
+    fn test_generate_partition_bounds_are_readable_and_unique() {
+        let bounds = generate_partition_bounds(8);
+        assert_eq!(8, bounds.len());
+
+        let mut values = bounds
+            .iter()
+            .map(|v| match v {
+                Value::String(s) => s.as_utf8().to_string(),
+                _ => panic!("expected string value"),
+            })
+            .collect::<Vec<_>>();
+        let mut dedup = values.clone();
+        dedup.sort();
+        dedup.dedup();
+        assert_eq!(values.len(), dedup.len());
+
+        for s in values.drain(..) {
+            assert!(s.chars().all(|c| c.is_ascii_alphanumeric()));
+        }
+    }
+
+    #[test]
+    fn test_generate_partition_value_for_string_bounds() {
+        let bounds = vec![Value::from("A"), Value::from("M")];
+        assert_eq!(Value::from("0"), generate_partition_value(&bounds, 0));
+        assert_eq!(Value::from("A"), generate_partition_value(&bounds, 1));
+        assert_eq!(Value::from("M"), generate_partition_value(&bounds, 2));
+    }
+
+    #[test]
+    fn test_generate_unique_partition_bound_not_in_existing() {
+        let mut rng = ChaCha8Rng::seed_from_u64(42);
+        let bounds = vec![Value::from("0"), Value::from("1"), Value::from("2")];
+        let candidate = generate_unique_partition_bound(&mut rng, &bounds).unwrap();
+        assert!(!bounds.contains(&candidate));
+        match candidate {
+            Value::String(s) => {
+                assert!(!s.as_utf8().is_empty());
+                assert!(s.as_utf8().chars().all(|c| c.is_ascii_alphanumeric()));
+            }
+            _ => panic!("expected string value"),
+        }
+    }
+}
diff --git a/tests-fuzz/src/translator.rs b/tests-fuzz/src/translator.rs
index 673b543f2c..4c5e0bb6a4 100644
--- a/tests-fuzz/src/translator.rs
+++ b/tests-fuzz/src/translator.rs
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 mod common;
+/// Translator that converts insert expressions into CSV records.
+pub mod csv;
 pub mod mysql;
 pub mod postgres;
 
diff --git a/tests-fuzz/src/translator/csv.rs b/tests-fuzz/src/translator/csv.rs
new file mode 100644
index 0000000000..e95956862c
--- /dev/null
+++ b/tests-fuzz/src/translator/csv.rs
@@ -0,0 +1,121 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::error::Error;
+use crate::ir::insert_expr::{InsertIntoExpr, RowValue};
+use crate::translator::DslTranslator;
+
+/// One CSV record converted from an insert row.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CsvRecord {
+    /// Cell values in column order.
+    pub values: Vec<String>,
+}
+
+/// CSV records converted from an insert expression.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct CsvRecords {
+    /// Target table name from insert expression.
+    pub table_name: String,
+    /// Header values from insert columns.
+    pub headers: Vec<String>,
+    /// Converted row records.
+    pub records: Vec<CsvRecord>,
+}
+
+/// Translates `InsertIntoExpr` into CSV-writer-ready records.
+pub struct InsertExprToCsvRecordsTranslator;
+
+impl DslTranslator<InsertIntoExpr, CsvRecords> for InsertExprToCsvRecordsTranslator {
+    type Error = Error;
+
+    fn translate(&self, input: &InsertIntoExpr) -> Result<CsvRecords, Self::Error> {
+        let headers = input
+            .columns
+            .iter()
+            .map(|column| column.name.to_string())
+            .collect::<Vec<_>>();
+        let records = input
+            .values_list
+            .iter()
+            .map(|row| CsvRecord {
+                values: row.iter().map(Self::format_row_value).collect(),
+            })
+            .collect::<Vec<_>>();
+
+        Ok(CsvRecords {
+            table_name: input.table_name.to_string(),
+            headers,
+            records,
+        })
+    }
+}
+
+impl InsertExprToCsvRecordsTranslator {
+    fn format_row_value(value: &RowValue) -> String {
+        match value {
+            RowValue::Value(datatypes::value::Value::Null) => String::new(),
+            RowValue::Value(v) => v.to_string(),
+            RowValue::Default => "DEFAULT".to_string(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datatypes::data_type::ConcreteDataType;
+
+    use super::InsertExprToCsvRecordsTranslator;
+    use crate::ir::create_expr::ColumnOption;
+    use crate::ir::insert_expr::{InsertIntoExpr, RowValue};
+    use crate::ir::{Column, Ident};
+    use crate::translator::DslTranslator;
+
+    #[test]
+    fn test_translate_insert_expr_to_csv_records() {
+        let input = InsertIntoExpr {
+            table_name: Ident::new("metric_a"),
+            omit_column_list: false,
+            columns: vec![
+                Column {
+                    name: "host".into(),
+                    column_type: ConcreteDataType::string_datatype(),
+                    options: vec![ColumnOption::PrimaryKey],
+                },
+                Column {
+                    name: "value".into(),
+                    column_type: ConcreteDataType::float64_datatype(),
+                    options: vec![],
+                },
+            ],
+            values_list: vec![
+                vec![
+                    RowValue::Value(datatypes::value::Value::String("web-1".into())),
+                    RowValue::Value(datatypes::value::Value::Int32(15)),
+                ],
+                vec![
+                    RowValue::Value(datatypes::value::Value::Null),
+                    RowValue::Default,
+                ],
+            ],
+        };
+
+        let output = InsertExprToCsvRecordsTranslator.translate(&input).unwrap();
+        assert_eq!(output.table_name, "metric_a");
+        assert_eq!(output.headers, vec!["host", "value"]);
+        assert_eq!(output.records.len(), 2);
+        assert_eq!(output.records[0].values, vec!["web-1", "15"]);
+        assert_eq!(output.records[1].values, vec!["", "DEFAULT"]);
+    }
+}
diff --git a/tests-fuzz/src/utils.rs b/tests-fuzz/src/utils.rs
index 0780f6c93d..d55abab3c2 100644
--- a/tests-fuzz/src/utils.rs
+++ b/tests-fuzz/src/utils.rs
@@ -15,6 +15,8 @@
 pub mod cluster_info;
 pub mod config;
 pub mod crd;
+/// CSV dump writer utilities for fuzz tests.
+pub mod csv_dump_writer;
 pub mod health;
 pub mod migration;
 pub mod partition;
@@ -22,10 +24,15 @@ pub mod pod_failure;
 pub mod procedure;
 #[cfg(feature = "unstable")]
 pub mod process;
+pub mod retry;
+/// SQL dump writer utilities for fuzz tests.
+pub mod sql_dump_writer;
 pub mod wait;
 
 use std::env;
+use std::str::FromStr;
 
+use common_base::readable_size::ReadableSize;
 use common_telemetry::info;
 use common_telemetry::tracing::log::LevelFilter;
 use paste::paste;
@@ -126,6 +133,14 @@ pub const GT_FUZZ_INPUT_MAX_COLUMNS: &str = "GT_FUZZ_INPUT_MAX_COLUMNS";
 pub const GT_FUZZ_INPUT_MAX_ALTER_ACTIONS: &str = "GT_FUZZ_INPUT_MAX_ALTER_ACTIONS";
 pub const GT_FUZZ_INPUT_MAX_INSERT_ACTIONS: &str = "GT_FUZZ_INPUT_MAX_INSERT_ACTIONS";
 pub const FUZZ_OVERRIDE_PREFIX: &str = "GT_FUZZ_OVERRIDE_";
+/// Enables CSV dump generation for fuzz runs.
+pub const GT_FUZZ_DUMP_TABLE_CSV: &str = "GT_FUZZ_DUMP_TABLE_CSV";
+/// Base directory for CSV dump sessions.
+pub const GT_FUZZ_DUMP_DIR: &str = "GT_FUZZ_DUMP_DIR";
+/// Directory suffix used by one CSV dump session.
+pub const GT_FUZZ_DUMP_SUFFIX: &str = "GT_FUZZ_DUMP_SUFFIX";
+/// Max in-memory CSV buffer size before auto flush.
+pub const GT_FUZZ_DUMP_BUFFER_MAX_BYTES: &str = "GT_FUZZ_DUMP_BUFFER_MAX_BYTES";
 
 /// Reads an override value for a fuzz parameter from env `GT_FUZZ_OVERRIDE_<NAME>`.
 pub fn get_fuzz_override<T>(name: &str) -> Option<T>
@@ -137,6 +152,33 @@ where
     env::var(&key).ok().and_then(|v| v.parse().ok())
 }
 
+/// Returns CSV dump base directory.
+pub fn get_gt_fuzz_dump_dir() -> String {
+    let _ = dotenv::dotenv();
+    env::var(GT_FUZZ_DUMP_DIR).unwrap_or_else(|_| "/tmp/greptime-fuzz-dumps".to_string())
+}
+
+/// Returns CSV dump directory suffix.
+pub fn get_gt_fuzz_dump_suffix() -> String {
+    let _ = dotenv::dotenv();
+    env::var(GT_FUZZ_DUMP_SUFFIX).unwrap_or_else(|_| ".repartition-metric-csv".to_string())
+}
+
+/// Returns max CSV in-memory buffer size.
+pub fn get_gt_fuzz_dump_buffer_max_bytes() -> usize {
+    let _ = dotenv::dotenv();
+    env::var(GT_FUZZ_DUMP_BUFFER_MAX_BYTES)
+        .ok()
+        .and_then(|value| {
+            value.parse::<usize>().ok().or_else(|| {
+                ReadableSize::from_str(&value)
+                    .ok()
+                    .map(|size| size.as_bytes() as usize)
+            })
+        })
+        .unwrap_or(8 * 1024 * 1024)
+}
+
 macro_rules! make_get_from_env_helper {
     ($key:expr, $default: expr) => {
         paste! {
diff --git a/tests-fuzz/src/utils/csv_dump_writer.rs b/tests-fuzz/src/utils/csv_dump_writer.rs
new file mode 100644
index 0000000000..de16a23c24
--- /dev/null
+++ b/tests-fuzz/src/utils/csv_dump_writer.rs
@@ -0,0 +1,383 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+use std::fs::{File, OpenOptions, create_dir_all, remove_dir_all};
+use std::io::Write;
+use std::path::{Path, PathBuf};
+
+use common_telemetry::{info, warn};
+use common_time::util::current_time_millis;
+use snafu::ResultExt;
+
+use crate::error::{self, Result};
+use crate::translator::csv::CsvRecords;
+use crate::utils::{
+    get_gt_fuzz_dump_buffer_max_bytes, get_gt_fuzz_dump_dir, get_gt_fuzz_dump_suffix,
+};
+
+/// Metadata for one CSV dump session.
+#[derive(Debug, Clone)]
+pub struct CsvDumpMetadata {
+    /// Fuzz target name.
+    pub target: String,
+    /// Seed used by current fuzz input.
+    pub seed: u64,
+    /// Repartition action count.
+    pub actions: usize,
+    /// Initial partition count.
+    pub partitions: usize,
+    /// Logical table count.
+    pub tables: usize,
+    /// Session start time in unix milliseconds.
+    pub started_at_unix_ms: i64,
+}
+
+impl CsvDumpMetadata {
+    /// Builds dump metadata with current timestamp.
+    pub fn new(
+        target: impl Into<String>,
+        seed: u64,
+        actions: usize,
+        partitions: usize,
+        tables: usize,
+    ) -> Self {
+        Self {
+            target: target.into(),
+            seed,
+            actions,
+            partitions,
+            tables,
+            started_at_unix_ms: current_time_millis(),
+        }
+    }
+}
+
+/// Session writer for staged CSV dump records.
+#[derive(Debug)]
+pub struct CsvDumpSession {
+    /// Session metadata.
+    pub metadata: CsvDumpMetadata,
+    /// Session directory path.
+    pub run_dir: PathBuf,
+    /// Max in-memory buffer size before auto flush.
+    pub max_buffer_bytes: usize,
+    records: Vec<CsvRecords>,
+    buffered_bytes: usize,
+    written_tables: HashSet<String>,
+    full_headers_by_table: HashMap<String, Vec<String>>,
+}
+
+impl CsvDumpSession {
+    /// Creates session directory and writes seed metadata file.
+    pub fn new(metadata: CsvDumpMetadata) -> Result<Self> {
+        Self::new_with_buffer_limit(metadata, get_gt_fuzz_dump_buffer_max_bytes())
+    }
+
+    /// Creates session with a custom in-memory buffer limit.
+    pub fn new_with_buffer_limit(
+        metadata: CsvDumpMetadata,
+        max_buffer_bytes: usize,
+    ) -> Result<Self> {
+        let run_dir = build_run_dir(&metadata);
+        create_dir_all(&run_dir).context(error::CreateFileSnafu {
+            path: run_dir.to_string_lossy().to_string(),
+        })?;
+        write_seed_meta(&run_dir, &metadata)?;
+        info!(
+            "Create csv dump session, target: {}, run_dir: {}, max_buffer_bytes: {}",
+            metadata.target,
+            run_dir.display(),
+            max_buffer_bytes
+        );
+
+        Ok(Self {
+            metadata,
+            run_dir,
+            max_buffer_bytes,
+            records: Vec::new(),
+            buffered_bytes: 0,
+            written_tables: HashSet::new(),
+            full_headers_by_table: HashMap::new(),
+        })
+    }
+
+    /// Appends one table CSV records batch with full table headers.
+    pub fn append(&mut self, records: CsvRecords, full_headers: Vec<String>) -> Result<()> {
+        self.full_headers_by_table
+            .entry(records.table_name.clone())
+            .or_insert(full_headers);
+        self.buffered_bytes += estimate_csv_records_size(&records);
+        self.records.push(records);
+        if self.buffered_bytes >= self.max_buffer_bytes {
+            self.flush_buffered_records()?;
+        }
+        Ok(())
+    }
+
+    /// Flushes all appended batches to CSV files.
+    pub fn flush_all(&mut self) -> Result<()> {
+        self.flush_buffered_records()
+    }
+
+    /// Removes session directory after successful validation.
+    pub fn cleanup_on_success(&self) -> std::io::Result<()> {
+        match remove_dir_all(&self.run_dir) {
+            Ok(_) => {
+                info!(
+                    "Cleanup csv dump directory on success: {}",
+                    self.run_dir.display()
+                );
+                Ok(())
+            }
+            Err(err) => {
+                warn!(
+                    "Cleanup csv dump directory failed: {}, error: {:?}",
+                    self.run_dir.display(),
+                    err
+                );
+                Err(err)
+            }
+        }
+    }
+
+    fn flush_buffered_records(&mut self) -> Result<()> {
+        if self.records.is_empty() {
+            return Ok(());
+        }
+        for batch in &self.records {
+            write_batch_csv(
+                &self.run_dir,
+                batch,
+                &mut self.written_tables,
+                &self.full_headers_by_table,
+            )?;
+        }
+        self.records.clear();
+        self.buffered_bytes = 0;
+        Ok(())
+    }
+}
+
+fn write_seed_meta(run_dir: &Path, metadata: &CsvDumpMetadata) -> Result<()> {
+    let path = run_dir.join("seed.meta");
+    let mut file = File::create(&path).context(error::CreateFileSnafu {
+        path: path.to_string_lossy().to_string(),
+    })?;
+
+    let content = format!(
+        "target={}\nseed={}\nactions={}\npartitions={}\ntables={}\nstarted_at_unix_ms={}\n",
+        metadata.target,
+        metadata.seed,
+        metadata.actions,
+        metadata.partitions,
+        metadata.tables,
+        metadata.started_at_unix_ms,
+    );
+    file.write_all(content.as_bytes())
+        .context(error::WriteFileSnafu {
+            path: path.to_string_lossy().to_string(),
+        })
+}
+
+fn write_batch_csv(
+    run_dir: &Path,
+    batch: &CsvRecords,
+    written_tables: &mut HashSet<String>,
+    full_headers_by_table: &HashMap<String, Vec<String>>,
+) -> Result<()> {
+    let output_headers = full_headers_by_table
+        .get(&batch.table_name)
+        .cloned()
+        .unwrap_or_else(|| batch.headers.clone());
+    let file_name = format!("{}.table-data.csv", sanitize_file_name(&batch.table_name));
+    let path = run_dir.join(file_name);
+    let mut file = OpenOptions::new()
+        .create(true)
+        .append(true)
+        .open(&path)
+        .context(error::CreateFileSnafu {
+            path: path.to_string_lossy().to_string(),
+        })?;
+
+    if written_tables.insert(batch.table_name.clone()) {
+        file.write_all(join_line(&output_headers).as_bytes())
+            .context(error::WriteFileSnafu {
+                path: path.to_string_lossy().to_string(),
+            })?;
+        file.write_all(b"\n").context(error::WriteFileSnafu {
+            path: path.to_string_lossy().to_string(),
+        })?;
+    }
+
+    let header_index = batch
+        .headers
+        .iter()
+        .enumerate()
+        .map(|(idx, header)| (header.as_str(), idx))
+        .collect::<HashMap<_, _>>();
+
+    for record in &batch.records {
+        let aligned_values = output_headers
+            .iter()
+            .map(|header| {
+                header_index
+                    .get(header.as_str())
+                    .and_then(|idx| record.values.get(*idx))
+                    .cloned()
+                    .unwrap_or_default()
+            })
+            .collect::<Vec<_>>();
+        file.write_all(join_line(&aligned_values).as_bytes())
+            .context(error::WriteFileSnafu {
+                path: path.to_string_lossy().to_string(),
+            })?;
+        file.write_all(b"\n").context(error::WriteFileSnafu {
+            path: path.to_string_lossy().to_string(),
+        })?;
+    }
+
+    Ok(())
+}
+
+fn estimate_csv_records_size(records: &CsvRecords) -> usize {
+    let headers = records.headers.iter().map(String::len).sum::<usize>();
+    let rows = records
+        .records
+        .iter()
+        .flat_map(|record| record.values.iter())
+        .map(String::len)
+        .sum::<usize>();
+    headers + rows
+}
+
+fn join_line(cells: &[String]) -> String {
+    cells
+        .iter()
+        .map(|cell| escape_csv_cell(cell))
+        .collect::<Vec<_>>()
+        .join(",")
+}
+
+fn escape_csv_cell(value: &str) -> String {
+    if value.contains([',', '"', '\n', '\r']) {
+        format!("\"{}\"", value.replace('"', "\"\""))
+    } else {
+        value.to_string()
+    }
+}
+
+fn sanitize_file_name(raw: &str) -> String {
+    raw.chars()
+        .map(|ch| {
+            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
+                ch
+            } else {
+                '_'
+            }
+        })
+        .collect()
+}
+
+fn build_run_dir(metadata: &CsvDumpMetadata) -> PathBuf {
+    let base = PathBuf::from(get_gt_fuzz_dump_dir());
+    let suffix = get_gt_fuzz_dump_suffix();
+    let name = format!(
+        "{}_seed_{}_actions_{}_ts_{}{}",
+        metadata.target, metadata.seed, metadata.actions, metadata.started_at_unix_ms, suffix
+    );
+    base.join(name)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{CsvDumpMetadata, CsvDumpSession};
+    use crate::translator::csv::{CsvRecord, CsvRecords};
+
+    #[test]
+    fn test_create_session_and_flush() {
+        let mut session = CsvDumpSession::new_with_buffer_limit(
+            CsvDumpMetadata::new("fuzz_case", 1, 2, 3, 4),
+            1024,
+        )
+        .unwrap();
+        session
+            .append(
+                CsvRecords {
+                    table_name: "metric-a".to_string(),
+                    headers: vec!["host".to_string(), "value".to_string()],
+                    records: vec![CsvRecord {
+                        values: vec!["web-1".to_string(), "10".to_string()],
+                    }],
+                },
+                vec!["host".to_string(), "value".to_string()],
+            )
+            .unwrap();
+        session.flush_all().unwrap();
+
+        assert!(session.run_dir.exists());
+        assert!(session.run_dir.join("seed.meta").exists());
+        assert!(session.run_dir.join("metric-a.table-data.csv").exists());
+    }
+
+    #[test]
+    fn test_auto_flush_on_buffer_limit() {
+        let mut session =
+            CsvDumpSession::new_with_buffer_limit(CsvDumpMetadata::new("fuzz_case", 5, 2, 3, 4), 1)
+                .unwrap();
+        session
+            .append(
+                CsvRecords {
+                    table_name: "metric-b".to_string(),
+                    headers: vec!["host".to_string()],
+                    records: vec![CsvRecord {
+                        values: vec!["web-2".to_string()],
+                    }],
+                },
+                vec!["host".to_string()],
+            )
+            .unwrap();
+
+        assert!(session.run_dir.join("metric-b.table-data.csv").exists());
+        assert_eq!(session.buffered_bytes, 0);
+    }
+
+    #[test]
+    fn test_flush_with_partial_headers_uses_full_headers() {
+        let mut session = CsvDumpSession::new_with_buffer_limit(
+            CsvDumpMetadata::new("fuzz_case", 7, 2, 3, 4),
+            1024,
+        )
+        .unwrap();
+        session
+            .append(
+                CsvRecords {
+                    table_name: "metric-c".to_string(),
+                    headers: vec!["host".to_string(), "value".to_string()],
+                    records: vec![CsvRecord {
+                        values: vec!["web-3".to_string(), "12".to_string()],
+                    }],
+                },
+                vec!["host".to_string(), "idc".to_string(), "value".to_string()],
+            )
+            .unwrap();
+        session.flush_all().unwrap();
+
+        let file =
+            std::fs::read_to_string(session.run_dir.join("metric-c.table-data.csv")).unwrap();
+        let mut lines = file.lines();
+        assert_eq!(lines.next().unwrap(), "host,idc,value");
+        assert_eq!(lines.next().unwrap(), "web-3,,12");
+    }
+}
diff --git a/tests-fuzz/src/utils/partition.rs b/tests-fuzz/src/utils/partition.rs
index d3dc30061d..89a684326b 100644
--- a/tests-fuzz/src/utils/partition.rs
+++ b/tests-fuzz/src/utils/partition.rs
@@ -36,7 +36,7 @@ pub struct PartitionCount {
 }
 
 pub async fn count_partitions(db: &MySqlPool, datanode_id: u64) -> Result<PartitionCount> {
-    let sql = "select count(1) as count from information_schema.region_peers where peer_id == ?";
+    let sql = "select count(1) as count from information_schema.region_peers where peer_id = ?";
     sqlx::query_as::<_, PartitionCount>(sql)
         .bind(datanode_id)
         .fetch_one(db)
diff --git a/tests-fuzz/src/utils/retry.rs b/tests-fuzz/src/utils/retry.rs
new file mode 100644
index 0000000000..06d1ede54f
--- /dev/null
+++ b/tests-fuzz/src/utils/retry.rs
@@ -0,0 +1,49 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::future::Future;
+use std::time::Duration;
+
+use common_telemetry::warn;
+
+pub async fn retry_with_backoff<T, E, Fut, F>(
+    mut operation: F,
+    max_attempts: usize,
+    init_backoff: Duration,
+    max_backoff: Duration,
+) -> Result<T, E>
+where
+    F: FnMut() -> Fut,
+    Fut: Future<Output = Result<T, E>>,
+    E: std::fmt::Debug,
+{
+    let mut backoff = init_backoff;
+    for attempt in 0..max_attempts {
+        match operation().await {
+            Ok(result) => return Ok(result),
+            Err(err) if attempt + 1 == max_attempts => return Err(err),
+            Err(err) => {
+                let current_attempt = attempt + 1;
+                warn!(
+                    "Retryable operation failed, attempt: {}, max_attempts: {}, backoff: {:?}, error: {:?}",
+                    current_attempt, max_attempts, backoff, err
+                );
+                tokio::time::sleep(backoff).await;
+                backoff = std::cmp::min(backoff * 2, max_backoff);
+            }
+        }
+    }
+
+    panic!("retry loop should always return")
+}
diff --git a/tests-fuzz/src/utils/sql_dump_writer.rs b/tests-fuzz/src/utils/sql_dump_writer.rs
new file mode 100644
index 0000000000..6f098d9584
--- /dev/null
+++ b/tests-fuzz/src/utils/sql_dump_writer.rs
@@ -0,0 +1,267 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::fs::{OpenOptions, create_dir_all};
+use std::io::Write;
+use std::path::PathBuf;
+
+use snafu::ResultExt;
+
+use crate::error::{self, Result};
+use crate::utils::get_gt_fuzz_dump_buffer_max_bytes;
+
+/// Session writer for table-scoped SQL trace files.
+#[derive(Debug)]
+pub struct SqlDumpSession {
+    /// Session directory path.
+    pub run_dir: PathBuf,
+    /// Max in-memory buffer size before auto flush.
+    pub max_buffer_bytes: usize,
+    buffered_bytes: usize,
+    entries_by_table: HashMap<String, Vec<String>>,
+}
+
+impl SqlDumpSession {
+    /// Creates SQL dump session with default buffer limit.
+    pub fn new(run_dir: PathBuf) -> Result<Self> {
+        Self::new_with_buffer_limit(run_dir, get_gt_fuzz_dump_buffer_max_bytes())
+    }
+
+    /// Creates SQL dump session with custom buffer limit.
+    pub fn new_with_buffer_limit(run_dir: PathBuf, max_buffer_bytes: usize) -> Result<Self> {
+        create_dir_all(&run_dir).context(error::CreateFileSnafu {
+            path: run_dir.to_string_lossy().to_string(),
+        })?;
+
+        Ok(Self {
+            run_dir,
+            max_buffer_bytes,
+            buffered_bytes: 0,
+            entries_by_table: HashMap::new(),
+        })
+    }
+
+    /// Appends one SQL statement for a logical table.
+    pub fn append_sql(&mut self, table: &str, sql: &str, comment: Option<&str>) -> Result<()> {
+        let entry = format_sql_entry(sql, comment);
+        self.push_entry(table, entry)?;
+        Ok(())
+    }
+
+    /// Broadcasts one comment event to all table trace files.
+    pub fn broadcast_event<I, T>(&mut self, tables: I, event: &str, sql: &str) -> Result<()>
+    where
+        I: IntoIterator<Item = T>,
+        T: AsRef<str>,
+    {
+        let entry = format_sql_entry(sql, Some(event));
+        for table in tables {
+            self.push_entry(table.as_ref(), entry.clone())?;
+        }
+        Ok(())
+    }
+
+    /// Flushes all staged SQL traces to table-scoped files.
+    pub fn flush_all(&mut self) -> Result<()> {
+        self.flush_buffered_entries()
+    }
+
+    fn push_entry(&mut self, table: &str, entry: String) -> Result<()> {
+        self.buffered_bytes += entry.len();
+        self.entries_by_table
+            .entry(table.to_string())
+            .or_default()
+            .push(entry);
+
+        if self.buffered_bytes >= self.max_buffer_bytes {
+            self.flush_buffered_entries()?;
+        }
+        Ok(())
+    }
+
+    fn flush_buffered_entries(&mut self) -> Result<()> {
+        if self.entries_by_table.is_empty() {
+            return Ok(());
+        }
+
+        for (table, entries) in &self.entries_by_table {
+            let path = self
+                .run_dir
+                .join(format!("{}.trace.sql", sanitize_file_name(table)));
+            let mut file = OpenOptions::new()
+                .create(true)
+                .append(true)
+                .open(&path)
+                .context(error::CreateFileSnafu {
+                    path: path.to_string_lossy().to_string(),
+                })?;
+
+            for entry in entries {
+                file.write_all(entry.as_bytes())
+                    .context(error::WriteFileSnafu {
+                        path: path.to_string_lossy().to_string(),
+                    })?;
+                file.write_all(b"\n").context(error::WriteFileSnafu {
+                    path: path.to_string_lossy().to_string(),
+                })?;
+            }
+        }
+
+        self.entries_by_table.clear();
+        self.buffered_bytes = 0;
+        Ok(())
+    }
+}
+
+fn format_sql_entry(sql: &str, comment: Option<&str>) -> String {
+    let normalized_sql = normalize_sql(sql);
+    if let Some(comment) = comment {
+        format!("{}\n{normalized_sql}", format_comment(comment))
+    } else {
+        normalized_sql
+    }
+}
+
+fn format_comment(comment: &str) -> String {
+    comment
+        .lines()
+        .map(|line| format!("-- {line}"))
+        .collect::<Vec<_>>()
+        .join("\n")
+}
+
+fn normalize_sql(sql: &str) -> String {
+    let trimmed = sql.trim_end();
+    if trimmed.ends_with(';') {
+        trimmed.to_string()
+    } else {
+        format!("{trimmed};")
+    }
+}
+
+fn sanitize_file_name(raw: &str) -> String {
+    raw.chars()
+        .map(|ch| {
+            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
+                ch
+            } else {
+                '_'
+            }
+        })
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::{SystemTime, UNIX_EPOCH};
+
+    use super::SqlDumpSession;
+
+    #[test]
+    fn test_append_sql_writes_table_trace_file() {
+        let run_dir = std::env::temp_dir().join(format!(
+            "tests-fuzz-sql-dump-{}",
+            SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis()
+        ));
+
+        let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1024).unwrap();
+        session
+            .append_sql(
+                "metric-a",
+                "INSERT INTO t VALUES(1)",
+                Some("kind=insert elapsed_ms=10"),
+            )
+            .unwrap();
+        session.flush_all().unwrap();
+
+        let content = std::fs::read_to_string(run_dir.join("metric-a.trace.sql")).unwrap();
+        assert!(content.contains("-- kind=insert elapsed_ms=10"));
+        assert!(content.contains("INSERT INTO t VALUES(1);"));
+    }
+
+    #[test]
+    fn test_broadcast_event_writes_to_all_tables() {
+        let run_dir = std::env::temp_dir().join(format!(
+            "tests-fuzz-sql-broadcast-{}",
+            SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis()
+        ));
+
+        let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1024).unwrap();
+        session
+            .broadcast_event(
+                ["metric-a", "metric-b"],
+                "repartition action_idx=3",
+                "ALTER TABLE t REPARTITION",
+            )
+            .unwrap();
+        session.flush_all().unwrap();
+
+        let content_a = std::fs::read_to_string(run_dir.join("metric-a.trace.sql")).unwrap();
+        let content_b = std::fs::read_to_string(run_dir.join("metric-b.trace.sql")).unwrap();
+        assert!(content_a.contains("-- repartition action_idx=3"));
+        assert!(content_a.contains("ALTER TABLE t REPARTITION;"));
+        assert!(content_b.contains("-- repartition action_idx=3"));
+        assert!(content_b.contains("ALTER TABLE t REPARTITION;"));
+    }
+
+    #[test]
+    fn test_multiline_comment_is_prefixed_per_line() {
+        let run_dir = std::env::temp_dir().join(format!(
+            "tests-fuzz-sql-dump-comment-{}",
+            SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis()
+        ));
+
+        let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1024).unwrap();
+        session
+            .append_sql(
+                "metric-a",
+                "INSERT INTO t VALUES(1)",
+                Some("kind=insert\nstarted_at_ms=1 elapsed_ms=2"),
+            )
+            .unwrap();
+        session.flush_all().unwrap();
+
+        let content = std::fs::read_to_string(run_dir.join("metric-a.trace.sql")).unwrap();
+        assert!(content.contains("-- kind=insert\n-- started_at_ms=1 elapsed_ms=2"));
+    }
+
+    #[test]
+    fn test_auto_flush_on_buffer_limit() {
+        let run_dir = std::env::temp_dir().join(format!(
+            "tests-fuzz-sql-dump-limit-{}",
+            SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis()
+        ));
+
+        let mut session = SqlDumpSession::new_with_buffer_limit(run_dir.clone(), 1).unwrap();
+        session
+            .append_sql("metric-a", "INSERT INTO t VALUES(1)", None)
+            .unwrap();
+
+        assert!(run_dir.join("metric-a.trace.sql").exists());
+        assert_eq!(session.buffered_bytes, 0);
+    }
+}
diff --git a/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs b/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs
new file mode 100644
index 0000000000..7932bc7759
--- /dev/null
+++ b/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs
@@ -0,0 +1,684 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#![no_main]
+
+use std::collections::{BTreeMap, HashMap};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use arbitrary::{Arbitrary, Unstructured};
+use common_telemetry::{info, warn};
+use common_time::Timestamp;
+use common_time::util::current_time_millis;
+use libfuzzer_sys::fuzz_target;
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaChaRng;
+use snafu::{ResultExt, ensure};
+use sqlx::{MySql, Pool};
+use tests_fuzz::context::{TableContext, TableContextRef};
+use tests_fuzz::error::{self, Result};
+use tests_fuzz::fake::{
+    ConstGenerator, MappedGenerator, WordGenerator, merge_two_word_map_fn, random_capitalize_map,
+    uppercase_and_keyword_backtick_map,
+};
+use tests_fuzz::generator::Generator;
+use tests_fuzz::generator::create_expr::{
+    CreateLogicalTableExprGeneratorBuilder, CreatePhysicalTableExprGeneratorBuilder,
+};
+use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder;
+use tests_fuzz::generator::repartition_expr::{
+    MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder,
+};
+use tests_fuzz::ir::{
+    CreateTableExpr, Ident, InsertIntoExpr, RepartitionExpr, generate_random_value,
+    generate_unique_timestamp_for_mysql_with_clock,
+};
+use tests_fuzz::translator::DslTranslator;
+use tests_fuzz::translator::csv::InsertExprToCsvRecordsTranslator;
+use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator;
+use tests_fuzz::translator::mysql::insert_expr::InsertIntoExprTranslator;
+use tests_fuzz::translator::mysql::repartition_expr::RepartitionExprTranslator;
+use tests_fuzz::utils::csv_dump_writer::{CsvDumpMetadata, CsvDumpSession};
+use tests_fuzz::utils::retry::retry_with_backoff;
+use tests_fuzz::utils::sql_dump_writer::SqlDumpSession;
+use tests_fuzz::utils::{
+    Connections, get_fuzz_override, get_gt_fuzz_input_max_alter_actions,
+    get_gt_fuzz_input_max_tables, init_greptime_connections_via_env,
+};
+use tests_fuzz::validator::row::count_values;
+use tokio::sync::{mpsc, oneshot};
+
+const BARRIER_ACK_TIMEOUT_SECS: u64 = 10;
+const VALIDATE_QUERY_MAX_ATTEMPTS: usize = 6;
+const VALIDATE_QUERY_INIT_BACKOFF: Duration = Duration::from_millis(50);
+const VALIDATE_QUERY_MAX_BACKOFF: Duration = Duration::from_millis(800);
+
+#[derive(Clone)]
+struct FuzzContext {
+    greptime: Pool<MySql>,
+}
+
+impl FuzzContext {
+    async fn close(self) {
+        self.greptime.close().await;
+    }
+}
+
+#[derive(Clone, Debug)]
+struct FuzzInput {
+    seed: u64,
+    actions: usize,
+    partitions: usize,
+    tables: usize,
+}
+
+fn generate_create_physical_table_expr<R: Rng + 'static>(
+    partitions: usize,
+    rng: &mut R,
+) -> Result<CreateTableExpr> {
+    CreatePhysicalTableExprGeneratorBuilder::default()
+        .name_generator(Box::new(ConstGenerator::new(Ident::new(
+            "fuzz_repartition_metric_physical",
+        ))))
+        .if_not_exists(rng.random_bool(0.5))
+        .partition(partitions)
+        .build()
+        .unwrap()
+        .generate(rng)
+}
+
+fn generate_create_logical_table_expr<R: Rng + 'static>(
+    physical_table_ctx: TableContextRef,
+    include_partition_column: bool,
+    rng: &mut R,
+) -> Result<CreateTableExpr> {
+    CreateLogicalTableExprGeneratorBuilder::default()
+        .name_generator(Box::new(MappedGenerator::new(
+            WordGenerator,
+            merge_two_word_map_fn(random_capitalize_map, uppercase_and_keyword_backtick_map),
+        )))
+        .physical_table_ctx(physical_table_ctx)
+        .labels(rng.random_range(1..=5))
+        .if_not_exists(rng.random_bool(0.5))
+        .include_partition_column(include_partition_column)
+        .build()
+        .unwrap()
+        .generate(rng)
+}
+
+fn generate_insert_expr<R: Rng + 'static>(
+    rows: usize,
+    rng: &mut R,
+    table_ctx: TableContextRef,
+    clock: Arc<Mutex<Timestamp>>,
+) -> Result<InsertIntoExpr> {
+    let ts_value_generator = generate_unique_timestamp_for_mysql_with_clock(clock);
+    InsertExprGeneratorBuilder::default()
+        .omit_column_list(false)
+        .table_ctx(table_ctx)
+        .rows(rows)
+        .value_generator(Box::new(generate_random_value))
+        .ts_value_generator(ts_value_generator)
+        .build()
+        .unwrap()
+        .generate(rng)
+}
+
+async fn create_metric_tables<R: Rng + 'static>(
+    ctx: &FuzzContext,
+    rng: &mut R,
+    partitions: usize,
+    table_count: usize,
+) -> Result<(
+    TableContextRef,
+    BTreeMap<Ident, TableContextRef>,
+    HashMap<String, String>,
+    String,
+)> {
+    let create_physical_expr = generate_create_physical_table_expr(partitions, rng)?;
+    let translator = CreateTableExprTranslator;
+    let create_physical_sql = translator.translate(&create_physical_expr)?;
+    let result = sqlx::query(&create_physical_sql)
+        .execute(&ctx.greptime)
+        .await
+        .context(error::ExecuteQuerySnafu {
+            sql: &create_physical_sql,
+        })?;
+    info!("Create physical table: {create_physical_sql}, result: {result:?}");
+    let physical_table_ctx = Arc::new(TableContext::from(&create_physical_expr));
+    ensure!(
+        physical_table_ctx.partition.is_some(),
+        error::AssertSnafu {
+            reason: "Physical metric table must have partition".to_string()
+        }
+    );
+
+    let mut logical_tables = BTreeMap::new();
+    let mut create_logical_sqls = HashMap::new();
+    let max_attempts = table_count * 3;
+    for _ in 0..max_attempts {
+        if logical_tables.len() >= table_count {
+            break;
+        }
+
+        let include_partition_column = rng.random_bool(0.5);
+        let create_logical_expr = generate_create_logical_table_expr(
+            physical_table_ctx.clone(),
+            include_partition_column,
+            rng,
+        )?;
+        if logical_tables.contains_key(&create_logical_expr.table_name) {
+            continue;
+        }
+
+        let create_logical_sql = translator.translate(&create_logical_expr)?;
+        let result = sqlx::query(&create_logical_sql)
+            .execute(&ctx.greptime)
+            .await
+            .context(error::ExecuteQuerySnafu {
+                sql: &create_logical_sql,
+            })?;
+        info!("Create logical table: {create_logical_sql}, result: {result:?}");
+        let logical_ctx = Arc::new(TableContext::from(&create_logical_expr));
+        create_logical_sqls.insert(logical_ctx.name.to_string(), create_logical_sql);
+        logical_tables.insert(logical_ctx.name.clone(), logical_ctx);
+    }
+
+    ensure!(
+        !logical_tables.is_empty(),
+        error::AssertSnafu {
+            reason: "No logical table created".to_string()
+        }
+    );
+
+    Ok((
+        physical_table_ctx,
+        logical_tables,
+        create_logical_sqls,
+        create_physical_sql,
+    ))
+}
+
+async fn execute_insert_with_retry(ctx: &FuzzContext, sql: &str) -> Result<()> {
+    let mut delay = Duration::from_millis(100);
+    let mut attempt = 0;
+    let max_attempts = 10;
+    loop {
+        match sqlx::query(sql)
+            .persistent(false)
+            .execute(&ctx.greptime)
+            .await
+        {
+            Ok(_) => return Ok(()),
+            Err(err) => {
+                tokio::time::sleep(delay).await;
+                delay = std::cmp::min(delay * 2, Duration::from_secs(1));
+                attempt += 1;
+                warn!("Execute insert with retry: {sql}, attempt: {attempt}, error: {err:?}");
+                if attempt >= max_attempts {
+                    return Err(err).context(error::ExecuteQuerySnafu { sql });
+                }
+            }
+        }
+    }
+}
+
+struct SharedState {
+    clock: Arc<Mutex<Timestamp>>,
+    inserted_rows: HashMap<String, u64>,
+    csv_dump_session: Option<CsvDumpSession>,
+    sql_dump_session: Option<SqlDumpSession>,
+    running: bool,
+}
+
+enum WriterControl {
+    Barrier {
+        epoch: usize,
+        ack: oneshot::Sender<()>,
+    },
+    Resume {
+        epoch: usize,
+    },
+    Stop,
+}
+
+fn handle_writer_control(control: WriterControl, paused: &mut bool) -> bool {
+    match control {
+        WriterControl::Barrier { epoch, ack } => {
+            info!("Writer received barrier control, epoch: {epoch}");
+            *paused = true;
+            let _ = ack.send(());
+            false
+        }
+        WriterControl::Resume { epoch } => {
+            info!("Writer received resume control, epoch: {epoch}");
+            *paused = false;
+            false
+        }
+        WriterControl::Stop => {
+            info!("Writer received stop control");
+            true
+        }
+    }
+}
+
+async fn write_loop<R: Rng + 'static>(
+    mut rng: R,
+    ctx: FuzzContext,
+    logical_tables: BTreeMap<Ident, TableContextRef>,
+    shared_state: Arc<Mutex<SharedState>>,
+    mut control_rx: mpsc::UnboundedReceiver<WriterControl>,
+) -> Result<()> {
+    info!("Start write loop");
+    let mut paused = false;
+    loop {
+        while let Ok(control) = control_rx.try_recv() {
+            if handle_writer_control(control, &mut paused) {
+                return Ok(());
+            }
+        }
+
+        if paused {
+            match control_rx.recv().await {
+                Some(control) => {
+                    if handle_writer_control(control, &mut paused) {
+                        return Ok(());
+                    }
+                }
+                None => return Ok(()),
+            }
+            continue;
+        }
+
+        let (running, clock) = {
+            let state = shared_state.lock().unwrap();
+            (state.running, state.clock.clone())
+        };
+        if !running {
+            break;
+        }
+
+        for table_ctx in logical_tables.values() {
+            let rows = rng.random_range(1..=3);
+            let insert_expr =
+                generate_insert_expr(rows, &mut rng, table_ctx.clone(), clock.clone())?;
+            let translator = InsertIntoExprTranslator;
+            let sql = translator.translate(&insert_expr)?;
+            let inserted = insert_expr.values_list.len() as u64;
+            let csv_records = InsertExprToCsvRecordsTranslator.translate(&insert_expr)?;
+            let table_name = table_ctx.name.to_string();
+            let full_headers = table_ctx
+                .columns
+                .iter()
+                .map(|column| column.name.value.clone())
+                .collect::<Vec<_>>();
+
+            let started_at_ms = current_time_millis();
+            let now = Instant::now();
+            execute_insert_with_retry(&ctx, &sql).await?;
+            let elapsed = now.elapsed();
+            info!("Execute insert sql: {sql}, elapsed: {elapsed:?}");
+
+            let mut state = shared_state.lock().unwrap();
+            if let Some(csv_dump_session) = state.csv_dump_session.as_mut() {
+                csv_dump_session.append(csv_records, full_headers)?;
+            }
+            if let Some(sql_dump_session) = state.sql_dump_session.as_mut() {
+                let comment = format!(
+                    "kind=insert table={} started_at_ms={} elapsed_ms={}",
+                    table_name,
+                    started_at_ms,
+                    elapsed.as_millis()
+                );
+                sql_dump_session.append_sql(&table_name, &sql, Some(&comment))?;
+            }
+            *state.inserted_rows.entry(table_name).or_insert(0) += inserted;
+        }
+
+        tokio::time::sleep(Duration::from_millis(100)).await;
+    }
+    info!("Write loop ended");
+
+    Ok(())
+}
+
+async fn validate_rows(
+    ctx: &FuzzContext,
+    logical_tables: &BTreeMap<Ident, TableContextRef>,
+    inserted_rows: &HashMap<String, u64>,
+) -> Result<()> {
+    for table_ctx in logical_tables.values() {
+        let expected = *inserted_rows.get(&table_ctx.name.to_string()).unwrap_or(&0) as usize;
+        let count_sql = format!("SELECT COUNT(1) AS count FROM {}", table_ctx.name);
+        let count = retry_with_backoff(
+            || count_values(&ctx.greptime, &count_sql),
+            VALIDATE_QUERY_MAX_ATTEMPTS,
+            VALIDATE_QUERY_INIT_BACKOFF,
+            VALIDATE_QUERY_MAX_BACKOFF,
+        )
+        .await?;
+        let distinct_count_sql = format!(
+            "SELECT COUNT(DISTINCT {}) AS count FROM {}",
+            table_ctx.timestamp_column().unwrap().name,
+            table_ctx.name
+        );
+        let distinct_count = retry_with_backoff(
+            || count_values(&ctx.greptime, &distinct_count_sql),
+            VALIDATE_QUERY_MAX_ATTEMPTS,
+            VALIDATE_QUERY_INIT_BACKOFF,
+            VALIDATE_QUERY_MAX_BACKOFF,
+        )
+        .await?;
+        info!(
+            "Validate rows for table: {}, expected: {}, count: {}, distinct_count: {}",
+            table_ctx.name, expected, count.count as usize, distinct_count.count as usize
+        );
+        assert_eq!(count.count as usize, expected);
+
+        assert_eq!(distinct_count.count as usize, expected);
+    }
+    Ok(())
+}
+
+fn flush_dump_sessions_and_snapshot(
+    shared_state: &Arc<Mutex<SharedState>>,
+) -> Result<HashMap<String, u64>> {
+    let mut state = shared_state.lock().unwrap();
+    if let Some(csv_dump_session) = state.csv_dump_session.as_mut() {
+        csv_dump_session.flush_all()?;
+    }
+    if let Some(sql_dump_session) = state.sql_dump_session.as_mut() {
+        sql_dump_session.flush_all()?;
+    }
+    Ok(state.inserted_rows.clone())
+}
+
+async fn cleanup_tables(
+    ctx: &FuzzContext,
+    physical_table_ctx: &TableContextRef,
+    logical_tables: &BTreeMap<Ident, TableContextRef>,
+) -> Result<()> {
+    for table_ctx in logical_tables.values() {
+        let drop_logical_sql = format!("DROP TABLE {}", table_ctx.name);
+        let result = sqlx::query(&drop_logical_sql)
+            .execute(&ctx.greptime)
+            .await
+            .context(error::ExecuteQuerySnafu {
+                sql: &drop_logical_sql,
+            })?;
+        info!("Drop logical table: {drop_logical_sql}, result: {result:?}");
+    }
+
+    let drop_physical_sql = format!("DROP TABLE {}", physical_table_ctx.name);
+    let result = sqlx::query(&drop_physical_sql)
+        .execute(&ctx.greptime)
+        .await
+        .context(error::ExecuteQuerySnafu {
+            sql: &drop_physical_sql,
+        })?;
+    info!("Drop physical table: {drop_physical_sql}, result: {result:?}");
+    Ok(())
+}
+
+fn repartition_operation<R: Rng + 'static>(
+    table_ctx: &TableContextRef,
+    rng: &mut R,
+) -> Result<RepartitionExpr> {
+    let split = rng.random_bool(0.5);
+    if table_ctx.partition.as_ref().unwrap().exprs.len() <= 2 || split {
+        let expr = SplitPartitionExprGeneratorBuilder::default()
+            .table_ctx(table_ctx.clone())
+            .build()
+            .unwrap()
+            .generate(rng)?;
+        Ok(RepartitionExpr::Split(expr))
+    } else {
+        let expr = MergePartitionExprGeneratorBuilder::default()
+            .table_ctx(table_ctx.clone())
+            .build()
+            .unwrap()
+            .generate(rng)?;
+        Ok(RepartitionExpr::Merge(expr))
+    }
+}
+
+impl Arbitrary<'_> for FuzzInput {
+    fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result<Self> {
+        let seed = get_fuzz_override::<u64>("SEED").unwrap_or(u.int_in_range(u64::MIN..=u64::MAX)?);
+        let mut rng = ChaChaRng::seed_from_u64(seed);
+        let partitions =
+            get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8));
+        let max_tables = get_gt_fuzz_input_max_tables();
+        let tables = get_fuzz_override::<usize>("TABLES")
+            .unwrap_or_else(|| rng.random_range(1..=std::cmp::max(1, max_tables)));
+        let max_actions = get_gt_fuzz_input_max_alter_actions();
+        let actions = get_fuzz_override::<usize>("ACTIONS")
+            .unwrap_or_else(|| rng.random_range(1..max_actions));
+
+        Ok(FuzzInput {
+            seed,
+            actions,
+            partitions,
+            tables,
+        })
+    }
+}
+
+async fn execute_repartition_metric_table(ctx: FuzzContext, input: FuzzInput) -> Result<()> {
+    info!("input: {input:?}");
+    let mut rng = ChaChaRng::seed_from_u64(input.seed);
+    let clock = Arc::new(Mutex::new(Timestamp::current_millis()));
+
+    let (mut physical_table_ctx, logical_tables, create_logical_sqls, create_physical_sql) =
+        create_metric_tables(&ctx, &mut rng, input.partitions, input.tables).await?;
+
+    let mut inserted_rows = HashMap::with_capacity(logical_tables.len());
+    for table_ctx in logical_tables.values() {
+        inserted_rows.insert(table_ctx.name.to_string(), 0);
+    }
+    let csv_dump_session = CsvDumpSession::new(CsvDumpMetadata::new(
+        "fuzz_repartition_metric_table",
+        input.seed,
+        input.actions,
+        input.partitions,
+        input.tables,
+    ))?;
+    let sql_dump_session = SqlDumpSession::new(csv_dump_session.run_dir.clone())?;
+    let logical_table_names = logical_tables
+        .values()
+        .map(|table_ctx| table_ctx.name.to_string())
+        .collect::<Vec<_>>();
+
+    let mut sql_dump_session = sql_dump_session;
+    sql_dump_session.append_sql(
+        &physical_table_ctx.name.to_string(),
+        &create_physical_sql,
+        Some("kind=create_physical_table"),
+    )?;
+    for table_name in &logical_table_names {
+        if let Some(create_sql) = create_logical_sqls.get(table_name) {
+            sql_dump_session.append_sql(
+                table_name,
+                create_sql,
+                Some("kind=create_logical_table"),
+            )?;
+        }
+    }
+
+    let shared_state = Arc::new(Mutex::new(SharedState {
+        clock,
+        inserted_rows,
+        csv_dump_session: Some(csv_dump_session),
+        sql_dump_session: Some(sql_dump_session),
+        running: true,
+    }));
+    let writer_rng = ChaChaRng::seed_from_u64(input.seed ^ 0xA5A5_A5A5_A5A5_A5A5);
+    let (control_tx, control_rx) = mpsc::unbounded_channel::<WriterControl>();
+    let writer_task = tokio::spawn(write_loop(
+        writer_rng,
+        ctx.clone(),
+        logical_tables.clone(),
+        shared_state.clone(),
+        control_rx,
+    ));
+    tokio::time::sleep(Duration::from_millis(100)).await;
+
+    for i in 0..input.actions {
+        let partition_num = physical_table_ctx.partition.as_ref().unwrap().exprs.len();
+        info!(
+            "partition_num: {partition_num}, action: {}/{}, table: {}, logical table num: {}",
+            i + 1,
+            input.actions,
+            physical_table_ctx.name,
+            logical_tables.len()
+        );
+
+        let repartition_expr = repartition_operation(&physical_table_ctx, &mut rng)?;
+        let translator = RepartitionExprTranslator;
+        let sql = translator.translate(&repartition_expr)?;
+        info!("Repartition sql: {sql}");
+        let started_at_ms = current_time_millis();
+        let now = Instant::now();
+        let result = sqlx::query(&sql)
+            .execute(&ctx.greptime)
+            .await
+            .context(error::ExecuteQuerySnafu { sql: &sql })?;
+        let elapsed = now.elapsed();
+        info!("Repartition result: {result:?}, elapsed: {elapsed:?}");
+
+        physical_table_ctx = Arc::new(
+            Arc::unwrap_or_clone(physical_table_ctx)
+                .repartition(repartition_expr)
+                .unwrap(),
+        );
+
+        let partition_entries = tests_fuzz::validator::partition::fetch_partitions_info_schema(
+            &ctx.greptime,
+            "public".into(),
+            &physical_table_ctx.name,
+        )
+        .await?;
+        tests_fuzz::validator::partition::assert_partitions(
+            physical_table_ctx.partition.as_ref().unwrap(),
+            &partition_entries,
+        )?;
+
+        {
+            let mut state = shared_state.lock().unwrap();
+            if let Some(sql_dump_session) = state.sql_dump_session.as_mut() {
+                let repartition_comment = format!(
+                    "kind=repartition table={} action_idx={} started_at_ms={} elapsed_ms={}",
+                    physical_table_ctx.name,
+                    i + 1,
+                    started_at_ms,
+                    elapsed.as_millis()
+                );
+                sql_dump_session.append_sql(
+                    &physical_table_ctx.name.to_string(),
+                    &sql,
+                    Some(&repartition_comment),
+                )?;
+                let event = format!(
+                    "repartition action_idx={} started_at_ms={} elapsed_ms={} sql={}",
+                    i + 1,
+                    started_at_ms,
+                    elapsed.as_millis(),
+                    sql
+                );
+                sql_dump_session.broadcast_event(logical_table_names.iter(), &event, &sql)?;
+            }
+        }
+
+        let (ack_tx, ack_rx) = oneshot::channel();
+        control_tx
+            .send(WriterControl::Barrier {
+                epoch: i + 1,
+                ack: ack_tx,
+            })
+            .expect("barrier control send must succeed");
+        tokio::time::timeout(Duration::from_secs(BARRIER_ACK_TIMEOUT_SECS), ack_rx)
+            .await
+            .expect("barrier ack timeout")
+            .expect("barrier ack dropped");
+
+        let inserted_rows_snapshot = flush_dump_sessions_and_snapshot(&shared_state)?;
+        info!("validate rows, epoch: {}", i + 1);
+        validate_rows(&ctx, &logical_tables, &inserted_rows_snapshot).await?;
+
+        control_tx
+            .send(WriterControl::Resume { epoch: i + 1 })
+            .expect("resume control send must succeed");
+    }
+
+    let _ = control_tx.send(WriterControl::Stop);
+    shared_state.lock().unwrap().running = false;
+    writer_task.await.unwrap().unwrap();
+    let inserted_rows = flush_dump_sessions_and_snapshot(&shared_state)?;
+    let (mut csv_dump_session, mut sql_dump_session) = {
+        let mut state = shared_state.lock().unwrap();
+        (state.csv_dump_session.take(), state.sql_dump_session.take())
+    };
+
+    let run_result = async {
+        validate_rows(&ctx, &logical_tables, &inserted_rows).await?;
+        cleanup_tables(&ctx, &physical_table_ctx, &logical_tables).await?;
+        Ok(())
+    }
+    .await;
+
+    if let Some(csv_dump_session) = csv_dump_session.take() {
+        match &run_result {
+            Ok(_) => {
+                if let Err(err) = csv_dump_session.cleanup_on_success() {
+                    warn!(
+                        "Cleanup csv dump directory failed, path: {}, error: {:?}",
+                        csv_dump_session.run_dir.display(),
+                        err
+                    );
+                }
+            }
+            Err(_) => {
+                warn!(
+                    "Keep csv dump directory for failure analysis, path: {}",
+                    csv_dump_session.run_dir.display()
+                );
+            }
+        }
+    }
+    if let Some(sql_dump_session) = sql_dump_session.take()
+        && run_result.is_err()
+    {
+        warn!(
+            "Keep sql dump directory for failure analysis, path: {}",
+            sql_dump_session.run_dir.display()
+        );
+    }
+
+    ctx.close().await;
+    run_result
+}
+
+fuzz_target!(|input: FuzzInput| {
+    common_telemetry::init_default_ut_logging();
+    common_runtime::block_on_global(async {
+        let Connections { mysql } = init_greptime_connections_via_env().await;
+        let ctx = FuzzContext {
+            greptime: mysql.expect("mysql connection init must be succeed"),
+        };
+        execute_repartition_metric_table(ctx, input)
+            .await
+            .unwrap_or_else(|err| panic!("fuzz test must be succeed: {err:?}"));
+    })
+});
diff --git a/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs b/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs
index c8ebbb54af..17cbfb9251 100644
--- a/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs
+++ b/tests-fuzz/targets/migration/fuzz_migrate_mito_regions.rs
@@ -261,13 +261,18 @@ async fn migrate_regions(ctx: &FuzzContext, migrations: &[Migration]) -> Result<
                     {
                         let output = procedure_state(&greptime, &procedure_id).await;
                         info!("Checking procedure: {procedure_id}, output: {output}");
-                        (fetch_partition(&greptime, region_id).await.unwrap(), output)
+                        (fetch_partition(&greptime, region_id).await.ok(), output)
                     }
                 })
             },
             |(partition, output)| {
-                info!("Region: {region_id},  datanode: {}", partition.datanode_id);
-                partition.datanode_id == migration.to_peer && output.contains("Done")
+                if let Some(partition) = partition {
+                    info!("Region: {region_id},  datanode: {}", partition.datanode_id);
+                    partition.datanode_id == migration.to_peer && output.contains("Done")
+                } else {
+                    info!("Region: {region_id}, partition not found yet");
+                    false
+                }
             },
             Duration::from_secs(5),
         )
diff --git a/tests-integration/Cargo.toml b/tests-integration/Cargo.toml
index 0c6b965fd3..ec35205a55 100644
--- a/tests-integration/Cargo.toml
+++ b/tests-integration/Cargo.toml
@@ -5,7 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [features]
-dashboard = []
+dashboard = ["servers/dashboard"]
 vector_index = []
 
 [lints]
diff --git a/tests-integration/src/test_util.rs b/tests-integration/src/test_util.rs
index fd0d1ef3c4..2bf6e812c7 100644
--- a/tests-integration/src/test_util.rs
+++ b/tests-integration/src/test_util.rs
@@ -534,6 +534,7 @@ pub async fn setup_test_http_app_with_frontend_and_custom_options(
         .with_influxdb_handler(instance.fe_instance().clone())
         .with_otlp_handler(instance.fe_instance().clone(), true)
         .with_jaeger_handler(instance.fe_instance().clone())
+        .with_dashboard_handler(instance.fe_instance().clone())
         .with_greptime_config_options(instance.opts.to_toml().unwrap());
 
     if let Some(user_provider) = user_provider {
diff --git a/tests-integration/src/tests/promql_test.rs b/tests-integration/src/tests/promql_test.rs
index 7fbce91ea6..ede4663118 100644
--- a/tests-integration/src/tests/promql_test.rs
+++ b/tests-integration/src/tests/promql_test.rs
@@ -15,7 +15,9 @@
 use std::sync::Arc;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 
-use common_query::Output;
+use common_query::{Output, OutputData};
+use common_recordbatch::util::collect_batches;
+use datatypes::arrow::array::{Float64Array, Int64Array};
 use frontend::instance::Instance;
 use query::parser::{PromQuery, QueryLanguageParser, QueryStatement};
 use rstest::rstest;
@@ -151,6 +153,103 @@ async fn create_insert_tql_assert(
     check_unordered_output_stream(query_output, expected).await;
 }
 
+async fn execute_all(instance: &Arc<Instance>, sql: &str, query_ctx: Arc<QueryContext>) {
+    instance
+        .do_query(sql, query_ctx)
+        .await
+        .into_iter()
+        .for_each(|v| {
+            let _ = v.unwrap();
+        });
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn promql_query_as_batches(
+    ins: Arc<Instance>,
+    promql: &str,
+    alias: Option<String>,
+    query_ctx: Arc<QueryContext>,
+    start: SystemTime,
+    end: SystemTime,
+    interval: Duration,
+    lookback: Duration,
+) -> common_recordbatch::RecordBatches {
+    let output = promql_query(
+        ins, promql, alias, query_ctx, start, end, interval, lookback,
+    )
+    .await
+    .unwrap();
+    match output.data {
+        OutputData::Stream(stream) => collect_batches(stream).await.unwrap(),
+        OutputData::RecordBatches(recordbatches) => recordbatches,
+        _ => unreachable!(),
+    }
+}
+
+const ANON_PROMQL_RATIO_REPRO_DB: &str = "repro_db";
+
+const ANON_PROMQL_RATIO_REPRO_CREATE: &str = r#"
+CREATE TABLE phy (
+    t TIMESTAMP TIME INDEX,
+    v DOUBLE
+) ENGINE=metric WITH ("physical_metric_table" = "");
+
+CREATE TABLE metric_a (
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    l5 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l1, l2, l3, l4, l5)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+CREATE TABLE metric_b (
+    l6 STRING NULL,
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l6, l1, l2, l3, l4)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+"#;
+
+const ANON_PROMQL_RATIO_REPRO_INSERT: &str = r#"
+INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120);
+
+INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2);
+"#;
+
+const ANON_PROMQL_RATIO_REPRO_NUMERATOR: &str = r#"count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}) > 0.50))"#;
+
+const ANON_PROMQL_RATIO_REPRO_DENOMINATOR: &str =
+    r#"count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]))"#;
+
+const ANON_PROMQL_RATIO_REPRO_WHOLE: &str = r#"(count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]))) * 100"#;
+
+const ANON_PROMQL_RATIO_REPRO_SCALAR_DIV: &str =
+    r#"count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m])) / 2"#;
+
 #[apply(both_instances_cases)]
 async fn sql_insert_tql_query_ceil(instance: Arc<dyn MockInstance>) {
     let instance = instance.frontend();
@@ -709,3 +808,140 @@ async fn cross_schema_query(instance: Arc<dyn MockInstance>) {
 
     check_unordered_output_stream(query_output, expected).await;
 }
+
+#[apply(both_instances_cases)]
+async fn anon_promql_ratio_repro(instance: Arc<dyn MockInstance>) {
+    let ins = instance.frontend();
+
+    execute_all(
+        &ins,
+        &format!("CREATE DATABASE {ANON_PROMQL_RATIO_REPRO_DB}"),
+        QueryContext::arc(),
+    )
+    .await;
+
+    let repro_ctx: Arc<QueryContext> =
+        QueryContext::with_db_name(Some(ANON_PROMQL_RATIO_REPRO_DB)).into();
+    execute_all(&ins, ANON_PROMQL_RATIO_REPRO_CREATE, repro_ctx.clone()).await;
+    execute_all(&ins, ANON_PROMQL_RATIO_REPRO_INSERT, repro_ctx).await;
+
+    let start = UNIX_EPOCH.checked_add(Duration::from_secs(180)).unwrap();
+    let end = UNIX_EPOCH.checked_add(Duration::from_secs(360)).unwrap();
+    let interval = Duration::from_secs(180);
+    let lookback = Duration::from_secs(1);
+
+    let numerator = promql_query_as_batches(
+        ins.clone(),
+        ANON_PROMQL_RATIO_REPRO_NUMERATOR,
+        Some("num".to_string()),
+        QueryContext::arc(),
+        start,
+        end,
+        interval,
+        lookback,
+    )
+    .await;
+    let denominator = promql_query_as_batches(
+        ins.clone(),
+        ANON_PROMQL_RATIO_REPRO_DENOMINATOR,
+        Some("den".to_string()),
+        QueryContext::arc(),
+        start,
+        end,
+        interval,
+        lookback,
+    )
+    .await;
+    let whole = promql_query_as_batches(
+        ins.clone(),
+        ANON_PROMQL_RATIO_REPRO_WHOLE,
+        Some("pct".to_string()),
+        QueryContext::arc(),
+        start,
+        end,
+        interval,
+        lookback,
+    )
+    .await;
+    let scalar_div = promql_query_as_batches(
+        ins,
+        ANON_PROMQL_RATIO_REPRO_SCALAR_DIV,
+        Some("half_den".to_string()),
+        QueryContext::arc(),
+        start,
+        end,
+        interval,
+        lookback,
+    )
+    .await;
+
+    let numerator = numerator.iter().collect::<Vec<_>>();
+    let denominator = denominator.iter().collect::<Vec<_>>();
+    let whole = whole.iter().collect::<Vec<_>>();
+    let scalar_div = scalar_div.iter().collect::<Vec<_>>();
+
+    let numerator_values = numerator[0]
+        .column_by_name("num")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Int64Array>()
+        .unwrap();
+    let denominator_values = denominator[0]
+        .column_by_name("den")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Int64Array>()
+        .unwrap();
+    let percentage_values = whole[0]
+        .column_by_name("pct")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Float64Array>()
+        .unwrap();
+    let scalar_div_values = scalar_div[0]
+        .column_by_name("half_den")
+        .unwrap()
+        .as_any()
+        .downcast_ref::<Float64Array>()
+        .unwrap();
+
+    assert_eq!(numerator_values.len(), 1, "{}", numerator[0].pretty_print());
+    assert_eq!(
+        denominator_values.len(),
+        1,
+        "{}",
+        denominator[0].pretty_print()
+    );
+    assert_eq!(percentage_values.len(), 1, "{}", whole[0].pretty_print());
+    assert_eq!(
+        scalar_div_values.len(),
+        1,
+        "{}",
+        scalar_div[0].pretty_print()
+    );
+
+    assert_eq!(
+        numerator_values.value(0),
+        1,
+        "{}",
+        numerator[0].pretty_print()
+    );
+    assert_eq!(
+        denominator_values.value(0),
+        3,
+        "{}",
+        denominator[0].pretty_print()
+    );
+    assert!(
+        (scalar_div_values.value(0) - 1.5).abs() < 1e-9,
+        "{}",
+        scalar_div[0].pretty_print()
+    );
+
+    let expected = 100.0 / 3.0;
+    assert!(
+        (percentage_values.value(0) - expected).abs() < 1e-9,
+        "{}",
+        whole[0].pretty_print()
+    );
+}
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index 68fa2a228d..7ae59ae9fc 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -106,6 +106,7 @@ macro_rules! http_tests {
                 test_config_api,
                 test_dynamic_tracer_toggle,
                 test_dashboard_path,
+                test_dashboard_api,
                 test_prometheus_remote_write,
                 test_prometheus_remote_special_labels,
                 test_prometheus_remote_schema_labels,
@@ -147,6 +148,7 @@ macro_rules! http_tests {
                 test_jaeger_query_api_for_trace_v1,
 
                 test_influxdb_write,
+                test_influxdb_write_with_hints,
                 test_http_memory_limit,
             );
         )*
@@ -1640,6 +1642,7 @@ fn drop_lines_with_inconsistent_results(input: String) -> String {
         "metadata_cache_size =",
         "content_cache_size =",
         "result_cache_size =",
+        "range_result_cache_size =",
         "name =",
         "recovery_parallelism =",
         "max_background_index_builds =",
@@ -1720,6 +1723,121 @@ pub async fn test_dashboard_path(store_type: StorageType) {
 #[cfg(not(feature = "dashboard"))]
 pub async fn test_dashboard_path(_: StorageType) {}
 
+#[cfg(feature = "dashboard")]
+pub async fn test_dashboard_api(store_type: StorageType) {
+    common_telemetry::init_default_ut_logging();
+    let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "dashboard_api").await;
+    let client = TestClient::new(app).await;
+
+    // 1. List dashboards - should be empty initially
+    let res = client.get("/v1/dashboards").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert!(dashboards.is_empty());
+
+    // 2. Save a dashboard
+    let dashboard_definition = r#"{"title": "My Dashboard", "panels": []}"#;
+    let res = client
+        .post("/v1/dashboards/test_dashboard")
+        .body(dashboard_definition)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert_eq!(dashboards.len(), 1);
+    assert_eq!(dashboards[0].get("name").unwrap(), "test_dashboard");
+
+    // 3. Save another dashboard
+    let res = client
+        .post("/v1/dashboards/another_dashboard")
+        .body(r#"{"title": "Another Dashboard"}"#)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    // 4. List dashboards - should have 2
+    let res = client.get("/v1/dashboards").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert_eq!(dashboards.len(), 2);
+
+    let names: Vec<&str> = dashboards
+        .iter()
+        .map(|d| d.get("name").unwrap().as_str().unwrap())
+        .collect();
+    assert!(names.contains(&"test_dashboard"));
+    assert!(names.contains(&"another_dashboard"));
+
+    // 5. Update a dashboard by posting again with new definition
+    let updated_definition = r#"{"title": "Updated Dashboard", "panels": [{"id": 1}]}"#;
+    let res = client
+        .post("/v1/dashboards/test_dashboard")
+        .body(updated_definition)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert_eq!(dashboards.len(), 1);
+    assert_eq!(dashboards[0].get("name").unwrap(), "test_dashboard");
+
+    // Verify the definition was updated by listing again
+    let res = client.get("/v1/dashboards").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert_eq!(dashboards.len(), 2);
+
+    // Find test_dashboard and verify it has updated definition
+    let test_db = dashboards
+        .iter()
+        .find(|d| d.get("name").unwrap() == "test_dashboard")
+        .unwrap();
+    assert_eq!(
+        test_db.get("definition").unwrap(),
+        r#"{"title": "Updated Dashboard", "panels": [{"id": 1}]}"#
+    );
+
+    // 6. Delete one dashboard
+    let res = client.delete("/v1/dashboards/test_dashboard").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert_eq!(dashboards.len(), 1);
+    assert_eq!(dashboards[0].get("name").unwrap(), "test_dashboard");
+
+    // 7. List dashboards - should have 1
+    let res = client.get("/v1/dashboards").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert_eq!(dashboards.len(), 1);
+    assert_eq!(dashboards[0].get("name").unwrap(), "another_dashboard");
+
+    // 8. Delete the remaining dashboard
+    let res = client
+        .delete("/v1/dashboards/another_dashboard")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    // 9. List dashboards - should be empty
+    let res = client.get("/v1/dashboards").send().await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let body: Value = res.json().await;
+    let dashboards = body.get("dashboards").unwrap().as_array().unwrap();
+    assert!(dashboards.is_empty());
+
+    guard.remove_all().await;
+}
+
+#[cfg(not(feature = "dashboard"))]
+pub async fn test_dashboard_api(_: StorageType) {}
+
 pub async fn test_prometheus_remote_write(store_type: StorageType) {
     common_telemetry::init_default_ut_logging();
     let (app, mut guard) =
@@ -3522,6 +3640,43 @@ transform:
     guard.remove_all().await;
 }
 
+pub async fn test_influxdb_write_with_hints(storage_type: StorageType) {
+    common_telemetry::init_default_ut_logging();
+    let (app, mut guard) =
+        setup_test_http_app_with_frontend(storage_type, "test_influxdb_write_with_hints").await;
+
+    let client = TestClient::new(app).await;
+
+    let result = client
+        .post("/v1/influxdb/write?db=public")
+        .header("x-greptime-hints", "sst_format=flat,ttl=30d,skip_wal=true")
+        .body("sst_fmt_table,host=host1 cpu=1.2 1664370459457010101")
+        .send()
+        .await;
+    assert_eq!(result.status(), 204);
+
+    let res = client
+        .get("/v1/sql?sql=show create table sst_fmt_table")
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+    let resp = res.text().await;
+    assert!(
+        resp.contains("sst_format = 'flat'"),
+        "expected sst_format = 'flat' in SHOW CREATE TABLE output, got: {resp}"
+    );
+    assert!(
+        resp.contains("ttl = '30days'"),
+        "expected ttl = '30days' in SHOW CREATE TABLE output, got: {resp}"
+    );
+    assert!(
+        resp.contains("skip_wal = 'true'"),
+        "expected skip_wal = 'true' in SHOW CREATE TABLE output, got: {resp}"
+    );
+
+    guard.remove_all().await;
+}
+
 /// Test one-to-many VRL pipeline expansion.
 /// This test verifies that a VRL processor can return an array, which results in
 /// multiple output rows from a single input row.
diff --git a/tests/cases/distributed/explain/step_aggr_advance.result b/tests/cases/distributed/explain/step_aggr_advance.result
index 4bd83b7afa..5938fa202d 100644
--- a/tests/cases/distributed/explain/step_aggr_advance.result
+++ b/tests/cases/distributed/explain/step_aggr_advance.result
@@ -442,54 +442,54 @@ Affected Rows: 0
 -- SQLNESS REPLACE (Hash.*) REDACTED
 tql explain (1752591864, 1752592164, '30s') sum by (a, b, c) (rate(aggr_optimize_not [2m])) / sum by (a, b, c) (rate(aggr_optimize_not_count [2m]));
 
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) |
-|               |   Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp                                                                                                                                                                                                                                                                                                                                                      |
-|               |     MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                          |
-|               |       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               | SubqueryAlias: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-|               |   Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-|               |     Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                                                       |
-|               |       Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-|               |         Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d                                                                                                                                                                                                                                                           |
-|               |           PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|               |             PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-|               |               PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-|               |                 Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                          |
-|               |                   Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(1752592164000, None)                                                                                                                                                                                                                                                                                                                                                                                                       |
-|               |                     TableScan: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-|               |     SubqueryAlias: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|               |       Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                     |
-|               |         Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                           |
-|               |           Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-|               |             Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c                                                                                                                                                                                                                                              |
-|               |               PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                          |
-|               |                 PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-|               |                   PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-|               |                     Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                        |
-|               |                       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(1752592164000, None)                                                                                                                                                                                                                                                                                                                                                                                                             |
-|               |   TableScan: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| physical_plan | ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                               |
++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, CAST(aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS Float64) / CAST(aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS Float64) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) |
+|               |   Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |     MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+|               |       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               | SubqueryAlias: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               |   Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|               |     Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                                                                                         |
+|               |       Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+|               |         Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d                                                                                                                                                                                                                                                                                             |
+|               |           PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+|               |             PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               |               PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|               |                 Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                                                            |
+|               |                   Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(1752592164000, None)                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+|               |                     TableScan: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |     SubqueryAlias: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+|               |       Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|               |         Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                                                             |
+|               |           Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+|               |             Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c                                                                                                                                                                                                                                                                                |
+|               |               PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+|               |                 PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+|               |                   PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+|               |                     Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                          |
+|               |                       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(1752592164000, None)                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+|               |   TableScan: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+|               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| physical_plan | ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                                                                 |
 |               |   REDACTED
-|               |     CoalescePartitionsExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|               |       AggregateExec: mode=SinglePartitioned, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                                                                                                                                                                                                                                                                                                                            |
-|               |         FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               |           ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c]                                                                                                                                                                                                                                                                                                                 |
-|               |             PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp]                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               |               PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-|               |                 PromSeriesDivideExec: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-|               |                   SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |     CoalescePartitionsExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
+|               |       AggregateExec: mode=SinglePartitioned, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                                                                                                                                                                                                                                                                                                                                                              |
+|               |         FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               |           ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c]                                                                                                                                                                                                                                                                                                                                                   |
+|               |             PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               |               PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+|               |                 PromSeriesDivideExec: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |                   SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
 |               |                     MergeScanExec: REDACTED
-|               |     SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-|               |       CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|               |     SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |       CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 |               |         MergeScanExec: REDACTED
-|               |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+|               |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
 -- SQLNESS REPLACE (metrics.*) REDACTED
 -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
diff --git a/tests/cases/standalone/common/alter/alter_database.result b/tests/cases/standalone/common/alter/alter_database.result
index 911ef5ddfc..2fccce10de 100644
--- a/tests/cases/standalone/common/alter/alter_database.result
+++ b/tests/cases/standalone/common/alter/alter_database.result
@@ -314,6 +314,85 @@ SHOW CREATE DATABASE alter_database;
 |                | )                                            |
 +----------------+----------------------------------------------+
 
+-- Test sst_format option
+ALTER DATABASE alter_database SET 'sst_format'='flat';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+----------------------------------------------+
+| Database       | Create Database                              |
++----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.twcs.time_window' = '30m',     |
+|                |   'compaction.type' = 'twcs',                |
+|                |   sst_format = 'flat'                        |
+|                | )                                            |
++----------------+----------------------------------------------+
+
+USE alter_database;
+
+Affected Rows: 0
+
+CREATE TABLE monitor(ts TIMESTAMP TIME INDEX);
+
+Affected Rows: 0
+
+SHOW CREATE TABLE monitor;
+
++---------+----------------------------------------+
+| Table   | Create Table                           |
++---------+----------------------------------------+
+| monitor | CREATE TABLE IF NOT EXISTS "monitor" ( |
+|         |   "ts" TIMESTAMP(3) NOT NULL,          |
+|         |   TIME INDEX ("ts")                    |
+|         | )                                      |
+|         |                                        |
+|         | ENGINE=mito                            |
+|         | WITH(                                  |
+|         |   sst_format = 'flat'                  |
+|         | )                                      |
++---------+----------------------------------------+
+
+USE public;
+
+Affected Rows: 0
+
+ALTER DATABASE alter_database SET 'sst_format'='primary_key';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+----------------------------------------------+
+| Database       | Create Database                              |
++----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.twcs.time_window' = '30m',     |
+|                |   'compaction.type' = 'twcs',                |
+|                |   sst_format = 'primary_key'                 |
+|                | )                                            |
++----------------+----------------------------------------------+
+
+ALTER DATABASE alter_database UNSET 'sst_format';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+----------------------------------------------+
+| Database       | Create Database                              |
++----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.twcs.time_window' = '30m',     |
+|                |   'compaction.type' = 'twcs'                 |
+|                | )                                            |
++----------------+----------------------------------------------+
+
 DROP DATABASE alter_database;
 
 Affected Rows: 0
diff --git a/tests/cases/standalone/common/alter/alter_database.sql b/tests/cases/standalone/common/alter/alter_database.sql
index 1b2f75637a..33b309153e 100644
--- a/tests/cases/standalone/common/alter/alter_database.sql
+++ b/tests/cases/standalone/common/alter/alter_database.sql
@@ -90,5 +90,25 @@ ALTER DATABASE alter_database UNSET 'ttl';
 
 SHOW CREATE DATABASE alter_database;
 
-DROP DATABASE alter_database;
+-- Test sst_format option
+ALTER DATABASE alter_database SET 'sst_format'='flat';
 
+SHOW CREATE DATABASE alter_database;
+
+USE alter_database;
+
+CREATE TABLE monitor(ts TIMESTAMP TIME INDEX);
+
+SHOW CREATE TABLE monitor;
+
+USE public;
+
+ALTER DATABASE alter_database SET 'sst_format'='primary_key';
+
+SHOW CREATE DATABASE alter_database;
+
+ALTER DATABASE alter_database UNSET 'sst_format';
+
+SHOW CREATE DATABASE alter_database;
+
+DROP DATABASE alter_database;
diff --git a/tests/cases/standalone/common/flow/flow_tql_avg.result b/tests/cases/standalone/common/flow/flow_tql_avg.result
new file mode 100644
index 0000000000..8438f41eb6
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_tql_avg.result
@@ -0,0 +1,126 @@
+CREATE TABLE sensor_readings (
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX,
+    sensor STRING,
+    loc STRING,
+    PRIMARY KEY (sensor, loc)
+);
+
+Affected Rows: 0
+
+CREATE TABLE sensor_readings_avg (
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX,
+    sensor STRING,
+    PRIMARY KEY (sensor)
+);
+
+Affected Rows: 0
+
+INSERT INTO sensor_readings VALUES
+    (20, now() - '30s'::interval, 'test', 'A');
+
+Affected Rows: 1
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (sensor_readings) AS value;
+
++-------+--------+---------------------+
+| value | sensor | ts                  |
++-------+--------+---------------------+
+| 20.0  | test   | TS |
++-------+--------+---------------------+
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+TQL EVAL (now() - '1m'::interval, now(), '1m') (sum by(sensor) (sensor_readings) / count by(sensor) (sensor_readings)) AS value;
+
++-------+--------+---------------------+
+| value | sensor | ts                  |
++-------+--------+---------------------+
+| 20.0  | test   | TS |
++-------+--------+---------------------+
+
+CREATE FLOW sensor_readings_avg_flow
+SINK TO sensor_readings_avg
+EVAL INTERVAL '1m' AS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (sensor_readings) AS value;
+
+Affected Rows: 0
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('sensor_readings_avg_flow');
+
++----------------------------------------------+
+| ADMIN FLUSH_FLOW('sensor_readings_avg_flow') |
++----------------------------------------------+
+|  FLOW_FLUSHED  |
++----------------------------------------------+
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+SELECT * FROM sensor_readings_avg ORDER BY ts DESC LIMIT 1;
+
++-------+---------------------+--------+
+| value | ts                  | sensor |
++-------+---------------------+--------+
+| 20.0  | TS | test   |
++-------+---------------------+--------+
+
+DROP FLOW sensor_readings_avg_flow;
+
+Affected Rows: 0
+
+-- SQLNESS SLEEP 1s
+INSERT INTO sensor_readings VALUES
+    (30, now() - '40s'::interval, 'test', 'B');
+
+Affected Rows: 1
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (sensor_readings) AS value;
+
++-------+--------+---------------------+
+| value | sensor | ts                  |
++-------+--------+---------------------+
+| 25.0  | test   | TS |
++-------+--------+---------------------+
+
+CREATE FLOW sensor_readings_avg_flow
+SINK TO sensor_readings_avg
+EVAL INTERVAL '1m' AS
+TQL EVAL (now() - '1m'::interval, now(), '1m') (sum by(sensor) (sensor_readings) / count by(sensor) (sensor_readings)) AS value;
+
+Affected Rows: 0
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('sensor_readings_avg_flow');
+
++----------------------------------------------+
+| ADMIN FLUSH_FLOW('sensor_readings_avg_flow') |
++----------------------------------------------+
+|  FLOW_FLUSHED  |
++----------------------------------------------+
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+SELECT * FROM sensor_readings_avg ORDER BY ts DESC LIMIT 1;
+
++-------+---------------------+--------+
+| value | ts                  | sensor |
++-------+---------------------+--------+
+| 25.0  | TS | test   |
++-------+---------------------+--------+
+
+DROP FLOW sensor_readings_avg_flow;
+
+Affected Rows: 0
+
+DROP TABLE sensor_readings_avg;
+
+Affected Rows: 0
+
+DROP TABLE sensor_readings;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/flow/flow_tql_avg.sql b/tests/cases/standalone/common/flow/flow_tql_avg.sql
new file mode 100644
index 0000000000..a5d6ab9d2b
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_tql_avg.sql
@@ -0,0 +1,63 @@
+CREATE TABLE sensor_readings (
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX,
+    sensor STRING,
+    loc STRING,
+    PRIMARY KEY (sensor, loc)
+);
+
+CREATE TABLE sensor_readings_avg (
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX,
+    sensor STRING,
+    PRIMARY KEY (sensor)
+);
+
+INSERT INTO sensor_readings VALUES
+    (20, now() - '30s'::interval, 'test', 'A');
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (sensor_readings) AS value;
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+TQL EVAL (now() - '1m'::interval, now(), '1m') (sum by(sensor) (sensor_readings) / count by(sensor) (sensor_readings)) AS value;
+
+CREATE FLOW sensor_readings_avg_flow
+SINK TO sensor_readings_avg
+EVAL INTERVAL '1m' AS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (sensor_readings) AS value;
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('sensor_readings_avg_flow');
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+SELECT * FROM sensor_readings_avg ORDER BY ts DESC LIMIT 1;
+
+DROP FLOW sensor_readings_avg_flow;
+
+-- SQLNESS SLEEP 1s
+INSERT INTO sensor_readings VALUES
+    (30, now() - '40s'::interval, 'test', 'B');
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (sensor_readings) AS value;
+
+
+CREATE FLOW sensor_readings_avg_flow
+SINK TO sensor_readings_avg
+EVAL INTERVAL '1m' AS
+TQL EVAL (now() - '1m'::interval, now(), '1m') (sum by(sensor) (sensor_readings) / count by(sensor) (sensor_readings)) AS value;
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('sensor_readings_avg_flow');
+
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) TS
+SELECT * FROM sensor_readings_avg ORDER BY ts DESC LIMIT 1;
+
+DROP FLOW sensor_readings_avg_flow;
+
+DROP TABLE sensor_readings_avg;
+DROP TABLE sensor_readings;
diff --git a/tests/cases/standalone/common/prepare/mysql_prepare.result b/tests/cases/standalone/common/prepare/mysql_prepare.result
index abc267b50e..5ef242a891 100644
--- a/tests/cases/standalone/common/prepare/mysql_prepare.result
+++ b/tests/cases/standalone/common/prepare/mysql_prepare.result
@@ -42,7 +42,7 @@ affected_rows: 0
 -- SQLNESS PROTOCOL MYSQL
 EXECUTE stmt USING 'a';
 
-Failed to execute query, err: MySqlError { ERROR 1815 (HY000): (EngineExecuteQuery): Cast error: Cannot cast string 'a' to value of Int32 type }
+Failed to execute query, err: MySqlError { ERROR 1210 (HY000): (InvalidArguments): Invalid request parameter: Unable to convert a to datatype Int32(Int32Type) }
 
 -- SQLNESS PROTOCOL MYSQL
 DEALLOCATE stmt;
@@ -124,6 +124,25 @@ DEALLOCATE stmt;
 
 affected_rows: 0
 
+-- SQLNESS PROTOCOL MYSQL
+PREPARE stmt FROM 'SELECT table_name, table_schema FROM information_schema.tables WHERE table_name = ?';
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+EXECUTE stmt USING 'cake';
+
++------------+--------------+
+| table_name | table_schema |
++------------+--------------+
+| cake       | public       |
++------------+--------------+
+
+-- SQLNESS PROTOCOL MYSQL
+DEALLOCATE stmt;
+
+affected_rows: 0
+
 -- SQLNESS PROTOCOL MYSQL
 DROP TABLE cake;
 
diff --git a/tests/cases/standalone/common/prepare/mysql_prepare.sql b/tests/cases/standalone/common/prepare/mysql_prepare.sql
index 8e80a0a867..e96e945f88 100644
--- a/tests/cases/standalone/common/prepare/mysql_prepare.sql
+++ b/tests/cases/standalone/common/prepare/mysql_prepare.sql
@@ -72,5 +72,14 @@ EXECUTE stmt USING 'happy', 42, 0;
 -- SQLNESS PROTOCOL MYSQL
 DEALLOCATE stmt;
 
+-- SQLNESS PROTOCOL MYSQL
+PREPARE stmt FROM 'SELECT table_name, table_schema FROM information_schema.tables WHERE table_name = ?';
+
+-- SQLNESS PROTOCOL MYSQL
+EXECUTE stmt USING 'cake';
+
+-- SQLNESS PROTOCOL MYSQL
+DEALLOCATE stmt;
+
 -- SQLNESS PROTOCOL MYSQL
 DROP TABLE cake;
diff --git a/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result
new file mode 100644
index 0000000000..ab3c4db715
--- /dev/null
+++ b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.result
@@ -0,0 +1,106 @@
+CREATE TABLE phy (
+    t TIMESTAMP TIME INDEX,
+    v DOUBLE
+) ENGINE=metric WITH ("physical_metric_table" = "");
+
+Affected Rows: 0
+
+CREATE TABLE metric_a (
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    l5 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l1, l2, l3, l4, l5)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+Affected Rows: 0
+
+CREATE TABLE metric_b (
+    l6 STRING NULL,
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l6, l1, l2, l3, l4)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+Affected Rows: 0
+
+INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120);
+
+Affected Rows: 9
+
+INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2);
+
+Affected Rows: 6
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50));
+
++---------------------+-------------------------------------------------------------------+
+| t                   | count(metric_a.prom_rate(t_range,v,t,Int64(180000)) / metric_b.v) |
++---------------------+-------------------------------------------------------------------+
+| 1970-01-01T00:03:00 | 1                                                                 |
++---------------------+-------------------------------------------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]));
+
++---------------------+---------------------------------------------+
+| t                   | count(prom_rate(t_range,v,t,Int64(180000))) |
++---------------------+---------------------------------------------+
+| 1970-01-01T00:03:00 | 3                                           |
++---------------------+---------------------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])) / 2;
+
++---------------------+----------------------------------------------------------+
+| t                   | count(prom_rate(t_range,v,t,Int64(180000))) / Float64(2) |
++---------------------+----------------------------------------------------------+
+| 1970-01-01T00:03:00 | 1.5                                                      |
++---------------------+----------------------------------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') (count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]))) * 100;
+
++---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+| t                   | metric_b.count(metric_a.prom_rate(t_range,v,t,Int64(180000)) / metric_b.v) / metric_a.count(prom_rate(t_range,v,t,Int64(180000))) * Float64(100) |
++---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+| 1970-01-01T00:03:00 | 33.33333333333333                                                                                                                                |
++---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+
+DROP TABLE metric_a;
+
+Affected Rows: 0
+
+DROP TABLE metric_b;
+
+Affected Rows: 0
+
+DROP TABLE phy;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql
new file mode 100644
index 0000000000..946d4f93a1
--- /dev/null
+++ b/tests/cases/standalone/common/promql/anon_promql_ratio_repro.sql
@@ -0,0 +1,63 @@
+CREATE TABLE phy (
+    t TIMESTAMP TIME INDEX,
+    v DOUBLE
+) ENGINE=metric WITH ("physical_metric_table" = "");
+
+CREATE TABLE metric_a (
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    l5 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l1, l2, l3, l4, l5)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+CREATE TABLE metric_b (
+    l6 STRING NULL,
+    l1 STRING NULL,
+    l2 STRING NULL,
+    l3 STRING NULL,
+    l4 STRING NULL,
+    t TIMESTAMP NOT NULL,
+    v DOUBLE NULL,
+    TIME INDEX (t),
+    PRIMARY KEY (l6, l1, l2, l3, l4)
+) ENGINE=metric WITH (on_physical_table = 'phy');
+
+INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120),
+    ('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30),
+    ('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60),
+    ('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120);
+
+INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1),
+    ('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2),
+    ('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2);
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50));
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]));
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])) / 2;
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (180, 360, '180s') (count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]))) * 100;
+
+DROP TABLE metric_a;
+DROP TABLE metric_b;
+DROP TABLE phy;
diff --git a/tests/cases/standalone/common/promql/scalar.result b/tests/cases/standalone/common/promql/scalar.result
index c5c3e5ebd1..c3292b4f5c 100644
--- a/tests/cases/standalone/common/promql/scalar.result
+++ b/tests/cases/standalone/common/promql/scalar.result
@@ -136,6 +136,42 @@ TQL EVAL (0, 15, '5s') scalar(count(count(host) by (host)));
 | 1970-01-01T00:00:15 | 2.0                            |
 +---------------------+--------------------------------+
 
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 15, '5s') scalar(count(sum(host) by (host)));
+
++---------------------+------------------------------+
+| ts                  | scalar(count(sum(host.val))) |
++---------------------+------------------------------+
+| 1970-01-01T00:00:00 | 2.0                          |
+| 1970-01-01T00:00:05 | 2.0                          |
+| 1970-01-01T00:00:10 | 2.0                          |
+| 1970-01-01T00:00:15 | 2.0                          |
++---------------------+------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 15, '5s') scalar(count(avg(host) by (host)));
+
++---------------------+------------------------------+
+| ts                  | scalar(count(avg(host.val))) |
++---------------------+------------------------------+
+| 1970-01-01T00:00:00 | 2.0                          |
+| 1970-01-01T00:00:05 | 2.0                          |
+| 1970-01-01T00:00:10 | 2.0                          |
+| 1970-01-01T00:00:15 | 2.0                          |
++---------------------+------------------------------+
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 15, '5s') scalar(count(stddev(host) by (host)));
+
++---------------------+-------------------------------------+
+| ts                  | scalar(count(stddev_pop(host.val))) |
++---------------------+-------------------------------------+
+| 1970-01-01T00:00:00 | 2.0                                 |
+| 1970-01-01T00:00:05 | 2.0                                 |
+| 1970-01-01T00:00:10 | 2.0                                 |
+| 1970-01-01T00:00:15 | 2.0                                 |
++---------------------+-------------------------------------+
+
 -- SQLNESS SORT_RESULT 3 1
 TQL EVAL (0, 15, '5s') scalar(host{host="host1"} + scalar(host{host="host2"}));
 
@@ -516,7 +552,99 @@ TQL EVAL (0, 15, '5s') clamp_max(clamp(host{host="host1"}, 0, 15), 6);
 | 1970-01-01T00:00:15 | 6.0                                                     | host1 |
 +---------------------+---------------------------------------------------------+-------+
 
-Drop table host;
+DROP TABLE host;
+
+Affected Rows: 0
+
+CREATE TABLE presence_metric (
+  ts timestamp(3) time index,
+  instance STRING,
+  cpu STRING,
+  shard STRING,
+  val DOUBLE,
+  PRIMARY KEY (instance, cpu, shard),
+);
+
+Affected Rows: 0
+
+INSERT INTO TABLE presence_metric VALUES
+    (0,      'i1', 'cpu0', 'a', 1.0),
+    (0,      'i1', 'cpu0', 'b', 2.0),
+    (0,      'i1', 'cpu1', 'a', 10.0),
+    (0,      'i1', 'cpu2', 'a', 20.0),
+    (0,      'i2', 'cpu9', 'a', 100.0),
+    (200000, 'i1', 'cpu0', 'a', 'NAN'::DOUBLE),
+    (200000, 'i1', 'cpu0', 'b', 'NAN'::DOUBLE),
+    (200000, 'i1', 'cpu1', 'a', 11.0),
+    (200000, 'i1', 'cpu2', 'a', NULL),
+    (200000, 'i2', 'cpu9', 'a', 101.0),
+    (400000, 'i1', 'cpu1', 'a', 12.0),
+    (400000, 'i2', 'cpu9', 'a', 102.0),
+    (600000, 'i1', 'cpu0', 'a', 7.0),
+    (600000, 'i1', 'cpu0', 'b', 8.0),
+    (600000, 'i2', 'cpu9', 'a', 103.0);
+
+Affected Rows: 15
+
+-- NaN drops `cpu0` from the grouped count, while the NULL sample on `cpu2`
+-- still leaves a zero-valued row in `count(...) by (cpu)`.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') count(presence_metric{instance="i1"}) by (cpu);
+
++------+---------------------+----------------------------+
+| cpu  | ts                  | count(presence_metric.val) |
++------+---------------------+----------------------------+
+| cpu0 | 1970-01-01T00:00:00 | 2                          |
+| cpu0 | 1970-01-01T00:10:00 | 2                          |
+| cpu1 | 1970-01-01T00:00:00 | 1                          |
+| cpu1 | 1970-01-01T00:03:20 | 1                          |
+| cpu1 | 1970-01-01T00:06:40 | 1                          |
+| cpu1 | 1970-01-01T00:10:00 | 1                          |
+| cpu2 | 1970-01-01T00:00:00 | 1                          |
+| cpu2 | 1970-01-01T00:03:20 | 0                          |
+| cpu2 | 1970-01-01T00:06:40 | 0                          |
++------+---------------------+----------------------------+
+
+-- Nested-count rewrite should preserve grouped presence after stale-NaN filtering and null-value pruning.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric{instance="i1"}) by (cpu)));
+
++---------------------+-------------------------------------------+
+| ts                  | scalar(count(count(presence_metric.val))) |
++---------------------+-------------------------------------------+
+| 1970-01-01T00:00:00 | 3.0                                       |
+| 1970-01-01T00:03:20 | 2.0                                       |
+| 1970-01-01T00:06:40 | 2.0                                       |
+| 1970-01-01T00:10:00 | 2.0                                       |
++---------------------+-------------------------------------------+
+
+-- Non-count inner aggregates must drop NULL-only groups before the outer count.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') scalar(count(sum(presence_metric{instance="i1"}) by (cpu)));
+
++---------------------+-----------------------------------------+
+| ts                  | scalar(count(sum(presence_metric.val))) |
++---------------------+-----------------------------------------+
+| 1970-01-01T00:00:00 | 3.0                                     |
+| 1970-01-01T00:03:20 | 1.0                                     |
+| 1970-01-01T00:06:40 | 1.0                                     |
+| 1970-01-01T00:10:00 | 2.0                                     |
++---------------------+-----------------------------------------+
+
+-- False case: outer `by (instance)` keeps multiple series at the scalar input, so scalar should still yield NaN.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric) by (instance, cpu)) by (instance));
+
++---------------------+-------------------------------------------+
+| ts                  | scalar(count(count(presence_metric.val))) |
++---------------------+-------------------------------------------+
+| 1970-01-01T00:00:00 | NaN                                       |
+| 1970-01-01T00:03:20 | NaN                                       |
+| 1970-01-01T00:06:40 | NaN                                       |
+| 1970-01-01T00:10:00 | NaN                                       |
++---------------------+-------------------------------------------+
+
+DROP TABLE presence_metric;
 
 Affected Rows: 0
 
diff --git a/tests/cases/standalone/common/promql/scalar.sql b/tests/cases/standalone/common/promql/scalar.sql
index b4007bbf15..662f9665fe 100644
--- a/tests/cases/standalone/common/promql/scalar.sql
+++ b/tests/cases/standalone/common/promql/scalar.sql
@@ -43,6 +43,15 @@ TQL EVAL (0, 15, '5s') scalar(host{host="host1"}) + host;
 -- SQLNESS SORT_RESULT 3 1
 TQL EVAL (0, 15, '5s') scalar(count(count(host) by (host)));
 
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 15, '5s') scalar(count(sum(host) by (host)));
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 15, '5s') scalar(count(avg(host) by (host)));
+
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 15, '5s') scalar(count(stddev(host) by (host)));
+
 -- SQLNESS SORT_RESULT 3 1
 TQL EVAL (0, 15, '5s') scalar(host{host="host1"} + scalar(host{host="host2"}));
 
@@ -149,4 +158,49 @@ TQL EVAL (0, 15, '5s') clamp(clamp_min(host{host="host1"}, 1), 0, 12);
 -- SQLNESS SORT_RESULT 3 1
 TQL EVAL (0, 15, '5s') clamp_max(clamp(host{host="host1"}, 0, 15), 6);
 
-Drop table host;
+DROP TABLE host;
+
+CREATE TABLE presence_metric (
+  ts timestamp(3) time index,
+  instance STRING,
+  cpu STRING,
+  shard STRING,
+  val DOUBLE,
+  PRIMARY KEY (instance, cpu, shard),
+);
+
+INSERT INTO TABLE presence_metric VALUES
+    (0,      'i1', 'cpu0', 'a', 1.0),
+    (0,      'i1', 'cpu0', 'b', 2.0),
+    (0,      'i1', 'cpu1', 'a', 10.0),
+    (0,      'i1', 'cpu2', 'a', 20.0),
+    (0,      'i2', 'cpu9', 'a', 100.0),
+    (200000, 'i1', 'cpu0', 'a', 'NAN'::DOUBLE),
+    (200000, 'i1', 'cpu0', 'b', 'NAN'::DOUBLE),
+    (200000, 'i1', 'cpu1', 'a', 11.0),
+    (200000, 'i1', 'cpu2', 'a', NULL),
+    (200000, 'i2', 'cpu9', 'a', 101.0),
+    (400000, 'i1', 'cpu1', 'a', 12.0),
+    (400000, 'i2', 'cpu9', 'a', 102.0),
+    (600000, 'i1', 'cpu0', 'a', 7.0),
+    (600000, 'i1', 'cpu0', 'b', 8.0),
+    (600000, 'i2', 'cpu9', 'a', 103.0);
+
+-- NaN drops `cpu0` from the grouped count, while the NULL sample on `cpu2`
+-- still leaves a zero-valued row in `count(...) by (cpu)`.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') count(presence_metric{instance="i1"}) by (cpu);
+
+-- Nested-count rewrite should preserve grouped presence after stale-NaN filtering and null-value pruning.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric{instance="i1"}) by (cpu)));
+
+-- Non-count inner aggregates must drop NULL-only groups before the outer count.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') scalar(count(sum(presence_metric{instance="i1"}) by (cpu)));
+
+-- False case: outer `by (instance)` keeps multiple series at the scalar input, so scalar should still yield NaN.
+-- SQLNESS SORT_RESULT 3 1
+TQL EVAL (0, 600, '200s') scalar(count(count(presence_metric) by (instance, cpu)) by (instance));
+
+DROP TABLE presence_metric;
diff --git a/tests/cases/standalone/common/tql/tql-cte.result b/tests/cases/standalone/common/tql/tql-cte.result
index a8c0c45d5d..e8278e80bd 100644
--- a/tests/cases/standalone/common/tql/tql-cte.result
+++ b/tests/cases/standalone/common/tql/tql-cte.result
@@ -427,8 +427,8 @@ SELECT min(val) as min_computed, max(val) as max_computed FROM computed;
 |               |   Aggregate: groupBy=[[]], aggr=[[min(computed.val), max(computed.val)]]                                                    |
 |               |     SubqueryAlias: computed                                                                                                 |
 |               |       Projection: metric.ts AS ts, val * Float64(2) + Float64(1) AS val                                                     |
-|               |         Projection: metric.ts, val * Float64(2) + Float64(1) AS val * Float64(2) + Float64(1)                               |
-|               |           Projection: metric.ts, metric.val * Float64(2) AS val * Float64(2)                                                |
+|               |         Projection: metric.ts, CAST(val * Float64(2) AS Float64) + Float64(1) AS val * Float64(2) + Float64(1)              |
+|               |           Projection: metric.ts, CAST(metric.val AS Float64) * Float64(2) AS val * Float64(2)                               |
 |               |             PromInstantManipulate: range=[0..40000], lookback=[300000], interval=[10000], time index=[ts]                   |
 |               |               PromSeriesDivide: tags=[]                                                                                     |
 |               |                 Filter: metric.ts >= TimestampMillisecond(-299999, None) AND metric.ts <= TimestampMillisecond(40000, None) |
diff --git a/tests/cases/standalone/common/types/json/json.result b/tests/cases/standalone/common/types/json/json.result
index 8c4755f4ae..8fad9632b1 100644
--- a/tests/cases/standalone/common/types/json/json.result
+++ b/tests/cases/standalone/common/types/json/json.result
@@ -37,22 +37,23 @@ INSERT INTO jsons VALUES('[null]', 0),
             }
         ]
     }
-}}', 11);
+}}', 11),
+('{"a":"abc\u2028tom"}', 12);
 
-Affected Rows: 12
+Affected Rows: 13
 
-INSERT INTO jsons VALUES(parse_json('[null]'), 12),
-(parse_json('[true]'), 13),
-(parse_json('[false]'), 14),
-(parse_json('[0]'), 15),
-(parse_json('["foo"]'), 16),
-(parse_json('[]'), 17),
-(parse_json('{}'), 18),
-(parse_json('[0,1]'), 19),
-(parse_json('{"foo":"bar"}'), 20),
-(parse_json('{"a":null,"foo":"bar"}'), 21),
-(parse_json('[-1]'), 22),
-(parse_json('[-2147483648]'), 23),
+INSERT INTO jsons VALUES(parse_json('[null]'), 1000),
+(parse_json('[true]'), 1001),
+(parse_json('[false]'), 1002),
+(parse_json('[0]'), 1003),
+(parse_json('["foo"]'), 1004),
+(parse_json('[]'), 1005),
+(parse_json('{}'), 1006),
+(parse_json('[0,1]'), 1007),
+(parse_json('{"foo":"bar"}'), 1008),
+(parse_json('{"a":null,"foo":"bar"}'), 1009),
+(parse_json('[-1]'), 1010),
+(parse_json('[-2147483648]'), 1011),
 (parse_json('{"entities": {
             "description": {
                 "urls": [
@@ -76,9 +77,10 @@ INSERT INTO jsons VALUES(parse_json('[null]'), 12),
                     }
                 ]
             }
-        }}'), 24);
+        }}'), 1012),
+(parse_json('{"a":"abc\u2028tom"}'), 1013);
 
-Affected Rows: 13
+Affected Rows: 14
 
 SELECT json_to_string(j), t FROM jsons;
 
@@ -97,25 +99,27 @@ SELECT json_to_string(j), t FROM jsons;
 | {"a":null,"foo":"bar"}                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:00.009 |
 | [-1]                                                                                                                                                                                                                                                                                                                      | 1970-01-01T00:00:00.010 |
 | {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:00.011 |
-| [null]                                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:00.012 |
-| [true]                                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:00.013 |
-| [false]                                                                                                                                                                                                                                                                                                                   | 1970-01-01T00:00:00.014 |
-| [0]                                                                                                                                                                                                                                                                                                                       | 1970-01-01T00:00:00.015 |
-| ["foo"]                                                                                                                                                                                                                                                                                                                   | 1970-01-01T00:00:00.016 |
-| []                                                                                                                                                                                                                                                                                                                        | 1970-01-01T00:00:00.017 |
-| {}                                                                                                                                                                                                                                                                                                                        | 1970-01-01T00:00:00.018 |
-| [0,1]                                                                                                                                                                                                                                                                                                                     | 1970-01-01T00:00:00.019 |
-| {"foo":"bar"}                                                                                                                                                                                                                                                                                                             | 1970-01-01T00:00:00.020 |
-| {"a":null,"foo":"bar"}                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:00.021 |
-| [-1]                                                                                                                                                                                                                                                                                                                      | 1970-01-01T00:00:00.022 |
-| [-2147483648]                                                                                                                                                                                                                                                                                                             | 1970-01-01T00:00:00.023 |
-| {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:00.024 |
+| {"a":"abc\u2028tom"}                                                                                                                                                                                                                                                                                                      | 1970-01-01T00:00:00.012 |
+| [null]                                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:01     |
+| [true]                                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:01.001 |
+| [false]                                                                                                                                                                                                                                                                                                                   | 1970-01-01T00:00:01.002 |
+| [0]                                                                                                                                                                                                                                                                                                                       | 1970-01-01T00:00:01.003 |
+| ["foo"]                                                                                                                                                                                                                                                                                                                   | 1970-01-01T00:00:01.004 |
+| []                                                                                                                                                                                                                                                                                                                        | 1970-01-01T00:00:01.005 |
+| {}                                                                                                                                                                                                                                                                                                                        | 1970-01-01T00:00:01.006 |
+| [0,1]                                                                                                                                                                                                                                                                                                                     | 1970-01-01T00:00:01.007 |
+| {"foo":"bar"}                                                                                                                                                                                                                                                                                                             | 1970-01-01T00:00:01.008 |
+| {"a":null,"foo":"bar"}                                                                                                                                                                                                                                                                                                    | 1970-01-01T00:00:01.009 |
+| [-1]                                                                                                                                                                                                                                                                                                                      | 1970-01-01T00:00:01.010 |
+| [-2147483648]                                                                                                                                                                                                                                                                                                             | 1970-01-01T00:00:01.011 |
+| {"entities":{"description":{"urls":[{"display_url":"pixiv.net/member.php?id=…","expanded_url":"http://www.pixiv.net/member.php?id=4776","indices":[58,80],"url":"http://t.co/QMLJeFmfMT"},{"display_url":"ask.fm/KATANA77","expanded_url":"http://ask.fm/KATANA77","indices":[95,117],"url":"http://t.co/LU8T7vmU3h"}]}}} | 1970-01-01T00:00:01.012 |
+| {"a":"abc\u2028tom"}                                                                                                                                                                                                                                                                                                      | 1970-01-01T00:00:01.013 |
 +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+
 
 --Insert invalid json strings--
 DELETE FROM jsons;
 
-Affected Rows: 25
+Affected Rows: 27
 
 INSERT INTO jsons VALUES(parse_json('{"a":1, "b":2, "c":3'), 4);
 
diff --git a/tests/cases/standalone/common/types/json/json.sql b/tests/cases/standalone/common/types/json/json.sql
index 868edc59e8..5a521ee1c6 100644
--- a/tests/cases/standalone/common/types/json/json.sql
+++ b/tests/cases/standalone/common/types/json/json.sql
@@ -35,20 +35,21 @@ INSERT INTO jsons VALUES('[null]', 0),
             }
         ]
     }
-}}', 11);
+}}', 11),
+('{"a":"abc\u2028tom"}', 12);
 
-INSERT INTO jsons VALUES(parse_json('[null]'), 12),
-(parse_json('[true]'), 13),
-(parse_json('[false]'), 14),
-(parse_json('[0]'), 15),
-(parse_json('["foo"]'), 16),
-(parse_json('[]'), 17),
-(parse_json('{}'), 18),
-(parse_json('[0,1]'), 19),
-(parse_json('{"foo":"bar"}'), 20),
-(parse_json('{"a":null,"foo":"bar"}'), 21),
-(parse_json('[-1]'), 22),
-(parse_json('[-2147483648]'), 23),
+INSERT INTO jsons VALUES(parse_json('[null]'), 1000),
+(parse_json('[true]'), 1001),
+(parse_json('[false]'), 1002),
+(parse_json('[0]'), 1003),
+(parse_json('["foo"]'), 1004),
+(parse_json('[]'), 1005),
+(parse_json('{}'), 1006),
+(parse_json('[0,1]'), 1007),
+(parse_json('{"foo":"bar"}'), 1008),
+(parse_json('{"a":null,"foo":"bar"}'), 1009),
+(parse_json('[-1]'), 1010),
+(parse_json('[-2147483648]'), 1011),
 (parse_json('{"entities": {
             "description": {
                 "urls": [
@@ -72,7 +73,8 @@ INSERT INTO jsons VALUES(parse_json('[null]'), 12),
                     }
                 ]
             }
-        }}'), 24);
+        }}'), 1012),
+(parse_json('{"a":"abc\u2028tom"}'), 1013);
 
 SELECT json_to_string(j), t FROM jsons;
 
diff --git a/tests/cases/standalone/common/view/create.result b/tests/cases/standalone/common/view/create.result
index 1c6e0ee50b..76b9838628 100644
--- a/tests/cases/standalone/common/view/create.result
+++ b/tests/cases/standalone/common/view/create.result
@@ -30,6 +30,10 @@ CREATE VIEW test_view as SELECT * FROM public.numbers;
 
 Affected Rows: 0
 
+CREATE VIEW test_view2 as SELECT * FROM test_view;
+
+Affected Rows: 0
+
 --- View already exists ----
 CREATE VIEW test_view as SELECT * FROM public.numbers;
 
@@ -51,6 +55,7 @@ SHOW TABLES;
 | numbers          |
 | test_table       |
 | test_view        |
+| test_view2       |
 +------------------+
 
 SHOW FULL TABLES;
@@ -61,6 +66,7 @@ SHOW FULL TABLES;
 | numbers          | LOCAL TEMPORARY |
 | test_table       | BASE TABLE      |
 | test_view        | VIEW            |
+| test_view2       | VIEW            |
 +------------------+-----------------+
 
 -- psql: \dv
@@ -124,17 +130,19 @@ SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE;
 |greptime|information_schema|tables|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 |greptime|public|test_table|BASETABLE|ID|ID|ID|ID|ID|ID|mito|ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||N|
 |greptime|public|test_view|VIEW|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||N|
+|greptime|public|test_view2|VIEW|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||N|
 |greptime|information_schema|views|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 +++++++++++++++++++++++++
 
 -- SQLNESS REPLACE (\s\d+\s) ID
 -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) DATETIME
-SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW';
+SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW' ORDER BY TABLE_NAME;
 
 +---------------+--------------+------------+------------+----------+-------------+-----------------+--------------+------------------+----------------+--------+---------+------------+------------+-----------+----------------+---------------------+---------------------+------------+-----------------+----------+----------------+---------------+-----------+
 | table_catalog | table_schema | table_name | table_type | table_id | data_length | max_data_length | index_length | max_index_length | avg_row_length | engine | version | row_format | table_rows | data_free | auto_increment | create_time         | update_time         | check_time | table_collation | checksum | create_options | table_comment | temporary |
 +---------------+--------------+------------+------------+----------+-------------+-----------------+--------------+------------------+----------------+--------+---------+------------+------------+-----------+----------------+---------------------+---------------------+------------+-----------------+----------+----------------+---------------+-----------+
 | greptime      | public       | test_view  | VIEW       |ID    |ID          |ID              |ID           |ID               |ID             |        |ID     | Fixed      |ID         |ID        |ID             |DATETIME |DATETIME |            | utf8_bin        |ID       |                |               | N         |
+| greptime      | public       | test_view2 | VIEW       |ID    |ID          |ID              |ID           |ID               |ID             |        |ID     | Fixed      |ID         |ID        |ID             |DATETIME |DATETIME |            | utf8_bin        |ID       |                |               | N         |
 +---------------+--------------+------------+------------+----------+-------------+-----------------+--------------+------------------+----------------+--------+---------+------------+------------+-----------+----------------+---------------------+---------------------+------------+-----------------+----------+----------------+---------------+-----------+
 
 SHOW COLUMNS FROM test_view;
@@ -169,10 +177,31 @@ SELECT * FROM test_view LIMIT 10;
 | 9      |
 +--------+
 
+SELECT * FROM test_view2 LIMIT 10;
+
++--------+
+| number |
++--------+
+| 0      |
+| 1      |
+| 2      |
+| 3      |
+| 4      |
+| 5      |
+| 6      |
+| 7      |
+| 8      |
+| 9      |
++--------+
+
 DROP VIEW test_view;
 
 Affected Rows: 0
 
+DROP VIEW test_view2;
+
+Affected Rows: 0
+
 DROP TABLE test_table;
 
 Affected Rows: 0
diff --git a/tests/cases/standalone/common/view/create.sql b/tests/cases/standalone/common/view/create.sql
index b82704d3a9..91149f44f4 100644
--- a/tests/cases/standalone/common/view/create.sql
+++ b/tests/cases/standalone/common/view/create.sql
@@ -16,6 +16,8 @@ CREATE OR REPLACE VIEW test_table as SELECT * FROM public.numbers;
 
 CREATE VIEW test_view as SELECT * FROM public.numbers;
 
+CREATE VIEW test_view2 as SELECT * FROM test_view;
+
 --- View already exists ----
 CREATE VIEW test_view as SELECT * FROM public.numbers;
 
@@ -48,7 +50,7 @@ SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE;
 
 -- SQLNESS REPLACE (\s\d+\s) ID
 -- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) DATETIME
-SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW';
+SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW' ORDER BY TABLE_NAME;
 
 SHOW COLUMNS FROM test_view;
 
@@ -58,8 +60,12 @@ SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'test_view';
 
 SELECT * FROM test_view LIMIT 10;
 
+SELECT * FROM test_view2 LIMIT 10;
+
 DROP VIEW test_view;
 
+DROP VIEW test_view2;
+
 DROP TABLE test_table;
 
 SELECT * FROM test_view LIMIT 10;
diff --git a/tests/cases/standalone/tql-explain-analyze/tsid_column.result b/tests/cases/standalone/tql-explain-analyze/tsid_column.result
index 84544b1655..4a7a875060 100644
--- a/tests/cases/standalone/tql-explain-analyze/tsid_column.result
+++ b/tests/cases/standalone/tql-explain-analyze/tsid_column.result
@@ -112,10 +112,63 @@ TQL ANALYZE (0, 10, '5s')  sum(irate(tsid_metric[1h])) / scalar(count(count(tsid
 |_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts], aggr=[count(count(tsid_metric.val))] REDACTED
 |_|_|_RepartitionExec: partitioning=REDACTED
 |_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts], aggr=[count(count(tsid_metric.val))] REDACTED
-|_|_|_ProjectionExec: expr=[ts@1 as ts, count(tsid_metric.val)@2 as count(tsid_metric.val)] REDACTED
-|_|_|_AggregateExec: mode=FinalPartitioned, gby=[job@0 as job, ts@1 as ts], aggr=[count(tsid_metric.val)] REDACTED
+|_|_|_ProjectionExec: expr=[ts@0 as ts] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED
 |_|_|_RepartitionExec: partitioning=REDACTED
-|_|_|_AggregateExec: mode=Partial, gby=[job@1 as job, ts@2 as ts], aggr=[count(tsid_metric.val)] REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED
+|_|_|_ProjectionExec: expr=[ts@3 as ts, job@1 as job] REDACTED
+|_|_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[5000], time index=[ts] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["__tsid"] REDACTED
+|_|_|_ProjectionExec: expr=[val@1 as val, job@3 as job, __tsid@2 as __tsid, ts@0 as ts] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+| 1_| 0_|_SortPreservingMergeExec: [ts@0 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts], aggr=[sum(prom_irate(ts_range,val))] REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts], aggr=[sum(prom_irate(ts_range,val))] REDACTED
+|_|_|_FilterExec: prom_irate(ts_range,val)@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[ts@2 as ts, prom_irate(ts_range@3, val@0) as prom_irate(ts_range,val)] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[0..10000], interval=[5000], eval range=[3600000], time index=[ts] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[ts], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["__tsid"] REDACTED
+|_|_|_ProjectionExec: expr=[val@1 as val, __tsid@2 as __tsid, ts@0 as ts] REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+|_|_| Total rows: 2_|
++-+-+-+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+TQL ANALYZE (0, 10, '5s')  sum(irate(tsid_metric[1h])) / scalar(count(sum(tsid_metric) by (job)));
+
++-+-+-+
+| stage | node | plan_|
++-+-+-+
+| 0_| 0_|_ProjectionExec: expr=[ts@1 as ts, sum(prom_irate(ts_range,val))@2 / scalar(count(sum(tsid_metric.val)))@0 as lhs.sum(prom_irate(ts_range,val)) / rhs.scalar(count(sum(tsid_metric.val)))] REDACTED
+|_|_|_REDACTED
+|_|_|_ScalarCalculateExec: tags=[] REDACTED
+|_|_|_CoalescePartitionsExec REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_SortPreservingMergeExec: [ts@0 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[ts@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts], aggr=[count(sum(tsid_metric.val))] REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts], aggr=[count(sum(tsid_metric.val))] REDACTED
+|_|_|_ProjectionExec: expr=[ts@0 as ts] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[ts@0 as ts, job@1 as job], aggr=[] REDACTED
+|_|_|_ProjectionExec: expr=[ts@1 as ts, job@0 as job] REDACTED
+|_|_|_FilterExec: val@0 IS NOT NULL, projection=[job@1, ts@2] REDACTED
 |_|_|_ProjectionExec: expr=[val@0 as val, job@1 as job, ts@3 as ts] REDACTED
 |_|_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[5000], time index=[ts] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["__tsid"] REDACTED
diff --git a/tests/cases/standalone/tql-explain-analyze/tsid_column.sql b/tests/cases/standalone/tql-explain-analyze/tsid_column.sql
index 7b3de23f33..dedce2dfb1 100644
--- a/tests/cases/standalone/tql-explain-analyze/tsid_column.sql
+++ b/tests/cases/standalone/tql-explain-analyze/tsid_column.sql
@@ -51,6 +51,14 @@ TQL ANALYZE (0, 10, '5s') sum by (job, instance) (tsid_metric);
 -- SQLNESS REPLACE (Hash.*) REDACTED
 TQL ANALYZE (0, 10, '5s')  sum(irate(tsid_metric[1h])) / scalar(count(count(tsid_metric) by (job)));
 
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+TQL ANALYZE (0, 10, '5s')  sum(irate(tsid_metric[1h])) / scalar(count(sum(tsid_metric) by (job)));
+
 DROP TABLE tsid_metric;
 DROP TABLE tsid_physical;
-
diff --git a/tests/conf/datanode-test.toml.template b/tests/conf/datanode-test.toml.template
index 4cb0423c72..3ec8a2f695 100644
--- a/tests/conf/datanode-test.toml.template
+++ b/tests/conf/datanode-test.toml.template
@@ -28,7 +28,7 @@ type = 'File'
 data_home = '{data_home}'
 
 [meta_client_options]
-metasrv_addrs = ['{metasrv_addr}']
+metasrv_addrs = ['{addrs.metasrv_addr}']
 timeout_millis = 3000
 connect_timeout_millis = 5000
 tcp_nodelay = false
diff --git a/tests/conf/frontend-test.toml.template b/tests/conf/frontend-test.toml.template
index de4ce86adc..25d44ff6e4 100644
--- a/tests/conf/frontend-test.toml.template
+++ b/tests/conf/frontend-test.toml.template
@@ -1,3 +1,3 @@
 [grpc]
-bind_addr = "{grpc_addr}"
-server_addr = "{grpc_addr}"
+bind_addr = "{addrs.grpc_addr}"
+server_addr = "{addrs.grpc_addr}"
diff --git a/tests/conf/standalone-test.toml.template b/tests/conf/standalone-test.toml.template
index 509eac7ca6..50c014e991 100644
--- a/tests/conf/standalone-test.toml.template
+++ b/tests/conf/standalone-test.toml.template
@@ -26,12 +26,12 @@ type = 'File'
 data_home = '{data_home}'
 
 [grpc]
-bind_addr = '{grpc_addr}'
+bind_addr = '{addrs.grpc_addr}'
 runtime_size = 8
 
 [mysql]
 enable = true
-addr = "{mysql_addr}"
+addr = "{addrs.mysql_addr}"
 runtime_size = 2
 prepared_stmt_cache_size= 10000
 
@@ -40,7 +40,7 @@ mode = "disable"
 
 [postgres]
 enable = true
-addr = "{postgres_addr}"
+addr = "{addrs.postgres_addr}"
 runtime_size = 2
 
 [procedure]
diff --git a/tests/runner/src/server_mode.rs b/tests/runner/src/server_mode.rs
index 172baf32ff..1f7cb72bf4 100644
--- a/tests/runner/src/server_mode.rs
+++ b/tests/runner/src/server_mode.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 use std::path::Path;
 use std::sync::{Mutex, OnceLock};
 
@@ -96,15 +96,7 @@ struct ConfigContext {
     use_etcd: bool,
     store_addrs: String,
     instance_id: usize,
-    // for following addrs, leave it empty if not needed
-    // required for datanode
-    metasrv_addr: String,
-    // for frontend and standalone
-    grpc_addr: String,
-    // for standalone
-    mysql_addr: String,
-    // for standalone
-    postgres_addr: String,
+    addrs: HashMap<String, String>,
     // enable flat format for storage engine
     enable_flat_format: bool,
 }
@@ -275,40 +267,26 @@ impl ServerMode {
         let procedure_dir = data_home.join("procedure").display().to_string();
 
         // Get the required addresses based on server mode
-        let (metasrv_addr, grpc_addr, mysql_addr, postgres_addr) = match self {
+        let addrs: HashMap<String, String> = match self {
             ServerMode::Standalone {
                 rpc_bind_addr,
                 mysql_addr,
                 postgres_addr,
-                ..
-            } => (
-                String::new(),
-                rpc_bind_addr.clone(),
-                mysql_addr.clone(),
-                postgres_addr.clone(),
-            ),
-            ServerMode::Frontend {
-                rpc_bind_addr,
-                mysql_addr,
-                postgres_addr,
-                ..
-            } => (
-                String::new(),
-                rpc_bind_addr.clone(),
-                mysql_addr.clone(),
-                postgres_addr.clone(),
-            ),
-            ServerMode::Datanode {
-                rpc_bind_addr,
-                metasrv_addr,
-                ..
-            } => (
-                metasrv_addr.clone(),
-                rpc_bind_addr.clone(),
-                String::new(),
-                String::new(),
-            ),
-            _ => (String::new(), String::new(), String::new(), String::new()),
+                http_addr,
+            } => [
+                ("http_addr".to_string(), http_addr.clone()),
+                ("grpc_addr".to_string(), rpc_bind_addr.clone()),
+                ("mysql_addr".to_string(), mysql_addr.clone()),
+                ("postgres_addr".to_string(), postgres_addr.clone()),
+            ]
+            .into(),
+            ServerMode::Frontend { rpc_bind_addr, .. } => {
+                [("grpc_addr".to_string(), rpc_bind_addr.clone())].into()
+            }
+            ServerMode::Datanode { metasrv_addr, .. } => {
+                [("metasrv_addr".to_string(), metasrv_addr.clone())].into()
+            }
+            _ => HashMap::new(),
         };
 
         let ctx = ConfigContext {
@@ -326,10 +304,7 @@ impl ServerMode {
                 .collect::<Vec<_>>()
                 .join(","),
             instance_id: id,
-            metasrv_addr,
-            grpc_addr,
-            mysql_addr,
-            postgres_addr,
+            addrs,
             enable_flat_format: db_ctx.store_config().enable_flat_format,
         };