diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml
index d0d2804c6a..65546dcc25 100644
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -669,18 +669,28 @@ jobs:
           - name: "Basic"
             opts: ""
             kafka: false
+            postgres: false
+            mysql: false
           - name: "Remote WAL"
             opts: "-w kafka -k 127.0.0.1:9092"
             kafka: true
+            postgres: false
+            mysql: false
           - name: "PostgreSQL KvBackend"
-            opts: "--setup-pg"
+            opts: "--setup-pg postgresql://greptimedb:admin@127.0.0.1:5432/postgres"
             kafka: false
-          - name: "MySQL Kvbackend"
-            opts: "--setup-mysql"
+            postgres: true
+            mysql: false
+          - name: "MySQL KvBackend"
+            opts: "--setup-mysql mysql://greptimedb:admin@127.0.0.1:3306/mysql"
             kafka: false
+            postgres: false
+            mysql: true
           - name: "Flat format"
             opts: "--enable-flat-format"
             kafka: false
+            postgres: false
+            mysql: false
     timeout-minutes: 60
     steps:
       - uses: actions/checkout@v4
@@ -688,9 +698,19 @@ jobs:
           persist-credentials: false
 
       - if: matrix.mode.kafka
-        name: Setup kafka server
+        name: Setup Kafka
         working-directory: tests-integration/fixtures
-        run:  ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait kafka
+        run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait kafka
+
+      - if: matrix.mode.postgres
+        name: Setup PostgreSQL
+        working-directory: tests-integration/fixtures
+        run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait postgres
+
+      - if: matrix.mode.mysql
+        name: Setup MySQL
+        working-directory: tests-integration/fixtures
+        run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait mysql
 
       - name: Download pre-built binaries
         uses: actions/download-artifact@v4
diff --git a/.github/workflows/nightly-jsonbench.yaml b/.github/workflows/nightly-jsonbench.yaml
new file mode 100644
index 0000000000..3667ee26a6
--- /dev/null
+++ b/.github/workflows/nightly-jsonbench.yaml
@@ -0,0 +1,162 @@
+name: Nightly JSONBench
+
+on:
+  schedule:
+    # Trigger at 00:00(Asia/Shanghai) on every weekday.
+    - cron: "0 16 * * 0-4"
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  allocate-runner:
+    name: Allocate runner
+    if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
+    runs-on: ubuntu-latest
+    outputs:
+      linux-arm64-runner: ${{ steps.start-linux-arm64-runner.outputs.label }}
+
+      # The following EC2 resource id will be used for resource releasing.
+      linux-arm64-ec2-runner-label: ${{ steps.start-linux-arm64-runner.outputs.label }}
+      linux-arm64-ec2-runner-instance-id: ${{ steps.start-linux-arm64-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+
+      - name: Allocate Linux ARM64 runner
+        uses: ./.github/actions/start-runner
+        id: start-linux-arm64-runner
+        with:
+          runner: ${{ vars.DEFAULT_ARM64_RUNNER }}
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.EC2_RUNNER_REGION }}
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          image-id: ${{ vars.EC2_RUNNER_LINUX_ARM64_IMAGE_ID }}
+          security-group-id: ${{ vars.EC2_RUNNER_SECURITY_GROUP_ID }}
+          subnet-id: ${{ vars.EC2_RUNNER_SUBNET_ID }}
+
+  jsonbench:
+    name: Run JSONBench
+    if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
+    needs: [ allocate-runner ]
+    runs-on: ${{ needs.allocate-runner.outputs.linux-arm64-runner }}
+    timeout-minutes: 120
+    env:
+      JSONBENCH_DATA_DIR: /home/runner/data/bluesky
+      JSONBENCH_OUTPUT_PREFIX: _ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+
+      - uses: arduino/setup-protoc@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          shared-key: "nightly-jsonbench"
+          cache-all-crates: "true"
+          save-if: ${{ github.ref == 'refs/heads/main' }}
+
+      - name: Build GreptimeDB
+        run: cargo build --profile nightly --bin greptime
+
+      - name: Reclaim disk space
+        shell: bash
+        run: |
+          set -euo pipefail
+
+          mkdir -p "${RUNNER_TEMP}/greptimedb-bin"
+          cp ./target/nightly/greptime "${RUNNER_TEMP}/greptimedb-bin/greptime"
+          chmod +x "${RUNNER_TEMP}/greptimedb-bin/greptime"
+
+          rm -rf ./target
+
+      - name: Run JSONBench
+        shell: bash
+        run: |
+          set -euo pipefail
+
+          cd "${RUNNER_TEMP}"
+          cp "${RUNNER_TEMP}/greptimedb-bin/greptime" ./greptime
+          chmod +x ./greptime
+
+          export GREPTIMEDB_STANDALONE__WAL__DIR=greptimedb_data/wal
+          export GREPTIMEDB_STANDALONE__STORAGE__DATA_HOME=greptimedb_data
+          export GREPTIMEDB_STANDALONE__LOGGING__DIR=greptimedb_data/logs
+          export GREPTIMEDB_STANDALONE__LOGGING__APPEND_STDOUT=false
+          export GREPTIMEDB_STANDALONE__HTTP__BODY_LIMIT=1GB
+          export GREPTIMEDB_STANDALONE__HTTP__TIMEOUT=500s
+
+          ./greptime standalone start > greptimedb.log 2>&1 &
+          greptime_pid=$!
+          trap 'kill "${greptime_pid}" 2>/dev/null || true' EXIT
+
+          until curl -s --fail -o /dev/null http://localhost:4000/health; do
+            if ! kill -0 "${greptime_pid}" 2>/dev/null; then
+              cat greptimedb.log
+              exit 1
+            fi
+            sleep 1
+          done
+
+          git clone --branch greptimedb-new-json --depth 1 https://github.com/GreptimeTeam/JSONBench.git JSONBench
+          cp ./greptime JSONBench/greptimedb/greptime
+
+          cd JSONBench/greptimedb
+          ./main.sh 3 "${JSONBENCH_DATA_DIR}" success.log error.log "${JSONBENCH_OUTPUT_PREFIX}" false
+
+      - name: Upload JSONBench results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: jsonbench-results
+          path: |
+            ${{ runner.temp }}/greptimedb.log
+            ${{ runner.temp }}/JSONBench/greptimedb/*.log
+            ${{ runner.temp }}/JSONBench/greptimedb/*.total_size
+            ${{ runner.temp }}/JSONBench/greptimedb/*.data_size
+            ${{ runner.temp }}/JSONBench/greptimedb/*.index_size
+            ${{ runner.temp }}/JSONBench/greptimedb/*.count
+            ${{ runner.temp }}/JSONBench/greptimedb/*.results_runtime
+            ${{ runner.temp }}/JSONBench/greptimedb/*.query_results
+          if-no-files-found: ignore
+          retention-days: 7
+
+  stop-linux-arm64-runner:
+    name: Stop Linux ARM64 runner
+    # It's always run as the last job in the workflow to make sure that the runner is released.
+    if: ${{ always() }}
+    runs-on: ubuntu-latest
+    needs: [
+      allocate-runner,
+      jsonbench,
+    ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+
+      - name: Stop Linux ARM64 runner
+        uses: ./.github/actions/stop-runner
+        with:
+          label: ${{ needs.allocate-runner.outputs.linux-arm64-ec2-runner-label }}
+          ec2-instance-id: ${{ needs.allocate-runner.outputs.linux-arm64-ec2-runner-instance-id }}
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.EC2_RUNNER_REGION }}
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
diff --git a/Cargo.lock b/Cargo.lock
index aafa225b4b..63ba289947 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1321,9 +1321,9 @@ dependencies = [
 
 [[package]]
 name = "bitpacking"
-version = "0.9.2"
+version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c1d3e2bfd8d06048a179f7b17afc3188effa10385e7b00dc65af6aae732ea92"
+checksum = "96a7139abd3d9cebf8cd6f920a389cf3dc9576172e32f4563f188cae3c3eb019"
 dependencies = [
  "crunchy",
 ]
@@ -1832,7 +1832,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c7d8d1efd5109b9c1cd3b7966bd071cdfb53bb6eb0b22a473a68c2f70a11a1eb"
 dependencies = [
  "parse-zoneinfo",
- "phf_codegen",
+ "phf_codegen 0.12.1",
  "phf_shared 0.12.1",
  "uncased",
 ]
@@ -4380,6 +4380,12 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "datasketches"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c286de4e81ea2590afc24d754e0f83810c566f50a1388fa75ebd57928c0d9745"
+
 [[package]]
 name = "datatypes"
 version = "1.1.0"
@@ -5486,12 +5492,12 @@ dependencies = [
 
 [[package]]
 name = "fs4"
-version = "0.8.4"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8"
+checksum = "8640e34b88f7652208ce9e88b1a37a2ae95227d84abec377ccd3c5cfeb141ed4"
 dependencies = [
- "rustix 0.38.44",
- "windows-sys 0.52.0",
+ "rustix 1.0.7",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -5820,7 +5826,7 @@ dependencies = [
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=dfd2a6d7d3d9c718cb159fcf9abae144b74fc503#dfd2a6d7d3d9c718cb159fcf9abae144b74fc503"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=7224c2ad6d11db612fbdb621c36135fc37ffce35#7224c2ad6d11db612fbdb621c36135fc37ffce35"
 dependencies = [
  "prost 0.14.1",
  "prost-types 0.14.1",
@@ -6564,27 +6570,37 @@ checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed"
 
 [[package]]
 name = "include-flate"
-version = "0.3.0"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e"
+checksum = "23e233413926ef735f7d87024466cfda5a4b87467730846bd82ea7d504121347"
 dependencies = [
  "include-flate-codegen",
- "lazy_static",
- "libflate",
+ "include-flate-compress",
 ]
 
 [[package]]
 name = "include-flate-codegen"
-version = "0.2.0"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7"
+checksum = "5e7148f24ef8922cc0e5574ebb908729ccdd3a110c440a45165733fedadd9969"
 dependencies = [
- "libflate",
+ "include-flate-compress",
+ "proc-macro-error2",
  "proc-macro2",
  "quote",
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "include-flate-compress"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74783a9ed407e844e99d5e7a57bd650acbfa124cf6e97ffd790ba59d8ab8e7ff"
+dependencies = [
+ "libflate",
+ "zstd",
+]
+
 [[package]]
 name = "include_dir"
 version = "0.7.4"
@@ -6918,25 +6934,25 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
 
 [[package]]
 name = "jieba-macros"
-version = "0.8.0"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6105f38f083bb1a79ad523bd32fa0d8ffcb6abd2fc4da9da203c32bca5b6ace3"
+checksum = "661344b2412fb00aee1841d2405c9a31f7c91cf6e578a8e953647c43dd1a8b0a"
 dependencies = [
- "phf_codegen",
+ "phf_codegen 0.13.1",
 ]
 
 [[package]]
 name = "jieba-rs"
-version = "0.8.0"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47982a320106da83b0c5d6aec0fb83e109f0132b69670b063adaa6fa5b4f3f4a"
+checksum = "d7ef90d6209fcff084a01b488c4199d882e3764b15ff0e7a6b5d7efaa46e1e4f"
 dependencies = [
  "cedarwood",
- "fxhash",
  "include-flate",
  "jieba-macros",
- "phf 0.12.1",
+ "phf 0.13.1",
  "regex",
+ "rustc-hash 2.1.1",
 ]
 
 [[package]]
@@ -7483,25 +7499,25 @@ checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc"
 
 [[package]]
 name = "libflate"
-version = "2.1.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e"
+checksum = "cd96e993e5f3368b0cb8497dae6c860c22af8ff18388c61c6c0b86c58d86b5df"
 dependencies = [
  "adler32",
- "core2",
  "crc32fast",
  "dary_heap",
  "libflate_lz77",
+ "no_std_io2",
 ]
 
 [[package]]
 name = "libflate_lz77"
-version = "2.1.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d"
+checksum = "ff7a10e427698aef6eef269482776debfef63384d30f13aad39a1a95e0e098fd"
 dependencies = [
- "core2",
- "hashbrown 0.14.5",
+ "hashbrown 0.16.1",
+ "no_std_io2",
  "rle-decode-fast",
 ]
 
@@ -7816,6 +7832,15 @@ dependencies = [
  "hashbrown 0.15.4",
 ]
 
+[[package]]
+name = "lru"
+version = "0.16.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39"
+dependencies = [
+ "hashbrown 0.16.1",
+]
+
 [[package]]
 name = "lru-slab"
 version = "0.1.2"
@@ -8299,6 +8324,7 @@ dependencies = [
  "either",
  "futures",
  "greptime-proto",
+ "humantime",
  "humantime-serde",
  "index",
  "itertools 0.14.0",
@@ -8434,7 +8460,7 @@ dependencies = [
  "flate2",
  "io-enum",
  "libc",
- "lru",
+ "lru 0.12.5",
  "mysql_common 0.34.1",
  "named_pipe",
  "pem",
@@ -8497,7 +8523,7 @@ dependencies = [
  "futures-sink",
  "futures-util",
  "keyed_priority_queue",
- "lru",
+ "lru 0.12.5",
  "mysql_common 0.34.1",
  "pem",
  "percent-encoding",
@@ -8695,6 +8721,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "no_std_io2"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "418abd1b6d34fbf6cae440dc874771b0525a604428704c76e48b29a5e67b8003"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "nohash"
 version = "0.2.0"
@@ -9635,6 +9670,15 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "ordered-float"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7d950ca161dc355eaf28f82b11345ed76c6e1f6eb1f4f4479e0323b9e2fbd0e"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "ordered-multimap"
 version = "0.4.3"
@@ -10122,6 +10166,15 @@ checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7"
 dependencies = [
  "phf_macros",
  "phf_shared 0.12.1",
+]
+
+[[package]]
+name = "phf"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
+dependencies = [
+ "phf_shared 0.13.1",
  "serde",
 ]
 
@@ -10131,10 +10184,20 @@ version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "efbdcb6f01d193b17f0b9c3360fa7e0e620991b193ff08702f78b3ce365d7e61"
 dependencies = [
- "phf_generator",
+ "phf_generator 0.12.1",
  "phf_shared 0.12.1",
 ]
 
+[[package]]
+name = "phf_codegen"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
+dependencies = [
+ "phf_generator 0.13.1",
+ "phf_shared 0.13.1",
+]
+
 [[package]]
 name = "phf_generator"
 version = "0.12.1"
@@ -10145,13 +10208,23 @@ dependencies = [
  "phf_shared 0.12.1",
 ]
 
+[[package]]
+name = "phf_generator"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
+dependencies = [
+ "fastrand",
+ "phf_shared 0.13.1",
+]
+
 [[package]]
 name = "phf_macros"
 version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d713258393a82f091ead52047ca779d37e5766226d009de21696c4e667044368"
 dependencies = [
- "phf_generator",
+ "phf_generator 0.12.1",
  "phf_shared 0.12.1",
  "proc-macro2",
  "quote",
@@ -10178,6 +10251,15 @@ dependencies = [
  "uncased",
 ]
 
+[[package]]
+name = "phf_shared"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
+dependencies = [
+ "siphasher",
+]
+
 [[package]]
 name = "pin-project"
 version = "1.1.10"
@@ -11415,16 +11497,6 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba"
 
-[[package]]
-name = "rand_distr"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
-dependencies = [
- "num-traits",
- "rand 0.8.5",
-]
-
 [[package]]
 name = "rand_xorshift"
 version = "0.4.0"
@@ -12705,6 +12777,7 @@ dependencies = [
  "metric-engine",
  "mime_guess",
  "mysql_async",
+ "mysql_common 0.34.1",
  "notify",
  "object-pool",
  "once_cell",
@@ -12960,9 +13033,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
 
 [[package]]
 name = "sketches-ddsketch"
-version = "0.3.0"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a"
+checksum = "05e40b6cf54d988dc1a2223531b969c9a9e30906ad90ef64890c27b4bfbb46ea"
 dependencies = [
  "serde",
 ]
@@ -13863,9 +13936,9 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
 
 [[package]]
 name = "tantivy"
-version = "0.24.2"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64a966cb0e76e311f09cf18507c9af192f15d34886ee43d7ba7c7e3803660c43"
+checksum = "edde6a10743fff00a4e1a8c9ef020bf5f3cbad301b7d2d39f2b07f123c4eac07"
 dependencies = [
  "aho-corasick",
  "arc-swap",
@@ -13876,17 +13949,17 @@ dependencies = [
  "census",
  "crc32fast",
  "crossbeam-channel",
+ "datasketches",
  "downcast-rs",
  "fastdivide",
  "fnv",
  "fs4",
  "htmlescape",
- "hyperloglogplus",
  "itertools 0.14.0",
  "levenshtein_automata",
  "log",
- "lru",
- "lz4_flex 0.11.6",
+ "lru 0.16.4",
+ "lz4_flex 0.13.1",
  "measure_time",
  "memmap2",
  "once_cell",
@@ -13909,6 +13982,7 @@ dependencies = [
  "tempfile",
  "thiserror 2.0.17",
  "time",
+ "typetag",
  "uuid",
  "winapi",
  "zstd",
@@ -13916,18 +13990,18 @@ dependencies = [
 
 [[package]]
 name = "tantivy-bitpacker"
-version = "0.8.0"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1adc286a39e089ae9938935cd488d7d34f14502544a36607effd2239ff0e2494"
+checksum = "4fed3d674429bcd2de5d0a6d1aa5495fed8afd9c5ecce993019caf7615f53fa4"
 dependencies = [
  "bitpacking",
 ]
 
 [[package]]
 name = "tantivy-columnar"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6300428e0c104c4f7db6f95b466a6f5c1b9aece094ec57cdd365337908dc7344"
+checksum = "c57166f5bcfd478f370ab8445afb4678dce44801fa5ce5c451aaf8595583c5dc"
 dependencies = [
  "downcast-rs",
  "fastdivide",
@@ -13941,9 +14015,9 @@ dependencies = [
 
 [[package]]
 name = "tantivy-common"
-version = "0.9.0"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e91b6ea6090ce03dc72c27d0619e77185d26cc3b20775966c346c6d4f7e99d7f"
+checksum = "bbf10915aa75da3c3b0d58b58853d2e889efbaf32d4982a4c3715dde6bba23e5"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -13965,9 +14039,9 @@ dependencies = [
 
 [[package]]
 name = "tantivy-jieba"
-version = "0.16.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b08147cc130e323ecc522117927b198bec617fe1df562a0b6449905858d0363"
+checksum = "3392170e86f1c387170aba7d171a466ffdc98a8b55b006e19ac64b123a7b690a"
 dependencies = [
  "jieba-rs",
  "lazy_static",
@@ -13976,20 +14050,22 @@ dependencies = [
 
 [[package]]
 name = "tantivy-query-grammar"
-version = "0.24.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e810cdeeebca57fc3f7bfec5f85fdbea9031b2ac9b990eb5ff49b371d52bbe6a"
+checksum = "dfadb8526b6da90704feb293b0701a6aae62ea14983143344be2dc5ce30f1d82"
 dependencies = [
+ "fnv",
  "nom 7.1.3",
+ "ordered-float 5.3.0",
  "serde",
  "serde_json",
 ]
 
 [[package]]
 name = "tantivy-sstable"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "709f22c08a4c90e1b36711c1c6cad5ae21b20b093e535b69b18783dd2cb99416"
+checksum = "8a2cfc3ac5164cbadc28965ffb145a8f47582a60ae5897859ad8d4316596c606"
 dependencies = [
  "futures-util",
  "itertools 0.14.0",
@@ -14001,20 +14077,19 @@ dependencies = [
 
 [[package]]
 name = "tantivy-stacker"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2bcdebb267671311d1e8891fd9d1301803fdb8ad21ba22e0a30d0cab49ba59c1"
+checksum = "6cbb051742da9d53ca9e8fff43a9b10e319338b24e2c0e15d0372df19ffeb951"
 dependencies = [
  "murmurhash32",
- "rand_distr",
  "tantivy-common",
 ]
 
 [[package]]
 name = "tantivy-tokenizer-api"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfa942fcee81e213e09715bbce8734ae2180070b97b33839a795ba1de201547d"
+checksum = "eac258c2c6390673f2685813afeeafcb8c4e0ee7de8dd3fc46838dcc37263f98"
 dependencies = [
  "serde",
 ]
@@ -15017,9 +15092,9 @@ checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
 
 [[package]]
 name = "typetag"
-version = "0.2.20"
+version = "0.2.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73f22b40dd7bfe8c14230cf9702081366421890435b2d625fa92b4acc4c3de6f"
+checksum = "c5a897b12c6c1151ad0b138b8db50252dc301f93bc3b027db05eec82aeed298c"
 dependencies = [
  "erased-serde",
  "inventory",
@@ -15030,9 +15105,9 @@ dependencies = [
 
 [[package]]
 name = "typetag-impl"
-version = "0.2.20"
+version = "0.2.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35f5380909ffc31b4de4f4bdf96b877175a016aa2ca98cee39fcfd8c4d53d952"
+checksum = "cf808357c6ed7e13ba0f3277ec8d8f21b2d501274895104263985330c726c1c5"
 dependencies = [
  "proc-macro2",
  "quote",
diff --git a/Cargo.toml b/Cargo.toml
index eeddc7099f..32407f31cf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -158,7 +158,7 @@ fs2 = "0.4"
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "dfd2a6d7d3d9c718cb159fcf9abae144b74fc503" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "7224c2ad6d11db612fbdb621c36135fc37ffce35" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
diff --git a/README.md b/README.md
index 4ed99fa306..127dd1ba85 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
 <h2 align="center">One database for metrics, logs, and traces<br/>
 replacing Prometheus, Loki, and Elasticsearch</h2>
 
->  The unified OpenTelemetry backend — with SQL + PromQL on object storage.
+> The unified OpenTelemetry backend — with SQL + PromQL on object storage.
 
 <div align="center">
 <h3 align="center">
@@ -30,11 +30,11 @@ replacing Prometheus, Loki, and Elasticsearch</h2>
 <a href="https://github.com/GreptimeTeam/greptimedb/actions/workflows/develop.yml">
 <img src="https://github.com/GreptimeTeam/greptimedb/actions/workflows/develop.yml/badge.svg" alt="GitHub Actions"/>
 </a>
-<a href="https://codecov.io/gh/GrepTimeTeam/greptimedb">
-<img src="https://codecov.io/gh/GrepTimeTeam/greptimedb/branch/main/graph/badge.svg?token=FITFDI3J3C" alt="Codecov"/>
+<a href="https://codecov.io/gh/GreptimeTeam/greptimedb">
+<img src="https://codecov.io/gh/GreptimeTeam/greptimedb/branch/main/graph/badge.svg?token=FITFDI3J3C" alt="Codecov"/>
 </a>
-<a href="https://github.com/greptimeTeam/greptimedb/blob/main/LICENSE">
-<img src="https://img.shields.io/github/license/greptimeTeam/greptimedb" alt="License"/>
+<a href="https://github.com/GreptimeTeam/greptimedb/blob/main/LICENSE">
+<img src="https://img.shields.io/github/license/GreptimeTeam/greptimedb" alt="License"/>
 </a>
 
 <br/>
@@ -51,7 +51,8 @@ replacing Prometheus, Loki, and Elasticsearch</h2>
 </div>
 
 - [Introduction](#introduction)
-- [⭐ Key Features](#features)
+- [Overview](#overview)
+- [Features](#features)
 - [How GreptimeDB Compares](#how-greptimedb-compares)
 - [Architecture](#architecture)
 - [Try GreptimeDB](#try-greptimedb)
@@ -69,37 +70,47 @@ replacing Prometheus, Loki, and Elasticsearch</h2>
 
 **GreptimeDB** is an open-source observability database built for [Observability 2.0](https://docs.greptime.com/user-guide/concepts/observability-2/) — treating metrics, logs, and traces as one unified data model (wide events) instead of three separate pillars.
 
-Use it as the single OpenTelemetry backend — replacing Prometheus, Loki, and Elasticsearch with one database built on object storage. Query with SQL and PromQL, scale without pain, cut costs up to 50x.
+Use it as the single OpenTelemetry backend — replacing Prometheus, Loki, and Elasticsearch with one database built on object storage. Query with SQL and PromQL, scale without pain, cut costs up to 50×.
+
+## Overview
+
+A quick overview of what GreptimeDB ingests, how it connects to other systems, and what its distributed engine lets you do.
+
+<p align="center">
+  <a href="https://github.com/GreptimeTeam/greptimedb/raw/main/docs/overview.png" target="_blank" rel="noopener">
+    <img alt="GreptimeDB Overview" src="docs/overview.png" width="900px">
+  </a>
+</p>
 
 ## Features
 
-|   Feature  | Description |
-| --------- | ----------- |
-| Drop-in replacement | [PromQL](https://docs.greptime.com/user-guide/query-data/promql/), [Prometheus remote write](https://docs.greptime.com/user-guide/ingest-data/for-observability/prometheus/), [Jaeger](https://docs.greptime.com/user-guide/query-data/jaeger/), and [OpenTelemetry](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/) native. Use as your single backend for all three signals, or migrate one at a time.|
-| 50x lower cost | Object storage (S3, GCS, Azure Blob etc.) as [primary storage](https://docs.greptime.com/user-guide/deployments-administration/configuration/#storage-options). Compute-storage separation scales without pain.|
-| SQL + PromQL | Monitor with [PromQL](https://docs.greptime.com/user-guide/query-data/promql), analyze with [SQL](https://docs.greptime.com/user-guide/query-data/sql). One database replaces Prometheus + your data warehouse.|
-| Sub-second at PB-EB scale | Columnar engine with [fulltext, inverted, and skipping indexes](https://docs.greptime.com/user-guide/manage-data/data-index). Written in Rust.|
+| Feature | Description |
+|---------|-------------|
+| **Observability 2.0 native** | Logs, metrics, and traces in one engine with [SQL + PromQL](https://docs.greptime.com/user-guide/query-data/overview/). Native [OpenTelemetry](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/), [Prometheus remote write](https://docs.greptime.com/user-guide/ingest-data/for-observability/prometheus/), and [Jaeger](https://docs.greptime.com/user-guide/query-data/jaeger/). Migrate one signal at a time, or use as a single backend. |
+| **Elastic compute-storage separation** | Scale reads independently with horizontal replicas. Serve high-concurrency workloads from dashboards, alerting, and AI agents — without resharding or data migration. |
+| **Sub-second on PB–EB-scale data** | Columnar engine with [fulltext, inverted, and skipping indexes](https://docs.greptime.com/user-guide/manage-data/data-index). Written in Rust. Designed for high-concurrency point queries, not just analytical scans. |
+| **50× lower cost** | Object storage (S3, GCS, Azure Blob) as [primary storage](https://docs.greptime.com/user-guide/deployments-administration/configuration/#storage-options), with a tiered cache (memory + local disk) to keep writes and queries fast. |
 
-  ✅ **Perfect for:**
-  * Replacing Prometheus + Loki + Elasticsearch with one database
+**Perfect for:**
+  * Replacing Prometheus + Loki + Elasticsearch with a single observability backend
   * Scaling past Prometheus — high cardinality, long-term storage, no Thanos/Mimir overhead
-  * Cutting observability costs with object storage (up to 50x savings on traces, 30% on logs)
-  * AI/LLM observability — store and analyze high-volume conversation data, agent traces, and token metrics via [OpenTelemetry GenAI conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)
+  * AI/agent workloads — store GenAI telemetry ([OTel GenAI conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)), and serve high-concurrency reads from SRE/developer agents via horizontal read replicas
+  * Cutting observability costs with object storage (up to 50× savings on traces, 30% on logs)
   * Edge-to-cloud observability with unified APIs on resource-constrained devices
 
-> **Why Observability 2.0?** The three-pillar model (separate databases for metrics, logs, traces) creates data silos and operational complexity. GreptimeDB treats all observability data as timestamped wide events in a single columnar engine — enabling cross-signal SQL JOINs, eliminating redundant infrastructure, and naturally supporting emerging workloads like AI agent observability. Read more: [Observability 2.0 and the Database for It](https://greptime.com/blogs/2025-04-25-greptimedb-observability2-new-database).
+> **Why Observability 2.0?** Three separate databases for metrics, logs, and traces means three storage layers, three query languages, and three sets of dashboards. GreptimeDB stores all three as timestamped wide events in one columnar engine — JOIN across signals in SQL, run one stack instead of three, and ingest AI agent telemetry the same way. Read more: [Observability 2.0 and the Database for It](https://greptime.com/blogs/2025-04-25-greptimedb-observability2-new-database).
 
 Learn more in [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why-greptimedb).
 
 ## How GreptimeDB Compares
 
-| Feature | GreptimeDB | Prometheus / Thanos / Mimir | Grafana Loki | Elasticsearch |
+| Capability | GreptimeDB | Prometheus / Thanos / Mimir | Grafana Loki | Elasticsearch |
 |---|---|---|---|---|
 | Data types | Metrics, logs, traces | Metrics only | Logs only | Logs, traces |
 | Query language | SQL + PromQL | PromQL | LogQL | Query DSL |
 | Storage | Native object storage (S3, etc.) | Local disk + object storage (Thanos/Mimir) | Object storage (chunks) | Local disk |
 | Scaling | Compute-storage separation, stateless nodes | Federation / Thanos / Mimir — multi-component, ops heavy | Stateless + object storage | Shard-based, ops heavy |
-| Cost efficiency | Up to 50x lower storage | High at scale | Moderate | High (inverted index overhead) |
+| Cost efficiency | Up to 50× lower storage cost | High at scale | Moderate | High (inverted index overhead) |
 | OpenTelemetry | Native (metrics + logs + traces) | Partial (metrics only) | Partial (logs only) | Via instrumentation |
 
 **Benchmarks:**
@@ -110,19 +121,26 @@ Learn more in [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why
 ## Architecture
 
 GreptimeDB can run in two modes:
-* **Standalone Mode** - Single binary for development and small deployments
-* **Distributed Mode** - Separate components for production scale:
-  - Frontend: Query processing and protocol handling
-  - Datanode: Data storage and retrieval
-  - Metasrv: Metadata management and coordination
-  
-Read the [architecture](https://docs.greptime.com/contributor-guide/overview/#architecture) document. [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb/1-overview) provides an in-depth look at GreptimeDB:
-  <img alt="GreptimeDB System Overview" src="docs/architecture.png">
+* **Standalone** — single binary for development and small deployments.
+* **Distributed** — four components, each independently scalable:
+  - **Frontend** — protocol entry (OTel, Prometheus, MySQL/PostgreSQL, gRPC, ingestion APIs for Elasticsearch/InfluxDB/Loki) and the distributed query engine. Stateless, scales horizontally.
+  - **Datanode** — region engine with WAL, memtable, SST, cache, compaction, and indexes. Persists data to object storage. Elastic.
+  - **Metasrv** — metadata, routing, repartitioning, autopilot, and security. Backed by a pluggable KV layer (etcd or RDS).
+  - **Flownode** (optional) — continuous flow computation (streaming and materialized views).
+
+For deeper coverage, see the [architecture doc](https://docs.greptime.com/contributor-guide/overview/#architecture) or [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb/1-overview).
+
+<a href="https://github.com/GreptimeTeam/greptimedb/raw/main/docs/architecture.png" target="_blank" rel="noopener">
+  <img alt="GreptimeDB System Overview" src="https://cdn.jsdelivr.net/gh/GreptimeTeam/greptimedb@main/docs/architecture.png">
+</a>
 
 ## Try GreptimeDB
 
-```shell
-docker pull greptime/greptimedb
+**For AI agents** — paste this prompt into your agent:
+
+```text
+Read https://docs.greptime.com/SKILL.md and follow the instructions
+to deploy, configure, ingest, and query GreptimeDB.
 ```
 
 ```shell
@@ -131,7 +149,7 @@ docker run -p 127.0.0.1:4000-4003:4000-4003 \
   --name greptime --rm \
   greptime/greptimedb:latest standalone start \
   --http-addr 0.0.0.0:4000 \
-  --grpc-bind-addr 0.0.0.0:4001 \
+  --rpc-bind-addr 0.0.0.0:4001 \
   --mysql-addr 0.0.0.0:4002 \
   --postgres-addr 0.0.0.0:4003
 ```
@@ -153,20 +171,30 @@ Read more in the [full Install Guide](https://docs.greptime.com/getting-started/
 ## Build From Source
 
 **Prerequisites:**
-* [Rust toolchain](https://www.rust-lang.org/tools/install) (nightly)
+* [Rust toolchain](https://www.rust-lang.org/tools/install) — nightly, pinned by [`rust-toolchain.toml`](https://github.com/GreptimeTeam/greptimedb/blob/main/rust-toolchain.toml)
 * [Protobuf compiler](https://grpc.io/docs/protoc-installation/) (>= 3.15)
-* C/C++ building essentials, including `gcc`/`g++`/`autoconf` and glibc library (eg. `libc6-dev` on Ubuntu and `glibc-devel` on Fedora)
-* Python toolchain (optional): Required only if using some test scripts.
+* C/C++ building essentials: `gcc` / `g++` / `autoconf` and the glibc dev package (`libc6-dev` on Ubuntu, `glibc-devel` on Fedora)
+* Python toolchain (optional, only for some test scripts)
 
-**Build and Run:**
+**Build and run:**
 ```bash
-make
-cargo run -- standalone start
+make                          # build greptime binary
+cargo run -- standalone start # start in standalone mode
 ```
 
+**Common dev commands:**
+```bash
+make fmt            # format Rust code
+make clippy         # lint (fails on warnings)
+make test           # unit + integration tests (uses cargo-nextest)
+make sqlness-test   # SQL regression tests
+```
+
+See the [Contribution Guidelines](CONTRIBUTING.md) for the full developer workflow.
+
 ## Tools & Extensions
 
-- **Kubernetes**: [GreptimeDB Operator](https://github.com/GrepTimeTeam/greptimedb-operator)
+- **Kubernetes**: [GreptimeDB Operator](https://github.com/GreptimeTeam/greptimedb-operator)
 - **Helm Charts**: [Greptime Helm Charts](https://github.com/GreptimeTeam/helm-charts)
 - **Dashboard**: [Web UI](https://github.com/GreptimeTeam/dashboard)
 - **gRPC Ingester**: [Go](https://github.com/GreptimeTeam/greptimedb-ingester-go), [Java](https://github.com/GreptimeTeam/greptimedb-ingester-java), [C++](https://github.com/GreptimeTeam/greptimedb-ingester-cpp), [Erlang](https://github.com/GreptimeTeam/greptimedb-ingester-erl), [Rust](https://github.com/GreptimeTeam/greptimedb-ingester-rust), [.NET](https://github.com/GreptimeTeam/greptimedb-ingester-dotnet)
@@ -175,18 +203,11 @@ cargo run -- standalone start
 
 ## Project Status
 
-> **Status:** [v1.0 GA](https://github.com/GreptimeTeam/greptimedb/releases/tag/v1.0.0) — generally available and production-ready! 🎉
+GreptimeDB is at [v1.0 GA](https://github.com/GreptimeTeam/greptimedb/releases/tag/v1.0.0) with stable APIs and regular releases. It runs in production at scale — [OceanBase Cloud](https://greptime.com/blogs/2025-07-22-user-case-obcloud-log-management-greptimedb) operates 80+ GreptimeDB clusters managing 300 TB of logs, cutting log storage cost by 60% after migrating from Grafana Loki. See more in [case studies](https://greptime.com/blogs/?category=Use%20Case).
 
-- Deployed in production handling billions of data points daily
-- Stable APIs, actively maintained, with regular releases ([version info](https://docs.greptime.com/nightly/reference/about-greptimedb-version))
+Read the [v1.0 highlights](https://greptime.com/blogs/2025-11-05-greptimedb-v1-highlights) and [2026 roadmap](https://greptime.com/blogs/2026-02-11-greptimedb-roadmap-2026), or browse the [version reference](https://docs.greptime.com/nightly/reference/about-greptimedb-version).
 
-GreptimeDB v1.0 marks a major milestone — stable APIs, production readiness, and proven performance at scale.
-
-**Learn more:** [v1.0 highlights](https://greptime.com/blogs/2025-11-05-greptimedb-v1-highlights) and [2026 roadmap](https://greptime.com/blogs/2026-02-11-greptimedb-roadmap-2026).
-
-For production use, we recommend v1.0 or later.
-
-If you find this project useful, a ⭐ would mean a lot to us!
+If GreptimeDB is useful to you, please star the repo.
 
 [![Star History Chart](https://api.star-history.com/svg?repos=GreptimeTeam/GreptimeDB&type=Date)](https://www.star-history.com/#GreptimeTeam/GreptimeDB&Date)
 
@@ -216,15 +237,19 @@ We offer enterprise add-ons, services, training, and consulting.
 
 ## Contributing
 
-- Read our [Contribution Guidelines](https://github.com/GreptimeTeam/greptimedb/blob/main/CONTRIBUTING.md).
+- Read our [Contribution Guidelines](CONTRIBUTING.md).
 - Explore [Internal Concepts](https://docs.greptime.com/contributor-guide/overview.html) and [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb).
 - Pick up a [good first issue](https://github.com/GreptimeTeam/greptimedb/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) and join the #contributors [Slack](https://greptime.com/slack) channel.
 
 ## Acknowledgement
 
-Special thanks to all contributors! See [AUTHORS.md](https://github.com/GreptimeTeam/greptimedb/blob/main/AUTHOR.md).
+Special thanks to all contributors! See [AUTHOR.md](AUTHOR.md).
 
 - Uses [Apache Arrow™](https://arrow.apache.org/) (memory model)
 - [Apache Parquet™](https://parquet.apache.org/) (file storage)
-- [Apache DataFusion™](https://arrow.apache.org/datafusion/) (query engine)
+- [Apache DataFusion™](https://datafusion.apache.org/) (query engine)
 - [Apache OpenDAL™](https://opendal.apache.org/) (data access abstraction)
+
+---
+
+*All trademarks, logos, and brand names referenced in this README and in the Overview diagram are the property of their respective owners. Their use is for identification purposes only and does not imply endorsement or affiliation.*
diff --git a/config/config.md b/config/config.md
index b1630d97ad..0fae0caaa4 100644
--- a/config/config.md
+++ b/config/config.md
@@ -155,6 +155,8 @@
 | `region_engine.mito.vector_cache_size` | String | Auto | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
 | `region_engine.mito.page_cache_size` | String | Auto | Cache size for pages of SST row groups. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/8 of OS memory. |
 | `region_engine.mito.selector_result_cache_size` | String | Auto | Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.range_result_cache_size` | String | Auto | Cache size for flat range scan results. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.prefilter_result_cache_size` | String | Auto | Cache size for prefilter results. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/32 of OS memory with a max limitation of 128MB. |
 | `region_engine.mito.enable_write_cache` | Bool | `false` | Whether to enable the write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance. |
 | `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. |
 | `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
@@ -543,6 +545,8 @@
 | `region_engine.mito.vector_cache_size` | String | Auto | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
 | `region_engine.mito.page_cache_size` | String | Auto | Cache size for pages of SST row groups. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/8 of OS memory. |
 | `region_engine.mito.selector_result_cache_size` | String | Auto | Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.range_result_cache_size` | String | Auto | Cache size for flat range scan results. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.prefilter_result_cache_size` | String | Auto | Cache size for prefilter results. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/32 of OS memory with a max limitation of 128MB. |
 | `region_engine.mito.enable_write_cache` | Bool | `false` | Whether to enable the write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance. |
 | `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. |
 | `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
diff --git a/config/datanode.example.toml b/config/datanode.example.toml
index 170045a090..d558918daf 100644
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -480,6 +480,16 @@ auto_flush_interval = "1h"
 ## @toml2docs:none-default="Auto"
 #+ selector_result_cache_size = "512MB"
 
+## Cache size for flat range scan results. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
+## @toml2docs:none-default="Auto"
+#+ range_result_cache_size = "512MB"
+
+## Cache size for prefilter results. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/32 of OS memory with a max limitation of 128MB.
+## @toml2docs:none-default="Auto"
+#+ prefilter_result_cache_size = "128MB"
+
 ## Whether to enable the write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance.
 enable_write_cache = false
 
diff --git a/config/standalone.example.toml b/config/standalone.example.toml
index 24249270b2..d5c42e744c 100644
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -599,6 +599,16 @@ auto_flush_interval = "1h"
 ## @toml2docs:none-default="Auto"
 #+ selector_result_cache_size = "512MB"
 
+## Cache size for flat range scan results. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
+## @toml2docs:none-default="Auto"
+#+ range_result_cache_size = "512MB"
+
+## Cache size for prefilter results. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/32 of OS memory with a max limitation of 128MB.
+## @toml2docs:none-default="Auto"
+#+ prefilter_result_cache_size = "128MB"
+
 ## Whether to enable the write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance.
 enable_write_cache = false
 
diff --git a/docs/architecture.png b/docs/architecture.png
index 992b6c856d..697292ef2f 100644
Binary files a/docs/architecture.png and b/docs/architecture.png differ
diff --git a/docs/overview.png b/docs/overview.png
new file mode 100644
index 0000000000..5ab20834a4
Binary files /dev/null and b/docs/overview.png differ
diff --git a/src/auth/src/permission.rs b/src/auth/src/permission.rs
index 88adfda633..8914635290 100644
--- a/src/auth/src/permission.rs
+++ b/src/auth/src/permission.rs
@@ -16,6 +16,7 @@ use std::fmt::Debug;
 use std::sync::Arc;
 
 use api::v1::greptime_request::Request;
+use api::v1::query_request::Query;
 use common_telemetry::debug;
 use sql::statements::statement::Statement;
 
@@ -42,10 +43,12 @@ impl<'a> PermissionReq<'a> {
     /// Returns true if the permission request is for read operations.
     pub fn is_readonly(&self) -> bool {
         match self {
-            PermissionReq::GrpcRequest(Request::Query(_))
-            | PermissionReq::PromQuery
-            | PermissionReq::LogQuery
-            | PermissionReq::PromStoreRead => true,
+            PermissionReq::GrpcRequest(Request::Query(query_request)) => {
+                !matches!(query_request.query, Some(Query::InsertIntoPlan(_)))
+            }
+            PermissionReq::PromQuery | PermissionReq::LogQuery | PermissionReq::PromStoreRead => {
+                true
+            }
             PermissionReq::SqlStatement(stmt) => stmt.is_readonly(),
 
             PermissionReq::GrpcRequest(_)
@@ -196,4 +199,14 @@ mod tests {
         assert!(matches!(read_result, PermissionResp::Reject));
         assert!(matches!(write_result, PermissionResp::Allow));
     }
+
+    #[test]
+    fn test_grpc_insert_into_plan_is_write_request() {
+        let request = Request::Query(api::v1::QueryRequest {
+            query: Some(Query::InsertIntoPlan(api::v1::InsertIntoPlan::default())),
+        });
+        let req = PermissionReq::GrpcRequest(&request);
+
+        assert!(req.is_write());
+    }
 }
diff --git a/src/catalog/src/system_schema/information_schema.rs b/src/catalog/src/system_schema/information_schema.rs
index 9715aa9402..a35950194c 100644
--- a/src/catalog/src/system_schema/information_schema.rs
+++ b/src/catalog/src/system_schema/information_schema.rs
@@ -20,6 +20,7 @@ pub mod key_column_usage;
 mod partitions;
 mod procedure_info;
 pub mod process_list;
+mod region_info;
 pub mod region_peers;
 mod region_statistics;
 pub mod schemata;
@@ -47,6 +48,8 @@ use datatypes::schema::SchemaRef;
 use lazy_static::lazy_static;
 use paste::paste;
 use process_list::InformationSchemaProcessList;
+use region_info::InformationSchemaRegionInfo;
+use store_api::region_info::RegionInfoEntry;
 use store_api::sst_entry::{ManifestSstEntry, PuffinIndexMetaEntry, StorageSstEntry};
 use store_api::storage::{ScanRequest, TableId};
 use table::TableRef;
@@ -242,6 +245,9 @@ impl SystemSchemaProviderInner for InformationSchemaProvider {
                     self.catalog_manager.clone(),
                 ),
             ) as _),
+            REGION_INFO => Some(Arc::new(InformationSchemaRegionInfo::new(
+                self.catalog_manager.clone(),
+            )) as _),
             PROCESS_LIST => self
                 .process_manager
                 .as_ref()
@@ -320,6 +326,10 @@ impl InformationSchemaProvider {
                 REGION_STATISTICS.to_string(),
                 self.build_table(REGION_STATISTICS).unwrap(),
             );
+            tables.insert(
+                REGION_INFO.to_string(),
+                self.build_table(REGION_INFO).unwrap(),
+            );
             tables.insert(
                 SSTS_MANIFEST.to_string(),
                 self.build_table(SSTS_MANIFEST).unwrap(),
@@ -447,6 +457,8 @@ pub enum DatanodeInspectKind {
     SstStorage,
     /// List index metadata collected from manifest
     SstIndexMeta,
+    /// List region runtime and manifest info
+    RegionInfo,
 }
 
 impl DatanodeInspectRequest {
@@ -456,6 +468,7 @@ impl DatanodeInspectRequest {
             DatanodeInspectKind::SstManifest => ManifestSstEntry::build_plan(self.scan),
             DatanodeInspectKind::SstStorage => StorageSstEntry::build_plan(self.scan),
             DatanodeInspectKind::SstIndexMeta => PuffinIndexMetaEntry::build_plan(self.scan),
+            DatanodeInspectKind::RegionInfo => RegionInfoEntry::build_plan(self.scan),
         }
     }
 }
@@ -488,3 +501,28 @@ impl InformationExtension for NoopInformationExtension {
         Ok(common_recordbatch::RecordBatches::empty().as_stream())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use store_api::region_info::RegionInfoEntry;
+
+    use super::*;
+
+    #[test]
+    fn test_datanode_inspect_region_info_build_plan() {
+        let plan = DatanodeInspectRequest {
+            kind: DatanodeInspectKind::RegionInfo,
+            scan: ScanRequest::default(),
+        }
+        .build_plan()
+        .unwrap();
+
+        let LogicalPlan::TableScan(scan) = plan else {
+            panic!("expected table scan");
+        };
+        assert_eq!(
+            scan.table_name.to_string(),
+            RegionInfoEntry::reserved_table_name_for_inspection()
+        );
+    }
+}
diff --git a/src/catalog/src/system_schema/information_schema/region_info.rs b/src/catalog/src/system_schema/information_schema/region_info.rs
new file mode 100644
index 0000000000..ffc9dfc7ae
--- /dev/null
+++ b/src/catalog/src/system_schema/information_schema/region_info.rs
@@ -0,0 +1,86 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::{Arc, Weak};
+
+use common_catalog::consts::INFORMATION_SCHEMA_REGION_INFO_TABLE_ID;
+use common_error::ext::BoxedError;
+use common_recordbatch::SendableRecordBatchStream;
+use common_recordbatch::adapter::AsyncRecordBatchStreamAdapter;
+use datatypes::schema::SchemaRef;
+use snafu::ResultExt;
+use store_api::region_info::RegionInfoEntry;
+use store_api::storage::{ScanRequest, TableId};
+
+use crate::CatalogManager;
+use crate::error::{ProjectSchemaSnafu, Result};
+use crate::information_schema::{
+    DatanodeInspectKind, DatanodeInspectRequest, InformationTable, REGION_INFO,
+};
+use crate::system_schema::utils;
+
+/// Information schema table for region info.
+pub struct InformationSchemaRegionInfo {
+    schema: SchemaRef,
+    catalog_manager: Weak<dyn CatalogManager>,
+}
+
+impl InformationSchemaRegionInfo {
+    pub(super) fn new(catalog_manager: Weak<dyn CatalogManager>) -> Self {
+        Self {
+            schema: RegionInfoEntry::schema(),
+            catalog_manager,
+        }
+    }
+}
+
+impl InformationTable for InformationSchemaRegionInfo {
+    fn table_id(&self) -> TableId {
+        INFORMATION_SCHEMA_REGION_INFO_TABLE_ID
+    }
+
+    fn table_name(&self) -> &'static str {
+        REGION_INFO
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn to_stream(&self, request: ScanRequest) -> Result<SendableRecordBatchStream> {
+        let schema = if let Some(p) = request.projection_indices() {
+            Arc::new(self.schema.try_project(p).context(ProjectSchemaSnafu)?)
+        } else {
+            self.schema.clone()
+        };
+
+        let info_ext = utils::information_extension(&self.catalog_manager)?;
+        let req = DatanodeInspectRequest {
+            kind: DatanodeInspectKind::RegionInfo,
+            scan: request,
+        };
+
+        let future = async move {
+            info_ext
+                .inspect_datanode(req)
+                .await
+                .map_err(BoxedError::new)
+                .context(common_recordbatch::error::ExternalSnafu)
+        };
+        Ok(Box::pin(AsyncRecordBatchStreamAdapter::new(
+            schema,
+            Box::pin(future),
+        )))
+    }
+}
diff --git a/src/catalog/src/system_schema/information_schema/table_names.rs b/src/catalog/src/system_schema/information_schema/table_names.rs
index 2a3329fece..3a4c86487a 100644
--- a/src/catalog/src/system_schema/information_schema/table_names.rs
+++ b/src/catalog/src/system_schema/information_schema/table_names.rs
@@ -45,6 +45,7 @@ pub const CLUSTER_INFO: &str = "cluster_info";
 pub const VIEWS: &str = "views";
 pub const FLOWS: &str = "flows";
 pub const PROCEDURE_INFO: &str = "procedure_info";
+pub const REGION_INFO: &str = "region_info";
 pub const REGION_STATISTICS: &str = "region_statistics";
 pub const PROCESS_LIST: &str = "process_list";
 pub const SSTS_MANIFEST: &str = "ssts_manifest";
diff --git a/src/catalog/src/table_source/dummy_catalog.rs b/src/catalog/src/table_source/dummy_catalog.rs
index db49db0eed..20637c3a3a 100644
--- a/src/catalog/src/table_source/dummy_catalog.rs
+++ b/src/catalog/src/table_source/dummy_catalog.rs
@@ -22,6 +22,7 @@ use async_trait::async_trait;
 use common_catalog::format_full_table_name;
 use datafusion::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider};
 use datafusion::datasource::TableProvider;
+use session::context::QueryContextRef;
 use snafu::OptionExt;
 use table::table::adapter::DfTableProviderAdapter;
 
@@ -32,12 +33,27 @@ use crate::error::TableNotExistSnafu;
 #[derive(Clone)]
 pub struct DummyCatalogList {
     catalog_manager: CatalogManagerRef,
+    query_ctx: Option<QueryContextRef>,
 }
 
 impl DummyCatalogList {
-    /// Creates a new catalog list with the given catalog manager.
+    /// Creates a new catalog list with the given catalog manager (no query context).
     pub fn new(catalog_manager: CatalogManagerRef) -> Self {
-        Self { catalog_manager }
+        Self {
+            catalog_manager,
+            query_ctx: None,
+        }
+    }
+
+    /// Creates a new catalog list with the given catalog manager and query context.
+    pub fn new_with_query_ctx(
+        catalog_manager: CatalogManagerRef,
+        query_ctx: QueryContextRef,
+    ) -> Self {
+        Self {
+            catalog_manager,
+            query_ctx: Some(query_ctx),
+        }
     }
 }
 
@@ -68,6 +84,7 @@ impl CatalogProviderList for DummyCatalogList {
         Some(Arc::new(DummyCatalogProvider {
             catalog_name: catalog_name.to_string(),
             catalog_manager: self.catalog_manager.clone(),
+            query_ctx: self.query_ctx.clone(),
         }))
     }
 }
@@ -77,6 +94,7 @@ impl CatalogProviderList for DummyCatalogList {
 struct DummyCatalogProvider {
     catalog_name: String,
     catalog_manager: CatalogManagerRef,
+    query_ctx: Option<QueryContextRef>,
 }
 
 impl CatalogProvider for DummyCatalogProvider {
@@ -93,6 +111,7 @@ impl CatalogProvider for DummyCatalogProvider {
             catalog_name: self.catalog_name.clone(),
             schema_name: schema_name.to_string(),
             catalog_manager: self.catalog_manager.clone(),
+            query_ctx: self.query_ctx.clone(),
         }))
     }
 }
@@ -111,6 +130,7 @@ struct DummySchemaProvider {
     catalog_name: String,
     schema_name: String,
     catalog_manager: CatalogManagerRef,
+    query_ctx: Option<QueryContextRef>,
 }
 
 #[async_trait]
@@ -126,7 +146,12 @@ impl SchemaProvider for DummySchemaProvider {
     async fn table(&self, name: &str) -> datafusion::error::Result<Option<Arc<dyn TableProvider>>> {
         let table = self
             .catalog_manager
-            .table(&self.catalog_name, &self.schema_name, name, None)
+            .table(
+                &self.catalog_name,
+                &self.schema_name,
+                name,
+                self.query_ctx.as_deref(),
+            )
             .await?
             .with_context(|| TableNotExistSnafu {
                 table: format_full_table_name(&self.catalog_name, &self.schema_name, name),
diff --git a/src/cli/src/data/export_v2/command.rs b/src/cli/src/data/export_v2/command.rs
index 3c069a72be..db0f576a4e 100644
--- a/src/cli/src/data/export_v2/command.rs
+++ b/src/cli/src/data/export_v2/command.rs
@@ -15,6 +15,7 @@
 //! Export V2 CLI commands.
 
 use std::collections::HashSet;
+use std::io::{self, Write};
 use std::time::Duration;
 
 use async_trait::async_trait;
@@ -28,7 +29,7 @@ use crate::Tool;
 use crate::common::ObjectStoreConfig;
 use crate::data::export_v2::coordinator::export_data;
 use crate::data::export_v2::error::{
-    ChunkTimeWindowRequiresBoundsSnafu, DatabaseSnafu, EmptyResultSnafu,
+    ChunkTimeWindowRequiresBoundsSnafu, DatabaseSnafu, EmptyResultSnafu, IoSnafu,
     ManifestVersionMismatchSnafu, Result, ResumeConfigMismatchSnafu, SchemaOnlyArgsNotAllowedSnafu,
     SchemaOnlyModeMismatchSnafu, SnapshotVerifyFailedSnafu, UnexpectedValueTypeSnafu,
 };
@@ -38,7 +39,9 @@ use crate::data::export_v2::manifest::{
 };
 use crate::data::export_v2::schema::{DDL_DIR, SCHEMA_DIR, SCHEMAS_FILE};
 use crate::data::path::{data_dir_for_schema_chunk, ddl_path_for_schema};
-use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri};
+use crate::data::snapshot_storage::{
+    OpenDalStorage, SnapshotStorage, validate_snapshot_uri, validate_uri,
+};
 use crate::data::sql::{escape_sql_identifier, escape_sql_literal};
 use crate::database::{DatabaseClient, parse_proxy_opts};
 
@@ -51,6 +54,8 @@ pub enum ExportV2Command {
     List(ExportListCommand),
     /// Verify snapshot integrity.
     Verify(ExportVerifyCommand),
+    /// Delete a snapshot and all data under it.
+    Delete(ExportDeleteCommand),
 }
 
 impl ExportV2Command {
@@ -59,6 +64,7 @@ impl ExportV2Command {
             ExportV2Command::Create(cmd) => cmd.build().await,
             ExportV2Command::List(cmd) => cmd.build().await,
             ExportV2Command::Verify(cmd) => cmd.build().await,
+            ExportV2Command::Delete(cmd) => cmd.build().await,
         }
     }
 }
@@ -172,6 +178,75 @@ impl ExportVerify {
     }
 }
 
+/// Delete a snapshot and all data under it.
+#[derive(Debug, Parser)]
+pub struct ExportDeleteCommand {
+    /// Snapshot storage location (e.g., s3://bucket/path, file:///tmp/backup).
+    #[clap(long)]
+    snapshot: String,
+
+    /// Skip interactive confirmation.
+    #[clap(long = "no-confirm", alias = "yes")]
+    skip_confirmation: bool,
+
+    /// Object store configuration for remote storage backends.
+    #[clap(flatten)]
+    storage: ObjectStoreConfig,
+}
+
+impl ExportDeleteCommand {
+    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
+        validate_snapshot_uri(&self.snapshot).map_err(BoxedError::new)?;
+        let storage =
+            OpenDalStorage::from_uri(&self.snapshot, &self.storage).map_err(BoxedError::new)?;
+
+        Ok(Box::new(ExportDelete {
+            snapshot: self.snapshot.clone(),
+            skip_confirmation: self.skip_confirmation,
+            storage,
+        }))
+    }
+}
+
+/// Export delete tool implementation.
+pub struct ExportDelete {
+    snapshot: String,
+    skip_confirmation: bool,
+    storage: OpenDalStorage,
+}
+
+#[async_trait]
+impl Tool for ExportDelete {
+    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
+        self.run().await.map_err(BoxedError::new)
+    }
+}
+
+impl ExportDelete {
+    async fn run(&self) -> Result<()> {
+        self.run_with_confirmation(confirm_delete).await
+    }
+
+    async fn run_with_confirmation<F>(&self, confirm: F) -> Result<()>
+    where
+        F: FnOnce(&str) -> Result<bool>,
+    {
+        let manifest = self.storage.read_manifest().await?;
+        print_delete_summary(&self.snapshot, &manifest);
+
+        if !self.skip_confirmation && !confirm(&self.snapshot)? {
+            println!("Deletion cancelled.");
+            return Ok(());
+        }
+
+        println!("Deleting snapshot...");
+        self.storage.delete_snapshot().await?;
+        println!("Snapshot deleted successfully.");
+
+        Ok(())
+    }
+}
+
 /// Create a new snapshot.
 #[derive(Debug, Parser)]
 pub struct ExportCreateCommand {
@@ -1239,6 +1314,79 @@ fn print_verify_report(snapshot: &str, report: &VerifyReport) {
     );
 }
 
+fn print_delete_summary(snapshot: &str, manifest: &Manifest) {
+    println!("Snapshot: {}", manifest.snapshot_id);
+    println!("  Location: {}", snapshot);
+    println!(
+        "  Created:  {} UTC",
+        manifest.created_at.format("%Y-%m-%d %H:%M:%S")
+    );
+    println!("  Catalog:  {}", manifest.catalog);
+    println!("  Schemas:  {}", manifest.schemas.join(", "));
+    println!("  Chunks:   {}", format_delete_chunks(manifest));
+}
+
+fn format_delete_chunks(manifest: &Manifest) -> String {
+    if manifest.schema_only {
+        return "0 (schema-only)".to_string();
+    }
+
+    let summary = summarize_chunks(manifest);
+    if manifest.is_complete() {
+        format!("{} (all processed)", summary.total)
+    } else {
+        format!(
+            "{} ({} completed, {} skipped, {} pending, {} in_progress, {} failed)",
+            summary.total,
+            summary.completed,
+            summary.skipped,
+            summary.pending,
+            summary.in_progress,
+            summary.failed
+        )
+    }
+}
+
+fn confirm_delete(snapshot: &str) -> Result<bool> {
+    println!();
+    println!(
+        "Warning: this removes the entire snapshot directory/prefix, not only files listed in manifest."
+    );
+    println!("This will permanently delete all data under:");
+    println!("  {}", display_snapshot_prefix(snapshot));
+    print!("Type 'yes' to confirm deletion: ");
+    io::stdout().flush().map_err(|error| {
+        IoSnafu {
+            operation: "flushing delete confirmation prompt",
+            error,
+        }
+        .build()
+    })?;
+
+    let mut input = String::new();
+    io::stdin().read_line(&mut input).map_err(|error| {
+        IoSnafu {
+            operation: "reading delete confirmation",
+            error,
+        }
+        .build()
+    })?;
+
+    Ok(delete_confirmation_matches(&input))
+}
+
+fn delete_confirmation_matches(input: &str) -> bool {
+    input.trim() == "yes"
+}
+
+fn display_snapshot_prefix(snapshot: &str) -> String {
+    if snapshot.ends_with('/') {
+        snapshot.to_string()
+    } else {
+        format!("{}/", snapshot)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use chrono::TimeZone;
@@ -1563,6 +1711,7 @@ mod tests {
         );
         assert_eq!(snapshot_status(&complete), "complete");
         assert_eq!(format_list_chunks(&complete), "2/2");
+        assert_eq!(format_delete_chunks(&complete), "2 (all processed)");
 
         let incomplete = test_manifest(
             chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
@@ -1571,6 +1720,150 @@ mod tests {
         );
         assert_eq!(snapshot_status(&incomplete), "incomplete");
         assert_eq!(format_list_chunks(&incomplete), "1/2");
+        assert_eq!(
+            format_delete_chunks(&incomplete),
+            "2 (1 completed, 0 skipped, 1 pending, 0 in_progress, 0 failed)"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_delete_build_rejects_bucket_root_uri() {
+        let cmd = ExportDeleteCommand::parse_from([
+            "export-v2-delete",
+            "--snapshot",
+            "s3://bucket",
+            "--no-confirm",
+        ]);
+
+        let error = cmd.build().await.err().unwrap().to_string();
+        assert!(error.contains("non-empty path"));
+    }
+
+    #[test]
+    fn test_delete_skip_confirmation_aliases() {
+        let no_confirm = ExportDeleteCommand::parse_from([
+            "export-v2-delete",
+            "--snapshot",
+            "s3://bucket/snapshot",
+            "--no-confirm",
+        ]);
+        assert!(no_confirm.skip_confirmation);
+
+        let yes = ExportDeleteCommand::parse_from([
+            "export-v2-delete",
+            "--snapshot",
+            "s3://bucket/snapshot",
+            "--yes",
+        ]);
+        assert!(yes.skip_confirmation);
+    }
+
+    #[tokio::test]
+    async fn test_delete_snapshot_with_no_confirm_removes_snapshot_contents() {
+        let parent = tempdir().unwrap();
+        let snapshot = parent.path().join("snapshot");
+        let sibling = parent.path().join("sibling");
+        std::fs::create_dir_all(&snapshot).unwrap();
+        std::fs::create_dir_all(&sibling).unwrap();
+        std::fs::write(sibling.join("keep.txt"), b"keep").unwrap();
+        write_root_manifest(
+            &snapshot,
+            test_manifest(
+                chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
+                true,
+                true,
+            ),
+        );
+        write_snapshot_file(&snapshot, "schema/schemas.json", b"[]");
+
+        let uri = Url::from_directory_path(&snapshot).unwrap().to_string();
+        let delete = ExportDelete {
+            snapshot: uri,
+            skip_confirmation: true,
+            storage: file_storage_for_dir(&snapshot),
+        };
+
+        delete
+            .run_with_confirmation(|_| unreachable!())
+            .await
+            .unwrap();
+
+        assert!(!snapshot.join(MANIFEST_FILE).exists());
+        assert!(!snapshot.join("schema/schemas.json").exists());
+        assert!(sibling.join("keep.txt").exists());
+    }
+
+    #[tokio::test]
+    async fn test_delete_snapshot_requires_manifest() {
+        let dir = tempdir().unwrap();
+        let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
+        let delete = ExportDelete {
+            snapshot: uri,
+            skip_confirmation: true,
+            storage: file_storage_for_dir(dir.path()),
+        };
+
+        let error = delete
+            .run_with_confirmation(|_| unreachable!())
+            .await
+            .err()
+            .unwrap()
+            .to_string();
+
+        assert!(error.contains("Snapshot not found"));
+        assert!(dir.path().exists());
+    }
+
+    #[tokio::test]
+    async fn test_delete_snapshot_cancels_without_exact_confirmation() {
+        let dir = tempdir().unwrap();
+        write_root_manifest(
+            dir.path(),
+            test_manifest(
+                chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
+                true,
+                true,
+            ),
+        );
+        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
+        let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
+        let delete = ExportDelete {
+            snapshot: uri.clone(),
+            skip_confirmation: false,
+            storage: file_storage_for_dir(dir.path()),
+        };
+
+        delete
+            .run_with_confirmation(|snapshot| {
+                assert_eq!(snapshot, uri);
+                Ok(false)
+            })
+            .await
+            .unwrap();
+
+        assert!(dir.path().join(MANIFEST_FILE).exists());
+        assert!(dir.path().join("schema/schemas.json").exists());
+    }
+
+    #[test]
+    fn test_delete_confirmation_requires_exact_yes() {
+        assert!(delete_confirmation_matches("yes"));
+        assert!(delete_confirmation_matches(" yes\n"));
+        assert!(!delete_confirmation_matches("YES"));
+        assert!(!delete_confirmation_matches("y"));
+        assert!(!delete_confirmation_matches("yes please"));
+    }
+
+    #[test]
+    fn test_display_snapshot_prefix_adds_trailing_slash() {
+        assert_eq!(
+            display_snapshot_prefix("s3://bucket/snapshot"),
+            "s3://bucket/snapshot/"
+        );
+        assert_eq!(
+            display_snapshot_prefix("s3://bucket/snapshot/"),
+            "s3://bucket/snapshot/"
+        );
     }
 
     #[tokio::test]
diff --git a/src/cli/src/data/export_v2/error.rs b/src/cli/src/data/export_v2/error.rs
index 8d9a53f186..e16e3a6176 100644
--- a/src/cli/src/data/export_v2/error.rs
+++ b/src/cli/src/data/export_v2/error.rs
@@ -71,6 +71,14 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("I/O error while {}: {}", operation, error))]
+    Io {
+        operation: &'static str,
+        error: std::io::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display(
         "Cannot resume snapshot with a different schema_only mode (existing: {}, requested: {}). Use --force to recreate.",
         existing_schema_only,
@@ -223,6 +231,8 @@ impl ErrorExt for Error {
             | Error::UnexpectedValueType { .. }
             | Error::UrlParse { .. } => StatusCode::Internal,
 
+            Error::Io { .. } => StatusCode::External,
+
             Error::Database { error, .. } => error.status_code(),
 
             Error::SnapshotNotFound { .. } => StatusCode::InvalidArguments,
diff --git a/src/cli/src/data/snapshot_storage.rs b/src/cli/src/data/snapshot_storage.rs
index da8fdf6ab1..93e211628a 100644
--- a/src/cli/src/data/snapshot_storage.rs
+++ b/src/cli/src/data/snapshot_storage.rs
@@ -18,6 +18,7 @@
 //! to various storage backends (S3, OSS, GCS, Azure Blob, local filesystem).
 
 use std::collections::BTreeSet;
+use std::path::Component;
 
 use async_trait::async_trait;
 use futures::TryStreamExt;
@@ -131,6 +132,92 @@ pub fn validate_uri(uri: &str) -> Result<StorageScheme> {
     StorageScheme::from_uri(uri)
 }
 
+/// Validates a URI for snapshot-scoped destructive operations.
+///
+/// Unlike read-only parent scans, destructive commands must target a concrete
+/// snapshot directory instead of a bucket/container root or filesystem root.
+/// Remote storage buckets/containers already provide namespace isolation, so a
+/// non-empty object prefix is enough; local filesystem paths require at least
+/// two non-root path segments to avoid deleting broad system directories.
+pub fn validate_snapshot_uri(uri: &str) -> Result<StorageScheme> {
+    let scheme = validate_uri(uri)?;
+    reject_query_or_fragment(uri)?;
+    match scheme {
+        StorageScheme::File => validate_file_snapshot_uri(uri)?,
+        StorageScheme::S3 | StorageScheme::Oss | StorageScheme::Gcs | StorageScheme::Azblob => {
+            extract_remote_location_with_root_policy(uri, false)?;
+        }
+    }
+    Ok(scheme)
+}
+
+fn reject_query_or_fragment(uri: &str) -> Result<()> {
+    let url = Url::parse(uri).context(UrlParseSnafu)?;
+    if url.query().is_some() || url.fragment().is_some() {
+        return InvalidUriSnafu {
+            uri,
+            reason: "snapshot URI must not include query or fragment",
+        }
+        .fail();
+    }
+
+    Ok(())
+}
+
+fn validate_file_snapshot_uri(uri: &str) -> Result<()> {
+    if has_explicit_dot_segment(uri) {
+        return InvalidUriSnafu {
+            uri,
+            reason: "file snapshot URI must not contain '.' or '..' path segments",
+        }
+        .fail();
+    }
+
+    let path = extract_file_path_from_uri(uri)?;
+    let mut normal_component_count = 0;
+
+    // This is only a path-shape guard for destructive operations. It does not
+    // resolve symlinks. Drive prefixes and root separators also do not count
+    // toward depth; delete still relies on the manifest check and explicit
+    // confirmation before removing the rooted storage prefix.
+    for component in std::path::Path::new(&path).components() {
+        match component {
+            Component::Normal(_) => normal_component_count += 1,
+            Component::CurDir | Component::ParentDir => {
+                return InvalidUriSnafu {
+                    uri,
+                    reason: "file snapshot URI must not contain '.' or '..' path segments",
+                }
+                .fail();
+            }
+            Component::Prefix(_) | Component::RootDir => {}
+        }
+    }
+
+    if normal_component_count < 2 {
+        return InvalidUriSnafu {
+            uri,
+            reason: "file snapshot URI must point to a directory at least two levels deep",
+        }
+        .fail();
+    }
+
+    Ok(())
+}
+
+fn has_explicit_dot_segment(uri: &str) -> bool {
+    // Defense in depth: catch dot segments at the raw URI level before
+    // `Url::to_file_path()` can normalize them away. The `Path::components()`
+    // check below still runs because URL decoding can reintroduce them.
+    let without_fragment = uri.split_once('#').map_or(uri, |(path, _)| path);
+    let path = without_fragment
+        .split_once('?')
+        .map_or(without_fragment, |(path, _)| path);
+
+    path.split('/')
+        .any(|segment| segment == "." || segment == "..")
+}
+
 fn schema_index_path() -> String {
     format!("{}/{}", SCHEMA_DIR, SCHEMAS_FILE)
 }
@@ -708,6 +795,43 @@ mod tests {
         assert!(OpenDalStorage::from_parent_uri("s3://bucket", &storage).is_ok());
     }
 
+    #[test]
+    fn test_validate_snapshot_uri_rejects_dangerous_roots() {
+        assert!(validate_snapshot_uri("s3://bucket").is_err());
+        assert!(validate_snapshot_uri("s3://bucket/").is_err());
+        assert!(validate_snapshot_uri("oss://bucket").is_err());
+        assert!(validate_snapshot_uri("gs://bucket").is_err());
+        assert!(validate_snapshot_uri("azblob://container").is_err());
+        assert!(validate_snapshot_uri("s3://bucket/snapshot?version=1").is_err());
+        assert!(validate_snapshot_uri("file:///tmp/backup#fragment").is_err());
+        assert!(validate_snapshot_uri("file:///").is_err());
+        assert!(validate_snapshot_uri("file:///tmp").is_err());
+        assert!(validate_snapshot_uri("file:///tmp/backup/.").is_err());
+        assert!(validate_snapshot_uri("file:///tmp/backup/..").is_err());
+    }
+
+    #[test]
+    fn test_validate_snapshot_uri_accepts_snapshot_paths() {
+        assert_eq!(
+            validate_snapshot_uri("s3://bucket/snapshots/prod").unwrap(),
+            StorageScheme::S3
+        );
+
+        let dir = tempdir().unwrap();
+        let snapshot = dir.path().join("snapshot");
+        std::fs::create_dir_all(&snapshot).unwrap();
+        let uri = Url::from_directory_path(snapshot).unwrap().to_string();
+        assert_eq!(validate_snapshot_uri(&uri).unwrap(), StorageScheme::File);
+    }
+
+    #[cfg(windows)]
+    #[test]
+    fn test_validate_snapshot_uri_windows_drive_prefix_depth() {
+        assert!(validate_snapshot_uri("file:///C:/").is_err());
+        assert!(validate_snapshot_uri("file:///C:/Users").is_err());
+        assert!(validate_snapshot_uri("file:///C:/Users/snapshot").is_ok());
+    }
+
     #[cfg(not(windows))]
     #[test]
     fn test_extract_path_from_uri_unix_examples() {
diff --git a/src/cmd/src/datanode/objbench.rs b/src/cmd/src/datanode/objbench.rs
index a298430c83..65f194d19f 100644
--- a/src/cmd/src/datanode/objbench.rs
+++ b/src/cmd/src/datanode/objbench.rs
@@ -588,6 +588,8 @@ async fn build_cache_manager(
             .vector_cache_size(config.vector_cache_size.as_bytes())
             .page_cache_size(config.page_cache_size.as_bytes())
             .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
+            .range_result_cache_size(config.range_result_cache_size.as_bytes())
+            .prefilter_result_cache_size(config.prefilter_result_cache_size.as_bytes())
             .index_metadata_size(config.index.metadata_cache_size.as_bytes())
             .index_content_size(config.index.content_cache_size.as_bytes())
             .index_content_page_size(config.index.content_cache_page_size.as_bytes())
diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs
index b0601088cf..e0f2c673ff 100644
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -20,6 +20,7 @@ use std::{fs, path};
 
 use async_trait::async_trait;
 use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry};
+use catalog::CatalogManagerRef;
 use catalog::information_schema::InformationExtensionRef;
 use catalog::kvbackend::{CatalogManagerConfiguratorRef, KvBackendCatalogManagerBuilder};
 use catalog::process_manager::ProcessManager;
@@ -28,7 +29,8 @@ use common_base::Plugins;
 use common_catalog::consts::{MIN_USER_FLOW_ID, MIN_USER_TABLE_ID};
 use common_config::{Configurable, metadata_store_dir};
 use common_error::ext::BoxedError;
-use common_meta::cache::LayeredCacheRegistryBuilder;
+use common_meta::DatanodeId;
+use common_meta::cache::{LayeredCacheRegistryBuilder, LayeredCacheRegistryRef};
 use common_meta::ddl::flow_meta::FlowMetadataAllocator;
 use common_meta::ddl::table_meta::TableMetadataAllocator;
 use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl};
@@ -53,8 +55,8 @@ use datanode::config::DatanodeOptions;
 use datanode::datanode::{Datanode, DatanodeBuilder};
 use datanode::region_server::RegionServer;
 use flow::{
-    FlownodeBuilder, FlownodeInstance, FlownodeOptions, FrontendClient, FrontendInvoker,
-    GrpcQueryHandlerWithBoxedError,
+    FlowDualEngineRef, FlownodeBuilder, FlownodeInstance, FlownodeOptions, FrontendClient,
+    FrontendInvoker, GrpcQueryHandlerWithBoxedError,
 };
 use frontend::frontend::Frontend;
 use frontend::instance::StandaloneDatanodeManager;
@@ -124,8 +126,8 @@ pub struct Instance {
     frontend: Frontend,
     flownode: FlownodeInstance,
     procedure_manager: ProcedureManagerRef,
-    wal_provider: WalProviderRef,
     leader_services_controller: Box<dyn StandaloneLeaderServicesController>,
+    leader_services_context: LeaderServicesContext,
     // Keep the logging guard to prevent the worker from being dropped.
     _guard: Vec<WorkerGuard>,
 }
@@ -159,11 +161,7 @@ impl App for Instance {
         self.datanode.start_telemetry();
 
         self.leader_services_controller
-            .start(
-                self.procedure_manager.clone(),
-                self.wal_provider.clone(),
-                self.datanode.region_server(),
-            )
+            .start(self.leader_services_context.clone())
             .await?;
 
         plugins::start_frontend_plugins(self.frontend.instance.plugins().clone())
@@ -379,6 +377,8 @@ impl StartCommand {
         opts.grpc.detect_server_addr();
         let fe_opts = opts.frontend_options();
         let dn_opts = opts.datanode_options();
+        let node_id = dn_opts.node_id;
+        let init_regions_parallelism = dn_opts.init_regions_parallelism;
 
         plugins::setup_frontend_plugins(&mut plugins, &plugin_opts, &fe_opts)
             .await
@@ -491,21 +491,18 @@ impl StartCommand {
             .await
             .map_err(BoxedError::new)
             .context(error::OtherSnafu)?;
+        let flow_engine = flownode.flow_engine();
 
         // set the ref to query for the local flow state
         {
             information_extension
-                .set_flow_engine(flownode.flow_engine())
+                .set_flow_engine(flow_engine.clone())
                 .await;
         }
 
         let node_manager = creator
             .node_manager_creator
-            .create(
-                &kv_backend,
-                datanode.region_server(),
-                flownode.flow_engine(),
-            )
+            .create(&kv_backend, datanode.region_server(), flow_engine.clone())
             .await?;
 
         let table_id_allocator = creator.table_id_allocator_creator.create(&kv_backend);
@@ -596,7 +593,7 @@ impl StartCommand {
             .await;
 
         // set the frontend invoker for flownode
-        let flow_streaming_engine = flownode.flow_engine().streaming_engine();
+        let flow_streaming_engine = flow_engine.streaming_engine();
         // flow server need to be able to use frontend to write insert requests back
         let invoker = FrontendInvoker::build_from(
             flow_streaming_engine.clone(),
@@ -620,14 +617,27 @@ impl StartCommand {
             servers,
             heartbeat_task: None,
         };
+        let leader_services_context = LeaderServicesContext {
+            procedure_manager: procedure_manager.clone(),
+            wal_provider: wal_provider.clone(),
+            region_server: datanode.region_server(),
+            kv_backend: kv_backend.clone(),
+            cache_registry: layered_cache_registry,
+            catalog_manager,
+            flow_engine,
+            frontend_client,
+            node_id,
+            init_regions_parallelism,
+            plugin_options: plugin_opts,
+        };
 
         let instance = Instance {
             datanode,
             frontend,
             flownode,
             procedure_manager,
-            wal_provider,
             leader_services_controller: creator.leader_services_controller,
+            leader_services_context,
             _guard: vec![],
         };
         let result = InstanceCreatorResult {
@@ -743,16 +753,11 @@ impl ProcedureExecutorCreator for DefaultProcedureExecutorCreator {
 
 #[async_trait]
 pub trait StandaloneLeaderServicesController: Send + Sync {
-    /// Starts services that manage standalone metadata or WAL state.
+    /// Starts leader services that manage standalone metadata or WAL state.
     ///
     /// The default implementation starts the procedure manager and WAL provider
     /// during instance startup.
-    async fn start(
-        &self,
-        procedure_manager: ProcedureManagerRef,
-        wal_provider: WalProviderRef,
-        region_server: RegionServer,
-    ) -> Result<()>;
+    async fn start(&self, context: LeaderServicesContext) -> Result<()>;
 
     /// Stops services started by [`StandaloneLeaderServicesController::start`].
     async fn stop(
@@ -762,21 +767,42 @@ pub trait StandaloneLeaderServicesController: Send + Sync {
     ) -> Result<()>;
 }
 
+#[derive(Clone)]
+/// Additional runtime handles for custom leader-service controllers.
+///
+/// The default standalone startup only needs to start/stop the procedure
+/// manager and WAL provider. Some embedders need to do more work around
+/// leader-service startup, for example reconciling metadata-backed runtime
+/// state before publishing writable leadership. Grouping those handles here
+/// keeps `Instance` small and avoids expanding
+/// [`StandaloneLeaderServicesController::start`] every time a custom lifecycle
+/// needs one more standalone component.
+pub struct LeaderServicesContext {
+    pub procedure_manager: ProcedureManagerRef,
+    pub wal_provider: WalProviderRef,
+    pub region_server: RegionServer,
+    pub kv_backend: KvBackendRef,
+    pub cache_registry: LayeredCacheRegistryRef,
+    pub catalog_manager: CatalogManagerRef,
+    pub flow_engine: FlowDualEngineRef,
+    pub frontend_client: Arc<FrontendClient>,
+    pub node_id: Option<DatanodeId>,
+    pub init_regions_parallelism: usize,
+    pub plugin_options: Vec<PluginOptions>,
+}
+
 pub struct DefaultStandaloneLeaderServicesController;
 
 #[async_trait]
 impl StandaloneLeaderServicesController for DefaultStandaloneLeaderServicesController {
-    async fn start(
-        &self,
-        procedure_manager: ProcedureManagerRef,
-        wal_provider: WalProviderRef,
-        _region_server: RegionServer,
-    ) -> Result<()> {
-        procedure_manager
+    async fn start(&self, context: LeaderServicesContext) -> Result<()> {
+        context
+            .procedure_manager
             .start()
             .await
             .context(error::StartProcedureManagerSnafu)?;
-        wal_provider
+        context
+            .wal_provider
             .start()
             .await
             .context(error::StartWalProviderSnafu)
diff --git a/src/common/catalog/src/consts.rs b/src/common/catalog/src/consts.rs
index 1cd5db8a0c..dd09893177 100644
--- a/src/common/catalog/src/consts.rs
+++ b/src/common/catalog/src/consts.rs
@@ -112,6 +112,8 @@ pub const INFORMATION_SCHEMA_SSTS_STORAGE_TABLE_ID: u32 = 38;
 pub const INFORMATION_SCHEMA_SSTS_INDEX_META_TABLE_ID: u32 = 39;
 /// id for information_schema.alerts
 pub const INFORMATION_SCHEMA_ALERTS_TABLE_ID: u32 = 40;
+/// id for information_schema.region_info
+pub const INFORMATION_SCHEMA_REGION_INFO_TABLE_ID: u32 = 41;
 
 // ----- End of information_schema tables -----
 
diff --git a/src/common/function/src/scalars/json/json_get_rewriter.rs b/src/common/function/src/scalars/json/json_get_rewriter.rs
index 137b307412..0143ee05d5 100644
--- a/src/common/function/src/scalars/json/json_get_rewriter.rs
+++ b/src/common/function/src/scalars/json/json_get_rewriter.rs
@@ -59,7 +59,10 @@ impl FunctionRewrite for JsonGetRewriter {
 //   json_get(column, path, <data_type>)
 // )
 fn inject_type_from_cast_expr(cast: Cast) -> Result<Transformed<Expr>> {
-    let Cast { expr, data_type } = cast;
+    let Cast {
+        expr,
+        mut data_type,
+    } = cast;
 
     let mut json_get = match *expr {
         Expr::ScalarFunction(f)
@@ -75,6 +78,9 @@ fn inject_type_from_cast_expr(cast: Cast) -> Result<Transformed<Expr>> {
         }
     };
 
+    if data_type.is_string() {
+        data_type = DataType::Utf8View;
+    }
     let with_type = ScalarValue::try_new_null(&data_type).map(|x| Expr::Literal(x, None))?;
     json_get.args.push(with_type);
     Ok(Transformed::yes(Expr::ScalarFunction(json_get)))
diff --git a/src/common/meta/src/cache/container.rs b/src/common/meta/src/cache/container.rs
index e3a3e13a76..e3a1a50adc 100644
--- a/src/common/meta/src/cache/container.rs
+++ b/src/common/meta/src/cache/container.rs
@@ -196,8 +196,8 @@ where
 #[async_trait::async_trait]
 impl<K, V> CacheInvalidator for CacheContainer<K, V, CacheIdent>
 where
-    K: Send + Sync,
-    V: Send + Sync,
+    K: Hash + Eq + Send + Sync + 'static,
+    V: Clone + Send + Sync + 'static,
 {
     async fn invalidate(&self, _ctx: &Context, caches: &[CacheIdent]) -> Result<()> {
         let idents = caches
@@ -211,6 +211,12 @@ where
 
         Ok(())
     }
+
+    fn invalidate_all(&self) -> Result<()> {
+        self.inc_version();
+        self.cache.invalidate_all();
+        Ok(())
+    }
 }
 
 impl<K, V, CacheToken> CacheContainer<K, V, CacheToken>
diff --git a/src/common/meta/src/cache/flow/table_flownode.rs b/src/common/meta/src/cache/flow/table_flownode.rs
index ebe3664202..4d3513a21d 100644
--- a/src/common/meta/src/cache/flow/table_flownode.rs
+++ b/src/common/meta/src/cache/flow/table_flownode.rs
@@ -210,7 +210,7 @@ mod tests {
     use crate::cache::flow::table_flownode::{FlowIdent, new_table_flownode_set_cache};
     use crate::instruction::{CacheIdent, CreateFlow, DropFlow};
     use crate::key::flow::FlowMetadataManager;
-    use crate::key::flow::flow_info::FlowInfoValue;
+    use crate::key::flow::flow_info::{FlowInfoValue, FlowStatus};
     use crate::key::flow::flow_route::FlowRouteValue;
     use crate::kv_backend::memory::MemoryKvBackend;
     use crate::peer::Peer;
@@ -242,11 +242,14 @@ mod tests {
                     catalog_name: DEFAULT_CATALOG_NAME.to_string(),
                     query_context: None,
                     flow_name: "my_flow".to_string(),
+                    all_source_table_names: vec![],
+                    unresolved_source_table_names: vec![],
                     raw_sql: "sql".to_string(),
                     expire_after: Some(300),
                     eval_interval_secs: None,
                     comment: "comment".to_string(),
                     options: Default::default(),
+                    status: FlowStatus::Active,
                     created_time: chrono::Utc::now(),
                     updated_time: chrono::Utc::now(),
                 },
diff --git a/src/common/meta/src/cache/registry.rs b/src/common/meta/src/cache/registry.rs
index d541525f98..b7ee82b6e5 100644
--- a/src/common/meta/src/cache/registry.rs
+++ b/src/common/meta/src/cache/registry.rs
@@ -67,6 +67,13 @@ impl CacheInvalidator for LayeredCacheRegistry {
         }
         results.into_iter().collect::<Result<Vec<_>>>().map(|_| ())
     }
+
+    fn invalidate_all(&self) -> Result<()> {
+        for registry in &self.layers {
+            registry.invalidate_all()?;
+        }
+        Ok(())
+    }
 }
 
 impl LayeredCacheRegistry {
@@ -124,6 +131,13 @@ impl CacheInvalidator for CacheRegistry {
             .collect::<Result<Vec<_>>>()?;
         Ok(())
     }
+
+    fn invalidate_all(&self) -> Result<()> {
+        for invalidator in &self.indexes {
+            invalidator.invalidate_all()?;
+        }
+        Ok(())
+    }
 }
 
 impl CacheRegistry {
@@ -149,6 +163,8 @@ mod tests {
 
     use crate::cache::registry::CacheRegistryBuilder;
     use crate::cache::*;
+    use crate::cache_invalidator::{CacheInvalidator, Context};
+    use crate::error::Result;
     use crate::instruction::CacheIdent;
 
     fn always_true_filter(_: &CacheIdent) -> bool {
@@ -259,4 +275,91 @@ mod tests {
             .unwrap();
         assert_eq!(cache.name(), "string_cache");
     }
+
+    #[tokio::test]
+    async fn test_registry_invalidate_all() {
+        let invalidator: Invalidator<_, String, CacheIdent> =
+            Box::new(|_, _| Box::pin(async { Ok(()) }));
+        let i32_cache = Arc::new(test_i32_cache("i32_cache", invalidator));
+        let invalidator: Invalidator<_, String, CacheIdent> =
+            Box::new(|_, _| Box::pin(async { Ok(()) }));
+        let string_cache = Arc::new(test_cache("string_cache", invalidator));
+
+        i32_cache.get(1).await.unwrap();
+        string_cache.get_by_ref("foo").await.unwrap();
+        assert!(i32_cache.contains_key(&1));
+        assert!(string_cache.contains_key("foo"));
+
+        let registry = CacheRegistryBuilder::default()
+            .add_cache(i32_cache.clone())
+            .add_cache(string_cache.clone())
+            .build();
+
+        registry.invalidate_all().unwrap();
+
+        assert!(!i32_cache.contains_key(&1));
+        assert!(!string_cache.contains_key("foo"));
+    }
+
+    struct LayerOrderInvalidator {
+        expected_order: i32,
+        order: Arc<AtomicI32>,
+    }
+
+    #[async_trait::async_trait]
+    impl CacheInvalidator for LayerOrderInvalidator {
+        async fn invalidate(&self, _ctx: &Context, _caches: &[CacheIdent]) -> Result<()> {
+            Ok(())
+        }
+
+        fn invalidate_all(&self) -> Result<()> {
+            let previous = self.order.fetch_add(1, Ordering::Relaxed);
+            assert_eq!(self.expected_order, previous);
+            Ok(())
+        }
+    }
+
+    #[tokio::test]
+    async fn test_layered_registry_invalidate_all() {
+        let order = Arc::new(AtomicI32::new(0));
+        let invalidator: Invalidator<_, String, CacheIdent> =
+            Box::new(|_, _| Box::pin(async { Ok(()) }));
+        let first_layer_cache = Arc::new(test_cache("first_layer_cache", invalidator));
+        let first_layer_order = Arc::new(LayerOrderInvalidator {
+            expected_order: 0,
+            order: order.clone(),
+        });
+        let first_layer = CacheRegistryBuilder::default()
+            .add_cache(first_layer_order)
+            .add_cache(first_layer_cache.clone())
+            .build();
+
+        let invalidator: Invalidator<_, String, CacheIdent> =
+            Box::new(|_, _| Box::pin(async { Ok(()) }));
+        let second_layer_cache = Arc::new(test_i32_cache("second_layer_cache", invalidator));
+        let second_layer_order = Arc::new(LayerOrderInvalidator {
+            expected_order: 1,
+            order: order.clone(),
+        });
+        let second_layer = CacheRegistryBuilder::default()
+            .add_cache(second_layer_order)
+            .add_cache(second_layer_cache.clone())
+            .build();
+
+        first_layer_cache.get_by_ref("foo").await.unwrap();
+        second_layer_cache.get(1).await.unwrap();
+        assert!(first_layer_cache.contains_key("foo"));
+        assert!(second_layer_cache.contains_key(&1));
+
+        let registry = LayeredCacheRegistryBuilder::default()
+            .add_cache_registry(first_layer)
+            .add_cache_registry(second_layer)
+            .build();
+
+        registry.invalidate_all().unwrap();
+
+        assert_eq!(2, order.load(Ordering::Relaxed));
+        assert!(!first_layer_cache.contains_key("foo"));
+        assert!(!second_layer_cache.contains_key(&1));
+    }
 }
diff --git a/src/common/meta/src/cache_invalidator.rs b/src/common/meta/src/cache_invalidator.rs
index ffc3dd1c9a..4fe0699ba5 100644
--- a/src/common/meta/src/cache_invalidator.rs
+++ b/src/common/meta/src/cache_invalidator.rs
@@ -55,6 +55,13 @@ pub struct Context {
 pub trait CacheInvalidator: Send + Sync {
     async fn invalidate(&self, ctx: &Context, caches: &[CacheIdent]) -> Result<()>;
 
+    /// Invalidates every cache entry owned by this invalidator.
+    ///
+    /// This method is required so each implementer explicitly decides how
+    /// full-cache invalidation should behave. Implementations that intentionally
+    /// do nothing must document why a no-op is safe.
+    fn invalidate_all(&self) -> Result<()>;
+
     fn name(&self) -> &'static str {
         std::any::type_name::<Self>()
     }
@@ -69,6 +76,11 @@ impl CacheInvalidator for DummyCacheInvalidator {
     async fn invalidate(&self, _ctx: &Context, _caches: &[CacheIdent]) -> Result<()> {
         Ok(())
     }
+
+    fn invalidate_all(&self) -> Result<()> {
+        // Dummy invalidator owns no cache state, so there is nothing to clear.
+        Ok(())
+    }
 }
 
 #[async_trait::async_trait]
@@ -157,4 +169,11 @@ where
         }
         Ok(())
     }
+
+    fn invalidate_all(&self) -> Result<()> {
+        // KvCacheInvalidator only knows how to invalidate explicit metadata
+        // keys. There is no safe generic way to enumerate or clear the backend
+        // keyspace, so full invalidation is intentionally a no-op here.
+        Ok(())
+    }
 }
diff --git a/src/common/meta/src/ddl/create_flow.rs b/src/common/meta/src/ddl/create_flow.rs
index 7120e50425..ddfb0c0759 100644
--- a/src/common/meta/src/ddl/create_flow.rs
+++ b/src/common/meta/src/ddl/create_flow.rs
@@ -14,7 +14,7 @@
 
 mod metadata;
 
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashMap};
 use std::fmt;
 
 use api::v1::ExpireAfter;
@@ -34,13 +34,14 @@ use serde::{Deserialize, Serialize};
 use snafu::{ResultExt, ensure};
 use strum::AsRefStr;
 use table::metadata::TableId;
+use table::table_name::TableName;
 
 use crate::cache_invalidator::Context;
 use crate::ddl::DdlContext;
 use crate::ddl::utils::{add_peer_context_if_needed, map_to_procedure_error};
 use crate::error::{self, Result, UnexpectedSnafu};
 use crate::instruction::{CacheIdent, CreateFlow, DropFlow};
-use crate::key::flow::flow_info::FlowInfoValue;
+use crate::key::flow::flow_info::{FlowInfoValue, FlowStatus};
 use crate::key::flow::flow_route::FlowRouteValue;
 use crate::key::table_name::TableNameKey;
 use crate::key::{DeserializedValueWithBytes, FlowId, FlowPartitionId};
@@ -67,6 +68,7 @@ impl CreateFlowProcedure {
                 flow_id: None,
                 peers: vec![],
                 source_table_ids: vec![],
+                unresolved_source_table_names: vec![],
                 flow_context: query_context.into(), // Convert to FlowQueryContext
                 state: CreateFlowState::Prepare,
                 prev_flow_info_value: None,
@@ -89,6 +91,8 @@ impl CreateFlowProcedure {
         let create_if_not_exists = self.data.task.create_if_not_exists;
         let or_replace = self.data.task.or_replace;
 
+        validate_flow_options(&self.data.task)?;
+
         let flow_name_value = self
             .context
             .flow_metadata_manager
@@ -167,6 +171,21 @@ impl CreateFlowProcedure {
         }
 
         self.collect_source_tables().await?;
+        ensure!(
+            self.data.unresolved_source_table_names.is_empty()
+                || defer_on_missing_source(&self.data.task)?,
+            error::UnsupportedSnafu {
+                operation: format!(
+                    "Create flow with missing source tables requires WITH ('{DEFER_ON_MISSING_SOURCE_KEY}'='true'): {}",
+                    self.data
+                        .unresolved_source_table_names
+                        .iter()
+                        .map(ToString::to_string)
+                        .join(", ")
+                )
+            }
+        );
+        self.ensure_supported_replace_transition()?;
 
         // Validate that source and sink tables are not the same
         let sink_table_name = &self.data.task.sink_table_name;
@@ -189,13 +208,38 @@ impl CreateFlowProcedure {
         if self.data.flow_id.is_none() {
             self.allocate_flow_id().await?;
         }
-        self.data.state = CreateFlowState::CreateFlows;
-        // determine flow type
         self.data.flow_type = Some(get_flow_type_from_options(&self.data.task)?);
 
+        self.data.state = if self.data.is_pending() {
+            self.data.peers.clear();
+            CreateFlowState::CreateMetadata
+        } else {
+            CreateFlowState::CreateFlows
+        };
+
         Ok(Status::executing(true))
     }
 
+    fn ensure_supported_replace_transition(&self) -> Result<()> {
+        if !self.data.task.or_replace {
+            return Ok(());
+        }
+
+        let Some(prev_flow_info) = self.data.prev_flow_info_value.as_ref() else {
+            return Ok(());
+        };
+        let prev_pending = prev_flow_info.get_inner_ref().is_pending();
+        let new_pending = self.data.is_pending();
+        ensure!(
+            prev_pending == new_pending,
+            error::UnsupportedSnafu {
+                operation: "Replacing between pending and active flow states is not supported yet"
+            }
+        );
+
+        Ok(())
+    }
+
     async fn on_flownode_create_flows(&mut self) -> Result<Status> {
         // Safety: must be allocated.
         let mut create_flow = Vec::with_capacity(self.data.peers.len());
@@ -365,6 +409,61 @@ pub fn get_flow_type_from_options(flow_task: &CreateFlowTask) -> Result<FlowType
     }
 }
 
+/// The flow option key for creating pending flow metadata when source tables do not exist.
+pub const DEFER_ON_MISSING_SOURCE_KEY: &str = "defer_on_missing_source";
+
+pub fn defer_on_missing_source(flow_task: &CreateFlowTask) -> Result<bool> {
+    flow_task
+        .flow_options
+        .get(DEFER_ON_MISSING_SOURCE_KEY)
+        .map(|value| {
+            value
+                .trim()
+                .to_ascii_lowercase()
+                .parse::<bool>()
+                .map_err(|_| {
+                    error::UnexpectedSnafu {
+                        err_msg: format!(
+                            "Invalid flow option '{DEFER_ON_MISSING_SOURCE_KEY}': {value}"
+                        ),
+                    }
+                    .build()
+                })
+        })
+        .transpose()
+        .map(|value| value.unwrap_or(false))
+}
+
+pub fn validate_flow_options(flow_task: &CreateFlowTask) -> Result<()> {
+    for key in flow_task.flow_options.keys() {
+        match key.as_str() {
+            DEFER_ON_MISSING_SOURCE_KEY | FlowType::FLOW_TYPE_KEY => {}
+            unknown => {
+                return UnexpectedSnafu {
+                    err_msg: format!(
+                        "Unknown flow option '{unknown}', supported user options: {DEFER_ON_MISSING_SOURCE_KEY}"
+                    ),
+                }
+                .fail();
+            }
+        }
+    }
+
+    defer_on_missing_source(flow_task)?;
+    get_flow_type_from_options(flow_task)?;
+    Ok(())
+}
+
+fn user_runtime_flow_options(options: &HashMap<String, String>) -> HashMap<String, String> {
+    let mut options = options.clone();
+    options.remove(DEFER_ON_MISSING_SOURCE_KEY);
+    options
+}
+
+fn metadata_flow_options(options: &HashMap<String, String>) -> HashMap<String, String> {
+    options.clone()
+}
+
 /// The state of [CreateFlowProcedure].
 #[derive(Debug, Clone, Serialize, Deserialize, AsRefStr, PartialEq)]
 pub enum CreateFlowState {
@@ -411,6 +510,8 @@ pub struct CreateFlowData {
     pub(crate) flow_id: Option<FlowId>,
     pub(crate) peers: Vec<Peer>,
     pub(crate) source_table_ids: Vec<TableId>,
+    #[serde(default)]
+    pub(crate) unresolved_source_table_names: Vec<TableName>,
     /// Use alias for backward compatibility with QueryContext serialized data
     #[serde(alias = "query_context")]
     pub(crate) flow_context: FlowQueryContext,
@@ -424,6 +525,16 @@ pub struct CreateFlowData {
     pub(crate) flow_type: Option<FlowType>,
 }
 
+impl CreateFlowData {
+    pub(crate) fn is_pending(&self) -> bool {
+        !self.unresolved_source_table_names.is_empty()
+    }
+
+    pub(crate) fn is_active(&self) -> bool {
+        !self.is_pending()
+    }
+}
+
 impl From<&CreateFlowData> for CreateRequest {
     fn from(value: &CreateFlowData) -> Self {
         let flow_id = value.flow_id.unwrap();
@@ -446,7 +557,7 @@ impl From<&CreateFlowData> for CreateRequest {
                 .map(|seconds| api::v1::EvalInterval { seconds }),
             comment: value.task.comment.clone(),
             sql: value.task.sql.clone(),
-            flow_options: value.task.flow_options.clone(),
+            flow_options: user_runtime_flow_options(&value.task.flow_options),
         };
 
         let flow_type = value.flow_type.unwrap_or_default().to_string();
@@ -466,9 +577,9 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
             eval_interval_secs: eval_interval,
             comment,
             sql,
-            flow_options: mut options,
             ..
         } = value.task.clone();
+        let mut options = metadata_flow_options(&value.task.flow_options);
 
         let flownode_ids = value
             .peers
@@ -484,7 +595,7 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
             .collect::<Vec<_>>();
 
         let flow_type = value.flow_type.unwrap_or_default().to_string();
-        options.insert("flow_type".to_string(), flow_type);
+        options.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);
 
         let mut create_time = chrono::Utc::now();
         if let Some(prev_flow_value) = value.prev_flow_info_value.as_ref()
@@ -495,6 +606,8 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
 
         let flow_info: FlowInfoValue = FlowInfoValue {
             source_table_ids: value.source_table_ids.clone(),
+            all_source_table_names: value.task.source_table_names.clone(),
+            unresolved_source_table_names: value.unresolved_source_table_names.clone(),
             sink_table_name,
             flownode_ids,
             catalog_name,
@@ -506,6 +619,11 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
             eval_interval_secs: eval_interval,
             comment,
             options,
+            status: if value.is_active() {
+                FlowStatus::Active
+            } else {
+                FlowStatus::PendingSources
+            },
             created_time: create_time,
             updated_time: chrono::Utc::now(),
         };
diff --git a/src/common/meta/src/ddl/create_flow/metadata.rs b/src/common/meta/src/ddl/create_flow/metadata.rs
index 27b85b7946..f97ecfdf4a 100644
--- a/src/common/meta/src/ddl/create_flow/metadata.rs
+++ b/src/common/meta/src/ddl/create_flow/metadata.rs
@@ -12,10 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use snafu::OptionExt;
-
 use crate::ddl::create_flow::CreateFlowProcedure;
-use crate::error::{self, Result};
+use crate::error::Result;
 use crate::key::table_name::TableNameKey;
 
 impl CreateFlowProcedure {
@@ -34,9 +32,8 @@ impl CreateFlowProcedure {
         Ok(())
     }
 
-    /// Ensures all source tables exist and collects source table ids
+    /// Collects source table ids and keeps track of missing tables.
     pub(crate) async fn collect_source_tables(&mut self) -> Result<()> {
-        // Ensures all source tables exist.
         let keys = self
             .data
             .task
@@ -52,22 +49,24 @@ impl CreateFlowProcedure {
             .batch_get(keys)
             .await?;
 
-        let source_table_ids = self
+        let mut resolved = Vec::with_capacity(self.data.task.source_table_names.len());
+        let mut unresolved = Vec::new();
+
+        for (name, table_id) in self
             .data
             .task
             .source_table_names
             .iter()
             .zip(source_table_ids)
-            .map(|(name, table_id)| {
-                Ok(table_id
-                    .with_context(|| error::TableNotFoundSnafu {
-                        table_name: name.to_string(),
-                    })?
-                    .table_id())
-            })
-            .collect::<Result<Vec<_>>>()?;
+        {
+            match table_id {
+                Some(table_id) => resolved.push(table_id.table_id()),
+                None => unresolved.push(name.clone()),
+            }
+        }
 
-        self.data.source_table_ids = source_table_ids;
+        self.data.source_table_ids = resolved;
+        self.data.unresolved_source_table_names = unresolved;
         Ok(())
     }
 }
diff --git a/src/common/meta/src/ddl/drop_flow/metadata.rs b/src/common/meta/src/ddl/drop_flow/metadata.rs
index 0437098be3..7afd00f9d5 100644
--- a/src/common/meta/src/ddl/drop_flow/metadata.rs
+++ b/src/common/meta/src/ddl/drop_flow/metadata.rs
@@ -43,7 +43,7 @@ impl DropFlowProcedure {
             .map(|(_, value)| value)
             .collect::<Vec<_>>();
         ensure!(
-            !flow_route_values.is_empty(),
+            flow_info_value.is_pending() || !flow_route_values.is_empty(),
             error::FlowRouteNotFoundSnafu {
                 flow_name: format_full_flow_name(catalog_name, flow_name),
             }
diff --git a/src/common/meta/src/ddl/tests/create_flow.rs b/src/common/meta/src/ddl/tests/create_flow.rs
index 344fc05024..a1a6c040f1 100644
--- a/src/common/meta/src/ddl/tests/create_flow.rs
+++ b/src/common/meta/src/ddl/tests/create_flow.rs
@@ -16,12 +16,17 @@ use std::assert_matches;
 use std::collections::HashMap;
 use std::sync::Arc;
 
+use api::v1::flow::CreateRequest;
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use common_procedure::Status;
 use common_procedure_test::execute_procedure_until_done;
 use table::table_name::TableName;
 
 use crate::ddl::DdlContext;
-use crate::ddl::create_flow::{CreateFlowData, CreateFlowProcedure, CreateFlowState, FlowType};
+use crate::ddl::create_flow::{
+    CreateFlowData, CreateFlowProcedure, CreateFlowState, DEFER_ON_MISSING_SOURCE_KEY, FlowType,
+    defer_on_missing_source,
+};
 use crate::ddl::test_util::create_table::test_create_table_task;
 use crate::ddl::test_util::flownode_handler::NaiveFlownodeHandler;
 use crate::error;
@@ -63,6 +68,11 @@ pub(crate) fn test_create_flow_task(
     }
 }
 
+fn enable_defer_on_missing_source(task: &mut CreateFlowTask) {
+    task.flow_options
+        .insert(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string());
+}
+
 #[tokio::test]
 async fn test_create_flow_source_table_not_found() {
     let source_table_names = vec![TableName::new(
@@ -78,7 +88,261 @@ async fn test_create_flow_source_table_not_found() {
     let query_ctx = test_query_context();
     let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context);
     let err = procedure.on_prepare().await.unwrap_err();
-    assert_matches!(err, error::Error::TableNotFound { .. });
+    assert_matches!(err, error::Error::Unsupported { .. });
+    assert!(
+        err.to_string()
+            .contains("requires WITH ('defer_on_missing_source'='true')")
+    );
+}
+
+#[tokio::test]
+async fn test_create_pending_flow_source_table_not_found_with_defer() {
+    let source_table_names = vec![TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "my_table",
+    )];
+    let sink_table_name =
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table");
+    let mut task = test_create_flow_task("my_flow", source_table_names, sink_table_name, false);
+    enable_defer_on_missing_source(&mut task);
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context.clone());
+    let status = procedure.on_prepare().await.unwrap();
+    assert_matches!(status, Status::Executing { persist: true, .. });
+    assert_eq!(procedure.data.unresolved_source_table_names.len(), 1);
+    assert_eq!(procedure.data.source_table_ids, Vec::<u32>::new());
+
+    let output = execute_procedure_until_done(&mut procedure).await.unwrap();
+    let flow_id = *output.downcast_ref::<FlowId>().unwrap();
+    let flow_info = ddl_context
+        .flow_metadata_manager
+        .flow_info_manager()
+        .get(flow_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_eq!(flow_info.source_table_ids(), Vec::<u32>::new());
+    assert_eq!(
+        flow_info
+            .options()
+            .get(DEFER_ON_MISSING_SOURCE_KEY)
+            .map(String::as_str),
+        Some("true")
+    );
+}
+
+#[tokio::test]
+async fn test_create_pending_flow_source_table_not_found_with_defer_false() {
+    let source_table_names = vec![TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "my_table",
+    )];
+    let sink_table_name =
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table");
+    let mut task = test_create_flow_task("my_flow", source_table_names, sink_table_name, false);
+    task.flow_options
+        .insert(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "false".to_string());
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context);
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_matches!(err, error::Error::Unsupported { .. });
+    assert!(
+        err.to_string()
+            .contains("requires WITH ('defer_on_missing_source'='true')")
+    );
+}
+
+#[tokio::test]
+async fn test_create_pending_flow_records_partial_source_resolution() {
+    let existing_source = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "partial_existing_source_table",
+    );
+    let missing_source = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "partial_missing_source_table",
+    );
+    let sink_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "partial_pending_sink_table",
+    );
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+
+    let existing_table_id = 3026;
+    let create_table_task =
+        test_create_table_task("partial_existing_source_table", existing_table_id);
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            create_table_task.table_info.clone(),
+            TableRouteValue::physical(vec![]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let mut task = test_create_flow_task(
+        "partial_pending_flow",
+        vec![existing_source.clone(), missing_source.clone()],
+        sink_table_name,
+        false,
+    );
+    enable_defer_on_missing_source(&mut task);
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context.clone());
+    let status = procedure.on_prepare().await.unwrap();
+    assert_matches!(status, Status::Executing { persist: true, .. });
+    assert_eq!(procedure.data.source_table_ids, vec![existing_table_id]);
+    assert_eq!(
+        procedure.data.unresolved_source_table_names,
+        vec![missing_source.clone()]
+    );
+
+    let output = execute_procedure_until_done(&mut procedure).await.unwrap();
+    let flow_id = *output.downcast_ref::<FlowId>().unwrap();
+    let flow_info = ddl_context
+        .flow_metadata_manager
+        .flow_info_manager()
+        .get(flow_id)
+        .await
+        .unwrap()
+        .unwrap();
+
+    assert!(flow_info.is_pending());
+    assert_eq!(flow_info.source_table_ids(), &[existing_table_id]);
+    let expected_all_sources = vec![existing_source, missing_source.clone()];
+    assert_eq!(
+        flow_info.all_source_table_names(),
+        expected_all_sources.as_slice()
+    );
+    assert_eq!(flow_info.unresolved_source_table_names(), &[missing_source]);
+    assert!(flow_info.flownode_ids().is_empty());
+}
+
+#[test]
+fn test_defer_on_missing_source_defaults_false() {
+    let task = test_create_flow_task(
+        "my_flow",
+        vec![],
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+        false,
+    );
+
+    assert!(!defer_on_missing_source(&task).unwrap());
+}
+
+#[test]
+fn test_defer_on_missing_source_true() {
+    let mut task = test_create_flow_task(
+        "my_flow",
+        vec![],
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+        false,
+    );
+    task.flow_options
+        .insert(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string());
+
+    assert!(defer_on_missing_source(&task).unwrap());
+}
+
+#[test]
+fn test_defer_on_missing_source_invalid_value() {
+    let mut task = test_create_flow_task(
+        "my_flow",
+        vec![],
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+        false,
+    );
+    task.flow_options.insert(
+        DEFER_ON_MISSING_SOURCE_KEY.to_string(),
+        "invalid".to_string(),
+    );
+
+    let err = defer_on_missing_source(&task).unwrap_err();
+    assert!(
+        err.to_string()
+            .contains("Invalid flow option 'defer_on_missing_source': invalid")
+    );
+}
+
+#[tokio::test]
+async fn test_create_flow_rejects_unknown_option_in_meta_task() {
+    let mut task = test_create_flow_task(
+        "my_flow",
+        vec![],
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+        false,
+    );
+    task.flow_options
+        .insert("unknown_option".to_string(), "value".to_string());
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context);
+
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_matches!(err, error::Error::Unexpected { .. });
+    assert!(
+        err.to_string()
+            .contains("Unknown flow option 'unknown_option'")
+    );
+}
+
+#[test]
+fn test_create_request_strips_defer_on_missing_source_runtime_option() {
+    let mut task = test_create_flow_task(
+        "my_flow",
+        vec![],
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+        false,
+    );
+    enable_defer_on_missing_source(&mut task);
+
+    let data = CreateFlowData {
+        state: CreateFlowState::CreateFlows,
+        task,
+        flow_id: Some(1024),
+        peers: vec![],
+        source_table_ids: vec![],
+        unresolved_source_table_names: vec![],
+        flow_context: FlowQueryContext {
+            catalog: DEFAULT_CATALOG_NAME.to_string(),
+            schema: DEFAULT_SCHEMA_NAME.to_string(),
+            timezone: "UTC".to_string(),
+            extensions: HashMap::new(),
+            channel: 0,
+            snapshot_seqs: HashMap::new(),
+            sst_min_sequences: HashMap::new(),
+        },
+        prev_flow_info_value: None,
+        did_replace: false,
+        flow_type: Some(FlowType::Batching),
+    };
+
+    let request: CreateRequest = (&data).into();
+
+    assert!(
+        !request
+            .flow_options
+            .contains_key(DEFER_ON_MISSING_SOURCE_KEY)
+    );
+    assert_eq!(
+        request
+            .flow_options
+            .get(FlowType::FLOW_TYPE_KEY)
+            .map(String::as_str),
+        Some(FlowType::BATCHING)
+    );
 }
 
 pub(crate) async fn create_test_flow(
@@ -101,6 +365,27 @@ pub(crate) async fn create_test_flow(
     *flow_id
 }
 
+pub(crate) async fn create_test_pending_flow(
+    ddl_context: &DdlContext,
+    flow_name: &str,
+    source_table_names: Vec<TableName>,
+    sink_table_name: TableName,
+) -> FlowId {
+    let mut task = test_create_flow_task(
+        flow_name,
+        source_table_names.clone(),
+        sink_table_name.clone(),
+        false,
+    );
+    enable_defer_on_missing_source(&mut task);
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context.clone());
+    let output = execute_procedure_until_done(&mut procedure).await.unwrap();
+    let flow_id = output.downcast_ref::<FlowId>().unwrap();
+
+    *flow_id
+}
+
 #[tokio::test]
 async fn test_create_flow() {
     let table_id = 1024;
@@ -154,6 +439,201 @@ async fn test_create_flow() {
     assert_matches!(err, error::Error::FlowAlreadyExists { .. });
 }
 
+#[tokio::test]
+async fn test_replace_pending_flow_with_active_flow_is_unsupported() {
+    let source_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_pending_source_table",
+    );
+    let sink_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_pending_sink_table",
+    );
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+
+    let pending_flow_id = create_test_pending_flow(
+        &ddl_context,
+        "replace_pending_flow",
+        vec![source_table_name.clone()],
+        sink_table_name.clone(),
+    )
+    .await;
+
+    let pending_flow = ddl_context
+        .flow_metadata_manager
+        .flow_info_manager()
+        .get(pending_flow_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert!(pending_flow.is_pending());
+    assert!(pending_flow.flownode_ids().is_empty());
+
+    let create_table_task = test_create_table_task("replace_pending_source_table", 1026);
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            create_table_task.table_info.clone(),
+            TableRouteValue::physical(vec![]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let mut replace_task = test_create_flow_task(
+        "replace_pending_flow",
+        vec![source_table_name],
+        sink_table_name,
+        false,
+    );
+    replace_task.or_replace = true;
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(replace_task, query_ctx, ddl_context.clone());
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_matches!(err, error::Error::Unsupported { .. });
+    assert!(
+        err.to_string()
+            .contains("Replacing between pending and active flow states")
+    );
+}
+
+#[tokio::test]
+async fn test_replace_active_flow_with_pending_flow_is_unsupported() {
+    let existing_source_table = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_active_source_table",
+    );
+    let missing_source_table = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_missing_source_table",
+    );
+    let sink_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_active_sink_table",
+    );
+
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+
+    let create_table_task = test_create_table_task("replace_active_source_table", 2026);
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            create_table_task.table_info.clone(),
+            TableRouteValue::physical(vec![]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let _flow_id = create_test_flow(
+        &ddl_context,
+        "replace_active_flow_to_pending",
+        vec![existing_source_table],
+        sink_table_name.clone(),
+    )
+    .await;
+
+    let mut replace_task = test_create_flow_task(
+        "replace_active_flow_to_pending",
+        vec![missing_source_table],
+        sink_table_name,
+        false,
+    );
+    enable_defer_on_missing_source(&mut replace_task);
+    replace_task.or_replace = true;
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(replace_task, query_ctx, ddl_context.clone());
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_matches!(err, error::Error::Unsupported { .. });
+    assert!(
+        err.to_string()
+            .contains("Replacing between pending and active flow states")
+    );
+}
+
+#[tokio::test]
+async fn test_replace_pending_flow_with_pending_flow_updates_metadata() {
+    let first_missing_source = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_pending_first_missing_source",
+    );
+    let second_missing_source = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_pending_second_missing_source",
+    );
+    let sink_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_pending_to_pending_sink_table",
+    );
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+
+    let original_flow_id = create_test_pending_flow(
+        &ddl_context,
+        "replace_pending_to_pending_flow",
+        vec![first_missing_source.clone()],
+        sink_table_name.clone(),
+    )
+    .await;
+
+    let original_flow = ddl_context
+        .flow_metadata_manager
+        .flow_info_manager()
+        .get(original_flow_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert!(original_flow.is_pending());
+    assert_eq!(
+        original_flow.unresolved_source_table_names(),
+        &[first_missing_source]
+    );
+    assert!(original_flow.flownode_ids().is_empty());
+
+    let mut replace_task = test_create_flow_task(
+        "replace_pending_to_pending_flow",
+        vec![second_missing_source.clone()],
+        sink_table_name,
+        false,
+    );
+    enable_defer_on_missing_source(&mut replace_task);
+    replace_task.or_replace = true;
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(replace_task, query_ctx, ddl_context.clone());
+    let output = execute_procedure_until_done(&mut procedure).await.unwrap();
+    let replaced_flow_id = *output.downcast_ref::<FlowId>().unwrap();
+    assert_eq!(replaced_flow_id, original_flow_id);
+
+    let replaced_flow = ddl_context
+        .flow_metadata_manager
+        .flow_info_manager()
+        .get(replaced_flow_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert!(replaced_flow.is_pending());
+    assert_eq!(replaced_flow.source_table_ids(), Vec::<u32>::new());
+    assert_eq!(
+        replaced_flow.unresolved_source_table_names(),
+        std::slice::from_ref(&second_missing_source)
+    );
+    assert_eq!(
+        replaced_flow.all_source_table_names(),
+        &[second_missing_source]
+    );
+    assert!(replaced_flow.flownode_ids().is_empty());
+}
+
 #[tokio::test]
 async fn test_create_flow_same_source_and_sink_table() {
     let table_id = 1024;
@@ -228,6 +708,7 @@ fn test_create_flow_data_serialization_backward_compatibility() {
         "flow_id": null,
         "peers": [],
         "source_table_ids": [],
+        "unresolved_source_table_names": [],
         "query_context": {
             "current_catalog": "old_catalog",
             "current_schema": "old_schema",
@@ -265,6 +746,7 @@ fn test_create_flow_data_new_format_serialization() {
         flow_id: None,
         peers: vec![],
         source_table_ids: vec![],
+        unresolved_source_table_names: vec![],
         flow_context,
         prev_flow_info_value: None,
         did_replace: false,
@@ -327,6 +809,7 @@ fn test_flow_info_conversion_with_flow_context() {
         flow_id: Some(123),
         peers: vec![],
         source_table_ids: vec![456, 789],
+        unresolved_source_table_names: vec![],
         flow_context,
         prev_flow_info_value: None,
         did_replace: false,
diff --git a/src/common/meta/src/ddl/tests/drop_flow.rs b/src/common/meta/src/ddl/tests/drop_flow.rs
index af34da4809..400fd2e118 100644
--- a/src/common/meta/src/ddl/tests/drop_flow.rs
+++ b/src/common/meta/src/ddl/tests/drop_flow.rs
@@ -23,7 +23,7 @@ use table::table_name::TableName;
 use crate::ddl::drop_flow::DropFlowProcedure;
 use crate::ddl::test_util::create_table::test_create_table_task;
 use crate::ddl::test_util::flownode_handler::NaiveFlownodeHandler;
-use crate::ddl::tests::create_flow::create_test_flow;
+use crate::ddl::tests::create_flow::{create_test_flow, create_test_pending_flow};
 use crate::error;
 use crate::key::table_route::TableRouteValue;
 use crate::rpc::ddl::DropFlowTask;
@@ -91,3 +91,45 @@ async fn test_drop_flow() {
     let err = procedure.on_prepare().await.unwrap_err();
     assert_matches!(err, error::Error::FlowNotFound { .. });
 }
+
+#[tokio::test]
+async fn test_drop_pending_flow_without_routes() {
+    let source_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "drop_pending_missing_source_table",
+    );
+    let sink_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "drop_pending_sink_table",
+    );
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+
+    let flow_id = create_test_pending_flow(
+        &ddl_context,
+        "drop_pending_flow",
+        vec![source_table_name],
+        sink_table_name,
+    )
+    .await;
+    let flow_info = ddl_context
+        .flow_metadata_manager
+        .flow_info_manager()
+        .get(flow_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert!(flow_info.is_pending());
+    assert!(flow_info.flownode_ids().is_empty());
+
+    let task = test_drop_flow_task("drop_pending_flow", flow_id, false);
+    let mut procedure = DropFlowProcedure::new(task, ddl_context.clone());
+    execute_procedure_until_done(&mut procedure).await;
+
+    let task = test_drop_flow_task("drop_pending_flow", flow_id, false);
+    let mut procedure = DropFlowProcedure::new(task, ddl_context);
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_matches!(err, error::Error::FlowNotFound { .. });
+}
diff --git a/src/common/meta/src/ddl_manager.rs b/src/common/meta/src/ddl_manager.rs
index 52af4a36af..8dceeb2e5a 100644
--- a/src/common/meta/src/ddl_manager.rs
+++ b/src/common/meta/src/ddl_manager.rs
@@ -15,8 +15,9 @@
 use std::sync::Arc;
 use std::time::Duration;
 
-use api::v1::Repartition;
 use api::v1::alter_table_expr::Kind;
+use api::v1::repartition::Source as PbRepartitionSource;
+use api::v1::{PartitionExprs, Repartition};
 use common_error::ext::BoxedError;
 use common_procedure::{
     BoxedProcedure, BoxedProcedureLoader, Output, ProcedureId, ProcedureManagerRef,
@@ -151,13 +152,18 @@ macro_rules! procedure_loader {
 
 pub type RepartitionProcedureFactoryRef = Arc<dyn RepartitionProcedureFactory>;
 
+pub enum RepartitionSource {
+    Partitioned { exprs: Vec<String> },
+    Unpartitioned { partition_columns: Vec<String> },
+}
+
 pub trait RepartitionProcedureFactory: Send + Sync {
     fn create(
         &self,
         ddl_ctx: &DdlContext,
         table_name: TableName,
         table_id: TableId,
-        from_exprs: Vec<String>,
+        source: RepartitionSource,
         to_exprs: Vec<String>,
         timeout: Option<Duration>,
     ) -> std::result::Result<BoxedProcedure, BoxedError>;
@@ -280,22 +286,38 @@ impl DdlManager {
         &self,
         table_id: TableId,
         table_name: TableName,
-        Repartition {
-            from_partition_exprs,
-            into_partition_exprs,
-        }: Repartition,
+        repartition: Repartition,
         wait: bool,
         timeout: Duration,
     ) -> Result<(ProcedureId, Option<Output>)> {
         let context = self.create_context();
 
+        let into_partition_exprs = repartition.into_partition_exprs;
+        let source = repartition.source;
+
+        let source = match source {
+            Some(PbRepartitionSource::PartitionExprs(PartitionExprs { exprs })) => {
+                RepartitionSource::Partitioned { exprs }
+            }
+            Some(PbRepartitionSource::Unpartitioned(source)) => RepartitionSource::Unpartitioned {
+                partition_columns: source.partition_columns,
+            },
+            None => {
+                // Reads the deprecated field for backward compatibility with old persisted DDL tasks.
+                #[allow(deprecated)]
+                RepartitionSource::Partitioned {
+                    exprs: repartition.from_partition_exprs,
+                }
+            }
+        };
+
         let procedure = self
             .repartition_procedure_factory
             .create(
                 &context,
                 table_name,
                 table_id,
-                from_partition_exprs,
+                source,
                 into_partition_exprs,
                 Some(timeout),
             )
@@ -1108,7 +1130,7 @@ mod tests {
     use crate::ddl::table_meta::TableMetadataAllocator;
     use crate::ddl::truncate_table::TruncateTableProcedure;
     use crate::ddl::{DdlContext, NoopRegionFailureDetectorControl};
-    use crate::ddl_manager::RepartitionProcedureFactory;
+    use crate::ddl_manager::{RepartitionProcedureFactory, RepartitionSource};
     use crate::key::TableMetadataManager;
     use crate::key::flow::FlowMetadataManager;
     use crate::kv_backend::memory::MemoryKvBackend;
@@ -1146,7 +1168,7 @@ mod tests {
             _ddl_ctx: &DdlContext,
             _table_name: TableName,
             _table_id: TableId,
-            _from_exprs: Vec<String>,
+            _source: RepartitionSource,
             _to_exprs: Vec<String>,
             _timeout: Option<Duration>,
         ) -> std::result::Result<BoxedProcedure, BoxedError> {
diff --git a/src/common/meta/src/key/flow.rs b/src/common/meta/src/key/flow.rs
index d581b92685..bc9aaaa6b3 100644
--- a/src/common/meta/src/key/flow.rs
+++ b/src/common/meta/src/key/flow.rs
@@ -459,6 +459,7 @@ mod tests {
 
     use super::*;
     use crate::FlownodeId;
+    use crate::key::flow::flow_info::FlowStatus;
     use crate::key::flow::table_flow::TableFlowKey;
     use crate::key::node_address::{NodeAddressKey, NodeAddressValue};
     use crate::key::{FlowPartitionId, MetadataValue};
@@ -522,6 +523,8 @@ mod tests {
             query_context: None,
             flow_name: flow_name.to_string(),
             source_table_ids,
+            all_source_table_names: vec![],
+            unresolved_source_table_names: vec![],
             sink_table_name,
             flownode_ids,
             raw_sql: "raw".to_string(),
@@ -529,6 +532,7 @@ mod tests {
             eval_interval_secs: None,
             comment: "hi".to_string(),
             options: Default::default(),
+            status: FlowStatus::Active,
             created_time: chrono::Utc::now(),
             updated_time: chrono::Utc::now(),
         }
@@ -774,6 +778,8 @@ mod tests {
             query_context: None,
             flow_name: "flow".to_string(),
             source_table_ids: vec![1024, 1025, 1026],
+            all_source_table_names: vec![],
+            unresolved_source_table_names: vec![],
             sink_table_name: another_sink_table_name,
             flownode_ids: [(0, 1u64)].into(),
             raw_sql: "raw".to_string(),
@@ -781,6 +787,7 @@ mod tests {
             eval_interval_secs: None,
             comment: "hi".to_string(),
             options: Default::default(),
+            status: FlowStatus::Active,
             created_time: chrono::Utc::now(),
             updated_time: chrono::Utc::now(),
         };
@@ -1151,6 +1158,8 @@ mod tests {
             query_context: None,
             flow_name: "flow".to_string(),
             source_table_ids: vec![1024, 1025, 1026],
+            all_source_table_names: vec![],
+            unresolved_source_table_names: vec![],
             sink_table_name: another_sink_table_name,
             flownode_ids: [(0, 1u64)].into(),
             raw_sql: "raw".to_string(),
@@ -1158,6 +1167,7 @@ mod tests {
             eval_interval_secs: None,
             comment: "hi".to_string(),
             options: Default::default(),
+            status: FlowStatus::Active,
             created_time: chrono::Utc::now(),
             updated_time: chrono::Utc::now(),
         };
diff --git a/src/common/meta/src/key/flow/flow_info.rs b/src/common/meta/src/key/flow/flow_info.rs
index d501822c3c..b1056902da 100644
--- a/src/common/meta/src/key/flow/flow_info.rs
+++ b/src/common/meta/src/key/flow/flow_info.rs
@@ -16,6 +16,8 @@ use std::collections::{BTreeMap, HashMap};
 use std::sync::Arc;
 
 use chrono::{DateTime, Utc};
+use futures::TryStreamExt;
+use futures::stream::BoxStream;
 use lazy_static::lazy_static;
 use regex::Regex;
 use serde::{Deserialize, Serialize};
@@ -27,12 +29,27 @@ use crate::FlownodeId;
 use crate::error::{self, Result};
 use crate::key::flow::FlowScoped;
 use crate::key::txn_helper::TxnOpGetResponseSet;
-use crate::key::{DeserializedValueWithBytes, FlowId, FlowPartitionId, MetadataKey, MetadataValue};
+use crate::key::{
+    BytesAdapter, DeserializedValueWithBytes, FlowId, FlowPartitionId, MetadataKey, MetadataValue,
+};
 use crate::kv_backend::KvBackendRef;
 use crate::kv_backend::txn::{Compare, CompareOp, Txn, TxnOp};
+use crate::range_stream::{DEFAULT_PAGE_SIZE, PaginationStream};
+use crate::rpc::KeyValue;
+use crate::rpc::store::RangeRequest;
 
 pub const FLOW_INFO_KEY_PREFIX: &str = "info";
 
+/// The lifecycle status of a flow stored in metadata.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
+pub enum FlowStatus {
+    /// The flow metadata exists, but at least one source table did not exist at create time.
+    PendingSources,
+    /// The flow has resolved source tables and can be scheduled on flownodes.
+    #[default]
+    Active,
+}
+
 lazy_static! {
     static ref FLOW_INFO_KEY_PATTERN: Regex =
         Regex::new(&format!("^{FLOW_INFO_KEY_PREFIX}/([0-9]+)$")).unwrap();
@@ -114,7 +131,12 @@ impl<'a> MetadataKey<'a, FlowInfoKeyInner> for FlowInfoKeyInner {
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct FlowInfoValue {
     /// The source tables used by the flow.
+    #[serde(default)]
     pub source_table_ids: Vec<TableId>,
+    #[serde(default)]
+    pub all_source_table_names: Vec<TableName>,
+    #[serde(default)]
+    pub unresolved_source_table_names: Vec<TableName>,
     /// The sink table used by the flow.
     pub sink_table_name: TableName,
     /// Which flow nodes this flow is running on.
@@ -145,6 +167,8 @@ pub struct FlowInfoValue {
     pub comment: String,
     /// The options.
     pub options: HashMap<String, String>,
+    #[serde(default)]
+    pub status: FlowStatus,
     /// The created time
     #[serde(default)]
     pub created_time: DateTime<Utc>,
@@ -154,6 +178,14 @@ pub struct FlowInfoValue {
 }
 
 impl FlowInfoValue {
+    pub fn is_pending(&self) -> bool {
+        self.status == FlowStatus::PendingSources
+    }
+
+    pub fn is_active(&self) -> bool {
+        self.status == FlowStatus::Active
+    }
+
     /// Returns the `flownode_id`.
     pub fn flownode_ids(&self) -> &BTreeMap<FlowPartitionId, FlownodeId> {
         &self.flownode_ids
@@ -173,6 +205,14 @@ impl FlowInfoValue {
         &self.source_table_ids
     }
 
+    pub fn all_source_table_names(&self) -> &[TableName] {
+        &self.all_source_table_names
+    }
+
+    pub fn unresolved_source_table_names(&self) -> &[TableName] {
+        &self.unresolved_source_table_names
+    }
+
     pub fn catalog_name(&self) -> &String {
         &self.catalog_name
     }
@@ -209,6 +249,10 @@ impl FlowInfoValue {
         &self.options
     }
 
+    pub fn status(&self) -> &FlowStatus {
+        &self.status
+    }
+
     pub fn created_time(&self) -> &DateTime<Utc> {
         &self.created_time
     }
@@ -225,6 +269,12 @@ pub struct FlowInfoManager {
     kv_backend: KvBackendRef,
 }
 
+pub fn flow_info_decoder(kv: KeyValue) -> Result<(FlowInfoKey, FlowInfoValue)> {
+    let key = FlowInfoKey::from_bytes(&kv.key)?;
+    let value = FlowInfoValue::try_from_raw_value(&kv.value)?;
+    Ok((key, value))
+}
+
 impl FlowInfoManager {
     /// Returns a new [FlowInfoManager].
     pub fn new(kv_backend: KvBackendRef) -> Self {
@@ -254,6 +304,23 @@ impl FlowInfoManager {
             .transpose()
     }
 
+    pub fn flow_infos(&self) -> BoxStream<'static, Result<(FlowId, FlowInfoValue)>> {
+        let start_key = FlowScoped::new(BytesAdapter::from(
+            format!("{FLOW_INFO_KEY_PREFIX}/").into_bytes(),
+        ))
+        .to_bytes();
+        let req = RangeRequest::new().with_prefix(start_key);
+        let stream = PaginationStream::new(
+            self.kv_backend.clone(),
+            req,
+            DEFAULT_PAGE_SIZE,
+            flow_info_decoder,
+        )
+        .into_stream();
+
+        Box::pin(stream.map_ok(|(key, value)| (key.flow_id(), value)))
+    }
+
     /// Builds a create flow transaction.
     /// It is expected that the `__flow/info/{flow_id}` wasn't occupied.
     /// Otherwise, the transaction will retrieve existing value.
diff --git a/src/common/procedure/src/local.rs b/src/common/procedure/src/local.rs
index 9e8536308c..5e8717a53a 100644
--- a/src/common/procedure/src/local.rs
+++ b/src/common/procedure/src/local.rs
@@ -24,7 +24,7 @@ use async_trait::async_trait;
 use backon::ExponentialBuilder;
 use common_error::ext::BoxedError;
 use common_event_recorder::EventRecorderRef;
-use common_runtime::{RepeatedTask, TaskFunction};
+use common_runtime::{JoinHandle, RepeatedTask, TaskFunction};
 use common_telemetry::tracing_context::{FutureExt, TracingContext};
 use common_telemetry::{error, info, tracing};
 use snafu::{OptionExt, ResultExt, ensure};
@@ -254,6 +254,8 @@ pub(crate) struct ManagerContext {
     running_procedures: Mutex<HashSet<ProcedureId>>,
     /// Ids and finished time of finished procedures.
     finished_procedures: Mutex<VecDeque<(ProcedureId, Instant)>>,
+    /// Runner tasks of procedures.
+    runner_tasks: Mutex<HashMap<ProcedureId, JoinHandle<()>>>,
     /// Running flag.
     running: Arc<AtomicBool>,
     /// Poison manager.
@@ -310,6 +312,7 @@ impl ManagerContext {
             procedures: RwLock::new(HashMap::new()),
             running_procedures: Mutex::new(HashSet::new()),
             finished_procedures: Mutex::new(VecDeque::new()),
+            runner_tasks: Mutex::new(HashMap::new()),
             running: Arc::new(AtomicBool::new(false)),
             poison_manager,
         }
@@ -329,6 +332,76 @@ impl ManagerContext {
         self.running.store(false, Ordering::Relaxed);
     }
 
+    fn reset_runtime_state(&self) {
+        self.procedures.write().unwrap().clear();
+        self.running_procedures.lock().unwrap().clear();
+        self.finished_procedures.lock().unwrap().clear();
+        for handle in self
+            .runner_tasks
+            .lock()
+            .unwrap()
+            .drain()
+            .map(|(_, handle)| handle)
+        {
+            handle.abort();
+        }
+        self.key_lock.clear();
+        self.dynamic_key_lock.clear();
+    }
+
+    fn spawn_runner_task<F>(&self, procedure_id: ProcedureId, spawn: F) -> bool
+    where
+        F: FnOnce() -> JoinHandle<()>,
+    {
+        let mut tasks = self.runner_tasks.lock().unwrap();
+        if !self.running() {
+            return false;
+        }
+
+        let handle = spawn();
+        let _ = tasks.insert(procedure_id, handle);
+        true
+    }
+
+    fn remove_procedure(&self, procedure_id: ProcedureId) {
+        self.procedures.write().unwrap().remove(&procedure_id);
+        self.running_procedures
+            .lock()
+            .unwrap()
+            .remove(&procedure_id);
+    }
+
+    pub(crate) fn remove_runner_task(&self, procedure_id: ProcedureId) {
+        let _ = self.runner_tasks.lock().unwrap().remove(&procedure_id);
+    }
+
+    fn take_runner_tasks(&self) -> Vec<JoinHandle<()>> {
+        self.runner_tasks
+            .lock()
+            .unwrap()
+            .drain()
+            .map(|(_, handle)| handle)
+            .collect()
+    }
+
+    async fn abort_runner_tasks(&self) {
+        let handles = self.take_runner_tasks();
+
+        for handle in &handles {
+            handle.abort();
+        }
+
+        for handle in handles {
+            if let Err(e) = handle.await
+                && !e.is_cancelled()
+            {
+                error!(
+                    e; "Procedure runner task exits unexpectedly during stop",
+                );
+            }
+        }
+    }
+
     /// Return `ProcedureManager` is running.
     pub(crate) fn running(&self) -> bool {
         self.running.load(Ordering::Relaxed)
@@ -675,17 +748,25 @@ impl LocalManager {
 
         let tracing_context = TracingContext::from_current_span();
 
-        let _handle = common_runtime::spawn_global(async move {
-            let span = tracing_context.attach(tracing::info_span!(
-            "LocalManager::submit_root_procedure",
-                procedure_name = %runner.meta.type_name,
-                procedure_id = %runner.meta.id,
-            ));
-            // Run the root procedure.
-            // The task was moved to another runtime for execution.
-            // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
-            runner.run().trace(span).await;
-        });
+        ensure!(
+            self.manager_ctx.spawn_runner_task(procedure_id, || {
+                common_runtime::spawn_global(async move {
+                    let span = tracing_context.attach(tracing::info_span!(
+                    "LocalManager::submit_root_procedure",
+                        procedure_name = %runner.meta.type_name,
+                        procedure_id = %runner.meta.id,
+                    ));
+                    // Run the root procedure.
+                    // The task was moved to another runtime for execution.
+                    // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
+                    runner.run().trace(span).await;
+                })
+            }),
+            {
+                self.manager_ctx.remove_procedure(procedure_id);
+                ManagerNotStartSnafu
+            }
+        );
 
         Ok(watcher)
     }
@@ -822,6 +903,7 @@ impl ProcedureManager for LocalManager {
 
         *task = Some(task_inner);
 
+        self.manager_ctx.reset_runtime_state();
         self.manager_ctx.start();
 
         info!("LocalManager is start.");
@@ -830,14 +912,18 @@ impl ProcedureManager for LocalManager {
     }
 
     async fn stop(&self) -> Result<()> {
-        let mut task = self.remove_outdated_meta_task.lock().await;
-
-        if let Some(task) = task.take() {
-            task.stop().await.context(StopRemoveOutdatedMetaTaskSnafu)?;
-        }
-
         self.manager_ctx.stop();
 
+        let mut task = self.remove_outdated_meta_task.lock().await;
+        if let Some(task) = task.take()
+            && let Err(e) = task.stop().await.context(StopRemoveOutdatedMetaTaskSnafu)
+        {
+            error!(e; "Failed to stop remove outdated meta task");
+        };
+
+        self.manager_ctx.abort_runner_tasks().await;
+        self.manager_ctx.reset_runtime_state();
+
         info!("LocalManager is stopped.");
 
         Ok(())
@@ -921,10 +1007,12 @@ pub(crate) mod test_util {
 #[cfg(test)]
 mod tests {
     use std::assert_matches;
+    use std::sync::atomic::{AtomicBool, Ordering as AtomicOrdering};
 
     use common_error::mock::MockError;
     use common_error::status_code::StatusCode;
     use common_test_util::temp_dir::create_temp_dir;
+    use tokio::sync::oneshot;
     use tokio::time::timeout;
 
     use super::*;
@@ -954,6 +1042,67 @@ mod tests {
         assert!(ctx.state(meta.id).unwrap().is_done());
     }
 
+    #[test]
+    fn test_reset_runtime_state() {
+        let ctx = new_test_manager_context();
+        ctx.set_running();
+        let mut meta = test_util::procedure_meta_for_test();
+        meta.lock_key = LockKey::single_exclusive("test.reset_runtime_state");
+        let meta = Arc::new(meta);
+        let procedure_id = meta.id;
+
+        assert!(ctx.try_insert_procedure(meta.clone()));
+        ctx.finished_procedures
+            .lock()
+            .unwrap()
+            .push_back((procedure_id, Instant::now()));
+        ctx.spawn_runner_task(procedure_id, || {
+            common_runtime::spawn_global(std::future::pending::<()>())
+        });
+
+        drop(
+            ctx.key_lock
+                .try_write("test.reset_runtime_state".to_string()),
+        );
+        drop(
+            ctx.dynamic_key_lock
+                .try_write("test.reset_runtime_state.dynamic".to_string()),
+        );
+        assert!(ctx.contains_procedure(procedure_id));
+        assert_eq!(1, ctx.running_procedures.lock().unwrap().len());
+        assert_eq!(1, ctx.finished_procedures.lock().unwrap().len());
+        assert_eq!(1, ctx.runner_tasks.lock().unwrap().len());
+        assert_eq!(1, ctx.key_lock.len());
+        assert_eq!(1, ctx.dynamic_key_lock.len());
+
+        ctx.reset_runtime_state();
+
+        assert!(!ctx.contains_procedure(procedure_id));
+        assert!(ctx.running_procedures.lock().unwrap().is_empty());
+        assert!(ctx.finished_procedures.lock().unwrap().is_empty());
+        assert!(ctx.runner_tasks.lock().unwrap().is_empty());
+        assert!(ctx.key_lock.is_empty());
+        assert!(ctx.dynamic_key_lock.is_empty());
+    }
+
+    #[test]
+    fn test_spawn_runner_task_not_started_after_stop() {
+        let ctx = new_test_manager_context();
+        let procedure_id = ProcedureId::random();
+
+        let spawned = Arc::new(AtomicBool::new(false));
+        let spawned_in_task = spawned.clone();
+        let started = ctx.spawn_runner_task(procedure_id, || {
+            common_runtime::spawn_global(async move {
+                spawned_in_task.store(true, AtomicOrdering::Relaxed);
+            })
+        });
+
+        assert!(!started);
+        assert!(!spawned.load(AtomicOrdering::Relaxed));
+        assert!(ctx.runner_tasks.lock().unwrap().is_empty());
+    }
+
     #[test]
     fn test_manager_context_insert_duplicate() {
         let ctx = new_test_manager_context();
@@ -1046,6 +1195,105 @@ mod tests {
         }
     }
 
+    #[derive(Debug)]
+    struct BlockingProcedure {
+        started_tx: Option<oneshot::Sender<()>>,
+        dropped: Arc<AtomicBool>,
+        lock_key: LockKey,
+    }
+
+    impl Drop for BlockingProcedure {
+        fn drop(&mut self) {
+            self.dropped.store(true, AtomicOrdering::Relaxed);
+        }
+    }
+
+    #[async_trait]
+    impl Procedure for BlockingProcedure {
+        fn type_name(&self) -> &str {
+            "BlockingProcedure"
+        }
+
+        async fn execute(&mut self, _ctx: &Context) -> Result<Status> {
+            if let Some(tx) = self.started_tx.take() {
+                let _ = tx.send(());
+            }
+            std::future::pending::<Result<Status>>().await
+        }
+
+        fn dump(&self) -> Result<String> {
+            Ok(String::new())
+        }
+
+        fn lock_key(&self) -> LockKey {
+            self.lock_key.clone()
+        }
+    }
+
+    #[tokio::test]
+    async fn test_stop_aborts_runner_and_resets_runtime_state() {
+        let dir = create_temp_dir("stop_aborts_runner_and_resets_runtime_state");
+        let config = ManagerConfig::default();
+        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
+        let poison_manager = Arc::new(InMemoryPoisonStore::new());
+        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
+        manager.start().await.unwrap();
+
+        let procedure_id = ProcedureId::random();
+        let (started_tx, started_rx) = oneshot::channel();
+        let dropped = Arc::new(AtomicBool::new(false));
+        let procedure = BlockingProcedure {
+            started_tx: Some(started_tx),
+            dropped: dropped.clone(),
+            lock_key: LockKey::single_exclusive("test.stop_aborts_runner"),
+        };
+
+        manager
+            .submit(ProcedureWithId {
+                id: procedure_id,
+                procedure: Box::new(procedure),
+            })
+            .await
+            .unwrap();
+        timeout(Duration::from_secs(5), started_rx)
+            .await
+            .unwrap()
+            .unwrap();
+
+        assert!(manager.manager_ctx.contains_procedure(procedure_id));
+        assert_eq!(
+            1,
+            manager.manager_ctx.running_procedures.lock().unwrap().len()
+        );
+        assert_eq!(1, manager.manager_ctx.runner_tasks.lock().unwrap().len());
+        assert_eq!(1, manager.manager_ctx.key_lock.len());
+
+        manager.stop().await.unwrap();
+
+        assert!(dropped.load(AtomicOrdering::Relaxed));
+        assert!(!manager.manager_ctx.running());
+        assert!(!manager.manager_ctx.contains_procedure(procedure_id));
+        assert!(
+            manager
+                .manager_ctx
+                .running_procedures
+                .lock()
+                .unwrap()
+                .is_empty()
+        );
+        assert!(
+            manager
+                .manager_ctx
+                .finished_procedures
+                .lock()
+                .unwrap()
+                .is_empty()
+        );
+        assert!(manager.manager_ctx.runner_tasks.lock().unwrap().is_empty());
+        assert!(manager.manager_ctx.key_lock.is_empty());
+        assert!(manager.manager_ctx.dynamic_key_lock.is_empty());
+    }
+
     #[test]
     fn test_register_loader() {
         let dir = create_temp_dir("register");
@@ -1439,7 +1687,7 @@ mod tests {
         let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
         let poison_manager = Arc::new(InMemoryPoisonStore::new());
         let manager = LocalManager::new(config, state_store, poison_manager, None, None);
-        manager.manager_ctx.set_running();
+        manager.start().await.unwrap();
 
         manager
             .manager_ctx
@@ -1447,7 +1695,6 @@ mod tests {
             .lock()
             .unwrap()
             .insert(ProcedureId::random());
-        manager.start().await.unwrap();
 
         // Submit a new procedure should fail.
         let mut procedure = ProcedureToLoad::new("submit");
diff --git a/src/common/procedure/src/local/runner.rs b/src/common/procedure/src/local/runner.rs
index ca3e221f43..509b3a7756 100644
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -20,6 +20,7 @@ use backon::{BackoffBuilder, ExponentialBuilder};
 use common_error::ext::PlainError;
 use common_error::status_code::StatusCode;
 use common_event_recorder::EventRecorderRef;
+use common_telemetry::tracing::warn;
 use common_telemetry::tracing_context::{FutureExt, TracingContext};
 use common_telemetry::{debug, error, info, tracing};
 use rand::Rng;
@@ -480,6 +481,15 @@ impl Runner {
         procedure_state: ProcedureState,
         procedure: BoxedProcedure,
     ) {
+        if !self.running() {
+            warn!(
+                "ProcedureManager is not running, skip submitting subprocedure {}-{}",
+                procedure.type_name(),
+                procedure_id
+            );
+            return;
+        }
+
         if self.manager_ctx.contains_procedure(procedure_id) {
             // If the parent has already submitted this procedure, don't submit it again.
             return;
@@ -520,23 +530,29 @@ impl Runner {
             procedure_id,
         );
 
-        // Add the id of the subprocedure to the metadata.
-        self.meta.push_child(procedure_id);
         let parent_id = self.meta.id;
 
         let tracing_context = TracingContext::from_current_span();
-        let _handle = common_runtime::spawn_global(async move {
-            let span = tracing_context.attach(tracing::info_span!(
-                "LocalManager::submit_subprocedure",
-                procedure_name = %runner.meta.type_name,
-                procedure_id = %runner.meta.id,
-                parent_id = %parent_id,
-            ));
-            // Run the root procedure.
-            // The task was moved to another runtime for execution.
-            // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
-            runner.run().trace(span).await
-        });
+        if !self.manager_ctx.spawn_runner_task(procedure_id, || {
+            common_runtime::spawn_global(async move {
+                let span = tracing_context.attach(tracing::info_span!(
+                    "LocalManager::submit_subprocedure",
+                    procedure_name = %runner.meta.type_name,
+                    procedure_id = %runner.meta.id,
+                    parent_id = %parent_id,
+                ));
+                // Run the root procedure.
+                // The task was moved to another runtime for execution.
+                // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
+                runner.run().trace(span).await
+            })
+        }) {
+            self.manager_ctx.remove_procedure(procedure_id);
+            return;
+        }
+
+        // Add the id of the subprocedure to the metadata.
+        self.meta.push_child(procedure_id);
     }
 
     /// Extend the retry time to wait for the next retry.
@@ -702,6 +718,12 @@ impl Runner {
     }
 }
 
+impl Drop for Runner {
+    fn drop(&mut self) {
+        self.manager_ctx.remove_runner_task(self.meta.id);
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::assert_matches;
diff --git a/src/common/procedure/src/rwlock.rs b/src/common/procedure/src/rwlock.rs
index cbdfe30977..c4807cf2f7 100644
--- a/src/common/procedure/src/rwlock.rs
+++ b/src/common/procedure/src/rwlock.rs
@@ -106,6 +106,13 @@ where
             locks.remove(key);
         }
     }
+
+    /// Clears all key locks.
+    ///
+    /// Callers must ensure no tasks are holding or waiting for these locks.
+    pub fn clear(&self) {
+        self.inner.lock().unwrap().clear();
+    }
 }
 
 #[cfg(test)]
diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs
index aa2e627ca2..d5711e1761 100644
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -314,6 +314,7 @@ impl RegionServer {
         let ctx = request.header.as_ref().map(|h| h.into());
         let query_ctx = Arc::new(ctx.unwrap_or_else(|| QueryContextBuilder::default().build()));
 
+        let region_id = request.region_id;
         let injector_builder = NameAwareDataSourceInjectorBuilder::from_plan(&request.plan)
             .context(DataFusionSnafu)?;
         let mut injector = injector_builder
@@ -326,7 +327,6 @@ impl RegionServer {
             .context(DataFusionSnafu)?
             .data;
 
-        let region_id = request.region_id;
         let stream = self
             .inner
             .handle_read(QueryRequest { plan, ..request }, query_ctx.clone())
@@ -837,14 +837,13 @@ fn wrap_flow_region_watermark_stream(
     region_id: RegionId,
     query_ctx: &QueryContextRef,
 ) -> SendableRecordBatchStream {
-    let Some(seq) = should_collect_region_watermark_from_extensions(&query_ctx.extensions())
-        .then(|| query_ctx.get_snapshot(region_id.as_u64()))
-        .flatten()
-    else {
-        return stream;
-    };
-
-    Box::pin(RegionWatermarkStream::new(stream, region_id, seq))
+    if should_collect_region_watermark_from_extensions(&query_ctx.extensions())
+        && let Some(seq) = query_ctx.get_snapshot(region_id.as_u64())
+    {
+        Box::pin(RegionWatermarkStream::new(stream, region_id, seq)) as SendableRecordBatchStream
+    } else {
+        stream
+    }
 }
 
 /// Wraps a region read stream so terminal metrics can carry the scan-open watermark.
diff --git a/src/datanode/src/region_server/catalog.rs b/src/datanode/src/region_server/catalog.rs
index 1c0f48951f..a4df422b75 100644
--- a/src/datanode/src/region_server/catalog.rs
+++ b/src/datanode/src/region_server/catalog.rs
@@ -27,6 +27,7 @@ use datafusion_expr::{LogicalPlan, TableSource};
 use futures::TryStreamExt;
 use session::context::QueryContextRef;
 use snafu::{OptionExt, ResultExt};
+use store_api::region_info::RegionInfoEntry;
 use store_api::sst_entry::{ManifestSstEntry, PuffinIndexMetaEntry, StorageSstEntry};
 use store_api::storage::RegionId;
 
@@ -41,6 +42,7 @@ enum InternalTableKind {
     InspectSstManifest,
     InspectSstStorage,
     InspectSstIndexMeta,
+    InspectRegionInfo,
 }
 
 impl InternalTableKind {
@@ -55,6 +57,9 @@ impl InternalTableKind {
         if name.eq_ignore_ascii_case(PuffinIndexMetaEntry::reserved_table_name_for_inspection()) {
             return Some(Self::InspectSstIndexMeta);
         }
+        if name.eq_ignore_ascii_case(RegionInfoEntry::reserved_table_name_for_inspection()) {
+            return Some(Self::InspectRegionInfo);
+        }
         None
     }
 
@@ -64,6 +69,7 @@ impl InternalTableKind {
             Self::InspectSstManifest => server.inspect_sst_manifest_provider().await,
             Self::InspectSstStorage => server.inspect_sst_storage_provider().await,
             Self::InspectSstIndexMeta => server.inspect_sst_index_meta_provider().await,
+            Self::InspectRegionInfo => server.inspect_region_info_provider().await,
         }
     }
 }
@@ -128,6 +134,25 @@ impl RegionServer {
         let table = MemTable::try_new(schema, vec![vec![batch]]).context(DataFusionSnafu)?;
         Ok(Arc::new(table))
     }
+
+    /// Expose region info across the engine as an in-memory table.
+    pub async fn inspect_region_info_provider(&self) -> Result<Arc<dyn TableProvider>> {
+        let mito = {
+            let guard = self.inner.mito_engine.read().unwrap();
+            guard.as_ref().cloned().context(UnexpectedSnafu {
+                violated: "mito engine not available",
+            })?
+        };
+
+        let entries = mito.all_region_infos().await;
+        let schema = RegionInfoEntry::schema().arrow_schema().clone();
+        let batch = RegionInfoEntry::to_record_batch(&entries)
+            .map_err(DataFusionError::from)
+            .context(DataFusionSnafu)?;
+
+        let table = MemTable::try_new(schema, vec![vec![batch]]).context(DataFusionSnafu)?;
+        Ok(Arc::new(table))
+    }
 }
 
 /// A catalog list that resolves `TableProvider` by table name:
@@ -347,6 +372,7 @@ mod tests {
     use datatypes::arrow::array::Int32Array;
     use datatypes::arrow::datatypes::{DataType, Field, Schema};
     use datatypes::arrow::record_batch::RecordBatch;
+    use store_api::region_info::RegionInfoEntry;
 
     use super::*; // bring rewrite() into scope
 
@@ -409,6 +435,18 @@ mod tests {
             b3.reserved_table_needed,
             vec![InternalTableKind::InspectSstManifest]
         );
+
+        let region_info = RegionInfoEntry::reserved_table_name_for_inspection();
+        let plan4 = table_scan(Some(region_info), &schema, None)
+            .unwrap()
+            .build()
+            .unwrap();
+        let b4 = NameAwareDataSourceInjectorBuilder::from_plan(&plan4).unwrap();
+        assert!(!b4.need_region_provider);
+        assert_eq!(
+            b4.reserved_table_needed,
+            vec![InternalTableKind::InspectRegionInfo]
+        );
     }
 
     #[test]
@@ -445,6 +483,39 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_rewriter_replaces_with_region_info_reserved_source() {
+        let schema = test_schema();
+        let table_name = RegionInfoEntry::reserved_table_name_for_inspection();
+        let plan = table_scan(Some(table_name), &schema, None)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let provider = empty_mem_table();
+        let source = provider_as_source(provider);
+
+        let mut injector = NameAwareDataSourceInjector {
+            reserved_sources: {
+                let mut m = HashMap::new();
+                m.insert(InternalTableKind::InspectRegionInfo, source.clone());
+                m
+            },
+            region_source: None,
+        };
+
+        let transformed = plan.rewrite(&mut injector).unwrap();
+        let new_plan = transformed.data;
+
+        if let LogicalPlan::TableScan(scan) = new_plan {
+            let src_ptr = Arc::as_ptr(&scan.source);
+            let want_ptr = Arc::as_ptr(&source);
+            assert!(std::ptr::eq(src_ptr, want_ptr));
+        } else {
+            panic!("expected TableScan after rewrite");
+        }
+    }
+
     #[test]
     fn test_rewriter_replaces_with_region_source_for_normal() {
         let schema = test_schema();
diff --git a/src/datanode/src/utils.rs b/src/datanode/src/utils.rs
index 488ddacdf0..c5cd008c28 100644
--- a/src/datanode/src/utils.rs
+++ b/src/datanode/src/utils.rs
@@ -29,10 +29,28 @@ use tracing::info;
 use crate::error::{GetMetadataSnafu, Result};
 
 /// The requests to open regions.
-pub(crate) struct RegionOpenRequests {
-    pub leader_regions: Vec<(RegionId, RegionOpenRequest)>,
+pub struct RegionOpenRequests {
+    pub(crate) leader_regions: Vec<(RegionId, RegionOpenRequest)>,
     #[cfg(feature = "enterprise")]
-    pub follower_regions: Vec<(RegionId, RegionOpenRequest)>,
+    pub(crate) follower_regions: Vec<(RegionId, RegionOpenRequest)>,
+}
+
+impl RegionOpenRequests {
+    /// Splits the request set into leader and follower regions.
+    #[allow(clippy::type_complexity)]
+    pub fn into_parts(
+        self,
+    ) -> (
+        Vec<(RegionId, RegionOpenRequest)>,
+        Vec<(RegionId, RegionOpenRequest)>,
+    ) {
+        let leader_regions = self.leader_regions;
+        #[cfg(feature = "enterprise")]
+        let follower_regions = self.follower_regions;
+        #[cfg(not(feature = "enterprise"))]
+        let follower_regions = Vec::new();
+        (leader_regions, follower_regions)
+    }
 }
 
 fn group_region_by_topic(
@@ -58,7 +76,8 @@ fn get_replay_checkpoint(
     })
 }
 
-pub(crate) async fn build_region_open_requests(
+/// Builds region-open requests from persisted metadata.
+pub async fn build_region_open_requests(
     node_id: DatanodeId,
     kv_backend: KvBackendRef,
 ) -> Result<RegionOpenRequests> {
diff --git a/src/datatypes/src/json.rs b/src/datatypes/src/json.rs
index db657abbcb..33104084ad 100644
--- a/src/datatypes/src/json.rs
+++ b/src/datatypes/src/json.rs
@@ -26,12 +26,12 @@ use std::sync::Arc;
 
 use serde::{Deserialize, Serialize};
 use serde_json::{Map, Value as Json};
-use snafu::{OptionExt, ResultExt, ensure};
+use snafu::{OptionExt, ResultExt};
 
 use crate::error::{self, InvalidJsonSnafu, Result, SerializeSnafu};
 use crate::json::value::{JsonValue, JsonVariant};
 use crate::types::json_type::{JsonNativeType, JsonNumberType, JsonObjectType};
-use crate::types::{StructField, StructType};
+use crate::types::{JsonType, StructField, StructType};
 use crate::value::{ListValue, StructValue, Value};
 
 /// The configuration of JSON encoding
@@ -305,33 +305,47 @@ fn encode_json_array_with_context<'a>(
 ) -> Result<JsonValue> {
     let json_array_len = json_array.len();
     let mut items = Vec::with_capacity(json_array_len);
-    let mut element_type = item_type.cloned();
 
     for (index, value) in json_array.into_iter().enumerate() {
         let array_context = context.with_key(&index.to_string());
-        let item_value =
-            encode_json_value_with_context(value, element_type.as_ref(), &array_context)?;
-        let item_type = item_value.json_type().native_type().clone();
-        items.push(item_value.into_variant());
-
-        // Determine the common type for the list
-        if let Some(current_type) = &element_type {
-            // It's valid for json array to have different types of items, for example,
-            // ["a string", 1]. However, the `JsonValue` will be converted to Arrow list array,
-            // which requires all items have exactly same type. So we forbid the different types
-            // case here. Besides, it's not common for items in a json array to differ. So I think
-            // we are good here.
-            ensure!(
-                item_type == *current_type,
-                error::InvalidJsonSnafu {
-                    value: "all items in json array must have the same type"
-                }
-            );
-        } else {
-            element_type = Some(item_type);
-        }
+        let item_value = encode_json_value_with_context(value, None, &array_context)?;
+        items.push(item_value);
     }
 
+    // In specification, it's valid for a JSON array to have different types of items, for example,
+    // ["a string", 1]. However, in implementation, the `JsonValue` will be converted to Arrow list
+    // array, which requires all items have exactly the same type. So we merge out the maybe
+    // different item types to a unified type, and align all the item values to it.
+
+    let provided_item_type = item_type.map(|x| JsonType::new_json2(x.clone()));
+    let merged_item_type = if let Some((first, rests)) = items.split_first() {
+        let mut merged = first.json_type().clone();
+        for rest in rests.iter().map(|x| x.json_type()) {
+            if matches!(merged.native_type(), JsonNativeType::Variant) {
+                break;
+            }
+            merged.merge(rest)?;
+        }
+        Some(merged)
+    } else {
+        None
+    };
+    let unified_item_type = match (provided_item_type, merged_item_type) {
+        (Some(mut x), Some(y)) => {
+            x.merge(&y)?;
+            Some(x)
+        }
+        (x, y) => x.or(y),
+    };
+    if let Some(unified_item_type) = unified_item_type {
+        for item in &mut items {
+            item.try_align(&unified_item_type)?;
+        }
+    }
+    let items = items
+        .into_iter()
+        .map(|x| x.into_variant())
+        .collect::<Vec<_>>();
     Ok(JsonValue::new(JsonVariant::Array(items)))
 }
 
@@ -1050,11 +1064,8 @@ mod tests {
     fn test_encode_json_array_mixed_types() {
         let json = json!([1, "hello", true, 3.15]);
         let settings = JsonStructureSettings::Structured(None);
-        let result = settings.encode_with_type(json, None);
-        assert_eq!(
-            result.unwrap_err().to_string(),
-            "Invalid JSON: all items in json array must have the same type"
-        );
+        let value = settings.encode_with_type(json, None).unwrap();
+        assert_eq!(value.data_type().to_string(), r#"Json2["<Variant>"]"#);
     }
 
     #[test]
@@ -1276,12 +1287,12 @@ mod tests {
     #[test]
     fn test_encode_json_array_with_item_type() {
         let json = json!([1, 2, 3]);
-        let item_type = Arc::new(ConcreteDataType::uint64_datatype());
+        let item_type = Arc::new(ConcreteDataType::int64_datatype());
         let settings = JsonStructureSettings::Structured(None);
         let result = settings
             .encode_with_type(
                 json,
-                Some(&JsonNativeType::Array(Box::new(JsonNativeType::u64()))),
+                Some(&JsonNativeType::Array(Box::new(JsonNativeType::i64()))),
             )
             .unwrap()
             .into_json_inner()
@@ -1289,9 +1300,9 @@ mod tests {
 
         if let Value::List(list_value) = result {
             assert_eq!(list_value.items().len(), 3);
-            assert_eq!(list_value.items()[0], Value::UInt64(1));
-            assert_eq!(list_value.items()[1], Value::UInt64(2));
-            assert_eq!(list_value.items()[2], Value::UInt64(3));
+            assert_eq!(list_value.items()[0], Value::Int64(1));
+            assert_eq!(list_value.items()[1], Value::Int64(2));
+            assert_eq!(list_value.items()[2], Value::Int64(3));
             assert_eq!(list_value.datatype(), item_type);
         } else {
             panic!("Expected List value");
@@ -2249,10 +2260,10 @@ mod tests {
             )])),
         );
 
-        let decoded_struct = settings.decode_struct(array_struct);
+        let decoded_struct = settings.decode_struct(array_struct).unwrap();
         assert_eq!(
-            decoded_struct.unwrap_err().to_string(),
-            "Invalid JSON: all items in json array must have the same type"
+            format!("{decoded_struct:?}"),
+            r#"StructValue { items: [List(ListValue { items: [Binary(Bytes(b"1")), Binary(Bytes(b"\"hello\"")), Binary(Bytes(b"true")), Binary(Bytes(b"3.15"))], datatype: Binary(BinaryType { repr_type: Binary }) })], fields: StructType { fields: [StructField { name: "value", data_type: List(ListType { item_type: Binary(BinaryType { repr_type: Binary }) }), nullable: true, metadata: {} }] } }"#
         );
     }
 
diff --git a/src/datatypes/src/json/value.rs b/src/datatypes/src/json/value.rs
index f3b652a549..4350630003 100644
--- a/src/datatypes/src/json/value.rs
+++ b/src/datatypes/src/json/value.rs
@@ -65,6 +65,14 @@ impl JsonNumber {
             JsonNumber::Float(n) => n.0,
         }
     }
+
+    fn native_type(&self) -> JsonNativeType {
+        match self {
+            JsonNumber::PosInt(_) => JsonNativeType::u64(),
+            JsonNumber::NegInt(_) => JsonNativeType::i64(),
+            JsonNumber::Float(_) => JsonNativeType::f64(),
+        }
+    }
 }
 
 impl From<u64> for JsonNumber {
@@ -147,26 +155,14 @@ impl JsonVariant {
         match self {
             JsonVariant::Null => JsonNativeType::Null,
             JsonVariant::Bool(_) => JsonNativeType::Bool,
-            JsonVariant::Number(n) => match n {
-                JsonNumber::PosInt(_) => JsonNativeType::u64(),
-                JsonNumber::NegInt(_) => JsonNativeType::i64(),
-                JsonNumber::Float(_) => JsonNativeType::f64(),
-            },
+            JsonVariant::Number(n) => n.native_type(),
             JsonVariant::String(_) => JsonNativeType::String,
             JsonVariant::Array(array) => {
-                let item_type = if let Some(first) = array.first() {
-                    first.native_type()
-                } else {
-                    JsonNativeType::Null
-                };
-                JsonNativeType::Array(Box::new(item_type))
+                json_array_native_type(array.iter().map(JsonVariant::native_type))
+            }
+            JsonVariant::Object(object) => {
+                json_object_native_type(object.iter().map(|(k, v)| (k, v.native_type())))
             }
-            JsonVariant::Object(object) => JsonNativeType::Object(
-                object
-                    .iter()
-                    .map(|(k, v)| (k.clone(), v.native_type()))
-                    .collect(),
-            ),
             JsonVariant::Variant(_) => JsonNativeType::Variant,
         }
     }
@@ -469,6 +465,7 @@ impl JsonValue {
                         .collect::<Result<_>>()?,
                 ),
 
+                (JsonVariant::Object(kvs), _) if kvs.is_empty() => JsonVariant::Null,
                 (JsonVariant::Object(mut kvs), JsonNativeType::Object(expected)) => {
                     ensure!(
                         expected.keys().len() >= kvs.keys().len()
@@ -517,7 +514,7 @@ impl JsonValue {
 
         let x = std::mem::take(&mut self.json_variant);
         self.json_variant = helper(x, expected.native_type())?;
-        self.json_type = OnceLock::from(expected.clone());
+        self.json_type = OnceLock::new();
         Ok(())
     }
 }
@@ -623,35 +620,55 @@ pub enum JsonVariantRef<'a> {
 }
 
 impl JsonVariantRef<'_> {
-    fn json_type(&self) -> JsonType {
-        fn native_type(v: &JsonVariantRef<'_>) -> JsonNativeType {
-            match v {
-                JsonVariantRef::Null => JsonNativeType::Null,
-                JsonVariantRef::Bool(_) => JsonNativeType::Bool,
-                JsonVariantRef::Number(n) => match n {
-                    JsonNumber::PosInt(_) => JsonNativeType::u64(),
-                    JsonNumber::NegInt(_) => JsonNativeType::i64(),
-                    JsonNumber::Float(_) => JsonNativeType::f64(),
-                },
-                JsonVariantRef::String(_) => JsonNativeType::String,
-                JsonVariantRef::Array(array) => {
-                    let item_type = if let Some(first) = array.first() {
-                        native_type(first)
-                    } else {
-                        JsonNativeType::Null
-                    };
-                    JsonNativeType::Array(Box::new(item_type))
-                }
-                JsonVariantRef::Object(object) => JsonNativeType::Object(
-                    object
-                        .iter()
-                        .map(|(k, v)| (k.to_string(), native_type(v)))
-                        .collect(),
-                ),
-                JsonVariantRef::Variant(_) => JsonNativeType::Variant,
+    fn native_type(&self) -> JsonNativeType {
+        match self {
+            JsonVariantRef::Null => JsonNativeType::Null,
+            JsonVariantRef::Bool(_) => JsonNativeType::Bool,
+            JsonVariantRef::Number(n) => n.native_type(),
+            JsonVariantRef::String(_) => JsonNativeType::String,
+            JsonVariantRef::Array(array) => {
+                json_array_native_type(array.iter().map(JsonVariantRef::native_type))
             }
+            JsonVariantRef::Object(object) => {
+                json_object_native_type(object.iter().map(|(k, v)| (*k, v.native_type())))
+            }
+            JsonVariantRef::Variant(_) => JsonNativeType::Variant,
         }
-        JsonType::new_json2(native_type(self))
+    }
+
+    fn json_type(&self) -> JsonType {
+        JsonType::new_json2(self.native_type())
+    }
+}
+
+fn json_array_native_type<I>(items: I) -> JsonNativeType
+where
+    I: IntoIterator<Item = JsonNativeType>,
+{
+    let mut iter = items.into_iter();
+    let mut item_type = match iter.next() {
+        Some(t) => t,
+        None => return JsonNativeType::Array(Box::new(JsonNativeType::Null)),
+    };
+    for x in iter {
+        if matches!(item_type, JsonNativeType::Variant) {
+            break;
+        }
+        item_type.merge(&x);
+    }
+    JsonNativeType::Array(Box::new(item_type))
+}
+
+fn json_object_native_type<I, K>(fields: I) -> JsonNativeType
+where
+    I: IntoIterator<Item = (K, JsonNativeType)>,
+    K: Into<String>,
+{
+    let mut fields = fields.into_iter().peekable();
+    if fields.peek().is_none() {
+        JsonNativeType::Null
+    } else {
+        JsonNativeType::Object(fields.map(|(k, v)| (k.into(), v)).collect())
     }
 }
 
@@ -941,7 +958,6 @@ mod tests {
                 ("name".to_string(), JsonVariant::Null),
             ])))
         );
-        assert_eq!(value.json_type(), &expected);
 
         // Object alignment should fail if the expected type misses any field from the value.
         let expected = JsonType::new_json2(JsonNativeType::Object(JsonObjectType::from([(
diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs
index 362357c5e6..652847da43 100644
--- a/src/datatypes/src/types/json_type.rs
+++ b/src/datatypes/src/types/json_type.rs
@@ -115,6 +115,14 @@ impl JsonNativeType {
             (JsonNativeType::Null, that) => that.clone(),
             (this, JsonNativeType::Null) => this,
             (this, that) if this == *that => this,
+
+            (JsonNativeType::Number(x), JsonNativeType::Number(y)) => {
+                JsonNativeType::Number(match (x, y) {
+                    (x, y) if x == *y => x,
+                    (JsonNumberType::F64, _) | (_, JsonNumberType::F64) => JsonNumberType::F64,
+                    _ => JsonNumberType::I64,
+                })
+            }
             _ => JsonNativeType::Variant,
         };
     }
@@ -128,7 +136,7 @@ impl JsonNativeType {
                 JsonNumberType::I64 => ArrowDataType::Int64,
                 JsonNumberType::F64 => ArrowDataType::Float64,
             },
-            JsonNativeType::String => ArrowDataType::Utf8,
+            JsonNativeType::String => ArrowDataType::Utf8View,
             JsonNativeType::Array(array) => {
                 ArrowDataType::List(Arc::new(Field::new("item", array.as_arrow_type(), true)))
             }
@@ -822,7 +830,7 @@ mod tests {
         test(
             "1.5",
             &mut JsonType::new_json2(JsonNativeType::i64()),
-            Ok(r#""<Variant>""#),
+            Ok(r#""<Number>""#),
         )?;
 
         // Object merge should preserve existing fields and append missing fields.
diff --git a/src/datatypes/src/vectors/json/array.rs b/src/datatypes/src/vectors/json/array.rs
index 75779821c5..b3bd24cd98 100644
--- a/src/datatypes/src/vectors/json/array.rs
+++ b/src/datatypes/src/vectors/json/array.rs
@@ -17,16 +17,24 @@ use std::sync::Arc;
 
 use arrow::compute;
 use arrow::util::display::{ArrayFormatter, FormatOptions};
+use arrow_array::builder::{
+    ArrayBuilder, BooleanBuilder, Float64Builder, Int64Builder, NullBuilder, StringViewBuilder,
+    make_builder,
+};
 use arrow_array::cast::AsArray;
 use arrow_array::types::{Float64Type, Int64Type, UInt64Type};
 use arrow_array::{Array, ArrayRef, GenericListArray, ListArray, StructArray, new_null_array};
 use arrow_schema::{DataType, FieldRef};
+use common_telemetry::debug;
 use serde_json::Value;
 use snafu::{OptionExt, ResultExt};
 
-use crate::arrow_array::{StringArray, binary_array_value, string_array_value};
+use crate::arrow_array::{
+    MutableBinaryArray, StringViewArray, binary_array_value, string_array_value,
+};
 use crate::error::{
-    AlignJsonArraySnafu, ArrowComputeSnafu, DeserializeSnafu, InvalidJsonSnafu, Result,
+    AlignJsonArraySnafu, ArrowComputeSnafu, CastTypeSnafu, DeserializeSnafu, InvalidJsonSnafu,
+    Result, SerializeSnafu,
 };
 
 pub struct JsonArray<'a> {
@@ -101,6 +109,12 @@ impl JsonArray<'_> {
             return Ok(self.inner.clone());
         }
 
+        debug!(
+            "Try aligning JSON array {} to data type {}",
+            self.inner.data_type(),
+            expect
+        );
+
         let struct_array = self.inner.as_struct_opt().context(AlignJsonArraySnafu {
             reason: "expect struct array",
         })?;
@@ -178,11 +192,23 @@ impl JsonArray<'_> {
     }
 
     fn try_cast(&self, to_type: &DataType) -> Result<ArrayRef> {
-        if compute::can_cast_types(self.inner.data_type(), to_type) {
+        let from_type = self.inner.data_type();
+        if from_type == to_type {
+            return Ok(self.inner.clone());
+        }
+
+        if from_type.is_binary() && !to_type.is_binary() {
+            return self.decode_variant(to_type);
+        }
+
+        if !from_type.is_binary() && to_type.is_binary() {
+            return self.encode_variant();
+        }
+
+        if compute::can_cast_types(from_type, to_type) {
             return compute::cast(&self.inner, to_type).context(ArrowComputeSnafu);
         }
 
-        // TODO(LFC): Cast according to `to_type` instead of formatting to String here.
         let formatter = ArrayFormatter::try_new(&self.inner, &FormatOptions::default())
             .context(ArrowComputeSnafu)?;
         let values = (0..self.inner.len())
@@ -192,7 +218,91 @@ impl JsonArray<'_> {
                     .then(|| formatter.value(i).to_string())
             })
             .collect::<Vec<_>>();
-        Ok(Arc::new(StringArray::from(values)))
+        Ok(Arc::new(StringViewArray::from(values)))
+    }
+
+    fn encode_variant(&self) -> Result<ArrayRef> {
+        let len = self.inner.len();
+        let mut encoded = Vec::with_capacity(len);
+        let mut total_bytes = 0;
+
+        for i in 0..len {
+            let value = self.try_get_value(i)?;
+            if value.is_null() {
+                encoded.push(None);
+            } else {
+                let bytes = serde_json::to_vec(&value).context(SerializeSnafu)?;
+                total_bytes += bytes.len();
+                encoded.push(Some(bytes));
+            }
+        }
+
+        let mut builder = MutableBinaryArray::with_capacity(len, total_bytes);
+        for value in encoded {
+            builder.append_option(value);
+        }
+        Ok(Arc::new(builder.finish()))
+    }
+
+    fn decode_variant(&self, to_type: &DataType) -> Result<ArrayRef> {
+        fn downcast_builder<'a, T: ArrayBuilder>(
+            builder: &'a mut dyn ArrayBuilder,
+            to_type: &DataType,
+        ) -> Result<&'a mut T> {
+            builder
+                .as_any_mut()
+                .downcast_mut::<T>()
+                .with_context(|| CastTypeSnafu {
+                    msg: format!("Expect ArrayBuilder is of type {to_type}"),
+                })
+        }
+
+        let mut builder = make_builder(to_type, self.inner.len());
+        if to_type.is_null() {
+            downcast_builder::<NullBuilder>(builder.as_mut(), to_type)?
+                .append_nulls(self.inner.len());
+        } else {
+            match to_type {
+                DataType::Boolean => {
+                    let b = downcast_builder::<BooleanBuilder>(builder.as_mut(), to_type)?;
+                    for i in 0..self.inner.len() {
+                        b.append_option(self.try_get_value(i)?.as_bool());
+                    }
+                }
+                DataType::Int64 => {
+                    let b = downcast_builder::<Int64Builder>(builder.as_mut(), to_type)?;
+                    for i in 0..self.inner.len() {
+                        b.append_option(self.try_get_value(i)?.as_i64());
+                    }
+                }
+                DataType::Float64 => {
+                    let b = downcast_builder::<Float64Builder>(builder.as_mut(), to_type)?;
+                    for i in 0..self.inner.len() {
+                        b.append_option(self.try_get_value(i)?.as_f64());
+                    }
+                }
+                DataType::Utf8View => {
+                    let b = downcast_builder::<StringViewBuilder>(builder.as_mut(), to_type)?;
+                    for i in 0..self.inner.len() {
+                        let v = self.try_get_value(i)?;
+                        if v.is_null() {
+                            b.append_null();
+                        } else if let Some(s) = v.as_str() {
+                            b.append_value(s);
+                        } else {
+                            b.append_value(v.to_string());
+                        }
+                    }
+                }
+                _ => {
+                    return CastTypeSnafu {
+                        msg: format!("Cannot cast JSON value to {to_type}"),
+                    }
+                    .fail();
+                }
+            }
+        }
+        Ok(builder.finish())
     }
 }
 
@@ -231,7 +341,9 @@ impl<'a> From<&'a ArrayRef> for JsonArray<'a> {
 #[cfg(test)]
 mod test {
     use arrow_array::types::Int64Type;
-    use arrow_array::{BinaryArray, BooleanArray, Float64Array, Int32Array, Int64Array, ListArray};
+    use arrow_array::{
+        BinaryArray, BooleanArray, Float64Array, Int32Array, Int64Array, ListArray, StringArray,
+    };
     use arrow_schema::{Field, Fields};
     use serde_json::json;
 
diff --git a/src/datatypes/src/vectors/json/builder.rs b/src/datatypes/src/vectors/json/builder.rs
index be79a921c7..7ca1ff2f6a 100644
--- a/src/datatypes/src/vectors/json/builder.rs
+++ b/src/datatypes/src/vectors/json/builder.rs
@@ -89,7 +89,9 @@ impl MutableVector for JsonVectorBuilder {
             .fail();
         };
         let json_type = value.json_type();
-        self.merged_type.merge(json_type)?;
+        if !self.merged_type.is_include(json_type) {
+            self.merged_type.merge(json_type)?;
+        }
 
         let value = JsonValue::new(JsonVariant::from(value.variant().clone()));
         self.values.push(value);
diff --git a/src/flow/src/adapter/flownode_impl.rs b/src/flow/src/adapter/flownode_impl.rs
index 53a3265d7d..f4ca149f1a 100644
--- a/src/flow/src/adapter/flownode_impl.rs
+++ b/src/flow/src/adapter/flownode_impl.rs
@@ -465,6 +465,11 @@ impl FlowDualEngine {
         Ok(())
     }
 
+    /// Reconciles in-memory flow tasks from persisted metadata.
+    pub async fn reconcile_flows_from_metadata(&self) -> Result<(), Error> {
+        self.check_flow_consistent(true, true).await
+    }
+
     /// TODO(discord9): also add a `exists` api using flow metadata manager's `exists` method
     async fn flow_exist_in_metadata(&self, flow_id: FlowId) -> Result<bool, Error> {
         self.flow_metadata_manager
diff --git a/src/flow/src/batching_mode.rs b/src/flow/src/batching_mode.rs
index 4162daa20c..580762a142 100644
--- a/src/flow/src/batching_mode.rs
+++ b/src/flow/src/batching_mode.rs
@@ -20,12 +20,15 @@ use common_grpc::channel_manager::ClientTlsOption;
 use serde::{Deserialize, Serialize};
 use session::ReadPreference;
 
+mod checkpoint;
 pub(crate) mod engine;
 pub(crate) mod frontend_client;
+mod incremental_filter;
 mod state;
+mod table_creator;
 mod task;
 mod time_window;
-mod utils;
+pub(crate) mod utils;
 
 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct BatchingModeOptions {
diff --git a/src/flow/src/batching_mode/checkpoint.rs b/src/flow/src/batching_mode/checkpoint.rs
new file mode 100644
index 0000000000..7341d3d9e7
--- /dev/null
+++ b/src/flow/src/batching_mode/checkpoint.rs
@@ -0,0 +1,127 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::batching_mode::state::CheckpointMode;
+
+pub(super) const CHECKPOINT_DECISION_ADVANCE: &str = "advance";
+pub(super) const CHECKPOINT_DECISION_FALLBACK: &str = "fallback";
+pub(super) const CHECKPOINT_REASON_NONE: &str = "none";
+
+/// Why the task fell back to full snapshot mode.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(super) enum FlowQueryFallbackReason {
+    /// The query result did not include a region-watermark map at all.
+    MissingRegionWatermark,
+    /// Some participating regions could not prove safe advancement against
+    /// both the returned watermarks and the checkpoint map.
+    IncompleteRegionWatermark,
+    /// The query only covered part of the dirty backlog, so global checkpoints
+    /// cannot advance yet. Incremental SQL drains all dirty windows before
+    /// checkpoint advancement; this primarily protects scoped full-snapshot
+    /// runs capped by the per-query dirty-window limit.
+    DirtyBacklogPending,
+    /// The datanode detected a stale incremental cursor and the Flow
+    /// must recompute from scratch.
+    StaleCursor,
+    /// A non-stale-cursor query failure; the Flow resets to full snapshot
+    /// to avoid cascading errors.
+    IncrementalQueryFailure,
+    /// Incremental mode has been permanently disabled for this Flow
+    /// (e.g. because the query shape is not incrementally safe).
+    IncrementalDisabled,
+}
+
+impl FlowQueryFallbackReason {
+    pub(super) fn as_label(self) -> &'static str {
+        match self {
+            Self::MissingRegionWatermark => "missing_region_watermark",
+            Self::IncompleteRegionWatermark => "incomplete_region_watermark",
+            Self::DirtyBacklogPending => "dirty_backlog_pending",
+            Self::StaleCursor => "stale_cursor",
+            Self::IncrementalQueryFailure => "incremental_query_failure",
+            Self::IncrementalDisabled => "incremental_disabled",
+        }
+    }
+}
+
+/// Decision produced by `BatchingTask::apply_query_result_to_state` after
+/// each Flow query execution. Describes whether the task advanced its
+/// checkpoint state or fell back to full snapshot, and why.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(super) enum FlowCheckpointDecision {
+    /// FullSnapshot → Incremental transition.
+    ///
+    /// The query exercised every participating region, all returned valid
+    /// watermarks, and the checkpoint map was populated from scratch.
+    /// Subsequent executions will use incremental after-seqs.
+    AdvancedFromFullSnapshot {
+        participating_regions: usize,
+        watermarks: usize,
+    },
+    /// Existing Incremental → Incremental (in-place advancement).
+    ///
+    /// A subset of participating regions advanced their watermarks. The
+    /// task stays in incremental mode with an updated checkpoint map.
+    AdvancedIncremental {
+        participating_regions: usize,
+        watermarks: usize,
+    },
+    /// Any mode → FullSnapshot.
+    ///
+    /// Watermark information was incomplete, a participating region was
+    /// absent from the existing checkpoint map, the task has permanently
+    /// disabled incremental mode, or the query itself failed. The task
+    /// resets to full snapshot semantics for the next execution.
+    FallbackToFullSnapshot {
+        previous_mode: CheckpointMode,
+        reason: FlowQueryFallbackReason,
+    },
+}
+
+impl FlowCheckpointDecision {
+    pub(super) fn mode_label(self) -> &'static str {
+        match self {
+            Self::AdvancedFromFullSnapshot { .. } => {
+                checkpoint_mode_label(CheckpointMode::FullSnapshot)
+            }
+            Self::AdvancedIncremental { .. } => checkpoint_mode_label(CheckpointMode::Incremental),
+            Self::FallbackToFullSnapshot { previous_mode, .. } => {
+                checkpoint_mode_label(previous_mode)
+            }
+        }
+    }
+
+    pub(super) fn decision_label(self) -> &'static str {
+        match self {
+            Self::AdvancedFromFullSnapshot { .. } | Self::AdvancedIncremental { .. } => {
+                CHECKPOINT_DECISION_ADVANCE
+            }
+            Self::FallbackToFullSnapshot { .. } => CHECKPOINT_DECISION_FALLBACK,
+        }
+    }
+
+    pub(super) fn reason_label(self) -> &'static str {
+        match self {
+            Self::FallbackToFullSnapshot { reason, .. } => reason.as_label(),
+            _ => CHECKPOINT_REASON_NONE,
+        }
+    }
+}
+
+pub(super) fn checkpoint_mode_label(mode: CheckpointMode) -> &'static str {
+    match mode {
+        CheckpointMode::FullSnapshot => "full_snapshot",
+        CheckpointMode::Incremental => "incremental",
+    }
+}
diff --git a/src/flow/src/batching_mode/engine.rs b/src/flow/src/batching_mode/engine.rs
index 054f5db9d6..f37e54d80b 100644
--- a/src/flow/src/batching_mode/engine.rs
+++ b/src/flow/src/batching_mode/engine.rs
@@ -59,8 +59,7 @@ use crate::{CreateFlowArgs, Error, FlowId, TableName};
 ///
 /// TODO(discord9): determine how to configure refresh rate
 pub struct BatchingEngine {
-    tasks: RwLock<BTreeMap<FlowId, BatchingTask>>,
-    shutdown_txs: RwLock<BTreeMap<FlowId, oneshot::Sender<()>>>,
+    runtime: RwLock<FlowRuntimeRegistry>,
     /// frontend client for insert request
     pub(crate) frontend_client: Arc<FrontendClient>,
     flow_metadata_manager: FlowMetadataManagerRef,
@@ -72,6 +71,51 @@ pub struct BatchingEngine {
     pub(crate) batch_opts: Arc<BatchingModeOptions>,
 }
 
+#[derive(Default)]
+struct FlowRuntimeRegistry {
+    tasks: BTreeMap<FlowId, BatchingTask>,
+    shutdown_txs: BTreeMap<FlowId, oneshot::Sender<()>>,
+}
+
+impl FlowRuntimeRegistry {
+    fn insert(
+        &mut self,
+        flow_id: FlowId,
+        task: BatchingTask,
+        shutdown_tx: oneshot::Sender<()>,
+    ) -> (Option<BatchingTask>, Option<oneshot::Sender<()>>) {
+        (
+            self.tasks.insert(flow_id, task),
+            self.shutdown_txs.insert(flow_id, shutdown_tx),
+        )
+    }
+
+    fn remove(&mut self, flow_id: FlowId) -> Option<(BatchingTask, Option<oneshot::Sender<()>>)> {
+        let task = self.tasks.remove(&flow_id)?;
+        let shutdown_tx = self.shutdown_txs.remove(&flow_id);
+        Some((task, shutdown_tx))
+    }
+
+    fn remove_if_current(
+        &mut self,
+        flow_id: FlowId,
+        task: &BatchingTask,
+    ) -> (Option<BatchingTask>, Option<oneshot::Sender<()>>) {
+        if self
+            .tasks
+            .get(&flow_id)
+            .is_some_and(|current| Arc::ptr_eq(&current.state, &task.state))
+        {
+            let Some((removed_task, removed_shutdown_tx)) = self.remove(flow_id) else {
+                return (None, None);
+            };
+            (Some(removed_task), removed_shutdown_tx)
+        } else {
+            (None, None)
+        }
+    }
+}
+
 impl BatchingEngine {
     pub fn new(
         frontend_client: Arc<FrontendClient>,
@@ -82,8 +126,7 @@ impl BatchingEngine {
         batch_opts: BatchingModeOptions,
     ) -> Self {
         Self {
-            tasks: Default::default(),
-            shutdown_txs: Default::default(),
+            runtime: Default::default(),
             frontend_client,
             flow_metadata_manager,
             table_meta,
@@ -95,8 +138,9 @@ impl BatchingEngine {
 
     /// Returns last execution timestamps (millisecond) for all batching flows.
     pub async fn get_last_exec_time_map(&self) -> BTreeMap<FlowId, i64> {
-        let tasks = self.tasks.read().await;
-        tasks
+        let runtime = self.runtime.read().await;
+        runtime
+            .tasks
             .iter()
             .filter_map(|(flow_id, task)| {
                 task.last_execution_time_millis()
@@ -151,10 +195,17 @@ impl BatchingEngine {
 
         let group_by_table_name = Arc::new(group_by_table_name);
 
+        let tasks = self
+            .runtime
+            .read()
+            .await
+            .tasks
+            .values()
+            .cloned()
+            .collect::<Vec<_>>();
         let mut handles = Vec::new();
-        let tasks = self.tasks.read().await;
 
-        for (_flow_id, task) in tasks.iter() {
+        for task in tasks {
             let src_table_names = &task.config.source_table_names;
 
             if src_table_names
@@ -204,7 +255,6 @@ impl BatchingEngine {
             });
             handles.push(handle);
         }
-        drop(tasks);
         for handle in handles {
             match handle.await {
                 Err(e) => {
@@ -274,9 +324,16 @@ impl BatchingEngine {
 
         let group_by_table_name = Arc::new(group_by_table_name);
 
+        let tasks = self
+            .runtime
+            .read()
+            .await
+            .tasks
+            .values()
+            .cloned()
+            .collect::<Vec<_>>();
         let mut handles = Vec::new();
-        let tasks = self.tasks.read().await;
-        for (_flow_id, task) in tasks.iter() {
+        for task in tasks {
             let src_table_names = &task.config.source_table_names;
 
             if src_table_names
@@ -327,8 +384,6 @@ impl BatchingEngine {
                 }
             }
         }
-        drop(tasks);
-
         Ok(())
     }
 }
@@ -390,7 +445,7 @@ impl BatchingEngine {
 
         // or replace logic
         {
-            let is_exist = self.tasks.read().await.contains_key(&flow_id);
+            let is_exist = self.runtime.read().await.tasks.contains_key(&flow_id);
             match (create_if_not_exists, or_replace, is_exist) {
                 // if replace, ignore that old flow exists
                 (_, true, true) => {
@@ -521,17 +576,60 @@ impl BatchingEngine {
         // check execute once first to detect any error early
         task.check_or_create_sink_table(&engine, &frontend).await?;
 
+        let (start_tx, start_rx) = oneshot::channel();
+
         // TODO(discord9): use time wheel or what for better
         let handle = common_runtime::spawn_global(async move {
-            task_inner.start_executing_loop(engine, frontend).await;
+            if start_rx.await.is_ok() {
+                task_inner.start_executing_loop(engine, frontend).await;
+            }
         });
         task.state.write().unwrap().task_handle = Some(handle);
+        let task_for_rollback = task.clone();
 
-        // only replace here not earlier because we want the old one intact if something went wrong before this line
-        let replaced_old_task_opt = self.tasks.write().await.insert(flow_id, task);
-        drop(replaced_old_task_opt);
+        // Only replace here, not earlier, because we want the old one intact if
+        // something went wrong before this line. Keep the task and shutdown
+        // sender in one registry lock so create/remove can't observe one
+        // without the other.
+        let (replaced_old_task_opt, replaced_old_shutdown_tx) = {
+            let mut runtime = self.runtime.write().await;
 
-        self.shutdown_txs.write().await.insert(flow_id, tx);
+            let is_exist = runtime.tasks.contains_key(&flow_id);
+            match (create_if_not_exists, or_replace, is_exist) {
+                (_, true, true) => {
+                    info!(
+                        "Replacing flow with id={} after final registry check",
+                        flow_id
+                    );
+                }
+                (false, false, true) => {
+                    abort_flow_task(flow_id, Some(task), "unregistered");
+                    return FlowAlreadyExistSnafu { id: flow_id }.fail();
+                }
+                (true, false, true) => {
+                    info!(
+                        "Flow with id={} already exists at final registry check, do nothing",
+                        flow_id
+                    );
+                    abort_flow_task(flow_id, Some(task), "unregistered");
+                    return Ok(None);
+                }
+                (_, _, false) => (),
+            }
+
+            runtime.insert(flow_id, task, tx)
+        };
+
+        notify_flow_shutdown(flow_id, replaced_old_shutdown_tx, "replaced");
+        abort_flow_task(flow_id, replaced_old_task_opt, "replaced");
+        if start_tx.send(()).is_err() {
+            self.rollback_flow_runtime_if_current(flow_id, &task_for_rollback)
+                .await;
+            UnexpectedSnafu {
+                reason: format!("Failed to start flow {flow_id} due to task already dropped"),
+            }
+            .fail()?;
+        }
 
         Ok(Some(flow_id))
     }
@@ -662,21 +760,25 @@ impl BatchingEngine {
     }
 
     pub async fn remove_flow_inner(&self, flow_id: FlowId) -> Result<(), Error> {
-        if self.tasks.write().await.remove(&flow_id).is_none() {
-            warn!("Flow {flow_id} not found in tasks");
-            FlowNotFoundSnafu { id: flow_id }.fail()?;
-        }
-        let Some(tx) = self.shutdown_txs.write().await.remove(&flow_id) else {
+        let (task, shutdown_tx) = {
+            let mut runtime = self.runtime.write().await;
+            let Some((task, shutdown_tx)) = runtime.remove(flow_id) else {
+                warn!("Flow {flow_id} not found in tasks");
+                FlowNotFoundSnafu { id: flow_id }.fail()?
+            };
+            (task, shutdown_tx)
+        };
+
+        let had_shutdown_tx = notify_flow_shutdown(flow_id, shutdown_tx, "removed");
+        abort_flow_task(flow_id, Some(task), "removed");
+
+        if !had_shutdown_tx {
             UnexpectedSnafu {
                 reason: format!("Can't found shutdown tx for flow {flow_id}"),
             }
             .fail()?
-        };
-        if tx.send(()).is_err() {
-            warn!(
-                "Fail to shutdown flow {flow_id} due to receiver already dropped, maybe flow {flow_id} is already dropped?"
-            )
         }
+
         Ok(())
     }
 
@@ -688,7 +790,7 @@ impl BatchingEngine {
         // this is only useful for the case when we are flushing the flow right after inserting data into it
         // TODO(discord9): find a better way to ensure the data is ready, maybe inform flownode from frontend?
         tokio::time::sleep(std::time::Duration::from_millis(100)).await;
-        let task = self.tasks.read().await.get(&flow_id).cloned();
+        let task = self.runtime.read().await.tasks.get(&flow_id).cloned();
         let task = task.with_context(|| FlowNotFoundSnafu { id: flow_id })?;
 
         let time_window_size = task
@@ -713,7 +815,7 @@ impl BatchingEngine {
             )
             .await?;
 
-        let affected_rows = res.map(|(r, _)| r).unwrap_or_default() as usize;
+        let affected_rows = res.map(|(r, _)| r).unwrap_or_default();
         debug!(
             "Successfully flush flow {flow_id}, affected rows={}",
             affected_rows
@@ -723,8 +825,46 @@ impl BatchingEngine {
 
     /// Determine if the batching mode flow task exists with given flow id
     pub async fn flow_exist_inner(&self, flow_id: FlowId) -> bool {
-        self.tasks.read().await.contains_key(&flow_id)
+        self.runtime.read().await.tasks.contains_key(&flow_id)
     }
+
+    async fn rollback_flow_runtime_if_current(&self, flow_id: FlowId, task: &BatchingTask) {
+        let (removed_task, removed_shutdown_tx) = {
+            let mut runtime = self.runtime.write().await;
+            runtime.remove_if_current(flow_id, task)
+        };
+
+        notify_flow_shutdown(flow_id, removed_shutdown_tx, "rolled back");
+        abort_flow_task(flow_id, removed_task, "rolled back");
+    }
+}
+
+fn notify_flow_shutdown(flow_id: FlowId, tx: Option<oneshot::Sender<()>>, action: &str) -> bool {
+    let Some(tx) = tx else {
+        return false;
+    };
+
+    if tx.send(()).is_err() {
+        warn!(
+            "Fail to shutdown {action} flow {flow_id} due to receiver already dropped, maybe flow {flow_id} is already dropped?"
+        );
+    }
+
+    true
+}
+
+fn abort_flow_task(flow_id: FlowId, task: Option<BatchingTask>, action: &str) -> bool {
+    let Some(task) = task else {
+        return false;
+    };
+
+    if let Some(handle) = task.state.write().unwrap().task_handle.take() {
+        handle.abort();
+        debug!("Aborted {action} flow task {flow_id}");
+        return true;
+    }
+
+    false
 }
 
 impl FlowEngine for BatchingEngine {
@@ -741,7 +881,14 @@ impl FlowEngine for BatchingEngine {
         Ok(self.flow_exist_inner(flow_id).await)
     }
     async fn list_flows(&self) -> Result<impl IntoIterator<Item = FlowId>, Error> {
-        Ok(self.tasks.read().await.keys().cloned().collect::<Vec<_>>())
+        Ok(self
+            .runtime
+            .read()
+            .await
+            .tasks
+            .keys()
+            .cloned()
+            .collect::<Vec<_>>())
     }
     async fn handle_flow_inserts(
         &self,
@@ -756,3 +903,241 @@ impl FlowEngine for BatchingEngine {
         self.handle_mark_dirty_time_window(req).await
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use catalog::memory::new_memory_catalog_manager;
+    use common_meta::key::TableMetadataManager;
+    use common_meta::key::flow::FlowMetadataManager;
+    use common_meta::kv_backend::memory::MemoryKvBackend;
+    use query::options::QueryOptions;
+    use session::context::QueryContext;
+
+    use super::*;
+    use crate::test_utils::create_test_query_engine;
+
+    struct DropNotify(Option<oneshot::Sender<()>>);
+
+    impl Drop for DropNotify {
+        fn drop(&mut self) {
+            if let Some(tx) = self.0.take() {
+                let _ = tx.send(());
+            }
+        }
+    }
+
+    async fn new_test_engine() -> BatchingEngine {
+        let kv_backend = Arc::new(MemoryKvBackend::new());
+        let table_meta = Arc::new(TableMetadataManager::new(kv_backend.clone()));
+        table_meta.init().await.unwrap();
+        let flow_meta = Arc::new(FlowMetadataManager::new(kv_backend));
+        let catalog_manager = new_memory_catalog_manager().unwrap();
+        let query_engine = create_test_query_engine();
+        let (frontend_client, _handler) =
+            FrontendClient::from_empty_grpc_handler(QueryOptions::default());
+
+        BatchingEngine::new(
+            Arc::new(frontend_client),
+            query_engine,
+            flow_meta,
+            table_meta,
+            catalog_manager,
+            BatchingModeOptions::default(),
+        )
+    }
+
+    async fn new_test_task(flow_id: FlowId) -> (BatchingTask, oneshot::Sender<()>) {
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+        let plan = sql_to_df_plan(
+            ctx.clone(),
+            query_engine.clone(),
+            "SELECT number, ts FROM numbers_with_ts",
+            true,
+        )
+        .await
+        .unwrap();
+        let (tx, rx) = oneshot::channel();
+
+        let task = BatchingTask::try_new(TaskArgs {
+            flow_id,
+            query: "SELECT number, ts FROM numbers_with_ts",
+            plan,
+            time_window_expr: None,
+            expire_after: None,
+            sink_table_name: [
+                "greptime".to_string(),
+                "public".to_string(),
+                "sink".to_string(),
+            ],
+            source_table_names: vec![[
+                "greptime".to_string(),
+                "public".to_string(),
+                "numbers_with_ts".to_string(),
+            ]],
+            query_ctx: ctx,
+            catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+            shutdown_rx: rx,
+            batch_opts: Arc::new(BatchingModeOptions::default()),
+            flow_eval_interval: None,
+        })
+        .unwrap();
+
+        (task, tx)
+    }
+
+    async fn install_abort_observed_handle(task: &BatchingTask) -> oneshot::Receiver<()> {
+        let (drop_tx, drop_rx) = oneshot::channel();
+        let (entered_tx, entered_rx) = oneshot::channel();
+        let handle = tokio::spawn(async move {
+            let _guard = DropNotify(Some(drop_tx));
+            let _ = entered_tx.send(());
+            std::future::pending::<()>().await;
+        });
+        task.state.write().unwrap().task_handle = Some(handle);
+        tokio::time::timeout(Duration::from_secs(1), entered_rx)
+            .await
+            .expect("test task handle should start")
+            .expect("test task handle should report start");
+        drop_rx
+    }
+
+    #[tokio::test]
+    async fn test_notify_flow_shutdown_sends_signal() {
+        let (tx, rx) = oneshot::channel();
+
+        assert!(notify_flow_shutdown(42, Some(tx), "test"));
+
+        rx.await.expect("replaced flow should receive shutdown");
+    }
+
+    #[test]
+    fn test_notify_flow_shutdown_accepts_missing_sender() {
+        assert!(!notify_flow_shutdown(42, None, "test"));
+    }
+
+    #[tokio::test]
+    async fn test_abort_flow_task_aborts_handle() {
+        let (task, _shutdown_tx) = new_test_task(42).await;
+        let drop_rx = install_abort_observed_handle(&task).await;
+
+        assert!(abort_flow_task(42, Some(task), "test"));
+
+        tokio::time::timeout(Duration::from_secs(1), drop_rx)
+            .await
+            .expect("aborted task should be dropped")
+            .expect("drop notifier should fire");
+    }
+
+    #[tokio::test]
+    async fn test_remove_flow_inner_aborts_registered_task() {
+        let engine = new_test_engine().await;
+        let (task, shutdown_tx) = new_test_task(42).await;
+        let drop_rx = install_abort_observed_handle(&task).await;
+
+        engine.runtime.write().await.insert(42, task, shutdown_tx);
+
+        engine.remove_flow_inner(42).await.unwrap();
+
+        tokio::time::timeout(Duration::from_secs(1), drop_rx)
+            .await
+            .expect("removed task should be dropped")
+            .expect("drop notifier should fire");
+        assert!(!engine.flow_exist_inner(42).await);
+        assert!(!engine.runtime.read().await.shutdown_txs.contains_key(&42));
+    }
+
+    #[tokio::test]
+    async fn test_or_replace_flow_runtime_replaces_old_handles_and_keeps_new_task() {
+        let engine = new_test_engine().await;
+        let (old_task, old_shutdown_tx) = new_test_task(42).await;
+        let old_task_identity = old_task.clone();
+        let old_drop_rx = install_abort_observed_handle(&old_task).await;
+        let (new_task, new_shutdown_tx) = new_test_task(42).await;
+        let new_task_identity = new_task.clone();
+
+        engine
+            .runtime
+            .write()
+            .await
+            .insert(42, old_task, old_shutdown_tx);
+        let (replaced_old_task, replaced_old_shutdown_tx) =
+            engine
+                .runtime
+                .write()
+                .await
+                .insert(42, new_task, new_shutdown_tx);
+
+        let replaced_old_task = replaced_old_task.expect("old task should be returned");
+        assert!(Arc::ptr_eq(
+            &replaced_old_task.state,
+            &old_task_identity.state
+        ));
+        assert!(notify_flow_shutdown(
+            42,
+            replaced_old_shutdown_tx,
+            "replaced"
+        ));
+        old_task_identity
+            .state
+            .write()
+            .unwrap()
+            .shutdown_rx
+            .try_recv()
+            .expect("old shutdown receiver should receive signal");
+        assert!(abort_flow_task(42, Some(replaced_old_task), "replaced"));
+
+        tokio::time::timeout(Duration::from_secs(1), old_drop_rx)
+            .await
+            .expect("replaced task should be dropped")
+            .expect("drop notifier should fire");
+
+        let runtime = engine.runtime.read().await;
+        assert_eq!(1, runtime.tasks.len());
+        assert_eq!(1, runtime.shutdown_txs.len());
+        let registered_task = runtime.tasks.get(&42).expect("new task should remain");
+        assert!(Arc::ptr_eq(
+            &registered_task.state,
+            &new_task_identity.state
+        ));
+        assert!(runtime.shutdown_txs.contains_key(&42));
+        assert!(matches!(
+            new_task_identity
+                .state
+                .write()
+                .unwrap()
+                .shutdown_rx
+                .try_recv(),
+            Err(oneshot::error::TryRecvError::Empty)
+        ));
+    }
+
+    #[tokio::test]
+    async fn test_rollback_flow_runtime_if_current_removes_matching_task_only() {
+        let engine = new_test_engine().await;
+        let (old_task, _old_shutdown_tx) = new_test_task(42).await;
+        let (current_task, current_shutdown_tx) = new_test_task(42).await;
+        let current_task_identity = current_task.clone();
+
+        engine
+            .runtime
+            .write()
+            .await
+            .insert(42, current_task, current_shutdown_tx);
+
+        engine.rollback_flow_runtime_if_current(42, &old_task).await;
+
+        let registered_task = engine.runtime.read().await.tasks.get(&42).cloned().unwrap();
+        assert!(Arc::ptr_eq(
+            &registered_task.state,
+            &current_task_identity.state
+        ));
+        assert!(engine.runtime.read().await.shutdown_txs.contains_key(&42));
+
+        engine
+            .rollback_flow_runtime_if_current(42, &current_task_identity)
+            .await;
+        assert!(!engine.flow_exist_inner(42).await);
+        assert!(!engine.runtime.read().await.shutdown_txs.contains_key(&42));
+    }
+}
diff --git a/src/flow/src/batching_mode/frontend_client.rs b/src/flow/src/batching_mode/frontend_client.rs
index 7382f214e5..c6194d96b3 100644
--- a/src/flow/src/batching_mode/frontend_client.rs
+++ b/src/flow/src/batching_mode/frontend_client.rs
@@ -20,15 +20,17 @@ use std::sync::{Arc, Mutex, Weak};
 use api::v1::greptime_request::Request;
 use api::v1::query_request::Query;
 use api::v1::{CreateTableExpr, QueryRequest};
-use client::{Client, Database};
+use client::{Client, Database, OutputWithMetrics};
 use common_error::ext::BoxedError;
 use common_grpc::channel_manager::{ChannelConfig, ChannelManager, load_client_tls_config};
 use common_meta::peer::{Peer, PeerDiscovery};
-use common_query::Output;
+use common_query::{Output, OutputData};
+use common_recordbatch::adapter::{RecordBatchMetrics, RegionWatermarkEntry};
 use common_telemetry::warn;
 use meta_client::client::MetaClient;
 use query::datafusion::QUERY_PARALLELISM_HINT;
-use query::options::QueryOptions;
+use query::metrics::terminal_recordbatch_metrics_from_plan;
+use query::options::{FlowQueryExtensions, QueryOptions};
 use rand::rng;
 use rand::seq::SliceRandom;
 use servers::query_handler::grpc::GrpcQueryHandler;
@@ -196,9 +198,6 @@ impl DatabaseWithPeer {
 }
 
 impl FrontendClient {
-    // TODO: support more fine-grained load balancing strategies for frontend
-    // selection, such as AZ (availability zone) awareness, to prefer frontends
-    // in the same zone as the flownode and reduce cross-AZ latency.
     /// scan for available frontend from metadata
     pub(crate) async fn scan_for_frontend(&self) -> Result<Vec<Peer>, Error> {
         let Self::Distributed { meta_client, .. } = self else {
@@ -341,6 +340,83 @@ impl FrontendClient {
         }
     }
 
+    pub(crate) async fn query_with_terminal_metrics(
+        &self,
+        catalog: &str,
+        schema: &str,
+        request: QueryRequest,
+        extensions: &[(&str, &str)],
+        peer_desc: &mut Option<PeerDesc>,
+    ) -> Result<OutputWithMetrics, Error> {
+        let flow_extensions = build_flow_extensions(extensions)?;
+        match self {
+            FrontendClient::Distributed {
+                query, batch_opts, ..
+            } => {
+                let query_parallelism = query.parallelism.to_string();
+                let hints = vec![
+                    (QUERY_PARALLELISM_HINT, query_parallelism.as_str()),
+                    (READ_PREFERENCE_HINT, batch_opts.read_preference.as_ref()),
+                ];
+                let db = self.get_random_active_frontend(catalog, schema).await?;
+                *peer_desc = Some(PeerDesc::Dist {
+                    peer: db.peer.clone(),
+                });
+                db.database
+                    .query_with_terminal_metrics_and_flow_extensions(request, &hints, extensions)
+                    .await
+                    .map_err(BoxedError::new)
+                    .context(ExternalSnafu)
+            }
+            FrontendClient::Standalone {
+                database_client,
+                query,
+            } => {
+                *peer_desc = Some(PeerDesc::Standalone);
+                let mut extensions_map = HashMap::from([(
+                    QUERY_PARALLELISM_HINT.to_string(),
+                    query.parallelism.to_string(),
+                )]);
+                for (key, value) in extensions {
+                    extensions_map.insert((*key).to_string(), (*value).to_string());
+                }
+                let ctx = QueryContextBuilder::default()
+                    .current_catalog(catalog.to_string())
+                    .current_schema(schema.to_string())
+                    .extensions(extensions_map)
+                    .build();
+                let ctx = Arc::new(ctx);
+                let database_client = {
+                    database_client
+                        .handler
+                        .lock()
+                        .map_err(|e| {
+                            UnexpectedSnafu {
+                                reason: format!("Failed to lock database client: {e}"),
+                            }
+                            .build()
+                        })?
+                        .as_ref()
+                        .context(UnexpectedSnafu {
+                            reason: "Standalone's frontend instance is not set",
+                        })?
+                        .upgrade()
+                        .context(UnexpectedSnafu {
+                            reason: "Failed to upgrade database client",
+                        })?
+                };
+                database_client
+                    .do_query(Request::Query(request), ctx.clone())
+                    .await
+                    .map(|output| {
+                        wrap_standalone_output_with_terminal_metrics(output, &flow_extensions, &ctx)
+                    })
+                    .map_err(BoxedError::new)
+                    .context(ExternalSnafu)
+            }
+        }
+    }
+
     /// Handle a request to frontend
     pub(crate) async fn handle(
         &self,
@@ -426,22 +502,83 @@ impl FrontendClient {
     }
 }
 
+fn build_flow_extensions(extensions: &[(&str, &str)]) -> Result<FlowQueryExtensions, Error> {
+    let flow_extensions = HashMap::from_iter(
+        extensions
+            .iter()
+            .map(|(key, value)| ((*key).to_string(), (*value).to_string())),
+    );
+    FlowQueryExtensions::parse_flow_extensions(&flow_extensions)
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)
+        .map(|extensions| extensions.unwrap_or_default())
+}
+
+fn wrap_standalone_output_with_terminal_metrics(
+    output: Output,
+    flow_extensions: &FlowQueryExtensions,
+    query_ctx: &QueryContextRef,
+) -> OutputWithMetrics {
+    let should_collect_region_watermark = flow_extensions.should_collect_region_watermark();
+    let terminal_metrics =
+        if should_collect_region_watermark && !matches!(&output.data, OutputData::Stream(_)) {
+            output
+                .meta
+                .plan
+                .clone()
+                .and_then(terminal_recordbatch_metrics_from_plan)
+                .or_else(|| terminal_recordbatch_metrics_from_snapshots(query_ctx))
+        } else {
+            None
+        };
+    let result = OutputWithMetrics::from_output(output);
+    if let Some(metrics) = terminal_metrics {
+        result.metrics.update(Some(metrics));
+    }
+    result
+}
+
+fn terminal_recordbatch_metrics_from_snapshots(
+    query_ctx: &QueryContextRef,
+) -> Option<RecordBatchMetrics> {
+    let mut region_watermarks = query_ctx
+        .snapshots()
+        .into_iter()
+        .map(|(region_id, watermark)| RegionWatermarkEntry {
+            region_id,
+            watermark: Some(watermark),
+        })
+        .collect::<Vec<_>>();
+    if region_watermarks.is_empty() {
+        return None;
+    }
+
+    region_watermarks.sort_by_key(|entry| entry.region_id);
+    Some(RecordBatchMetrics {
+        region_watermarks,
+        ..Default::default()
+    })
+}
+
 /// Describe a peer of frontend
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub(crate) enum PeerDesc {
+    /// The query failed before a frontend peer was selected.
+    #[default]
+    Unknown,
     /// Distributed mode's frontend peer address
     Dist {
         /// frontend peer address
         peer: Peer,
     },
     /// Standalone mode
-    #[default]
     Standalone,
 }
 
 impl std::fmt::Display for PeerDesc {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
+            PeerDesc::Unknown => write!(f, "unknown"),
             PeerDesc::Dist { peer } => write!(f, "{}", peer.addr),
             PeerDesc::Standalone => write!(f, "standalone"),
         }
@@ -450,9 +587,17 @@ impl std::fmt::Display for PeerDesc {
 
 #[cfg(test)]
 mod tests {
+    use std::pin::Pin;
+    use std::task::{Context, Poll};
     use std::time::Duration;
 
-    use common_query::Output;
+    use common_query::{Output, OutputData};
+    use common_recordbatch::adapter::RecordBatchMetrics;
+    use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream};
+    use datatypes::prelude::{ConcreteDataType, VectorRef};
+    use datatypes::schema::{ColumnSchema, Schema};
+    use datatypes::vectors::Int32Vector;
+    use futures::StreamExt;
     use tokio::time::timeout;
 
     use super::*;
@@ -460,6 +605,58 @@ mod tests {
     #[derive(Debug)]
     struct NoopHandler;
 
+    struct MockMetricsStream {
+        schema: datatypes::schema::SchemaRef,
+        batch: Option<RecordBatch>,
+        metrics: RecordBatchMetrics,
+        terminal_metrics_only: bool,
+    }
+
+    impl futures::Stream for MockMetricsStream {
+        type Item = common_recordbatch::error::Result<RecordBatch>;
+
+        fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+            Poll::Ready(self.batch.take().map(Ok))
+        }
+
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            (
+                usize::from(self.batch.is_some()),
+                Some(usize::from(self.batch.is_some())),
+            )
+        }
+    }
+
+    impl RecordBatchStream for MockMetricsStream {
+        fn name(&self) -> &str {
+            "MockMetricsStream"
+        }
+
+        fn schema(&self) -> datatypes::schema::SchemaRef {
+            self.schema.clone()
+        }
+
+        fn output_ordering(&self) -> Option<&[OrderOption]> {
+            None
+        }
+
+        fn metrics(&self) -> Option<RecordBatchMetrics> {
+            if self.terminal_metrics_only && self.batch.is_some() {
+                return None;
+            }
+            Some(self.metrics.clone())
+        }
+    }
+
+    #[derive(Debug)]
+    struct MetricsHandler;
+
+    #[derive(Debug)]
+    struct ExtensionAwareHandler;
+
+    #[derive(Debug)]
+    struct SnapshotBindingHandler;
+
     #[async_trait::async_trait]
     impl GrpcQueryHandlerWithBoxedError for NoopHandler {
         async fn do_query(
@@ -471,6 +668,63 @@ mod tests {
         }
     }
 
+    #[async_trait::async_trait]
+    impl GrpcQueryHandlerWithBoxedError for MetricsHandler {
+        async fn do_query(
+            &self,
+            _query: Request,
+            _ctx: QueryContextRef,
+        ) -> std::result::Result<Output, BoxedError> {
+            let schema = Arc::new(Schema::new(vec![ColumnSchema::new(
+                "v",
+                ConcreteDataType::int32_datatype(),
+                false,
+            )]));
+            let batch = RecordBatch::new(
+                schema.clone(),
+                vec![Arc::new(Int32Vector::from_slice([1, 2])) as VectorRef],
+            )
+            .unwrap();
+            Ok(Output::new_with_stream(Box::pin(MockMetricsStream {
+                schema,
+                batch: Some(batch),
+                metrics: RecordBatchMetrics {
+                    region_watermarks: vec![common_recordbatch::adapter::RegionWatermarkEntry {
+                        region_id: 42,
+                        watermark: Some(99),
+                    }],
+                    ..Default::default()
+                },
+                terminal_metrics_only: true,
+            })))
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl GrpcQueryHandlerWithBoxedError for ExtensionAwareHandler {
+        async fn do_query(
+            &self,
+            _query: Request,
+            ctx: QueryContextRef,
+        ) -> std::result::Result<Output, BoxedError> {
+            assert_eq!(ctx.extension("flow.return_region_seq"), Some("true"));
+            Ok(Output::new_with_affected_rows(1))
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl GrpcQueryHandlerWithBoxedError for SnapshotBindingHandler {
+        async fn do_query(
+            &self,
+            _query: Request,
+            ctx: QueryContextRef,
+        ) -> std::result::Result<Output, BoxedError> {
+            assert_eq!(ctx.extension("flow.return_region_seq"), Some("true"));
+            ctx.set_snapshot(42, 99);
+            Ok(Output::new_with_affected_rows(1))
+        }
+    }
+
     #[tokio::test]
     async fn wait_initialized() {
         let (client, handler_mut) =
@@ -516,4 +770,117 @@ mod tests {
                 .is_ok()
         );
     }
+
+    #[tokio::test]
+    async fn test_query_with_terminal_metrics_tracks_watermark_in_standalone_mode() {
+        let handler: Arc<dyn GrpcQueryHandlerWithBoxedError> = Arc::new(MetricsHandler);
+        let client =
+            FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+        let mut peer_desc = None;
+
+        let result = client
+            .query_with_terminal_metrics(
+                "greptime",
+                "public",
+                QueryRequest {
+                    query: Some(Query::Sql("select 1".to_string())),
+                },
+                &[],
+                &mut peer_desc,
+            )
+            .await
+            .unwrap();
+        assert!(matches!(peer_desc, Some(PeerDesc::Standalone)));
+
+        let terminal_metrics = result.metrics.clone();
+        assert!(!result.metrics.is_ready());
+        assert!(terminal_metrics.get().is_none());
+
+        let OutputData::Stream(mut stream) = result.output.data else {
+            panic!("expected stream output");
+        };
+        while stream.next().await.is_some() {}
+
+        assert!(terminal_metrics.is_ready());
+        assert_eq!(
+            terminal_metrics.region_watermark_map(),
+            Some(HashMap::from([(42_u64, 99_u64)]))
+        );
+    }
+
+    #[tokio::test]
+    async fn test_query_with_terminal_metrics_forwards_flow_extensions_in_standalone_mode() {
+        let handler: Arc<dyn GrpcQueryHandlerWithBoxedError> = Arc::new(ExtensionAwareHandler);
+        let client =
+            FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+        let mut peer_desc = None;
+
+        let result = client
+            .query_with_terminal_metrics(
+                "greptime",
+                "public",
+                QueryRequest {
+                    query: Some(Query::Sql("insert into t select 1".to_string())),
+                },
+                &[("flow.return_region_seq", "true")],
+                &mut peer_desc,
+            )
+            .await
+            .unwrap();
+        assert!(matches!(peer_desc, Some(PeerDesc::Standalone)));
+
+        assert!(result.metrics.is_ready());
+        assert!(result.region_watermark_map().is_none());
+    }
+
+    #[tokio::test]
+    async fn test_query_with_terminal_metrics_uses_standalone_snapshot_bounds() {
+        let handler: Arc<dyn GrpcQueryHandlerWithBoxedError> = Arc::new(SnapshotBindingHandler);
+        let client =
+            FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+        let mut peer_desc = None;
+
+        let result = client
+            .query_with_terminal_metrics(
+                "greptime",
+                "public",
+                QueryRequest {
+                    query: Some(Query::Sql("insert into t select * from src".to_string())),
+                },
+                &[("flow.return_region_seq", "true")],
+                &mut peer_desc,
+            )
+            .await
+            .unwrap();
+        assert!(matches!(peer_desc, Some(PeerDesc::Standalone)));
+
+        assert!(result.metrics.is_ready());
+        assert_eq!(
+            result.region_watermark_map(),
+            Some(HashMap::from([(42, 99)]))
+        );
+    }
+
+    #[tokio::test]
+    async fn test_query_with_terminal_metrics_rejects_invalid_flow_extensions() {
+        let handler: Arc<dyn GrpcQueryHandlerWithBoxedError> = Arc::new(NoopHandler);
+        let client =
+            FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+        let mut peer_desc = None;
+
+        let err = client
+            .query_with_terminal_metrics(
+                "greptime",
+                "public",
+                QueryRequest {
+                    query: Some(Query::Sql("select 1".to_string())),
+                },
+                &[("flow.return_region_seq", "not-a-bool")],
+                &mut peer_desc,
+            )
+            .await
+            .unwrap_err();
+
+        assert!(format!("{err:?}").contains("Invalid value for flow.return_region_seq"));
+    }
 }
diff --git a/src/flow/src/batching_mode/incremental_filter.rs b/src/flow/src/batching_mode/incremental_filter.rs
new file mode 100644
index 0000000000..ddc58d0378
--- /dev/null
+++ b/src/flow/src/batching_mode/incremental_filter.rs
@@ -0,0 +1,222 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use common_telemetry::tracing::debug;
+use datafusion_expr::Expr;
+use datatypes::schema::Schema;
+
+use crate::batching_mode::state::FilterExprInfo;
+use crate::batching_mode::utils::IncrementalAggregateAnalysis;
+use crate::{Error, FlowId};
+
+pub(super) fn build_sink_dirty_time_window_filter_expr(
+    flow_id: FlowId,
+    analysis: &IncrementalAggregateAnalysis,
+    sink_schema: &Schema,
+    dirty_filter: Option<&FilterExprInfo>,
+) -> Result<Option<Expr>, Error> {
+    let Some(dirty_filter) = dirty_filter else {
+        return Ok(None);
+    };
+
+    let Some(sink_filter_col) =
+        infer_sink_time_window_filter_col(flow_id, analysis, sink_schema, dirty_filter)
+    else {
+        return Ok(None);
+    };
+
+    dirty_filter.predicate_for_col(&sink_filter_col)
+}
+
+fn infer_sink_time_window_filter_col(
+    flow_id: FlowId,
+    analysis: &IncrementalAggregateAnalysis,
+    sink_schema: &Schema,
+    dirty_filter: &FilterExprInfo,
+) -> Option<String> {
+    if analysis.group_key_names.is_empty() {
+        return None;
+    }
+
+    let is_timestamp_group_key = |name: &str| {
+        analysis.group_key_names.iter().any(|key| key == name)
+            && sink_schema
+                .column_schema_by_name(name)
+                .is_some_and(|col| col.data_type.is_timestamp())
+    };
+
+    if is_timestamp_group_key(&dirty_filter.col_name) {
+        return Some(dirty_filter.col_name.clone());
+    }
+
+    let candidates = analysis
+        .group_key_names
+        .iter()
+        .filter(|name| is_timestamp_group_key(name))
+        .cloned()
+        .collect::<Vec<_>>();
+
+    match candidates.as_slice() {
+        [name] => Some(name.clone()),
+        [] => {
+            debug!(
+                "Flow {} cannot infer sink dirty-window filter column: no timestamp group key in {:?}",
+                flow_id, analysis.group_key_names
+            );
+            None
+        }
+        _ => {
+            debug!(
+                "Flow {} cannot infer sink dirty-window filter column: ambiguous timestamp group keys {:?}",
+                flow_id, candidates
+            );
+            None
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::schema::ColumnSchema;
+    use pretty_assertions::assert_eq;
+
+    use super::*;
+    use crate::adapter::AUTO_CREATED_UPDATE_AT_TS_COL;
+    use crate::batching_mode::state::FilterExprInfo;
+    use crate::batching_mode::utils::IncrementalAggregateAnalysis;
+
+    fn test_analysis_with_group_keys(group_key_names: Vec<&str>) -> IncrementalAggregateAnalysis {
+        IncrementalAggregateAnalysis {
+            group_key_names: group_key_names
+                .into_iter()
+                .map(|name| name.to_string())
+                .collect(),
+            merge_columns: vec![],
+            literal_columns: vec![],
+            output_field_names: vec![],
+            unsupported_exprs: vec![],
+        }
+    }
+
+    fn test_dirty_filter(col_name: &str) -> FilterExprInfo {
+        FilterExprInfo {
+            expr: datafusion_expr::col(col_name),
+            col_name: col_name.to_string(),
+            time_ranges: vec![],
+            window_size: chrono::Duration::seconds(1),
+        }
+    }
+
+    fn test_sink_schema(columns: Vec<(&str, ConcreteDataType)>) -> Schema {
+        Schema::new(
+            columns
+                .into_iter()
+                .map(|(name, data_type)| ColumnSchema::new(name, data_type, true))
+                .collect(),
+        )
+    }
+
+    #[test]
+    fn test_infer_sink_time_window_filter_col_uses_matching_source_group_key() {
+        let analysis = test_analysis_with_group_keys(vec!["ts", "host"]);
+        let sink_schema = test_sink_schema(vec![
+            ("ts", ConcreteDataType::timestamp_millisecond_datatype()),
+            ("host", ConcreteDataType::string_datatype()),
+        ]);
+        let dirty_filter = test_dirty_filter("ts");
+
+        assert_eq!(
+            Some("ts".to_string()),
+            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+        );
+    }
+
+    #[test]
+    fn test_infer_sink_time_window_filter_col_uses_unique_timestamp_group_key() {
+        let analysis = test_analysis_with_group_keys(vec!["host", "time_window"]);
+        let sink_schema = test_sink_schema(vec![
+            ("host", ConcreteDataType::string_datatype()),
+            (
+                "time_window",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+            ),
+            (
+                AUTO_CREATED_UPDATE_AT_TS_COL,
+                ConcreteDataType::timestamp_millisecond_datatype(),
+            ),
+        ]);
+        let dirty_filter = test_dirty_filter("ts");
+
+        assert_eq!(
+            Some("time_window".to_string()),
+            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+        );
+    }
+
+    #[test]
+    fn test_infer_sink_time_window_filter_col_skips_global_aggregate() {
+        let analysis = test_analysis_with_group_keys(vec![]);
+        let sink_schema = test_sink_schema(vec![
+            ("number", ConcreteDataType::uint32_datatype()),
+            (
+                "time_window",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+            ),
+        ]);
+        let dirty_filter = test_dirty_filter("ts");
+
+        assert_eq!(
+            None,
+            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+        );
+    }
+
+    #[test]
+    fn test_infer_sink_time_window_filter_col_skips_without_timestamp_group_key() {
+        let analysis = test_analysis_with_group_keys(vec!["host", "device"]);
+        let sink_schema = test_sink_schema(vec![
+            ("host", ConcreteDataType::string_datatype()),
+            ("device", ConcreteDataType::string_datatype()),
+            (
+                AUTO_CREATED_UPDATE_AT_TS_COL,
+                ConcreteDataType::timestamp_millisecond_datatype(),
+            ),
+        ]);
+        let dirty_filter = test_dirty_filter("ts");
+
+        assert_eq!(
+            None,
+            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+        );
+    }
+
+    #[test]
+    fn test_infer_sink_time_window_filter_col_skips_ambiguous_timestamp_group_keys() {
+        let analysis = test_analysis_with_group_keys(vec!["ts", "time_window"]);
+        let sink_schema = test_sink_schema(vec![
+            ("ts", ConcreteDataType::timestamp_millisecond_datatype()),
+            (
+                "time_window",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+            ),
+        ]);
+        let dirty_filter = test_dirty_filter("source_ts");
+
+        assert_eq!(
+            None,
+            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+        );
+    }
+}
diff --git a/src/flow/src/batching_mode/state.rs b/src/flow/src/batching_mode/state.rs
index d90023ae46..42b71a4ec7 100644
--- a/src/flow/src/batching_mode/state.rs
+++ b/src/flow/src/batching_mode/state.rs
@@ -13,8 +13,9 @@
 // limitations under the License.
 
 //! Batching mode task state, which changes frequently
+//!
 
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::time::Duration;
 
 use common_telemetry::debug;
@@ -49,6 +50,14 @@ pub struct TaskState {
     /// Dirty Time windows need to be updated
     /// mapping of `start -> end` and non-overlapping
     pub(crate) dirty_time_windows: DirtyTimeWindows,
+    checkpoint_mode: CheckpointMode,
+    /// Region id -> last consumed watermark sequence. Incremental scans use
+    /// this as the next lower sequence bound for each source region.
+    checkpoints: BTreeMap<u64, u64>,
+    /// Once set, the task will never attempt incremental mode again.
+    /// Set when the flow's query shape is deterministically incompatible
+    /// with incremental execution (e.g. unsupported aggregate expressions).
+    incremental_disabled: bool,
     exec_state: ExecState,
     /// Shutdown receiver
     pub(crate) shutdown_rx: oneshot::Receiver<()>,
@@ -63,6 +72,9 @@ impl TaskState {
             last_query_duration: Duration::from_secs(0),
             last_exec_time_millis: None,
             dirty_time_windows: Default::default(),
+            checkpoint_mode: CheckpointMode::FullSnapshot,
+            checkpoints: Default::default(),
+            incremental_disabled: false,
             exec_state: ExecState::Idle,
             shutdown_rx,
             task_handle: None,
@@ -84,6 +96,84 @@ impl TaskState {
         self.last_exec_time_millis
     }
 
+    pub fn checkpoint_mode(&self) -> CheckpointMode {
+        self.checkpoint_mode
+    }
+
+    pub fn checkpoints(&self) -> &BTreeMap<u64, u64> {
+        &self.checkpoints
+    }
+
+    pub fn is_incremental_disabled(&self) -> bool {
+        self.incremental_disabled
+    }
+
+    /// Permanently disable incremental mode for this task and
+    /// immediately fall back to full snapshot for the current cycle.
+    pub fn disable_incremental(&mut self) {
+        self.incremental_disabled = true;
+        self.mark_full_snapshot();
+    }
+
+    pub fn mark_full_snapshot(&mut self) {
+        self.checkpoint_mode = CheckpointMode::FullSnapshot;
+    }
+
+    pub fn advance_checkpoints(&mut self, watermark_map: HashMap<u64, u64>) {
+        self.checkpoints = watermark_map.into_iter().collect();
+        if !self.incremental_disabled {
+            self.checkpoint_mode = CheckpointMode::Incremental;
+        }
+    }
+
+    pub fn advance_incremental_checkpoints_with_participation(
+        &mut self,
+        participating_regions: &BTreeSet<u64>,
+        watermark_map: HashMap<u64, u64>,
+    ) {
+        for region_id in participating_regions {
+            if let Some(seq) = watermark_map.get(region_id) {
+                self.checkpoints.insert(*region_id, *seq);
+            }
+        }
+        if !self.incremental_disabled {
+            self.checkpoint_mode = CheckpointMode::Incremental;
+        }
+    }
+
+    pub fn can_advance_full_snapshot_checkpoints(
+        &self,
+        participating_regions: &BTreeSet<u64>,
+        watermark_map: &HashMap<u64, u64>,
+    ) -> bool {
+        !participating_regions.is_empty()
+            && participating_regions.len() == watermark_map.len()
+            && participating_regions
+                .iter()
+                .all(|region_id| watermark_map.contains_key(region_id))
+    }
+
+    pub fn can_advance_incremental_checkpoints_with_participation(
+        &self,
+        participating_regions: &BTreeSet<u64>,
+        watermark_map: &HashMap<u64, u64>,
+    ) -> bool {
+        !self.incremental_disabled
+            && !self.checkpoints.is_empty()
+            && !participating_regions.is_empty()
+            && participating_regions.len() == watermark_map.len()
+            && participating_regions
+                .iter()
+                .all(|region_id| self.checkpoints.contains_key(region_id))
+            && participating_regions.iter().all(|region_id| {
+                let checkpoint = self.checkpoints.get(region_id);
+                watermark_map
+                    .get(region_id)
+                    .zip(checkpoint)
+                    .is_some_and(|(seq, checkpoint)| seq >= checkpoint)
+            })
+    }
+
     /// Compute the next query delay based on the time window size or the last query duration.
     /// Aiming to avoid too frequent queries. But also not too long delay.
     ///
@@ -94,6 +184,10 @@ impl TaskState {
     /// if current the dirty time range is longer than one query can handle,
     /// execute immediately to faster clean up dirty time windows.
     ///
+    /// If `prefer_short_incremental_cadence` is true, run incremental queries
+    /// more often when there is no large dirty backlog. This only reduces the
+    /// chance of hitting a stale cursor after flush; it is not required for
+    /// correctness.
     pub fn get_next_start_query_time(
         &self,
         flow_id: FlowId,
@@ -101,6 +195,7 @@ impl TaskState {
         min_refresh_duration: Duration,
         max_timeout: Option<Duration>,
         max_filter_num_per_query: usize,
+        prefer_short_incremental_cadence: bool,
     ) -> Instant {
         // = last query duration, capped by [max(min_run_interval, time_window_size), max_timeout], note at most `max_timeout`
         let lower = time_window_size.unwrap_or(min_refresh_duration);
@@ -119,7 +214,20 @@ impl TaskState {
         // if dirty time range is more than one query can handle, execute immediately
         // to faster clean up dirty time windows
         if cur_dirty_window_size < max_query_update_range {
-            self.last_update_time + next_duration
+            if prefer_short_incremental_cadence {
+                // Run incremental queries sooner than the normal time-window
+                // cadence, while still backing off by at least the previous
+                // query duration and respecting the max-timeout cap.
+                let next_duration = self.last_query_duration.max(min_refresh_duration);
+                let next_duration = if let Some(max_timeout) = max_timeout {
+                    next_duration.min(max_timeout)
+                } else {
+                    next_duration
+                };
+                self.last_update_time + next_duration
+            } else {
+                self.last_update_time + next_duration
+            }
         } else {
             // if dirty time windows can't be clean up in one query, execute immediately to faster
             // clean up dirty time windows
@@ -199,12 +307,42 @@ impl DirtyTimeWindows {
     }
 
     pub fn add_window(&mut self, start: Timestamp, end: Option<Timestamp>) {
-        self.windows.insert(start, end);
+        self.add_or_merge_window(start, end);
     }
 
     pub fn add_windows(&mut self, time_ranges: Vec<(Timestamp, Timestamp)>) {
         for (start, end) in time_ranges {
-            self.windows.insert(start, Some(end));
+            self.add_or_merge_window(start, Some(end));
+        }
+    }
+
+    /// Add all dirty markers from another dirty-window set.
+    pub fn add_dirty_windows(&mut self, dirty_windows: &DirtyTimeWindows) {
+        for (start, end) in &dirty_windows.windows {
+            self.add_or_merge_window(*start, *end);
+        }
+    }
+
+    fn add_or_merge_window(&mut self, start: Timestamp, end: Option<Timestamp>) {
+        self.windows
+            .entry(start)
+            .and_modify(|current_end| {
+                *current_end = Self::union_window_end(*current_end, end);
+            })
+            .or_insert(end);
+    }
+
+    fn union_window_end(
+        current_end: Option<Timestamp>,
+        incoming_end: Option<Timestamp>,
+    ) -> Option<Timestamp> {
+        match (current_end, incoming_end) {
+            (Some(current), Some(incoming)) => Some(current.max(incoming)),
+            // `None` is a dirty marker without a known upper bound.  When one
+            // side has a concrete end, keep it so merging a restored snapshot
+            // never shrinks an already-known dirty range with the same start.
+            (Some(end), None) | (None, Some(end)) => Some(end),
+            (None, None) => None,
         }
     }
 
@@ -216,7 +354,7 @@ impl DirtyTimeWindows {
     /// Set windows to be dirty, only useful for full aggr without time window
     /// to mark some new data is inserted
     pub fn set_dirty(&mut self) {
-        self.windows.insert(Timestamp::new_second(0), None);
+        self.add_or_merge_window(Timestamp::new_second(0), None);
     }
 
     /// Number of dirty windows.
@@ -283,7 +421,7 @@ impl DirtyTimeWindows {
         );
         self.merge_dirty_time_windows(window_size, expire_lower_bound)?;
 
-        if self.windows.len() > self.max_filter_num_per_query {
+        if self.windows.len() > window_cnt {
             let first_time_window = self.windows.first_key_value();
             let last_time_window = self.windows.last_key_value();
 
@@ -292,7 +430,7 @@ impl DirtyTimeWindows {
                     "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. Time window expr={:?}, expire_after={:?}, first_time_window={:?}, last_time_window={:?}, the original query: {:?}",
                     task_ctx.config.flow_id,
                     self.windows.len(),
-                    self.max_filter_num_per_query,
+                    window_cnt,
                     task_ctx.config.time_window_expr,
                     task_ctx.config.expire_after,
                     first_time_window,
@@ -304,7 +442,7 @@ impl DirtyTimeWindows {
                     "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. first_time_window={:?}, last_time_window={:?}",
                     flow_id,
                     self.windows.len(),
-                    self.max_filter_num_per_query,
+                    window_cnt,
                     first_time_window,
                     last_time_window
                 )
@@ -559,6 +697,12 @@ enum ExecState {
     Executing,
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CheckpointMode {
+    FullSnapshot,
+    Incremental,
+}
+
 /// Filter Expression's information
 #[derive(Debug, Clone)]
 pub struct FilterExprInfo {
@@ -576,6 +720,28 @@ impl FilterExprInfo {
                 acc + end.sub(start).unwrap_or(chrono::Duration::zero())
             })
     }
+
+    pub fn predicate_for_col(
+        &self,
+        col_name: &str,
+    ) -> Result<Option<datafusion_expr::Expr>, Error> {
+        use datafusion_common::Column;
+        use datafusion_expr::{Expr, lit};
+
+        let mut expr_lst = Vec::with_capacity(self.time_ranges.len());
+        for (start, end) in &self.time_ranges {
+            let lower = to_df_literal(*start)?;
+            let upper = to_df_literal(*end)?;
+            let filter_col = || Expr::Column(Column::new_unqualified(col_name));
+            expr_lst.push(
+                filter_col()
+                    .gt_eq(lit(lower))
+                    .and(filter_col().lt(lit(upper))),
+            );
+        }
+
+        Ok(expr_lst.into_iter().reduce(|a, b| a.or(b)))
+    }
 }
 
 #[cfg(test)]
@@ -820,4 +986,370 @@ mod test {
             }
         }
     }
+
+    #[test]
+    fn test_task_state_checkpoint_mode_and_advancement() {
+        let query_ctx = QueryContext::arc();
+        let (_tx, rx) = tokio::sync::oneshot::channel();
+        let mut state = TaskState::new(query_ctx, rx);
+
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+        assert!(state.checkpoints().is_empty());
+
+        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
+        assert_eq!(
+            state.checkpoints(),
+            &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+        );
+
+        state.mark_full_snapshot();
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+        assert_eq!(
+            state.checkpoints(),
+            &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+        );
+    }
+
+    #[test]
+    fn test_disable_incremental_persists_full_snapshot_mode() {
+        let query_ctx = QueryContext::arc();
+        let (_tx, rx) = tokio::sync::oneshot::channel();
+        let mut state = TaskState::new(query_ctx, rx);
+
+        assert!(!state.is_incremental_disabled());
+
+        // After disable, mode becomes FullSnapshot and flag is set.
+        state.disable_incremental();
+        assert!(state.is_incremental_disabled());
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+
+        // `advance_checkpoints` will NOT transition to Incremental when disabled.
+        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+        assert_eq!(
+            state.checkpoints(),
+            &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+        );
+
+        // `mark_full_snapshot` does not re-enable incremental.
+        state.mark_full_snapshot();
+        assert!(state.is_incremental_disabled());
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+    }
+
+    #[test]
+    fn test_full_snapshot_checkpoint_advancement_requires_participating_regions() {
+        let query_ctx = QueryContext::arc();
+        let (_tx, rx) = tokio::sync::oneshot::channel();
+        let state = TaskState::new(query_ctx, rx);
+
+        assert!(!state.can_advance_full_snapshot_checkpoints(&BTreeSet::new(), &HashMap::new()));
+        assert!(!state.can_advance_full_snapshot_checkpoints(
+            &BTreeSet::from([1_u64, 2_u64]),
+            &HashMap::from([(1_u64, 10_u64)]),
+        ));
+        assert!(state.can_advance_full_snapshot_checkpoints(
+            &BTreeSet::from([1_u64, 2_u64]),
+            &HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]),
+        ));
+    }
+
+    #[test]
+    fn test_incremental_checkpoint_advancement_requires_participation_alignment() {
+        let query_ctx = QueryContext::arc();
+        let (_tx, rx) = tokio::sync::oneshot::channel();
+        let mut state = TaskState::new(query_ctx, rx);
+        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+
+        assert!(
+            state.can_advance_incremental_checkpoints_with_participation(
+                &BTreeSet::from([1_u64]),
+                &HashMap::from([(1_u64, 11_u64)]),
+            )
+        );
+        assert!(
+            !state.can_advance_incremental_checkpoints_with_participation(
+                &BTreeSet::from([1_u64, 2_u64]),
+                &HashMap::from([(1_u64, 11_u64)]),
+            )
+        );
+        assert!(
+            !state.can_advance_incremental_checkpoints_with_participation(
+                &BTreeSet::from([3_u64]),
+                &HashMap::from([(3_u64, 11_u64)]),
+            )
+        );
+        assert!(
+            !state.can_advance_incremental_checkpoints_with_participation(
+                &BTreeSet::from([1_u64]),
+                &HashMap::from([(1_u64, 9_u64)]),
+            )
+        );
+        assert!(
+            state.can_advance_incremental_checkpoints_with_participation(
+                &BTreeSet::from([1_u64, 2_u64]),
+                &HashMap::from([(1_u64, 11_u64), (2_u64, 21_u64)]),
+            )
+        );
+
+        state.disable_incremental();
+        assert!(
+            !state.can_advance_incremental_checkpoints_with_participation(
+                &BTreeSet::from([1_u64, 2_u64]),
+                &HashMap::from([(1_u64, 12_u64), (2_u64, 22_u64)]),
+            )
+        );
+    }
+
+    #[test]
+    fn test_incremental_checkpoint_advancement_merges_participating_subset() {
+        let query_ctx = QueryContext::arc();
+        let (_tx, rx) = tokio::sync::oneshot::channel();
+        let mut state = TaskState::new(query_ctx, rx);
+        state.advance_checkpoints(HashMap::from([
+            (1_u64, 10_u64),
+            (2_u64, 20_u64),
+            (3_u64, 30_u64),
+        ]));
+
+        state.advance_incremental_checkpoints_with_participation(
+            &BTreeSet::from([1_u64, 3_u64]),
+            HashMap::from([(1_u64, 12_u64), (3_u64, 35_u64)]),
+        );
+
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
+        assert_eq!(
+            state.checkpoints(),
+            &BTreeMap::from([(1_u64, 12_u64), (2_u64, 20_u64), (3_u64, 35_u64)])
+        );
+    }
+
+    #[test]
+    fn test_filter_expr_info_predicate_for_col_empty_ranges() {
+        let filter = FilterExprInfo {
+            expr: datafusion_expr::col("ts"),
+            col_name: "ts".to_string(),
+            time_ranges: vec![],
+            window_size: chrono::Duration::seconds(1),
+        };
+
+        assert!(filter.predicate_for_col("time_window").unwrap().is_none());
+    }
+
+    #[test]
+    fn test_filter_expr_info_predicate_for_col_single_range() {
+        let filter = FilterExprInfo {
+            expr: datafusion_expr::col("ts"),
+            col_name: "ts".to_string(),
+            time_ranges: vec![(Timestamp::new_second(0), Timestamp::new_second(1))],
+            window_size: chrono::Duration::seconds(1),
+        };
+
+        let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
+        let unparser = datafusion::sql::unparser::Unparser::default();
+        assert_eq!(
+            "((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP)))",
+            unparser.expr_to_sql(&predicate).unwrap().to_string()
+        );
+    }
+
+    #[test]
+    fn test_filter_expr_info_predicate_for_col_multiple_ranges() {
+        let filter = FilterExprInfo {
+            expr: datafusion_expr::col("ts"),
+            col_name: "ts".to_string(),
+            time_ranges: vec![
+                (Timestamp::new_second(0), Timestamp::new_second(1)),
+                (Timestamp::new_second(10), Timestamp::new_second(11)),
+            ],
+            window_size: chrono::Duration::seconds(1),
+        };
+
+        let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
+        let unparser = datafusion::sql::unparser::Unparser::default();
+        assert_eq!(
+            "(((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP))) OR ((time_window >= CAST('1970-01-01 00:00:10' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:11' AS TIMESTAMP))))",
+            unparser.expr_to_sql(&predicate).unwrap().to_string()
+        );
+    }
+
+    /// Helper: create a `TaskState` whose `last_update_time` is a known duration in the past.
+    fn state_with_past_update(age: Duration) -> TaskState {
+        let query_ctx = QueryContext::arc();
+        let (_tx, rx) = tokio::sync::oneshot::channel();
+        let mut state = TaskState::new(query_ctx, rx);
+        state.last_update_time = Instant::now() - age;
+        state
+    }
+
+    #[test]
+    fn test_short_incremental_cadence_uses_min_refresh() {
+        // When prefer_short_incremental_cadence is true and dirty backlog is manageable,
+        // the next start time should be last_update_time + min_refresh (short cadence),
+        // ignoring the longer time_window_size.
+        let state = state_with_past_update(Duration::from_secs(10));
+
+        let time_window_size = Some(Duration::from_secs(60)); // large window
+        let min_refresh = Duration::from_secs(5);
+        let flow_id = 1;
+
+        let result = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            None,
+            20,
+            true, // prefer_short_incremental_cadence
+        );
+
+        // With short cadence, result should be last_update_time + min_refresh.
+        let expected = state.last_update_time + min_refresh;
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_short_incremental_cadence_respects_last_query_duration() {
+        let mut state = state_with_past_update(Duration::from_secs(10));
+        state.last_query_duration = Duration::from_secs(20);
+
+        let time_window_size = Some(Duration::from_secs(60));
+        let min_refresh = Duration::from_secs(5);
+        let flow_id = 1;
+
+        let result = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            None,
+            20,
+            true,
+        );
+
+        assert_eq!(result, state.last_update_time + state.last_query_duration);
+    }
+
+    #[test]
+    fn test_short_incremental_cadence_respects_max_timeout() {
+        let mut state = state_with_past_update(Duration::from_secs(10));
+        state.last_query_duration = Duration::from_secs(20);
+
+        let time_window_size = Some(Duration::from_secs(60));
+        let min_refresh = Duration::from_secs(30);
+        let max_timeout = Duration::from_secs(5);
+        let flow_id = 1;
+
+        let result = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            Some(max_timeout),
+            20,
+            true,
+        );
+
+        assert_eq!(result, state.last_update_time + max_timeout);
+    }
+
+    #[test]
+    fn test_full_snapshot_ignores_short_cadence() {
+        // When prefer_short_incremental_cadence is false (full snapshot mode),
+        // the normal long-cadence based on time_window_size applies.
+        let mut state = state_with_past_update(Duration::from_secs(10));
+        // Make last_query_duration small so the lower bound (time_window_size) dominates.
+        state.last_query_duration = Duration::from_secs(1);
+
+        let time_window_size = Some(Duration::from_secs(60)); // large window
+        let min_refresh = Duration::from_secs(5);
+        let flow_id = 1;
+
+        let result = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            None,
+            20,
+            false, // prefer_short_incremental_cadence = false
+        );
+
+        // With normal cadence, result should be last_update_time + time_window_size
+        // (since last_query_duration < time_window_size).
+        let expected = state.last_update_time + Duration::from_secs(60);
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_dirty_window_overflow_schedules_immediately_even_with_short_cadence() {
+        // Dirty-window overflow must always schedule immediately,
+        // regardless of prefer_short_incremental_cadence.
+        let mut state = state_with_past_update(Duration::from_secs(10));
+        // Create a very large dirty backlog.
+        state
+            .dirty_time_windows
+            .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(3600)));
+
+        let time_window_size = Some(Duration::from_secs(1)); // tiny window => overflow
+        let min_refresh = Duration::from_secs(5);
+        let flow_id = 1;
+
+        // With short cadence flag.
+        let result = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            None,
+            1, // max 1 filter => tiny capacity
+            true,
+        );
+        assert!(
+            result <= Instant::now(),
+            "dirty overflow should schedule immediately"
+        );
+
+        // Without short cadence flag — same behavior.
+        let result2 = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            None,
+            1,
+            false,
+        );
+        assert!(
+            result2 <= Instant::now(),
+            "dirty overflow should schedule immediately"
+        );
+    }
+
+    #[test]
+    fn test_incremental_disabled_ignores_short_cadence() {
+        // When prefer_short_incremental_cadence is true but the dirty backlog is
+        // manageable, the short cadence is applied. This test verifies that the
+        // caller-side guard (checkpoint_mode + !is_incremental_disabled) controls
+        // whether short cadence is requested at all — when incremental is disabled,
+        // the flag is false, and the long cadence applies.
+        //
+        // This simulates the case where the caller computed
+        // prefer_short_incremental_cadence = false (e.g. incremental disabled
+        // or FullSnapshot mode), so the long cadence is used.
+        let mut state = state_with_past_update(Duration::from_secs(10));
+        state.last_query_duration = Duration::from_secs(1);
+
+        let time_window_size = Some(Duration::from_secs(60));
+        let min_refresh = Duration::from_secs(5);
+        let flow_id = 1;
+
+        let result = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            None,
+            20,
+            false, // prefer_short_incremental_cadence = false
+        );
+
+        // With normal cadence, result should be last_update_time + time_window_size.
+        let expected = state.last_update_time + Duration::from_secs(60);
+        assert_eq!(result, expected);
+    }
 }
diff --git a/src/flow/src/batching_mode/table_creator.rs b/src/flow/src/batching_mode/table_creator.rs
new file mode 100644
index 0000000000..05da055a40
--- /dev/null
+++ b/src/flow/src/batching_mode/table_creator.rs
@@ -0,0 +1,381 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::CreateTableExpr;
+use datafusion_common::tree_node::TreeNode;
+use datafusion_expr::LogicalPlan;
+use datatypes::prelude::ConcreteDataType;
+use datatypes::schema::ColumnSchema;
+use operator::expr_helper::column_schemas_to_defs;
+use snafu::ResultExt;
+
+use crate::Error;
+use crate::adapter::{AUTO_CREATED_PLACEHOLDER_TS_COL, AUTO_CREATED_UPDATE_AT_TS_COL};
+use crate::batching_mode::utils::FindGroupByFinalName;
+use crate::error::{ConvertColumnSchemaSnafu, DatafusionSnafu};
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum QueryType {
+    /// query is a tql query
+    Tql,
+    /// query is a sql query
+    Sql,
+}
+
+// auto created table have a auto added column `update_at`, and optional have a `AUTO_CREATED_PLACEHOLDER_TS_COL` column for time index placeholder if no timestamp column is specified
+// TODO(discord9): for now no default value is set for auto added column for compatibility reason with streaming mode, but this might change in favor of simpler code?
+pub(super) fn create_table_with_expr(
+    plan: &LogicalPlan,
+    sink_table_name: &[String; 3],
+    query_type: &QueryType,
+) -> Result<CreateTableExpr, Error> {
+    let table_def = match query_type {
+        &QueryType::Sql => {
+            if let Some(def) = build_pk_from_aggr(plan)? {
+                def
+            } else {
+                build_by_sql_schema(plan)?
+            }
+        }
+        QueryType::Tql => {
+            // first try build from aggr, then from tql schema because tql query might not have aggr node
+            if let Some(table_def) = build_pk_from_aggr(plan)? {
+                table_def
+            } else {
+                build_by_tql_schema(plan)?
+            }
+        }
+    };
+    let first_time_stamp = table_def.ts_col;
+    let primary_keys = table_def.pks;
+
+    let mut column_schemas = Vec::new();
+    for field in plan.schema().fields() {
+        let name = field.name();
+        let ty = ConcreteDataType::from_arrow_type(field.data_type());
+        let col_schema = if first_time_stamp == Some(name.clone()) {
+            ColumnSchema::new(name, ty, false).with_time_index(true)
+        } else {
+            ColumnSchema::new(name, ty, true)
+        };
+
+        match query_type {
+            QueryType::Sql => {
+                column_schemas.push(col_schema);
+            }
+            QueryType::Tql => {
+                // if is val column, need to rename as val DOUBLE NULL
+                // if is tag column, need to cast type as STRING NULL
+                let is_tag_column = primary_keys.contains(name);
+                let is_val_column = !is_tag_column && first_time_stamp.as_ref() != Some(name);
+                if is_val_column {
+                    let col_schema =
+                        ColumnSchema::new(name, ConcreteDataType::float64_datatype(), true);
+                    column_schemas.push(col_schema);
+                } else if is_tag_column {
+                    let col_schema =
+                        ColumnSchema::new(name, ConcreteDataType::string_datatype(), true);
+                    column_schemas.push(col_schema);
+                } else {
+                    // time index column
+                    column_schemas.push(col_schema);
+                }
+            }
+        }
+    }
+
+    if query_type == &QueryType::Sql {
+        let update_at_schema = ColumnSchema::new(
+            AUTO_CREATED_UPDATE_AT_TS_COL,
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            true,
+        );
+        column_schemas.push(update_at_schema);
+    }
+
+    let time_index = if let Some(time_index) = first_time_stamp {
+        time_index
+    } else {
+        column_schemas.push(
+            ColumnSchema::new(
+                AUTO_CREATED_PLACEHOLDER_TS_COL,
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                false,
+            )
+            .with_time_index(true),
+        );
+        AUTO_CREATED_PLACEHOLDER_TS_COL.to_string()
+    };
+
+    let column_defs =
+        column_schemas_to_defs(column_schemas, &primary_keys).context(ConvertColumnSchemaSnafu)?;
+    Ok(CreateTableExpr {
+        catalog_name: sink_table_name[0].clone(),
+        schema_name: sink_table_name[1].clone(),
+        table_name: sink_table_name[2].clone(),
+        desc: "Auto created table by flow engine".to_string(),
+        column_defs,
+        time_index,
+        primary_keys,
+        create_if_not_exists: true,
+        table_options: Default::default(),
+        table_id: None,
+        engine: "mito".to_string(),
+    })
+}
+
+/// simply build by schema, return first timestamp column and no primary key
+fn build_by_sql_schema(plan: &LogicalPlan) -> Result<TableDef, Error> {
+    let first_time_stamp = plan.schema().fields().iter().find_map(|f| {
+        if ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp() {
+            Some(f.name().clone())
+        } else {
+            None
+        }
+    });
+    Ok(TableDef {
+        ts_col: first_time_stamp,
+        pks: vec![],
+    })
+}
+
+/// Return first timestamp column found in output schema and all string columns
+fn build_by_tql_schema(plan: &LogicalPlan) -> Result<TableDef, Error> {
+    let first_time_stamp = plan.schema().fields().iter().find_map(|f| {
+        if ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp() {
+            Some(f.name().clone())
+        } else {
+            None
+        }
+    });
+    let string_columns = plan
+        .schema()
+        .fields()
+        .iter()
+        .filter_map(|f| {
+            if ConcreteDataType::from_arrow_type(f.data_type()).is_string() {
+                Some(f.name().clone())
+            } else {
+                None
+            }
+        })
+        .collect::<Vec<_>>();
+
+    Ok(TableDef {
+        ts_col: first_time_stamp,
+        pks: string_columns,
+    })
+}
+
+struct TableDef {
+    ts_col: Option<String>,
+    pks: Vec<String>,
+}
+
+/// Return first timestamp column which is in group by clause and other columns which are also in group by clause
+///
+/// # Returns
+///
+/// * `Option<String>` - first timestamp column which is in group by clause
+/// * `Vec<String>` - other columns which are also in group by clause
+///
+/// if no aggregation found, return None
+fn build_pk_from_aggr(plan: &LogicalPlan) -> Result<Option<TableDef>, Error> {
+    let fields = plan.schema().fields();
+    let mut pk_names = FindGroupByFinalName::default();
+
+    plan.visit(&mut pk_names)
+        .with_context(|_| DatafusionSnafu {
+            context: format!("Can't find aggr expr in plan {plan:?}"),
+        })?;
+
+    // if no group by clause, return empty with first timestamp column found in output schema
+    let Some(pk_final_names) = pk_names.get_group_expr_names() else {
+        return Ok(None);
+    };
+    if pk_final_names.is_empty() {
+        let first_ts_col = fields
+            .iter()
+            .find(|f| ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp())
+            .map(|f| f.name().clone());
+        return Ok(Some(TableDef {
+            ts_col: first_ts_col,
+            pks: vec![],
+        }));
+    }
+
+    let all_pk_cols: Vec<_> = fields
+        .iter()
+        .filter(|f| pk_final_names.contains(f.name()))
+        .map(|f| f.name().clone())
+        .collect();
+    // Auto-created tables use the first timestamp column in the group-by keys
+    // as the time index. It is possible that timestamp columns appear only as
+    // aggregate outputs (for example `max(ts)`) and are not group-by keys; in
+    // that case `first_time_stamp` stays `None` and the caller falls back to a
+    // placeholder time index column.
+    let first_time_stamp = fields
+        .iter()
+        .find(|f| {
+            all_pk_cols.contains(&f.name().clone())
+                && ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp()
+        })
+        .map(|f| f.name().clone());
+
+    let all_pk_cols: Vec<_> = all_pk_cols
+        .into_iter()
+        .filter(|col| first_time_stamp.as_ref() != Some(col))
+        .collect();
+
+    Ok(Some(TableDef {
+        ts_col: first_time_stamp,
+        pks: all_pk_cols,
+    }))
+}
+
+#[cfg(test)]
+mod test {
+    use api::v1::column_def::try_as_column_schema;
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::schema::ColumnSchema;
+    use pretty_assertions::assert_eq;
+    use session::context::QueryContext;
+
+    use super::*;
+    use crate::adapter::{AUTO_CREATED_PLACEHOLDER_TS_COL, AUTO_CREATED_UPDATE_AT_TS_COL};
+    use crate::batching_mode::utils::sql_to_df_plan;
+    use crate::test_utils::create_test_query_engine;
+
+    #[tokio::test]
+    async fn test_gen_create_table_sql() {
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+        struct TestCase {
+            sql: String,
+            sink_table_name: String,
+            column_schemas: Vec<ColumnSchema>,
+            primary_keys: Vec<String>,
+            time_index: String,
+        }
+
+        let update_at_schema = ColumnSchema::new(
+            AUTO_CREATED_UPDATE_AT_TS_COL,
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            true,
+        );
+
+        let ts_placeholder_schema = ColumnSchema::new(
+            AUTO_CREATED_PLACEHOLDER_TS_COL,
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true);
+
+        let testcases = vec![
+            TestCase {
+                sql: "SELECT number, ts FROM numbers_with_ts".to_string(),
+                sink_table_name: "new_table".to_string(),
+                column_schemas: vec![
+                    ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+                    ColumnSchema::new(
+                        "ts",
+                        ConcreteDataType::timestamp_millisecond_datatype(),
+                        false,
+                    )
+                    .with_time_index(true),
+                    update_at_schema.clone(),
+                ],
+                primary_keys: vec![],
+                time_index: "ts".to_string(),
+            },
+            TestCase {
+                sql: "SELECT number, max(ts) FROM numbers_with_ts GROUP BY number".to_string(),
+                sink_table_name: "new_table".to_string(),
+                column_schemas: vec![
+                    ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+                    ColumnSchema::new(
+                        "max(numbers_with_ts.ts)",
+                        ConcreteDataType::timestamp_millisecond_datatype(),
+                        true,
+                    ),
+                    update_at_schema.clone(),
+                    ts_placeholder_schema.clone(),
+                ],
+                primary_keys: vec!["number".to_string()],
+                time_index: AUTO_CREATED_PLACEHOLDER_TS_COL.to_string(),
+            },
+            TestCase {
+                sql: "SELECT max(number), ts FROM numbers_with_ts GROUP BY ts".to_string(),
+                sink_table_name: "new_table".to_string(),
+                column_schemas: vec![
+                    ColumnSchema::new(
+                        "max(numbers_with_ts.number)",
+                        ConcreteDataType::uint32_datatype(),
+                        true,
+                    ),
+                    ColumnSchema::new(
+                        "ts",
+                        ConcreteDataType::timestamp_millisecond_datatype(),
+                        false,
+                    )
+                    .with_time_index(true),
+                    update_at_schema.clone(),
+                ],
+                primary_keys: vec![],
+                time_index: "ts".to_string(),
+            },
+            TestCase {
+                sql: "SELECT number, ts FROM numbers_with_ts GROUP BY ts, number".to_string(),
+                sink_table_name: "new_table".to_string(),
+                column_schemas: vec![
+                    ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+                    ColumnSchema::new(
+                        "ts",
+                        ConcreteDataType::timestamp_millisecond_datatype(),
+                        false,
+                    )
+                    .with_time_index(true),
+                    update_at_schema.clone(),
+                ],
+                primary_keys: vec!["number".to_string()],
+                time_index: "ts".to_string(),
+            },
+        ];
+
+        for tc in testcases {
+            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), &tc.sql, true)
+                .await
+                .unwrap();
+            let expr = create_table_with_expr(
+                &plan,
+                &[
+                    "greptime".to_string(),
+                    "public".to_string(),
+                    tc.sink_table_name.clone(),
+                ],
+                &QueryType::Sql,
+            )
+            .unwrap();
+            // TODO(discord9): assert expr
+            let column_schemas = expr
+                .column_defs
+                .iter()
+                .map(|c| try_as_column_schema(c).unwrap())
+                .collect::<Vec<_>>();
+            assert_eq!(tc.column_schemas, column_schemas, "{:?}", tc.sql);
+            assert_eq!(tc.primary_keys, expr.primary_keys, "{:?}", tc.sql);
+            assert_eq!(tc.time_index, expr.time_index, "{:?}", tc.sql);
+        }
+    }
+}
diff --git a/src/flow/src/batching_mode/task.rs b/src/flow/src/batching_mode/task.rs
index 84c96cc7cd..3cdf7899a6 100644
--- a/src/flow/src/batching_mode/task.rs
+++ b/src/flow/src/batching_mode/task.rs
@@ -28,13 +28,12 @@ use datafusion::sql::unparser::expr_to_sql;
 use datafusion_common::DFSchemaRef;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_expr::{DmlStatement, LogicalPlan, WriteOp};
-use datatypes::prelude::ConcreteDataType;
-use datatypes::schema::{ColumnSchema, Schema};
-use operator::expr_helper::column_schemas_to_defs;
+use datatypes::schema::Schema;
 use query::QueryEngineRef;
+use query::options::FLOW_INCREMENTAL_MODE;
 use query::query_engine::DefaultSerializer;
 use session::context::QueryContextRef;
-use snafu::{OptionExt, ResultExt, ensure};
+use snafu::{OptionExt, ResultExt};
 use sql::parsers::utils::is_tql;
 use store_api::mito_engine_options::MERGE_MODE_KEY;
 use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
@@ -43,19 +42,20 @@ use tokio::sync::oneshot;
 use tokio::sync::oneshot::error::TryRecvError;
 use tokio::time::Instant;
 
-use crate::adapter::{AUTO_CREATED_PLACEHOLDER_TS_COL, AUTO_CREATED_UPDATE_AT_TS_COL};
 use crate::batching_mode::BatchingModeOptions;
-use crate::batching_mode::frontend_client::FrontendClient;
-use crate::batching_mode::state::{FilterExprInfo, TaskState};
+use crate::batching_mode::checkpoint::checkpoint_mode_label;
+use crate::batching_mode::frontend_client::{FrontendClient, PeerDesc};
+use crate::batching_mode::state::{CheckpointMode, DirtyTimeWindows, FilterExprInfo, TaskState};
+use crate::batching_mode::table_creator::{QueryType, create_table_with_expr};
 use crate::batching_mode::time_window::TimeWindowExpr;
 use crate::batching_mode::utils::{
-    AddFilterRewriter, ColumnMatcherRewriter, FindGroupByFinalName, gen_plan_with_matching_schema,
+    AddFilterRewriter, ColumnMatcherRewriter, gen_plan_with_matching_schema,
     get_table_info_df_schema, sql_to_df_plan,
 };
 use crate::df_optimizer::apply_df_optimizer;
 use crate::error::{
-    ConvertColumnSchemaSnafu, DatafusionSnafu, ExternalSnafu, InvalidQuerySnafu,
-    SubstraitEncodeLogicalPlanSnafu, UnexpectedSnafu,
+    DatafusionSnafu, ExternalSnafu, InvalidQuerySnafu, SubstraitEncodeLogicalPlanSnafu,
+    UnexpectedSnafu,
 };
 use crate::metrics::{
     METRIC_FLOW_BATCHING_ENGINE_ERROR_CNT, METRIC_FLOW_BATCHING_ENGINE_QUERY_TIME,
@@ -64,6 +64,15 @@ use crate::metrics::{
 };
 use crate::{Error, FlowId};
 
+mod ckpt;
+mod inc;
+
+/// Maximum number of dirty time-window predicates attached to one incremental
+/// SQL query. This keeps generated OR filters bounded so Substrait encoding and
+/// downstream planning remain predictable; if the backlog is larger, the flow
+/// drains one capped batch and postpones checkpoint advancement to a later run.
+const MAX_INCREMENTAL_DIRTY_WINDOW_FILTERS: usize = 4096;
+
 /// The task's config, immutable once created
 #[derive(Clone)]
 pub struct TaskConfig {
@@ -100,14 +109,6 @@ fn is_merge_mode_last_non_null(options: &HashMap<String, String>) -> bool {
         .unwrap_or(false)
 }
 
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum QueryType {
-    /// query is a tql query
-    Tql,
-    /// query is a sql query
-    Sql,
-}
-
 #[derive(Clone)]
 pub struct BatchingTask {
     pub config: Arc<TaskConfig>,
@@ -132,7 +133,21 @@ pub struct TaskArgs<'a> {
 
 pub struct PlanInfo {
     pub plan: LogicalPlan,
-    pub filter: Option<FilterExprInfo>,
+    pub dirty_restore: DirtyRestore,
+    pub can_advance_checkpoints: bool,
+}
+
+pub enum DirtyRestore {
+    /// The query was scoped to dirty time ranges; restore those ranges if the
+    /// run fails.
+    Scoped(FilterExprInfo),
+    /// The query could not be scoped to dirty time ranges, so the dirty-window
+    /// state is only a dirty signal. Restore the consumed signal if the full
+    /// run fails.
+    ///
+    /// TODO(discord9): Full-query runs only need a dirty bool flag. Refactor
+    /// the unscoped path to stop reusing `DirtyTimeWindows` for this signal.
+    Unscoped(DirtyTimeWindows),
 }
 
 impl BatchingTask {
@@ -210,7 +225,7 @@ impl BatchingTask {
         &self,
         engine: &QueryEngineRef,
         frontend_client: &Arc<FrontendClient>,
-    ) -> Result<Option<(u32, Duration)>, Error> {
+    ) -> Result<Option<(usize, Duration)>, Error> {
         if !self.is_table_exist(&self.config.sink_table_name).await? {
             let create_table = self.gen_create_table_expr(engine.clone()).await?;
             info!(
@@ -241,11 +256,28 @@ impl BatchingTask {
         engine: &QueryEngineRef,
         frontend_client: &Arc<FrontendClient>,
         max_window_cnt: Option<usize>,
-    ) -> Result<Option<(u32, Duration)>, Error> {
+    ) -> Result<Option<(usize, Duration)>, Error> {
         if let Some(new_query) = self.gen_insert_plan(engine, max_window_cnt).await? {
             debug!("Generate new query: {}", new_query.plan);
-            self.execute_logical_plan(frontend_client, &new_query.plan)
+            let dirty_filter = match &new_query.dirty_restore {
+                DirtyRestore::Scoped(f) => Some(f),
+                _ => None,
+            };
+            match self
+                .execute_logical_plan(
+                    frontend_client,
+                    &new_query.plan,
+                    dirty_filter,
+                    new_query.can_advance_checkpoints,
+                )
                 .await
+            {
+                Ok(result) => Ok(result),
+                Err(err) => {
+                    self.handle_executed_query_failure(Some(&new_query));
+                    Err(err)
+                }
+            }
         } else {
             debug!("Generate no query");
             Ok(None)
@@ -278,57 +310,68 @@ impl BatchingTask {
             )
             .await?;
 
-        let insert_into_info = if let Some(new_query) = new_query {
-            // first check if all columns in input query exists in sink table
-            // since insert into ref to names in record batch generate by given query
-            let table_columns = df_schema
-                .columns()
-                .into_iter()
-                .map(|c| c.name)
-                .collect::<BTreeSet<_>>();
-            for column in new_query.plan.schema().columns() {
-                ensure!(
-                    table_columns.contains(column.name()),
-                    InvalidQuerySnafu {
-                        reason: format!(
-                            "Column {} not found in sink table with columns {:?}",
-                            column, table_columns
-                        ),
-                    }
-                );
-            }
-
-            let table_provider = Arc::new(DfTableProviderAdapter::new(table));
-            let table_source = Arc::new(DefaultTableSource::new(table_provider));
-
-            // update_at& time index placeholder (if exists) should have default value
-            let plan = LogicalPlan::Dml(DmlStatement::new(
-                datafusion_common::TableReference::Full {
-                    catalog: self.config.sink_table_name[0].clone().into(),
-                    schema: self.config.sink_table_name[1].clone().into(),
-                    table: self.config.sink_table_name[2].clone().into(),
-                },
-                table_source,
-                WriteOp::Insert(datafusion_expr::dml::InsertOp::Append),
-                Arc::new(new_query.plan),
-            ));
-            PlanInfo {
-                plan,
-                filter: new_query.filter,
-            }
-        } else {
+        let Some(new_query) = new_query else {
             return Ok(None);
         };
-        let insert_into = insert_into_info
-            .plan
-            .recompute_schema()
-            .context(DatafusionSnafu {
-                context: "Failed to recompute schema",
-            })?;
+
+        // first check if all columns in input query exists in sink table
+        // since insert into ref to names in record batch generate by given query
+        let table_columns = df_schema
+            .columns()
+            .into_iter()
+            .map(|c| c.name)
+            .collect::<BTreeSet<_>>();
+        for column in new_query.plan.schema().columns() {
+            if !table_columns.contains(column.name()) {
+                self.restore_dirty_windows_after_failure(&new_query);
+                return InvalidQuerySnafu {
+                    reason: format!(
+                        "Column {} not found in sink table with columns {:?}",
+                        column, table_columns
+                    ),
+                }
+                .fail();
+            }
+        }
+
+        let table_provider = Arc::new(DfTableProviderAdapter::new(table));
+        let table_source = Arc::new(DefaultTableSource::new(table_provider));
+
+        // update_at& time index placeholder (if exists) should have default value
+        let plan = LogicalPlan::Dml(DmlStatement::new(
+            datafusion_common::TableReference::Full {
+                catalog: self.config.sink_table_name[0].clone().into(),
+                schema: self.config.sink_table_name[1].clone().into(),
+                table: self.config.sink_table_name[2].clone().into(),
+            },
+            table_source,
+            WriteOp::Insert(datafusion_expr::dml::InsertOp::Append),
+            Arc::new(new_query.plan.clone()),
+        ));
+        let insert_into_info = PlanInfo {
+            plan,
+            dirty_restore: new_query.dirty_restore,
+            can_advance_checkpoints: new_query.can_advance_checkpoints,
+        };
+        let insert_into =
+            match insert_into_info
+                .plan
+                .clone()
+                .recompute_schema()
+                .context(DatafusionSnafu {
+                    context: "Failed to recompute schema",
+                }) {
+                Ok(insert_into) => insert_into,
+                Err(err) => {
+                    self.restore_dirty_windows_after_failure(&insert_into_info);
+                    return Err(err);
+                }
+            };
 
         Ok(Some(PlanInfo {
             plan: insert_into,
-            filter: insert_into_info.filter,
+            dirty_restore: insert_into_info.dirty_restore,
+            can_advance_checkpoints: insert_into_info.can_advance_checkpoints,
         }))
     }
 
@@ -349,7 +392,9 @@ impl BatchingTask {
         &self,
         frontend_client: &Arc<FrontendClient>,
         plan: &LogicalPlan,
-    ) -> Result<Option<(u32, Duration)>, Error> {
+        dirty_filter: Option<&FilterExprInfo>,
+        can_advance_checkpoints: bool,
+    ) -> Result<Option<(usize, Duration)>, Error> {
         let instant = Instant::now();
         let flow_id = self.config.flow_id;
 
@@ -378,81 +423,167 @@ impl BatchingTask {
             })?
             .data;
 
-        let mut peer_desc = None;
+        // For incremental-mode SQL queries, attempt to rewrite the delta aggregate
+        // plan into a safe delta-LEFT-JOIN-sink form before deciding on extensions.
+        let incremental_plan = if can_advance_checkpoints {
+            self.prepare_plan_for_incremental(&plan, dirty_filter)
+                .await?
+        } else {
+            None
+        };
+        let incremental_safe = incremental_plan.is_some();
+        let plan = incremental_plan.unwrap_or_else(|| plan.clone());
 
+        let extensions = self
+            .build_flow_query_extensions(incremental_safe, can_advance_checkpoints)
+            .await?;
+        let extension_refs = extensions
+            .iter()
+            .map(|(key, value)| (*key, value.as_str()))
+            .collect::<Vec<_>>();
+        let query_mode = if extensions
+            .iter()
+            .any(|(key, _)| *key == FLOW_INCREMENTAL_MODE)
+        {
+            CheckpointMode::Incremental
+        } else {
+            CheckpointMode::FullSnapshot
+        };
+        Self::record_query_mode(flow_id, query_mode);
+        debug!(
+            "Flow {flow_id} executing batching query with checkpoint_mode={}, extension_count={}",
+            checkpoint_mode_label(query_mode),
+            extensions.len()
+        );
+
+        let mut peer_desc = None;
         let res = {
             let _timer = METRIC_FLOW_BATCHING_ENGINE_QUERY_TIME
                 .with_label_values(&[flow_id.to_string().as_str()])
                 .start_timer();
 
-            // hack and special handling the insert logical plan
             let req = if let Some((insert_to, insert_plan)) =
                 breakup_insert_plan(&plan, catalog, schema)
             {
                 let message = DFLogicalSubstraitConvertor {}
                     .encode(&insert_plan, DefaultSerializer)
                     .context(SubstraitEncodeLogicalPlanSnafu)?;
-                api::v1::greptime_request::Request::Query(api::v1::QueryRequest {
+                api::v1::QueryRequest {
                     query: Some(api::v1::query_request::Query::InsertIntoPlan(
                         api::v1::InsertIntoPlan {
                             table_name: Some(insert_to),
                             logical_plan: message.to_vec(),
                         },
                     )),
-                })
+                }
             } else {
                 let message = DFLogicalSubstraitConvertor {}
                     .encode(&plan, DefaultSerializer)
                     .context(SubstraitEncodeLogicalPlanSnafu)?;
 
-                api::v1::greptime_request::Request::Query(api::v1::QueryRequest {
+                api::v1::QueryRequest {
                     query: Some(api::v1::query_request::Query::LogicalPlan(message.to_vec())),
-                })
+                }
             };
 
             frontend_client
-                .handle(req, catalog, schema, &mut peer_desc)
+                .query_with_terminal_metrics(catalog, schema, req, &extension_refs, &mut peer_desc)
                 .await
         };
 
         let elapsed = instant.elapsed();
-        if let Ok(affected_rows) = &res {
-            debug!(
-                "Flow {flow_id} executed, affected_rows: {affected_rows:?}, elapsed: {:?}",
-                elapsed
-            );
-            METRIC_FLOW_ROWS
-                .with_label_values(&[format!("{}-out-batching", flow_id).as_str()])
-                .inc_by(*affected_rows as _);
-        } else if let Err(err) = &res {
+        let peer_label = peer_desc
+            .as_ref()
+            .map(ToString::to_string)
+            .unwrap_or_else(|| PeerDesc::default().to_string());
+        if let Err(err) = &res {
             warn!(
-                "Failed to execute Flow {flow_id} on frontend {:?}, result: {err:?}, elapsed: {:?} with query: {}",
-                peer_desc, elapsed, &plan
+                "Failed to execute Flow {flow_id} on frontend {peer_label}, result: {err:?}, elapsed: {:?} with query: {}",
+                elapsed, &plan
             );
+            let decision = {
+                let mut state = self.state.write().unwrap();
+                let reason = Self::query_failure_reason(err);
+                Self::apply_query_failure_to_state(&mut state, elapsed, reason)
+            };
+            if let Some(decision) = decision {
+                Self::record_checkpoint_decision(flow_id, decision);
+            }
         }
 
         // record slow query
         if elapsed >= self.config.batch_opts.slow_query_threshold {
             warn!(
-                "Flow {flow_id} on frontend {:?} executed for {:?} before complete, query: {}",
-                peer_desc, elapsed, &plan
+                "Flow {flow_id} on frontend {peer_label} executed for {:?} before complete, query: {}",
+                elapsed, &plan
             );
+            let flow_id = flow_id.to_string();
             METRIC_FLOW_BATCHING_ENGINE_SLOW_QUERY
-                .with_label_values(&[
-                    flow_id.to_string().as_str(),
-                    &peer_desc.unwrap_or_default().to_string(),
-                ])
+                .with_label_values(&[flow_id.as_str(), peer_label.as_str()])
                 .observe(elapsed.as_secs_f64());
         }
 
+        let res = res?;
+        let (affected_rows, _) = res.output.extract_rows_and_cost();
+        debug!(
+            "Flow {flow_id} executed, affected_rows: {affected_rows:?}, elapsed: {:?}, watermark: {:?}",
+            elapsed,
+            res.region_watermark_map()
+        );
+        METRIC_FLOW_ROWS
+            .with_label_values(&[format!("{}-out-batching", flow_id).as_str()])
+            .inc_by(affected_rows as _);
+        {
+            let mut state = self.state.write().unwrap();
+            let decision = Self::apply_query_result_to_state(
+                &mut state,
+                &res,
+                elapsed,
+                can_advance_checkpoints,
+            );
+            Self::record_checkpoint_decision(flow_id, decision);
+        }
+
+        Ok(Some((affected_rows, elapsed)))
+    }
+
+    /// Restore dirty windows consumed by a failed query so they are retried on
+    /// the next execution.
+    ///
+    fn restore_dirty_windows_after_failure(&self, query: &PlanInfo) {
+        match &query.dirty_restore {
+            DirtyRestore::Scoped(filter) => self.restore_scoped_dirty_windows(filter),
+            DirtyRestore::Unscoped(dirty_windows) => self
+                .state
+                .write()
+                .unwrap()
+                .dirty_time_windows
+                .add_dirty_windows(dirty_windows),
+        }
+    }
+
+    fn restore_scoped_dirty_windows(&self, filter: &FilterExprInfo) {
         self.state
             .write()
             .unwrap()
-            .after_query_exec(elapsed, res.is_ok());
+            .dirty_time_windows
+            .add_windows(filter.time_ranges.clone());
+    }
 
-        let res = res?;
+    fn restore_scoped_dirty_windows_on_err<T>(
+        &self,
+        filter: &FilterExprInfo,
+        result: Result<T, Error>,
+    ) -> Result<T, Error> {
+        result.inspect_err(|_| {
+            self.restore_scoped_dirty_windows(filter);
+        })
+    }
 
-        Ok(Some((res, elapsed)))
+    fn handle_executed_query_failure(&self, query: Option<&PlanInfo>) {
+        if let Some(query) = query {
+            self.restore_dirty_windows_after_failure(query);
+        }
     }
 
     /// start executing query in a loop, break when receive shutdown signal
@@ -506,8 +637,17 @@ impl BatchingTask {
             };
 
             let res = if let Some(new_query) = &new_query {
-                self.execute_logical_plan(&frontend_client, &new_query.plan)
-                    .await
+                let dirty_filter = match &new_query.dirty_restore {
+                    DirtyRestore::Scoped(f) => Some(f),
+                    _ => None,
+                };
+                self.execute_logical_plan(
+                    &frontend_client,
+                    &new_query.plan,
+                    dirty_filter,
+                    new_query.can_advance_checkpoints,
+                )
+                .await
             } else {
                 Ok(None)
             };
@@ -535,12 +675,17 @@ impl BatchingTask {
                                 .as_ref()
                                 .and_then(|t| *t.time_window_size());
 
+                            let prefer_short_incremental_cadence = state.checkpoint_mode()
+                                == CheckpointMode::Incremental
+                                && !state.is_incremental_disabled();
+
                             state.get_next_start_query_time(
                                 self.config.flow_id,
                                 &time_window_size,
                                 min_refresh,
                                 Some(self.config.batch_opts.query_timeout),
                                 self.config.batch_opts.experimental_max_filter_num_per_query,
+                                prefer_short_incremental_cadence,
                             )
                         };
 
@@ -558,16 +703,13 @@ impl BatchingTask {
                 }
                 // TODO(discord9): this error should have better place to go, but for now just print error, also more context is needed
                 Err(err) => {
+                    self.handle_executed_query_failure(new_query.as_ref());
                     METRIC_FLOW_BATCHING_ENGINE_ERROR_CNT
                         .with_label_values(&[&flow_id_str])
                         .inc();
                     match new_query {
                         Some(query) => {
                             common_telemetry::error!(err; "Failed to execute query for flow={} with query: {}", self.config.flow_id, query.plan);
-                            // Re-add dirty windows back since query failed
-                            self.state.write().unwrap().dirty_time_windows.add_windows(
-                                query.filter.map(|f| f.time_ranges).unwrap_or_default(),
-                            );
                             // TODO(discord9): add some backoff here? half the query time window or what
                             // backoff meaning use smaller `max_window_cnt` for next query
 
@@ -641,8 +783,13 @@ impl BatchingTask {
                         self.config.flow_id
                     );
                     // clean dirty time window too, this could be from create flow's check_execute
-                    let is_dirty = !self.state.read().unwrap().dirty_time_windows.is_empty();
-                    self.state.write().unwrap().dirty_time_windows.clean();
+                    let (is_dirty, dirty_windows_to_restore) = {
+                        let mut state = self.state.write().unwrap();
+                        let dirty_windows_to_restore = state.dirty_time_windows.clone();
+                        let is_dirty = !dirty_windows_to_restore.is_empty();
+                        state.dirty_time_windows.clean();
+                        (is_dirty, dirty_windows_to_restore)
+                    };
 
                     if !is_dirty {
                         // no dirty data, hence no need to update
@@ -650,7 +797,7 @@ impl BatchingTask {
                         return Ok(None);
                     }
 
-                    let plan = gen_plan_with_matching_schema(
+                    let plan = match gen_plan_with_matching_schema(
                         &self.config.query,
                         query_ctx,
                         engine,
@@ -658,15 +805,36 @@ impl BatchingTask {
                         primary_key_indices,
                         allow_partial,
                     )
-                    .await?;
+                    .await
+                    {
+                        Ok(plan) => plan,
+                        Err(err) => {
+                            self.state
+                                .write()
+                                .unwrap()
+                                .dirty_time_windows
+                                .add_dirty_windows(&dirty_windows_to_restore);
+                            return Err(err);
+                        }
+                    };
 
-                    return Ok(Some(PlanInfo { plan, filter: None }));
+                    return Ok(Some(PlanInfo {
+                        plan,
+                        dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
+                        can_advance_checkpoints: true,
+                    }));
                 }
                 _ => {
-                    // clean for tql have no use for time window
-                    self.state.write().unwrap().dirty_time_windows.clean();
+                    // Clean dirty windows for full-query/non-scoped paths,
+                    // such as TQL, that cannot use a time-window filter.
+                    let dirty_windows_to_restore = {
+                        let mut state = self.state.write().unwrap();
+                        let dirty_windows_to_restore = state.dirty_time_windows.clone();
+                        state.dirty_time_windows.clean();
+                        dirty_windows_to_restore
+                    };
 
-                    let plan = gen_plan_with_matching_schema(
+                    let plan = match gen_plan_with_matching_schema(
                         &self.config.query,
                         query_ctx,
                         engine,
@@ -674,9 +842,24 @@ impl BatchingTask {
                         primary_key_indices,
                         allow_partial,
                     )
-                    .await?;
+                    .await
+                    {
+                        Ok(plan) => plan,
+                        Err(err) => {
+                            self.state
+                                .write()
+                                .unwrap()
+                                .dirty_time_windows
+                                .add_dirty_windows(&dirty_windows_to_restore);
+                            return Err(err);
+                        }
+                    };
 
-                    return Ok(Some(PlanInfo { plan, filter: None }));
+                    return Ok(Some(PlanInfo {
+                        plan,
+                        dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
+                        can_advance_checkpoints: true,
+                    }));
                 }
             };
 
@@ -706,33 +889,33 @@ impl BatchingTask {
                 ),
             })?;
 
-        let expr = self
-            .state
-            .write()
-            .unwrap()
-            .dirty_time_windows
-            .gen_filter_exprs(
+        let (expr, can_advance_checkpoints) = {
+            let mut state = self.state.write().unwrap();
+            let window_cnt = if state.checkpoint_mode() == CheckpointMode::Incremental
+                && !state.is_incremental_disabled()
+                && matches!(self.config.query_type, QueryType::Sql)
+            {
+                // Incremental scans are bounded by region sequence checkpoints,
+                // so the dirty-window filter only narrows sink-side/time-window
+                // work. Drain more windows than normal, but keep a hard cap to
+                // avoid building a huge OR filter after a long downtime. If
+                // windows remain, checkpoints won't advance this round.
+                MAX_INCREMENTAL_DIRTY_WINDOW_FILTERS
+            } else {
+                max_window_cnt
+                    .unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query)
+            };
+            let expr = state.dirty_time_windows.gen_filter_exprs(
                 &col_name,
                 Some(expire_lower_bound),
                 window_size,
-                max_window_cnt
-                    .unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query),
+                window_cnt,
                 self.config.flow_id,
                 Some(self),
             )?;
-
-        debug!(
-            "Flow id={:?}, Generated filter expr: {:?}",
-            self.config.flow_id,
-            expr.as_ref()
-                .map(
-                    |expr| expr_to_sql(&expr.expr).with_context(|_| DatafusionSnafu {
-                        context: format!("Failed to generate filter expr from {expr:?}"),
-                    })
-                )
-                .transpose()?
-                .map(|s| s.to_string())
-        );
+            let can_advance_checkpoints = state.dirty_time_windows.is_empty();
+            (expr, can_advance_checkpoints)
+        };
 
         let Some(expr) = expr else {
             // no new data, hence no need to update
@@ -740,6 +923,15 @@ impl BatchingTask {
             return Ok(None);
         };
 
+        let filter_sql = expr_to_sql(&expr.expr)
+            .map(|sql| sql.to_string())
+            .unwrap_or_else(|err| format!("<failed to format filter expr: {err}>"));
+
+        debug!(
+            "Flow id={:?}, Generated filter expr: {:?}",
+            self.config.flow_id, filter_sql
+        );
+
         let mut add_filter = AddFilterRewriter::new(expr.expr.clone());
         let mut add_auto_column = ColumnMatcherRewriter::new(
             sink_table_schema.clone(),
@@ -747,363 +939,35 @@ impl BatchingTask {
             allow_partial,
         );
 
-        let plan =
-            sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.config.query, false).await?;
-        let rewrite = plan
-            .clone()
-            .rewrite(&mut add_filter)
-            .and_then(|p| p.data.rewrite(&mut add_auto_column))
-            .with_context(|_| DatafusionSnafu {
-                context: format!("Failed to rewrite plan:\n {}\n", plan),
-            })?
-            .data;
+        let plan = self.restore_scoped_dirty_windows_on_err(
+            &expr,
+            sql_to_df_plan(query_ctx.clone(), engine.clone(), &self.config.query, false).await,
+        )?;
+        let rewrite = self.restore_scoped_dirty_windows_on_err(
+            &expr,
+            plan.clone()
+                .rewrite(&mut add_filter)
+                .and_then(|p| p.data.rewrite(&mut add_auto_column))
+                .with_context(|_| DatafusionSnafu {
+                    context: format!("Failed to rewrite plan:\n {}\n", plan),
+                })
+                .map(|rewrite| rewrite.data),
+        )?;
         // only apply optimize after complex rewrite is done
-        let new_plan = apply_df_optimizer(rewrite, &query_ctx).await?;
+        let new_plan = self.restore_scoped_dirty_windows_on_err(
+            &expr,
+            apply_df_optimizer(rewrite, &query_ctx).await,
+        )?;
 
         let info = PlanInfo {
             plan: new_plan.clone(),
-            filter: Some(expr),
+            dirty_restore: DirtyRestore::Scoped(expr),
+            can_advance_checkpoints,
         };
 
         Ok(Some(info))
     }
 }
 
-// auto created table have a auto added column `update_at`, and optional have a `AUTO_CREATED_PLACEHOLDER_TS_COL` column for time index placeholder if no timestamp column is specified
-// TODO(discord9): for now no default value is set for auto added column for compatibility reason with streaming mode, but this might change in favor of simpler code?
-fn create_table_with_expr(
-    plan: &LogicalPlan,
-    sink_table_name: &[String; 3],
-    query_type: &QueryType,
-) -> Result<CreateTableExpr, Error> {
-    let table_def = match query_type {
-        &QueryType::Sql => {
-            if let Some(def) = build_pk_from_aggr(plan)? {
-                def
-            } else {
-                build_by_sql_schema(plan)?
-            }
-        }
-        QueryType::Tql => {
-            // first try build from aggr, then from tql schema because tql query might not have aggr node
-            if let Some(table_def) = build_pk_from_aggr(plan)? {
-                table_def
-            } else {
-                build_by_tql_schema(plan)?
-            }
-        }
-    };
-    let first_time_stamp = table_def.ts_col;
-    let primary_keys = table_def.pks;
-
-    let mut column_schemas = Vec::new();
-    for field in plan.schema().fields() {
-        let name = field.name();
-        let ty = ConcreteDataType::from_arrow_type(field.data_type());
-        let col_schema = if first_time_stamp == Some(name.clone()) {
-            ColumnSchema::new(name, ty, false).with_time_index(true)
-        } else {
-            ColumnSchema::new(name, ty, true)
-        };
-
-        match query_type {
-            QueryType::Sql => {
-                column_schemas.push(col_schema);
-            }
-            QueryType::Tql => {
-                // if is val column, need to rename as val DOUBLE NULL
-                // if is tag column, need to cast type as STRING NULL
-                let is_tag_column = primary_keys.contains(name);
-                let is_val_column = !is_tag_column && first_time_stamp.as_ref() != Some(name);
-                if is_val_column {
-                    let col_schema =
-                        ColumnSchema::new(name, ConcreteDataType::float64_datatype(), true);
-                    column_schemas.push(col_schema);
-                } else if is_tag_column {
-                    let col_schema =
-                        ColumnSchema::new(name, ConcreteDataType::string_datatype(), true);
-                    column_schemas.push(col_schema);
-                } else {
-                    // time index column
-                    column_schemas.push(col_schema);
-                }
-            }
-        }
-    }
-
-    if query_type == &QueryType::Sql {
-        let update_at_schema = ColumnSchema::new(
-            AUTO_CREATED_UPDATE_AT_TS_COL,
-            ConcreteDataType::timestamp_millisecond_datatype(),
-            true,
-        );
-        column_schemas.push(update_at_schema);
-    }
-
-    let time_index = if let Some(time_index) = first_time_stamp {
-        time_index
-    } else {
-        column_schemas.push(
-            ColumnSchema::new(
-                AUTO_CREATED_PLACEHOLDER_TS_COL,
-                ConcreteDataType::timestamp_millisecond_datatype(),
-                false,
-            )
-            .with_time_index(true),
-        );
-        AUTO_CREATED_PLACEHOLDER_TS_COL.to_string()
-    };
-
-    let column_defs =
-        column_schemas_to_defs(column_schemas, &primary_keys).context(ConvertColumnSchemaSnafu)?;
-    Ok(CreateTableExpr {
-        catalog_name: sink_table_name[0].clone(),
-        schema_name: sink_table_name[1].clone(),
-        table_name: sink_table_name[2].clone(),
-        desc: "Auto created table by flow engine".to_string(),
-        column_defs,
-        time_index,
-        primary_keys,
-        create_if_not_exists: true,
-        table_options: Default::default(),
-        table_id: None,
-        engine: "mito".to_string(),
-    })
-}
-
-/// simply build by schema, return first timestamp column and no primary key
-fn build_by_sql_schema(plan: &LogicalPlan) -> Result<TableDef, Error> {
-    let first_time_stamp = plan.schema().fields().iter().find_map(|f| {
-        if ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp() {
-            Some(f.name().clone())
-        } else {
-            None
-        }
-    });
-    Ok(TableDef {
-        ts_col: first_time_stamp,
-        pks: vec![],
-    })
-}
-
-/// Return first timestamp column found in output schema and all string columns
-fn build_by_tql_schema(plan: &LogicalPlan) -> Result<TableDef, Error> {
-    let first_time_stamp = plan.schema().fields().iter().find_map(|f| {
-        if ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp() {
-            Some(f.name().clone())
-        } else {
-            None
-        }
-    });
-    let string_columns = plan
-        .schema()
-        .fields()
-        .iter()
-        .filter_map(|f| {
-            if ConcreteDataType::from_arrow_type(f.data_type()).is_string() {
-                Some(f.name().clone())
-            } else {
-                None
-            }
-        })
-        .collect::<Vec<_>>();
-
-    Ok(TableDef {
-        ts_col: first_time_stamp,
-        pks: string_columns,
-    })
-}
-
-struct TableDef {
-    ts_col: Option<String>,
-    pks: Vec<String>,
-}
-
-/// Return first timestamp column which is in group by clause and other columns which are also in group by clause
-///
-/// # Returns
-///
-/// * `Option<String>` - first timestamp column which is in group by clause
-/// * `Vec<String>` - other columns which are also in group by clause
-///
-/// if no aggregation found, return None
-fn build_pk_from_aggr(plan: &LogicalPlan) -> Result<Option<TableDef>, Error> {
-    let fields = plan.schema().fields();
-    let mut pk_names = FindGroupByFinalName::default();
-
-    plan.visit(&mut pk_names)
-        .with_context(|_| DatafusionSnafu {
-            context: format!("Can't find aggr expr in plan {plan:?}"),
-        })?;
-
-    // if no group by clause, return empty with first timestamp column found in output schema
-    let Some(pk_final_names) = pk_names.get_group_expr_names() else {
-        return Ok(None);
-    };
-    if pk_final_names.is_empty() {
-        let first_ts_col = fields
-            .iter()
-            .find(|f| ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp())
-            .map(|f| f.name().clone());
-        return Ok(Some(TableDef {
-            ts_col: first_ts_col,
-            pks: vec![],
-        }));
-    }
-
-    let all_pk_cols: Vec<_> = fields
-        .iter()
-        .filter(|f| pk_final_names.contains(f.name()))
-        .map(|f| f.name().clone())
-        .collect();
-    // auto create table use first timestamp column in group by clause as time index
-    let first_time_stamp = fields
-        .iter()
-        .find(|f| {
-            all_pk_cols.contains(&f.name().clone())
-                && ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp()
-        })
-        .map(|f| f.name().clone());
-
-    let all_pk_cols: Vec<_> = all_pk_cols
-        .into_iter()
-        .filter(|col| first_time_stamp.as_ref() != Some(col))
-        .collect();
-
-    Ok(Some(TableDef {
-        ts_col: first_time_stamp,
-        pks: all_pk_cols,
-    }))
-}
-
 #[cfg(test)]
-mod test {
-    use api::v1::column_def::try_as_column_schema;
-    use pretty_assertions::assert_eq;
-    use session::context::QueryContext;
-
-    use super::*;
-    use crate::test_utils::create_test_query_engine;
-
-    #[tokio::test]
-    async fn test_gen_create_table_sql() {
-        let query_engine = create_test_query_engine();
-        let ctx = QueryContext::arc();
-        struct TestCase {
-            sql: String,
-            sink_table_name: String,
-            column_schemas: Vec<ColumnSchema>,
-            primary_keys: Vec<String>,
-            time_index: String,
-        }
-
-        let update_at_schema = ColumnSchema::new(
-            AUTO_CREATED_UPDATE_AT_TS_COL,
-            ConcreteDataType::timestamp_millisecond_datatype(),
-            true,
-        );
-
-        let ts_placeholder_schema = ColumnSchema::new(
-            AUTO_CREATED_PLACEHOLDER_TS_COL,
-            ConcreteDataType::timestamp_millisecond_datatype(),
-            false,
-        )
-        .with_time_index(true);
-
-        let testcases = vec![
-            TestCase {
-                sql: "SELECT number, ts FROM numbers_with_ts".to_string(),
-                sink_table_name: "new_table".to_string(),
-                column_schemas: vec![
-                    ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
-                    ColumnSchema::new(
-                        "ts",
-                        ConcreteDataType::timestamp_millisecond_datatype(),
-                        false,
-                    )
-                    .with_time_index(true),
-                    update_at_schema.clone(),
-                ],
-                primary_keys: vec![],
-                time_index: "ts".to_string(),
-            },
-            TestCase {
-                sql: "SELECT number, max(ts) FROM numbers_with_ts GROUP BY number".to_string(),
-                sink_table_name: "new_table".to_string(),
-                column_schemas: vec![
-                    ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
-                    ColumnSchema::new(
-                        "max(numbers_with_ts.ts)",
-                        ConcreteDataType::timestamp_millisecond_datatype(),
-                        true,
-                    ),
-                    update_at_schema.clone(),
-                    ts_placeholder_schema.clone(),
-                ],
-                primary_keys: vec!["number".to_string()],
-                time_index: AUTO_CREATED_PLACEHOLDER_TS_COL.to_string(),
-            },
-            TestCase {
-                sql: "SELECT max(number), ts FROM numbers_with_ts GROUP BY ts".to_string(),
-                sink_table_name: "new_table".to_string(),
-                column_schemas: vec![
-                    ColumnSchema::new(
-                        "max(numbers_with_ts.number)",
-                        ConcreteDataType::uint32_datatype(),
-                        true,
-                    ),
-                    ColumnSchema::new(
-                        "ts",
-                        ConcreteDataType::timestamp_millisecond_datatype(),
-                        false,
-                    )
-                    .with_time_index(true),
-                    update_at_schema.clone(),
-                ],
-                primary_keys: vec![],
-                time_index: "ts".to_string(),
-            },
-            TestCase {
-                sql: "SELECT number, ts FROM numbers_with_ts GROUP BY ts, number".to_string(),
-                sink_table_name: "new_table".to_string(),
-                column_schemas: vec![
-                    ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
-                    ColumnSchema::new(
-                        "ts",
-                        ConcreteDataType::timestamp_millisecond_datatype(),
-                        false,
-                    )
-                    .with_time_index(true),
-                    update_at_schema.clone(),
-                ],
-                primary_keys: vec!["number".to_string()],
-                time_index: "ts".to_string(),
-            },
-        ];
-
-        for tc in testcases {
-            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), &tc.sql, true)
-                .await
-                .unwrap();
-            let expr = create_table_with_expr(
-                &plan,
-                &[
-                    "greptime".to_string(),
-                    "public".to_string(),
-                    tc.sink_table_name.clone(),
-                ],
-                &QueryType::Sql,
-            )
-            .unwrap();
-            // TODO(discord9): assert expr
-            let column_schemas = expr
-                .column_defs
-                .iter()
-                .map(|c| try_as_column_schema(c).unwrap())
-                .collect::<Vec<_>>();
-            assert_eq!(tc.column_schemas, column_schemas, "{:?}", tc.sql);
-            assert_eq!(tc.primary_keys, expr.primary_keys, "{:?}", tc.sql);
-            assert_eq!(tc.time_index, expr.time_index, "{:?}", tc.sql);
-        }
-    }
-}
+mod test;
diff --git a/src/flow/src/batching_mode/task/ckpt.rs b/src/flow/src/batching_mode/task/ckpt.rs
new file mode 100644
index 0000000000..035d30a079
--- /dev/null
+++ b/src/flow/src/batching_mode/task/ckpt.rs
@@ -0,0 +1,181 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::Duration;
+
+use client::OutputWithMetrics;
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_telemetry::tracing::warn;
+use common_telemetry::{debug, info};
+
+use crate::batching_mode::checkpoint::{
+    FlowCheckpointDecision, FlowQueryFallbackReason, checkpoint_mode_label,
+};
+use crate::batching_mode::state::{CheckpointMode, TaskState};
+use crate::batching_mode::task::BatchingTask;
+use crate::metrics::{
+    METRIC_FLOW_BATCHING_ENGINE_CHECKPOINT_DECISION_CNT, METRIC_FLOW_BATCHING_ENGINE_QUERY_MODE_CNT,
+};
+use crate::{Error, FlowId};
+
+impl BatchingTask {
+    pub(super) fn query_failure_reason(err: &Error) -> FlowQueryFallbackReason {
+        if err.status_code() == StatusCode::RequestOutdated {
+            FlowQueryFallbackReason::StaleCursor
+        } else {
+            FlowQueryFallbackReason::IncrementalQueryFailure
+        }
+    }
+
+    pub(super) fn apply_query_failure_to_state(
+        state: &mut TaskState,
+        elapsed: Duration,
+        reason: FlowQueryFallbackReason,
+    ) -> Option<FlowCheckpointDecision> {
+        state.after_query_exec(elapsed, false);
+        let checkpoint_mode = state.checkpoint_mode();
+        if checkpoint_mode == CheckpointMode::Incremental {
+            state.mark_full_snapshot();
+            Some(FlowCheckpointDecision::FallbackToFullSnapshot {
+                previous_mode: checkpoint_mode,
+                reason,
+            })
+        } else {
+            None
+        }
+    }
+
+    pub(super) fn apply_query_result_to_state(
+        state: &mut TaskState,
+        res: &OutputWithMetrics,
+        elapsed: Duration,
+        can_advance_checkpoints: bool,
+    ) -> FlowCheckpointDecision {
+        state.after_query_exec(elapsed, true);
+        let checkpoint_mode = state.checkpoint_mode();
+        if !can_advance_checkpoints {
+            state.mark_full_snapshot();
+            return FlowCheckpointDecision::FallbackToFullSnapshot {
+                previous_mode: checkpoint_mode,
+                reason: FlowQueryFallbackReason::DirtyBacklogPending,
+            };
+        }
+
+        if let (Some(participating_regions), Some(watermark_map)) =
+            (res.participating_regions(), res.region_watermark_map())
+        {
+            let can_advance = match checkpoint_mode {
+                CheckpointMode::FullSnapshot => state
+                    .can_advance_full_snapshot_checkpoints(&participating_regions, &watermark_map),
+                CheckpointMode::Incremental => state
+                    .can_advance_incremental_checkpoints_with_participation(
+                        &participating_regions,
+                        &watermark_map,
+                    ),
+            };
+
+            if can_advance {
+                let participating_region_count = participating_regions.len();
+                let watermark_count = watermark_map.len();
+                match checkpoint_mode {
+                    CheckpointMode::FullSnapshot => {
+                        state.advance_checkpoints(watermark_map);
+                        if state.is_incremental_disabled() {
+                            FlowCheckpointDecision::FallbackToFullSnapshot {
+                                previous_mode: CheckpointMode::FullSnapshot,
+                                reason: FlowQueryFallbackReason::IncrementalDisabled,
+                            }
+                        } else {
+                            FlowCheckpointDecision::AdvancedFromFullSnapshot {
+                                participating_regions: participating_region_count,
+                                watermarks: watermark_count,
+                            }
+                        }
+                    }
+                    CheckpointMode::Incremental => {
+                        state.advance_incremental_checkpoints_with_participation(
+                            &participating_regions,
+                            watermark_map,
+                        );
+                        FlowCheckpointDecision::AdvancedIncremental {
+                            participating_regions: participating_region_count,
+                            watermarks: watermark_count,
+                        }
+                    }
+                }
+            } else {
+                state.mark_full_snapshot();
+                FlowCheckpointDecision::FallbackToFullSnapshot {
+                    previous_mode: checkpoint_mode,
+                    reason: FlowQueryFallbackReason::IncompleteRegionWatermark,
+                }
+            }
+        } else {
+            state.mark_full_snapshot();
+            FlowCheckpointDecision::FallbackToFullSnapshot {
+                previous_mode: checkpoint_mode,
+                reason: FlowQueryFallbackReason::MissingRegionWatermark,
+            }
+        }
+    }
+
+    pub(super) fn record_checkpoint_decision(flow_id: FlowId, decision: FlowCheckpointDecision) {
+        let flow_id = flow_id.to_string();
+        METRIC_FLOW_BATCHING_ENGINE_CHECKPOINT_DECISION_CNT
+            .with_label_values(&[
+                flow_id.as_str(),
+                decision.mode_label(),
+                decision.decision_label(),
+                decision.reason_label(),
+            ])
+            .inc();
+
+        match decision {
+            FlowCheckpointDecision::AdvancedFromFullSnapshot {
+                participating_regions,
+                watermarks,
+            } => {
+                info!(
+                    "Flow {flow_id} switched to incremental mode after full snapshot, participating_regions={participating_regions}, watermarks={watermarks}"
+                );
+            }
+            FlowCheckpointDecision::AdvancedIncremental {
+                participating_regions,
+                watermarks,
+            } => {
+                debug!(
+                    "Flow {flow_id} advanced incremental checkpoints, participating_regions={participating_regions}, watermarks={watermarks}"
+                );
+            }
+            FlowCheckpointDecision::FallbackToFullSnapshot {
+                previous_mode,
+                reason,
+            } => {
+                warn!(
+                    "Flow {flow_id} switched to full snapshot mode, previous_mode={}, reason={}",
+                    checkpoint_mode_label(previous_mode),
+                    reason.as_label()
+                );
+            }
+        }
+    }
+
+    pub(super) fn record_query_mode(flow_id: FlowId, mode: CheckpointMode) {
+        let flow_id = flow_id.to_string();
+        METRIC_FLOW_BATCHING_ENGINE_QUERY_MODE_CNT
+            .with_label_values(&[flow_id.as_str(), checkpoint_mode_label(mode)])
+            .inc();
+    }
+}
diff --git a/src/flow/src/batching_mode/task/inc.rs b/src/flow/src/batching_mode/task/inc.rs
new file mode 100644
index 0000000000..4fb64a676e
--- /dev/null
+++ b/src/flow/src/batching_mode/task/inc.rs
@@ -0,0 +1,252 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_error::ext::BoxedError;
+use common_telemetry::debug;
+use common_telemetry::tracing::warn;
+use datafusion_expr::{DmlStatement, LogicalPlan};
+use query::options::{
+    FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY,
+    FLOW_SINK_TABLE_ID,
+};
+use snafu::ResultExt;
+use table::metadata::TableId;
+
+use crate::Error;
+use crate::batching_mode::incremental_filter::build_sink_dirty_time_window_filter_expr;
+use crate::batching_mode::state::{CheckpointMode, FilterExprInfo};
+use crate::batching_mode::table_creator::QueryType;
+use crate::batching_mode::task::BatchingTask;
+use crate::batching_mode::utils::{
+    analyze_incremental_aggregate_plan, get_table_info_df_schema,
+    rewrite_incremental_aggregate_with_sink_merge,
+};
+use crate::error::{ExternalSnafu, UnexpectedSnafu};
+
+impl BatchingTask {
+    async fn sink_table_id(&self) -> Result<TableId, Error> {
+        let table = self
+            .config
+            .catalog_manager
+            .table(
+                &self.config.sink_table_name[0],
+                &self.config.sink_table_name[1],
+                &self.config.sink_table_name[2],
+                None,
+            )
+            .await
+            .map_err(BoxedError::new)
+            .context(ExternalSnafu)?
+            .ok_or_else(|| {
+                UnexpectedSnafu {
+                    reason: format!(
+                        "Flow {} cannot build incremental extensions because sink table {:?} was not found",
+                        self.config.flow_id, self.config.sink_table_name
+                    ),
+                }
+                .build()
+            })?;
+        Ok(table.table_info().table_id())
+    }
+
+    /// For incremental-mode SQL queries, attempt to prepare an executable plan
+    /// that is safe for incremental scan extensions.
+    ///
+    /// Returns `Some(plan)` when incremental extensions are safe, and `None`
+    /// when the caller should execute the original plan without incremental
+    /// extensions. The returned plan may be either a rewritten
+    /// delta-LEFT-JOIN-sink merge plan or the original plan. In particular,
+    /// plain GROUP BY queries with no aggregate merge columns are incremental
+    /// safe without a rewrite, so they return `Some(original_plan)`.
+    pub(super) async fn prepare_plan_for_incremental(
+        &self,
+        plan: &LogicalPlan,
+        dirty_filter: Option<&FilterExprInfo>,
+    ) -> Result<Option<LogicalPlan>, Error> {
+        let is_incremental_sql = {
+            let state = self.state.read().unwrap();
+            if state.is_incremental_disabled() {
+                return Ok(None);
+            }
+            state.checkpoint_mode() == CheckpointMode::Incremental
+                && matches!(self.config.query_type, QueryType::Sql)
+        };
+
+        if !is_incremental_sql {
+            return Ok(None);
+        }
+
+        // Extract inner query plan from the DML wrapper.
+        // Non-DML or non-SQL plans bypass the rewrite and keep checkpoint mode;
+        // non-aggregate TQL or non-INSERT plans do not need incremental scan extensions.
+        let inner_plan = match plan {
+            LogicalPlan::Dml(dml) => dml.input.as_ref().clone(),
+            _ => return Ok(None),
+        };
+
+        // Analyze the plan for incremental rewritability.
+        // Incremental reads currently require aggregate / group-by plans that
+        // can be rewritten into a delta-left-join-sink merge. Non-aggregate SQL
+        // (projection, filter, or other non-aggregate shapes) stays full-snapshot
+        // until separately supported, and incremental mode is permanently
+        // disabled for this flow.
+        let Some(analysis) = analyze_incremental_aggregate_plan(&inner_plan)? else {
+            warn!(
+                "Flow {} incremental mode but plan is not an aggregate query; \
+                 permanently disabling incremental for this flow",
+                self.config.flow_id
+            );
+            self.state.write().unwrap().disable_incremental();
+            return Ok(None);
+        };
+
+        if !analysis.unsupported_exprs.is_empty() {
+            warn!(
+                "Flow {} incremental aggregate contains unsupported expressions {:?}; \
+                 permanently disabling incremental for this flow",
+                self.config.flow_id, analysis.unsupported_exprs
+            );
+            self.state.write().unwrap().disable_incremental();
+            return Ok(None);
+        }
+
+        // Plain GROUP BY without aggregate expressions has no values to
+        // merge between delta and sink. The incremental delta scan emits
+        // changed groups, and sink primary-key write semantics make this
+        // idempotent; no explicit left-join rewrite is needed.
+        if analysis.merge_columns.is_empty() {
+            return Ok(Some(plan.clone()));
+        }
+
+        // Fetch sink table for the merge rewrite.
+        // Transient errors (catalog, schema, filter, or rewrite) should not
+        // permanently disable incremental mode. Instead, we fall back to a
+        // full-snapshot plan for this round while keeping incremental retryable.
+        let sink_table = match get_table_info_df_schema(
+            self.config.catalog_manager.clone(),
+            self.config.sink_table_name.clone(),
+        )
+        .await
+        {
+            Ok((table, _)) => table,
+            Err(err) => {
+                warn!(
+                    "Flow {} failed to fetch sink table for incremental rewrite; \
+                     falling back to full snapshot for this round: {:?}",
+                    self.config.flow_id, err
+                );
+                self.state.write().unwrap().mark_full_snapshot();
+                return Ok(None);
+            }
+        };
+        let sink_schema = sink_table.table_info().meta.schema.clone();
+        let sink_dirty_filter = match build_sink_dirty_time_window_filter_expr(
+            self.config.flow_id,
+            &analysis,
+            &sink_schema,
+            dirty_filter,
+        ) {
+            Ok(filter) => filter,
+            Err(err) => {
+                warn!(
+                    "Flow {} failed to build sink dirty time window filter; \
+                     falling back to full snapshot for this round: {:?}",
+                    self.config.flow_id, err
+                );
+                self.state.write().unwrap().mark_full_snapshot();
+                return Ok(None);
+            }
+        };
+
+        let rewritten_inner = match rewrite_incremental_aggregate_with_sink_merge(
+            &inner_plan,
+            &analysis,
+            sink_table,
+            &self.config.sink_table_name,
+            sink_dirty_filter,
+        )
+        .await
+        {
+            Ok(plan) => plan,
+            Err(err) => {
+                warn!(
+                    "Flow {} failed to rewrite incremental aggregate with sink merge; \
+                     falling back to full snapshot for this round: {:?}",
+                    self.config.flow_id, err
+                );
+                self.state.write().unwrap().mark_full_snapshot();
+                return Ok(None);
+            }
+        };
+
+        // Reconstruct DML plan with the rewritten inner plan
+        let rewritten = match plan {
+            LogicalPlan::Dml(dml) => LogicalPlan::Dml(DmlStatement::new(
+                dml.table_name.clone(),
+                dml.target.clone(),
+                dml.op.clone(),
+                Arc::new(rewritten_inner),
+            )),
+            _ => unreachable!("already matched Dml above"),
+        };
+
+        debug!(
+            "Flow {} rewrote incremental SQL aggregate query with sink merge",
+            self.config.flow_id
+        );
+
+        Ok(Some(rewritten))
+    }
+
+    pub(super) async fn build_flow_query_extensions(
+        &self,
+        incremental_safe: bool,
+        can_advance_checkpoints: bool,
+    ) -> Result<Vec<(&'static str, String)>, Error> {
+        let mut extensions = vec![("flow.return_region_seq", "true".to_string())];
+
+        let incremental_checkpoints_json = {
+            let state = self.state.read().unwrap();
+            if incremental_safe
+                && can_advance_checkpoints
+                && !state.is_incremental_disabled()
+                && state.checkpoint_mode() == CheckpointMode::Incremental
+                && !state.checkpoints().is_empty()
+            {
+                Some(serde_json::to_string(state.checkpoints()).map_err(|err| {
+                    UnexpectedSnafu {
+                        reason: format!("Failed to serialize checkpoint map: {err}"),
+                    }
+                    .build()
+                })?)
+            } else {
+                None
+            }
+        };
+
+        if let Some(checkpoints_json) = incremental_checkpoints_json {
+            let sink_table_id = self.sink_table_id().await?;
+            extensions.push((FLOW_SINK_TABLE_ID, sink_table_id.to_string()));
+            extensions.push((
+                FLOW_INCREMENTAL_MODE,
+                FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY.to_string(),
+            ));
+            extensions.push((FLOW_INCREMENTAL_AFTER_SEQS, checkpoints_json));
+        }
+
+        Ok(extensions)
+    }
+}
diff --git a/src/flow/src/batching_mode/task/test.rs b/src/flow/src/batching_mode/task/test.rs
new file mode 100644
index 0000000000..959aeb00c9
--- /dev/null
+++ b/src/flow/src/batching_mode/task/test.rs
@@ -0,0 +1,1094 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::BTreeMap;
+
+use catalog::RegisterTableRequest;
+use catalog::memory::MemoryCatalogManager;
+use client::OutputWithMetrics;
+use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use common_query::Output;
+use common_recordbatch::RecordBatch;
+use common_recordbatch::adapter::{RecordBatchMetrics, RegionWatermarkEntry};
+use datatypes::data_type::ConcreteDataType as CDT;
+use datatypes::schema::ColumnSchema;
+use datatypes::vectors::{TimestampMillisecondVector, UInt32Vector, VectorRef};
+use pretty_assertions::assert_eq;
+use query::options::{FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY};
+use session::context::QueryContext;
+use table::test_util::MemTable;
+
+use super::*;
+use crate::batching_mode::checkpoint::{
+    CHECKPOINT_DECISION_ADVANCE, CHECKPOINT_DECISION_FALLBACK, CHECKPOINT_REASON_NONE,
+    FlowCheckpointDecision, FlowQueryFallbackReason,
+};
+use crate::batching_mode::state::CheckpointMode;
+use crate::batching_mode::time_window::find_time_window_expr;
+use crate::test_utils::create_test_query_engine;
+
+async fn new_test_task_and_plan_with_missing_sink() -> (BatchingTask, LogicalPlan) {
+    new_test_task_engine_and_plan_with_query(
+        "SELECT number, ts FROM numbers_with_ts",
+        "missing_sink",
+    )
+    .await
+    .into_task_and_plan()
+}
+
+struct TestTaskParts {
+    task: BatchingTask,
+    query_engine: QueryEngineRef,
+    plan: LogicalPlan,
+}
+
+impl TestTaskParts {
+    fn into_task_and_plan(self) -> (BatchingTask, LogicalPlan) {
+        (self.task, self.plan)
+    }
+}
+
+async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str) -> TestTaskParts {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let plan = sql_to_df_plan(
+        ctx.clone(),
+        query_engine.clone(),
+        "SELECT number, ts FROM numbers_with_ts",
+        true,
+    )
+    .await
+    .unwrap();
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+
+    let task = BatchingTask::try_new(TaskArgs {
+        flow_id: 1,
+        query,
+        plan: plan.clone(),
+        time_window_expr: None,
+        expire_after: None,
+        sink_table_name: [
+            "greptime".to_string(),
+            "public".to_string(),
+            sink_table.to_string(),
+        ],
+        source_table_names: vec![[
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers_with_ts".to_string(),
+        ]],
+        query_ctx: ctx,
+        catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+        shutdown_rx: rx,
+        batch_opts: Arc::new(BatchingModeOptions::default()),
+        flow_eval_interval: None,
+    })
+    .unwrap();
+
+    TestTaskParts {
+        task,
+        query_engine,
+        plan,
+    }
+}
+
+async fn new_time_window_test_task_with_query(query: &str) -> TestTaskParts {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let plan_query = "SELECT number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window, number";
+    let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), plan_query, true)
+        .await
+        .unwrap();
+    let (column_name, time_window_expr, _, df_schema) = find_time_window_expr(
+        &plan,
+        query_engine.engine_state().catalog_manager().clone(),
+        ctx.clone(),
+    )
+    .await
+    .unwrap();
+    let time_window_expr = time_window_expr.map(|expr| {
+        TimeWindowExpr::from_expr(
+            &expr,
+            &column_name,
+            &df_schema,
+            &query_engine.engine_state().session_state(),
+        )
+        .unwrap()
+    });
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+
+    let task = BatchingTask::try_new(TaskArgs {
+        flow_id: 1,
+        query,
+        plan: plan.clone(),
+        time_window_expr,
+        expire_after: None,
+        sink_table_name: [
+            "greptime".to_string(),
+            "public".to_string(),
+            "missing_sink".to_string(),
+        ],
+        source_table_names: vec![[
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers_with_ts".to_string(),
+        ]],
+        query_ctx: ctx,
+        catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+        shutdown_rx: rx,
+        batch_opts: Arc::new(BatchingModeOptions::default()),
+        flow_eval_interval: None,
+    })
+    .unwrap();
+
+    TestTaskParts {
+        task,
+        query_engine,
+        plan,
+    }
+}
+
+fn register_number_only_sink(query_engine: &QueryEngineRef, table_name: &str) {
+    let schema = Arc::new(Schema::new(vec![ColumnSchema::new(
+        "number",
+        CDT::uint32_datatype(),
+        false,
+    )]));
+    let columns: Vec<VectorRef> = vec![Arc::new(UInt32Vector::from_slice([1_u32]))];
+    let recordbatch = RecordBatch::new(schema, columns).unwrap();
+    let table = MemTable::table(table_name, recordbatch);
+    let request = RegisterTableRequest {
+        catalog: DEFAULT_CATALOG_NAME.to_string(),
+        schema: DEFAULT_SCHEMA_NAME.to_string(),
+        table_name: table_name.to_string(),
+        table_id: 9001,
+        table,
+    };
+    let catalog_manager = query_engine.engine_state().catalog_manager();
+    let memory_catalog = catalog_manager
+        .as_any()
+        .downcast_ref::<MemoryCatalogManager>()
+        .unwrap();
+    memory_catalog.register_table_sync(request).unwrap();
+}
+
+fn register_auto_created_aggregate_sink(query_engine: &QueryEngineRef, table_name: &str) {
+    let schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", CDT::uint32_datatype(), true),
+        ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false).with_time_index(true),
+        ColumnSchema::new("update_at", CDT::timestamp_millisecond_datatype(), true),
+    ]));
+    let columns: Vec<VectorRef> = vec![
+        Arc::new(UInt32Vector::from_slice([1_u32])),
+        Arc::new(TimestampMillisecondVector::from_slice([0_i64])),
+        Arc::new(TimestampMillisecondVector::from_slice([0_i64])),
+    ];
+    let recordbatch = RecordBatch::new(schema, columns).unwrap();
+    let table = MemTable::table(table_name, recordbatch);
+    let request = RegisterTableRequest {
+        catalog: DEFAULT_CATALOG_NAME.to_string(),
+        schema: DEFAULT_SCHEMA_NAME.to_string(),
+        table_name: table_name.to_string(),
+        table_id: 9002,
+        table,
+    };
+    let catalog_manager = query_engine.engine_state().catalog_manager();
+    let memory_catalog = catalog_manager
+        .as_any()
+        .downcast_ref::<MemoryCatalogManager>()
+        .unwrap();
+    memory_catalog.register_table_sync(request).unwrap();
+}
+
+fn dirty_marker() -> DirtyTimeWindows {
+    let mut dirty = DirtyTimeWindows::default();
+    dirty.set_dirty();
+    dirty
+}
+
+fn dirty_range(start: i64, end: i64) -> DirtyTimeWindows {
+    let mut dirty = DirtyTimeWindows::default();
+    dirty.add_window(
+        Timestamp::new_second(start),
+        Some(Timestamp::new_second(end)),
+    );
+    dirty
+}
+
+async fn assert_unscoped_failure_restore(
+    consumed_dirty_windows: DirtyTimeWindows,
+    current_dirty_windows: DirtyTimeWindows,
+    expected_len: usize,
+    expected_window_size_secs: u64,
+) {
+    let (task, plan) = new_test_task_and_plan_with_missing_sink().await;
+    {
+        let mut state = task.state.write().unwrap();
+        state.dirty_time_windows.clean();
+        state
+            .dirty_time_windows
+            .add_dirty_windows(&current_dirty_windows);
+    }
+    let unscoped_query = PlanInfo {
+        plan,
+        dirty_restore: DirtyRestore::Unscoped(consumed_dirty_windows),
+        can_advance_checkpoints: true,
+    };
+
+    task.handle_executed_query_failure(Some(&unscoped_query));
+
+    let state = task.state.read().unwrap();
+    assert_eq!(state.dirty_time_windows.len(), expected_len);
+    assert_eq!(
+        state.dirty_time_windows.window_size(),
+        std::time::Duration::from_secs(expected_window_size_secs)
+    );
+}
+
+fn output_with_region_watermarks(
+    watermarks: impl IntoIterator<Item = (u64, Option<u64>)>,
+) -> OutputWithMetrics {
+    let result = OutputWithMetrics::from_output(Output::new_with_affected_rows(0));
+    result.metrics.update(Some(RecordBatchMetrics {
+        region_watermarks: watermarks
+            .into_iter()
+            .map(|(region_id, watermark)| RegionWatermarkEntry {
+                region_id,
+                watermark,
+            })
+            .collect(),
+        ..Default::default()
+    }));
+    result.metrics.mark_ready();
+    result
+}
+
+#[test]
+fn test_apply_query_result_to_state_advances_full_snapshot_to_incremental() {
+    let query_ctx = QueryContext::arc();
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+    let mut state = TaskState::new(query_ctx, rx);
+    let result = output_with_region_watermarks([(1_u64, Some(10_u64)), (2_u64, Some(20_u64))]);
+
+    let decision = BatchingTask::apply_query_result_to_state(
+        &mut state,
+        &result,
+        std::time::Duration::from_millis(1),
+        true,
+    );
+
+    assert_eq!(
+        decision,
+        FlowCheckpointDecision::AdvancedFromFullSnapshot {
+            participating_regions: 2,
+            watermarks: 2,
+        }
+    );
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
+    assert_eq!(
+        state.checkpoints(),
+        &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+    );
+}
+
+#[test]
+fn test_apply_query_result_to_state_stays_full_snapshot_when_incremental_disabled() {
+    let query_ctx = QueryContext::arc();
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+    let mut state = TaskState::new(query_ctx, rx);
+    state.disable_incremental();
+    assert!(state.is_incremental_disabled());
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+
+    let result = output_with_region_watermarks([(1_u64, Some(10_u64)), (2_u64, Some(20_u64))]);
+    let decision = BatchingTask::apply_query_result_to_state(
+        &mut state,
+        &result,
+        std::time::Duration::from_millis(1),
+        true,
+    );
+
+    // Should NOT claim advancement to incremental; should fallback with correct reason.
+    assert_eq!(
+        decision,
+        FlowCheckpointDecision::FallbackToFullSnapshot {
+            previous_mode: CheckpointMode::FullSnapshot,
+            reason: FlowQueryFallbackReason::IncrementalDisabled,
+        }
+    );
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+    assert!(state.is_incremental_disabled());
+    // Checkpoints are still updated even if mode doesn't advance.
+    assert_eq!(
+        state.checkpoints(),
+        &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+    );
+}
+
+#[test]
+fn test_apply_query_result_to_state_rejects_unproved_watermark() {
+    let query_ctx = QueryContext::arc();
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+    let mut state = TaskState::new(query_ctx, rx);
+    let result = output_with_region_watermarks([(1_u64, Some(10_u64)), (2_u64, None)]);
+
+    let decision = BatchingTask::apply_query_result_to_state(
+        &mut state,
+        &result,
+        std::time::Duration::from_millis(1),
+        true,
+    );
+
+    assert_eq!(
+        decision,
+        FlowCheckpointDecision::FallbackToFullSnapshot {
+            previous_mode: CheckpointMode::FullSnapshot,
+            reason: FlowQueryFallbackReason::IncompleteRegionWatermark,
+        }
+    );
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+    assert!(state.checkpoints().is_empty());
+}
+
+#[test]
+fn test_apply_query_result_to_state_reports_missing_watermark() {
+    let query_ctx = QueryContext::arc();
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+    let mut state = TaskState::new(query_ctx, rx);
+    let result = OutputWithMetrics::from_output(Output::new_with_affected_rows(0));
+
+    let decision = BatchingTask::apply_query_result_to_state(
+        &mut state,
+        &result,
+        std::time::Duration::from_millis(1),
+        true,
+    );
+
+    assert_eq!(
+        decision,
+        FlowCheckpointDecision::FallbackToFullSnapshot {
+            previous_mode: CheckpointMode::FullSnapshot,
+            reason: FlowQueryFallbackReason::MissingRegionWatermark,
+        }
+    );
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+    assert!(state.checkpoints().is_empty());
+}
+
+#[test]
+fn test_apply_query_result_to_state_advances_incremental_subset() {
+    let query_ctx = QueryContext::arc();
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+    let mut state = TaskState::new(query_ctx, rx);
+    state.advance_checkpoints(HashMap::from([
+        (1_u64, 10_u64),
+        (2_u64, 20_u64),
+        (3_u64, 30_u64),
+    ]));
+    let result = output_with_region_watermarks([(1_u64, Some(12_u64)), (3_u64, Some(35_u64))]);
+
+    let decision = BatchingTask::apply_query_result_to_state(
+        &mut state,
+        &result,
+        std::time::Duration::from_millis(1),
+        true,
+    );
+
+    assert_eq!(
+        decision,
+        FlowCheckpointDecision::AdvancedIncremental {
+            participating_regions: 2,
+            watermarks: 2,
+        }
+    );
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
+    assert_eq!(
+        state.checkpoints(),
+        &BTreeMap::from([(1_u64, 12_u64), (2_u64, 20_u64), (3_u64, 35_u64)])
+    );
+}
+
+#[test]
+fn test_apply_query_result_to_state_blocks_full_snapshot_when_dirty_backlog_pending() {
+    let query_ctx = QueryContext::arc();
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+    let mut state = TaskState::new(query_ctx, rx);
+    let result = output_with_region_watermarks([(1_u64, Some(10_u64)), (2_u64, Some(20_u64))]);
+
+    let decision = BatchingTask::apply_query_result_to_state(
+        &mut state,
+        &result,
+        std::time::Duration::from_millis(1),
+        false,
+    );
+
+    assert_eq!(
+        decision,
+        FlowCheckpointDecision::FallbackToFullSnapshot {
+            previous_mode: CheckpointMode::FullSnapshot,
+            reason: FlowQueryFallbackReason::DirtyBacklogPending,
+        }
+    );
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+    assert!(state.checkpoints().is_empty());
+}
+
+#[test]
+fn test_apply_query_result_to_state_blocks_incremental_when_dirty_backlog_pending() {
+    let query_ctx = QueryContext::arc();
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+    let mut state = TaskState::new(query_ctx, rx);
+    state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+    let result = output_with_region_watermarks([(1_u64, Some(12_u64)), (2_u64, Some(25_u64))]);
+
+    let decision = BatchingTask::apply_query_result_to_state(
+        &mut state,
+        &result,
+        std::time::Duration::from_millis(1),
+        false,
+    );
+
+    assert_eq!(
+        decision,
+        FlowCheckpointDecision::FallbackToFullSnapshot {
+            previous_mode: CheckpointMode::Incremental,
+            reason: FlowQueryFallbackReason::DirtyBacklogPending,
+        }
+    );
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+    assert_eq!(
+        state.checkpoints(),
+        &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+    );
+}
+
+#[test]
+fn test_apply_query_failure_to_state_falls_back_from_incremental() {
+    let query_ctx = QueryContext::arc();
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+    let mut state = TaskState::new(query_ctx, rx);
+    state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
+
+    let decision = BatchingTask::apply_query_failure_to_state(
+        &mut state,
+        std::time::Duration::from_millis(1),
+        FlowQueryFallbackReason::IncrementalQueryFailure,
+    );
+
+    assert_eq!(
+        decision,
+        Some(FlowCheckpointDecision::FallbackToFullSnapshot {
+            previous_mode: CheckpointMode::Incremental,
+            reason: FlowQueryFallbackReason::IncrementalQueryFailure,
+        })
+    );
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+    assert_eq!(
+        state.checkpoints(),
+        &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+    );
+}
+
+#[test]
+fn test_apply_query_failure_to_state_keeps_full_snapshot_without_decision() {
+    let query_ctx = QueryContext::arc();
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+    let mut state = TaskState::new(query_ctx, rx);
+
+    let decision = BatchingTask::apply_query_failure_to_state(
+        &mut state,
+        std::time::Duration::from_millis(1),
+        FlowQueryFallbackReason::StaleCursor,
+    );
+
+    assert_eq!(decision, None);
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+    assert!(state.checkpoints().is_empty());
+}
+
+#[test]
+fn test_checkpoint_decision_labels_are_stable() {
+    let advance = FlowCheckpointDecision::AdvancedIncremental {
+        participating_regions: 1,
+        watermarks: 1,
+    };
+    let fallback = FlowCheckpointDecision::FallbackToFullSnapshot {
+        previous_mode: CheckpointMode::Incremental,
+        reason: FlowQueryFallbackReason::StaleCursor,
+    };
+
+    assert_eq!(advance.mode_label(), "incremental");
+    assert_eq!(advance.decision_label(), CHECKPOINT_DECISION_ADVANCE);
+    assert_eq!(advance.reason_label(), CHECKPOINT_REASON_NONE);
+    assert_eq!(fallback.mode_label(), "incremental");
+    assert_eq!(fallback.decision_label(), CHECKPOINT_DECISION_FALLBACK);
+    assert_eq!(fallback.reason_label(), "stale_cursor");
+    assert_eq!(
+        FlowQueryFallbackReason::DirtyBacklogPending.as_label(),
+        "dirty_backlog_pending"
+    );
+}
+
+#[tokio::test]
+async fn test_build_flow_query_extensions_switches_with_checkpoint_mode() {
+    let (task, _) = new_test_task_engine_and_plan_with_query(
+        "SELECT number, ts FROM numbers_with_ts",
+        "numbers_with_ts",
+    )
+    .await
+    .into_task_and_plan();
+
+    let extensions = task.build_flow_query_extensions(false, true).await.unwrap();
+    assert_eq!(
+        extensions,
+        vec![("flow.return_region_seq", "true".to_string())]
+    );
+
+    task.state
+        .write()
+        .unwrap()
+        .advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+
+    let extensions = task.build_flow_query_extensions(false, true).await.unwrap();
+    assert!(extensions.contains(&("flow.return_region_seq", "true".to_string())));
+    assert!(
+        !extensions
+            .iter()
+            .any(|(key, _)| *key == FLOW_INCREMENTAL_MODE)
+    );
+    assert!(
+        !extensions
+            .iter()
+            .any(|(key, _)| *key == FLOW_INCREMENTAL_AFTER_SEQS)
+    );
+
+    let extensions = task.build_flow_query_extensions(true, true).await.unwrap();
+
+    assert!(extensions.contains(&("flow.return_region_seq", "true".to_string())));
+    assert!(extensions.contains(&(
+        FLOW_INCREMENTAL_MODE,
+        FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY.to_string()
+    )));
+    assert!(extensions.contains(&(
+        FLOW_INCREMENTAL_AFTER_SEQS,
+        serde_json::json!({"1": 10, "2": 20}).to_string(),
+    )));
+
+    let extensions = task.build_flow_query_extensions(true, false).await.unwrap();
+    assert!(extensions.contains(&("flow.return_region_seq", "true".to_string())));
+    assert!(
+        !extensions
+            .iter()
+            .any(|(key, _)| *key == FLOW_INCREMENTAL_MODE)
+    );
+    assert!(
+        !extensions
+            .iter()
+            .any(|(key, _)| *key == FLOW_INCREMENTAL_AFTER_SEQS)
+    );
+
+    task.state.write().unwrap().disable_incremental();
+    let extensions = task.build_flow_query_extensions(true, true).await.unwrap();
+    assert!(extensions.contains(&("flow.return_region_seq", "true".to_string())));
+    assert!(
+        !extensions
+            .iter()
+            .any(|(key, _)| *key == FLOW_INCREMENTAL_MODE)
+    );
+    assert!(
+        !extensions
+            .iter()
+            .any(|(key, _)| *key == FLOW_INCREMENTAL_AFTER_SEQS)
+    );
+}
+
+#[tokio::test]
+async fn test_full_snapshot_scoped_plan_marks_checkpoint_advance_safe_only_after_backlog_drained() {
+    let TestTaskParts {
+        task,
+        query_engine,
+        ..
+    } = new_time_window_test_task_with_query(
+        "SELECT number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window, number",
+    )
+    .await;
+    {
+        let mut state = task.state.write().unwrap();
+        state
+            .dirty_time_windows
+            .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
+        state
+            .dirty_time_windows
+            .add_window(Timestamp::new_second(30), Some(Timestamp::new_second(35)));
+    }
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", CDT::uint32_datatype(), false),
+        ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
+            .with_time_index(true),
+    ]));
+
+    let first = task
+        .gen_query_with_time_window(query_engine.clone(), &sink_schema, &[], false, Some(1))
+        .await
+        .unwrap()
+        .unwrap();
+    assert!(!first.can_advance_checkpoints);
+    assert_eq!(task.state.read().unwrap().dirty_time_windows.len(), 1);
+
+    let second = task
+        .gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
+        .await
+        .unwrap()
+        .unwrap();
+    assert!(second.can_advance_checkpoints);
+    assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
+}
+
+#[tokio::test]
+async fn test_incremental_scoped_plan_consumes_all_dirty_windows_for_checkpoint_safety() {
+    let TestTaskParts {
+        task,
+        query_engine,
+        ..
+    } = new_time_window_test_task_with_query(
+        "SELECT number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window, number",
+    )
+    .await;
+    {
+        let mut state = task.state.write().unwrap();
+        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
+        state
+            .dirty_time_windows
+            .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
+        state
+            .dirty_time_windows
+            .add_window(Timestamp::new_second(30), Some(Timestamp::new_second(35)));
+    }
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", CDT::uint32_datatype(), false),
+        ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
+            .with_time_index(true),
+    ]));
+
+    let plan = task
+        .gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
+        .await
+        .unwrap()
+        .unwrap();
+
+    assert!(plan.can_advance_checkpoints);
+    assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
+}
+
+#[tokio::test]
+async fn test_executed_query_failure_restores_scoped_dirty_windows_for_flush_path() {
+    let (task, plan) = new_test_task_and_plan_with_missing_sink().await;
+    {
+        let mut state = task.state.write().unwrap();
+        state.dirty_time_windows.clean();
+    }
+    let scoped_query = PlanInfo {
+        plan,
+        dirty_restore: DirtyRestore::Scoped(FilterExprInfo {
+            expr: datafusion_expr::lit(true),
+            col_name: "ts".to_string(),
+            time_ranges: vec![(Timestamp::new_second(10), Timestamp::new_second(20))],
+            window_size: chrono::Duration::seconds(10),
+        }),
+        can_advance_checkpoints: true,
+    };
+
+    task.handle_executed_query_failure(Some(&scoped_query));
+
+    let state = task.state.read().unwrap();
+    assert_eq!(state.dirty_time_windows.len(), 1);
+    assert_eq!(
+        state.dirty_time_windows.window_size(),
+        std::time::Duration::from_secs(10)
+    );
+}
+
+#[tokio::test]
+async fn test_prepare_plan_for_incremental_disables_on_non_aggregate() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let plan = sql_to_df_plan(
+        ctx.clone(),
+        query_engine.clone(),
+        "SELECT number, ts FROM numbers_with_ts",
+        true,
+    )
+    .await
+    .unwrap();
+
+    // Build a DML wrapper using a real sink table from the test engine.
+    let (sink_table, _) = get_table_info_df_schema(
+        query_engine.engine_state().catalog_manager().clone(),
+        [
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers_with_ts".to_string(),
+        ],
+    )
+    .await
+    .unwrap();
+    let table_provider = Arc::new(DfTableProviderAdapter::new(sink_table));
+    let table_source = Arc::new(DefaultTableSource::new(table_provider));
+    let dml_plan = LogicalPlan::Dml(DmlStatement::new(
+        datafusion_common::TableReference::bare("test"),
+        table_source,
+        WriteOp::Insert(datafusion_expr::dml::InsertOp::Append),
+        Arc::new(plan),
+    ));
+
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+    let task = BatchingTask::try_new(TaskArgs {
+        flow_id: 1,
+        query: "SELECT number, ts FROM numbers_with_ts",
+        plan: dml_plan.clone(),
+        time_window_expr: None,
+        expire_after: None,
+        sink_table_name: [
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers_with_ts".to_string(),
+        ],
+        source_table_names: vec![[
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers_with_ts".to_string(),
+        ]],
+        query_ctx: ctx,
+        catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+        shutdown_rx: rx,
+        batch_opts: Arc::new(BatchingModeOptions::default()),
+        flow_eval_interval: None,
+    })
+    .unwrap();
+
+    // Put the state into Incremental mode with checkpoints.
+    task.state
+        .write()
+        .unwrap()
+        .advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
+    assert_eq!(
+        task.state.read().unwrap().checkpoint_mode(),
+        CheckpointMode::Incremental
+    );
+
+    let incremental_plan = task
+        .prepare_plan_for_incremental(&dml_plan, None)
+        .await
+        .unwrap();
+    assert!(incremental_plan.is_none());
+    let state = task.state.read().unwrap();
+    assert!(state.is_incremental_disabled());
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+}
+
+#[tokio::test]
+async fn test_prepare_plan_for_incremental_falls_back_without_disable_on_rewrite_error() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let plan = sql_to_df_plan(
+        ctx.clone(),
+        query_engine.clone(),
+        "SELECT sum(number) AS total, ts FROM numbers_with_ts GROUP BY ts",
+        true,
+    )
+    .await
+    .unwrap();
+
+    let (sink_table, _) = get_table_info_df_schema(
+        query_engine.engine_state().catalog_manager().clone(),
+        [
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers_with_ts".to_string(),
+        ],
+    )
+    .await
+    .unwrap();
+    let table_provider = Arc::new(DfTableProviderAdapter::new(sink_table));
+    let table_source = Arc::new(DefaultTableSource::new(table_provider));
+    let dml_plan = LogicalPlan::Dml(DmlStatement::new(
+        datafusion_common::TableReference::bare("test"),
+        table_source,
+        WriteOp::Insert(datafusion_expr::dml::InsertOp::Append),
+        Arc::new(plan),
+    ));
+
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+    let task = BatchingTask::try_new(TaskArgs {
+        flow_id: 1,
+        query: "SELECT sum(number) AS total, ts FROM numbers_with_ts GROUP BY ts",
+        plan: dml_plan.clone(),
+        time_window_expr: None,
+        expire_after: None,
+        // The sink table exists, but does not have the rewritten aggregate
+        // output column `total`, so the rewrite fails deterministically.
+        sink_table_name: [
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers_with_ts".to_string(),
+        ],
+        source_table_names: vec![[
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers_with_ts".to_string(),
+        ]],
+        query_ctx: ctx,
+        catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+        shutdown_rx: rx,
+        batch_opts: Arc::new(BatchingModeOptions::default()),
+        flow_eval_interval: None,
+    })
+    .unwrap();
+
+    task.state
+        .write()
+        .unwrap()
+        .advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
+    assert_eq!(
+        task.state.read().unwrap().checkpoint_mode(),
+        CheckpointMode::Incremental
+    );
+
+    let incremental_plan = task
+        .prepare_plan_for_incremental(&dml_plan, None)
+        .await
+        .unwrap();
+    assert!(incremental_plan.is_none());
+    let state = task.state.read().unwrap();
+    assert!(!state.is_incremental_disabled());
+    assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+}
+
+#[tokio::test]
+async fn test_prepare_plan_for_incremental_group_by_without_merge_columns_uses_original_plan() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let plan = sql_to_df_plan(
+        ctx.clone(),
+        query_engine.clone(),
+        "SELECT ts FROM numbers_with_ts GROUP BY ts",
+        true,
+    )
+    .await
+    .unwrap();
+
+    let (sink_table, _) = get_table_info_df_schema(
+        query_engine.engine_state().catalog_manager().clone(),
+        [
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers_with_ts".to_string(),
+        ],
+    )
+    .await
+    .unwrap();
+    let table_provider = Arc::new(DfTableProviderAdapter::new(sink_table));
+    let table_source = Arc::new(DefaultTableSource::new(table_provider));
+    let dml_plan = LogicalPlan::Dml(DmlStatement::new(
+        datafusion_common::TableReference::bare("test"),
+        table_source,
+        WriteOp::Insert(datafusion_expr::dml::InsertOp::Append),
+        Arc::new(plan),
+    ));
+
+    let (_tx, rx) = tokio::sync::oneshot::channel();
+    let task = BatchingTask::try_new(TaskArgs {
+        flow_id: 1,
+        query: "SELECT ts FROM numbers_with_ts GROUP BY ts",
+        plan: dml_plan.clone(),
+        time_window_expr: None,
+        expire_after: None,
+        sink_table_name: [
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers_with_ts".to_string(),
+        ],
+        source_table_names: vec![[
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers_with_ts".to_string(),
+        ]],
+        query_ctx: ctx,
+        catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+        shutdown_rx: rx,
+        batch_opts: Arc::new(BatchingModeOptions::default()),
+        flow_eval_interval: None,
+    })
+    .unwrap();
+
+    task.state
+        .write()
+        .unwrap()
+        .advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
+
+    let incremental_plan = task
+        .prepare_plan_for_incremental(&dml_plan, None)
+        .await
+        .unwrap()
+        .expect("plain GROUP BY is incremental-safe without a rewrite");
+
+    assert_eq!(format!("{incremental_plan}"), format!("{dml_plan}"));
+    assert!(!task.state.read().unwrap().is_incremental_disabled());
+}
+
+#[tokio::test]
+async fn test_auto_created_sql_aggregate_sink_reaches_incremental_safe() {
+    let sink_table = "auto_created_aggregate_sink";
+    let TestTaskParts {
+        task, query_engine, ..
+    } = new_test_task_engine_and_plan_with_query(
+        "SELECT max(number) AS number, ts FROM numbers_with_ts GROUP BY ts",
+        sink_table,
+    )
+    .await;
+    register_auto_created_aggregate_sink(&query_engine, sink_table);
+    task.state.write().unwrap().dirty_time_windows.set_dirty();
+
+    let plan_info = task
+        .gen_insert_plan(&query_engine, None)
+        .await
+        .unwrap()
+        .unwrap();
+    assert!(plan_info.can_advance_checkpoints);
+
+    task.state
+        .write()
+        .unwrap()
+        .advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
+    let incremental_plan = task
+        .prepare_plan_for_incremental(&plan_info.plan, None)
+        .await
+        .unwrap();
+    let incremental_safe = incremental_plan.is_some();
+
+    assert!(incremental_safe);
+    assert!(!task.state.read().unwrap().is_incremental_disabled());
+
+    let extensions = task
+        .build_flow_query_extensions(incremental_safe, plan_info.can_advance_checkpoints)
+        .await
+        .unwrap();
+    assert!(extensions.contains(&(
+        FLOW_INCREMENTAL_MODE,
+        FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY.to_string()
+    )));
+    assert!(
+        extensions
+            .iter()
+            .any(|(key, _)| *key == FLOW_INCREMENTAL_AFTER_SEQS)
+    );
+}
+
+#[tokio::test]
+async fn test_unscoped_failure_restores_consumed_dirty_signal() {
+    assert_unscoped_failure_restore(dirty_marker(), DirtyTimeWindows::default(), 1, 0).await;
+    assert_unscoped_failure_restore(dirty_range(30, 40), dirty_range(10, 20), 2, 20).await;
+    assert_unscoped_failure_restore(dirty_range(30, 40), dirty_range(30, 50), 1, 20).await;
+}
+
+#[tokio::test]
+async fn test_unscoped_plan_generation_failure_restores_consumed_dirty_signal() {
+    let TestTaskParts {
+        task, query_engine, ..
+    } = new_test_task_engine_and_plan_with_query(
+        "SELECT missing_column FROM numbers_with_ts",
+        "missing_sink",
+    )
+    .await;
+    task.state.write().unwrap().dirty_time_windows.set_dirty();
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", CDT::uint32_datatype(), false),
+        ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false).with_time_index(true),
+    ]));
+
+    let result = task
+        .gen_query_with_time_window(query_engine, &sink_schema, &[], false, None)
+        .await;
+
+    assert!(result.is_err());
+    let state = task.state.read().unwrap();
+    assert_eq!(state.dirty_time_windows.len(), 1);
+    assert_eq!(
+        state.dirty_time_windows.window_size(),
+        std::time::Duration::from_secs(0)
+    );
+}
+
+#[tokio::test]
+async fn test_scoped_plan_generation_failure_restores_consumed_dirty_windows() {
+    let TestTaskParts {
+        task,
+        query_engine,
+        ..
+    } = new_time_window_test_task_with_query(
+        "SELECT missing_column, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window, missing_column",
+    )
+    .await;
+    task.state
+        .write()
+        .unwrap()
+        .dirty_time_windows
+        .add_window(Timestamp::new_second(10), Some(Timestamp::new_second(15)));
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", CDT::uint32_datatype(), false),
+        ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
+            .with_time_index(true),
+    ]));
+
+    let result = task
+        .gen_query_with_time_window(query_engine, &sink_schema, &[], false, None)
+        .await;
+
+    assert!(result.is_err());
+    let state = task.state.read().unwrap();
+    assert_eq!(state.dirty_time_windows.len(), 1);
+    assert_eq!(
+        state.dirty_time_windows.window_size(),
+        std::time::Duration::from_secs(5)
+    );
+}
+
+#[tokio::test]
+async fn test_insert_plan_matching_failure_restores_consumed_dirty_marker() {
+    let sink_table = "partial_sink";
+    let TestTaskParts {
+        task, query_engine, ..
+    } = new_test_task_engine_and_plan_with_query(
+        "SELECT number, ts FROM numbers_with_ts",
+        sink_table,
+    )
+    .await;
+    register_number_only_sink(&query_engine, sink_table);
+    task.state.write().unwrap().dirty_time_windows.set_dirty();
+
+    let result = task.gen_insert_plan(&query_engine, None).await;
+
+    assert!(result.is_err());
+    let _err = match result {
+        Ok(_) => panic!("gen_insert_plan should fail with a sink column mismatch"),
+        Err(err) => err,
+    };
+    let state = task.state.read().unwrap();
+    assert_eq!(state.dirty_time_windows.len(), 1);
+    assert_eq!(
+        state.dirty_time_windows.window_size(),
+        std::time::Duration::from_secs(0)
+    );
+}
diff --git a/src/flow/src/batching_mode/utils.rs b/src/flow/src/batching_mode/utils.rs
index 7b066388ec..e86b1ee3be 100644
--- a/src/flow/src/batching_mode/utils.rs
+++ b/src/flow/src/batching_mode/utils.rs
@@ -278,7 +278,7 @@ fn collect_output_projection_info(plan: &LogicalPlan) -> OutputProjectionInfo {
                     let mut col_names = Vec::new();
                     find_column_names(&alias.expr, &mut col_names);
                     match col_names.len() {
-                        0 if matches!(alias.expr.as_ref(), Expr::Literal(_, _)) => {
+                        0 if is_passthrough_output_column(&alias_name, alias.expr.as_ref()) => {
                             projection_info.literal_columns.insert(alias_name);
                         }
                         1 => {
@@ -315,10 +315,38 @@ fn collect_output_projection_info(plan: &LogicalPlan) -> OutputProjectionInfo {
         }
     }
 
+    if projection_info
+        .output_field_names
+        .iter()
+        .any(|name| name == AUTO_CREATED_PLACEHOLDER_TS_COL)
+    {
+        projection_info
+            .literal_columns
+            .insert(AUTO_CREATED_PLACEHOLDER_TS_COL.to_string());
+    }
+
     projection_info.output_aliases = output_aliases;
     projection_info
 }
 
+fn is_passthrough_output_column(alias_name: &str, expr: &Expr) -> bool {
+    matches!(expr, Expr::Literal(_, _))
+        || match alias_name {
+            AUTO_CREATED_UPDATE_AT_TS_COL => expr == &datafusion::prelude::now(),
+            AUTO_CREATED_PLACEHOLDER_TS_COL => is_literal_or_cast_literal(expr),
+            _ => false,
+        }
+}
+
+fn is_literal_or_cast_literal(expr: &Expr) -> bool {
+    match expr {
+        Expr::Literal(_, _) => true,
+        Expr::Cast(cast) => is_literal_or_cast_literal(cast.expr.as_ref()),
+        Expr::TryCast(cast) => is_literal_or_cast_literal(cast.expr.as_ref()),
+        _ => false,
+    }
+}
+
 fn merge_op_for_aggregate_expr(aggr_expr: &Expr) -> Result<IncrementalAggregateMergeOp, String> {
     let Some(aggr_func) = get_aggr_func(aggr_expr) else {
         return Err(aggr_expr.to_string());
@@ -385,6 +413,11 @@ fn find_uncovered_output_fields(
             !group_key_names.contains(*name)
                 && !merge_column_names.contains(*name)
                 && !projection_info.literal_columns.contains(*name)
+                // Auto-created sink columns injected by ColumnMatcherRewriter
+                // are not part of the original aggregate semantics and must
+                // not prevent incremental aggregate rewrites.
+                && name.as_str() != AUTO_CREATED_UPDATE_AT_TS_COL
+                && name.as_str() != AUTO_CREATED_PLACEHOLDER_TS_COL
         })
         .cloned()
         .collect()
@@ -536,7 +569,8 @@ pub fn analyze_incremental_aggregate_plan(
 ///
 /// ```text
 /// delta = SELECT ts, number FROM <delta_plan> AS __flow_delta
-/// sink  = SELECT ts, number FROM <sink_table> AS __flow_sink
+/// sink_scan = SELECT * FROM <sink_table> [WHERE <sink_dirty_filter>]
+/// sink  = SELECT ts, number FROM sink_scan AS __flow_sink
 /// SELECT
 ///   CASE
 ///     WHEN __flow_sink.number IS NULL THEN __flow_delta.number
@@ -548,11 +582,17 @@ pub fn analyze_incremental_aggregate_plan(
 /// LEFT JOIN sink
 ///   ON __flow_delta.ts IS NOT DISTINCT FROM __flow_sink.ts
 /// ```
+///
+/// If `sink_dirty_filter` is provided, it is applied to the sink table scan
+/// before projection, aliasing, and the left join. The predicate must reference
+/// raw sink table columns structurally (unqualified), before the `__flow_sink`
+/// alias exists.
 pub async fn rewrite_incremental_aggregate_with_sink_merge(
     delta_plan: &LogicalPlan,
     analysis: &IncrementalAggregateAnalysis,
     sink_table: TableRef,
     sink_table_name: &TableName,
+    sink_dirty_filter: Option<Expr>,
 ) -> Result<LogicalPlan, Error> {
     ensure!(
         analysis.unsupported_exprs.is_empty(),
@@ -637,7 +677,22 @@ pub async fn rewrite_incremental_aggregate_with_sink_merge(
         .cloned()
         .map(unqualified_col)
         .collect::<Vec<_>>();
-    let sink_selected = LogicalPlanBuilder::from(sink_scan)
+    let sink_input = if let Some(predicate) = sink_dirty_filter {
+        LogicalPlanBuilder::from(sink_scan)
+            .filter(predicate)
+            .with_context(|_| DatafusionSnafu {
+                context: "Failed to filter sink table scan for incremental sink merge".to_string(),
+            })?
+            .build()
+            .with_context(|_| DatafusionSnafu {
+                context: "Failed to build filtered sink plan for incremental sink merge"
+                    .to_string(),
+            })?
+    } else {
+        sink_scan
+    };
+
+    let sink_selected = LogicalPlanBuilder::from(sink_input)
         .project(sink_selected_exprs)
         .with_context(|_| DatafusionSnafu {
             context: "Failed to project sink table scan for incremental sink merge".to_string(),
diff --git a/src/flow/src/batching_mode/utils/test.rs b/src/flow/src/batching_mode/utils/test.rs
index 863580b4ae..5b9cf7f507 100644
--- a/src/flow/src/batching_mode/utils/test.rs
+++ b/src/flow/src/batching_mode/utils/test.rs
@@ -15,10 +15,13 @@
 use std::sync::Arc;
 
 use common_recordbatch::RecordBatch;
+use common_time::Timestamp;
 use datafusion_common::tree_node::TreeNode as _;
 use datafusion_expr::GroupingSet;
-use datatypes::prelude::{ConcreteDataType, Scalar, VectorRef};
+use datatypes::prelude::{ConcreteDataType, MutableVector, Scalar, ScalarVectorBuilder, VectorRef};
 use datatypes::schema::{ColumnSchema, Schema};
+use datatypes::timestamp::TimestampMillisecond;
+use datatypes::vectors::TimestampMillisecondVectorBuilder;
 use pretty_assertions::assert_eq;
 use query::query_engine::DefaultSerializer;
 use session::context::QueryContext;
@@ -26,6 +29,7 @@ use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
 use table::test_util::MemTable;
 
 use super::*;
+use crate::batching_mode::state::FilterExprInfo;
 use crate::test_utils::create_test_query_engine;
 
 fn u32_table(table_name: &str, columns: Vec<&str>, rows: usize) -> TableRef {
@@ -50,6 +54,30 @@ fn empty_u32_table(table_name: &str, columns: Vec<&str>) -> TableRef {
     u32_table(table_name, columns, 0)
 }
 
+fn time_window_u32_table(table_name: &str) -> TableRef {
+    let schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new(
+            "time_window",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true),
+        ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+    ]));
+
+    let mut time_window_builder = TimestampMillisecondVectorBuilder::with_capacity(1);
+    time_window_builder.push(Some(TimestampMillisecond::new(0)));
+    let recordbatch = RecordBatch::new(
+        schema,
+        vec![
+            time_window_builder.to_vector_cloned(),
+            Arc::new(<u32 as Scalar>::VectorType::from_vec(vec![1])) as VectorRef,
+        ],
+    )
+    .unwrap();
+    MemTable::table(table_name, recordbatch)
+}
+
 fn assert_same_logical_plan(actual: &LogicalPlan, expected: &LogicalPlan) {
     assert_eq!(
         format!("{}", expected.display_indent()),
@@ -84,6 +112,29 @@ fn expected_left_join_rewrite(
     sink_selected_exprs: Vec<Expr>,
     join_keys: (Vec<Column>, Vec<Column>),
     projection_exprs: Vec<Expr>,
+) -> LogicalPlan {
+    expected_left_join_rewrite_with_sink_filter(
+        delta_plan,
+        sink_table,
+        sink_table_name,
+        delta_selected_exprs,
+        sink_selected_exprs,
+        None,
+        join_keys,
+        projection_exprs,
+    )
+}
+
+#[allow(clippy::too_many_arguments)]
+fn expected_left_join_rewrite_with_sink_filter(
+    delta_plan: &LogicalPlan,
+    sink_table: TableRef,
+    sink_table_name: &TableName,
+    delta_selected_exprs: Vec<Expr>,
+    sink_selected_exprs: Vec<Expr>,
+    sink_filter: Option<Expr>,
+    join_keys: (Vec<Column>, Vec<Column>),
+    projection_exprs: Vec<Expr>,
 ) -> LogicalPlan {
     let delta_alias = "__flow_delta";
     let sink_alias = "__flow_sink";
@@ -94,7 +145,17 @@ fn expected_left_join_rewrite(
         .unwrap()
         .build()
         .unwrap();
-    let sink_selected = LogicalPlanBuilder::from(test_sink_scan(sink_table, sink_table_name))
+    let sink_scan = test_sink_scan(sink_table, sink_table_name);
+    let sink_input = if let Some(predicate) = sink_filter {
+        LogicalPlanBuilder::from(sink_scan)
+            .filter(predicate)
+            .unwrap()
+            .build()
+            .unwrap()
+    } else {
+        sink_scan
+    };
+    let sink_selected = LogicalPlanBuilder::from(sink_input)
         .project(sink_selected_exprs)
         .unwrap()
         .alias(sink_alias)
@@ -576,6 +637,44 @@ async fn test_analyze_incremental_aggregate_plan_keeps_aliases_for_multiple_aggr
     }));
 }
 
+#[tokio::test]
+async fn test_analyze_incremental_aggregate_plan_allows_auto_created_sink_columns() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let sql = format!(
+        "SELECT max(number) AS total, ts, now() AS {}, CAST('1970-01-01 00:00:00' AS TIMESTAMP) AS {} FROM numbers_with_ts GROUP BY ts",
+        AUTO_CREATED_UPDATE_AT_TS_COL, AUTO_CREATED_PLACEHOLDER_TS_COL
+    );
+    let plan = sql_to_df_plan(ctx, query_engine, &sql, false)
+        .await
+        .unwrap();
+
+    let analysis = analyze_incremental_aggregate_plan(&plan).unwrap().unwrap();
+    assert!(
+        analysis.unsupported_exprs.is_empty(),
+        "auto-created sink columns should not disable incremental analysis: {:?}",
+        analysis.unsupported_exprs
+    );
+    assert!(
+        analysis
+            .literal_columns
+            .iter()
+            .any(|name| name == AUTO_CREATED_UPDATE_AT_TS_COL)
+    );
+    assert!(
+        analysis
+            .literal_columns
+            .iter()
+            .any(|name| name == AUTO_CREATED_PLACEHOLDER_TS_COL)
+    );
+    assert_eq!(analysis.merge_columns.len(), 1);
+    assert_eq!(analysis.merge_columns[0].output_field_name, "total");
+    assert_eq!(
+        analysis.merge_columns[0].merge_op,
+        IncrementalAggregateMergeOp::Max
+    );
+}
+
 #[tokio::test]
 async fn test_analyze_incremental_aggregate_plan_allows_where_before_aggregate() {
     let query_engine = create_test_query_engine();
@@ -641,6 +740,7 @@ async fn test_rewrite_incremental_aggregate_allows_alias_wrapped_scan() {
             "public".to_string(),
             "alias_wrapped_sink".to_string(),
         ],
+        None,
     )
     .await
     .unwrap();
@@ -887,6 +987,7 @@ async fn test_analyze_incremental_aggregate_plan_allows_literal_outputs() {
         &analysis,
         sink_table.clone(),
         &sink_table_name,
+        None,
     )
     .await
     .unwrap();
@@ -975,6 +1076,7 @@ async fn test_rewrite_incremental_aggregate_preserves_non_identifier_aliases() {
             "public".to_string(),
             "non_identifier_alias_sink".to_string(),
         ],
+        None,
     )
     .await
     .unwrap();
@@ -1161,6 +1263,7 @@ async fn test_rewrite_incremental_aggregate_with_left_join() {
         &analysis,
         sink_table.clone(),
         &sink_table_name,
+        None,
     )
     .await
     .unwrap();
@@ -1183,6 +1286,67 @@ async fn test_rewrite_incremental_aggregate_with_left_join() {
     assert_same_logical_plan(&rewritten, &expected);
 }
 
+#[tokio::test]
+async fn test_rewrite_incremental_aggregate_filters_sink_dirty_time_window() {
+    // This verifies the rewrite placement when callers supply an already
+    // inferred sink dirty-window predicate. The task-level inference rules are
+    // covered by `infer_sink_time_window_filter_col` tests in task.rs.
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let sql = "SELECT max(number) AS number, date_bin(INTERVAL '1 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window";
+    let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
+        .await
+        .unwrap();
+    let analysis = analyze_incremental_aggregate_plan(&plan).unwrap().unwrap();
+    let sink_table = time_window_u32_table("time_window_sink");
+    let sink_table_name = [
+        "greptime".to_string(),
+        "public".to_string(),
+        "time_window_sink".to_string(),
+    ];
+    let dirty_filter = FilterExprInfo {
+        expr: unqualified_col("ts"),
+        col_name: "ts".to_string(),
+        time_ranges: vec![(
+            Timestamp::new_millisecond(0),
+            Timestamp::new_millisecond(1000),
+        )],
+        window_size: chrono::Duration::seconds(1),
+    };
+    let sink_filter = dirty_filter
+        .predicate_for_col("time_window")
+        .unwrap()
+        .unwrap();
+
+    let rewritten = rewrite_incremental_aggregate_with_sink_merge(
+        &plan,
+        &analysis,
+        sink_table.clone(),
+        &sink_table_name,
+        Some(sink_filter.clone()),
+    )
+    .await
+    .unwrap();
+
+    let expected = expected_left_join_rewrite_with_sink_filter(
+        &plan,
+        sink_table,
+        &sink_table_name,
+        vec![unqualified_col("time_window"), unqualified_col("number")],
+        vec![unqualified_col("time_window"), unqualified_col("number")],
+        Some(sink_filter),
+        (
+            vec![qualified_column("__flow_delta", "time_window")],
+            vec![qualified_column("__flow_sink", "time_window")],
+        ),
+        vec![
+            max_merge_expr("number"),
+            qualified_col("__flow_delta", "time_window").alias("time_window"),
+        ],
+    );
+    assert_same_logical_plan(&rewritten, &expected);
+}
+
 #[tokio::test]
 async fn test_analyze_incremental_aggregate_plan_rejects_global_aggregate() {
     let query_engine = create_test_query_engine();
@@ -1230,6 +1394,7 @@ async fn test_rewrite_incremental_aggregate_rejects_empty_group_keys() {
         &analysis,
         sink_table,
         &sink_table_name,
+        None,
     )
     .await
     .unwrap_err();
@@ -1261,6 +1426,7 @@ async fn test_rewrite_incremental_aggregate_preserves_raw_aggregate_field_name()
         &analysis,
         sink_table.clone(),
         &sink_table_name,
+        None,
     )
     .await
     .unwrap();
diff --git a/src/flow/src/metrics.rs b/src/flow/src/metrics.rs
index 58c01793cc..00f93d47ab 100644
--- a/src/flow/src/metrics.rs
+++ b/src/flow/src/metrics.rs
@@ -87,6 +87,20 @@ lazy_static! {
             &["flow_id"],
         )
         .unwrap();
+    pub static ref METRIC_FLOW_BATCHING_ENGINE_CHECKPOINT_DECISION_CNT: IntCounterVec =
+        register_int_counter_vec!(
+            "greptime_flow_batching_checkpoint_decision_count",
+            "flow batching checkpoint state-machine decisions",
+            &["flow_id", "mode", "decision", "reason"],
+        )
+        .unwrap();
+    pub static ref METRIC_FLOW_BATCHING_ENGINE_QUERY_MODE_CNT: IntCounterVec =
+        register_int_counter_vec!(
+            "greptime_flow_batching_query_mode_count",
+            "flow batching query attempts by checkpoint mode",
+            &["flow_id", "mode"],
+        )
+        .unwrap();
     pub static ref METRIC_FLOW_RUN_INTERVAL_MS: IntGauge =
         register_int_gauge!("greptime_flow_run_interval_ms", "flow run interval in ms").unwrap();
     pub static ref METRIC_FLOW_ROWS: IntCounterVec = register_int_counter_vec!(
diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs
index 24075601f6..e85bc28f9a 100644
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -303,7 +303,7 @@ impl Instance {
                     .await
             }
             _ => {
-                query_interceptor.pre_execute(&stmt, None, query_ctx.clone())?;
+                query_interceptor.pre_execute(Some(&stmt), None, query_ctx.clone())?;
                 self.statement_executor
                     .execute_sql(stmt, query_ctx)
                     .await
@@ -326,7 +326,7 @@ impl Instance {
         let QueryStatement::Sql(stmt) = stmt else {
             unreachable!()
         };
-        query_interceptor.pre_execute(&stmt, Some(&plan), query_ctx.clone())?;
+        query_interceptor.pre_execute(Some(&stmt), Some(&plan), query_ctx.clone())?;
 
         self.statement_executor
             .exec_plan(plan, query_ctx.clone())
@@ -344,7 +344,11 @@ impl Instance {
             .statement_executor
             .plan_tql(tql.clone(), query_ctx)
             .await?;
-        query_interceptor.pre_execute(&Statement::Tql(tql), Some(&plan), query_ctx.clone())?;
+        query_interceptor.pre_execute(
+            Some(&Statement::Tql(tql)),
+            Some(&plan),
+            query_ctx.clone(),
+        )?;
         self.statement_executor
             .exec_plan(plan, query_ctx.clone())
             .await
@@ -649,9 +653,7 @@ impl Instance {
         let query_interceptor_opt = self.plugins.get::<SqlQueryInterceptorRef<Error>>();
         let query_interceptor = query_interceptor_opt.as_ref();
 
-        if let Some(ref s) = stmt {
-            query_interceptor.pre_execute(s, Some(&plan), query_ctx.clone())?;
-        }
+        query_interceptor.pre_execute(stmt.as_ref(), Some(&plan), query_ctx.clone())?;
 
         let query = stmt
             .as_ref()
@@ -880,7 +882,11 @@ impl PrometheusHandler for Instance {
             .map_err(BoxedError::new)
             .context(ExecuteQuerySnafu)?;
 
-        interceptor.pre_execute(query, Some(&plan), query_ctx.clone())?;
+        let QueryStatement::Promql(eval_stmt, _) = &stmt else {
+            unreachable!("query is parsed from promql");
+        };
+
+        interceptor.pre_execute(query, &eval_stmt.expr, Some(&plan), query_ctx.clone())?;
 
         // Take the EvalStmt from the original QueryStatement and use it to create the CatalogQueryStatement.
         let query_statement = if let QueryStatement::Promql(eval_stmt, alias) = stmt {
@@ -892,7 +898,7 @@ impl PrometheusHandler for Instance {
             }
             .fail();
         };
-        let query = query_statement.to_string();
+        let raw_query = query_statement.to_string();
 
         let slow_query_timer = self
             .slow_query_options
@@ -912,7 +918,7 @@ impl PrometheusHandler for Instance {
         let ticket = self.process_manager.register_query(
             query_ctx.current_catalog().to_string(),
             vec![query_ctx.current_schema()],
-            query,
+            raw_query,
             query_ctx.conn_info().to_string(),
             Some(query_ctx.process_id()),
             slow_query_timer,
@@ -1204,14 +1210,19 @@ fn should_track_plan_process(stmt: Option<&Statement>, plan: &LogicalPlan) -> bo
 #[cfg(test)]
 mod tests {
     use std::collections::HashMap;
+    use std::future::Future;
+    use std::pin::Pin;
     use std::sync::atomic::{AtomicBool, Ordering};
     use std::sync::{Arc, Barrier};
+    use std::task::{Context, Poll};
     use std::thread;
     use std::time::{Duration, Instant};
 
     use api::v1::meta::{ProcedureDetailResponse, ReconcileRequest, ReconcileResponse};
     use catalog::process_manager::ProcessManager;
     use common_base::Plugins;
+    use common_error::ext::{BoxedError, PlainError};
+    use common_error::status_code::StatusCode;
     use common_meta::cache::LayeredCacheRegistryBuilder;
     use common_meta::kv_backend::memory::MemoryKvBackend;
     use common_meta::procedure_executor::{ExecutorContext, ProcedureExecutor};
@@ -1220,23 +1231,142 @@ mod tests {
         MigrateRegionRequest, MigrateRegionResponse, ProcedureStateResponse,
     };
     use common_query::Output;
+    use common_recordbatch::{
+        OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream,
+    };
     use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
     use datafusion_expr::dml::InsertOp;
     use datafusion_expr::{LogicalPlanBuilder, LogicalTableSource};
     use datatypes::prelude::ConcreteDataType;
-    use datatypes::schema::{ColumnSchema, Schema as GtSchema};
+    use datatypes::schema::{ColumnSchema, Schema as GtSchema, SchemaRef as GtSchemaRef};
     use query::query_engine::options::QueryOptions;
     use session::context::{Channel, ConnInfo, QueryContext, QueryContextBuilder};
+    use snafu::{Location, Snafu};
     use sql::dialect::GreptimeDbDialect;
+    use store_api::data_source::DataSource;
+    use store_api::storage::ScanRequest;
     use strfmt::Format;
-    use table::metadata::{TableInfoBuilder, TableMetaBuilder};
+    use table::metadata::{FilterPushDownType, TableInfo, TableInfoBuilder, TableMetaBuilder};
     use table::test_util::EmptyTable;
+    use table::{Table, TableRef};
     use tokio::sync::{mpsc, oneshot};
 
     use super::*;
     use crate::frontend::FrontendOptions;
     use crate::instance::builder::FrontendBuilder;
 
+    #[derive(Debug, Snafu)]
+    enum TestError {
+        #[snafu(display("Failed to build test cache registry"))]
+        BuildCacheRegistry {
+            source: cache::error::Error,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Failed to build test table meta for table: {table_name}"))]
+        BuildTableMeta {
+            table_name: String,
+            source: table::metadata::TableMetaBuilderError,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Failed to build test table info for table: {table_name}"))]
+        BuildTableInfo {
+            table_name: String,
+            source: table::metadata::TableInfoBuilderError,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Failed to register test table: {table_name}"))]
+        RegisterTable {
+            table_name: String,
+            source: catalog::error::Error,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Failed to build test frontend instance"))]
+        BuildFrontend {
+            source: crate::error::Error,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Expected exactly one output for SQL `{sql}`, got {actual}"))]
+        UnexpectedOutputCount {
+            sql: String,
+            actual: usize,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Failed to execute SQL `{sql}`"))]
+        ExecuteSql {
+            sql: String,
+            source: crate::error::Error,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Timed out waiting for insert-select start notification"))]
+        InsertStartTimeout {
+            source: tokio::time::error::Elapsed,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Insert-select start notification channel closed"))]
+        InsertStartChannelClosed {
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Failed to release blocking insert-select interceptor"))]
+        ReleaseBlockedInsert {
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Timed out waiting for insert-select source to be polled"))]
+        SourcePollTimeout {
+            source: tokio::time::error::Elapsed,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Insert-select source poll notification channel closed"))]
+        SourcePollChannelClosed {
+            source: oneshot::error::RecvError,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Timed out waiting for insert task to finish"))]
+        InsertTaskTimeout {
+            source: tokio::time::error::Elapsed,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Insert task panicked"))]
+        InsertTaskPanic {
+            source: tokio::task::JoinError,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Expected insert-select to be cancelled"))]
+        InsertSelectNotCancelled {
+            #[snafu(implicit)]
+            location: Location,
+        },
+    }
+
+    type TestResult<T> = std::result::Result<T, TestError>;
+
     fn parse_one_sql(sql: &str) -> Statement {
         parse_stmt(sql, &GreptimeDbDialect {}).unwrap().remove(0)
     }
@@ -1270,11 +1400,11 @@ mod tests {
 
         fn pre_execute(
             &self,
-            statement: &Statement,
+            statement: Option<&Statement>,
             _plan: Option<&LogicalPlan>,
             _query_ctx: QueryContextRef,
         ) -> Result<()> {
-            let Statement::Insert(insert) = statement else {
+            let Some(Statement::Insert(insert)) = statement else {
                 return Ok(());
             };
             if !insert.has_non_values_query_source() {
@@ -1292,6 +1422,70 @@ mod tests {
         }
     }
 
+    struct PendingRecordBatchStream {
+        schema: GtSchemaRef,
+        polled_tx: Option<oneshot::Sender<()>>,
+        _finish_tx: oneshot::Sender<()>,
+        finish_rx: Pin<Box<oneshot::Receiver<()>>>,
+    }
+
+    impl RecordBatchStream for PendingRecordBatchStream {
+        fn schema(&self) -> GtSchemaRef {
+            self.schema.clone()
+        }
+
+        fn output_ordering(&self) -> Option<&[OrderOption]> {
+            None
+        }
+
+        fn metrics(&self) -> Option<common_recordbatch::adapter::RecordBatchMetrics> {
+            None
+        }
+    }
+
+    impl Stream for PendingRecordBatchStream {
+        type Item = common_recordbatch::error::Result<RecordBatch>;
+
+        fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+            if let Some(polled_tx) = self.polled_tx.take() {
+                let _ = polled_tx.send(());
+            }
+
+            match self.finish_rx.as_mut().poll(cx) {
+                Poll::Ready(_) => Poll::Ready(None),
+                Poll::Pending => Poll::Pending,
+            }
+        }
+    }
+
+    impl Unpin for PendingRecordBatchStream {}
+
+    struct PendingDataSource {
+        schema: GtSchemaRef,
+        polled_tx: std::sync::Mutex<Option<oneshot::Sender<()>>>,
+    }
+
+    impl DataSource for PendingDataSource {
+        fn get_stream(
+            &self,
+            _request: ScanRequest,
+        ) -> std::result::Result<SendableRecordBatchStream, BoxedError> {
+            let (finish_tx, finish_rx) = oneshot::channel();
+            let mut polled_tx = self.polled_tx.lock().map_err(|_| {
+                BoxedError::new(PlainError::new(
+                    "pending data source lock poisoned".to_string(),
+                    StatusCode::Unexpected,
+                ))
+            })?;
+            Ok(Box::pin(PendingRecordBatchStream {
+                schema: self.schema.clone(),
+                polled_tx: polled_tx.take(),
+                _finish_tx: finish_tx,
+                finish_rx: Box::pin(finish_rx),
+            }))
+        }
+    }
+
     struct NoopProcedureExecutor;
 
     #[async_trait::async_trait]
@@ -1353,18 +1547,18 @@ mod tests {
 
     fn test_cache_registry(
         kv_backend: common_meta::kv_backend::KvBackendRef,
-    ) -> common_meta::cache::LayeredCacheRegistryRef {
-        Arc::new(
+    ) -> TestResult<common_meta::cache::LayeredCacheRegistryRef> {
+        Ok(Arc::new(
             cache::with_default_composite_cache_registry(
                 LayeredCacheRegistryBuilder::default()
                     .add_cache_registry(cache::build_fundamental_cache_registry(kv_backend)),
             )
-            .unwrap()
+            .context(BuildCacheRegistrySnafu)?
             .build(),
-        )
+        ))
     }
 
-    fn test_table(table_id: u32, table_name: &str) -> table::TableRef {
+    fn test_table_info(table_id: u32, table_name: &str) -> TestResult<TableInfo> {
         let schema = Arc::new(GtSchema::new(vec![
             ColumnSchema::new("id", ConcreteDataType::int32_datatype(), false),
             ColumnSchema::new(
@@ -1380,36 +1574,85 @@ mod tests {
             .value_indices(vec![1])
             .next_column_id(1024)
             .build()
-            .unwrap();
-        let table_info = TableInfoBuilder::new(table_name, table_meta)
+            .with_context(|_| BuildTableMetaSnafu {
+                table_name: table_name.to_string(),
+            })?;
+
+        TableInfoBuilder::new(table_name, table_meta)
             .table_id(table_id)
             .build()
-            .unwrap();
+            .with_context(|_| BuildTableInfoSnafu {
+                table_name: table_name.to_string(),
+            })
+    }
 
-        EmptyTable::from_table_info(&table_info)
+    fn test_table(table_id: u32, table_name: &str) -> TestResult<table::TableRef> {
+        let table_info = test_table_info(table_id, table_name)?;
+        Ok(EmptyTable::from_table_info(&table_info))
+    }
+
+    fn pending_table(
+        table_id: u32,
+        table_name: &str,
+        polled_tx: oneshot::Sender<()>,
+    ) -> TestResult<table::TableRef> {
+        let table_info = test_table_info(table_id, table_name)?;
+        let data_source = Arc::new(PendingDataSource {
+            schema: table_info.meta.schema.clone(),
+            polled_tx: std::sync::Mutex::new(Some(polled_tx)),
+        });
+
+        Ok(Arc::new(Table::new(
+            Arc::new(table_info),
+            FilterPushDownType::Unsupported,
+            data_source,
+        )))
+    }
+
+    async fn test_instance_with_tables(
+        source_table: TableRef,
+        target_table: TableRef,
+    ) -> TestResult<Instance> {
+        test_instance_with_plugins(source_table, target_table, Plugins::new()).await
     }
 
     async fn test_instance_with_insert_select_interceptor(
         interceptor: SqlQueryInterceptorRef<Error>,
-    ) -> Instance {
+    ) -> TestResult<Instance> {
+        let plugins = Plugins::new();
+        plugins.insert::<SqlQueryInterceptorRef<Error>>(interceptor);
+
+        test_instance_with_plugins(
+            test_table(1024, "source")?,
+            test_table(1025, "target")?,
+            plugins,
+        )
+        .await
+    }
+
+    async fn test_instance_with_plugins(
+        source_table: TableRef,
+        target_table: TableRef,
+        plugins: Plugins,
+    ) -> TestResult<Instance> {
         let kv_backend = Arc::new(MemoryKvBackend::new());
         let process_manager = Arc::new(ProcessManager::new("test-frontend".to_string(), None));
-        let catalog_manager =
-            catalog::memory::MemoryCatalogManager::new_with_table(test_table(1024, "source"));
+        let catalog_manager = catalog::memory::MemoryCatalogManager::new_with_table(source_table);
+        let target_table_name = "target";
         catalog_manager
             .register_table_sync(catalog::RegisterTableRequest {
                 catalog: "greptime".to_string(),
                 schema: "public".to_string(),
-                table_name: "target".to_string(),
+                table_name: target_table_name.to_string(),
                 table_id: 1025,
-                table: test_table(1025, "target"),
+                table: target_table,
             })
-            .unwrap();
+            .with_context(|_| RegisterTableSnafu {
+                table_name: target_table_name.to_string(),
+            })?;
         catalog_manager.register_process_list_table(process_manager.clone());
 
-        let cache_registry = test_cache_registry(kv_backend.clone());
-        let plugins = Plugins::new();
-        plugins.insert::<SqlQueryInterceptorRef<Error>>(interceptor);
+        let cache_registry = test_cache_registry(kv_backend.clone())?;
 
         FrontendBuilder::new(
             FrontendOptions::default(),
@@ -1423,17 +1666,25 @@ mod tests {
         .with_plugin(plugins)
         .try_build()
         .await
-        .unwrap()
+        .context(BuildFrontendSnafu)
     }
 
     async fn execute_one_sql(
         instance: &Instance,
         sql: &str,
         query_ctx: QueryContextRef,
-    ) -> Result<Output> {
+    ) -> TestResult<Output> {
         let mut results = instance.do_query_inner(sql, query_ctx).await;
-        assert_eq!(1, results.len());
-        results.remove(0)
+        ensure!(
+            results.len() == 1,
+            UnexpectedOutputCountSnafu {
+                sql: sql.to_string(),
+                actual: results.len(),
+            }
+        );
+        results.remove(0).with_context(|_| ExecuteSqlSnafu {
+            sql: sql.to_string(),
+        })
     }
 
     #[test]
@@ -1588,12 +1839,12 @@ mod tests {
     }
 
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn test_insert_select_is_visible_in_show_processlist() {
+    async fn test_insert_select_is_visible_in_show_processlist() -> TestResult<()> {
         let insert_sql = "INSERT INTO target SELECT * FROM source";
         let (started_tx, mut started_rx) = mpsc::unbounded_channel();
         let (finish_tx, finish_rx) = oneshot::channel();
         let interceptor = Arc::new(BlockingInsertSelectInterceptor::new(started_tx, finish_rx));
-        let instance = Arc::new(test_instance_with_insert_select_interceptor(interceptor).await);
+        let instance = Arc::new(test_instance_with_insert_select_interceptor(interceptor).await?);
 
         let insert_task = tokio::spawn({
             let instance = instance.clone();
@@ -1602,20 +1853,77 @@ mod tests {
 
         tokio::time::timeout(Duration::from_secs(5), started_rx.recv())
             .await
-            .unwrap()
-            .unwrap();
+            .context(InsertStartTimeoutSnafu)?
+            .context(InsertStartChannelClosedSnafu)?;
 
-        let output = execute_one_sql(&instance, "SHOW PROCESSLIST", test_query_ctx(43))
-            .await
-            .unwrap();
+        let output = execute_one_sql(&instance, "SHOW PROCESSLIST", test_query_ctx(43)).await?;
         let process_list = output.data.pretty_print().await;
         assert!(
             process_list.contains(insert_sql),
             "process list did not contain running insert:\n{process_list}"
         );
 
-        finish_tx.send(()).unwrap();
-        insert_task.await.unwrap().unwrap();
+        finish_tx
+            .send(())
+            .map_err(|_| ReleaseBlockedInsertSnafu.build())?;
+        insert_task.await.context(InsertTaskPanicSnafu)??;
+
+        Ok(())
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_kill_query_cancels_insert_select() -> TestResult<()> {
+        assert_kill_cancels_insert_select("KILL QUERY 4242").await
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_kill_process_id_cancels_insert_select() -> TestResult<()> {
+        assert_kill_cancels_insert_select("KILL 'test-frontend/4242'").await
+    }
+
+    async fn assert_kill_cancels_insert_select(kill_sql: &str) -> TestResult<()> {
+        let insert_sql = "INSERT INTO target SELECT * FROM source";
+        let (source_polled_tx, source_polled_rx) = oneshot::channel();
+        let instance = Arc::new(
+            test_instance_with_tables(
+                pending_table(1024, "source", source_polled_tx)?,
+                test_table(1025, "target")?,
+            )
+            .await?,
+        );
+
+        let insert_task = tokio::spawn({
+            let instance = instance.clone();
+            async move { execute_one_sql(&instance, insert_sql, test_query_ctx(4242)).await }
+        });
+
+        tokio::time::timeout(Duration::from_secs(5), source_polled_rx)
+            .await
+            .context(SourcePollTimeoutSnafu)?
+            .context(SourcePollChannelClosedSnafu)?;
+
+        let output = execute_one_sql(&instance, kill_sql, test_query_ctx(43)).await?;
+        assert!(matches!(output.data, OutputData::AffectedRows(1)));
+
+        let insert_result = tokio::time::timeout(Duration::from_secs(5), insert_task)
+            .await
+            .context(InsertTaskTimeoutSnafu)?
+            .context(InsertTaskPanicSnafu)?;
+        let err = match insert_result {
+            Ok(_) => return InsertSelectNotCancelledSnafu.fail(),
+            Err(TestError::ExecuteSql { source, .. }) => source,
+            Err(err) => return Err(err),
+        };
+        assert_eq!(StatusCode::Cancelled, err.status_code());
+
+        let output = execute_one_sql(&instance, "SHOW PROCESSLIST", test_query_ctx(43)).await?;
+        let process_list = output.data.pretty_print().await;
+        assert!(
+            !process_list.contains(insert_sql),
+            "process list still contains killed insert:\n{process_list}"
+        );
+
+        Ok(())
     }
 
     fn insert_dml_plan() -> LogicalPlan {
diff --git a/src/frontend/src/instance/grpc.rs b/src/frontend/src/instance/grpc.rs
index 8d18293cb8..0ca1a2cf20 100644
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -121,8 +121,9 @@ impl GrpcQueryHandler for Instance {
                                 .context(PlanStatementSnafu)?;
 
                             let dummy_catalog_list =
-                                Arc::new(catalog::table_source::dummy_catalog::DummyCatalogList::new(
+                                Arc::new(catalog::table_source::dummy_catalog::DummyCatalogList::new_with_query_ctx(
                                     self.catalog_manager().clone(),
+                                    ctx.clone(),
                                 ));
 
                             let logical_plan = plan_decoder
@@ -416,10 +417,12 @@ impl Instance {
             .new_plan_decoder()
             .context(PlanStatementSnafu)?;
 
-        let dummy_catalog_list =
-            Arc::new(catalog::table_source::dummy_catalog::DummyCatalogList::new(
+        let dummy_catalog_list = Arc::new(
+            catalog::table_source::dummy_catalog::DummyCatalogList::new_with_query_ctx(
                 self.catalog_manager().clone(),
-            ));
+                ctx.clone(),
+            ),
+        );
 
         // no optimize yet since we still need to add stuff
         let logical_plan = plan_decoder
diff --git a/src/index/Cargo.toml b/src/index/Cargo.toml
index 3b78f7d22f..167c1c0df1 100644
--- a/src/index/Cargo.toml
+++ b/src/index/Cargo.toml
@@ -26,7 +26,7 @@ fst.workspace = true
 futures.workspace = true
 greptime-proto.workspace = true
 itertools.workspace = true
-jieba-rs = "0.8"
+jieba-rs = "0.10"
 lazy_static.workspace = true
 mockall.workspace = true
 nalgebra.workspace = true
@@ -40,8 +40,8 @@ serde.workspace = true
 serde_json.workspace = true
 snafu.workspace = true
 store-api.workspace = true
-tantivy = { version = "0.24", features = ["zstd-compression"] }
-tantivy-jieba = "0.16"
+tantivy = { version = "0.26", features = ["zstd-compression"] }
+tantivy-jieba = "0.20"
 tokio.workspace = true
 tokio-util.workspace = true
 usearch = { version = "2.21", default-features = false, features = ["fp16lib"], optional = true }
diff --git a/src/index/benches/tokenizer_bench.rs b/src/index/benches/tokenizer_bench.rs
index e365c884b2..f376fe57d7 100644
--- a/src/index/benches/tokenizer_bench.rs
+++ b/src/index/benches/tokenizer_bench.rs
@@ -12,8 +12,79 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
-use index::fulltext_index::tokenizer::{EnglishTokenizer, Tokenizer};
+use std::collections::HashMap;
+use std::hint::black_box;
+use std::path::PathBuf;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use futures::AsyncRead;
+use index::fulltext_index::create::{FulltextIndexCreator, TantivyFulltextIndexCreator};
+use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer};
+use index::fulltext_index::{Analyzer, Config};
+use puffin::puffin_manager::{PuffinWriter, PutOptions};
+
+const CHINESE_TOKENIZER_TEXTS: &[(&str, &str)] = &[
+    ("short", "登录手机号。中国农业银行。"),
+    (
+        "mixed_log",
+        "2025-08-01 21:09:28 用户登录失败 trace_id=abc_123 dynamic_key=mobile_login 中国农业银行接口返回超时。",
+    ),
+    (
+        "product_search",
+        "哈基米哦南北绿豆，噢马自立曼波。装电视台，中国中央广播电视台。压不缩，笑不活。",
+    ),
+    (
+        "long_news",
+        "中国农业银行发布公告称，手机银行登录服务完成升级。多个地区用户反馈查询速度提升，后台监控显示核心链路延迟下降，异常请求自动重试次数减少。系统继续保留 trace_id、request_id 和 dynamic_key 等字段用于排查问题。",
+    ),
+];
+
+const CHINESE_INDEX_DOCS: &[&str] = &[
+    "登录手机号，中国农业银行手机银行接口返回成功。",
+    "用户登录失败，trace_id=abc_123，dynamic_key=mobile_login。",
+    "中国中央广播电视台发布新的节目预告。",
+    "装电视台的时候遇到压不缩的问题。",
+    "哈基米哦南北绿豆，噢马自立曼波。",
+    "后台监控显示核心链路延迟下降。",
+    "系统保留 request_id 用于排查问题。",
+    "中文全文索引需要兼顾召回率和 token 数量。",
+];
+
+struct NoopPuffinWriter;
+
+#[async_trait]
+impl PuffinWriter for NoopPuffinWriter {
+    async fn put_blob<R>(
+        &mut self,
+        _key: &str,
+        _raw_data: R,
+        _options: PutOptions,
+        _properties: HashMap<String, String>,
+    ) -> puffin::error::Result<u64>
+    where
+        R: AsyncRead + Send,
+    {
+        unreachable!("tantivy fulltext benchmark only writes directory blobs")
+    }
+
+    async fn put_dir(
+        &mut self,
+        _key: &str,
+        _dir: PathBuf,
+        _options: PutOptions,
+        _properties: HashMap<String, String>,
+    ) -> puffin::error::Result<u64> {
+        Ok(0)
+    }
+
+    fn set_footer_lz4_compressed(&mut self, _lz4_compressed: bool) {}
+
+    async fn finish(self) -> puffin::error::Result<u64> {
+        Ok(0)
+    }
+}
 
 fn bench_english_tokenizer(c: &mut Criterion) {
     let tokenizer = EnglishTokenizer;
@@ -86,5 +157,104 @@ fn bench_english_tokenizer(c: &mut Criterion) {
     repeat_group.finish();
 }
 
-criterion_group!(benches, bench_english_tokenizer);
+fn bench_chinese_tokenizer(c: &mut Criterion) {
+    let tokenizer = ChineseTokenizer;
+    let mut group = c.benchmark_group("chinese_tokenizer");
+
+    for (name, text) in CHINESE_TOKENIZER_TEXTS {
+        group.throughput(Throughput::Bytes(text.len() as u64));
+        group.bench_with_input(BenchmarkId::new("tokenize", name), text, |b, text| {
+            b.iter(|| black_box(tokenizer.tokenize(black_box(text))))
+        });
+    }
+
+    group.finish();
+
+    let mut repeat_group = c.benchmark_group("chinese_tokenizer_repeated");
+    let sample_text = CHINESE_TOKENIZER_TEXTS
+        .iter()
+        .find(|(name, _)| *name == "mixed_log")
+        .map(|(_, text)| *text)
+        .expect("mixed_log sample must exist");
+
+    for repeat_count in [10, 100, 1000] {
+        repeat_group.bench_with_input(
+            BenchmarkId::new("repeated_tokenize", repeat_count),
+            &repeat_count,
+            |b, &repeat_count| {
+                b.iter(|| {
+                    for _ in 0..repeat_count {
+                        black_box(tokenizer.tokenize(black_box(sample_text)));
+                    }
+                })
+            },
+        );
+    }
+
+    repeat_group.finish();
+}
+
+fn bench_tantivy_chinese_fulltext_index(c: &mut Criterion) {
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .expect("failed to create Tokio runtime");
+    let config = Config {
+        analyzer: Analyzer::Chinese,
+        case_sensitive: false,
+    };
+    let mut group = c.benchmark_group("tantivy_chinese_fulltext_index");
+    group.sample_size(10);
+    group.measurement_time(Duration::from_secs(10));
+
+    for doc_count in [32usize, 256usize] {
+        group.throughput(Throughput::Elements(doc_count as u64));
+        group.bench_with_input(
+            BenchmarkId::new("build_commit", doc_count),
+            &doc_count,
+            |b, &doc_count| {
+                b.iter_batched(
+                    tempfile::tempdir,
+                    |dir| {
+                        let dir = dir.expect("failed to create temp dir");
+                        runtime.block_on(async {
+                            let mut creator =
+                                TantivyFulltextIndexCreator::new(dir.path(), config, 64 << 20)
+                                    .await
+                                    .expect("failed to create tantivy fulltext index");
+                            for idx in 0..doc_count {
+                                let text = CHINESE_INDEX_DOCS[idx % CHINESE_INDEX_DOCS.len()];
+                                creator
+                                    .push_text(black_box(text))
+                                    .await
+                                    .expect("failed to push text");
+                            }
+                            let mut puffin_writer = NoopPuffinWriter;
+                            creator
+                                .finish(
+                                    &mut puffin_writer,
+                                    "tantivy_chinese_fulltext_index",
+                                    PutOptions::default(),
+                                )
+                                .await
+                                .expect("failed to commit tantivy fulltext index");
+                        });
+                        // Return the temp dir so Criterion drops it after timing the routine.
+                        dir
+                    },
+                    BatchSize::SmallInput,
+                )
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_english_tokenizer,
+    bench_chinese_tokenizer,
+    bench_tantivy_chinese_fulltext_index
+);
 criterion_main!(benches);
diff --git a/src/index/src/fulltext_index.rs b/src/index/src/fulltext_index.rs
index 8de28c0490..06a36f65a8 100644
--- a/src/index/src/fulltext_index.rs
+++ b/src/index/src/fulltext_index.rs
@@ -52,7 +52,7 @@ impl Config {
     fn build_tantivy_tokenizer(&self) -> TokenizerManager {
         let mut builder = match self.analyzer {
             Analyzer::English => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
-            Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer {}).dynamic(),
+            Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer::new()).dynamic(),
         };
 
         if !self.case_sensitive {
diff --git a/src/index/src/fulltext_index/tokenizer.rs b/src/index/src/fulltext_index/tokenizer.rs
index 919c497317..3afc826e6f 100644
--- a/src/index/src/fulltext_index/tokenizer.rs
+++ b/src/index/src/fulltext_index/tokenizer.rs
@@ -98,7 +98,8 @@ impl Tokenizer for ChineseTokenizer {
             let mut tokens = JIEBA
                 .cut_for_search(text, true)
                 .into_iter()
-                .filter(|s| is_indexable_token(s))
+                .map(|token| token.word)
+                .filter(|token| is_indexable_token(token))
                 .collect::<Vec<_>>();
 
             let english = EnglishTokenizer {};
@@ -336,10 +337,26 @@ mod tests {
         let text = "哈基米哦南北绿豆，噢马自立曼波。登录手机号。中国农业银行。装电视台，中国中央广播电视台。压不缩，笑不活。";
 
         let default_tokens = tokenizer.tokenize(text);
-        let cut_hmm_false = JIEBA.cut(text, false);
-        let cut_hmm_true = JIEBA.cut(text, true);
-        let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false);
-        let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true);
+        let cut_hmm_false = JIEBA
+            .cut(text, false)
+            .into_iter()
+            .map(|token| token.word)
+            .collect::<Vec<_>>();
+        let cut_hmm_true = JIEBA
+            .cut(text, true)
+            .into_iter()
+            .map(|token| token.word)
+            .collect::<Vec<_>>();
+        let cut_for_search_hmm_false = JIEBA
+            .cut_for_search(text, false)
+            .into_iter()
+            .map(|token| token.word)
+            .collect::<Vec<_>>();
+        let cut_for_search_hmm_true = JIEBA
+            .cut_for_search(text, true)
+            .into_iter()
+            .map(|token| token.word)
+            .collect::<Vec<_>>();
 
         assert_eq!(
             default_tokens,
diff --git a/src/meta-srv/src/cache_invalidator.rs b/src/meta-srv/src/cache_invalidator.rs
index b594d65f48..f6ec0b4fc9 100644
--- a/src/meta-srv/src/cache_invalidator.rs
+++ b/src/meta-srv/src/cache_invalidator.rs
@@ -84,4 +84,11 @@ impl CacheInvalidator for MetasrvCacheInvalidator {
         let instruction = Instruction::InvalidateCaches(caches.to_vec());
         self.broadcast(ctx, instruction).await
     }
+
+    fn invalidate_all(&self) -> MetaResult<()> {
+        // MetasrvCacheInvalidator only broadcasts concrete cache identifiers to
+        // remote nodes. The heartbeat instruction protocol has no global
+        // invalidate-all message, so there is no safe broadcast to send here.
+        Ok(())
+    }
 }
diff --git a/src/meta-srv/src/handler.rs b/src/meta-srv/src/handler.rs
index 4b05db4e4c..9cfd4e6079 100644
--- a/src/meta-srv/src/handler.rs
+++ b/src/meta-srv/src/handler.rs
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashSet};
 use std::fmt::{Debug, Display};
-use std::ops::Range;
+use std::ops::Bound;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
 
@@ -136,6 +137,26 @@ pub struct PusherId {
     pub id: u64,
 }
 
+impl PartialEq for PusherId {
+    fn eq(&self, other: &Self) -> bool {
+        self.role as i32 == other.role as i32 && self.id == other.id
+    }
+}
+
+impl Eq for PusherId {}
+
+impl PartialOrd for PusherId {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for PusherId {
+    fn cmp(&self, other: &Self) -> Ordering {
+        (self.role as i32, self.id).cmp(&(other.role as i32, other.id))
+    }
+}
+
 impl Debug for PusherId {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{:?}-{}", self.role, self.id)
@@ -153,8 +174,11 @@ impl PusherId {
         Self { role, id }
     }
 
-    pub fn string_key(&self) -> String {
-        format!("{}-{}", self.role as i32, self.id)
+    fn role_range(role: Role) -> (Bound<Self>, Bound<Self>) {
+        (
+            Bound::Included(Self::new(role, u64::MIN)),
+            Bound::Included(Self::new(role, u64::MAX)),
+        )
     }
 }
 
@@ -214,7 +238,7 @@ impl Pusher {
 
 /// The group of heartbeat pushers.
 #[derive(Clone, Default)]
-pub struct Pushers(Arc<RwLock<BTreeMap<String, Pusher>>>);
+pub struct Pushers(Arc<RwLock<BTreeMap<PusherId, Pusher>>>);
 
 impl Pushers {
     async fn push(
@@ -222,11 +246,12 @@ impl Pushers {
         pusher_id: PusherId,
         mailbox_message: MailboxMessage,
     ) -> Result<DeregisterSignalReceiver> {
-        let pusher_id = pusher_id.string_key();
         let pushers = self.0.read().await;
         let pusher = pushers
             .get(&pusher_id)
-            .context(error::PusherNotFoundSnafu { pusher_id })?;
+            .with_context(|| error::PusherNotFoundSnafu {
+                pusher_id: pusher_id.to_string(),
+            })?;
 
         pusher
             .push(HeartbeatResponse {
@@ -239,14 +264,10 @@ impl Pushers {
         Ok(pusher.deregister_signal_receiver.clone())
     }
 
-    async fn broadcast(
-        &self,
-        range: Range<String>,
-        mailbox_message: &MailboxMessage,
-    ) -> Result<()> {
+    async fn broadcast(&self, role: Role, mailbox_message: &MailboxMessage) -> Result<()> {
         let pushers = self.0.read().await;
         let pushers = pushers
-            .range(range)
+            .range(PusherId::role_range(role))
             .map(|(_, value)| value)
             .collect::<Vec<_>>();
         let mut results = Vec::with_capacity(pushers.len());
@@ -271,21 +292,12 @@ impl Pushers {
         Ok(())
     }
 
-    pub(crate) async fn insert(&self, pusher_id: String, pusher: Pusher) -> Option<Pusher> {
+    pub(crate) async fn insert(&self, pusher_id: PusherId, pusher: Pusher) -> Option<Pusher> {
         self.0.write().await.insert(pusher_id, pusher)
     }
 
-    async fn remove(&self, pusher_id: &str) -> Option<Pusher> {
-        self.0.write().await.remove(pusher_id)
-    }
-
-    pub(crate) async fn clear(&self) -> Vec<String> {
-        let mut pushers = self.0.write().await;
-        let keys = pushers.keys().cloned().collect::<Vec<_>>();
-        if !keys.is_empty() {
-            pushers.clear();
-        }
-        keys
+    async fn remove(&self, pusher_id: PusherId) -> Option<Pusher> {
+        self.0.write().await.remove(&pusher_id)
     }
 }
 
@@ -317,17 +329,24 @@ impl HeartbeatHandlerGroup {
     pub async fn register_pusher(&self, pusher_id: PusherId, pusher: Pusher) {
         METRIC_META_HEARTBEAT_CONNECTION_NUM.inc();
         info!("Pusher register: {}", pusher_id);
-        let _ = self.pushers.insert(pusher_id.string_key(), pusher).await;
+        let _ = self.pushers.insert(pusher_id, pusher).await;
     }
 
     /// Deregisters the heartbeat response [`Pusher`] with the given key from the group.
     pub async fn deregister_push(&self, pusher_id: PusherId) {
-        info!("Pusher unregister: {}", pusher_id);
-        if self.pushers.remove(&pusher_id.string_key()).await.is_some() {
+        if self.pushers.remove(pusher_id).await.is_some() {
+            info!("Pusher unregister: {}", pusher_id);
             METRIC_META_HEARTBEAT_CONNECTION_NUM.dec();
         }
     }
 
+    #[cfg(test)]
+    /// Returns whether the group contains the heartbeat response [`Pusher`] with the given key.
+    pub async fn contains_pusher(&self, pusher_id: &PusherId) -> bool {
+        let pushers = self.pushers.0.read().await;
+        pushers.contains_key(pusher_id)
+    }
+
     /// Returns the [`Pushers`] of the group.
     pub fn pushers(&self) -> Pushers {
         self.pushers.clone()
@@ -533,7 +552,7 @@ impl Mailbox for HeartbeatMailbox {
     }
 
     async fn broadcast(&self, ch: &BroadcastChannel, msg: &MailboxMessage) -> Result<()> {
-        self.pushers.broadcast(ch.pusher_range(), msg).await
+        self.pushers.broadcast(ch.role(), msg).await
     }
 
     async fn on_recv(&self, id: MessageId, maybe_msg: Result<MailboxMessage>) -> Result<()> {
@@ -550,14 +569,6 @@ impl Mailbox for HeartbeatMailbox {
 
         Ok(())
     }
-
-    async fn reset(&self) {
-        let keys = self.pushers.clear().await;
-        if !keys.is_empty() {
-            info!("Reset mailbox, deregister pushers: {:?}", keys);
-            METRIC_META_HEARTBEAT_CONNECTION_NUM.sub(keys.len() as i64);
-        }
-    }
 }
 
 /// The builder to build the group of heartbeat handlers.
@@ -871,6 +882,7 @@ impl HeartbeatHandlerGroupBuilderCustomizer for DefaultHeartbeatHandlerGroupBuil
 mod tests {
 
     use std::assert_matches;
+    use std::collections::BTreeMap;
     use std::sync::Arc;
     use std::time::Duration;
 
@@ -946,6 +958,62 @@ mod tests {
         (mailbox, receiver)
     }
 
+    #[test]
+    fn test_pusher_id_role_range() {
+        let mut pushers = BTreeMap::new();
+        pushers.insert(PusherId::new(Role::Datanode, u64::MAX), "datanode");
+        pushers.insert(PusherId::new(Role::Frontend, u64::MIN), "frontend-min");
+        pushers.insert(PusherId::new(Role::Frontend, u64::MAX), "frontend-max");
+        pushers.insert(PusherId::new(Role::Flownode, u64::MIN), "flownode");
+
+        let frontend_pushers = pushers
+            .range(PusherId::role_range(Role::Frontend))
+            .map(|(_, value)| *value)
+            .collect::<Vec<_>>();
+
+        assert_eq!(frontend_pushers, vec!["frontend-min", "frontend-max"]);
+    }
+
+    #[tokio::test]
+    async fn test_pushers_broadcast_by_role() {
+        let pushers = Pushers::default();
+        let (datanode_tx, mut datanode_rx) = mpsc::channel(1);
+        let (frontend_tx, mut frontend_rx) = mpsc::channel(1);
+        let (flownode_tx, mut flownode_rx) = mpsc::channel(1);
+
+        pushers
+            .insert(
+                PusherId::new(Role::Datanode, u64::MAX),
+                Pusher::new(datanode_tx),
+            )
+            .await;
+        pushers
+            .insert(PusherId::new(Role::Frontend, 1), Pusher::new(frontend_tx))
+            .await;
+        pushers
+            .insert(
+                PusherId::new(Role::Flownode, u64::MIN),
+                Pusher::new(flownode_tx),
+            )
+            .await;
+
+        let msg = MailboxMessage {
+            id: 42,
+            subject: "broadcast-test".to_string(),
+            timestamp_millis: 123,
+            ..Default::default()
+        };
+
+        pushers.broadcast(Role::Frontend, &msg).await.unwrap();
+
+        let received = frontend_rx.recv().await.unwrap().unwrap();
+        let mailbox_message = received.mailbox_message.unwrap();
+        assert_eq!(mailbox_message.id, 0);
+        assert_eq!(mailbox_message.subject, "broadcast-test");
+        assert!(datanode_rx.try_recv().is_err());
+        assert!(flownode_rx.try_recv().is_err());
+    }
+
     #[test]
     fn test_handler_group_builder() {
         let group = HeartbeatHandlerGroupBuilder::new(Pushers::default())
diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs
index df2a3a35b8..f7f5bbf77d 100644
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -512,7 +512,6 @@ pub struct MetaStateHandler {
     greptimedb_telemetry_task: Arc<GreptimeDBTelemetryTask>,
     leader_cached_kv_backend: Arc<LeaderCachedKvBackend>,
     leadership_change_notifier: LeadershipChangeNotifier,
-    mailbox: MailboxRef,
     state: StateRef,
 }
 
@@ -536,9 +535,6 @@ impl MetaStateHandler {
     pub async fn on_leader_stop(&self) {
         self.state.write().unwrap().next_state(become_follower());
 
-        // Enforces the mailbox to clear all pushers.
-        // The remaining heartbeat connections will be closed by the remote peer or keep-alive detection.
-        self.mailbox.reset().await;
         self.leadership_change_notifier
             .notify_on_leader_stop()
             .await;
@@ -667,7 +663,6 @@ impl Metasrv {
                 state: self.state.clone(),
                 leader_cached_kv_backend: leader_cached_kv_backend.clone(),
                 leadership_change_notifier,
-                mailbox: self.mailbox.clone(),
             };
             let _handle = common_runtime::spawn_global(async move {
                 loop {
diff --git a/src/meta-srv/src/procedure/repartition.rs b/src/meta-srv/src/procedure/repartition.rs
index f314a40080..c1819cb364 100644
--- a/src/meta-srv/src/procedure/repartition.rs
+++ b/src/meta-srv/src/procedure/repartition.rs
@@ -20,6 +20,7 @@ pub mod group;
 pub mod plan;
 pub mod repartition_end;
 pub mod repartition_start;
+pub mod update_partition_metadata;
 pub mod utils;
 
 use std::any::Any;
@@ -32,7 +33,7 @@ use common_meta::cache_invalidator::CacheInvalidatorRef;
 use common_meta::ddl::DdlContext;
 use common_meta::ddl::allocator::region_routes::RegionRoutesAllocatorRef;
 use common_meta::ddl::allocator::wal_options::WalOptionsAllocatorRef;
-use common_meta::ddl_manager::RepartitionProcedureFactory;
+use common_meta::ddl_manager::{RepartitionProcedureFactory, RepartitionSource};
 use common_meta::instruction::CacheIdent;
 use common_meta::key::datanode_table::RegionInfo;
 use common_meta::key::table_info::TableInfoValue;
@@ -62,7 +63,8 @@ use crate::procedure::repartition::group::{
     Context as RepartitionGroupContext, RepartitionGroupProcedure, region_routes,
 };
 use crate::procedure::repartition::plan::RepartitionPlanEntry;
-use crate::procedure::repartition::repartition_start::RepartitionStart;
+use crate::procedure::repartition::repartition_start::{RepartitionFrom, RepartitionStart};
+use crate::procedure::repartition::update_partition_metadata::PartitionMetadataUpdate;
 use crate::procedure::repartition::utils::{
     get_datanode_table_value, rollback_group_metadata_routes,
 };
@@ -93,6 +95,9 @@ pub struct PersistentContext {
     /// The timeout for repartition operations.
     #[serde(with = "humantime_serde", default = "default_timeout")]
     pub timeout: Duration,
+    #[serde(default)]
+    /// Records table-level partition metadata added by this repartition.
+    pub partition_metadata_update: Option<PartitionMetadataUpdate>,
 }
 
 fn default_timeout() -> Duration {
@@ -121,6 +126,7 @@ impl PersistentContext {
             failed_procedures: vec![],
             unknown_procedures: vec![],
             timeout: timeout.unwrap_or_else(default_timeout),
+            partition_metadata_update: None,
         }
     }
 
@@ -317,7 +323,9 @@ impl Context {
     ///
     /// Abort:
     /// - Table info not found.
-    pub async fn get_table_info_value(&self) -> Result<TableInfoValue> {
+    pub async fn get_raw_table_info_value(
+        &self,
+    ) -> Result<DeserializedValueWithBytes<TableInfoValue>> {
         let table_id = self.persistent_ctx.table_id;
         let table_info_value = self
             .table_metadata_manager
@@ -328,11 +336,36 @@ impl Context {
             .with_context(|_| error::RetryLaterWithSourceSnafu {
                 reason: format!("Failed to get table info for table: {}", table_id),
             })?
-            .context(error::TableInfoNotFoundSnafu { table_id })?
-            .into_inner();
+            .context(error::TableInfoNotFoundSnafu { table_id })?;
+
         Ok(table_info_value)
     }
 
+    pub async fn get_table_info_value(&self) -> Result<TableInfoValue> {
+        let table_info_value = self.get_raw_table_info_value().await?.into_inner();
+        Ok(table_info_value)
+    }
+
+    /// Updates the table info.
+    pub async fn update_table_info(
+        &self,
+        current_table_info_value: &DeserializedValueWithBytes<TableInfoValue>,
+        new_table_info_value: TableInfoValue,
+    ) -> Result<()> {
+        let table_id = self.persistent_ctx.table_id;
+        self.table_metadata_manager
+            .update_table_info(
+                current_table_info_value,
+                None,
+                new_table_info_value.table_info,
+            )
+            .await
+            .map_err(BoxedError::new)
+            .with_context(|_| error::RetryLaterWithSourceSnafu {
+                reason: format!("Failed to update table info for table: {}", table_id),
+            })
+    }
+
     /// Updates the table route.
     ///
     /// Retry:
@@ -469,12 +502,8 @@ struct RepartitionDataOwned {
 impl RepartitionProcedure {
     const TYPE_NAME: &'static str = "metasrv-procedure::Repartition";
 
-    pub fn new(
-        from_exprs: Vec<PartitionExpr>,
-        to_exprs: Vec<PartitionExpr>,
-        context: Context,
-    ) -> Self {
-        let state = Box::new(RepartitionStart::new(from_exprs, to_exprs));
+    pub fn new(from: RepartitionFrom, to_exprs: Vec<PartitionExpr>, context: Context) -> Self {
+        let state = Box::new(RepartitionStart::new(from, to_exprs));
 
         Self { state, context }
     }
@@ -492,24 +521,24 @@ impl RepartitionProcedure {
         Ok(Self { state, context })
     }
 
-    /// Returns whether parent rollback should remove this repartition's allocated regions.
+    /// Returns whether parent rollback should run.
     ///
-    /// This uses an "after AllocateRegion" semantic: once execution reaches
-    /// `AllocateRegion` or any later state, rollback must try to remove this round's
-    /// `allocated_region_ids` from table-route metadata when they exist.
-    ///
-    /// State flow:
-    /// `RepartitionStart -> AllocateRegion -> Dispatch -> Collect -> DeallocateRegion -> RepartitionEnd`
-    ///                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    ///                     rollback allocated regions in metadata
+    /// This uses an "after repartition metadata update" semantic: once execution
+    /// reaches `UpdatePartitionMetadata` or any later rollback-active state,
+    /// rollback must try to clean metadata written by the repartition procedure.
     ///
     /// Notes:
-    /// - `RepartitionStart`: no-op, because allocation has not happened yet.
-    /// - `AllocateRegion` / `Dispatch` / `Collect`  rollback-active.
+    /// - `RepartitionStart`: no-op, because no metadata has been updated yet.
+    /// - `UpdatePartitionMetadata`: rollback table partition metadata.
+    /// - `AllocateRegion` / `Dispatch` / `Collect`: rollback table partition metadata
+    ///   and allocated region metadata.
     /// - `DeallocateRegion`: is not rollback-active.
     /// - `RepartitionEnd`: no-op.
-    fn should_rollback_allocated_regions(&self) -> bool {
-        self.state.as_any().is::<allocate_region::AllocateRegion>()
+    fn should_rollback(&self) -> bool {
+        self.state
+            .as_any()
+            .is::<update_partition_metadata::UpdatePartitionMetadata>()
+            || self.state.as_any().is::<allocate_region::AllocateRegion>()
             || self.state.as_any().is::<dispatch::Dispatch>()
             || self.state.as_any().is::<collect::Collect>()
     }
@@ -526,7 +555,7 @@ impl RepartitionProcedure {
 
     /// Returns allocated region ids that parent rollback should remove.
     ///
-    /// Rollback uses an "after AllocateRegion" semantic:
+    /// Rollback uses an "after region allocation" semantic:
     /// - in `AllocateRegion` and `Dispatch`, all allocated regions belong to the
     ///   current repartition attempt and must be cleaned up.
     /// - in `Collect`, only the plans referenced by failed or unknown
@@ -586,8 +615,47 @@ impl RepartitionProcedure {
         Ok(())
     }
 
+    async fn rollback_partition_metadata(&mut self) -> Result<()> {
+        let Some(update) = self
+            .context
+            .persistent_ctx
+            .partition_metadata_update
+            .as_ref()
+        else {
+            return Ok(());
+        };
+        if update.partition_key_indices.is_empty() {
+            return Ok(());
+        }
+
+        let table_info_value = self.context.get_raw_table_info_value().await?;
+        let mut new_partition_key_indices = table_info_value
+            .table_info
+            .meta
+            .partition_key_indices
+            .clone();
+        new_partition_key_indices.retain(|idx| !update.partition_key_indices.contains(idx));
+        if new_partition_key_indices == table_info_value.table_info.meta.partition_key_indices {
+            return Ok(());
+        }
+
+        let mut new_table_info = table_info_value.table_info.clone();
+        new_table_info.meta.partition_key_indices = new_partition_key_indices;
+        self.context
+            .update_table_info(&table_info_value, table_info_value.update(new_table_info))
+            .await?;
+
+        // Do not invalidate the table cache here. The table routes may still
+        // contain partition expressions until `rollback_inner` rolls them back.
+        // Exposing cleared partition columns with partitioned routes can build
+        // an inconsistent partition rule. The cache is invalidated once after
+        // both partition metadata and routes are rolled back.
+
+        Ok(())
+    }
+
     async fn rollback_inner(&mut self, procedure_ctx: &ProcedureContext) -> Result<()> {
-        if !self.should_rollback_allocated_regions() {
+        if !self.should_rollback() {
             return Ok(());
         }
 
@@ -596,6 +664,8 @@ impl RepartitionProcedure {
 
         let table_lock = TableLock::Write(table_id).into();
         let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await;
+
+        self.rollback_partition_metadata().await?;
         let table_route_value = self.context.get_table_route_value().await?;
         let original_region_routes = region_routes(table_id, table_route_value.get_inner_ref())?;
         let mut current_region_routes = original_region_routes.clone();
@@ -738,20 +808,28 @@ impl RepartitionProcedureFactory for DefaultRepartitionProcedureFactory {
         ddl_ctx: &DdlContext,
         table_name: TableName,
         table_id: TableId,
-        from_exprs: Vec<String>,
+        source: RepartitionSource,
         to_exprs: Vec<String>,
         timeout: Option<Duration>,
     ) -> std::result::Result<BoxedProcedure, BoxedError> {
         let persistent_ctx = PersistentContext::new(table_name, table_id, timeout);
-        let from_exprs = from_exprs
-            .iter()
-            .map(|e| {
-                PartitionExpr::from_json_str(e)
-                    .context(error::DeserializePartitionExprSnafu)?
-                    .context(error::EmptyPartitionExprSnafu)
-            })
-            .collect::<Result<Vec<_>>>()
-            .map_err(BoxedError::new)?;
+        let from = match source {
+            RepartitionSource::Partitioned { exprs } => {
+                let exprs = exprs
+                    .iter()
+                    .map(|e| {
+                        PartitionExpr::from_json_str(e)
+                            .context(error::DeserializePartitionExprSnafu)?
+                            .context(error::EmptyPartitionExprSnafu)
+                    })
+                    .collect::<Result<Vec<_>>>()
+                    .map_err(BoxedError::new)?;
+                RepartitionFrom::Partitioned { exprs }
+            }
+            RepartitionSource::Unpartitioned { partition_columns } => {
+                RepartitionFrom::Unpartitioned { partition_columns }
+            }
+        };
         let to_exprs = to_exprs
             .iter()
             .map(|e| {
@@ -763,7 +841,7 @@ impl RepartitionProcedureFactory for DefaultRepartitionProcedureFactory {
             .map_err(BoxedError::new)?;
 
         let procedure = RepartitionProcedure::new(
-            from_exprs,
+            from,
             to_exprs,
             Context::new(
                 ddl_ctx,
@@ -853,27 +931,30 @@ mod tests {
     use crate::procedure::repartition::deallocate_region::DeallocateRegion;
     use crate::procedure::repartition::dispatch::Dispatch;
     use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
-    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
     use crate::procedure::repartition::repartition_end::RepartitionEnd;
     use crate::procedure::repartition::test_util::{
         TestingEnv, assert_parent_state, current_parent_region_routes, extract_subprocedure_ids,
         new_parent_context, procedure_context_with_receivers, procedure_state_receiver, range_expr,
         test_region_route, test_region_wal_options,
     };
+    use crate::procedure::repartition::update_partition_metadata::{
+        PartitionMetadataUpdate, UpdatePartitionMetadata,
+    };
 
     fn test_plan(table_id: TableId) -> RepartitionPlanEntry {
         RepartitionPlanEntry {
             group_id: uuid::Uuid::new_v4(),
-            source_regions: vec![RegionDescriptor {
-                region_id: RegionId::new(table_id, 1),
-                partition_expr: range_expr("x", 0, 100),
-            }],
+            source_regions: vec![SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 1),
+                range_expr("x", 0, 100),
+            )],
             target_regions: vec![
-                RegionDescriptor {
+                TargetRegionDescriptor {
                     region_id: RegionId::new(table_id, 1),
                     partition_expr: range_expr("x", 0, 50),
                 },
-                RegionDescriptor {
+                TargetRegionDescriptor {
                     region_id: RegionId::new(table_id, 3),
                     partition_expr: range_expr("x", 50, 100),
                 },
@@ -927,6 +1008,15 @@ mod tests {
             .unwrap()
     }
 
+    async fn table_partition_key_indices(ctx: &Context) -> Vec<usize> {
+        ctx.get_table_info_value()
+            .await
+            .unwrap()
+            .table_info
+            .meta
+            .partition_key_indices
+    }
+
     fn test_procedure(state: Box<dyn State>, context: Context) -> RepartitionProcedure {
         RepartitionProcedure { state, context }
     }
@@ -965,34 +1055,43 @@ mod tests {
     }
 
     #[test]
-    fn test_should_rollback_allocated_regions() {
+    fn test_should_rollback_after_metadata_update() {
         let env = TestingEnv::new();
         let table_id = 1024;
 
         let procedure = test_procedure(
-            Box::new(RepartitionStart::new(vec![], vec![])),
+            Box::new(RepartitionStart::new(
+                RepartitionFrom::Partitioned { exprs: vec![] },
+                vec![],
+            )),
             test_context(&env, table_id),
         );
-        assert!(!procedure.should_rollback_allocated_regions());
+        assert!(!procedure.should_rollback());
+
+        let procedure = test_procedure(
+            Box::new(UpdatePartitionMetadata::new(vec![])),
+            test_context(&env, table_id),
+        );
+        assert!(procedure.should_rollback());
 
         let procedure = test_procedure(
             Box::new(AllocateRegion::new(vec![])),
             test_context(&env, table_id),
         );
-        assert!(procedure.should_rollback_allocated_regions());
+        assert!(procedure.should_rollback());
 
         let procedure = test_procedure(Box::new(Dispatch), test_context(&env, table_id));
-        assert!(procedure.should_rollback_allocated_regions());
+        assert!(procedure.should_rollback());
 
         let procedure =
             test_procedure(Box::new(Collect::new(vec![])), test_context(&env, table_id));
-        assert!(procedure.should_rollback_allocated_regions());
+        assert!(procedure.should_rollback());
 
         let procedure = test_procedure(Box::new(DeallocateRegion), test_context(&env, table_id));
-        assert!(!procedure.should_rollback_allocated_regions());
+        assert!(!procedure.should_rollback());
 
         let procedure = test_procedure(Box::new(RepartitionEnd), test_context(&env, table_id));
-        assert!(!procedure.should_rollback_allocated_regions());
+        assert!(!procedure.should_rollback());
     }
 
     #[test]
@@ -1048,6 +1147,68 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_persistent_context_partition_metadata_update_serde_default() {
+        let json = r#"{
+            "catalog_name":"test_catalog",
+            "schema_name":"test_schema",
+            "table_name":"test_table",
+            "table_id":1024,
+            "plans":[],
+            "timeout":"120s"
+        }"#;
+
+        let persistent_ctx: PersistentContext = serde_json::from_str(json).unwrap();
+
+        assert!(persistent_ctx.partition_metadata_update.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_repartition_rollback_removes_partition_metadata_indices() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler));
+        env.create_physical_table_metadata_for_repartition(
+            table_id,
+            vec![test_region_route(RegionId::new(table_id, 1), "")],
+            test_region_wal_options(&[1]),
+        )
+        .await;
+
+        let mut context = new_parent_context(&env, node_manager, table_id);
+        let current = context.get_raw_table_info_value().await.unwrap();
+        let mut table_info = current.table_info.clone();
+        table_info.meta.partition_key_indices = vec![0, 1];
+        context
+            .update_table_info(&current, current.update(table_info))
+            .await
+            .unwrap();
+        context.persistent_ctx.partition_metadata_update = Some(PartitionMetadataUpdate {
+            partition_key_indices: vec![0],
+        });
+        let mut procedure = RepartitionProcedure {
+            state: Box::new(UpdatePartitionMetadata::new(vec![])),
+            context,
+        };
+
+        procedure
+            .rollback(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert_eq!(
+            procedure
+                .context
+                .get_table_info_value()
+                .await
+                .unwrap()
+                .table_info
+                .meta
+                .partition_key_indices,
+            vec![1]
+        );
+    }
+
     #[tokio::test]
     async fn test_repartition_rollback_removes_allocated_routes_from_dispatch() {
         let env = TestingEnv::new();
@@ -1209,16 +1370,16 @@ mod tests {
         );
         let succeeded_plan = RepartitionPlanEntry {
             group_id: Uuid::new_v4(),
-            source_regions: vec![RegionDescriptor {
-                region_id: RegionId::new(table_id, 2),
-                partition_expr: range_expr("x", 100, 200),
-            }],
+            source_regions: vec![SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 2),
+                range_expr("x", 100, 200),
+            )],
             target_regions: vec![
-                RegionDescriptor {
+                TargetRegionDescriptor {
                     region_id: RegionId::new(table_id, 2),
                     partition_expr: range_expr("x", 100, 150),
                 },
-                RegionDescriptor {
+                TargetRegionDescriptor {
                     region_id: RegionId::new(table_id, 4),
                     partition_expr: range_expr("x", 150, 200),
                 },
@@ -1292,16 +1453,16 @@ mod tests {
         );
         let succeeded_plan = RepartitionPlanEntry {
             group_id: Uuid::new_v4(),
-            source_regions: vec![RegionDescriptor {
-                region_id: RegionId::new(table_id, 2),
-                partition_expr: range_expr("x", 100, 200),
-            }],
+            source_regions: vec![SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 2),
+                range_expr("x", 100, 200),
+            )],
             target_regions: vec![
-                RegionDescriptor {
+                TargetRegionDescriptor {
                     region_id: RegionId::new(table_id, 2),
                     partition_expr: range_expr("x", 100, 150),
                 },
-                RegionDescriptor {
+                TargetRegionDescriptor {
                     region_id: RegionId::new(table_id, 4),
                     partition_expr: range_expr("x", 150, 200),
                 },
@@ -1567,16 +1728,16 @@ mod tests {
         let failed_merge_plan = RepartitionPlanEntry {
             group_id: Uuid::new_v4(),
             source_regions: vec![
-                RegionDescriptor {
-                    region_id: RegionId::new(table_id, 1),
-                    partition_expr: range_expr("x", 0, 100),
-                },
-                RegionDescriptor {
-                    region_id: RegionId::new(table_id, 2),
-                    partition_expr: range_expr("x", 100, 200),
-                },
+                SourceRegionDescriptor::partitioned(
+                    RegionId::new(table_id, 1),
+                    range_expr("x", 0, 100),
+                ),
+                SourceRegionDescriptor::partitioned(
+                    RegionId::new(table_id, 2),
+                    range_expr("x", 100, 200),
+                ),
             ],
-            target_regions: vec![RegionDescriptor {
+            target_regions: vec![TargetRegionDescriptor {
                 region_id: RegionId::new(table_id, 1),
                 partition_expr: range_expr("x", 0, 200),
             }],
@@ -1587,16 +1748,16 @@ mod tests {
         };
         let succeeded_split_plan = RepartitionPlanEntry {
             group_id: Uuid::new_v4(),
-            source_regions: vec![RegionDescriptor {
-                region_id: RegionId::new(table_id, 3),
-                partition_expr: range_expr("x", 200, 300),
-            }],
+            source_regions: vec![SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 3),
+                range_expr("x", 200, 300),
+            )],
             target_regions: vec![
-                RegionDescriptor {
+                TargetRegionDescriptor {
                     region_id: RegionId::new(table_id, 3),
                     partition_expr: range_expr("x", 200, 250),
                 },
-                RegionDescriptor {
+                TargetRegionDescriptor {
                     region_id: RegionId::new(table_id, 4),
                     partition_expr: range_expr("x", 250, 300),
                 },
@@ -1708,7 +1869,9 @@ mod tests {
 
         let context = new_parent_context(&env, node_manager, table_id);
         let mut procedure = RepartitionProcedure::new(
-            vec![range_expr("x", 0, 100)],
+            RepartitionFrom::Partitioned {
+                exprs: vec![range_expr("x", 0, 100)],
+            },
             vec![range_expr("x", 0, 50), range_expr("x", 50, 100)],
             context,
         );
@@ -1810,6 +1973,226 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn test_repartition_procedure_flow_unpartitioned_failed_and_full_rollback() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
+
+        env.create_physical_table_metadata_for_repartition(
+            table_id,
+            vec![test_region_route(RegionId::new(table_id, 1), "")],
+            test_region_wal_options(&[1]),
+        )
+        .await;
+
+        let context = new_parent_context(&env, node_manager, table_id);
+        let to_exprs = vec![range_expr("col1", 0, 50), range_expr("col1", 50, 100)];
+        let mut procedure = RepartitionProcedure::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["col1".to_string()],
+            },
+            to_exprs.clone(),
+            context,
+        );
+
+        let start_status = procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        assert!(start_status.need_persist());
+        assert_parent_state::<UpdatePartitionMetadata>(&procedure);
+        assert_eq!(
+            procedure
+                .context
+                .persistent_ctx
+                .partition_metadata_update
+                .as_ref()
+                .unwrap()
+                .partition_key_indices,
+            vec![0]
+        );
+
+        let update_status = procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        assert!(update_status.need_persist());
+        assert_parent_state::<AllocateRegion>(&procedure);
+        assert_eq!(
+            table_partition_key_indices(&procedure.context).await,
+            vec![0]
+        );
+
+        let build_allocate_status = procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        assert!(build_allocate_status.need_persist());
+        assert_parent_state::<AllocateRegion>(&procedure);
+        assert_eq!(procedure.context.persistent_ctx.plans.len(), 1);
+        let plan = &procedure.context.persistent_ctx.plans[0];
+        assert_eq!(
+            plan.source_regions,
+            vec![SourceRegionDescriptor::Default {
+                region_id: RegionId::new(table_id, 1)
+            }]
+        );
+        assert_eq!(plan.target_regions.len(), 2);
+        assert_eq!(plan.target_regions[0].region_id, RegionId::new(table_id, 1));
+        assert_eq!(plan.target_regions[0].partition_expr, to_exprs[0]);
+        assert_eq!(
+            plan.allocated_region_ids,
+            vec![plan.target_regions[1].region_id]
+        );
+        assert!(plan.pending_deallocate_region_ids.is_empty());
+        assert_eq!(plan.transition_map, vec![vec![0, 1]]);
+        let target_regions = plan.target_regions.clone();
+
+        let execute_allocate_status = procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        assert!(execute_allocate_status.need_persist());
+        assert_parent_state::<Dispatch>(&procedure);
+        let region_routes = current_parent_region_routes(&procedure.context).await;
+        assert_eq!(region_routes.len(), 2);
+        assert_eq!(
+            region_route_by_id(&region_routes, target_regions[0].region_id)
+                .region
+                .partition_expr(),
+            ""
+        );
+        assert_eq!(
+            region_route_by_id(&region_routes, target_regions[1].region_id)
+                .region
+                .partition_expr(),
+            to_exprs[1].as_json_str().unwrap()
+        );
+
+        let dispatch_status = procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        let subprocedure_ids = extract_subprocedure_ids(dispatch_status);
+        assert_eq!(subprocedure_ids.len(), 1);
+        assert_parent_state::<Collect>(&procedure);
+
+        let failed_state = ProcedureState::failed(Arc::new(ProcedureError::external(
+            MockError::new(StatusCode::Internal),
+        )));
+        let collect_ctx = procedure_context_with_receivers(HashMap::from([(
+            subprocedure_ids[0],
+            procedure_state_receiver(failed_state),
+        )]));
+        let err = procedure.execute(&collect_ctx).await.unwrap_err();
+        assert!(!err.is_retry_later());
+        assert_parent_state::<Collect>(&procedure);
+
+        procedure
+            .rollback(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert!(
+            table_partition_key_indices(&procedure.context)
+                .await
+                .is_empty()
+        );
+        assert_eq!(
+            current_parent_region_routes(&procedure.context).await,
+            vec![test_region_route(RegionId::new(table_id, 1), "")]
+        );
+    }
+
+    #[tokio::test]
+    async fn test_repartition_procedure_flow_unpartitioned_rollback_is_idempotent() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
+
+        env.create_physical_table_metadata_for_repartition(
+            table_id,
+            vec![test_region_route(RegionId::new(table_id, 1), "")],
+            test_region_wal_options(&[1]),
+        )
+        .await;
+
+        let context = new_parent_context(&env, node_manager, table_id);
+        let mut procedure = RepartitionProcedure::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["col1".to_string()],
+            },
+            vec![range_expr("col1", 0, 50), range_expr("col1", 50, 100)],
+            context,
+        );
+
+        procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        assert_eq!(
+            table_partition_key_indices(&procedure.context).await,
+            vec![0]
+        );
+        assert_eq!(
+            current_parent_region_routes(&procedure.context).await.len(),
+            2
+        );
+
+        let dispatch_status = procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        let subprocedure_ids = extract_subprocedure_ids(dispatch_status);
+        assert_eq!(subprocedure_ids.len(), 1);
+        assert_parent_state::<Collect>(&procedure);
+
+        let failed_state = ProcedureState::failed(Arc::new(ProcedureError::external(
+            MockError::new(StatusCode::Internal),
+        )));
+        let collect_ctx = procedure_context_with_receivers(HashMap::from([(
+            subprocedure_ids[0],
+            procedure_state_receiver(failed_state),
+        )]));
+        let err = procedure.execute(&collect_ctx).await.unwrap_err();
+        assert!(!err.is_retry_later());
+
+        procedure
+            .rollback(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        let once_indices = table_partition_key_indices(&procedure.context).await;
+        let once_routes = current_parent_region_routes(&procedure.context).await;
+
+        procedure
+            .rollback(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        let twice_indices = table_partition_key_indices(&procedure.context).await;
+        let twice_routes = current_parent_region_routes(&procedure.context).await;
+
+        assert_eq!(once_indices, twice_indices);
+        assert_eq!(once_routes, twice_routes);
+        assert!(twice_indices.is_empty());
+        assert_eq!(
+            twice_routes,
+            vec![test_region_route(RegionId::new(table_id, 1), "")]
+        );
+    }
+
     #[tokio::test]
     async fn test_repartition_procedure_flow_split_allocate_retryable_then_resume() {
         common_telemetry::init_default_ut_logging();
@@ -1852,7 +2235,9 @@ mod tests {
 
         let context = new_parent_context(&env, node_manager, table_id);
         let mut procedure = RepartitionProcedure::new(
-            vec![range_expr("x", 0, 100)],
+            RepartitionFrom::Partitioned {
+                exprs: vec![range_expr("x", 0, 100)],
+            },
             vec![range_expr("x", 0, 50), range_expr("x", 50, 100)],
             context,
         );
diff --git a/src/meta-srv/src/procedure/repartition/allocate_region.rs b/src/meta-srv/src/procedure/repartition/allocate_region.rs
index a3293e8c3e..c49866ac8a 100644
--- a/src/meta-srv/src/procedure/repartition/allocate_region.rs
+++ b/src/meta-srv/src/procedure/repartition/allocate_region.rs
@@ -35,7 +35,7 @@ use tokio::time::Instant;
 use crate::error::{self, Result};
 use crate::procedure::repartition::dispatch::Dispatch;
 use crate::procedure::repartition::plan::{
-    AllocationPlanEntry, RegionDescriptor, RepartitionPlanEntry,
+    AllocationPlanEntry, RepartitionPlanEntry, TargetRegionDescriptor,
     convert_allocation_plan_to_repartition_plan,
 };
 use crate::procedure::repartition::{Context, State};
@@ -324,7 +324,7 @@ impl AllocateRegion {
     /// Collects all regions that need to be allocated from the repartition plan entries.
     fn collect_allocate_regions(
         repartition_plan_entries: &[RepartitionPlanEntry],
-    ) -> Vec<&RegionDescriptor> {
+    ) -> Vec<&TargetRegionDescriptor> {
         repartition_plan_entries
             .iter()
             .flat_map(|p| p.allocate_regions())
@@ -333,7 +333,7 @@ impl AllocateRegion {
 
     /// Prepares region allocation data: region numbers and their partition expressions.
     fn prepare_region_allocation_data(
-        allocate_regions: &[&RegionDescriptor],
+        allocate_regions: &[&TargetRegionDescriptor],
     ) -> Result<Vec<(RegionNumber, String)>> {
         allocate_regions
             .iter()
@@ -417,6 +417,7 @@ mod tests {
 
     use super::*;
     use crate::procedure::repartition::State;
+    use crate::procedure::repartition::plan::SourceRegionDescriptor;
     use crate::procedure::repartition::test_util::{
         TestingEnv, current_parent_region_routes, new_parent_context, range_expr,
         test_region_wal_options,
@@ -428,8 +429,21 @@ mod tests {
         col: &str,
         start: i64,
         end: i64,
-    ) -> RegionDescriptor {
-        RegionDescriptor {
+    ) -> SourceRegionDescriptor {
+        SourceRegionDescriptor::partitioned(
+            RegionId::new(table_id, region_number),
+            range_expr(col, start, end),
+        )
+    }
+
+    fn create_target_region_descriptor(
+        table_id: TableId,
+        region_number: u32,
+        col: &str,
+        start: i64,
+        end: i64,
+    ) -> TargetRegionDescriptor {
+        TargetRegionDescriptor {
             region_id: RegionId::new(table_id, region_number),
             partition_expr: range_expr(col, start, end),
         }
@@ -700,10 +714,10 @@ mod tests {
     fn test_prepare_region_allocation_data() {
         let table_id = 1024;
         let regions = [
-            create_region_descriptor(table_id, 10, "x", 0, 50),
-            create_region_descriptor(table_id, 11, "x", 50, 100),
+            create_target_region_descriptor(table_id, 10, "x", 0, 50),
+            create_target_region_descriptor(table_id, 11, "x", 50, 100),
         ];
-        let region_refs: Vec<&RegionDescriptor> = regions.iter().collect();
+        let region_refs: Vec<&TargetRegionDescriptor> = regions.iter().collect();
 
         let result = AllocateRegion::prepare_region_allocation_data(&region_refs).unwrap();
 
@@ -732,7 +746,7 @@ mod tests {
         ctx.persistent_ctx.plans = vec![RepartitionPlanEntry {
             group_id: Uuid::new_v4(),
             source_regions: vec![],
-            target_regions: vec![create_region_descriptor(table_id, 3, "x", 0, 100)],
+            target_regions: vec![create_target_region_descriptor(table_id, 3, "x", 0, 100)],
             allocated_region_ids: vec![RegionId::new(table_id, 3)],
             pending_deallocate_region_ids: vec![],
             transition_map: vec![],
diff --git a/src/meta-srv/src/procedure/repartition/dispatch.rs b/src/meta-srv/src/procedure/repartition/dispatch.rs
index 3a9f9376f1..4377123887 100644
--- a/src/meta-srv/src/procedure/repartition/dispatch.rs
+++ b/src/meta-srv/src/procedure/repartition/dispatch.rs
@@ -25,22 +25,22 @@ use store_api::storage::RegionId;
 use crate::error::Result;
 use crate::procedure::repartition::collect::{Collect, ProcedureMeta};
 use crate::procedure::repartition::group::RepartitionGroupProcedure;
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
 use crate::procedure::repartition::{self, Context, State};
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Dispatch;
 
 pub(crate) fn build_region_mapping(
-    source_regions: &[RegionDescriptor],
-    target_regions: &[RegionDescriptor],
+    source_regions: &[SourceRegionDescriptor],
+    target_regions: &[TargetRegionDescriptor],
     transition_map: &[Vec<usize>],
 ) -> HashMap<RegionId, Vec<RegionId>> {
     transition_map
         .iter()
         .enumerate()
         .map(|(source_idx, indices)| {
-            let source_region = source_regions[source_idx].region_id;
+            let source_region = source_regions[source_idx].region_id();
             let target_regions = indices
                 .iter()
                 .map(|&target_idx| target_regions[target_idx].region_id)
diff --git a/src/meta-srv/src/procedure/repartition/group.rs b/src/meta-srv/src/procedure/repartition/group.rs
index 12374e8ada..2dc1117467 100644
--- a/src/meta-srv/src/procedure/repartition/group.rs
+++ b/src/meta-srv/src/procedure/repartition/group.rs
@@ -49,7 +49,7 @@ use uuid::Uuid;
 
 use crate::error::{self, Result};
 use crate::procedure::repartition::group::repartition_start::RepartitionStart;
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
 use crate::procedure::repartition::utils::get_datanode_table_value;
 use crate::procedure::repartition::{self};
 use crate::service::mailbox::MailboxRef;
@@ -330,9 +330,9 @@ pub struct PersistentContext {
     /// The schema name of the repartition group.
     pub schema_name: String,
     /// The source regions of the repartition group.
-    pub sources: Vec<RegionDescriptor>,
+    pub sources: Vec<SourceRegionDescriptor>,
     /// The target regions of the repartition group.
-    pub targets: Vec<RegionDescriptor>,
+    pub targets: Vec<TargetRegionDescriptor>,
     /// For each `source region`, the corresponding
     /// `target regions` that overlap with it.
     pub region_mapping: HashMap<RegionId, Vec<RegionId>>,
@@ -360,8 +360,8 @@ impl PersistentContext {
         table_id: TableId,
         catalog_name: String,
         schema_name: String,
-        sources: Vec<RegionDescriptor>,
-        targets: Vec<RegionDescriptor>,
+        sources: Vec<SourceRegionDescriptor>,
+        targets: Vec<TargetRegionDescriptor>,
         region_mapping: HashMap<RegionId, Vec<RegionId>>,
         sync_region: bool,
         allocated_region_ids: Vec<RegionId>,
@@ -392,7 +392,7 @@ impl PersistentContext {
             SchemaLock::read(&self.catalog_name, &self.schema_name).into(),
         ]);
         for source in &self.sources {
-            lock_keys.push(RegionLock::Write(source.region_id).into());
+            lock_keys.push(RegionLock::Write(source.region_id()).into());
         }
         lock_keys
     }
diff --git a/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs b/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs
index 43e5ee31d9..6148901ffa 100644
--- a/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs
+++ b/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs
@@ -37,7 +37,7 @@ use crate::procedure::repartition::group::utils::{
     HandleMultipleResult, group_region_routes_by_peer, handle_multiple_results,
 };
 use crate::procedure::repartition::group::{Context, State};
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::TargetRegionDescriptor;
 use crate::service::mailbox::{Channel, MailboxRef};
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -75,7 +75,7 @@ impl ApplyStagingManifest {
     fn build_apply_staging_manifest_instructions(
         staging_manifest_paths: &HashMap<RegionId, String>,
         target_routes: &[RegionRoute],
-        targets: &[RegionDescriptor],
+        targets: &[TargetRegionDescriptor],
         central_region_id: RegionId,
     ) -> Result<ApplyStagingManifestInstructions> {
         let target_partition_expr_by_region = targets
diff --git a/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs
index c1957031d5..d1be2ca9d0 100644
--- a/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs
+++ b/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs
@@ -38,7 +38,7 @@ use crate::procedure::repartition::group::utils::{
     HandleMultipleResult, group_region_routes_by_peer, handle_multiple_results,
 };
 use crate::procedure::repartition::group::{Context, GroupId, GroupPrepareResult, State};
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::TargetRegionDescriptor;
 use crate::procedure::utils::{self, ErrorStrategy};
 use crate::service::mailbox::{Channel, MailboxRef};
 
@@ -77,7 +77,7 @@ impl EnterStagingRegion {
     fn build_enter_staging_instructions(
         group_id: GroupId,
         prepare_result: &GroupPrepareResult,
-        targets: &[RegionDescriptor],
+        targets: &[TargetRegionDescriptor],
         pending_deallocate_region_ids: &[RegionId],
     ) -> Result<HashMap<Peer, Vec<common_meta::instruction::EnterStagingRegion>>> {
         let target_partition_expr_by_region = targets
@@ -454,7 +454,7 @@ mod tests {
     use crate::error::{self, Error};
     use crate::procedure::repartition::group::GroupPrepareResult;
     use crate::procedure::repartition::group::enter_staging_region::EnterStagingRegion;
-    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::plan::TargetRegionDescriptor;
     use crate::procedure::repartition::test_util::{
         TestingEnv, new_persistent_context, range_expr,
     };
@@ -720,13 +720,13 @@ mod tests {
         }
     }
 
-    fn test_targets() -> Vec<RegionDescriptor> {
+    fn test_targets() -> Vec<TargetRegionDescriptor> {
         vec![
-            RegionDescriptor {
+            TargetRegionDescriptor {
                 region_id: RegionId::new(1024, 1),
                 partition_expr: range_expr("x", 0, 10),
             },
-            RegionDescriptor {
+            TargetRegionDescriptor {
                 region_id: RegionId::new(1024, 2),
                 partition_expr: range_expr("x", 10, 20),
             },
diff --git a/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs b/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs
index 1d6a75100e..d8259a354f 100644
--- a/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs
+++ b/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs
@@ -30,7 +30,7 @@ use crate::error::{self, Result};
 use crate::handler::HeartbeatMailbox;
 use crate::procedure::repartition::group::apply_staging_manifest::ApplyStagingManifest;
 use crate::procedure::repartition::group::{Context, State};
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
 use crate::service::mailbox::{Channel, MailboxRef};
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -98,8 +98,8 @@ impl State for RemapManifest {
 
 impl RemapManifest {
     fn build_remap_manifest_instructions(
-        source_regions: &[RegionDescriptor],
-        target_regions: &[RegionDescriptor],
+        source_regions: &[SourceRegionDescriptor],
+        target_regions: &[TargetRegionDescriptor],
         region_mapping: &HashMap<RegionId, Vec<RegionId>>,
         central_region_id: RegionId,
     ) -> Result<common_meta::instruction::RemapManifest> {
@@ -117,7 +117,7 @@ impl RemapManifest {
 
         Ok(common_meta::instruction::RemapManifest {
             region_id: central_region_id,
-            input_regions: source_regions.iter().map(|r| r.region_id).collect(),
+            input_regions: source_regions.iter().map(|r| r.region_id()).collect(),
             region_mapping: region_mapping.clone(),
             new_partition_exprs,
         })
diff --git a/src/meta-srv/src/procedure/repartition/group/repartition_start.rs b/src/meta-srv/src/procedure/repartition/group/repartition_start.rs
index 8b8b5208b4..8392890911 100644
--- a/src/meta-srv/src/procedure/repartition/group/repartition_start.rs
+++ b/src/meta-srv/src/procedure/repartition/group/repartition_start.rs
@@ -19,7 +19,7 @@ use common_meta::rpc::router::RegionRoute;
 use common_procedure::{Context as ProcedureContext, Status};
 use common_telemetry::debug;
 use serde::{Deserialize, Serialize};
-use snafu::{OptionExt, ResultExt, ensure};
+use snafu::{OptionExt, ensure};
 
 use crate::error::{self, Result};
 use crate::procedure::repartition::group::sync_region::SyncRegion;
@@ -27,21 +27,18 @@ use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
 use crate::procedure::repartition::group::{
     Context, GroupId, GroupPrepareResult, State, region_routes,
 };
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
 
 #[derive(Debug, Serialize, Deserialize)]
 pub struct RepartitionStart;
 
-/// Ensures that the partition expression of the region route matches the partition expression of the region descriptor.
-fn ensure_region_route_expr_match(
+/// Ensures that the partition expression of the source region route matches the source descriptor.
+fn ensure_source_region_route_expr_match(
     region_route: &RegionRoute,
-    region_descriptor: &RegionDescriptor,
+    source: &SourceRegionDescriptor,
 ) -> Result<RegionRoute> {
     let actual = region_route.region.partition_expr();
-    let expected = region_descriptor
-        .partition_expr
-        .as_json_str()
-        .context(error::SerializePartitionExprSnafu)?;
+    let expected = source.route_expr_for_rollback()?;
     ensure!(
         actual == expected,
         error::PartitionExprMismatchSnafu {
@@ -60,8 +57,8 @@ impl RepartitionStart {
     fn ensure_route_present(
         group_id: GroupId,
         region_routes: &[RegionRoute],
-        sources: &[RegionDescriptor],
-        targets: &[RegionDescriptor],
+        sources: &[SourceRegionDescriptor],
+        targets: &[TargetRegionDescriptor],
     ) -> Result<GroupPrepareResult> {
         ensure!(
             !sources.is_empty(),
@@ -78,12 +75,12 @@ impl RepartitionStart {
             .iter()
             .map(|s| {
                 region_routes_map
-                    .get(&s.region_id)
+                    .get(&s.region_id())
                     .context(error::RepartitionSourceRegionMissingSnafu {
                         group_id,
-                        region_id: s.region_id,
+                        region_id: s.region_id(),
                     })
-                    .and_then(|r| ensure_region_route_expr_match(r, s))
+                    .and_then(|r| ensure_source_region_route_expr_match(r, s))
             })
             .collect::<Result<Vec<_>>>()?;
         let target_region_routes = targets
@@ -109,7 +106,7 @@ impl RepartitionStart {
                 }
             );
         }
-        let central_region = sources[0].region_id;
+        let central_region = sources[0].region_id();
         let central_region_datanode = source_region_routes[0]
             .leader_peer
             .as_ref()
@@ -216,16 +213,14 @@ mod tests {
 
     use crate::error::Error;
     use crate::procedure::repartition::group::repartition_start::RepartitionStart;
-    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
     use crate::procedure::repartition::test_util::range_expr;
 
     #[test]
     fn test_ensure_route_present_missing_source_region() {
-        let source_region = RegionDescriptor {
-            region_id: RegionId::new(1024, 1),
-            partition_expr: range_expr("x", 0, 100),
-        };
-        let target_region = RegionDescriptor {
+        let source_region =
+            SourceRegionDescriptor::partitioned(RegionId::new(1024, 1), range_expr("x", 0, 100));
+        let target_region = TargetRegionDescriptor {
             region_id: RegionId::new(1024, 2),
             partition_expr: range_expr("x", 0, 10),
         };
@@ -249,11 +244,9 @@ mod tests {
 
     #[test]
     fn test_ensure_route_present_partition_expr_mismatch() {
-        let source_region = RegionDescriptor {
-            region_id: RegionId::new(1024, 1),
-            partition_expr: range_expr("x", 0, 100),
-        };
-        let target_region = RegionDescriptor {
+        let source_region =
+            SourceRegionDescriptor::partitioned(RegionId::new(1024, 1), range_expr("x", 0, 100));
+        let target_region = TargetRegionDescriptor {
             region_id: RegionId::new(1024, 2),
             partition_expr: range_expr("x", 0, 10),
         };
@@ -277,12 +270,69 @@ mod tests {
     }
 
     #[test]
-    fn test_ensure_route_present_missing_target_region() {
-        let source_region = RegionDescriptor {
+    fn test_ensure_route_present_default_source_matches_empty_partition_expr() {
+        let source_region = SourceRegionDescriptor::Default {
             region_id: RegionId::new(1024, 1),
-            partition_expr: range_expr("x", 0, 100),
         };
-        let target_region = RegionDescriptor {
+        let target_region = TargetRegionDescriptor {
+            region_id: RegionId::new(1024, 1),
+            partition_expr: range_expr("x", 0, 10),
+        };
+        let region_routes = vec![RegionRoute {
+            region: Region {
+                id: RegionId::new(1024, 1),
+                partition_expr: String::new(),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            ..Default::default()
+        }];
+
+        let result = RepartitionStart::ensure_route_present(
+            Uuid::new_v4(),
+            &region_routes,
+            &[source_region],
+            &[target_region],
+        );
+
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_ensure_route_present_default_source_rejects_non_empty_partition_expr() {
+        let source_region = SourceRegionDescriptor::Default {
+            region_id: RegionId::new(1024, 1),
+        };
+        let target_region = TargetRegionDescriptor {
+            region_id: RegionId::new(1024, 1),
+            partition_expr: range_expr("x", 0, 10),
+        };
+        let region_routes = vec![RegionRoute {
+            region: Region {
+                id: RegionId::new(1024, 1),
+                partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            ..Default::default()
+        }];
+
+        let err = RepartitionStart::ensure_route_present(
+            Uuid::new_v4(),
+            &region_routes,
+            &[source_region],
+            &[target_region],
+        )
+        .unwrap_err();
+
+        assert_matches!(err, Error::PartitionExprMismatch { .. });
+    }
+
+    #[test]
+    fn test_ensure_route_present_missing_target_region() {
+        let source_region =
+            SourceRegionDescriptor::partitioned(RegionId::new(1024, 1), range_expr("x", 0, 100));
+        let target_region = TargetRegionDescriptor {
             region_id: RegionId::new(1024, 2),
             partition_expr: range_expr("x", 0, 10),
         };
@@ -307,11 +357,9 @@ mod tests {
 
     #[test]
     fn test_ensure_route_present_legacy_partition_expr_source() {
-        let source_region = RegionDescriptor {
-            region_id: RegionId::new(1024, 1),
-            partition_expr: range_expr("x", 0, 100),
-        };
-        let target_region = RegionDescriptor {
+        let source_region =
+            SourceRegionDescriptor::partitioned(RegionId::new(1024, 1), range_expr("x", 0, 100));
+        let target_region = TargetRegionDescriptor {
             region_id: RegionId::new(1024, 2),
             partition_expr: range_expr("x", 0, 10),
         };
diff --git a/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs
index ff01161ff5..13fc486467 100644
--- a/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs
+++ b/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs
@@ -22,7 +22,7 @@ use snafu::{OptionExt, ResultExt};
 use crate::error::{self, Result};
 use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
 use crate::procedure::repartition::group::{Context, GroupId, region_routes};
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
 
 impl UpdateMetadata {
     /// Applies the new partition expressions for staging regions.
@@ -32,8 +32,8 @@ impl UpdateMetadata {
     /// - Source region not found.
     pub(crate) fn apply_staging_region_routes(
         group_id: GroupId,
-        sources: &[RegionDescriptor],
-        targets: &[RegionDescriptor],
+        sources: &[SourceRegionDescriptor],
+        targets: &[TargetRegionDescriptor],
         pending_deallocate_region_ids: &[store_api::storage::RegionId],
         current_region_routes: &[RegionRoute],
     ) -> Result<Vec<RegionRoute>> {
@@ -61,15 +61,16 @@ impl UpdateMetadata {
         }
 
         for source in sources {
-            let region_route = region_routes_map.get_mut(&source.region_id).context(
+            let region_id = source.region_id();
+            let region_route = region_routes_map.get_mut(&region_id).context(
                 error::RepartitionSourceRegionMissingSnafu {
                     group_id,
-                    region_id: source.region_id,
+                    region_id,
                 },
             )?;
             // Set leader staging state for the source region route.
             region_route.set_leader_staging();
-            if pending_deallocate_region_ids.contains(&source.region_id) {
+            if pending_deallocate_region_ids.contains(&region_id) {
                 // When a region is pending deallocation, it should ignore all writes.
                 region_route.set_ignore_all_writes();
             }
@@ -130,7 +131,7 @@ mod tests {
     use uuid::Uuid;
 
     use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
-    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
     use crate::procedure::repartition::test_util::range_expr;
 
     #[test]
@@ -166,11 +167,11 @@ mod tests {
                 ..Default::default()
             },
         ];
-        let source_region = RegionDescriptor {
-            region_id: RegionId::new(table_id, 1),
-            partition_expr: range_expr("x", 0, 100),
-        };
-        let target_region = RegionDescriptor {
+        let source_region = SourceRegionDescriptor::partitioned(
+            RegionId::new(table_id, 1),
+            range_expr("x", 0, 100),
+        );
+        let target_region = TargetRegionDescriptor {
             region_id: RegionId::new(table_id, 2),
             partition_expr: range_expr("x", 0, 10),
         };
@@ -196,6 +197,68 @@ mod tests {
         assert!(!new_region_routes[2].is_leader_staging());
     }
 
+    #[test]
+    fn test_generate_region_routes_with_reused_default_source_region() {
+        let group_id = Uuid::new_v4();
+        let table_id = 1024;
+        let default_region_id = RegionId::new(table_id, 1);
+        let region_routes = vec![
+            RegionRoute {
+                region: Region {
+                    id: default_region_id,
+                    partition_expr: String::new(),
+                    ..Default::default()
+                },
+                leader_peer: Some(Peer::empty(1)),
+                ..Default::default()
+            },
+            RegionRoute {
+                region: Region {
+                    id: RegionId::new(table_id, 2),
+                    partition_expr: String::new(),
+                    ..Default::default()
+                },
+                leader_peer: Some(Peer::empty(1)),
+                ..Default::default()
+            },
+        ];
+        let source_region = SourceRegionDescriptor::Default {
+            region_id: default_region_id,
+        };
+        let reused_target_expr = range_expr("x", 0, 10);
+        let target_regions = vec![
+            TargetRegionDescriptor {
+                region_id: default_region_id,
+                partition_expr: reused_target_expr.clone(),
+            },
+            TargetRegionDescriptor {
+                region_id: RegionId::new(table_id, 2),
+                partition_expr: range_expr("x", 10, 20),
+            },
+        ];
+
+        let new_region_routes = UpdateMetadata::apply_staging_region_routes(
+            group_id,
+            &[source_region],
+            &target_regions,
+            &[],
+            &region_routes,
+        )
+        .unwrap();
+
+        assert_eq!(
+            new_region_routes[0].region.partition_expr,
+            reused_target_expr.as_json_str().unwrap()
+        );
+        assert!(new_region_routes[0].is_leader_staging());
+        assert!(!new_region_routes[0].is_ignore_all_writes());
+        assert_eq!(
+            new_region_routes[1].region.partition_expr,
+            range_expr("x", 10, 20).as_json_str().unwrap()
+        );
+        assert!(new_region_routes[1].is_leader_staging());
+    }
+
     #[test]
     fn test_generate_region_routes_mark_pending_deallocate_reject_all_writes() {
         let group_id = Uuid::new_v4();
@@ -221,11 +284,11 @@ mod tests {
                 ..Default::default()
             },
         ];
-        let source_region = RegionDescriptor {
-            region_id: pending_deallocate_region_id,
-            partition_expr: range_expr("x", 0, 100),
-        };
-        let target_region = RegionDescriptor {
+        let source_region = SourceRegionDescriptor::partitioned(
+            pending_deallocate_region_id,
+            range_expr("x", 0, 100),
+        );
+        let target_region = TargetRegionDescriptor {
             region_id: RegionId::new(table_id, 2),
             partition_expr: range_expr("x", 0, 10),
         };
diff --git a/src/meta-srv/src/procedure/repartition/group/update_metadata/exit_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/update_metadata/exit_staging_region.rs
index 50864daa93..325f859a98 100644
--- a/src/meta-srv/src/procedure/repartition/group/update_metadata/exit_staging_region.rs
+++ b/src/meta-srv/src/procedure/repartition/group/update_metadata/exit_staging_region.rs
@@ -22,13 +22,13 @@ use snafu::{OptionExt, ResultExt};
 use crate::error::{self, Result};
 use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
 use crate::procedure::repartition::group::{Context, GroupId, region_routes};
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
 
 impl UpdateMetadata {
     pub(crate) fn exit_staging_region_routes(
         group_id: GroupId,
-        sources: &[RegionDescriptor],
-        targets: &[RegionDescriptor],
+        sources: &[SourceRegionDescriptor],
+        targets: &[TargetRegionDescriptor],
         current_region_routes: &[RegionRoute],
     ) -> Result<Vec<RegionRoute>> {
         let mut region_routes = current_region_routes.to_vec();
@@ -48,10 +48,11 @@ impl UpdateMetadata {
         }
 
         for source in sources {
-            let region_route = region_routes_map.get_mut(&source.region_id).context(
+            let region_id = source.region_id();
+            let region_route = region_routes_map.get_mut(&region_id).context(
                 error::RepartitionSourceRegionMissingSnafu {
                     group_id,
-                    region_id: source.region_id,
+                    region_id,
                 },
             )?;
             region_route.clear_leader_staging();
@@ -113,24 +114,25 @@ mod tests {
     use uuid::Uuid;
 
     use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
-    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
     use crate::procedure::repartition::test_util::range_expr;
 
     #[test]
     fn test_exit_staging_region_routes_keep_reject_all_writes() {
         let group_id = Uuid::new_v4();
         let table_id = 1024;
-        let source_region = RegionDescriptor {
-            region_id: RegionId::new(table_id, 1),
-            partition_expr: range_expr("x", 0, 100),
-        };
-        let target_region = RegionDescriptor {
+        let source_region = SourceRegionDescriptor::partitioned(
+            RegionId::new(table_id, 1),
+            range_expr("x", 0, 100),
+        );
+        let source_region_id = source_region.region_id();
+        let target_region = TargetRegionDescriptor {
             region_id: RegionId::new(table_id, 2),
             partition_expr: range_expr("x", 0, 50),
         };
         let mut source_route = RegionRoute {
             region: Region {
-                id: source_region.region_id,
+                id: source_region_id,
                 partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
                 ..Default::default()
             },
@@ -165,4 +167,40 @@ mod tests {
         assert!(!new_region_routes[1].is_leader_staging());
         assert!(new_region_routes[1].is_ignore_all_writes());
     }
+
+    #[test]
+    fn test_exit_staging_region_routes_with_reused_default_source_region() {
+        let group_id = Uuid::new_v4();
+        let table_id = 1024;
+        let default_region_id = RegionId::new(table_id, 1);
+        let source_region = SourceRegionDescriptor::Default {
+            region_id: default_region_id,
+        };
+        let target_region = TargetRegionDescriptor {
+            region_id: default_region_id,
+            partition_expr: range_expr("x", 0, 50),
+        };
+        let target_expr = target_region.partition_expr.as_json_str().unwrap();
+        let region_route = RegionRoute {
+            region: Region {
+                id: default_region_id,
+                partition_expr: target_expr.clone(),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            leader_state: Some(LeaderState::Staging),
+            ..Default::default()
+        };
+
+        let new_region_routes = UpdateMetadata::exit_staging_region_routes(
+            group_id,
+            &[source_region],
+            &[target_region],
+            &[region_route],
+        )
+        .unwrap();
+
+        assert!(!new_region_routes[0].is_leader_staging());
+        assert_eq!(new_region_routes[0].region.partition_expr, target_expr);
+    }
 }
diff --git a/src/meta-srv/src/procedure/repartition/plan.rs b/src/meta-srv/src/procedure/repartition/plan.rs
index 063a64341b..1d11d7aa56 100644
--- a/src/meta-srv/src/procedure/repartition/plan.rs
+++ b/src/meta-srv/src/procedure/repartition/plan.rs
@@ -16,17 +16,137 @@ use std::cmp::Ordering;
 
 use common_meta::rpc::router::RegionRoute;
 use partition::expr::PartitionExpr;
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize};
+use snafu::ResultExt;
 use store_api::storage::{RegionId, RegionNumber, TableId};
 
+use crate::error::{self, Result};
 use crate::procedure::repartition::group::GroupId;
 
-/// Metadata describing a region involved in the plan.
+/// Metadata describing a source region involved in the plan.
+///
+/// Source regions may represent either an existing partitioned region or the
+/// default region of an unpartitioned table.
+#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
+pub enum SourceRegionDescriptor {
+    /// A regular partitioned source region.
+    Partitioned {
+        /// The region id of the source region.
+        region_id: RegionId,
+        /// The partition expression of the source region.
+        partition_expr: PartitionExpr,
+    },
+    /// The default source region of an unpartitioned table.
+    Default {
+        /// The region id of the default source region.
+        region_id: RegionId,
+    },
+}
+
+impl<'de> Deserialize<'de> for SourceRegionDescriptor {
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        #[serde(deny_unknown_fields)]
+        struct PartitionedSourceRegionDescriptor {
+            region_id: RegionId,
+            partition_expr: PartitionExpr,
+        }
+
+        #[derive(Deserialize)]
+        #[serde(untagged)]
+        enum SourceRegionDescriptorRepr {
+            Tagged(SourceRegionDescriptorTaggedRepr),
+            Legacy(PartitionedSourceRegionDescriptor),
+        }
+
+        #[derive(Deserialize)]
+        enum SourceRegionDescriptorTaggedRepr {
+            Partitioned {
+                region_id: RegionId,
+                partition_expr: PartitionExpr,
+            },
+            Default {
+                region_id: RegionId,
+            },
+        }
+
+        match SourceRegionDescriptorRepr::deserialize(deserializer)? {
+            SourceRegionDescriptorRepr::Tagged(SourceRegionDescriptorTaggedRepr::Partitioned {
+                region_id,
+                partition_expr,
+            }) => Ok(Self::Partitioned {
+                region_id,
+                partition_expr,
+            }),
+            SourceRegionDescriptorRepr::Tagged(SourceRegionDescriptorTaggedRepr::Default {
+                region_id,
+            }) => Ok(Self::Default { region_id }),
+            SourceRegionDescriptorRepr::Legacy(descriptor) => Ok(Self::Partitioned {
+                region_id: descriptor.region_id,
+                partition_expr: descriptor.partition_expr,
+            }),
+        }
+    }
+}
+
+impl SourceRegionDescriptor {
+    /// Creates a partitioned source region descriptor.
+    pub fn partitioned(region_id: RegionId, partition_expr: PartitionExpr) -> Self {
+        Self::Partitioned {
+            region_id,
+            partition_expr,
+        }
+    }
+
+    /// Returns the region id of this source descriptor.
+    pub fn region_id(&self) -> RegionId {
+        match self {
+            Self::Partitioned { region_id, .. } => *region_id,
+            Self::Default { region_id } => *region_id,
+        }
+    }
+
+    /// Returns the partition expression if this source is partitioned.
+    pub fn partition_expr(&self) -> Option<&PartitionExpr> {
+        match self {
+            Self::Partitioned { partition_expr, .. } => Some(partition_expr),
+            Self::Default { .. } => None,
+        }
+    }
+
+    /// Returns true if this source descriptor matches the route partition expression.
+    pub fn matches_route_expr(&self, route_expr: &str) -> Result<bool> {
+        match self {
+            Self::Partitioned { partition_expr, .. } => {
+                let expected = partition_expr
+                    .as_json_str()
+                    .context(error::SerializePartitionExprSnafu)?;
+                Ok(route_expr == expected)
+            }
+            Self::Default { .. } => Ok(route_expr.is_empty()),
+        }
+    }
+
+    /// Returns the route partition expression to restore during rollback.
+    pub fn route_expr_for_rollback(&self) -> Result<String> {
+        match self {
+            Self::Partitioned { partition_expr, .. } => partition_expr
+                .as_json_str()
+                .context(error::SerializePartitionExprSnafu),
+            Self::Default { .. } => Ok(String::new()),
+        }
+    }
+}
+
+/// Metadata describing a target region involved in the plan.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub struct RegionDescriptor {
-    /// The region id of the region involved in the plan.
+pub struct TargetRegionDescriptor {
+    /// The region id of the target region.
     pub region_id: RegionId,
-    /// The partition expression of the region.
+    /// The partition expression of the target region.
     pub partition_expr: PartitionExpr,
 }
 
@@ -37,7 +157,7 @@ pub struct AllocationPlanEntry {
     /// The group id for this plan entry.
     pub group_id: GroupId,
     /// Source region descriptors involved in the plan.
-    pub source_regions: Vec<RegionDescriptor>,
+    pub source_regions: Vec<SourceRegionDescriptor>,
     /// The target partition expressions for the new or changed regions.
     pub target_partition_exprs: Vec<PartitionExpr>,
     /// For each `source_regions[k]`, the corresponding vector contains global
@@ -52,9 +172,9 @@ pub struct RepartitionPlanEntry {
     /// The group id for this plan entry.
     pub group_id: GroupId,
     /// The source region descriptors involved in the plan.
-    pub source_regions: Vec<RegionDescriptor>,
+    pub source_regions: Vec<SourceRegionDescriptor>,
     /// The target region descriptors involved in the plan.
-    pub target_regions: Vec<RegionDescriptor>,
+    pub target_regions: Vec<TargetRegionDescriptor>,
     /// The region ids of the allocated regions.
     pub allocated_region_ids: Vec<RegionId>,
     /// The region ids of the regions that are pending deallocation.
@@ -69,7 +189,7 @@ pub struct RepartitionPlanEntry {
 
 impl RepartitionPlanEntry {
     /// Returns the target regions that are newly allocated.
-    pub(crate) fn allocate_regions(&self) -> Vec<&RegionDescriptor> {
+    pub(crate) fn allocate_regions(&self) -> Vec<&TargetRegionDescriptor> {
         self.target_regions
             .iter()
             .filter(|r| self.allocated_region_ids.contains(&r.region_id))
@@ -111,7 +231,7 @@ pub fn convert_allocation_plan_to_repartition_plan(
                 .iter()
                 .skip(source_regions.len())
                 .map(|target_partition_expr| {
-                    let desc = RegionDescriptor {
+                    let desc = TargetRegionDescriptor {
                         region_id: RegionId::new(table_id, *next_region_number),
                         partition_expr: target_partition_expr.clone(),
                     };
@@ -128,10 +248,12 @@ pub fn convert_allocation_plan_to_repartition_plan(
             let target_regions = source_regions
                 .iter()
                 .zip(target_partition_exprs.iter())
-                .map(|(source_region, target_partition_expr)| RegionDescriptor {
-                    region_id: source_region.region_id,
-                    partition_expr: target_partition_expr.clone(),
-                })
+                .map(
+                    |(source_region, target_partition_expr)| TargetRegionDescriptor {
+                        region_id: source_region.region_id(),
+                        partition_expr: target_partition_expr.clone(),
+                    },
+                )
                 .chain(pending_allocate_target_partition_exprs)
                 .collect::<Vec<_>>();
 
@@ -149,10 +271,12 @@ pub fn convert_allocation_plan_to_repartition_plan(
             let target_regions = source_regions
                 .iter()
                 .zip(target_partition_exprs.iter())
-                .map(|(source_region, target_partition_expr)| RegionDescriptor {
-                    region_id: source_region.region_id,
-                    partition_expr: target_partition_expr.clone(),
-                })
+                .map(
+                    |(source_region, target_partition_expr)| TargetRegionDescriptor {
+                        region_id: source_region.region_id(),
+                        partition_expr: target_partition_expr.clone(),
+                    },
+                )
                 .collect::<Vec<_>>();
 
             RepartitionPlanEntry {
@@ -171,16 +295,18 @@ pub fn convert_allocation_plan_to_repartition_plan(
                 .iter()
                 .take(target_partition_exprs.len())
                 .zip(target_partition_exprs.iter())
-                .map(|(source_region, target_partition_expr)| RegionDescriptor {
-                    region_id: source_region.region_id,
-                    partition_expr: target_partition_expr.clone(),
-                })
+                .map(
+                    |(source_region, target_partition_expr)| TargetRegionDescriptor {
+                        region_id: source_region.region_id(),
+                        partition_expr: target_partition_expr.clone(),
+                    },
+                )
                 .collect::<Vec<_>>();
 
             let pending_deallocate_region_ids = source_regions
                 .iter()
                 .skip(target_partition_exprs.len())
-                .map(|source_region| source_region.region_id)
+                .map(|source_region| source_region.region_id())
                 .collect::<Vec<_>>();
 
             RepartitionPlanEntry {
@@ -210,11 +336,140 @@ mod tests {
         col: &str,
         start: i64,
         end: i64,
-    ) -> RegionDescriptor {
-        RegionDescriptor {
-            region_id: RegionId::new(table_id, region_number),
-            partition_expr: range_expr(col, start, end),
-        }
+    ) -> SourceRegionDescriptor {
+        SourceRegionDescriptor::partitioned(
+            RegionId::new(table_id, region_number),
+            range_expr(col, start, end),
+        )
+    }
+
+    #[test]
+    fn test_source_region_descriptor_deserializes_legacy_partitioned_shape() {
+        let table_id = 1024;
+        let region_id = RegionId::new(table_id, 1);
+        let partition_expr = range_expr("x", 0, 100);
+        let legacy_json = serde_json::json!({
+            "region_id": region_id,
+            "partition_expr": partition_expr,
+        });
+
+        let descriptor: SourceRegionDescriptor = serde_json::from_value(legacy_json).unwrap();
+
+        assert_eq!(
+            descriptor,
+            SourceRegionDescriptor::partitioned(region_id, partition_expr)
+        );
+    }
+
+    #[test]
+    fn test_source_region_descriptor_rejects_legacy_default_shape() {
+        let region_id = RegionId::new(1024, 1);
+        let default_json = serde_json::json!({
+            "region_id": region_id,
+        });
+
+        let err = serde_json::from_value::<SourceRegionDescriptor>(default_json).unwrap_err();
+
+        assert!(err.to_string().contains("data did not match any variant"));
+    }
+
+    #[test]
+    fn test_source_region_descriptor_roundtrip_tagged_partitioned_shape() {
+        let region_id = RegionId::new(1024, 1);
+        let partition_expr = range_expr("x", 0, 100);
+        let descriptor = SourceRegionDescriptor::partitioned(region_id, partition_expr.clone());
+
+        let value = serde_json::to_value(&descriptor).unwrap();
+        let decoded = serde_json::from_value::<SourceRegionDescriptor>(value.clone()).unwrap();
+
+        assert_eq!(
+            value,
+            serde_json::json!({
+                "Partitioned": {
+                    "region_id": region_id,
+                    "partition_expr": partition_expr,
+                }
+            })
+        );
+        assert_eq!(decoded, descriptor);
+    }
+
+    #[test]
+    fn test_source_region_descriptor_roundtrip_tagged_default_shape() {
+        let region_id = RegionId::new(1024, 1);
+        let descriptor = SourceRegionDescriptor::Default { region_id };
+
+        let value = serde_json::to_value(&descriptor).unwrap();
+        let decoded = serde_json::from_value::<SourceRegionDescriptor>(value.clone()).unwrap();
+
+        assert_eq!(
+            value,
+            serde_json::json!({
+                "Default": {
+                    "region_id": region_id,
+                }
+            })
+        );
+        assert_eq!(decoded, descriptor);
+    }
+
+    #[test]
+    fn test_source_region_descriptor_rejects_invalid_partition_expr_shape() {
+        let region_id = RegionId::new(1024, 1);
+        let invalid_json = serde_json::json!({
+            "region_id": region_id,
+            "partition_expr": 42,
+        });
+
+        let err = serde_json::from_value::<SourceRegionDescriptor>(invalid_json).unwrap_err();
+
+        assert!(err.to_string().contains("data did not match any variant"));
+    }
+
+    #[test]
+    fn test_repartition_plan_entry_deserializes_legacy_source_regions() {
+        let group_id = Uuid::new_v4();
+        let table_id = 1024;
+        let source_region_id = RegionId::new(table_id, 1);
+        let target_region_id = RegionId::new(table_id, 2);
+        let source_partition_expr = range_expr("x", 0, 100);
+        let target_partition_expr = range_expr("x", 0, 50);
+        let legacy_json = serde_json::json!({
+            "group_id": group_id,
+            "source_regions": [{
+                "region_id": source_region_id,
+                "partition_expr": source_partition_expr,
+            }],
+            "target_regions": [{
+                "region_id": target_region_id,
+                "partition_expr": target_partition_expr,
+            }],
+            "allocated_region_ids": [target_region_id],
+            "pending_deallocate_region_ids": [],
+            "transition_map": [[0]],
+        });
+
+        let plan: RepartitionPlanEntry = serde_json::from_value(legacy_json).unwrap();
+
+        assert_eq!(plan.group_id, group_id);
+        assert_eq!(
+            plan.source_regions,
+            vec![SourceRegionDescriptor::partitioned(
+                source_region_id,
+                source_partition_expr
+            )]
+        );
+        assert_eq!(
+            plan.target_regions,
+            vec![TargetRegionDescriptor {
+                region_id: target_region_id,
+                partition_expr: target_partition_expr,
+            }]
+        );
+        assert_eq!(plan.allocated_region_ids, vec![target_region_id]);
+        assert!(plan.pending_deallocate_region_ids.is_empty());
+        assert_eq!(plan.transition_map, vec![vec![0]]);
+        assert!(plan.original_target_routes.is_empty());
     }
 
     #[test]
@@ -468,6 +723,55 @@ mod tests {
         assert_eq!(next_region_number, 6);
     }
 
+    #[test]
+    fn test_convert_plan_allocate_default_source_region() {
+        let group_id = Uuid::new_v4();
+        let table_id = 1024;
+        let mut next_region_number = 5;
+        let source_regions = vec![SourceRegionDescriptor::Default {
+            region_id: RegionId::new(table_id, 1),
+        }];
+        let target_partition_exprs = vec![range_expr("x", 0, 50), range_expr("x", 50, 100)];
+        let allocation_plan = AllocationPlanEntry {
+            group_id,
+            source_regions: source_regions.clone(),
+            target_partition_exprs: target_partition_exprs.clone(),
+            transition_map: vec![vec![0, 1]],
+        };
+
+        let result = convert_allocation_plan_to_repartition_plan(
+            table_id,
+            &mut next_region_number,
+            &allocation_plan,
+        );
+
+        assert_eq!(result.source_regions, source_regions);
+        assert_eq!(result.target_regions.len(), 2);
+        assert_eq!(
+            result.target_regions[0].region_id,
+            RegionId::new(table_id, 1)
+        );
+        assert_eq!(
+            result.target_regions[0].partition_expr,
+            target_partition_exprs[0]
+        );
+        assert_eq!(
+            result.target_regions[1].region_id,
+            RegionId::new(table_id, 5)
+        );
+        assert_eq!(
+            result.target_regions[1].partition_expr,
+            target_partition_exprs[1]
+        );
+        assert_eq!(
+            result.allocated_region_ids,
+            vec![RegionId::new(table_id, 5)]
+        );
+        assert!(result.pending_deallocate_region_ids.is_empty());
+        assert_eq!(result.transition_map, vec![vec![0, 1]]);
+        assert_eq!(next_region_number, 6);
+    }
+
     #[test]
     fn test_convert_plan_deallocate_to_single_region() {
         let group_id = Uuid::new_v4();
diff --git a/src/meta-srv/src/procedure/repartition/repartition_start.rs b/src/meta-srv/src/procedure/repartition/repartition_start.rs
index 5c6bcfdb06..b6f0ec9c0a 100644
--- a/src/meta-srv/src/procedure/repartition/repartition_start.rs
+++ b/src/meta-srv/src/procedure/repartition/repartition_start.rs
@@ -17,31 +17,69 @@ use std::any::Any;
 use common_meta::key::table_route::PhysicalTableRouteValue;
 use common_procedure::{Context as ProcedureContext, Status};
 use common_telemetry::debug;
+use partition::collider::Collider;
 use partition::expr::PartitionExpr;
 use partition::subtask::{self, RepartitionSubtask};
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize};
 use snafu::{OptionExt, ResultExt, ensure};
 use tokio::time::Instant;
 use uuid::Uuid;
 
 use crate::error::{self, Result};
 use crate::procedure::repartition::allocate_region::AllocateRegion;
-use crate::procedure::repartition::plan::{AllocationPlanEntry, RegionDescriptor};
+use crate::procedure::repartition::plan::{AllocationPlanEntry, SourceRegionDescriptor};
 use crate::procedure::repartition::repartition_end::RepartitionEnd;
+use crate::procedure::repartition::update_partition_metadata::{
+    PartitionMetadataUpdate, UpdatePartitionMetadata,
+};
 use crate::procedure::repartition::{Context, State};
 
+#[derive(Debug, Clone, Serialize)]
+pub enum RepartitionFrom {
+    Partitioned { exprs: Vec<PartitionExpr> },
+    Unpartitioned { partition_columns: Vec<String> },
+}
+
+impl<'de> Deserialize<'de> for RepartitionFrom {
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        enum CurrentRepartitionFrom {
+            Partitioned { exprs: Vec<PartitionExpr> },
+            Unpartitioned { partition_columns: Vec<String> },
+        }
+
+        #[derive(Deserialize)]
+        #[serde(untagged)]
+        enum RepartitionFromRepr {
+            Current(CurrentRepartitionFrom),
+            Legacy(Vec<PartitionExpr>),
+        }
+
+        match RepartitionFromRepr::deserialize(deserializer)? {
+            RepartitionFromRepr::Current(CurrentRepartitionFrom::Partitioned { exprs }) => {
+                Ok(Self::Partitioned { exprs })
+            }
+            RepartitionFromRepr::Current(CurrentRepartitionFrom::Unpartitioned {
+                partition_columns,
+            }) => Ok(Self::Unpartitioned { partition_columns }),
+            RepartitionFromRepr::Legacy(exprs) => Ok(Self::Partitioned { exprs }),
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct RepartitionStart {
-    from_exprs: Vec<PartitionExpr>,
+    #[serde(alias = "from_exprs")]
+    from: RepartitionFrom,
     to_exprs: Vec<PartitionExpr>,
 }
 
 impl RepartitionStart {
-    pub fn new(from_exprs: Vec<PartitionExpr>, to_exprs: Vec<PartitionExpr>) -> Self {
-        Self {
-            from_exprs,
-            to_exprs,
-        }
+    pub fn new(from: RepartitionFrom, to_exprs: Vec<PartitionExpr>) -> Self {
+        Self { from, to_exprs }
     }
 }
 
@@ -53,6 +91,13 @@ impl State for RepartitionStart {
         ctx: &mut Context,
         _: &ProcedureContext,
     ) -> Result<(Box<dyn State>, Status)> {
+        ensure!(
+            !self.to_exprs.is_empty(),
+            error::InvalidArgumentsSnafu {
+                err_msg: "Repartition expects non-empty target partition expressions".to_string(),
+            }
+        );
+
         let timer = Instant::now();
         let (physical_table_id, table_route) = ctx
             .table_metadata_manager
@@ -71,7 +116,8 @@ impl State for RepartitionStart {
             }
         );
 
-        let plans = Self::build_plan(&table_route, &self.from_exprs, &self.to_exprs)?;
+        let from_exprs = self.prepare_from(ctx).await?;
+        let plans = Self::build_plan(&table_route, from_exprs, &self.to_exprs)?;
         let plan_count = plans.len();
         let total_source_regions: usize = plans.iter().map(|p| p.source_regions.len()).sum();
         let total_target_regions: usize =
@@ -90,10 +136,17 @@ impl State for RepartitionStart {
             return Ok((Box::new(RepartitionEnd), Status::done()));
         }
 
-        Ok((
-            Box::new(AllocateRegion::new(plans)),
-            Status::executing(false),
-        ))
+        if ctx.persistent_ctx.partition_metadata_update.is_some() {
+            Ok((
+                Box::new(UpdatePartitionMetadata::new(plans)),
+                Status::executing(true),
+            ))
+        } else {
+            Ok((
+                Box::new(AllocateRegion::new(plans)),
+                Status::executing(false),
+            ))
+        }
     }
 
     fn as_any(&self) -> &dyn Any {
@@ -102,13 +155,76 @@ impl State for RepartitionStart {
 }
 
 impl RepartitionStart {
+    async fn prepare_from<'a>(&'a self, ctx: &mut Context) -> Result<&'a [PartitionExpr]> {
+        match &self.from {
+            RepartitionFrom::Partitioned { exprs } => Ok(exprs),
+            RepartitionFrom::Unpartitioned { partition_columns } => {
+                Self::prepare_unpartitioned(ctx, partition_columns).await?;
+                Ok(&[])
+            }
+        }
+    }
+
+    async fn prepare_unpartitioned(ctx: &mut Context, partition_columns: &[String]) -> Result<()> {
+        if ctx.persistent_ctx.partition_metadata_update.is_some() {
+            return Ok(());
+        }
+
+        ensure!(
+            !partition_columns.is_empty(),
+            error::InvalidArgumentsSnafu {
+                err_msg: "Unpartitioned repartition expects non-empty partition columns"
+                    .to_string(),
+            }
+        );
+
+        let table_info_value = ctx.get_table_info_value().await?;
+        ensure!(
+            table_info_value
+                .table_info
+                .meta
+                .partition_key_indices
+                .is_empty(),
+            error::InvalidArgumentsSnafu {
+                err_msg: format!(
+                    "Unpartitioned repartition expects an unpartitioned table, but table {} has partition key indices: {:?}",
+                    ctx.persistent_ctx.table_id,
+                    table_info_value.table_info.meta.partition_key_indices
+                ),
+            }
+        );
+
+        let schema = &table_info_value.table_info.meta.schema;
+        let partition_key_indices = partition_columns
+            .iter()
+            .map(|column_name| {
+                schema.column_index_by_name(column_name).with_context(|| {
+                    error::InvalidArgumentsSnafu {
+                        err_msg: format!(
+                            "Partition column {} not found in table {}",
+                            column_name, ctx.persistent_ctx.table_id
+                        ),
+                    }
+                })
+            })
+            .collect::<Result<Vec<_>>>()?;
+        ctx.persistent_ctx.partition_metadata_update =
+            Some(PartitionMetadataUpdate::new(partition_key_indices));
+
+        Ok(())
+    }
+
     pub(crate) fn build_plan(
         physical_route: &PhysicalTableRouteValue,
         from_exprs: &[PartitionExpr],
         to_exprs: &[PartitionExpr],
     ) -> Result<Vec<AllocationPlanEntry>> {
-        let subtasks = subtask::create_subtasks(from_exprs, to_exprs)
-            .context(error::RepartitionCreateSubtasksSnafu)?;
+        let subtasks = if from_exprs.is_empty() {
+            Self::default_source_subtasks(to_exprs)?
+        } else {
+            subtask::create_subtasks(from_exprs, to_exprs)
+                .context(error::RepartitionCreateSubtasksSnafu)?
+        };
         if subtasks.is_empty() {
             return Ok(vec![]);
         }
@@ -123,7 +239,7 @@ impl RepartitionStart {
 
     fn build_plan_entries(
         subtasks: Vec<RepartitionSubtask>,
-        source_index: &[RegionDescriptor],
+        source_index: &[SourceRegionDescriptor],
         target_exprs: &[PartitionExpr],
     ) -> Vec<AllocationPlanEntry> {
         subtasks
@@ -151,10 +267,32 @@ impl RepartitionStart {
             .collect::<Vec<_>>()
     }
 
+    fn default_source_subtasks(to_exprs: &[PartitionExpr]) -> Result<Vec<RepartitionSubtask>> {
+        ensure!(
+            !to_exprs.is_empty(),
+            error::UnexpectedSnafu {
+                violated: "Default source repartition expects non-empty target partition exprs",
+            }
+        );
+
+        Collider::new(to_exprs).context(error::RepartitionCreateSubtasksSnafu)?;
+
+        let to_expr_indices = (0..to_exprs.len()).collect::<Vec<_>>();
+        Ok(vec![RepartitionSubtask {
+            from_expr_indices: vec![0],
+            to_expr_indices: to_expr_indices.clone(),
+            transition_map: vec![to_expr_indices],
+        }])
+    }
+
     fn source_region_descriptors(
         from_exprs: &[PartitionExpr],
         physical_route: &PhysicalTableRouteValue,
-    ) -> Result<Vec<RegionDescriptor>> {
+    ) -> Result<Vec<SourceRegionDescriptor>> {
+        if from_exprs.is_empty() {
+            return Self::default_source_region_descriptors(physical_route);
+        }
+
         let existing_regions = physical_route
             .region_routes
             .iter()
@@ -178,13 +316,394 @@ impl RepartitionStart {
                         debug!("Failed to find matching region for partition expression: {}, existing regions: {:?}", expr_json, existing_regions);
                     })?;
 
-                Ok(RegionDescriptor {
-                    region_id: matched_region_id,
-                    partition_expr: expr.clone(),
-                })
+                Ok(SourceRegionDescriptor::partitioned(
+                    matched_region_id,
+                    expr.clone(),
+                ))
             })
             .collect::<Result<Vec<_>>>()?;
 
         Ok(descriptors)
     }
+
+    fn default_source_region_descriptors(
+        physical_route: &PhysicalTableRouteValue,
+    ) -> Result<Vec<SourceRegionDescriptor>> {
+        ensure!(
+            physical_route.region_routes.len() == 1,
+            error::UnexpectedSnafu {
+                violated: format!(
+                    "Default source repartition expects exactly one source region, but got {}",
+                    physical_route.region_routes.len()
+                ),
+            }
+        );
+        let source_region = &physical_route.region_routes[0].region;
+        ensure!(
+            source_region.partition_expr().is_empty(),
+            error::UnexpectedSnafu {
+                violated: format!(
+                    "Default source repartition expects an empty partition expr, but got {}",
+                    source_region.partition_expr()
+                ),
+            }
+        );
+
+        Ok(vec![SourceRegionDescriptor::Default {
+            region_id: source_region.id,
+        }])
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use common_meta::ddl::test_util::datanode_handler::NaiveDatanodeHandler;
+    use common_meta::key::table_route::PhysicalTableRouteValue;
+    use common_meta::peer::Peer;
+    use common_meta::rpc::router::{Region, RegionRoute};
+    use common_meta::test_util::MockDatanodeManager;
+    use datatypes::prelude::Value;
+    use partition::expr::{Operand, RestrictedOp};
+    use store_api::storage::RegionId;
+
+    use super::*;
+    use crate::procedure::repartition::test_util::{
+        TestingEnv, new_parent_context, range_expr, test_region_route, test_region_wal_options,
+    };
+
+    fn physical_route(region_routes: Vec<RegionRoute>) -> PhysicalTableRouteValue {
+        PhysicalTableRouteValue::new(region_routes)
+    }
+
+    async fn new_test_context(env: &TestingEnv, table_id: u32) -> Context {
+        env.create_physical_table_metadata_for_repartition(
+            table_id,
+            vec![test_region_route(RegionId::new(table_id, 1), "")],
+            test_region_wal_options(&[1]),
+        )
+        .await;
+        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
+        new_parent_context(env, node_manager, table_id)
+    }
+
+    #[test]
+    fn test_build_plan_with_default_source_region() {
+        let table_id = 1024;
+        let physical_route =
+            physical_route(vec![test_region_route(RegionId::new(table_id, 1), "")]);
+        let to_exprs = vec![range_expr("x", 0, 50), range_expr("x", 50, 100)];
+
+        let plans = RepartitionStart::build_plan(&physical_route, &[], &to_exprs).unwrap();
+
+        assert_eq!(plans.len(), 1);
+        let plan = &plans[0];
+        assert_eq!(
+            plan.source_regions,
+            vec![SourceRegionDescriptor::Default {
+                region_id: RegionId::new(table_id, 1)
+            }]
+        );
+        assert_eq!(plan.target_partition_exprs, to_exprs);
+        assert_eq!(plan.transition_map, vec![vec![0, 1]]);
+    }
+
+    #[test]
+    fn test_build_plan_with_default_source_rejects_non_empty_partition_expr() {
+        let table_id = 1024;
+        let physical_route = physical_route(vec![test_region_route(
+            RegionId::new(table_id, 1),
+            &range_expr("x", 0, 100).as_json_str().unwrap(),
+        )]);
+        let to_exprs = vec![range_expr("x", 0, 50), range_expr("x", 50, 100)];
+
+        let err = RepartitionStart::build_plan(&physical_route, &[], &to_exprs).unwrap_err();
+
+        assert!(err.to_string().contains("empty partition expr"));
+    }
+
+    #[test]
+    fn test_build_plan_with_default_source_rejects_multiple_regions() {
+        let table_id = 1024;
+        let physical_route = physical_route(vec![
+            test_region_route(RegionId::new(table_id, 1), ""),
+            test_region_route(RegionId::new(table_id, 2), ""),
+        ]);
+        let to_exprs = vec![range_expr("x", 0, 50), range_expr("x", 50, 100)];
+
+        let err = RepartitionStart::build_plan(&physical_route, &[], &to_exprs).unwrap_err();
+
+        assert!(err.to_string().contains("exactly one source region"));
+    }
+
+    #[test]
+    fn test_build_plan_with_default_source_rejects_empty_targets() {
+        let table_id = 1024;
+        let physical_route =
+            physical_route(vec![test_region_route(RegionId::new(table_id, 1), "")]);
+
+        let err = RepartitionStart::build_plan(&physical_route, &[], &[]).unwrap_err();
+
+        assert!(err.to_string().contains("non-empty target partition exprs"));
+    }
+
+    #[test]
+    fn test_build_plan_with_default_source_rejects_invalid_targets() {
+        let table_id = 1024;
+        let physical_route =
+            physical_route(vec![test_region_route(RegionId::new(table_id, 1), "")]);
+        let invalid_to_expr = PartitionExpr::new(
+            Operand::Value(Value::Int64(1)),
+            RestrictedOp::Eq,
+            Operand::Value(Value::Int64(2)),
+        );
+
+        let err =
+            RepartitionStart::build_plan(&physical_route, &[], &[invalid_to_expr]).unwrap_err();
+
+        assert!(
+            err.to_string()
+                .contains("Failed to create repartition subtasks")
+        );
+    }
+
+    #[test]
+    fn test_build_plan_keeps_partitioned_source_matching() {
+        let table_id = 1024;
+        let from_exprs = vec![range_expr("x", 0, 100)];
+        let to_exprs = vec![range_expr("x", 0, 50), range_expr("x", 50, 100)];
+        let physical_route = physical_route(vec![RegionRoute {
+            region: Region {
+                id: RegionId::new(table_id, 1),
+                partition_expr: from_exprs[0].as_json_str().unwrap(),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            ..Default::default()
+        }]);
+
+        let plans = RepartitionStart::build_plan(&physical_route, &from_exprs, &to_exprs).unwrap();
+
+        assert_eq!(plans.len(), 1);
+        assert_eq!(
+            plans[0].source_regions,
+            vec![SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 1),
+                from_exprs[0].clone()
+            )]
+        );
+    }
+
+    #[test]
+    fn test_repartition_start_deserializes_legacy_from_exprs() {
+        let from_exprs = vec![range_expr("x", 0, 100)];
+        let to_exprs = vec![range_expr("x", 0, 50), range_expr("x", 50, 100)];
+        let json = serde_json::json!({
+            "from_exprs": from_exprs,
+            "to_exprs": to_exprs,
+        })
+        .to_string();
+
+        let state: RepartitionStart = serde_json::from_str(&json).unwrap();
+
+        let RepartitionFrom::Partitioned { exprs } = state.from else {
+            panic!("expected partition source");
+        };
+        assert_eq!(exprs, vec![range_expr("x", 0, 100)]);
+    }
+
+    #[test]
+    fn test_repartition_start_deserializes_current_from() {
+        let state = RepartitionStart::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["col1".to_string()],
+            },
+            vec![range_expr("col1", 0, 50)],
+        );
+        let json = serde_json::to_string(&state).unwrap();
+
+        let state: RepartitionStart = serde_json::from_str(&json).unwrap();
+
+        let RepartitionFrom::Unpartitioned { partition_columns } = state.from else {
+            panic!("expected unpartitioned source");
+        };
+        assert_eq!(partition_columns, vec!["col1"]);
+    }
+
+    #[tokio::test]
+    async fn test_partitioned_source_does_not_initialize_partition_metadata_update() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        env.create_physical_table_metadata_for_repartition(
+            table_id,
+            vec![test_region_route(
+                RegionId::new(table_id, 1),
+                &range_expr("x", 0, 100).as_json_str().unwrap(),
+            )],
+            test_region_wal_options(&[1]),
+        )
+        .await;
+        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
+        let mut ctx = new_parent_context(&env, node_manager, table_id);
+        let mut state = RepartitionStart::new(
+            RepartitionFrom::Partitioned {
+                exprs: vec![range_expr("x", 0, 100)],
+            },
+            vec![range_expr("x", 0, 50), range_expr("x", 50, 100)],
+        );
+
+        let (next, status) = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert!(!status.need_persist());
+        assert!(next.as_any().is::<AllocateRegion>());
+        assert!(ctx.persistent_ctx.partition_metadata_update.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_unpartitioned_source_initializes_partition_metadata_update() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let mut state = RepartitionStart::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["col2".to_string(), "col1".to_string()],
+            },
+            vec![range_expr("col2", 0, 50), range_expr("col2", 50, 100)],
+        );
+
+        let (next, status) = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert!(status.need_persist());
+        assert!(next.as_any().is::<UpdatePartitionMetadata>());
+        assert_eq!(
+            ctx.persistent_ctx
+                .partition_metadata_update
+                .as_ref()
+                .unwrap()
+                .partition_key_indices,
+            vec![2, 0]
+        );
+    }
+
+    #[tokio::test]
+    async fn test_unpartitioned_source_rejects_existing_partition_metadata() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let current = ctx.get_raw_table_info_value().await.unwrap();
+        let mut table_info = current.table_info.clone();
+        table_info.meta.partition_key_indices = vec![0];
+        ctx.update_table_info(&current, current.update(table_info))
+            .await
+            .unwrap();
+        let mut state = RepartitionStart::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["col1".to_string()],
+            },
+            vec![range_expr("col1", 0, 50)],
+        );
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(err.to_string().contains("expects an unpartitioned table"));
+        assert!(ctx.persistent_ctx.partition_metadata_update.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_repartition_start_rejects_empty_target_partition_exprs() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let mut state =
+            RepartitionStart::new(RepartitionFrom::Partitioned { exprs: vec![] }, vec![]);
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(
+            err.to_string()
+                .contains("non-empty target partition expressions")
+        );
+    }
+
+    #[tokio::test]
+    async fn test_unpartitioned_source_rejects_empty_target_partition_exprs() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let mut state = RepartitionStart::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["col1".to_string()],
+            },
+            vec![],
+        );
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(
+            err.to_string()
+                .contains("non-empty target partition expressions")
+        );
+        assert!(ctx.persistent_ctx.partition_metadata_update.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_unpartitioned_source_rejects_empty_partition_columns() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let mut state = RepartitionStart::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec![],
+            },
+            vec![range_expr("col1", 0, 50)],
+        );
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(err.to_string().contains("non-empty partition columns"));
+        assert!(ctx.persistent_ctx.partition_metadata_update.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_unpartitioned_source_rejects_missing_partition_column() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let mut state = RepartitionStart::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["missing_col".to_string()],
+            },
+            vec![range_expr("col1", 0, 50)],
+        );
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(
+            err.to_string()
+                .contains("Partition column missing_col not found")
+        );
+        assert!(ctx.persistent_ctx.partition_metadata_update.is_none());
+    }
 }
diff --git a/src/meta-srv/src/procedure/repartition/test_util.rs b/src/meta-srv/src/procedure/repartition/test_util.rs
index 83856a49e6..122f8e3953 100644
--- a/src/meta-srv/src/procedure/repartition/test_util.rs
+++ b/src/meta-srv/src/procedure/repartition/test_util.rs
@@ -42,7 +42,7 @@ use uuid::Uuid;
 use crate::cache_invalidator::MetasrvCacheInvalidator;
 use crate::metasrv::MetasrvInfo;
 use crate::procedure::repartition::group::{Context, PersistentContext, VolatileContext};
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
 use crate::procedure::repartition::{
     Context as ParentContext, PersistentContext as ParentPersistentContext, RepartitionProcedure,
 };
@@ -177,8 +177,8 @@ pub fn test_region_wal_options(region_numbers: &[RegionNumber]) -> HashMap<Regio
 
 pub fn new_persistent_context(
     table_id: TableId,
-    sources: Vec<RegionDescriptor>,
-    targets: Vec<RegionDescriptor>,
+    sources: Vec<SourceRegionDescriptor>,
+    targets: Vec<TargetRegionDescriptor>,
 ) -> PersistentContext {
     PersistentContext {
         group_id: Uuid::new_v4(),
diff --git a/src/meta-srv/src/procedure/repartition/update_partition_metadata.rs b/src/meta-srv/src/procedure/repartition/update_partition_metadata.rs
new file mode 100644
index 0000000000..cc9ca1c9bb
--- /dev/null
+++ b/src/meta-srv/src/procedure/repartition/update_partition_metadata.rs
@@ -0,0 +1,251 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_meta::lock_key::TableLock;
+use common_procedure::{Context as ProcedureContext, Status};
+use serde::{Deserialize, Serialize};
+use snafu::ensure;
+
+use crate::error::{self, Result};
+use crate::procedure::repartition::allocate_region::AllocateRegion;
+use crate::procedure::repartition::plan::AllocationPlanEntry;
+use crate::procedure::repartition::{Context, State};
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct PartitionMetadataUpdate {
+    pub partition_key_indices: Vec<usize>,
+}
+
+impl PartitionMetadataUpdate {
+    pub fn new(partition_key_indices: Vec<usize>) -> Self {
+        Self {
+            partition_key_indices,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct UpdatePartitionMetadata {
+    plan_entries: Vec<AllocationPlanEntry>,
+}
+
+impl UpdatePartitionMetadata {
+    pub fn new(plan_entries: Vec<AllocationPlanEntry>) -> Self {
+        Self { plan_entries }
+    }
+}
+
+#[async_trait::async_trait]
+#[typetag::serde]
+impl State for UpdatePartitionMetadata {
+    async fn next(
+        &mut self,
+        ctx: &mut Context,
+        procedure_ctx: &ProcedureContext,
+    ) -> Result<(Box<dyn State>, Status)> {
+        let Some(update) = ctx.persistent_ctx.partition_metadata_update.as_ref() else {
+            return Ok((
+                Box::new(AllocateRegion::new(self.plan_entries.clone())),
+                Status::executing(false),
+            ));
+        };
+        let partition_key_indices = update.partition_key_indices.clone();
+        ensure!(
+            !partition_key_indices.is_empty(),
+            error::InvalidArgumentsSnafu {
+                err_msg:
+                    "Repartition partition metadata update expects non-empty partition key indices"
+                        .to_string(),
+            }
+        );
+
+        let table_id = ctx.persistent_ctx.table_id;
+        let table_lock = TableLock::Write(table_id).into();
+        let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await;
+        let table_info_value = ctx.get_raw_table_info_value().await?;
+        let current_partition_key_indices = &table_info_value.table_info.meta.partition_key_indices;
+        if current_partition_key_indices == &partition_key_indices {
+            return Ok((
+                Box::new(AllocateRegion::new(self.plan_entries.clone())),
+                Status::executing(true),
+            ));
+        }
+        ensure!(
+            current_partition_key_indices.is_empty(),
+            error::InvalidArgumentsSnafu {
+                err_msg: format!(
+                    "Repartition partition metadata update expects an unpartitioned table, but table {} has partition key indices: {:?}",
+                    table_id, current_partition_key_indices
+                ),
+            }
+        );
+
+        let mut new_table_info = table_info_value.table_info.clone();
+        new_table_info.meta.partition_key_indices = partition_key_indices;
+        ctx.update_table_info(&table_info_value, table_info_value.update(new_table_info))
+            .await?;
+        // We don't invalidate cache here because the subsequent AllocateRegion step
+        // will update the table route and invalidate the cache accordingly.
+
+        Ok((
+            Box::new(AllocateRegion::new(self.plan_entries.clone())),
+            Status::executing(true),
+        ))
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use common_meta::ddl::test_util::datanode_handler::NaiveDatanodeHandler;
+    use common_meta::test_util::MockDatanodeManager;
+    use store_api::storage::{RegionId, TableId};
+
+    use super::*;
+    use crate::procedure::repartition::test_util::{
+        TestingEnv, new_parent_context, range_expr, test_region_route, test_region_wal_options,
+    };
+
+    async fn new_test_context(env: &TestingEnv, table_id: TableId) -> Context {
+        env.create_physical_table_metadata_for_repartition(
+            table_id,
+            vec![test_region_route(RegionId::new(table_id, 1), "")],
+            test_region_wal_options(&[1]),
+        )
+        .await;
+        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
+        let mut ctx = new_parent_context(env, node_manager, table_id);
+        ctx.persistent_ctx.partition_metadata_update = Some(PartitionMetadataUpdate::new(vec![0]));
+        ctx
+    }
+
+    async fn set_partition_key_indices(ctx: &Context, partition_key_indices: Vec<usize>) {
+        let current = ctx.get_raw_table_info_value().await.unwrap();
+        let mut table_info = current.table_info.clone();
+        table_info.meta.partition_key_indices = partition_key_indices;
+        ctx.update_table_info(&current, current.update(table_info))
+            .await
+            .unwrap();
+    }
+
+    async fn partition_key_indices(ctx: &Context) -> Vec<usize> {
+        ctx.get_table_info_value()
+            .await
+            .unwrap()
+            .table_info
+            .meta
+            .partition_key_indices
+    }
+
+    #[tokio::test]
+    async fn test_update_partition_metadata_applies_to_unpartitioned_table() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let mut state = UpdatePartitionMetadata::new(vec![]);
+
+        let (next, status) = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert!(status.need_persist());
+        assert!(next.as_any().is::<AllocateRegion>());
+        assert_eq!(partition_key_indices(&ctx).await, vec![0]);
+    }
+
+    #[tokio::test]
+    async fn test_update_partition_metadata_replay_is_noop() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        set_partition_key_indices(&ctx, vec![0]).await;
+        let mut state = UpdatePartitionMetadata::new(vec![]);
+
+        let (next, status) = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert!(status.need_persist());
+        assert!(next.as_any().is::<AllocateRegion>());
+        assert_eq!(partition_key_indices(&ctx).await, vec![0]);
+    }
+
+    #[tokio::test]
+    async fn test_update_partition_metadata_rejects_empty_partition_key_indices() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        ctx.persistent_ctx.partition_metadata_update = Some(PartitionMetadataUpdate::new(vec![]));
+        let mut state = UpdatePartitionMetadata::new(vec![]);
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(err.to_string().contains("non-empty partition key indices"));
+        assert!(partition_key_indices(&ctx).await.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_update_partition_metadata_rejects_other_partition_keys() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        set_partition_key_indices(&ctx, vec![1]).await;
+        let mut state = UpdatePartitionMetadata::new(vec![]);
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(err.to_string().contains("expects an unpartitioned table"));
+        assert_eq!(partition_key_indices(&ctx).await, vec![1]);
+    }
+
+    #[tokio::test]
+    async fn test_update_partition_metadata_preserves_plan_entries() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let plan_entries = vec![crate::procedure::repartition::plan::AllocationPlanEntry {
+            group_id: uuid::Uuid::new_v4(),
+            source_regions: vec![
+                crate::procedure::repartition::plan::SourceRegionDescriptor::Default {
+                    region_id: RegionId::new(table_id, 1),
+                },
+            ],
+            target_partition_exprs: vec![range_expr("x", 0, 10)],
+            transition_map: vec![vec![0]],
+        }];
+        let mut state = UpdatePartitionMetadata::new(plan_entries);
+
+        let (next, _) = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert!(next.as_any().is::<AllocateRegion>());
+    }
+}
diff --git a/src/meta-srv/src/procedure/repartition/utils.rs b/src/meta-srv/src/procedure/repartition/utils.rs
index 6f274e9596..f255ca618f 100644
--- a/src/meta-srv/src/procedure/repartition/utils.rs
+++ b/src/meta-srv/src/procedure/repartition/utils.rs
@@ -23,7 +23,7 @@ use store_api::storage::{RegionId, RegionNumber, TableId};
 
 use crate::error::{self, Result};
 use crate::procedure::repartition::group::GroupId;
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::SourceRegionDescriptor;
 
 /// Returns the `datanode_table_value`
 ///
@@ -138,21 +138,23 @@ pub fn merge_and_validate_region_wal_options(
 /// restored here.
 pub fn rollback_group_metadata_routes(
     group_id: GroupId,
-    source_regions: &[RegionDescriptor],
+    source_regions: &[SourceRegionDescriptor],
     original_target_routes: &[RegionRoute],
     allocated_region_ids: &[RegionId],
     pending_deallocate_region_ids: &[RegionId],
     region_routes_map: &mut HashMap<RegionId, &mut RegionRoute>,
 ) -> Result<()> {
     for source in source_regions {
-        let region_route = region_routes_map.get_mut(&source.region_id).context(
+        let region_id = source.region_id();
+        let region_route = region_routes_map.get_mut(&region_id).context(
             error::RepartitionSourceRegionMissingSnafu {
                 group_id,
-                region_id: source.region_id,
+                region_id,
             },
         )?;
         region_route.clear_leader_staging();
-        if pending_deallocate_region_ids.contains(&source.region_id) {
+        region_route.region.partition_expr = source.route_expr_for_rollback()?;
+        if pending_deallocate_region_ids.contains(&region_id) {
             region_route.clear_ignore_all_writes();
         }
     }
@@ -191,7 +193,7 @@ mod tests {
 
     use super::*;
     use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
-    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
     use crate::procedure::repartition::test_util::range_expr;
 
     /// Helper function to create a Kafka WAL option string from a topic name.
@@ -242,7 +244,7 @@ mod tests {
 
     fn original_target_routes(
         region_routes: &[RegionRoute],
-        targets: &[RegionDescriptor],
+        targets: &[TargetRegionDescriptor],
     ) -> Vec<RegionRoute> {
         let target_ids = targets
             .iter()
@@ -380,16 +382,16 @@ mod tests {
             ),
             new_staged_region_route(RegionId::new(table_id, 3), "", None, false),
         ];
-        let sources = vec![RegionDescriptor {
-            region_id: RegionId::new(table_id, 1),
-            partition_expr: range_expr("x", 0, 100),
-        }];
+        let sources = vec![SourceRegionDescriptor::partitioned(
+            RegionId::new(table_id, 1),
+            range_expr("x", 0, 100),
+        )];
         let targets = vec![
-            RegionDescriptor {
+            TargetRegionDescriptor {
                 region_id: RegionId::new(table_id, 1),
                 partition_expr: range_expr("x", 0, 50),
             },
-            RegionDescriptor {
+            TargetRegionDescriptor {
                 region_id: RegionId::new(table_id, 3),
                 partition_expr: range_expr("x", 50, 100),
             },
@@ -420,6 +422,60 @@ mod tests {
         assert_eq!(applied_region_routes, original_region_routes);
     }
 
+    #[test]
+    fn test_rollback_group_metadata_routes_default_source_restores_empty_expr() {
+        let group_id = Uuid::new_v4();
+        let table_id = 1024;
+        let default_region_id = RegionId::new(table_id, 1);
+        let allocated_region_id = RegionId::new(table_id, 2);
+        let source_regions = vec![SourceRegionDescriptor::Default {
+            region_id: default_region_id,
+        }];
+        let target_regions = vec![
+            TargetRegionDescriptor {
+                region_id: default_region_id,
+                partition_expr: range_expr("x", 0, 50),
+            },
+            TargetRegionDescriptor {
+                region_id: allocated_region_id,
+                partition_expr: range_expr("x", 50, 100),
+            },
+        ];
+        let current_region_routes = vec![
+            new_staged_region_route(default_region_id, "", None, false),
+            new_staged_region_route(allocated_region_id, "", None, false),
+        ];
+        let original_target_routes = vec![current_region_routes[0].clone()];
+        let mut applied_region_routes = UpdateMetadata::apply_staging_region_routes(
+            group_id,
+            &source_regions,
+            &target_regions,
+            &[],
+            &current_region_routes,
+        )
+        .unwrap();
+        assert_eq!(
+            applied_region_routes[0].region.partition_expr,
+            range_expr("x", 0, 50).as_json_str().unwrap()
+        );
+
+        rollback_group_metadata_routes(
+            group_id,
+            &source_regions,
+            &original_target_routes,
+            &[allocated_region_id],
+            &[],
+            &mut applied_region_routes
+                .iter_mut()
+                .map(|route| (route.region.id, route))
+                .collect(),
+        )
+        .unwrap();
+
+        assert_eq!(applied_region_routes[0].region.partition_expr, "");
+        assert!(!applied_region_routes[0].is_leader_staging());
+    }
+
     #[test]
     fn test_rollback_group_metadata_routes_merge_case_is_idempotent() {
         let group_id = Uuid::new_v4();
@@ -445,16 +501,16 @@ mod tests {
             ),
         ];
         let sources = vec![
-            RegionDescriptor {
-                region_id: RegionId::new(table_id, 1),
-                partition_expr: range_expr("x", 0, 100),
-            },
-            RegionDescriptor {
-                region_id: RegionId::new(table_id, 2),
-                partition_expr: range_expr("x", 100, 200),
-            },
+            SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 1),
+                range_expr("x", 0, 100),
+            ),
+            SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 2),
+                range_expr("x", 100, 200),
+            ),
         ];
-        let targets = vec![RegionDescriptor {
+        let targets = vec![TargetRegionDescriptor {
             region_id: RegionId::new(table_id, 1),
             partition_expr: range_expr("x", 0, 200),
         }];
diff --git a/src/meta-srv/src/procedure/test_util.rs b/src/meta-srv/src/procedure/test_util.rs
index 5bf60fe32e..318a276676 100644
--- a/src/meta-srv/src/procedure/test_util.rs
+++ b/src/meta-srv/src/procedure/test_util.rs
@@ -66,7 +66,7 @@ impl MailboxContext {
     ) {
         let pusher_id = channel.pusher_id();
         let pusher = Pusher::new(tx);
-        let _ = self.pushers.insert(pusher_id.string_key(), pusher).await;
+        let _ = self.pushers.insert(pusher_id, pusher).await;
     }
 
     pub fn mailbox(&self) -> &MailboxRef {
diff --git a/src/meta-srv/src/service/heartbeat.rs b/src/meta-srv/src/service/heartbeat.rs
index 238ed99df2..066d156047 100644
--- a/src/meta-srv/src/service/heartbeat.rs
+++ b/src/meta-srv/src/service/heartbeat.rs
@@ -20,10 +20,12 @@ use api::v1::meta::{
     AskLeaderRequest, AskLeaderResponse, HeartbeatRequest, HeartbeatResponse, Peer, RequestHeader,
     ResponseHeader, Role, heartbeat_server,
 };
+use common_meta::election::LeaderChangeMessage;
 use common_telemetry::{debug, error, info, warn};
 use futures::StreamExt;
 use once_cell::sync::OnceCell;
 use snafu::{OptionExt, ResultExt};
+use tokio::sync::broadcast::error::RecvError;
 use tokio::sync::mpsc;
 use tokio::sync::mpsc::Sender;
 use tokio_stream::wrappers::ReceiverStream;
@@ -31,10 +33,282 @@ use tonic::{Request, Response, Status, Streaming};
 
 use crate::error::{self, Result};
 use crate::handler::{HeartbeatHandlerGroup, Pusher, PusherId};
-use crate::metasrv::{Context, Metasrv};
+use crate::metasrv::{Context, ElectionRef, Metasrv};
 use crate::metrics::METRIC_META_HEARTBEAT_RECV;
 use crate::service::{GrpcResult, GrpcStream};
 
+type HeartbeatResponseResult = std::result::Result<HeartbeatResponse, Status>;
+
+#[async_trait::async_trait]
+trait HeartbeatRequestStream {
+    async fn next(&mut self) -> Option<std::result::Result<HeartbeatRequest, Status>>;
+}
+
+struct TonicHeartbeatRequestStream {
+    inner: Streaming<HeartbeatRequest>,
+}
+
+impl TonicHeartbeatRequestStream {
+    fn new(inner: Streaming<HeartbeatRequest>) -> Self {
+        Self { inner }
+    }
+}
+
+#[async_trait::async_trait]
+impl HeartbeatRequestStream for TonicHeartbeatRequestStream {
+    async fn next(&mut self) -> Option<std::result::Result<HeartbeatRequest, Status>> {
+        self.inner.next().await
+    }
+}
+
+enum LeaderStepDownEvent {
+    StepDown,
+    Closed,
+}
+
+#[async_trait::async_trait]
+trait LeaderStepDown {
+    async fn wait(&mut self) -> LeaderStepDownEvent;
+}
+
+struct ElectionLeaderStepDown {
+    rx: tokio::sync::broadcast::Receiver<LeaderChangeMessage>,
+}
+
+impl ElectionLeaderStepDown {
+    fn new(election: ElectionRef) -> Self {
+        Self {
+            rx: election.subscribe_leader_change(),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl LeaderStepDown for ElectionLeaderStepDown {
+    async fn wait(&mut self) -> LeaderStepDownEvent {
+        loop {
+            match self.rx.recv().await {
+                Ok(LeaderChangeMessage::StepDown(_)) => return LeaderStepDownEvent::StepDown,
+                Ok(LeaderChangeMessage::Elected(_)) => {}
+                Err(RecvError::Lagged(skipped)) => {
+                    warn!(
+                        "Leader step-down watcher lagged, skipped {} leader change events",
+                        skipped
+                    );
+                }
+                Err(RecvError::Closed) => return LeaderStepDownEvent::Closed,
+            }
+        }
+    }
+}
+
+struct HeartbeatSession<R, L> {
+    requests: R,
+    tx: Sender<HeartbeatResponseResult>,
+    leader_step_down: Option<L>,
+    handler_group: Arc<HeartbeatHandlerGroup>,
+    ctx: Context,
+    sender_id: PusherId,
+}
+
+impl<R, L> HeartbeatSession<R, L>
+where
+    R: HeartbeatRequestStream,
+    L: LeaderStepDown,
+{
+    /// Initializes the heartbeat session by receiving the first request,
+    /// and returns `None` if the stream is closed or an error occurs.
+    async fn init(
+        mut requests: R,
+        tx: Sender<HeartbeatResponseResult>,
+        leader_step_down: Option<L>,
+        handler_group: Arc<HeartbeatHandlerGroup>,
+        ctx: Context,
+    ) -> Option<Self> {
+        let msg = requests.next().await?;
+
+        let req = match msg {
+            Ok(req) => req,
+            Err(err) => {
+                error!("Failed to receive the first heartbeat request, error: {err}");
+                let _ = handle_request_stream_error(None, &tx, err).await;
+                return None;
+            }
+        };
+
+        let Some(header) = req.header.as_ref() else {
+            error!("Exit on malformed request: MissingRequestHeader");
+            let _ = tx
+                .send(Err(error::MissingRequestHeaderSnafu {}.build().into()))
+                .await;
+            return None;
+        };
+
+        let sender_id = register_pusher(&handler_group, header, tx.clone()).await;
+        let mut session = Self {
+            requests,
+            tx,
+            leader_step_down,
+            handler_group,
+            ctx,
+            sender_id,
+        };
+
+        if session.handle_request(req, true).await {
+            Some(session)
+        } else {
+            session.cleanup().await;
+            None
+        }
+    }
+
+    /// Runs the heartbeat session until the stream is closed or an error occurs.
+    async fn run(mut self) {
+        let mut leader_step_down = self.leader_step_down.take();
+
+        loop {
+            tokio::select! {
+                msg = self.requests.next() => {
+                    let Some(msg) = msg else {
+                        break;
+                    };
+
+                    if !self.handle_message(msg).await {
+                        break;
+                    }
+                }
+                event = wait_leader_step_down(leader_step_down.as_mut()), if leader_step_down.is_some() => {
+                    match event {
+                        LeaderStepDownEvent::StepDown => {
+                            self.send_not_leader_error().await;
+                            break;
+                        }
+                        LeaderStepDownEvent::Closed => {
+                            warn!("Leader step-down watcher closed");
+                            self.send_election_unavailable_error().await;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
+        self.cleanup().await;
+    }
+
+    /// Handles the incoming message, and returns whether to continue the session.
+    async fn handle_message(&mut self, msg: std::result::Result<HeartbeatRequest, Status>) -> bool {
+        match msg {
+            Ok(req) => self.handle_request(req, false).await,
+            Err(err) => handle_request_stream_error(Some(self.sender_id), &self.tx, err).await,
+        }
+    }
+
+    /// Handles the incoming heartbeat request, and returns whether to continue the session.
+    async fn handle_request(&mut self, req: HeartbeatRequest, is_handshake: bool) -> bool {
+        debug!("Receiving heartbeat request: {:?}", req);
+
+        let sender_id = self.sender_id.to_string();
+        METRIC_META_HEARTBEAT_RECV
+            .with_label_values(&[sender_id.as_str()])
+            .inc();
+
+        let res = self
+            .handler_group
+            .handle(req, self.ctx.clone().with_handshake(is_handshake))
+            .await
+            .inspect_err(
+                |e| warn!(e; "Failed to handle heartbeat request, sender: {}", self.sender_id),
+            )
+            .map_err(|e| e.into());
+
+        let is_not_leader = res.as_ref().is_ok_and(|r| r.is_not_leader());
+
+        debug!("Sending heartbeat response: {:?}", res);
+
+        if self.tx.send(res).await.is_err() {
+            info!(
+                "ReceiverStream was dropped; shutting down, sender: {}",
+                self.sender_id
+            );
+            return false;
+        }
+
+        if is_not_leader {
+            warn!(
+                "Quit because it is no longer the leader, sender: {}",
+                self.sender_id
+            );
+            self.send_not_leader_error().await;
+            return false;
+        }
+
+        true
+    }
+
+    async fn send_not_leader_error(&mut self) {
+        let _ = self
+            .tx
+            .send(Err(Status::aborted(format!(
+                "The requested metasrv node is not leader, node addr: {}",
+                self.ctx.server_addr
+            ))))
+            .await;
+    }
+
+    async fn send_election_unavailable_error(&mut self) {
+        let _ = self
+            .tx
+            .send(Err(Status::unavailable(format!(
+                "The requested metasrv node is shutting down, node addr: {}",
+                self.ctx.server_addr
+            ))))
+            .await;
+    }
+
+    async fn cleanup(&self) {
+        info!("Heartbeat stream closed, sender: {}", self.sender_id);
+        let _ = self.handler_group.deregister_push(self.sender_id).await;
+    }
+}
+
+async fn wait_leader_step_down<L>(leader_step_down: Option<&mut L>) -> LeaderStepDownEvent
+where
+    L: LeaderStepDown,
+{
+    match leader_step_down {
+        Some(leader_step_down) => leader_step_down.wait().await,
+        None => std::future::pending().await,
+    }
+}
+
+/// Handles request stream error by logging and forwarding the error to the client if possible.
+///
+/// Returns `false` if the stream should be terminated.
+async fn handle_request_stream_error(
+    sender_id: Option<PusherId>,
+    tx: &Sender<HeartbeatResponseResult>,
+    err: Status,
+) -> bool {
+    if let Some(io_err) = error::match_for_io_error(&err)
+        && io_err.kind() == ErrorKind::BrokenPipe
+    {
+        error!("Client disconnected: broken pipe, sender: {:?}", sender_id);
+        return false;
+    }
+    error!(err; "Error while receiving heartbeat request, sender: {:?}", sender_id);
+
+    if tx.send(Err(err)).await.is_err() {
+        info!(
+            "Failed to forward heartbeat request stream error; response stream was dropped, sender: {:?}",
+            sender_id
+        );
+        return false;
+    }
+
+    true
+}
+
 #[async_trait::async_trait]
 impl heartbeat_server::Heartbeat for Metasrv {
     type HeartbeatStream = GrpcStream<HeartbeatResponse>;
@@ -43,88 +317,26 @@ impl heartbeat_server::Heartbeat for Metasrv {
         &self,
         req: Request<Streaming<HeartbeatRequest>>,
     ) -> GrpcResult<Self::HeartbeatStream> {
-        let mut in_stream = req.into_inner();
         let (tx, rx) = mpsc::channel(128);
         let handler_group = self.handler_group().context(error::UnexpectedSnafu {
             violated: "expected heartbeat handlers",
         })?;
 
         let ctx = self.new_ctx();
+        let requests = TonicHeartbeatRequestStream::new(req.into_inner());
         let _handle = common_runtime::spawn_global(async move {
-            let mut pusher_id = None;
-            while let Some(msg) = in_stream.next().await {
-                let mut is_not_leader = false;
-                match msg {
-                    Ok(req) => {
-                        debug!("Receiving heartbeat request: {:?}", req);
-
-                        let Some(header) = req.header.as_ref() else {
-                            error!("Exit on malformed request: MissingRequestHeader");
-                            let _ = tx
-                                .send(Err(error::MissingRequestHeaderSnafu {}.build().into()))
-                                .await;
-                            break;
-                        };
-
-                        let is_handshake = pusher_id.is_none();
-                        if is_handshake {
-                            pusher_id =
-                                Some(register_pusher(&handler_group, header, tx.clone()).await);
-                        }
-                        if let Some(k) = &pusher_id {
-                            METRIC_META_HEARTBEAT_RECV.with_label_values(&[&k.to_string()]);
-                        } else {
-                            METRIC_META_HEARTBEAT_RECV.with_label_values(&["none"]);
-                        }
-
-                        let res = handler_group
-                            .handle(req, ctx.clone().with_handshake(is_handshake))
-                            .await
-                            .inspect_err(|e| warn!(e; "Failed to handle heartbeat request, pusher: {pusher_id:?}", ))
-                            .map_err(|e| e.into());
-
-                        is_not_leader = res.as_ref().is_ok_and(|r| r.is_not_leader());
-
-                        debug!("Sending heartbeat response: {:?}", res);
-
-                        if tx.send(res).await.is_err() {
-                            info!("ReceiverStream was dropped; shutting down");
-                            break;
-                        }
-                    }
-                    Err(err) => {
-                        if let Some(io_err) = error::match_for_io_error(&err)
-                            && io_err.kind() == ErrorKind::BrokenPipe
-                        {
-                            // client disconnected in unexpected way
-                            error!("Client disconnected: broken pipe");
-                            break;
-                        }
-                        error!(err; "Sending heartbeat response error");
-
-                        if tx.send(Err(err)).await.is_err() {
-                            info!("ReceiverStream was dropped; shutting down");
-                            break;
-                        }
-                    }
-                }
-
-                if is_not_leader {
-                    warn!("Quit because it is no longer the leader");
-                    let _ = tx
-                        .send(Err(Status::aborted(format!(
-                            "The requested metasrv node is not leader, node addr: {}",
-                            ctx.server_addr
-                        ))))
-                        .await;
-                    break;
-                }
-            }
-
-            info!("Heartbeat stream closed: {pusher_id:?}");
-
-            if let Some(pusher_id) = pusher_id {
-                let _ = handler_group.deregister_push(pusher_id).await;
+            if let Some(session) = HeartbeatSession::init(
+                requests,
+                tx,
+                ctx.election
+                    .as_ref()
+                    .map(|r| ElectionLeaderStepDown::new(r.clone())),
+                handler_group,
+                ctx,
+            )
+            .await
+            {
+                session.run().await;
             }
         });
 
@@ -192,6 +404,7 @@ async fn register_pusher(
 
 #[cfg(test)]
 mod tests {
+    use std::collections::VecDeque;
     use std::sync::Arc;
 
     use api::v1::meta::heartbeat_server::Heartbeat;
@@ -199,12 +412,300 @@ mod tests {
     use common_meta::kv_backend::memory::MemoryKvBackend;
     use common_telemetry::tracing_context::W3cTrace;
     use servers::grpc::GrpcOptions;
-    use tonic::IntoRequest;
+    use tokio::sync::mpsc;
+    use tonic::{Code, IntoRequest};
 
-    use super::get_node_id;
+    use super::*;
+    use crate::handler::test_utils::TestEnv;
     use crate::metasrv::MetasrvOptions;
     use crate::metasrv::builder::MetasrvBuilder;
 
+    struct MockHeartbeatRequestStream {
+        messages: VecDeque<std::result::Result<HeartbeatRequest, Status>>,
+        pending_when_empty: bool,
+    }
+
+    impl MockHeartbeatRequestStream {
+        fn new(messages: Vec<std::result::Result<HeartbeatRequest, Status>>) -> Self {
+            Self {
+                messages: messages.into(),
+                pending_when_empty: false,
+            }
+        }
+
+        fn pending_after(messages: Vec<std::result::Result<HeartbeatRequest, Status>>) -> Self {
+            Self {
+                messages: messages.into(),
+                pending_when_empty: true,
+            }
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl HeartbeatRequestStream for MockHeartbeatRequestStream {
+        async fn next(&mut self) -> Option<std::result::Result<HeartbeatRequest, Status>> {
+            if let Some(message) = self.messages.pop_front() {
+                return Some(message);
+            }
+
+            if self.pending_when_empty {
+                std::future::pending().await
+            } else {
+                None
+            }
+        }
+    }
+
+    struct MockLeaderStepDown {
+        event: Option<LeaderStepDownEvent>,
+    }
+
+    impl MockLeaderStepDown {
+        fn new(event: LeaderStepDownEvent) -> Self {
+            Self { event: Some(event) }
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl LeaderStepDown for MockLeaderStepDown {
+        async fn wait(&mut self) -> LeaderStepDownEvent {
+            self.event.take().unwrap()
+        }
+    }
+
+    fn heartbeat_request(role: Role, member_id: u64) -> HeartbeatRequest {
+        HeartbeatRequest {
+            header: Some(RequestHeader {
+                role: role.into(),
+                member_id,
+                ..Default::default()
+            }),
+            ..Default::default()
+        }
+    }
+
+    fn sender_id(role: Role, member_id: u64) -> PusherId {
+        PusherId::new(role, member_id)
+    }
+
+    fn test_context() -> Context {
+        TestEnv::new().ctx()
+    }
+
+    fn test_handler_group() -> Arc<HeartbeatHandlerGroup> {
+        Arc::new(HeartbeatHandlerGroup::default())
+    }
+
+    async fn init_session<L>(
+        requests: MockHeartbeatRequestStream,
+        tx: Sender<HeartbeatResponseResult>,
+        leader_step_down: Option<L>,
+        handler_group: Arc<HeartbeatHandlerGroup>,
+    ) -> Option<HeartbeatSession<MockHeartbeatRequestStream, L>>
+    where
+        L: LeaderStepDown,
+    {
+        HeartbeatSession::init(
+            requests,
+            tx,
+            leader_step_down,
+            handler_group,
+            test_context(),
+        )
+        .await
+    }
+
+    async fn recv_response(
+        rx: &mut mpsc::Receiver<HeartbeatResponseResult>,
+    ) -> HeartbeatResponseResult {
+        rx.recv().await.unwrap()
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_init_returns_none_on_empty_stream() {
+        let (tx, _rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let requests = MockHeartbeatRequestStream::new(vec![]);
+
+        let session = init_session(
+            requests,
+            tx,
+            None::<MockLeaderStepDown>,
+            handler_group.clone(),
+        )
+        .await;
+
+        assert!(session.is_none());
+        assert!(
+            !handler_group
+                .contains_pusher(&sender_id(Role::Datanode, 42))
+                .await
+        );
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_init_forwards_first_stream_error() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let requests = MockHeartbeatRequestStream::new(vec![Err(Status::internal("boom"))]);
+
+        let session = init_session(requests, tx, None::<MockLeaderStepDown>, handler_group).await;
+
+        assert!(session.is_none());
+        let status = recv_response(&mut rx).await.unwrap_err();
+        assert_eq!(Code::Internal, status.code());
+        assert_eq!("boom", status.message());
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_init_sends_error_on_missing_header() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let requests = MockHeartbeatRequestStream::new(vec![Ok(HeartbeatRequest::default())]);
+
+        let session = init_session(
+            requests,
+            tx,
+            None::<MockLeaderStepDown>,
+            handler_group.clone(),
+        )
+        .await;
+
+        assert!(session.is_none());
+        assert!(
+            !handler_group
+                .contains_pusher(&sender_id(Role::Datanode, 42))
+                .await
+        );
+
+        let status = recv_response(&mut rx).await.unwrap_err();
+        assert_eq!(Code::InvalidArgument, status.code());
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_init_registers_sender() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let sender_id = sender_id(Role::Datanode, 42);
+        let requests =
+            MockHeartbeatRequestStream::new(vec![Ok(heartbeat_request(Role::Datanode, 42))]);
+
+        let session = init_session(
+            requests,
+            tx,
+            None::<MockLeaderStepDown>,
+            handler_group.clone(),
+        )
+        .await;
+
+        assert!(session.is_some());
+        assert!(handler_group.contains_pusher(&sender_id).await);
+
+        let response = recv_response(&mut rx).await.unwrap();
+        assert!(response.heartbeat_config.is_some());
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_run_deregisters_sender_on_stream_close() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let sender_id = sender_id(Role::Datanode, 42);
+        let requests =
+            MockHeartbeatRequestStream::new(vec![Ok(heartbeat_request(Role::Datanode, 42))]);
+        let session = init_session(
+            requests,
+            tx,
+            None::<MockLeaderStepDown>,
+            handler_group.clone(),
+        )
+        .await
+        .unwrap();
+        let _ = recv_response(&mut rx).await.unwrap();
+
+        session.run().await;
+
+        assert!(!handler_group.contains_pusher(&sender_id).await);
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_run_forwards_stream_error_after_init() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let sender_id = sender_id(Role::Datanode, 42);
+        let requests = MockHeartbeatRequestStream::new(vec![
+            Ok(heartbeat_request(Role::Datanode, 42)),
+            Err(Status::unavailable("temporary")),
+        ]);
+        let session = init_session(
+            requests,
+            tx,
+            None::<MockLeaderStepDown>,
+            handler_group.clone(),
+        )
+        .await
+        .unwrap();
+        let _ = recv_response(&mut rx).await.unwrap();
+
+        session.run().await;
+
+        let status = recv_response(&mut rx).await.unwrap_err();
+        assert_eq!(Code::Unavailable, status.code());
+        assert_eq!("temporary", status.message());
+        assert!(!handler_group.contains_pusher(&sender_id).await);
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_leader_step_down_sends_aborted_and_deregisters() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let sender_id = sender_id(Role::Datanode, 42);
+        let requests = MockHeartbeatRequestStream::pending_after(vec![Ok(heartbeat_request(
+            Role::Datanode,
+            42,
+        ))]);
+        let session = init_session(
+            requests,
+            tx,
+            Some(MockLeaderStepDown::new(LeaderStepDownEvent::StepDown)),
+            handler_group.clone(),
+        )
+        .await
+        .unwrap();
+        let _ = recv_response(&mut rx).await.unwrap();
+
+        session.run().await;
+
+        let status = recv_response(&mut rx).await.unwrap_err();
+        assert_eq!(Code::Aborted, status.code());
+        assert!(!handler_group.contains_pusher(&sender_id).await);
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_leader_watcher_closed_sends_unavailable_and_deregisters() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let sender_id = sender_id(Role::Datanode, 42);
+        let requests = MockHeartbeatRequestStream::pending_after(vec![Ok(heartbeat_request(
+            Role::Datanode,
+            42,
+        ))]);
+        let session = init_session(
+            requests,
+            tx,
+            Some(MockLeaderStepDown::new(LeaderStepDownEvent::Closed)),
+            handler_group.clone(),
+        )
+        .await
+        .unwrap();
+        let _ = recv_response(&mut rx).await.unwrap();
+
+        session.run().await;
+
+        let status = recv_response(&mut rx).await.unwrap_err();
+        assert_eq!(Code::Unavailable, status.code());
+        assert!(!handler_group.contains_pusher(&sender_id).await);
+    }
+
     #[tokio::test]
     async fn test_ask_leader() {
         let kv_backend = Arc::new(MemoryKvBackend::new());
diff --git a/src/meta-srv/src/service/mailbox.rs b/src/meta-srv/src/service/mailbox.rs
index 8b37eeaad5..f3fbdcbffc 100644
--- a/src/meta-srv/src/service/mailbox.rs
+++ b/src/meta-srv/src/service/mailbox.rs
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 use std::fmt::{Display, Formatter};
-use std::ops::Range;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
@@ -69,20 +68,11 @@ pub enum BroadcastChannel {
 }
 
 impl BroadcastChannel {
-    pub(crate) fn pusher_range(&self) -> Range<String> {
+    pub(crate) fn role(&self) -> Role {
         match self {
-            BroadcastChannel::Datanode => Range {
-                start: format!("{}-", Role::Datanode as i32),
-                end: format!("{}-", Role::Frontend as i32),
-            },
-            BroadcastChannel::Frontend => Range {
-                start: format!("{}-", Role::Frontend as i32),
-                end: format!("{}-", Role::Flownode as i32),
-            },
-            BroadcastChannel::Flownode => Range {
-                start: format!("{}-", Role::Flownode as i32),
-                end: format!("{}-", Role::Flownode as i32 + 1),
-            },
+            BroadcastChannel::Datanode => Role::Datanode,
+            BroadcastChannel::Frontend => Role::Frontend,
+            BroadcastChannel::Flownode => Role::Flownode,
         }
     }
 }
@@ -207,9 +197,6 @@ pub trait Mailbox: Send + Sync {
     async fn broadcast(&self, ch: &BroadcastChannel, msg: &MailboxMessage) -> Result<()>;
 
     async fn on_recv(&self, id: MessageId, maybe_msg: Result<MailboxMessage>) -> Result<()>;
-
-    /// Reset all pushers of the mailbox.
-    async fn reset(&self);
 }
 
 #[cfg(test)]
@@ -222,19 +209,10 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_channel_pusher_range() {
-        assert_eq!(
-            BroadcastChannel::Datanode.pusher_range(),
-            ("0-".to_string().."1-".to_string())
-        );
-        assert_eq!(
-            BroadcastChannel::Frontend.pusher_range(),
-            ("1-".to_string().."2-".to_string())
-        );
-        assert_eq!(
-            BroadcastChannel::Flownode.pusher_range(),
-            ("2-".to_string().."3-".to_string())
-        );
+    fn test_broadcast_channel_role() {
+        assert_eq!(BroadcastChannel::Datanode.role(), Role::Datanode);
+        assert_eq!(BroadcastChannel::Frontend.role(), Role::Frontend);
+        assert_eq!(BroadcastChannel::Flownode.role(), Role::Flownode);
     }
 
     #[tokio::test]
diff --git a/src/metric-engine/src/engine/put.rs b/src/metric-engine/src/engine/put.rs
index 438a8fcad3..103b15e596 100644
--- a/src/metric-engine/src/engine/put.rs
+++ b/src/metric-engine/src/engine/put.rs
@@ -31,7 +31,7 @@ use store_api::storage::{RegionId, TableId};
 
 use crate::engine::MetricEngineInner;
 use crate::error::{
-    ColumnNotFoundSnafu, CreateDefaultSnafu, ForbiddenPhysicalAlterSnafu, InvalidRequestSnafu,
+    ColumnNotFoundSnafu, CreateDefaultSnafu, ForbiddenPhysicalWriteSnafu, InvalidRequestSnafu,
     LogicalRegionNotFoundSnafu, PhysicalRegionNotFoundSnafu, Result, UnexpectedRequestSnafu,
     UnsupportedRegionRequestSnafu,
 };
@@ -55,7 +55,7 @@ impl MetricEngineInner {
             );
             FORBIDDEN_OPERATION_COUNT.inc();
 
-            ForbiddenPhysicalAlterSnafu.fail()
+            ForbiddenPhysicalWriteSnafu.fail()
         } else {
             self.put_logical_region(region_id, request).await
         }
@@ -86,18 +86,31 @@ impl MetricEngineInner {
 
         // Fast path: single request, no batching overhead
         if len == 1 {
-            let (logical_id, req) = requests.into_iter().next().unwrap();
-            return self.put_logical_region(logical_id, req).await;
+            let (region_id, req) = requests.into_iter().next().unwrap();
+            let is_putting_physical_region =
+                self.state.read().unwrap().exist_physical_region(region_id);
+            if is_putting_physical_region {
+                FORBIDDEN_OPERATION_COUNT.inc();
+                return ForbiddenPhysicalWriteSnafu.fail();
+            }
+
+            return self.put_logical_region(region_id, req).await;
         }
 
         let mut requests_per_physical: HashMap<RegionId, Vec<(RegionId, RegionPutRequest)>> =
             HashMap::new();
-        for (logical_region_id, request) in requests {
-            let physical_region_id = self.find_physical_region_id(logical_region_id)?;
+        for (region_id, request) in requests {
+            let is_putting_physical_region =
+                self.state.read().unwrap().exist_physical_region(region_id);
+            if is_putting_physical_region {
+                FORBIDDEN_OPERATION_COUNT.inc();
+                return ForbiddenPhysicalWriteSnafu.fail();
+            }
+            let physical_region_id = self.find_physical_region_id(region_id)?;
             requests_per_physical
                 .entry(physical_region_id)
                 .or_default()
-                .push((logical_region_id, request));
+                .push((region_id, request));
         }
 
         let mut total_affected_rows: AffectedRows = 0;
@@ -1226,6 +1239,84 @@ mod tests {
         assert_eq!(batches.iter().map(|b| b.num_rows()).sum::<usize>(), 0);
     }
 
+    #[tokio::test]
+    async fn test_batch_write_single_physical_region_forbidden() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+        let engine = env.metric();
+
+        let physical_region_id = env.default_physical_region_id();
+        let schema = test_util::row_schema_with_tags(&["job"]);
+        let requests = vec![(
+            physical_region_id,
+            RegionPutRequest {
+                rows: Rows {
+                    schema,
+                    rows: test_util::build_rows(1, 1),
+                },
+                hint: None,
+                partition_expr_version: None,
+            },
+        )];
+
+        let err = engine
+            .inner
+            .put_regions_batch(requests.into_iter())
+            .await
+            .unwrap_err();
+
+        assert!(matches!(
+            err,
+            crate::error::Error::ForbiddenPhysicalWrite { .. }
+        ));
+    }
+
+    #[tokio::test]
+    async fn test_batch_write_physical_region_forbidden() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+        let engine = env.metric();
+
+        let physical_region_id = env.default_physical_region_id();
+        let logical_region_id = env.default_logical_region_id();
+        let schema = test_util::row_schema_with_tags(&["job"]);
+        let requests = vec![
+            (
+                logical_region_id,
+                RegionPutRequest {
+                    rows: Rows {
+                        schema: schema.clone(),
+                        rows: test_util::build_rows(1, 1),
+                    },
+                    hint: None,
+                    partition_expr_version: None,
+                },
+            ),
+            (
+                physical_region_id,
+                RegionPutRequest {
+                    rows: Rows {
+                        schema,
+                        rows: test_util::build_rows(1, 1),
+                    },
+                    hint: None,
+                    partition_expr_version: None,
+                },
+            ),
+        ];
+
+        let err = engine
+            .inner
+            .put_regions_batch(requests.into_iter())
+            .await
+            .unwrap_err();
+
+        assert!(matches!(
+            err,
+            crate::error::Error::ForbiddenPhysicalWrite { .. }
+        ));
+    }
+
     #[tokio::test]
     async fn test_batch_write_single_request_fast_path() {
         let env = TestEnv::new().await;
diff --git a/src/metric-engine/src/error.rs b/src/metric-engine/src/error.rs
index 284b1b0298..f01737e764 100644
--- a/src/metric-engine/src/error.rs
+++ b/src/metric-engine/src/error.rs
@@ -254,6 +254,12 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Write request to physical region is forbidden"))]
+    ForbiddenPhysicalWrite {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display("Invalid region metadata"))]
     InvalidMetadata {
         source: store_api::metadata::MetadataError,
@@ -411,6 +417,7 @@ impl ErrorExt for Error {
             | CreateDefault { .. } => StatusCode::InvalidArguments,
 
             ForbiddenPhysicalAlter { .. }
+            | ForbiddenPhysicalWrite { .. }
             | UnsupportedRegionRequest { .. }
             | MissingFiles { .. } => StatusCode::Unsupported,
 
diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml
index dde1e44ea1..3e3a18a24d 100644
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -53,6 +53,7 @@ dashmap.workspace = true
 dotenv.workspace = true
 either.workspace = true
 futures.workspace = true
+humantime.workspace = true
 humantime-serde.workspace = true
 index.workspace = true
 itertools.workspace = true
diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs
index c05db5b989..eee1cfae0a 100644
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -30,6 +30,7 @@ use std::sync::Arc;
 use bytes::Bytes;
 use common_base::readable_size::ReadableSize;
 use common_telemetry::warn;
+use datatypes::arrow::buffer::BooleanBuffer;
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::value::Value;
 use datatypes::vectors::VectorRef;
@@ -38,8 +39,10 @@ use index::result_cache::IndexResultCache;
 use moka::notification::RemovalCause;
 use moka::sync::Cache;
 use object_store::ObjectStore;
+use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
 use parquet::file::metadata::{FileMetaData, PageIndexPolicy, ParquetMetaData};
 use puffin::puffin_manager::cache::{PuffinMetadataCache, PuffinMetadataCacheRef};
+use smallvec::SmallVec;
 use snafu::{OptionExt, ResultExt};
 use store_api::metadata::RegionMetadataRef;
 use store_api::storage::{ConcreteDataType, FileId, RegionId, TimeSeriesRowSelector};
@@ -74,6 +77,8 @@ const INDEX_TYPE: &str = "index";
 const SELECTOR_RESULT_TYPE: &str = "selector_result";
 /// Metrics type key for range scan result cache.
 const RANGE_RESULT_TYPE: &str = "range_result";
+/// Metrics type key for prefilter result cache.
+const PREFILTER_RESULT_TYPE: &str = "prefilter_result";
 const RANGE_RESULT_CONCAT_MEMORY_LIMIT: ReadableSize = ReadableSize::mb(512);
 const RANGE_RESULT_CONCAT_MEMORY_PERMIT: ReadableSize = ReadableSize::kb(1);
 
@@ -274,6 +279,117 @@ fn strip_region_metadata_from_parquet(parquet_metadata: ParquetMetaData) -> Parq
         .build()
 }
 
+fn removal_cause_str(cause: RemovalCause) -> &'static str {
+    match cause {
+        RemovalCause::Expired => "expired",
+        RemovalCause::Explicit => "explicit",
+        RemovalCause::Replaced => "replaced",
+        RemovalCause::Size => "size",
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct PrefilterRowSelector {
+    row_count: usize,
+    skip: bool,
+}
+
+// `parquet::arrow::arrow_reader::RowSelector` does not implement `Hash`, but
+// prefilter cache keys must hash the upstream row-selection snapshot. Keep a
+// local hashable mirror of the two fields that define selector semantics.
+// TODO(yingwen): Remove this mirror if upstream `RowSelector` implements `Hash`.
+impl From<&RowSelector> for PrefilterRowSelector {
+    fn from(selector: &RowSelector) -> Self {
+        Self {
+            row_count: selector.row_count,
+            skip: selector.skip,
+        }
+    }
+}
+
+/// Key for a cached prefilter result.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct PrefilterKey {
+    file_id: FileId,
+    row_group_idx: u32,
+    row_selection: Option<Arc<Vec<PrefilterRowSelector>>>,
+    schema_version: u64,
+    filter_exprs: SmallVec<[String; 1]>,
+    mem_usage: usize,
+}
+
+impl PrefilterKey {
+    pub(crate) fn row_selection_snapshot(
+        row_selection: Option<&RowSelection>,
+    ) -> Option<Arc<Vec<PrefilterRowSelector>>> {
+        row_selection.map(|selection| {
+            Arc::new(
+                selection
+                    .iter()
+                    .map(PrefilterRowSelector::from)
+                    .collect::<Vec<_>>(),
+            )
+        })
+    }
+
+    pub(crate) fn new(
+        file_id: FileId,
+        row_group_idx: u32,
+        row_selection: Option<Arc<Vec<PrefilterRowSelector>>>,
+        schema_version: u64,
+        filter_exprs: SmallVec<[String; 1]>,
+    ) -> Self {
+        let row_selection_bytes = row_selection
+            .as_ref()
+            .map(|selection| selection.len() * mem::size_of::<PrefilterRowSelector>())
+            .unwrap_or(0);
+        let spilled_expr_bytes = if filter_exprs.spilled() {
+            filter_exprs.capacity() * mem::size_of::<String>()
+        } else {
+            0
+        };
+        let expr_bytes = filter_exprs.iter().map(|s| s.capacity()).sum::<usize>();
+
+        Self {
+            file_id,
+            row_group_idx,
+            row_selection,
+            schema_version,
+            filter_exprs,
+            mem_usage: mem::size_of::<Self>()
+                + row_selection_bytes
+                + spilled_expr_bytes
+                + expr_bytes,
+        }
+    }
+
+    fn mem_usage(&self) -> usize {
+        self.mem_usage
+    }
+}
+
+type PrefilterResultCache = Cache<PrefilterKey, Arc<BooleanBuffer>>;
+
+fn new_prefilter_result_cache(capacity: u64) -> PrefilterResultCache {
+    Cache::builder()
+        .max_capacity(capacity)
+        .weigher(prefilter_result_cache_weight)
+        .eviction_listener(|k, v, cause| {
+            let size = prefilter_result_cache_weight(&k, &v);
+            CACHE_BYTES
+                .with_label_values(&[PREFILTER_RESULT_TYPE])
+                .sub(size.into());
+            CACHE_EVICTION
+                .with_label_values(&[PREFILTER_RESULT_TYPE, removal_cause_str(cause)])
+                .inc();
+        })
+        .build()
+}
+
+fn prefilter_result_cache_weight(k: &PrefilterKey, v: &Arc<BooleanBuffer>) -> u32 {
+    (k.mem_usage() + mem::size_of::<BooleanBuffer>() + v.values().len()) as u32
+}
+
 /// Cache strategies that may only enable a subset of caches.
 #[derive(Clone)]
 pub enum CacheStrategy {
@@ -358,6 +474,23 @@ impl CacheStrategy {
         }
     }
 
+    /// Calls [CacheManager::get_prefilter_result()].
+    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
+    pub(crate) fn get_prefilter_result(&self, key: &PrefilterKey) -> Option<Arc<BooleanBuffer>> {
+        match self {
+            CacheStrategy::EnableAll(cache_manager) => cache_manager.get_prefilter_result(key),
+            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
+        }
+    }
+
+    /// Calls [CacheManager::put_prefilter_result()].
+    /// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
+    pub(crate) fn put_prefilter_result(&self, key: PrefilterKey, result: Arc<BooleanBuffer>) {
+        if let CacheStrategy::EnableAll(cache_manager) = self {
+            cache_manager.put_prefilter_result(key, result);
+        }
+    }
+
     /// Calls [CacheManager::remove_parquet_meta_data()].
     pub fn remove_parquet_meta_data(&self, file_id: RegionFileId) {
         match self {
@@ -610,6 +743,8 @@ pub struct CacheManager {
     range_result_memory_limiter: Arc<RangeResultMemoryLimiter>,
     /// Cache for index result.
     index_result_cache: Option<IndexResultCache>,
+    /// Cache for prefilter result.
+    prefilter_result_cache: Option<PrefilterResultCache>,
 }
 
 pub type CacheManagerRef = Arc<CacheManager>;
@@ -908,6 +1043,21 @@ impl CacheManager {
     pub(crate) fn index_result_cache(&self) -> Option<&IndexResultCache> {
         self.index_result_cache.as_ref()
     }
+
+    pub(crate) fn get_prefilter_result(&self, key: &PrefilterKey) -> Option<Arc<BooleanBuffer>> {
+        self.prefilter_result_cache
+            .as_ref()
+            .and_then(|cache| update_hit_miss(cache.get(key), PREFILTER_RESULT_TYPE))
+    }
+
+    pub(crate) fn put_prefilter_result(&self, key: PrefilterKey, result: Arc<BooleanBuffer>) {
+        if let Some(cache) = &self.prefilter_result_cache {
+            CACHE_BYTES
+                .with_label_values(&[PREFILTER_RESULT_TYPE])
+                .add(prefilter_result_cache_weight(&key, &result).into());
+            cache.insert(key, result);
+        }
+    }
 }
 
 /// Increases selector cache miss metrics.
@@ -930,6 +1080,7 @@ pub struct CacheManagerBuilder {
     index_content_size: u64,
     index_content_page_size: u64,
     index_result_cache_size: u64,
+    prefilter_result_cache_size: u64,
     puffin_metadata_size: u64,
     write_cache: Option<WriteCacheRef>,
     selector_result_cache_size: u64,
@@ -985,6 +1136,12 @@ impl CacheManagerBuilder {
         self
     }
 
+    /// Sets cache size for prefilter result.
+    pub fn prefilter_result_cache_size(mut self, bytes: u64) -> Self {
+        self.prefilter_result_cache_size = bytes;
+        self
+    }
+
     /// Sets cache size for puffin metadata.
     pub fn puffin_metadata_size(mut self, bytes: u64) -> Self {
         self.puffin_metadata_size = bytes;
@@ -1005,15 +1162,6 @@ impl CacheManagerBuilder {
 
     /// Builds the [CacheManager].
     pub fn build(self) -> CacheManager {
-        fn to_str(cause: RemovalCause) -> &'static str {
-            match cause {
-                RemovalCause::Expired => "expired",
-                RemovalCause::Explicit => "explicit",
-                RemovalCause::Replaced => "replaced",
-                RemovalCause::Size => "size",
-            }
-        }
-
         let sst_meta_cache = (self.sst_meta_cache_size != 0).then(|| {
             Cache::builder()
                 .max_capacity(self.sst_meta_cache_size)
@@ -1024,7 +1172,7 @@ impl CacheManagerBuilder {
                         .with_label_values(&[SST_META_TYPE])
                         .sub(size.into());
                     CACHE_EVICTION
-                        .with_label_values(&[SST_META_TYPE, to_str(cause)])
+                        .with_label_values(&[SST_META_TYPE, removal_cause_str(cause)])
                         .inc();
                 })
                 .build()
@@ -1039,7 +1187,7 @@ impl CacheManagerBuilder {
                         .with_label_values(&[VECTOR_TYPE])
                         .sub(size.into());
                     CACHE_EVICTION
-                        .with_label_values(&[VECTOR_TYPE, to_str(cause)])
+                        .with_label_values(&[VECTOR_TYPE, removal_cause_str(cause)])
                         .inc();
                 })
                 .build()
@@ -1052,7 +1200,7 @@ impl CacheManagerBuilder {
                     let size = page_cache_weight(&k, &v);
                     CACHE_BYTES.with_label_values(&[PAGE_TYPE]).sub(size.into());
                     CACHE_EVICTION
-                        .with_label_values(&[PAGE_TYPE, to_str(cause)])
+                        .with_label_values(&[PAGE_TYPE, removal_cause_str(cause)])
                         .inc();
                 })
                 .build()
@@ -1073,6 +1221,8 @@ impl CacheManagerBuilder {
             .then(|| Arc::new(VectorIndexCache::new(self.index_content_size)));
         let index_result_cache = (self.index_result_cache_size != 0)
             .then(|| IndexResultCache::new(self.index_result_cache_size));
+        let prefilter_result_cache = (self.prefilter_result_cache_size != 0)
+            .then(|| new_prefilter_result_cache(self.prefilter_result_cache_size));
         let puffin_metadata_cache =
             PuffinMetadataCache::new(self.puffin_metadata_size, &CACHE_BYTES);
         let selector_result_cache = (self.selector_result_cache_size != 0).then(|| {
@@ -1085,7 +1235,7 @@ impl CacheManagerBuilder {
                         .with_label_values(&[SELECTOR_RESULT_TYPE])
                         .sub(size.into());
                     CACHE_EVICTION
-                        .with_label_values(&[SELECTOR_RESULT_TYPE, to_str(cause)])
+                        .with_label_values(&[SELECTOR_RESULT_TYPE, removal_cause_str(cause)])
                         .inc();
                 })
                 .build()
@@ -1100,7 +1250,7 @@ impl CacheManagerBuilder {
                         .with_label_values(&[RANGE_RESULT_TYPE])
                         .sub(size.into());
                     CACHE_EVICTION
-                        .with_label_values(&[RANGE_RESULT_TYPE, to_str(cause)])
+                        .with_label_values(&[RANGE_RESULT_TYPE, removal_cause_str(cause)])
                         .inc();
                 })
                 .build()
@@ -1123,6 +1273,7 @@ impl CacheManagerBuilder {
                 RANGE_RESULT_CONCAT_MEMORY_PERMIT.as_bytes() as usize,
             )),
             index_result_cache,
+            prefilter_result_cache,
         }
     }
 }
@@ -1551,6 +1702,127 @@ mod tests {
         assert!(cache.get_selector_result(&key).is_some());
     }
 
+    #[test]
+    fn test_prefilter_result_cache() {
+        let disabled = CacheManager::builder().build();
+        let file_id = FileId::random();
+        let key = PrefilterKey::new(
+            file_id,
+            0,
+            None,
+            1,
+            SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()]),
+        );
+        let selection = Arc::new(BooleanBuffer::new_set(3));
+
+        disabled.put_prefilter_result(key.clone(), selection.clone());
+        assert!(disabled.get_prefilter_result(&key).is_none());
+
+        let cache = Arc::new(
+            CacheManager::builder()
+                .prefilter_result_cache_size(1000)
+                .build(),
+        );
+        assert!(cache.get_prefilter_result(&key).is_none());
+        cache.put_prefilter_result(key.clone(), selection.clone());
+        assert_eq!(
+            cache.get_prefilter_result(&key).unwrap().as_ref(),
+            selection.as_ref()
+        );
+
+        let enable_all = CacheStrategy::EnableAll(cache.clone());
+        assert!(enable_all.get_prefilter_result(&key).is_some());
+
+        let compaction = CacheStrategy::Compaction(cache.clone());
+        assert!(compaction.get_prefilter_result(&key).is_none());
+        compaction.put_prefilter_result(key.clone(), selection.clone());
+        assert!(cache.get_prefilter_result(&key).is_some());
+
+        let disabled_strategy = CacheStrategy::Disabled;
+        assert!(disabled_strategy.get_prefilter_result(&key).is_none());
+        disabled_strategy.put_prefilter_result(key.clone(), selection);
+        assert!(cache.get_prefilter_result(&key).is_some());
+    }
+
+    #[test]
+    fn test_prefilter_key_distinguishes_dimensions() {
+        let file_id = FileId::random();
+        let row_selection = RowSelection::from(vec![RowSelector::skip(1), RowSelector::select(3)]);
+        let other_row_selection =
+            RowSelection::from(vec![RowSelector::skip(2), RowSelector::select(2)]);
+        let row_selection = PrefilterKey::row_selection_snapshot(Some(&row_selection));
+        let other_row_selection = PrefilterKey::row_selection_snapshot(Some(&other_row_selection));
+        let base = PrefilterKey::new(
+            file_id,
+            0,
+            row_selection.clone(),
+            1,
+            SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()]),
+        );
+
+        assert_ne!(
+            base,
+            PrefilterKey::new(
+                FileId::random(),
+                0,
+                row_selection.clone(),
+                1,
+                SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()])
+            )
+        );
+        assert_ne!(
+            base,
+            PrefilterKey::new(
+                file_id,
+                1,
+                row_selection.clone(),
+                1,
+                SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()])
+            )
+        );
+        assert_ne!(
+            base,
+            PrefilterKey::new(
+                file_id,
+                0,
+                other_row_selection,
+                1,
+                SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()])
+            )
+        );
+        assert_ne!(
+            base,
+            PrefilterKey::new(
+                file_id,
+                0,
+                row_selection.clone(),
+                1,
+                SmallVec::from_vec(vec!["tag_0 IN ([b])".to_string()])
+            )
+        );
+        assert_ne!(
+            base,
+            PrefilterKey::new(
+                file_id,
+                0,
+                row_selection.clone(),
+                2,
+                SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()])
+            )
+        );
+        let pk_group = PrefilterKey::new(
+            file_id,
+            0,
+            row_selection,
+            1,
+            SmallVec::from_vec(vec![
+                "tag_0 IN ([a])".to_string(),
+                "tag_1 IN ([x])".to_string(),
+            ]),
+        );
+        assert_ne!(base, pk_group);
+    }
+
     #[test]
     fn test_range_result_cache() {
         let cache = Arc::new(
diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs
index 98e97fca85..3a85ff1c65 100644
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -117,6 +117,8 @@ pub struct MitoConfig {
     pub selector_result_cache_size: ReadableSize,
     /// Cache size for flat range scan results. Setting it to 0 to disable the cache.
     pub range_result_cache_size: ReadableSize,
+    /// Cache size for prefilter results. Setting it to 0 to disable the cache.
+    pub prefilter_result_cache_size: ReadableSize,
     /// Whether to enable the write cache.
     pub enable_write_cache: bool,
     /// File system path for write cache dir's root, defaults to `{data_home}`.
@@ -202,6 +204,7 @@ impl Default for MitoConfig {
             page_cache_size: ReadableSize::mb(512),
             selector_result_cache_size: ReadableSize::mb(512),
             range_result_cache_size: ReadableSize::mb(512),
+            prefilter_result_cache_size: ReadableSize::mb(128),
             enable_write_cache: false,
             write_cache_path: String::new(),
             write_cache_size: ReadableSize::gb(5),
@@ -330,6 +333,8 @@ impl MitoConfig {
         self.page_cache_size = page_cache_size;
         self.selector_result_cache_size = mem_cache_size;
         self.range_result_cache_size = mem_cache_size;
+        // Use a smaller cache size because prefilter result usually should be small.
+        self.prefilter_result_cache_size = sst_meta_cache_size;
 
         self.index.adjust_buffer_and_cache_size(sys_memory);
     }
diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs
index 8aa154b085..41215e1ab6 100644
--- a/src/mito2/src/engine.rs
+++ b/src/mito2/src/engine.rs
@@ -118,6 +118,7 @@ use store_api::region_engine::{
     RemapManifestsResponse, SetRegionRoleStateResponse, SettableRegionRoleState,
     SyncRegionFromRequest, SyncRegionFromResponse,
 };
+use store_api::region_info::RegionInfoEntry;
 use store_api::region_request::{
     AffectedRows, RegionCatchupRequest, RegionOpenRequest, RegionRequest,
 };
@@ -613,8 +614,10 @@ impl MitoEngine {
                             return Vec::new();
                         }
                     };
+                    // The index file path is derived from the physical file owner. After
+                    // repartition, `entry.region_id` is only the referring region.
                     let region_index_id = RegionIndexId::new(
-                        RegionFileId::new(entry.region_id, file_id),
+                        RegionFileId::new(entry.origin_region_id, file_id),
                         index_version,
                     );
                     let context = IndexEntryContext {
@@ -654,6 +657,16 @@ impl MitoEngine {
         results
     }
 
+    /// Lists region info entries of all regions in the engine.
+    pub async fn all_region_infos(&self) -> Vec<RegionInfoEntry> {
+        let node_id = self.inner.workers.file_ref_manager().node_id();
+        self.inner
+            .workers
+            .all_regions()
+            .map(|region| region.region_info_entry(node_id))
+            .collect()
+    }
+
     /// Lists all SSTs from the storage layer of all regions in the engine.
     pub fn all_ssts_from_storage(&self) -> impl Stream<Item = Result<StorageSstEntry>> {
         let node_id = self.inner.workers.file_ref_manager().node_id();
diff --git a/src/mito2/src/engine/basic_test.rs b/src/mito2/src/engine/basic_test.rs
index f256f88694..e1e462f692 100644
--- a/src/mito2/src/engine/basic_test.rs
+++ b/src/mito2/src/engine/basic_test.rs
@@ -978,6 +978,58 @@ async fn test_list_ssts_with_format(
     assert_eq!(debug_format, expected_storage_ssts, "{}", debug_format);
 }
 
+#[tokio::test]
+async fn test_all_region_infos() {
+    let mut env = TestEnv::with_prefix("all-region-infos").await;
+    let engine = env
+        .create_engine(MitoConfig {
+            default_flat_format: true,
+            ..Default::default()
+        })
+        .await;
+
+    let region_id = RegionId::new(1024, 7);
+    let request = CreateRequestBuilder::new().build();
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+
+    let rows = Rows {
+        schema: column_schemas,
+        rows: build_rows_for_key("region-info", 0, 3, 0),
+    };
+    put_rows(&engine, region_id, rows).await;
+    engine
+        .handle_request(
+            region_id,
+            RegionRequest::Flush(RegionFlushRequest::default()),
+        )
+        .await
+        .unwrap();
+
+    let entries = engine.all_region_infos().await;
+    let entry = entries
+        .iter()
+        .find(|entry| entry.region_id == region_id)
+        .expect("region info entry should exist");
+
+    assert_eq!(region_id.as_u64(), entry.region_id.as_u64());
+    assert_eq!(region_id.table_id(), entry.table_id);
+    assert_eq!(region_id.region_number(), entry.region_number);
+    assert_eq!(region_id.region_group(), entry.region_group);
+    assert_eq!(region_id.region_sequence(), entry.region_sequence);
+    assert!(!entry.state.is_empty());
+    assert_eq!("Leader", entry.role);
+    assert!(entry.writable);
+    assert_eq!(3, entry.committed_sequence);
+    assert_eq!(Some(3), entry.flushed_sequence);
+    assert!(entry.manifest_version > 0);
+    assert!(serde_json::from_str::<serde_json::Value>(&entry.region_options).is_ok());
+    assert_eq!("flat", entry.sst_format);
+}
+
 #[tokio::test]
 async fn test_all_index_metas_list_all_types() {
     test_all_index_metas_list_all_types_with_format(false, r#"
diff --git a/src/mito2/src/engine/scan_test.rs b/src/mito2/src/engine/scan_test.rs
index 75fbc848ea..e4010940fa 100644
--- a/src/mito2/src/engine/scan_test.rs
+++ b/src/mito2/src/engine/scan_test.rs
@@ -100,7 +100,7 @@ async fn test_incremental_query_stale_error() {
             region_id,
             ScanRequest {
                 memtable_min_sequence: Some(min_readable_seq),
-                sst_min_sequence: Some(u64::MAX),
+                skip_sst_files: true,
                 ..Default::default()
             },
         )
diff --git a/src/mito2/src/read/range_cache.rs b/src/mito2/src/read/range_cache.rs
index 7d1010205d..af212ab23e 100644
--- a/src/mito2/src/read/range_cache.rs
+++ b/src/mito2/src/read/range_cache.rs
@@ -19,13 +19,20 @@ use std::sync::Arc;
 
 use async_stream::try_stream;
 use common_telemetry::warn;
+use common_time::Timestamp;
+use common_time::range::TimestampRange;
+use common_time::timestamp::TimeUnit;
+use datafusion_expr::expr::Expr;
+use datafusion_expr::{Between, BinaryExpr, Operator};
 use datatypes::arrow::compute::concat_batches;
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::prelude::ConcreteDataType;
+use datatypes::value::scalar_value_to_timestamp;
 use futures::TryStreamExt;
 use snafu::ResultExt;
 use store_api::region_engine::PartitionRange;
 use store_api::storage::{FileId, RegionId, TimeSeriesRowSelector};
+use table::predicate::is_string_timestamp_literal;
 use tokio::sync::{mpsc, oneshot};
 
 use crate::cache::CacheStrategy;
@@ -139,7 +146,6 @@ impl ScanRequestFingerprint {
             .unwrap_or(&[])
     }
 
-    #[allow(dead_code)]
     pub(crate) fn without_time_filters(&self) -> Self {
         Self {
             inner: Arc::clone(&self.inner),
@@ -266,6 +272,177 @@ pub(crate) fn collect_partition_range_row_groups(
     }
 }
 
+/// Returns the timestamp range where all time-only predicates are guaranteed true.
+///
+/// Returns `Some(min_to_max)` for empty input (vacuously true everywhere).
+/// Returns `None` if any expression contains an unsupported shape: `OR`, `NOT`,
+/// `IN`, non-literal RHS, unsupported operator, column-name mismatch, an `=`
+/// literal that cannot be represented exactly in the column unit, or overflow
+/// during bound adjustment.
+///
+/// This is intentionally stricter than `extract_time_range_from_expr` in
+/// `table::predicate`: lower bounds round up and upper bounds round down. If a
+/// partition's file-time range is contained by the returned range, every row in
+/// that partition satisfies the original time predicates.
+///
+/// `IsNull`/`IsNotNull` on the time index are not routed into `time_filters`
+/// today. If that changes, handle them here before stripping time filters from
+/// the cache key.
+pub(crate) fn implied_time_range_from_exprs(
+    ts_col_name: &str,
+    ts_col_unit: TimeUnit,
+    exprs: &[&Expr],
+) -> Option<TimestampRange> {
+    let mut acc = TimestampRange::min_to_max();
+    for expr in exprs {
+        let r = implied_time_range_from_expr(ts_col_name, ts_col_unit, expr)?;
+        acc = acc.and(&r);
+    }
+    Some(acc)
+}
+
+fn implied_time_range_from_expr(
+    ts_col_name: &str,
+    ts_col_unit: TimeUnit,
+    expr: &Expr,
+) -> Option<TimestampRange> {
+    match expr {
+        Expr::BinaryExpr(BinaryExpr { left, op, right }) => match op {
+            Operator::And => {
+                let l = implied_time_range_from_expr(ts_col_name, ts_col_unit, left)?;
+                let r = implied_time_range_from_expr(ts_col_name, ts_col_unit, right)?;
+                Some(l.and(&r))
+            }
+            Operator::Eq | Operator::Lt | Operator::LtEq | Operator::Gt | Operator::GtEq => {
+                implied_from_cmp(ts_col_name, ts_col_unit, left, *op, right)
+            }
+            // `OR` would require a strict intersection over a union of half-planes
+            // (not the loose-span union provided by `TimestampRange::or`), so we
+            // refuse it. Any other operator is unsupported.
+            _ => None,
+        },
+        Expr::Between(Between {
+            expr,
+            negated,
+            low,
+            high,
+        }) => {
+            if *negated {
+                return None;
+            }
+            implied_from_between(ts_col_name, ts_col_unit, expr, low, high)
+        }
+        // Includes `IsNull`, `IsNotNull`, `Not`, `InList`, function calls, etc.
+        _ => None,
+    }
+}
+
+fn match_ts_column_literal<'a>(
+    ts_col_name: &str,
+    left: &'a Expr,
+    right: &'a Expr,
+) -> Option<(Timestamp, bool)> {
+    let (col, scalar, reverse) = match (left, right) {
+        (Expr::Column(c), Expr::Literal(s, _)) => (c, s, false),
+        (Expr::Literal(s, _), Expr::Column(c)) => (c, s, true),
+        _ => return None,
+    };
+    if col.name != ts_col_name {
+        return None;
+    }
+    // Reject string literals: their conversion needs a timezone we do not have,
+    // and the existing extractor in `table::predicate` rejects them too.
+    if is_string_timestamp_literal(scalar) {
+        return None;
+    }
+    scalar_value_to_timestamp(scalar, None).map(|t| (t, reverse))
+}
+
+fn implied_from_cmp(
+    ts_col_name: &str,
+    ts_col_unit: TimeUnit,
+    left: &Expr,
+    op: Operator,
+    right: &Expr,
+) -> Option<TimestampRange> {
+    let (ts, reverse) = match_ts_column_literal(ts_col_name, left, right)?;
+    // Normalize to "column OP literal".
+    let op = if reverse {
+        match op {
+            Operator::Lt => Operator::Gt,
+            Operator::LtEq => Operator::GtEq,
+            Operator::Gt => Operator::Lt,
+            Operator::GtEq => Operator::LtEq,
+            Operator::Eq => Operator::Eq,
+            _ => return None,
+        }
+    } else {
+        op
+    };
+
+    match op {
+        Operator::GtEq => {
+            // ts >= L. Round the lower bound up in the column unit.
+            let b = ts.convert_to_ceil(ts_col_unit)?;
+            Some(TimestampRange::from_start(b))
+        }
+        Operator::Gt => {
+            // ts > L. floor(L) + 1 is the tight lower bound in the column unit.
+            let v = ts.convert_to(ts_col_unit)?.value().checked_add(1)?;
+            Some(TimestampRange::from_start(Timestamp::new(v, ts_col_unit)))
+        }
+        Operator::LtEq => {
+            // ts <= L. Round the upper bound down in the column unit.
+            let b = ts.convert_to(ts_col_unit)?;
+            Some(TimestampRange::until_end(b, true))
+        }
+        Operator::Lt => {
+            // ts < L. `ts < ceil(L)` is the tight bound: equal to `ts < L` when
+            // L is exactly representable, and `ts <= floor(L)` otherwise.
+            let b = ts.convert_to_ceil(ts_col_unit)?;
+            Some(TimestampRange::until_end(b, false))
+        }
+        Operator::Eq => {
+            // ts = L. Only provable when L is exactly representable.
+            let f = ts.convert_to(ts_col_unit)?;
+            let c = ts.convert_to_ceil(ts_col_unit)?;
+            if f.value() != c.value() {
+                return None;
+            }
+            Some(TimestampRange::single(f))
+        }
+        _ => None,
+    }
+}
+
+fn implied_from_between(
+    ts_col_name: &str,
+    ts_col_unit: TimeUnit,
+    expr: &Expr,
+    low: &Expr,
+    high: &Expr,
+) -> Option<TimestampRange> {
+    let Expr::Column(c) = expr else {
+        return None;
+    };
+    if c.name != ts_col_name {
+        return None;
+    }
+    let (low_s, high_s) = match (low, high) {
+        (Expr::Literal(l, _), Expr::Literal(h, _)) => (l, h),
+        _ => return None,
+    };
+    if is_string_timestamp_literal(low_s) || is_string_timestamp_literal(high_s) {
+        return None;
+    }
+    let low_ts = scalar_value_to_timestamp(low_s, None)?;
+    let high_ts = scalar_value_to_timestamp(high_s, None)?;
+    // BETWEEN low AND high is equivalent to ts >= low AND ts <= high.
+    let lo = low_ts.convert_to_ceil(ts_col_unit)?;
+    let hi = high_ts.convert_to(ts_col_unit)?;
+    Some(TimestampRange::new_inclusive(Some(lo), Some(hi)))
+}
+
 /// Builds a cache key for the given partition range if it is eligible for caching.
 pub(crate) fn build_range_cache_key(
     stream_ctx: &StreamContext,
@@ -292,17 +469,36 @@ pub(crate) fn build_range_cache_key(
         return None;
     }
 
-    // TODO(yingwen): We used to call `fingerprint.without_time_filters()` when the query's
-    // `TimestampRange` fully covered the partition's `FileTimeRange`, so different queries that
-    // all enclosed the same partition could share a cache entry. The cover check turned out to
-    // be too coarse: it returned true in cases where the dropped time predicates would still
-    // have excluded rows, so the cache served results that should have been filtered. Reviving
-    // the optimization needs a per-predicate implication check that walks each time-only `Expr`
-    // (recursing through AND/OR/NOT) and proves the predicate is satisfied for every timestamp
-    // inside the partition's `FileTimeRange` — not the looser "does `extract_time_range_from_expr`
-    // return a range that covers the partition" used previously. Until then, always carry the
-    // full fingerprint so cache reuse stays correct.
-    let scan = fingerprint.clone();
+    // If the implied range covers this partition's `FileTimeRange`, drop
+    // time-only predicates from the cache key so that queries with different
+    // but equally-covering time bounds share an entry. `None` means some
+    // time-only predicate had an unsupported shape (e.g. `OR`), so we keep
+    // them in the key.
+    let range_meta = &stream_ctx.ranges[part_range.identifier];
+    let (file_min, file_max) = range_meta.time_range;
+    let covers = match &stream_ctx.scan_implied_time_range {
+        // An empty implied range can never cover a non-empty file range, so
+        // short-circuit. We also skip the unit asserts because
+        // `TimestampRange::empty()` uses `Timestamp::default()` (millisecond),
+        // which would falsely trip the asserts for non-ms time index columns.
+        Some(implied) if !implied.is_empty() => {
+            // The `contains` check is sound only when `file_min`/`file_max`
+            // share the implied range's unit (the time index column's unit).
+            // Mito stores time index values in that unit; assert to catch any
+            // future drift.
+            if let Some(ts) = implied.start().as_ref().or(implied.end().as_ref()) {
+                assert_eq!(file_min.unit(), ts.unit());
+                assert_eq!(file_max.unit(), ts.unit());
+            }
+            implied.contains(&file_min) && implied.contains(&file_max)
+        }
+        _ => false,
+    };
+    let scan = if covers {
+        fingerprint.without_time_filters()
+    } else {
+        fingerprint.clone()
+    };
 
     Some(RangeScanCacheKey {
         region_id: stream_ctx.input.region_metadata().region_id,
@@ -722,11 +918,16 @@ mod tests {
             num_rows: 10,
         };
         let partition_range = range_meta.new_partition_range(0);
-        let scan_fingerprint = crate::read::scan_region::build_scan_fingerprint(&input);
+        let (scan_fingerprint, scan_implied_time_range) =
+            match crate::read::scan_region::build_scan_fingerprint(&input) {
+                Some(b) => (Some(b.fingerprint), b.implied_time_range),
+                None => (None, None),
+            };
         let stream_ctx = StreamContext {
             input,
             ranges: vec![range_meta],
             scan_fingerprint,
+            scan_implied_time_range,
             query_start: Instant::now(),
         };
 
@@ -770,57 +971,312 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn preserves_time_filters_when_query_covers_partition_range() {
-        assert_range_cache_filters(
-            vec![
-                col("ts").gt_eq(ts_lit(1000)),
-                col("ts").lt(ts_lit(2001)),
-                col("ts").is_not_null(),
-                col("k0").eq(lit("foo")),
-            ],
-            TimestampRange::with_unit(1000, 2002, TimeUnit::Millisecond),
-            (
-                Timestamp::new_millisecond(1000),
-                Timestamp::new_millisecond(2000),
-            ),
-            vec![col("k0").eq(lit("foo")), col("ts").is_not_null()],
-            vec![col("ts").gt_eq(ts_lit(1000)), col("ts").lt(ts_lit(2001))],
-        )
-        .await;
+    async fn range_cache_time_filter_key_cases() {
+        let partition = (
+            Timestamp::new_millisecond(1000),
+            Timestamp::new_millisecond(2000),
+        );
+
+        struct Case {
+            filters: Vec<Expr>,
+            query_time_range: Option<TimestampRange>,
+            expected_filters: Vec<Expr>,
+            expected_time_filters: Vec<Expr>,
+        }
+
+        // Time filters are stripped only when their implied range fully covers
+        // the partition's file-time range. `is_not_null(ts)` stays in regular
+        // filters because it is not routed into `time_filters`.
+        for case in [
+            Case {
+                filters: vec![
+                    col("ts").gt_eq(ts_lit(1000)),
+                    col("ts").lt(ts_lit(2001)),
+                    col("ts").is_not_null(),
+                    col("k0").eq(lit("foo")),
+                ],
+                query_time_range: TimestampRange::with_unit(1000, 2002, TimeUnit::Millisecond),
+                expected_filters: vec![col("k0").eq(lit("foo")), col("ts").is_not_null()],
+                expected_time_filters: vec![],
+            },
+            Case {
+                filters: vec![
+                    col("ts").gt_eq(ts_lit(500)),
+                    col("ts").lt(ts_lit(3000)),
+                    col("k0").eq(lit("foo")),
+                ],
+                query_time_range: TimestampRange::with_unit(500, 3000, TimeUnit::Millisecond),
+                expected_filters: vec![col("k0").eq(lit("foo"))],
+                expected_time_filters: vec![],
+            },
+            Case {
+                filters: vec![
+                    col("ts").gt_eq(ts_lit(1000)),
+                    col("ts").lt_eq(ts_lit(2000)),
+                    col("k0").eq(lit("foo")),
+                ],
+                query_time_range: TimestampRange::with_unit(1000, 2001, TimeUnit::Millisecond),
+                expected_filters: vec![col("k0").eq(lit("foo"))],
+                expected_time_filters: vec![],
+            },
+            Case {
+                filters: vec![
+                    col("ts").between(ts_lit(1000), ts_lit(2000)),
+                    col("k0").eq(lit("foo")),
+                ],
+                query_time_range: TimestampRange::with_unit(1000, 2001, TimeUnit::Millisecond),
+                expected_filters: vec![col("k0").eq(lit("foo"))],
+                expected_time_filters: vec![],
+            },
+            Case {
+                filters: vec![col("ts").gt_eq(ts_lit(1200)), col("k0").eq(lit("foo"))],
+                query_time_range: TimestampRange::with_unit(1200, 2001, TimeUnit::Millisecond),
+                expected_filters: vec![col("k0").eq(lit("foo"))],
+                expected_time_filters: vec![col("ts").gt_eq(ts_lit(1200))],
+            },
+            Case {
+                filters: vec![
+                    col("ts").gt_eq(ts_lit(1500)),
+                    col("ts").is_not_null(),
+                    col("k0").eq(lit("foo")),
+                ],
+                query_time_range: None,
+                expected_filters: vec![col("k0").eq(lit("foo")), col("ts").is_not_null()],
+                expected_time_filters: vec![col("ts").gt_eq(ts_lit(1500))],
+            },
+        ] {
+            assert_range_cache_filters(
+                case.filters,
+                case.query_time_range,
+                partition,
+                case.expected_filters,
+                case.expected_time_filters,
+            )
+            .await;
+        }
     }
 
     #[tokio::test]
-    async fn preserves_time_filters_when_query_does_not_cover_partition_range() {
-        assert_range_cache_filters(
-            vec![col("ts").gt_eq(ts_lit(1000)), col("k0").eq(lit("foo"))],
-            TimestampRange::with_unit(1000, 1500, TimeUnit::Millisecond),
-            (
-                Timestamp::new_millisecond(1000),
-                Timestamp::new_millisecond(2000),
-            ),
-            vec![col("k0").eq(lit("foo"))],
-            vec![col("ts").gt_eq(ts_lit(1000))],
+    async fn two_distinct_queries_share_cache_key_when_both_cover() {
+        let partition_range = (
+            Timestamp::new_millisecond(1000),
+            Timestamp::new_millisecond(2000),
+        );
+
+        let (ctx_a, part_a) = new_stream_context(
+            vec![
+                col("ts").gt_eq(ts_lit(500)),
+                col("ts").lt(ts_lit(3000)),
+                col("k0").eq(lit("foo")),
+            ],
+            TimestampRange::with_unit(500, 3000, TimeUnit::Millisecond),
+            partition_range,
         )
         .await;
+        let (ctx_b, part_b) = new_stream_context(
+            vec![
+                col("ts").gt_eq(ts_lit(100)),
+                col("ts").lt(ts_lit(5000)),
+                col("k0").eq(lit("foo")),
+            ],
+            TimestampRange::with_unit(100, 5000, TimeUnit::Millisecond),
+            partition_range,
+        )
+        .await;
+
+        let key_a = build_range_cache_key(&ctx_a, &part_a).unwrap();
+        let key_b = build_range_cache_key(&ctx_b, &part_b).unwrap();
+        assert_eq!(key_a.scan, key_b.scan);
+        assert!(key_a.scan.time_filters().is_empty());
     }
 
     #[tokio::test]
-    async fn preserves_time_filters_when_query_has_no_time_range_limit() {
-        assert_range_cache_filters(
-            vec![
-                col("ts").gt_eq(ts_lit(1000)),
-                col("ts").is_not_null(),
-                col("k0").eq(lit("foo")),
-            ],
+    async fn disables_optimization_on_or_clause() {
+        let partition_range = (
+            Timestamp::new_millisecond(1000),
+            Timestamp::new_millisecond(2000),
+        );
+
+        let or_a = col("ts").gt_eq(ts_lit(1000)).or(col("ts").lt(ts_lit(500)));
+        let or_b = col("ts").gt_eq(ts_lit(900)).or(col("ts").lt(ts_lit(400)));
+
+        let (ctx_a, part_a) = new_stream_context(
+            vec![or_a.clone(), col("k0").eq(lit("foo"))],
             None,
-            (
-                Timestamp::new_millisecond(1000),
-                Timestamp::new_millisecond(2000),
-            ),
-            vec![col("k0").eq(lit("foo")), col("ts").is_not_null()],
-            vec![col("ts").gt_eq(ts_lit(1000))],
+            partition_range,
         )
         .await;
+        let (ctx_b, part_b) = new_stream_context(
+            vec![or_b.clone(), col("k0").eq(lit("foo"))],
+            None,
+            partition_range,
+        )
+        .await;
+
+        assert!(ctx_a.scan_implied_time_range.is_none());
+        let key_a = build_range_cache_key(&ctx_a, &part_a).unwrap();
+        let key_b = build_range_cache_key(&ctx_b, &part_b).unwrap();
+        assert_ne!(key_a.scan, key_b.scan);
+        assert_eq!(
+            key_a.scan.time_filters(),
+            normalized_exprs([or_a]).as_slice()
+        );
+    }
+
+    #[tokio::test]
+    async fn empty_implied_range_does_not_panic_on_non_ms_file_range() {
+        // Contradictory time predicates make the implied range empty. The
+        // empty range's sentinel timestamps use `Timestamp::default()` (ms),
+        // so without the `is_empty()` short-circuit the unit asserts would
+        // panic against a non-ms `range_meta.time_range`.
+        let partition = (
+            Timestamp::new_millisecond(1000),
+            Timestamp::new_millisecond(2000),
+        );
+
+        let (mut ctx, part_range) = new_stream_context(
+            vec![col("ts").gt_eq(ts_lit(1500)), col("k0").eq(lit("foo"))],
+            TimestampRange::with_unit(1500, 3000, TimeUnit::Millisecond),
+            partition,
+        )
+        .await;
+
+        ctx.scan_implied_time_range = Some(TimestampRange::empty());
+        ctx.ranges[0].time_range = (
+            Timestamp::new(1_000_000_000, TimeUnit::Nanosecond),
+            Timestamp::new(2_000_000_000, TimeUnit::Nanosecond),
+        );
+
+        let key = build_range_cache_key(&ctx, &part_range).unwrap();
+        // Empty implied range cannot cover, so time filters stay in the key.
+        assert!(!key.scan.time_filters().is_empty());
+    }
+
+    fn ms_ts(v: i64) -> Timestamp {
+        Timestamp::new_millisecond(v)
+    }
+
+    fn implied_ms(expr: Expr) -> Option<TimestampRange> {
+        implied_time_range_from_exprs("ts", TimeUnit::Millisecond, &[&expr])
+    }
+
+    #[test]
+    fn implied_time_range_supported_exprs() {
+        for (expr, expected) in [
+            (
+                col("ts").gt_eq(ts_lit(1000)),
+                Some(TimestampRange::from_start(ms_ts(1000))),
+            ),
+            (
+                col("ts").gt(ts_lit(1000)),
+                Some(TimestampRange::from_start(ms_ts(1001))),
+            ),
+            (
+                col("ts").lt_eq(ts_lit(2000)),
+                Some(TimestampRange::until_end(ms_ts(2000), true)),
+            ),
+            (
+                col("ts").lt(ts_lit(2000)),
+                Some(TimestampRange::until_end(ms_ts(2000), false)),
+            ),
+            (
+                col("ts").eq(ts_lit(1500)),
+                Some(TimestampRange::single(ms_ts(1500))),
+            ),
+            (
+                ts_lit(1000).lt_eq(col("ts")),
+                Some(TimestampRange::from_start(ms_ts(1000))),
+            ),
+            (
+                col("ts").between(ts_lit(1000), ts_lit(2000)),
+                Some(TimestampRange::new_inclusive(
+                    Some(ms_ts(1000)),
+                    Some(ms_ts(2000)),
+                )),
+            ),
+            (
+                col("ts")
+                    .gt_eq(ts_lit(1000))
+                    .and(col("ts").lt(ts_lit(2000))),
+                TimestampRange::with_unit(1000, 2000, TimeUnit::Millisecond),
+            ),
+            (
+                col("ts")
+                    .gt_eq(ts_lit(1000))
+                    .and(col("ts").lt(ts_lit(5000)))
+                    .and(col("ts").lt_eq(ts_lit(3000))),
+                TimestampRange::with_unit(1000, 3001, TimeUnit::Millisecond),
+            ),
+        ] {
+            assert_eq!(implied_ms(expr), expected);
+        }
+
+        assert_eq!(
+            implied_time_range_from_exprs("ts", TimeUnit::Millisecond, &[]),
+            Some(TimestampRange::min_to_max())
+        );
+    }
+
+    #[test]
+    fn implied_time_range_unsupported_exprs() {
+        let not_between = Expr::Between(Between {
+            expr: Box::new(col("ts")),
+            negated: true,
+            low: Box::new(ts_lit(1000)),
+            high: Box::new(ts_lit(2000)),
+        });
+
+        for expr in [
+            not_between,
+            col("ts").gt_eq(ts_lit(1000)).or(col("ts").lt(ts_lit(500))),
+            Expr::Not(Box::new(col("ts").gt_eq(ts_lit(1000)))),
+            col("ts").in_list(vec![ts_lit(1000), ts_lit(2000)], false),
+            col("ts").gt_eq(col("other")),
+            col("other_ts").gt_eq(ts_lit(1000)),
+        ] {
+            assert!(implied_ms(expr).is_none());
+        }
+    }
+
+    #[test]
+    fn implied_time_range_unit_conversion() {
+        let second_1 = lit(ScalarValue::TimestampSecond(Some(1), None));
+        let ns_1500 = lit(ScalarValue::TimestampNanosecond(Some(1_500_000_000), None));
+        let ns_1500_5 = lit(ScalarValue::TimestampNanosecond(Some(1_500_500_000), None));
+
+        for (expr, expected) in [
+            (
+                col("ts").gt_eq(second_1.clone()),
+                Some(TimestampRange::from_start(ms_ts(1000))),
+            ),
+            (
+                col("ts").lt_eq(second_1),
+                Some(TimestampRange::until_end(ms_ts(1000), true)),
+            ),
+            (
+                col("ts").eq(ns_1500),
+                Some(TimestampRange::single(ms_ts(1500))),
+            ),
+            (col("ts").eq(ns_1500_5.clone()), None),
+            (
+                col("ts").gt_eq(ns_1500_5.clone()),
+                Some(TimestampRange::from_start(ms_ts(1501))),
+            ),
+            (
+                col("ts").lt_eq(ns_1500_5.clone()),
+                Some(TimestampRange::until_end(ms_ts(1500), true)),
+            ),
+            (
+                col("ts").gt(ns_1500_5.clone()),
+                Some(TimestampRange::from_start(ms_ts(1501))),
+            ),
+            (
+                col("ts").lt(ns_1500_5),
+                Some(TimestampRange::until_end(ms_ts(1501), false)),
+            ),
+        ] {
+            assert_eq!(implied_ms(expr), expected);
+        }
     }
 
     #[test]
diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs
index f002c08bd7..baf6964c27 100644
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -33,6 +33,7 @@ use datafusion_expr::Expr;
 use datafusion_expr::utils::expr_to_columns;
 use datatypes::schema::ext::ArrowSchemaExt;
 use futures::StreamExt;
+use itertools::Itertools;
 use partition::expr::PartitionExpr;
 use smallvec::SmallVec;
 use snafu::ResultExt;
@@ -57,7 +58,7 @@ use crate::metrics::READ_SST_COUNT;
 use crate::read::compat::{self, FlatCompatBatch};
 use crate::read::flat_projection::FlatProjectionMapper;
 use crate::read::range::{FileRangeBuilder, MemRangeBuilder, RangeMeta, RowGroupIndex};
-use crate::read::range_cache::ScanRequestFingerprint;
+use crate::read::range_cache::{ScanRequestFingerprint, implied_time_range_from_exprs};
 use crate::read::read_columns::{
     ReadColumns, merge, read_columns_from_predicate, read_columns_from_projection,
 };
@@ -436,7 +437,16 @@ impl ScanRegion {
             .schema
             .arrow_schema()
             .has_json_extension_field()
-            .then_some(&self.request.json_type_hint);
+            .then_some(&self.request.json_type_hint)
+            .inspect(|json_type_hint| {
+                debug!(
+                    "Concretized JSON type: {{{}}}",
+                    json_type_hint
+                        .iter()
+                        .map(|(k, v)| format!("{}: {}", k, v))
+                        .join(", ")
+                );
+            });
         let mapper = FlatProjectionMapper::new_with_read_columns(
             &self.version.metadata,
             projection,
@@ -446,26 +456,28 @@ impl ScanRegion {
 
         let ssts = &self.version.ssts;
         let mut files = Vec::new();
-        for level in ssts.levels() {
-            for file in level.files.values() {
-                let exceed_min_sequence = match (sst_min_sequence, file.meta_ref().sequence) {
-                    (Some(min_sequence), Some(file_sequence)) => file_sequence > min_sequence,
-                    // If the file's sequence is None (or actually is zero), it could mean the file
-                    // is generated and added to the region "directly". In this case, its data should
-                    // be considered as fresh as the memtable. So its sequence is treated greater than
-                    // the min_sequence, whatever the value of min_sequence is. Hence the default
-                    // "true" in this arm.
-                    (Some(_), None) => true,
-                    (None, _) => true,
-                };
+        if !self.request.skip_sst_files {
+            for level in ssts.levels() {
+                for file in level.files.values() {
+                    let exceed_min_sequence = match (sst_min_sequence, file.meta_ref().sequence) {
+                        (Some(min_sequence), Some(file_sequence)) => file_sequence > min_sequence,
+                        // If the file's sequence is None (or actually is zero), it could mean the file
+                        // is generated and added to the region "directly". In this case, its data should
+                        // be considered as fresh as the memtable. So its sequence is treated greater than
+                        // the min_sequence, whatever the value of min_sequence is. Hence the default
+                        // "true" in this arm.
+                        (Some(_), None) => true,
+                        (None, _) => true,
+                    };
 
-                // Finds SST files in range.
-                if exceed_min_sequence && file_in_range(file, &time_range) {
-                    files.push(file.clone());
+                    // Finds SST files in range.
+                    if exceed_min_sequence && file_in_range(file, &time_range) {
+                        files.push(file.clone());
+                    }
+                    // There is no need to check and prune for file's sequence here as the sequence number is usually very new,
+                    // unless the timing is too good, or the sequence number wouldn't be in file.
+                    // and the batch will be filtered out by tree reader anyway.
                 }
-                // There is no need to check and prune for file's sequence here as the sequence number is usually very new,
-                // unless the timing is too good, or the sequence number wouldn't be in file.
-                // and the batch will be filtered out by tree reader anyway.
             }
         }
 
@@ -569,7 +581,9 @@ impl ScanRegion {
             .with_vector_index_k(vector_index_k);
 
         #[cfg(feature = "enterprise")]
-        let input = if let Some(provider) = self.extension_range_provider {
+        let input = if !self.request.skip_sst_files
+            && let Some(provider) = self.extension_range_provider
+        {
             let ranges = provider
                 .find_extension_ranges(self.version.flushed_sequence, time_range, &self.request)
                 .await?;
@@ -1299,9 +1313,21 @@ fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode {
     }
 }
 
-/// Builds a [ScanRequestFingerprint] from a [ScanInput] if the scan is eligible
+/// Output of [build_scan_fingerprint]: the cache fingerprint plus the derived
+/// implied time range used to decide whether the cache key can drop the time
+/// predicates for a given partition (see `build_range_cache_key`).
+pub(crate) struct ScanFingerprintBundle {
+    pub(crate) fingerprint: ScanRequestFingerprint,
+    /// `Some(r)` = all time-only predicates are guaranteed true on `r` (in the
+    /// column's `TimeUnit`).
+    /// `None`    = at least one time-only predicate could not be proven (e.g.
+    /// `OR`), so the cache-key optimization is disabled for this scan.
+    pub(crate) implied_time_range: Option<TimestampRange>,
+}
+
+/// Builds a [ScanFingerprintBundle] from a [ScanInput] if the scan is eligible
 /// for partition range caching.
-pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFingerprint> {
+pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanFingerprintBundle> {
     let eligible = !input.compaction
         && !input.files.is_empty()
         && matches!(input.cache_strategy, CacheStrategy::EnableAll(_));
@@ -1334,7 +1360,7 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFin
         .unwrap_or_default();
 
     let mut filters = Vec::new();
-    let mut time_filters = Vec::new();
+    let mut time_only_exprs: Vec<&Expr> = Vec::new();
     let mut has_tag_filter = false;
     let mut columns = HashSet::new();
 
@@ -1350,20 +1376,17 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFin
             _ => false,
         };
 
-        // TODO(yingwen): The split between `time_filters` and `filters` is currently inert
-        // because `build_range_cache_key()` always keeps both in the cache key. We used to
-        // strip `time_filters` when the query's `TimestampRange` covered the partition's
-        // `FileTimeRange`, but `extract_time_range_from_expr` is not precise enough to prove
-        // a time predicate is implied by that range (it can return a wider range than the
-        // predicate, and it does not analyze AND/OR shapes), which let the cache reuse rows
-        // that should have been filtered. Reviving the optimization needs a per-predicate
-        // implication check that walks each time-only `Expr` (recursing through AND/OR/NOT)
-        // and proves the predicate holds for every timestamp inside the partition's
-        // `FileTimeRange`; until then both buckets land in the fingerprint.
+        // Route time-only exprs that the legacy extractor recognizes into
+        // `time_only_exprs` so the implication walker
+        // (`implied_time_range_from_exprs`, called below) can attempt to drop
+        // them from the cache key when the partition's `FileTimeRange` is fully
+        // covered, then stringify them into the fingerprint's `time_filters`
+        // bucket. Time-only exprs that the extractor doesn't recognize stay in
+        // `filters` and never get stripped — conservatively correct.
         if is_time_only
             && extract_time_range_from_expr(&time_index_name, ts_col_unit, expr).is_some()
         {
-            time_filters.push(expr.to_string());
+            time_only_exprs.push(expr);
         } else {
             filters.push(expr.to_string());
         }
@@ -1374,31 +1397,38 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFin
         return None;
     }
 
+    let implied_time_range =
+        implied_time_range_from_exprs(&time_index_name, ts_col_unit, &time_only_exprs);
+    let mut time_filters: Vec<String> = time_only_exprs.iter().map(|e| e.to_string()).collect();
+
     // Ensure the filters are sorted for consistent fingerprinting.
     filters.sort_unstable();
     time_filters.sort_unstable();
     let read_columns = input.read_cols.clone();
-    Some(
-        crate::read::range_cache::ScanRequestFingerprintBuilder {
-            read_column_types: read_columns
-                .column_ids_iter()
-                .map(|id| {
-                    metadata
-                        .column_by_id(id)
-                        .map(|col| col.column_schema.data_type.clone())
-                })
-                .collect(),
-            read_columns,
-            filters,
-            time_filters,
-            series_row_selector: input.series_row_selector,
-            append_mode: input.append_mode,
-            filter_deleted: input.filter_deleted,
-            merge_mode: input.merge_mode,
-            partition_expr_version: metadata.partition_expr_version,
-        }
-        .build(),
-    )
+    let fingerprint = crate::read::range_cache::ScanRequestFingerprintBuilder {
+        read_column_types: read_columns
+            .column_ids_iter()
+            .map(|id| {
+                metadata
+                    .column_by_id(id)
+                    .map(|col| col.column_schema.data_type.clone())
+            })
+            .collect(),
+        read_columns,
+        filters,
+        time_filters,
+        series_row_selector: input.series_row_selector,
+        append_mode: input.append_mode,
+        filter_deleted: input.filter_deleted,
+        merge_mode: input.merge_mode,
+        partition_expr_version: metadata.partition_expr_version,
+    }
+    .build();
+
+    Some(ScanFingerprintBundle {
+        fingerprint,
+        implied_time_range,
+    })
 }
 
 /// Context shared by different streams from a scanner.
@@ -1412,6 +1442,13 @@ pub struct StreamContext {
     /// `None` when the scan is not eligible for caching.
     #[allow(dead_code)]
     pub(crate) scan_fingerprint: Option<ScanRequestFingerprint>,
+    /// Implied range of every time-only predicate, in the time index column's
+    /// `TimeUnit`. Used by `build_range_cache_key` to decide whether the
+    /// partition's `FileTimeRange` is fully covered (allowing `time_filters`
+    /// to be stripped from the cache key). `None` when caching is ineligible
+    /// or when the implication walker bailed on an unsupported shape (e.g.
+    /// `OR`).
+    pub(crate) scan_implied_time_range: Option<TimestampRange>,
 
     // Metrics:
     /// The start time of the query.
@@ -1424,12 +1461,16 @@ impl StreamContext {
         let query_start = input.query_start.unwrap_or_else(Instant::now);
         let ranges = RangeMeta::seq_scan_ranges(&input);
         READ_SST_COUNT.observe(input.num_files() as f64);
-        let scan_fingerprint = build_scan_fingerprint(&input);
+        let (scan_fingerprint, scan_implied_time_range) = match build_scan_fingerprint(&input) {
+            Some(b) => (Some(b.fingerprint), b.implied_time_range),
+            None => (None, None),
+        };
 
         Self {
             input,
             ranges,
             scan_fingerprint,
+            scan_implied_time_range,
             query_start,
         }
     }
@@ -1439,12 +1480,16 @@ impl StreamContext {
         let query_start = input.query_start.unwrap_or_else(Instant::now);
         let ranges = RangeMeta::unordered_scan_ranges(&input);
         READ_SST_COUNT.observe(input.num_files() as f64);
-        let scan_fingerprint = build_scan_fingerprint(&input);
+        let (scan_fingerprint, scan_implied_time_range) = match build_scan_fingerprint(&input) {
+            Some(b) => (Some(b.fingerprint), b.implied_time_range),
+            None => (None, None),
+        };
 
         Self {
             input,
             ranges,
             scan_fingerprint,
+            scan_implied_time_range,
             query_start,
         }
     }
@@ -1841,7 +1886,7 @@ mod tests {
             partition_expr_version: 0,
         }
         .build();
-        assert_eq!(expected, fingerprint);
+        assert_eq!(expected, fingerprint.fingerprint);
     }
 
     #[tokio::test]
@@ -1914,7 +1959,7 @@ mod tests {
             partition_expr_version: metadata.partition_expr_version,
         }
         .build();
-        assert_eq!(expected, fingerprint);
+        assert_eq!(expected, fingerprint.fingerprint);
         assert_ne!(0, metadata.partition_expr_version);
     }
 
diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs
index 5b7e46b0c1..4cf2179430 100644
--- a/src/mito2/src/read/scan_util.rs
+++ b/src/mito2/src/read/scan_util.rs
@@ -1375,6 +1375,7 @@ mod split_tests {
             input,
             ranges: vec![],
             scan_fingerprint: None,
+            scan_implied_time_range: None,
             query_start: std::time::Instant::now(),
         }
     }
@@ -1661,7 +1662,7 @@ where
     }
 }
 
-/// Splits the batch by timestamps.
+/// Splits the batch so each sub-batch has strictly increasing timestamps.
 ///
 /// # Panics
 /// Panics if the timestamp array is invalid.
@@ -1682,7 +1683,7 @@ pub(crate) fn split_record_batch(record_batch: RecordBatch, batches: &mut VecDeq
     offsets.push(0);
     let values = ts_values.values();
     for (i, &value) in values.iter().take(batch_rows - 1).enumerate() {
-        if value > values[i + 1] {
+        if value >= values[i + 1] {
             offsets.push(i + 1);
         }
     }
@@ -1755,6 +1756,7 @@ mod tests {
             input,
             ranges: Vec::new(),
             scan_fingerprint: None,
+            scan_implied_time_range: None,
             query_start: Instant::now(),
         })
     }
@@ -1949,4 +1951,76 @@ mod tests {
             compute_average_batch_size(std::iter::empty())
         );
     }
+
+    /// Builds a flat-format record batch whose time index column holds `timestamps`.
+    fn flat_ts_batch(timestamps: &[i64]) -> RecordBatch {
+        use datatypes::arrow::array::{TimestampMillisecondArray, UInt8Array, UInt64Array};
+        use datatypes::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+
+        let num_rows = timestamps.len();
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "ts",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new("pk", DataType::UInt64, false),
+            Field::new("seq", DataType::UInt64, false),
+            Field::new("op", DataType::UInt8, false),
+        ]));
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(TimestampMillisecondArray::from(timestamps.to_vec())),
+                Arc::new(UInt64Array::from(vec![0u64; num_rows])),
+                Arc::new(UInt64Array::from(vec![0u64; num_rows])),
+                Arc::new(UInt8Array::from(vec![0u8; num_rows])),
+            ],
+        )
+        .unwrap()
+    }
+
+    /// Splits `timestamps` and returns the time index values of each sub-batch.
+    fn split_ts(timestamps: &[i64]) -> Vec<Vec<i64>> {
+        let mut batches = VecDeque::new();
+        split_record_batch(flat_ts_batch(timestamps), &mut batches);
+        batches
+            .iter()
+            .map(|batch| {
+                let pos = time_index_column_index(batch.num_columns());
+                let (values, _) = timestamp_array_to_primitive(batch.column(pos)).unwrap();
+                values.values().to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn test_split_record_batch_on_equal_timestamps() {
+        // Splits on both decreasing and equal timestamps.
+        assert_eq!(
+            split_ts(&[1, 2, 2, 3, 1]),
+            vec![vec![1, 2], vec![2, 3], vec![1]]
+        );
+        // A run of equal timestamps yields single-row sub-batches.
+        assert_eq!(split_ts(&[5, 5, 5]), vec![vec![5], vec![5], vec![5]]);
+        // Equal-ts run at the leading edge of the batch.
+        assert_eq!(split_ts(&[5, 5, 1, 2]), vec![vec![5], vec![5], vec![1, 2]]);
+        // Equal-ts run at the trailing edge of the batch.
+        assert_eq!(split_ts(&[1, 2, 5, 5]), vec![vec![1, 2, 5], vec![5]]);
+    }
+
+    #[test]
+    fn test_split_record_batch_on_decreasing_timestamps() {
+        assert_eq!(split_ts(&[1, 2, 3]), vec![vec![1, 2, 3]]);
+        assert_eq!(split_ts(&[1, 3, 2, 4]), vec![vec![1, 3], vec![2, 4]]);
+    }
+
+    #[test]
+    fn test_split_record_batch_empty_and_single_row() {
+        let mut batches = VecDeque::new();
+        split_record_batch(flat_ts_batch(&[]), &mut batches);
+        assert!(batches.is_empty());
+
+        assert_eq!(split_ts(&[42]), vec![vec![42]]);
+    }
 }
diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs
index c85599bf58..9d214caed3 100644
--- a/src/mito2/src/region.rs
+++ b/src/mito2/src/region.rs
@@ -37,6 +37,7 @@ use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::{
     RegionManifestInfo, RegionRole, RegionStatistic, SettableRegionRoleState,
 };
+use store_api::region_info::RegionInfoEntry;
 use store_api::region_request::{PathType, StagingPartitionDirective};
 use store_api::sst_entry::ManifestSstEntry;
 use store_api::storage::{FileId, RegionId, SequenceNumber};
@@ -111,6 +112,22 @@ impl RegionRoleState {
             RegionRoleState::Follower => None,
         }
     }
+
+    pub(crate) fn as_str(&self) -> &'static str {
+        match self {
+            RegionRoleState::Follower => "Follower",
+            RegionRoleState::Leader(RegionLeaderState::Writable) => "Leader(Writable)",
+            RegionRoleState::Leader(RegionLeaderState::Staging) => "Leader(Staging)",
+            RegionRoleState::Leader(RegionLeaderState::EnteringStaging) => {
+                "Leader(EnteringStaging)"
+            }
+            RegionRoleState::Leader(RegionLeaderState::Altering) => "Leader(Altering)",
+            RegionRoleState::Leader(RegionLeaderState::Dropping) => "Leader(Dropping)",
+            RegionRoleState::Leader(RegionLeaderState::Truncating) => "Leader(Truncating)",
+            RegionRoleState::Leader(RegionLeaderState::Editing) => "Leader(Editing)",
+            RegionRoleState::Leader(RegionLeaderState::Downgrading) => "Leader(Downgrading)",
+        }
+    }
 }
 
 /// Metadata and runtime status of a region.
@@ -584,14 +601,14 @@ impl MitoRegion {
         let memtables = &version.memtables;
         let memtable_usage = (memtables.mutable_usage() + memtables.immutables_usage()) as u64;
 
-        let sst_usage = version.ssts.sst_usage();
-        let index_usage = version.ssts.index_usage();
+        let sst_usage = version.ssts.owned_sst_usage(self.region_id);
+        let index_usage = version.ssts.owned_index_usage(self.region_id);
         let flushed_entry_id = version.flushed_entry_id;
 
         let wal_usage = self.estimated_wal_usage(memtable_usage);
         let manifest_usage = self.stats.total_manifest_size();
-        let num_rows = version.ssts.num_rows() + version.memtables.num_rows();
-        let num_files = version.ssts.num_files();
+        let num_rows = version.ssts.owned_num_rows(self.region_id) + version.memtables.num_rows();
+        let num_files = version.ssts.owned_num_files(self.region_id);
         let manifest_version = self.stats.manifest_version();
         let file_removed_cnt = self.stats.file_removed_cnt();
 
@@ -648,6 +665,41 @@ impl MitoRegion {
         self.access_layer.clone()
     }
 
+    /// Returns the region info entry of the region.
+    pub(crate) fn region_info_entry(&self, node_id: Option<u64>) -> RegionInfoEntry {
+        let region_id = self.region_id;
+        let version = self.version();
+        let state = self.state();
+        let role = self.region_role();
+        let region_options = serde_json::to_string(&version.options)
+            .unwrap_or_else(|err| serde_json::json!({ "error": err.to_string() }).to_string());
+        let sst_format = match version.options.sst_format.unwrap_or_default() {
+            crate::sst::FormatType::PrimaryKey => "primary_key",
+            crate::sst::FormatType::Flat => "flat",
+        }
+        .to_string();
+
+        RegionInfoEntry {
+            region_id,
+            table_id: region_id.table_id(),
+            region_number: region_id.region_number(),
+            region_group: region_id.region_group(),
+            region_sequence: region_id.region_sequence(),
+            state: state.as_str().to_string(),
+            role: role.to_string(),
+            writable: self.is_writable(),
+            committed_sequence: self.find_committed_sequence(),
+            flushed_sequence: Some(self.flushed_sequence()).filter(|sequence| *sequence > 0),
+            manifest_version: self.stats.manifest_version(),
+            compaction_time_window: version
+                .compaction_time_window
+                .map(|duration| humantime::format_duration(duration).to_string()),
+            region_options,
+            sst_format,
+            node_id,
+        }
+    }
+
     /// Returns the SST entries of the region.
     pub async fn manifest_sst_entries(&self) -> Vec<ManifestSstEntry> {
         let table_dir = self.table_dir();
@@ -1623,6 +1675,23 @@ mod tests {
         assert!(AtomicCell::<RegionRoleState>::is_lock_free());
     }
 
+    #[test]
+    fn test_region_role_state_as_str() {
+        assert_eq!("Follower", RegionRoleState::Follower.as_str());
+        assert_eq!(
+            "Leader(Writable)",
+            RegionRoleState::Leader(RegionLeaderState::Writable).as_str()
+        );
+        assert_eq!(
+            "Leader(Staging)",
+            RegionRoleState::Leader(RegionLeaderState::Staging).as_str()
+        );
+        assert_eq!(
+            "Leader(Downgrading)",
+            RegionRoleState::Leader(RegionLeaderState::Downgrading).as_str()
+        );
+    }
+
     async fn build_test_region(env: &SchedulerEnv) -> MitoRegion {
         let builder = VersionControlBuilder::new();
         let version_control = Arc::new(builder.build());
diff --git a/src/mito2/src/sst/index/bloom_filter/applier.rs b/src/mito2/src/sst/index/bloom_filter/applier.rs
index e36874e97d..b1f6032630 100644
--- a/src/mito2/src/sst/index/bloom_filter/applier.rs
+++ b/src/mito2/src/sst/index/bloom_filter/applier.rs
@@ -21,6 +21,7 @@ use std::time::Instant;
 
 use common_base::range_read::RangeReader;
 use common_telemetry::{tracing, warn};
+use datatypes::data_type::ConcreteDataType;
 use index::bloom_filter::applier::{BloomFilterApplier, InListPredicate};
 use index::bloom_filter::reader::{
     BloomFilterReadMetrics, BloomFilterReader, BloomFilterReaderImpl,
@@ -30,6 +31,7 @@ use object_store::ObjectStore;
 use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
 use puffin::puffin_manager::{PuffinManager, PuffinReader};
 use snafu::ResultExt;
+use store_api::metadata::RegionMetadataRef;
 use store_api::region_request::PathType;
 use store_api::storage::ColumnId;
 
@@ -38,7 +40,6 @@ use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
 use crate::cache::index::bloom_filter_index::{
     BloomFilterIndexCacheRef, CachedBloomFilterIndexBlobReader, Tag,
 };
-use crate::cache::index::result_cache::PredicateKey;
 use crate::error::{
     ApplyBloomFilterIndexSnafu, Error, MetadataSnafu, PuffinBuildReaderSnafu, PuffinReadBlobSnafu,
     Result,
@@ -133,10 +134,10 @@ pub struct BloomFilterIndexApplier {
 
     /// Bloom filter predicates.
     /// For each column, the value will be retained only if it contains __all__ predicates.
-    predicates: Arc<BTreeMap<ColumnId, Vec<InListPredicate>>>,
+    default_predicates: Arc<BTreeMap<ColumnId, Vec<InListPredicate>>>,
 
-    /// Predicate key. Used to identify the predicate and fetch result from cache.
-    predicate_key: PredicateKey,
+    /// Expected predicate column types from the latest region metadata.
+    expected_predicate_col_types: BTreeMap<ColumnId, ConcreteDataType>,
 }
 
 impl BloomFilterIndexApplier {
@@ -149,8 +150,9 @@ impl BloomFilterIndexApplier {
         object_store: ObjectStore,
         puffin_manager_factory: PuffinManagerFactory,
         predicates: BTreeMap<ColumnId, Vec<InListPredicate>>,
+        expected_predicate_col_types: BTreeMap<ColumnId, ConcreteDataType>,
     ) -> Self {
-        let predicates = Arc::new(predicates);
+        let default_predicates = Arc::new(predicates);
         Self {
             table_dir,
             path_type,
@@ -159,8 +161,8 @@ impl BloomFilterIndexApplier {
             puffin_manager_factory,
             puffin_metadata_cache: None,
             bloom_filter_index_cache: None,
-            predicate_key: PredicateKey::new_bloom(predicates.clone()),
-            predicates,
+            default_predicates,
+            expected_predicate_col_types,
         }
     }
 
@@ -207,6 +209,7 @@ impl BloomFilterIndexApplier {
         &self,
         file_id: RegionIndexId,
         file_size_hint: Option<u64>,
+        predicates: &BTreeMap<ColumnId, Vec<InListPredicate>>,
         row_groups: impl Iterator<Item = (usize, bool)>,
         mut metrics: Option<&mut BloomFilterIndexApplyMetrics>,
     ) -> Result<Vec<(usize, Vec<Range<usize>>)>> {
@@ -230,7 +233,7 @@ impl BloomFilterIndexApplier {
             .map(|(i, range)| (*i, vec![range.clone()]))
             .collect::<Vec<_>>();
 
-        for (column_id, predicates) in self.predicates.iter() {
+        for (column_id, predicates) in predicates {
             let blob = match self
                 .blob_reader(file_id, *column_id, file_size_hint, metrics.as_deref_mut())
                 .await?
@@ -438,9 +441,46 @@ impl BloomFilterIndexApplier {
         Ok(())
     }
 
-    /// Returns the predicate key.
-    pub fn predicate_key(&self) -> &PredicateKey {
-        &self.predicate_key
+    /// Returns compatible bloom filter predicates with the given SST metadata.
+    ///
+    /// Returns `None` when no compatible predicate remains for this SST.
+    pub fn compatible_predicate_for_sst(
+        &self,
+        sst_metadata: &RegionMetadataRef,
+    ) -> Option<Arc<BTreeMap<ColumnId, Vec<InListPredicate>>>> {
+        let mut has_type_mismatch = false;
+        let mut compatible_col_ids = Vec::new();
+
+        for (col_id, expected) in &self.expected_predicate_col_types {
+            let Some(sst_col) = sst_metadata.column_by_id(*col_id) else {
+                has_type_mismatch = true;
+                continue;
+            };
+
+            if sst_col.column_schema.data_type != *expected {
+                has_type_mismatch = true;
+                continue;
+            }
+
+            compatible_col_ids.push(*col_id);
+        }
+
+        if compatible_col_ids.is_empty() {
+            return None;
+        }
+
+        if !has_type_mismatch {
+            return Some(self.default_predicates.clone());
+        }
+
+        let mut compatible_predicates = BTreeMap::new();
+        for col_id in compatible_col_ids {
+            if let Some(predicates) = self.default_predicates.get(&col_id) {
+                compatible_predicates.insert(col_id, predicates.clone());
+            }
+        }
+
+        Some(Arc::new(compatible_predicates))
     }
 }
 
@@ -456,9 +496,12 @@ fn is_blob_not_found(err: &Error) -> bool {
 
 #[cfg(test)]
 mod tests {
+    use std::collections::BTreeSet;
 
     use datafusion_expr::{Expr, col, lit};
     use futures::future::BoxFuture;
+    use index::Bytes;
+    use object_store::services::Memory;
     use puffin::puffin_manager::PuffinWriter;
     use store_api::metadata::RegionMetadata;
     use store_api::storage::FileId;
@@ -470,6 +513,113 @@ mod tests {
         mock_object_store, mock_region_metadata, new_batch, new_intm_mgr,
     };
 
+    #[tokio::test]
+    async fn test_compatible_predicate_for_sst() {
+        let (_d, puffin_manager_factory) =
+            PuffinManagerFactory::new_for_test_async("test_plan_for_sst_basic_").await;
+        let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
+        let table_dir = "table_dir".to_string();
+
+        let predicates = BTreeMap::from_iter([(
+            1,
+            vec![InListPredicate {
+                list: BTreeSet::from_iter([Bytes::from("foo")]),
+            }],
+        )]);
+        let expected_predicate_col_types =
+            BTreeMap::from_iter([(1, ConcreteDataType::string_datatype())]);
+
+        let applier = BloomFilterIndexApplier::new(
+            table_dir,
+            PathType::Bare,
+            object_store,
+            puffin_manager_factory,
+            predicates,
+            expected_predicate_col_types,
+        );
+        let predicates = applier.compatible_predicate_for_sst(&mock_region_metadata());
+        assert!(predicates.is_some());
+    }
+
+    #[tokio::test]
+    async fn test_compatible_predicate_for_sst_type_mismatch() {
+        let (_d, puffin_manager_factory) =
+            PuffinManagerFactory::new_for_test_async("test_plan_for_sst_type_mismatch_").await;
+        let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
+        let table_dir = "table_dir".to_string();
+
+        let predicates = BTreeMap::from_iter([(
+            1,
+            vec![InListPredicate {
+                list: BTreeSet::from_iter([Bytes::from("foo")]),
+            }],
+        )]);
+        let expected_predicate_col_types =
+            BTreeMap::from_iter([(1, ConcreteDataType::int64_datatype())]);
+
+        let applier = BloomFilterIndexApplier::new(
+            table_dir,
+            PathType::Bare,
+            object_store,
+            puffin_manager_factory,
+            predicates,
+            expected_predicate_col_types,
+        );
+        let predicates = applier.compatible_predicate_for_sst(&mock_region_metadata());
+        assert!(predicates.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_compatible_predicate_for_sst_partial_type_mismatch() {
+        let (_d, puffin_manager_factory) =
+            PuffinManagerFactory::new_for_test_async("test_plan_for_sst_partial_mismatch_").await;
+        let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
+        let table_dir = "table_dir".to_string();
+
+        // Column 1 (tag_str): expected string — matches SST (compatible).
+        // Column 3 (field_u64): expected int64 — SST has uint64 (mismatched).
+        let predicates = BTreeMap::from_iter([
+            (
+                1,
+                vec![InListPredicate {
+                    list: BTreeSet::from_iter([Bytes::from("foo")]),
+                }],
+            ),
+            (
+                3,
+                vec![InListPredicate {
+                    list: BTreeSet::from_iter([Bytes::from("bar")]),
+                }],
+            ),
+        ]);
+        let expected_predicate_col_types = BTreeMap::from_iter([
+            (1, ConcreteDataType::string_datatype()),
+            (3, ConcreteDataType::int64_datatype()), // intentional mismatch
+        ]);
+
+        let applier = BloomFilterIndexApplier::new(
+            table_dir,
+            PathType::Bare,
+            object_store,
+            puffin_manager_factory,
+            predicates,
+            expected_predicate_col_types,
+        );
+        let result = applier.compatible_predicate_for_sst(&mock_region_metadata());
+
+        // The subset containing only the compatible column must be returned.
+        let result = result.expect("expected Some with compatible subset");
+        assert!(
+            result.contains_key(&1),
+            "compatible column 1 must be present"
+        );
+        assert!(
+            !result.contains_key(&3),
+            "mismatched column 3 must be absent"
+        );
+        assert_eq!(result.len(), 1, "only the compatible predicate must remain");
+    }
+
     #[allow(clippy::type_complexity)]
     fn tester(
         table_dir: String,
@@ -496,8 +646,11 @@ mod tests {
                 );
 
                 let applier = builder.build(&exprs).unwrap().unwrap();
+                let predicates = applier
+                    .compatible_predicate_for_sst(&Arc::new(metadata.clone()))
+                    .unwrap();
                 applier
-                    .apply(file_id, None, row_groups.into_iter(), None)
+                    .apply(file_id, None, &predicates, row_groups.into_iter(), None)
                     .await
                     .unwrap()
                     .into_iter()
diff --git a/src/mito2/src/sst/index/bloom_filter/applier/builder.rs b/src/mito2/src/sst/index/bloom_filter/applier/builder.rs
index a2930d4075..beb26d1bf7 100644
--- a/src/mito2/src/sst/index/bloom_filter/applier/builder.rs
+++ b/src/mito2/src/sst/index/bloom_filter/applier/builder.rs
@@ -101,12 +101,14 @@ impl<'a> BloomFilterIndexApplierBuilder<'a> {
             return Ok(None);
         }
 
+        let expected_predicate_column_types = self.expected_predicate_column_types();
         let applier = BloomFilterIndexApplier::new(
             self.table_dir,
             self.path_type,
             self.object_store,
             self.puffin_manager_factory,
             self.predicates,
+            expected_predicate_column_types,
         )
         .with_file_cache(self.file_cache)
         .with_puffin_metadata_cache(self.puffin_metadata_cache)
@@ -137,6 +139,17 @@ impl<'a> BloomFilterIndexApplierBuilder<'a> {
         }
     }
 
+    /// Returns `(column_id, data_type)` pairs for predicate columns.
+    fn expected_predicate_column_types(&self) -> BTreeMap<ColumnId, ConcreteDataType> {
+        self.predicates
+            .keys()
+            .filter_map(|col_id| {
+                let col = self.metadata.column_by_id(*col_id)?;
+                Some((*col_id, col.column_schema.data_type.clone()))
+            })
+            .collect()
+    }
+
     /// Helper function to get the column id and type
     fn column_id_and_type(
         &self,
@@ -404,7 +417,7 @@ mod tests {
         let result = builder.build(&exprs).unwrap();
         assert!(result.is_some());
 
-        let predicates = result.unwrap().predicates;
+        let predicates = result.unwrap().default_predicates;
         assert_eq!(predicates.len(), 1);
 
         let column_predicates = predicates.get(&1).unwrap();
@@ -443,7 +456,7 @@ mod tests {
         let result = builder.build(&exprs).unwrap();
         assert!(result.is_some());
 
-        let predicates = result.unwrap().predicates;
+        let predicates = result.unwrap().default_predicates;
         let column_predicates = predicates.get(&2).unwrap();
         assert_eq!(column_predicates.len(), 1);
         assert_eq!(column_predicates[0].list.len(), 3);
@@ -473,7 +486,7 @@ mod tests {
         let result = builder().build(&[expr]).unwrap();
         assert!(result.is_some());
 
-        let predicates = result.unwrap().predicates;
+        let predicates = result.unwrap().default_predicates;
         let column_predicates = predicates.get(&1).unwrap();
         assert_eq!(column_predicates.len(), 1);
         assert_eq!(column_predicates[0].list.len(), 4);
@@ -537,7 +550,7 @@ mod tests {
         let result = builder.build(&exprs).unwrap();
         assert!(result.is_some());
 
-        let predicates = result.unwrap().predicates;
+        let predicates = result.unwrap().default_predicates;
         assert_eq!(predicates.len(), 2);
         assert!(predicates.contains_key(&1));
         assert!(predicates.contains_key(&2));
@@ -575,7 +588,7 @@ mod tests {
         let result = builder.build(&exprs).unwrap();
         assert!(result.is_some());
 
-        let predicates = result.unwrap().predicates;
+        let predicates = result.unwrap().default_predicates;
         assert!(!predicates.contains_key(&1)); // Null equality should be ignored
         let column2_predicates = predicates.get(&2).unwrap();
         assert_eq!(column2_predicates[0].list.len(), 2);
@@ -644,7 +657,7 @@ mod tests {
         let result = builder.build(&exprs).unwrap();
         assert!(result.is_some());
 
-        let predicates = result.unwrap().predicates;
+        let predicates = result.unwrap().default_predicates;
         let column_predicates = predicates.get(&1).unwrap();
         assert_eq!(column_predicates.len(), 2);
     }
diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs
index a3fe32cbe3..48474d9fe4 100644
--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -138,6 +138,7 @@ mod tests {
 
     use super::*;
     use crate::access_layer::{FilePathProvider, Metrics, RegionFilePathFactory, WriteType};
+    use crate::cache::index::result_cache::PredicateKey;
     use crate::cache::test_util::assert_parquet_metadata_equal;
     use crate::cache::{CacheManager, CacheStrategy, PageKey};
     use crate::config::IndexConfig;
@@ -985,11 +986,14 @@ mod tests {
         assert_eq!(metrics.filter_metrics.rg_minmax_filtered, 2);
         assert_eq!(metrics.filter_metrics.rg_bloom_filtered, 2);
         assert_eq!(metrics.filter_metrics.rows_bloom_filtered, 100);
+        let bloom_predicates = bloom_filter_applier
+            .as_ref()
+            .unwrap()
+            .compatible_predicate_for_sst(&metadata)
+            .unwrap();
+        let bloom_predicate_key = PredicateKey::new_bloom(bloom_predicates);
         let cached = index_result_cache
-            .get(
-                bloom_filter_applier.unwrap().predicate_key(),
-                handle.file_id().file_id(),
-            )
+            .get(&bloom_predicate_key, handle.file_id().file_id())
             .unwrap();
         assert!(cached.contains_row_group(2));
         assert!(cached.contains_row_group(3));
@@ -1055,11 +1059,14 @@ mod tests {
         assert_eq!(metrics.filter_metrics.rg_minmax_filtered, 0);
         assert_eq!(metrics.filter_metrics.rg_bloom_filtered, 2);
         assert_eq!(metrics.filter_metrics.rows_bloom_filtered, 140);
+        let bloom_predicates = bloom_filter_applier
+            .as_ref()
+            .unwrap()
+            .compatible_predicate_for_sst(&metadata)
+            .unwrap();
+        let bloom_predicate_key = PredicateKey::new_bloom(bloom_predicates);
         let cached = index_result_cache
-            .get(
-                bloom_filter_applier.unwrap().predicate_key(),
-                handle.file_id().file_id(),
-            )
+            .get(&bloom_predicate_key, handle.file_id().file_id())
             .unwrap();
         assert!(cached.contains_row_group(0));
         assert!(cached.contains_row_group(1));
diff --git a/src/mito2/src/sst/parquet/prefilter.rs b/src/mito2/src/sst/parquet/prefilter.rs
index c98da1abac..7fb549e17d 100644
--- a/src/mito2/src/sst/parquet/prefilter.rs
+++ b/src/mito2/src/sst/parquet/prefilter.rs
@@ -26,17 +26,20 @@ use api::v1::SemanticType;
 use common_recordbatch::filter::SimpleFilterEvaluator;
 use datatypes::arrow::array::{Array, BinaryArray, BooleanArray, BooleanBufferBuilder};
 use datatypes::arrow::buffer::BooleanBuffer;
+use datatypes::arrow::datatypes::SchemaRef;
 use datatypes::arrow::record_batch::RecordBatch;
 use futures::StreamExt;
 use mito_codec::row_converter::{PrimaryKeyCodec, PrimaryKeyFilter};
 use parquet::arrow::ProjectionMask;
 use parquet::arrow::arrow_reader::RowSelection;
 use parquet::schema::types::SchemaDescriptor;
+use smallvec::{SmallVec, smallvec};
 use snafu::{OptionExt, ResultExt};
 use store_api::metadata::{RegionMetadata, RegionMetadataRef};
 use store_api::storage::consts::PRIMARY_KEY_COLUMN_NAME;
 use table::predicate::Predicate;
 
+use crate::cache::PrefilterKey;
 use crate::error::{
     ComputeArrowSnafu, DecodeSnafu, EvalPartitionFilterSnafu, NewRecordBatchSnafu,
     RecordBatchSnafu, Result, UnexpectedSnafu,
@@ -285,7 +288,6 @@ pub(crate) fn build_reader_filter_plan(
     expected_metadata: Option<&RegionMetadata>,
     pre_filter_mode: PreFilterMode,
     read_format: &FlatReadFormat,
-    parquet_schema: &SchemaDescriptor,
     codec: &Arc<dyn PrimaryKeyCodec>,
 ) -> ReaderFilterPlan {
     let Some(predicate) = predicate else {
@@ -372,15 +374,27 @@ pub(crate) fn build_reader_filter_plan(
         }
     }
 
+    let pk_filter_expr_strs = (!pk_filter_contexts.is_empty()).then(|| {
+        let mut expr_strs = pk_filter_contexts
+            .iter()
+            .map(|filter_ctx| filter_ctx.expr_str().to_string())
+            .collect::<Vec<_>>();
+        expr_strs.sort();
+        SmallVec::from_vec(expr_strs)
+    });
     let pk_filter_exprs =
         (!primary_key_filters.is_empty()).then_some(Arc::new(primary_key_filters));
+    let schema_version = expected_metadata
+        .map(|metadata| metadata.schema_version)
+        .unwrap_or_else(|| read_format.metadata().schema_version);
     let prefilter_builder = PrefilterContextBuilder::new(
         read_format,
         codec,
         pk_filter_exprs,
+        pk_filter_expr_strs,
         prefilter_simple_filters.clone(),
         prefilter_physical_filters,
-        parquet_schema,
+        schema_version,
     );
 
     if prefilter_builder.is_some() {
@@ -402,8 +416,6 @@ pub(crate) fn build_reader_filter_plan(
 
 /// Context for prefiltering a row group.
 pub(crate) struct PrefilterContext {
-    /// Projection mask for reading prefilter columns.
-    projection: ProjectionMask,
     /// Optional PK filter for legacy primary-key-format parquet.
     pk_filter: Option<Box<dyn PrimaryKeyFilter>>,
     /// Simple filters that can be evaluated directly from the prefilter batch.
@@ -411,6 +423,12 @@ pub(crate) struct PrefilterContext {
     /// Physical filters that can be evaluated directly from the prefilter batch.
     /// Physical expressions are only applied in the prefilter phase.
     physical_filters: Vec<PhysicalFilterContext>,
+    /// Region schema version used in per-filter cache keys.
+    schema_version: u64,
+    /// Sorted expression strings for the encoded-PK filter group.
+    pk_filter_expr_strs: Option<SmallVec<[String; 1]>>,
+    /// Arrow schema used to build narrowed prefilter projections.
+    arrow_schema: SchemaRef,
 }
 
 /// Pre-built state for constructing [PrefilterContext] per row group.
@@ -419,12 +437,14 @@ pub(crate) struct PrefilterContext {
 /// are computed once. A fresh [PrefilterContext] with its own mutable PK filter
 /// is created via [PrefilterContextBuilder::build()] for each row group.
 pub(crate) struct PrefilterContextBuilder {
-    projection: ProjectionMask,
     pk_filters: Option<Arc<Vec<SimpleFilterEvaluator>>>,
+    pk_filter_expr_strs: Option<SmallVec<[String; 1]>>,
     filters: Vec<SimpleFilterContext>,
     physical_filters: Vec<PhysicalFilterContext>,
     codec: Arc<dyn PrimaryKeyCodec>,
     metadata: RegionMetadataRef,
+    schema_version: u64,
+    arrow_schema: SchemaRef,
 }
 
 impl PrefilterContextBuilder {
@@ -438,9 +458,10 @@ impl PrefilterContextBuilder {
         read_format: &FlatReadFormat,
         codec: &Arc<dyn PrimaryKeyCodec>,
         primary_key_filters: Option<Arc<Vec<SimpleFilterEvaluator>>>,
+        primary_key_filter_expr_strs: Option<SmallVec<[String; 1]>>,
         filters: Vec<SimpleFilterContext>,
         physical_filters: Vec<PhysicalFilterContext>,
-        parquet_schema: &SchemaDescriptor,
+        schema_version: u64,
     ) -> Option<Self> {
         let metadata = read_format.metadata();
         let use_raw_tag_columns = read_format.batch_has_raw_pk_columns();
@@ -448,6 +469,10 @@ impl PrefilterContextBuilder {
             .then_some(primary_key_filters)
             .flatten()
             .filter(|filters| !filters.is_empty());
+        let pk_filter_expr_strs = pk_filters
+            .is_some()
+            .then_some(primary_key_filter_expr_strs)
+            .flatten();
 
         let mut prefilter_column_names = HashSet::new();
         for filter_ctx in &filters {
@@ -464,11 +489,8 @@ impl PrefilterContextBuilder {
             prefilter_column_names.insert(filter_ctx.column_name().to_string());
         }
 
-        let (projection, prefilter_count) = compute_projection_mask(
-            &prefilter_column_names,
-            read_format.arrow_schema(),
-            parquet_schema,
-        );
+        let prefilter_count =
+            compute_projection_count(&prefilter_column_names, read_format.arrow_schema());
 
         if prefilter_count == 0 {
             return None;
@@ -487,12 +509,14 @@ impl PrefilterContextBuilder {
         }
 
         Some(Self {
-            projection,
             pk_filters,
+            pk_filter_expr_strs,
             filters,
             physical_filters,
             codec: Arc::clone(codec),
             metadata: metadata.clone(),
+            schema_version,
+            arrow_schema: read_format.arrow_schema().clone(),
         })
     }
 
@@ -505,10 +529,12 @@ impl PrefilterContextBuilder {
             Box::new(CachedPrimaryKeyFilter::new(pk_filter)) as Box<dyn PrimaryKeyFilter>
         });
         PrefilterContext {
-            projection: self.projection.clone(),
             pk_filter,
             filters: self.filters.clone(),
             physical_filters: self.physical_filters.clone(),
+            schema_version: self.schema_version,
+            pk_filter_expr_strs: self.pk_filter_expr_strs.clone(),
+            arrow_schema: self.arrow_schema.clone(),
         }
     }
 }
@@ -532,18 +558,31 @@ fn compute_projection_mask(
     column_names: &HashSet<String>,
     arrow_schema: &datatypes::arrow::datatypes::SchemaRef,
     parquet_schema: &SchemaDescriptor,
-) -> (ProjectionMask, usize) {
+) -> ProjectionMask {
+    ProjectionMask::roots(
+        parquet_schema,
+        projection_indices(column_names, arrow_schema),
+    )
+}
+
+fn compute_projection_count(
+    column_names: &HashSet<String>,
+    arrow_schema: &datatypes::arrow::datatypes::SchemaRef,
+) -> usize {
+    projection_indices(column_names, arrow_schema).len()
+}
+
+fn projection_indices(
+    column_names: &HashSet<String>,
+    arrow_schema: &datatypes::arrow::datatypes::SchemaRef,
+) -> Vec<usize> {
     let mut projection_indices: Vec<usize> = column_names
         .iter()
         .filter_map(|name| arrow_schema.column_with_name(name).map(|(index, _)| index))
         .collect();
     projection_indices.sort_unstable();
     projection_indices.dedup();
-    let count = projection_indices.len();
-    (
-        ProjectionMask::roots(parquet_schema, projection_indices.iter().copied()),
-        count,
-    )
+    projection_indices
 }
 
 fn should_use_prefilter(
@@ -568,18 +607,121 @@ pub(crate) async fn execute_prefilter(
     reader_builder: &RowGroupReaderBuilder,
     build_ctx: &RowGroupBuildContext<'_>,
 ) -> Result<PrefilterResult> {
+    let entries = build_prefilter_cache_entries(prefilter_ctx, reader_builder, build_ctx);
+
+    if entries.is_empty() {
+        return execute_prefilter_by_reading_columns(prefilter_ctx, reader_builder, build_ctx)
+            .await;
+    }
+
+    execute_prefilter_with_result_cache(prefilter_ctx, reader_builder, build_ctx, entries).await
+}
+
+async fn execute_prefilter_with_result_cache(
+    prefilter_ctx: &mut PrefilterContext,
+    reader_builder: &RowGroupReaderBuilder,
+    build_ctx: &RowGroupBuildContext<'_>,
+    entries: Vec<PrefilterEntry>,
+) -> Result<PrefilterResult> {
+    let non_cacheable_physical = non_cacheable_physical_filters(prefilter_ctx);
+    let mut hit_mask: Option<BooleanBuffer> = None;
+    let mut misses = Vec::new();
+    for entry in entries {
+        let Some(key) = &entry.key else {
+            misses.push(entry);
+            continue;
+        };
+
+        if let Some(mask) = reader_builder.cache_strategy().get_prefilter_result(key) {
+            hit_mask = Some(match hit_mask {
+                Some(hit_mask) => hit_mask.bitand(mask.as_ref()),
+                None => mask.as_ref().clone(),
+            });
+        } else {
+            misses.push(entry);
+        }
+    }
+
+    if misses.is_empty() && non_cacheable_physical.is_empty() {
+        let combined_mask = hit_mask.unwrap_or_else(|| BooleanBuffer::new_set(0));
+        let refined_selection =
+            refined_selection_from_mask(&combined_mask, &build_ctx.row_selection);
+        let rows_before_filter = rows_before_filter(reader_builder, build_ctx);
+        let filtered_rows = rows_before_filter.saturating_sub(refined_selection.row_count());
+        return Ok(PrefilterResult {
+            refined_selection,
+            filtered_rows,
+        });
+    }
+
+    let mut uncached_entries = misses;
+    uncached_entries.extend(
+        non_cacheable_physical
+            .iter()
+            .copied()
+            .map(|idx| PrefilterEntry::without_cache(PrefilterEntryKind::Physical(idx))),
+    );
+    let (uncached_mask, read_rows) =
+        build_prefilter_masks(prefilter_ctx, reader_builder, build_ctx, &uncached_entries).await?;
+
+    let final_mask = match (hit_mask, uncached_mask) {
+        (Some(hit_mask), Some(uncached_mask)) => hit_mask.bitand(&uncached_mask),
+        (Some(hit_mask), None) => hit_mask,
+        (None, Some(uncached_mask)) => uncached_mask,
+        (None, None) => BooleanBuffer::new_set(read_rows),
+    };
+    debug_assert_eq!(final_mask.len(), read_rows);
+    let rows_selected = final_mask.count_set_bits();
+    let filtered_rows = read_rows.saturating_sub(rows_selected);
+    let refined_selection = refined_selection_from_mask(&final_mask, &build_ctx.row_selection);
+
+    Ok(PrefilterResult {
+        refined_selection,
+        filtered_rows,
+    })
+}
+
+fn non_cacheable_physical_filters(prefilter_ctx: &PrefilterContext) -> Vec<usize> {
+    prefilter_ctx
+        .physical_filters
+        .iter()
+        .enumerate()
+        .filter_map(|(idx, filter)| (!filter.is_immutable()).then_some(idx))
+        .collect()
+}
+
+async fn build_prefilter_masks(
+    prefilter_ctx: &mut PrefilterContext,
+    reader_builder: &RowGroupReaderBuilder,
+    build_ctx: &RowGroupBuildContext<'_>,
+    entries: &[PrefilterEntry],
+) -> Result<(Option<BooleanBuffer>, usize)> {
+    let prefilter_column_names = prefilter_column_names_for_entries(prefilter_ctx, entries);
+    let parquet_schema = reader_builder
+        .parquet_metadata()
+        .file_metadata()
+        .schema_descr();
+    let projection = compute_projection_mask(
+        &prefilter_column_names,
+        &prefilter_ctx.arrow_schema,
+        parquet_schema,
+    );
+
     let mut stream = reader_builder
         .build_with_projection(
             build_ctx.row_group_idx,
             build_ctx.row_selection.clone(),
-            prefilter_ctx.projection.clone(),
+            projection,
             build_ctx.fetch_metrics,
         )
         .await?;
 
-    let mut filter_arrays = Vec::new();
+    let mut cache_builders = entries
+        .iter()
+        .map(|entry| entry.key.is_some().then(|| BooleanBufferBuilder::new(0)))
+        .collect::<Vec<_>>();
+    let mut combined_builder = (!entries.is_empty()).then(|| BooleanBufferBuilder::new(0));
     let mut rows_before_filter = 0usize;
-    let mut rows_selected = 0usize;
 
     while let Some(batch_result) = stream.next().await {
         let batch = batch_result?;
@@ -589,30 +731,78 @@ pub(crate) async fn execute_prefilter(
         }
         rows_before_filter += num_rows;
 
-        let batch_mask = match apply_filters_to_batch(
-            &batch,
-            &mut prefilter_ctx.pk_filter,
-            &prefilter_ctx.filters,
-            &prefilter_ctx.physical_filters,
-            reader_builder.file_path(),
-        )? {
-            Some(mask) => mask,
-            None => BooleanBuffer::new_unset(num_rows),
-        };
-        rows_selected += batch_mask.count_set_bits();
-        filter_arrays.push(BooleanArray::from(batch_mask));
+        let mut batch_mask = BooleanBuffer::new_set(num_rows);
+        for (idx, entry) in entries.iter().enumerate() {
+            let mask = eval_entry_mask(
+                &batch,
+                prefilter_ctx,
+                entry.kind,
+                reader_builder.file_path(),
+            )?;
+            batch_mask = batch_mask.bitand(&mask);
+            if let Some(Some(builder)) = cache_builders.get_mut(idx) {
+                builder.append_buffer(&mask);
+            }
+        }
+        if let Some(builder) = &mut combined_builder {
+            builder.append_buffer(&batch_mask);
+        }
     }
 
-    let filtered_rows = rows_before_filter.saturating_sub(rows_selected);
-    let refined_selection = if filter_arrays.is_empty() || rows_selected == 0 {
-        RowSelection::from(vec![])
-    } else {
-        let prefilter_selection = RowSelection::from_filters(&filter_arrays);
-        match &build_ctx.row_selection {
-            Some(original) => original.and_then(&prefilter_selection),
-            None => prefilter_selection,
+    for (entry, builder) in entries.iter().zip(cache_builders) {
+        if let (Some(key), Some(mut builder)) = (&entry.key, builder) {
+            reader_builder
+                .cache_strategy()
+                .put_prefilter_result(key.clone(), Arc::new(builder.finish()));
         }
-    };
+    }
+
+    Ok((
+        combined_builder.map(|mut builder| builder.finish()),
+        rows_before_filter,
+    ))
+}
+
+fn prefilter_column_names_for_entries(
+    prefilter_ctx: &PrefilterContext,
+    entries: &[PrefilterEntry],
+) -> HashSet<String> {
+    let mut prefilter_column_names = HashSet::new();
+    for entry in entries {
+        match entry.kind {
+            PrefilterEntryKind::Simple(idx) => {
+                if let MaybeFilter::Filter(filter) = prefilter_ctx.filters[idx].filter() {
+                    prefilter_column_names.insert(filter.column_name().to_string());
+                }
+            }
+            PrefilterEntryKind::Physical(idx) => {
+                prefilter_column_names.insert(
+                    prefilter_ctx.physical_filters[idx]
+                        .column_name()
+                        .to_string(),
+                );
+            }
+            PrefilterEntryKind::PkGroup => {
+                prefilter_column_names.insert(PRIMARY_KEY_COLUMN_NAME.to_string());
+            }
+        }
+    }
+    prefilter_column_names
+}
+
+async fn execute_prefilter_by_reading_columns(
+    prefilter_ctx: &mut PrefilterContext,
+    reader_builder: &RowGroupReaderBuilder,
+    build_ctx: &RowGroupBuildContext<'_>,
+) -> Result<PrefilterResult> {
+    let entries = all_prefilter_entries(prefilter_ctx);
+    let (mask, rows_before_filter) =
+        build_prefilter_masks(prefilter_ctx, reader_builder, build_ctx, &entries).await?;
+
+    let final_mask = mask.unwrap_or_else(|| BooleanBuffer::new_set(rows_before_filter));
+    let rows_selected = final_mask.count_set_bits();
+    let filtered_rows = rows_before_filter.saturating_sub(rows_selected);
+    let refined_selection = refined_selection_from_mask(&final_mask, &build_ctx.row_selection);
 
     Ok(PrefilterResult {
         refined_selection,
@@ -620,100 +810,243 @@ pub(crate) async fn execute_prefilter(
     })
 }
 
-fn apply_filters_to_batch(
+fn all_prefilter_entries(prefilter_ctx: &PrefilterContext) -> Vec<PrefilterEntry> {
+    let mut entries = Vec::new();
+    if prefilter_ctx.pk_filter.is_some() {
+        entries.push(PrefilterEntry::without_cache(PrefilterEntryKind::PkGroup));
+    }
+    entries.extend(
+        prefilter_ctx
+            .filters
+            .iter()
+            .enumerate()
+            .map(|(idx, _)| PrefilterEntry::without_cache(PrefilterEntryKind::Simple(idx))),
+    );
+    entries.extend(
+        prefilter_ctx
+            .physical_filters
+            .iter()
+            .enumerate()
+            .map(|(idx, _)| PrefilterEntry::without_cache(PrefilterEntryKind::Physical(idx))),
+    );
+    entries
+}
+
+#[derive(Clone, Copy)]
+enum PrefilterEntryKind {
+    Simple(usize),
+    Physical(usize),
+    PkGroup,
+}
+
+struct PrefilterEntry {
+    kind: PrefilterEntryKind,
+    key: Option<PrefilterKey>,
+}
+
+impl PrefilterEntry {
+    fn without_cache(kind: PrefilterEntryKind) -> Self {
+        Self { kind, key: None }
+    }
+}
+
+fn build_prefilter_cache_entries(
+    prefilter_ctx: &PrefilterContext,
+    reader_builder: &RowGroupReaderBuilder,
+    build_ctx: &RowGroupBuildContext<'_>,
+) -> Vec<PrefilterEntry> {
+    let row_selection = PrefilterKey::row_selection_snapshot(build_ctx.row_selection.as_ref());
+    let file_id = reader_builder.file_handle().file_id().file_id();
+    let row_group_idx = build_ctx.row_group_idx as u32;
+    let mut entries = Vec::new();
+
+    for (idx, filter_ctx) in prefilter_ctx.filters.iter().enumerate() {
+        entries.push(PrefilterEntry {
+            kind: PrefilterEntryKind::Simple(idx),
+            key: Some(PrefilterKey::new(
+                file_id,
+                row_group_idx,
+                row_selection.clone(),
+                prefilter_ctx.schema_version,
+                smallvec![filter_ctx.expr_str().to_string()],
+            )),
+        });
+    }
+
+    for (idx, filter_ctx) in prefilter_ctx.physical_filters.iter().enumerate() {
+        if !filter_ctx.is_immutable() {
+            continue;
+        }
+        entries.push(PrefilterEntry {
+            kind: PrefilterEntryKind::Physical(idx),
+            key: Some(PrefilterKey::new(
+                file_id,
+                row_group_idx,
+                row_selection.clone(),
+                prefilter_ctx.schema_version,
+                smallvec![filter_ctx.expr_str().to_string()],
+            )),
+        });
+    }
+
+    if prefilter_ctx.pk_filter.is_some()
+        && let Some(exprs) = &prefilter_ctx.pk_filter_expr_strs
+    {
+        entries.push(PrefilterEntry {
+            kind: PrefilterEntryKind::PkGroup,
+            key: Some(PrefilterKey::new(
+                file_id,
+                row_group_idx,
+                row_selection,
+                prefilter_ctx.schema_version,
+                exprs.clone(),
+            )),
+        });
+    }
+
+    entries
+}
+
+fn rows_before_filter(
+    reader_builder: &RowGroupReaderBuilder,
+    build_ctx: &RowGroupBuildContext<'_>,
+) -> usize {
+    build_ctx.row_selection.as_ref().map_or_else(
+        || {
+            reader_builder
+                .parquet_metadata()
+                .row_group(build_ctx.row_group_idx)
+                .num_rows() as usize
+        },
+        RowSelection::row_count,
+    )
+}
+
+fn refined_selection_from_mask(
+    mask: &BooleanBuffer,
+    original_selection: &Option<RowSelection>,
+) -> RowSelection {
+    if mask.is_empty() || mask.count_set_bits() == 0 {
+        return RowSelection::from(vec![]);
+    }
+
+    let prefilter_selection = RowSelection::from_filters(&[BooleanArray::from(mask.clone())]);
+    match original_selection {
+        Some(original) => original.and_then(&prefilter_selection),
+        None => prefilter_selection,
+    }
+}
+
+fn eval_entry_mask(
     batch: &RecordBatch,
-    pk_filter: &mut Option<Box<dyn PrimaryKeyFilter>>,
-    filters: &[SimpleFilterContext],
-    physical_filters: &[PhysicalFilterContext],
+    prefilter_ctx: &mut PrefilterContext,
+    kind: PrefilterEntryKind,
     file_path: &str,
-) -> Result<Option<BooleanBuffer>> {
-    let mut mask = BooleanBuffer::new_set(batch.num_rows());
-
-    if let Some(pk_filter) = pk_filter.as_mut() {
-        // Prefilter reads a reduced projection. For PK prefilter, the encoded
-        // primary key column is always appended as the last projected column,
-        // while `__sequence` and `__op_type` are not read.
-        let pk_column_index = batch.num_columns() - 1;
-        let matched_row_ranges =
-            matching_row_ranges_by_primary_key(batch, pk_column_index, pk_filter.as_mut())?;
-        let mut builder = BooleanBufferBuilder::new(batch.num_rows());
-        builder.append_n(batch.num_rows(), false);
-        for range in matched_row_ranges {
-            for row in range {
-                builder.set_bit(row, true);
-            }
+) -> Result<BooleanBuffer> {
+    match kind {
+        PrefilterEntryKind::Simple(idx) => {
+            eval_simple_filter_mask(batch, &prefilter_ctx.filters[idx], file_path)
         }
-        mask = mask.bitand(&builder.finish());
-    }
-
-    for filter_ctx in filters {
-        let filter = match filter_ctx.filter() {
-            MaybeFilter::Filter(filter) => filter,
-            MaybeFilter::Matched => continue,
-            MaybeFilter::Pruned => return Ok(None),
-        };
-
-        let (idx, _) = batch
-            .schema()
-            .column_with_name(filter.column_name())
-            .with_context(|| UnexpectedSnafu {
-                reason: format!(
-                    "Prefilter column '{}' (id {}) not found in batch for file {}",
-                    filter.column_name(),
-                    filter_ctx.column_id(),
-                    file_path
-                ),
-            })?;
-        let column = batch.column(idx).clone();
-        let result = filter.evaluate_array(&column).context(RecordBatchSnafu)?;
-        mask = mask.bitand(&result);
-    }
-
-    for filter_ctx in physical_filters {
-        let filter = filter_ctx.filter();
-
-        let (idx, _) = batch
-            .schema()
-            .column_with_name(filter_ctx.column_name())
-            .with_context(|| UnexpectedSnafu {
-                reason: format!(
-                    "Prefilter physical column '{}' (id {}) not found in batch for file {}",
-                    filter_ctx.column_name(),
-                    filter_ctx.column_id(),
-                    file_path
-                ),
-            })?;
-        let column = batch.column(idx).clone();
-
-        let record_batch = RecordBatch::try_new(filter_ctx.schema().clone(), vec![column])
-            .context(NewRecordBatchSnafu)?;
-        let evaluated = filter
-            .evaluate(&record_batch)
-            .context(EvalPartitionFilterSnafu)?;
-        let array = evaluated
-            .into_array(record_batch.num_rows())
-            .context(EvalPartitionFilterSnafu)?;
-        let boolean_array =
-            array
-                .as_any()
-                .downcast_ref::<BooleanArray>()
-                .context(UnexpectedSnafu {
-                    reason: "Failed to downcast physical filter result to BooleanArray",
-                })?;
-        // Treat null results as false (filtered out); value bits are not guaranteed
-        // to be false for invalid entries.
-        let mut result = boolean_array.values().clone();
-        if let Some(nulls) = boolean_array.nulls() {
-            result = result.bitand(nulls.inner());
+        PrefilterEntryKind::Physical(idx) => {
+            eval_physical_filter_mask(batch, &prefilter_ctx.physical_filters[idx], file_path)
+        }
+        PrefilterEntryKind::PkGroup => {
+            let pk_filter = prefilter_ctx.pk_filter.as_mut().context(UnexpectedSnafu {
+                reason: "Missing primary key filter for prefilter cache entry",
+            })?;
+            eval_pk_group_mask(batch, pk_filter.as_mut())
         }
-        mask = mask.bitand(&result);
     }
+}
 
-    if mask.count_set_bits() == 0 {
-        Ok(None)
-    } else {
-        Ok(Some(mask))
+fn eval_pk_group_mask(
+    batch: &RecordBatch,
+    pk_filter: &mut dyn PrimaryKeyFilter,
+) -> Result<BooleanBuffer> {
+    let (pk_column_index, _) = batch
+        .schema()
+        .column_with_name(PRIMARY_KEY_COLUMN_NAME)
+        .context(UnexpectedSnafu {
+            reason: "Primary key column not found in prefilter batch",
+        })?;
+    let matched_row_ranges = matching_row_ranges_by_primary_key(batch, pk_column_index, pk_filter)?;
+    let mut builder = BooleanBufferBuilder::new(batch.num_rows());
+    builder.append_n(batch.num_rows(), false);
+    for range in matched_row_ranges {
+        for row in range {
+            builder.set_bit(row, true);
+        }
     }
+    Ok(builder.finish())
+}
+
+fn eval_simple_filter_mask(
+    batch: &RecordBatch,
+    filter_ctx: &SimpleFilterContext,
+    file_path: &str,
+) -> Result<BooleanBuffer> {
+    let filter = match filter_ctx.filter() {
+        MaybeFilter::Filter(filter) => filter,
+        MaybeFilter::Matched => return Ok(BooleanBuffer::new_set(batch.num_rows())),
+        MaybeFilter::Pruned => return Ok(BooleanBuffer::new_unset(batch.num_rows())),
+    };
+
+    let (idx, _) = batch
+        .schema()
+        .column_with_name(filter.column_name())
+        .with_context(|| UnexpectedSnafu {
+            reason: format!(
+                "Prefilter column '{}' (id {}) not found in batch for file {}",
+                filter.column_name(),
+                filter_ctx.column_id(),
+                file_path
+            ),
+        })?;
+    let column = batch.column(idx).clone();
+    filter.evaluate_array(&column).context(RecordBatchSnafu)
+}
+
+fn eval_physical_filter_mask(
+    batch: &RecordBatch,
+    filter_ctx: &PhysicalFilterContext,
+    file_path: &str,
+) -> Result<BooleanBuffer> {
+    let filter = filter_ctx.filter();
+
+    let (idx, _) = batch
+        .schema()
+        .column_with_name(filter_ctx.column_name())
+        .with_context(|| UnexpectedSnafu {
+            reason: format!(
+                "Prefilter physical column '{}' (id {}) not found in batch for file {}",
+                filter_ctx.column_name(),
+                filter_ctx.column_id(),
+                file_path
+            ),
+        })?;
+    let column = batch.column(idx).clone();
+
+    let record_batch = RecordBatch::try_new(filter_ctx.schema().clone(), vec![column])
+        .context(NewRecordBatchSnafu)?;
+    let evaluated = filter
+        .evaluate(&record_batch)
+        .context(EvalPartitionFilterSnafu)?;
+    let array = evaluated
+        .into_array(record_batch.num_rows())
+        .context(EvalPartitionFilterSnafu)?;
+    let boolean_array = array
+        .as_any()
+        .downcast_ref::<BooleanArray>()
+        .context(UnexpectedSnafu {
+            reason: "Failed to downcast physical filter result to BooleanArray",
+        })?;
+    // Treat null results as false (filtered out); value bits are not guaranteed
+    // to be false for invalid entries.
+    let mut result = boolean_array.values().clone();
+    if let Some(nulls) = boolean_array.nulls() {
+        result = result.bitand(nulls.inner());
+    }
+    Ok(result)
 }
 
 #[cfg(test)]
@@ -728,7 +1061,6 @@ mod tests {
     };
     use datatypes::arrow::datatypes::{Schema, UInt32Type};
     use mito_codec::row_converter::{PrimaryKeyFilter, build_primary_key_codec};
-    use parquet::arrow::ArrowSchemaConverter;
     use store_api::codec::PrimaryKeyEncoding;
 
     use super::*;
@@ -800,12 +1132,6 @@ mod tests {
             .collect()
     }
 
-    fn parquet_schema(read_format: &FlatReadFormat) -> SchemaDescriptor {
-        ArrowSchemaConverter::new()
-            .convert(read_format.arrow_schema())
-            .unwrap()
-    }
-
     fn new_raw_batch(primary_keys: &[&[u8]], field_values: &[u64]) -> RecordBatch {
         assert_eq!(primary_keys.len(), field_values.len());
 
@@ -989,15 +1315,15 @@ mod tests {
         )
         .unwrap();
         let codec = build_primary_key_codec(metadata.as_ref());
-        let parquet_schema = parquet_schema(&read_format);
 
         let builder = PrefilterContextBuilder::new(
             &read_format,
             &codec,
             None,
+            None,
             Vec::new(),
             Vec::new(),
-            &parquet_schema,
+            metadata.schema_version,
         );
         assert!(builder.is_none());
     }
@@ -1091,7 +1417,6 @@ mod tests {
             true,
         )
         .unwrap();
-        let full_parquet_schema = parquet_schema(&full_read_format);
         let codec = build_primary_key_codec(metadata.as_ref());
 
         let skip_fields_plan = build_reader_filter_plan(
@@ -1102,7 +1427,6 @@ mod tests {
             None,
             PreFilterMode::SkipFields,
             &full_read_format,
-            &full_parquet_schema,
             &codec,
         );
         assert!(skip_fields_plan.prefilter_builder.is_some());
@@ -1121,13 +1445,11 @@ mod tests {
             true,
         )
         .unwrap();
-        let projected_parquet_schema = parquet_schema(&projected_read_format);
         let pk_prefilter_plan = build_reader_filter_plan(
             Some(&Predicate::new(vec![col("tag_0").eq(lit("a"))])),
             None,
             PreFilterMode::All,
             &projected_read_format,
-            &projected_parquet_schema,
             &codec,
         );
         assert!(pk_prefilter_plan.prefilter_builder.is_some());
@@ -1135,35 +1457,94 @@ mod tests {
     }
 
     #[test]
-    fn test_apply_filters_to_batch_uses_flat_tag_columns_directly() {
+    fn test_pk_filter_expr_strings_are_stable_under_expr_order() {
+        let metadata: RegionMetadataRef = Arc::new(sst_region_metadata_with_encoding(
+            PrimaryKeyEncoding::Sparse,
+        ));
+        let read_format = FlatReadFormat::new(
+            metadata.clone(),
+            ReadColumns::from_deduped_column_ids(
+                metadata.column_metadatas.iter().map(|c| c.column_id),
+            ),
+            None,
+            "test",
+            false,
+        )
+        .unwrap();
+        let codec = build_primary_key_codec(metadata.as_ref());
+
+        let expr_a = col("tag_0").eq(lit("a"));
+        let expr_b = col("tag_1").eq(lit("x"));
+        let plan_ab = build_reader_filter_plan(
+            Some(&Predicate::new(vec![expr_a.clone(), expr_b.clone()])),
+            None,
+            PreFilterMode::All,
+            &read_format,
+            &codec,
+        );
+        let plan_b_a = build_reader_filter_plan(
+            Some(&Predicate::new(vec![expr_b, expr_a])),
+            None,
+            PreFilterMode::All,
+            &read_format,
+            &codec,
+        );
+
+        let exprs_ab = plan_ab.prefilter_builder.unwrap().pk_filter_expr_strs;
+        let exprs_b_a = plan_b_a.prefilter_builder.unwrap().pk_filter_expr_strs;
+        assert!(exprs_ab.is_some());
+        assert_eq!(exprs_ab, exprs_b_a);
+    }
+
+    #[test]
+    fn test_simple_and_physical_contexts_preserve_expr_strings() {
+        let metadata: RegionMetadataRef = Arc::new(sst_region_metadata());
+        let read_format = FlatReadFormat::new(
+            metadata.clone(),
+            ReadColumns::from_deduped_column_ids(
+                metadata.column_metadatas.iter().map(|c| c.column_id),
+            ),
+            None,
+            "test",
+            true,
+        )
+        .unwrap();
+
+        let simple_expr = col("tag_0").eq(lit("a"));
+        let simple = SimpleFilterContext::new_opt(&metadata, None, &simple_expr).unwrap();
+        assert_eq!(simple.expr_str(), format!("{simple_expr:?}"));
+
+        let physical_expr = col("field_0").in_list(vec![lit(1_u64), lit(2_u64)], false);
+        let physical =
+            PhysicalFilterContext::new_opt(&metadata, None, &read_format, &physical_expr).unwrap();
+        assert_eq!(physical.expr_str(), format!("{physical_expr:?}"));
+    }
+
+    #[test]
+    fn test_eval_simple_filter_mask_uses_flat_tag_columns_directly() {
         let metadata: RegionMetadataRef = Arc::new(sst_region_metadata());
         let filters = new_simple_filter_contexts(&metadata, &[col("tag_0").eq(lit("a"))]);
         let batch = new_record_batch_with_custom_sequence(&["a", "x"], 0, 4, 1);
 
-        let mut no_pk_filter = None;
-        let mask = apply_filters_to_batch(&batch, &mut no_pk_filter, &filters, &[], "test")
-            .unwrap()
-            .unwrap();
+        let mask = eval_simple_filter_mask(&batch, &filters[0], "test").unwrap();
         assert_eq!(mask.count_set_bits(), 4);
     }
 
     #[test]
-    fn test_apply_filters_to_batch_errors_on_missing_selected_column() {
+    fn test_eval_simple_filter_mask_errors_on_missing_selected_column() {
         let metadata: RegionMetadataRef = Arc::new(sst_region_metadata());
         let filters = new_simple_filter_contexts(&metadata, &[col("tag_0").eq(lit("a"))]);
         let pk = new_primary_key(&["a", "x"]);
         let batch = new_raw_batch(&[pk.as_slice()], &[10]);
 
-        let mut no_pk_filter = None;
-        let err =
-            apply_filters_to_batch(&batch, &mut no_pk_filter, &filters, &[], "test").unwrap_err();
+        let err = eval_simple_filter_mask(&batch, &filters[0], "test").unwrap_err();
         let err = err.to_string();
         assert!(err.contains("Prefilter column"));
         assert!(err.contains("tag_0"));
     }
 
     #[test]
-    fn test_apply_filters_to_batch_evaluates_physical_filters() {
+    fn test_eval_physical_filter_mask_evaluates_physical_filters() {
         let metadata: RegionMetadataRef =
             Arc::new(sst_region_metadata_with_encoding(PrimaryKeyEncoding::Dense));
         let read_format = FlatReadFormat::new(
@@ -1181,16 +1562,12 @@ mod tests {
         let pk = new_primary_key(&["a", "x"]);
         let batch = new_raw_batch(&[pk.as_slice(), pk.as_slice(), pk.as_slice()], &[9, 10, 11]);
 
-        let mut no_pk_filter = None;
-        let mask =
-            apply_filters_to_batch(&batch, &mut no_pk_filter, &[], &physical_filters, "test")
-                .unwrap()
-                .unwrap();
+        let mask = eval_physical_filter_mask(&batch, &physical_filters[0], "test").unwrap();
         assert_eq!(mask.count_set_bits(), 1);
     }
 
     #[test]
-    fn test_apply_filters_to_batch_uses_last_projected_column_for_pk_prefilter() {
+    fn test_eval_pk_group_mask_finds_pk_column_by_name() {
         let metadata = Arc::new(sst_region_metadata());
         let filters = Arc::new(new_test_filters(&[col("tag_0").eq(lit("a"))]));
         let mut pk_filter = Some(Box::new(CachedPrimaryKeyFilter::new(
@@ -1208,9 +1585,7 @@ mod tests {
             &[10, 11, 12, 13],
         );
 
-        let mask = apply_filters_to_batch(&batch, &mut pk_filter, &[], &[], "test")
-            .unwrap()
-            .unwrap();
+        let mask = eval_pk_group_mask(&batch, pk_filter.as_mut().unwrap().as_mut()).unwrap();
 
         assert_eq!(mask.count_set_bits(), 2);
     }
diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs
index 0e1ce8d28b..02db70fa88 100644
--- a/src/mito2/src/sst/parquet/reader.rs
+++ b/src/mito2/src/sst/parquet/reader.rs
@@ -26,8 +26,9 @@ use api::v1::SemanticType;
 use common_recordbatch::filter::SimpleFilterEvaluator;
 use common_telemetry::{error, tracing, warn};
 use datafusion::physical_plan::PhysicalExpr;
-use datafusion_expr::Expr;
+use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
 use datafusion_expr::utils::expr_to_columns;
+use datafusion_expr::{Expr, Volatility};
 use datatypes::arrow::array::ArrayRef;
 use datatypes::arrow::datatypes::{Field, Schema as ArrowSchema, SchemaRef};
 use datatypes::arrow::record_batch::RecordBatch;
@@ -458,7 +459,6 @@ impl ParquetReaderBuilder {
             self.expected_metadata.as_deref(),
             self.pre_filter_mode,
             &read_format,
-            parquet_meta.file_metadata().schema_descr(),
             &codec,
         );
 
@@ -741,6 +741,7 @@ impl ParquetReaderBuilder {
         }
 
         self.prune_row_groups_by_bloom_filter(
+            read_format.metadata(),
             row_group_size,
             parquet_meta,
             &mut output,
@@ -935,6 +936,7 @@ impl ParquetReaderBuilder {
 
     async fn prune_row_groups_by_bloom_filter(
         &self,
+        sst_metadata: &RegionMetadataRef,
         row_group_size: usize,
         parquet_meta: &ParquetMetaData,
         output: &mut RowGroupSelection,
@@ -953,12 +955,17 @@ impl ParquetReaderBuilder {
             &self.bloom_filter_index_appliers[..]
         };
         for index_applier in appliers.iter().flatten() {
-            let predicate_key = index_applier.predicate_key();
+            let Some(compatible_predicates) =
+                index_applier.compatible_predicate_for_sst(sst_metadata)
+            else {
+                continue;
+            };
+            let predicate_key = PredicateKey::new_bloom(compatible_predicates.clone());
             // Fast path: return early if the result is in the cache.
-            let cached = self
-                .cache_strategy
-                .index_result_cache()
-                .and_then(|cache| cache.get(predicate_key, self.file_handle.file_id().file_id()));
+            let cached = self.cache_strategy.index_result_cache().and_then(|cache| {
+                let file_id = self.file_handle.file_id().file_id();
+                cache.get(&predicate_key, file_id)
+            });
             if let Some(result) = cached.as_ref()
                 && all_required_row_groups_searched(output, result)
             {
@@ -986,6 +993,7 @@ impl ParquetReaderBuilder {
                 .apply(
                     self.file_handle.index_id(),
                     Some(file_size_hint),
+                    &compatible_predicates,
                     rgs,
                     metrics.bloom_filter_apply_metrics.as_mut(),
                 )
@@ -1006,7 +1014,7 @@ impl ParquetReaderBuilder {
             }
 
             self.apply_index_result_and_update_cache(
-                predicate_key,
+                &predicate_key,
                 self.file_handle.file_id().file_id(),
                 selection,
                 output,
@@ -1862,6 +1870,8 @@ impl MaybeFilter {
 pub(crate) struct SimpleFilterContext {
     /// Filter to evaluate.
     filter: MaybeFilter,
+    /// Debug string of the original logical expression.
+    expr_str: String,
     /// Id of the column to evaluate.
     column_id: ColumnId,
     /// Semantic type of the column.
@@ -1879,6 +1889,7 @@ impl SimpleFilterContext {
         expr: &Expr,
     ) -> Option<Self> {
         let filter = SimpleFilterEvaluator::try_new(expr)?;
+        let expr_str = format!("{expr:?}");
         let (column_metadata, maybe_filter) = match expected_meta {
             Some(meta) => {
                 // Gets the column metadata from the expected metadata.
@@ -1924,6 +1935,7 @@ impl SimpleFilterContext {
 
         Some(Self {
             filter: maybe_filter,
+            expr_str,
             column_id: column_metadata.column_id,
             semantic_type: column_metadata.semantic_type,
         })
@@ -1934,6 +1946,11 @@ impl SimpleFilterContext {
         &self.filter
     }
 
+    /// Returns the original logical expression string.
+    pub(crate) fn expr_str(&self) -> &str {
+        &self.expr_str
+    }
+
     /// Returns the column id.
     pub(crate) fn column_id(&self) -> ColumnId {
         self.column_id
@@ -1950,6 +1967,8 @@ impl SimpleFilterContext {
 pub(crate) struct PhysicalFilterContext {
     /// Filter to evaluate.
     filter: Arc<dyn PhysicalExpr>,
+    /// Debug string of the original logical expression.
+    expr_str: String,
     /// Id of the column to evaluate.
     column_id: ColumnId,
     /// Name of the column to evaluate.
@@ -1958,6 +1977,8 @@ pub(crate) struct PhysicalFilterContext {
     semantic_type: SemanticType,
     /// Schema containing only the referenced column.
     schema: SchemaRef,
+    /// Whether the original logical expression is immutable across queries.
+    immutable: bool,
 }
 
 impl PhysicalFilterContext {
@@ -1974,6 +1995,7 @@ impl PhysicalFilterContext {
         if !Self::is_prefilter_candidate(expr) {
             return None;
         }
+        let expr_str = format!("{expr:?}");
         let column_name = Self::single_column_name(expr)?;
         let column_metadata = match expected_meta {
             Some(meta) => {
@@ -1998,13 +2020,16 @@ impl PhysicalFilterContext {
                 error!(e; "Unable to build physical filter for {expr}, schema: {schema:?}");
             })
             .ok()?;
+        let immutable = expr_is_immutable(expr);
 
         Some(Self {
             filter: physical_expr,
+            expr_str,
             column_id: column_metadata.column_id,
             column_name,
             semantic_type: column_metadata.semantic_type,
             schema,
+            immutable,
         })
     }
 
@@ -2035,6 +2060,11 @@ impl PhysicalFilterContext {
         &self.filter
     }
 
+    /// Returns the original logical expression string.
+    pub(crate) fn expr_str(&self) -> &str {
+        &self.expr_str
+    }
+
     /// Returns the column id.
     pub(crate) fn column_id(&self) -> ColumnId {
         self.column_id
@@ -2054,6 +2084,29 @@ impl PhysicalFilterContext {
     pub(crate) fn schema(&self) -> &SchemaRef {
         &self.schema
     }
+
+    /// Returns true if the original logical expression is immutable across queries.
+    pub(crate) fn is_immutable(&self) -> bool {
+        self.immutable
+    }
+}
+
+fn expr_is_immutable(expr: &Expr) -> bool {
+    let mut is_immutable = true;
+    let _ = expr.apply(|expr| match expr {
+        Expr::ScalarFunction(function)
+            if function.func.signature().volatility != Volatility::Immutable =>
+        {
+            is_immutable = false;
+            Ok(TreeNodeRecursion::Stop)
+        }
+        Expr::ScalarVariable(_, _) => {
+            is_immutable = false;
+            Ok(TreeNodeRecursion::Stop)
+        }
+        _ => Ok(TreeNodeRecursion::Continue),
+    });
+    is_immutable
 }
 
 /// Prune a column by its default value.
@@ -2335,6 +2388,74 @@ mod tests {
         assert!(!selection.is_empty());
     }
 
+    #[test]
+    fn test_expr_is_immutable_checks_scalar_function_volatility() {
+        #[derive(Debug, PartialEq, Eq, Hash)]
+        struct TestVolatilityUdf {
+            name: String,
+            signature: Signature,
+        }
+
+        impl TestVolatilityUdf {
+            fn new(name: &str, volatility: Volatility) -> Self {
+                Self {
+                    name: name.to_string(),
+                    signature: Signature::variadic_any(volatility),
+                }
+            }
+        }
+
+        impl ScalarUDFImpl for TestVolatilityUdf {
+            fn as_any(&self) -> &dyn Any {
+                self
+            }
+
+            fn name(&self) -> &str {
+                &self.name
+            }
+
+            fn signature(&self) -> &Signature {
+                &self.signature
+            }
+
+            fn return_type(&self, _arg_types: &[DataType]) -> datafusion_common::Result<DataType> {
+                Ok(DataType::Int64)
+            }
+
+            fn invoke_with_args(
+                &self,
+                _args: ScalarFunctionArgs,
+            ) -> datafusion_common::Result<ColumnarValue> {
+                Ok(ColumnarValue::Scalar(ScalarValue::Int64(Some(1))))
+            }
+        }
+
+        let expr = |name: &str, volatility| {
+            Expr::ScalarFunction(ScalarFunction::new_udf(
+                Arc::new(ScalarUDF::new_from_impl(TestVolatilityUdf::new(
+                    name, volatility,
+                ))),
+                vec![],
+            ))
+        };
+
+        assert!(expr_is_immutable(&expr(
+            "immutable_udf",
+            Volatility::Immutable
+        )));
+        assert!(!expr_is_immutable(&expr("stable_udf", Volatility::Stable)));
+        assert!(!expr_is_immutable(&expr(
+            "volatile_udf",
+            Volatility::Volatile
+        )));
+
+        let scalar_variable = Expr::ScalarVariable(
+            Arc::new(Field::new("@@version", DataType::Utf8, false)),
+            vec!["@@version".to_string()],
+        );
+        assert!(!expr_is_immutable(&scalar_variable));
+    }
+
     #[tokio::test(flavor = "current_thread")]
     async fn test_has_row_level_selection() {
         let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
diff --git a/src/mito2/src/sst/version.rs b/src/mito2/src/sst/version.rs
index 5958cf7513..67d41a3b82 100644
--- a/src/mito2/src/sst/version.rs
+++ b/src/mito2/src/sst/version.rs
@@ -18,7 +18,7 @@ use std::fmt;
 use std::sync::Arc;
 
 use common_time::{TimeToLive, Timestamp};
-use store_api::storage::FileId;
+use store_api::storage::{FileId, RegionId};
 
 use crate::sst::file::{FileHandle, FileMeta, Level, MAX_LEVEL};
 use crate::sst::file_purger::FilePurgerRef;
@@ -106,15 +106,19 @@ impl SstVersion {
         }
     }
 
-    /// Returns the number of rows in SST files.
+    /// Returns the number of rows in SST files owned by `region_id`.
+    ///
+    /// Rows from SST files referenced from other regions, for example after
+    /// repartition, are not counted.
     /// For historical reasons, the result is not precise for old SST files.
-    pub(crate) fn num_rows(&self) -> u64 {
+    pub(crate) fn owned_num_rows(&self, region_id: RegionId) -> u64 {
         self.levels
             .iter()
             .map(|level_meta| {
                 level_meta
                     .files
                     .values()
+                    .filter(|file_handle| file_handle.region_id() == region_id)
                     .map(|file_handle| {
                         let meta = file_handle.meta_ref();
                         meta.num_rows
@@ -124,22 +128,29 @@ impl SstVersion {
             .sum()
     }
 
-    /// Returns the number of SST files.
-    pub(crate) fn num_files(&self) -> u64 {
-        self.levels
-            .iter()
-            .map(|level_meta| level_meta.files.len() as u64)
-            .sum()
-    }
-
-    /// Returns SST data files'space occupied in current version.
-    pub(crate) fn sst_usage(&self) -> u64 {
+    /// Returns the number of SST files owned by `region_id`.
+    pub(crate) fn owned_num_files(&self, region_id: RegionId) -> u64 {
         self.levels
             .iter()
             .map(|level_meta| {
                 level_meta
                     .files
                     .values()
+                    .filter(|file_handle| file_handle.region_id() == region_id)
+                    .count() as u64
+            })
+            .sum()
+    }
+
+    /// Returns the space occupied by SST data files owned by `region_id`.
+    pub(crate) fn owned_sst_usage(&self, region_id: RegionId) -> u64 {
+        self.levels
+            .iter()
+            .map(|level_meta| {
+                level_meta
+                    .files
+                    .values()
+                    .filter(|file_handle| file_handle.region_id() == region_id)
                     .map(|file_handle| {
                         let meta = file_handle.meta_ref();
                         meta.file_size
@@ -149,14 +160,15 @@ impl SstVersion {
             .sum()
     }
 
-    /// Returns SST index files'space occupied in current version.
-    pub(crate) fn index_usage(&self) -> u64 {
+    /// Returns the space occupied by SST index files owned by `region_id`.
+    pub(crate) fn owned_index_usage(&self, region_id: RegionId) -> u64 {
         self.levels
             .iter()
             .map(|level_meta| {
                 level_meta
                     .files
                     .values()
+                    .filter(|file_handle| file_handle.region_id() == region_id)
                     .map(|file_handle| {
                         let meta = file_handle.meta_ref();
                         meta.index_file_size
@@ -257,4 +269,50 @@ mod tests {
             assert!(added_files.contains_key(&f.file_id));
         });
     }
+
+    #[test]
+    fn test_usage_only_counts_owned_files() {
+        let purger = new_noop_file_purger();
+        let region_id = RegionId::new(1, 1);
+        let other_region_id = RegionId::new(1, 2);
+
+        let files = [
+            FileMeta {
+                region_id,
+                file_id: FileId::random(),
+                file_size: 100,
+                index_file_size: 10,
+                num_rows: 1,
+                ..Default::default()
+            },
+            FileMeta {
+                region_id,
+                file_id: FileId::random(),
+                file_size: 200,
+                index_file_size: 20,
+                num_rows: 2,
+                ..Default::default()
+            },
+            FileMeta {
+                region_id: other_region_id,
+                file_id: FileId::random(),
+                file_size: 300,
+                index_file_size: 30,
+                num_rows: 3,
+                ..Default::default()
+            },
+        ];
+
+        let mut version = SstVersion::new();
+        version.add_files(purger, files.iter().cloned());
+
+        assert_eq!(3, version.owned_num_rows(region_id));
+        assert_eq!(2, version.owned_num_files(region_id));
+        assert_eq!(300, version.owned_sst_usage(region_id));
+        assert_eq!(30, version.owned_index_usage(region_id));
+        assert_eq!(3, version.owned_num_rows(other_region_id));
+        assert_eq!(1, version.owned_num_files(other_region_id));
+        assert_eq!(300, version.owned_sst_usage(other_region_id));
+        assert_eq!(30, version.owned_index_usage(other_region_id));
+    }
 }
diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs
index 88be908423..3e1d42caf2 100644
--- a/src/mito2/src/worker.rs
+++ b/src/mito2/src/worker.rs
@@ -208,6 +208,7 @@ impl WorkerGroup {
                 .page_cache_size(config.page_cache_size.as_bytes())
                 .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
                 .range_result_cache_size(config.range_result_cache_size.as_bytes())
+                .prefilter_result_cache_size(config.prefilter_result_cache_size.as_bytes())
                 .index_metadata_size(config.index.metadata_cache_size.as_bytes())
                 .index_content_size(config.index.content_cache_size.as_bytes())
                 .index_content_page_size(config.index.content_cache_page_size.as_bytes())
@@ -423,6 +424,7 @@ impl WorkerGroup {
                 .page_cache_size(config.page_cache_size.as_bytes())
                 .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
                 .range_result_cache_size(config.range_result_cache_size.as_bytes())
+                .prefilter_result_cache_size(config.prefilter_result_cache_size.as_bytes())
                 .write_cache(write_cache)
                 .build(),
         );
diff --git a/src/operator/src/expr_helper.rs b/src/operator/src/expr_helper.rs
index 378122030c..af6e7d1032 100644
--- a/src/operator/src/expr_helper.rs
+++ b/src/operator/src/expr_helper.rs
@@ -689,11 +689,17 @@ pub struct RepartitionRequest {
     pub catalog_name: String,
     pub schema_name: String,
     pub table_name: String,
-    pub from_exprs: Vec<Expr>,
+    pub source: RepartitionSource,
     pub into_exprs: Vec<Expr>,
     pub options: OptionMap,
 }
 
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum RepartitionSource {
+    Partitions { from_exprs: Vec<Expr> },
+    Unpartitioned { partition_columns: Vec<String> },
+}
+
 pub(crate) fn to_repartition_request(
     alter_table: AlterTable,
     query_ctx: &QueryContextRef,
@@ -708,19 +714,37 @@ pub(crate) fn to_repartition_request(
         .map_err(BoxedError::new)
         .context(ExternalSnafu)?;
 
-    let AlterTableOperation::Repartition { operation } = alter_operation else {
-        return InvalidSqlSnafu {
-            err_msg: "expected REPARTITION operation",
+    let (source, into_exprs) = match alter_operation {
+        AlterTableOperation::Repartition { operation } => (
+            RepartitionSource::Partitions {
+                from_exprs: operation.from_exprs,
+            },
+            operation.into_exprs,
+        ),
+        AlterTableOperation::Partition { partitions } => (
+            RepartitionSource::Unpartitioned {
+                partition_columns: partitions
+                    .column_list
+                    .into_iter()
+                    .map(|ident| ident.value)
+                    .collect(),
+            },
+            partitions.exprs,
+        ),
+        _ => {
+            return InvalidSqlSnafu {
+                err_msg: "expected REPARTITION or PARTITION operation",
+            }
+            .fail();
         }
-        .fail();
     };
 
     Ok(RepartitionRequest {
         catalog_name,
         schema_name,
         table_name,
-        from_exprs: operation.from_exprs,
-        into_exprs: operation.into_exprs,
+        source,
+        into_exprs,
         options,
     })
 }
@@ -814,6 +838,12 @@ pub(crate) fn to_alter_table_expr(
             }
             .fail();
         }
+        AlterTableOperation::Partition { .. } => {
+            return NotSupportedSnafu {
+                feat: "ALTER TABLE ... PARTITION ON COLUMNS",
+            }
+            .fail();
+        }
         AlterTableOperation::SetIndex { options } => {
             let option = match options {
                 sql::statements::alter::SetIndexOperation::Fulltext {
@@ -1687,9 +1717,11 @@ ALTER TABLE metrics REPARTITION (
         assert_eq!("greptime", request.catalog_name);
         assert_eq!("public", request.schema_name);
         assert_eq!("metrics", request.table_name);
+        let RepartitionSource::Partitions { from_exprs } = request.source else {
+            unreachable!()
+        };
         assert_eq!(
-            request
-                .from_exprs
+            from_exprs
                 .into_iter()
                 .map(|x| x.to_string())
                 .collect::<Vec<_>>(),
@@ -1708,6 +1740,44 @@ ALTER TABLE metrics REPARTITION (
         );
     }
 
+    #[test]
+    fn test_to_repartition_request_with_unpartitioned_source() {
+        let sql = r#"
+ALTER TABLE metrics PARTITION ON COLUMNS (device_id, area) (
+  device_id < 100 AND area < 'South',
+  device_id < 100 AND area >= 'South'
+);"#;
+        let stmt =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap()
+                .pop()
+                .unwrap();
+
+        let Statement::AlterTable(alter_table) = stmt else {
+            unreachable!()
+        };
+
+        let request = to_repartition_request(alter_table, &QueryContext::arc()).unwrap();
+        assert_eq!("greptime", request.catalog_name);
+        assert_eq!("public", request.schema_name);
+        assert_eq!("metrics", request.table_name);
+        let RepartitionSource::Unpartitioned { partition_columns } = request.source else {
+            unreachable!()
+        };
+        assert_eq!(partition_columns, vec!["device_id", "area"]);
+        assert_eq!(
+            request
+                .into_exprs
+                .into_iter()
+                .map(|x| x.to_string())
+                .collect::<Vec<_>>(),
+            vec![
+                "device_id < 100 AND area < 'South'".to_string(),
+                "device_id < 100 AND area >= 'South'".to_string()
+            ]
+        );
+    }
+
     fn new_test_table_names() -> Vec<TableName> {
         vec![
             TableName {
diff --git a/src/operator/src/statement/ddl.rs b/src/operator/src/statement/ddl.rs
index 08d3206548..aef164c6bb 100644
--- a/src/operator/src/statement/ddl.rs
+++ b/src/operator/src/statement/ddl.rs
@@ -19,9 +19,10 @@ use std::time::Duration;
 use api::helper::ColumnDataTypeWrapper;
 use api::v1::alter_table_expr::Kind;
 use api::v1::meta::CreateFlowTask as PbCreateFlowTask;
+use api::v1::repartition::Source;
 use api::v1::{
     AlterDatabaseExpr, AlterTableExpr, CreateFlowExpr, CreateTableExpr, CreateViewExpr,
-    Repartition, column_def,
+    PartitionExprs, Repartition, UnpartitionedSource, column_def,
 };
 #[cfg(feature = "enterprise")]
 use api::v1::{
@@ -34,7 +35,7 @@ use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, is_reado
 use common_catalog::{format_full_flow_name, format_full_table_name};
 use common_error::ext::BoxedError;
 use common_meta::cache_invalidator::Context;
-use common_meta::ddl::create_flow::FlowType;
+use common_meta::ddl::create_flow::{DEFER_ON_MISSING_SOURCE_KEY, FlowType};
 use common_meta::instruction::CacheIdent;
 use common_meta::key::schema_name::{SchemaName, SchemaNameKey};
 use common_meta::procedure_executor::ExecutorContext;
@@ -102,7 +103,7 @@ use crate::error::{
     TableMetadataManagerSnafu, TableNotFoundSnafu, UnrecognizedTableOptionSnafu,
     ViewAlreadyExistsSnafu,
 };
-use crate::expr_helper::{self, RepartitionRequest};
+use crate::expr_helper::{self, RepartitionRequest, RepartitionSource};
 use crate::statement::StatementExecutor;
 use crate::statement::show::create_partitions_stmt;
 use crate::utils::{to_meta_query_context, to_meta_query_context_with_origin_frontend};
@@ -113,7 +114,6 @@ struct DdlSubmitOptions {
     timeout: Duration,
 }
 
-const DEFER_ON_MISSING_SOURCE_KEY: &str = "defer_on_missing_source";
 const ALLOWED_FLOW_OPTIONS: [&str; 1] = [DEFER_ON_MISSING_SOURCE_KEY];
 
 fn build_procedure_id_output(procedure_id: Vec<u8>) -> Result<Output> {
@@ -204,6 +204,39 @@ fn validate_and_normalize_flow_options(
         .collect()
 }
 
+fn determine_flow_type_for_source_state(
+    flow_name: &str,
+    flow_options: &HashMap<String, String>,
+    has_missing_source_table: bool,
+    has_instant_ttl_source_table: bool,
+) -> Result<Option<FlowType>> {
+    if has_missing_source_table {
+        let defer_on_missing_source = flow_options
+            .get(DEFER_ON_MISSING_SOURCE_KEY)
+            .is_some_and(|value| value == "true");
+        ensure!(
+            defer_on_missing_source,
+            InvalidSqlSnafu {
+                err_msg: format!(
+                    "missing source tables for flow '{}'; use WITH ({DEFER_ON_MISSING_SOURCE_KEY} = true) to create a pending flow",
+                    flow_name
+                )
+            }
+        );
+        info!(
+            "Flow `{}` is created as a pending batching flow because source tables are missing and defer_on_missing_source=true",
+            flow_name
+        );
+        return Ok(Some(FlowType::Batching));
+    }
+
+    if has_instant_ttl_source_table {
+        return Ok(Some(FlowType::Streaming));
+    }
+
+    Ok(None)
+}
+
 impl StatementExecutor {
     pub fn catalog_manager(&self) -> CatalogManagerRef {
         self.catalog_manager.clone()
@@ -714,7 +747,9 @@ impl StatementExecutor {
         expr: &CreateFlowExpr,
         query_ctx: QueryContextRef,
     ) -> Result<FlowType> {
-        // first check if source table's ttl is instant, if it is, force streaming mode
+        let mut has_missing_source_table = false;
+        let mut has_instant_ttl_source_table = false;
+
         for src_table_name in &expr.source_table_names {
             let table = self
                 .catalog_manager()
@@ -726,16 +761,13 @@ impl StatementExecutor {
                 )
                 .await
                 .map_err(BoxedError::new)
-                .context(ExternalSnafu)?
-                .with_context(|| TableNotFoundSnafu {
-                    table_name: format_full_table_name(
-                        &src_table_name.catalog_name,
-                        &src_table_name.schema_name,
-                        &src_table_name.table_name,
-                    ),
-                })?;
+                .context(ExternalSnafu)?;
+
+            let Some(table) = table else {
+                has_missing_source_table = true;
+                continue;
+            };
 
-            // instant source table can only be handled by streaming mode
             if table.table_info().meta.options.ttl == Some(common_time::TimeToLive::Instant) {
                 warn!(
                     "Source table `{}` for flow `{}`'s ttl=instant, fallback to streaming mode",
@@ -746,10 +778,19 @@ impl StatementExecutor {
                     ),
                     expr.flow_name
                 );
-                return Ok(FlowType::Streaming);
+                has_instant_ttl_source_table = true;
             }
         }
 
+        if let Some(flow_type) = determine_flow_type_for_source_state(
+            &expr.flow_name,
+            &expr.flow_options,
+            has_missing_source_table,
+            has_instant_ttl_source_table,
+        )? {
+            return Ok(flow_type);
+        }
+
         let engine = &self.query_engine;
         let stmts = ParserContext::create_with_dialect(
             &expr.sql,
@@ -1408,7 +1449,7 @@ impl StatementExecutor {
     ) -> Result<Output> {
         if matches!(
             alter_table.alter_operation(),
-            AlterTableOperation::Repartition { .. }
+            AlterTableOperation::Repartition { .. } | AlterTableOperation::Partition { .. }
         ) {
             let request = expr_helper::to_repartition_request(alter_table, &query_context)?;
             return self.repartition_table(request, &query_context).await;
@@ -1468,32 +1509,59 @@ impl StatementExecutor {
         );
 
         let table_info = table.table_info();
-        // Get partition column names from the table metadata.
         let existing_partition_columns = table_info.meta.partition_columns().collect::<Vec<_>>();
-        // Repartition requires the table to have partition columns.
-        ensure!(
-            !existing_partition_columns.is_empty(),
-            InvalidPartitionRuleSnafu {
-                reason: format!(
-                    "table {} does not have partition columns, cannot repartition",
-                    table_ref
-                )
+        let partition_columns = match &request.source {
+            RepartitionSource::Partitions { .. } => {
+                ensure!(
+                    !existing_partition_columns.is_empty(),
+                    InvalidPartitionRuleSnafu {
+                        reason: format!(
+                            "table {} does not have partition columns, cannot repartition",
+                            table_ref
+                        )
+                    }
+                );
+                existing_partition_columns
             }
-        );
+            RepartitionSource::Unpartitioned { partition_columns } => {
+                ensure!(
+                    !partition_columns.is_empty(),
+                    InvalidPartitionRuleSnafu {
+                        reason: "PARTITION ON COLUMNS requires at least one partition column"
+                    }
+                );
+                ensure!(
+                    existing_partition_columns.is_empty(),
+                    InvalidPartitionRuleSnafu {
+                        reason: format!("table {} already has partition columns", table_ref)
+                    }
+                );
+                let column_schemas = table_info.meta.schema.column_schemas();
+                partition_columns
+                    .iter()
+                    .map(|column_name| {
+                        column_schemas
+                            .iter()
+                            .find(|column| &column.name == column_name)
+                            .with_context(|| ColumnNotFoundSnafu { msg: column_name })
+                    })
+                    .collect::<Result<Vec<_>>>()?
+            }
+        };
 
-        // Repartition operations involving columns outside the existing partition columns are not supported.
-        // This restriction ensures repartition only applies to current partition columns.
-        let column_name_and_type = existing_partition_columns
+        let column_name_and_type = partition_columns
             .iter()
             .map(|column| (&column.name, column.data_type.clone()))
             .collect();
         let timezone = query_context.timezone();
         // Convert SQL Exprs to PartitionExprs.
-        let from_partition_exprs = request
-            .from_exprs
-            .iter()
-            .map(|expr| convert_one_expr(expr, &column_name_and_type, &timezone))
-            .collect::<Result<Vec<_>>>()?;
+        let from_partition_exprs = match &request.source {
+            RepartitionSource::Partitions { from_exprs } => from_exprs
+                .iter()
+                .map(|expr| convert_one_expr(expr, &column_name_and_type, &timezone))
+                .collect::<Result<Vec<_>>>()?,
+            RepartitionSource::Unpartitioned { .. } => vec![],
+        };
 
         let mut into_partition_exprs = request
             .into_exprs
@@ -1503,7 +1571,8 @@ impl StatementExecutor {
 
         // `MERGE PARTITION` (and some `REPARTITION`) generates a single `OR` expression from
         // multiple source partitions; try to simplify it for better readability and stability.
-        if from_partition_exprs.len() > 1
+        if matches!(&request.source, RepartitionSource::Partitions { .. })
+            && from_partition_exprs.len() > 1
             && into_partition_exprs.len() == 1
             && let Some(expr) = into_partition_exprs.pop()
         {
@@ -1530,34 +1599,36 @@ impl StatementExecutor {
 
         // Validate that from_partition_exprs are a subset of existing partition exprs.
         // We compare PartitionExpr directly since it implements Eq.
-        for from_expr in &from_partition_exprs {
-            ensure!(
-                existing_partition_exprs.contains(from_expr),
-                InvalidPartitionRuleSnafu {
-                    reason: format!(
-                        "partition expression '{}' does not exist in table {}",
-                        from_expr, table_ref
-                    )
-                }
-            );
+        if matches!(&request.source, RepartitionSource::Partitions { .. }) {
+            for from_expr in &from_partition_exprs {
+                ensure!(
+                    existing_partition_exprs.contains(from_expr),
+                    InvalidPartitionRuleSnafu {
+                        reason: format!(
+                            "partition expression '{}' does not exist in table {}",
+                            from_expr, table_ref
+                        )
+                    }
+                );
+            }
         }
 
         // Build the new partition expressions:
         // new_exprs = existing_exprs - from_exprs + into_exprs
-        let new_partition_exprs: Vec<PartitionExpr> = existing_partition_exprs
-            .into_iter()
-            .filter(|expr| !from_partition_exprs.contains(expr))
-            .chain(into_partition_exprs.clone().into_iter())
-            .collect();
+        let new_partition_exprs: Vec<PartitionExpr> = match &request.source {
+            RepartitionSource::Partitions { .. } => existing_partition_exprs
+                .into_iter()
+                .filter(|expr| !from_partition_exprs.contains(expr))
+                .chain(into_partition_exprs.clone().into_iter())
+                .collect(),
+            RepartitionSource::Unpartitioned { .. } => into_partition_exprs.clone(),
+        };
         let new_partition_exprs_len = new_partition_exprs.len();
         let from_partition_exprs_len = from_partition_exprs.len();
 
         // Validate the new partition expressions using MultiDimPartitionRule and PartitionChecker.
         let _ = MultiDimPartitionRule::try_new(
-            existing_partition_columns
-                .iter()
-                .map(|c| c.name.clone())
-                .collect(),
+            partition_columns.iter().map(|c| c.name.clone()).collect(),
             vec![],
             new_partition_exprs,
             true,
@@ -1574,16 +1645,28 @@ impl StatementExecutor {
         };
         let from_partition_exprs_json = serialize_exprs(from_partition_exprs)?;
         let into_partition_exprs_json = serialize_exprs(into_partition_exprs)?;
+        let source = match &request.source {
+            RepartitionSource::Partitions { .. } => Source::PartitionExprs(PartitionExprs {
+                exprs: from_partition_exprs_json,
+            }),
+            RepartitionSource::Unpartitioned { partition_columns } => {
+                Source::Unpartitioned(UnpartitionedSource {
+                    partition_columns: partition_columns.clone(),
+                })
+            }
+        };
+        let repartition = Repartition {
+            into_partition_exprs: into_partition_exprs_json,
+            source: Some(source),
+            ..Default::default()
+        };
         let mut req = SubmitDdlTaskRequest::new(
             to_meta_query_context(query_context.clone()),
             DdlTask::new_alter_table(AlterTableExpr {
                 catalog_name: request.catalog_name.clone(),
                 schema_name: request.schema_name.clone(),
                 table_name: request.table_name.clone(),
-                kind: Some(Kind::Repartition(Repartition {
-                    from_partition_exprs: from_partition_exprs_json,
-                    into_partition_exprs: into_partition_exprs_json,
-                })),
+                kind: Some(Kind::Repartition(repartition)),
             }),
         );
         req.wait = ddl_options.wait;
@@ -2472,6 +2555,35 @@ SELECT max(c1), min(c2) FROM schema_2.table_2;";
         ));
     }
 
+    #[test]
+    fn test_determine_flow_type_for_source_state_missing_sources_require_opt_in() {
+        let err = determine_flow_type_for_source_state("my_flow", &HashMap::new(), true, false)
+            .unwrap_err();
+
+        assert!(err.to_string().contains(
+            "missing source tables for flow 'my_flow'; use WITH (defer_on_missing_source = true) to create a pending flow"
+        ));
+    }
+
+    #[test]
+    fn test_determine_flow_type_for_source_state_missing_sources_prefer_batching() {
+        let flow_options =
+            HashMap::from([(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string())]);
+
+        assert_eq!(
+            determine_flow_type_for_source_state("my_flow", &flow_options, true, true).unwrap(),
+            Some(FlowType::Batching)
+        );
+    }
+
+    #[test]
+    fn test_determine_flow_type_for_source_state_instant_ttl_without_missing_sources() {
+        assert_eq!(
+            determine_flow_type_for_source_state("my_flow", &HashMap::new(), false, true).unwrap(),
+            Some(FlowType::Streaming)
+        );
+    }
+
     #[test]
     fn test_name_is_match() {
         assert!(!NAME_PATTERN_REG.is_match("/adaf"));
diff --git a/src/promql/src/extension_plan/histogram_fold.rs b/src/promql/src/extension_plan/histogram_fold.rs
index 15dd5f7c8c..b663433859 100644
--- a/src/promql/src/extension_plan/histogram_fold.rs
+++ b/src/promql/src/extension_plan/histogram_fold.rs
@@ -322,16 +322,18 @@ impl HistogramFold {
     /// Transform the schema
     ///
     /// - `le` will be removed
+    ///
+    /// Column qualifiers are preserved so downstream plan nodes can keep
+    /// referencing the columns by their original qualified names.
     fn convert_schema(
         input_schema: &DFSchemaRef,
         le_column: &str,
     ) -> DataFusionResult<DFSchemaRef> {
-        let fields = input_schema.fields();
         // safety: those fields are checked in `check_schema()`
-        let mut new_fields = Vec::with_capacity(fields.len() - 1);
-        for f in fields {
-            if f.name() != le_column {
-                new_fields.push((None, f.clone()));
+        let mut new_fields = Vec::with_capacity(input_schema.fields().len() - 1);
+        for (qualifier, field) in input_schema.iter() {
+            if field.name() != le_column {
+                new_fields.push((qualifier.cloned(), field.clone()));
             }
         }
         Ok(Arc::new(DFSchema::new_with_metadata(
diff --git a/src/query/src/datafusion/json_expr_planner.rs b/src/query/src/datafusion/json_expr_planner.rs
index 4546c0a84d..786561db2b 100644
--- a/src/query/src/datafusion/json_expr_planner.rs
+++ b/src/query/src/datafusion/json_expr_planner.rs
@@ -115,11 +115,14 @@ fn extract_untyped_json_get(expr: &mut Expr) -> Option<&mut ScalarFunction> {
     }
 }
 
-fn push_json_get_type_arg(mut expr: Expr, data_type: DataType) -> Result<Either<Expr, Expr>> {
+fn push_json_get_type_arg(mut expr: Expr, mut data_type: DataType) -> Result<Either<Expr, Expr>> {
     let Some(json_get) = extract_untyped_json_get(&mut expr) else {
         return Ok(Either::Left(expr));
     };
 
+    if data_type.is_string() {
+        data_type = DataType::Utf8View;
+    }
     let with_type = ScalarValue::try_new_null(&data_type).map(|x| Expr::Literal(x, None))?;
     json_get.args.push(with_type);
 
diff --git a/src/query/src/dist_plan/merge_scan.rs b/src/query/src/dist_plan/merge_scan.rs
index 470b4d325f..e1dd635de7 100644
--- a/src/query/src/dist_plan/merge_scan.rs
+++ b/src/query/src/dist_plan/merge_scan.rs
@@ -51,6 +51,7 @@ use tracing::{Instrument, Span};
 use crate::dist_plan::analyzer::AliasMapping;
 use crate::dist_plan::analyzer::utils::patch_batch_timezone;
 use crate::metrics::{MERGE_SCAN_ERRORS_TOTAL, MERGE_SCAN_POLL_ELAPSED, MERGE_SCAN_REGIONS};
+use crate::options::FlowQueryExtensions;
 use crate::region_query::RegionQueryHandlerRef;
 
 #[derive(Debug, Hash, PartialOrd, PartialEq, Eq, Clone)]
@@ -481,6 +482,23 @@ impl MergeScanExec {
         &self.regions
     }
 
+    pub fn is_flow_sink_scan(&self) -> bool {
+        let Some(sink_table_id) =
+            FlowQueryExtensions::parse_flow_extensions(&self.query_ctx.extensions())
+                .ok()
+                .flatten()
+                .and_then(|extensions| extensions.sink_table_id)
+        else {
+            return false;
+        };
+
+        !self.regions.is_empty()
+            && self
+                .regions
+                .iter()
+                .all(|region_id| region_id.table_id() == sink_table_id)
+    }
+
     pub fn partition_count(&self) -> usize {
         self.target_partition
     }
diff --git a/src/query/src/dummy_catalog.rs b/src/query/src/dummy_catalog.rs
index 0ff985ea00..c79c8d88ea 100644
--- a/src/query/src/dummy_catalog.rs
+++ b/src/query/src/dummy_catalog.rs
@@ -45,7 +45,7 @@ use table::metadata::{TableId, TableInfoRef};
 use table::table::scan::RegionScanExec;
 
 use crate::error::{GetRegionMetadataSnafu, Result};
-use crate::options::FlowQueryExtensions;
+use crate::options::{FlowIncrementalMode, FlowQueryExtensions};
 
 /// Resolve to the given region (specified by [RegionId]) unconditionally.
 #[derive(Clone, Debug)]
@@ -357,6 +357,8 @@ struct FlowScanDecision {
     /// When present, this becomes the effective memtable upper bound and suppresses
     /// binding a new snapshot on scan open.
     memtable_max_sequence: Option<u64>,
+    /// Whether to skip SST files for memtable-only incremental source scans.
+    skip_sst_files: bool,
 }
 
 impl FlowScanDecision {
@@ -366,6 +368,7 @@ impl FlowScanDecision {
             snapshot_on_scan: false,
             memtable_min_sequence: None,
             memtable_max_sequence: None,
+            skip_sst_files: false,
         }
     }
 }
@@ -379,6 +382,7 @@ fn decide_flow_scan(query_ctx: &QueryContext, region_id: RegionId) -> Result<Flo
             snapshot_on_scan: false,
             memtable_min_sequence: None,
             memtable_max_sequence: query_ctx.get_snapshot(region_id.as_u64()),
+            skip_sst_files: false,
         });
     };
 
@@ -403,12 +407,16 @@ fn decide_flow_scan(query_ctx: &QueryContext, region_id: RegionId) -> Result<Flo
 
     let memtable_max_sequence = query_ctx.get_snapshot(region_id.as_u64());
 
+    let skip_sst_files = apply_incremental
+        && flow_extensions.incremental_mode == Some(FlowIncrementalMode::MemtableOnly);
+
     Ok(FlowScanDecision {
         is_sink_scan: false,
         snapshot_on_scan: memtable_max_sequence.is_none()
             && flow_extensions.should_collect_region_watermark(),
         memtable_min_sequence,
         memtable_max_sequence,
+        skip_sst_files,
     })
 }
 
@@ -424,6 +432,7 @@ fn build_scan_request(
         sst_min_sequence: (!decision.is_sink_scan)
             .then(|| query_ctx.sst_min_sequence(region_id.as_u64()))
             .flatten(),
+        skip_sst_files: decision.skip_sst_files,
         snapshot_on_scan: decision.snapshot_on_scan,
         memtable_min_sequence: decision.memtable_min_sequence,
         memtable_max_sequence: decision.memtable_max_sequence,
@@ -620,7 +629,10 @@ mod tests {
 
     use super::*;
     use crate::error::Error;
-    use crate::options::{FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE, FLOW_SINK_TABLE_ID};
+    use crate::options::{
+        FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE, FLOW_RETURN_REGION_SEQ,
+        FLOW_SINK_TABLE_ID,
+    };
 
     fn test_region_id() -> RegionId {
         RegionId::new(1024, 1)
@@ -651,6 +663,49 @@ mod tests {
         assert_eq!(request.sst_min_sequence, Some(7));
     }
 
+    #[test]
+    fn test_terminal_watermark_context_source_and_sink_scan_semantics() {
+        let region_id = test_region_id();
+        let query_ctx = QueryContextBuilder::default()
+            .extensions(HashMap::from([(
+                FLOW_RETURN_REGION_SEQ.to_string(),
+                "true".to_string(),
+            )]))
+            .build();
+
+        let request = scan_request_from_query_context(region_id, &query_ctx).unwrap();
+
+        assert!(request.snapshot_on_scan);
+        assert_eq!(request.memtable_min_sequence, None);
+        assert_eq!(request.memtable_max_sequence, None);
+        assert_eq!(request.sst_min_sequence, None);
+
+        let query_ctx = QueryContextBuilder::default()
+            .extensions(HashMap::from([
+                (FLOW_RETURN_REGION_SEQ.to_string(), "true".to_string()),
+                (
+                    FLOW_SINK_TABLE_ID.to_string(),
+                    region_id.table_id().to_string(),
+                ),
+            ]))
+            .snapshot_seqs(Arc::new(RwLock::new(HashMap::from([(
+                region_id.as_u64(),
+                88_u64,
+            )]))))
+            .sst_min_sequences(Arc::new(RwLock::new(HashMap::from([(
+                region_id.as_u64(),
+                77_u64,
+            )]))))
+            .build();
+
+        let request = scan_request_from_query_context(region_id, &query_ctx).unwrap();
+
+        assert!(!request.snapshot_on_scan);
+        assert_eq!(request.memtable_min_sequence, None);
+        assert_eq!(request.memtable_max_sequence, None);
+        assert_eq!(request.sst_min_sequence, None);
+    }
+
     #[test]
     fn test_scan_request_from_incremental_context_uses_snapshot_bound_intent() {
         let region_id = test_region_id();
@@ -795,6 +850,8 @@ mod tests {
 
         let request = scan_request_from_query_context(region_id, &query_ctx).unwrap();
         assert_eq!(request.memtable_min_sequence, Some(55));
+        assert_eq!(request.sst_min_sequence, None);
+        assert!(request.skip_sst_files);
     }
 
     #[test]
@@ -829,6 +886,7 @@ mod tests {
         assert_eq!(request.memtable_min_sequence, None);
         assert_eq!(request.memtable_max_sequence, None);
         assert_eq!(request.sst_min_sequence, None);
+        assert!(!request.skip_sst_files);
         assert!(!request.snapshot_on_scan);
     }
 
diff --git a/src/query/src/metrics.rs b/src/query/src/metrics.rs
index 7541b191fa..7696291480 100644
--- a/src/query/src/metrics.rs
+++ b/src/query/src/metrics.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, BTreeSet};
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
@@ -35,9 +35,6 @@ use crate::options::FlowQueryExtensions;
 /// Intermediate merge state for one participating region while collecting
 /// terminal correctness watermarks across merge-scan sub-stages.
 enum MergeState {
-    /// The region participated, but no explicit watermark result has been seen
-    /// yet for this merge.
-    Participated,
     /// At least one branch reported that this region cannot prove a safe
     /// checkpoint watermark for the current query round.
     Unproved,
@@ -256,7 +253,9 @@ fn collect_region_watermarks(plan: Arc<dyn ExecutionPlan>) -> Vec<RegionWatermar
     let mut stack = vec![plan];
 
     while let Some(plan) = stack.pop() {
-        if let Some(merge_scan) = plan.as_any().downcast_ref::<MergeScanExec>() {
+        if let Some(merge_scan) = plan.as_any().downcast_ref::<MergeScanExec>()
+            && !merge_scan.is_flow_sink_scan()
+        {
             merge_merge_scan_region_watermarks(
                 &mut merged,
                 merge_scan
@@ -281,8 +280,6 @@ fn collect_region_watermarks(plan: Arc<dyn ExecutionPlan>) -> Vec<RegionWatermar
 ///
 /// | Current state  | New entry       | Result            | Rationale |
 /// |---------------|-----------------|-------------------|-----------|
-/// | Participated  | Proved(seq)     | Proved(seq)       | First proof for the region |
-/// | Participated  | Unproved         | Unproved          | One branch cannot prove → region is unsafe |
 /// | Proved(old)   | Proved(same)    | Proved(old)       | Convergent proof, keep |
 /// | Proved(old)   | Proved(diff)    | Conflict([old,diff]) | Ambiguous → degrade to unproved |
 /// | Unproved      | _anything_      | Unproved          | Already unsafe, stays unsafe |
@@ -300,15 +297,12 @@ fn merge_region_watermark_entries(
             .entry(entry.region_id)
             .and_modify(|existing| match entry.watermark {
                 None => match existing {
-                    MergeState::Participated | MergeState::Proved(_) => {
+                    MergeState::Proved(_) => {
                         *existing = MergeState::Unproved;
                     }
                     MergeState::Unproved | MergeState::Conflict { .. } => {}
                 },
                 Some(seq) => match existing {
-                    MergeState::Participated => {
-                        *existing = MergeState::Proved(seq);
-                    }
                     MergeState::Unproved => {}
                     MergeState::Proved(existing_seq) if *existing_seq == seq => {}
                     MergeState::Proved(existing_seq) => {
@@ -336,16 +330,32 @@ fn merge_merge_scan_region_watermarks(
     regions: impl IntoIterator<Item = u64>,
     sub_stage_metrics: impl IntoIterator<Item = RecordBatchMetrics>,
 ) {
-    // Regions listed by MergeScanExec participated even when no sub-stage can
-    // prove a watermark. Keep them as explicit `None` entries so callers can
-    // distinguish unproved participation from non-participation.
-    for region_id in regions {
-        merged.entry(region_id).or_insert(MergeState::Participated);
-    }
-
+    let regions = regions.into_iter().collect::<Vec<_>>();
+    let mut proved_or_unproved_regions = BTreeSet::new();
     for metrics in sub_stage_metrics {
+        proved_or_unproved_regions.extend(
+            metrics
+                .region_watermarks
+                .iter()
+                .map(|entry| entry.region_id),
+        );
         merge_region_watermark_entries(merged, metrics.region_watermarks);
     }
+
+    // Regions listed by a MergeScanExec participated even when no sub-stage can
+    // prove a watermark. Merge missing per-scan region entries as explicit
+    // `None` entries so an unproved participating branch vetoes any proof from
+    // another branch for the same region.
+    merge_region_watermark_entries(
+        merged,
+        regions
+            .into_iter()
+            .filter(|region_id| !proved_or_unproved_regions.contains(region_id))
+            .map(|region_id| RegionWatermarkEntry {
+                region_id,
+                watermark: None,
+            }),
+    );
 }
 
 fn finalize_region_watermarks(merged: BTreeMap<u64, MergeState>) -> Vec<RegionWatermarkEntry> {
@@ -354,7 +364,6 @@ fn finalize_region_watermarks(merged: BTreeMap<u64, MergeState>) -> Vec<RegionWa
         .map(|(region_id, state)| RegionWatermarkEntry {
             region_id,
             watermark: match state {
-                MergeState::Participated => None,
                 MergeState::Unproved => None,
                 MergeState::Proved(seq) => Some(seq),
                 MergeState::Conflict { watermarks } => {
@@ -371,10 +380,35 @@ fn finalize_region_watermarks(merged: BTreeMap<u64, MergeState>) -> Vec<RegionWa
 
 #[cfg(test)]
 mod tests {
+    use std::collections::{BTreeMap, BTreeSet};
+    use std::sync::Arc;
+
+    use async_trait::async_trait;
     use datafusion::arrow::datatypes::Schema as ArrowSchema;
+    use datafusion::execution::SessionStateBuilder;
     use datafusion::physical_plan::empty::EmptyExec;
+    use datafusion_expr::LogicalPlanBuilder;
+    use session::ReadPreference;
+    use session::context::QueryContextBuilder;
+    use store_api::storage::RegionId;
+    use table::table_name::TableName;
 
     use super::*;
+    use crate::options::{FLOW_RETURN_REGION_SEQ, FLOW_SINK_TABLE_ID};
+    use crate::region_query::RegionQueryHandler;
+
+    struct NoopRegionQueryHandler;
+
+    #[async_trait]
+    impl RegionQueryHandler for NoopRegionQueryHandler {
+        async fn do_get(
+            &self,
+            _read_preference: ReadPreference,
+            _request: common_query::request::QueryRequest,
+        ) -> Result<SendableRecordBatchStream> {
+            unreachable!("metrics tests should not execute remote queries")
+        }
+    }
 
     fn metrics_with_region_watermarks(entries: &[(u64, Option<u64>)]) -> RecordBatchMetrics {
         RecordBatchMetrics {
@@ -389,12 +423,66 @@ mod tests {
         }
     }
 
+    fn test_merge_scan_exec(table_id: u32, query_ctx: QueryContextRef) -> Arc<dyn ExecutionPlan> {
+        let session_state = SessionStateBuilder::new().with_default_features().build();
+        let plan = LogicalPlanBuilder::empty(false).build().unwrap();
+        let schema = ArrowSchema::empty();
+
+        Arc::new(
+            MergeScanExec::new(
+                &session_state,
+                TableName::new("greptime", "public", "test"),
+                vec![RegionId::new(table_id, 0)],
+                plan,
+                &schema,
+                Arc::new(NoopRegionQueryHandler),
+                query_ctx,
+                1,
+                BTreeMap::<String, BTreeSet<datafusion_common::Column>>::new(),
+            )
+            .unwrap(),
+        )
+    }
+
+    fn flow_query_ctx_with_sink_table_id(sink_table_id: u32) -> QueryContextRef {
+        Arc::new(
+            QueryContextBuilder::default()
+                .set_extension(FLOW_RETURN_REGION_SEQ.to_string(), "true".to_string())
+                .set_extension(FLOW_SINK_TABLE_ID.to_string(), sink_table_id.to_string())
+                .build(),
+        )
+    }
+
     #[test]
     fn terminal_metrics_returns_none_without_merge_scan() {
         let plan: Arc<dyn ExecutionPlan> = Arc::new(EmptyExec::new(Arc::new(ArrowSchema::empty())));
         assert!(terminal_recordbatch_metrics_from_plan(plan).is_none());
     }
 
+    #[test]
+    fn terminal_metrics_skip_flow_sink_merge_scan_regions() {
+        let query_ctx = flow_query_ctx_with_sink_table_id(42);
+        let plan = test_merge_scan_exec(42, query_ctx);
+
+        assert!(terminal_recordbatch_metrics_from_plan(plan).is_none());
+    }
+
+    #[test]
+    fn terminal_metrics_keep_source_merge_scan_regions_with_sink_extension() {
+        let query_ctx = flow_query_ctx_with_sink_table_id(42);
+        let plan = test_merge_scan_exec(43, query_ctx);
+
+        assert_eq!(
+            terminal_recordbatch_metrics_from_plan(plan)
+                .unwrap()
+                .region_watermarks,
+            vec![RegionWatermarkEntry {
+                region_id: RegionId::new(43, 0).as_u64(),
+                watermark: None,
+            }]
+        );
+    }
+
     #[test]
     fn merge_merge_scan_region_watermarks_marks_missing_watermarks_unproved() {
         let mut merged = BTreeMap::new();
@@ -503,4 +591,44 @@ mod tests {
             }]
         );
     }
+
+    #[test]
+    fn merge_merge_scan_region_watermarks_missing_branch_vetoes_proved_value() {
+        let mut merged = BTreeMap::new();
+
+        merge_merge_scan_region_watermarks(
+            &mut merged,
+            [9],
+            [metrics_with_region_watermarks(&[(9, Some(21))])],
+        );
+        merge_merge_scan_region_watermarks(&mut merged, [9], std::iter::empty());
+
+        assert_eq!(
+            finalize_region_watermarks(merged),
+            vec![RegionWatermarkEntry {
+                region_id: 9,
+                watermark: None,
+            }]
+        );
+    }
+
+    #[test]
+    fn merge_merge_scan_region_watermarks_missing_branch_vetoes_proved_value_regardless_of_order() {
+        let mut merged = BTreeMap::new();
+
+        merge_merge_scan_region_watermarks(&mut merged, [9], std::iter::empty());
+        merge_merge_scan_region_watermarks(
+            &mut merged,
+            [9],
+            [metrics_with_region_watermarks(&[(9, Some(21))])],
+        );
+
+        assert_eq!(
+            finalize_region_watermarks(merged),
+            vec![RegionWatermarkEntry {
+                region_id: 9,
+                watermark: None,
+            }]
+        );
+    }
 }
diff --git a/src/query/src/optimizer.rs b/src/query/src/optimizer.rs
index b85320a495..ffbfff5ee2 100644
--- a/src/query/src/optimizer.rs
+++ b/src/query/src/optimizer.rs
@@ -19,6 +19,7 @@ pub mod count_wildcard;
 pub(crate) mod json_type_concretize;
 pub mod parallelize_scan;
 pub mod pass_distribution;
+pub mod promql_tsid_narrow_join;
 pub mod remove_duplicate;
 pub mod scan_hint;
 pub mod string_normalization;
diff --git a/src/query/src/optimizer/promql_tsid_narrow_join.rs b/src/query/src/optimizer/promql_tsid_narrow_join.rs
new file mode 100644
index 0000000000..419415662e
--- /dev/null
+++ b/src/query/src/optimizer/promql_tsid_narrow_join.rs
@@ -0,0 +1,271 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use arrow_schema::{DataType, SchemaRef};
+use datafusion::config::ConfigOptions;
+use datafusion::physical_optimizer::PhysicalOptimizerRule;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion::physical_plan::joins::{HashJoinExec, PartitionMode};
+use datafusion_common::Result as DfResult;
+use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
+use datafusion_expr::JoinType;
+use datafusion_physical_expr::expressions::Column;
+use store_api::metric_engine_consts::DATA_SCHEMA_TSID_COLUMN_NAME;
+
+/// Chooses a broadcast-style hash join for the PromQL vector-vector shape where
+/// the build side only carries value, `__tsid`, and timestamp columns.
+///
+/// PromQL arithmetic joins often keep one side narrow (without raw labels) and the other side wide
+/// with all output labels. Partitioning both sides shuffles the wide stream.
+/// `CollectLeft` only gathers the narrow build side and lets the wide probe side
+/// keep its existing partitioning.
+#[derive(Debug)]
+pub struct PromqlTsidNarrowJoin;
+
+impl PhysicalOptimizerRule for PromqlTsidNarrowJoin {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> DfResult<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(Self::rewrite_join).data()
+    }
+
+    fn name(&self) -> &str {
+        "PromqlTsidNarrowJoin"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+impl PromqlTsidNarrowJoin {
+    fn rewrite_join(plan: Arc<dyn ExecutionPlan>) -> DfResult<Transformed<Arc<dyn ExecutionPlan>>> {
+        let Some(hash_join) = plan.as_any().downcast_ref::<HashJoinExec>() else {
+            return Ok(Transformed::no(plan));
+        };
+
+        if !Self::should_collect_left(hash_join) {
+            return Ok(Transformed::no(plan));
+        }
+
+        Ok(Transformed::yes(
+            hash_join
+                .builder()
+                .with_partition_mode(PartitionMode::CollectLeft)
+                .reset_state()
+                .build_exec()?,
+        ))
+    }
+
+    fn should_collect_left(hash_join: &HashJoinExec) -> bool {
+        hash_join.partition_mode() == &PartitionMode::Partitioned
+            && hash_join.join_type() == &JoinType::Inner
+            && hash_join.filter().is_none()
+            && hash_join.right().schema().fields().len() > hash_join.left().schema().fields().len()
+            && Self::is_promql_value_tsid_time_schema(&hash_join.left().schema())
+            && Self::joins_on_tsid_and_time(hash_join)
+    }
+
+    fn is_promql_value_tsid_time_schema(schema: &SchemaRef) -> bool {
+        let mut has_value = false;
+        let mut has_tsid = false;
+        let mut has_time = false;
+
+        for field in schema.fields() {
+            match field.name().as_str() {
+                "greptime_value" => has_value = true,
+                DATA_SCHEMA_TSID_COLUMN_NAME => has_tsid = true,
+                _ if matches!(field.data_type(), DataType::Timestamp(_, _)) => has_time = true,
+                _ => return false,
+            }
+        }
+
+        has_value && has_tsid && has_time
+    }
+
+    fn joins_on_tsid_and_time(hash_join: &HashJoinExec) -> bool {
+        let mut has_tsid = false;
+        let mut has_time = false;
+
+        for (left, right) in hash_join.on() {
+            let (Some(left_col), Some(right_col)) = (
+                left.as_any().downcast_ref::<Column>(),
+                right.as_any().downcast_ref::<Column>(),
+            ) else {
+                return false;
+            };
+
+            if left_col.name() == DATA_SCHEMA_TSID_COLUMN_NAME
+                && right_col.name() == DATA_SCHEMA_TSID_COLUMN_NAME
+            {
+                has_tsid = true;
+            } else if matches!(
+                hash_join
+                    .left()
+                    .schema()
+                    .field(left_col.index())
+                    .data_type(),
+                DataType::Timestamp(_, _)
+            ) && matches!(
+                hash_join
+                    .right()
+                    .schema()
+                    .field(right_col.index())
+                    .data_type(),
+                DataType::Timestamp(_, _)
+            ) {
+                has_time = true;
+            }
+        }
+
+        has_tsid && has_time
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use arrow_schema::{DataType, Field, Schema, TimeUnit};
+    use datafusion::common::NullEquality;
+    use datafusion::physical_optimizer::PhysicalOptimizerRule;
+    use datafusion::physical_plan::displayable;
+    use datafusion::physical_plan::empty::EmptyExec;
+    use datafusion::physical_plan::joins::HashJoinExec;
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_physical_expr::PhysicalExpr;
+
+    use super::*;
+
+    #[test]
+    fn chooses_collect_left_for_narrow_promql_build_side() {
+        let left = Arc::new(EmptyExec::new(Arc::new(Schema::new(vec![
+            Field::new("greptime_value", DataType::Float64, true),
+            Field::new(DATA_SCHEMA_TSID_COLUMN_NAME, DataType::UInt64, false),
+            Field::new(
+                "greptime_timestamp",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+        ])))) as Arc<dyn ExecutionPlan>;
+        let right = Arc::new(EmptyExec::new(Arc::new(Schema::new(vec![
+            Field::new("greptime_value", DataType::Float64, true),
+            Field::new("host", DataType::Utf8, true),
+            Field::new(DATA_SCHEMA_TSID_COLUMN_NAME, DataType::UInt64, false),
+            Field::new(
+                "greptime_timestamp",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+        ])))) as Arc<dyn ExecutionPlan>;
+        let on = vec![
+            (
+                Arc::new(Column::new(DATA_SCHEMA_TSID_COLUMN_NAME, 1)) as Arc<dyn PhysicalExpr>,
+                Arc::new(Column::new(DATA_SCHEMA_TSID_COLUMN_NAME, 2)) as Arc<dyn PhysicalExpr>,
+            ),
+            (
+                Arc::new(Column::new("greptime_timestamp", 2)) as Arc<dyn PhysicalExpr>,
+                Arc::new(Column::new("greptime_timestamp", 3)) as Arc<dyn PhysicalExpr>,
+            ),
+        ];
+        let join = Arc::new(
+            HashJoinExec::try_new(
+                left,
+                right,
+                on,
+                None,
+                &JoinType::Inner,
+                Some(vec![0, 3, 4, 5, 6]),
+                PartitionMode::Partitioned,
+                NullEquality::NullEqualsNull,
+                false,
+            )
+            .unwrap(),
+        ) as Arc<dyn ExecutionPlan>;
+        let original_schema = join.schema();
+
+        let optimized = PromqlTsidNarrowJoin
+            .optimize(join, &ConfigOptions::default())
+            .unwrap();
+        let optimized_join = optimized.as_any().downcast_ref::<HashJoinExec>().unwrap();
+
+        assert_eq!(optimized_join.partition_mode(), &PartitionMode::CollectLeft);
+        assert_eq!(optimized.schema(), original_schema);
+        assert!(
+            displayable(optimized.as_ref())
+                .one_line()
+                .to_string()
+                .contains(
+                    "projection=[greptime_value@0, greptime_value@3, host@4, __tsid@5, greptime_timestamp@6]"
+                )
+        );
+    }
+
+    #[test]
+    fn keeps_partitioned_join_when_left_side_carries_labels() {
+        let left = Arc::new(EmptyExec::new(Arc::new(Schema::new(vec![
+            Field::new("greptime_value", DataType::Float64, true),
+            Field::new("host", DataType::Utf8, true),
+            Field::new(DATA_SCHEMA_TSID_COLUMN_NAME, DataType::UInt64, false),
+            Field::new(
+                "greptime_timestamp",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+        ])))) as Arc<dyn ExecutionPlan>;
+        let right = Arc::new(EmptyExec::new(Arc::new(Schema::new(vec![
+            Field::new("greptime_value", DataType::Float64, true),
+            Field::new(DATA_SCHEMA_TSID_COLUMN_NAME, DataType::UInt64, false),
+            Field::new(
+                "greptime_timestamp",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+        ])))) as Arc<dyn ExecutionPlan>;
+        let join = Arc::new(
+            HashJoinExec::try_new(
+                left,
+                right,
+                vec![
+                    (
+                        Arc::new(Column::new(DATA_SCHEMA_TSID_COLUMN_NAME, 2))
+                            as Arc<dyn PhysicalExpr>,
+                        Arc::new(Column::new(DATA_SCHEMA_TSID_COLUMN_NAME, 1))
+                            as Arc<dyn PhysicalExpr>,
+                    ),
+                    (
+                        Arc::new(Column::new("greptime_timestamp", 3)) as Arc<dyn PhysicalExpr>,
+                        Arc::new(Column::new("greptime_timestamp", 2)) as Arc<dyn PhysicalExpr>,
+                    ),
+                ],
+                None,
+                &JoinType::Inner,
+                None,
+                PartitionMode::Partitioned,
+                NullEquality::NullEqualsNull,
+                false,
+            )
+            .unwrap(),
+        ) as Arc<dyn ExecutionPlan>;
+
+        let optimized = PromqlTsidNarrowJoin
+            .optimize(join, &ConfigOptions::default())
+            .unwrap();
+        let optimized_join = optimized.as_any().downcast_ref::<HashJoinExec>().unwrap();
+
+        assert_eq!(optimized_join.partition_mode(), &PartitionMode::Partitioned);
+    }
+}
diff --git a/src/query/src/planner.rs b/src/query/src/planner.rs
index b95803a52b..c4f4af3c6a 100644
--- a/src/query/src/planner.rs
+++ b/src/query/src/planner.rs
@@ -494,6 +494,36 @@ impl DfLogicalPlanner {
         Ok(())
     }
 
+    fn infer_limit_placeholder_types(
+        plan: &LogicalPlan,
+        placeholder_types: &mut HashMap<String, Option<DataType>>,
+    ) -> Result<()> {
+        plan.apply(|node| {
+            if let LogicalPlan::Limit(limit) = node {
+                for expr in limit.skip.iter().chain(limit.fetch.iter()) {
+                    expr.apply(|e| {
+                        if let DfExpr::Placeholder(ph) = e {
+                            placeholder_types
+                                .entry(ph.id.clone())
+                                .and_modify(|existing| {
+                                    if existing.is_none() {
+                                        *existing = Some(DataType::Int64);
+                                    }
+                                })
+                                .or_insert(Some(DataType::Int64));
+                        }
+
+                        Ok(TreeNodeRecursion::Continue)
+                    })?;
+                }
+            }
+
+            Ok(TreeNodeRecursion::Continue)
+        })?;
+
+        Ok(())
+    }
+
     /// Gets inferred parameter types from a logical plan.
     /// Returns a map where each parameter ID is mapped to:
     /// - Some(DataType) if the parameter type could be inferred
@@ -501,7 +531,8 @@ impl DfLogicalPlanner {
     ///
     /// This function first uses DataFusion's `get_parameter_types()` to infer types.
     /// If any parameters have `None` values (i.e., DataFusion couldn't infer their types),
-    /// it falls back to using `extract_placeholder_cast_types()` to detect explicit casts.
+    /// it falls back to using `extract_placeholder_cast_types()` to detect explicit casts
+    /// and applies context-specific inference such as LIMIT/OFFSET placeholders.
     ///
     /// This is because datafusion can only infer types for a limited cases.
     ///
@@ -510,19 +541,15 @@ impl DfLogicalPlanner {
     pub fn get_inferred_parameter_types(
         plan: &LogicalPlan,
     ) -> Result<HashMap<String, Option<DataType>>> {
-        let param_types = plan.get_parameter_types().context(PlanSqlSnafu)?;
+        let mut param_types = plan.get_parameter_types().context(PlanSqlSnafu)?;
 
         let has_none = param_types.values().any(|v| v.is_none());
 
-        if !has_none {
-            Ok(param_types)
-        } else {
+        if has_none {
             let cast_types = Self::extract_placeholder_cast_types(plan)?;
 
-            let mut merged = param_types;
-
             for (id, opt_type) in cast_types {
-                merged
+                param_types
                     .entry(id)
                     .and_modify(|existing| {
                         if existing.is_none() {
@@ -532,8 +559,10 @@ impl DfLogicalPlanner {
                     .or_insert(opt_type);
             }
 
-            Ok(merged)
+            Self::infer_limit_placeholder_types(plan, &mut param_types)?;
         }
+
+        Ok(param_types)
     }
 }
 
@@ -793,6 +822,15 @@ mod tests {
         assert_eq!(type_3, &Some(DataType::Int32));
     }
 
+    #[tokio::test]
+    async fn test_get_inferred_parameter_types_limit_offset() {
+        let plan = parse_sql_to_plan("SELECT id FROM test LIMIT $1 OFFSET $2").await;
+        let types = DfLogicalPlanner::get_inferred_parameter_types(&plan).unwrap();
+
+        assert_eq!(types.get("$1"), Some(&Some(DataType::Int64)));
+        assert_eq!(types.get("$2"), Some(&Some(DataType::Int64)));
+    }
+
     #[tokio::test]
     async fn test_plan_pql_applies_extension_rules() {
         for inner_agg in ["count", "sum", "avg", "min", "max", "stddev", "stdvar"] {
diff --git a/src/query/src/promql/planner.rs b/src/query/src/promql/planner.rs
index 6b17b7837a..0dacc136e8 100644
--- a/src/query/src/promql/planner.rs
+++ b/src/query/src/promql/planner.rs
@@ -312,17 +312,71 @@ impl PromPlanner {
         let range_ms = range.as_millis() as _;
         self.ctx.range = Some(range_ms);
 
+        let time_index_column =
+            self.ctx
+                .time_index_column
+                .clone()
+                .with_context(|| TimeIndexNotFoundSnafu {
+                    table: self.ctx.table_name.clone().unwrap_or_default(),
+                })?;
+
+        // `RangeManipulate` assumes each input batch holds exactly one series
+        // (it takes tag column values from row 0 and applies them to every
+        // output row). The inner expression may emit batches that mix series,
+        // so sort by series key + time index and split into per-series batches
+        // with a `SeriesDivide` first.
+        let input_schema = input.schema();
+        let input_has_tsid = input_schema.fields().iter().any(|field| {
+            field.name() == DATA_SCHEMA_TSID_COLUMN_NAME
+                && field.data_type() == &ArrowDataType::UInt64
+        });
+        let (series_key_columns, mut sort_exprs) = if input_has_tsid {
+            (
+                vec![DATA_SCHEMA_TSID_COLUMN_NAME.to_string()],
+                vec![
+                    DfExpr::Column(Column::from_name(DATA_SCHEMA_TSID_COLUMN_NAME))
+                        .sort(true, true),
+                ],
+            )
+        } else {
+            // Only use tag columns that survive in the inner plan's schema —
+            // `ctx.tag_columns` can drift from the actual output.
+            let key_columns: Vec<String> = self
+                .ctx
+                .tag_columns
+                .iter()
+                .filter(|name| input_schema.has_column_with_unqualified_name(name))
+                .cloned()
+                .collect();
+            let sort = key_columns
+                .iter()
+                .map(|name| DfExpr::Column(Column::from_name(name)).sort(true, true))
+                .collect::<Vec<_>>();
+            (key_columns, sort)
+        };
+        sort_exprs.push(DfExpr::Column(Column::from_name(&time_index_column)).sort(true, true));
+
+        let sort_plan = LogicalPlanBuilder::from(input)
+            .sort(sort_exprs)
+            .context(DataFusionPlanningSnafu)?
+            .build()
+            .context(DataFusionPlanningSnafu)?;
+        let divide_plan = LogicalPlan::Extension(Extension {
+            node: Arc::new(SeriesDivide::new(
+                series_key_columns,
+                time_index_column.clone(),
+                sort_plan,
+            )),
+        });
+
         let manipulate = RangeManipulate::new(
             self.ctx.start,
             self.ctx.end,
             self.ctx.interval,
             range_ms,
-            self.ctx
-                .time_index_column
-                .clone()
-                .expect("time index should be set in `setup_context`"),
+            time_index_column,
             self.ctx.field_columns.clone(),
-            input,
+            divide_plan,
         )
         .context(DataFusionPlanningSnafu)?;
 
@@ -4023,12 +4077,15 @@ impl PromPlanner {
             return Ok(plan);
         }
 
+        // Preserve column qualifiers so downstream plan nodes can keep referencing
+        // the columns by their original qualified names.
         let project_exprs = schema
-            .fields()
             .iter()
-            .filter(|field| field.name() != DATA_SCHEMA_TSID_COLUMN_NAME)
-            .map(|field| Ok(DfExpr::Column(Column::from_name(field.name().clone()))))
-            .collect::<Result<Vec<_>>>()?;
+            .filter(|(_, field)| field.name() != DATA_SCHEMA_TSID_COLUMN_NAME)
+            .map(|(qualifier, field)| {
+                DfExpr::Column(Column::new(qualifier.cloned(), field.name().clone()))
+            })
+            .collect::<Vec<_>>();
 
         LogicalPlanBuilder::from(plan)
             .project(project_exprs)
@@ -5923,6 +5980,26 @@ mod test {
         indie_query_plan_compare(query, expected).await;
     }
 
+    /// The outer `PromRangeManipulate` from a subquery must be preceded by
+    /// `Sort` + `PromSeriesDivide`.
+    #[tokio::test]
+    async fn count_over_time_subquery() {
+        let query = "count_over_time(some_metric[10m:1m])";
+        let expected = String::from(
+            "Filter: prom_count_over_time(timestamp_range,field_0) IS NOT NULL [timestamp:Timestamp(ms), prom_count_over_time(timestamp_range,field_0):Float64;N, tag_0:Utf8]\
+            \n  Projection: some_metric.timestamp, prom_count_over_time(timestamp_range, field_0) AS prom_count_over_time(timestamp_range,field_0), some_metric.tag_0 [timestamp:Timestamp(ms), prom_count_over_time(timestamp_range,field_0):Float64;N, tag_0:Utf8]\
+            \n    PromRangeManipulate: req range=[0..100000000], interval=[5000], eval range=[600000], time index=[timestamp], values=[\"field_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Dictionary(Int64, Float64);N, timestamp_range:Dictionary(Int64, Timestamp(ms))]\
+            \n      PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
+            \n        Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
+            \n          PromInstantManipulate: range=[-540000..100000000], lookback=[1000], interval=[60000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
+            \n            PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
+            \n              Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
+            \n                Filter: some_metric.timestamp >= TimestampMillisecond(-540999, None) AND some_metric.timestamp <= TimestampMillisecond(100000000, None) [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
+            \n                  TableScan: some_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]",
+        );
+        indie_query_plan_compare(query, expected).await;
+    }
+
     #[tokio::test]
     async fn test_hash_join() {
         let mut eval_stmt = EvalStmt {
@@ -6005,6 +6082,39 @@ mod test {
             .unwrap();
     }
 
+    #[tokio::test]
+    async fn test_histogram_quantile_binary_op() {
+        let mut eval_stmt = EvalStmt {
+            expr: PromExpr::NumberLiteral(NumberLiteral { val: 1.0 }),
+            start: UNIX_EPOCH,
+            end: UNIX_EPOCH
+                .checked_add(Duration::from_secs(100_000))
+                .unwrap(),
+            interval: Duration::from_secs(5),
+            lookback_delta: Duration::from_secs(1),
+        };
+
+        // Arithmetic applied to a histogram_quantile() result. Regression for #8144:
+        // HistogramFold used to drop the input column qualifiers, so the binary-op
+        // projection failed to resolve the qualified tag column.
+        let case = r#"histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m]))) + 0"#;
+
+        let prom_expr = parser::parse(case).unwrap();
+        eval_stmt.expr = prom_expr;
+        let table_provider = build_test_table_provider_with_fields(
+            &[(
+                DEFAULT_SCHEMA_NAME.to_string(),
+                "http_request_duration_seconds_bucket".to_string(),
+            )],
+            &["pod", "le"],
+        )
+        .await;
+        // Should plan without a "No field named ..." error.
+        let _ = PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
+            .await
+            .unwrap();
+    }
+
     #[tokio::test]
     async fn test_parse_and_operator() {
         let mut eval_stmt = EvalStmt {
diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs
index 45a5700781..4262428091 100644
--- a/src/query/src/query_engine/state.rs
+++ b/src/query/src/query_engine/state.rs
@@ -66,6 +66,7 @@ use crate::optimizer::count_wildcard::CountWildcardToTimeIndexRule;
 use crate::optimizer::json_type_concretize::JsonTypeConcretizeRule;
 use crate::optimizer::parallelize_scan::ParallelizeScan;
 use crate::optimizer::pass_distribution::PassDistribution;
+use crate::optimizer::promql_tsid_narrow_join::PromqlTsidNarrowJoin;
 use crate::optimizer::remove_duplicate::RemoveDuplicate;
 use crate::optimizer::scan_hint::ScanHintRule;
 use crate::optimizer::string_normalization::StringNormalizationRule;
@@ -189,9 +190,13 @@ impl QueryEngineState {
         physical_optimizer
             .rules
             .insert(6, Arc::new(PassDistribution));
+        // Prefer collecting narrow PromQL build sides over repartitioning wide label streams.
+        physical_optimizer
+            .rules
+            .insert(7, Arc::new(PromqlTsidNarrowJoin));
         // Enforce sorting AFTER custom rules that modify the plan structure
         physical_optimizer.rules.insert(
-            7,
+            8,
             Arc::new(datafusion::physical_optimizer::enforce_sorting::EnforceSorting {}),
         );
         // Add rule for windowed sort
diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml
index b35b30968a..f870d3c3ca 100644
--- a/src/servers/Cargo.toml
+++ b/src/servers/Cargo.toml
@@ -82,6 +82,7 @@ log-query.workspace = true
 loki-proto.workspace = true
 metric-engine.workspace = true
 mime_guess = "2.0"
+mysql_common = "0.34"
 notify.workspace = true
 object-pool = "0.5"
 once_cell.workspace = true
diff --git a/src/servers/src/interceptor.rs b/src/servers/src/interceptor.rs
index 7425c228f5..30bc5a2aa0 100644
--- a/src/servers/src/interceptor.rs
+++ b/src/servers/src/interceptor.rs
@@ -23,6 +23,7 @@ use common_error::ext::ErrorExt;
 use common_query::Output;
 use datafusion_expr::LogicalPlan;
 use log_query::LogQuery;
+use promql_parser::parser::Expr;
 use query::parser::PromQuery;
 use session::context::QueryContextRef;
 use sql::statements::statement::Statement;
@@ -58,7 +59,7 @@ pub trait SqlQueryInterceptor {
     /// Called before sql is actually executed. This hook is not called at the moment.
     fn pre_execute(
         &self,
-        _statement: &Statement,
+        _statement: Option<&Statement>,
         _plan: Option<&LogicalPlan>,
         _query_ctx: QueryContextRef,
     ) -> Result<(), Self::Error> {
@@ -111,7 +112,7 @@ where
 
     fn pre_execute(
         &self,
-        statement: &Statement,
+        statement: Option<&Statement>,
         plan: Option<&LogicalPlan>,
         query_ctx: QueryContextRef,
     ) -> Result<(), Self::Error> {
@@ -224,6 +225,7 @@ pub trait PromQueryInterceptor {
     fn pre_execute(
         &self,
         _query: &PromQuery,
+        _expr: &Expr,
         _plan: Option<&LogicalPlan>,
         _query_ctx: QueryContextRef,
     ) -> Result<(), Self::Error> {
@@ -253,11 +255,45 @@ where
     fn pre_execute(
         &self,
         query: &PromQuery,
+        expr: &Expr,
         plan: Option<&LogicalPlan>,
         query_ctx: QueryContextRef,
     ) -> Result<(), Self::Error> {
         if let Some(this) = self {
-            this.pre_execute(query, plan, query_ctx)
+            this.pre_execute(query, expr, plan, query_ctx)
+        } else {
+            Ok(())
+        }
+    }
+
+    fn post_execute(
+        &self,
+        output: Output,
+        query_ctx: QueryContextRef,
+    ) -> Result<Output, Self::Error> {
+        if let Some(this) = self {
+            this.post_execute(output, query_ctx)
+        } else {
+            Ok(output)
+        }
+    }
+}
+
+impl<E> PromQueryInterceptor for Option<&PromQueryInterceptorRef<E>>
+where
+    E: ErrorExt,
+{
+    type Error = E;
+
+    fn pre_execute(
+        &self,
+        query: &PromQuery,
+        expr: &Expr,
+        plan: Option<&LogicalPlan>,
+        query_ctx: QueryContextRef,
+    ) -> Result<(), Self::Error> {
+        if let Some(this) = self {
+            this.pre_execute(query, expr, plan, query_ctx)
         } else {
             Ok(())
         }
diff --git a/src/servers/src/mysql/handler.rs b/src/servers/src/mysql/handler.rs
index 3a80593c63..88a539ea21 100644
--- a/src/servers/src/mysql/handler.rs
+++ b/src/servers/src/mysql/handler.rs
@@ -30,6 +30,7 @@ use datafusion_expr::LogicalPlan;
 use datatypes::prelude::ConcreteDataType;
 use datatypes::schema::Schema;
 use itertools::Itertools;
+use mysql_common::Value as MysqlValue;
 use opensrv_mysql::{
     AsyncMysqlShim, Column, ErrorKind, InitWriter, ParamParser, ParamValue, QueryResultWriter,
     StatementMetaWriter, ValueInner,
@@ -51,9 +52,7 @@ use crate::error::{
     self, DataFrameSnafu, InferParameterTypesSnafu, InvalidPrepareStatementSnafu, Result,
 };
 use crate::metrics::METRIC_AUTH_FAILURE;
-use crate::mysql::helper::{
-    self, format_placeholder, replace_placeholders, transform_placeholders,
-};
+use crate::mysql::helper::{self, format_placeholder, transform_placeholders_with_count};
 use crate::mysql::writer;
 use crate::mysql::writer::{create_mysql_column, handle_err};
 use crate::query_handler::sql::ServerSqlQueryHandlerRef;
@@ -192,28 +191,31 @@ impl MysqlInstanceShim {
             return Ok((vec![], vec![]));
         }
 
-        let (query, param_num) = replace_placeholders(raw_query);
-
         let statement = validate_query(raw_query).await?;
 
         // We have to transform the placeholder, because DataFusion only parses placeholders
         // in the form of "$i", it can't process "?" right now.
-        let statement = transform_placeholders(statement);
+        let (statement, placeholder_count) = transform_placeholders_with_count(statement);
+        let param_num = placeholder_count + 1;
 
         let describe_result = self
             .do_describe(statement.clone(), query_ctx.clone())
             .await?;
         let plan = describe_result.map(|DescribeResult { logical_plan }| logical_plan);
 
-        let params = if let Some(plan) = &plan {
+        let (params, can_cache_as_plan) = if let Some(plan) = &plan {
             let param_types = DfLogicalPlanner::get_inferred_parameter_types(plan)
                 .context(InferParameterTypesSnafu)?
                 .into_iter()
                 .map(|(k, v)| (k, v.map(|v| ConcreteDataType::from_arrow_type(&v))))
                 .collect();
-            prepared_params(&param_types)?
+
+            (
+                prepared_params(&param_types, param_num)?,
+                all_params_have_types(&param_types, param_num),
+            )
         } else {
-            dummy_params(param_num)?
+            (dummy_params(param_num)?, false)
         };
 
         let columns =
@@ -239,17 +241,20 @@ impl MysqlInstanceShim {
                 .unwrap_or_default();
 
         match plan {
-            Some(plan) if params.len() == param_num - 1 => {
+            Some(plan) if can_cache_as_plan => {
                 self.save_plan(SqlPlan::Plan(plan, statement), stmt_key)
                     .inspect_err(|e| {
                         error!(e; "Failed to save prepared statement");
                     })?;
             }
             _ => {
-                self.save_plan(SqlPlan::Statement(statement, query), stmt_key)
-                    .inspect_err(|e| {
-                        error!(e; "Failed to save prepared statement");
-                    })?;
+                self.save_plan(
+                    SqlPlan::Statement(statement, raw_query.to_string()),
+                    stmt_key,
+                )
+                .inspect_err(|e| {
+                    error!(e; "Failed to save prepared statement");
+                })?;
             }
         }
 
@@ -312,7 +317,7 @@ impl MysqlInstanceShim {
                     self.do_query(&query, query_ctx.clone()).await
                 }
             }
-            SqlPlan::Statement(_stmt, query) => {
+            SqlPlan::Statement(stmt, query) => {
                 let param_strs = match params {
                     Params::ProtocolParams(params) => {
                         params.iter().map(convert_param_value_to_string).collect()
@@ -323,7 +328,7 @@ impl MysqlInstanceShim {
                     "do_execute Replacing with Params: {:?}, Original Query: {}",
                     param_strs, query
                 );
-                let query = replace_params(param_strs, query);
+                let query = replace_params(param_strs, stmt, query)?;
                 debug!("Mysql execute replaced query: {}", query);
                 self.do_query(&query, query_ctx.clone()).await
             }
@@ -662,19 +667,133 @@ fn convert_param_value_to_string(param: &ParamValue) -> String {
         ValueInner::UInt(u) => u.to_string(),
         ValueInner::Double(u) => u.to_string(),
         ValueInner::NULL => "NULL".to_string(),
-        ValueInner::Bytes(b) => format!("'{}'", &String::from_utf8_lossy(b)),
+        // MySQL prepared fallback emits SQL text. Delegate bytes/string literal
+        // escaping to mysql_common. `false` means normal MySQL backslash escapes;
+        // if NO_BACKSLASH_ESCAPES is supported in this path later, wire the
+        // session SQL mode here.
+        ValueInner::Bytes(b) => MysqlValue::Bytes(b.to_vec()).as_sql(false),
         ValueInner::Date(_) => format!("'{}'", NaiveDate::from(param.value)),
         ValueInner::Datetime(_) => format!("'{}'", NaiveDateTime::from(param.value)),
         ValueInner::Time(_) => format_duration(Duration::from(param.value)),
     }
 }
 
-fn replace_params(params: Vec<String>, query: String) -> String {
-    let mut query = query;
-    for (index, param) in (1..).zip(params) {
-        query = query.replace(&format_placeholder(index), &param);
+fn replace_params(params: Vec<String>, stmt: Statement, mut query: String) -> Result<String> {
+    let spans = helper::placeholder_spans(stmt);
+    ensure!(
+        spans.len() == params.len(),
+        error::InternalSnafu {
+            err_msg: format!(
+                "Prepared statement expected {} parameters but got {}",
+                spans.len(),
+                params.len()
+            )
+        }
+    );
+
+    let mut replacements = Vec::with_capacity(spans.len());
+    for span in spans {
+        let start = location_to_byte_offset(&query, span.start_line, span.start_column)
+            .ok_or_else(|| {
+                error::InternalSnafu {
+                    err_msg: format!(
+                        "Invalid placeholder start span: line {}, column {}",
+                        span.start_line, span.start_column
+                    ),
+                }
+                .build()
+            })?;
+        let end =
+            location_to_byte_offset(&query, span.end_line, span.end_column).ok_or_else(|| {
+                error::InternalSnafu {
+                    err_msg: format!(
+                        "Invalid placeholder end span: line {}, column {}",
+                        span.end_line, span.end_column
+                    ),
+                }
+                .build()
+            })?;
+        let param = span
+            .index
+            .checked_sub(1)
+            .and_then(|idx| params.get(idx))
+            .ok_or_else(|| {
+                error::InternalSnafu {
+                    err_msg: format!("Missing prepared statement parameter {}", span.index),
+                }
+                .build()
+            })?;
+
+        ensure!(
+            start < end && end <= query.len(),
+            error::InternalSnafu {
+                err_msg: format!(
+                    "Invalid placeholder byte span: {}..{} for query length {}",
+                    start,
+                    end,
+                    query.len()
+                )
+            }
+        );
+        ensure!(
+            query.get(start..end) == Some("?"),
+            error::InternalSnafu {
+                err_msg: format!(
+                    "Prepared statement placeholder span maps to {:?} instead of '?'",
+                    query.get(start..end)
+                )
+            }
+        );
+
+        replacements.push((start, end, param.clone()));
     }
-    query
+
+    replacements.sort_unstable_by_key(|(start, _, _)| *start);
+    for windows in replacements.windows(2) {
+        ensure!(
+            windows[0].1 <= windows[1].0,
+            error::InternalSnafu {
+                err_msg: "Overlapping placeholder spans in prepared statement".to_string()
+            }
+        );
+    }
+
+    // All spans are computed against the original query. Apply replacements
+    // from right to left so changing one parameter's string length never shifts
+    // the byte offsets of placeholders that have not been replaced yet.
+    for (start, end, param) in replacements.into_iter().rev() {
+        query.replace_range(start..end, &param);
+    }
+
+    Ok(query)
+}
+
+fn location_to_byte_offset(query: &str, line: u64, column: u64) -> Option<usize> {
+    // sqlparser spans are 1-based line/column locations, and columns advance by
+    // Rust `char`s rather than bytes. Convert them to byte offsets before using
+    // `String::replace_range` on the original SQL text.
+    if line == 0 || column == 0 {
+        return None;
+    }
+
+    let mut current_line = 1;
+    let mut current_column = 1;
+    for (index, ch) in query.char_indices() {
+        if current_line == line && current_column == column {
+            return Some(index);
+        }
+
+        if ch == '\n' {
+            current_line += 1;
+            current_column = 1;
+        } else {
+            current_column += 1;
+        }
+    }
+
+    // The exclusive end location of a trailing placeholder points just past
+    // the last character, for example the end span of `SELECT ?`.
+    (current_line == line && current_column == column).then_some(query.len())
 }
 
 fn format_duration(duration: Duration) -> String {
@@ -778,20 +897,33 @@ fn dummy_params(index: usize) -> Result<Vec<Column>> {
 }
 
 /// Parameters that the client must provide when executing the prepared statement.
-fn prepared_params(param_types: &HashMap<String, Option<ConcreteDataType>>) -> Result<Vec<Column>> {
-    let mut params = Vec::with_capacity(param_types.len());
+fn prepared_params(
+    param_types: &HashMap<String, Option<ConcreteDataType>>,
+    param_num: usize,
+) -> Result<Vec<Column>> {
+    let mut params = Vec::with_capacity(param_num - 1);
 
     // Placeholder index starts from 1
-    for index in 1..=param_types.len() {
-        if let Some(Some(t)) = param_types.get(&format_placeholder(index)) {
-            let column = create_mysql_column(t, "")?;
-            params.push(column);
-        }
+    for i in 1..param_num {
+        let column = if let Some(Some(t)) = param_types.get(&format_placeholder(i)) {
+            create_mysql_column(t, "")?
+        } else {
+            create_mysql_column(&ConcreteDataType::null_datatype(), "")?
+        };
+        params.push(column);
     }
 
     Ok(params)
 }
 
+fn all_params_have_types(
+    param_types: &HashMap<String, Option<ConcreteDataType>>,
+    param_num: usize,
+) -> bool {
+    param_types.len() == param_num - 1
+        && (1..param_num).all(|i| matches!(param_types.get(&format_placeholder(i)), Some(Some(_))))
+}
+
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
@@ -852,6 +984,122 @@ mod tests {
         )
     }
 
+    fn statement_with_transformed_placeholders(query: &str) -> Statement {
+        let mut statements =
+            ParserContext::create_with_dialect(query, &MySqlDialect {}, ParseOptions::default())
+                .unwrap();
+        assert_eq!(statements.len(), 1);
+        transform_placeholders_with_count(statements.remove(0)).0
+    }
+
+    #[test]
+    fn test_prepared_params_keep_unknown_type_placeholders() {
+        let mut param_types = HashMap::new();
+        param_types.insert(format_placeholder(1), None);
+        param_types.insert(
+            format_placeholder(2),
+            Some(ConcreteDataType::int32_datatype()),
+        );
+
+        let params = prepared_params(&param_types, 3).unwrap();
+        assert_eq!(params.len(), 2);
+        assert!(!all_params_have_types(&param_types, 3));
+    }
+
+    #[test]
+    fn test_replace_params_by_placeholder_span() {
+        let query = "SELECT ?, ?".to_string();
+        let stmt = statement_with_transformed_placeholders(&query);
+        let params = vec!["'$2 should stay'".to_string(), "'value'".to_string()];
+
+        assert_eq!(
+            "SELECT '$2 should stay', 'value'",
+            replace_params(params, stmt, query).unwrap()
+        );
+
+        let query = "SELECT ?, ?, ?".to_string();
+        let stmt = statement_with_transformed_placeholders(&query);
+        let params = vec![
+            "'much longer than a placeholder'".to_string(),
+            "0".to_string(),
+            "'also much longer than a placeholder'".to_string(),
+        ];
+
+        assert_eq!(
+            "SELECT 'much longer than a placeholder', 0, 'also much longer than a placeholder'",
+            replace_params(params, stmt, query).unwrap()
+        );
+
+        let query = "SELECT '$1', \"$2\", `$3`, ?, ?".to_string();
+        let stmt = statement_with_transformed_placeholders(&query);
+        let params = vec!["'1'".to_string(), "'2'".to_string()];
+
+        assert_eq!(
+            "SELECT '$1', \"$2\", `$3`, '1', '2'",
+            replace_params(params, stmt, query).unwrap()
+        );
+
+        let query = "SELECT /* ? */ ? -- ?\n, ?".to_string();
+        let stmt = statement_with_transformed_placeholders(&query);
+        let params = vec!["'first'".to_string(), "'second'".to_string()];
+
+        assert_eq!(
+            "SELECT /* ? */ 'first' -- ?\n, 'second'",
+            replace_params(params, stmt, query).unwrap()
+        );
+
+        let query = "SELECT '中文', ?".to_string();
+        let stmt = statement_with_transformed_placeholders(&query);
+        let params = vec!["'value'".to_string()];
+
+        assert_eq!(
+            "SELECT '中文', 'value'",
+            replace_params(params, stmt, query).unwrap()
+        );
+
+        let query = "SELECT '中文',\n  ?".to_string();
+        let stmt = statement_with_transformed_placeholders(&query);
+        let params = vec!["'value'".to_string()];
+
+        assert_eq!(
+            "SELECT '中文',\n  'value'",
+            replace_params(params, stmt, query).unwrap()
+        );
+
+        let query = "SELECT 'x'\r\n, ?".to_string();
+        let stmt = statement_with_transformed_placeholders(&query);
+        let params = vec!["'crlf'".to_string()];
+
+        assert_eq!(
+            "SELECT 'x'\r\n, 'crlf'",
+            replace_params(params, stmt, query).unwrap()
+        );
+
+        let query = "SELECT\t?".to_string();
+        let stmt = statement_with_transformed_placeholders(&query);
+        let params = vec!["NULL".to_string()];
+
+        assert_eq!("SELECT\tNULL", replace_params(params, stmt, query).unwrap());
+
+        let query = "SELECT CAST(? AS INT64), ? + (SELECT ?)".to_string();
+        let stmt = statement_with_transformed_placeholders(&query);
+        let params = vec!["1".to_string(), "2".to_string(), "3".to_string()];
+
+        assert_eq!(
+            "SELECT CAST(1 AS INT64), 2 + (SELECT 3)",
+            replace_params(params, stmt, query).unwrap()
+        );
+
+        let query = "SET time_zone = ?".to_string();
+        let stmt = statement_with_transformed_placeholders(&query);
+        let params = vec!["'UTC'".to_string()];
+
+        assert_eq!(
+            "SET time_zone = 'UTC'",
+            replace_params(params, stmt, query).unwrap()
+        );
+    }
+
     #[tokio::test]
     async fn test_prepare_federated_query() {
         let mut shim = create_shim();
diff --git a/src/servers/src/mysql/helper.rs b/src/servers/src/mysql/helper.rs
index 2ee2421892..c4c072b007 100644
--- a/src/servers/src/mysql/helper.rs
+++ b/src/servers/src/mysql/helper.rs
@@ -23,6 +23,7 @@ use datatypes::prelude::ConcreteDataType;
 use datatypes::schema::ColumnSchema;
 use datatypes::types::TimestampType;
 use datatypes::value::{self, Value};
+#[cfg(test)]
 use itertools::Itertools;
 use opensrv_mysql::{ParamValue, ValueInner, to_naive_datetime};
 use snafu::ResultExt;
@@ -31,6 +32,17 @@ use sql::statements::statement::Statement;
 
 use crate::error::{self, Result};
 
+/// Location of a prepared-statement placeholder in the original SQL text.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) struct PlaceholderSpan {
+    /// 1-based placeholder index.
+    pub(crate) index: usize,
+    pub(crate) start_line: u64,
+    pub(crate) start_column: u64,
+    pub(crate) end_line: u64,
+    pub(crate) end_column: u64,
+}
+
 /// Returns the placeholder string "$i".
 pub fn format_placeholder(i: usize) -> String {
     format!("${}", i)
@@ -38,6 +50,7 @@ pub fn format_placeholder(i: usize) -> String {
 
 /// Replace all the "?" placeholder into "$i" in SQL,
 /// returns the new SQL and the last placeholder index.
+#[cfg(test)]
 pub fn replace_placeholders(query: &str) -> (String, usize) {
     let query_parts = query.split('?').collect::<Vec<_>>();
     let parts_len = query_parts.len();
@@ -58,27 +71,51 @@ pub fn replace_placeholders(query: &str) -> (String, usize) {
     (query, index + 1)
 }
 
-/// Transform all the "?" placeholder into "$i".
-/// Only works for Insert,Query and Delete statements.
-pub fn transform_placeholders(stmt: Statement) -> Statement {
-    match stmt {
-        Statement::Query(mut query) => {
-            visit_placeholders(&mut query.inner);
-            Statement::Query(query)
-        }
-        Statement::Insert(mut insert) => {
-            visit_placeholders(&mut insert.inner);
-            Statement::Insert(insert)
-        }
-        Statement::Delete(mut delete) => {
-            visit_placeholders(&mut delete.inner);
-            Statement::Delete(delete)
-        }
-        stmt => stmt,
-    }
+/// Transform all the "?" placeholders into "$i" and return the number of
+/// transformed placeholders.
+pub fn transform_placeholders_with_count(mut stmt: Statement) -> (Statement, usize) {
+    let count = visit_placeholders(&mut stmt);
+    (stmt, count)
 }
 
-fn visit_placeholders<V>(v: &mut V)
+/// Collect spans of "$i" placeholders in a statement.
+pub(crate) fn placeholder_spans(mut stmt: Statement) -> Vec<PlaceholderSpan> {
+    let mut spans = Vec::new();
+    collect_placeholder_spans(&mut stmt, &mut spans);
+    spans
+}
+
+fn collect_placeholder_spans<V>(v: &mut V, spans: &mut Vec<PlaceholderSpan>)
+where
+    V: VisitMut,
+{
+    let _ = visit_expressions_mut(v, |expr| {
+        if let Expr::Value(ValueWithSpan {
+            value: ValueExpr::Placeholder(s),
+            span,
+        }) = expr
+            && let Some(index) = placeholder_index(s)
+        {
+            spans.push(PlaceholderSpan {
+                index,
+                start_line: span.start.line,
+                start_column: span.start.column,
+                end_line: span.end.line,
+                end_column: span.end.column,
+            });
+        }
+        ControlFlow::<()>::Continue(())
+    });
+}
+
+fn placeholder_index(s: &str) -> Option<usize> {
+    s.strip_prefix('$')?
+        .parse::<usize>()
+        .ok()
+        .filter(|i| *i > 0)
+}
+
+fn visit_placeholders<V>(v: &mut V) -> usize
 where
     V: VisitMut,
 {
@@ -88,12 +125,14 @@ where
             value: ValueExpr::Placeholder(s),
             ..
         }) = expr
+            && s == "?"
         {
             *s = format_placeholder(index);
             index += 1;
         }
         ControlFlow::<()>::Continue(())
     });
+    index - 1
 }
 
 /// Convert [`ParamValue`] into [`Value`] according to param type.
@@ -340,33 +379,52 @@ mod tests {
     #[test]
     fn test_transform_placeholders() {
         let insert = parse_sql("insert into demo values(?,?,?)");
-        let Statement::Insert(insert) = transform_placeholders(insert) else {
+        let (stmt, count) = transform_placeholders_with_count(insert);
+        let Statement::Insert(insert) = stmt else {
             unreachable!()
         };
         assert_eq!(
             "INSERT INTO demo VALUES ($1, $2, $3)",
             insert.inner.to_string()
         );
+        assert_eq!(3, count);
 
         let delete = parse_sql("delete from demo where host=? and idc=?");
-        let Statement::Delete(delete) = transform_placeholders(delete) else {
+        let (stmt, count) = transform_placeholders_with_count(delete);
+        let Statement::Delete(delete) = stmt else {
             unreachable!()
         };
         assert_eq!(
             "DELETE FROM demo WHERE host = $1 AND idc = $2",
             delete.inner.to_string()
         );
+        assert_eq!(2, count);
 
         let select = parse_sql(
             "select * from demo where host=? and idc in (select idc from idcs where name=?) and cpu>?",
         );
-        let Statement::Query(select) = transform_placeholders(select) else {
+        let (stmt, count) = transform_placeholders_with_count(select);
+        let Statement::Query(select) = stmt else {
             unreachable!()
         };
         assert_eq!(
             "SELECT * FROM demo WHERE host = $1 AND idc IN (SELECT idc FROM idcs WHERE name = $2) AND cpu > $3",
             select.inner.to_string()
         );
+        assert_eq!(3, count);
+
+        let select = parse_sql("select '?', ?");
+        let (stmt, count) = transform_placeholders_with_count(select);
+        let Statement::Query(select) = stmt else {
+            unreachable!()
+        };
+        assert_eq!("SELECT '?', $1", select.inner.to_string());
+        assert_eq!(1, count);
+
+        let set = parse_sql("set time_zone = ?");
+        let (stmt, count) = transform_placeholders_with_count(set);
+        assert_eq!("SET time_zone = $1", stmt.to_string());
+        assert_eq!(1, count);
     }
 
     #[test]
diff --git a/src/servers/tests/interceptor.rs b/src/servers/tests/interceptor.rs
index 7712c90332..fd516a0dfe 100644
--- a/src/servers/tests/interceptor.rs
+++ b/src/servers/tests/interceptor.rs
@@ -90,6 +90,7 @@ impl PromQueryInterceptor for NoopInterceptor {
     fn pre_execute(
         &self,
         query: &PromQuery,
+        _expr: &promql_parser::parser::Expr,
         _plan: Option<&LogicalPlan>,
         _query_ctx: QueryContextRef,
     ) -> std::result::Result<(), Self::Error> {
@@ -121,7 +122,13 @@ fn test_prom_interceptor() {
         ..Default::default()
     };
 
-    let fail = PromQueryInterceptor::pre_execute(&di, &query, None, ctx.clone());
+    let fail = PromQueryInterceptor::pre_execute(
+        &di,
+        &query,
+        &promql_parser::parser::parse(&query.query).unwrap(),
+        None,
+        ctx.clone(),
+    );
     assert!(fail.is_err());
 
     let output = Output::new_with_affected_rows(1);
diff --git a/src/servers/tests/mysql/mysql_server_test.rs b/src/servers/tests/mysql/mysql_server_test.rs
index e0cb086dda..888ac92fc3 100644
--- a/src/servers/tests/mysql/mysql_server_test.rs
+++ b/src/servers/tests/mysql/mysql_server_test.rs
@@ -516,6 +516,172 @@ async fn test_query_prepared() -> Result<()> {
         _ => unreachable!(),
     }
 
+    // Regression test for #8142: LIMIT ? should work in prepared statements.
+    // The LIMIT placeholder should be inferred as Int64 so the MySQL prepare
+    // response advertises the correct parameter count.
+    {
+        let stmt = connection
+            .prep("SELECT uint32s FROM all_datatypes LIMIT ?")
+            .await
+            .unwrap();
+        let rows: Vec<Row> = connection
+            .exec(stmt, vec![mysql_async::Value::Int(1)])
+            .await
+            .unwrap();
+        assert_eq!(rows.len(), 1, "LIMIT 1 should return 1 row");
+    }
+
+    // Also cover mixed placeholders: the WHERE placeholder is inferred from
+    // the column type and the LIMIT placeholder is inferred from its context.
+    {
+        let stmt = connection
+            .prep("SELECT uint32s FROM all_datatypes WHERE uint32s >= ? LIMIT ?")
+            .await
+            .unwrap();
+        let rows: Vec<Row> = connection
+            .exec(
+                stmt,
+                vec![mysql_async::Value::UInt(0), mysql_async::Value::UInt(1)],
+            )
+            .await
+            .unwrap();
+        assert_eq!(rows.len(), 1, "LIMIT 1 should return 1 row");
+    }
+
+    // Untyped placeholders should still be advertised in the MySQL prepare
+    // response. This used to fail on the client side because the server
+    // reported 0 parameters for `SELECT ?`.
+    {
+        let stmt = connection.prep("SELECT ?").await.unwrap();
+        assert_eq!(stmt.num_params(), 1);
+
+        let row: Option<String> = connection
+            .exec_first(stmt, vec![mysql_async::Value::Bytes(b"can't".to_vec())])
+            .await
+            .unwrap();
+        assert_eq!(row, Some("can't".to_string()));
+
+        let stmt = connection.prep("SELECT ?").await.unwrap();
+        let row: Option<String> = connection
+            .exec_first(stmt, vec![mysql_async::Value::Bytes(b"a\\'b".to_vec())])
+            .await
+            .unwrap();
+        assert_eq!(row, Some("a\\'b".to_string()));
+
+        let stmt = connection.prep("SELECT ?").await.unwrap();
+        let row: Option<Vec<u8>> = connection
+            .exec_first(stmt, vec![mysql_async::Value::Bytes(vec![0xFF, 0xFE])])
+            .await
+            .unwrap();
+        assert_eq!(row, Some(vec![0xFF, 0xFE]));
+
+        let stmt = connection.prep("SELECT ?").await.unwrap();
+        let row: Option<Option<String>> = connection
+            .exec_first(stmt, vec![mysql_async::Value::NULL])
+            .await
+            .unwrap();
+        assert_eq!(row, Some(None));
+    }
+
+    // Values inserted into the SQL text must not be processed again while
+    // replacing later placeholders.
+    {
+        let stmt = connection.prep("SELECT ?, ?").await.unwrap();
+        assert_eq!(stmt.num_params(), 2);
+
+        let row: Option<(String, String)> = connection
+            .exec_first(
+                stmt,
+                vec![
+                    mysql_async::Value::Bytes(b"keep $2".to_vec()),
+                    mysql_async::Value::Bytes(b"second".to_vec()),
+                ],
+            )
+            .await
+            .unwrap();
+        assert_eq!(row, Some(("keep $2".to_string(), "second".to_string())));
+    }
+
+    // Non-placeholder question marks inside string literals must not affect
+    // the advertised prepare parameter count.
+    {
+        let stmt = connection.prep("SELECT '?', ?").await.unwrap();
+        assert_eq!(stmt.num_params(), 1);
+
+        let row: Option<(String, String)> = connection
+            .exec_first(stmt, vec![mysql_async::Value::Bytes(b"actual".to_vec())])
+            .await
+            .unwrap();
+        assert_eq!(row, Some(("?".to_string(), "actual".to_string())));
+
+        let stmt = connection.prep("SELECT '$1', ?").await.unwrap();
+        assert_eq!(stmt.num_params(), 1);
+
+        let row: Option<(String, String)> = connection
+            .exec_first(stmt, vec![mysql_async::Value::Bytes(b"actual".to_vec())])
+            .await
+            .unwrap();
+        assert_eq!(row, Some(("$1".to_string(), "actual".to_string())));
+
+        let stmt = connection.prep("SELECT /* ? */ ? -- ?\n").await.unwrap();
+        assert_eq!(stmt.num_params(), 1);
+
+        let row: Option<String> = connection
+            .exec_first(stmt, vec![mysql_async::Value::Bytes(b"commented".to_vec())])
+            .await
+            .unwrap();
+        assert_eq!(row, Some("commented".to_string()));
+
+        let stmt = connection.prep("SELECT '中文', ?").await.unwrap();
+        assert_eq!(stmt.num_params(), 1);
+
+        let row: Option<(String, String)> = connection
+            .exec_first(stmt, vec![mysql_async::Value::Bytes(b"actual".to_vec())])
+            .await
+            .unwrap();
+        assert_eq!(row, Some(("中文".to_string(), "actual".to_string())));
+    }
+
+    // Also cover mixed known and unknown placeholders. The projection
+    // placeholder is untyped, while the WHERE placeholder is inferred from the
+    // column type. The prepare response must advertise both parameters.
+    {
+        let stmt = connection
+            .prep("SELECT ?, uint32s FROM all_datatypes WHERE uint32s >= ?")
+            .await
+            .unwrap();
+        assert_eq!(stmt.num_params(), 2);
+
+        let rows: Vec<Row> = connection
+            .exec(
+                stmt,
+                vec![
+                    mysql_async::Value::Bytes(b"unknown".to_vec()),
+                    mysql_async::Value::UInt(0),
+                ],
+            )
+            .await
+            .unwrap();
+        assert!(!rows.is_empty());
+    }
+
+    // LIMIT placeholders used to be a common case where DataFusion did not
+    // infer a parameter type. The prepare response must still advertise the
+    // parameter and execution must substitute it correctly.
+    {
+        let stmt = connection
+            .prep("SELECT uint32s FROM all_datatypes ORDER BY uint32s LIMIT ?")
+            .await
+            .unwrap();
+        assert_eq!(stmt.num_params(), 1);
+
+        let rows: Vec<Row> = connection
+            .exec(stmt, vec![mysql_async::Value::UInt(1)])
+            .await
+            .unwrap();
+        assert_eq!(rows.len(), 1);
+    }
+
     Ok(())
 }
 
diff --git a/src/sql/src/parsers/alter_parser.rs b/src/sql/src/parsers/alter_parser.rs
index e5e1575a20..4aa571005a 100644
--- a/src/sql/src/parsers/alter_parser.rs
+++ b/src/sql/src/parsers/alter_parser.rs
@@ -134,6 +134,7 @@ impl ParserContext<'_> {
                     self.parse_alter_table_merge_partition()?
                 } else {
                     match w.keyword {
+                        Keyword::PARTITION => self.parse_alter_table_partition()?,
                         Keyword::ADD => self.parse_alter_table_add()?,
                         Keyword::DROP => {
                             let _ = self.parser.next_token();
@@ -174,7 +175,7 @@ impl ParserContext<'_> {
                             AlterTableOperation::SetTableOptions { options }
                         }
                         _ => self.expected(
-                            "ADD or DROP or MODIFY or RENAME or SET or REPARTITION or SPLIT or MERGE after ALTER TABLE",
+                            "ADD or DROP or MODIFY or RENAME or SET or UNSET or REPARTITION or SPLIT or MERGE or PARTITION after ALTER TABLE",
                             self.parser.peek_token(),
                         )?,
                     }
@@ -218,6 +219,19 @@ impl ParserContext<'_> {
         })
     }
 
+    fn parse_alter_table_partition(&mut self) -> Result<AlterTableOperation> {
+        let _ = self.parser.next_token();
+        let partitions = self.parse_partition_on_columns()?;
+        if partitions.exprs.is_empty() {
+            return Err(ParserError::ParserError(
+                "PARTITION ON COLUMNS requires at least one partition expression".to_string(),
+            ))
+            .context(error::SyntaxSnafu);
+        }
+
+        Ok(AlterTableOperation::Partition { partitions })
+    }
+
     fn parse_alter_table_split_partition(&mut self) -> Result<AlterTableOperation> {
         let _ = self.parser.next_token();
         self.parser
@@ -976,6 +990,100 @@ ALTER TABLE t REPARTITION (
         }
     }
 
+    #[test]
+    fn test_parse_alter_table_partition_on_columns() {
+        let sql = r#"
+ALTER TABLE sensor_readings PARTITION ON COLUMNS (device_id, area) (
+  device_id < 100 AND area < 'South',
+  device_id < 100 AND area >= 'South',
+  device_id >= 100 AND area <= 'East',
+  device_id >= 100 AND area > 'East'
+);"#;
+        let mut result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap();
+        assert_eq!(1, result.len());
+
+        let statement = result.remove(0);
+        assert_matches!(statement, Statement::AlterTable { .. });
+        if let Statement::AlterTable(alter_table) = statement {
+            assert_matches!(
+                alter_table.alter_operation(),
+                AlterTableOperation::Partition { .. }
+            );
+
+            if let AlterTableOperation::Partition { partitions } = alter_table.alter_operation() {
+                assert_eq!(partitions.column_list.len(), 2);
+                assert_eq!(partitions.column_list[0].value, "device_id");
+                assert_eq!(partitions.column_list[1].value, "area");
+                assert_eq!(partitions.exprs.len(), 4);
+                assert_eq!(
+                    partitions.exprs[0].to_string(),
+                    "device_id < 100 AND area < 'South'"
+                );
+                assert_eq!(
+                    partitions.exprs[3].to_string(),
+                    "device_id >= 100 AND area > 'East'"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_parse_alter_table_partition_on_columns_with_options() {
+        let sql = r#"
+ALTER TABLE sensor_readings PARTITION ON COLUMNS (device_id) (
+  device_id < 100,
+  device_id >= 100
+) WITH (
+  TIMEOUT = '5m',
+  WAIT = false
+);"#;
+        let mut result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap();
+        assert_eq!(1, result.len());
+
+        let statement = result.remove(0);
+        assert_matches!(statement, Statement::AlterTable { .. });
+        if let Statement::AlterTable(alter_table) = statement {
+            assert_matches!(
+                alter_table.alter_operation(),
+                AlterTableOperation::Partition { .. }
+            );
+            let options = alter_table.options().to_str_map();
+            assert_eq!(options.get("timeout").unwrap(), &"5m");
+            assert_eq!(options.get("wait").unwrap(), &"false");
+            assert_eq!(options.len(), 2);
+        }
+    }
+
+    #[test]
+    fn test_parse_alter_table_partition_on_columns_empty_columns() {
+        let sql = r#"
+ALTER TABLE sensor_readings PARTITION ON COLUMNS () (
+  device_id < 100
+);"#;
+        let result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
+
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_parse_alter_table_partition_on_columns_empty_exprs() {
+        let sql = r#"
+ALTER TABLE sensor_readings PARTITION ON COLUMNS (device_id) ();"#;
+        let result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap_err();
+
+        assert_eq!(
+            result.output_msg(),
+            "Invalid SQL syntax: sql parser error: PARTITION ON COLUMNS requires at least one partition expression"
+        );
+    }
+
     #[test]
     fn test_parse_alter_table_split_partition() {
         let sql = r#"
@@ -1274,7 +1382,7 @@ ALTER TABLE metrics REPARTITION
         let err = result.output_msg();
         assert_eq!(
             err,
-            "Invalid SQL syntax: sql parser error: Expected ADD or DROP or MODIFY or RENAME or SET or REPARTITION or SPLIT or MERGE after ALTER TABLE, found: table_t"
+            "Invalid SQL syntax: sql parser error: Expected ADD or DROP or MODIFY or RENAME or SET or UNSET or REPARTITION or SPLIT or MERGE or PARTITION after ALTER TABLE, found: table_t"
         );
 
         let sql = "ALTER TABLE test_table RENAME table_t";
diff --git a/src/sql/src/parsers/create_parser.rs b/src/sql/src/parsers/create_parser.rs
index a82590c603..b2614bb8d5 100644
--- a/src/sql/src/parsers/create_parser.rs
+++ b/src/sql/src/parsers/create_parser.rs
@@ -502,6 +502,12 @@ impl<'a> ParserContext<'a> {
         if !self.parser.parse_keyword(Keyword::PARTITION) {
             return Ok(None);
         }
+
+        self.parse_partition_on_columns().map(Some)
+    }
+
+    /// Parses the "ON COLUMNS (...) (...)" part after "PARTITION".
+    pub(crate) fn parse_partition_on_columns(&mut self) -> Result<Partitions> {
         self.parser
             .expect_keywords(&[Keyword::ON, Keyword::COLUMNS])
             .context(error::UnexpectedSnafu {
@@ -520,7 +526,7 @@ impl<'a> ParserContext<'a> {
 
         let exprs = self.parse_comma_separated(Self::parse_partition_entry)?;
 
-        Ok(Some(Partitions { column_list, exprs }))
+        Ok(Partitions { column_list, exprs })
     }
 
     fn parse_partition_entry(&mut self) -> Result<Expr> {
diff --git a/src/sql/src/statements/alter.rs b/src/sql/src/statements/alter.rs
index ab35e5bd34..72182a4b60 100644
--- a/src/sql/src/statements/alter.rs
+++ b/src/sql/src/statements/alter.rs
@@ -26,6 +26,7 @@ use sqlparser::ast::{ColumnDef, DataType, Expr, Ident, ObjectName, TableConstrai
 use sqlparser_derive::{Visit, VisitMut};
 
 use crate::statements::OptionMap;
+use crate::statements::create::Partitions;
 
 #[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
 pub struct AlterTable {
@@ -119,6 +120,10 @@ pub enum AlterTableOperation {
     Repartition {
         operation: RepartitionOperation,
     },
+    /// `PARTITION ON COLUMNS (...) (...)`
+    Partition {
+        partitions: Partitions,
+    },
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
@@ -248,6 +253,9 @@ impl Display for AlterTableOperation {
             AlterTableOperation::Repartition { operation } => {
                 write!(f, "REPARTITION {operation}")
             }
+            AlterTableOperation::Partition { partitions } => {
+                write!(f, "{partitions}")
+            }
             AlterTableOperation::SetIndex { options } => match options {
                 SetIndexOperation::Fulltext {
                     column_name,
diff --git a/src/standalone/src/procedure.rs b/src/standalone/src/procedure.rs
index 144a56be44..853221e698 100644
--- a/src/standalone/src/procedure.rs
+++ b/src/standalone/src/procedure.rs
@@ -17,7 +17,7 @@ use std::time::Duration;
 
 use common_error::ext::BoxedError;
 use common_meta::ddl::DdlContext;
-use common_meta::ddl_manager::RepartitionProcedureFactory;
+use common_meta::ddl_manager::{RepartitionProcedureFactory, RepartitionSource};
 use common_meta::key::runtime_switch::RuntimeSwitchManager;
 use common_meta::kv_backend::KvBackendRef;
 use common_meta::state_store::KvStateStore;
@@ -66,7 +66,7 @@ impl RepartitionProcedureFactory for StandaloneRepartitionProcedureFactory {
         _ddl_ctx: &DdlContext,
         _table_name: TableName,
         _table_id: TableId,
-        _from_exprs: Vec<String>,
+        _source: RepartitionSource,
         _to_exprs: Vec<String>,
         _timeout: Option<Duration>,
     ) -> std::result::Result<BoxedProcedure, BoxedError> {
diff --git a/src/store-api/src/lib.rs b/src/store-api/src/lib.rs
index cb39875d74..f97e348842 100644
--- a/src/store-api/src/lib.rs
+++ b/src/store-api/src/lib.rs
@@ -23,6 +23,7 @@ mod metrics;
 pub mod mito_engine_options;
 pub mod path_utils;
 pub mod region_engine;
+pub mod region_info;
 pub mod region_request;
 pub mod sst_entry;
 pub mod storage;
diff --git a/src/store-api/src/region_engine.rs b/src/store-api/src/region_engine.rs
index b235fcffc7..e5ab05e5e7 100644
--- a/src/store-api/src/region_engine.rs
+++ b/src/store-api/src/region_engine.rs
@@ -483,7 +483,10 @@ pub type BatchResponses = Vec<(RegionId, Result<RegionResponse, BoxedError>)>;
 /// Represents the statistics of a region.
 #[derive(Debug, Deserialize, Serialize, Default)]
 pub struct RegionStatistic {
-    /// The number of rows
+    /// The number of rows stored in SST files owned by this region plus rows in memtables.
+    ///
+    /// Rows from SST files referenced from other regions, for example after repartition,
+    /// are not counted to avoid table-level double counting when summing region statistics.
     #[serde(default)]
     pub num_rows: u64,
     /// The size of memtable in bytes.
@@ -492,11 +495,17 @@ pub struct RegionStatistic {
     pub wal_size: u64,
     /// The size of manifest in bytes.
     pub manifest_size: u64,
-    /// The size of SST data files in bytes.
+    /// The size of SST data files owned by this region in bytes.
+    ///
+    /// SST files referenced from other regions, for example after repartition, are not counted.
     pub sst_size: u64,
-    /// The num of SST files.
+    /// The number of SST files owned by this region.
+    ///
+    /// SST files referenced from other regions, for example after repartition, are not counted.
     pub sst_num: u64,
-    /// The size of SST index files in bytes.
+    /// The size of SST index files owned by this region in bytes.
+    ///
+    /// SST index files referenced from other regions, for example after repartition, are not counted.
     #[serde(default)]
     pub index_size: u64,
     /// The details of the region.
diff --git a/src/store-api/src/region_info.rs b/src/store-api/src/region_info.rs
new file mode 100644
index 0000000000..2d099d2734
--- /dev/null
+++ b/src/store-api/src/region_info.rs
@@ -0,0 +1,360 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_recordbatch::DfRecordBatch;
+use datafusion_common::DataFusionError;
+use datafusion_expr::{LogicalPlan, LogicalPlanBuilder, LogicalTableSource};
+use datatypes::arrow::array::{
+    ArrayRef, BooleanArray, StringArray, UInt8Array, UInt32Array, UInt64Array,
+};
+use datatypes::arrow::error::ArrowError;
+use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
+use serde::{Deserialize, Serialize};
+
+use crate::storage::{RegionGroup, RegionId, RegionNumber, RegionSeq, ScanRequest, TableId};
+
+/// Runtime and manifest information of a region for inspection.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct RegionInfoEntry {
+    /// The region id.
+    pub region_id: RegionId,
+    /// The table id this region belongs to.
+    pub table_id: TableId,
+    /// The region number inside the table.
+    pub region_number: RegionNumber,
+    /// The region group.
+    pub region_group: RegionGroup,
+    /// The region sequence inside the group.
+    pub region_sequence: RegionSeq,
+    /// The full runtime role/state label.
+    pub state: String,
+    /// The coarse region role.
+    pub role: String,
+    /// Whether the region accepts writes.
+    pub writable: bool,
+    /// The committed sequence of the region.
+    pub committed_sequence: u64,
+    /// The latest sequence that has been persisted into SSTs.
+    pub flushed_sequence: Option<u64>,
+    /// The manifest version of the region.
+    pub manifest_version: u64,
+    /// Human-readable compaction time window.
+    pub compaction_time_window: Option<String>,
+    /// Region options encoded as JSON.
+    pub region_options: String,
+    /// SST format used by the region.
+    pub sst_format: String,
+    /// Datanode id that reports the row.
+    pub node_id: Option<u64>,
+}
+
+impl RegionInfoEntry {
+    /// Returns the schema of the region info entry.
+    pub fn schema() -> SchemaRef {
+        use datatypes::prelude::ConcreteDataType as Ty;
+        Arc::new(Schema::new(vec![
+            ColumnSchema::new("region_id", Ty::uint64_datatype(), false),
+            ColumnSchema::new("table_id", Ty::uint32_datatype(), false),
+            ColumnSchema::new("region_number", Ty::uint32_datatype(), false),
+            ColumnSchema::new("region_group", Ty::uint8_datatype(), false),
+            ColumnSchema::new("region_sequence", Ty::uint32_datatype(), false),
+            ColumnSchema::new("state", Ty::string_datatype(), false),
+            ColumnSchema::new("role", Ty::string_datatype(), false),
+            ColumnSchema::new("writable", Ty::boolean_datatype(), false),
+            ColumnSchema::new("committed_sequence", Ty::uint64_datatype(), false),
+            ColumnSchema::new("flushed_sequence", Ty::uint64_datatype(), true),
+            ColumnSchema::new("manifest_version", Ty::uint64_datatype(), false),
+            ColumnSchema::new("compaction_time_window", Ty::string_datatype(), true),
+            ColumnSchema::new("region_options", Ty::string_datatype(), false),
+            ColumnSchema::new("sst_format", Ty::string_datatype(), false),
+            ColumnSchema::new("node_id", Ty::uint64_datatype(), true),
+        ]))
+    }
+
+    /// Converts a list of region info entries to a record batch.
+    pub fn to_record_batch(entries: &[Self]) -> Result<DfRecordBatch, ArrowError> {
+        let schema = Self::schema();
+        let region_ids = entries.iter().map(|e| e.region_id.as_u64());
+        let table_ids = entries.iter().map(|e| e.table_id);
+        let region_numbers = entries.iter().map(|e| e.region_number);
+        let region_groups = entries.iter().map(|e| e.region_group);
+        let region_sequences = entries.iter().map(|e| e.region_sequence);
+        let states = entries.iter().map(|e| e.state.as_str());
+        let roles = entries.iter().map(|e| e.role.as_str());
+        let writable = entries.iter().map(|e| e.writable);
+        let committed_sequences = entries.iter().map(|e| e.committed_sequence);
+        let flushed_sequences = entries.iter().map(|e| e.flushed_sequence);
+        let manifest_versions = entries.iter().map(|e| e.manifest_version);
+        let compaction_time_windows = entries.iter().map(|e| e.compaction_time_window.as_ref());
+        let region_options = entries.iter().map(|e| e.region_options.as_str());
+        let sst_formats = entries.iter().map(|e| e.sst_format.as_str());
+        let node_ids = entries.iter().map(|e| e.node_id);
+
+        let columns: Vec<ArrayRef> = vec![
+            Arc::new(UInt64Array::from_iter_values(region_ids)),
+            Arc::new(UInt32Array::from_iter_values(table_ids)),
+            Arc::new(UInt32Array::from_iter_values(region_numbers)),
+            Arc::new(UInt8Array::from_iter_values(region_groups)),
+            Arc::new(UInt32Array::from_iter_values(region_sequences)),
+            Arc::new(StringArray::from_iter_values(states)),
+            Arc::new(StringArray::from_iter_values(roles)),
+            Arc::new(BooleanArray::from_iter(writable)),
+            Arc::new(UInt64Array::from_iter_values(committed_sequences)),
+            Arc::new(UInt64Array::from_iter(flushed_sequences)),
+            Arc::new(UInt64Array::from_iter_values(manifest_versions)),
+            Arc::new(StringArray::from_iter(compaction_time_windows)),
+            Arc::new(StringArray::from_iter_values(region_options)),
+            Arc::new(StringArray::from_iter_values(sst_formats)),
+            Arc::new(UInt64Array::from_iter(node_ids)),
+        ];
+
+        DfRecordBatch::try_new(schema.arrow_schema().clone(), columns)
+    }
+
+    /// Reserved internal inspect table name for region info.
+    pub fn reserved_table_name_for_inspection() -> &'static str {
+        "__inspect/__mito/__region_info"
+    }
+
+    /// Builds a logical plan for scanning region info entries.
+    pub fn build_plan(scan_request: ScanRequest) -> Result<LogicalPlan, DataFusionError> {
+        let table_source = LogicalTableSource::new(Self::schema().arrow_schema().clone());
+
+        let projection = scan_request.projection_input.map(|input| input.projection);
+        let mut builder = LogicalPlanBuilder::scan(
+            Self::reserved_table_name_for_inspection(),
+            Arc::new(table_source),
+            projection,
+        )?;
+
+        for filter in scan_request.filters {
+            builder = builder.filter(filter)?;
+        }
+
+        if let Some(limit) = scan_request.limit {
+            builder = builder.limit(0, Some(limit))?;
+        }
+
+        builder.build()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use datafusion_common::TableReference;
+    use datafusion_expr::{LogicalPlan, Operator, binary_expr, col, lit};
+    use datatypes::arrow::array::{
+        Array, BooleanArray, StringArray, UInt8Array, UInt32Array, UInt64Array,
+    };
+
+    use super::*;
+    use crate::storage::{RegionId, ScanRequest};
+
+    #[test]
+    fn test_region_info_schema() {
+        let schema = RegionInfoEntry::schema();
+        let columns = schema.column_schemas();
+
+        let names = columns.iter().map(|c| c.name.as_str()).collect::<Vec<_>>();
+        assert_eq!(
+            names,
+            vec![
+                "region_id",
+                "table_id",
+                "region_number",
+                "region_group",
+                "region_sequence",
+                "state",
+                "role",
+                "writable",
+                "committed_sequence",
+                "flushed_sequence",
+                "manifest_version",
+                "compaction_time_window",
+                "region_options",
+                "sst_format",
+                "node_id",
+            ]
+        );
+        assert!(!columns[0].is_nullable());
+        assert!(!columns[8].is_nullable());
+        assert!(columns[9].is_nullable());
+        assert!(columns[11].is_nullable());
+        assert!(columns[14].is_nullable());
+    }
+
+    #[test]
+    fn test_region_info_to_record_batch() {
+        let region_id1 = RegionId::with_group_and_seq(10, 1, 20);
+        let region_id2 = RegionId::with_group_and_seq(11, 0, 21);
+        let entries = vec![
+            RegionInfoEntry {
+                region_id: region_id1,
+                table_id: region_id1.table_id(),
+                region_number: region_id1.region_number(),
+                region_group: region_id1.region_group(),
+                region_sequence: region_id1.region_sequence(),
+                state: "Leader(Writable)".to_string(),
+                role: "Leader".to_string(),
+                writable: true,
+                committed_sequence: 42,
+                flushed_sequence: Some(41),
+                manifest_version: 7,
+                compaction_time_window: Some("1h".to_string()),
+                region_options: "{\"sst_format\":\"flat\"}".to_string(),
+                sst_format: "flat".to_string(),
+                node_id: Some(3),
+            },
+            RegionInfoEntry {
+                region_id: region_id2,
+                table_id: region_id2.table_id(),
+                region_number: region_id2.region_number(),
+                region_group: region_id2.region_group(),
+                region_sequence: region_id2.region_sequence(),
+                state: "Follower".to_string(),
+                role: "Follower".to_string(),
+                writable: false,
+                committed_sequence: 9,
+                flushed_sequence: None,
+                manifest_version: 2,
+                compaction_time_window: None,
+                region_options: "{}".to_string(),
+                sst_format: "primary_key".to_string(),
+                node_id: None,
+            },
+        ];
+
+        let batch = RegionInfoEntry::to_record_batch(&entries).unwrap();
+        assert_eq!(batch.num_rows(), 2);
+
+        let region_ids = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap();
+        assert_eq!(region_id1.as_u64(), region_ids.value(0));
+        assert_eq!(region_id2.as_u64(), region_ids.value(1));
+
+        let table_ids = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<UInt32Array>()
+            .unwrap();
+        assert_eq!(10, table_ids.value(0));
+        assert_eq!(11, table_ids.value(1));
+
+        let region_groups = batch
+            .column(3)
+            .as_any()
+            .downcast_ref::<UInt8Array>()
+            .unwrap();
+        assert_eq!(1, region_groups.value(0));
+        assert_eq!(0, region_groups.value(1));
+
+        let states = batch
+            .column(5)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap();
+        assert_eq!("Leader(Writable)", states.value(0));
+        assert_eq!("Follower", states.value(1));
+
+        let writable = batch
+            .column(7)
+            .as_any()
+            .downcast_ref::<BooleanArray>()
+            .unwrap();
+        assert!(writable.value(0));
+        assert!(!writable.value(1));
+
+        let committed_sequences = batch
+            .column(8)
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap();
+        assert_eq!(42, committed_sequences.value(0));
+        assert_eq!(9, committed_sequences.value(1));
+
+        let flushed_sequences = batch
+            .column(9)
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap();
+        assert_eq!(41, flushed_sequences.value(0));
+        assert!(flushed_sequences.is_null(1));
+
+        let compaction_time_windows = batch
+            .column(11)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap();
+        assert_eq!("1h", compaction_time_windows.value(0));
+        assert!(compaction_time_windows.is_null(1));
+
+        let node_ids = batch
+            .column(14)
+            .as_any()
+            .downcast_ref::<UInt64Array>()
+            .unwrap();
+        assert_eq!(3, node_ids.value(0));
+        assert!(node_ids.is_null(1));
+    }
+
+    #[test]
+    fn test_region_info_build_plan() {
+        let projection_input = Some(vec![0, 5, 7, 11].into());
+        let request = ScanRequest {
+            projection_input,
+            filters: vec![binary_expr(col("writable"), Operator::Eq, lit(true))],
+            limit: Some(10),
+            ..Default::default()
+        };
+
+        let plan = RegionInfoEntry::build_plan(request).unwrap();
+        let (scan, has_filter, has_limit) = extract_scan(&plan);
+        assert!(has_filter);
+        assert!(has_limit);
+        assert_eq!(
+            scan.table_name,
+            TableReference::bare(RegionInfoEntry::reserved_table_name_for_inspection())
+        );
+        assert_eq!(scan.projection, Some(vec![0, 5, 7, 11]));
+
+        let fields = scan.projected_schema.fields();
+        assert_eq!(fields.len(), 4);
+        assert_eq!(fields[0].name(), "region_id");
+        assert_eq!(fields[1].name(), "state");
+        assert_eq!(fields[2].name(), "writable");
+        assert_eq!(fields[3].name(), "compaction_time_window");
+    }
+
+    fn extract_scan(plan: &LogicalPlan) -> (&datafusion_expr::logical_plan::TableScan, bool, bool) {
+        use datafusion_expr::logical_plan::Limit;
+
+        match plan {
+            LogicalPlan::Filter(f) => {
+                let (scan, _, has_limit) = extract_scan(&f.input);
+                (scan, true, has_limit)
+            }
+            LogicalPlan::Limit(Limit { input, .. }) => {
+                let (scan, has_filter, _) = extract_scan(input);
+                (scan, has_filter, true)
+            }
+            LogicalPlan::TableScan(scan) => (scan, false, false),
+            other => panic!("unexpected plan: {other:?}"),
+        }
+    }
+}
diff --git a/src/store-api/src/storage/requests.rs b/src/store-api/src/storage/requests.rs
index 57b2ca8e88..b27119d881 100644
--- a/src/store-api/src/storage/requests.rs
+++ b/src/store-api/src/storage/requests.rs
@@ -124,6 +124,9 @@ pub struct ScanRequest {
     /// Optional constraint on the minimal sequence number in the SST files.
     /// If set, only the SST files that contain sequences greater than this value will be scanned.
     pub sst_min_sequence: Option<SequenceNumber>,
+    /// Whether to skip all SST files.
+    /// This is stronger than `sst_min_sequence` and also skips SST files without sequence metadata.
+    pub skip_sst_files: bool,
     /// Whether to bind the effective snapshot upper bound when opening the scan.
     pub snapshot_on_scan: bool,
     /// Optional hint for the distribution of time-series data.
@@ -211,6 +214,14 @@ impl Display for ScanRequest {
                 sst_min_sequence
             )?;
         }
+        if self.skip_sst_files {
+            write!(
+                f,
+                "{}skip_sst_files: {}",
+                delimiter.as_str(),
+                self.skip_sst_files
+            )?;
+        }
         if self.snapshot_on_scan {
             write!(
                 f,
@@ -321,5 +332,11 @@ mod tests {
             request.to_string(),
             "ScanRequest { snapshot_on_scan: true }"
         );
+
+        let request = ScanRequest {
+            skip_sst_files: true,
+            ..Default::default()
+        };
+        assert_eq!(request.to_string(), "ScanRequest { skip_sst_files: true }");
     }
 }
diff --git a/src/table/src/predicate.rs b/src/table/src/predicate.rs
index e559e2c296..29b1043049 100644
--- a/src/table/src/predicate.rs
+++ b/src/table/src/predicate.rs
@@ -41,7 +41,7 @@ mod stats;
 /// In theory, it should be converted to a timestamp scalar value by `TypeConversionRule`.
 macro_rules! return_none_if_utf8 {
     ($lit: ident) => {
-        if matches!($lit, ScalarValue::Utf8(_)) {
+        if is_string_timestamp_literal($lit) {
             warn!(
                 "Unexpected ScalarValue::Utf8 in time range predicate: {:?}. Maybe it's an implicit bug, please report it to https://github.com/GreptimeTeam/greptimedb/issues",
                 $lit
@@ -53,6 +53,13 @@ macro_rules! return_none_if_utf8 {
     };
 }
 
+pub fn is_string_timestamp_literal(scalar: &ScalarValue) -> bool {
+    matches!(
+        scalar,
+        ScalarValue::Utf8(_) | ScalarValue::LargeUtf8(_) | ScalarValue::Utf8View(_)
+    )
+}
+
 /// Reference-counted pointer to a list of logical exprs and a list of dynamic filter physical exprs.
 #[derive(Debug, Clone, Default)]
 pub struct Predicate {
diff --git a/tests-integration/src/grpc.rs b/tests-integration/src/grpc.rs
index 181e698e87..7c2148e130 100644
--- a/tests-integration/src/grpc.rs
+++ b/tests-integration/src/grpc.rs
@@ -54,13 +54,19 @@ mod test {
     use api::v1::{
         AddColumn, AddColumns, AlterTableExpr, Column, ColumnDataType, ColumnDataTypeExtension,
         ColumnDef, CreateDatabaseExpr, CreateTableExpr, DdlRequest, DeleteRequest, DeleteRequests,
-        DropTableExpr, InsertRequest, InsertRequests, QueryRequest, SemanticType,
+        DropTableExpr, InsertIntoPlan, InsertRequest, InsertRequests, QueryRequest, SemanticType,
         VectorTypeExtension, alter_table_expr,
     };
+    use auth::{
+        DefaultPermissionChecker, Identity, Password, PermissionCheckerRef, UserProvider,
+        static_user_provider_from_option,
+    };
     use client::OutputData;
+    use common_base::Plugins;
     use common_catalog::consts::MITO_ENGINE;
     use common_meta::rpc::router::region_distribution;
     use common_query::Output;
+    use common_query::logical_plan::breakup_insert_plan;
     use common_recordbatch::RecordBatches;
     use frontend::instance::Instance;
     use query::parser::QueryLanguageParser;
@@ -129,6 +135,82 @@ mod test {
             .unwrap()
     }
 
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_grpc_insert_into_plan_rejects_readonly_user() {
+        let plugins = Plugins::new();
+        plugins.insert::<PermissionCheckerRef>(DefaultPermissionChecker::arc());
+
+        let standalone =
+            GreptimeDbStandaloneBuilder::new("test_grpc_insert_into_plan_rejects_readonly_user")
+                .with_plugin(plugins)
+                .build()
+                .await;
+        let instance = standalone.fe_instance();
+        let table_name = "grpc_insert_into_plan_auth";
+
+        create_table(
+            instance,
+            format!("CREATE TABLE {table_name} (host STRING, val DOUBLE, ts TIMESTAMP TIME INDEX)"),
+        )
+        .await;
+
+        let stmt = QueryLanguageParser::parse_sql(
+            &format!("INSERT INTO {table_name} VALUES ('readonly-bypass', 42.0, 1000)"),
+            &QueryContext::arc(),
+        )
+        .unwrap();
+        let plan = instance
+            .statement_executor()
+            .plan(&stmt, QueryContext::arc())
+            .await
+            .unwrap();
+        let (table_name, insert_plan) = breakup_insert_plan(&plan, "greptime", "public").unwrap();
+        let logical_plan = DFLogicalSubstraitConvertor
+            .encode(&insert_plan, DefaultSerializer)
+            .unwrap()
+            .to_vec();
+
+        let request = Request::Query(QueryRequest {
+            query: Some(Query::InsertIntoPlan(InsertIntoPlan {
+                table_name: Some(table_name),
+                logical_plan,
+            })),
+        });
+        let ctx = QueryContext::arc();
+        let provider =
+            static_user_provider_from_option("static_user_provider:cmd:readonly:ro=readonly_pwd")
+                .unwrap();
+        let readonly_user = provider
+            .authenticate(
+                Identity::UserId("readonly", None),
+                Password::PlainText("readonly_pwd".to_string().into()),
+            )
+            .await
+            .unwrap();
+        ctx.set_current_user(readonly_user);
+
+        let err = GrpcQueryHandler::do_query(instance.as_ref(), request, ctx)
+            .await
+            .unwrap_err();
+        let err_msg = format!("{err:?}");
+        assert!(
+            err_msg.contains("not authorized"),
+            "unexpected error: {err_msg}"
+        );
+
+        query_and_expect(
+            instance,
+            "SELECT count(*) FROM grpc_insert_into_plan_auth",
+            "\
++----------+
+| count(*) |
++----------+
+| 0        |
++----------+",
+        )
+        .await;
+    }
+
     async fn test_handle_multi_ddl_request(instance: &Instance) {
         let request = Request::Ddl(DdlRequest {
             expr: Some(DdlExpr::CreateDatabase(CreateDatabaseExpr {
diff --git a/tests-integration/src/instance.rs b/tests-integration/src/instance.rs
index b11d5c32c6..17435a3b7a 100644
--- a/tests-integration/src/instance.rs
+++ b/tests-integration/src/instance.rs
@@ -323,7 +323,7 @@ mod tests {
 
             fn pre_execute(
                 &self,
-                _statement: &Statement,
+                _statement: Option<&Statement>,
                 _plan: Option<&LogicalPlan>,
                 _query_ctx: QueryContextRef,
             ) -> Result<()> {
@@ -396,7 +396,7 @@ mod tests {
 
             fn pre_execute(
                 &self,
-                _statement: &Statement,
+                _statement: Option<&Statement>,
                 _plan: Option<&LogicalPlan>,
                 _query_ctx: QueryContextRef,
             ) -> Result<()> {
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index f17b78a7e5..7f411cdec2 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -1719,10 +1719,11 @@ fn drop_lines_with_inconsistent_results(input: String) -> String {
         "vector_cache_size =",
         "page_cache_size =",
         "selector_result_cache_size =",
+        "range_result_cache_size =",
+        "prefilter_result_cache_size =",
         "metadata_cache_size =",
         "content_cache_size =",
         "result_cache_size =",
-        "range_result_cache_size =",
         "name =",
         "recovery_parallelism =",
         "max_background_index_builds =",
diff --git a/tests-integration/tests/jsonbench.rs b/tests-integration/tests/jsonbench.rs
index 55cfcd53f0..9e8cabf3e0 100644
--- a/tests-integration/tests/jsonbench.rs
+++ b/tests-integration/tests/jsonbench.rs
@@ -25,8 +25,6 @@ use servers::server::ServerHandlers;
 use tests_integration::standalone::GreptimeDbStandaloneBuilder;
 use tests_integration::test_util::execute_sql_and_expect;
 
-// TODO(LFC): Unignore the test when JSON2 is ready.
-#[ignore]
 #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
 async fn test_load_jsonbench_data_by_pipeline() -> io::Result<()> {
     common_telemetry::init_default_ut_logging();
@@ -123,8 +121,6 @@ transform:
     assert!(response.starts_with(pattern));
 }
 
-// TODO(LFC): Unignore the test when JSON2 is ready.
-#[ignore]
 #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
 async fn test_load_jsonbench_data_by_sql() -> io::Result<()> {
     common_telemetry::init_default_ut_logging();
@@ -153,16 +149,10 @@ async fn query_data(frontend: &Arc<Instance>) -> io::Result<()> {
 +----------+"#;
     execute_sql_and_expect(frontend, sql, expected).await;
 
-    let sql = "SELECT * FROM bluesky ORDER BY time_us";
-    let expected = fs::read_to_string(find_workspace_path(
-        "tests-integration/resources/jsonbench-select-all.txt",
-    ))?;
-    execute_sql_and_expect(frontend, sql, &expected).await;
-
     // query 1:
     let sql = "
 SELECT
-    json_get_string(data, '$.commit.collection') AS event, count() AS count
+    data.commit.collection AS event, count() AS count
 FROM bluesky
 GROUP BY event
 ORDER BY count DESC, event ASC";
@@ -180,13 +170,12 @@ ORDER BY count DESC, event ASC";
     // query 2:
     let sql = "
 SELECT
-    json_get_string(data, '$.commit.collection') AS event,
+    data.commit.collection AS event,
     count() AS count,
-    count(DISTINCT json_get_string(data, '$.did')) AS users
+    count(DISTINCT data.did) AS users
 FROM bluesky
 WHERE
-    (json_get_string(data, '$.kind') = 'commit') AND
-    (json_get_string(data, '$.commit.operation') = 'create')
+    data.kind = 'commit' AND data.commit.operation = 'create'
 GROUP BY event
 ORDER BY count DESC, event ASC";
     let expected = r#"
@@ -203,15 +192,14 @@ ORDER BY count DESC, event ASC";
     // query 3:
     let sql = "
 SELECT
-    json_get_string(data, '$.commit.collection') AS event,
-    date_part('hour', to_timestamp_micros(json_get_int(data, '$.time_us'))) as hour_of_day,
+    data.commit.collection AS event,
+    date_part('hour', to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) as hour_of_day,
     count() AS count
 FROM bluesky
 WHERE
-    (json_get_string(data, '$.kind') = 'commit') AND
-    (json_get_string(data, '$.commit.operation') = 'create') AND
-    json_get_string(data, '$.commit.collection') IN
-        ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
+    data.kind = 'commit' AND
+    data.commit.operation = 'create' AND
+    data.commit.collection in ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
 GROUP BY event, hour_of_day
 ORDER BY hour_of_day, event";
     let expected = r#"
@@ -227,13 +215,13 @@ ORDER BY hour_of_day, event";
     // query 4:
     let sql = "
 SELECT
-    json_get_string(data, '$.did') as user_id,
-    min(to_timestamp_micros(json_get_int(data, '$.time_us'))) AS first_post_ts
+    data.did::String as user_id,
+    min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) AS first_post_ts
 FROM bluesky
 WHERE
-    (json_get_string(data, '$.kind') = 'commit') AND
-    (json_get_string(data, '$.commit.operation') = 'create') AND
-    (json_get_string(data, '$.commit.collection') = 'app.bsky.feed.post')
+    data.kind = 'commit' AND
+    data.commit.operation = 'create' AND
+    data.commit.collection = 'app.bsky.feed.post'
 GROUP BY user_id
 ORDER BY first_post_ts ASC, user_id DESC
 LIMIT 3";
@@ -250,17 +238,17 @@ LIMIT 3";
     // query 5:
     let sql = "
 SELECT
-    json_get_string(data, '$.did') as user_id,
+    data.did::String as user_id,
     date_part(
         'epoch',
-        max(to_timestamp_micros(json_get_int(data, '$.time_us'))) -
-        min(to_timestamp_micros(json_get_int(data, '$.time_us')))
+        max(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) -
+          min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64')))
     ) AS activity_span
 FROM bluesky
 WHERE
-    (json_get_string(data, '$.kind') = 'commit') AND
-    (json_get_string(data, '$.commit.operation') = 'create') AND
-    (json_get_string(data, '$.commit.collection') = 'app.bsky.feed.post')
+    data.kind = 'commit' AND
+    data.commit.operation = 'create' AND
+    data.commit.collection = 'app.bsky.feed.post'
 GROUP BY user_id
 ORDER BY activity_span DESC, user_id DESC
 LIMIT 3";
@@ -304,30 +292,21 @@ async fn insert_data_by_sql(frontend: &Arc<Instance>) -> io::Result<()> {
 async fn desc_table(frontend: &Arc<Instance>) {
     let sql = "DESC TABLE bluesky";
     let expected = r#"
-+---------+------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
-| Column  | Type                                                                                                                                           | Key | Null | Default | Semantic Type |
-+---------+------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
-| data    | Json<{"_raw":"<String>","commit.collection":"<String>","commit.operation":"<String>","did":"<String>","kind":"<String>","time_us":"<Number>"}> |     | YES  |         | FIELD         |
-| time_us | TimestampMicrosecond                                                                                                                           | PRI | NO   |         | TIMESTAMP     |
-+---------+------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+"#;
++---------+----------------------+-----+------+---------+---------------+
+| Column  | Type                 | Key | Null | Default | Semantic Type |
++---------+----------------------+-----+------+---------+---------------+
+| data    | Json2{}              |     | YES  |         | FIELD         |
+| time_us | TimestampMicrosecond | PRI | NO   |         | TIMESTAMP     |
++---------+----------------------+-----+------+---------+---------------+"#;
     execute_sql_and_expect(frontend, sql, expected).await;
 }
 
 async fn create_table(frontend: &Arc<Instance>) {
     let sql = r#"
 CREATE TABLE bluesky (
-  "data" JSON (
-    format = "partial",
-    fields = Struct<
-      kind String,
-      "commit.operation" String,
-      "commit.collection" String,
-      did String,
-      time_us Bigint
-    >,
-  ),
+  "data" JSON2,
   time_us TimestampMicrosecond TIME INDEX,
-)
+) WITH ('append_mode' = 'true', 'sst_format' = 'flat')
 "#;
     execute_sql_and_expect(frontend, sql, "Affected Rows: 0").await;
 }
diff --git a/tests-integration/tests/repartition.rs b/tests-integration/tests/repartition.rs
index 50893cc7a6..ef59d1b910 100644
--- a/tests-integration/tests/repartition.rs
+++ b/tests-integration/tests/repartition.rs
@@ -55,6 +55,24 @@ macro_rules! repartition_tests {
                         }
                     }
 
+                    #[tokio::test(flavor = "multi_thread")]
+                    async fn [< test_partition_unpartitioned_mito >]() {
+                        let store_type = tests_integration::test_util::StorageType::$service;
+                        if store_type.test_on() {
+                            common_telemetry::init_default_ut_logging();
+                            $crate::repartition::test_partition_unpartitioned_mito(store_type).await;
+                        }
+                    }
+
+                    #[tokio::test(flavor = "multi_thread")]
+                    async fn [< test_partition_unpartitioned_metric >]() {
+                        let store_type = tests_integration::test_util::StorageType::$service;
+                        if store_type.test_on() {
+                            common_telemetry::init_default_ut_logging();
+                            $crate::repartition::test_partition_unpartitioned_metric(store_type).await;
+                        }
+                    }
+
                     #[tokio::test(flavor = "multi_thread")]
                     async fn [< test_repartition_metric >]() {
                         let store_type = tests_integration::test_util::StorageType::$service;
@@ -78,6 +96,274 @@ macro_rules! repartition_tests {
     };
 }
 
+pub async fn test_partition_unpartitioned_mito(store_type: StorageType) {
+    info!(
+        "test_partition_unpartitioned_mito: store_type: {:?}",
+        store_type
+    );
+    let cluster_name = "test_partition_unpartitioned_mito";
+    let (store_config, _guard) = get_test_store_config(&store_type);
+    let datanodes = 3u64;
+    let mut builder = GreptimeDbClusterBuilder::new(cluster_name).await;
+    if matches!(store_type, StorageType::File) {
+        let home_dir = create_temp_dir("test_partition_unpartitioned_mito_data_home");
+        builder = builder.with_shared_home_dir(Arc::new(home_dir));
+    }
+
+    let cluster = builder
+        .with_datanodes(datanodes as u32)
+        .with_store_config(store_config)
+        .with_datanode_wal_config(DatanodeWalConfig::Noop)
+        .build(true)
+        .await;
+
+    let query_ctx = QueryContext::arc();
+    let instance = cluster.fe_instance();
+
+    let sql = r#"
+        CREATE TABLE `partition_unpartitioned_mito_table`(
+          `id` INT,
+          `city` STRING,
+          `ts` TIMESTAMP TIME INDEX,
+          PRIMARY KEY(`id`, `city`)
+        ) ENGINE = mito;
+    "#;
+    run_sql(instance, sql, query_ctx.clone()).await.unwrap();
+
+    let sql = r#"
+        INSERT INTO `partition_unpartitioned_mito_table` VALUES
+          (1, 'New York', '2022-01-01 00:00:00'),
+          (10, 'Paris', '2022-01-01 00:00:00'),
+          (20, 'Beijing', '2022-01-01 00:00:00');
+    "#;
+    run_sql(instance, sql, query_ctx.clone()).await.unwrap();
+
+    let sql = r#"
+        ALTER TABLE `partition_unpartitioned_mito_table` PARTITION ON COLUMNS (`id`) (
+          `id` < 10,
+          `id` >= 10 AND `id` < 20,
+          `id` >= 20
+        );
+    "#;
+    run_sql(instance, sql, query_ctx.clone()).await.unwrap();
+    // Wait for cache invalidation.
+    tokio::time::sleep(Duration::from_millis(500)).await;
+
+    let result = run_sql(
+        instance,
+        "SELECT * FROM `partition_unpartitioned_mito_table` ORDER BY `id`",
+        query_ctx.clone(),
+    )
+    .await
+    .unwrap();
+    let expected = "\
++----+----------+---------------------+
+| id | city     | ts                  |
++----+----------+---------------------+
+| 1  | New York | 2022-01-01T00:00:00 |
+| 10 | Paris    | 2022-01-01T00:00:00 |
+| 20 | Beijing  | 2022-01-01T00:00:00 |
++----+----------+---------------------+";
+    check_output_stream(result.data, expected).await;
+
+    let result = run_sql(
+        instance,
+        "\
+SELECT partition_expression, partition_description \
+FROM information_schema.partitions \
+WHERE table_name = 'partition_unpartitioned_mito_table' \
+ORDER BY partition_ordinal_position;",
+        query_ctx.clone(),
+    )
+    .await
+    .unwrap();
+    let expected_partitions = r#"+----------------------+-----------------------+
+| partition_expression | partition_description |
++----------------------+-----------------------+
+| id                   | id < 10               |
+| id                   | id >= 10 AND id < 20  |
+| id                   | id >= 20              |
++----------------------+-----------------------+"#;
+    check_output_stream(result.data, expected_partitions).await;
+
+    let sql = r#"
+        INSERT INTO `partition_unpartitioned_mito_table` VALUES
+          (5, 'London', '2022-01-02 00:00:00'),
+          (15, 'Tokyo', '2022-01-02 00:00:00'),
+          (25, 'Shanghai', '2022-01-02 00:00:00');
+    "#;
+    run_sql(instance, sql, query_ctx.clone()).await.unwrap();
+
+    let result = run_sql(
+        instance,
+        "SELECT * FROM `partition_unpartitioned_mito_table` ORDER BY `id`",
+        query_ctx.clone(),
+    )
+    .await
+    .unwrap();
+    let expected = "\
++----+----------+---------------------+
+| id | city     | ts                  |
++----+----------+---------------------+
+| 1  | New York | 2022-01-01T00:00:00 |
+| 5  | London   | 2022-01-02T00:00:00 |
+| 10 | Paris    | 2022-01-01T00:00:00 |
+| 15 | Tokyo    | 2022-01-02T00:00:00 |
+| 20 | Beijing  | 2022-01-01T00:00:00 |
+| 25 | Shanghai | 2022-01-02T00:00:00 |
++----+----------+---------------------+";
+    check_output_stream(result.data, expected).await;
+
+    run_sql(
+        instance,
+        "DROP TABLE `partition_unpartitioned_mito_table`",
+        query_ctx.clone(),
+    )
+    .await
+    .unwrap();
+}
+
+pub async fn test_partition_unpartitioned_metric(store_type: StorageType) {
+    info!(
+        "test_partition_unpartitioned_metric: store_type: {:?}",
+        store_type
+    );
+    let cluster_name = "test_partition_unpartitioned_metric";
+    let (store_config, _guard) = get_test_store_config(&store_type);
+    let datanodes = 3u64;
+    let mut builder = GreptimeDbClusterBuilder::new(cluster_name).await;
+    if matches!(store_type, StorageType::File) {
+        let home_dir = create_temp_dir("test_partition_unpartitioned_metric_data_home");
+        builder = builder.with_shared_home_dir(Arc::new(home_dir));
+    }
+
+    let cluster = builder
+        .with_datanodes(datanodes as u32)
+        .with_store_config(store_config)
+        .with_datanode_wal_config(DatanodeWalConfig::Noop)
+        .build(true)
+        .await;
+
+    let query_ctx = QueryContext::arc();
+    let instance = cluster.fe_instance();
+
+    let sql = r#"
+        CREATE TABLE `partition_unpartitioned_metric_phy`(
+          `ts` TIMESTAMP TIME INDEX,
+          `val` DOUBLE,
+          `host` STRING PRIMARY KEY
+        ) ENGINE = metric
+        WITH (
+          "physical_metric_table" = "true"
+        );
+    "#;
+    run_sql(instance, sql, query_ctx.clone()).await.unwrap();
+
+    let sql = r#"
+        CREATE TABLE `partition_unpartitioned_metric_log`(
+          `ts` TIMESTAMP TIME INDEX,
+          `val` DOUBLE,
+          `host` STRING PRIMARY KEY
+        ) ENGINE = metric
+        WITH (
+          "on_physical_table" = "partition_unpartitioned_metric_phy"
+        );
+    "#;
+    run_sql(instance, sql, query_ctx.clone()).await.unwrap();
+
+    let sql = r#"
+        INSERT INTO `partition_unpartitioned_metric_log` (`host`, `ts`, `val`) VALUES
+          ('a_host', '2022-01-01 00:00:00', 1),
+          ('z_host', '2022-01-01 00:00:00', 2);
+    "#;
+    run_sql(instance, sql, query_ctx.clone()).await.unwrap();
+
+    let sql = r#"
+        ALTER TABLE `partition_unpartitioned_metric_phy` PARTITION ON COLUMNS (`host`) (
+          `host` < 'm',
+          `host` >= 'm'
+        );
+    "#;
+    run_sql(instance, sql, query_ctx.clone()).await.unwrap();
+    // Wait for cache invalidation.
+    tokio::time::sleep(Duration::from_millis(500)).await;
+
+    let result = run_sql(
+        instance,
+        "SELECT * FROM `partition_unpartitioned_metric_log` ORDER BY `host`",
+        query_ctx.clone(),
+    )
+    .await
+    .unwrap();
+    let expected = "\
++--------+---------------------+-----+
+| host   | ts                  | val |
++--------+---------------------+-----+
+| a_host | 2022-01-01T00:00:00 | 1.0 |
+| z_host | 2022-01-01T00:00:00 | 2.0 |
++--------+---------------------+-----+";
+    check_output_stream(result.data, expected).await;
+
+    let result = run_sql(
+        instance,
+        "\
+SELECT partition_expression, partition_description \
+FROM information_schema.partitions \
+WHERE table_name = 'partition_unpartitioned_metric_phy' \
+ORDER BY partition_ordinal_position;",
+        query_ctx.clone(),
+    )
+    .await
+    .unwrap();
+    let expected_partitions = r#"+----------------------+-----------------------+
+| partition_expression | partition_description |
++----------------------+-----------------------+
+| host                 | host < m              |
+| host                 | host >= m             |
++----------------------+-----------------------+"#;
+    check_output_stream(result.data, expected_partitions).await;
+
+    let sql = r#"
+        INSERT INTO `partition_unpartitioned_metric_log` (`host`, `ts`, `val`) VALUES
+          ('b_host', '2022-01-02 00:00:00', 3),
+          ('x_host', '2022-01-02 00:00:00', 4);
+    "#;
+    run_sql(instance, sql, query_ctx.clone()).await.unwrap();
+
+    let result = run_sql(
+        instance,
+        "SELECT * FROM `partition_unpartitioned_metric_log` ORDER BY `host`",
+        query_ctx.clone(),
+    )
+    .await
+    .unwrap();
+    let expected = "\
++--------+---------------------+-----+
+| host   | ts                  | val |
++--------+---------------------+-----+
+| a_host | 2022-01-01T00:00:00 | 1.0 |
+| b_host | 2022-01-02T00:00:00 | 3.0 |
+| x_host | 2022-01-02T00:00:00 | 4.0 |
+| z_host | 2022-01-01T00:00:00 | 2.0 |
++--------+---------------------+-----+";
+    check_output_stream(result.data, expected).await;
+
+    run_sql(
+        instance,
+        "DROP TABLE `partition_unpartitioned_metric_log`",
+        query_ctx.clone(),
+    )
+    .await
+    .unwrap();
+    run_sql(
+        instance,
+        "DROP TABLE `partition_unpartitioned_metric_phy`",
+        query_ctx.clone(),
+    )
+    .await
+    .unwrap();
+}
+
 async fn trigger_table_gc(metasrv: &Arc<Metasrv>, table_name: &str) {
     info!("triggering table gc for table: {}", table_name);
     let table_metadata_manager = metasrv.table_metadata_manager();
diff --git a/tests/cases/standalone/common/alter/change_col_type_skipping_index.result b/tests/cases/standalone/common/alter/change_col_type_skipping_index.result
new file mode 100644
index 0000000000..e19cb455a5
--- /dev/null
+++ b/tests/cases/standalone/common/alter/change_col_type_skipping_index.result
@@ -0,0 +1,47 @@
+-- Regression test for skip index with column type change.
+CREATE TABLE monitoring_data_skip (
+    host STRING SKIPPING INDEX,
+    `region` STRING,
+    cpu_usage DOUBLE SKIPPING INDEX,
+    `timestamp` TIMESTAMP TIME INDEX
+) WITH ('append_mode'='true');
+
+Affected Rows: 0
+
+INSERT INTO monitoring_data_skip (host, region, cpu_usage, `timestamp`) VALUES
+('web-01', 'us-east', 12.5, '2026-05-06 10:00:00'),
+('web-01', 'us-east', 15.2, '2026-05-06 10:01:00'),
+('web-02', 'us-east', 23.7, '2026-05-06 10:01:00'),
+('db-01', 'us-east', 45.0, '2026-05-06 10:02:00'),
+('db-02', 'us-west', 82.2, '2026-05-06 10:02:00'),
+('cache-01', 'eu-central', 55.4, '2026-05-06 10:02:00'),
+('queue-01', 'ap-south', 99.1, '2026-05-06 10:02:00');
+
+Affected Rows: 7
+
+ADMIN FLUSH_TABLE('monitoring_data_skip');
+
++-------------------------------------------+
+| ADMIN FLUSH_TABLE('monitoring_data_skip') |
++-------------------------------------------+
+| 0                                         |
++-------------------------------------------+
+
+ALTER TABLE monitoring_data_skip
+MODIFY COLUMN cpu_usage STRING;
+
+Affected Rows: 0
+
+SELECT host, region, cpu_usage, `timestamp` FROM monitoring_data_skip
+WHERE cpu_usage = '23.7';
+
++--------+---------+-----------+---------------------+
+| host   | region  | cpu_usage | timestamp           |
++--------+---------+-----------+---------------------+
+| web-02 | us-east | 23.7      | 2026-05-06T10:01:00 |
++--------+---------+-----------+---------------------+
+
+DROP TABLE monitoring_data_skip;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/alter/change_col_type_skipping_index.sql b/tests/cases/standalone/common/alter/change_col_type_skipping_index.sql
new file mode 100644
index 0000000000..8a7e3b431d
--- /dev/null
+++ b/tests/cases/standalone/common/alter/change_col_type_skipping_index.sql
@@ -0,0 +1,26 @@
+-- Regression test for skip index with column type change.
+CREATE TABLE monitoring_data_skip (
+    host STRING SKIPPING INDEX,
+    `region` STRING,
+    cpu_usage DOUBLE SKIPPING INDEX,
+    `timestamp` TIMESTAMP TIME INDEX
+) WITH ('append_mode'='true');
+
+INSERT INTO monitoring_data_skip (host, region, cpu_usage, `timestamp`) VALUES
+('web-01', 'us-east', 12.5, '2026-05-06 10:00:00'),
+('web-01', 'us-east', 15.2, '2026-05-06 10:01:00'),
+('web-02', 'us-east', 23.7, '2026-05-06 10:01:00'),
+('db-01', 'us-east', 45.0, '2026-05-06 10:02:00'),
+('db-02', 'us-west', 82.2, '2026-05-06 10:02:00'),
+('cache-01', 'eu-central', 55.4, '2026-05-06 10:02:00'),
+('queue-01', 'ap-south', 99.1, '2026-05-06 10:02:00');
+
+ADMIN FLUSH_TABLE('monitoring_data_skip');
+
+ALTER TABLE monitoring_data_skip
+MODIFY COLUMN cpu_usage STRING;
+
+SELECT host, region, cpu_usage, `timestamp` FROM monitoring_data_skip
+WHERE cpu_usage = '23.7';
+
+DROP TABLE monitoring_data_skip;
diff --git a/tests/cases/standalone/common/flow/flow_incremental_aggr.result b/tests/cases/standalone/common/flow/flow_incremental_aggr.result
new file mode 100644
index 0000000000..bb66d5362c
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_incremental_aggr.result
@@ -0,0 +1,119 @@
+CREATE TABLE incremental_aggr_input (
+    host_id INT,
+    n INT,
+    ts TIMESTAMP TIME INDEX,
+    PRIMARY KEY(host_id)
+) WITH (
+    append_mode = 'true'
+);
+
+Affected Rows: 0
+
+CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink AS
+SELECT
+    sum(n) AS total,
+    min(n) AS min_n,
+    max(n) AS max_n,
+    date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window
+FROM
+    incremental_aggr_input
+GROUP BY
+    time_window;
+
+Affected Rows: 0
+
+INSERT INTO incremental_aggr_input VALUES
+    (1, 10, '2024-01-01 00:00:00'),
+    (2, 20, '2024-01-01 00:00:30');
+
+Affected Rows: 2
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('incremental_aggr_flow');
+
++-------------------------------------------+
+| ADMIN FLUSH_FLOW('incremental_aggr_flow') |
++-------------------------------------------+
+|  FLOW_FLUSHED  |
++-------------------------------------------+
+
+SELECT total, min_n, max_n, time_window FROM incremental_aggr_sink ORDER BY time_window;
+
++-------+-------+-------+---------------------+
+| total | min_n | max_n | time_window         |
++-------+-------+-------+---------------------+
+| 30    | 10    | 20    | 2024-01-01T00:00:00 |
++-------+-------+-------+---------------------+
+
+-- Move already checkpointed source rows into SST. The next incremental run
+-- must still read only the memtable delta and must not merge these old SST
+-- rows again.
+ADMIN FLUSH_TABLE('incremental_aggr_input');
+
++---------------------------------------------+
+| ADMIN FLUSH_TABLE('incremental_aggr_input') |
++---------------------------------------------+
+| 0                                           |
++---------------------------------------------+
+
+-- Insert more rows into the same time window. An incremental-safe flow should
+-- merge the delta aggregate with the existing sink aggregate state.
+INSERT INTO incremental_aggr_input VALUES
+    (3, 30, '2024-01-01 00:00:15'),
+    (4, 40, '2024-01-01 00:00:45');
+
+Affected Rows: 2
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('incremental_aggr_flow');
+
++-------------------------------------------+
+| ADMIN FLUSH_FLOW('incremental_aggr_flow') |
++-------------------------------------------+
+|  FLOW_FLUSHED  |
++-------------------------------------------+
+
+SELECT total, min_n, max_n, time_window FROM incremental_aggr_sink ORDER BY time_window;
+
++-------+-------+-------+---------------------+
+| total | min_n | max_n | time_window         |
++-------+-------+-------+---------------------+
+| 100   | 10    | 40    | 2024-01-01T00:00:00 |
++-------+-------+-------+---------------------+
+
+-- Insert a row into a new time window to cover append of a new aggregate key.
+INSERT INTO incremental_aggr_input VALUES
+    (5, 50, '2024-01-01 00:01:00');
+
+Affected Rows: 1
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('incremental_aggr_flow');
+
++-------------------------------------------+
+| ADMIN FLUSH_FLOW('incremental_aggr_flow') |
++-------------------------------------------+
+|  FLOW_FLUSHED  |
++-------------------------------------------+
+
+SELECT total, min_n, max_n, time_window FROM incremental_aggr_sink ORDER BY time_window;
+
++-------+-------+-------+---------------------+
+| total | min_n | max_n | time_window         |
++-------+-------+-------+---------------------+
+| 100   | 10    | 40    | 2024-01-01T00:00:00 |
+| 50    | 50    | 50    | 2024-01-01T00:01:00 |
++-------+-------+-------+---------------------+
+
+DROP FLOW incremental_aggr_flow;
+
+Affected Rows: 0
+
+DROP TABLE incremental_aggr_input;
+
+Affected Rows: 0
+
+DROP TABLE incremental_aggr_sink;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/flow/flow_incremental_aggr.sql b/tests/cases/standalone/common/flow/flow_incremental_aggr.sql
new file mode 100644
index 0000000000..51dd431fef
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_incremental_aggr.sql
@@ -0,0 +1,57 @@
+CREATE TABLE incremental_aggr_input (
+    host_id INT,
+    n INT,
+    ts TIMESTAMP TIME INDEX,
+    PRIMARY KEY(host_id)
+) WITH (
+    append_mode = 'true'
+);
+
+CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink AS
+SELECT
+    sum(n) AS total,
+    min(n) AS min_n,
+    max(n) AS max_n,
+    date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window
+FROM
+    incremental_aggr_input
+GROUP BY
+    time_window;
+
+INSERT INTO incremental_aggr_input VALUES
+    (1, 10, '2024-01-01 00:00:00'),
+    (2, 20, '2024-01-01 00:00:30');
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('incremental_aggr_flow');
+
+SELECT total, min_n, max_n, time_window FROM incremental_aggr_sink ORDER BY time_window;
+
+-- Move already checkpointed source rows into SST. The next incremental run
+-- must still read only the memtable delta and must not merge these old SST
+-- rows again.
+ADMIN FLUSH_TABLE('incremental_aggr_input');
+
+-- Insert more rows into the same time window. An incremental-safe flow should
+-- merge the delta aggregate with the existing sink aggregate state.
+INSERT INTO incremental_aggr_input VALUES
+    (3, 30, '2024-01-01 00:00:15'),
+    (4, 40, '2024-01-01 00:00:45');
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('incremental_aggr_flow');
+
+SELECT total, min_n, max_n, time_window FROM incremental_aggr_sink ORDER BY time_window;
+
+-- Insert a row into a new time window to cover append of a new aggregate key.
+INSERT INTO incremental_aggr_input VALUES
+    (5, 50, '2024-01-01 00:01:00');
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('incremental_aggr_flow');
+
+SELECT total, min_n, max_n, time_window FROM incremental_aggr_sink ORDER BY time_window;
+
+DROP FLOW incremental_aggr_flow;
+DROP TABLE incremental_aggr_input;
+DROP TABLE incremental_aggr_sink;
diff --git a/tests/cases/standalone/common/flow/flow_incremental_memtable.result b/tests/cases/standalone/common/flow/flow_incremental_memtable.result
new file mode 100644
index 0000000000..1e452b21ad
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_incremental_memtable.result
@@ -0,0 +1,132 @@
+-- Validate that a flow performing an incremental aggregate read only reads memtable
+-- data and does NOT re-read source rows that have already been flushed to SST after
+-- a previous checkpoint.
+CREATE TABLE flow_incr_memtable_input (
+    host_id INT,
+    n INT,
+    ts TIMESTAMP TIME INDEX,
+    PRIMARY KEY(host_id)
+) WITH (
+    append_mode = 'true'
+);
+
+Affected Rows: 0
+
+CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink AS
+SELECT
+    sum(n) AS total,
+    min(n) AS min_n,
+    max(n) AS max_n,
+    date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window
+FROM
+    flow_incr_memtable_input
+GROUP BY
+    time_window;
+
+Affected Rows: 0
+
+-- ==== Phase 1: initial insert + checkpoint ====
+INSERT INTO flow_incr_memtable_input VALUES
+    (1, 10, '2024-01-01 00:00:00'),
+    (2, 20, '2024-01-01 00:00:30');
+
+Affected Rows: 2
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('flow_incr_memtable');
+
++----------------------------------------+
+| ADMIN FLUSH_FLOW('flow_incr_memtable') |
++----------------------------------------+
+|  FLOW_FLUSHED  |
++----------------------------------------+
+
+SELECT total, min_n, max_n, time_window FROM flow_incr_memtable_sink ORDER BY time_window;
+
++-------+-------+-------+---------------------+
+| total | min_n | max_n | time_window         |
++-------+-------+-------+---------------------+
+| 30    | 10    | 20    | 2024-01-01T00:00:00 |
++-------+-------+-------+---------------------+
+
+-- ==== Phase 2: flush sink and source tables to SST ====
+-- The next incremental run must still read the flushed sink aggregate state,
+-- while skipping already-checkpointed source SST files.
+ADMIN FLUSH_TABLE('flow_incr_memtable_sink');
+
++----------------------------------------------+
+| ADMIN FLUSH_TABLE('flow_incr_memtable_sink') |
++----------------------------------------------+
+| 0                                            |
++----------------------------------------------+
+
+ADMIN FLUSH_TABLE('flow_incr_memtable_input');
+
++-----------------------------------------------+
+| ADMIN FLUSH_TABLE('flow_incr_memtable_input') |
++-----------------------------------------------+
+| 0                                             |
++-----------------------------------------------+
+
+-- ==== Phase 3: empty incremental window ====
+-- Flush the flow without inserting any new source rows to verify that
+-- the incremental read correctly handles the case where no new memtable
+-- data exists.
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('flow_incr_memtable');
+
++----------------------------------------+
+| ADMIN FLUSH_FLOW('flow_incr_memtable') |
++----------------------------------------+
+|  FLOW_FLUSHED  |
++----------------------------------------+
+
+SELECT total, min_n, max_n, time_window FROM flow_incr_memtable_sink ORDER BY time_window;
+
++-------+-------+-------+---------------------+
+| total | min_n | max_n | time_window         |
++-------+-------+-------+---------------------+
+| 30    | 10    | 20    | 2024-01-01T00:00:00 |
++-------+-------+-------+---------------------+
+
+-- ==== Phase 4: insert new delta within the same time window ====
+INSERT INTO flow_incr_memtable_input VALUES
+    (3, 30, '2024-01-01 00:00:15'),
+    (4, 40, '2024-01-01 00:00:45');
+
+Affected Rows: 2
+
+-- ==== Phase 5: flush flow again (incremental read) ====
+-- The flow must only read the new memtable delta and merge with the existing
+-- sink aggregate. If it mistakenly re-reads the SST, the result will be
+-- inflated (initial data counted twice).
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('flow_incr_memtable');
+
++----------------------------------------+
+| ADMIN FLUSH_FLOW('flow_incr_memtable') |
++----------------------------------------+
+|  FLOW_FLUSHED  |
++----------------------------------------+
+
+SELECT total, min_n, max_n, time_window FROM flow_incr_memtable_sink ORDER BY time_window;
+
++-------+-------+-------+---------------------+
+| total | min_n | max_n | time_window         |
++-------+-------+-------+---------------------+
+| 100   | 10    | 40    | 2024-01-01T00:00:00 |
++-------+-------+-------+---------------------+
+
+-- Clean up
+DROP FLOW flow_incr_memtable;
+
+Affected Rows: 0
+
+DROP TABLE flow_incr_memtable_input;
+
+Affected Rows: 0
+
+DROP TABLE flow_incr_memtable_sink;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/flow/flow_incremental_memtable.sql b/tests/cases/standalone/common/flow/flow_incremental_memtable.sql
new file mode 100644
index 0000000000..66dccbb8b3
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_incremental_memtable.sql
@@ -0,0 +1,66 @@
+-- Validate that a flow performing an incremental aggregate read only reads memtable
+-- data and does NOT re-read source rows that have already been flushed to SST after
+-- a previous checkpoint.
+CREATE TABLE flow_incr_memtable_input (
+    host_id INT,
+    n INT,
+    ts TIMESTAMP TIME INDEX,
+    PRIMARY KEY(host_id)
+) WITH (
+    append_mode = 'true'
+);
+
+CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink AS
+SELECT
+    sum(n) AS total,
+    min(n) AS min_n,
+    max(n) AS max_n,
+    date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window
+FROM
+    flow_incr_memtable_input
+GROUP BY
+    time_window;
+
+-- ==== Phase 1: initial insert + checkpoint ====
+INSERT INTO flow_incr_memtable_input VALUES
+    (1, 10, '2024-01-01 00:00:00'),
+    (2, 20, '2024-01-01 00:00:30');
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('flow_incr_memtable');
+
+SELECT total, min_n, max_n, time_window FROM flow_incr_memtable_sink ORDER BY time_window;
+
+-- ==== Phase 2: flush sink and source tables to SST ====
+-- The next incremental run must still read the flushed sink aggregate state,
+-- while skipping already-checkpointed source SST files.
+ADMIN FLUSH_TABLE('flow_incr_memtable_sink');
+ADMIN FLUSH_TABLE('flow_incr_memtable_input');
+
+-- ==== Phase 3: empty incremental window ====
+-- Flush the flow without inserting any new source rows to verify that
+-- the incremental read correctly handles the case where no new memtable
+-- data exists.
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('flow_incr_memtable');
+
+SELECT total, min_n, max_n, time_window FROM flow_incr_memtable_sink ORDER BY time_window;
+
+-- ==== Phase 4: insert new delta within the same time window ====
+INSERT INTO flow_incr_memtable_input VALUES
+    (3, 30, '2024-01-01 00:00:15'),
+    (4, 40, '2024-01-01 00:00:45');
+
+-- ==== Phase 5: flush flow again (incremental read) ====
+-- The flow must only read the new memtable delta and merge with the existing
+-- sink aggregate. If it mistakenly re-reads the SST, the result will be
+-- inflated (initial data counted twice).
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('flow_incr_memtable');
+
+SELECT total, min_n, max_n, time_window FROM flow_incr_memtable_sink ORDER BY time_window;
+
+-- Clean up
+DROP FLOW flow_incr_memtable;
+DROP TABLE flow_incr_memtable_input;
+DROP TABLE flow_incr_memtable_sink;
diff --git a/tests/cases/standalone/common/flow/flow_incremental_partitioned.result b/tests/cases/standalone/common/flow/flow_incremental_partitioned.result
new file mode 100644
index 0000000000..b56b390abd
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_incremental_partitioned.result
@@ -0,0 +1,108 @@
+-- Validate that a flow performing an incremental aggregate read on a
+-- partitioned source table (multiple regions) only reads memtable data
+-- and does NOT re-read source rows that have already been flushed to SST.
+CREATE TABLE flow_incr_part_input (
+    host_id INT,
+    n INT,
+    ts TIMESTAMP TIME INDEX,
+    PRIMARY KEY(host_id)
+)
+PARTITION ON COLUMNS (host_id) (
+    host_id < 3,
+    host_id >= 3
+)
+WITH (
+    append_mode = 'true'
+);
+
+Affected Rows: 0
+
+CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink AS
+SELECT
+    sum(n) AS total,
+    min(n) AS min_n,
+    max(n) AS max_n,
+    date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window
+FROM
+    flow_incr_part_input
+GROUP BY
+    time_window;
+
+Affected Rows: 0
+
+-- ==== Phase 1: initial insert across both partitions ====
+INSERT INTO flow_incr_part_input VALUES
+    (1, 10, '2024-01-01 00:00:00'),
+    (4, 20, '2024-01-01 00:00:30');
+
+Affected Rows: 2
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('flow_incr_part');
+
++------------------------------------+
+| ADMIN FLUSH_FLOW('flow_incr_part') |
++------------------------------------+
+|  FLOW_FLUSHED  |
++------------------------------------+
+
+SELECT total, min_n, max_n, time_window FROM flow_incr_part_sink ORDER BY time_window;
+
++-------+-------+-------+---------------------+
+| total | min_n | max_n | time_window         |
++-------+-------+-------+---------------------+
+| 30    | 10    | 20    | 2024-01-01T00:00:00 |
++-------+-------+-------+---------------------+
+
+-- ==== Phase 2: flush source table to SST ====
+-- Move already checkpointed source rows into SST so the next incremental run
+-- must skip them.
+ADMIN FLUSH_TABLE('flow_incr_part_input');
+
++-------------------------------------------+
+| ADMIN FLUSH_TABLE('flow_incr_part_input') |
++-------------------------------------------+
+| 0                                         |
++-------------------------------------------+
+
+-- ==== Phase 3: insert new delta across both partitions, same time window ====
+INSERT INTO flow_incr_part_input VALUES
+    (2, 30, '2024-01-01 00:00:15'),
+    (3, 40, '2024-01-01 00:00:45');
+
+Affected Rows: 2
+
+-- ==== Phase 4: flush flow again (incremental read) ====
+-- The flow must only read the new memtable delta from both regions and merge
+-- with the existing sink aggregate. If it mistakenly re-reads the SST, the
+-- result will be inflated (initial data counted twice).
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('flow_incr_part');
+
++------------------------------------+
+| ADMIN FLUSH_FLOW('flow_incr_part') |
++------------------------------------+
+|  FLOW_FLUSHED  |
++------------------------------------+
+
+SELECT total, min_n, max_n, time_window FROM flow_incr_part_sink ORDER BY time_window;
+
++-------+-------+-------+---------------------+
+| total | min_n | max_n | time_window         |
++-------+-------+-------+---------------------+
+| 100   | 10    | 40    | 2024-01-01T00:00:00 |
++-------+-------+-------+---------------------+
+
+-- Clean up
+DROP FLOW flow_incr_part;
+
+Affected Rows: 0
+
+DROP TABLE flow_incr_part_input;
+
+Affected Rows: 0
+
+DROP TABLE flow_incr_part_sink;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/flow/flow_incremental_partitioned.sql b/tests/cases/standalone/common/flow/flow_incremental_partitioned.sql
new file mode 100644
index 0000000000..234c9b9085
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_incremental_partitioned.sql
@@ -0,0 +1,61 @@
+-- Validate that a flow performing an incremental aggregate read on a
+-- partitioned source table (multiple regions) only reads memtable data
+-- and does NOT re-read source rows that have already been flushed to SST.
+CREATE TABLE flow_incr_part_input (
+    host_id INT,
+    n INT,
+    ts TIMESTAMP TIME INDEX,
+    PRIMARY KEY(host_id)
+)
+PARTITION ON COLUMNS (host_id) (
+    host_id < 3,
+    host_id >= 3
+)
+WITH (
+    append_mode = 'true'
+);
+
+CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink AS
+SELECT
+    sum(n) AS total,
+    min(n) AS min_n,
+    max(n) AS max_n,
+    date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window
+FROM
+    flow_incr_part_input
+GROUP BY
+    time_window;
+
+-- ==== Phase 1: initial insert across both partitions ====
+INSERT INTO flow_incr_part_input VALUES
+    (1, 10, '2024-01-01 00:00:00'),
+    (4, 20, '2024-01-01 00:00:30');
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('flow_incr_part');
+
+SELECT total, min_n, max_n, time_window FROM flow_incr_part_sink ORDER BY time_window;
+
+-- ==== Phase 2: flush source table to SST ====
+-- Move already checkpointed source rows into SST so the next incremental run
+-- must skip them.
+ADMIN FLUSH_TABLE('flow_incr_part_input');
+
+-- ==== Phase 3: insert new delta across both partitions, same time window ====
+INSERT INTO flow_incr_part_input VALUES
+    (2, 30, '2024-01-01 00:00:15'),
+    (3, 40, '2024-01-01 00:00:45');
+
+-- ==== Phase 4: flush flow again (incremental read) ====
+-- The flow must only read the new memtable delta from both regions and merge
+-- with the existing sink aggregate. If it mistakenly re-reads the SST, the
+-- result will be inflated (initial data counted twice).
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('flow_incr_part');
+
+SELECT total, min_n, max_n, time_window FROM flow_incr_part_sink ORDER BY time_window;
+
+-- Clean up
+DROP FLOW flow_incr_part;
+DROP TABLE flow_incr_part_input;
+DROP TABLE flow_incr_part_sink;
diff --git a/tests/cases/standalone/common/flow/flow_pending.result b/tests/cases/standalone/common/flow/flow_pending.result
new file mode 100644
index 0000000000..d6fe01b38a
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_pending.result
@@ -0,0 +1,52 @@
+CREATE FLOW pending_without_defer
+SINK TO pending_sink
+AS SELECT val FROM pending_source;
+
+Error: 1004(InvalidArguments), Invalid SQL, error: missing source tables for flow 'pending_without_defer'; use WITH (defer_on_missing_source = true) to create a pending flow
+
+CREATE FLOW pending_with_defer
+SINK TO pending_sink
+WITH (defer_on_missing_source = true)
+AS SELECT val FROM pending_source WHERE val > 10;
+
+Affected Rows: 0
+
+SHOW CREATE FLOW pending_with_defer;
+
++--------------------+--------------------------------------------------+
+| Flow               | Create Flow                                      |
++--------------------+--------------------------------------------------+
+| pending_with_defer | CREATE FLOW IF NOT EXISTS pending_with_defer     |
+|                    | SINK TO public.pending_sink                      |
+|                    | WITH (defer_on_missing_source = 'true')          |
+|                    | AS SELECT val FROM pending_source WHERE val > 10 |
++--------------------+--------------------------------------------------+
+
+SELECT
+    flow_definition,
+    source_table_ids,
+    source_table_names,
+    flownode_ids,
+    options LIKE '%"defer_on_missing_source":"true"%' AS has_defer_option,
+    options LIKE '%"flow_type":"batching"%' AS has_flow_type_option
+FROM INFORMATION_SCHEMA.FLOWS
+WHERE flow_name = 'pending_with_defer';
+
++--------------------------------------------------+------------------+--------------------+--------------+------------------+----------------------+
+| flow_definition                                  | source_table_ids | source_table_names | flownode_ids | has_defer_option | has_flow_type_option |
++--------------------------------------------------+------------------+--------------------+--------------+------------------+----------------------+
+| CREATE FLOW IF NOT EXISTS pending_with_defer     | []               |                    | {}           | true             | true                 |
+| SINK TO public.pending_sink                      |                  |                    |              |                  |                      |
+| WITH (defer_on_missing_source = 'true')          |                  |                    |              |                  |                      |
+| AS SELECT val FROM pending_source WHERE val > 10 |                  |                    |              |                  |                      |
++--------------------------------------------------+------------------+--------------------+--------------+------------------+----------------------+
+
+DROP FLOW pending_with_defer;
+
+Affected Rows: 0
+
+SELECT flow_name FROM INFORMATION_SCHEMA.FLOWS WHERE flow_name = 'pending_with_defer';
+
+++
+++
+
diff --git a/tests/cases/standalone/common/flow/flow_pending.sql b/tests/cases/standalone/common/flow/flow_pending.sql
new file mode 100644
index 0000000000..498f5b2782
--- /dev/null
+++ b/tests/cases/standalone/common/flow/flow_pending.sql
@@ -0,0 +1,24 @@
+CREATE FLOW pending_without_defer
+SINK TO pending_sink
+AS SELECT val FROM pending_source;
+
+CREATE FLOW pending_with_defer
+SINK TO pending_sink
+WITH (defer_on_missing_source = true)
+AS SELECT val FROM pending_source WHERE val > 10;
+
+SHOW CREATE FLOW pending_with_defer;
+
+SELECT
+    flow_definition,
+    source_table_ids,
+    source_table_names,
+    flownode_ids,
+    options LIKE '%"defer_on_missing_source":"true"%' AS has_defer_option,
+    options LIKE '%"flow_type":"batching"%' AS has_flow_type_option
+FROM INFORMATION_SCHEMA.FLOWS
+WHERE flow_name = 'pending_with_defer';
+
+DROP FLOW pending_with_defer;
+
+SELECT flow_name FROM INFORMATION_SCHEMA.FLOWS WHERE flow_name = 'pending_with_defer';
diff --git a/tests/cases/standalone/common/flow/flow_rebuild.result b/tests/cases/standalone/common/flow/flow_rebuild.result
index db2f314b32..bd2bf9c892 100644
--- a/tests/cases/standalone/common/flow/flow_rebuild.result
+++ b/tests/cases/standalone/common/flow/flow_rebuild.result
@@ -273,6 +273,7 @@ ADMIN FLUSH_FLOW('test_wildcard_basic');
 |  FLOW_FLUSHED  |
 +-----------------------------------------+
 
+-- SQLNESS SLEEP 3s
 SELECT wildcard FROM out_basic;
 
 +----------+
diff --git a/tests/cases/standalone/common/flow/flow_rebuild.sql b/tests/cases/standalone/common/flow/flow_rebuild.sql
index c86c781d5d..4f30c80ea2 100644
--- a/tests/cases/standalone/common/flow/flow_rebuild.sql
+++ b/tests/cases/standalone/common/flow/flow_rebuild.sql
@@ -151,6 +151,7 @@ VALUES
 -- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
 ADMIN FLUSH_FLOW('test_wildcard_basic');
 
+-- SQLNESS SLEEP 3s
 SELECT wildcard FROM out_basic;
 
 -- test again, this time with db restart
diff --git a/tests/cases/standalone/common/information_schema/region_info.result b/tests/cases/standalone/common/information_schema/region_info.result
new file mode 100644
index 0000000000..ed9a07add6
--- /dev/null
+++ b/tests/cases/standalone/common/information_schema/region_info.result
@@ -0,0 +1,63 @@
+DESC TABLE information_schema.region_info;
+
++------------------------+---------+-----+------+---------+---------------+
+| Column                 | Type    | Key | Null | Default | Semantic Type |
++------------------------+---------+-----+------+---------+---------------+
+| region_id              | UInt64  |     | NO   |         | FIELD         |
+| table_id               | UInt32  |     | NO   |         | FIELD         |
+| region_number          | UInt32  |     | NO   |         | FIELD         |
+| region_group           | UInt8   |     | NO   |         | FIELD         |
+| region_sequence        | UInt32  |     | NO   |         | FIELD         |
+| state                  | String  |     | NO   |         | FIELD         |
+| role                   | String  |     | NO   |         | FIELD         |
+| writable               | Boolean |     | NO   |         | FIELD         |
+| committed_sequence     | UInt64  |     | NO   |         | FIELD         |
+| flushed_sequence       | UInt64  |     | YES  |         | FIELD         |
+| manifest_version       | UInt64  |     | NO   |         | FIELD         |
+| compaction_time_window | String  |     | YES  |         | FIELD         |
+| region_options         | String  |     | NO   |         | FIELD         |
+| sst_format             | String  |     | NO   |         | FIELD         |
+| node_id                | UInt64  |     | YES  |         | FIELD         |
++------------------------+---------+-----+------+---------+---------------+
+
+CREATE TABLE region_info_case (
+  a INT PRIMARY KEY,
+  ts TIMESTAMP TIME INDEX,
+)
+WITH ("sst_format" = "flat");
+
+Affected Rows: 0
+
+INSERT INTO region_info_case VALUES (1, 1), (2, 2);
+
+Affected Rows: 2
+
+ADMIN FLUSH_TABLE('region_info_case');
+
++---------------------------------------+
+| ADMIN FLUSH_TABLE('region_info_case') |
++---------------------------------------+
+| 0                                     |
++---------------------------------------+
+
+-- SQLNESS REPLACE (\s+\d+\s+) <NUM>
+-- SQLNESS REPLACE (\{".*"\}) <JSON>
+-- SQLNESS REPLACE (-{40,}) ----------------
+-- SQLNESS REPLACE (region_options\s+\|) region_options |
+SELECT region_id, state, role, writable, committed_sequence, flushed_sequence, manifest_version, compaction_time_window, region_options, sst_format
+FROM information_schema.region_info
+WHERE region_id IN (
+  SELECT region_id FROM information_schema.region_peers WHERE table_name = 'region_info_case'
+)
+ORDER BY region_id;
+
++---------------+------------------+--------+----------+--------------------+------------------+------------------+------------------------+----------------+------------+
+| region_id     | state            | role   | writable | committed_sequence | flushed_sequence | manifest_version | compaction_time_window | region_options | sst_format |
++---------------+------------------+--------+----------+--------------------+------------------+------------------+------------------------+----------------+------------+
+|<NUM>| Leader(Writable) | Leader | true     |<NUM>|<NUM>|<NUM>|                        | <JSON> | flat       |
++---------------+------------------+--------+----------+--------------------+------------------+------------------+------------------------+----------------+------------+
+
+DROP TABLE region_info_case;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/information_schema/region_info.sql b/tests/cases/standalone/common/information_schema/region_info.sql
new file mode 100644
index 0000000000..1aa682393f
--- /dev/null
+++ b/tests/cases/standalone/common/information_schema/region_info.sql
@@ -0,0 +1,24 @@
+DESC TABLE information_schema.region_info;
+
+CREATE TABLE region_info_case (
+  a INT PRIMARY KEY,
+  ts TIMESTAMP TIME INDEX,
+)
+WITH ("sst_format" = "flat");
+
+INSERT INTO region_info_case VALUES (1, 1), (2, 2);
+
+ADMIN FLUSH_TABLE('region_info_case');
+
+-- SQLNESS REPLACE (\s+\d+\s+) <NUM>
+-- SQLNESS REPLACE (\{".*"\}) <JSON>
+-- SQLNESS REPLACE (-{40,}) ----------------
+-- SQLNESS REPLACE (region_options\s+\|) region_options |
+SELECT region_id, state, role, writable, committed_sequence, flushed_sequence, manifest_version, compaction_time_window, region_options, sst_format
+FROM information_schema.region_info
+WHERE region_id IN (
+  SELECT region_id FROM information_schema.region_peers WHERE table_name = 'region_info_case'
+)
+ORDER BY region_id;
+
+DROP TABLE region_info_case;
diff --git a/tests/cases/standalone/common/insert/physical_metric_table_insert.result b/tests/cases/standalone/common/insert/physical_metric_table_insert.result
new file mode 100644
index 0000000000..63b2997209
--- /dev/null
+++ b/tests/cases/standalone/common/insert/physical_metric_table_insert.result
@@ -0,0 +1,28 @@
+CREATE TABLE IF NOT EXISTS demo_metric_table (
+  label STRING NULL,
+  ts TIMESTAMP(3) NOT NULL,
+  val DOUBLE NULL,
+  TIME INDEX (ts),
+  PRIMARY KEY (label)
+)
+PARTITION ON COLUMNS (label) (
+  label < 'M',
+  label >= 'M'
+)
+ENGINE=metric
+WITH(
+  physical_metric_table = 'true',
+  skip_wal = 'true'
+);
+
+Affected Rows: 0
+
+INSERT INTO demo_metric_table (label, ts, val)
+VALUES ('A', '2026-05-19 00:00:00', 1.0);
+
+Error: 1001(Unsupported), Write request to physical region is forbidden
+
+DROP TABLE demo_metric_table;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/insert/physical_metric_table_insert.sql b/tests/cases/standalone/common/insert/physical_metric_table_insert.sql
new file mode 100644
index 0000000000..0be352986f
--- /dev/null
+++ b/tests/cases/standalone/common/insert/physical_metric_table_insert.sql
@@ -0,0 +1,21 @@
+CREATE TABLE IF NOT EXISTS demo_metric_table (
+  label STRING NULL,
+  ts TIMESTAMP(3) NOT NULL,
+  val DOUBLE NULL,
+  TIME INDEX (ts),
+  PRIMARY KEY (label)
+)
+PARTITION ON COLUMNS (label) (
+  label < 'M',
+  label >= 'M'
+)
+ENGINE=metric
+WITH(
+  physical_metric_table = 'true',
+  skip_wal = 'true'
+);
+
+INSERT INTO demo_metric_table (label, ts, val)
+VALUES ('A', '2026-05-19 00:00:00', 1.0);
+
+DROP TABLE demo_metric_table;
diff --git a/tests/cases/standalone/common/promql/encode_substrait.result b/tests/cases/standalone/common/promql/encode_substrait.result
index a154d9e5a2..deb72317e9 100644
--- a/tests/cases/standalone/common/promql/encode_substrait.result
+++ b/tests/cases/standalone/common/promql/encode_substrait.result
@@ -16,24 +16,26 @@ tql explain (0, 100, '1s')
       tag_a="ffa",
     }[1h])[12h:1h];
 
-+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                |
-+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                                                     |
-|               | PromRangeManipulate: req range=[0..100000], interval=[1000], eval range=[43200000], time index=[ts], values=["prom_increase(ts_range,val,ts,Int64(3600000))"]                       |
-|               |   Filter: prom_increase(ts_range,val,ts,Int64(3600000)) IS NOT NULL                                                                                                                 |
-|               |     Projection: count_total.ts, prom_increase(ts_range, val, count_total.ts, Int64(3600000)) AS prom_increase(ts_range,val,ts,Int64(3600000)), count_total.tag_a, count_total.tag_b |
-|               |       PromRangeManipulate: req range=[-39600000..100000], interval=[3600000], eval range=[3600000], time index=[ts], values=["val"]                                                 |
-|               |         PromSeriesNormalize: offset=[0], time index=[ts], filter NaN: [true]                                                                                                        |
-|               |           PromSeriesDivide: tags=["tag_a", "tag_b"]                                                                                                                                 |
-|               |             Sort: count_total.tag_a ASC NULLS FIRST, count_total.tag_b ASC NULLS FIRST, count_total.ts ASC NULLS FIRST                                                              |
-|               |               Filter: count_total.tag_a = Utf8("ffa") AND count_total.ts >= TimestampMillisecond(-43199999, None) AND count_total.ts <= TimestampMillisecond(100000, None)          |
-|               |                 TableScan: count_total                                                                                                                                              |
-|               | ]]                                                                                                                                                                                  |
-| physical_plan | CooperativeExec                                                                                                                                                                     |
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                    |
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                                                                                         |
+|               | PromRangeManipulate: req range=[0..100000], interval=[1000], eval range=[43200000], time index=[ts], values=["prom_increase(ts_range,val,ts,Int64(3600000))"]                           |
+|               |   PromSeriesDivide: tags=["tag_a", "tag_b"]                                                                                                                                             |
+|               |     Sort: count_total.tag_a ASC NULLS FIRST, count_total.tag_b ASC NULLS FIRST, count_total.ts ASC NULLS FIRST                                                                          |
+|               |       Filter: prom_increase(ts_range,val,ts,Int64(3600000)) IS NOT NULL                                                                                                                 |
+|               |         Projection: count_total.ts, prom_increase(ts_range, val, count_total.ts, Int64(3600000)) AS prom_increase(ts_range,val,ts,Int64(3600000)), count_total.tag_a, count_total.tag_b |
+|               |           PromRangeManipulate: req range=[-39600000..100000], interval=[3600000], eval range=[3600000], time index=[ts], values=["val"]                                                 |
+|               |             PromSeriesNormalize: offset=[0], time index=[ts], filter NaN: [true]                                                                                                        |
+|               |               PromSeriesDivide: tags=["tag_a", "tag_b"]                                                                                                                                 |
+|               |                 Sort: count_total.tag_a ASC NULLS FIRST, count_total.tag_b ASC NULLS FIRST, count_total.ts ASC NULLS FIRST                                                              |
+|               |                   Filter: count_total.tag_a = Utf8("ffa") AND count_total.ts >= TimestampMillisecond(-43199999, None) AND count_total.ts <= TimestampMillisecond(100000, None)          |
+|               |                     TableScan: count_total                                                                                                                                              |
+|               | ]]                                                                                                                                                                                      |
+| physical_plan | CooperativeExec                                                                                                                                                                         |
 |               |   MergeScanExec: REDACTED
-|               |                                                                                                                                                                                     |
-+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+|               |                                                                                                                                                                                         |
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
 tql eval (0, 100, '1s') 
     increase(count_total{
diff --git a/tests/cases/standalone/common/promql/histogram_quantile_binary_op.result b/tests/cases/standalone/common/promql/histogram_quantile_binary_op.result
new file mode 100644
index 0000000000..df21957356
--- /dev/null
+++ b/tests/cases/standalone/common/promql/histogram_quantile_binary_op.result
@@ -0,0 +1,91 @@
+-- Reproduce https://github.com/GreptimeTeam/greptimedb/issues/8144
+-- Binary comparison/arithmetic applied to a histogram_quantile() result.
+create table http_request_duration_seconds_bucket (
+    ts timestamp time index,
+    le string,
+    pod string,
+    val double,
+    primary key (pod, le),
+);
+
+Affected Rows: 0
+
+insert into http_request_duration_seconds_bucket values
+    (2900000, "0.01", "pod-a", 10),
+    (2900000, "0.05", "pod-a", 20),
+    (2900000, "0.1", "pod-a", 30),
+    (2900000, "+Inf", "pod-a", 40),
+    (3000000, "0.01", "pod-a", 20),
+    (3000000, "0.05", "pod-a", 50),
+    (3000000, "0.1", "pod-a", 80),
+    (3000000, "+Inf", "pod-a", 100),
+    (2900000, "0.01", "pod-b", 5),
+    (2900000, "0.05", "pod-b", 8),
+    (2900000, "0.1", "pod-b", 12),
+    (2900000, "+Inf", "pod-b", 15),
+    (3000000, "0.01", "pod-b", 10),
+    (3000000, "0.05", "pod-b", 25),
+    (3000000, "0.1", "pod-b", 45),
+    (3000000, "+Inf", "pod-b", 60);
+
+Affected Rows: 16
+
+-- histogram_quantile alone
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3000, '1s') histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m])));
+
++-------+---------------------+-----------------------------------------------+
+| pod   | ts                  | sum(prom_rate(ts_range,val,ts,Int64(300000))) |
++-------+---------------------+-----------------------------------------------+
+| pod-a | 1970-01-01T00:50:00 | 0.05                                          |
+| pod-b | 1970-01-01T00:50:00 | 0.062499999999999986                          |
++-------+---------------------+-----------------------------------------------+
+
+-- comparison filter
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3000, '1s') histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m]))) >= 0.02;
+
++-------+---------------------+-----------------------------------------------+
+| pod   | ts                  | sum(prom_rate(ts_range,val,ts,Int64(300000))) |
++-------+---------------------+-----------------------------------------------+
+| pod-a | 1970-01-01T00:50:00 | 0.05                                          |
+| pod-b | 1970-01-01T00:50:00 | 0.062499999999999986                          |
++-------+---------------------+-----------------------------------------------+
+
+-- arithmetic
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3000, '1s') histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m]))) + 0;
+
++-------+---------------------+------------------------------------------------------------+
+| pod   | ts                  | sum(prom_rate(ts_range,val,ts,Int64(300000))) + Float64(0) |
++-------+---------------------+------------------------------------------------------------+
+| pod-a | 1970-01-01T00:50:00 | 0.05                                                       |
+| pod-b | 1970-01-01T00:50:00 | 0.062499999999999986                                       |
++-------+---------------------+------------------------------------------------------------+
+
+-- bool modifier
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3000, '1s') histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m]))) >= bool 0.02;
+
++-------+---------------------+----------------------------------------------------------------+
+| pod   | ts                  | sum(prom_rate(ts_range,val,ts,Int64(300000))) >= Float64(0.02) |
++-------+---------------------+----------------------------------------------------------------+
+| pod-a | 1970-01-01T00:50:00 | 1.0                                                            |
+| pod-b | 1970-01-01T00:50:00 | 1.0                                                            |
++-------+---------------------+----------------------------------------------------------------+
+
+-- subquery
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3000, '1s') count_over_time((histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m]))) >= 0.02)[10m:1m]);
+
++---------------------+------------------------------------------------------------------------------+-------+
+| ts                  | prom_count_over_time(ts_range,sum(prom_rate(ts_range,val,ts,Int64(300000)))) | pod   |
++---------------------+------------------------------------------------------------------------------+-------+
+| 1970-01-01T00:50:00 | 1.0                                                                          | pod-a |
+| 1970-01-01T00:50:00 | 1.0                                                                          | pod-b |
++---------------------+------------------------------------------------------------------------------+-------+
+
+drop table http_request_duration_seconds_bucket;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/promql/histogram_quantile_binary_op.sql b/tests/cases/standalone/common/promql/histogram_quantile_binary_op.sql
new file mode 100644
index 0000000000..d6e936eae5
--- /dev/null
+++ b/tests/cases/standalone/common/promql/histogram_quantile_binary_op.sql
@@ -0,0 +1,50 @@
+-- Reproduce https://github.com/GreptimeTeam/greptimedb/issues/8144
+-- Binary comparison/arithmetic applied to a histogram_quantile() result.
+
+create table http_request_duration_seconds_bucket (
+    ts timestamp time index,
+    le string,
+    pod string,
+    val double,
+    primary key (pod, le),
+);
+
+insert into http_request_duration_seconds_bucket values
+    (2900000, "0.01", "pod-a", 10),
+    (2900000, "0.05", "pod-a", 20),
+    (2900000, "0.1", "pod-a", 30),
+    (2900000, "+Inf", "pod-a", 40),
+    (3000000, "0.01", "pod-a", 20),
+    (3000000, "0.05", "pod-a", 50),
+    (3000000, "0.1", "pod-a", 80),
+    (3000000, "+Inf", "pod-a", 100),
+    (2900000, "0.01", "pod-b", 5),
+    (2900000, "0.05", "pod-b", 8),
+    (2900000, "0.1", "pod-b", 12),
+    (2900000, "+Inf", "pod-b", 15),
+    (3000000, "0.01", "pod-b", 10),
+    (3000000, "0.05", "pod-b", 25),
+    (3000000, "0.1", "pod-b", 45),
+    (3000000, "+Inf", "pod-b", 60);
+
+-- histogram_quantile alone
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3000, '1s') histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m])));
+
+-- comparison filter
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3000, '1s') histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m]))) >= 0.02;
+
+-- arithmetic
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3000, '1s') histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m]))) + 0;
+
+-- bool modifier
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3000, '1s') histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m]))) >= bool 0.02;
+
+-- subquery
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3000, '1s') count_over_time((histogram_quantile(0.5, sum by (le, pod) (rate(http_request_duration_seconds_bucket[5m]))) >= 0.02)[10m:1m]);
+
+drop table http_request_duration_seconds_bucket;
diff --git a/tests/cases/standalone/common/promql/tsid_binary_join_regression.result b/tests/cases/standalone/common/promql/tsid_binary_join_regression.result
index 3640291dc3..d414eb6bba 100644
--- a/tests/cases/standalone/common/promql/tsid_binary_join_regression.result
+++ b/tests/cases/standalone/common/promql/tsid_binary_join_regression.result
@@ -71,11 +71,11 @@ TQL ANALYZE (0, 5, '5s') tsid_binary_join_left / tsid_binary_join_right;
 | stage | node | plan_|
 +-+-+-+
 | 0_| 0_|_ProjectionExec: expr=[host@2 as host, job@3 as job, ts@5 as ts, __tsid@4 as __tsid, greptime_value@0 / greptime_value@1 as tsid_binary_join_left.greptime_value / tsid_binary_join_right.greptime_value] REDACTED
-|_|_|_HashJoinExec: mode=Partitioned, join_type=Inner, on=[(__tsid@1, __tsid@3), (ts@2, ts@4)], projection=[greptime_value@0, greptime_value@3, host@4, job@5, __tsid@6, ts@7], NullsEqual: true REDACTED
-|_|_|_RepartitionExec: partitioning=Hash([__tsid@1, ts@2],REDACTED
+|_|_|_HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(__tsid@1, __tsid@3), (ts@2, ts@4)], projection=[greptime_value@0, greptime_value@3, host@4, job@5, __tsid@6, ts@7], NullsEqual: true REDACTED
+|_|_|_CoalescePartitionsExec REDACTED
 |_|_|_ProjectionExec: expr=[greptime_value@0 as greptime_value, __tsid@3 as __tsid, ts@4 as ts] REDACTED
 |_|_|_MergeScanExec: REDACTED
-|_|_|_RepartitionExec: partitioning=Hash([__tsid@3, ts@4],REDACTED
+|_|_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
 | 1_| 0_|_PromInstantManipulateExec: range=[0..5000], lookback=[300000], interval=[5000], time index=[ts] REDACTED
@@ -189,11 +189,11 @@ TQL ANALYZE (0, 5, '5s') tsid_binary_join_left > bool tsid_binary_join_right;
 | stage | node | plan_|
 +-+-+-+
 | 0_| 0_|_ProjectionExec: expr=[host@2 as host, job@3 as job, ts@5 as ts, __tsid@4 as __tsid, CAST(greptime_value@1 < greptime_value@0 AS Float64) as tsid_binary_join_left.greptime_value > tsid_binary_join_right.greptime_value] REDACTED
-|_|_|_HashJoinExec: mode=Partitioned, join_type=Inner, on=[(__tsid@1, __tsid@3), (ts@2, ts@4)], projection=[greptime_value@0, greptime_value@3, host@4, job@5, __tsid@6, ts@7], NullsEqual: true REDACTED
-|_|_|_RepartitionExec: partitioning=Hash([__tsid@1, ts@2],REDACTED
+|_|_|_HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(__tsid@1, __tsid@3), (ts@2, ts@4)], projection=[greptime_value@0, greptime_value@3, host@4, job@5, __tsid@6, ts@7], NullsEqual: true REDACTED
+|_|_|_CoalescePartitionsExec REDACTED
 |_|_|_ProjectionExec: expr=[greptime_value@0 as greptime_value, __tsid@3 as __tsid, ts@4 as ts] REDACTED
 |_|_|_MergeScanExec: REDACTED
-|_|_|_RepartitionExec: partitioning=Hash([__tsid@3, ts@4],REDACTED
+|_|_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
 | 1_| 0_|_PromInstantManipulateExec: range=[0..5000], lookback=[300000], interval=[5000], time index=[ts] REDACTED
diff --git a/tests/cases/standalone/common/show/show_databases_tables.result b/tests/cases/standalone/common/show/show_databases_tables.result
index d817227392..e816e989d9 100644
--- a/tests/cases/standalone/common/show/show_databases_tables.result
+++ b/tests/cases/standalone/common/show/show_databases_tables.result
@@ -49,6 +49,7 @@ SHOW TABLES;
 | process_list                          |
 | profiling                             |
 | referential_constraints               |
+| region_info                           |
 | region_peers                          |
 | region_statistics                     |
 | routines                              |
@@ -99,6 +100,7 @@ SHOW FULL TABLES;
 | process_list                          | LOCAL TEMPORARY |
 | profiling                             | LOCAL TEMPORARY |
 | referential_constraints               | LOCAL TEMPORARY |
+| region_info                           | LOCAL TEMPORARY |
 | region_peers                          | LOCAL TEMPORARY |
 | region_statistics                     | LOCAL TEMPORARY |
 | routines                              | LOCAL TEMPORARY |
@@ -143,6 +145,7 @@ SHOW TABLE STATUS;
 |process_list||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
 |profiling||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
 |referential_constraints||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
+|region_info||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
 |region_peers||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
 |region_statistics||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
 |routines||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
diff --git a/tests/cases/standalone/common/system/information_schema.result b/tests/cases/standalone/common/system/information_schema.result
index 38f3ea52a4..6cff6e2ce9 100644
--- a/tests/cases/standalone/common/system/information_schema.result
+++ b/tests/cases/standalone/common/system/information_schema.result
@@ -36,6 +36,7 @@ order by table_schema, table_name;
 |greptime|information_schema|process_list|LOCALTEMPORARY|36|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
 |greptime|information_schema|profiling|LOCALTEMPORARY|19|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
 |greptime|information_schema|referential_constraints|LOCALTEMPORARY|20|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
+|greptime|information_schema|region_info|LOCALTEMPORARY|41|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
 |greptime|information_schema|region_peers|LOCALTEMPORARY|29|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
 |greptime|information_schema|region_statistics|LOCALTEMPORARY|35|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
 |greptime|information_schema|routines|LOCALTEMPORARY|21|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
@@ -316,6 +317,21 @@ select * from information_schema.columns order by table_schema, table_name, colu
 | greptime      | information_schema | referential_constraints               | unique_constraint_name            | 6                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
 | greptime      | information_schema | referential_constraints               | unique_constraint_schema          | 5                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
 | greptime      | information_schema | referential_constraints               | update_rule                       | 8                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
+| greptime      | information_schema | region_info                           | committed_sequence                | 9                |                          |                        | 20                | 0             |                    |                    |                |            |       | select,insert |                       | UInt64               | bigint unsigned     | FIELD         |                | No          | bigint unsigned     |                |        |
+| greptime      | information_schema | region_info                           | compaction_time_window            | 12               | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | Yes         | string              |                |        |
+| greptime      | information_schema | region_info                           | flushed_sequence                  | 10               |                          |                        | 20                | 0             |                    |                    |                |            |       | select,insert |                       | UInt64               | bigint unsigned     | FIELD         |                | Yes         | bigint unsigned     |                |        |
+| greptime      | information_schema | region_info                           | manifest_version                  | 11               |                          |                        | 20                | 0             |                    |                    |                |            |       | select,insert |                       | UInt64               | bigint unsigned     | FIELD         |                | No          | bigint unsigned     |                |        |
+| greptime      | information_schema | region_info                           | node_id                           | 15               |                          |                        | 20                | 0             |                    |                    |                |            |       | select,insert |                       | UInt64               | bigint unsigned     | FIELD         |                | Yes         | bigint unsigned     |                |        |
+| greptime      | information_schema | region_info                           | region_group                      | 4                |                          |                        | 3                 | 0             |                    |                    |                |            |       | select,insert |                       | UInt8                | tinyint unsigned    | FIELD         |                | No          | tinyint unsigned    |                |        |
+| greptime      | information_schema | region_info                           | region_id                         | 1                |                          |                        | 20                | 0             |                    |                    |                |            |       | select,insert |                       | UInt64               | bigint unsigned     | FIELD         |                | No          | bigint unsigned     |                |        |
+| greptime      | information_schema | region_info                           | region_number                     | 3                |                          |                        | 10                | 0             |                    |                    |                |            |       | select,insert |                       | UInt32               | int unsigned        | FIELD         |                | No          | int unsigned        |                |        |
+| greptime      | information_schema | region_info                           | region_options                    | 13               | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
+| greptime      | information_schema | region_info                           | region_sequence                   | 5                |                          |                        | 10                | 0             |                    |                    |                |            |       | select,insert |                       | UInt32               | int unsigned        | FIELD         |                | No          | int unsigned        |                |        |
+| greptime      | information_schema | region_info                           | role                              | 7                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
+| greptime      | information_schema | region_info                           | sst_format                        | 14               | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
+| greptime      | information_schema | region_info                           | state                             | 6                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
+| greptime      | information_schema | region_info                           | table_id                          | 2                |                          |                        | 10                | 0             |                    |                    |                |            |       | select,insert |                       | UInt32               | int unsigned        | FIELD         |                | No          | int unsigned        |                |        |
+| greptime      | information_schema | region_info                           | writable                          | 8                |                          |                        |                   |               |                    |                    |                |            |       | select,insert |                       | Boolean              | boolean             | FIELD         |                | No          | boolean             |                |        |
 | greptime      | information_schema | region_peers                          | down_seconds                      | 9                |                          |                        | 19                | 0             |                    |                    |                |            |       | select,insert |                       | Int64                | bigint              | FIELD         |                | Yes         | bigint              |                |        |
 | greptime      | information_schema | region_peers                          | is_leader                         | 7                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | Yes         | string              |                |        |
 | greptime      | information_schema | region_peers                          | peer_addr                         | 6                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | Yes         | string              |                |        |
diff --git a/tests/cases/standalone/common/tql-explain-analyze/explain.result b/tests/cases/standalone/common/tql-explain-analyze/explain.result
index 65532d738b..e60a6b74f6 100644
--- a/tests/cases/standalone/common/tql-explain-analyze/explain.result
+++ b/tests/cases/standalone/common/tql-explain-analyze/explain.result
@@ -182,6 +182,7 @@ TQL EXPLAIN VERBOSE (0, 10, '5s') test;
 | physical_plan after FilterPushdown_| SAME TEXT AS ABOVE_|
 | physical_plan after parallelize_scan_| SAME TEXT AS ABOVE_|
 | physical_plan after PassDistributionRule_| SAME TEXT AS ABOVE_|
+| physical_plan after PromqlTsidNarrowJoin_| SAME TEXT AS ABOVE_|
 | physical_plan after EnforceSorting_| SAME TEXT AS ABOVE_|
 | physical_plan after EnforceDistribution_| SAME TEXT AS ABOVE_|
 | physical_plan after CombinePartialFinalAggregate_| SAME TEXT AS ABOVE_|
@@ -332,6 +333,7 @@ TQL EXPLAIN VERBOSE (0, 10, '5s') test AS series;
 | physical_plan after FilterPushdown_| SAME TEXT AS ABOVE_|
 | physical_plan after parallelize_scan_| SAME TEXT AS ABOVE_|
 | physical_plan after PassDistributionRule_| SAME TEXT AS ABOVE_|
+| physical_plan after PromqlTsidNarrowJoin_| SAME TEXT AS ABOVE_|
 | physical_plan after EnforceSorting_| SAME TEXT AS ABOVE_|
 | physical_plan after EnforceDistribution_| SAME TEXT AS ABOVE_|
 | physical_plan after CombinePartialFinalAggregate_| SAME TEXT AS ABOVE_|
@@ -654,6 +656,7 @@ TQL EXPLAIN VERBOSE (0, 10, '5s') test_nano;
 | physical_plan after FilterPushdown_| SAME TEXT AS ABOVE_|
 | physical_plan after parallelize_scan_| SAME TEXT AS ABOVE_|
 | physical_plan after PassDistributionRule_| SAME TEXT AS ABOVE_|
+| physical_plan after PromqlTsidNarrowJoin_| SAME TEXT AS ABOVE_|
 | physical_plan after EnforceSorting_| OutputRequirementExec: order_by=[], dist_by=Unspecified_|
 |_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[5000], time index=[j]_|
 |_|_PromSeriesDivideExec: tags=["k"]_|
diff --git a/tests/cases/standalone/common/types/json/json2.result b/tests/cases/standalone/common/types/json/json2.result
index fdae802f3b..7de73f2a78 100644
--- a/tests/cases/standalone/common/types/json/json2.result
+++ b/tests/cases/standalone/common/types/json/json2.result
@@ -111,12 +111,29 @@ select j.a.b from json2_table order by ts;
 | -4                                  |
 |                                     |
 |                                     |
-| "s7"                                |
+| s7                                  |
 | 8                                   |
 |                                     |
 | 10                                  |
 +-------------------------------------+
 
+select j.a, j.a.x from json2_table order by ts;
+
++-----------------------------------+-------------------------------------+
+| json_get(json2_table.j,Utf8("a")) | json_get(json2_table.j,Utf8("a.x")) |
++-----------------------------------+-------------------------------------+
+| {"b":1}                           |                                     |
+| {"b":-2}                          |                                     |
+| {"b":3}                           |                                     |
+| {"b":-4}                          |                                     |
+|                                   |                                     |
+|                                   |                                     |
+| {"b":"s7"}                        |                                     |
+| {"b":8}                           |                                     |
+| {"b":null,"x":true}               | true                                |
+| {"b":10,"x":null}                 | null                                |
++-----------------------------------+-------------------------------------+
+
 select j.c, j.y from json2_table order by ts;
 
 +-----------------------------------+-----------------------------------+
@@ -129,11 +146,49 @@ select j.c, j.y from json2_table order by ts;
 | s5                                |                                   |
 | s6                                |                                   |
 | [1]                               |                                   |
-| "s8"                              |                                   |
+| s8                                |                                   |
 | s9                                |                                   |
 |                                   | false                             |
 +-----------------------------------+-----------------------------------+
 
+select j from json2_table order by ts;
+
+Error: 3001(EngineExecuteQuery), Failed to align JSON array, reason: Invalid argument error: use StructArray::try_new_with_length or StructArray::new_empty_fields to create a struct array with no fields so that the length can be set correctly
+
+select * from json2_table order by ts;
+
+Error: 3001(EngineExecuteQuery), Failed to align JSON array, reason: Invalid argument error: use StructArray::try_new_with_length or StructArray::new_empty_fields to create a struct array with no fields so that the length can be set correctly
+
+select j.a.b + 1 from json2_table order by ts;
+
++------------------------------------------------------------+
+| json_get(json2_table.j,Utf8("a.b"),Int64(NULL)) + Int64(1) |
++------------------------------------------------------------+
+| 2                                                          |
+| -1                                                         |
+| 4                                                          |
+| -3                                                         |
+|                                                            |
+|                                                            |
+|                                                            |
+| 9                                                          |
+|                                                            |
+| 11                                                         |
++------------------------------------------------------------+
+
+select abs(j.a.b) from json2_table order by ts;
+
+Error: 3000(PlanQuery), Failed to plan SQL: Error during planning: Function 'abs' expects NativeType::Numeric but received NativeType::String No function matches the given name and argument types 'abs(Utf8View)'. You might need to add explicit type casts.
+	Candidate functions:
+	abs(Numeric(1))
+
+-- "j.c" is of type "String", "abs" is expected to be all "null"s.
+select abs(j.c) from json2_table order by ts;
+
+Error: 3000(PlanQuery), Failed to plan SQL: Error during planning: Function 'abs' expects NativeType::Numeric but received NativeType::String No function matches the given name and argument types 'abs(Utf8View)'. You might need to add explicit type casts.
+	Candidate functions:
+	abs(Numeric(1))
+
 select j.d from json2_table order by ts;
 
 +-----------------------------------+
diff --git a/tests/cases/standalone/common/types/json/json2.sql b/tests/cases/standalone/common/types/json/json2.sql
index 57e113f8be..cb8df2f8b9 100644
--- a/tests/cases/standalone/common/types/json/json2.sql
+++ b/tests/cases/standalone/common/types/json/json2.sql
@@ -42,8 +42,21 @@ explain select j.a.x::bool from json2_table;
 
 select j.a.b from json2_table order by ts;
 
+select j.a, j.a.x from json2_table order by ts;
+
 select j.c, j.y from json2_table order by ts;
 
+select j from json2_table order by ts;
+
+select * from json2_table order by ts;
+
+select j.a.b + 1 from json2_table order by ts;
+
+select abs(j.a.b) from json2_table order by ts;
+
+-- "j.c" is of type "String", "abs" is expected to be all "null"s.
+select abs(j.c) from json2_table order by ts;
+
 select j.d from json2_table order by ts;
 
 drop table json2_table;
diff --git a/tests/cases/standalone/common/types/json/jsonbench.result b/tests/cases/standalone/common/types/json/jsonbench.result
new file mode 100644
index 0000000000..bc039e0b08
--- /dev/null
+++ b/tests/cases/standalone/common/types/json/jsonbench.result
@@ -0,0 +1,180 @@
+CREATE TABLE bluesky (
+    `data`  JSON2,
+    time_us TimestampMicrosecond TIME INDEX
+) WITH ('append_mode' = 'true', 'sst_format' = 'flat');
+
+Affected Rows: 0
+
+INSERT INTO bluesky (time_us, data)
+VALUES (1732206349000167,
+        '{"did":"did:plc:yj3sjq3blzpynh27cumnp5ks","time_us":1732206349000167,"kind":"commit","commit":{"rev":"3lbhtytnn2k2f","operation":"create","collection":"app.bsky.feed.post","rkey":"3lbhtyteurk2y","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:09:27.095Z","langs":["en"],"reply":{"parent":{"cid":"bafyreibfglofvqou2yiqvwzk4rcgkhhxrbunyemshdjledgwymimqkg24e","uri":"at://did:plc:6tr6tuzlx2db3rduzr2d6r24/app.bsky.feed.post/3lbhqo2rtys2z"},"root":{"cid":"bafyreibfglofvqou2yiqvwzk4rcgkhhxrbunyemshdjledgwymimqkg24e","uri":"at://did:plc:6tr6tuzlx2db3rduzr2d6r24/app.bsky.feed.post/3lbhqo2rtys2z"}},"text":"aaaaah.  LIght shines in a corner of WTF...."},"cid":"bafyreidblutgvj75o4q4akzyyejedjj6l3it6hgqwee6jpwv2wqph5fsgm"}}');
+
+Affected Rows: 1
+
+INSERT INTO bluesky (time_us, data)
+VALUES (1732206349000644,
+        '{"did":"did:plc:3i4xf2v4wcnyktgv6satke64","time_us":1732206349000644,"kind":"commit","commit":{"rev":"3lbhuvzds6d2a","operation":"create","collection":"app.bsky.feed.like","rkey":"3lbhuvzdked2a","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:25:46.221Z","subject":{"cid":"bafyreidjvrcmckkm765mct5fph36x7kupkfo35rjklbf2k76xkzwyiauge","uri":"at://did:plc:azrv4rcbws6kmcga4fsbphg2/app.bsky.feed.post/3lbgjdpbiec2l"}},"cid":"bafyreia5l5vrkh5oj4cjyhcqby2dprhyvcyofo2q5562tijlae2pzih23m"}}');
+
+Affected Rows: 1
+
+ADMIN flush_table('bluesky');
+
++------------------------------+
+| ADMIN flush_table('bluesky') |
++------------------------------+
+| 0                            |
++------------------------------+
+
+INSERT INTO bluesky (time_us, data)
+VALUES (1732206349001108,
+        '{"did":"did:plc:gccfnqqizz4urhchsaie6jft","time_us":1732206349001108,"kind":"commit","commit":{"rev":"3lbhuvze3gi2u","operation":"create","collection":"app.bsky.graph.follow","rkey":"3lbhuvzdtmi2u","record":{"$type":"app.bsky.graph.follow","createdAt":"2024-11-21T16:27:40.923Z","subject":"did:plc:r7cdh4sgzqbfdc6wcdxxti7c"},"cid":"bafyreiew2p6cgirfaj45qoenm4fgumib7xoloclrap3jgkz5es7g7kby3i"}}');
+
+Affected Rows: 1
+
+INSERT INTO bluesky (time_us, data)
+VALUES (1732206349001372,
+        '{"did":"did:plc:msxqf3twq7abtdw7dbfskphk","time_us":1732206349001372,"kind":"commit","commit":{"rev":"3lbhueija5p22","operation":"create","collection":"app.bsky.feed.like","rkey":"3lbhueiizcx22","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:15:58.232Z","subject":{"cid":"bafyreiavpshyqzrlo5m7fqodjhs6jevweqnif4phasiwimv4a7mnsqi2fe","uri":"at://did:plc:fusulxqc52zbrc75fi6xrcof/app.bsky.feed.post/3lbhskq5zn22f"}},"cid":"bafyreidjix4dauj2afjlbzmhj3a7gwftcevvmmy6edww6vrjdbst26rkby"}}');
+
+Affected Rows: 1
+
+ADMIN flush_table('bluesky');
+
++------------------------------+
+| ADMIN flush_table('bluesky') |
++------------------------------+
+| 0                            |
++------------------------------+
+
+INSERT INTO bluesky (time_us, data)
+VALUES (1732206349001905,
+        '{"did":"did:plc:l5o3qjrmfztir54cpwlv2eme","time_us":1732206349001905,"kind":"commit","commit":{"rev":"3lbhtytohxc2o","operation":"create","collection":"app.bsky.feed.post","rkey":"3lbhtytjqzk2q","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:09:27.254Z","langs":["en"],"reply":{"parent":{"cid":"bafyreih35fe2jj3gchmgk4amold4l6sfxd2sby5wrg3jrws5fkdypxrbg4","uri":"at://did:plc:6wx2gg5yqgvmlu35r6y3bk6d/app.bsky.feed.post/3lbhtj2eb4s2o"},"root":{"cid":"bafyreifipyt3vctd4ptuoicvio7rbr5xvjv4afwuggnd2prnmn55mu6luu","uri":"at://did:plc:474ldquxwzrlcvjhhbbk2wte/app.bsky.feed.post/3lbhdzrynik27"}},"text":"okay i take mine back because I hadn’t heard this one yet^^"},"cid":"bafyreigzdsdne3z2xxcakgisieyj7y47hj6eg7lj6v4q25ah5q2qotu5ku"}}');
+
+Affected Rows: 1
+
+ADMIN compact_table('bluesky', 'swcs', '86400');
+
++-------------------------------------------------+
+| ADMIN compact_table('bluesky', 'swcs', '86400') |
++-------------------------------------------------+
+| 0                                               |
++-------------------------------------------------+
+
+SELECT count(*) FROM bluesky;
+
++----------+
+| count(*) |
++----------+
+| 5        |
++----------+
+
+-- Query 1:
+SELECT data.commit.collection AS event,
+       count() AS count
+FROM bluesky
+GROUP BY event
+ORDER BY count DESC, event ASC;
+
++-----------------------+-------+
+| event                 | count |
++-----------------------+-------+
+| app.bsky.feed.like    | 2     |
+| app.bsky.feed.post    | 2     |
+| app.bsky.graph.follow | 1     |
++-----------------------+-------+
+
+-- Query 2:
+SELECT data.commit.collection AS event,
+       count() AS count,
+       count(DISTINCT data.did) AS users
+FROM bluesky
+WHERE data.kind = 'commit' AND data.commit.operation = 'create'
+GROUP BY event
+ORDER BY count DESC, event ASC;
+
++-----------------------+-------+-------+
+| event                 | count | users |
++-----------------------+-------+-------+
+| app.bsky.feed.like    | 2     | 2     |
+| app.bsky.feed.post    | 2     | 2     |
+| app.bsky.graph.follow | 1     | 1     |
++-----------------------+-------+-------+
+
+-- Query 3:
+SELECT data.commit.collection AS event,
+       date_part('hour', to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) as hour_of_day,
+       count() AS count
+FROM bluesky
+WHERE data.kind = 'commit'
+  AND data.commit.operation = 'create'
+  AND data.commit.collection in ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
+GROUP BY event, hour_of_day
+ORDER BY hour_of_day, event;
+
++--------------------+-------------+-------+
+| event              | hour_of_day | count |
++--------------------+-------------+-------+
+| app.bsky.feed.like | 16          | 2     |
+| app.bsky.feed.post | 16          | 2     |
++--------------------+-------------+-------+
+
+-- Query 4:
+SELECT data.did::String as user_id,
+       min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) AS first_post_ts
+FROM bluesky
+WHERE data.kind = 'commit'
+  AND data.commit.operation = 'create'
+  AND data.commit.collection = 'app.bsky.feed.post'
+GROUP BY user_id
+ORDER BY first_post_ts ASC, user_id DESC
+LIMIT 3;
+
++----------------------------------+----------------------------+
+| user_id                          | first_post_ts              |
++----------------------------------+----------------------------+
+| did:plc:yj3sjq3blzpynh27cumnp5ks | 2024-11-21T16:25:49.000167 |
+| did:plc:l5o3qjrmfztir54cpwlv2eme | 2024-11-21T16:25:49.001905 |
++----------------------------------+----------------------------+
+
+-- Query 5:
+SELECT data.did::String as user_id,
+       date_part(
+           'epoch',
+           max(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) -
+             min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64')))
+       ) AS activity_span
+FROM bluesky
+WHERE data.kind = 'commit'
+  AND data.commit.operation = 'create'
+  AND data.commit.collection = 'app.bsky.feed.post'
+GROUP BY user_id
+ORDER BY activity_span DESC, user_id DESC
+LIMIT 3;
+
++----------------------------------+---------------+
+| user_id                          | activity_span |
++----------------------------------+---------------+
+| did:plc:yj3sjq3blzpynh27cumnp5ks | 0.0           |
+| did:plc:l5o3qjrmfztir54cpwlv2eme | 0.0           |
++----------------------------------+---------------+
+
+-- SQLNESS REPLACE (peers.*) REDACTED
+EXPLAIN
+SELECT date_part('hour', to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) as hour_of_day
+FROM bluesky;
+
++---------------+-------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                          |
++---------------+-------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | MergeScan [is_placeholder=false, remote_input=[                                                                               |
+|               | Projection: date_part(Utf8("hour"), to_timestamp_micros(json_get(bluesky.data, Utf8("time_us"), Int64(NULL)))) AS hour_of_day |
+|               |   TableScan: bluesky                                                                                                          |
+|               | ]]                                                                                                                            |
+| physical_plan | CooperativeExec                                                                                                               |
+|               |   MergeScanExec: REDACTED
+|               |                                                                                                                               |
++---------------+-------------------------------------------------------------------------------------------------------------------------------+
+
+DROP TABLE bluesky;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/types/json/jsonbench.sql b/tests/cases/standalone/common/types/json/jsonbench.sql
new file mode 100644
index 0000000000..8d25605ded
--- /dev/null
+++ b/tests/cases/standalone/common/types/json/jsonbench.sql
@@ -0,0 +1,92 @@
+CREATE TABLE bluesky (
+    `data`  JSON2,
+    time_us TimestampMicrosecond TIME INDEX
+) WITH ('append_mode' = 'true', 'sst_format' = 'flat');
+
+INSERT INTO bluesky (time_us, data)
+VALUES (1732206349000167,
+        '{"did":"did:plc:yj3sjq3blzpynh27cumnp5ks","time_us":1732206349000167,"kind":"commit","commit":{"rev":"3lbhtytnn2k2f","operation":"create","collection":"app.bsky.feed.post","rkey":"3lbhtyteurk2y","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:09:27.095Z","langs":["en"],"reply":{"parent":{"cid":"bafyreibfglofvqou2yiqvwzk4rcgkhhxrbunyemshdjledgwymimqkg24e","uri":"at://did:plc:6tr6tuzlx2db3rduzr2d6r24/app.bsky.feed.post/3lbhqo2rtys2z"},"root":{"cid":"bafyreibfglofvqou2yiqvwzk4rcgkhhxrbunyemshdjledgwymimqkg24e","uri":"at://did:plc:6tr6tuzlx2db3rduzr2d6r24/app.bsky.feed.post/3lbhqo2rtys2z"}},"text":"aaaaah.  LIght shines in a corner of WTF...."},"cid":"bafyreidblutgvj75o4q4akzyyejedjj6l3it6hgqwee6jpwv2wqph5fsgm"}}');
+
+INSERT INTO bluesky (time_us, data)
+VALUES (1732206349000644,
+        '{"did":"did:plc:3i4xf2v4wcnyktgv6satke64","time_us":1732206349000644,"kind":"commit","commit":{"rev":"3lbhuvzds6d2a","operation":"create","collection":"app.bsky.feed.like","rkey":"3lbhuvzdked2a","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:25:46.221Z","subject":{"cid":"bafyreidjvrcmckkm765mct5fph36x7kupkfo35rjklbf2k76xkzwyiauge","uri":"at://did:plc:azrv4rcbws6kmcga4fsbphg2/app.bsky.feed.post/3lbgjdpbiec2l"}},"cid":"bafyreia5l5vrkh5oj4cjyhcqby2dprhyvcyofo2q5562tijlae2pzih23m"}}');
+
+ADMIN flush_table('bluesky');
+
+INSERT INTO bluesky (time_us, data)
+VALUES (1732206349001108,
+        '{"did":"did:plc:gccfnqqizz4urhchsaie6jft","time_us":1732206349001108,"kind":"commit","commit":{"rev":"3lbhuvze3gi2u","operation":"create","collection":"app.bsky.graph.follow","rkey":"3lbhuvzdtmi2u","record":{"$type":"app.bsky.graph.follow","createdAt":"2024-11-21T16:27:40.923Z","subject":"did:plc:r7cdh4sgzqbfdc6wcdxxti7c"},"cid":"bafyreiew2p6cgirfaj45qoenm4fgumib7xoloclrap3jgkz5es7g7kby3i"}}');
+
+INSERT INTO bluesky (time_us, data)
+VALUES (1732206349001372,
+        '{"did":"did:plc:msxqf3twq7abtdw7dbfskphk","time_us":1732206349001372,"kind":"commit","commit":{"rev":"3lbhueija5p22","operation":"create","collection":"app.bsky.feed.like","rkey":"3lbhueiizcx22","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:15:58.232Z","subject":{"cid":"bafyreiavpshyqzrlo5m7fqodjhs6jevweqnif4phasiwimv4a7mnsqi2fe","uri":"at://did:plc:fusulxqc52zbrc75fi6xrcof/app.bsky.feed.post/3lbhskq5zn22f"}},"cid":"bafyreidjix4dauj2afjlbzmhj3a7gwftcevvmmy6edww6vrjdbst26rkby"}}');
+
+ADMIN flush_table('bluesky');
+
+INSERT INTO bluesky (time_us, data)
+VALUES (1732206349001905,
+        '{"did":"did:plc:l5o3qjrmfztir54cpwlv2eme","time_us":1732206349001905,"kind":"commit","commit":{"rev":"3lbhtytohxc2o","operation":"create","collection":"app.bsky.feed.post","rkey":"3lbhtytjqzk2q","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:09:27.254Z","langs":["en"],"reply":{"parent":{"cid":"bafyreih35fe2jj3gchmgk4amold4l6sfxd2sby5wrg3jrws5fkdypxrbg4","uri":"at://did:plc:6wx2gg5yqgvmlu35r6y3bk6d/app.bsky.feed.post/3lbhtj2eb4s2o"},"root":{"cid":"bafyreifipyt3vctd4ptuoicvio7rbr5xvjv4afwuggnd2prnmn55mu6luu","uri":"at://did:plc:474ldquxwzrlcvjhhbbk2wte/app.bsky.feed.post/3lbhdzrynik27"}},"text":"okay i take mine back because I hadn’t heard this one yet^^"},"cid":"bafyreigzdsdne3z2xxcakgisieyj7y47hj6eg7lj6v4q25ah5q2qotu5ku"}}');
+
+ADMIN compact_table('bluesky', 'swcs', '86400');
+
+SELECT count(*) FROM bluesky;
+
+-- Query 1:
+SELECT data.commit.collection AS event,
+       count() AS count
+FROM bluesky
+GROUP BY event
+ORDER BY count DESC, event ASC;
+
+-- Query 2:
+SELECT data.commit.collection AS event,
+       count() AS count,
+       count(DISTINCT data.did) AS users
+FROM bluesky
+WHERE data.kind = 'commit' AND data.commit.operation = 'create'
+GROUP BY event
+ORDER BY count DESC, event ASC;
+
+-- Query 3:
+SELECT data.commit.collection AS event,
+       date_part('hour', to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) as hour_of_day,
+       count() AS count
+FROM bluesky
+WHERE data.kind = 'commit'
+  AND data.commit.operation = 'create'
+  AND data.commit.collection in ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
+GROUP BY event, hour_of_day
+ORDER BY hour_of_day, event;
+
+-- Query 4:
+SELECT data.did::String as user_id,
+       min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) AS first_post_ts
+FROM bluesky
+WHERE data.kind = 'commit'
+  AND data.commit.operation = 'create'
+  AND data.commit.collection = 'app.bsky.feed.post'
+GROUP BY user_id
+ORDER BY first_post_ts ASC, user_id DESC
+LIMIT 3;
+
+-- Query 5:
+SELECT data.did::String as user_id,
+       date_part(
+           'epoch',
+           max(to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) -
+             min(to_timestamp_micros(arrow_cast(data.time_us, 'Int64')))
+       ) AS activity_span
+FROM bluesky
+WHERE data.kind = 'commit'
+  AND data.commit.operation = 'create'
+  AND data.commit.collection = 'app.bsky.feed.post'
+GROUP BY user_id
+ORDER BY activity_span DESC, user_id DESC
+LIMIT 3;
+
+-- SQLNESS REPLACE (peers.*) REDACTED
+EXPLAIN
+SELECT date_part('hour', to_timestamp_micros(arrow_cast(data.time_us, 'Int64'))) as hour_of_day
+FROM bluesky;
+
+DROP TABLE bluesky;
diff --git a/tests/cases/standalone/common/view/create.result b/tests/cases/standalone/common/view/create.result
index 76b9838628..4674f83c98 100644
--- a/tests/cases/standalone/common/view/create.result
+++ b/tests/cases/standalone/common/view/create.result
@@ -116,6 +116,7 @@ SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE;
 |greptime|information_schema|process_list|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 |greptime|information_schema|profiling|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 |greptime|information_schema|referential_constraints|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
+|greptime|information_schema|region_info|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 |greptime|information_schema|region_peers|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 |greptime|information_schema|region_statistics|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 |greptime|information_schema|routines|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|