Merge branch 'main' into feat/flush-hook-extension-point

2026-06-03 05:40:40 +00:00 · 2026-05-29 12:35:10 -07:00
parent b1497ee450 869a584f8a
commit 655f3a959c
166 changed files with 15992 additions and 1955 deletions
--- a/.github/workflows/develop.yml
+++ b/.github/workflows/develop.yml
@@ -669,18 +669,28 @@ jobs:
          - name: "Basic"
            opts: ""
            kafka: false
+            postgres: false
+            mysql: false
          - name: "Remote WAL"
            opts: "-w kafka -k 127.0.0.1:9092"
            kafka: true
+            postgres: false
+            mysql: false
          - name: "PostgreSQL KvBackend"
-            opts: "--setup-pg"
+            opts: "--setup-pg postgresql://greptimedb:admin@127.0.0.1:5432/postgres"
            kafka: false
-          - name: "MySQL Kvbackend"
-            opts: "--setup-mysql"
+            postgres: true
+            mysql: false
+          - name: "MySQL KvBackend"
+            opts: "--setup-mysql mysql://greptimedb:admin@127.0.0.1:3306/mysql"
            kafka: false
+            postgres: false
+            mysql: true
          - name: "Flat format"
            opts: "--enable-flat-format"
            kafka: false
+            postgres: false
+            mysql: false
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4
@@ -688,9 +698,19 @@ jobs:
          persist-credentials: false

      - if: matrix.mode.kafka
-        name: Setup kafka server
+        name: Setup Kafka
        working-directory: tests-integration/fixtures
-        run:  ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait kafka
+        run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait kafka
+
+      - if: matrix.mode.postgres
+        name: Setup PostgreSQL
+        working-directory: tests-integration/fixtures
+        run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait postgres
+
+      - if: matrix.mode.mysql
+        name: Setup MySQL
+        working-directory: tests-integration/fixtures
+        run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait mysql

      - name: Download pre-built binaries
        uses: actions/download-artifact@v4
--- a/.github/workflows/nightly-jsonbench.yaml
+++ b/.github/workflows/nightly-jsonbench.yaml
@@ -0,0 +1,162 @@
+name: Nightly JSONBench
+
+on:
+  schedule:
+    # Trigger at 00:00(Asia/Shanghai) on every weekday.
+    - cron: "0 16 * * 0-4"
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  allocate-runner:
+    name: Allocate runner
+    if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
+    runs-on: ubuntu-latest
+    outputs:
+      linux-arm64-runner: ${{ steps.start-linux-arm64-runner.outputs.label }}
+
+      # The following EC2 resource id will be used for resource releasing.
+      linux-arm64-ec2-runner-label: ${{ steps.start-linux-arm64-runner.outputs.label }}
+      linux-arm64-ec2-runner-instance-id: ${{ steps.start-linux-arm64-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+
+      - name: Allocate Linux ARM64 runner
+        uses: ./.github/actions/start-runner
+        id: start-linux-arm64-runner
+        with:
+          runner: ${{ vars.DEFAULT_ARM64_RUNNER }}
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.EC2_RUNNER_REGION }}
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          image-id: ${{ vars.EC2_RUNNER_LINUX_ARM64_IMAGE_ID }}
+          security-group-id: ${{ vars.EC2_RUNNER_SECURITY_GROUP_ID }}
+          subnet-id: ${{ vars.EC2_RUNNER_SUBNET_ID }}
+
+  jsonbench:
+    name: Run JSONBench
+    if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
+    needs: [ allocate-runner ]
+    runs-on: ${{ needs.allocate-runner.outputs.linux-arm64-runner }}
+    timeout-minutes: 120
+    env:
+      JSONBENCH_DATA_DIR: /home/runner/data/bluesky
+      JSONBENCH_OUTPUT_PREFIX: _ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+
+      - uses: arduino/setup-protoc@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@v2
+        with:
+          shared-key: "nightly-jsonbench"
+          cache-all-crates: "true"
+          save-if: ${{ github.ref == 'refs/heads/main' }}
+
+      - name: Build GreptimeDB
+        run: cargo build --profile nightly --bin greptime
+
+      - name: Reclaim disk space
+        shell: bash
+        run: |
+          set -euo pipefail
+
+          mkdir -p "${RUNNER_TEMP}/greptimedb-bin"
+          cp ./target/nightly/greptime "${RUNNER_TEMP}/greptimedb-bin/greptime"
+          chmod +x "${RUNNER_TEMP}/greptimedb-bin/greptime"
+
+          rm -rf ./target
+
+      - name: Run JSONBench
+        shell: bash
+        run: |
+          set -euo pipefail
+
+          cd "${RUNNER_TEMP}"
+          cp "${RUNNER_TEMP}/greptimedb-bin/greptime" ./greptime
+          chmod +x ./greptime
+
+          export GREPTIMEDB_STANDALONE__WAL__DIR=greptimedb_data/wal
+          export GREPTIMEDB_STANDALONE__STORAGE__DATA_HOME=greptimedb_data
+          export GREPTIMEDB_STANDALONE__LOGGING__DIR=greptimedb_data/logs
+          export GREPTIMEDB_STANDALONE__LOGGING__APPEND_STDOUT=false
+          export GREPTIMEDB_STANDALONE__HTTP__BODY_LIMIT=1GB
+          export GREPTIMEDB_STANDALONE__HTTP__TIMEOUT=500s
+
+          ./greptime standalone start > greptimedb.log 2>&1 &
+          greptime_pid=$!
+          trap 'kill "${greptime_pid}" 2>/dev/null || true' EXIT
+
+          until curl -s --fail -o /dev/null http://localhost:4000/health; do
+            if ! kill -0 "${greptime_pid}" 2>/dev/null; then
+              cat greptimedb.log
+              exit 1
+            fi
+            sleep 1
+          done
+
+          git clone --branch greptimedb-new-json --depth 1 https://github.com/GreptimeTeam/JSONBench.git JSONBench
+          cp ./greptime JSONBench/greptimedb/greptime
+
+          cd JSONBench/greptimedb
+          ./main.sh 3 "${JSONBENCH_DATA_DIR}" success.log error.log "${JSONBENCH_OUTPUT_PREFIX}" false
+
+      - name: Upload JSONBench results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: jsonbench-results
+          path: |
+            ${{ runner.temp }}/greptimedb.log
+            ${{ runner.temp }}/JSONBench/greptimedb/*.log
+            ${{ runner.temp }}/JSONBench/greptimedb/*.total_size
+            ${{ runner.temp }}/JSONBench/greptimedb/*.data_size
+            ${{ runner.temp }}/JSONBench/greptimedb/*.index_size
+            ${{ runner.temp }}/JSONBench/greptimedb/*.count
+            ${{ runner.temp }}/JSONBench/greptimedb/*.results_runtime
+            ${{ runner.temp }}/JSONBench/greptimedb/*.query_results
+          if-no-files-found: ignore
+          retention-days: 7
+
+  stop-linux-arm64-runner:
+    name: Stop Linux ARM64 runner
+    # It's always run as the last job in the workflow to make sure that the runner is released.
+    if: ${{ always() }}
+    runs-on: ubuntu-latest
+    needs: [
+      allocate-runner,
+      jsonbench,
+    ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+
+      - name: Stop Linux ARM64 runner
+        uses: ./.github/actions/stop-runner
+        with:
+          label: ${{ needs.allocate-runner.outputs.linux-arm64-ec2-runner-label }}
+          ec2-instance-id: ${{ needs.allocate-runner.outputs.linux-arm64-ec2-runner-instance-id }}
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.EC2_RUNNER_REGION }}
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1321,9 +1321,9 @@ dependencies = [

 [[package]]
 name = "bitpacking"
-version = "0.9.2"
+version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c1d3e2bfd8d06048a179f7b17afc3188effa10385e7b00dc65af6aae732ea92"
+checksum = "96a7139abd3d9cebf8cd6f920a389cf3dc9576172e32f4563f188cae3c3eb019"
 dependencies = [
 "crunchy",
 ]
@@ -1832,7 +1832,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c7d8d1efd5109b9c1cd3b7966bd071cdfb53bb6eb0b22a473a68c2f70a11a1eb"
 dependencies = [
 "parse-zoneinfo",
- "phf_codegen",
+ "phf_codegen 0.12.1",
 "phf_shared 0.12.1",
 "uncased",
 ]
@@ -4380,6 +4380,12 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "datasketches"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c286de4e81ea2590afc24d754e0f83810c566f50a1388fa75ebd57928c0d9745"
+
 [[package]]
 name = "datatypes"
 version = "1.1.0"
@@ -5486,12 +5492,12 @@ dependencies = [

 [[package]]
 name = "fs4"
-version = "0.8.4"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8"
+checksum = "8640e34b88f7652208ce9e88b1a37a2ae95227d84abec377ccd3c5cfeb141ed4"
 dependencies = [
- "rustix 0.38.44",
- "windows-sys 0.52.0",
+ "rustix 1.0.7",
+ "windows-sys 0.59.0",
 ]

 [[package]]
@@ -5820,7 +5826,7 @@ dependencies = [
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=dfd2a6d7d3d9c718cb159fcf9abae144b74fc503#dfd2a6d7d3d9c718cb159fcf9abae144b74fc503"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=7224c2ad6d11db612fbdb621c36135fc37ffce35#7224c2ad6d11db612fbdb621c36135fc37ffce35"
 dependencies = [
 "prost 0.14.1",
 "prost-types 0.14.1",
@@ -6564,27 +6570,37 @@ checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed"

 [[package]]
 name = "include-flate"
-version = "0.3.0"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e"
+checksum = "23e233413926ef735f7d87024466cfda5a4b87467730846bd82ea7d504121347"
 dependencies = [
 "include-flate-codegen",
- "lazy_static",
- "libflate",
+ "include-flate-compress",
 ]

 [[package]]
 name = "include-flate-codegen"
-version = "0.2.0"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7"
+checksum = "5e7148f24ef8922cc0e5574ebb908729ccdd3a110c440a45165733fedadd9969"
 dependencies = [
- "libflate",
+ "include-flate-compress",
+ "proc-macro-error2",
 "proc-macro2",
 "quote",
 "syn 2.0.117",
 ]

+[[package]]
+name = "include-flate-compress"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74783a9ed407e844e99d5e7a57bd650acbfa124cf6e97ffd790ba59d8ab8e7ff"
+dependencies = [
+ "libflate",
+ "zstd",
+]
+
 [[package]]
 name = "include_dir"
 version = "0.7.4"
@@ -6918,25 +6934,25 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"

 [[package]]
 name = "jieba-macros"
-version = "0.8.0"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6105f38f083bb1a79ad523bd32fa0d8ffcb6abd2fc4da9da203c32bca5b6ace3"
+checksum = "661344b2412fb00aee1841d2405c9a31f7c91cf6e578a8e953647c43dd1a8b0a"
 dependencies = [
- "phf_codegen",
+ "phf_codegen 0.13.1",
 ]

 [[package]]
 name = "jieba-rs"
-version = "0.8.0"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47982a320106da83b0c5d6aec0fb83e109f0132b69670b063adaa6fa5b4f3f4a"
+checksum = "d7ef90d6209fcff084a01b488c4199d882e3764b15ff0e7a6b5d7efaa46e1e4f"
 dependencies = [
 "cedarwood",
- "fxhash",
 "include-flate",
 "jieba-macros",
- "phf 0.12.1",
+ "phf 0.13.1",
 "regex",
+ "rustc-hash 2.1.1",
 ]

 [[package]]
@@ -7483,25 +7499,25 @@ checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc"

 [[package]]
 name = "libflate"
-version = "2.1.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e"
+checksum = "cd96e993e5f3368b0cb8497dae6c860c22af8ff18388c61c6c0b86c58d86b5df"
 dependencies = [
 "adler32",
- "core2",
 "crc32fast",
 "dary_heap",
 "libflate_lz77",
+ "no_std_io2",
 ]

 [[package]]
 name = "libflate_lz77"
-version = "2.1.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d"
+checksum = "ff7a10e427698aef6eef269482776debfef63384d30f13aad39a1a95e0e098fd"
 dependencies = [
- "core2",
- "hashbrown 0.14.5",
+ "hashbrown 0.16.1",
+ "no_std_io2",
 "rle-decode-fast",
 ]

@@ -7816,6 +7832,15 @@ dependencies = [
 "hashbrown 0.15.4",
 ]

+[[package]]
+name = "lru"
+version = "0.16.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39"
+dependencies = [
+ "hashbrown 0.16.1",
+]
+
 [[package]]
 name = "lru-slab"
 version = "0.1.2"
@@ -8299,6 +8324,7 @@ dependencies = [
 "either",
 "futures",
 "greptime-proto",
+ "humantime",
 "humantime-serde",
 "index",
 "itertools 0.14.0",
@@ -8434,7 +8460,7 @@ dependencies = [
 "flate2",
 "io-enum",
 "libc",
- "lru",
+ "lru 0.12.5",
 "mysql_common 0.34.1",
 "named_pipe",
 "pem",
@@ -8497,7 +8523,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "keyed_priority_queue",
- "lru",
+ "lru 0.12.5",
 "mysql_common 0.34.1",
 "pem",
 "percent-encoding",
@@ -8695,6 +8721,15 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "no_std_io2"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "418abd1b6d34fbf6cae440dc874771b0525a604428704c76e48b29a5e67b8003"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "nohash"
 version = "0.2.0"
@@ -9635,6 +9670,15 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "ordered-float"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7d950ca161dc355eaf28f82b11345ed76c6e1f6eb1f4f4479e0323b9e2fbd0e"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "ordered-multimap"
 version = "0.4.3"
@@ -10122,6 +10166,15 @@ checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7"
 dependencies = [
 "phf_macros",
 "phf_shared 0.12.1",
+]
+
+[[package]]
+name = "phf"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
+dependencies = [
+ "phf_shared 0.13.1",
 "serde",
 ]

@@ -10131,10 +10184,20 @@ version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "efbdcb6f01d193b17f0b9c3360fa7e0e620991b193ff08702f78b3ce365d7e61"
 dependencies = [
- "phf_generator",
+ "phf_generator 0.12.1",
 "phf_shared 0.12.1",
 ]

+[[package]]
+name = "phf_codegen"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
+dependencies = [
+ "phf_generator 0.13.1",
+ "phf_shared 0.13.1",
+]
+
 [[package]]
 name = "phf_generator"
 version = "0.12.1"
@@ -10145,13 +10208,23 @@ dependencies = [
 "phf_shared 0.12.1",
 ]

+[[package]]
+name = "phf_generator"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
+dependencies = [
+ "fastrand",
+ "phf_shared 0.13.1",
+]
+
 [[package]]
 name = "phf_macros"
 version = "0.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d713258393a82f091ead52047ca779d37e5766226d009de21696c4e667044368"
 dependencies = [
- "phf_generator",
+ "phf_generator 0.12.1",
 "phf_shared 0.12.1",
 "proc-macro2",
 "quote",
@@ -10178,6 +10251,15 @@ dependencies = [
 "uncased",
 ]

+[[package]]
+name = "phf_shared"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
+dependencies = [
+ "siphasher",
+]
+
 [[package]]
 name = "pin-project"
 version = "1.1.10"
@@ -11415,16 +11497,6 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba"

-[[package]]
-name = "rand_distr"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
-dependencies = [
- "num-traits",
- "rand 0.8.5",
-]
-
 [[package]]
 name = "rand_xorshift"
 version = "0.4.0"
@@ -12705,6 +12777,7 @@ dependencies = [
 "metric-engine",
 "mime_guess",
 "mysql_async",
+ "mysql_common 0.34.1",
 "notify",
 "object-pool",
 "once_cell",
@@ -12960,9 +13033,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"

 [[package]]
 name = "sketches-ddsketch"
-version = "0.3.0"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a"
+checksum = "05e40b6cf54d988dc1a2223531b969c9a9e30906ad90ef64890c27b4bfbb46ea"
 dependencies = [
 "serde",
 ]
@@ -13863,9 +13936,9 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"

 [[package]]
 name = "tantivy"
-version = "0.24.2"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64a966cb0e76e311f09cf18507c9af192f15d34886ee43d7ba7c7e3803660c43"
+checksum = "edde6a10743fff00a4e1a8c9ef020bf5f3cbad301b7d2d39f2b07f123c4eac07"
 dependencies = [
 "aho-corasick",
 "arc-swap",
@@ -13876,17 +13949,17 @@ dependencies = [
 "census",
 "crc32fast",
 "crossbeam-channel",
+ "datasketches",
 "downcast-rs",
 "fastdivide",
 "fnv",
 "fs4",
 "htmlescape",
- "hyperloglogplus",
 "itertools 0.14.0",
 "levenshtein_automata",
 "log",
- "lru",
- "lz4_flex 0.11.6",
+ "lru 0.16.4",
+ "lz4_flex 0.13.1",
 "measure_time",
 "memmap2",
 "once_cell",
@@ -13909,6 +13982,7 @@ dependencies = [
 "tempfile",
 "thiserror 2.0.17",
 "time",
+ "typetag",
 "uuid",
 "winapi",
 "zstd",
@@ -13916,18 +13990,18 @@ dependencies = [

 [[package]]
 name = "tantivy-bitpacker"
-version = "0.8.0"
+version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1adc286a39e089ae9938935cd488d7d34f14502544a36607effd2239ff0e2494"
+checksum = "4fed3d674429bcd2de5d0a6d1aa5495fed8afd9c5ecce993019caf7615f53fa4"
 dependencies = [
 "bitpacking",
 ]

 [[package]]
 name = "tantivy-columnar"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6300428e0c104c4f7db6f95b466a6f5c1b9aece094ec57cdd365337908dc7344"
+checksum = "c57166f5bcfd478f370ab8445afb4678dce44801fa5ce5c451aaf8595583c5dc"
 dependencies = [
 "downcast-rs",
 "fastdivide",
@@ -13941,9 +14015,9 @@ dependencies = [

 [[package]]
 name = "tantivy-common"
-version = "0.9.0"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e91b6ea6090ce03dc72c27d0619e77185d26cc3b20775966c346c6d4f7e99d7f"
+checksum = "bbf10915aa75da3c3b0d58b58853d2e889efbaf32d4982a4c3715dde6bba23e5"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -13965,9 +14039,9 @@ dependencies = [

 [[package]]
 name = "tantivy-jieba"
-version = "0.16.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b08147cc130e323ecc522117927b198bec617fe1df562a0b6449905858d0363"
+checksum = "3392170e86f1c387170aba7d171a466ffdc98a8b55b006e19ac64b123a7b690a"
 dependencies = [
 "jieba-rs",
 "lazy_static",
@@ -13976,20 +14050,22 @@ dependencies = [

 [[package]]
 name = "tantivy-query-grammar"
-version = "0.24.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e810cdeeebca57fc3f7bfec5f85fdbea9031b2ac9b990eb5ff49b371d52bbe6a"
+checksum = "dfadb8526b6da90704feb293b0701a6aae62ea14983143344be2dc5ce30f1d82"
 dependencies = [
+ "fnv",
 "nom 7.1.3",
+ "ordered-float 5.3.0",
 "serde",
 "serde_json",
 ]

 [[package]]
 name = "tantivy-sstable"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "709f22c08a4c90e1b36711c1c6cad5ae21b20b093e535b69b18783dd2cb99416"
+checksum = "8a2cfc3ac5164cbadc28965ffb145a8f47582a60ae5897859ad8d4316596c606"
 dependencies = [
 "futures-util",
 "itertools 0.14.0",
@@ -14001,20 +14077,19 @@ dependencies = [

 [[package]]
 name = "tantivy-stacker"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2bcdebb267671311d1e8891fd9d1301803fdb8ad21ba22e0a30d0cab49ba59c1"
+checksum = "6cbb051742da9d53ca9e8fff43a9b10e319338b24e2c0e15d0372df19ffeb951"
 dependencies = [
 "murmurhash32",
- "rand_distr",
 "tantivy-common",
 ]

 [[package]]
 name = "tantivy-tokenizer-api"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfa942fcee81e213e09715bbce8734ae2180070b97b33839a795ba1de201547d"
+checksum = "eac258c2c6390673f2685813afeeafcb8c4e0ee7de8dd3fc46838dcc37263f98"
 dependencies = [
 "serde",
 ]
@@ -15017,9 +15092,9 @@ checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"

 [[package]]
 name = "typetag"
-version = "0.2.20"
+version = "0.2.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73f22b40dd7bfe8c14230cf9702081366421890435b2d625fa92b4acc4c3de6f"
+checksum = "c5a897b12c6c1151ad0b138b8db50252dc301f93bc3b027db05eec82aeed298c"
 dependencies = [
 "erased-serde",
 "inventory",
@@ -15030,9 +15105,9 @@ dependencies = [

 [[package]]
 name = "typetag-impl"
-version = "0.2.20"
+version = "0.2.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35f5380909ffc31b4de4f4bdf96b877175a016aa2ca98cee39fcfd8c4d53d952"
+checksum = "cf808357c6ed7e13ba0f3277ec8d8f21b2d501274895104263985330c726c1c5"
 dependencies = [
 "proc-macro2",
 "quote",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -158,7 +158,7 @@ fs2 = "0.4"
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "dfd2a6d7d3d9c718cb159fcf9abae144b74fc503" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "7224c2ad6d11db612fbdb621c36135fc37ffce35" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
 <h2 align="center">One database for metrics, logs, and traces<br/>
 replacing Prometheus, Loki, and Elasticsearch</h2>

->  The unified OpenTelemetry backend — with SQL + PromQL on object storage.
+> The unified OpenTelemetry backend — with SQL + PromQL on object storage.

 <div align="center">
 <h3 align="center">
@@ -30,11 +30,11 @@ replacing Prometheus, Loki, and Elasticsearch</h2>
 <a href="https://github.com/GreptimeTeam/greptimedb/actions/workflows/develop.yml">
 <img src="https://github.com/GreptimeTeam/greptimedb/actions/workflows/develop.yml/badge.svg" alt="GitHub Actions"/>
 </a>
-<a href="https://codecov.io/gh/GrepTimeTeam/greptimedb">
-<img src="https://codecov.io/gh/GrepTimeTeam/greptimedb/branch/main/graph/badge.svg?token=FITFDI3J3C" alt="Codecov"/>
+<a href="https://codecov.io/gh/GreptimeTeam/greptimedb">
+<img src="https://codecov.io/gh/GreptimeTeam/greptimedb/branch/main/graph/badge.svg?token=FITFDI3J3C" alt="Codecov"/>
 </a>
-<a href="https://github.com/greptimeTeam/greptimedb/blob/main/LICENSE">
-<img src="https://img.shields.io/github/license/greptimeTeam/greptimedb" alt="License"/>
+<a href="https://github.com/GreptimeTeam/greptimedb/blob/main/LICENSE">
+<img src="https://img.shields.io/github/license/GreptimeTeam/greptimedb" alt="License"/>
 </a>

 <br/>
@@ -51,7 +51,8 @@ replacing Prometheus, Loki, and Elasticsearch</h2>
 </div>

 - [Introduction](#introduction)
- [⭐ Key Features](#features)
+- [Overview](#overview)
+- [Features](#features)
 - [How GreptimeDB Compares](#how-greptimedb-compares)
 - [Architecture](#architecture)
 - [Try GreptimeDB](#try-greptimedb)
@@ -69,37 +70,47 @@ replacing Prometheus, Loki, and Elasticsearch</h2>

 **GreptimeDB** is an open-source observability database built for [Observability 2.0](https://docs.greptime.com/user-guide/concepts/observability-2/) — treating metrics, logs, and traces as one unified data model (wide events) instead of three separate pillars.

-Use it as the single OpenTelemetry backend — replacing Prometheus, Loki, and Elasticsearch with one database built on object storage. Query with SQL and PromQL, scale without pain, cut costs up to 50x.
+Use it as the single OpenTelemetry backend — replacing Prometheus, Loki, and Elasticsearch with one database built on object storage. Query with SQL and PromQL, scale without pain, cut costs up to 50×.
+
+## Overview
+
+A quick overview of what GreptimeDB ingests, how it connects to other systems, and what its distributed engine lets you do.
+
+<p align="center">
+  <a href="https://github.com/GreptimeTeam/greptimedb/raw/main/docs/overview.png" target="_blank" rel="noopener">
+    <img alt="GreptimeDB Overview" src="docs/overview.png" width="900px">
+  </a>
+</p>

 ## Features

-|   Feature  | Description |
-| --------- | ----------- |
-| Drop-in replacement | [PromQL](https://docs.greptime.com/user-guide/query-data/promql/), [Prometheus remote write](https://docs.greptime.com/user-guide/ingest-data/for-observability/prometheus/), [Jaeger](https://docs.greptime.com/user-guide/query-data/jaeger/), and [OpenTelemetry](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/) native. Use as your single backend for all three signals, or migrate one at a time.|
-| 50x lower cost | Object storage (S3, GCS, Azure Blob etc.) as [primary storage](https://docs.greptime.com/user-guide/deployments-administration/configuration/#storage-options). Compute-storage separation scales without pain.|
-| SQL + PromQL | Monitor with [PromQL](https://docs.greptime.com/user-guide/query-data/promql), analyze with [SQL](https://docs.greptime.com/user-guide/query-data/sql). One database replaces Prometheus + your data warehouse.|
-| Sub-second at PB-EB scale | Columnar engine with [fulltext, inverted, and skipping indexes](https://docs.greptime.com/user-guide/manage-data/data-index). Written in Rust.|
+| Feature | Description |
+|---------|-------------|
+| **Observability 2.0 native** | Logs, metrics, and traces in one engine with [SQL + PromQL](https://docs.greptime.com/user-guide/query-data/overview/). Native [OpenTelemetry](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/), [Prometheus remote write](https://docs.greptime.com/user-guide/ingest-data/for-observability/prometheus/), and [Jaeger](https://docs.greptime.com/user-guide/query-data/jaeger/). Migrate one signal at a time, or use as a single backend. |
+| **Elastic compute-storage separation** | Scale reads independently with horizontal replicas. Serve high-concurrency workloads from dashboards, alerting, and AI agents — without resharding or data migration. |
+| **Sub-second on PB–EB-scale data** | Columnar engine with [fulltext, inverted, and skipping indexes](https://docs.greptime.com/user-guide/manage-data/data-index). Written in Rust. Designed for high-concurrency point queries, not just analytical scans. |
+| **50× lower cost** | Object storage (S3, GCS, Azure Blob) as [primary storage](https://docs.greptime.com/user-guide/deployments-administration/configuration/#storage-options), with a tiered cache (memory + local disk) to keep writes and queries fast. |

-  ✅ **Perfect for:**
-  * Replacing Prometheus + Loki + Elasticsearch with one database
+**Perfect for:**
+  * Replacing Prometheus + Loki + Elasticsearch with a single observability backend
  * Scaling past Prometheus — high cardinality, long-term storage, no Thanos/Mimir overhead
-  * Cutting observability costs with object storage (up to 50x savings on traces, 30% on logs)
-  * AI/LLM observability — store and analyze high-volume conversation data, agent traces, and token metrics via [OpenTelemetry GenAI conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)
+  * AI/agent workloads — store GenAI telemetry ([OTel GenAI conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/)), and serve high-concurrency reads from SRE/developer agents via horizontal read replicas
+  * Cutting observability costs with object storage (up to 50× savings on traces, 30% on logs)
  * Edge-to-cloud observability with unified APIs on resource-constrained devices

-> **Why Observability 2.0?** The three-pillar model (separate databases for metrics, logs, traces) creates data silos and operational complexity. GreptimeDB treats all observability data as timestamped wide events in a single columnar engine — enabling cross-signal SQL JOINs, eliminating redundant infrastructure, and naturally supporting emerging workloads like AI agent observability. Read more: [Observability 2.0 and the Database for It](https://greptime.com/blogs/2025-04-25-greptimedb-observability2-new-database).
+> **Why Observability 2.0?** Three separate databases for metrics, logs, and traces means three storage layers, three query languages, and three sets of dashboards. GreptimeDB stores all three as timestamped wide events in one columnar engine — JOIN across signals in SQL, run one stack instead of three, and ingest AI agent telemetry the same way. Read more: [Observability 2.0 and the Database for It](https://greptime.com/blogs/2025-04-25-greptimedb-observability2-new-database).

 Learn more in [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why-greptimedb).

 ## How GreptimeDB Compares

-| Feature | GreptimeDB | Prometheus / Thanos / Mimir | Grafana Loki | Elasticsearch |
+| Capability | GreptimeDB | Prometheus / Thanos / Mimir | Grafana Loki | Elasticsearch |
 |---|---|---|---|---|
 | Data types | Metrics, logs, traces | Metrics only | Logs only | Logs, traces |
 | Query language | SQL + PromQL | PromQL | LogQL | Query DSL |
 | Storage | Native object storage (S3, etc.) | Local disk + object storage (Thanos/Mimir) | Object storage (chunks) | Local disk |
 | Scaling | Compute-storage separation, stateless nodes | Federation / Thanos / Mimir — multi-component, ops heavy | Stateless + object storage | Shard-based, ops heavy |
-| Cost efficiency | Up to 50x lower storage | High at scale | Moderate | High (inverted index overhead) |
+| Cost efficiency | Up to 50× lower storage cost | High at scale | Moderate | High (inverted index overhead) |
 | OpenTelemetry | Native (metrics + logs + traces) | Partial (metrics only) | Partial (logs only) | Via instrumentation |

 **Benchmarks:**
@@ -110,19 +121,26 @@ Learn more in [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why
 ## Architecture

 GreptimeDB can run in two modes:
-* **Standalone Mode** - Single binary for development and small deployments
-* **Distributed Mode** - Separate components for production scale:
-  - Frontend: Query processing and protocol handling
-  - Datanode: Data storage and retrieval
-  - Metasrv: Metadata management and coordination
-  
-Read the [architecture](https://docs.greptime.com/contributor-guide/overview/#architecture) document. [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb/1-overview) provides an in-depth look at GreptimeDB:
-  <img alt="GreptimeDB System Overview" src="docs/architecture.png">
+* **Standalone** — single binary for development and small deployments.
+* **Distributed** — four components, each independently scalable:
+  - **Frontend** — protocol entry (OTel, Prometheus, MySQL/PostgreSQL, gRPC, ingestion APIs for Elasticsearch/InfluxDB/Loki) and the distributed query engine. Stateless, scales horizontally.
+  - **Datanode** — region engine with WAL, memtable, SST, cache, compaction, and indexes. Persists data to object storage. Elastic.
+  - **Metasrv** — metadata, routing, repartitioning, autopilot, and security. Backed by a pluggable KV layer (etcd or RDS).
+  - **Flownode** (optional) — continuous flow computation (streaming and materialized views).
+
+For deeper coverage, see the [architecture doc](https://docs.greptime.com/contributor-guide/overview/#architecture) or [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb/1-overview).
+
+<a href="https://github.com/GreptimeTeam/greptimedb/raw/main/docs/architecture.png" target="_blank" rel="noopener">
+  <img alt="GreptimeDB System Overview" src="https://cdn.jsdelivr.net/gh/GreptimeTeam/greptimedb@main/docs/architecture.png">
+</a>

 ## Try GreptimeDB

-```shell
-docker pull greptime/greptimedb
+**For AI agents** — paste this prompt into your agent:
+
+```text
+Read https://docs.greptime.com/SKILL.md and follow the instructions
+to deploy, configure, ingest, and query GreptimeDB.
 ```

 ```shell
@@ -131,7 +149,7 @@ docker run -p 127.0.0.1:4000-4003:4000-4003 \
  --name greptime --rm \
  greptime/greptimedb:latest standalone start \
  --http-addr 0.0.0.0:4000 \
-  --grpc-bind-addr 0.0.0.0:4001 \
+  --rpc-bind-addr 0.0.0.0:4001 \
  --mysql-addr 0.0.0.0:4002 \
  --postgres-addr 0.0.0.0:4003
 ```
@@ -153,20 +171,30 @@ Read more in the [full Install Guide](https://docs.greptime.com/getting-started/
 ## Build From Source

 **Prerequisites:**
-* [Rust toolchain](https://www.rust-lang.org/tools/install) (nightly)
+* [Rust toolchain](https://www.rust-lang.org/tools/install) — nightly, pinned by [`rust-toolchain.toml`](https://github.com/GreptimeTeam/greptimedb/blob/main/rust-toolchain.toml)
 * [Protobuf compiler](https://grpc.io/docs/protoc-installation/) (>= 3.15)
-* C/C++ building essentials, including `gcc`/`g++`/`autoconf` and glibc library (eg. `libc6-dev` on Ubuntu and `glibc-devel` on Fedora)
-* Python toolchain (optional): Required only if using some test scripts.
+* C/C++ building essentials: `gcc` / `g++` / `autoconf` and the glibc dev package (`libc6-dev` on Ubuntu, `glibc-devel` on Fedora)
+* Python toolchain (optional, only for some test scripts)

-**Build and Run:**
+**Build and run:**
 ```bash
-make
-cargo run -- standalone start
+make                          # build greptime binary
+cargo run -- standalone start # start in standalone mode
 ```

+**Common dev commands:**
+```bash
+make fmt            # format Rust code
+make clippy         # lint (fails on warnings)
+make test           # unit + integration tests (uses cargo-nextest)
+make sqlness-test   # SQL regression tests
+```
+
+See the [Contribution Guidelines](CONTRIBUTING.md) for the full developer workflow.
+
 ## Tools & Extensions

- **Kubernetes**: [GreptimeDB Operator](https://github.com/GrepTimeTeam/greptimedb-operator)
+- **Kubernetes**: [GreptimeDB Operator](https://github.com/GreptimeTeam/greptimedb-operator)
 - **Helm Charts**: [Greptime Helm Charts](https://github.com/GreptimeTeam/helm-charts)
 - **Dashboard**: [Web UI](https://github.com/GreptimeTeam/dashboard)
 - **gRPC Ingester**: [Go](https://github.com/GreptimeTeam/greptimedb-ingester-go), [Java](https://github.com/GreptimeTeam/greptimedb-ingester-java), [C++](https://github.com/GreptimeTeam/greptimedb-ingester-cpp), [Erlang](https://github.com/GreptimeTeam/greptimedb-ingester-erl), [Rust](https://github.com/GreptimeTeam/greptimedb-ingester-rust), [.NET](https://github.com/GreptimeTeam/greptimedb-ingester-dotnet)
@@ -175,18 +203,11 @@ cargo run -- standalone start

 ## Project Status

-> **Status:** [v1.0 GA](https://github.com/GreptimeTeam/greptimedb/releases/tag/v1.0.0) — generally available and production-ready! 🎉
+GreptimeDB is at [v1.0 GA](https://github.com/GreptimeTeam/greptimedb/releases/tag/v1.0.0) with stable APIs and regular releases. It runs in production at scale — [OceanBase Cloud](https://greptime.com/blogs/2025-07-22-user-case-obcloud-log-management-greptimedb) operates 80+ GreptimeDB clusters managing 300 TB of logs, cutting log storage cost by 60% after migrating from Grafana Loki. See more in [case studies](https://greptime.com/blogs/?category=Use%20Case).

- Deployed in production handling billions of data points daily
- Stable APIs, actively maintained, with regular releases ([version info](https://docs.greptime.com/nightly/reference/about-greptimedb-version))
+Read the [v1.0 highlights](https://greptime.com/blogs/2025-11-05-greptimedb-v1-highlights) and [2026 roadmap](https://greptime.com/blogs/2026-02-11-greptimedb-roadmap-2026), or browse the [version reference](https://docs.greptime.com/nightly/reference/about-greptimedb-version).

-GreptimeDB v1.0 marks a major milestone — stable APIs, production readiness, and proven performance at scale.
-
-**Learn more:** [v1.0 highlights](https://greptime.com/blogs/2025-11-05-greptimedb-v1-highlights) and [2026 roadmap](https://greptime.com/blogs/2026-02-11-greptimedb-roadmap-2026).
-
-For production use, we recommend v1.0 or later.
-
-If you find this project useful, a ⭐ would mean a lot to us!
+If GreptimeDB is useful to you, please star the repo.

 [![Star History Chart](https://api.star-history.com/svg?repos=GreptimeTeam/GreptimeDB&type=Date)](https://www.star-history.com/#GreptimeTeam/GreptimeDB&Date)

@@ -216,15 +237,19 @@ We offer enterprise add-ons, services, training, and consulting.

 ## Contributing

- Read our [Contribution Guidelines](https://github.com/GreptimeTeam/greptimedb/blob/main/CONTRIBUTING.md).
+- Read our [Contribution Guidelines](CONTRIBUTING.md).
 - Explore [Internal Concepts](https://docs.greptime.com/contributor-guide/overview.html) and [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb).
 - Pick up a [good first issue](https://github.com/GreptimeTeam/greptimedb/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) and join the #contributors [Slack](https://greptime.com/slack) channel.

 ## Acknowledgement

-Special thanks to all contributors! See [AUTHORS.md](https://github.com/GreptimeTeam/greptimedb/blob/main/AUTHOR.md).
+Special thanks to all contributors! See [AUTHOR.md](AUTHOR.md).

 - Uses [Apache Arrow™](https://arrow.apache.org/) (memory model)
 - [Apache Parquet™](https://parquet.apache.org/) (file storage)
- [Apache DataFusion™](https://arrow.apache.org/datafusion/) (query engine)
+- [Apache DataFusion™](https://datafusion.apache.org/) (query engine)
 - [Apache OpenDAL™](https://opendal.apache.org/) (data access abstraction)
+
+---
+
+*All trademarks, logos, and brand names referenced in this README and in the Overview diagram are the property of their respective owners. Their use is for identification purposes only and does not imply endorsement or affiliation.*
--- a/config/config.md
+++ b/config/config.md
@@ -155,6 +155,8 @@
 | `region_engine.mito.vector_cache_size` | String | Auto | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
 | `region_engine.mito.page_cache_size` | String | Auto | Cache size for pages of SST row groups. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/8 of OS memory. |
 | `region_engine.mito.selector_result_cache_size` | String | Auto | Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.range_result_cache_size` | String | Auto | Cache size for flat range scan results. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.prefilter_result_cache_size` | String | Auto | Cache size for prefilter results. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/32 of OS memory with a max limitation of 128MB. |
 | `region_engine.mito.enable_write_cache` | Bool | `false` | Whether to enable the write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance. |
 | `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. |
 | `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
@@ -543,6 +545,8 @@
 | `region_engine.mito.vector_cache_size` | String | Auto | Cache size for vectors and arrow arrays. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
 | `region_engine.mito.page_cache_size` | String | Auto | Cache size for pages of SST row groups. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/8 of OS memory. |
 | `region_engine.mito.selector_result_cache_size` | String | Auto | Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.range_result_cache_size` | String | Auto | Cache size for flat range scan results. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/16 of OS memory with a max limitation of 512MB. |
+| `region_engine.mito.prefilter_result_cache_size` | String | Auto | Cache size for prefilter results. Setting it to 0 to disable the cache.<br/>If not set, it's default to 1/32 of OS memory with a max limitation of 128MB. |
 | `region_engine.mito.enable_write_cache` | Bool | `false` | Whether to enable the write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance. |
 | `region_engine.mito.write_cache_path` | String | `""` | File system path for write cache, defaults to `{data_home}`. |
 | `region_engine.mito.write_cache_size` | String | `5GiB` | Capacity for write cache. If your disk space is sufficient, it is recommended to set it larger. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -480,6 +480,16 @@ auto_flush_interval = "1h"
 ## @toml2docs:none-default="Auto"
 #+ selector_result_cache_size = "512MB"

+## Cache size for flat range scan results. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
+## @toml2docs:none-default="Auto"
+#+ range_result_cache_size = "512MB"
+
+## Cache size for prefilter results. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/32 of OS memory with a max limitation of 128MB.
+## @toml2docs:none-default="Auto"
+#+ prefilter_result_cache_size = "128MB"
+
 ## Whether to enable the write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance.
 enable_write_cache = false

--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -599,6 +599,16 @@ auto_flush_interval = "1h"
 ## @toml2docs:none-default="Auto"
 #+ selector_result_cache_size = "512MB"

+## Cache size for flat range scan results. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/16 of OS memory with a max limitation of 512MB.
+## @toml2docs:none-default="Auto"
+#+ range_result_cache_size = "512MB"
+
+## Cache size for prefilter results. Setting it to 0 to disable the cache.
+## If not set, it's default to 1/32 of OS memory with a max limitation of 128MB.
+## @toml2docs:none-default="Auto"
+#+ prefilter_result_cache_size = "128MB"
+
 ## Whether to enable the write cache, it's enabled by default when using object storage. It is recommended to enable it when using object storage for better performance.
 enable_write_cache = false

--- a/docs/architecture.png
+++ b/docs/architecture.png
--- a/docs/overview.png
+++ b/docs/overview.png
--- a/src/auth/src/permission.rs
+++ b/src/auth/src/permission.rs
@@ -16,6 +16,7 @@ use std::fmt::Debug;
 use std::sync::Arc;

 use api::v1::greptime_request::Request;
+use api::v1::query_request::Query;
 use common_telemetry::debug;
 use sql::statements::statement::Statement;

@@ -42,10 +43,12 @@ impl<'a> PermissionReq<'a> {
    /// Returns true if the permission request is for read operations.
    pub fn is_readonly(&self) -> bool {
        match self {
-            PermissionReq::GrpcRequest(Request::Query(_))
-            | PermissionReq::PromQuery
-            | PermissionReq::LogQuery
-            | PermissionReq::PromStoreRead => true,
+            PermissionReq::GrpcRequest(Request::Query(query_request)) => {
+                !matches!(query_request.query, Some(Query::InsertIntoPlan(_)))
+            }
+            PermissionReq::PromQuery | PermissionReq::LogQuery | PermissionReq::PromStoreRead => {
+                true
+            }
            PermissionReq::SqlStatement(stmt) => stmt.is_readonly(),

            PermissionReq::GrpcRequest(_)
@@ -196,4 +199,14 @@ mod tests {
        assert!(matches!(read_result, PermissionResp::Reject));
        assert!(matches!(write_result, PermissionResp::Allow));
    }
+
+    #[test]
+    fn test_grpc_insert_into_plan_is_write_request() {
+        let request = Request::Query(api::v1::QueryRequest {
+            query: Some(Query::InsertIntoPlan(api::v1::InsertIntoPlan::default())),
+        });
+        let req = PermissionReq::GrpcRequest(&request);
+
+        assert!(req.is_write());
+    }
 }
--- a/src/catalog/src/system_schema/information_schema.rs
+++ b/src/catalog/src/system_schema/information_schema.rs
@@ -20,6 +20,7 @@ pub mod key_column_usage;
 mod partitions;
 mod procedure_info;
 pub mod process_list;
+mod region_info;
 pub mod region_peers;
 mod region_statistics;
 pub mod schemata;
@@ -47,6 +48,8 @@ use datatypes::schema::SchemaRef;
 use lazy_static::lazy_static;
 use paste::paste;
 use process_list::InformationSchemaProcessList;
+use region_info::InformationSchemaRegionInfo;
+use store_api::region_info::RegionInfoEntry;
 use store_api::sst_entry::{ManifestSstEntry, PuffinIndexMetaEntry, StorageSstEntry};
 use store_api::storage::{ScanRequest, TableId};
 use table::TableRef;
@@ -242,6 +245,9 @@ impl SystemSchemaProviderInner for InformationSchemaProvider {
                    self.catalog_manager.clone(),
                ),
            ) as _),
+            REGION_INFO => Some(Arc::new(InformationSchemaRegionInfo::new(
+                self.catalog_manager.clone(),
+            )) as _),
            PROCESS_LIST => self
                .process_manager
                .as_ref()
@@ -320,6 +326,10 @@ impl InformationSchemaProvider {
                REGION_STATISTICS.to_string(),
                self.build_table(REGION_STATISTICS).unwrap(),
            );
+            tables.insert(
+                REGION_INFO.to_string(),
+                self.build_table(REGION_INFO).unwrap(),
+            );
            tables.insert(
                SSTS_MANIFEST.to_string(),
                self.build_table(SSTS_MANIFEST).unwrap(),
@@ -447,6 +457,8 @@ pub enum DatanodeInspectKind {
    SstStorage,
    /// List index metadata collected from manifest
    SstIndexMeta,
+    /// List region runtime and manifest info
+    RegionInfo,
 }

 impl DatanodeInspectRequest {
@@ -456,6 +468,7 @@ impl DatanodeInspectRequest {
            DatanodeInspectKind::SstManifest => ManifestSstEntry::build_plan(self.scan),
            DatanodeInspectKind::SstStorage => StorageSstEntry::build_plan(self.scan),
            DatanodeInspectKind::SstIndexMeta => PuffinIndexMetaEntry::build_plan(self.scan),
+            DatanodeInspectKind::RegionInfo => RegionInfoEntry::build_plan(self.scan),
        }
    }
 }
@@ -488,3 +501,28 @@ impl InformationExtension for NoopInformationExtension {
        Ok(common_recordbatch::RecordBatches::empty().as_stream())
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use store_api::region_info::RegionInfoEntry;
+
+    use super::*;
+
+    #[test]
+    fn test_datanode_inspect_region_info_build_plan() {
+        let plan = DatanodeInspectRequest {
+            kind: DatanodeInspectKind::RegionInfo,
+            scan: ScanRequest::default(),
+        }
+        .build_plan()
+        .unwrap();
+
+        let LogicalPlan::TableScan(scan) = plan else {
+            panic!("expected table scan");
+        };
+        assert_eq!(
+            scan.table_name.to_string(),
+            RegionInfoEntry::reserved_table_name_for_inspection()
+        );
+    }
+}
--- a/src/catalog/src/system_schema/information_schema/region_info.rs
+++ b/src/catalog/src/system_schema/information_schema/region_info.rs
@@ -0,0 +1,86 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::{Arc, Weak};
+
+use common_catalog::consts::INFORMATION_SCHEMA_REGION_INFO_TABLE_ID;
+use common_error::ext::BoxedError;
+use common_recordbatch::SendableRecordBatchStream;
+use common_recordbatch::adapter::AsyncRecordBatchStreamAdapter;
+use datatypes::schema::SchemaRef;
+use snafu::ResultExt;
+use store_api::region_info::RegionInfoEntry;
+use store_api::storage::{ScanRequest, TableId};
+
+use crate::CatalogManager;
+use crate::error::{ProjectSchemaSnafu, Result};
+use crate::information_schema::{
+    DatanodeInspectKind, DatanodeInspectRequest, InformationTable, REGION_INFO,
+};
+use crate::system_schema::utils;
+
+/// Information schema table for region info.
+pub struct InformationSchemaRegionInfo {
+    schema: SchemaRef,
+    catalog_manager: Weak<dyn CatalogManager>,
+}
+
+impl InformationSchemaRegionInfo {
+    pub(super) fn new(catalog_manager: Weak<dyn CatalogManager>) -> Self {
+        Self {
+            schema: RegionInfoEntry::schema(),
+            catalog_manager,
+        }
+    }
+}
+
+impl InformationTable for InformationSchemaRegionInfo {
+    fn table_id(&self) -> TableId {
+        INFORMATION_SCHEMA_REGION_INFO_TABLE_ID
+    }
+
+    fn table_name(&self) -> &'static str {
+        REGION_INFO
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn to_stream(&self, request: ScanRequest) -> Result<SendableRecordBatchStream> {
+        let schema = if let Some(p) = request.projection_indices() {
+            Arc::new(self.schema.try_project(p).context(ProjectSchemaSnafu)?)
+        } else {
+            self.schema.clone()
+        };
+
+        let info_ext = utils::information_extension(&self.catalog_manager)?;
+        let req = DatanodeInspectRequest {
+            kind: DatanodeInspectKind::RegionInfo,
+            scan: request,
+        };
+
+        let future = async move {
+            info_ext
+                .inspect_datanode(req)
+                .await
+                .map_err(BoxedError::new)
+                .context(common_recordbatch::error::ExternalSnafu)
+        };
+        Ok(Box::pin(AsyncRecordBatchStreamAdapter::new(
+            schema,
+            Box::pin(future),
+        )))
+    }
+}
--- a/src/catalog/src/system_schema/information_schema/table_names.rs
+++ b/src/catalog/src/system_schema/information_schema/table_names.rs
@@ -45,6 +45,7 @@ pub const CLUSTER_INFO: &str = "cluster_info";
 pub const VIEWS: &str = "views";
 pub const FLOWS: &str = "flows";
 pub const PROCEDURE_INFO: &str = "procedure_info";
+pub const REGION_INFO: &str = "region_info";
 pub const REGION_STATISTICS: &str = "region_statistics";
 pub const PROCESS_LIST: &str = "process_list";
 pub const SSTS_MANIFEST: &str = "ssts_manifest";
--- a/src/catalog/src/table_source/dummy_catalog.rs
+++ b/src/catalog/src/table_source/dummy_catalog.rs
@@ -22,6 +22,7 @@ use async_trait::async_trait;
 use common_catalog::format_full_table_name;
 use datafusion::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider};
 use datafusion::datasource::TableProvider;
+use session::context::QueryContextRef;
 use snafu::OptionExt;
 use table::table::adapter::DfTableProviderAdapter;

@@ -32,12 +33,27 @@ use crate::error::TableNotExistSnafu;
 #[derive(Clone)]
 pub struct DummyCatalogList {
    catalog_manager: CatalogManagerRef,
+    query_ctx: Option<QueryContextRef>,
 }

 impl DummyCatalogList {
-    /// Creates a new catalog list with the given catalog manager.
+    /// Creates a new catalog list with the given catalog manager (no query context).
    pub fn new(catalog_manager: CatalogManagerRef) -> Self {
-        Self { catalog_manager }
+        Self {
+            catalog_manager,
+            query_ctx: None,
+        }
+    }
+
+    /// Creates a new catalog list with the given catalog manager and query context.
+    pub fn new_with_query_ctx(
+        catalog_manager: CatalogManagerRef,
+        query_ctx: QueryContextRef,
+    ) -> Self {
+        Self {
+            catalog_manager,
+            query_ctx: Some(query_ctx),
+        }
    }
 }

@@ -68,6 +84,7 @@ impl CatalogProviderList for DummyCatalogList {
        Some(Arc::new(DummyCatalogProvider {
            catalog_name: catalog_name.to_string(),
            catalog_manager: self.catalog_manager.clone(),
+            query_ctx: self.query_ctx.clone(),
        }))
    }
 }
@@ -77,6 +94,7 @@ impl CatalogProviderList for DummyCatalogList {
 struct DummyCatalogProvider {
    catalog_name: String,
    catalog_manager: CatalogManagerRef,
+    query_ctx: Option<QueryContextRef>,
 }

 impl CatalogProvider for DummyCatalogProvider {
@@ -93,6 +111,7 @@ impl CatalogProvider for DummyCatalogProvider {
            catalog_name: self.catalog_name.clone(),
            schema_name: schema_name.to_string(),
            catalog_manager: self.catalog_manager.clone(),
+            query_ctx: self.query_ctx.clone(),
        }))
    }
 }
@@ -111,6 +130,7 @@ struct DummySchemaProvider {
    catalog_name: String,
    schema_name: String,
    catalog_manager: CatalogManagerRef,
+    query_ctx: Option<QueryContextRef>,
 }

 #[async_trait]
@@ -126,7 +146,12 @@ impl SchemaProvider for DummySchemaProvider {
    async fn table(&self, name: &str) -> datafusion::error::Result<Option<Arc<dyn TableProvider>>> {
        let table = self
            .catalog_manager
-            .table(&self.catalog_name, &self.schema_name, name, None)
+            .table(
+                &self.catalog_name,
+                &self.schema_name,
+                name,
+                self.query_ctx.as_deref(),
+            )
            .await?
            .with_context(|| TableNotExistSnafu {
                table: format_full_table_name(&self.catalog_name, &self.schema_name, name),
--- a/src/cli/src/data/export_v2/command.rs
+++ b/src/cli/src/data/export_v2/command.rs
@@ -15,6 +15,7 @@
 //! Export V2 CLI commands.

 use std::collections::HashSet;
+use std::io::{self, Write};
 use std::time::Duration;

 use async_trait::async_trait;
@@ -28,7 +29,7 @@ use crate::Tool;
 use crate::common::ObjectStoreConfig;
 use crate::data::export_v2::coordinator::export_data;
 use crate::data::export_v2::error::{
-    ChunkTimeWindowRequiresBoundsSnafu, DatabaseSnafu, EmptyResultSnafu,
+    ChunkTimeWindowRequiresBoundsSnafu, DatabaseSnafu, EmptyResultSnafu, IoSnafu,
    ManifestVersionMismatchSnafu, Result, ResumeConfigMismatchSnafu, SchemaOnlyArgsNotAllowedSnafu,
    SchemaOnlyModeMismatchSnafu, SnapshotVerifyFailedSnafu, UnexpectedValueTypeSnafu,
 };
@@ -38,7 +39,9 @@ use crate::data::export_v2::manifest::{
 };
 use crate::data::export_v2::schema::{DDL_DIR, SCHEMA_DIR, SCHEMAS_FILE};
 use crate::data::path::{data_dir_for_schema_chunk, ddl_path_for_schema};
-use crate::data::snapshot_storage::{OpenDalStorage, SnapshotStorage, validate_uri};
+use crate::data::snapshot_storage::{
+    OpenDalStorage, SnapshotStorage, validate_snapshot_uri, validate_uri,
+};
 use crate::data::sql::{escape_sql_identifier, escape_sql_literal};
 use crate::database::{DatabaseClient, parse_proxy_opts};

@@ -51,6 +54,8 @@ pub enum ExportV2Command {
    List(ExportListCommand),
    /// Verify snapshot integrity.
    Verify(ExportVerifyCommand),
+    /// Delete a snapshot and all data under it.
+    Delete(ExportDeleteCommand),
 }

 impl ExportV2Command {
@@ -59,6 +64,7 @@ impl ExportV2Command {
            ExportV2Command::Create(cmd) => cmd.build().await,
            ExportV2Command::List(cmd) => cmd.build().await,
            ExportV2Command::Verify(cmd) => cmd.build().await,
+            ExportV2Command::Delete(cmd) => cmd.build().await,
        }
    }
 }
@@ -172,6 +178,75 @@ impl ExportVerify {
    }
 }

+/// Delete a snapshot and all data under it.
+#[derive(Debug, Parser)]
+pub struct ExportDeleteCommand {
+    /// Snapshot storage location (e.g., s3://bucket/path, file:///tmp/backup).
+    #[clap(long)]
+    snapshot: String,
+
+    /// Skip interactive confirmation.
+    #[clap(long = "no-confirm", alias = "yes")]
+    skip_confirmation: bool,
+
+    /// Object store configuration for remote storage backends.
+    #[clap(flatten)]
+    storage: ObjectStoreConfig,
+}
+
+impl ExportDeleteCommand {
+    pub async fn build(&self) -> std::result::Result<Box<dyn Tool>, BoxedError> {
+        validate_snapshot_uri(&self.snapshot).map_err(BoxedError::new)?;
+        let storage =
+            OpenDalStorage::from_uri(&self.snapshot, &self.storage).map_err(BoxedError::new)?;
+
+        Ok(Box::new(ExportDelete {
+            snapshot: self.snapshot.clone(),
+            skip_confirmation: self.skip_confirmation,
+            storage,
+        }))
+    }
+}
+
+/// Export delete tool implementation.
+pub struct ExportDelete {
+    snapshot: String,
+    skip_confirmation: bool,
+    storage: OpenDalStorage,
+}
+
+#[async_trait]
+impl Tool for ExportDelete {
+    async fn do_work(&self) -> std::result::Result<(), BoxedError> {
+        self.run().await.map_err(BoxedError::new)
+    }
+}
+
+impl ExportDelete {
+    async fn run(&self) -> Result<()> {
+        self.run_with_confirmation(confirm_delete).await
+    }
+
+    async fn run_with_confirmation<F>(&self, confirm: F) -> Result<()>
+    where
+        F: FnOnce(&str) -> Result<bool>,
+    {
+        let manifest = self.storage.read_manifest().await?;
+        print_delete_summary(&self.snapshot, &manifest);
+
+        if !self.skip_confirmation && !confirm(&self.snapshot)? {
+            println!("Deletion cancelled.");
+            return Ok(());
+        }
+
+        println!("Deleting snapshot...");
+        self.storage.delete_snapshot().await?;
+        println!("Snapshot deleted successfully.");
+
+        Ok(())
+    }
+}
+
 /// Create a new snapshot.
 #[derive(Debug, Parser)]
 pub struct ExportCreateCommand {
@@ -1239,6 +1314,79 @@ fn print_verify_report(snapshot: &str, report: &VerifyReport) {
    );
 }

+fn print_delete_summary(snapshot: &str, manifest: &Manifest) {
+    println!("Snapshot: {}", manifest.snapshot_id);
+    println!("  Location: {}", snapshot);
+    println!(
+        "  Created:  {} UTC",
+        manifest.created_at.format("%Y-%m-%d %H:%M:%S")
+    );
+    println!("  Catalog:  {}", manifest.catalog);
+    println!("  Schemas:  {}", manifest.schemas.join(", "));
+    println!("  Chunks:   {}", format_delete_chunks(manifest));
+}
+
+fn format_delete_chunks(manifest: &Manifest) -> String {
+    if manifest.schema_only {
+        return "0 (schema-only)".to_string();
+    }
+
+    let summary = summarize_chunks(manifest);
+    if manifest.is_complete() {
+        format!("{} (all processed)", summary.total)
+    } else {
+        format!(
+            "{} ({} completed, {} skipped, {} pending, {} in_progress, {} failed)",
+            summary.total,
+            summary.completed,
+            summary.skipped,
+            summary.pending,
+            summary.in_progress,
+            summary.failed
+        )
+    }
+}
+
+fn confirm_delete(snapshot: &str) -> Result<bool> {
+    println!();
+    println!(
+        "Warning: this removes the entire snapshot directory/prefix, not only files listed in manifest."
+    );
+    println!("This will permanently delete all data under:");
+    println!("  {}", display_snapshot_prefix(snapshot));
+    print!("Type 'yes' to confirm deletion: ");
+    io::stdout().flush().map_err(|error| {
+        IoSnafu {
+            operation: "flushing delete confirmation prompt",
+            error,
+        }
+        .build()
+    })?;
+
+    let mut input = String::new();
+    io::stdin().read_line(&mut input).map_err(|error| {
+        IoSnafu {
+            operation: "reading delete confirmation",
+            error,
+        }
+        .build()
+    })?;
+
+    Ok(delete_confirmation_matches(&input))
+}
+
+fn delete_confirmation_matches(input: &str) -> bool {
+    input.trim() == "yes"
+}
+
+fn display_snapshot_prefix(snapshot: &str) -> String {
+    if snapshot.ends_with('/') {
+        snapshot.to_string()
+    } else {
+        format!("{}/", snapshot)
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use chrono::TimeZone;
@@ -1563,6 +1711,7 @@ mod tests {
        );
        assert_eq!(snapshot_status(&complete), "complete");
        assert_eq!(format_list_chunks(&complete), "2/2");
+        assert_eq!(format_delete_chunks(&complete), "2 (all processed)");

        let incomplete = test_manifest(
            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
@@ -1571,6 +1720,150 @@ mod tests {
        );
        assert_eq!(snapshot_status(&incomplete), "incomplete");
        assert_eq!(format_list_chunks(&incomplete), "1/2");
+        assert_eq!(
+            format_delete_chunks(&incomplete),
+            "2 (1 completed, 0 skipped, 1 pending, 0 in_progress, 0 failed)"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_delete_build_rejects_bucket_root_uri() {
+        let cmd = ExportDeleteCommand::parse_from([
+            "export-v2-delete",
+            "--snapshot",
+            "s3://bucket",
+            "--no-confirm",
+        ]);
+
+        let error = cmd.build().await.err().unwrap().to_string();
+        assert!(error.contains("non-empty path"));
+    }
+
+    #[test]
+    fn test_delete_skip_confirmation_aliases() {
+        let no_confirm = ExportDeleteCommand::parse_from([
+            "export-v2-delete",
+            "--snapshot",
+            "s3://bucket/snapshot",
+            "--no-confirm",
+        ]);
+        assert!(no_confirm.skip_confirmation);
+
+        let yes = ExportDeleteCommand::parse_from([
+            "export-v2-delete",
+            "--snapshot",
+            "s3://bucket/snapshot",
+            "--yes",
+        ]);
+        assert!(yes.skip_confirmation);
+    }
+
+    #[tokio::test]
+    async fn test_delete_snapshot_with_no_confirm_removes_snapshot_contents() {
+        let parent = tempdir().unwrap();
+        let snapshot = parent.path().join("snapshot");
+        let sibling = parent.path().join("sibling");
+        std::fs::create_dir_all(&snapshot).unwrap();
+        std::fs::create_dir_all(&sibling).unwrap();
+        std::fs::write(sibling.join("keep.txt"), b"keep").unwrap();
+        write_root_manifest(
+            &snapshot,
+            test_manifest(
+                chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
+                true,
+                true,
+            ),
+        );
+        write_snapshot_file(&snapshot, "schema/schemas.json", b"[]");
+
+        let uri = Url::from_directory_path(&snapshot).unwrap().to_string();
+        let delete = ExportDelete {
+            snapshot: uri,
+            skip_confirmation: true,
+            storage: file_storage_for_dir(&snapshot),
+        };
+
+        delete
+            .run_with_confirmation(|_| unreachable!())
+            .await
+            .unwrap();
+
+        assert!(!snapshot.join(MANIFEST_FILE).exists());
+        assert!(!snapshot.join("schema/schemas.json").exists());
+        assert!(sibling.join("keep.txt").exists());
+    }
+
+    #[tokio::test]
+    async fn test_delete_snapshot_requires_manifest() {
+        let dir = tempdir().unwrap();
+        let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
+        let delete = ExportDelete {
+            snapshot: uri,
+            skip_confirmation: true,
+            storage: file_storage_for_dir(dir.path()),
+        };
+
+        let error = delete
+            .run_with_confirmation(|_| unreachable!())
+            .await
+            .err()
+            .unwrap()
+            .to_string();
+
+        assert!(error.contains("Snapshot not found"));
+        assert!(dir.path().exists());
+    }
+
+    #[tokio::test]
+    async fn test_delete_snapshot_cancels_without_exact_confirmation() {
+        let dir = tempdir().unwrap();
+        write_root_manifest(
+            dir.path(),
+            test_manifest(
+                chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
+                true,
+                true,
+            ),
+        );
+        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
+        let uri = Url::from_directory_path(dir.path()).unwrap().to_string();
+        let delete = ExportDelete {
+            snapshot: uri.clone(),
+            skip_confirmation: false,
+            storage: file_storage_for_dir(dir.path()),
+        };
+
+        delete
+            .run_with_confirmation(|snapshot| {
+                assert_eq!(snapshot, uri);
+                Ok(false)
+            })
+            .await
+            .unwrap();
+
+        assert!(dir.path().join(MANIFEST_FILE).exists());
+        assert!(dir.path().join("schema/schemas.json").exists());
+    }
+
+    #[test]
+    fn test_delete_confirmation_requires_exact_yes() {
+        assert!(delete_confirmation_matches("yes"));
+        assert!(delete_confirmation_matches(" yes\n"));
+        assert!(!delete_confirmation_matches("YES"));
+        assert!(!delete_confirmation_matches("y"));
+        assert!(!delete_confirmation_matches("yes please"));
+    }
+
+    #[test]
+    fn test_display_snapshot_prefix_adds_trailing_slash() {
+        assert_eq!(
+            display_snapshot_prefix("s3://bucket/snapshot"),
+            "s3://bucket/snapshot/"
+        );
+        assert_eq!(
+            display_snapshot_prefix("s3://bucket/snapshot/"),
+            "s3://bucket/snapshot/"
+        );
    }

    #[tokio::test]
--- a/src/cli/src/data/export_v2/error.rs
+++ b/src/cli/src/data/export_v2/error.rs
@@ -71,6 +71,14 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("I/O error while {}: {}", operation, error))]
+    Io {
+        operation: &'static str,
+        error: std::io::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display(
        "Cannot resume snapshot with a different schema_only mode (existing: {}, requested: {}). Use --force to recreate.",
        existing_schema_only,
@@ -223,6 +231,8 @@ impl ErrorExt for Error {
            | Error::UnexpectedValueType { .. }
            | Error::UrlParse { .. } => StatusCode::Internal,

+            Error::Io { .. } => StatusCode::External,
+
            Error::Database { error, .. } => error.status_code(),

            Error::SnapshotNotFound { .. } => StatusCode::InvalidArguments,
--- a/src/cli/src/data/snapshot_storage.rs
+++ b/src/cli/src/data/snapshot_storage.rs
@@ -18,6 +18,7 @@
 //! to various storage backends (S3, OSS, GCS, Azure Blob, local filesystem).

 use std::collections::BTreeSet;
+use std::path::Component;

 use async_trait::async_trait;
 use futures::TryStreamExt;
@@ -131,6 +132,92 @@ pub fn validate_uri(uri: &str) -> Result<StorageScheme> {
    StorageScheme::from_uri(uri)
 }

+/// Validates a URI for snapshot-scoped destructive operations.
+///
+/// Unlike read-only parent scans, destructive commands must target a concrete
+/// snapshot directory instead of a bucket/container root or filesystem root.
+/// Remote storage buckets/containers already provide namespace isolation, so a
+/// non-empty object prefix is enough; local filesystem paths require at least
+/// two non-root path segments to avoid deleting broad system directories.
+pub fn validate_snapshot_uri(uri: &str) -> Result<StorageScheme> {
+    let scheme = validate_uri(uri)?;
+    reject_query_or_fragment(uri)?;
+    match scheme {
+        StorageScheme::File => validate_file_snapshot_uri(uri)?,
+        StorageScheme::S3 | StorageScheme::Oss | StorageScheme::Gcs | StorageScheme::Azblob => {
+            extract_remote_location_with_root_policy(uri, false)?;
+        }
+    }
+    Ok(scheme)
+}
+
+fn reject_query_or_fragment(uri: &str) -> Result<()> {
+    let url = Url::parse(uri).context(UrlParseSnafu)?;
+    if url.query().is_some() || url.fragment().is_some() {
+        return InvalidUriSnafu {
+            uri,
+            reason: "snapshot URI must not include query or fragment",
+        }
+        .fail();
+    }
+
+    Ok(())
+}
+
+fn validate_file_snapshot_uri(uri: &str) -> Result<()> {
+    if has_explicit_dot_segment(uri) {
+        return InvalidUriSnafu {
+            uri,
+            reason: "file snapshot URI must not contain '.' or '..' path segments",
+        }
+        .fail();
+    }
+
+    let path = extract_file_path_from_uri(uri)?;
+    let mut normal_component_count = 0;
+
+    // This is only a path-shape guard for destructive operations. It does not
+    // resolve symlinks. Drive prefixes and root separators also do not count
+    // toward depth; delete still relies on the manifest check and explicit
+    // confirmation before removing the rooted storage prefix.
+    for component in std::path::Path::new(&path).components() {
+        match component {
+            Component::Normal(_) => normal_component_count += 1,
+            Component::CurDir | Component::ParentDir => {
+                return InvalidUriSnafu {
+                    uri,
+                    reason: "file snapshot URI must not contain '.' or '..' path segments",
+                }
+                .fail();
+            }
+            Component::Prefix(_) | Component::RootDir => {}
+        }
+    }
+
+    if normal_component_count < 2 {
+        return InvalidUriSnafu {
+            uri,
+            reason: "file snapshot URI must point to a directory at least two levels deep",
+        }
+        .fail();
+    }
+
+    Ok(())
+}
+
+fn has_explicit_dot_segment(uri: &str) -> bool {
+    // Defense in depth: catch dot segments at the raw URI level before
+    // `Url::to_file_path()` can normalize them away. The `Path::components()`
+    // check below still runs because URL decoding can reintroduce them.
+    let without_fragment = uri.split_once('#').map_or(uri, |(path, _)| path);
+    let path = without_fragment
+        .split_once('?')
+        .map_or(without_fragment, |(path, _)| path);
+
+    path.split('/')
+        .any(|segment| segment == "." || segment == "..")
+}
+
 fn schema_index_path() -> String {
    format!("{}/{}", SCHEMA_DIR, SCHEMAS_FILE)
 }
@@ -708,6 +795,43 @@ mod tests {
        assert!(OpenDalStorage::from_parent_uri("s3://bucket", &storage).is_ok());
    }

+    #[test]
+    fn test_validate_snapshot_uri_rejects_dangerous_roots() {
+        assert!(validate_snapshot_uri("s3://bucket").is_err());
+        assert!(validate_snapshot_uri("s3://bucket/").is_err());
+        assert!(validate_snapshot_uri("oss://bucket").is_err());
+        assert!(validate_snapshot_uri("gs://bucket").is_err());
+        assert!(validate_snapshot_uri("azblob://container").is_err());
+        assert!(validate_snapshot_uri("s3://bucket/snapshot?version=1").is_err());
+        assert!(validate_snapshot_uri("file:///tmp/backup#fragment").is_err());
+        assert!(validate_snapshot_uri("file:///").is_err());
+        assert!(validate_snapshot_uri("file:///tmp").is_err());
+        assert!(validate_snapshot_uri("file:///tmp/backup/.").is_err());
+        assert!(validate_snapshot_uri("file:///tmp/backup/..").is_err());
+    }
+
+    #[test]
+    fn test_validate_snapshot_uri_accepts_snapshot_paths() {
+        assert_eq!(
+            validate_snapshot_uri("s3://bucket/snapshots/prod").unwrap(),
+            StorageScheme::S3
+        );
+
+        let dir = tempdir().unwrap();
+        let snapshot = dir.path().join("snapshot");
+        std::fs::create_dir_all(&snapshot).unwrap();
+        let uri = Url::from_directory_path(snapshot).unwrap().to_string();
+        assert_eq!(validate_snapshot_uri(&uri).unwrap(), StorageScheme::File);
+    }
+
+    #[cfg(windows)]
+    #[test]
+    fn test_validate_snapshot_uri_windows_drive_prefix_depth() {
+        assert!(validate_snapshot_uri("file:///C:/").is_err());
+        assert!(validate_snapshot_uri("file:///C:/Users").is_err());
+        assert!(validate_snapshot_uri("file:///C:/Users/snapshot").is_ok());
+    }
+
    #[cfg(not(windows))]
    #[test]
    fn test_extract_path_from_uri_unix_examples() {
--- a/src/cmd/src/datanode/objbench.rs
+++ b/src/cmd/src/datanode/objbench.rs
@@ -588,6 +588,8 @@ async fn build_cache_manager(
            .vector_cache_size(config.vector_cache_size.as_bytes())
            .page_cache_size(config.page_cache_size.as_bytes())
            .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
+            .range_result_cache_size(config.range_result_cache_size.as_bytes())
+            .prefilter_result_cache_size(config.prefilter_result_cache_size.as_bytes())
            .index_metadata_size(config.index.metadata_cache_size.as_bytes())
            .index_content_size(config.index.content_cache_size.as_bytes())
            .index_content_page_size(config.index.content_cache_page_size.as_bytes())
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -20,6 +20,7 @@ use std::{fs, path};

 use async_trait::async_trait;
 use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry};
+use catalog::CatalogManagerRef;
 use catalog::information_schema::InformationExtensionRef;
 use catalog::kvbackend::{CatalogManagerConfiguratorRef, KvBackendCatalogManagerBuilder};
 use catalog::process_manager::ProcessManager;
@@ -28,7 +29,8 @@ use common_base::Plugins;
 use common_catalog::consts::{MIN_USER_FLOW_ID, MIN_USER_TABLE_ID};
 use common_config::{Configurable, metadata_store_dir};
 use common_error::ext::BoxedError;
-use common_meta::cache::LayeredCacheRegistryBuilder;
+use common_meta::DatanodeId;
+use common_meta::cache::{LayeredCacheRegistryBuilder, LayeredCacheRegistryRef};
 use common_meta::ddl::flow_meta::FlowMetadataAllocator;
 use common_meta::ddl::table_meta::TableMetadataAllocator;
 use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl};
@@ -53,8 +55,8 @@ use datanode::config::DatanodeOptions;
 use datanode::datanode::{Datanode, DatanodeBuilder};
 use datanode::region_server::RegionServer;
 use flow::{
-    FlownodeBuilder, FlownodeInstance, FlownodeOptions, FrontendClient, FrontendInvoker,
-    GrpcQueryHandlerWithBoxedError,
+    FlowDualEngineRef, FlownodeBuilder, FlownodeInstance, FlownodeOptions, FrontendClient,
+    FrontendInvoker, GrpcQueryHandlerWithBoxedError,
 };
 use frontend::frontend::Frontend;
 use frontend::instance::StandaloneDatanodeManager;
@@ -124,8 +126,8 @@ pub struct Instance {
    frontend: Frontend,
    flownode: FlownodeInstance,
    procedure_manager: ProcedureManagerRef,
-    wal_provider: WalProviderRef,
    leader_services_controller: Box<dyn StandaloneLeaderServicesController>,
+    leader_services_context: LeaderServicesContext,
    // Keep the logging guard to prevent the worker from being dropped.
    _guard: Vec<WorkerGuard>,
 }
@@ -159,11 +161,7 @@ impl App for Instance {
        self.datanode.start_telemetry();

        self.leader_services_controller
-            .start(
-                self.procedure_manager.clone(),
-                self.wal_provider.clone(),
-                self.datanode.region_server(),
-            )
+            .start(self.leader_services_context.clone())
            .await?;

        plugins::start_frontend_plugins(self.frontend.instance.plugins().clone())
@@ -379,6 +377,8 @@ impl StartCommand {
        opts.grpc.detect_server_addr();
        let fe_opts = opts.frontend_options();
        let dn_opts = opts.datanode_options();
+        let node_id = dn_opts.node_id;
+        let init_regions_parallelism = dn_opts.init_regions_parallelism;

        plugins::setup_frontend_plugins(&mut plugins, &plugin_opts, &fe_opts)
            .await
@@ -491,21 +491,18 @@ impl StartCommand {
            .await
            .map_err(BoxedError::new)
            .context(error::OtherSnafu)?;
+        let flow_engine = flownode.flow_engine();

        // set the ref to query for the local flow state
        {
            information_extension
-                .set_flow_engine(flownode.flow_engine())
+                .set_flow_engine(flow_engine.clone())
                .await;
        }

        let node_manager = creator
            .node_manager_creator
-            .create(
-                &kv_backend,
-                datanode.region_server(),
-                flownode.flow_engine(),
-            )
+            .create(&kv_backend, datanode.region_server(), flow_engine.clone())
            .await?;

        let table_id_allocator = creator.table_id_allocator_creator.create(&kv_backend);
@@ -596,7 +593,7 @@ impl StartCommand {
            .await;

        // set the frontend invoker for flownode
-        let flow_streaming_engine = flownode.flow_engine().streaming_engine();
+        let flow_streaming_engine = flow_engine.streaming_engine();
        // flow server need to be able to use frontend to write insert requests back
        let invoker = FrontendInvoker::build_from(
            flow_streaming_engine.clone(),
@@ -620,14 +617,27 @@ impl StartCommand {
            servers,
            heartbeat_task: None,
        };
+        let leader_services_context = LeaderServicesContext {
+            procedure_manager: procedure_manager.clone(),
+            wal_provider: wal_provider.clone(),
+            region_server: datanode.region_server(),
+            kv_backend: kv_backend.clone(),
+            cache_registry: layered_cache_registry,
+            catalog_manager,
+            flow_engine,
+            frontend_client,
+            node_id,
+            init_regions_parallelism,
+            plugin_options: plugin_opts,
+        };

        let instance = Instance {
            datanode,
            frontend,
            flownode,
            procedure_manager,
-            wal_provider,
            leader_services_controller: creator.leader_services_controller,
+            leader_services_context,
            _guard: vec![],
        };
        let result = InstanceCreatorResult {
@@ -743,16 +753,11 @@ impl ProcedureExecutorCreator for DefaultProcedureExecutorCreator {

 #[async_trait]
 pub trait StandaloneLeaderServicesController: Send + Sync {
-    /// Starts services that manage standalone metadata or WAL state.
+    /// Starts leader services that manage standalone metadata or WAL state.
    ///
    /// The default implementation starts the procedure manager and WAL provider
    /// during instance startup.
-    async fn start(
-        &self,
-        procedure_manager: ProcedureManagerRef,
-        wal_provider: WalProviderRef,
-        region_server: RegionServer,
-    ) -> Result<()>;
+    async fn start(&self, context: LeaderServicesContext) -> Result<()>;

    /// Stops services started by [`StandaloneLeaderServicesController::start`].
    async fn stop(
@@ -762,21 +767,42 @@ pub trait StandaloneLeaderServicesController: Send + Sync {
    ) -> Result<()>;
 }

+#[derive(Clone)]
+/// Additional runtime handles for custom leader-service controllers.
+///
+/// The default standalone startup only needs to start/stop the procedure
+/// manager and WAL provider. Some embedders need to do more work around
+/// leader-service startup, for example reconciling metadata-backed runtime
+/// state before publishing writable leadership. Grouping those handles here
+/// keeps `Instance` small and avoids expanding
+/// [`StandaloneLeaderServicesController::start`] every time a custom lifecycle
+/// needs one more standalone component.
+pub struct LeaderServicesContext {
+    pub procedure_manager: ProcedureManagerRef,
+    pub wal_provider: WalProviderRef,
+    pub region_server: RegionServer,
+    pub kv_backend: KvBackendRef,
+    pub cache_registry: LayeredCacheRegistryRef,
+    pub catalog_manager: CatalogManagerRef,
+    pub flow_engine: FlowDualEngineRef,
+    pub frontend_client: Arc<FrontendClient>,
+    pub node_id: Option<DatanodeId>,
+    pub init_regions_parallelism: usize,
+    pub plugin_options: Vec<PluginOptions>,
+}
+
 pub struct DefaultStandaloneLeaderServicesController;

 #[async_trait]
 impl StandaloneLeaderServicesController for DefaultStandaloneLeaderServicesController {
-    async fn start(
-        &self,
-        procedure_manager: ProcedureManagerRef,
-        wal_provider: WalProviderRef,
-        _region_server: RegionServer,
-    ) -> Result<()> {
-        procedure_manager
+    async fn start(&self, context: LeaderServicesContext) -> Result<()> {
+        context
+            .procedure_manager
            .start()
            .await
            .context(error::StartProcedureManagerSnafu)?;
-        wal_provider
+        context
+            .wal_provider
            .start()
            .await
            .context(error::StartWalProviderSnafu)
--- a/src/common/catalog/src/consts.rs
+++ b/src/common/catalog/src/consts.rs
@@ -112,6 +112,8 @@ pub const INFORMATION_SCHEMA_SSTS_STORAGE_TABLE_ID: u32 = 38;
 pub const INFORMATION_SCHEMA_SSTS_INDEX_META_TABLE_ID: u32 = 39;
 /// id for information_schema.alerts
 pub const INFORMATION_SCHEMA_ALERTS_TABLE_ID: u32 = 40;
+/// id for information_schema.region_info
+pub const INFORMATION_SCHEMA_REGION_INFO_TABLE_ID: u32 = 41;

 // ----- End of information_schema tables -----

--- a/src/common/function/src/scalars/json/json_get_rewriter.rs
+++ b/src/common/function/src/scalars/json/json_get_rewriter.rs
@@ -59,7 +59,10 @@ impl FunctionRewrite for JsonGetRewriter {
 //   json_get(column, path, <data_type>)
 // )
 fn inject_type_from_cast_expr(cast: Cast) -> Result<Transformed<Expr>> {
-    let Cast { expr, data_type } = cast;
+    let Cast {
+        expr,
+        mut data_type,
+    } = cast;

    let mut json_get = match *expr {
        Expr::ScalarFunction(f)
@@ -75,6 +78,9 @@ fn inject_type_from_cast_expr(cast: Cast) -> Result<Transformed<Expr>> {
        }
    };

+    if data_type.is_string() {
+        data_type = DataType::Utf8View;
+    }
    let with_type = ScalarValue::try_new_null(&data_type).map(|x| Expr::Literal(x, None))?;
    json_get.args.push(with_type);
    Ok(Transformed::yes(Expr::ScalarFunction(json_get)))
--- a/src/common/meta/src/cache/container.rs
+++ b/src/common/meta/src/cache/container.rs
@@ -196,8 +196,8 @@ where
 #[async_trait::async_trait]
 impl<K, V> CacheInvalidator for CacheContainer<K, V, CacheIdent>
 where
-    K: Send + Sync,
-    V: Send + Sync,
+    K: Hash + Eq + Send + Sync + 'static,
+    V: Clone + Send + Sync + 'static,
 {
    async fn invalidate(&self, _ctx: &Context, caches: &[CacheIdent]) -> Result<()> {
        let idents = caches
@@ -211,6 +211,12 @@ where

        Ok(())
    }
+
+    fn invalidate_all(&self) -> Result<()> {
+        self.inc_version();
+        self.cache.invalidate_all();
+        Ok(())
+    }
 }

 impl<K, V, CacheToken> CacheContainer<K, V, CacheToken>
--- a/src/common/meta/src/cache/flow/table_flownode.rs
+++ b/src/common/meta/src/cache/flow/table_flownode.rs
@@ -210,7 +210,7 @@ mod tests {
    use crate::cache::flow::table_flownode::{FlowIdent, new_table_flownode_set_cache};
    use crate::instruction::{CacheIdent, CreateFlow, DropFlow};
    use crate::key::flow::FlowMetadataManager;
-    use crate::key::flow::flow_info::FlowInfoValue;
+    use crate::key::flow::flow_info::{FlowInfoValue, FlowStatus};
    use crate::key::flow::flow_route::FlowRouteValue;
    use crate::kv_backend::memory::MemoryKvBackend;
    use crate::peer::Peer;
@@ -242,11 +242,14 @@ mod tests {
                    catalog_name: DEFAULT_CATALOG_NAME.to_string(),
                    query_context: None,
                    flow_name: "my_flow".to_string(),
+                    all_source_table_names: vec![],
+                    unresolved_source_table_names: vec![],
                    raw_sql: "sql".to_string(),
                    expire_after: Some(300),
                    eval_interval_secs: None,
                    comment: "comment".to_string(),
                    options: Default::default(),
+                    status: FlowStatus::Active,
                    created_time: chrono::Utc::now(),
                    updated_time: chrono::Utc::now(),
                },
--- a/src/common/meta/src/cache/registry.rs
+++ b/src/common/meta/src/cache/registry.rs
@@ -67,6 +67,13 @@ impl CacheInvalidator for LayeredCacheRegistry {
        }
        results.into_iter().collect::<Result<Vec<_>>>().map(|_| ())
    }
+
+    fn invalidate_all(&self) -> Result<()> {
+        for registry in &self.layers {
+            registry.invalidate_all()?;
+        }
+        Ok(())
+    }
 }

 impl LayeredCacheRegistry {
@@ -124,6 +131,13 @@ impl CacheInvalidator for CacheRegistry {
            .collect::<Result<Vec<_>>>()?;
        Ok(())
    }
+
+    fn invalidate_all(&self) -> Result<()> {
+        for invalidator in &self.indexes {
+            invalidator.invalidate_all()?;
+        }
+        Ok(())
+    }
 }

 impl CacheRegistry {
@@ -149,6 +163,8 @@ mod tests {

    use crate::cache::registry::CacheRegistryBuilder;
    use crate::cache::*;
+    use crate::cache_invalidator::{CacheInvalidator, Context};
+    use crate::error::Result;
    use crate::instruction::CacheIdent;

    fn always_true_filter(_: &CacheIdent) -> bool {
@@ -259,4 +275,91 @@ mod tests {
            .unwrap();
        assert_eq!(cache.name(), "string_cache");
    }
+
+    #[tokio::test]
+    async fn test_registry_invalidate_all() {
+        let invalidator: Invalidator<_, String, CacheIdent> =
+            Box::new(|_, _| Box::pin(async { Ok(()) }));
+        let i32_cache = Arc::new(test_i32_cache("i32_cache", invalidator));
+        let invalidator: Invalidator<_, String, CacheIdent> =
+            Box::new(|_, _| Box::pin(async { Ok(()) }));
+        let string_cache = Arc::new(test_cache("string_cache", invalidator));
+
+        i32_cache.get(1).await.unwrap();
+        string_cache.get_by_ref("foo").await.unwrap();
+        assert!(i32_cache.contains_key(&1));
+        assert!(string_cache.contains_key("foo"));
+
+        let registry = CacheRegistryBuilder::default()
+            .add_cache(i32_cache.clone())
+            .add_cache(string_cache.clone())
+            .build();
+
+        registry.invalidate_all().unwrap();
+
+        assert!(!i32_cache.contains_key(&1));
+        assert!(!string_cache.contains_key("foo"));
+    }
+
+    struct LayerOrderInvalidator {
+        expected_order: i32,
+        order: Arc<AtomicI32>,
+    }
+
+    #[async_trait::async_trait]
+    impl CacheInvalidator for LayerOrderInvalidator {
+        async fn invalidate(&self, _ctx: &Context, _caches: &[CacheIdent]) -> Result<()> {
+            Ok(())
+        }
+
+        fn invalidate_all(&self) -> Result<()> {
+            let previous = self.order.fetch_add(1, Ordering::Relaxed);
+            assert_eq!(self.expected_order, previous);
+            Ok(())
+        }
+    }
+
+    #[tokio::test]
+    async fn test_layered_registry_invalidate_all() {
+        let order = Arc::new(AtomicI32::new(0));
+        let invalidator: Invalidator<_, String, CacheIdent> =
+            Box::new(|_, _| Box::pin(async { Ok(()) }));
+        let first_layer_cache = Arc::new(test_cache("first_layer_cache", invalidator));
+        let first_layer_order = Arc::new(LayerOrderInvalidator {
+            expected_order: 0,
+            order: order.clone(),
+        });
+        let first_layer = CacheRegistryBuilder::default()
+            .add_cache(first_layer_order)
+            .add_cache(first_layer_cache.clone())
+            .build();
+
+        let invalidator: Invalidator<_, String, CacheIdent> =
+            Box::new(|_, _| Box::pin(async { Ok(()) }));
+        let second_layer_cache = Arc::new(test_i32_cache("second_layer_cache", invalidator));
+        let second_layer_order = Arc::new(LayerOrderInvalidator {
+            expected_order: 1,
+            order: order.clone(),
+        });
+        let second_layer = CacheRegistryBuilder::default()
+            .add_cache(second_layer_order)
+            .add_cache(second_layer_cache.clone())
+            .build();
+
+        first_layer_cache.get_by_ref("foo").await.unwrap();
+        second_layer_cache.get(1).await.unwrap();
+        assert!(first_layer_cache.contains_key("foo"));
+        assert!(second_layer_cache.contains_key(&1));
+
+        let registry = LayeredCacheRegistryBuilder::default()
+            .add_cache_registry(first_layer)
+            .add_cache_registry(second_layer)
+            .build();
+
+        registry.invalidate_all().unwrap();
+
+        assert_eq!(2, order.load(Ordering::Relaxed));
+        assert!(!first_layer_cache.contains_key("foo"));
+        assert!(!second_layer_cache.contains_key(&1));
+    }
 }
--- a/src/common/meta/src/cache_invalidator.rs
+++ b/src/common/meta/src/cache_invalidator.rs
@@ -55,6 +55,13 @@ pub struct Context {
 pub trait CacheInvalidator: Send + Sync {
    async fn invalidate(&self, ctx: &Context, caches: &[CacheIdent]) -> Result<()>;

+    /// Invalidates every cache entry owned by this invalidator.
+    ///
+    /// This method is required so each implementer explicitly decides how
+    /// full-cache invalidation should behave. Implementations that intentionally
+    /// do nothing must document why a no-op is safe.
+    fn invalidate_all(&self) -> Result<()>;
+
    fn name(&self) -> &'static str {
        std::any::type_name::<Self>()
    }
@@ -69,6 +76,11 @@ impl CacheInvalidator for DummyCacheInvalidator {
    async fn invalidate(&self, _ctx: &Context, _caches: &[CacheIdent]) -> Result<()> {
        Ok(())
    }
+
+    fn invalidate_all(&self) -> Result<()> {
+        // Dummy invalidator owns no cache state, so there is nothing to clear.
+        Ok(())
+    }
 }

 #[async_trait::async_trait]
@@ -157,4 +169,11 @@ where
        }
        Ok(())
    }
+
+    fn invalidate_all(&self) -> Result<()> {
+        // KvCacheInvalidator only knows how to invalidate explicit metadata
+        // keys. There is no safe generic way to enumerate or clear the backend
+        // keyspace, so full invalidation is intentionally a no-op here.
+        Ok(())
+    }
 }
--- a/src/common/meta/src/ddl/create_flow.rs
+++ b/src/common/meta/src/ddl/create_flow.rs
@@ -14,7 +14,7 @@

 mod metadata;

-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashMap};
 use std::fmt;

 use api::v1::ExpireAfter;
@@ -34,13 +34,14 @@ use serde::{Deserialize, Serialize};
 use snafu::{ResultExt, ensure};
 use strum::AsRefStr;
 use table::metadata::TableId;
+use table::table_name::TableName;

 use crate::cache_invalidator::Context;
 use crate::ddl::DdlContext;
 use crate::ddl::utils::{add_peer_context_if_needed, map_to_procedure_error};
 use crate::error::{self, Result, UnexpectedSnafu};
 use crate::instruction::{CacheIdent, CreateFlow, DropFlow};
-use crate::key::flow::flow_info::FlowInfoValue;
+use crate::key::flow::flow_info::{FlowInfoValue, FlowStatus};
 use crate::key::flow::flow_route::FlowRouteValue;
 use crate::key::table_name::TableNameKey;
 use crate::key::{DeserializedValueWithBytes, FlowId, FlowPartitionId};
@@ -67,6 +68,7 @@ impl CreateFlowProcedure {
                flow_id: None,
                peers: vec![],
                source_table_ids: vec![],
+                unresolved_source_table_names: vec![],
                flow_context: query_context.into(), // Convert to FlowQueryContext
                state: CreateFlowState::Prepare,
                prev_flow_info_value: None,
@@ -89,6 +91,8 @@ impl CreateFlowProcedure {
        let create_if_not_exists = self.data.task.create_if_not_exists;
        let or_replace = self.data.task.or_replace;

+        validate_flow_options(&self.data.task)?;
+
        let flow_name_value = self
            .context
            .flow_metadata_manager
@@ -167,6 +171,21 @@ impl CreateFlowProcedure {
        }

        self.collect_source_tables().await?;
+        ensure!(
+            self.data.unresolved_source_table_names.is_empty()
+                || defer_on_missing_source(&self.data.task)?,
+            error::UnsupportedSnafu {
+                operation: format!(
+                    "Create flow with missing source tables requires WITH ('{DEFER_ON_MISSING_SOURCE_KEY}'='true'): {}",
+                    self.data
+                        .unresolved_source_table_names
+                        .iter()
+                        .map(ToString::to_string)
+                        .join(", ")
+                )
+            }
+        );
+        self.ensure_supported_replace_transition()?;

        // Validate that source and sink tables are not the same
        let sink_table_name = &self.data.task.sink_table_name;
@@ -189,13 +208,38 @@ impl CreateFlowProcedure {
        if self.data.flow_id.is_none() {
            self.allocate_flow_id().await?;
        }
-        self.data.state = CreateFlowState::CreateFlows;
-        // determine flow type
        self.data.flow_type = Some(get_flow_type_from_options(&self.data.task)?);

+        self.data.state = if self.data.is_pending() {
+            self.data.peers.clear();
+            CreateFlowState::CreateMetadata
+        } else {
+            CreateFlowState::CreateFlows
+        };
+
        Ok(Status::executing(true))
    }

+    fn ensure_supported_replace_transition(&self) -> Result<()> {
+        if !self.data.task.or_replace {
+            return Ok(());
+        }
+
+        let Some(prev_flow_info) = self.data.prev_flow_info_value.as_ref() else {
+            return Ok(());
+        };
+        let prev_pending = prev_flow_info.get_inner_ref().is_pending();
+        let new_pending = self.data.is_pending();
+        ensure!(
+            prev_pending == new_pending,
+            error::UnsupportedSnafu {
+                operation: "Replacing between pending and active flow states is not supported yet"
+            }
+        );
+
+        Ok(())
+    }
+
    async fn on_flownode_create_flows(&mut self) -> Result<Status> {
        // Safety: must be allocated.
        let mut create_flow = Vec::with_capacity(self.data.peers.len());
@@ -365,6 +409,61 @@ pub fn get_flow_type_from_options(flow_task: &CreateFlowTask) -> Result<FlowType
    }
 }

+/// The flow option key for creating pending flow metadata when source tables do not exist.
+pub const DEFER_ON_MISSING_SOURCE_KEY: &str = "defer_on_missing_source";
+
+pub fn defer_on_missing_source(flow_task: &CreateFlowTask) -> Result<bool> {
+    flow_task
+        .flow_options
+        .get(DEFER_ON_MISSING_SOURCE_KEY)
+        .map(|value| {
+            value
+                .trim()
+                .to_ascii_lowercase()
+                .parse::<bool>()
+                .map_err(|_| {
+                    error::UnexpectedSnafu {
+                        err_msg: format!(
+                            "Invalid flow option '{DEFER_ON_MISSING_SOURCE_KEY}': {value}"
+                        ),
+                    }
+                    .build()
+                })
+        })
+        .transpose()
+        .map(|value| value.unwrap_or(false))
+}
+
+pub fn validate_flow_options(flow_task: &CreateFlowTask) -> Result<()> {
+    for key in flow_task.flow_options.keys() {
+        match key.as_str() {
+            DEFER_ON_MISSING_SOURCE_KEY | FlowType::FLOW_TYPE_KEY => {}
+            unknown => {
+                return UnexpectedSnafu {
+                    err_msg: format!(
+                        "Unknown flow option '{unknown}', supported user options: {DEFER_ON_MISSING_SOURCE_KEY}"
+                    ),
+                }
+                .fail();
+            }
+        }
+    }
+
+    defer_on_missing_source(flow_task)?;
+    get_flow_type_from_options(flow_task)?;
+    Ok(())
+}
+
+fn user_runtime_flow_options(options: &HashMap<String, String>) -> HashMap<String, String> {
+    let mut options = options.clone();
+    options.remove(DEFER_ON_MISSING_SOURCE_KEY);
+    options
+}
+
+fn metadata_flow_options(options: &HashMap<String, String>) -> HashMap<String, String> {
+    options.clone()
+}
+
 /// The state of [CreateFlowProcedure].
 #[derive(Debug, Clone, Serialize, Deserialize, AsRefStr, PartialEq)]
 pub enum CreateFlowState {
@@ -411,6 +510,8 @@ pub struct CreateFlowData {
    pub(crate) flow_id: Option<FlowId>,
    pub(crate) peers: Vec<Peer>,
    pub(crate) source_table_ids: Vec<TableId>,
+    #[serde(default)]
+    pub(crate) unresolved_source_table_names: Vec<TableName>,
    /// Use alias for backward compatibility with QueryContext serialized data
    #[serde(alias = "query_context")]
    pub(crate) flow_context: FlowQueryContext,
@@ -424,6 +525,16 @@ pub struct CreateFlowData {
    pub(crate) flow_type: Option<FlowType>,
 }

+impl CreateFlowData {
+    pub(crate) fn is_pending(&self) -> bool {
+        !self.unresolved_source_table_names.is_empty()
+    }
+
+    pub(crate) fn is_active(&self) -> bool {
+        !self.is_pending()
+    }
+}
+
 impl From<&CreateFlowData> for CreateRequest {
    fn from(value: &CreateFlowData) -> Self {
        let flow_id = value.flow_id.unwrap();
@@ -446,7 +557,7 @@ impl From<&CreateFlowData> for CreateRequest {
                .map(|seconds| api::v1::EvalInterval { seconds }),
            comment: value.task.comment.clone(),
            sql: value.task.sql.clone(),
-            flow_options: value.task.flow_options.clone(),
+            flow_options: user_runtime_flow_options(&value.task.flow_options),
        };

        let flow_type = value.flow_type.unwrap_or_default().to_string();
@@ -466,9 +577,9 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
            eval_interval_secs: eval_interval,
            comment,
            sql,
-            flow_options: mut options,
            ..
        } = value.task.clone();
+        let mut options = metadata_flow_options(&value.task.flow_options);

        let flownode_ids = value
            .peers
@@ -484,7 +595,7 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
            .collect::<Vec<_>>();

        let flow_type = value.flow_type.unwrap_or_default().to_string();
-        options.insert("flow_type".to_string(), flow_type);
+        options.insert(FlowType::FLOW_TYPE_KEY.to_string(), flow_type);

        let mut create_time = chrono::Utc::now();
        if let Some(prev_flow_value) = value.prev_flow_info_value.as_ref()
@@ -495,6 +606,8 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa

        let flow_info: FlowInfoValue = FlowInfoValue {
            source_table_ids: value.source_table_ids.clone(),
+            all_source_table_names: value.task.source_table_names.clone(),
+            unresolved_source_table_names: value.unresolved_source_table_names.clone(),
            sink_table_name,
            flownode_ids,
            catalog_name,
@@ -506,6 +619,11 @@ impl From<&CreateFlowData> for (FlowInfoValue, Vec<(FlowPartitionId, FlowRouteVa
            eval_interval_secs: eval_interval,
            comment,
            options,
+            status: if value.is_active() {
+                FlowStatus::Active
+            } else {
+                FlowStatus::PendingSources
+            },
            created_time: create_time,
            updated_time: chrono::Utc::now(),
        };
--- a/src/common/meta/src/ddl/create_flow/metadata.rs
+++ b/src/common/meta/src/ddl/create_flow/metadata.rs
@@ -12,10 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use snafu::OptionExt;
-
 use crate::ddl::create_flow::CreateFlowProcedure;
-use crate::error::{self, Result};
+use crate::error::Result;
 use crate::key::table_name::TableNameKey;

 impl CreateFlowProcedure {
@@ -34,9 +32,8 @@ impl CreateFlowProcedure {
        Ok(())
    }

-    /// Ensures all source tables exist and collects source table ids
+    /// Collects source table ids and keeps track of missing tables.
    pub(crate) async fn collect_source_tables(&mut self) -> Result<()> {
-        // Ensures all source tables exist.
        let keys = self
            .data
            .task
@@ -52,22 +49,24 @@ impl CreateFlowProcedure {
            .batch_get(keys)
            .await?;

-        let source_table_ids = self
+        let mut resolved = Vec::with_capacity(self.data.task.source_table_names.len());
+        let mut unresolved = Vec::new();
+
+        for (name, table_id) in self
            .data
            .task
            .source_table_names
            .iter()
            .zip(source_table_ids)
-            .map(|(name, table_id)| {
-                Ok(table_id
-                    .with_context(|| error::TableNotFoundSnafu {
-                        table_name: name.to_string(),
-                    })?
-                    .table_id())
-            })
-            .collect::<Result<Vec<_>>>()?;
+        {
+            match table_id {
+                Some(table_id) => resolved.push(table_id.table_id()),
+                None => unresolved.push(name.clone()),
+            }
+        }

-        self.data.source_table_ids = source_table_ids;
+        self.data.source_table_ids = resolved;
+        self.data.unresolved_source_table_names = unresolved;
        Ok(())
    }
 }
--- a/src/common/meta/src/ddl/drop_flow/metadata.rs
+++ b/src/common/meta/src/ddl/drop_flow/metadata.rs
@@ -43,7 +43,7 @@ impl DropFlowProcedure {
            .map(|(_, value)| value)
            .collect::<Vec<_>>();
        ensure!(
-            !flow_route_values.is_empty(),
+            flow_info_value.is_pending() || !flow_route_values.is_empty(),
            error::FlowRouteNotFoundSnafu {
                flow_name: format_full_flow_name(catalog_name, flow_name),
            }
--- a/src/common/meta/src/ddl/tests/create_flow.rs
+++ b/src/common/meta/src/ddl/tests/create_flow.rs
@@ -16,12 +16,17 @@ use std::assert_matches;
 use std::collections::HashMap;
 use std::sync::Arc;

+use api::v1::flow::CreateRequest;
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+use common_procedure::Status;
 use common_procedure_test::execute_procedure_until_done;
 use table::table_name::TableName;

 use crate::ddl::DdlContext;
-use crate::ddl::create_flow::{CreateFlowData, CreateFlowProcedure, CreateFlowState, FlowType};
+use crate::ddl::create_flow::{
+    CreateFlowData, CreateFlowProcedure, CreateFlowState, DEFER_ON_MISSING_SOURCE_KEY, FlowType,
+    defer_on_missing_source,
+};
 use crate::ddl::test_util::create_table::test_create_table_task;
 use crate::ddl::test_util::flownode_handler::NaiveFlownodeHandler;
 use crate::error;
@@ -63,6 +68,11 @@ pub(crate) fn test_create_flow_task(
    }
 }

+fn enable_defer_on_missing_source(task: &mut CreateFlowTask) {
+    task.flow_options
+        .insert(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string());
+}
+
 #[tokio::test]
 async fn test_create_flow_source_table_not_found() {
    let source_table_names = vec![TableName::new(
@@ -78,7 +88,261 @@ async fn test_create_flow_source_table_not_found() {
    let query_ctx = test_query_context();
    let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context);
    let err = procedure.on_prepare().await.unwrap_err();
-    assert_matches!(err, error::Error::TableNotFound { .. });
+    assert_matches!(err, error::Error::Unsupported { .. });
+    assert!(
+        err.to_string()
+            .contains("requires WITH ('defer_on_missing_source'='true')")
+    );
+}
+
+#[tokio::test]
+async fn test_create_pending_flow_source_table_not_found_with_defer() {
+    let source_table_names = vec![TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "my_table",
+    )];
+    let sink_table_name =
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table");
+    let mut task = test_create_flow_task("my_flow", source_table_names, sink_table_name, false);
+    enable_defer_on_missing_source(&mut task);
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context.clone());
+    let status = procedure.on_prepare().await.unwrap();
+    assert_matches!(status, Status::Executing { persist: true, .. });
+    assert_eq!(procedure.data.unresolved_source_table_names.len(), 1);
+    assert_eq!(procedure.data.source_table_ids, Vec::<u32>::new());
+
+    let output = execute_procedure_until_done(&mut procedure).await.unwrap();
+    let flow_id = *output.downcast_ref::<FlowId>().unwrap();
+    let flow_info = ddl_context
+        .flow_metadata_manager
+        .flow_info_manager()
+        .get(flow_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_eq!(flow_info.source_table_ids(), Vec::<u32>::new());
+    assert_eq!(
+        flow_info
+            .options()
+            .get(DEFER_ON_MISSING_SOURCE_KEY)
+            .map(String::as_str),
+        Some("true")
+    );
+}
+
+#[tokio::test]
+async fn test_create_pending_flow_source_table_not_found_with_defer_false() {
+    let source_table_names = vec![TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "my_table",
+    )];
+    let sink_table_name =
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table");
+    let mut task = test_create_flow_task("my_flow", source_table_names, sink_table_name, false);
+    task.flow_options
+        .insert(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "false".to_string());
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context);
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_matches!(err, error::Error::Unsupported { .. });
+    assert!(
+        err.to_string()
+            .contains("requires WITH ('defer_on_missing_source'='true')")
+    );
+}
+
+#[tokio::test]
+async fn test_create_pending_flow_records_partial_source_resolution() {
+    let existing_source = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "partial_existing_source_table",
+    );
+    let missing_source = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "partial_missing_source_table",
+    );
+    let sink_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "partial_pending_sink_table",
+    );
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+
+    let existing_table_id = 3026;
+    let create_table_task =
+        test_create_table_task("partial_existing_source_table", existing_table_id);
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            create_table_task.table_info.clone(),
+            TableRouteValue::physical(vec![]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let mut task = test_create_flow_task(
+        "partial_pending_flow",
+        vec![existing_source.clone(), missing_source.clone()],
+        sink_table_name,
+        false,
+    );
+    enable_defer_on_missing_source(&mut task);
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context.clone());
+    let status = procedure.on_prepare().await.unwrap();
+    assert_matches!(status, Status::Executing { persist: true, .. });
+    assert_eq!(procedure.data.source_table_ids, vec![existing_table_id]);
+    assert_eq!(
+        procedure.data.unresolved_source_table_names,
+        vec![missing_source.clone()]
+    );
+
+    let output = execute_procedure_until_done(&mut procedure).await.unwrap();
+    let flow_id = *output.downcast_ref::<FlowId>().unwrap();
+    let flow_info = ddl_context
+        .flow_metadata_manager
+        .flow_info_manager()
+        .get(flow_id)
+        .await
+        .unwrap()
+        .unwrap();
+
+    assert!(flow_info.is_pending());
+    assert_eq!(flow_info.source_table_ids(), &[existing_table_id]);
+    let expected_all_sources = vec![existing_source, missing_source.clone()];
+    assert_eq!(
+        flow_info.all_source_table_names(),
+        expected_all_sources.as_slice()
+    );
+    assert_eq!(flow_info.unresolved_source_table_names(), &[missing_source]);
+    assert!(flow_info.flownode_ids().is_empty());
+}
+
+#[test]
+fn test_defer_on_missing_source_defaults_false() {
+    let task = test_create_flow_task(
+        "my_flow",
+        vec![],
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+        false,
+    );
+
+    assert!(!defer_on_missing_source(&task).unwrap());
+}
+
+#[test]
+fn test_defer_on_missing_source_true() {
+    let mut task = test_create_flow_task(
+        "my_flow",
+        vec![],
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+        false,
+    );
+    task.flow_options
+        .insert(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string());
+
+    assert!(defer_on_missing_source(&task).unwrap());
+}
+
+#[test]
+fn test_defer_on_missing_source_invalid_value() {
+    let mut task = test_create_flow_task(
+        "my_flow",
+        vec![],
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+        false,
+    );
+    task.flow_options.insert(
+        DEFER_ON_MISSING_SOURCE_KEY.to_string(),
+        "invalid".to_string(),
+    );
+
+    let err = defer_on_missing_source(&task).unwrap_err();
+    assert!(
+        err.to_string()
+            .contains("Invalid flow option 'defer_on_missing_source': invalid")
+    );
+}
+
+#[tokio::test]
+async fn test_create_flow_rejects_unknown_option_in_meta_task() {
+    let mut task = test_create_flow_task(
+        "my_flow",
+        vec![],
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+        false,
+    );
+    task.flow_options
+        .insert("unknown_option".to_string(), "value".to_string());
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context);
+
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_matches!(err, error::Error::Unexpected { .. });
+    assert!(
+        err.to_string()
+            .contains("Unknown flow option 'unknown_option'")
+    );
+}
+
+#[test]
+fn test_create_request_strips_defer_on_missing_source_runtime_option() {
+    let mut task = test_create_flow_task(
+        "my_flow",
+        vec![],
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+        false,
+    );
+    enable_defer_on_missing_source(&mut task);
+
+    let data = CreateFlowData {
+        state: CreateFlowState::CreateFlows,
+        task,
+        flow_id: Some(1024),
+        peers: vec![],
+        source_table_ids: vec![],
+        unresolved_source_table_names: vec![],
+        flow_context: FlowQueryContext {
+            catalog: DEFAULT_CATALOG_NAME.to_string(),
+            schema: DEFAULT_SCHEMA_NAME.to_string(),
+            timezone: "UTC".to_string(),
+            extensions: HashMap::new(),
+            channel: 0,
+            snapshot_seqs: HashMap::new(),
+            sst_min_sequences: HashMap::new(),
+        },
+        prev_flow_info_value: None,
+        did_replace: false,
+        flow_type: Some(FlowType::Batching),
+    };
+
+    let request: CreateRequest = (&data).into();
+
+    assert!(
+        !request
+            .flow_options
+            .contains_key(DEFER_ON_MISSING_SOURCE_KEY)
+    );
+    assert_eq!(
+        request
+            .flow_options
+            .get(FlowType::FLOW_TYPE_KEY)
+            .map(String::as_str),
+        Some(FlowType::BATCHING)
+    );
 }

 pub(crate) async fn create_test_flow(
@@ -101,6 +365,27 @@ pub(crate) async fn create_test_flow(
    *flow_id
 }

+pub(crate) async fn create_test_pending_flow(
+    ddl_context: &DdlContext,
+    flow_name: &str,
+    source_table_names: Vec<TableName>,
+    sink_table_name: TableName,
+) -> FlowId {
+    let mut task = test_create_flow_task(
+        flow_name,
+        source_table_names.clone(),
+        sink_table_name.clone(),
+        false,
+    );
+    enable_defer_on_missing_source(&mut task);
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(task, query_ctx, ddl_context.clone());
+    let output = execute_procedure_until_done(&mut procedure).await.unwrap();
+    let flow_id = output.downcast_ref::<FlowId>().unwrap();
+
+    *flow_id
+}
+
 #[tokio::test]
 async fn test_create_flow() {
    let table_id = 1024;
@@ -154,6 +439,201 @@ async fn test_create_flow() {
    assert_matches!(err, error::Error::FlowAlreadyExists { .. });
 }

+#[tokio::test]
+async fn test_replace_pending_flow_with_active_flow_is_unsupported() {
+    let source_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_pending_source_table",
+    );
+    let sink_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_pending_sink_table",
+    );
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+
+    let pending_flow_id = create_test_pending_flow(
+        &ddl_context,
+        "replace_pending_flow",
+        vec![source_table_name.clone()],
+        sink_table_name.clone(),
+    )
+    .await;
+
+    let pending_flow = ddl_context
+        .flow_metadata_manager
+        .flow_info_manager()
+        .get(pending_flow_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert!(pending_flow.is_pending());
+    assert!(pending_flow.flownode_ids().is_empty());
+
+    let create_table_task = test_create_table_task("replace_pending_source_table", 1026);
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            create_table_task.table_info.clone(),
+            TableRouteValue::physical(vec![]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let mut replace_task = test_create_flow_task(
+        "replace_pending_flow",
+        vec![source_table_name],
+        sink_table_name,
+        false,
+    );
+    replace_task.or_replace = true;
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(replace_task, query_ctx, ddl_context.clone());
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_matches!(err, error::Error::Unsupported { .. });
+    assert!(
+        err.to_string()
+            .contains("Replacing between pending and active flow states")
+    );
+}
+
+#[tokio::test]
+async fn test_replace_active_flow_with_pending_flow_is_unsupported() {
+    let existing_source_table = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_active_source_table",
+    );
+    let missing_source_table = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_missing_source_table",
+    );
+    let sink_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_active_sink_table",
+    );
+
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+
+    let create_table_task = test_create_table_task("replace_active_source_table", 2026);
+    ddl_context
+        .table_metadata_manager
+        .create_table_metadata(
+            create_table_task.table_info.clone(),
+            TableRouteValue::physical(vec![]),
+            HashMap::new(),
+        )
+        .await
+        .unwrap();
+
+    let _flow_id = create_test_flow(
+        &ddl_context,
+        "replace_active_flow_to_pending",
+        vec![existing_source_table],
+        sink_table_name.clone(),
+    )
+    .await;
+
+    let mut replace_task = test_create_flow_task(
+        "replace_active_flow_to_pending",
+        vec![missing_source_table],
+        sink_table_name,
+        false,
+    );
+    enable_defer_on_missing_source(&mut replace_task);
+    replace_task.or_replace = true;
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(replace_task, query_ctx, ddl_context.clone());
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_matches!(err, error::Error::Unsupported { .. });
+    assert!(
+        err.to_string()
+            .contains("Replacing between pending and active flow states")
+    );
+}
+
+#[tokio::test]
+async fn test_replace_pending_flow_with_pending_flow_updates_metadata() {
+    let first_missing_source = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_pending_first_missing_source",
+    );
+    let second_missing_source = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_pending_second_missing_source",
+    );
+    let sink_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "replace_pending_to_pending_sink_table",
+    );
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+
+    let original_flow_id = create_test_pending_flow(
+        &ddl_context,
+        "replace_pending_to_pending_flow",
+        vec![first_missing_source.clone()],
+        sink_table_name.clone(),
+    )
+    .await;
+
+    let original_flow = ddl_context
+        .flow_metadata_manager
+        .flow_info_manager()
+        .get(original_flow_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert!(original_flow.is_pending());
+    assert_eq!(
+        original_flow.unresolved_source_table_names(),
+        &[first_missing_source]
+    );
+    assert!(original_flow.flownode_ids().is_empty());
+
+    let mut replace_task = test_create_flow_task(
+        "replace_pending_to_pending_flow",
+        vec![second_missing_source.clone()],
+        sink_table_name,
+        false,
+    );
+    enable_defer_on_missing_source(&mut replace_task);
+    replace_task.or_replace = true;
+    let query_ctx = test_query_context();
+    let mut procedure = CreateFlowProcedure::new(replace_task, query_ctx, ddl_context.clone());
+    let output = execute_procedure_until_done(&mut procedure).await.unwrap();
+    let replaced_flow_id = *output.downcast_ref::<FlowId>().unwrap();
+    assert_eq!(replaced_flow_id, original_flow_id);
+
+    let replaced_flow = ddl_context
+        .flow_metadata_manager
+        .flow_info_manager()
+        .get(replaced_flow_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert!(replaced_flow.is_pending());
+    assert_eq!(replaced_flow.source_table_ids(), Vec::<u32>::new());
+    assert_eq!(
+        replaced_flow.unresolved_source_table_names(),
+        std::slice::from_ref(&second_missing_source)
+    );
+    assert_eq!(
+        replaced_flow.all_source_table_names(),
+        &[second_missing_source]
+    );
+    assert!(replaced_flow.flownode_ids().is_empty());
+}
+
 #[tokio::test]
 async fn test_create_flow_same_source_and_sink_table() {
    let table_id = 1024;
@@ -228,6 +708,7 @@ fn test_create_flow_data_serialization_backward_compatibility() {
        "flow_id": null,
        "peers": [],
        "source_table_ids": [],
+        "unresolved_source_table_names": [],
        "query_context": {
            "current_catalog": "old_catalog",
            "current_schema": "old_schema",
@@ -265,6 +746,7 @@ fn test_create_flow_data_new_format_serialization() {
        flow_id: None,
        peers: vec![],
        source_table_ids: vec![],
+        unresolved_source_table_names: vec![],
        flow_context,
        prev_flow_info_value: None,
        did_replace: false,
@@ -327,6 +809,7 @@ fn test_flow_info_conversion_with_flow_context() {
        flow_id: Some(123),
        peers: vec![],
        source_table_ids: vec![456, 789],
+        unresolved_source_table_names: vec![],
        flow_context,
        prev_flow_info_value: None,
        did_replace: false,
--- a/src/common/meta/src/ddl/tests/drop_flow.rs
+++ b/src/common/meta/src/ddl/tests/drop_flow.rs
@@ -23,7 +23,7 @@ use table::table_name::TableName;
 use crate::ddl::drop_flow::DropFlowProcedure;
 use crate::ddl::test_util::create_table::test_create_table_task;
 use crate::ddl::test_util::flownode_handler::NaiveFlownodeHandler;
-use crate::ddl::tests::create_flow::create_test_flow;
+use crate::ddl::tests::create_flow::{create_test_flow, create_test_pending_flow};
 use crate::error;
 use crate::key::table_route::TableRouteValue;
 use crate::rpc::ddl::DropFlowTask;
@@ -91,3 +91,45 @@ async fn test_drop_flow() {
    let err = procedure.on_prepare().await.unwrap_err();
    assert_matches!(err, error::Error::FlowNotFound { .. });
 }
+
+#[tokio::test]
+async fn test_drop_pending_flow_without_routes() {
+    let source_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "drop_pending_missing_source_table",
+    );
+    let sink_table_name = TableName::new(
+        DEFAULT_CATALOG_NAME,
+        DEFAULT_SCHEMA_NAME,
+        "drop_pending_sink_table",
+    );
+    let node_manager = Arc::new(MockFlownodeManager::new(NaiveFlownodeHandler));
+    let ddl_context = new_ddl_context(node_manager);
+
+    let flow_id = create_test_pending_flow(
+        &ddl_context,
+        "drop_pending_flow",
+        vec![source_table_name],
+        sink_table_name,
+    )
+    .await;
+    let flow_info = ddl_context
+        .flow_metadata_manager
+        .flow_info_manager()
+        .get(flow_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert!(flow_info.is_pending());
+    assert!(flow_info.flownode_ids().is_empty());
+
+    let task = test_drop_flow_task("drop_pending_flow", flow_id, false);
+    let mut procedure = DropFlowProcedure::new(task, ddl_context.clone());
+    execute_procedure_until_done(&mut procedure).await;
+
+    let task = test_drop_flow_task("drop_pending_flow", flow_id, false);
+    let mut procedure = DropFlowProcedure::new(task, ddl_context);
+    let err = procedure.on_prepare().await.unwrap_err();
+    assert_matches!(err, error::Error::FlowNotFound { .. });
+}
--- a/src/common/meta/src/ddl_manager.rs
+++ b/src/common/meta/src/ddl_manager.rs
@@ -15,8 +15,9 @@
 use std::sync::Arc;
 use std::time::Duration;

-use api::v1::Repartition;
 use api::v1::alter_table_expr::Kind;
+use api::v1::repartition::Source as PbRepartitionSource;
+use api::v1::{PartitionExprs, Repartition};
 use common_error::ext::BoxedError;
 use common_procedure::{
    BoxedProcedure, BoxedProcedureLoader, Output, ProcedureId, ProcedureManagerRef,
@@ -151,13 +152,18 @@ macro_rules! procedure_loader {

 pub type RepartitionProcedureFactoryRef = Arc<dyn RepartitionProcedureFactory>;

+pub enum RepartitionSource {
+    Partitioned { exprs: Vec<String> },
+    Unpartitioned { partition_columns: Vec<String> },
+}
+
 pub trait RepartitionProcedureFactory: Send + Sync {
    fn create(
        &self,
        ddl_ctx: &DdlContext,
        table_name: TableName,
        table_id: TableId,
-        from_exprs: Vec<String>,
+        source: RepartitionSource,
        to_exprs: Vec<String>,
        timeout: Option<Duration>,
    ) -> std::result::Result<BoxedProcedure, BoxedError>;
@@ -280,22 +286,38 @@ impl DdlManager {
        &self,
        table_id: TableId,
        table_name: TableName,
-        Repartition {
-            from_partition_exprs,
-            into_partition_exprs,
-        }: Repartition,
+        repartition: Repartition,
        wait: bool,
        timeout: Duration,
    ) -> Result<(ProcedureId, Option<Output>)> {
        let context = self.create_context();

+        let into_partition_exprs = repartition.into_partition_exprs;
+        let source = repartition.source;
+
+        let source = match source {
+            Some(PbRepartitionSource::PartitionExprs(PartitionExprs { exprs })) => {
+                RepartitionSource::Partitioned { exprs }
+            }
+            Some(PbRepartitionSource::Unpartitioned(source)) => RepartitionSource::Unpartitioned {
+                partition_columns: source.partition_columns,
+            },
+            None => {
+                // Reads the deprecated field for backward compatibility with old persisted DDL tasks.
+                #[allow(deprecated)]
+                RepartitionSource::Partitioned {
+                    exprs: repartition.from_partition_exprs,
+                }
+            }
+        };
+
        let procedure = self
            .repartition_procedure_factory
            .create(
                &context,
                table_name,
                table_id,
-                from_partition_exprs,
+                source,
                into_partition_exprs,
                Some(timeout),
            )
@@ -1108,7 +1130,7 @@ mod tests {
    use crate::ddl::table_meta::TableMetadataAllocator;
    use crate::ddl::truncate_table::TruncateTableProcedure;
    use crate::ddl::{DdlContext, NoopRegionFailureDetectorControl};
-    use crate::ddl_manager::RepartitionProcedureFactory;
+    use crate::ddl_manager::{RepartitionProcedureFactory, RepartitionSource};
    use crate::key::TableMetadataManager;
    use crate::key::flow::FlowMetadataManager;
    use crate::kv_backend::memory::MemoryKvBackend;
@@ -1146,7 +1168,7 @@ mod tests {
            _ddl_ctx: &DdlContext,
            _table_name: TableName,
            _table_id: TableId,
-            _from_exprs: Vec<String>,
+            _source: RepartitionSource,
            _to_exprs: Vec<String>,
            _timeout: Option<Duration>,
        ) -> std::result::Result<BoxedProcedure, BoxedError> {
--- a/src/common/meta/src/key/flow.rs
+++ b/src/common/meta/src/key/flow.rs
@@ -459,6 +459,7 @@ mod tests {

    use super::*;
    use crate::FlownodeId;
+    use crate::key::flow::flow_info::FlowStatus;
    use crate::key::flow::table_flow::TableFlowKey;
    use crate::key::node_address::{NodeAddressKey, NodeAddressValue};
    use crate::key::{FlowPartitionId, MetadataValue};
@@ -522,6 +523,8 @@ mod tests {
            query_context: None,
            flow_name: flow_name.to_string(),
            source_table_ids,
+            all_source_table_names: vec![],
+            unresolved_source_table_names: vec![],
            sink_table_name,
            flownode_ids,
            raw_sql: "raw".to_string(),
@@ -529,6 +532,7 @@ mod tests {
            eval_interval_secs: None,
            comment: "hi".to_string(),
            options: Default::default(),
+            status: FlowStatus::Active,
            created_time: chrono::Utc::now(),
            updated_time: chrono::Utc::now(),
        }
@@ -774,6 +778,8 @@ mod tests {
            query_context: None,
            flow_name: "flow".to_string(),
            source_table_ids: vec![1024, 1025, 1026],
+            all_source_table_names: vec![],
+            unresolved_source_table_names: vec![],
            sink_table_name: another_sink_table_name,
            flownode_ids: [(0, 1u64)].into(),
            raw_sql: "raw".to_string(),
@@ -781,6 +787,7 @@ mod tests {
            eval_interval_secs: None,
            comment: "hi".to_string(),
            options: Default::default(),
+            status: FlowStatus::Active,
            created_time: chrono::Utc::now(),
            updated_time: chrono::Utc::now(),
        };
@@ -1151,6 +1158,8 @@ mod tests {
            query_context: None,
            flow_name: "flow".to_string(),
            source_table_ids: vec![1024, 1025, 1026],
+            all_source_table_names: vec![],
+            unresolved_source_table_names: vec![],
            sink_table_name: another_sink_table_name,
            flownode_ids: [(0, 1u64)].into(),
            raw_sql: "raw".to_string(),
@@ -1158,6 +1167,7 @@ mod tests {
            eval_interval_secs: None,
            comment: "hi".to_string(),
            options: Default::default(),
+            status: FlowStatus::Active,
            created_time: chrono::Utc::now(),
            updated_time: chrono::Utc::now(),
        };
--- a/src/common/meta/src/key/flow/flow_info.rs
+++ b/src/common/meta/src/key/flow/flow_info.rs
@@ -16,6 +16,8 @@ use std::collections::{BTreeMap, HashMap};
 use std::sync::Arc;

 use chrono::{DateTime, Utc};
+use futures::TryStreamExt;
+use futures::stream::BoxStream;
 use lazy_static::lazy_static;
 use regex::Regex;
 use serde::{Deserialize, Serialize};
@@ -27,12 +29,27 @@ use crate::FlownodeId;
 use crate::error::{self, Result};
 use crate::key::flow::FlowScoped;
 use crate::key::txn_helper::TxnOpGetResponseSet;
-use crate::key::{DeserializedValueWithBytes, FlowId, FlowPartitionId, MetadataKey, MetadataValue};
+use crate::key::{
+    BytesAdapter, DeserializedValueWithBytes, FlowId, FlowPartitionId, MetadataKey, MetadataValue,
+};
 use crate::kv_backend::KvBackendRef;
 use crate::kv_backend::txn::{Compare, CompareOp, Txn, TxnOp};
+use crate::range_stream::{DEFAULT_PAGE_SIZE, PaginationStream};
+use crate::rpc::KeyValue;
+use crate::rpc::store::RangeRequest;

 pub const FLOW_INFO_KEY_PREFIX: &str = "info";

+/// The lifecycle status of a flow stored in metadata.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
+pub enum FlowStatus {
+    /// The flow metadata exists, but at least one source table did not exist at create time.
+    PendingSources,
+    /// The flow has resolved source tables and can be scheduled on flownodes.
+    #[default]
+    Active,
+}
+
 lazy_static! {
    static ref FLOW_INFO_KEY_PATTERN: Regex =
        Regex::new(&format!("^{FLOW_INFO_KEY_PREFIX}/([0-9]+)$")).unwrap();
@@ -114,7 +131,12 @@ impl<'a> MetadataKey<'a, FlowInfoKeyInner> for FlowInfoKeyInner {
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct FlowInfoValue {
    /// The source tables used by the flow.
+    #[serde(default)]
    pub source_table_ids: Vec<TableId>,
+    #[serde(default)]
+    pub all_source_table_names: Vec<TableName>,
+    #[serde(default)]
+    pub unresolved_source_table_names: Vec<TableName>,
    /// The sink table used by the flow.
    pub sink_table_name: TableName,
    /// Which flow nodes this flow is running on.
@@ -145,6 +167,8 @@ pub struct FlowInfoValue {
    pub comment: String,
    /// The options.
    pub options: HashMap<String, String>,
+    #[serde(default)]
+    pub status: FlowStatus,
    /// The created time
    #[serde(default)]
    pub created_time: DateTime<Utc>,
@@ -154,6 +178,14 @@ pub struct FlowInfoValue {
 }

 impl FlowInfoValue {
+    pub fn is_pending(&self) -> bool {
+        self.status == FlowStatus::PendingSources
+    }
+
+    pub fn is_active(&self) -> bool {
+        self.status == FlowStatus::Active
+    }
+
    /// Returns the `flownode_id`.
    pub fn flownode_ids(&self) -> &BTreeMap<FlowPartitionId, FlownodeId> {
        &self.flownode_ids
@@ -173,6 +205,14 @@ impl FlowInfoValue {
        &self.source_table_ids
    }

+    pub fn all_source_table_names(&self) -> &[TableName] {
+        &self.all_source_table_names
+    }
+
+    pub fn unresolved_source_table_names(&self) -> &[TableName] {
+        &self.unresolved_source_table_names
+    }
+
    pub fn catalog_name(&self) -> &String {
        &self.catalog_name
    }
@@ -209,6 +249,10 @@ impl FlowInfoValue {
        &self.options
    }

+    pub fn status(&self) -> &FlowStatus {
+        &self.status
+    }
+
    pub fn created_time(&self) -> &DateTime<Utc> {
        &self.created_time
    }
@@ -225,6 +269,12 @@ pub struct FlowInfoManager {
    kv_backend: KvBackendRef,
 }

+pub fn flow_info_decoder(kv: KeyValue) -> Result<(FlowInfoKey, FlowInfoValue)> {
+    let key = FlowInfoKey::from_bytes(&kv.key)?;
+    let value = FlowInfoValue::try_from_raw_value(&kv.value)?;
+    Ok((key, value))
+}
+
 impl FlowInfoManager {
    /// Returns a new [FlowInfoManager].
    pub fn new(kv_backend: KvBackendRef) -> Self {
@@ -254,6 +304,23 @@ impl FlowInfoManager {
            .transpose()
    }

+    pub fn flow_infos(&self) -> BoxStream<'static, Result<(FlowId, FlowInfoValue)>> {
+        let start_key = FlowScoped::new(BytesAdapter::from(
+            format!("{FLOW_INFO_KEY_PREFIX}/").into_bytes(),
+        ))
+        .to_bytes();
+        let req = RangeRequest::new().with_prefix(start_key);
+        let stream = PaginationStream::new(
+            self.kv_backend.clone(),
+            req,
+            DEFAULT_PAGE_SIZE,
+            flow_info_decoder,
+        )
+        .into_stream();
+
+        Box::pin(stream.map_ok(|(key, value)| (key.flow_id(), value)))
+    }
+
    /// Builds a create flow transaction.
    /// It is expected that the `__flow/info/{flow_id}` wasn't occupied.
    /// Otherwise, the transaction will retrieve existing value.
--- a/src/common/procedure/src/local.rs
+++ b/src/common/procedure/src/local.rs
@@ -24,7 +24,7 @@ use async_trait::async_trait;
 use backon::ExponentialBuilder;
 use common_error::ext::BoxedError;
 use common_event_recorder::EventRecorderRef;
-use common_runtime::{RepeatedTask, TaskFunction};
+use common_runtime::{JoinHandle, RepeatedTask, TaskFunction};
 use common_telemetry::tracing_context::{FutureExt, TracingContext};
 use common_telemetry::{error, info, tracing};
 use snafu::{OptionExt, ResultExt, ensure};
@@ -254,6 +254,8 @@ pub(crate) struct ManagerContext {
    running_procedures: Mutex<HashSet<ProcedureId>>,
    /// Ids and finished time of finished procedures.
    finished_procedures: Mutex<VecDeque<(ProcedureId, Instant)>>,
+    /// Runner tasks of procedures.
+    runner_tasks: Mutex<HashMap<ProcedureId, JoinHandle<()>>>,
    /// Running flag.
    running: Arc<AtomicBool>,
    /// Poison manager.
@@ -310,6 +312,7 @@ impl ManagerContext {
            procedures: RwLock::new(HashMap::new()),
            running_procedures: Mutex::new(HashSet::new()),
            finished_procedures: Mutex::new(VecDeque::new()),
+            runner_tasks: Mutex::new(HashMap::new()),
            running: Arc::new(AtomicBool::new(false)),
            poison_manager,
        }
@@ -329,6 +332,76 @@ impl ManagerContext {
        self.running.store(false, Ordering::Relaxed);
    }

+    fn reset_runtime_state(&self) {
+        self.procedures.write().unwrap().clear();
+        self.running_procedures.lock().unwrap().clear();
+        self.finished_procedures.lock().unwrap().clear();
+        for handle in self
+            .runner_tasks
+            .lock()
+            .unwrap()
+            .drain()
+            .map(|(_, handle)| handle)
+        {
+            handle.abort();
+        }
+        self.key_lock.clear();
+        self.dynamic_key_lock.clear();
+    }
+
+    fn spawn_runner_task<F>(&self, procedure_id: ProcedureId, spawn: F) -> bool
+    where
+        F: FnOnce() -> JoinHandle<()>,
+    {
+        let mut tasks = self.runner_tasks.lock().unwrap();
+        if !self.running() {
+            return false;
+        }
+
+        let handle = spawn();
+        let _ = tasks.insert(procedure_id, handle);
+        true
+    }
+
+    fn remove_procedure(&self, procedure_id: ProcedureId) {
+        self.procedures.write().unwrap().remove(&procedure_id);
+        self.running_procedures
+            .lock()
+            .unwrap()
+            .remove(&procedure_id);
+    }
+
+    pub(crate) fn remove_runner_task(&self, procedure_id: ProcedureId) {
+        let _ = self.runner_tasks.lock().unwrap().remove(&procedure_id);
+    }
+
+    fn take_runner_tasks(&self) -> Vec<JoinHandle<()>> {
+        self.runner_tasks
+            .lock()
+            .unwrap()
+            .drain()
+            .map(|(_, handle)| handle)
+            .collect()
+    }
+
+    async fn abort_runner_tasks(&self) {
+        let handles = self.take_runner_tasks();
+
+        for handle in &handles {
+            handle.abort();
+        }
+
+        for handle in handles {
+            if let Err(e) = handle.await
+                && !e.is_cancelled()
+            {
+                error!(
+                    e; "Procedure runner task exits unexpectedly during stop",
+                );
+            }
+        }
+    }
+
    /// Return `ProcedureManager` is running.
    pub(crate) fn running(&self) -> bool {
        self.running.load(Ordering::Relaxed)
@@ -675,17 +748,25 @@ impl LocalManager {

        let tracing_context = TracingContext::from_current_span();

-        let _handle = common_runtime::spawn_global(async move {
-            let span = tracing_context.attach(tracing::info_span!(
-            "LocalManager::submit_root_procedure",
-                procedure_name = %runner.meta.type_name,
-                procedure_id = %runner.meta.id,
-            ));
-            // Run the root procedure.
-            // The task was moved to another runtime for execution.
-            // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
-            runner.run().trace(span).await;
-        });
+        ensure!(
+            self.manager_ctx.spawn_runner_task(procedure_id, || {
+                common_runtime::spawn_global(async move {
+                    let span = tracing_context.attach(tracing::info_span!(
+                    "LocalManager::submit_root_procedure",
+                        procedure_name = %runner.meta.type_name,
+                        procedure_id = %runner.meta.id,
+                    ));
+                    // Run the root procedure.
+                    // The task was moved to another runtime for execution.
+                    // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
+                    runner.run().trace(span).await;
+                })
+            }),
+            {
+                self.manager_ctx.remove_procedure(procedure_id);
+                ManagerNotStartSnafu
+            }
+        );

        Ok(watcher)
    }
@@ -822,6 +903,7 @@ impl ProcedureManager for LocalManager {

        *task = Some(task_inner);

+        self.manager_ctx.reset_runtime_state();
        self.manager_ctx.start();

        info!("LocalManager is start.");
@@ -830,14 +912,18 @@ impl ProcedureManager for LocalManager {
    }

    async fn stop(&self) -> Result<()> {
-        let mut task = self.remove_outdated_meta_task.lock().await;
-
-        if let Some(task) = task.take() {
-            task.stop().await.context(StopRemoveOutdatedMetaTaskSnafu)?;
-        }
-
        self.manager_ctx.stop();

+        let mut task = self.remove_outdated_meta_task.lock().await;
+        if let Some(task) = task.take()
+            && let Err(e) = task.stop().await.context(StopRemoveOutdatedMetaTaskSnafu)
+        {
+            error!(e; "Failed to stop remove outdated meta task");
+        };
+
+        self.manager_ctx.abort_runner_tasks().await;
+        self.manager_ctx.reset_runtime_state();
+
        info!("LocalManager is stopped.");

        Ok(())
@@ -921,10 +1007,12 @@ pub(crate) mod test_util {
 #[cfg(test)]
 mod tests {
    use std::assert_matches;
+    use std::sync::atomic::{AtomicBool, Ordering as AtomicOrdering};

    use common_error::mock::MockError;
    use common_error::status_code::StatusCode;
    use common_test_util::temp_dir::create_temp_dir;
+    use tokio::sync::oneshot;
    use tokio::time::timeout;

    use super::*;
@@ -954,6 +1042,67 @@ mod tests {
        assert!(ctx.state(meta.id).unwrap().is_done());
    }

+    #[test]
+    fn test_reset_runtime_state() {
+        let ctx = new_test_manager_context();
+        ctx.set_running();
+        let mut meta = test_util::procedure_meta_for_test();
+        meta.lock_key = LockKey::single_exclusive("test.reset_runtime_state");
+        let meta = Arc::new(meta);
+        let procedure_id = meta.id;
+
+        assert!(ctx.try_insert_procedure(meta.clone()));
+        ctx.finished_procedures
+            .lock()
+            .unwrap()
+            .push_back((procedure_id, Instant::now()));
+        ctx.spawn_runner_task(procedure_id, || {
+            common_runtime::spawn_global(std::future::pending::<()>())
+        });
+
+        drop(
+            ctx.key_lock
+                .try_write("test.reset_runtime_state".to_string()),
+        );
+        drop(
+            ctx.dynamic_key_lock
+                .try_write("test.reset_runtime_state.dynamic".to_string()),
+        );
+        assert!(ctx.contains_procedure(procedure_id));
+        assert_eq!(1, ctx.running_procedures.lock().unwrap().len());
+        assert_eq!(1, ctx.finished_procedures.lock().unwrap().len());
+        assert_eq!(1, ctx.runner_tasks.lock().unwrap().len());
+        assert_eq!(1, ctx.key_lock.len());
+        assert_eq!(1, ctx.dynamic_key_lock.len());
+
+        ctx.reset_runtime_state();
+
+        assert!(!ctx.contains_procedure(procedure_id));
+        assert!(ctx.running_procedures.lock().unwrap().is_empty());
+        assert!(ctx.finished_procedures.lock().unwrap().is_empty());
+        assert!(ctx.runner_tasks.lock().unwrap().is_empty());
+        assert!(ctx.key_lock.is_empty());
+        assert!(ctx.dynamic_key_lock.is_empty());
+    }
+
+    #[test]
+    fn test_spawn_runner_task_not_started_after_stop() {
+        let ctx = new_test_manager_context();
+        let procedure_id = ProcedureId::random();
+
+        let spawned = Arc::new(AtomicBool::new(false));
+        let spawned_in_task = spawned.clone();
+        let started = ctx.spawn_runner_task(procedure_id, || {
+            common_runtime::spawn_global(async move {
+                spawned_in_task.store(true, AtomicOrdering::Relaxed);
+            })
+        });
+
+        assert!(!started);
+        assert!(!spawned.load(AtomicOrdering::Relaxed));
+        assert!(ctx.runner_tasks.lock().unwrap().is_empty());
+    }
+
    #[test]
    fn test_manager_context_insert_duplicate() {
        let ctx = new_test_manager_context();
@@ -1046,6 +1195,105 @@ mod tests {
        }
    }

+    #[derive(Debug)]
+    struct BlockingProcedure {
+        started_tx: Option<oneshot::Sender<()>>,
+        dropped: Arc<AtomicBool>,
+        lock_key: LockKey,
+    }
+
+    impl Drop for BlockingProcedure {
+        fn drop(&mut self) {
+            self.dropped.store(true, AtomicOrdering::Relaxed);
+        }
+    }
+
+    #[async_trait]
+    impl Procedure for BlockingProcedure {
+        fn type_name(&self) -> &str {
+            "BlockingProcedure"
+        }
+
+        async fn execute(&mut self, _ctx: &Context) -> Result<Status> {
+            if let Some(tx) = self.started_tx.take() {
+                let _ = tx.send(());
+            }
+            std::future::pending::<Result<Status>>().await
+        }
+
+        fn dump(&self) -> Result<String> {
+            Ok(String::new())
+        }
+
+        fn lock_key(&self) -> LockKey {
+            self.lock_key.clone()
+        }
+    }
+
+    #[tokio::test]
+    async fn test_stop_aborts_runner_and_resets_runtime_state() {
+        let dir = create_temp_dir("stop_aborts_runner_and_resets_runtime_state");
+        let config = ManagerConfig::default();
+        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
+        let poison_manager = Arc::new(InMemoryPoisonStore::new());
+        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
+        manager.start().await.unwrap();
+
+        let procedure_id = ProcedureId::random();
+        let (started_tx, started_rx) = oneshot::channel();
+        let dropped = Arc::new(AtomicBool::new(false));
+        let procedure = BlockingProcedure {
+            started_tx: Some(started_tx),
+            dropped: dropped.clone(),
+            lock_key: LockKey::single_exclusive("test.stop_aborts_runner"),
+        };
+
+        manager
+            .submit(ProcedureWithId {
+                id: procedure_id,
+                procedure: Box::new(procedure),
+            })
+            .await
+            .unwrap();
+        timeout(Duration::from_secs(5), started_rx)
+            .await
+            .unwrap()
+            .unwrap();
+
+        assert!(manager.manager_ctx.contains_procedure(procedure_id));
+        assert_eq!(
+            1,
+            manager.manager_ctx.running_procedures.lock().unwrap().len()
+        );
+        assert_eq!(1, manager.manager_ctx.runner_tasks.lock().unwrap().len());
+        assert_eq!(1, manager.manager_ctx.key_lock.len());
+
+        manager.stop().await.unwrap();
+
+        assert!(dropped.load(AtomicOrdering::Relaxed));
+        assert!(!manager.manager_ctx.running());
+        assert!(!manager.manager_ctx.contains_procedure(procedure_id));
+        assert!(
+            manager
+                .manager_ctx
+                .running_procedures
+                .lock()
+                .unwrap()
+                .is_empty()
+        );
+        assert!(
+            manager
+                .manager_ctx
+                .finished_procedures
+                .lock()
+                .unwrap()
+                .is_empty()
+        );
+        assert!(manager.manager_ctx.runner_tasks.lock().unwrap().is_empty());
+        assert!(manager.manager_ctx.key_lock.is_empty());
+        assert!(manager.manager_ctx.dynamic_key_lock.is_empty());
+    }
+
    #[test]
    fn test_register_loader() {
        let dir = create_temp_dir("register");
@@ -1439,7 +1687,7 @@ mod tests {
        let state_store = Arc::new(ObjectStateStore::new(test_util::new_object_store(&dir)));
        let poison_manager = Arc::new(InMemoryPoisonStore::new());
        let manager = LocalManager::new(config, state_store, poison_manager, None, None);
-        manager.manager_ctx.set_running();
+        manager.start().await.unwrap();

        manager
            .manager_ctx
@@ -1447,7 +1695,6 @@ mod tests {
            .lock()
            .unwrap()
            .insert(ProcedureId::random());
-        manager.start().await.unwrap();

        // Submit a new procedure should fail.
        let mut procedure = ProcedureToLoad::new("submit");
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -20,6 +20,7 @@ use backon::{BackoffBuilder, ExponentialBuilder};
 use common_error::ext::PlainError;
 use common_error::status_code::StatusCode;
 use common_event_recorder::EventRecorderRef;
+use common_telemetry::tracing::warn;
 use common_telemetry::tracing_context::{FutureExt, TracingContext};
 use common_telemetry::{debug, error, info, tracing};
 use rand::Rng;
@@ -480,6 +481,15 @@ impl Runner {
        procedure_state: ProcedureState,
        procedure: BoxedProcedure,
    ) {
+        if !self.running() {
+            warn!(
+                "ProcedureManager is not running, skip submitting subprocedure {}-{}",
+                procedure.type_name(),
+                procedure_id
+            );
+            return;
+        }
+
        if self.manager_ctx.contains_procedure(procedure_id) {
            // If the parent has already submitted this procedure, don't submit it again.
            return;
@@ -520,23 +530,29 @@ impl Runner {
            procedure_id,
        );

-        // Add the id of the subprocedure to the metadata.
-        self.meta.push_child(procedure_id);
        let parent_id = self.meta.id;

        let tracing_context = TracingContext::from_current_span();
-        let _handle = common_runtime::spawn_global(async move {
-            let span = tracing_context.attach(tracing::info_span!(
-                "LocalManager::submit_subprocedure",
-                procedure_name = %runner.meta.type_name,
-                procedure_id = %runner.meta.id,
-                parent_id = %parent_id,
-            ));
-            // Run the root procedure.
-            // The task was moved to another runtime for execution.
-            // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
-            runner.run().trace(span).await
-        });
+        if !self.manager_ctx.spawn_runner_task(procedure_id, || {
+            common_runtime::spawn_global(async move {
+                let span = tracing_context.attach(tracing::info_span!(
+                    "LocalManager::submit_subprocedure",
+                    procedure_name = %runner.meta.type_name,
+                    procedure_id = %runner.meta.id,
+                    parent_id = %parent_id,
+                ));
+                // Run the root procedure.
+                // The task was moved to another runtime for execution.
+                // In order not to interrupt tracing, a span needs to be created to continue tracing the current task.
+                runner.run().trace(span).await
+            })
+        }) {
+            self.manager_ctx.remove_procedure(procedure_id);
+            return;
+        }
+
+        // Add the id of the subprocedure to the metadata.
+        self.meta.push_child(procedure_id);
    }

    /// Extend the retry time to wait for the next retry.
@@ -702,6 +718,12 @@ impl Runner {
    }
 }

+impl Drop for Runner {
+    fn drop(&mut self) {
+        self.manager_ctx.remove_runner_task(self.meta.id);
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::assert_matches;
--- a/src/common/procedure/src/rwlock.rs
+++ b/src/common/procedure/src/rwlock.rs
@@ -106,6 +106,13 @@ where
            locks.remove(key);
        }
    }
+
+    /// Clears all key locks.
+    ///
+    /// Callers must ensure no tasks are holding or waiting for these locks.
+    pub fn clear(&self) {
+        self.inner.lock().unwrap().clear();
+    }
 }

 #[cfg(test)]
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -314,6 +314,7 @@ impl RegionServer {
        let ctx = request.header.as_ref().map(|h| h.into());
        let query_ctx = Arc::new(ctx.unwrap_or_else(|| QueryContextBuilder::default().build()));

+        let region_id = request.region_id;
        let injector_builder = NameAwareDataSourceInjectorBuilder::from_plan(&request.plan)
            .context(DataFusionSnafu)?;
        let mut injector = injector_builder
@@ -326,7 +327,6 @@ impl RegionServer {
            .context(DataFusionSnafu)?
            .data;

-        let region_id = request.region_id;
        let stream = self
            .inner
            .handle_read(QueryRequest { plan, ..request }, query_ctx.clone())
@@ -837,14 +837,13 @@ fn wrap_flow_region_watermark_stream(
    region_id: RegionId,
    query_ctx: &QueryContextRef,
 ) -> SendableRecordBatchStream {
-    let Some(seq) = should_collect_region_watermark_from_extensions(&query_ctx.extensions())
-        .then(|| query_ctx.get_snapshot(region_id.as_u64()))
-        .flatten()
-    else {
-        return stream;
-    };
-
-    Box::pin(RegionWatermarkStream::new(stream, region_id, seq))
+    if should_collect_region_watermark_from_extensions(&query_ctx.extensions())
+        && let Some(seq) = query_ctx.get_snapshot(region_id.as_u64())
+    {
+        Box::pin(RegionWatermarkStream::new(stream, region_id, seq)) as SendableRecordBatchStream
+    } else {
+        stream
+    }
 }

 /// Wraps a region read stream so terminal metrics can carry the scan-open watermark.
--- a/src/datanode/src/region_server/catalog.rs
+++ b/src/datanode/src/region_server/catalog.rs
@@ -27,6 +27,7 @@ use datafusion_expr::{LogicalPlan, TableSource};
 use futures::TryStreamExt;
 use session::context::QueryContextRef;
 use snafu::{OptionExt, ResultExt};
+use store_api::region_info::RegionInfoEntry;
 use store_api::sst_entry::{ManifestSstEntry, PuffinIndexMetaEntry, StorageSstEntry};
 use store_api::storage::RegionId;

@@ -41,6 +42,7 @@ enum InternalTableKind {
    InspectSstManifest,
    InspectSstStorage,
    InspectSstIndexMeta,
+    InspectRegionInfo,
 }

 impl InternalTableKind {
@@ -55,6 +57,9 @@ impl InternalTableKind {
        if name.eq_ignore_ascii_case(PuffinIndexMetaEntry::reserved_table_name_for_inspection()) {
            return Some(Self::InspectSstIndexMeta);
        }
+        if name.eq_ignore_ascii_case(RegionInfoEntry::reserved_table_name_for_inspection()) {
+            return Some(Self::InspectRegionInfo);
+        }
        None
    }

@@ -64,6 +69,7 @@ impl InternalTableKind {
            Self::InspectSstManifest => server.inspect_sst_manifest_provider().await,
            Self::InspectSstStorage => server.inspect_sst_storage_provider().await,
            Self::InspectSstIndexMeta => server.inspect_sst_index_meta_provider().await,
+            Self::InspectRegionInfo => server.inspect_region_info_provider().await,
        }
    }
 }
@@ -128,6 +134,25 @@ impl RegionServer {
        let table = MemTable::try_new(schema, vec![vec![batch]]).context(DataFusionSnafu)?;
        Ok(Arc::new(table))
    }
+
+    /// Expose region info across the engine as an in-memory table.
+    pub async fn inspect_region_info_provider(&self) -> Result<Arc<dyn TableProvider>> {
+        let mito = {
+            let guard = self.inner.mito_engine.read().unwrap();
+            guard.as_ref().cloned().context(UnexpectedSnafu {
+                violated: "mito engine not available",
+            })?
+        };
+
+        let entries = mito.all_region_infos().await;
+        let schema = RegionInfoEntry::schema().arrow_schema().clone();
+        let batch = RegionInfoEntry::to_record_batch(&entries)
+            .map_err(DataFusionError::from)
+            .context(DataFusionSnafu)?;
+
+        let table = MemTable::try_new(schema, vec![vec![batch]]).context(DataFusionSnafu)?;
+        Ok(Arc::new(table))
+    }
 }

 /// A catalog list that resolves `TableProvider` by table name:
@@ -347,6 +372,7 @@ mod tests {
    use datatypes::arrow::array::Int32Array;
    use datatypes::arrow::datatypes::{DataType, Field, Schema};
    use datatypes::arrow::record_batch::RecordBatch;
+    use store_api::region_info::RegionInfoEntry;

    use super::*; // bring rewrite() into scope

@@ -409,6 +435,18 @@ mod tests {
            b3.reserved_table_needed,
            vec![InternalTableKind::InspectSstManifest]
        );
+
+        let region_info = RegionInfoEntry::reserved_table_name_for_inspection();
+        let plan4 = table_scan(Some(region_info), &schema, None)
+            .unwrap()
+            .build()
+            .unwrap();
+        let b4 = NameAwareDataSourceInjectorBuilder::from_plan(&plan4).unwrap();
+        assert!(!b4.need_region_provider);
+        assert_eq!(
+            b4.reserved_table_needed,
+            vec![InternalTableKind::InspectRegionInfo]
+        );
    }

    #[test]
@@ -445,6 +483,39 @@ mod tests {
        }
    }

+    #[test]
+    fn test_rewriter_replaces_with_region_info_reserved_source() {
+        let schema = test_schema();
+        let table_name = RegionInfoEntry::reserved_table_name_for_inspection();
+        let plan = table_scan(Some(table_name), &schema, None)
+            .unwrap()
+            .build()
+            .unwrap();
+
+        let provider = empty_mem_table();
+        let source = provider_as_source(provider);
+
+        let mut injector = NameAwareDataSourceInjector {
+            reserved_sources: {
+                let mut m = HashMap::new();
+                m.insert(InternalTableKind::InspectRegionInfo, source.clone());
+                m
+            },
+            region_source: None,
+        };
+
+        let transformed = plan.rewrite(&mut injector).unwrap();
+        let new_plan = transformed.data;
+
+        if let LogicalPlan::TableScan(scan) = new_plan {
+            let src_ptr = Arc::as_ptr(&scan.source);
+            let want_ptr = Arc::as_ptr(&source);
+            assert!(std::ptr::eq(src_ptr, want_ptr));
+        } else {
+            panic!("expected TableScan after rewrite");
+        }
+    }
+
    #[test]
    fn test_rewriter_replaces_with_region_source_for_normal() {
        let schema = test_schema();
--- a/src/datanode/src/utils.rs
+++ b/src/datanode/src/utils.rs
@@ -29,10 +29,28 @@ use tracing::info;
 use crate::error::{GetMetadataSnafu, Result};

 /// The requests to open regions.
-pub(crate) struct RegionOpenRequests {
-    pub leader_regions: Vec<(RegionId, RegionOpenRequest)>,
+pub struct RegionOpenRequests {
+    pub(crate) leader_regions: Vec<(RegionId, RegionOpenRequest)>,
    #[cfg(feature = "enterprise")]
-    pub follower_regions: Vec<(RegionId, RegionOpenRequest)>,
+    pub(crate) follower_regions: Vec<(RegionId, RegionOpenRequest)>,
+}
+
+impl RegionOpenRequests {
+    /// Splits the request set into leader and follower regions.
+    #[allow(clippy::type_complexity)]
+    pub fn into_parts(
+        self,
+    ) -> (
+        Vec<(RegionId, RegionOpenRequest)>,
+        Vec<(RegionId, RegionOpenRequest)>,
+    ) {
+        let leader_regions = self.leader_regions;
+        #[cfg(feature = "enterprise")]
+        let follower_regions = self.follower_regions;
+        #[cfg(not(feature = "enterprise"))]
+        let follower_regions = Vec::new();
+        (leader_regions, follower_regions)
+    }
 }

 fn group_region_by_topic(
@@ -58,7 +76,8 @@ fn get_replay_checkpoint(
    })
 }

-pub(crate) async fn build_region_open_requests(
+/// Builds region-open requests from persisted metadata.
+pub async fn build_region_open_requests(
    node_id: DatanodeId,
    kv_backend: KvBackendRef,
 ) -> Result<RegionOpenRequests> {
--- a/src/datatypes/src/json.rs
+++ b/src/datatypes/src/json.rs
@@ -26,12 +26,12 @@ use std::sync::Arc;

 use serde::{Deserialize, Serialize};
 use serde_json::{Map, Value as Json};
-use snafu::{OptionExt, ResultExt, ensure};
+use snafu::{OptionExt, ResultExt};

 use crate::error::{self, InvalidJsonSnafu, Result, SerializeSnafu};
 use crate::json::value::{JsonValue, JsonVariant};
 use crate::types::json_type::{JsonNativeType, JsonNumberType, JsonObjectType};
-use crate::types::{StructField, StructType};
+use crate::types::{JsonType, StructField, StructType};
 use crate::value::{ListValue, StructValue, Value};

 /// The configuration of JSON encoding
@@ -305,33 +305,47 @@ fn encode_json_array_with_context<'a>(
 ) -> Result<JsonValue> {
    let json_array_len = json_array.len();
    let mut items = Vec::with_capacity(json_array_len);
-    let mut element_type = item_type.cloned();

    for (index, value) in json_array.into_iter().enumerate() {
        let array_context = context.with_key(&index.to_string());
-        let item_value =
-            encode_json_value_with_context(value, element_type.as_ref(), &array_context)?;
-        let item_type = item_value.json_type().native_type().clone();
-        items.push(item_value.into_variant());
-
-        // Determine the common type for the list
-        if let Some(current_type) = &element_type {
-            // It's valid for json array to have different types of items, for example,
-            // ["a string", 1]. However, the `JsonValue` will be converted to Arrow list array,
-            // which requires all items have exactly same type. So we forbid the different types
-            // case here. Besides, it's not common for items in a json array to differ. So I think
-            // we are good here.
-            ensure!(
-                item_type == *current_type,
-                error::InvalidJsonSnafu {
-                    value: "all items in json array must have the same type"
-                }
-            );
-        } else {
-            element_type = Some(item_type);
-        }
+        let item_value = encode_json_value_with_context(value, None, &array_context)?;
+        items.push(item_value);
    }

+    // In specification, it's valid for a JSON array to have different types of items, for example,
+    // ["a string", 1]. However, in implementation, the `JsonValue` will be converted to Arrow list
+    // array, which requires all items have exactly the same type. So we merge out the maybe
+    // different item types to a unified type, and align all the item values to it.
+
+    let provided_item_type = item_type.map(|x| JsonType::new_json2(x.clone()));
+    let merged_item_type = if let Some((first, rests)) = items.split_first() {
+        let mut merged = first.json_type().clone();
+        for rest in rests.iter().map(|x| x.json_type()) {
+            if matches!(merged.native_type(), JsonNativeType::Variant) {
+                break;
+            }
+            merged.merge(rest)?;
+        }
+        Some(merged)
+    } else {
+        None
+    };
+    let unified_item_type = match (provided_item_type, merged_item_type) {
+        (Some(mut x), Some(y)) => {
+            x.merge(&y)?;
+            Some(x)
+        }
+        (x, y) => x.or(y),
+    };
+    if let Some(unified_item_type) = unified_item_type {
+        for item in &mut items {
+            item.try_align(&unified_item_type)?;
+        }
+    }
+    let items = items
+        .into_iter()
+        .map(|x| x.into_variant())
+        .collect::<Vec<_>>();
    Ok(JsonValue::new(JsonVariant::Array(items)))
 }

@@ -1050,11 +1064,8 @@ mod tests {
    fn test_encode_json_array_mixed_types() {
        let json = json!([1, "hello", true, 3.15]);
        let settings = JsonStructureSettings::Structured(None);
-        let result = settings.encode_with_type(json, None);
-        assert_eq!(
-            result.unwrap_err().to_string(),
-            "Invalid JSON: all items in json array must have the same type"
-        );
+        let value = settings.encode_with_type(json, None).unwrap();
+        assert_eq!(value.data_type().to_string(), r#"Json2["<Variant>"]"#);
    }

    #[test]
@@ -1276,12 +1287,12 @@ mod tests {
    #[test]
    fn test_encode_json_array_with_item_type() {
        let json = json!([1, 2, 3]);
-        let item_type = Arc::new(ConcreteDataType::uint64_datatype());
+        let item_type = Arc::new(ConcreteDataType::int64_datatype());
        let settings = JsonStructureSettings::Structured(None);
        let result = settings
            .encode_with_type(
                json,
-                Some(&JsonNativeType::Array(Box::new(JsonNativeType::u64()))),
+                Some(&JsonNativeType::Array(Box::new(JsonNativeType::i64()))),
            )
            .unwrap()
            .into_json_inner()
@@ -1289,9 +1300,9 @@ mod tests {

        if let Value::List(list_value) = result {
            assert_eq!(list_value.items().len(), 3);
-            assert_eq!(list_value.items()[0], Value::UInt64(1));
-            assert_eq!(list_value.items()[1], Value::UInt64(2));
-            assert_eq!(list_value.items()[2], Value::UInt64(3));
+            assert_eq!(list_value.items()[0], Value::Int64(1));
+            assert_eq!(list_value.items()[1], Value::Int64(2));
+            assert_eq!(list_value.items()[2], Value::Int64(3));
            assert_eq!(list_value.datatype(), item_type);
        } else {
            panic!("Expected List value");
@@ -2249,10 +2260,10 @@ mod tests {
            )])),
        );

-        let decoded_struct = settings.decode_struct(array_struct);
+        let decoded_struct = settings.decode_struct(array_struct).unwrap();
        assert_eq!(
-            decoded_struct.unwrap_err().to_string(),
-            "Invalid JSON: all items in json array must have the same type"
+            format!("{decoded_struct:?}"),
+            r#"StructValue { items: [List(ListValue { items: [Binary(Bytes(b"1")), Binary(Bytes(b"\"hello\"")), Binary(Bytes(b"true")), Binary(Bytes(b"3.15"))], datatype: Binary(BinaryType { repr_type: Binary }) })], fields: StructType { fields: [StructField { name: "value", data_type: List(ListType { item_type: Binary(BinaryType { repr_type: Binary }) }), nullable: true, metadata: {} }] } }"#
        );
    }

--- a/src/datatypes/src/json/value.rs
+++ b/src/datatypes/src/json/value.rs
@@ -65,6 +65,14 @@ impl JsonNumber {
            JsonNumber::Float(n) => n.0,
        }
    }
+
+    fn native_type(&self) -> JsonNativeType {
+        match self {
+            JsonNumber::PosInt(_) => JsonNativeType::u64(),
+            JsonNumber::NegInt(_) => JsonNativeType::i64(),
+            JsonNumber::Float(_) => JsonNativeType::f64(),
+        }
+    }
 }

 impl From<u64> for JsonNumber {
@@ -147,26 +155,14 @@ impl JsonVariant {
        match self {
            JsonVariant::Null => JsonNativeType::Null,
            JsonVariant::Bool(_) => JsonNativeType::Bool,
-            JsonVariant::Number(n) => match n {
-                JsonNumber::PosInt(_) => JsonNativeType::u64(),
-                JsonNumber::NegInt(_) => JsonNativeType::i64(),
-                JsonNumber::Float(_) => JsonNativeType::f64(),
-            },
+            JsonVariant::Number(n) => n.native_type(),
            JsonVariant::String(_) => JsonNativeType::String,
            JsonVariant::Array(array) => {
-                let item_type = if let Some(first) = array.first() {
-                    first.native_type()
-                } else {
-                    JsonNativeType::Null
-                };
-                JsonNativeType::Array(Box::new(item_type))
+                json_array_native_type(array.iter().map(JsonVariant::native_type))
+            }
+            JsonVariant::Object(object) => {
+                json_object_native_type(object.iter().map(|(k, v)| (k, v.native_type())))
            }
-            JsonVariant::Object(object) => JsonNativeType::Object(
-                object
-                    .iter()
-                    .map(|(k, v)| (k.clone(), v.native_type()))
-                    .collect(),
-            ),
            JsonVariant::Variant(_) => JsonNativeType::Variant,
        }
    }
@@ -469,6 +465,7 @@ impl JsonValue {
                        .collect::<Result<_>>()?,
                ),

+                (JsonVariant::Object(kvs), _) if kvs.is_empty() => JsonVariant::Null,
                (JsonVariant::Object(mut kvs), JsonNativeType::Object(expected)) => {
                    ensure!(
                        expected.keys().len() >= kvs.keys().len()
@@ -517,7 +514,7 @@ impl JsonValue {

        let x = std::mem::take(&mut self.json_variant);
        self.json_variant = helper(x, expected.native_type())?;
-        self.json_type = OnceLock::from(expected.clone());
+        self.json_type = OnceLock::new();
        Ok(())
    }
 }
@@ -623,35 +620,55 @@ pub enum JsonVariantRef<'a> {
 }

 impl JsonVariantRef<'_> {
-    fn json_type(&self) -> JsonType {
-        fn native_type(v: &JsonVariantRef<'_>) -> JsonNativeType {
-            match v {
-                JsonVariantRef::Null => JsonNativeType::Null,
-                JsonVariantRef::Bool(_) => JsonNativeType::Bool,
-                JsonVariantRef::Number(n) => match n {
-                    JsonNumber::PosInt(_) => JsonNativeType::u64(),
-                    JsonNumber::NegInt(_) => JsonNativeType::i64(),
-                    JsonNumber::Float(_) => JsonNativeType::f64(),
-                },
-                JsonVariantRef::String(_) => JsonNativeType::String,
-                JsonVariantRef::Array(array) => {
-                    let item_type = if let Some(first) = array.first() {
-                        native_type(first)
-                    } else {
-                        JsonNativeType::Null
-                    };
-                    JsonNativeType::Array(Box::new(item_type))
-                }
-                JsonVariantRef::Object(object) => JsonNativeType::Object(
-                    object
-                        .iter()
-                        .map(|(k, v)| (k.to_string(), native_type(v)))
-                        .collect(),
-                ),
-                JsonVariantRef::Variant(_) => JsonNativeType::Variant,
+    fn native_type(&self) -> JsonNativeType {
+        match self {
+            JsonVariantRef::Null => JsonNativeType::Null,
+            JsonVariantRef::Bool(_) => JsonNativeType::Bool,
+            JsonVariantRef::Number(n) => n.native_type(),
+            JsonVariantRef::String(_) => JsonNativeType::String,
+            JsonVariantRef::Array(array) => {
+                json_array_native_type(array.iter().map(JsonVariantRef::native_type))
            }
+            JsonVariantRef::Object(object) => {
+                json_object_native_type(object.iter().map(|(k, v)| (*k, v.native_type())))
+            }
+            JsonVariantRef::Variant(_) => JsonNativeType::Variant,
        }
-        JsonType::new_json2(native_type(self))
+    }
+
+    fn json_type(&self) -> JsonType {
+        JsonType::new_json2(self.native_type())
+    }
+}
+
+fn json_array_native_type<I>(items: I) -> JsonNativeType
+where
+    I: IntoIterator<Item = JsonNativeType>,
+{
+    let mut iter = items.into_iter();
+    let mut item_type = match iter.next() {
+        Some(t) => t,
+        None => return JsonNativeType::Array(Box::new(JsonNativeType::Null)),
+    };
+    for x in iter {
+        if matches!(item_type, JsonNativeType::Variant) {
+            break;
+        }
+        item_type.merge(&x);
+    }
+    JsonNativeType::Array(Box::new(item_type))
+}
+
+fn json_object_native_type<I, K>(fields: I) -> JsonNativeType
+where
+    I: IntoIterator<Item = (K, JsonNativeType)>,
+    K: Into<String>,
+{
+    let mut fields = fields.into_iter().peekable();
+    if fields.peek().is_none() {
+        JsonNativeType::Null
+    } else {
+        JsonNativeType::Object(fields.map(|(k, v)| (k.into(), v)).collect())
    }
 }

@@ -941,7 +958,6 @@ mod tests {
                ("name".to_string(), JsonVariant::Null),
            ])))
        );
-        assert_eq!(value.json_type(), &expected);

        // Object alignment should fail if the expected type misses any field from the value.
        let expected = JsonType::new_json2(JsonNativeType::Object(JsonObjectType::from([(
--- a/src/datatypes/src/types/json_type.rs
+++ b/src/datatypes/src/types/json_type.rs
@@ -115,6 +115,14 @@ impl JsonNativeType {
            (JsonNativeType::Null, that) => that.clone(),
            (this, JsonNativeType::Null) => this,
            (this, that) if this == *that => this,
+
+            (JsonNativeType::Number(x), JsonNativeType::Number(y)) => {
+                JsonNativeType::Number(match (x, y) {
+                    (x, y) if x == *y => x,
+                    (JsonNumberType::F64, _) | (_, JsonNumberType::F64) => JsonNumberType::F64,
+                    _ => JsonNumberType::I64,
+                })
+            }
            _ => JsonNativeType::Variant,
        };
    }
@@ -128,7 +136,7 @@ impl JsonNativeType {
                JsonNumberType::I64 => ArrowDataType::Int64,
                JsonNumberType::F64 => ArrowDataType::Float64,
            },
-            JsonNativeType::String => ArrowDataType::Utf8,
+            JsonNativeType::String => ArrowDataType::Utf8View,
            JsonNativeType::Array(array) => {
                ArrowDataType::List(Arc::new(Field::new("item", array.as_arrow_type(), true)))
            }
@@ -822,7 +830,7 @@ mod tests {
        test(
            "1.5",
            &mut JsonType::new_json2(JsonNativeType::i64()),
-            Ok(r#""<Variant>""#),
+            Ok(r#""<Number>""#),
        )?;

        // Object merge should preserve existing fields and append missing fields.
--- a/src/datatypes/src/vectors/json/array.rs
+++ b/src/datatypes/src/vectors/json/array.rs
@@ -17,16 +17,24 @@ use std::sync::Arc;

 use arrow::compute;
 use arrow::util::display::{ArrayFormatter, FormatOptions};
+use arrow_array::builder::{
+    ArrayBuilder, BooleanBuilder, Float64Builder, Int64Builder, NullBuilder, StringViewBuilder,
+    make_builder,
+};
 use arrow_array::cast::AsArray;
 use arrow_array::types::{Float64Type, Int64Type, UInt64Type};
 use arrow_array::{Array, ArrayRef, GenericListArray, ListArray, StructArray, new_null_array};
 use arrow_schema::{DataType, FieldRef};
+use common_telemetry::debug;
 use serde_json::Value;
 use snafu::{OptionExt, ResultExt};

-use crate::arrow_array::{StringArray, binary_array_value, string_array_value};
+use crate::arrow_array::{
+    MutableBinaryArray, StringViewArray, binary_array_value, string_array_value,
+};
 use crate::error::{
-    AlignJsonArraySnafu, ArrowComputeSnafu, DeserializeSnafu, InvalidJsonSnafu, Result,
+    AlignJsonArraySnafu, ArrowComputeSnafu, CastTypeSnafu, DeserializeSnafu, InvalidJsonSnafu,
+    Result, SerializeSnafu,
 };

 pub struct JsonArray<'a> {
@@ -101,6 +109,12 @@ impl JsonArray<'_> {
            return Ok(self.inner.clone());
        }

+        debug!(
+            "Try aligning JSON array {} to data type {}",
+            self.inner.data_type(),
+            expect
+        );
+
        let struct_array = self.inner.as_struct_opt().context(AlignJsonArraySnafu {
            reason: "expect struct array",
        })?;
@@ -178,11 +192,23 @@ impl JsonArray<'_> {
    }

    fn try_cast(&self, to_type: &DataType) -> Result<ArrayRef> {
-        if compute::can_cast_types(self.inner.data_type(), to_type) {
+        let from_type = self.inner.data_type();
+        if from_type == to_type {
+            return Ok(self.inner.clone());
+        }
+
+        if from_type.is_binary() && !to_type.is_binary() {
+            return self.decode_variant(to_type);
+        }
+
+        if !from_type.is_binary() && to_type.is_binary() {
+            return self.encode_variant();
+        }
+
+        if compute::can_cast_types(from_type, to_type) {
            return compute::cast(&self.inner, to_type).context(ArrowComputeSnafu);
        }

-        // TODO(LFC): Cast according to `to_type` instead of formatting to String here.
        let formatter = ArrayFormatter::try_new(&self.inner, &FormatOptions::default())
            .context(ArrowComputeSnafu)?;
        let values = (0..self.inner.len())
@@ -192,7 +218,91 @@ impl JsonArray<'_> {
                    .then(|| formatter.value(i).to_string())
            })
            .collect::<Vec<_>>();
-        Ok(Arc::new(StringArray::from(values)))
+        Ok(Arc::new(StringViewArray::from(values)))
+    }
+
+    fn encode_variant(&self) -> Result<ArrayRef> {
+        let len = self.inner.len();
+        let mut encoded = Vec::with_capacity(len);
+        let mut total_bytes = 0;
+
+        for i in 0..len {
+            let value = self.try_get_value(i)?;
+            if value.is_null() {
+                encoded.push(None);
+            } else {
+                let bytes = serde_json::to_vec(&value).context(SerializeSnafu)?;
+                total_bytes += bytes.len();
+                encoded.push(Some(bytes));
+            }
+        }
+
+        let mut builder = MutableBinaryArray::with_capacity(len, total_bytes);
+        for value in encoded {
+            builder.append_option(value);
+        }
+        Ok(Arc::new(builder.finish()))
+    }
+
+    fn decode_variant(&self, to_type: &DataType) -> Result<ArrayRef> {
+        fn downcast_builder<'a, T: ArrayBuilder>(
+            builder: &'a mut dyn ArrayBuilder,
+            to_type: &DataType,
+        ) -> Result<&'a mut T> {
+            builder
+                .as_any_mut()
+                .downcast_mut::<T>()
+                .with_context(|| CastTypeSnafu {
+                    msg: format!("Expect ArrayBuilder is of type {to_type}"),
+                })
+        }
+
+        let mut builder = make_builder(to_type, self.inner.len());
+        if to_type.is_null() {
+            downcast_builder::<NullBuilder>(builder.as_mut(), to_type)?
+                .append_nulls(self.inner.len());
+        } else {
+            match to_type {
+                DataType::Boolean => {
+                    let b = downcast_builder::<BooleanBuilder>(builder.as_mut(), to_type)?;
+                    for i in 0..self.inner.len() {
+                        b.append_option(self.try_get_value(i)?.as_bool());
+                    }
+                }
+                DataType::Int64 => {
+                    let b = downcast_builder::<Int64Builder>(builder.as_mut(), to_type)?;
+                    for i in 0..self.inner.len() {
+                        b.append_option(self.try_get_value(i)?.as_i64());
+                    }
+                }
+                DataType::Float64 => {
+                    let b = downcast_builder::<Float64Builder>(builder.as_mut(), to_type)?;
+                    for i in 0..self.inner.len() {
+                        b.append_option(self.try_get_value(i)?.as_f64());
+                    }
+                }
+                DataType::Utf8View => {
+                    let b = downcast_builder::<StringViewBuilder>(builder.as_mut(), to_type)?;
+                    for i in 0..self.inner.len() {
+                        let v = self.try_get_value(i)?;
+                        if v.is_null() {
+                            b.append_null();
+                        } else if let Some(s) = v.as_str() {
+                            b.append_value(s);
+                        } else {
+                            b.append_value(v.to_string());
+                        }
+                    }
+                }
+                _ => {
+                    return CastTypeSnafu {
+                        msg: format!("Cannot cast JSON value to {to_type}"),
+                    }
+                    .fail();
+                }
+            }
+        }
+        Ok(builder.finish())
    }
 }

@@ -231,7 +341,9 @@ impl<'a> From<&'a ArrayRef> for JsonArray<'a> {
 #[cfg(test)]
 mod test {
    use arrow_array::types::Int64Type;
-    use arrow_array::{BinaryArray, BooleanArray, Float64Array, Int32Array, Int64Array, ListArray};
+    use arrow_array::{
+        BinaryArray, BooleanArray, Float64Array, Int32Array, Int64Array, ListArray, StringArray,
+    };
    use arrow_schema::{Field, Fields};
    use serde_json::json;

--- a/src/datatypes/src/vectors/json/builder.rs
+++ b/src/datatypes/src/vectors/json/builder.rs
@@ -89,7 +89,9 @@ impl MutableVector for JsonVectorBuilder {
            .fail();
        };
        let json_type = value.json_type();
-        self.merged_type.merge(json_type)?;
+        if !self.merged_type.is_include(json_type) {
+            self.merged_type.merge(json_type)?;
+        }

        let value = JsonValue::new(JsonVariant::from(value.variant().clone()));
        self.values.push(value);
--- a/src/flow/src/adapter/flownode_impl.rs
+++ b/src/flow/src/adapter/flownode_impl.rs
@@ -465,6 +465,11 @@ impl FlowDualEngine {
        Ok(())
    }

+    /// Reconciles in-memory flow tasks from persisted metadata.
+    pub async fn reconcile_flows_from_metadata(&self) -> Result<(), Error> {
+        self.check_flow_consistent(true, true).await
+    }
+
    /// TODO(discord9): also add a `exists` api using flow metadata manager's `exists` method
    async fn flow_exist_in_metadata(&self, flow_id: FlowId) -> Result<bool, Error> {
        self.flow_metadata_manager
--- a/src/flow/src/batching_mode.rs
+++ b/src/flow/src/batching_mode.rs
@@ -20,12 +20,15 @@ use common_grpc::channel_manager::ClientTlsOption;
 use serde::{Deserialize, Serialize};
 use session::ReadPreference;

+mod checkpoint;
 pub(crate) mod engine;
 pub(crate) mod frontend_client;
+mod incremental_filter;
 mod state;
+mod table_creator;
 mod task;
 mod time_window;
-mod utils;
+pub(crate) mod utils;

 #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 pub struct BatchingModeOptions {
--- a/src/flow/src/batching_mode/checkpoint.rs
+++ b/src/flow/src/batching_mode/checkpoint.rs
@@ -0,0 +1,127 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::batching_mode::state::CheckpointMode;
+
+pub(super) const CHECKPOINT_DECISION_ADVANCE: &str = "advance";
+pub(super) const CHECKPOINT_DECISION_FALLBACK: &str = "fallback";
+pub(super) const CHECKPOINT_REASON_NONE: &str = "none";
+
+/// Why the task fell back to full snapshot mode.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(super) enum FlowQueryFallbackReason {
+    /// The query result did not include a region-watermark map at all.
+    MissingRegionWatermark,
+    /// Some participating regions could not prove safe advancement against
+    /// both the returned watermarks and the checkpoint map.
+    IncompleteRegionWatermark,
+    /// The query only covered part of the dirty backlog, so global checkpoints
+    /// cannot advance yet. Incremental SQL drains all dirty windows before
+    /// checkpoint advancement; this primarily protects scoped full-snapshot
+    /// runs capped by the per-query dirty-window limit.
+    DirtyBacklogPending,
+    /// The datanode detected a stale incremental cursor and the Flow
+    /// must recompute from scratch.
+    StaleCursor,
+    /// A non-stale-cursor query failure; the Flow resets to full snapshot
+    /// to avoid cascading errors.
+    IncrementalQueryFailure,
+    /// Incremental mode has been permanently disabled for this Flow
+    /// (e.g. because the query shape is not incrementally safe).
+    IncrementalDisabled,
+}
+
+impl FlowQueryFallbackReason {
+    pub(super) fn as_label(self) -> &'static str {
+        match self {
+            Self::MissingRegionWatermark => "missing_region_watermark",
+            Self::IncompleteRegionWatermark => "incomplete_region_watermark",
+            Self::DirtyBacklogPending => "dirty_backlog_pending",
+            Self::StaleCursor => "stale_cursor",
+            Self::IncrementalQueryFailure => "incremental_query_failure",
+            Self::IncrementalDisabled => "incremental_disabled",
+        }
+    }
+}
+
+/// Decision produced by `BatchingTask::apply_query_result_to_state` after
+/// each Flow query execution. Describes whether the task advanced its
+/// checkpoint state or fell back to full snapshot, and why.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(super) enum FlowCheckpointDecision {
+    /// FullSnapshot → Incremental transition.
+    ///
+    /// The query exercised every participating region, all returned valid
+    /// watermarks, and the checkpoint map was populated from scratch.
+    /// Subsequent executions will use incremental after-seqs.
+    AdvancedFromFullSnapshot {
+        participating_regions: usize,
+        watermarks: usize,
+    },
+    /// Existing Incremental → Incremental (in-place advancement).
+    ///
+    /// A subset of participating regions advanced their watermarks. The
+    /// task stays in incremental mode with an updated checkpoint map.
+    AdvancedIncremental {
+        participating_regions: usize,
+        watermarks: usize,
+    },
+    /// Any mode → FullSnapshot.
+    ///
+    /// Watermark information was incomplete, a participating region was
+    /// absent from the existing checkpoint map, the task has permanently
+    /// disabled incremental mode, or the query itself failed. The task
+    /// resets to full snapshot semantics for the next execution.
+    FallbackToFullSnapshot {
+        previous_mode: CheckpointMode,
+        reason: FlowQueryFallbackReason,
+    },
+}
+
+impl FlowCheckpointDecision {
+    pub(super) fn mode_label(self) -> &'static str {
+        match self {
+            Self::AdvancedFromFullSnapshot { .. } => {
+                checkpoint_mode_label(CheckpointMode::FullSnapshot)
+            }
+            Self::AdvancedIncremental { .. } => checkpoint_mode_label(CheckpointMode::Incremental),
+            Self::FallbackToFullSnapshot { previous_mode, .. } => {
+                checkpoint_mode_label(previous_mode)
+            }
+        }
+    }
+
+    pub(super) fn decision_label(self) -> &'static str {
+        match self {
+            Self::AdvancedFromFullSnapshot { .. } | Self::AdvancedIncremental { .. } => {
+                CHECKPOINT_DECISION_ADVANCE
+            }
+            Self::FallbackToFullSnapshot { .. } => CHECKPOINT_DECISION_FALLBACK,
+        }
+    }
+
+    pub(super) fn reason_label(self) -> &'static str {
+        match self {
+            Self::FallbackToFullSnapshot { reason, .. } => reason.as_label(),
+            _ => CHECKPOINT_REASON_NONE,
+        }
+    }
+}
+
+pub(super) fn checkpoint_mode_label(mode: CheckpointMode) -> &'static str {
+    match mode {
+        CheckpointMode::FullSnapshot => "full_snapshot",
+        CheckpointMode::Incremental => "incremental",
+    }
+}
--- a/src/flow/src/batching_mode/engine.rs
+++ b/src/flow/src/batching_mode/engine.rs
@@ -59,8 +59,7 @@ use crate::{CreateFlowArgs, Error, FlowId, TableName};
 ///
 /// TODO(discord9): determine how to configure refresh rate
 pub struct BatchingEngine {
-    tasks: RwLock<BTreeMap<FlowId, BatchingTask>>,
-    shutdown_txs: RwLock<BTreeMap<FlowId, oneshot::Sender<()>>>,
+    runtime: RwLock<FlowRuntimeRegistry>,
    /// frontend client for insert request
    pub(crate) frontend_client: Arc<FrontendClient>,
    flow_metadata_manager: FlowMetadataManagerRef,
@@ -72,6 +71,51 @@ pub struct BatchingEngine {
    pub(crate) batch_opts: Arc<BatchingModeOptions>,
 }

+#[derive(Default)]
+struct FlowRuntimeRegistry {
+    tasks: BTreeMap<FlowId, BatchingTask>,
+    shutdown_txs: BTreeMap<FlowId, oneshot::Sender<()>>,
+}
+
+impl FlowRuntimeRegistry {
+    fn insert(
+        &mut self,
+        flow_id: FlowId,
+        task: BatchingTask,
+        shutdown_tx: oneshot::Sender<()>,
+    ) -> (Option<BatchingTask>, Option<oneshot::Sender<()>>) {
+        (
+            self.tasks.insert(flow_id, task),
+            self.shutdown_txs.insert(flow_id, shutdown_tx),
+        )
+    }
+
+    fn remove(&mut self, flow_id: FlowId) -> Option<(BatchingTask, Option<oneshot::Sender<()>>)> {
+        let task = self.tasks.remove(&flow_id)?;
+        let shutdown_tx = self.shutdown_txs.remove(&flow_id);
+        Some((task, shutdown_tx))
+    }
+
+    fn remove_if_current(
+        &mut self,
+        flow_id: FlowId,
+        task: &BatchingTask,
+    ) -> (Option<BatchingTask>, Option<oneshot::Sender<()>>) {
+        if self
+            .tasks
+            .get(&flow_id)
+            .is_some_and(|current| Arc::ptr_eq(&current.state, &task.state))
+        {
+            let Some((removed_task, removed_shutdown_tx)) = self.remove(flow_id) else {
+                return (None, None);
+            };
+            (Some(removed_task), removed_shutdown_tx)
+        } else {
+            (None, None)
+        }
+    }
+}
+
 impl BatchingEngine {
    pub fn new(
        frontend_client: Arc<FrontendClient>,
@@ -82,8 +126,7 @@ impl BatchingEngine {
        batch_opts: BatchingModeOptions,
    ) -> Self {
        Self {
-            tasks: Default::default(),
-            shutdown_txs: Default::default(),
+            runtime: Default::default(),
            frontend_client,
            flow_metadata_manager,
            table_meta,
@@ -95,8 +138,9 @@ impl BatchingEngine {

    /// Returns last execution timestamps (millisecond) for all batching flows.
    pub async fn get_last_exec_time_map(&self) -> BTreeMap<FlowId, i64> {
-        let tasks = self.tasks.read().await;
-        tasks
+        let runtime = self.runtime.read().await;
+        runtime
+            .tasks
            .iter()
            .filter_map(|(flow_id, task)| {
                task.last_execution_time_millis()
@@ -151,10 +195,17 @@ impl BatchingEngine {

        let group_by_table_name = Arc::new(group_by_table_name);

+        let tasks = self
+            .runtime
+            .read()
+            .await
+            .tasks
+            .values()
+            .cloned()
+            .collect::<Vec<_>>();
        let mut handles = Vec::new();
-        let tasks = self.tasks.read().await;

-        for (_flow_id, task) in tasks.iter() {
+        for task in tasks {
            let src_table_names = &task.config.source_table_names;

            if src_table_names
@@ -204,7 +255,6 @@ impl BatchingEngine {
            });
            handles.push(handle);
        }
-        drop(tasks);
        for handle in handles {
            match handle.await {
                Err(e) => {
@@ -274,9 +324,16 @@ impl BatchingEngine {

        let group_by_table_name = Arc::new(group_by_table_name);

+        let tasks = self
+            .runtime
+            .read()
+            .await
+            .tasks
+            .values()
+            .cloned()
+            .collect::<Vec<_>>();
        let mut handles = Vec::new();
-        let tasks = self.tasks.read().await;
-        for (_flow_id, task) in tasks.iter() {
+        for task in tasks {
            let src_table_names = &task.config.source_table_names;

            if src_table_names
@@ -327,8 +384,6 @@ impl BatchingEngine {
                }
            }
        }
-        drop(tasks);
-
        Ok(())
    }
 }
@@ -390,7 +445,7 @@ impl BatchingEngine {

        // or replace logic
        {
-            let is_exist = self.tasks.read().await.contains_key(&flow_id);
+            let is_exist = self.runtime.read().await.tasks.contains_key(&flow_id);
            match (create_if_not_exists, or_replace, is_exist) {
                // if replace, ignore that old flow exists
                (_, true, true) => {
@@ -521,17 +576,60 @@ impl BatchingEngine {
        // check execute once first to detect any error early
        task.check_or_create_sink_table(&engine, &frontend).await?;

+        let (start_tx, start_rx) = oneshot::channel();
+
        // TODO(discord9): use time wheel or what for better
        let handle = common_runtime::spawn_global(async move {
-            task_inner.start_executing_loop(engine, frontend).await;
+            if start_rx.await.is_ok() {
+                task_inner.start_executing_loop(engine, frontend).await;
+            }
        });
        task.state.write().unwrap().task_handle = Some(handle);
+        let task_for_rollback = task.clone();

-        // only replace here not earlier because we want the old one intact if something went wrong before this line
-        let replaced_old_task_opt = self.tasks.write().await.insert(flow_id, task);
-        drop(replaced_old_task_opt);
+        // Only replace here, not earlier, because we want the old one intact if
+        // something went wrong before this line. Keep the task and shutdown
+        // sender in one registry lock so create/remove can't observe one
+        // without the other.
+        let (replaced_old_task_opt, replaced_old_shutdown_tx) = {
+            let mut runtime = self.runtime.write().await;

-        self.shutdown_txs.write().await.insert(flow_id, tx);
+            let is_exist = runtime.tasks.contains_key(&flow_id);
+            match (create_if_not_exists, or_replace, is_exist) {
+                (_, true, true) => {
+                    info!(
+                        "Replacing flow with id={} after final registry check",
+                        flow_id
+                    );
+                }
+                (false, false, true) => {
+                    abort_flow_task(flow_id, Some(task), "unregistered");
+                    return FlowAlreadyExistSnafu { id: flow_id }.fail();
+                }
+                (true, false, true) => {
+                    info!(
+                        "Flow with id={} already exists at final registry check, do nothing",
+                        flow_id
+                    );
+                    abort_flow_task(flow_id, Some(task), "unregistered");
+                    return Ok(None);
+                }
+                (_, _, false) => (),
+            }
+
+            runtime.insert(flow_id, task, tx)
+        };
+
+        notify_flow_shutdown(flow_id, replaced_old_shutdown_tx, "replaced");
+        abort_flow_task(flow_id, replaced_old_task_opt, "replaced");
+        if start_tx.send(()).is_err() {
+            self.rollback_flow_runtime_if_current(flow_id, &task_for_rollback)
+                .await;
+            UnexpectedSnafu {
+                reason: format!("Failed to start flow {flow_id} due to task already dropped"),
+            }
+            .fail()?;
+        }

        Ok(Some(flow_id))
    }
@@ -662,21 +760,25 @@ impl BatchingEngine {
    }

    pub async fn remove_flow_inner(&self, flow_id: FlowId) -> Result<(), Error> {
-        if self.tasks.write().await.remove(&flow_id).is_none() {
-            warn!("Flow {flow_id} not found in tasks");
-            FlowNotFoundSnafu { id: flow_id }.fail()?;
-        }
-        let Some(tx) = self.shutdown_txs.write().await.remove(&flow_id) else {
+        let (task, shutdown_tx) = {
+            let mut runtime = self.runtime.write().await;
+            let Some((task, shutdown_tx)) = runtime.remove(flow_id) else {
+                warn!("Flow {flow_id} not found in tasks");
+                FlowNotFoundSnafu { id: flow_id }.fail()?
+            };
+            (task, shutdown_tx)
+        };
+
+        let had_shutdown_tx = notify_flow_shutdown(flow_id, shutdown_tx, "removed");
+        abort_flow_task(flow_id, Some(task), "removed");
+
+        if !had_shutdown_tx {
            UnexpectedSnafu {
                reason: format!("Can't found shutdown tx for flow {flow_id}"),
            }
            .fail()?
-        };
-        if tx.send(()).is_err() {
-            warn!(
-                "Fail to shutdown flow {flow_id} due to receiver already dropped, maybe flow {flow_id} is already dropped?"
-            )
        }
+
        Ok(())
    }

@@ -688,7 +790,7 @@ impl BatchingEngine {
        // this is only useful for the case when we are flushing the flow right after inserting data into it
        // TODO(discord9): find a better way to ensure the data is ready, maybe inform flownode from frontend?
        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
-        let task = self.tasks.read().await.get(&flow_id).cloned();
+        let task = self.runtime.read().await.tasks.get(&flow_id).cloned();
        let task = task.with_context(|| FlowNotFoundSnafu { id: flow_id })?;

        let time_window_size = task
@@ -713,7 +815,7 @@ impl BatchingEngine {
            )
            .await?;

-        let affected_rows = res.map(|(r, _)| r).unwrap_or_default() as usize;
+        let affected_rows = res.map(|(r, _)| r).unwrap_or_default();
        debug!(
            "Successfully flush flow {flow_id}, affected rows={}",
            affected_rows
@@ -723,8 +825,46 @@ impl BatchingEngine {

    /// Determine if the batching mode flow task exists with given flow id
    pub async fn flow_exist_inner(&self, flow_id: FlowId) -> bool {
-        self.tasks.read().await.contains_key(&flow_id)
+        self.runtime.read().await.tasks.contains_key(&flow_id)
    }
+
+    async fn rollback_flow_runtime_if_current(&self, flow_id: FlowId, task: &BatchingTask) {
+        let (removed_task, removed_shutdown_tx) = {
+            let mut runtime = self.runtime.write().await;
+            runtime.remove_if_current(flow_id, task)
+        };
+
+        notify_flow_shutdown(flow_id, removed_shutdown_tx, "rolled back");
+        abort_flow_task(flow_id, removed_task, "rolled back");
+    }
+}
+
+fn notify_flow_shutdown(flow_id: FlowId, tx: Option<oneshot::Sender<()>>, action: &str) -> bool {
+    let Some(tx) = tx else {
+        return false;
+    };
+
+    if tx.send(()).is_err() {
+        warn!(
+            "Fail to shutdown {action} flow {flow_id} due to receiver already dropped, maybe flow {flow_id} is already dropped?"
+        );
+    }
+
+    true
+}
+
+fn abort_flow_task(flow_id: FlowId, task: Option<BatchingTask>, action: &str) -> bool {
+    let Some(task) = task else {
+        return false;
+    };
+
+    if let Some(handle) = task.state.write().unwrap().task_handle.take() {
+        handle.abort();
+        debug!("Aborted {action} flow task {flow_id}");
+        return true;
+    }
+
+    false
 }

 impl FlowEngine for BatchingEngine {
@@ -741,7 +881,14 @@ impl FlowEngine for BatchingEngine {
        Ok(self.flow_exist_inner(flow_id).await)
    }
    async fn list_flows(&self) -> Result<impl IntoIterator<Item = FlowId>, Error> {
-        Ok(self.tasks.read().await.keys().cloned().collect::<Vec<_>>())
+        Ok(self
+            .runtime
+            .read()
+            .await
+            .tasks
+            .keys()
+            .cloned()
+            .collect::<Vec<_>>())
    }
    async fn handle_flow_inserts(
        &self,
@@ -756,3 +903,241 @@ impl FlowEngine for BatchingEngine {
        self.handle_mark_dirty_time_window(req).await
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use catalog::memory::new_memory_catalog_manager;
+    use common_meta::key::TableMetadataManager;
+    use common_meta::key::flow::FlowMetadataManager;
+    use common_meta::kv_backend::memory::MemoryKvBackend;
+    use query::options::QueryOptions;
+    use session::context::QueryContext;
+
+    use super::*;
+    use crate::test_utils::create_test_query_engine;
+
+    struct DropNotify(Option<oneshot::Sender<()>>);
+
+    impl Drop for DropNotify {
+        fn drop(&mut self) {
+            if let Some(tx) = self.0.take() {
+                let _ = tx.send(());
+            }
+        }
+    }
+
+    async fn new_test_engine() -> BatchingEngine {
+        let kv_backend = Arc::new(MemoryKvBackend::new());
+        let table_meta = Arc::new(TableMetadataManager::new(kv_backend.clone()));
+        table_meta.init().await.unwrap();
+        let flow_meta = Arc::new(FlowMetadataManager::new(kv_backend));
+        let catalog_manager = new_memory_catalog_manager().unwrap();
+        let query_engine = create_test_query_engine();
+        let (frontend_client, _handler) =
+            FrontendClient::from_empty_grpc_handler(QueryOptions::default());
+
+        BatchingEngine::new(
+            Arc::new(frontend_client),
+            query_engine,
+            flow_meta,
+            table_meta,
+            catalog_manager,
+            BatchingModeOptions::default(),
+        )
+    }
+
+    async fn new_test_task(flow_id: FlowId) -> (BatchingTask, oneshot::Sender<()>) {
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+        let plan = sql_to_df_plan(
+            ctx.clone(),
+            query_engine.clone(),
+            "SELECT number, ts FROM numbers_with_ts",
+            true,
+        )
+        .await
+        .unwrap();
+        let (tx, rx) = oneshot::channel();
+
+        let task = BatchingTask::try_new(TaskArgs {
+            flow_id,
+            query: "SELECT number, ts FROM numbers_with_ts",
+            plan,
+            time_window_expr: None,
+            expire_after: None,
+            sink_table_name: [
+                "greptime".to_string(),
+                "public".to_string(),
+                "sink".to_string(),
+            ],
+            source_table_names: vec![[
+                "greptime".to_string(),
+                "public".to_string(),
+                "numbers_with_ts".to_string(),
+            ]],
+            query_ctx: ctx,
+            catalog_manager: query_engine.engine_state().catalog_manager().clone(),
+            shutdown_rx: rx,
+            batch_opts: Arc::new(BatchingModeOptions::default()),
+            flow_eval_interval: None,
+        })
+        .unwrap();
+
+        (task, tx)
+    }
+
+    async fn install_abort_observed_handle(task: &BatchingTask) -> oneshot::Receiver<()> {
+        let (drop_tx, drop_rx) = oneshot::channel();
+        let (entered_tx, entered_rx) = oneshot::channel();
+        let handle = tokio::spawn(async move {
+            let _guard = DropNotify(Some(drop_tx));
+            let _ = entered_tx.send(());
+            std::future::pending::<()>().await;
+        });
+        task.state.write().unwrap().task_handle = Some(handle);
+        tokio::time::timeout(Duration::from_secs(1), entered_rx)
+            .await
+            .expect("test task handle should start")
+            .expect("test task handle should report start");
+        drop_rx
+    }
+
+    #[tokio::test]
+    async fn test_notify_flow_shutdown_sends_signal() {
+        let (tx, rx) = oneshot::channel();
+
+        assert!(notify_flow_shutdown(42, Some(tx), "test"));
+
+        rx.await.expect("replaced flow should receive shutdown");
+    }
+
+    #[test]
+    fn test_notify_flow_shutdown_accepts_missing_sender() {
+        assert!(!notify_flow_shutdown(42, None, "test"));
+    }
+
+    #[tokio::test]
+    async fn test_abort_flow_task_aborts_handle() {
+        let (task, _shutdown_tx) = new_test_task(42).await;
+        let drop_rx = install_abort_observed_handle(&task).await;
+
+        assert!(abort_flow_task(42, Some(task), "test"));
+
+        tokio::time::timeout(Duration::from_secs(1), drop_rx)
+            .await
+            .expect("aborted task should be dropped")
+            .expect("drop notifier should fire");
+    }
+
+    #[tokio::test]
+    async fn test_remove_flow_inner_aborts_registered_task() {
+        let engine = new_test_engine().await;
+        let (task, shutdown_tx) = new_test_task(42).await;
+        let drop_rx = install_abort_observed_handle(&task).await;
+
+        engine.runtime.write().await.insert(42, task, shutdown_tx);
+
+        engine.remove_flow_inner(42).await.unwrap();
+
+        tokio::time::timeout(Duration::from_secs(1), drop_rx)
+            .await
+            .expect("removed task should be dropped")
+            .expect("drop notifier should fire");
+        assert!(!engine.flow_exist_inner(42).await);
+        assert!(!engine.runtime.read().await.shutdown_txs.contains_key(&42));
+    }
+
+    #[tokio::test]
+    async fn test_or_replace_flow_runtime_replaces_old_handles_and_keeps_new_task() {
+        let engine = new_test_engine().await;
+        let (old_task, old_shutdown_tx) = new_test_task(42).await;
+        let old_task_identity = old_task.clone();
+        let old_drop_rx = install_abort_observed_handle(&old_task).await;
+        let (new_task, new_shutdown_tx) = new_test_task(42).await;
+        let new_task_identity = new_task.clone();
+
+        engine
+            .runtime
+            .write()
+            .await
+            .insert(42, old_task, old_shutdown_tx);
+        let (replaced_old_task, replaced_old_shutdown_tx) =
+            engine
+                .runtime
+                .write()
+                .await
+                .insert(42, new_task, new_shutdown_tx);
+
+        let replaced_old_task = replaced_old_task.expect("old task should be returned");
+        assert!(Arc::ptr_eq(
+            &replaced_old_task.state,
+            &old_task_identity.state
+        ));
+        assert!(notify_flow_shutdown(
+            42,
+            replaced_old_shutdown_tx,
+            "replaced"
+        ));
+        old_task_identity
+            .state
+            .write()
+            .unwrap()
+            .shutdown_rx
+            .try_recv()
+            .expect("old shutdown receiver should receive signal");
+        assert!(abort_flow_task(42, Some(replaced_old_task), "replaced"));
+
+        tokio::time::timeout(Duration::from_secs(1), old_drop_rx)
+            .await
+            .expect("replaced task should be dropped")
+            .expect("drop notifier should fire");
+
+        let runtime = engine.runtime.read().await;
+        assert_eq!(1, runtime.tasks.len());
+        assert_eq!(1, runtime.shutdown_txs.len());
+        let registered_task = runtime.tasks.get(&42).expect("new task should remain");
+        assert!(Arc::ptr_eq(
+            &registered_task.state,
+            &new_task_identity.state
+        ));
+        assert!(runtime.shutdown_txs.contains_key(&42));
+        assert!(matches!(
+            new_task_identity
+                .state
+                .write()
+                .unwrap()
+                .shutdown_rx
+                .try_recv(),
+            Err(oneshot::error::TryRecvError::Empty)
+        ));
+    }
+
+    #[tokio::test]
+    async fn test_rollback_flow_runtime_if_current_removes_matching_task_only() {
+        let engine = new_test_engine().await;
+        let (old_task, _old_shutdown_tx) = new_test_task(42).await;
+        let (current_task, current_shutdown_tx) = new_test_task(42).await;
+        let current_task_identity = current_task.clone();
+
+        engine
+            .runtime
+            .write()
+            .await
+            .insert(42, current_task, current_shutdown_tx);
+
+        engine.rollback_flow_runtime_if_current(42, &old_task).await;
+
+        let registered_task = engine.runtime.read().await.tasks.get(&42).cloned().unwrap();
+        assert!(Arc::ptr_eq(
+            &registered_task.state,
+            &current_task_identity.state
+        ));
+        assert!(engine.runtime.read().await.shutdown_txs.contains_key(&42));
+
+        engine
+            .rollback_flow_runtime_if_current(42, &current_task_identity)
+            .await;
+        assert!(!engine.flow_exist_inner(42).await);
+        assert!(!engine.runtime.read().await.shutdown_txs.contains_key(&42));
+    }
+}
--- a/src/flow/src/batching_mode/frontend_client.rs
+++ b/src/flow/src/batching_mode/frontend_client.rs
@@ -20,15 +20,17 @@ use std::sync::{Arc, Mutex, Weak};
 use api::v1::greptime_request::Request;
 use api::v1::query_request::Query;
 use api::v1::{CreateTableExpr, QueryRequest};
-use client::{Client, Database};
+use client::{Client, Database, OutputWithMetrics};
 use common_error::ext::BoxedError;
 use common_grpc::channel_manager::{ChannelConfig, ChannelManager, load_client_tls_config};
 use common_meta::peer::{Peer, PeerDiscovery};
-use common_query::Output;
+use common_query::{Output, OutputData};
+use common_recordbatch::adapter::{RecordBatchMetrics, RegionWatermarkEntry};
 use common_telemetry::warn;
 use meta_client::client::MetaClient;
 use query::datafusion::QUERY_PARALLELISM_HINT;
-use query::options::QueryOptions;
+use query::metrics::terminal_recordbatch_metrics_from_plan;
+use query::options::{FlowQueryExtensions, QueryOptions};
 use rand::rng;
 use rand::seq::SliceRandom;
 use servers::query_handler::grpc::GrpcQueryHandler;
@@ -196,9 +198,6 @@ impl DatabaseWithPeer {
 }

 impl FrontendClient {
-    // TODO: support more fine-grained load balancing strategies for frontend
-    // selection, such as AZ (availability zone) awareness, to prefer frontends
-    // in the same zone as the flownode and reduce cross-AZ latency.
    /// scan for available frontend from metadata
    pub(crate) async fn scan_for_frontend(&self) -> Result<Vec<Peer>, Error> {
        let Self::Distributed { meta_client, .. } = self else {
@@ -341,6 +340,83 @@ impl FrontendClient {
        }
    }

+    pub(crate) async fn query_with_terminal_metrics(
+        &self,
+        catalog: &str,
+        schema: &str,
+        request: QueryRequest,
+        extensions: &[(&str, &str)],
+        peer_desc: &mut Option<PeerDesc>,
+    ) -> Result<OutputWithMetrics, Error> {
+        let flow_extensions = build_flow_extensions(extensions)?;
+        match self {
+            FrontendClient::Distributed {
+                query, batch_opts, ..
+            } => {
+                let query_parallelism = query.parallelism.to_string();
+                let hints = vec![
+                    (QUERY_PARALLELISM_HINT, query_parallelism.as_str()),
+                    (READ_PREFERENCE_HINT, batch_opts.read_preference.as_ref()),
+                ];
+                let db = self.get_random_active_frontend(catalog, schema).await?;
+                *peer_desc = Some(PeerDesc::Dist {
+                    peer: db.peer.clone(),
+                });
+                db.database
+                    .query_with_terminal_metrics_and_flow_extensions(request, &hints, extensions)
+                    .await
+                    .map_err(BoxedError::new)
+                    .context(ExternalSnafu)
+            }
+            FrontendClient::Standalone {
+                database_client,
+                query,
+            } => {
+                *peer_desc = Some(PeerDesc::Standalone);
+                let mut extensions_map = HashMap::from([(
+                    QUERY_PARALLELISM_HINT.to_string(),
+                    query.parallelism.to_string(),
+                )]);
+                for (key, value) in extensions {
+                    extensions_map.insert((*key).to_string(), (*value).to_string());
+                }
+                let ctx = QueryContextBuilder::default()
+                    .current_catalog(catalog.to_string())
+                    .current_schema(schema.to_string())
+                    .extensions(extensions_map)
+                    .build();
+                let ctx = Arc::new(ctx);
+                let database_client = {
+                    database_client
+                        .handler
+                        .lock()
+                        .map_err(|e| {
+                            UnexpectedSnafu {
+                                reason: format!("Failed to lock database client: {e}"),
+                            }
+                            .build()
+                        })?
+                        .as_ref()
+                        .context(UnexpectedSnafu {
+                            reason: "Standalone's frontend instance is not set",
+                        })?
+                        .upgrade()
+                        .context(UnexpectedSnafu {
+                            reason: "Failed to upgrade database client",
+                        })?
+                };
+                database_client
+                    .do_query(Request::Query(request), ctx.clone())
+                    .await
+                    .map(|output| {
+                        wrap_standalone_output_with_terminal_metrics(output, &flow_extensions, &ctx)
+                    })
+                    .map_err(BoxedError::new)
+                    .context(ExternalSnafu)
+            }
+        }
+    }
+
    /// Handle a request to frontend
    pub(crate) async fn handle(
        &self,
@@ -426,22 +502,83 @@ impl FrontendClient {
    }
 }

+fn build_flow_extensions(extensions: &[(&str, &str)]) -> Result<FlowQueryExtensions, Error> {
+    let flow_extensions = HashMap::from_iter(
+        extensions
+            .iter()
+            .map(|(key, value)| ((*key).to_string(), (*value).to_string())),
+    );
+    FlowQueryExtensions::parse_flow_extensions(&flow_extensions)
+        .map_err(BoxedError::new)
+        .context(ExternalSnafu)
+        .map(|extensions| extensions.unwrap_or_default())
+}
+
+fn wrap_standalone_output_with_terminal_metrics(
+    output: Output,
+    flow_extensions: &FlowQueryExtensions,
+    query_ctx: &QueryContextRef,
+) -> OutputWithMetrics {
+    let should_collect_region_watermark = flow_extensions.should_collect_region_watermark();
+    let terminal_metrics =
+        if should_collect_region_watermark && !matches!(&output.data, OutputData::Stream(_)) {
+            output
+                .meta
+                .plan
+                .clone()
+                .and_then(terminal_recordbatch_metrics_from_plan)
+                .or_else(|| terminal_recordbatch_metrics_from_snapshots(query_ctx))
+        } else {
+            None
+        };
+    let result = OutputWithMetrics::from_output(output);
+    if let Some(metrics) = terminal_metrics {
+        result.metrics.update(Some(metrics));
+    }
+    result
+}
+
+fn terminal_recordbatch_metrics_from_snapshots(
+    query_ctx: &QueryContextRef,
+) -> Option<RecordBatchMetrics> {
+    let mut region_watermarks = query_ctx
+        .snapshots()
+        .into_iter()
+        .map(|(region_id, watermark)| RegionWatermarkEntry {
+            region_id,
+            watermark: Some(watermark),
+        })
+        .collect::<Vec<_>>();
+    if region_watermarks.is_empty() {
+        return None;
+    }
+
+    region_watermarks.sort_by_key(|entry| entry.region_id);
+    Some(RecordBatchMetrics {
+        region_watermarks,
+        ..Default::default()
+    })
+}
+
 /// Describe a peer of frontend
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub(crate) enum PeerDesc {
+    /// The query failed before a frontend peer was selected.
+    #[default]
+    Unknown,
    /// Distributed mode's frontend peer address
    Dist {
        /// frontend peer address
        peer: Peer,
    },
    /// Standalone mode
-    #[default]
    Standalone,
 }

 impl std::fmt::Display for PeerDesc {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
+            PeerDesc::Unknown => write!(f, "unknown"),
            PeerDesc::Dist { peer } => write!(f, "{}", peer.addr),
            PeerDesc::Standalone => write!(f, "standalone"),
        }
@@ -450,9 +587,17 @@ impl std::fmt::Display for PeerDesc {

 #[cfg(test)]
 mod tests {
+    use std::pin::Pin;
+    use std::task::{Context, Poll};
    use std::time::Duration;

-    use common_query::Output;
+    use common_query::{Output, OutputData};
+    use common_recordbatch::adapter::RecordBatchMetrics;
+    use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream};
+    use datatypes::prelude::{ConcreteDataType, VectorRef};
+    use datatypes::schema::{ColumnSchema, Schema};
+    use datatypes::vectors::Int32Vector;
+    use futures::StreamExt;
    use tokio::time::timeout;

    use super::*;
@@ -460,6 +605,58 @@ mod tests {
    #[derive(Debug)]
    struct NoopHandler;

+    struct MockMetricsStream {
+        schema: datatypes::schema::SchemaRef,
+        batch: Option<RecordBatch>,
+        metrics: RecordBatchMetrics,
+        terminal_metrics_only: bool,
+    }
+
+    impl futures::Stream for MockMetricsStream {
+        type Item = common_recordbatch::error::Result<RecordBatch>;
+
+        fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+            Poll::Ready(self.batch.take().map(Ok))
+        }
+
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            (
+                usize::from(self.batch.is_some()),
+                Some(usize::from(self.batch.is_some())),
+            )
+        }
+    }
+
+    impl RecordBatchStream for MockMetricsStream {
+        fn name(&self) -> &str {
+            "MockMetricsStream"
+        }
+
+        fn schema(&self) -> datatypes::schema::SchemaRef {
+            self.schema.clone()
+        }
+
+        fn output_ordering(&self) -> Option<&[OrderOption]> {
+            None
+        }
+
+        fn metrics(&self) -> Option<RecordBatchMetrics> {
+            if self.terminal_metrics_only && self.batch.is_some() {
+                return None;
+            }
+            Some(self.metrics.clone())
+        }
+    }
+
+    #[derive(Debug)]
+    struct MetricsHandler;
+
+    #[derive(Debug)]
+    struct ExtensionAwareHandler;
+
+    #[derive(Debug)]
+    struct SnapshotBindingHandler;
+
    #[async_trait::async_trait]
    impl GrpcQueryHandlerWithBoxedError for NoopHandler {
        async fn do_query(
@@ -471,6 +668,63 @@ mod tests {
        }
    }

+    #[async_trait::async_trait]
+    impl GrpcQueryHandlerWithBoxedError for MetricsHandler {
+        async fn do_query(
+            &self,
+            _query: Request,
+            _ctx: QueryContextRef,
+        ) -> std::result::Result<Output, BoxedError> {
+            let schema = Arc::new(Schema::new(vec![ColumnSchema::new(
+                "v",
+                ConcreteDataType::int32_datatype(),
+                false,
+            )]));
+            let batch = RecordBatch::new(
+                schema.clone(),
+                vec![Arc::new(Int32Vector::from_slice([1, 2])) as VectorRef],
+            )
+            .unwrap();
+            Ok(Output::new_with_stream(Box::pin(MockMetricsStream {
+                schema,
+                batch: Some(batch),
+                metrics: RecordBatchMetrics {
+                    region_watermarks: vec![common_recordbatch::adapter::RegionWatermarkEntry {
+                        region_id: 42,
+                        watermark: Some(99),
+                    }],
+                    ..Default::default()
+                },
+                terminal_metrics_only: true,
+            })))
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl GrpcQueryHandlerWithBoxedError for ExtensionAwareHandler {
+        async fn do_query(
+            &self,
+            _query: Request,
+            ctx: QueryContextRef,
+        ) -> std::result::Result<Output, BoxedError> {
+            assert_eq!(ctx.extension("flow.return_region_seq"), Some("true"));
+            Ok(Output::new_with_affected_rows(1))
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl GrpcQueryHandlerWithBoxedError for SnapshotBindingHandler {
+        async fn do_query(
+            &self,
+            _query: Request,
+            ctx: QueryContextRef,
+        ) -> std::result::Result<Output, BoxedError> {
+            assert_eq!(ctx.extension("flow.return_region_seq"), Some("true"));
+            ctx.set_snapshot(42, 99);
+            Ok(Output::new_with_affected_rows(1))
+        }
+    }
+
    #[tokio::test]
    async fn wait_initialized() {
        let (client, handler_mut) =
@@ -516,4 +770,117 @@ mod tests {
                .is_ok()
        );
    }
+
+    #[tokio::test]
+    async fn test_query_with_terminal_metrics_tracks_watermark_in_standalone_mode() {
+        let handler: Arc<dyn GrpcQueryHandlerWithBoxedError> = Arc::new(MetricsHandler);
+        let client =
+            FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+        let mut peer_desc = None;
+
+        let result = client
+            .query_with_terminal_metrics(
+                "greptime",
+                "public",
+                QueryRequest {
+                    query: Some(Query::Sql("select 1".to_string())),
+                },
+                &[],
+                &mut peer_desc,
+            )
+            .await
+            .unwrap();
+        assert!(matches!(peer_desc, Some(PeerDesc::Standalone)));
+
+        let terminal_metrics = result.metrics.clone();
+        assert!(!result.metrics.is_ready());
+        assert!(terminal_metrics.get().is_none());
+
+        let OutputData::Stream(mut stream) = result.output.data else {
+            panic!("expected stream output");
+        };
+        while stream.next().await.is_some() {}
+
+        assert!(terminal_metrics.is_ready());
+        assert_eq!(
+            terminal_metrics.region_watermark_map(),
+            Some(HashMap::from([(42_u64, 99_u64)]))
+        );
+    }
+
+    #[tokio::test]
+    async fn test_query_with_terminal_metrics_forwards_flow_extensions_in_standalone_mode() {
+        let handler: Arc<dyn GrpcQueryHandlerWithBoxedError> = Arc::new(ExtensionAwareHandler);
+        let client =
+            FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+        let mut peer_desc = None;
+
+        let result = client
+            .query_with_terminal_metrics(
+                "greptime",
+                "public",
+                QueryRequest {
+                    query: Some(Query::Sql("insert into t select 1".to_string())),
+                },
+                &[("flow.return_region_seq", "true")],
+                &mut peer_desc,
+            )
+            .await
+            .unwrap();
+        assert!(matches!(peer_desc, Some(PeerDesc::Standalone)));
+
+        assert!(result.metrics.is_ready());
+        assert!(result.region_watermark_map().is_none());
+    }
+
+    #[tokio::test]
+    async fn test_query_with_terminal_metrics_uses_standalone_snapshot_bounds() {
+        let handler: Arc<dyn GrpcQueryHandlerWithBoxedError> = Arc::new(SnapshotBindingHandler);
+        let client =
+            FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+        let mut peer_desc = None;
+
+        let result = client
+            .query_with_terminal_metrics(
+                "greptime",
+                "public",
+                QueryRequest {
+                    query: Some(Query::Sql("insert into t select * from src".to_string())),
+                },
+                &[("flow.return_region_seq", "true")],
+                &mut peer_desc,
+            )
+            .await
+            .unwrap();
+        assert!(matches!(peer_desc, Some(PeerDesc::Standalone)));
+
+        assert!(result.metrics.is_ready());
+        assert_eq!(
+            result.region_watermark_map(),
+            Some(HashMap::from([(42, 99)]))
+        );
+    }
+
+    #[tokio::test]
+    async fn test_query_with_terminal_metrics_rejects_invalid_flow_extensions() {
+        let handler: Arc<dyn GrpcQueryHandlerWithBoxedError> = Arc::new(NoopHandler);
+        let client =
+            FrontendClient::from_grpc_handler(Arc::downgrade(&handler), QueryOptions::default());
+        let mut peer_desc = None;
+
+        let err = client
+            .query_with_terminal_metrics(
+                "greptime",
+                "public",
+                QueryRequest {
+                    query: Some(Query::Sql("select 1".to_string())),
+                },
+                &[("flow.return_region_seq", "not-a-bool")],
+                &mut peer_desc,
+            )
+            .await
+            .unwrap_err();
+
+        assert!(format!("{err:?}").contains("Invalid value for flow.return_region_seq"));
+    }
 }
--- a/src/flow/src/batching_mode/incremental_filter.rs
+++ b/src/flow/src/batching_mode/incremental_filter.rs
@@ -0,0 +1,222 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use common_telemetry::tracing::debug;
+use datafusion_expr::Expr;
+use datatypes::schema::Schema;
+
+use crate::batching_mode::state::FilterExprInfo;
+use crate::batching_mode::utils::IncrementalAggregateAnalysis;
+use crate::{Error, FlowId};
+
+pub(super) fn build_sink_dirty_time_window_filter_expr(
+    flow_id: FlowId,
+    analysis: &IncrementalAggregateAnalysis,
+    sink_schema: &Schema,
+    dirty_filter: Option<&FilterExprInfo>,
+) -> Result<Option<Expr>, Error> {
+    let Some(dirty_filter) = dirty_filter else {
+        return Ok(None);
+    };
+
+    let Some(sink_filter_col) =
+        infer_sink_time_window_filter_col(flow_id, analysis, sink_schema, dirty_filter)
+    else {
+        return Ok(None);
+    };
+
+    dirty_filter.predicate_for_col(&sink_filter_col)
+}
+
+fn infer_sink_time_window_filter_col(
+    flow_id: FlowId,
+    analysis: &IncrementalAggregateAnalysis,
+    sink_schema: &Schema,
+    dirty_filter: &FilterExprInfo,
+) -> Option<String> {
+    if analysis.group_key_names.is_empty() {
+        return None;
+    }
+
+    let is_timestamp_group_key = |name: &str| {
+        analysis.group_key_names.iter().any(|key| key == name)
+            && sink_schema
+                .column_schema_by_name(name)
+                .is_some_and(|col| col.data_type.is_timestamp())
+    };
+
+    if is_timestamp_group_key(&dirty_filter.col_name) {
+        return Some(dirty_filter.col_name.clone());
+    }
+
+    let candidates = analysis
+        .group_key_names
+        .iter()
+        .filter(|name| is_timestamp_group_key(name))
+        .cloned()
+        .collect::<Vec<_>>();
+
+    match candidates.as_slice() {
+        [name] => Some(name.clone()),
+        [] => {
+            debug!(
+                "Flow {} cannot infer sink dirty-window filter column: no timestamp group key in {:?}",
+                flow_id, analysis.group_key_names
+            );
+            None
+        }
+        _ => {
+            debug!(
+                "Flow {} cannot infer sink dirty-window filter column: ambiguous timestamp group keys {:?}",
+                flow_id, candidates
+            );
+            None
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::schema::ColumnSchema;
+    use pretty_assertions::assert_eq;
+
+    use super::*;
+    use crate::adapter::AUTO_CREATED_UPDATE_AT_TS_COL;
+    use crate::batching_mode::state::FilterExprInfo;
+    use crate::batching_mode::utils::IncrementalAggregateAnalysis;
+
+    fn test_analysis_with_group_keys(group_key_names: Vec<&str>) -> IncrementalAggregateAnalysis {
+        IncrementalAggregateAnalysis {
+            group_key_names: group_key_names
+                .into_iter()
+                .map(|name| name.to_string())
+                .collect(),
+            merge_columns: vec![],
+            literal_columns: vec![],
+            output_field_names: vec![],
+            unsupported_exprs: vec![],
+        }
+    }
+
+    fn test_dirty_filter(col_name: &str) -> FilterExprInfo {
+        FilterExprInfo {
+            expr: datafusion_expr::col(col_name),
+            col_name: col_name.to_string(),
+            time_ranges: vec![],
+            window_size: chrono::Duration::seconds(1),
+        }
+    }
+
+    fn test_sink_schema(columns: Vec<(&str, ConcreteDataType)>) -> Schema {
+        Schema::new(
+            columns
+                .into_iter()
+                .map(|(name, data_type)| ColumnSchema::new(name, data_type, true))
+                .collect(),
+        )
+    }
+
+    #[test]
+    fn test_infer_sink_time_window_filter_col_uses_matching_source_group_key() {
+        let analysis = test_analysis_with_group_keys(vec!["ts", "host"]);
+        let sink_schema = test_sink_schema(vec![
+            ("ts", ConcreteDataType::timestamp_millisecond_datatype()),
+            ("host", ConcreteDataType::string_datatype()),
+        ]);
+        let dirty_filter = test_dirty_filter("ts");
+
+        assert_eq!(
+            Some("ts".to_string()),
+            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+        );
+    }
+
+    #[test]
+    fn test_infer_sink_time_window_filter_col_uses_unique_timestamp_group_key() {
+        let analysis = test_analysis_with_group_keys(vec!["host", "time_window"]);
+        let sink_schema = test_sink_schema(vec![
+            ("host", ConcreteDataType::string_datatype()),
+            (
+                "time_window",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+            ),
+            (
+                AUTO_CREATED_UPDATE_AT_TS_COL,
+                ConcreteDataType::timestamp_millisecond_datatype(),
+            ),
+        ]);
+        let dirty_filter = test_dirty_filter("ts");
+
+        assert_eq!(
+            Some("time_window".to_string()),
+            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+        );
+    }
+
+    #[test]
+    fn test_infer_sink_time_window_filter_col_skips_global_aggregate() {
+        let analysis = test_analysis_with_group_keys(vec![]);
+        let sink_schema = test_sink_schema(vec![
+            ("number", ConcreteDataType::uint32_datatype()),
+            (
+                "time_window",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+            ),
+        ]);
+        let dirty_filter = test_dirty_filter("ts");
+
+        assert_eq!(
+            None,
+            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+        );
+    }
+
+    #[test]
+    fn test_infer_sink_time_window_filter_col_skips_without_timestamp_group_key() {
+        let analysis = test_analysis_with_group_keys(vec!["host", "device"]);
+        let sink_schema = test_sink_schema(vec![
+            ("host", ConcreteDataType::string_datatype()),
+            ("device", ConcreteDataType::string_datatype()),
+            (
+                AUTO_CREATED_UPDATE_AT_TS_COL,
+                ConcreteDataType::timestamp_millisecond_datatype(),
+            ),
+        ]);
+        let dirty_filter = test_dirty_filter("ts");
+
+        assert_eq!(
+            None,
+            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+        );
+    }
+
+    #[test]
+    fn test_infer_sink_time_window_filter_col_skips_ambiguous_timestamp_group_keys() {
+        let analysis = test_analysis_with_group_keys(vec!["ts", "time_window"]);
+        let sink_schema = test_sink_schema(vec![
+            ("ts", ConcreteDataType::timestamp_millisecond_datatype()),
+            (
+                "time_window",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+            ),
+        ]);
+        let dirty_filter = test_dirty_filter("source_ts");
+
+        assert_eq!(
+            None,
+            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
+        );
+    }
+}
--- a/src/flow/src/batching_mode/state.rs
+++ b/src/flow/src/batching_mode/state.rs
@@ -13,8 +13,9 @@
 // limitations under the License.

 //! Batching mode task state, which changes frequently
+//!

-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::time::Duration;

 use common_telemetry::debug;
@@ -49,6 +50,14 @@ pub struct TaskState {
    /// Dirty Time windows need to be updated
    /// mapping of `start -> end` and non-overlapping
    pub(crate) dirty_time_windows: DirtyTimeWindows,
+    checkpoint_mode: CheckpointMode,
+    /// Region id -> last consumed watermark sequence. Incremental scans use
+    /// this as the next lower sequence bound for each source region.
+    checkpoints: BTreeMap<u64, u64>,
+    /// Once set, the task will never attempt incremental mode again.
+    /// Set when the flow's query shape is deterministically incompatible
+    /// with incremental execution (e.g. unsupported aggregate expressions).
+    incremental_disabled: bool,
    exec_state: ExecState,
    /// Shutdown receiver
    pub(crate) shutdown_rx: oneshot::Receiver<()>,
@@ -63,6 +72,9 @@ impl TaskState {
            last_query_duration: Duration::from_secs(0),
            last_exec_time_millis: None,
            dirty_time_windows: Default::default(),
+            checkpoint_mode: CheckpointMode::FullSnapshot,
+            checkpoints: Default::default(),
+            incremental_disabled: false,
            exec_state: ExecState::Idle,
            shutdown_rx,
            task_handle: None,
@@ -84,6 +96,84 @@ impl TaskState {
        self.last_exec_time_millis
    }

+    pub fn checkpoint_mode(&self) -> CheckpointMode {
+        self.checkpoint_mode
+    }
+
+    pub fn checkpoints(&self) -> &BTreeMap<u64, u64> {
+        &self.checkpoints
+    }
+
+    pub fn is_incremental_disabled(&self) -> bool {
+        self.incremental_disabled
+    }
+
+    /// Permanently disable incremental mode for this task and
+    /// immediately fall back to full snapshot for the current cycle.
+    pub fn disable_incremental(&mut self) {
+        self.incremental_disabled = true;
+        self.mark_full_snapshot();
+    }
+
+    pub fn mark_full_snapshot(&mut self) {
+        self.checkpoint_mode = CheckpointMode::FullSnapshot;
+    }
+
+    pub fn advance_checkpoints(&mut self, watermark_map: HashMap<u64, u64>) {
+        self.checkpoints = watermark_map.into_iter().collect();
+        if !self.incremental_disabled {
+            self.checkpoint_mode = CheckpointMode::Incremental;
+        }
+    }
+
+    pub fn advance_incremental_checkpoints_with_participation(
+        &mut self,
+        participating_regions: &BTreeSet<u64>,
+        watermark_map: HashMap<u64, u64>,
+    ) {
+        for region_id in participating_regions {
+            if let Some(seq) = watermark_map.get(region_id) {
+                self.checkpoints.insert(*region_id, *seq);
+            }
+        }
+        if !self.incremental_disabled {
+            self.checkpoint_mode = CheckpointMode::Incremental;
+        }
+    }
+
+    pub fn can_advance_full_snapshot_checkpoints(
+        &self,
+        participating_regions: &BTreeSet<u64>,
+        watermark_map: &HashMap<u64, u64>,
+    ) -> bool {
+        !participating_regions.is_empty()
+            && participating_regions.len() == watermark_map.len()
+            && participating_regions
+                .iter()
+                .all(|region_id| watermark_map.contains_key(region_id))
+    }
+
+    pub fn can_advance_incremental_checkpoints_with_participation(
+        &self,
+        participating_regions: &BTreeSet<u64>,
+        watermark_map: &HashMap<u64, u64>,
+    ) -> bool {
+        !self.incremental_disabled
+            && !self.checkpoints.is_empty()
+            && !participating_regions.is_empty()
+            && participating_regions.len() == watermark_map.len()
+            && participating_regions
+                .iter()
+                .all(|region_id| self.checkpoints.contains_key(region_id))
+            && participating_regions.iter().all(|region_id| {
+                let checkpoint = self.checkpoints.get(region_id);
+                watermark_map
+                    .get(region_id)
+                    .zip(checkpoint)
+                    .is_some_and(|(seq, checkpoint)| seq >= checkpoint)
+            })
+    }
+
    /// Compute the next query delay based on the time window size or the last query duration.
    /// Aiming to avoid too frequent queries. But also not too long delay.
    ///
@@ -94,6 +184,10 @@ impl TaskState {
    /// if current the dirty time range is longer than one query can handle,
    /// execute immediately to faster clean up dirty time windows.
    ///
+    /// If `prefer_short_incremental_cadence` is true, run incremental queries
+    /// more often when there is no large dirty backlog. This only reduces the
+    /// chance of hitting a stale cursor after flush; it is not required for
+    /// correctness.
    pub fn get_next_start_query_time(
        &self,
        flow_id: FlowId,
@@ -101,6 +195,7 @@ impl TaskState {
        min_refresh_duration: Duration,
        max_timeout: Option<Duration>,
        max_filter_num_per_query: usize,
+        prefer_short_incremental_cadence: bool,
    ) -> Instant {
        // = last query duration, capped by [max(min_run_interval, time_window_size), max_timeout], note at most `max_timeout`
        let lower = time_window_size.unwrap_or(min_refresh_duration);
@@ -119,7 +214,20 @@ impl TaskState {
        // if dirty time range is more than one query can handle, execute immediately
        // to faster clean up dirty time windows
        if cur_dirty_window_size < max_query_update_range {
-            self.last_update_time + next_duration
+            if prefer_short_incremental_cadence {
+                // Run incremental queries sooner than the normal time-window
+                // cadence, while still backing off by at least the previous
+                // query duration and respecting the max-timeout cap.
+                let next_duration = self.last_query_duration.max(min_refresh_duration);
+                let next_duration = if let Some(max_timeout) = max_timeout {
+                    next_duration.min(max_timeout)
+                } else {
+                    next_duration
+                };
+                self.last_update_time + next_duration
+            } else {
+                self.last_update_time + next_duration
+            }
        } else {
            // if dirty time windows can't be clean up in one query, execute immediately to faster
            // clean up dirty time windows
@@ -199,12 +307,42 @@ impl DirtyTimeWindows {
    }

    pub fn add_window(&mut self, start: Timestamp, end: Option<Timestamp>) {
-        self.windows.insert(start, end);
+        self.add_or_merge_window(start, end);
    }

    pub fn add_windows(&mut self, time_ranges: Vec<(Timestamp, Timestamp)>) {
        for (start, end) in time_ranges {
-            self.windows.insert(start, Some(end));
+            self.add_or_merge_window(start, Some(end));
+        }
+    }
+
+    /// Add all dirty markers from another dirty-window set.
+    pub fn add_dirty_windows(&mut self, dirty_windows: &DirtyTimeWindows) {
+        for (start, end) in &dirty_windows.windows {
+            self.add_or_merge_window(*start, *end);
+        }
+    }
+
+    fn add_or_merge_window(&mut self, start: Timestamp, end: Option<Timestamp>) {
+        self.windows
+            .entry(start)
+            .and_modify(|current_end| {
+                *current_end = Self::union_window_end(*current_end, end);
+            })
+            .or_insert(end);
+    }
+
+    fn union_window_end(
+        current_end: Option<Timestamp>,
+        incoming_end: Option<Timestamp>,
+    ) -> Option<Timestamp> {
+        match (current_end, incoming_end) {
+            (Some(current), Some(incoming)) => Some(current.max(incoming)),
+            // `None` is a dirty marker without a known upper bound.  When one
+            // side has a concrete end, keep it so merging a restored snapshot
+            // never shrinks an already-known dirty range with the same start.
+            (Some(end), None) | (None, Some(end)) => Some(end),
+            (None, None) => None,
        }
    }

@@ -216,7 +354,7 @@ impl DirtyTimeWindows {
    /// Set windows to be dirty, only useful for full aggr without time window
    /// to mark some new data is inserted
    pub fn set_dirty(&mut self) {
-        self.windows.insert(Timestamp::new_second(0), None);
+        self.add_or_merge_window(Timestamp::new_second(0), None);
    }

    /// Number of dirty windows.
@@ -283,7 +421,7 @@ impl DirtyTimeWindows {
        );
        self.merge_dirty_time_windows(window_size, expire_lower_bound)?;

-        if self.windows.len() > self.max_filter_num_per_query {
+        if self.windows.len() > window_cnt {
            let first_time_window = self.windows.first_key_value();
            let last_time_window = self.windows.last_key_value();

@@ -292,7 +430,7 @@ impl DirtyTimeWindows {
                    "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. Time window expr={:?}, expire_after={:?}, first_time_window={:?}, last_time_window={:?}, the original query: {:?}",
                    task_ctx.config.flow_id,
                    self.windows.len(),
-                    self.max_filter_num_per_query,
+                    window_cnt,
                    task_ctx.config.time_window_expr,
                    task_ctx.config.expire_after,
                    first_time_window,
@@ -304,7 +442,7 @@ impl DirtyTimeWindows {
                    "Flow id = {:?}, too many time windows: {}, only the first {} are taken for this query, the group by expression might be wrong. first_time_window={:?}, last_time_window={:?}",
                    flow_id,
                    self.windows.len(),
-                    self.max_filter_num_per_query,
+                    window_cnt,
                    first_time_window,
                    last_time_window
                )
@@ -559,6 +697,12 @@ enum ExecState {
    Executing,
 }

+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum CheckpointMode {
+    FullSnapshot,
+    Incremental,
+}
+
 /// Filter Expression's information
 #[derive(Debug, Clone)]
 pub struct FilterExprInfo {
@@ -576,6 +720,28 @@ impl FilterExprInfo {
                acc + end.sub(start).unwrap_or(chrono::Duration::zero())
            })
    }
+
+    pub fn predicate_for_col(
+        &self,
+        col_name: &str,
+    ) -> Result<Option<datafusion_expr::Expr>, Error> {
+        use datafusion_common::Column;
+        use datafusion_expr::{Expr, lit};
+
+        let mut expr_lst = Vec::with_capacity(self.time_ranges.len());
+        for (start, end) in &self.time_ranges {
+            let lower = to_df_literal(*start)?;
+            let upper = to_df_literal(*end)?;
+            let filter_col = || Expr::Column(Column::new_unqualified(col_name));
+            expr_lst.push(
+                filter_col()
+                    .gt_eq(lit(lower))
+                    .and(filter_col().lt(lit(upper))),
+            );
+        }
+
+        Ok(expr_lst.into_iter().reduce(|a, b| a.or(b)))
+    }
 }

 #[cfg(test)]
@@ -820,4 +986,370 @@ mod test {
            }
        }
    }
+
+    #[test]
+    fn test_task_state_checkpoint_mode_and_advancement() {
+        let query_ctx = QueryContext::arc();
+        let (_tx, rx) = tokio::sync::oneshot::channel();
+        let mut state = TaskState::new(query_ctx, rx);
+
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+        assert!(state.checkpoints().is_empty());
+
+        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
+        assert_eq!(
+            state.checkpoints(),
+            &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+        );
+
+        state.mark_full_snapshot();
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+        assert_eq!(
+            state.checkpoints(),
+            &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+        );
+    }
+
+    #[test]
+    fn test_disable_incremental_persists_full_snapshot_mode() {
+        let query_ctx = QueryContext::arc();
+        let (_tx, rx) = tokio::sync::oneshot::channel();
+        let mut state = TaskState::new(query_ctx, rx);
+
+        assert!(!state.is_incremental_disabled());
+
+        // After disable, mode becomes FullSnapshot and flag is set.
+        state.disable_incremental();
+        assert!(state.is_incremental_disabled());
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+
+        // `advance_checkpoints` will NOT transition to Incremental when disabled.
+        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+        assert_eq!(
+            state.checkpoints(),
+            &BTreeMap::from([(1_u64, 10_u64), (2_u64, 20_u64)])
+        );
+
+        // `mark_full_snapshot` does not re-enable incremental.
+        state.mark_full_snapshot();
+        assert!(state.is_incremental_disabled());
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+    }
+
+    #[test]
+    fn test_full_snapshot_checkpoint_advancement_requires_participating_regions() {
+        let query_ctx = QueryContext::arc();
+        let (_tx, rx) = tokio::sync::oneshot::channel();
+        let state = TaskState::new(query_ctx, rx);
+
+        assert!(!state.can_advance_full_snapshot_checkpoints(&BTreeSet::new(), &HashMap::new()));
+        assert!(!state.can_advance_full_snapshot_checkpoints(
+            &BTreeSet::from([1_u64, 2_u64]),
+            &HashMap::from([(1_u64, 10_u64)]),
+        ));
+        assert!(state.can_advance_full_snapshot_checkpoints(
+            &BTreeSet::from([1_u64, 2_u64]),
+            &HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]),
+        ));
+    }
+
+    #[test]
+    fn test_incremental_checkpoint_advancement_requires_participation_alignment() {
+        let query_ctx = QueryContext::arc();
+        let (_tx, rx) = tokio::sync::oneshot::channel();
+        let mut state = TaskState::new(query_ctx, rx);
+        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64), (2_u64, 20_u64)]));
+
+        assert!(
+            state.can_advance_incremental_checkpoints_with_participation(
+                &BTreeSet::from([1_u64]),
+                &HashMap::from([(1_u64, 11_u64)]),
+            )
+        );
+        assert!(
+            !state.can_advance_incremental_checkpoints_with_participation(
+                &BTreeSet::from([1_u64, 2_u64]),
+                &HashMap::from([(1_u64, 11_u64)]),
+            )
+        );
+        assert!(
+            !state.can_advance_incremental_checkpoints_with_participation(
+                &BTreeSet::from([3_u64]),
+                &HashMap::from([(3_u64, 11_u64)]),
+            )
+        );
+        assert!(
+            !state.can_advance_incremental_checkpoints_with_participation(
+                &BTreeSet::from([1_u64]),
+                &HashMap::from([(1_u64, 9_u64)]),
+            )
+        );
+        assert!(
+            state.can_advance_incremental_checkpoints_with_participation(
+                &BTreeSet::from([1_u64, 2_u64]),
+                &HashMap::from([(1_u64, 11_u64), (2_u64, 21_u64)]),
+            )
+        );
+
+        state.disable_incremental();
+        assert!(
+            !state.can_advance_incremental_checkpoints_with_participation(
+                &BTreeSet::from([1_u64, 2_u64]),
+                &HashMap::from([(1_u64, 12_u64), (2_u64, 22_u64)]),
+            )
+        );
+    }
+
+    #[test]
+    fn test_incremental_checkpoint_advancement_merges_participating_subset() {
+        let query_ctx = QueryContext::arc();
+        let (_tx, rx) = tokio::sync::oneshot::channel();
+        let mut state = TaskState::new(query_ctx, rx);
+        state.advance_checkpoints(HashMap::from([
+            (1_u64, 10_u64),
+            (2_u64, 20_u64),
+            (3_u64, 30_u64),
+        ]));
+
+        state.advance_incremental_checkpoints_with_participation(
+            &BTreeSet::from([1_u64, 3_u64]),
+            HashMap::from([(1_u64, 12_u64), (3_u64, 35_u64)]),
+        );
+
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::Incremental);
+        assert_eq!(
+            state.checkpoints(),
+            &BTreeMap::from([(1_u64, 12_u64), (2_u64, 20_u64), (3_u64, 35_u64)])
+        );
+    }
+
+    #[test]
+    fn test_filter_expr_info_predicate_for_col_empty_ranges() {
+        let filter = FilterExprInfo {
+            expr: datafusion_expr::col("ts"),
+            col_name: "ts".to_string(),
+            time_ranges: vec![],
+            window_size: chrono::Duration::seconds(1),
+        };
+
+        assert!(filter.predicate_for_col("time_window").unwrap().is_none());
+    }
+
+    #[test]
+    fn test_filter_expr_info_predicate_for_col_single_range() {
+        let filter = FilterExprInfo {
+            expr: datafusion_expr::col("ts"),
+            col_name: "ts".to_string(),
+            time_ranges: vec![(Timestamp::new_second(0), Timestamp::new_second(1))],
+            window_size: chrono::Duration::seconds(1),
+        };
+
+        let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
+        let unparser = datafusion::sql::unparser::Unparser::default();
+        assert_eq!(
+            "((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP)))",
+            unparser.expr_to_sql(&predicate).unwrap().to_string()
+        );
+    }
+
+    #[test]
+    fn test_filter_expr_info_predicate_for_col_multiple_ranges() {
+        let filter = FilterExprInfo {
+            expr: datafusion_expr::col("ts"),
+            col_name: "ts".to_string(),
+            time_ranges: vec![
+                (Timestamp::new_second(0), Timestamp::new_second(1)),
+                (Timestamp::new_second(10), Timestamp::new_second(11)),
+            ],
+            window_size: chrono::Duration::seconds(1),
+        };
+
+        let predicate = filter.predicate_for_col("time_window").unwrap().unwrap();
+        let unparser = datafusion::sql::unparser::Unparser::default();
+        assert_eq!(
+            "(((time_window >= CAST('1970-01-01 00:00:00' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:01' AS TIMESTAMP))) OR ((time_window >= CAST('1970-01-01 00:00:10' AS TIMESTAMP)) AND (time_window < CAST('1970-01-01 00:00:11' AS TIMESTAMP))))",
+            unparser.expr_to_sql(&predicate).unwrap().to_string()
+        );
+    }
+
+    /// Helper: create a `TaskState` whose `last_update_time` is a known duration in the past.
+    fn state_with_past_update(age: Duration) -> TaskState {
+        let query_ctx = QueryContext::arc();
+        let (_tx, rx) = tokio::sync::oneshot::channel();
+        let mut state = TaskState::new(query_ctx, rx);
+        state.last_update_time = Instant::now() - age;
+        state
+    }
+
+    #[test]
+    fn test_short_incremental_cadence_uses_min_refresh() {
+        // When prefer_short_incremental_cadence is true and dirty backlog is manageable,
+        // the next start time should be last_update_time + min_refresh (short cadence),
+        // ignoring the longer time_window_size.
+        let state = state_with_past_update(Duration::from_secs(10));
+
+        let time_window_size = Some(Duration::from_secs(60)); // large window
+        let min_refresh = Duration::from_secs(5);
+        let flow_id = 1;
+
+        let result = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            None,
+            20,
+            true, // prefer_short_incremental_cadence
+        );
+
+        // With short cadence, result should be last_update_time + min_refresh.
+        let expected = state.last_update_time + min_refresh;
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_short_incremental_cadence_respects_last_query_duration() {
+        let mut state = state_with_past_update(Duration::from_secs(10));
+        state.last_query_duration = Duration::from_secs(20);
+
+        let time_window_size = Some(Duration::from_secs(60));
+        let min_refresh = Duration::from_secs(5);
+        let flow_id = 1;
+
+        let result = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            None,
+            20,
+            true,
+        );
+
+        assert_eq!(result, state.last_update_time + state.last_query_duration);
+    }
+
+    #[test]
+    fn test_short_incremental_cadence_respects_max_timeout() {
+        let mut state = state_with_past_update(Duration::from_secs(10));
+        state.last_query_duration = Duration::from_secs(20);
+
+        let time_window_size = Some(Duration::from_secs(60));
+        let min_refresh = Duration::from_secs(30);
+        let max_timeout = Duration::from_secs(5);
+        let flow_id = 1;
+
+        let result = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            Some(max_timeout),
+            20,
+            true,
+        );
+
+        assert_eq!(result, state.last_update_time + max_timeout);
+    }
+
+    #[test]
+    fn test_full_snapshot_ignores_short_cadence() {
+        // When prefer_short_incremental_cadence is false (full snapshot mode),
+        // the normal long-cadence based on time_window_size applies.
+        let mut state = state_with_past_update(Duration::from_secs(10));
+        // Make last_query_duration small so the lower bound (time_window_size) dominates.
+        state.last_query_duration = Duration::from_secs(1);
+
+        let time_window_size = Some(Duration::from_secs(60)); // large window
+        let min_refresh = Duration::from_secs(5);
+        let flow_id = 1;
+
+        let result = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            None,
+            20,
+            false, // prefer_short_incremental_cadence = false
+        );
+
+        // With normal cadence, result should be last_update_time + time_window_size
+        // (since last_query_duration < time_window_size).
+        let expected = state.last_update_time + Duration::from_secs(60);
+        assert_eq!(result, expected);
+    }
+
+    #[test]
+    fn test_dirty_window_overflow_schedules_immediately_even_with_short_cadence() {
+        // Dirty-window overflow must always schedule immediately,
+        // regardless of prefer_short_incremental_cadence.
+        let mut state = state_with_past_update(Duration::from_secs(10));
+        // Create a very large dirty backlog.
+        state
+            .dirty_time_windows
+            .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(3600)));
+
+        let time_window_size = Some(Duration::from_secs(1)); // tiny window => overflow
+        let min_refresh = Duration::from_secs(5);
+        let flow_id = 1;
+
+        // With short cadence flag.
+        let result = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            None,
+            1, // max 1 filter => tiny capacity
+            true,
+        );
+        assert!(
+            result <= Instant::now(),
+            "dirty overflow should schedule immediately"
+        );
+
+        // Without short cadence flag — same behavior.
+        let result2 = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            None,
+            1,
+            false,
+        );
+        assert!(
+            result2 <= Instant::now(),
+            "dirty overflow should schedule immediately"
+        );
+    }
+
+    #[test]
+    fn test_incremental_disabled_ignores_short_cadence() {
+        // When prefer_short_incremental_cadence is true but the dirty backlog is
+        // manageable, the short cadence is applied. This test verifies that the
+        // caller-side guard (checkpoint_mode + !is_incremental_disabled) controls
+        // whether short cadence is requested at all — when incremental is disabled,
+        // the flag is false, and the long cadence applies.
+        //
+        // This simulates the case where the caller computed
+        // prefer_short_incremental_cadence = false (e.g. incremental disabled
+        // or FullSnapshot mode), so the long cadence is used.
+        let mut state = state_with_past_update(Duration::from_secs(10));
+        state.last_query_duration = Duration::from_secs(1);
+
+        let time_window_size = Some(Duration::from_secs(60));
+        let min_refresh = Duration::from_secs(5);
+        let flow_id = 1;
+
+        let result = state.get_next_start_query_time(
+            flow_id,
+            &time_window_size,
+            min_refresh,
+            None,
+            20,
+            false, // prefer_short_incremental_cadence = false
+        );
+
+        // With normal cadence, result should be last_update_time + time_window_size.
+        let expected = state.last_update_time + Duration::from_secs(60);
+        assert_eq!(result, expected);
+    }
 }
--- a/src/flow/src/batching_mode/table_creator.rs
+++ b/src/flow/src/batching_mode/table_creator.rs
@@ -0,0 +1,381 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::CreateTableExpr;
+use datafusion_common::tree_node::TreeNode;
+use datafusion_expr::LogicalPlan;
+use datatypes::prelude::ConcreteDataType;
+use datatypes::schema::ColumnSchema;
+use operator::expr_helper::column_schemas_to_defs;
+use snafu::ResultExt;
+
+use crate::Error;
+use crate::adapter::{AUTO_CREATED_PLACEHOLDER_TS_COL, AUTO_CREATED_UPDATE_AT_TS_COL};
+use crate::batching_mode::utils::FindGroupByFinalName;
+use crate::error::{ConvertColumnSchemaSnafu, DatafusionSnafu};
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum QueryType {
+    /// query is a tql query
+    Tql,
+    /// query is a sql query
+    Sql,
+}
+
+// auto created table have a auto added column `update_at`, and optional have a `AUTO_CREATED_PLACEHOLDER_TS_COL` column for time index placeholder if no timestamp column is specified
+// TODO(discord9): for now no default value is set for auto added column for compatibility reason with streaming mode, but this might change in favor of simpler code?
+pub(super) fn create_table_with_expr(
+    plan: &LogicalPlan,
+    sink_table_name: &[String; 3],
+    query_type: &QueryType,
+) -> Result<CreateTableExpr, Error> {
+    let table_def = match query_type {
+        &QueryType::Sql => {
+            if let Some(def) = build_pk_from_aggr(plan)? {
+                def
+            } else {
+                build_by_sql_schema(plan)?
+            }
+        }
+        QueryType::Tql => {
+            // first try build from aggr, then from tql schema because tql query might not have aggr node
+            if let Some(table_def) = build_pk_from_aggr(plan)? {
+                table_def
+            } else {
+                build_by_tql_schema(plan)?
+            }
+        }
+    };
+    let first_time_stamp = table_def.ts_col;
+    let primary_keys = table_def.pks;
+
+    let mut column_schemas = Vec::new();
+    for field in plan.schema().fields() {
+        let name = field.name();
+        let ty = ConcreteDataType::from_arrow_type(field.data_type());
+        let col_schema = if first_time_stamp == Some(name.clone()) {
+            ColumnSchema::new(name, ty, false).with_time_index(true)
+        } else {
+            ColumnSchema::new(name, ty, true)
+        };
+
+        match query_type {
+            QueryType::Sql => {
+                column_schemas.push(col_schema);
+            }
+            QueryType::Tql => {
+                // if is val column, need to rename as val DOUBLE NULL
+                // if is tag column, need to cast type as STRING NULL
+                let is_tag_column = primary_keys.contains(name);
+                let is_val_column = !is_tag_column && first_time_stamp.as_ref() != Some(name);
+                if is_val_column {
+                    let col_schema =
+                        ColumnSchema::new(name, ConcreteDataType::float64_datatype(), true);
+                    column_schemas.push(col_schema);
+                } else if is_tag_column {
+                    let col_schema =
+                        ColumnSchema::new(name, ConcreteDataType::string_datatype(), true);
+                    column_schemas.push(col_schema);
+                } else {
+                    // time index column
+                    column_schemas.push(col_schema);
+                }
+            }
+        }
+    }
+
+    if query_type == &QueryType::Sql {
+        let update_at_schema = ColumnSchema::new(
+            AUTO_CREATED_UPDATE_AT_TS_COL,
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            true,
+        );
+        column_schemas.push(update_at_schema);
+    }
+
+    let time_index = if let Some(time_index) = first_time_stamp {
+        time_index
+    } else {
+        column_schemas.push(
+            ColumnSchema::new(
+                AUTO_CREATED_PLACEHOLDER_TS_COL,
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                false,
+            )
+            .with_time_index(true),
+        );
+        AUTO_CREATED_PLACEHOLDER_TS_COL.to_string()
+    };
+
+    let column_defs =
+        column_schemas_to_defs(column_schemas, &primary_keys).context(ConvertColumnSchemaSnafu)?;
+    Ok(CreateTableExpr {
+        catalog_name: sink_table_name[0].clone(),
+        schema_name: sink_table_name[1].clone(),
+        table_name: sink_table_name[2].clone(),
+        desc: "Auto created table by flow engine".to_string(),
+        column_defs,
+        time_index,
+        primary_keys,
+        create_if_not_exists: true,
+        table_options: Default::default(),
+        table_id: None,
+        engine: "mito".to_string(),
+    })
+}
+
+/// simply build by schema, return first timestamp column and no primary key
+fn build_by_sql_schema(plan: &LogicalPlan) -> Result<TableDef, Error> {
+    let first_time_stamp = plan.schema().fields().iter().find_map(|f| {
+        if ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp() {
+            Some(f.name().clone())
+        } else {
+            None
+        }
+    });
+    Ok(TableDef {
+        ts_col: first_time_stamp,
+        pks: vec![],
+    })
+}
+
+/// Return first timestamp column found in output schema and all string columns
+fn build_by_tql_schema(plan: &LogicalPlan) -> Result<TableDef, Error> {
+    let first_time_stamp = plan.schema().fields().iter().find_map(|f| {
+        if ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp() {
+            Some(f.name().clone())
+        } else {
+            None
+        }
+    });
+    let string_columns = plan
+        .schema()
+        .fields()
+        .iter()
+        .filter_map(|f| {
+            if ConcreteDataType::from_arrow_type(f.data_type()).is_string() {
+                Some(f.name().clone())
+            } else {
+                None
+            }
+        })
+        .collect::<Vec<_>>();
+
+    Ok(TableDef {
+        ts_col: first_time_stamp,
+        pks: string_columns,
+    })
+}
+
+struct TableDef {
+    ts_col: Option<String>,
+    pks: Vec<String>,
+}
+
+/// Return first timestamp column which is in group by clause and other columns which are also in group by clause
+///
+/// # Returns
+///
+/// * `Option<String>` - first timestamp column which is in group by clause
+/// * `Vec<String>` - other columns which are also in group by clause
+///
+/// if no aggregation found, return None
+fn build_pk_from_aggr(plan: &LogicalPlan) -> Result<Option<TableDef>, Error> {
+    let fields = plan.schema().fields();
+    let mut pk_names = FindGroupByFinalName::default();
+
+    plan.visit(&mut pk_names)
+        .with_context(|_| DatafusionSnafu {
+            context: format!("Can't find aggr expr in plan {plan:?}"),
+        })?;
+
+    // if no group by clause, return empty with first timestamp column found in output schema
+    let Some(pk_final_names) = pk_names.get_group_expr_names() else {
+        return Ok(None);
+    };
+    if pk_final_names.is_empty() {
+        let first_ts_col = fields
+            .iter()
+            .find(|f| ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp())
+            .map(|f| f.name().clone());
+        return Ok(Some(TableDef {
+            ts_col: first_ts_col,
+            pks: vec![],
+        }));
+    }
+
+    let all_pk_cols: Vec<_> = fields
+        .iter()
+        .filter(|f| pk_final_names.contains(f.name()))
+        .map(|f| f.name().clone())
+        .collect();
+    // Auto-created tables use the first timestamp column in the group-by keys
+    // as the time index. It is possible that timestamp columns appear only as
+    // aggregate outputs (for example `max(ts)`) and are not group-by keys; in
+    // that case `first_time_stamp` stays `None` and the caller falls back to a
+    // placeholder time index column.
+    let first_time_stamp = fields
+        .iter()
+        .find(|f| {
+            all_pk_cols.contains(&f.name().clone())
+                && ConcreteDataType::from_arrow_type(f.data_type()).is_timestamp()
+        })
+        .map(|f| f.name().clone());
+
+    let all_pk_cols: Vec<_> = all_pk_cols
+        .into_iter()
+        .filter(|col| first_time_stamp.as_ref() != Some(col))
+        .collect();
+
+    Ok(Some(TableDef {
+        ts_col: first_time_stamp,
+        pks: all_pk_cols,
+    }))
+}
+
+#[cfg(test)]
+mod test {
+    use api::v1::column_def::try_as_column_schema;
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::schema::ColumnSchema;
+    use pretty_assertions::assert_eq;
+    use session::context::QueryContext;
+
+    use super::*;
+    use crate::adapter::{AUTO_CREATED_PLACEHOLDER_TS_COL, AUTO_CREATED_UPDATE_AT_TS_COL};
+    use crate::batching_mode::utils::sql_to_df_plan;
+    use crate::test_utils::create_test_query_engine;
+
+    #[tokio::test]
+    async fn test_gen_create_table_sql() {
+        let query_engine = create_test_query_engine();
+        let ctx = QueryContext::arc();
+        struct TestCase {
+            sql: String,
+            sink_table_name: String,
+            column_schemas: Vec<ColumnSchema>,
+            primary_keys: Vec<String>,
+            time_index: String,
+        }
+
+        let update_at_schema = ColumnSchema::new(
+            AUTO_CREATED_UPDATE_AT_TS_COL,
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            true,
+        );
+
+        let ts_placeholder_schema = ColumnSchema::new(
+            AUTO_CREATED_PLACEHOLDER_TS_COL,
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true);
+
+        let testcases = vec![
+            TestCase {
+                sql: "SELECT number, ts FROM numbers_with_ts".to_string(),
+                sink_table_name: "new_table".to_string(),
+                column_schemas: vec![
+                    ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+                    ColumnSchema::new(
+                        "ts",
+                        ConcreteDataType::timestamp_millisecond_datatype(),
+                        false,
+                    )
+                    .with_time_index(true),
+                    update_at_schema.clone(),
+                ],
+                primary_keys: vec![],
+                time_index: "ts".to_string(),
+            },
+            TestCase {
+                sql: "SELECT number, max(ts) FROM numbers_with_ts GROUP BY number".to_string(),
+                sink_table_name: "new_table".to_string(),
+                column_schemas: vec![
+                    ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+                    ColumnSchema::new(
+                        "max(numbers_with_ts.ts)",
+                        ConcreteDataType::timestamp_millisecond_datatype(),
+                        true,
+                    ),
+                    update_at_schema.clone(),
+                    ts_placeholder_schema.clone(),
+                ],
+                primary_keys: vec!["number".to_string()],
+                time_index: AUTO_CREATED_PLACEHOLDER_TS_COL.to_string(),
+            },
+            TestCase {
+                sql: "SELECT max(number), ts FROM numbers_with_ts GROUP BY ts".to_string(),
+                sink_table_name: "new_table".to_string(),
+                column_schemas: vec![
+                    ColumnSchema::new(
+                        "max(numbers_with_ts.number)",
+                        ConcreteDataType::uint32_datatype(),
+                        true,
+                    ),
+                    ColumnSchema::new(
+                        "ts",
+                        ConcreteDataType::timestamp_millisecond_datatype(),
+                        false,
+                    )
+                    .with_time_index(true),
+                    update_at_schema.clone(),
+                ],
+                primary_keys: vec![],
+                time_index: "ts".to_string(),
+            },
+            TestCase {
+                sql: "SELECT number, ts FROM numbers_with_ts GROUP BY ts, number".to_string(),
+                sink_table_name: "new_table".to_string(),
+                column_schemas: vec![
+                    ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+                    ColumnSchema::new(
+                        "ts",
+                        ConcreteDataType::timestamp_millisecond_datatype(),
+                        false,
+                    )
+                    .with_time_index(true),
+                    update_at_schema.clone(),
+                ],
+                primary_keys: vec!["number".to_string()],
+                time_index: "ts".to_string(),
+            },
+        ];
+
+        for tc in testcases {
+            let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), &tc.sql, true)
+                .await
+                .unwrap();
+            let expr = create_table_with_expr(
+                &plan,
+                &[
+                    "greptime".to_string(),
+                    "public".to_string(),
+                    tc.sink_table_name.clone(),
+                ],
+                &QueryType::Sql,
+            )
+            .unwrap();
+            // TODO(discord9): assert expr
+            let column_schemas = expr
+                .column_defs
+                .iter()
+                .map(|c| try_as_column_schema(c).unwrap())
+                .collect::<Vec<_>>();
+            assert_eq!(tc.column_schemas, column_schemas, "{:?}", tc.sql);
+            assert_eq!(tc.primary_keys, expr.primary_keys, "{:?}", tc.sql);
+            assert_eq!(tc.time_index, expr.time_index, "{:?}", tc.sql);
+        }
+    }
+}
--- a/src/flow/src/batching_mode/task.rs
+++ b/src/flow/src/batching_mode/task.rs
--- a/src/flow/src/batching_mode/task/ckpt.rs
+++ b/src/flow/src/batching_mode/task/ckpt.rs
@@ -0,0 +1,181 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::Duration;
+
+use client::OutputWithMetrics;
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_telemetry::tracing::warn;
+use common_telemetry::{debug, info};
+
+use crate::batching_mode::checkpoint::{
+    FlowCheckpointDecision, FlowQueryFallbackReason, checkpoint_mode_label,
+};
+use crate::batching_mode::state::{CheckpointMode, TaskState};
+use crate::batching_mode::task::BatchingTask;
+use crate::metrics::{
+    METRIC_FLOW_BATCHING_ENGINE_CHECKPOINT_DECISION_CNT, METRIC_FLOW_BATCHING_ENGINE_QUERY_MODE_CNT,
+};
+use crate::{Error, FlowId};
+
+impl BatchingTask {
+    pub(super) fn query_failure_reason(err: &Error) -> FlowQueryFallbackReason {
+        if err.status_code() == StatusCode::RequestOutdated {
+            FlowQueryFallbackReason::StaleCursor
+        } else {
+            FlowQueryFallbackReason::IncrementalQueryFailure
+        }
+    }
+
+    pub(super) fn apply_query_failure_to_state(
+        state: &mut TaskState,
+        elapsed: Duration,
+        reason: FlowQueryFallbackReason,
+    ) -> Option<FlowCheckpointDecision> {
+        state.after_query_exec(elapsed, false);
+        let checkpoint_mode = state.checkpoint_mode();
+        if checkpoint_mode == CheckpointMode::Incremental {
+            state.mark_full_snapshot();
+            Some(FlowCheckpointDecision::FallbackToFullSnapshot {
+                previous_mode: checkpoint_mode,
+                reason,
+            })
+        } else {
+            None
+        }
+    }
+
+    pub(super) fn apply_query_result_to_state(
+        state: &mut TaskState,
+        res: &OutputWithMetrics,
+        elapsed: Duration,
+        can_advance_checkpoints: bool,
+    ) -> FlowCheckpointDecision {
+        state.after_query_exec(elapsed, true);
+        let checkpoint_mode = state.checkpoint_mode();
+        if !can_advance_checkpoints {
+            state.mark_full_snapshot();
+            return FlowCheckpointDecision::FallbackToFullSnapshot {
+                previous_mode: checkpoint_mode,
+                reason: FlowQueryFallbackReason::DirtyBacklogPending,
+            };
+        }
+
+        if let (Some(participating_regions), Some(watermark_map)) =
+            (res.participating_regions(), res.region_watermark_map())
+        {
+            let can_advance = match checkpoint_mode {
+                CheckpointMode::FullSnapshot => state
+                    .can_advance_full_snapshot_checkpoints(&participating_regions, &watermark_map),
+                CheckpointMode::Incremental => state
+                    .can_advance_incremental_checkpoints_with_participation(
+                        &participating_regions,
+                        &watermark_map,
+                    ),
+            };
+
+            if can_advance {
+                let participating_region_count = participating_regions.len();
+                let watermark_count = watermark_map.len();
+                match checkpoint_mode {
+                    CheckpointMode::FullSnapshot => {
+                        state.advance_checkpoints(watermark_map);
+                        if state.is_incremental_disabled() {
+                            FlowCheckpointDecision::FallbackToFullSnapshot {
+                                previous_mode: CheckpointMode::FullSnapshot,
+                                reason: FlowQueryFallbackReason::IncrementalDisabled,
+                            }
+                        } else {
+                            FlowCheckpointDecision::AdvancedFromFullSnapshot {
+                                participating_regions: participating_region_count,
+                                watermarks: watermark_count,
+                            }
+                        }
+                    }
+                    CheckpointMode::Incremental => {
+                        state.advance_incremental_checkpoints_with_participation(
+                            &participating_regions,
+                            watermark_map,
+                        );
+                        FlowCheckpointDecision::AdvancedIncremental {
+                            participating_regions: participating_region_count,
+                            watermarks: watermark_count,
+                        }
+                    }
+                }
+            } else {
+                state.mark_full_snapshot();
+                FlowCheckpointDecision::FallbackToFullSnapshot {
+                    previous_mode: checkpoint_mode,
+                    reason: FlowQueryFallbackReason::IncompleteRegionWatermark,
+                }
+            }
+        } else {
+            state.mark_full_snapshot();
+            FlowCheckpointDecision::FallbackToFullSnapshot {
+                previous_mode: checkpoint_mode,
+                reason: FlowQueryFallbackReason::MissingRegionWatermark,
+            }
+        }
+    }
+
+    pub(super) fn record_checkpoint_decision(flow_id: FlowId, decision: FlowCheckpointDecision) {
+        let flow_id = flow_id.to_string();
+        METRIC_FLOW_BATCHING_ENGINE_CHECKPOINT_DECISION_CNT
+            .with_label_values(&[
+                flow_id.as_str(),
+                decision.mode_label(),
+                decision.decision_label(),
+                decision.reason_label(),
+            ])
+            .inc();
+
+        match decision {
+            FlowCheckpointDecision::AdvancedFromFullSnapshot {
+                participating_regions,
+                watermarks,
+            } => {
+                info!(
+                    "Flow {flow_id} switched to incremental mode after full snapshot, participating_regions={participating_regions}, watermarks={watermarks}"
+                );
+            }
+            FlowCheckpointDecision::AdvancedIncremental {
+                participating_regions,
+                watermarks,
+            } => {
+                debug!(
+                    "Flow {flow_id} advanced incremental checkpoints, participating_regions={participating_regions}, watermarks={watermarks}"
+                );
+            }
+            FlowCheckpointDecision::FallbackToFullSnapshot {
+                previous_mode,
+                reason,
+            } => {
+                warn!(
+                    "Flow {flow_id} switched to full snapshot mode, previous_mode={}, reason={}",
+                    checkpoint_mode_label(previous_mode),
+                    reason.as_label()
+                );
+            }
+        }
+    }
+
+    pub(super) fn record_query_mode(flow_id: FlowId, mode: CheckpointMode) {
+        let flow_id = flow_id.to_string();
+        METRIC_FLOW_BATCHING_ENGINE_QUERY_MODE_CNT
+            .with_label_values(&[flow_id.as_str(), checkpoint_mode_label(mode)])
+            .inc();
+    }
+}
--- a/src/flow/src/batching_mode/task/inc.rs
+++ b/src/flow/src/batching_mode/task/inc.rs
@@ -0,0 +1,252 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_error::ext::BoxedError;
+use common_telemetry::debug;
+use common_telemetry::tracing::warn;
+use datafusion_expr::{DmlStatement, LogicalPlan};
+use query::options::{
+    FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY,
+    FLOW_SINK_TABLE_ID,
+};
+use snafu::ResultExt;
+use table::metadata::TableId;
+
+use crate::Error;
+use crate::batching_mode::incremental_filter::build_sink_dirty_time_window_filter_expr;
+use crate::batching_mode::state::{CheckpointMode, FilterExprInfo};
+use crate::batching_mode::table_creator::QueryType;
+use crate::batching_mode::task::BatchingTask;
+use crate::batching_mode::utils::{
+    analyze_incremental_aggregate_plan, get_table_info_df_schema,
+    rewrite_incremental_aggregate_with_sink_merge,
+};
+use crate::error::{ExternalSnafu, UnexpectedSnafu};
+
+impl BatchingTask {
+    async fn sink_table_id(&self) -> Result<TableId, Error> {
+        let table = self
+            .config
+            .catalog_manager
+            .table(
+                &self.config.sink_table_name[0],
+                &self.config.sink_table_name[1],
+                &self.config.sink_table_name[2],
+                None,
+            )
+            .await
+            .map_err(BoxedError::new)
+            .context(ExternalSnafu)?
+            .ok_or_else(|| {
+                UnexpectedSnafu {
+                    reason: format!(
+                        "Flow {} cannot build incremental extensions because sink table {:?} was not found",
+                        self.config.flow_id, self.config.sink_table_name
+                    ),
+                }
+                .build()
+            })?;
+        Ok(table.table_info().table_id())
+    }
+
+    /// For incremental-mode SQL queries, attempt to prepare an executable plan
+    /// that is safe for incremental scan extensions.
+    ///
+    /// Returns `Some(plan)` when incremental extensions are safe, and `None`
+    /// when the caller should execute the original plan without incremental
+    /// extensions. The returned plan may be either a rewritten
+    /// delta-LEFT-JOIN-sink merge plan or the original plan. In particular,
+    /// plain GROUP BY queries with no aggregate merge columns are incremental
+    /// safe without a rewrite, so they return `Some(original_plan)`.
+    pub(super) async fn prepare_plan_for_incremental(
+        &self,
+        plan: &LogicalPlan,
+        dirty_filter: Option<&FilterExprInfo>,
+    ) -> Result<Option<LogicalPlan>, Error> {
+        let is_incremental_sql = {
+            let state = self.state.read().unwrap();
+            if state.is_incremental_disabled() {
+                return Ok(None);
+            }
+            state.checkpoint_mode() == CheckpointMode::Incremental
+                && matches!(self.config.query_type, QueryType::Sql)
+        };
+
+        if !is_incremental_sql {
+            return Ok(None);
+        }
+
+        // Extract inner query plan from the DML wrapper.
+        // Non-DML or non-SQL plans bypass the rewrite and keep checkpoint mode;
+        // non-aggregate TQL or non-INSERT plans do not need incremental scan extensions.
+        let inner_plan = match plan {
+            LogicalPlan::Dml(dml) => dml.input.as_ref().clone(),
+            _ => return Ok(None),
+        };
+
+        // Analyze the plan for incremental rewritability.
+        // Incremental reads currently require aggregate / group-by plans that
+        // can be rewritten into a delta-left-join-sink merge. Non-aggregate SQL
+        // (projection, filter, or other non-aggregate shapes) stays full-snapshot
+        // until separately supported, and incremental mode is permanently
+        // disabled for this flow.
+        let Some(analysis) = analyze_incremental_aggregate_plan(&inner_plan)? else {
+            warn!(
+                "Flow {} incremental mode but plan is not an aggregate query; \
+                 permanently disabling incremental for this flow",
+                self.config.flow_id
+            );
+            self.state.write().unwrap().disable_incremental();
+            return Ok(None);
+        };
+
+        if !analysis.unsupported_exprs.is_empty() {
+            warn!(
+                "Flow {} incremental aggregate contains unsupported expressions {:?}; \
+                 permanently disabling incremental for this flow",
+                self.config.flow_id, analysis.unsupported_exprs
+            );
+            self.state.write().unwrap().disable_incremental();
+            return Ok(None);
+        }
+
+        // Plain GROUP BY without aggregate expressions has no values to
+        // merge between delta and sink. The incremental delta scan emits
+        // changed groups, and sink primary-key write semantics make this
+        // idempotent; no explicit left-join rewrite is needed.
+        if analysis.merge_columns.is_empty() {
+            return Ok(Some(plan.clone()));
+        }
+
+        // Fetch sink table for the merge rewrite.
+        // Transient errors (catalog, schema, filter, or rewrite) should not
+        // permanently disable incremental mode. Instead, we fall back to a
+        // full-snapshot plan for this round while keeping incremental retryable.
+        let sink_table = match get_table_info_df_schema(
+            self.config.catalog_manager.clone(),
+            self.config.sink_table_name.clone(),
+        )
+        .await
+        {
+            Ok((table, _)) => table,
+            Err(err) => {
+                warn!(
+                    "Flow {} failed to fetch sink table for incremental rewrite; \
+                     falling back to full snapshot for this round: {:?}",
+                    self.config.flow_id, err
+                );
+                self.state.write().unwrap().mark_full_snapshot();
+                return Ok(None);
+            }
+        };
+        let sink_schema = sink_table.table_info().meta.schema.clone();
+        let sink_dirty_filter = match build_sink_dirty_time_window_filter_expr(
+            self.config.flow_id,
+            &analysis,
+            &sink_schema,
+            dirty_filter,
+        ) {
+            Ok(filter) => filter,
+            Err(err) => {
+                warn!(
+                    "Flow {} failed to build sink dirty time window filter; \
+                     falling back to full snapshot for this round: {:?}",
+                    self.config.flow_id, err
+                );
+                self.state.write().unwrap().mark_full_snapshot();
+                return Ok(None);
+            }
+        };
+
+        let rewritten_inner = match rewrite_incremental_aggregate_with_sink_merge(
+            &inner_plan,
+            &analysis,
+            sink_table,
+            &self.config.sink_table_name,
+            sink_dirty_filter,
+        )
+        .await
+        {
+            Ok(plan) => plan,
+            Err(err) => {
+                warn!(
+                    "Flow {} failed to rewrite incremental aggregate with sink merge; \
+                     falling back to full snapshot for this round: {:?}",
+                    self.config.flow_id, err
+                );
+                self.state.write().unwrap().mark_full_snapshot();
+                return Ok(None);
+            }
+        };
+
+        // Reconstruct DML plan with the rewritten inner plan
+        let rewritten = match plan {
+            LogicalPlan::Dml(dml) => LogicalPlan::Dml(DmlStatement::new(
+                dml.table_name.clone(),
+                dml.target.clone(),
+                dml.op.clone(),
+                Arc::new(rewritten_inner),
+            )),
+            _ => unreachable!("already matched Dml above"),
+        };
+
+        debug!(
+            "Flow {} rewrote incremental SQL aggregate query with sink merge",
+            self.config.flow_id
+        );
+
+        Ok(Some(rewritten))
+    }
+
+    pub(super) async fn build_flow_query_extensions(
+        &self,
+        incremental_safe: bool,
+        can_advance_checkpoints: bool,
+    ) -> Result<Vec<(&'static str, String)>, Error> {
+        let mut extensions = vec![("flow.return_region_seq", "true".to_string())];
+
+        let incremental_checkpoints_json = {
+            let state = self.state.read().unwrap();
+            if incremental_safe
+                && can_advance_checkpoints
+                && !state.is_incremental_disabled()
+                && state.checkpoint_mode() == CheckpointMode::Incremental
+                && !state.checkpoints().is_empty()
+            {
+                Some(serde_json::to_string(state.checkpoints()).map_err(|err| {
+                    UnexpectedSnafu {
+                        reason: format!("Failed to serialize checkpoint map: {err}"),
+                    }
+                    .build()
+                })?)
+            } else {
+                None
+            }
+        };
+
+        if let Some(checkpoints_json) = incremental_checkpoints_json {
+            let sink_table_id = self.sink_table_id().await?;
+            extensions.push((FLOW_SINK_TABLE_ID, sink_table_id.to_string()));
+            extensions.push((
+                FLOW_INCREMENTAL_MODE,
+                FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY.to_string(),
+            ));
+            extensions.push((FLOW_INCREMENTAL_AFTER_SEQS, checkpoints_json));
+        }
+
+        Ok(extensions)
+    }
+}
--- a/src/flow/src/batching_mode/task/test.rs
+++ b/src/flow/src/batching_mode/task/test.rs
--- a/src/flow/src/batching_mode/utils.rs
+++ b/src/flow/src/batching_mode/utils.rs
@@ -278,7 +278,7 @@ fn collect_output_projection_info(plan: &LogicalPlan) -> OutputProjectionInfo {
                    let mut col_names = Vec::new();
                    find_column_names(&alias.expr, &mut col_names);
                    match col_names.len() {
-                        0 if matches!(alias.expr.as_ref(), Expr::Literal(_, _)) => {
+                        0 if is_passthrough_output_column(&alias_name, alias.expr.as_ref()) => {
                            projection_info.literal_columns.insert(alias_name);
                        }
                        1 => {
@@ -315,10 +315,38 @@ fn collect_output_projection_info(plan: &LogicalPlan) -> OutputProjectionInfo {
        }
    }

+    if projection_info
+        .output_field_names
+        .iter()
+        .any(|name| name == AUTO_CREATED_PLACEHOLDER_TS_COL)
+    {
+        projection_info
+            .literal_columns
+            .insert(AUTO_CREATED_PLACEHOLDER_TS_COL.to_string());
+    }
+
    projection_info.output_aliases = output_aliases;
    projection_info
 }

+fn is_passthrough_output_column(alias_name: &str, expr: &Expr) -> bool {
+    matches!(expr, Expr::Literal(_, _))
+        || match alias_name {
+            AUTO_CREATED_UPDATE_AT_TS_COL => expr == &datafusion::prelude::now(),
+            AUTO_CREATED_PLACEHOLDER_TS_COL => is_literal_or_cast_literal(expr),
+            _ => false,
+        }
+}
+
+fn is_literal_or_cast_literal(expr: &Expr) -> bool {
+    match expr {
+        Expr::Literal(_, _) => true,
+        Expr::Cast(cast) => is_literal_or_cast_literal(cast.expr.as_ref()),
+        Expr::TryCast(cast) => is_literal_or_cast_literal(cast.expr.as_ref()),
+        _ => false,
+    }
+}
+
 fn merge_op_for_aggregate_expr(aggr_expr: &Expr) -> Result<IncrementalAggregateMergeOp, String> {
    let Some(aggr_func) = get_aggr_func(aggr_expr) else {
        return Err(aggr_expr.to_string());
@@ -385,6 +413,11 @@ fn find_uncovered_output_fields(
            !group_key_names.contains(*name)
                && !merge_column_names.contains(*name)
                && !projection_info.literal_columns.contains(*name)
+                // Auto-created sink columns injected by ColumnMatcherRewriter
+                // are not part of the original aggregate semantics and must
+                // not prevent incremental aggregate rewrites.
+                && name.as_str() != AUTO_CREATED_UPDATE_AT_TS_COL
+                && name.as_str() != AUTO_CREATED_PLACEHOLDER_TS_COL
        })
        .cloned()
        .collect()
@@ -536,7 +569,8 @@ pub fn analyze_incremental_aggregate_plan(
 ///
 /// ```text
 /// delta = SELECT ts, number FROM <delta_plan> AS __flow_delta
-/// sink  = SELECT ts, number FROM <sink_table> AS __flow_sink
+/// sink_scan = SELECT * FROM <sink_table> [WHERE <sink_dirty_filter>]
+/// sink  = SELECT ts, number FROM sink_scan AS __flow_sink
 /// SELECT
 ///   CASE
 ///     WHEN __flow_sink.number IS NULL THEN __flow_delta.number
@@ -548,11 +582,17 @@ pub fn analyze_incremental_aggregate_plan(
 /// LEFT JOIN sink
 ///   ON __flow_delta.ts IS NOT DISTINCT FROM __flow_sink.ts
 /// ```
+///
+/// If `sink_dirty_filter` is provided, it is applied to the sink table scan
+/// before projection, aliasing, and the left join. The predicate must reference
+/// raw sink table columns structurally (unqualified), before the `__flow_sink`
+/// alias exists.
 pub async fn rewrite_incremental_aggregate_with_sink_merge(
    delta_plan: &LogicalPlan,
    analysis: &IncrementalAggregateAnalysis,
    sink_table: TableRef,
    sink_table_name: &TableName,
+    sink_dirty_filter: Option<Expr>,
 ) -> Result<LogicalPlan, Error> {
    ensure!(
        analysis.unsupported_exprs.is_empty(),
@@ -637,7 +677,22 @@ pub async fn rewrite_incremental_aggregate_with_sink_merge(
        .cloned()
        .map(unqualified_col)
        .collect::<Vec<_>>();
-    let sink_selected = LogicalPlanBuilder::from(sink_scan)
+    let sink_input = if let Some(predicate) = sink_dirty_filter {
+        LogicalPlanBuilder::from(sink_scan)
+            .filter(predicate)
+            .with_context(|_| DatafusionSnafu {
+                context: "Failed to filter sink table scan for incremental sink merge".to_string(),
+            })?
+            .build()
+            .with_context(|_| DatafusionSnafu {
+                context: "Failed to build filtered sink plan for incremental sink merge"
+                    .to_string(),
+            })?
+    } else {
+        sink_scan
+    };
+
+    let sink_selected = LogicalPlanBuilder::from(sink_input)
        .project(sink_selected_exprs)
        .with_context(|_| DatafusionSnafu {
            context: "Failed to project sink table scan for incremental sink merge".to_string(),
--- a/src/flow/src/batching_mode/utils/test.rs
+++ b/src/flow/src/batching_mode/utils/test.rs
@@ -15,10 +15,13 @@
 use std::sync::Arc;

 use common_recordbatch::RecordBatch;
+use common_time::Timestamp;
 use datafusion_common::tree_node::TreeNode as _;
 use datafusion_expr::GroupingSet;
-use datatypes::prelude::{ConcreteDataType, Scalar, VectorRef};
+use datatypes::prelude::{ConcreteDataType, MutableVector, Scalar, ScalarVectorBuilder, VectorRef};
 use datatypes::schema::{ColumnSchema, Schema};
+use datatypes::timestamp::TimestampMillisecond;
+use datatypes::vectors::TimestampMillisecondVectorBuilder;
 use pretty_assertions::assert_eq;
 use query::query_engine::DefaultSerializer;
 use session::context::QueryContext;
@@ -26,6 +29,7 @@ use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
 use table::test_util::MemTable;

 use super::*;
+use crate::batching_mode::state::FilterExprInfo;
 use crate::test_utils::create_test_query_engine;

 fn u32_table(table_name: &str, columns: Vec<&str>, rows: usize) -> TableRef {
@@ -50,6 +54,30 @@ fn empty_u32_table(table_name: &str, columns: Vec<&str>) -> TableRef {
    u32_table(table_name, columns, 0)
 }

+fn time_window_u32_table(table_name: &str) -> TableRef {
+    let schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new(
+            "time_window",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true),
+        ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+    ]));
+
+    let mut time_window_builder = TimestampMillisecondVectorBuilder::with_capacity(1);
+    time_window_builder.push(Some(TimestampMillisecond::new(0)));
+    let recordbatch = RecordBatch::new(
+        schema,
+        vec![
+            time_window_builder.to_vector_cloned(),
+            Arc::new(<u32 as Scalar>::VectorType::from_vec(vec![1])) as VectorRef,
+        ],
+    )
+    .unwrap();
+    MemTable::table(table_name, recordbatch)
+}
+
 fn assert_same_logical_plan(actual: &LogicalPlan, expected: &LogicalPlan) {
    assert_eq!(
        format!("{}", expected.display_indent()),
@@ -84,6 +112,29 @@ fn expected_left_join_rewrite(
    sink_selected_exprs: Vec<Expr>,
    join_keys: (Vec<Column>, Vec<Column>),
    projection_exprs: Vec<Expr>,
+) -> LogicalPlan {
+    expected_left_join_rewrite_with_sink_filter(
+        delta_plan,
+        sink_table,
+        sink_table_name,
+        delta_selected_exprs,
+        sink_selected_exprs,
+        None,
+        join_keys,
+        projection_exprs,
+    )
+}
+
+#[allow(clippy::too_many_arguments)]
+fn expected_left_join_rewrite_with_sink_filter(
+    delta_plan: &LogicalPlan,
+    sink_table: TableRef,
+    sink_table_name: &TableName,
+    delta_selected_exprs: Vec<Expr>,
+    sink_selected_exprs: Vec<Expr>,
+    sink_filter: Option<Expr>,
+    join_keys: (Vec<Column>, Vec<Column>),
+    projection_exprs: Vec<Expr>,
 ) -> LogicalPlan {
    let delta_alias = "__flow_delta";
    let sink_alias = "__flow_sink";
@@ -94,7 +145,17 @@ fn expected_left_join_rewrite(
        .unwrap()
        .build()
        .unwrap();
-    let sink_selected = LogicalPlanBuilder::from(test_sink_scan(sink_table, sink_table_name))
+    let sink_scan = test_sink_scan(sink_table, sink_table_name);
+    let sink_input = if let Some(predicate) = sink_filter {
+        LogicalPlanBuilder::from(sink_scan)
+            .filter(predicate)
+            .unwrap()
+            .build()
+            .unwrap()
+    } else {
+        sink_scan
+    };
+    let sink_selected = LogicalPlanBuilder::from(sink_input)
        .project(sink_selected_exprs)
        .unwrap()
        .alias(sink_alias)
@@ -576,6 +637,44 @@ async fn test_analyze_incremental_aggregate_plan_keeps_aliases_for_multiple_aggr
    }));
 }

+#[tokio::test]
+async fn test_analyze_incremental_aggregate_plan_allows_auto_created_sink_columns() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let sql = format!(
+        "SELECT max(number) AS total, ts, now() AS {}, CAST('1970-01-01 00:00:00' AS TIMESTAMP) AS {} FROM numbers_with_ts GROUP BY ts",
+        AUTO_CREATED_UPDATE_AT_TS_COL, AUTO_CREATED_PLACEHOLDER_TS_COL
+    );
+    let plan = sql_to_df_plan(ctx, query_engine, &sql, false)
+        .await
+        .unwrap();
+
+    let analysis = analyze_incremental_aggregate_plan(&plan).unwrap().unwrap();
+    assert!(
+        analysis.unsupported_exprs.is_empty(),
+        "auto-created sink columns should not disable incremental analysis: {:?}",
+        analysis.unsupported_exprs
+    );
+    assert!(
+        analysis
+            .literal_columns
+            .iter()
+            .any(|name| name == AUTO_CREATED_UPDATE_AT_TS_COL)
+    );
+    assert!(
+        analysis
+            .literal_columns
+            .iter()
+            .any(|name| name == AUTO_CREATED_PLACEHOLDER_TS_COL)
+    );
+    assert_eq!(analysis.merge_columns.len(), 1);
+    assert_eq!(analysis.merge_columns[0].output_field_name, "total");
+    assert_eq!(
+        analysis.merge_columns[0].merge_op,
+        IncrementalAggregateMergeOp::Max
+    );
+}
+
 #[tokio::test]
 async fn test_analyze_incremental_aggregate_plan_allows_where_before_aggregate() {
    let query_engine = create_test_query_engine();
@@ -641,6 +740,7 @@ async fn test_rewrite_incremental_aggregate_allows_alias_wrapped_scan() {
            "public".to_string(),
            "alias_wrapped_sink".to_string(),
        ],
+        None,
    )
    .await
    .unwrap();
@@ -887,6 +987,7 @@ async fn test_analyze_incremental_aggregate_plan_allows_literal_outputs() {
        &analysis,
        sink_table.clone(),
        &sink_table_name,
+        None,
    )
    .await
    .unwrap();
@@ -975,6 +1076,7 @@ async fn test_rewrite_incremental_aggregate_preserves_non_identifier_aliases() {
            "public".to_string(),
            "non_identifier_alias_sink".to_string(),
        ],
+        None,
    )
    .await
    .unwrap();
@@ -1161,6 +1263,7 @@ async fn test_rewrite_incremental_aggregate_with_left_join() {
        &analysis,
        sink_table.clone(),
        &sink_table_name,
+        None,
    )
    .await
    .unwrap();
@@ -1183,6 +1286,67 @@ async fn test_rewrite_incremental_aggregate_with_left_join() {
    assert_same_logical_plan(&rewritten, &expected);
 }

+#[tokio::test]
+async fn test_rewrite_incremental_aggregate_filters_sink_dirty_time_window() {
+    // This verifies the rewrite placement when callers supply an already
+    // inferred sink dirty-window predicate. The task-level inference rules are
+    // covered by `infer_sink_time_window_filter_col` tests in task.rs.
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let sql = "SELECT max(number) AS number, date_bin(INTERVAL '1 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window";
+    let plan = sql_to_df_plan(ctx.clone(), query_engine.clone(), sql, false)
+        .await
+        .unwrap();
+    let analysis = analyze_incremental_aggregate_plan(&plan).unwrap().unwrap();
+    let sink_table = time_window_u32_table("time_window_sink");
+    let sink_table_name = [
+        "greptime".to_string(),
+        "public".to_string(),
+        "time_window_sink".to_string(),
+    ];
+    let dirty_filter = FilterExprInfo {
+        expr: unqualified_col("ts"),
+        col_name: "ts".to_string(),
+        time_ranges: vec![(
+            Timestamp::new_millisecond(0),
+            Timestamp::new_millisecond(1000),
+        )],
+        window_size: chrono::Duration::seconds(1),
+    };
+    let sink_filter = dirty_filter
+        .predicate_for_col("time_window")
+        .unwrap()
+        .unwrap();
+
+    let rewritten = rewrite_incremental_aggregate_with_sink_merge(
+        &plan,
+        &analysis,
+        sink_table.clone(),
+        &sink_table_name,
+        Some(sink_filter.clone()),
+    )
+    .await
+    .unwrap();
+
+    let expected = expected_left_join_rewrite_with_sink_filter(
+        &plan,
+        sink_table,
+        &sink_table_name,
+        vec![unqualified_col("time_window"), unqualified_col("number")],
+        vec![unqualified_col("time_window"), unqualified_col("number")],
+        Some(sink_filter),
+        (
+            vec![qualified_column("__flow_delta", "time_window")],
+            vec![qualified_column("__flow_sink", "time_window")],
+        ),
+        vec![
+            max_merge_expr("number"),
+            qualified_col("__flow_delta", "time_window").alias("time_window"),
+        ],
+    );
+    assert_same_logical_plan(&rewritten, &expected);
+}
+
 #[tokio::test]
 async fn test_analyze_incremental_aggregate_plan_rejects_global_aggregate() {
    let query_engine = create_test_query_engine();
@@ -1230,6 +1394,7 @@ async fn test_rewrite_incremental_aggregate_rejects_empty_group_keys() {
        &analysis,
        sink_table,
        &sink_table_name,
+        None,
    )
    .await
    .unwrap_err();
@@ -1261,6 +1426,7 @@ async fn test_rewrite_incremental_aggregate_preserves_raw_aggregate_field_name()
        &analysis,
        sink_table.clone(),
        &sink_table_name,
+        None,
    )
    .await
    .unwrap();
--- a/src/flow/src/metrics.rs
+++ b/src/flow/src/metrics.rs
@@ -87,6 +87,20 @@ lazy_static! {
            &["flow_id"],
        )
        .unwrap();
+    pub static ref METRIC_FLOW_BATCHING_ENGINE_CHECKPOINT_DECISION_CNT: IntCounterVec =
+        register_int_counter_vec!(
+            "greptime_flow_batching_checkpoint_decision_count",
+            "flow batching checkpoint state-machine decisions",
+            &["flow_id", "mode", "decision", "reason"],
+        )
+        .unwrap();
+    pub static ref METRIC_FLOW_BATCHING_ENGINE_QUERY_MODE_CNT: IntCounterVec =
+        register_int_counter_vec!(
+            "greptime_flow_batching_query_mode_count",
+            "flow batching query attempts by checkpoint mode",
+            &["flow_id", "mode"],
+        )
+        .unwrap();
    pub static ref METRIC_FLOW_RUN_INTERVAL_MS: IntGauge =
        register_int_gauge!("greptime_flow_run_interval_ms", "flow run interval in ms").unwrap();
    pub static ref METRIC_FLOW_ROWS: IntCounterVec = register_int_counter_vec!(
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -303,7 +303,7 @@ impl Instance {
                    .await
            }
            _ => {
-                query_interceptor.pre_execute(&stmt, None, query_ctx.clone())?;
+                query_interceptor.pre_execute(Some(&stmt), None, query_ctx.clone())?;
                self.statement_executor
                    .execute_sql(stmt, query_ctx)
                    .await
@@ -326,7 +326,7 @@ impl Instance {
        let QueryStatement::Sql(stmt) = stmt else {
            unreachable!()
        };
-        query_interceptor.pre_execute(&stmt, Some(&plan), query_ctx.clone())?;
+        query_interceptor.pre_execute(Some(&stmt), Some(&plan), query_ctx.clone())?;

        self.statement_executor
            .exec_plan(plan, query_ctx.clone())
@@ -344,7 +344,11 @@ impl Instance {
            .statement_executor
            .plan_tql(tql.clone(), query_ctx)
            .await?;
-        query_interceptor.pre_execute(&Statement::Tql(tql), Some(&plan), query_ctx.clone())?;
+        query_interceptor.pre_execute(
+            Some(&Statement::Tql(tql)),
+            Some(&plan),
+            query_ctx.clone(),
+        )?;
        self.statement_executor
            .exec_plan(plan, query_ctx.clone())
            .await
@@ -649,9 +653,7 @@ impl Instance {
        let query_interceptor_opt = self.plugins.get::<SqlQueryInterceptorRef<Error>>();
        let query_interceptor = query_interceptor_opt.as_ref();

-        if let Some(ref s) = stmt {
-            query_interceptor.pre_execute(s, Some(&plan), query_ctx.clone())?;
-        }
+        query_interceptor.pre_execute(stmt.as_ref(), Some(&plan), query_ctx.clone())?;

        let query = stmt
            .as_ref()
@@ -880,7 +882,11 @@ impl PrometheusHandler for Instance {
            .map_err(BoxedError::new)
            .context(ExecuteQuerySnafu)?;

-        interceptor.pre_execute(query, Some(&plan), query_ctx.clone())?;
+        let QueryStatement::Promql(eval_stmt, _) = &stmt else {
+            unreachable!("query is parsed from promql");
+        };
+
+        interceptor.pre_execute(query, &eval_stmt.expr, Some(&plan), query_ctx.clone())?;

        // Take the EvalStmt from the original QueryStatement and use it to create the CatalogQueryStatement.
        let query_statement = if let QueryStatement::Promql(eval_stmt, alias) = stmt {
@@ -892,7 +898,7 @@ impl PrometheusHandler for Instance {
            }
            .fail();
        };
-        let query = query_statement.to_string();
+        let raw_query = query_statement.to_string();

        let slow_query_timer = self
            .slow_query_options
@@ -912,7 +918,7 @@ impl PrometheusHandler for Instance {
        let ticket = self.process_manager.register_query(
            query_ctx.current_catalog().to_string(),
            vec![query_ctx.current_schema()],
-            query,
+            raw_query,
            query_ctx.conn_info().to_string(),
            Some(query_ctx.process_id()),
            slow_query_timer,
@@ -1204,14 +1210,19 @@ fn should_track_plan_process(stmt: Option<&Statement>, plan: &LogicalPlan) -> bo
 #[cfg(test)]
 mod tests {
    use std::collections::HashMap;
+    use std::future::Future;
+    use std::pin::Pin;
    use std::sync::atomic::{AtomicBool, Ordering};
    use std::sync::{Arc, Barrier};
+    use std::task::{Context, Poll};
    use std::thread;
    use std::time::{Duration, Instant};

    use api::v1::meta::{ProcedureDetailResponse, ReconcileRequest, ReconcileResponse};
    use catalog::process_manager::ProcessManager;
    use common_base::Plugins;
+    use common_error::ext::{BoxedError, PlainError};
+    use common_error::status_code::StatusCode;
    use common_meta::cache::LayeredCacheRegistryBuilder;
    use common_meta::kv_backend::memory::MemoryKvBackend;
    use common_meta::procedure_executor::{ExecutorContext, ProcedureExecutor};
@@ -1220,23 +1231,142 @@ mod tests {
        MigrateRegionRequest, MigrateRegionResponse, ProcedureStateResponse,
    };
    use common_query::Output;
+    use common_recordbatch::{
+        OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream,
+    };
    use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
    use datafusion_expr::dml::InsertOp;
    use datafusion_expr::{LogicalPlanBuilder, LogicalTableSource};
    use datatypes::prelude::ConcreteDataType;
-    use datatypes::schema::{ColumnSchema, Schema as GtSchema};
+    use datatypes::schema::{ColumnSchema, Schema as GtSchema, SchemaRef as GtSchemaRef};
    use query::query_engine::options::QueryOptions;
    use session::context::{Channel, ConnInfo, QueryContext, QueryContextBuilder};
+    use snafu::{Location, Snafu};
    use sql::dialect::GreptimeDbDialect;
+    use store_api::data_source::DataSource;
+    use store_api::storage::ScanRequest;
    use strfmt::Format;
-    use table::metadata::{TableInfoBuilder, TableMetaBuilder};
+    use table::metadata::{FilterPushDownType, TableInfo, TableInfoBuilder, TableMetaBuilder};
    use table::test_util::EmptyTable;
+    use table::{Table, TableRef};
    use tokio::sync::{mpsc, oneshot};

    use super::*;
    use crate::frontend::FrontendOptions;
    use crate::instance::builder::FrontendBuilder;

+    #[derive(Debug, Snafu)]
+    enum TestError {
+        #[snafu(display("Failed to build test cache registry"))]
+        BuildCacheRegistry {
+            source: cache::error::Error,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Failed to build test table meta for table: {table_name}"))]
+        BuildTableMeta {
+            table_name: String,
+            source: table::metadata::TableMetaBuilderError,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Failed to build test table info for table: {table_name}"))]
+        BuildTableInfo {
+            table_name: String,
+            source: table::metadata::TableInfoBuilderError,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Failed to register test table: {table_name}"))]
+        RegisterTable {
+            table_name: String,
+            source: catalog::error::Error,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Failed to build test frontend instance"))]
+        BuildFrontend {
+            source: crate::error::Error,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Expected exactly one output for SQL `{sql}`, got {actual}"))]
+        UnexpectedOutputCount {
+            sql: String,
+            actual: usize,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Failed to execute SQL `{sql}`"))]
+        ExecuteSql {
+            sql: String,
+            source: crate::error::Error,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Timed out waiting for insert-select start notification"))]
+        InsertStartTimeout {
+            source: tokio::time::error::Elapsed,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Insert-select start notification channel closed"))]
+        InsertStartChannelClosed {
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Failed to release blocking insert-select interceptor"))]
+        ReleaseBlockedInsert {
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Timed out waiting for insert-select source to be polled"))]
+        SourcePollTimeout {
+            source: tokio::time::error::Elapsed,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Insert-select source poll notification channel closed"))]
+        SourcePollChannelClosed {
+            source: oneshot::error::RecvError,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Timed out waiting for insert task to finish"))]
+        InsertTaskTimeout {
+            source: tokio::time::error::Elapsed,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Insert task panicked"))]
+        InsertTaskPanic {
+            source: tokio::task::JoinError,
+            #[snafu(implicit)]
+            location: Location,
+        },
+
+        #[snafu(display("Expected insert-select to be cancelled"))]
+        InsertSelectNotCancelled {
+            #[snafu(implicit)]
+            location: Location,
+        },
+    }
+
+    type TestResult<T> = std::result::Result<T, TestError>;
+
    fn parse_one_sql(sql: &str) -> Statement {
        parse_stmt(sql, &GreptimeDbDialect {}).unwrap().remove(0)
    }
@@ -1270,11 +1400,11 @@ mod tests {

        fn pre_execute(
            &self,
-            statement: &Statement,
+            statement: Option<&Statement>,
            _plan: Option<&LogicalPlan>,
            _query_ctx: QueryContextRef,
        ) -> Result<()> {
-            let Statement::Insert(insert) = statement else {
+            let Some(Statement::Insert(insert)) = statement else {
                return Ok(());
            };
            if !insert.has_non_values_query_source() {
@@ -1292,6 +1422,70 @@ mod tests {
        }
    }

+    struct PendingRecordBatchStream {
+        schema: GtSchemaRef,
+        polled_tx: Option<oneshot::Sender<()>>,
+        _finish_tx: oneshot::Sender<()>,
+        finish_rx: Pin<Box<oneshot::Receiver<()>>>,
+    }
+
+    impl RecordBatchStream for PendingRecordBatchStream {
+        fn schema(&self) -> GtSchemaRef {
+            self.schema.clone()
+        }
+
+        fn output_ordering(&self) -> Option<&[OrderOption]> {
+            None
+        }
+
+        fn metrics(&self) -> Option<common_recordbatch::adapter::RecordBatchMetrics> {
+            None
+        }
+    }
+
+    impl Stream for PendingRecordBatchStream {
+        type Item = common_recordbatch::error::Result<RecordBatch>;
+
+        fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+            if let Some(polled_tx) = self.polled_tx.take() {
+                let _ = polled_tx.send(());
+            }
+
+            match self.finish_rx.as_mut().poll(cx) {
+                Poll::Ready(_) => Poll::Ready(None),
+                Poll::Pending => Poll::Pending,
+            }
+        }
+    }
+
+    impl Unpin for PendingRecordBatchStream {}
+
+    struct PendingDataSource {
+        schema: GtSchemaRef,
+        polled_tx: std::sync::Mutex<Option<oneshot::Sender<()>>>,
+    }
+
+    impl DataSource for PendingDataSource {
+        fn get_stream(
+            &self,
+            _request: ScanRequest,
+        ) -> std::result::Result<SendableRecordBatchStream, BoxedError> {
+            let (finish_tx, finish_rx) = oneshot::channel();
+            let mut polled_tx = self.polled_tx.lock().map_err(|_| {
+                BoxedError::new(PlainError::new(
+                    "pending data source lock poisoned".to_string(),
+                    StatusCode::Unexpected,
+                ))
+            })?;
+            Ok(Box::pin(PendingRecordBatchStream {
+                schema: self.schema.clone(),
+                polled_tx: polled_tx.take(),
+                _finish_tx: finish_tx,
+                finish_rx: Box::pin(finish_rx),
+            }))
+        }
+    }
+
    struct NoopProcedureExecutor;

    #[async_trait::async_trait]
@@ -1353,18 +1547,18 @@ mod tests {

    fn test_cache_registry(
        kv_backend: common_meta::kv_backend::KvBackendRef,
-    ) -> common_meta::cache::LayeredCacheRegistryRef {
-        Arc::new(
+    ) -> TestResult<common_meta::cache::LayeredCacheRegistryRef> {
+        Ok(Arc::new(
            cache::with_default_composite_cache_registry(
                LayeredCacheRegistryBuilder::default()
                    .add_cache_registry(cache::build_fundamental_cache_registry(kv_backend)),
            )
-            .unwrap()
+            .context(BuildCacheRegistrySnafu)?
            .build(),
-        )
+        ))
    }

-    fn test_table(table_id: u32, table_name: &str) -> table::TableRef {
+    fn test_table_info(table_id: u32, table_name: &str) -> TestResult<TableInfo> {
        let schema = Arc::new(GtSchema::new(vec![
            ColumnSchema::new("id", ConcreteDataType::int32_datatype(), false),
            ColumnSchema::new(
@@ -1380,36 +1574,85 @@ mod tests {
            .value_indices(vec![1])
            .next_column_id(1024)
            .build()
-            .unwrap();
-        let table_info = TableInfoBuilder::new(table_name, table_meta)
+            .with_context(|_| BuildTableMetaSnafu {
+                table_name: table_name.to_string(),
+            })?;
+
+        TableInfoBuilder::new(table_name, table_meta)
            .table_id(table_id)
            .build()
-            .unwrap();
+            .with_context(|_| BuildTableInfoSnafu {
+                table_name: table_name.to_string(),
+            })
+    }

-        EmptyTable::from_table_info(&table_info)
+    fn test_table(table_id: u32, table_name: &str) -> TestResult<table::TableRef> {
+        let table_info = test_table_info(table_id, table_name)?;
+        Ok(EmptyTable::from_table_info(&table_info))
+    }
+
+    fn pending_table(
+        table_id: u32,
+        table_name: &str,
+        polled_tx: oneshot::Sender<()>,
+    ) -> TestResult<table::TableRef> {
+        let table_info = test_table_info(table_id, table_name)?;
+        let data_source = Arc::new(PendingDataSource {
+            schema: table_info.meta.schema.clone(),
+            polled_tx: std::sync::Mutex::new(Some(polled_tx)),
+        });
+
+        Ok(Arc::new(Table::new(
+            Arc::new(table_info),
+            FilterPushDownType::Unsupported,
+            data_source,
+        )))
+    }
+
+    async fn test_instance_with_tables(
+        source_table: TableRef,
+        target_table: TableRef,
+    ) -> TestResult<Instance> {
+        test_instance_with_plugins(source_table, target_table, Plugins::new()).await
    }

    async fn test_instance_with_insert_select_interceptor(
        interceptor: SqlQueryInterceptorRef<Error>,
-    ) -> Instance {
+    ) -> TestResult<Instance> {
+        let plugins = Plugins::new();
+        plugins.insert::<SqlQueryInterceptorRef<Error>>(interceptor);
+
+        test_instance_with_plugins(
+            test_table(1024, "source")?,
+            test_table(1025, "target")?,
+            plugins,
+        )
+        .await
+    }
+
+    async fn test_instance_with_plugins(
+        source_table: TableRef,
+        target_table: TableRef,
+        plugins: Plugins,
+    ) -> TestResult<Instance> {
        let kv_backend = Arc::new(MemoryKvBackend::new());
        let process_manager = Arc::new(ProcessManager::new("test-frontend".to_string(), None));
-        let catalog_manager =
-            catalog::memory::MemoryCatalogManager::new_with_table(test_table(1024, "source"));
+        let catalog_manager = catalog::memory::MemoryCatalogManager::new_with_table(source_table);
+        let target_table_name = "target";
        catalog_manager
            .register_table_sync(catalog::RegisterTableRequest {
                catalog: "greptime".to_string(),
                schema: "public".to_string(),
-                table_name: "target".to_string(),
+                table_name: target_table_name.to_string(),
                table_id: 1025,
-                table: test_table(1025, "target"),
+                table: target_table,
            })
-            .unwrap();
+            .with_context(|_| RegisterTableSnafu {
+                table_name: target_table_name.to_string(),
+            })?;
        catalog_manager.register_process_list_table(process_manager.clone());

-        let cache_registry = test_cache_registry(kv_backend.clone());
-        let plugins = Plugins::new();
-        plugins.insert::<SqlQueryInterceptorRef<Error>>(interceptor);
+        let cache_registry = test_cache_registry(kv_backend.clone())?;

        FrontendBuilder::new(
            FrontendOptions::default(),
@@ -1423,17 +1666,25 @@ mod tests {
        .with_plugin(plugins)
        .try_build()
        .await
-        .unwrap()
+        .context(BuildFrontendSnafu)
    }

    async fn execute_one_sql(
        instance: &Instance,
        sql: &str,
        query_ctx: QueryContextRef,
-    ) -> Result<Output> {
+    ) -> TestResult<Output> {
        let mut results = instance.do_query_inner(sql, query_ctx).await;
-        assert_eq!(1, results.len());
-        results.remove(0)
+        ensure!(
+            results.len() == 1,
+            UnexpectedOutputCountSnafu {
+                sql: sql.to_string(),
+                actual: results.len(),
+            }
+        );
+        results.remove(0).with_context(|_| ExecuteSqlSnafu {
+            sql: sql.to_string(),
+        })
    }

    #[test]
@@ -1588,12 +1839,12 @@ mod tests {
    }

    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
-    async fn test_insert_select_is_visible_in_show_processlist() {
+    async fn test_insert_select_is_visible_in_show_processlist() -> TestResult<()> {
        let insert_sql = "INSERT INTO target SELECT * FROM source";
        let (started_tx, mut started_rx) = mpsc::unbounded_channel();
        let (finish_tx, finish_rx) = oneshot::channel();
        let interceptor = Arc::new(BlockingInsertSelectInterceptor::new(started_tx, finish_rx));
-        let instance = Arc::new(test_instance_with_insert_select_interceptor(interceptor).await);
+        let instance = Arc::new(test_instance_with_insert_select_interceptor(interceptor).await?);

        let insert_task = tokio::spawn({
            let instance = instance.clone();
@@ -1602,20 +1853,77 @@ mod tests {

        tokio::time::timeout(Duration::from_secs(5), started_rx.recv())
            .await
-            .unwrap()
-            .unwrap();
+            .context(InsertStartTimeoutSnafu)?
+            .context(InsertStartChannelClosedSnafu)?;

-        let output = execute_one_sql(&instance, "SHOW PROCESSLIST", test_query_ctx(43))
-            .await
-            .unwrap();
+        let output = execute_one_sql(&instance, "SHOW PROCESSLIST", test_query_ctx(43)).await?;
        let process_list = output.data.pretty_print().await;
        assert!(
            process_list.contains(insert_sql),
            "process list did not contain running insert:\n{process_list}"
        );

-        finish_tx.send(()).unwrap();
-        insert_task.await.unwrap().unwrap();
+        finish_tx
+            .send(())
+            .map_err(|_| ReleaseBlockedInsertSnafu.build())?;
+        insert_task.await.context(InsertTaskPanicSnafu)??;
+
+        Ok(())
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_kill_query_cancels_insert_select() -> TestResult<()> {
+        assert_kill_cancels_insert_select("KILL QUERY 4242").await
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_kill_process_id_cancels_insert_select() -> TestResult<()> {
+        assert_kill_cancels_insert_select("KILL 'test-frontend/4242'").await
+    }
+
+    async fn assert_kill_cancels_insert_select(kill_sql: &str) -> TestResult<()> {
+        let insert_sql = "INSERT INTO target SELECT * FROM source";
+        let (source_polled_tx, source_polled_rx) = oneshot::channel();
+        let instance = Arc::new(
+            test_instance_with_tables(
+                pending_table(1024, "source", source_polled_tx)?,
+                test_table(1025, "target")?,
+            )
+            .await?,
+        );
+
+        let insert_task = tokio::spawn({
+            let instance = instance.clone();
+            async move { execute_one_sql(&instance, insert_sql, test_query_ctx(4242)).await }
+        });
+
+        tokio::time::timeout(Duration::from_secs(5), source_polled_rx)
+            .await
+            .context(SourcePollTimeoutSnafu)?
+            .context(SourcePollChannelClosedSnafu)?;
+
+        let output = execute_one_sql(&instance, kill_sql, test_query_ctx(43)).await?;
+        assert!(matches!(output.data, OutputData::AffectedRows(1)));
+
+        let insert_result = tokio::time::timeout(Duration::from_secs(5), insert_task)
+            .await
+            .context(InsertTaskTimeoutSnafu)?
+            .context(InsertTaskPanicSnafu)?;
+        let err = match insert_result {
+            Ok(_) => return InsertSelectNotCancelledSnafu.fail(),
+            Err(TestError::ExecuteSql { source, .. }) => source,
+            Err(err) => return Err(err),
+        };
+        assert_eq!(StatusCode::Cancelled, err.status_code());
+
+        let output = execute_one_sql(&instance, "SHOW PROCESSLIST", test_query_ctx(43)).await?;
+        let process_list = output.data.pretty_print().await;
+        assert!(
+            !process_list.contains(insert_sql),
+            "process list still contains killed insert:\n{process_list}"
+        );
+
+        Ok(())
    }

    fn insert_dml_plan() -> LogicalPlan {
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -121,8 +121,9 @@ impl GrpcQueryHandler for Instance {
                                .context(PlanStatementSnafu)?;

                            let dummy_catalog_list =
-                                Arc::new(catalog::table_source::dummy_catalog::DummyCatalogList::new(
+                                Arc::new(catalog::table_source::dummy_catalog::DummyCatalogList::new_with_query_ctx(
                                    self.catalog_manager().clone(),
+                                    ctx.clone(),
                                ));

                            let logical_plan = plan_decoder
@@ -416,10 +417,12 @@ impl Instance {
            .new_plan_decoder()
            .context(PlanStatementSnafu)?;

-        let dummy_catalog_list =
-            Arc::new(catalog::table_source::dummy_catalog::DummyCatalogList::new(
+        let dummy_catalog_list = Arc::new(
+            catalog::table_source::dummy_catalog::DummyCatalogList::new_with_query_ctx(
                self.catalog_manager().clone(),
-            ));
+                ctx.clone(),
+            ),
+        );

        // no optimize yet since we still need to add stuff
        let logical_plan = plan_decoder
--- a/src/index/Cargo.toml
+++ b/src/index/Cargo.toml
@@ -26,7 +26,7 @@ fst.workspace = true
 futures.workspace = true
 greptime-proto.workspace = true
 itertools.workspace = true
-jieba-rs = "0.8"
+jieba-rs = "0.10"
 lazy_static.workspace = true
 mockall.workspace = true
 nalgebra.workspace = true
@@ -40,8 +40,8 @@ serde.workspace = true
 serde_json.workspace = true
 snafu.workspace = true
 store-api.workspace = true
-tantivy = { version = "0.24", features = ["zstd-compression"] }
-tantivy-jieba = "0.16"
+tantivy = { version = "0.26", features = ["zstd-compression"] }
+tantivy-jieba = "0.20"
 tokio.workspace = true
 tokio-util.workspace = true
 usearch = { version = "2.21", default-features = false, features = ["fp16lib"], optional = true }
--- a/src/index/benches/tokenizer_bench.rs
+++ b/src/index/benches/tokenizer_bench.rs
@@ -12,8 +12,79 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
-use index::fulltext_index::tokenizer::{EnglishTokenizer, Tokenizer};
+use std::collections::HashMap;
+use std::hint::black_box;
+use std::path::PathBuf;
+use std::time::Duration;
+
+use async_trait::async_trait;
+use criterion::{BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use futures::AsyncRead;
+use index::fulltext_index::create::{FulltextIndexCreator, TantivyFulltextIndexCreator};
+use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer};
+use index::fulltext_index::{Analyzer, Config};
+use puffin::puffin_manager::{PuffinWriter, PutOptions};
+
+const CHINESE_TOKENIZER_TEXTS: &[(&str, &str)] = &[
+    ("short", "登录手机号。中国农业银行。"),
+    (
+        "mixed_log",
+        "2025-08-01 21:09:28 用户登录失败 trace_id=abc_123 dynamic_key=mobile_login 中国农业银行接口返回超时。",
+    ),
+    (
+        "product_search",
+        "哈基米哦南北绿豆，噢马自立曼波。装电视台，中国中央广播电视台。压不缩，笑不活。",
+    ),
+    (
+        "long_news",
+        "中国农业银行发布公告称，手机银行登录服务完成升级。多个地区用户反馈查询速度提升，后台监控显示核心链路延迟下降，异常请求自动重试次数减少。系统继续保留 trace_id、request_id 和 dynamic_key 等字段用于排查问题。",
+    ),
+];
+
+const CHINESE_INDEX_DOCS: &[&str] = &[
+    "登录手机号，中国农业银行手机银行接口返回成功。",
+    "用户登录失败，trace_id=abc_123，dynamic_key=mobile_login。",
+    "中国中央广播电视台发布新的节目预告。",
+    "装电视台的时候遇到压不缩的问题。",
+    "哈基米哦南北绿豆，噢马自立曼波。",
+    "后台监控显示核心链路延迟下降。",
+    "系统保留 request_id 用于排查问题。",
+    "中文全文索引需要兼顾召回率和 token 数量。",
+];
+
+struct NoopPuffinWriter;
+
+#[async_trait]
+impl PuffinWriter for NoopPuffinWriter {
+    async fn put_blob<R>(
+        &mut self,
+        _key: &str,
+        _raw_data: R,
+        _options: PutOptions,
+        _properties: HashMap<String, String>,
+    ) -> puffin::error::Result<u64>
+    where
+        R: AsyncRead + Send,
+    {
+        unreachable!("tantivy fulltext benchmark only writes directory blobs")
+    }
+
+    async fn put_dir(
+        &mut self,
+        _key: &str,
+        _dir: PathBuf,
+        _options: PutOptions,
+        _properties: HashMap<String, String>,
+    ) -> puffin::error::Result<u64> {
+        Ok(0)
+    }
+
+    fn set_footer_lz4_compressed(&mut self, _lz4_compressed: bool) {}
+
+    async fn finish(self) -> puffin::error::Result<u64> {
+        Ok(0)
+    }
+}

 fn bench_english_tokenizer(c: &mut Criterion) {
    let tokenizer = EnglishTokenizer;
@@ -86,5 +157,104 @@ fn bench_english_tokenizer(c: &mut Criterion) {
    repeat_group.finish();
 }

-criterion_group!(benches, bench_english_tokenizer);
+fn bench_chinese_tokenizer(c: &mut Criterion) {
+    let tokenizer = ChineseTokenizer;
+    let mut group = c.benchmark_group("chinese_tokenizer");
+
+    for (name, text) in CHINESE_TOKENIZER_TEXTS {
+        group.throughput(Throughput::Bytes(text.len() as u64));
+        group.bench_with_input(BenchmarkId::new("tokenize", name), text, |b, text| {
+            b.iter(|| black_box(tokenizer.tokenize(black_box(text))))
+        });
+    }
+
+    group.finish();
+
+    let mut repeat_group = c.benchmark_group("chinese_tokenizer_repeated");
+    let sample_text = CHINESE_TOKENIZER_TEXTS
+        .iter()
+        .find(|(name, _)| *name == "mixed_log")
+        .map(|(_, text)| *text)
+        .expect("mixed_log sample must exist");
+
+    for repeat_count in [10, 100, 1000] {
+        repeat_group.bench_with_input(
+            BenchmarkId::new("repeated_tokenize", repeat_count),
+            &repeat_count,
+            |b, &repeat_count| {
+                b.iter(|| {
+                    for _ in 0..repeat_count {
+                        black_box(tokenizer.tokenize(black_box(sample_text)));
+                    }
+                })
+            },
+        );
+    }
+
+    repeat_group.finish();
+}
+
+fn bench_tantivy_chinese_fulltext_index(c: &mut Criterion) {
+    let runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .expect("failed to create Tokio runtime");
+    let config = Config {
+        analyzer: Analyzer::Chinese,
+        case_sensitive: false,
+    };
+    let mut group = c.benchmark_group("tantivy_chinese_fulltext_index");
+    group.sample_size(10);
+    group.measurement_time(Duration::from_secs(10));
+
+    for doc_count in [32usize, 256usize] {
+        group.throughput(Throughput::Elements(doc_count as u64));
+        group.bench_with_input(
+            BenchmarkId::new("build_commit", doc_count),
+            &doc_count,
+            |b, &doc_count| {
+                b.iter_batched(
+                    tempfile::tempdir,
+                    |dir| {
+                        let dir = dir.expect("failed to create temp dir");
+                        runtime.block_on(async {
+                            let mut creator =
+                                TantivyFulltextIndexCreator::new(dir.path(), config, 64 << 20)
+                                    .await
+                                    .expect("failed to create tantivy fulltext index");
+                            for idx in 0..doc_count {
+                                let text = CHINESE_INDEX_DOCS[idx % CHINESE_INDEX_DOCS.len()];
+                                creator
+                                    .push_text(black_box(text))
+                                    .await
+                                    .expect("failed to push text");
+                            }
+                            let mut puffin_writer = NoopPuffinWriter;
+                            creator
+                                .finish(
+                                    &mut puffin_writer,
+                                    "tantivy_chinese_fulltext_index",
+                                    PutOptions::default(),
+                                )
+                                .await
+                                .expect("failed to commit tantivy fulltext index");
+                        });
+                        // Return the temp dir so Criterion drops it after timing the routine.
+                        dir
+                    },
+                    BatchSize::SmallInput,
+                )
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_english_tokenizer,
+    bench_chinese_tokenizer,
+    bench_tantivy_chinese_fulltext_index
+);
 criterion_main!(benches);
--- a/src/index/src/fulltext_index.rs
+++ b/src/index/src/fulltext_index.rs
@@ -52,7 +52,7 @@ impl Config {
    fn build_tantivy_tokenizer(&self) -> TokenizerManager {
        let mut builder = match self.analyzer {
            Analyzer::English => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
-            Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer {}).dynamic(),
+            Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer::new()).dynamic(),
        };

        if !self.case_sensitive {
--- a/src/index/src/fulltext_index/tokenizer.rs
+++ b/src/index/src/fulltext_index/tokenizer.rs
@@ -98,7 +98,8 @@ impl Tokenizer for ChineseTokenizer {
            let mut tokens = JIEBA
                .cut_for_search(text, true)
                .into_iter()
-                .filter(|s| is_indexable_token(s))
+                .map(|token| token.word)
+                .filter(|token| is_indexable_token(token))
                .collect::<Vec<_>>();

            let english = EnglishTokenizer {};
@@ -336,10 +337,26 @@ mod tests {
        let text = "哈基米哦南北绿豆，噢马自立曼波。登录手机号。中国农业银行。装电视台，中国中央广播电视台。压不缩，笑不活。";

        let default_tokens = tokenizer.tokenize(text);
-        let cut_hmm_false = JIEBA.cut(text, false);
-        let cut_hmm_true = JIEBA.cut(text, true);
-        let cut_for_search_hmm_false = JIEBA.cut_for_search(text, false);
-        let cut_for_search_hmm_true = JIEBA.cut_for_search(text, true);
+        let cut_hmm_false = JIEBA
+            .cut(text, false)
+            .into_iter()
+            .map(|token| token.word)
+            .collect::<Vec<_>>();
+        let cut_hmm_true = JIEBA
+            .cut(text, true)
+            .into_iter()
+            .map(|token| token.word)
+            .collect::<Vec<_>>();
+        let cut_for_search_hmm_false = JIEBA
+            .cut_for_search(text, false)
+            .into_iter()
+            .map(|token| token.word)
+            .collect::<Vec<_>>();
+        let cut_for_search_hmm_true = JIEBA
+            .cut_for_search(text, true)
+            .into_iter()
+            .map(|token| token.word)
+            .collect::<Vec<_>>();

        assert_eq!(
            default_tokens,
--- a/src/meta-srv/src/cache_invalidator.rs
+++ b/src/meta-srv/src/cache_invalidator.rs
@@ -84,4 +84,11 @@ impl CacheInvalidator for MetasrvCacheInvalidator {
        let instruction = Instruction::InvalidateCaches(caches.to_vec());
        self.broadcast(ctx, instruction).await
    }
+
+    fn invalidate_all(&self) -> MetaResult<()> {
+        // MetasrvCacheInvalidator only broadcasts concrete cache identifiers to
+        // remote nodes. The heartbeat instruction protocol has no global
+        // invalidate-all message, so there is no safe broadcast to send here.
+        Ok(())
+    }
 }
--- a/src/meta-srv/src/handler.rs
+++ b/src/meta-srv/src/handler.rs
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashSet};
 use std::fmt::{Debug, Display};
-use std::ops::Range;
+use std::ops::Bound;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};

@@ -136,6 +137,26 @@ pub struct PusherId {
    pub id: u64,
 }

+impl PartialEq for PusherId {
+    fn eq(&self, other: &Self) -> bool {
+        self.role as i32 == other.role as i32 && self.id == other.id
+    }
+}
+
+impl Eq for PusherId {}
+
+impl PartialOrd for PusherId {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for PusherId {
+    fn cmp(&self, other: &Self) -> Ordering {
+        (self.role as i32, self.id).cmp(&(other.role as i32, other.id))
+    }
+}
+
 impl Debug for PusherId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{:?}-{}", self.role, self.id)
@@ -153,8 +174,11 @@ impl PusherId {
        Self { role, id }
    }

-    pub fn string_key(&self) -> String {
-        format!("{}-{}", self.role as i32, self.id)
+    fn role_range(role: Role) -> (Bound<Self>, Bound<Self>) {
+        (
+            Bound::Included(Self::new(role, u64::MIN)),
+            Bound::Included(Self::new(role, u64::MAX)),
+        )
    }
 }

@@ -214,7 +238,7 @@ impl Pusher {

 /// The group of heartbeat pushers.
 #[derive(Clone, Default)]
-pub struct Pushers(Arc<RwLock<BTreeMap<String, Pusher>>>);
+pub struct Pushers(Arc<RwLock<BTreeMap<PusherId, Pusher>>>);

 impl Pushers {
    async fn push(
@@ -222,11 +246,12 @@ impl Pushers {
        pusher_id: PusherId,
        mailbox_message: MailboxMessage,
    ) -> Result<DeregisterSignalReceiver> {
-        let pusher_id = pusher_id.string_key();
        let pushers = self.0.read().await;
        let pusher = pushers
            .get(&pusher_id)
-            .context(error::PusherNotFoundSnafu { pusher_id })?;
+            .with_context(|| error::PusherNotFoundSnafu {
+                pusher_id: pusher_id.to_string(),
+            })?;

        pusher
            .push(HeartbeatResponse {
@@ -239,14 +264,10 @@ impl Pushers {
        Ok(pusher.deregister_signal_receiver.clone())
    }

-    async fn broadcast(
-        &self,
-        range: Range<String>,
-        mailbox_message: &MailboxMessage,
-    ) -> Result<()> {
+    async fn broadcast(&self, role: Role, mailbox_message: &MailboxMessage) -> Result<()> {
        let pushers = self.0.read().await;
        let pushers = pushers
-            .range(range)
+            .range(PusherId::role_range(role))
            .map(|(_, value)| value)
            .collect::<Vec<_>>();
        let mut results = Vec::with_capacity(pushers.len());
@@ -271,21 +292,12 @@ impl Pushers {
        Ok(())
    }

-    pub(crate) async fn insert(&self, pusher_id: String, pusher: Pusher) -> Option<Pusher> {
+    pub(crate) async fn insert(&self, pusher_id: PusherId, pusher: Pusher) -> Option<Pusher> {
        self.0.write().await.insert(pusher_id, pusher)
    }

-    async fn remove(&self, pusher_id: &str) -> Option<Pusher> {
-        self.0.write().await.remove(pusher_id)
-    }
-
-    pub(crate) async fn clear(&self) -> Vec<String> {
-        let mut pushers = self.0.write().await;
-        let keys = pushers.keys().cloned().collect::<Vec<_>>();
-        if !keys.is_empty() {
-            pushers.clear();
-        }
-        keys
+    async fn remove(&self, pusher_id: PusherId) -> Option<Pusher> {
+        self.0.write().await.remove(&pusher_id)
    }
 }

@@ -317,17 +329,24 @@ impl HeartbeatHandlerGroup {
    pub async fn register_pusher(&self, pusher_id: PusherId, pusher: Pusher) {
        METRIC_META_HEARTBEAT_CONNECTION_NUM.inc();
        info!("Pusher register: {}", pusher_id);
-        let _ = self.pushers.insert(pusher_id.string_key(), pusher).await;
+        let _ = self.pushers.insert(pusher_id, pusher).await;
    }

    /// Deregisters the heartbeat response [`Pusher`] with the given key from the group.
    pub async fn deregister_push(&self, pusher_id: PusherId) {
-        info!("Pusher unregister: {}", pusher_id);
-        if self.pushers.remove(&pusher_id.string_key()).await.is_some() {
+        if self.pushers.remove(pusher_id).await.is_some() {
+            info!("Pusher unregister: {}", pusher_id);
            METRIC_META_HEARTBEAT_CONNECTION_NUM.dec();
        }
    }

+    #[cfg(test)]
+    /// Returns whether the group contains the heartbeat response [`Pusher`] with the given key.
+    pub async fn contains_pusher(&self, pusher_id: &PusherId) -> bool {
+        let pushers = self.pushers.0.read().await;
+        pushers.contains_key(pusher_id)
+    }
+
    /// Returns the [`Pushers`] of the group.
    pub fn pushers(&self) -> Pushers {
        self.pushers.clone()
@@ -533,7 +552,7 @@ impl Mailbox for HeartbeatMailbox {
    }

    async fn broadcast(&self, ch: &BroadcastChannel, msg: &MailboxMessage) -> Result<()> {
-        self.pushers.broadcast(ch.pusher_range(), msg).await
+        self.pushers.broadcast(ch.role(), msg).await
    }

    async fn on_recv(&self, id: MessageId, maybe_msg: Result<MailboxMessage>) -> Result<()> {
@@ -550,14 +569,6 @@ impl Mailbox for HeartbeatMailbox {

        Ok(())
    }
-
-    async fn reset(&self) {
-        let keys = self.pushers.clear().await;
-        if !keys.is_empty() {
-            info!("Reset mailbox, deregister pushers: {:?}", keys);
-            METRIC_META_HEARTBEAT_CONNECTION_NUM.sub(keys.len() as i64);
-        }
-    }
 }

 /// The builder to build the group of heartbeat handlers.
@@ -871,6 +882,7 @@ impl HeartbeatHandlerGroupBuilderCustomizer for DefaultHeartbeatHandlerGroupBuil
 mod tests {

    use std::assert_matches;
+    use std::collections::BTreeMap;
    use std::sync::Arc;
    use std::time::Duration;

@@ -946,6 +958,62 @@ mod tests {
        (mailbox, receiver)
    }

+    #[test]
+    fn test_pusher_id_role_range() {
+        let mut pushers = BTreeMap::new();
+        pushers.insert(PusherId::new(Role::Datanode, u64::MAX), "datanode");
+        pushers.insert(PusherId::new(Role::Frontend, u64::MIN), "frontend-min");
+        pushers.insert(PusherId::new(Role::Frontend, u64::MAX), "frontend-max");
+        pushers.insert(PusherId::new(Role::Flownode, u64::MIN), "flownode");
+
+        let frontend_pushers = pushers
+            .range(PusherId::role_range(Role::Frontend))
+            .map(|(_, value)| *value)
+            .collect::<Vec<_>>();
+
+        assert_eq!(frontend_pushers, vec!["frontend-min", "frontend-max"]);
+    }
+
+    #[tokio::test]
+    async fn test_pushers_broadcast_by_role() {
+        let pushers = Pushers::default();
+        let (datanode_tx, mut datanode_rx) = mpsc::channel(1);
+        let (frontend_tx, mut frontend_rx) = mpsc::channel(1);
+        let (flownode_tx, mut flownode_rx) = mpsc::channel(1);
+
+        pushers
+            .insert(
+                PusherId::new(Role::Datanode, u64::MAX),
+                Pusher::new(datanode_tx),
+            )
+            .await;
+        pushers
+            .insert(PusherId::new(Role::Frontend, 1), Pusher::new(frontend_tx))
+            .await;
+        pushers
+            .insert(
+                PusherId::new(Role::Flownode, u64::MIN),
+                Pusher::new(flownode_tx),
+            )
+            .await;
+
+        let msg = MailboxMessage {
+            id: 42,
+            subject: "broadcast-test".to_string(),
+            timestamp_millis: 123,
+            ..Default::default()
+        };
+
+        pushers.broadcast(Role::Frontend, &msg).await.unwrap();
+
+        let received = frontend_rx.recv().await.unwrap().unwrap();
+        let mailbox_message = received.mailbox_message.unwrap();
+        assert_eq!(mailbox_message.id, 0);
+        assert_eq!(mailbox_message.subject, "broadcast-test");
+        assert!(datanode_rx.try_recv().is_err());
+        assert!(flownode_rx.try_recv().is_err());
+    }
+
    #[test]
    fn test_handler_group_builder() {
        let group = HeartbeatHandlerGroupBuilder::new(Pushers::default())
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -512,7 +512,6 @@ pub struct MetaStateHandler {
    greptimedb_telemetry_task: Arc<GreptimeDBTelemetryTask>,
    leader_cached_kv_backend: Arc<LeaderCachedKvBackend>,
    leadership_change_notifier: LeadershipChangeNotifier,
-    mailbox: MailboxRef,
    state: StateRef,
 }

@@ -536,9 +535,6 @@ impl MetaStateHandler {
    pub async fn on_leader_stop(&self) {
        self.state.write().unwrap().next_state(become_follower());

-        // Enforces the mailbox to clear all pushers.
-        // The remaining heartbeat connections will be closed by the remote peer or keep-alive detection.
-        self.mailbox.reset().await;
        self.leadership_change_notifier
            .notify_on_leader_stop()
            .await;
@@ -667,7 +663,6 @@ impl Metasrv {
                state: self.state.clone(),
                leader_cached_kv_backend: leader_cached_kv_backend.clone(),
                leadership_change_notifier,
-                mailbox: self.mailbox.clone(),
            };
            let _handle = common_runtime::spawn_global(async move {
                loop {
--- a/src/meta-srv/src/procedure/repartition.rs
+++ b/src/meta-srv/src/procedure/repartition.rs
@@ -20,6 +20,7 @@ pub mod group;
 pub mod plan;
 pub mod repartition_end;
 pub mod repartition_start;
+pub mod update_partition_metadata;
 pub mod utils;

 use std::any::Any;
@@ -32,7 +33,7 @@ use common_meta::cache_invalidator::CacheInvalidatorRef;
 use common_meta::ddl::DdlContext;
 use common_meta::ddl::allocator::region_routes::RegionRoutesAllocatorRef;
 use common_meta::ddl::allocator::wal_options::WalOptionsAllocatorRef;
-use common_meta::ddl_manager::RepartitionProcedureFactory;
+use common_meta::ddl_manager::{RepartitionProcedureFactory, RepartitionSource};
 use common_meta::instruction::CacheIdent;
 use common_meta::key::datanode_table::RegionInfo;
 use common_meta::key::table_info::TableInfoValue;
@@ -62,7 +63,8 @@ use crate::procedure::repartition::group::{
    Context as RepartitionGroupContext, RepartitionGroupProcedure, region_routes,
 };
 use crate::procedure::repartition::plan::RepartitionPlanEntry;
-use crate::procedure::repartition::repartition_start::RepartitionStart;
+use crate::procedure::repartition::repartition_start::{RepartitionFrom, RepartitionStart};
+use crate::procedure::repartition::update_partition_metadata::PartitionMetadataUpdate;
 use crate::procedure::repartition::utils::{
    get_datanode_table_value, rollback_group_metadata_routes,
 };
@@ -93,6 +95,9 @@ pub struct PersistentContext {
    /// The timeout for repartition operations.
    #[serde(with = "humantime_serde", default = "default_timeout")]
    pub timeout: Duration,
+    #[serde(default)]
+    /// Records table-level partition metadata added by this repartition.
+    pub partition_metadata_update: Option<PartitionMetadataUpdate>,
 }

 fn default_timeout() -> Duration {
@@ -121,6 +126,7 @@ impl PersistentContext {
            failed_procedures: vec![],
            unknown_procedures: vec![],
            timeout: timeout.unwrap_or_else(default_timeout),
+            partition_metadata_update: None,
        }
    }

@@ -317,7 +323,9 @@ impl Context {
    ///
    /// Abort:
    /// - Table info not found.
-    pub async fn get_table_info_value(&self) -> Result<TableInfoValue> {
+    pub async fn get_raw_table_info_value(
+        &self,
+    ) -> Result<DeserializedValueWithBytes<TableInfoValue>> {
        let table_id = self.persistent_ctx.table_id;
        let table_info_value = self
            .table_metadata_manager
@@ -328,11 +336,36 @@ impl Context {
            .with_context(|_| error::RetryLaterWithSourceSnafu {
                reason: format!("Failed to get table info for table: {}", table_id),
            })?
-            .context(error::TableInfoNotFoundSnafu { table_id })?
-            .into_inner();
+            .context(error::TableInfoNotFoundSnafu { table_id })?;
+
        Ok(table_info_value)
    }

+    pub async fn get_table_info_value(&self) -> Result<TableInfoValue> {
+        let table_info_value = self.get_raw_table_info_value().await?.into_inner();
+        Ok(table_info_value)
+    }
+
+    /// Updates the table info.
+    pub async fn update_table_info(
+        &self,
+        current_table_info_value: &DeserializedValueWithBytes<TableInfoValue>,
+        new_table_info_value: TableInfoValue,
+    ) -> Result<()> {
+        let table_id = self.persistent_ctx.table_id;
+        self.table_metadata_manager
+            .update_table_info(
+                current_table_info_value,
+                None,
+                new_table_info_value.table_info,
+            )
+            .await
+            .map_err(BoxedError::new)
+            .with_context(|_| error::RetryLaterWithSourceSnafu {
+                reason: format!("Failed to update table info for table: {}", table_id),
+            })
+    }
+
    /// Updates the table route.
    ///
    /// Retry:
@@ -469,12 +502,8 @@ struct RepartitionDataOwned {
 impl RepartitionProcedure {
    const TYPE_NAME: &'static str = "metasrv-procedure::Repartition";

-    pub fn new(
-        from_exprs: Vec<PartitionExpr>,
-        to_exprs: Vec<PartitionExpr>,
-        context: Context,
-    ) -> Self {
-        let state = Box::new(RepartitionStart::new(from_exprs, to_exprs));
+    pub fn new(from: RepartitionFrom, to_exprs: Vec<PartitionExpr>, context: Context) -> Self {
+        let state = Box::new(RepartitionStart::new(from, to_exprs));

        Self { state, context }
    }
@@ -492,24 +521,24 @@ impl RepartitionProcedure {
        Ok(Self { state, context })
    }

-    /// Returns whether parent rollback should remove this repartition's allocated regions.
+    /// Returns whether parent rollback should run.
    ///
-    /// This uses an "after AllocateRegion" semantic: once execution reaches
-    /// `AllocateRegion` or any later state, rollback must try to remove this round's
-    /// `allocated_region_ids` from table-route metadata when they exist.
-    ///
-    /// State flow:
-    /// `RepartitionStart -> AllocateRegion -> Dispatch -> Collect -> DeallocateRegion -> RepartitionEnd`
-    ///                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-    ///                     rollback allocated regions in metadata
+    /// This uses an "after repartition metadata update" semantic: once execution
+    /// reaches `UpdatePartitionMetadata` or any later rollback-active state,
+    /// rollback must try to clean metadata written by the repartition procedure.
    ///
    /// Notes:
-    /// - `RepartitionStart`: no-op, because allocation has not happened yet.
-    /// - `AllocateRegion` / `Dispatch` / `Collect`  rollback-active.
+    /// - `RepartitionStart`: no-op, because no metadata has been updated yet.
+    /// - `UpdatePartitionMetadata`: rollback table partition metadata.
+    /// - `AllocateRegion` / `Dispatch` / `Collect`: rollback table partition metadata
+    ///   and allocated region metadata.
    /// - `DeallocateRegion`: is not rollback-active.
    /// - `RepartitionEnd`: no-op.
-    fn should_rollback_allocated_regions(&self) -> bool {
-        self.state.as_any().is::<allocate_region::AllocateRegion>()
+    fn should_rollback(&self) -> bool {
+        self.state
+            .as_any()
+            .is::<update_partition_metadata::UpdatePartitionMetadata>()
+            || self.state.as_any().is::<allocate_region::AllocateRegion>()
            || self.state.as_any().is::<dispatch::Dispatch>()
            || self.state.as_any().is::<collect::Collect>()
    }
@@ -526,7 +555,7 @@ impl RepartitionProcedure {

    /// Returns allocated region ids that parent rollback should remove.
    ///
-    /// Rollback uses an "after AllocateRegion" semantic:
+    /// Rollback uses an "after region allocation" semantic:
    /// - in `AllocateRegion` and `Dispatch`, all allocated regions belong to the
    ///   current repartition attempt and must be cleaned up.
    /// - in `Collect`, only the plans referenced by failed or unknown
@@ -586,8 +615,47 @@ impl RepartitionProcedure {
        Ok(())
    }

+    async fn rollback_partition_metadata(&mut self) -> Result<()> {
+        let Some(update) = self
+            .context
+            .persistent_ctx
+            .partition_metadata_update
+            .as_ref()
+        else {
+            return Ok(());
+        };
+        if update.partition_key_indices.is_empty() {
+            return Ok(());
+        }
+
+        let table_info_value = self.context.get_raw_table_info_value().await?;
+        let mut new_partition_key_indices = table_info_value
+            .table_info
+            .meta
+            .partition_key_indices
+            .clone();
+        new_partition_key_indices.retain(|idx| !update.partition_key_indices.contains(idx));
+        if new_partition_key_indices == table_info_value.table_info.meta.partition_key_indices {
+            return Ok(());
+        }
+
+        let mut new_table_info = table_info_value.table_info.clone();
+        new_table_info.meta.partition_key_indices = new_partition_key_indices;
+        self.context
+            .update_table_info(&table_info_value, table_info_value.update(new_table_info))
+            .await?;
+
+        // Do not invalidate the table cache here. The table routes may still
+        // contain partition expressions until `rollback_inner` rolls them back.
+        // Exposing cleared partition columns with partitioned routes can build
+        // an inconsistent partition rule. The cache is invalidated once after
+        // both partition metadata and routes are rolled back.
+
+        Ok(())
+    }
+
    async fn rollback_inner(&mut self, procedure_ctx: &ProcedureContext) -> Result<()> {
-        if !self.should_rollback_allocated_regions() {
+        if !self.should_rollback() {
            return Ok(());
        }

@@ -596,6 +664,8 @@ impl RepartitionProcedure {

        let table_lock = TableLock::Write(table_id).into();
        let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await;
+
+        self.rollback_partition_metadata().await?;
        let table_route_value = self.context.get_table_route_value().await?;
        let original_region_routes = region_routes(table_id, table_route_value.get_inner_ref())?;
        let mut current_region_routes = original_region_routes.clone();
@@ -738,20 +808,28 @@ impl RepartitionProcedureFactory for DefaultRepartitionProcedureFactory {
        ddl_ctx: &DdlContext,
        table_name: TableName,
        table_id: TableId,
-        from_exprs: Vec<String>,
+        source: RepartitionSource,
        to_exprs: Vec<String>,
        timeout: Option<Duration>,
    ) -> std::result::Result<BoxedProcedure, BoxedError> {
        let persistent_ctx = PersistentContext::new(table_name, table_id, timeout);
-        let from_exprs = from_exprs
-            .iter()
-            .map(|e| {
-                PartitionExpr::from_json_str(e)
-                    .context(error::DeserializePartitionExprSnafu)?
-                    .context(error::EmptyPartitionExprSnafu)
-            })
-            .collect::<Result<Vec<_>>>()
-            .map_err(BoxedError::new)?;
+        let from = match source {
+            RepartitionSource::Partitioned { exprs } => {
+                let exprs = exprs
+                    .iter()
+                    .map(|e| {
+                        PartitionExpr::from_json_str(e)
+                            .context(error::DeserializePartitionExprSnafu)?
+                            .context(error::EmptyPartitionExprSnafu)
+                    })
+                    .collect::<Result<Vec<_>>>()
+                    .map_err(BoxedError::new)?;
+                RepartitionFrom::Partitioned { exprs }
+            }
+            RepartitionSource::Unpartitioned { partition_columns } => {
+                RepartitionFrom::Unpartitioned { partition_columns }
+            }
+        };
        let to_exprs = to_exprs
            .iter()
            .map(|e| {
@@ -763,7 +841,7 @@ impl RepartitionProcedureFactory for DefaultRepartitionProcedureFactory {
            .map_err(BoxedError::new)?;

        let procedure = RepartitionProcedure::new(
-            from_exprs,
+            from,
            to_exprs,
            Context::new(
                ddl_ctx,
@@ -853,27 +931,30 @@ mod tests {
    use crate::procedure::repartition::deallocate_region::DeallocateRegion;
    use crate::procedure::repartition::dispatch::Dispatch;
    use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
-    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
    use crate::procedure::repartition::repartition_end::RepartitionEnd;
    use crate::procedure::repartition::test_util::{
        TestingEnv, assert_parent_state, current_parent_region_routes, extract_subprocedure_ids,
        new_parent_context, procedure_context_with_receivers, procedure_state_receiver, range_expr,
        test_region_route, test_region_wal_options,
    };
+    use crate::procedure::repartition::update_partition_metadata::{
+        PartitionMetadataUpdate, UpdatePartitionMetadata,
+    };

    fn test_plan(table_id: TableId) -> RepartitionPlanEntry {
        RepartitionPlanEntry {
            group_id: uuid::Uuid::new_v4(),
-            source_regions: vec![RegionDescriptor {
-                region_id: RegionId::new(table_id, 1),
-                partition_expr: range_expr("x", 0, 100),
-            }],
+            source_regions: vec![SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 1),
+                range_expr("x", 0, 100),
+            )],
            target_regions: vec![
-                RegionDescriptor {
+                TargetRegionDescriptor {
                    region_id: RegionId::new(table_id, 1),
                    partition_expr: range_expr("x", 0, 50),
                },
-                RegionDescriptor {
+                TargetRegionDescriptor {
                    region_id: RegionId::new(table_id, 3),
                    partition_expr: range_expr("x", 50, 100),
                },
@@ -927,6 +1008,15 @@ mod tests {
            .unwrap()
    }

+    async fn table_partition_key_indices(ctx: &Context) -> Vec<usize> {
+        ctx.get_table_info_value()
+            .await
+            .unwrap()
+            .table_info
+            .meta
+            .partition_key_indices
+    }
+
    fn test_procedure(state: Box<dyn State>, context: Context) -> RepartitionProcedure {
        RepartitionProcedure { state, context }
    }
@@ -965,34 +1055,43 @@ mod tests {
    }

    #[test]
-    fn test_should_rollback_allocated_regions() {
+    fn test_should_rollback_after_metadata_update() {
        let env = TestingEnv::new();
        let table_id = 1024;

        let procedure = test_procedure(
-            Box::new(RepartitionStart::new(vec![], vec![])),
+            Box::new(RepartitionStart::new(
+                RepartitionFrom::Partitioned { exprs: vec![] },
+                vec![],
+            )),
            test_context(&env, table_id),
        );
-        assert!(!procedure.should_rollback_allocated_regions());
+        assert!(!procedure.should_rollback());
+
+        let procedure = test_procedure(
+            Box::new(UpdatePartitionMetadata::new(vec![])),
+            test_context(&env, table_id),
+        );
+        assert!(procedure.should_rollback());

        let procedure = test_procedure(
            Box::new(AllocateRegion::new(vec![])),
            test_context(&env, table_id),
        );
-        assert!(procedure.should_rollback_allocated_regions());
+        assert!(procedure.should_rollback());

        let procedure = test_procedure(Box::new(Dispatch), test_context(&env, table_id));
-        assert!(procedure.should_rollback_allocated_regions());
+        assert!(procedure.should_rollback());

        let procedure =
            test_procedure(Box::new(Collect::new(vec![])), test_context(&env, table_id));
-        assert!(procedure.should_rollback_allocated_regions());
+        assert!(procedure.should_rollback());

        let procedure = test_procedure(Box::new(DeallocateRegion), test_context(&env, table_id));
-        assert!(!procedure.should_rollback_allocated_regions());
+        assert!(!procedure.should_rollback());

        let procedure = test_procedure(Box::new(RepartitionEnd), test_context(&env, table_id));
-        assert!(!procedure.should_rollback_allocated_regions());
+        assert!(!procedure.should_rollback());
    }

    #[test]
@@ -1048,6 +1147,68 @@ mod tests {
        );
    }

+    #[test]
+    fn test_persistent_context_partition_metadata_update_serde_default() {
+        let json = r#"{
+            "catalog_name":"test_catalog",
+            "schema_name":"test_schema",
+            "table_name":"test_table",
+            "table_id":1024,
+            "plans":[],
+            "timeout":"120s"
+        }"#;
+
+        let persistent_ctx: PersistentContext = serde_json::from_str(json).unwrap();
+
+        assert!(persistent_ctx.partition_metadata_update.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_repartition_rollback_removes_partition_metadata_indices() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let node_manager = Arc::new(MockDatanodeManager::new(UnexpectedErrorDatanodeHandler));
+        env.create_physical_table_metadata_for_repartition(
+            table_id,
+            vec![test_region_route(RegionId::new(table_id, 1), "")],
+            test_region_wal_options(&[1]),
+        )
+        .await;
+
+        let mut context = new_parent_context(&env, node_manager, table_id);
+        let current = context.get_raw_table_info_value().await.unwrap();
+        let mut table_info = current.table_info.clone();
+        table_info.meta.partition_key_indices = vec![0, 1];
+        context
+            .update_table_info(&current, current.update(table_info))
+            .await
+            .unwrap();
+        context.persistent_ctx.partition_metadata_update = Some(PartitionMetadataUpdate {
+            partition_key_indices: vec![0],
+        });
+        let mut procedure = RepartitionProcedure {
+            state: Box::new(UpdatePartitionMetadata::new(vec![])),
+            context,
+        };
+
+        procedure
+            .rollback(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert_eq!(
+            procedure
+                .context
+                .get_table_info_value()
+                .await
+                .unwrap()
+                .table_info
+                .meta
+                .partition_key_indices,
+            vec![1]
+        );
+    }
+
    #[tokio::test]
    async fn test_repartition_rollback_removes_allocated_routes_from_dispatch() {
        let env = TestingEnv::new();
@@ -1209,16 +1370,16 @@ mod tests {
        );
        let succeeded_plan = RepartitionPlanEntry {
            group_id: Uuid::new_v4(),
-            source_regions: vec![RegionDescriptor {
-                region_id: RegionId::new(table_id, 2),
-                partition_expr: range_expr("x", 100, 200),
-            }],
+            source_regions: vec![SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 2),
+                range_expr("x", 100, 200),
+            )],
            target_regions: vec![
-                RegionDescriptor {
+                TargetRegionDescriptor {
                    region_id: RegionId::new(table_id, 2),
                    partition_expr: range_expr("x", 100, 150),
                },
-                RegionDescriptor {
+                TargetRegionDescriptor {
                    region_id: RegionId::new(table_id, 4),
                    partition_expr: range_expr("x", 150, 200),
                },
@@ -1292,16 +1453,16 @@ mod tests {
        );
        let succeeded_plan = RepartitionPlanEntry {
            group_id: Uuid::new_v4(),
-            source_regions: vec![RegionDescriptor {
-                region_id: RegionId::new(table_id, 2),
-                partition_expr: range_expr("x", 100, 200),
-            }],
+            source_regions: vec![SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 2),
+                range_expr("x", 100, 200),
+            )],
            target_regions: vec![
-                RegionDescriptor {
+                TargetRegionDescriptor {
                    region_id: RegionId::new(table_id, 2),
                    partition_expr: range_expr("x", 100, 150),
                },
-                RegionDescriptor {
+                TargetRegionDescriptor {
                    region_id: RegionId::new(table_id, 4),
                    partition_expr: range_expr("x", 150, 200),
                },
@@ -1567,16 +1728,16 @@ mod tests {
        let failed_merge_plan = RepartitionPlanEntry {
            group_id: Uuid::new_v4(),
            source_regions: vec![
-                RegionDescriptor {
-                    region_id: RegionId::new(table_id, 1),
-                    partition_expr: range_expr("x", 0, 100),
-                },
-                RegionDescriptor {
-                    region_id: RegionId::new(table_id, 2),
-                    partition_expr: range_expr("x", 100, 200),
-                },
+                SourceRegionDescriptor::partitioned(
+                    RegionId::new(table_id, 1),
+                    range_expr("x", 0, 100),
+                ),
+                SourceRegionDescriptor::partitioned(
+                    RegionId::new(table_id, 2),
+                    range_expr("x", 100, 200),
+                ),
            ],
-            target_regions: vec![RegionDescriptor {
+            target_regions: vec![TargetRegionDescriptor {
                region_id: RegionId::new(table_id, 1),
                partition_expr: range_expr("x", 0, 200),
            }],
@@ -1587,16 +1748,16 @@ mod tests {
        };
        let succeeded_split_plan = RepartitionPlanEntry {
            group_id: Uuid::new_v4(),
-            source_regions: vec![RegionDescriptor {
-                region_id: RegionId::new(table_id, 3),
-                partition_expr: range_expr("x", 200, 300),
-            }],
+            source_regions: vec![SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 3),
+                range_expr("x", 200, 300),
+            )],
            target_regions: vec![
-                RegionDescriptor {
+                TargetRegionDescriptor {
                    region_id: RegionId::new(table_id, 3),
                    partition_expr: range_expr("x", 200, 250),
                },
-                RegionDescriptor {
+                TargetRegionDescriptor {
                    region_id: RegionId::new(table_id, 4),
                    partition_expr: range_expr("x", 250, 300),
                },
@@ -1708,7 +1869,9 @@ mod tests {

        let context = new_parent_context(&env, node_manager, table_id);
        let mut procedure = RepartitionProcedure::new(
-            vec![range_expr("x", 0, 100)],
+            RepartitionFrom::Partitioned {
+                exprs: vec![range_expr("x", 0, 100)],
+            },
            vec![range_expr("x", 0, 50), range_expr("x", 50, 100)],
            context,
        );
@@ -1810,6 +1973,226 @@ mod tests {
        );
    }

+    #[tokio::test]
+    async fn test_repartition_procedure_flow_unpartitioned_failed_and_full_rollback() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
+
+        env.create_physical_table_metadata_for_repartition(
+            table_id,
+            vec![test_region_route(RegionId::new(table_id, 1), "")],
+            test_region_wal_options(&[1]),
+        )
+        .await;
+
+        let context = new_parent_context(&env, node_manager, table_id);
+        let to_exprs = vec![range_expr("col1", 0, 50), range_expr("col1", 50, 100)];
+        let mut procedure = RepartitionProcedure::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["col1".to_string()],
+            },
+            to_exprs.clone(),
+            context,
+        );
+
+        let start_status = procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        assert!(start_status.need_persist());
+        assert_parent_state::<UpdatePartitionMetadata>(&procedure);
+        assert_eq!(
+            procedure
+                .context
+                .persistent_ctx
+                .partition_metadata_update
+                .as_ref()
+                .unwrap()
+                .partition_key_indices,
+            vec![0]
+        );
+
+        let update_status = procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        assert!(update_status.need_persist());
+        assert_parent_state::<AllocateRegion>(&procedure);
+        assert_eq!(
+            table_partition_key_indices(&procedure.context).await,
+            vec![0]
+        );
+
+        let build_allocate_status = procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        assert!(build_allocate_status.need_persist());
+        assert_parent_state::<AllocateRegion>(&procedure);
+        assert_eq!(procedure.context.persistent_ctx.plans.len(), 1);
+        let plan = &procedure.context.persistent_ctx.plans[0];
+        assert_eq!(
+            plan.source_regions,
+            vec![SourceRegionDescriptor::Default {
+                region_id: RegionId::new(table_id, 1)
+            }]
+        );
+        assert_eq!(plan.target_regions.len(), 2);
+        assert_eq!(plan.target_regions[0].region_id, RegionId::new(table_id, 1));
+        assert_eq!(plan.target_regions[0].partition_expr, to_exprs[0]);
+        assert_eq!(
+            plan.allocated_region_ids,
+            vec![plan.target_regions[1].region_id]
+        );
+        assert!(plan.pending_deallocate_region_ids.is_empty());
+        assert_eq!(plan.transition_map, vec![vec![0, 1]]);
+        let target_regions = plan.target_regions.clone();
+
+        let execute_allocate_status = procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        assert!(execute_allocate_status.need_persist());
+        assert_parent_state::<Dispatch>(&procedure);
+        let region_routes = current_parent_region_routes(&procedure.context).await;
+        assert_eq!(region_routes.len(), 2);
+        assert_eq!(
+            region_route_by_id(&region_routes, target_regions[0].region_id)
+                .region
+                .partition_expr(),
+            ""
+        );
+        assert_eq!(
+            region_route_by_id(&region_routes, target_regions[1].region_id)
+                .region
+                .partition_expr(),
+            to_exprs[1].as_json_str().unwrap()
+        );
+
+        let dispatch_status = procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        let subprocedure_ids = extract_subprocedure_ids(dispatch_status);
+        assert_eq!(subprocedure_ids.len(), 1);
+        assert_parent_state::<Collect>(&procedure);
+
+        let failed_state = ProcedureState::failed(Arc::new(ProcedureError::external(
+            MockError::new(StatusCode::Internal),
+        )));
+        let collect_ctx = procedure_context_with_receivers(HashMap::from([(
+            subprocedure_ids[0],
+            procedure_state_receiver(failed_state),
+        )]));
+        let err = procedure.execute(&collect_ctx).await.unwrap_err();
+        assert!(!err.is_retry_later());
+        assert_parent_state::<Collect>(&procedure);
+
+        procedure
+            .rollback(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert!(
+            table_partition_key_indices(&procedure.context)
+                .await
+                .is_empty()
+        );
+        assert_eq!(
+            current_parent_region_routes(&procedure.context).await,
+            vec![test_region_route(RegionId::new(table_id, 1), "")]
+        );
+    }
+
+    #[tokio::test]
+    async fn test_repartition_procedure_flow_unpartitioned_rollback_is_idempotent() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
+
+        env.create_physical_table_metadata_for_repartition(
+            table_id,
+            vec![test_region_route(RegionId::new(table_id, 1), "")],
+            test_region_wal_options(&[1]),
+        )
+        .await;
+
+        let context = new_parent_context(&env, node_manager, table_id);
+        let mut procedure = RepartitionProcedure::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["col1".to_string()],
+            },
+            vec![range_expr("col1", 0, 50), range_expr("col1", 50, 100)],
+            context,
+        );
+
+        procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        assert_eq!(
+            table_partition_key_indices(&procedure.context).await,
+            vec![0]
+        );
+        assert_eq!(
+            current_parent_region_routes(&procedure.context).await.len(),
+            2
+        );
+
+        let dispatch_status = procedure
+            .execute(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        let subprocedure_ids = extract_subprocedure_ids(dispatch_status);
+        assert_eq!(subprocedure_ids.len(), 1);
+        assert_parent_state::<Collect>(&procedure);
+
+        let failed_state = ProcedureState::failed(Arc::new(ProcedureError::external(
+            MockError::new(StatusCode::Internal),
+        )));
+        let collect_ctx = procedure_context_with_receivers(HashMap::from([(
+            subprocedure_ids[0],
+            procedure_state_receiver(failed_state),
+        )]));
+        let err = procedure.execute(&collect_ctx).await.unwrap_err();
+        assert!(!err.is_retry_later());
+
+        procedure
+            .rollback(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        let once_indices = table_partition_key_indices(&procedure.context).await;
+        let once_routes = current_parent_region_routes(&procedure.context).await;
+
+        procedure
+            .rollback(&TestingEnv::procedure_context())
+            .await
+            .unwrap();
+        let twice_indices = table_partition_key_indices(&procedure.context).await;
+        let twice_routes = current_parent_region_routes(&procedure.context).await;
+
+        assert_eq!(once_indices, twice_indices);
+        assert_eq!(once_routes, twice_routes);
+        assert!(twice_indices.is_empty());
+        assert_eq!(
+            twice_routes,
+            vec![test_region_route(RegionId::new(table_id, 1), "")]
+        );
+    }
+
    #[tokio::test]
    async fn test_repartition_procedure_flow_split_allocate_retryable_then_resume() {
        common_telemetry::init_default_ut_logging();
@@ -1852,7 +2235,9 @@ mod tests {

        let context = new_parent_context(&env, node_manager, table_id);
        let mut procedure = RepartitionProcedure::new(
-            vec![range_expr("x", 0, 100)],
+            RepartitionFrom::Partitioned {
+                exprs: vec![range_expr("x", 0, 100)],
+            },
            vec![range_expr("x", 0, 50), range_expr("x", 50, 100)],
            context,
        );
--- a/src/meta-srv/src/procedure/repartition/allocate_region.rs
+++ b/src/meta-srv/src/procedure/repartition/allocate_region.rs
@@ -35,7 +35,7 @@ use tokio::time::Instant;
 use crate::error::{self, Result};
 use crate::procedure::repartition::dispatch::Dispatch;
 use crate::procedure::repartition::plan::{
-    AllocationPlanEntry, RegionDescriptor, RepartitionPlanEntry,
+    AllocationPlanEntry, RepartitionPlanEntry, TargetRegionDescriptor,
    convert_allocation_plan_to_repartition_plan,
 };
 use crate::procedure::repartition::{Context, State};
@@ -324,7 +324,7 @@ impl AllocateRegion {
    /// Collects all regions that need to be allocated from the repartition plan entries.
    fn collect_allocate_regions(
        repartition_plan_entries: &[RepartitionPlanEntry],
-    ) -> Vec<&RegionDescriptor> {
+    ) -> Vec<&TargetRegionDescriptor> {
        repartition_plan_entries
            .iter()
            .flat_map(|p| p.allocate_regions())
@@ -333,7 +333,7 @@ impl AllocateRegion {

    /// Prepares region allocation data: region numbers and their partition expressions.
    fn prepare_region_allocation_data(
-        allocate_regions: &[&RegionDescriptor],
+        allocate_regions: &[&TargetRegionDescriptor],
    ) -> Result<Vec<(RegionNumber, String)>> {
        allocate_regions
            .iter()
@@ -417,6 +417,7 @@ mod tests {

    use super::*;
    use crate::procedure::repartition::State;
+    use crate::procedure::repartition::plan::SourceRegionDescriptor;
    use crate::procedure::repartition::test_util::{
        TestingEnv, current_parent_region_routes, new_parent_context, range_expr,
        test_region_wal_options,
@@ -428,8 +429,21 @@ mod tests {
        col: &str,
        start: i64,
        end: i64,
-    ) -> RegionDescriptor {
-        RegionDescriptor {
+    ) -> SourceRegionDescriptor {
+        SourceRegionDescriptor::partitioned(
+            RegionId::new(table_id, region_number),
+            range_expr(col, start, end),
+        )
+    }
+
+    fn create_target_region_descriptor(
+        table_id: TableId,
+        region_number: u32,
+        col: &str,
+        start: i64,
+        end: i64,
+    ) -> TargetRegionDescriptor {
+        TargetRegionDescriptor {
            region_id: RegionId::new(table_id, region_number),
            partition_expr: range_expr(col, start, end),
        }
@@ -700,10 +714,10 @@ mod tests {
    fn test_prepare_region_allocation_data() {
        let table_id = 1024;
        let regions = [
-            create_region_descriptor(table_id, 10, "x", 0, 50),
-            create_region_descriptor(table_id, 11, "x", 50, 100),
+            create_target_region_descriptor(table_id, 10, "x", 0, 50),
+            create_target_region_descriptor(table_id, 11, "x", 50, 100),
        ];
-        let region_refs: Vec<&RegionDescriptor> = regions.iter().collect();
+        let region_refs: Vec<&TargetRegionDescriptor> = regions.iter().collect();

        let result = AllocateRegion::prepare_region_allocation_data(&region_refs).unwrap();

@@ -732,7 +746,7 @@ mod tests {
        ctx.persistent_ctx.plans = vec![RepartitionPlanEntry {
            group_id: Uuid::new_v4(),
            source_regions: vec![],
-            target_regions: vec![create_region_descriptor(table_id, 3, "x", 0, 100)],
+            target_regions: vec![create_target_region_descriptor(table_id, 3, "x", 0, 100)],
            allocated_region_ids: vec![RegionId::new(table_id, 3)],
            pending_deallocate_region_ids: vec![],
            transition_map: vec![],
--- a/src/meta-srv/src/procedure/repartition/dispatch.rs
+++ b/src/meta-srv/src/procedure/repartition/dispatch.rs
@@ -25,22 +25,22 @@ use store_api::storage::RegionId;
 use crate::error::Result;
 use crate::procedure::repartition::collect::{Collect, ProcedureMeta};
 use crate::procedure::repartition::group::RepartitionGroupProcedure;
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
 use crate::procedure::repartition::{self, Context, State};

 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Dispatch;

 pub(crate) fn build_region_mapping(
-    source_regions: &[RegionDescriptor],
-    target_regions: &[RegionDescriptor],
+    source_regions: &[SourceRegionDescriptor],
+    target_regions: &[TargetRegionDescriptor],
    transition_map: &[Vec<usize>],
 ) -> HashMap<RegionId, Vec<RegionId>> {
    transition_map
        .iter()
        .enumerate()
        .map(|(source_idx, indices)| {
-            let source_region = source_regions[source_idx].region_id;
+            let source_region = source_regions[source_idx].region_id();
            let target_regions = indices
                .iter()
                .map(|&target_idx| target_regions[target_idx].region_id)
--- a/src/meta-srv/src/procedure/repartition/group.rs
+++ b/src/meta-srv/src/procedure/repartition/group.rs
@@ -49,7 +49,7 @@ use uuid::Uuid;

 use crate::error::{self, Result};
 use crate::procedure::repartition::group::repartition_start::RepartitionStart;
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
 use crate::procedure::repartition::utils::get_datanode_table_value;
 use crate::procedure::repartition::{self};
 use crate::service::mailbox::MailboxRef;
@@ -330,9 +330,9 @@ pub struct PersistentContext {
    /// The schema name of the repartition group.
    pub schema_name: String,
    /// The source regions of the repartition group.
-    pub sources: Vec<RegionDescriptor>,
+    pub sources: Vec<SourceRegionDescriptor>,
    /// The target regions of the repartition group.
-    pub targets: Vec<RegionDescriptor>,
+    pub targets: Vec<TargetRegionDescriptor>,
    /// For each `source region`, the corresponding
    /// `target regions` that overlap with it.
    pub region_mapping: HashMap<RegionId, Vec<RegionId>>,
@@ -360,8 +360,8 @@ impl PersistentContext {
        table_id: TableId,
        catalog_name: String,
        schema_name: String,
-        sources: Vec<RegionDescriptor>,
-        targets: Vec<RegionDescriptor>,
+        sources: Vec<SourceRegionDescriptor>,
+        targets: Vec<TargetRegionDescriptor>,
        region_mapping: HashMap<RegionId, Vec<RegionId>>,
        sync_region: bool,
        allocated_region_ids: Vec<RegionId>,
@@ -392,7 +392,7 @@ impl PersistentContext {
            SchemaLock::read(&self.catalog_name, &self.schema_name).into(),
        ]);
        for source in &self.sources {
-            lock_keys.push(RegionLock::Write(source.region_id).into());
+            lock_keys.push(RegionLock::Write(source.region_id()).into());
        }
        lock_keys
    }
--- a/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs
+++ b/src/meta-srv/src/procedure/repartition/group/apply_staging_manifest.rs
@@ -37,7 +37,7 @@ use crate::procedure::repartition::group::utils::{
    HandleMultipleResult, group_region_routes_by_peer, handle_multiple_results,
 };
 use crate::procedure::repartition::group::{Context, State};
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::TargetRegionDescriptor;
 use crate::service::mailbox::{Channel, MailboxRef};

 #[derive(Debug, Serialize, Deserialize)]
@@ -75,7 +75,7 @@ impl ApplyStagingManifest {
    fn build_apply_staging_manifest_instructions(
        staging_manifest_paths: &HashMap<RegionId, String>,
        target_routes: &[RegionRoute],
-        targets: &[RegionDescriptor],
+        targets: &[TargetRegionDescriptor],
        central_region_id: RegionId,
    ) -> Result<ApplyStagingManifestInstructions> {
        let target_partition_expr_by_region = targets
--- a/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs
+++ b/src/meta-srv/src/procedure/repartition/group/enter_staging_region.rs
@@ -38,7 +38,7 @@ use crate::procedure::repartition::group::utils::{
    HandleMultipleResult, group_region_routes_by_peer, handle_multiple_results,
 };
 use crate::procedure::repartition::group::{Context, GroupId, GroupPrepareResult, State};
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::TargetRegionDescriptor;
 use crate::procedure::utils::{self, ErrorStrategy};
 use crate::service::mailbox::{Channel, MailboxRef};

@@ -77,7 +77,7 @@ impl EnterStagingRegion {
    fn build_enter_staging_instructions(
        group_id: GroupId,
        prepare_result: &GroupPrepareResult,
-        targets: &[RegionDescriptor],
+        targets: &[TargetRegionDescriptor],
        pending_deallocate_region_ids: &[RegionId],
    ) -> Result<HashMap<Peer, Vec<common_meta::instruction::EnterStagingRegion>>> {
        let target_partition_expr_by_region = targets
@@ -454,7 +454,7 @@ mod tests {
    use crate::error::{self, Error};
    use crate::procedure::repartition::group::GroupPrepareResult;
    use crate::procedure::repartition::group::enter_staging_region::EnterStagingRegion;
-    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::plan::TargetRegionDescriptor;
    use crate::procedure::repartition::test_util::{
        TestingEnv, new_persistent_context, range_expr,
    };
@@ -720,13 +720,13 @@ mod tests {
        }
    }

-    fn test_targets() -> Vec<RegionDescriptor> {
+    fn test_targets() -> Vec<TargetRegionDescriptor> {
        vec![
-            RegionDescriptor {
+            TargetRegionDescriptor {
                region_id: RegionId::new(1024, 1),
                partition_expr: range_expr("x", 0, 10),
            },
-            RegionDescriptor {
+            TargetRegionDescriptor {
                region_id: RegionId::new(1024, 2),
                partition_expr: range_expr("x", 10, 20),
            },
--- a/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs
+++ b/src/meta-srv/src/procedure/repartition/group/remap_manifest.rs
@@ -30,7 +30,7 @@ use crate::error::{self, Result};
 use crate::handler::HeartbeatMailbox;
 use crate::procedure::repartition::group::apply_staging_manifest::ApplyStagingManifest;
 use crate::procedure::repartition::group::{Context, State};
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
 use crate::service::mailbox::{Channel, MailboxRef};

 #[derive(Debug, Serialize, Deserialize)]
@@ -98,8 +98,8 @@ impl State for RemapManifest {

 impl RemapManifest {
    fn build_remap_manifest_instructions(
-        source_regions: &[RegionDescriptor],
-        target_regions: &[RegionDescriptor],
+        source_regions: &[SourceRegionDescriptor],
+        target_regions: &[TargetRegionDescriptor],
        region_mapping: &HashMap<RegionId, Vec<RegionId>>,
        central_region_id: RegionId,
    ) -> Result<common_meta::instruction::RemapManifest> {
@@ -117,7 +117,7 @@ impl RemapManifest {

        Ok(common_meta::instruction::RemapManifest {
            region_id: central_region_id,
-            input_regions: source_regions.iter().map(|r| r.region_id).collect(),
+            input_regions: source_regions.iter().map(|r| r.region_id()).collect(),
            region_mapping: region_mapping.clone(),
            new_partition_exprs,
        })
--- a/src/meta-srv/src/procedure/repartition/group/repartition_start.rs
+++ b/src/meta-srv/src/procedure/repartition/group/repartition_start.rs
@@ -19,7 +19,7 @@ use common_meta::rpc::router::RegionRoute;
 use common_procedure::{Context as ProcedureContext, Status};
 use common_telemetry::debug;
 use serde::{Deserialize, Serialize};
-use snafu::{OptionExt, ResultExt, ensure};
+use snafu::{OptionExt, ensure};

 use crate::error::{self, Result};
 use crate::procedure::repartition::group::sync_region::SyncRegion;
@@ -27,21 +27,18 @@ use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
 use crate::procedure::repartition::group::{
    Context, GroupId, GroupPrepareResult, State, region_routes,
 };
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};

 #[derive(Debug, Serialize, Deserialize)]
 pub struct RepartitionStart;

-/// Ensures that the partition expression of the region route matches the partition expression of the region descriptor.
-fn ensure_region_route_expr_match(
+/// Ensures that the partition expression of the source region route matches the source descriptor.
+fn ensure_source_region_route_expr_match(
    region_route: &RegionRoute,
-    region_descriptor: &RegionDescriptor,
+    source: &SourceRegionDescriptor,
 ) -> Result<RegionRoute> {
    let actual = region_route.region.partition_expr();
-    let expected = region_descriptor
-        .partition_expr
-        .as_json_str()
-        .context(error::SerializePartitionExprSnafu)?;
+    let expected = source.route_expr_for_rollback()?;
    ensure!(
        actual == expected,
        error::PartitionExprMismatchSnafu {
@@ -60,8 +57,8 @@ impl RepartitionStart {
    fn ensure_route_present(
        group_id: GroupId,
        region_routes: &[RegionRoute],
-        sources: &[RegionDescriptor],
-        targets: &[RegionDescriptor],
+        sources: &[SourceRegionDescriptor],
+        targets: &[TargetRegionDescriptor],
    ) -> Result<GroupPrepareResult> {
        ensure!(
            !sources.is_empty(),
@@ -78,12 +75,12 @@ impl RepartitionStart {
            .iter()
            .map(|s| {
                region_routes_map
-                    .get(&s.region_id)
+                    .get(&s.region_id())
                    .context(error::RepartitionSourceRegionMissingSnafu {
                        group_id,
-                        region_id: s.region_id,
+                        region_id: s.region_id(),
                    })
-                    .and_then(|r| ensure_region_route_expr_match(r, s))
+                    .and_then(|r| ensure_source_region_route_expr_match(r, s))
            })
            .collect::<Result<Vec<_>>>()?;
        let target_region_routes = targets
@@ -109,7 +106,7 @@ impl RepartitionStart {
                }
            );
        }
-        let central_region = sources[0].region_id;
+        let central_region = sources[0].region_id();
        let central_region_datanode = source_region_routes[0]
            .leader_peer
            .as_ref()
@@ -216,16 +213,14 @@ mod tests {

    use crate::error::Error;
    use crate::procedure::repartition::group::repartition_start::RepartitionStart;
-    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
    use crate::procedure::repartition::test_util::range_expr;

    #[test]
    fn test_ensure_route_present_missing_source_region() {
-        let source_region = RegionDescriptor {
-            region_id: RegionId::new(1024, 1),
-            partition_expr: range_expr("x", 0, 100),
-        };
-        let target_region = RegionDescriptor {
+        let source_region =
+            SourceRegionDescriptor::partitioned(RegionId::new(1024, 1), range_expr("x", 0, 100));
+        let target_region = TargetRegionDescriptor {
            region_id: RegionId::new(1024, 2),
            partition_expr: range_expr("x", 0, 10),
        };
@@ -249,11 +244,9 @@ mod tests {

    #[test]
    fn test_ensure_route_present_partition_expr_mismatch() {
-        let source_region = RegionDescriptor {
-            region_id: RegionId::new(1024, 1),
-            partition_expr: range_expr("x", 0, 100),
-        };
-        let target_region = RegionDescriptor {
+        let source_region =
+            SourceRegionDescriptor::partitioned(RegionId::new(1024, 1), range_expr("x", 0, 100));
+        let target_region = TargetRegionDescriptor {
            region_id: RegionId::new(1024, 2),
            partition_expr: range_expr("x", 0, 10),
        };
@@ -277,12 +270,69 @@ mod tests {
    }

    #[test]
-    fn test_ensure_route_present_missing_target_region() {
-        let source_region = RegionDescriptor {
+    fn test_ensure_route_present_default_source_matches_empty_partition_expr() {
+        let source_region = SourceRegionDescriptor::Default {
            region_id: RegionId::new(1024, 1),
-            partition_expr: range_expr("x", 0, 100),
        };
-        let target_region = RegionDescriptor {
+        let target_region = TargetRegionDescriptor {
+            region_id: RegionId::new(1024, 1),
+            partition_expr: range_expr("x", 0, 10),
+        };
+        let region_routes = vec![RegionRoute {
+            region: Region {
+                id: RegionId::new(1024, 1),
+                partition_expr: String::new(),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            ..Default::default()
+        }];
+
+        let result = RepartitionStart::ensure_route_present(
+            Uuid::new_v4(),
+            &region_routes,
+            &[source_region],
+            &[target_region],
+        );
+
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_ensure_route_present_default_source_rejects_non_empty_partition_expr() {
+        let source_region = SourceRegionDescriptor::Default {
+            region_id: RegionId::new(1024, 1),
+        };
+        let target_region = TargetRegionDescriptor {
+            region_id: RegionId::new(1024, 1),
+            partition_expr: range_expr("x", 0, 10),
+        };
+        let region_routes = vec![RegionRoute {
+            region: Region {
+                id: RegionId::new(1024, 1),
+                partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            ..Default::default()
+        }];
+
+        let err = RepartitionStart::ensure_route_present(
+            Uuid::new_v4(),
+            &region_routes,
+            &[source_region],
+            &[target_region],
+        )
+        .unwrap_err();
+
+        assert_matches!(err, Error::PartitionExprMismatch { .. });
+    }
+
+    #[test]
+    fn test_ensure_route_present_missing_target_region() {
+        let source_region =
+            SourceRegionDescriptor::partitioned(RegionId::new(1024, 1), range_expr("x", 0, 100));
+        let target_region = TargetRegionDescriptor {
            region_id: RegionId::new(1024, 2),
            partition_expr: range_expr("x", 0, 10),
        };
@@ -307,11 +357,9 @@ mod tests {

    #[test]
    fn test_ensure_route_present_legacy_partition_expr_source() {
-        let source_region = RegionDescriptor {
-            region_id: RegionId::new(1024, 1),
-            partition_expr: range_expr("x", 0, 100),
-        };
-        let target_region = RegionDescriptor {
+        let source_region =
+            SourceRegionDescriptor::partitioned(RegionId::new(1024, 1), range_expr("x", 0, 100));
+        let target_region = TargetRegionDescriptor {
            region_id: RegionId::new(1024, 2),
            partition_expr: range_expr("x", 0, 10),
        };
--- a/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs
+++ b/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs
@@ -22,7 +22,7 @@ use snafu::{OptionExt, ResultExt};
 use crate::error::{self, Result};
 use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
 use crate::procedure::repartition::group::{Context, GroupId, region_routes};
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};

 impl UpdateMetadata {
    /// Applies the new partition expressions for staging regions.
@@ -32,8 +32,8 @@ impl UpdateMetadata {
    /// - Source region not found.
    pub(crate) fn apply_staging_region_routes(
        group_id: GroupId,
-        sources: &[RegionDescriptor],
-        targets: &[RegionDescriptor],
+        sources: &[SourceRegionDescriptor],
+        targets: &[TargetRegionDescriptor],
        pending_deallocate_region_ids: &[store_api::storage::RegionId],
        current_region_routes: &[RegionRoute],
    ) -> Result<Vec<RegionRoute>> {
@@ -61,15 +61,16 @@ impl UpdateMetadata {
        }

        for source in sources {
-            let region_route = region_routes_map.get_mut(&source.region_id).context(
+            let region_id = source.region_id();
+            let region_route = region_routes_map.get_mut(&region_id).context(
                error::RepartitionSourceRegionMissingSnafu {
                    group_id,
-                    region_id: source.region_id,
+                    region_id,
                },
            )?;
            // Set leader staging state for the source region route.
            region_route.set_leader_staging();
-            if pending_deallocate_region_ids.contains(&source.region_id) {
+            if pending_deallocate_region_ids.contains(&region_id) {
                // When a region is pending deallocation, it should ignore all writes.
                region_route.set_ignore_all_writes();
            }
@@ -130,7 +131,7 @@ mod tests {
    use uuid::Uuid;

    use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
-    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
    use crate::procedure::repartition::test_util::range_expr;

    #[test]
@@ -166,11 +167,11 @@ mod tests {
                ..Default::default()
            },
        ];
-        let source_region = RegionDescriptor {
-            region_id: RegionId::new(table_id, 1),
-            partition_expr: range_expr("x", 0, 100),
-        };
-        let target_region = RegionDescriptor {
+        let source_region = SourceRegionDescriptor::partitioned(
+            RegionId::new(table_id, 1),
+            range_expr("x", 0, 100),
+        );
+        let target_region = TargetRegionDescriptor {
            region_id: RegionId::new(table_id, 2),
            partition_expr: range_expr("x", 0, 10),
        };
@@ -196,6 +197,68 @@ mod tests {
        assert!(!new_region_routes[2].is_leader_staging());
    }

+    #[test]
+    fn test_generate_region_routes_with_reused_default_source_region() {
+        let group_id = Uuid::new_v4();
+        let table_id = 1024;
+        let default_region_id = RegionId::new(table_id, 1);
+        let region_routes = vec![
+            RegionRoute {
+                region: Region {
+                    id: default_region_id,
+                    partition_expr: String::new(),
+                    ..Default::default()
+                },
+                leader_peer: Some(Peer::empty(1)),
+                ..Default::default()
+            },
+            RegionRoute {
+                region: Region {
+                    id: RegionId::new(table_id, 2),
+                    partition_expr: String::new(),
+                    ..Default::default()
+                },
+                leader_peer: Some(Peer::empty(1)),
+                ..Default::default()
+            },
+        ];
+        let source_region = SourceRegionDescriptor::Default {
+            region_id: default_region_id,
+        };
+        let reused_target_expr = range_expr("x", 0, 10);
+        let target_regions = vec![
+            TargetRegionDescriptor {
+                region_id: default_region_id,
+                partition_expr: reused_target_expr.clone(),
+            },
+            TargetRegionDescriptor {
+                region_id: RegionId::new(table_id, 2),
+                partition_expr: range_expr("x", 10, 20),
+            },
+        ];
+
+        let new_region_routes = UpdateMetadata::apply_staging_region_routes(
+            group_id,
+            &[source_region],
+            &target_regions,
+            &[],
+            &region_routes,
+        )
+        .unwrap();
+
+        assert_eq!(
+            new_region_routes[0].region.partition_expr,
+            reused_target_expr.as_json_str().unwrap()
+        );
+        assert!(new_region_routes[0].is_leader_staging());
+        assert!(!new_region_routes[0].is_ignore_all_writes());
+        assert_eq!(
+            new_region_routes[1].region.partition_expr,
+            range_expr("x", 10, 20).as_json_str().unwrap()
+        );
+        assert!(new_region_routes[1].is_leader_staging());
+    }
+
    #[test]
    fn test_generate_region_routes_mark_pending_deallocate_reject_all_writes() {
        let group_id = Uuid::new_v4();
@@ -221,11 +284,11 @@ mod tests {
                ..Default::default()
            },
        ];
-        let source_region = RegionDescriptor {
-            region_id: pending_deallocate_region_id,
-            partition_expr: range_expr("x", 0, 100),
-        };
-        let target_region = RegionDescriptor {
+        let source_region = SourceRegionDescriptor::partitioned(
+            pending_deallocate_region_id,
+            range_expr("x", 0, 100),
+        );
+        let target_region = TargetRegionDescriptor {
            region_id: RegionId::new(table_id, 2),
            partition_expr: range_expr("x", 0, 10),
        };
--- a/src/meta-srv/src/procedure/repartition/group/update_metadata/exit_staging_region.rs
+++ b/src/meta-srv/src/procedure/repartition/group/update_metadata/exit_staging_region.rs
@@ -22,13 +22,13 @@ use snafu::{OptionExt, ResultExt};
 use crate::error::{self, Result};
 use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
 use crate::procedure::repartition::group::{Context, GroupId, region_routes};
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};

 impl UpdateMetadata {
    pub(crate) fn exit_staging_region_routes(
        group_id: GroupId,
-        sources: &[RegionDescriptor],
-        targets: &[RegionDescriptor],
+        sources: &[SourceRegionDescriptor],
+        targets: &[TargetRegionDescriptor],
        current_region_routes: &[RegionRoute],
    ) -> Result<Vec<RegionRoute>> {
        let mut region_routes = current_region_routes.to_vec();
@@ -48,10 +48,11 @@ impl UpdateMetadata {
        }

        for source in sources {
-            let region_route = region_routes_map.get_mut(&source.region_id).context(
+            let region_id = source.region_id();
+            let region_route = region_routes_map.get_mut(&region_id).context(
                error::RepartitionSourceRegionMissingSnafu {
                    group_id,
-                    region_id: source.region_id,
+                    region_id,
                },
            )?;
            region_route.clear_leader_staging();
@@ -113,24 +114,25 @@ mod tests {
    use uuid::Uuid;

    use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
-    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
    use crate::procedure::repartition::test_util::range_expr;

    #[test]
    fn test_exit_staging_region_routes_keep_reject_all_writes() {
        let group_id = Uuid::new_v4();
        let table_id = 1024;
-        let source_region = RegionDescriptor {
-            region_id: RegionId::new(table_id, 1),
-            partition_expr: range_expr("x", 0, 100),
-        };
-        let target_region = RegionDescriptor {
+        let source_region = SourceRegionDescriptor::partitioned(
+            RegionId::new(table_id, 1),
+            range_expr("x", 0, 100),
+        );
+        let source_region_id = source_region.region_id();
+        let target_region = TargetRegionDescriptor {
            region_id: RegionId::new(table_id, 2),
            partition_expr: range_expr("x", 0, 50),
        };
        let mut source_route = RegionRoute {
            region: Region {
-                id: source_region.region_id,
+                id: source_region_id,
                partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
                ..Default::default()
            },
@@ -165,4 +167,40 @@ mod tests {
        assert!(!new_region_routes[1].is_leader_staging());
        assert!(new_region_routes[1].is_ignore_all_writes());
    }
+
+    #[test]
+    fn test_exit_staging_region_routes_with_reused_default_source_region() {
+        let group_id = Uuid::new_v4();
+        let table_id = 1024;
+        let default_region_id = RegionId::new(table_id, 1);
+        let source_region = SourceRegionDescriptor::Default {
+            region_id: default_region_id,
+        };
+        let target_region = TargetRegionDescriptor {
+            region_id: default_region_id,
+            partition_expr: range_expr("x", 0, 50),
+        };
+        let target_expr = target_region.partition_expr.as_json_str().unwrap();
+        let region_route = RegionRoute {
+            region: Region {
+                id: default_region_id,
+                partition_expr: target_expr.clone(),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            leader_state: Some(LeaderState::Staging),
+            ..Default::default()
+        };
+
+        let new_region_routes = UpdateMetadata::exit_staging_region_routes(
+            group_id,
+            &[source_region],
+            &[target_region],
+            &[region_route],
+        )
+        .unwrap();
+
+        assert!(!new_region_routes[0].is_leader_staging());
+        assert_eq!(new_region_routes[0].region.partition_expr, target_expr);
+    }
 }
--- a/src/meta-srv/src/procedure/repartition/plan.rs
+++ b/src/meta-srv/src/procedure/repartition/plan.rs
@@ -16,17 +16,137 @@ use std::cmp::Ordering;

 use common_meta::rpc::router::RegionRoute;
 use partition::expr::PartitionExpr;
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize};
+use snafu::ResultExt;
 use store_api::storage::{RegionId, RegionNumber, TableId};

+use crate::error::{self, Result};
 use crate::procedure::repartition::group::GroupId;

-/// Metadata describing a region involved in the plan.
+/// Metadata describing a source region involved in the plan.
+///
+/// Source regions may represent either an existing partitioned region or the
+/// default region of an unpartitioned table.
+#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
+pub enum SourceRegionDescriptor {
+    /// A regular partitioned source region.
+    Partitioned {
+        /// The region id of the source region.
+        region_id: RegionId,
+        /// The partition expression of the source region.
+        partition_expr: PartitionExpr,
+    },
+    /// The default source region of an unpartitioned table.
+    Default {
+        /// The region id of the default source region.
+        region_id: RegionId,
+    },
+}
+
+impl<'de> Deserialize<'de> for SourceRegionDescriptor {
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        #[serde(deny_unknown_fields)]
+        struct PartitionedSourceRegionDescriptor {
+            region_id: RegionId,
+            partition_expr: PartitionExpr,
+        }
+
+        #[derive(Deserialize)]
+        #[serde(untagged)]
+        enum SourceRegionDescriptorRepr {
+            Tagged(SourceRegionDescriptorTaggedRepr),
+            Legacy(PartitionedSourceRegionDescriptor),
+        }
+
+        #[derive(Deserialize)]
+        enum SourceRegionDescriptorTaggedRepr {
+            Partitioned {
+                region_id: RegionId,
+                partition_expr: PartitionExpr,
+            },
+            Default {
+                region_id: RegionId,
+            },
+        }
+
+        match SourceRegionDescriptorRepr::deserialize(deserializer)? {
+            SourceRegionDescriptorRepr::Tagged(SourceRegionDescriptorTaggedRepr::Partitioned {
+                region_id,
+                partition_expr,
+            }) => Ok(Self::Partitioned {
+                region_id,
+                partition_expr,
+            }),
+            SourceRegionDescriptorRepr::Tagged(SourceRegionDescriptorTaggedRepr::Default {
+                region_id,
+            }) => Ok(Self::Default { region_id }),
+            SourceRegionDescriptorRepr::Legacy(descriptor) => Ok(Self::Partitioned {
+                region_id: descriptor.region_id,
+                partition_expr: descriptor.partition_expr,
+            }),
+        }
+    }
+}
+
+impl SourceRegionDescriptor {
+    /// Creates a partitioned source region descriptor.
+    pub fn partitioned(region_id: RegionId, partition_expr: PartitionExpr) -> Self {
+        Self::Partitioned {
+            region_id,
+            partition_expr,
+        }
+    }
+
+    /// Returns the region id of this source descriptor.
+    pub fn region_id(&self) -> RegionId {
+        match self {
+            Self::Partitioned { region_id, .. } => *region_id,
+            Self::Default { region_id } => *region_id,
+        }
+    }
+
+    /// Returns the partition expression if this source is partitioned.
+    pub fn partition_expr(&self) -> Option<&PartitionExpr> {
+        match self {
+            Self::Partitioned { partition_expr, .. } => Some(partition_expr),
+            Self::Default { .. } => None,
+        }
+    }
+
+    /// Returns true if this source descriptor matches the route partition expression.
+    pub fn matches_route_expr(&self, route_expr: &str) -> Result<bool> {
+        match self {
+            Self::Partitioned { partition_expr, .. } => {
+                let expected = partition_expr
+                    .as_json_str()
+                    .context(error::SerializePartitionExprSnafu)?;
+                Ok(route_expr == expected)
+            }
+            Self::Default { .. } => Ok(route_expr.is_empty()),
+        }
+    }
+
+    /// Returns the route partition expression to restore during rollback.
+    pub fn route_expr_for_rollback(&self) -> Result<String> {
+        match self {
+            Self::Partitioned { partition_expr, .. } => partition_expr
+                .as_json_str()
+                .context(error::SerializePartitionExprSnafu),
+            Self::Default { .. } => Ok(String::new()),
+        }
+    }
+}
+
+/// Metadata describing a target region involved in the plan.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub struct RegionDescriptor {
-    /// The region id of the region involved in the plan.
+pub struct TargetRegionDescriptor {
+    /// The region id of the target region.
    pub region_id: RegionId,
-    /// The partition expression of the region.
+    /// The partition expression of the target region.
    pub partition_expr: PartitionExpr,
 }

@@ -37,7 +157,7 @@ pub struct AllocationPlanEntry {
    /// The group id for this plan entry.
    pub group_id: GroupId,
    /// Source region descriptors involved in the plan.
-    pub source_regions: Vec<RegionDescriptor>,
+    pub source_regions: Vec<SourceRegionDescriptor>,
    /// The target partition expressions for the new or changed regions.
    pub target_partition_exprs: Vec<PartitionExpr>,
    /// For each `source_regions[k]`, the corresponding vector contains global
@@ -52,9 +172,9 @@ pub struct RepartitionPlanEntry {
    /// The group id for this plan entry.
    pub group_id: GroupId,
    /// The source region descriptors involved in the plan.
-    pub source_regions: Vec<RegionDescriptor>,
+    pub source_regions: Vec<SourceRegionDescriptor>,
    /// The target region descriptors involved in the plan.
-    pub target_regions: Vec<RegionDescriptor>,
+    pub target_regions: Vec<TargetRegionDescriptor>,
    /// The region ids of the allocated regions.
    pub allocated_region_ids: Vec<RegionId>,
    /// The region ids of the regions that are pending deallocation.
@@ -69,7 +189,7 @@ pub struct RepartitionPlanEntry {

 impl RepartitionPlanEntry {
    /// Returns the target regions that are newly allocated.
-    pub(crate) fn allocate_regions(&self) -> Vec<&RegionDescriptor> {
+    pub(crate) fn allocate_regions(&self) -> Vec<&TargetRegionDescriptor> {
        self.target_regions
            .iter()
            .filter(|r| self.allocated_region_ids.contains(&r.region_id))
@@ -111,7 +231,7 @@ pub fn convert_allocation_plan_to_repartition_plan(
                .iter()
                .skip(source_regions.len())
                .map(|target_partition_expr| {
-                    let desc = RegionDescriptor {
+                    let desc = TargetRegionDescriptor {
                        region_id: RegionId::new(table_id, *next_region_number),
                        partition_expr: target_partition_expr.clone(),
                    };
@@ -128,10 +248,12 @@ pub fn convert_allocation_plan_to_repartition_plan(
            let target_regions = source_regions
                .iter()
                .zip(target_partition_exprs.iter())
-                .map(|(source_region, target_partition_expr)| RegionDescriptor {
-                    region_id: source_region.region_id,
-                    partition_expr: target_partition_expr.clone(),
-                })
+                .map(
+                    |(source_region, target_partition_expr)| TargetRegionDescriptor {
+                        region_id: source_region.region_id(),
+                        partition_expr: target_partition_expr.clone(),
+                    },
+                )
                .chain(pending_allocate_target_partition_exprs)
                .collect::<Vec<_>>();

@@ -149,10 +271,12 @@ pub fn convert_allocation_plan_to_repartition_plan(
            let target_regions = source_regions
                .iter()
                .zip(target_partition_exprs.iter())
-                .map(|(source_region, target_partition_expr)| RegionDescriptor {
-                    region_id: source_region.region_id,
-                    partition_expr: target_partition_expr.clone(),
-                })
+                .map(
+                    |(source_region, target_partition_expr)| TargetRegionDescriptor {
+                        region_id: source_region.region_id(),
+                        partition_expr: target_partition_expr.clone(),
+                    },
+                )
                .collect::<Vec<_>>();

            RepartitionPlanEntry {
@@ -171,16 +295,18 @@ pub fn convert_allocation_plan_to_repartition_plan(
                .iter()
                .take(target_partition_exprs.len())
                .zip(target_partition_exprs.iter())
-                .map(|(source_region, target_partition_expr)| RegionDescriptor {
-                    region_id: source_region.region_id,
-                    partition_expr: target_partition_expr.clone(),
-                })
+                .map(
+                    |(source_region, target_partition_expr)| TargetRegionDescriptor {
+                        region_id: source_region.region_id(),
+                        partition_expr: target_partition_expr.clone(),
+                    },
+                )
                .collect::<Vec<_>>();

            let pending_deallocate_region_ids = source_regions
                .iter()
                .skip(target_partition_exprs.len())
-                .map(|source_region| source_region.region_id)
+                .map(|source_region| source_region.region_id())
                .collect::<Vec<_>>();

            RepartitionPlanEntry {
@@ -210,11 +336,140 @@ mod tests {
        col: &str,
        start: i64,
        end: i64,
-    ) -> RegionDescriptor {
-        RegionDescriptor {
-            region_id: RegionId::new(table_id, region_number),
-            partition_expr: range_expr(col, start, end),
-        }
+    ) -> SourceRegionDescriptor {
+        SourceRegionDescriptor::partitioned(
+            RegionId::new(table_id, region_number),
+            range_expr(col, start, end),
+        )
+    }
+
+    #[test]
+    fn test_source_region_descriptor_deserializes_legacy_partitioned_shape() {
+        let table_id = 1024;
+        let region_id = RegionId::new(table_id, 1);
+        let partition_expr = range_expr("x", 0, 100);
+        let legacy_json = serde_json::json!({
+            "region_id": region_id,
+            "partition_expr": partition_expr,
+        });
+
+        let descriptor: SourceRegionDescriptor = serde_json::from_value(legacy_json).unwrap();
+
+        assert_eq!(
+            descriptor,
+            SourceRegionDescriptor::partitioned(region_id, partition_expr)
+        );
+    }
+
+    #[test]
+    fn test_source_region_descriptor_rejects_legacy_default_shape() {
+        let region_id = RegionId::new(1024, 1);
+        let default_json = serde_json::json!({
+            "region_id": region_id,
+        });
+
+        let err = serde_json::from_value::<SourceRegionDescriptor>(default_json).unwrap_err();
+
+        assert!(err.to_string().contains("data did not match any variant"));
+    }
+
+    #[test]
+    fn test_source_region_descriptor_roundtrip_tagged_partitioned_shape() {
+        let region_id = RegionId::new(1024, 1);
+        let partition_expr = range_expr("x", 0, 100);
+        let descriptor = SourceRegionDescriptor::partitioned(region_id, partition_expr.clone());
+
+        let value = serde_json::to_value(&descriptor).unwrap();
+        let decoded = serde_json::from_value::<SourceRegionDescriptor>(value.clone()).unwrap();
+
+        assert_eq!(
+            value,
+            serde_json::json!({
+                "Partitioned": {
+                    "region_id": region_id,
+                    "partition_expr": partition_expr,
+                }
+            })
+        );
+        assert_eq!(decoded, descriptor);
+    }
+
+    #[test]
+    fn test_source_region_descriptor_roundtrip_tagged_default_shape() {
+        let region_id = RegionId::new(1024, 1);
+        let descriptor = SourceRegionDescriptor::Default { region_id };
+
+        let value = serde_json::to_value(&descriptor).unwrap();
+        let decoded = serde_json::from_value::<SourceRegionDescriptor>(value.clone()).unwrap();
+
+        assert_eq!(
+            value,
+            serde_json::json!({
+                "Default": {
+                    "region_id": region_id,
+                }
+            })
+        );
+        assert_eq!(decoded, descriptor);
+    }
+
+    #[test]
+    fn test_source_region_descriptor_rejects_invalid_partition_expr_shape() {
+        let region_id = RegionId::new(1024, 1);
+        let invalid_json = serde_json::json!({
+            "region_id": region_id,
+            "partition_expr": 42,
+        });
+
+        let err = serde_json::from_value::<SourceRegionDescriptor>(invalid_json).unwrap_err();
+
+        assert!(err.to_string().contains("data did not match any variant"));
+    }
+
+    #[test]
+    fn test_repartition_plan_entry_deserializes_legacy_source_regions() {
+        let group_id = Uuid::new_v4();
+        let table_id = 1024;
+        let source_region_id = RegionId::new(table_id, 1);
+        let target_region_id = RegionId::new(table_id, 2);
+        let source_partition_expr = range_expr("x", 0, 100);
+        let target_partition_expr = range_expr("x", 0, 50);
+        let legacy_json = serde_json::json!({
+            "group_id": group_id,
+            "source_regions": [{
+                "region_id": source_region_id,
+                "partition_expr": source_partition_expr,
+            }],
+            "target_regions": [{
+                "region_id": target_region_id,
+                "partition_expr": target_partition_expr,
+            }],
+            "allocated_region_ids": [target_region_id],
+            "pending_deallocate_region_ids": [],
+            "transition_map": [[0]],
+        });
+
+        let plan: RepartitionPlanEntry = serde_json::from_value(legacy_json).unwrap();
+
+        assert_eq!(plan.group_id, group_id);
+        assert_eq!(
+            plan.source_regions,
+            vec![SourceRegionDescriptor::partitioned(
+                source_region_id,
+                source_partition_expr
+            )]
+        );
+        assert_eq!(
+            plan.target_regions,
+            vec![TargetRegionDescriptor {
+                region_id: target_region_id,
+                partition_expr: target_partition_expr,
+            }]
+        );
+        assert_eq!(plan.allocated_region_ids, vec![target_region_id]);
+        assert!(plan.pending_deallocate_region_ids.is_empty());
+        assert_eq!(plan.transition_map, vec![vec![0]]);
+        assert!(plan.original_target_routes.is_empty());
    }

    #[test]
@@ -468,6 +723,55 @@ mod tests {
        assert_eq!(next_region_number, 6);
    }

+    #[test]
+    fn test_convert_plan_allocate_default_source_region() {
+        let group_id = Uuid::new_v4();
+        let table_id = 1024;
+        let mut next_region_number = 5;
+        let source_regions = vec![SourceRegionDescriptor::Default {
+            region_id: RegionId::new(table_id, 1),
+        }];
+        let target_partition_exprs = vec![range_expr("x", 0, 50), range_expr("x", 50, 100)];
+        let allocation_plan = AllocationPlanEntry {
+            group_id,
+            source_regions: source_regions.clone(),
+            target_partition_exprs: target_partition_exprs.clone(),
+            transition_map: vec![vec![0, 1]],
+        };
+
+        let result = convert_allocation_plan_to_repartition_plan(
+            table_id,
+            &mut next_region_number,
+            &allocation_plan,
+        );
+
+        assert_eq!(result.source_regions, source_regions);
+        assert_eq!(result.target_regions.len(), 2);
+        assert_eq!(
+            result.target_regions[0].region_id,
+            RegionId::new(table_id, 1)
+        );
+        assert_eq!(
+            result.target_regions[0].partition_expr,
+            target_partition_exprs[0]
+        );
+        assert_eq!(
+            result.target_regions[1].region_id,
+            RegionId::new(table_id, 5)
+        );
+        assert_eq!(
+            result.target_regions[1].partition_expr,
+            target_partition_exprs[1]
+        );
+        assert_eq!(
+            result.allocated_region_ids,
+            vec![RegionId::new(table_id, 5)]
+        );
+        assert!(result.pending_deallocate_region_ids.is_empty());
+        assert_eq!(result.transition_map, vec![vec![0, 1]]);
+        assert_eq!(next_region_number, 6);
+    }
+
    #[test]
    fn test_convert_plan_deallocate_to_single_region() {
        let group_id = Uuid::new_v4();
--- a/src/meta-srv/src/procedure/repartition/repartition_start.rs
+++ b/src/meta-srv/src/procedure/repartition/repartition_start.rs
@@ -17,31 +17,69 @@ use std::any::Any;
 use common_meta::key::table_route::PhysicalTableRouteValue;
 use common_procedure::{Context as ProcedureContext, Status};
 use common_telemetry::debug;
+use partition::collider::Collider;
 use partition::expr::PartitionExpr;
 use partition::subtask::{self, RepartitionSubtask};
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize};
 use snafu::{OptionExt, ResultExt, ensure};
 use tokio::time::Instant;
 use uuid::Uuid;

 use crate::error::{self, Result};
 use crate::procedure::repartition::allocate_region::AllocateRegion;
-use crate::procedure::repartition::plan::{AllocationPlanEntry, RegionDescriptor};
+use crate::procedure::repartition::plan::{AllocationPlanEntry, SourceRegionDescriptor};
 use crate::procedure::repartition::repartition_end::RepartitionEnd;
+use crate::procedure::repartition::update_partition_metadata::{
+    PartitionMetadataUpdate, UpdatePartitionMetadata,
+};
 use crate::procedure::repartition::{Context, State};

+#[derive(Debug, Clone, Serialize)]
+pub enum RepartitionFrom {
+    Partitioned { exprs: Vec<PartitionExpr> },
+    Unpartitioned { partition_columns: Vec<String> },
+}
+
+impl<'de> Deserialize<'de> for RepartitionFrom {
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        enum CurrentRepartitionFrom {
+            Partitioned { exprs: Vec<PartitionExpr> },
+            Unpartitioned { partition_columns: Vec<String> },
+        }
+
+        #[derive(Deserialize)]
+        #[serde(untagged)]
+        enum RepartitionFromRepr {
+            Current(CurrentRepartitionFrom),
+            Legacy(Vec<PartitionExpr>),
+        }
+
+        match RepartitionFromRepr::deserialize(deserializer)? {
+            RepartitionFromRepr::Current(CurrentRepartitionFrom::Partitioned { exprs }) => {
+                Ok(Self::Partitioned { exprs })
+            }
+            RepartitionFromRepr::Current(CurrentRepartitionFrom::Unpartitioned {
+                partition_columns,
+            }) => Ok(Self::Unpartitioned { partition_columns }),
+            RepartitionFromRepr::Legacy(exprs) => Ok(Self::Partitioned { exprs }),
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct RepartitionStart {
-    from_exprs: Vec<PartitionExpr>,
+    #[serde(alias = "from_exprs")]
+    from: RepartitionFrom,
    to_exprs: Vec<PartitionExpr>,
 }

 impl RepartitionStart {
-    pub fn new(from_exprs: Vec<PartitionExpr>, to_exprs: Vec<PartitionExpr>) -> Self {
-        Self {
-            from_exprs,
-            to_exprs,
-        }
+    pub fn new(from: RepartitionFrom, to_exprs: Vec<PartitionExpr>) -> Self {
+        Self { from, to_exprs }
    }
 }

@@ -53,6 +91,13 @@ impl State for RepartitionStart {
        ctx: &mut Context,
        _: &ProcedureContext,
    ) -> Result<(Box<dyn State>, Status)> {
+        ensure!(
+            !self.to_exprs.is_empty(),
+            error::InvalidArgumentsSnafu {
+                err_msg: "Repartition expects non-empty target partition expressions".to_string(),
+            }
+        );
+
        let timer = Instant::now();
        let (physical_table_id, table_route) = ctx
            .table_metadata_manager
@@ -71,7 +116,8 @@ impl State for RepartitionStart {
            }
        );

-        let plans = Self::build_plan(&table_route, &self.from_exprs, &self.to_exprs)?;
+        let from_exprs = self.prepare_from(ctx).await?;
+        let plans = Self::build_plan(&table_route, from_exprs, &self.to_exprs)?;
        let plan_count = plans.len();
        let total_source_regions: usize = plans.iter().map(|p| p.source_regions.len()).sum();
        let total_target_regions: usize =
@@ -90,10 +136,17 @@ impl State for RepartitionStart {
            return Ok((Box::new(RepartitionEnd), Status::done()));
        }

-        Ok((
-            Box::new(AllocateRegion::new(plans)),
-            Status::executing(false),
-        ))
+        if ctx.persistent_ctx.partition_metadata_update.is_some() {
+            Ok((
+                Box::new(UpdatePartitionMetadata::new(plans)),
+                Status::executing(true),
+            ))
+        } else {
+            Ok((
+                Box::new(AllocateRegion::new(plans)),
+                Status::executing(false),
+            ))
+        }
    }

    fn as_any(&self) -> &dyn Any {
@@ -102,13 +155,76 @@ impl State for RepartitionStart {
 }

 impl RepartitionStart {
+    async fn prepare_from<'a>(&'a self, ctx: &mut Context) -> Result<&'a [PartitionExpr]> {
+        match &self.from {
+            RepartitionFrom::Partitioned { exprs } => Ok(exprs),
+            RepartitionFrom::Unpartitioned { partition_columns } => {
+                Self::prepare_unpartitioned(ctx, partition_columns).await?;
+                Ok(&[])
+            }
+        }
+    }
+
+    async fn prepare_unpartitioned(ctx: &mut Context, partition_columns: &[String]) -> Result<()> {
+        if ctx.persistent_ctx.partition_metadata_update.is_some() {
+            return Ok(());
+        }
+
+        ensure!(
+            !partition_columns.is_empty(),
+            error::InvalidArgumentsSnafu {
+                err_msg: "Unpartitioned repartition expects non-empty partition columns"
+                    .to_string(),
+            }
+        );
+
+        let table_info_value = ctx.get_table_info_value().await?;
+        ensure!(
+            table_info_value
+                .table_info
+                .meta
+                .partition_key_indices
+                .is_empty(),
+            error::InvalidArgumentsSnafu {
+                err_msg: format!(
+                    "Unpartitioned repartition expects an unpartitioned table, but table {} has partition key indices: {:?}",
+                    ctx.persistent_ctx.table_id,
+                    table_info_value.table_info.meta.partition_key_indices
+                ),
+            }
+        );
+
+        let schema = &table_info_value.table_info.meta.schema;
+        let partition_key_indices = partition_columns
+            .iter()
+            .map(|column_name| {
+                schema.column_index_by_name(column_name).with_context(|| {
+                    error::InvalidArgumentsSnafu {
+                        err_msg: format!(
+                            "Partition column {} not found in table {}",
+                            column_name, ctx.persistent_ctx.table_id
+                        ),
+                    }
+                })
+            })
+            .collect::<Result<Vec<_>>>()?;
+        ctx.persistent_ctx.partition_metadata_update =
+            Some(PartitionMetadataUpdate::new(partition_key_indices));
+
+        Ok(())
+    }
+
    pub(crate) fn build_plan(
        physical_route: &PhysicalTableRouteValue,
        from_exprs: &[PartitionExpr],
        to_exprs: &[PartitionExpr],
    ) -> Result<Vec<AllocationPlanEntry>> {
-        let subtasks = subtask::create_subtasks(from_exprs, to_exprs)
-            .context(error::RepartitionCreateSubtasksSnafu)?;
+        let subtasks = if from_exprs.is_empty() {
+            Self::default_source_subtasks(to_exprs)?
+        } else {
+            subtask::create_subtasks(from_exprs, to_exprs)
+                .context(error::RepartitionCreateSubtasksSnafu)?
+        };
        if subtasks.is_empty() {
            return Ok(vec![]);
        }
@@ -123,7 +239,7 @@ impl RepartitionStart {

    fn build_plan_entries(
        subtasks: Vec<RepartitionSubtask>,
-        source_index: &[RegionDescriptor],
+        source_index: &[SourceRegionDescriptor],
        target_exprs: &[PartitionExpr],
    ) -> Vec<AllocationPlanEntry> {
        subtasks
@@ -151,10 +267,32 @@ impl RepartitionStart {
            .collect::<Vec<_>>()
    }

+    fn default_source_subtasks(to_exprs: &[PartitionExpr]) -> Result<Vec<RepartitionSubtask>> {
+        ensure!(
+            !to_exprs.is_empty(),
+            error::UnexpectedSnafu {
+                violated: "Default source repartition expects non-empty target partition exprs",
+            }
+        );
+
+        Collider::new(to_exprs).context(error::RepartitionCreateSubtasksSnafu)?;
+
+        let to_expr_indices = (0..to_exprs.len()).collect::<Vec<_>>();
+        Ok(vec![RepartitionSubtask {
+            from_expr_indices: vec![0],
+            to_expr_indices: to_expr_indices.clone(),
+            transition_map: vec![to_expr_indices],
+        }])
+    }
+
    fn source_region_descriptors(
        from_exprs: &[PartitionExpr],
        physical_route: &PhysicalTableRouteValue,
-    ) -> Result<Vec<RegionDescriptor>> {
+    ) -> Result<Vec<SourceRegionDescriptor>> {
+        if from_exprs.is_empty() {
+            return Self::default_source_region_descriptors(physical_route);
+        }
+
        let existing_regions = physical_route
            .region_routes
            .iter()
@@ -178,13 +316,394 @@ impl RepartitionStart {
                        debug!("Failed to find matching region for partition expression: {}, existing regions: {:?}", expr_json, existing_regions);
                    })?;

-                Ok(RegionDescriptor {
-                    region_id: matched_region_id,
-                    partition_expr: expr.clone(),
-                })
+                Ok(SourceRegionDescriptor::partitioned(
+                    matched_region_id,
+                    expr.clone(),
+                ))
            })
            .collect::<Result<Vec<_>>>()?;

        Ok(descriptors)
    }
+
+    fn default_source_region_descriptors(
+        physical_route: &PhysicalTableRouteValue,
+    ) -> Result<Vec<SourceRegionDescriptor>> {
+        ensure!(
+            physical_route.region_routes.len() == 1,
+            error::UnexpectedSnafu {
+                violated: format!(
+                    "Default source repartition expects exactly one source region, but got {}",
+                    physical_route.region_routes.len()
+                ),
+            }
+        );
+        let source_region = &physical_route.region_routes[0].region;
+        ensure!(
+            source_region.partition_expr().is_empty(),
+            error::UnexpectedSnafu {
+                violated: format!(
+                    "Default source repartition expects an empty partition expr, but got {}",
+                    source_region.partition_expr()
+                ),
+            }
+        );
+
+        Ok(vec![SourceRegionDescriptor::Default {
+            region_id: source_region.id,
+        }])
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use common_meta::ddl::test_util::datanode_handler::NaiveDatanodeHandler;
+    use common_meta::key::table_route::PhysicalTableRouteValue;
+    use common_meta::peer::Peer;
+    use common_meta::rpc::router::{Region, RegionRoute};
+    use common_meta::test_util::MockDatanodeManager;
+    use datatypes::prelude::Value;
+    use partition::expr::{Operand, RestrictedOp};
+    use store_api::storage::RegionId;
+
+    use super::*;
+    use crate::procedure::repartition::test_util::{
+        TestingEnv, new_parent_context, range_expr, test_region_route, test_region_wal_options,
+    };
+
+    fn physical_route(region_routes: Vec<RegionRoute>) -> PhysicalTableRouteValue {
+        PhysicalTableRouteValue::new(region_routes)
+    }
+
+    async fn new_test_context(env: &TestingEnv, table_id: u32) -> Context {
+        env.create_physical_table_metadata_for_repartition(
+            table_id,
+            vec![test_region_route(RegionId::new(table_id, 1), "")],
+            test_region_wal_options(&[1]),
+        )
+        .await;
+        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
+        new_parent_context(env, node_manager, table_id)
+    }
+
+    #[test]
+    fn test_build_plan_with_default_source_region() {
+        let table_id = 1024;
+        let physical_route =
+            physical_route(vec![test_region_route(RegionId::new(table_id, 1), "")]);
+        let to_exprs = vec![range_expr("x", 0, 50), range_expr("x", 50, 100)];
+
+        let plans = RepartitionStart::build_plan(&physical_route, &[], &to_exprs).unwrap();
+
+        assert_eq!(plans.len(), 1);
+        let plan = &plans[0];
+        assert_eq!(
+            plan.source_regions,
+            vec![SourceRegionDescriptor::Default {
+                region_id: RegionId::new(table_id, 1)
+            }]
+        );
+        assert_eq!(plan.target_partition_exprs, to_exprs);
+        assert_eq!(plan.transition_map, vec![vec![0, 1]]);
+    }
+
+    #[test]
+    fn test_build_plan_with_default_source_rejects_non_empty_partition_expr() {
+        let table_id = 1024;
+        let physical_route = physical_route(vec![test_region_route(
+            RegionId::new(table_id, 1),
+            &range_expr("x", 0, 100).as_json_str().unwrap(),
+        )]);
+        let to_exprs = vec![range_expr("x", 0, 50), range_expr("x", 50, 100)];
+
+        let err = RepartitionStart::build_plan(&physical_route, &[], &to_exprs).unwrap_err();
+
+        assert!(err.to_string().contains("empty partition expr"));
+    }
+
+    #[test]
+    fn test_build_plan_with_default_source_rejects_multiple_regions() {
+        let table_id = 1024;
+        let physical_route = physical_route(vec![
+            test_region_route(RegionId::new(table_id, 1), ""),
+            test_region_route(RegionId::new(table_id, 2), ""),
+        ]);
+        let to_exprs = vec![range_expr("x", 0, 50), range_expr("x", 50, 100)];
+
+        let err = RepartitionStart::build_plan(&physical_route, &[], &to_exprs).unwrap_err();
+
+        assert!(err.to_string().contains("exactly one source region"));
+    }
+
+    #[test]
+    fn test_build_plan_with_default_source_rejects_empty_targets() {
+        let table_id = 1024;
+        let physical_route =
+            physical_route(vec![test_region_route(RegionId::new(table_id, 1), "")]);
+
+        let err = RepartitionStart::build_plan(&physical_route, &[], &[]).unwrap_err();
+
+        assert!(err.to_string().contains("non-empty target partition exprs"));
+    }
+
+    #[test]
+    fn test_build_plan_with_default_source_rejects_invalid_targets() {
+        let table_id = 1024;
+        let physical_route =
+            physical_route(vec![test_region_route(RegionId::new(table_id, 1), "")]);
+        let invalid_to_expr = PartitionExpr::new(
+            Operand::Value(Value::Int64(1)),
+            RestrictedOp::Eq,
+            Operand::Value(Value::Int64(2)),
+        );
+
+        let err =
+            RepartitionStart::build_plan(&physical_route, &[], &[invalid_to_expr]).unwrap_err();
+
+        assert!(
+            err.to_string()
+                .contains("Failed to create repartition subtasks")
+        );
+    }
+
+    #[test]
+    fn test_build_plan_keeps_partitioned_source_matching() {
+        let table_id = 1024;
+        let from_exprs = vec![range_expr("x", 0, 100)];
+        let to_exprs = vec![range_expr("x", 0, 50), range_expr("x", 50, 100)];
+        let physical_route = physical_route(vec![RegionRoute {
+            region: Region {
+                id: RegionId::new(table_id, 1),
+                partition_expr: from_exprs[0].as_json_str().unwrap(),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            ..Default::default()
+        }]);
+
+        let plans = RepartitionStart::build_plan(&physical_route, &from_exprs, &to_exprs).unwrap();
+
+        assert_eq!(plans.len(), 1);
+        assert_eq!(
+            plans[0].source_regions,
+            vec![SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 1),
+                from_exprs[0].clone()
+            )]
+        );
+    }
+
+    #[test]
+    fn test_repartition_start_deserializes_legacy_from_exprs() {
+        let from_exprs = vec![range_expr("x", 0, 100)];
+        let to_exprs = vec![range_expr("x", 0, 50), range_expr("x", 50, 100)];
+        let json = serde_json::json!({
+            "from_exprs": from_exprs,
+            "to_exprs": to_exprs,
+        })
+        .to_string();
+
+        let state: RepartitionStart = serde_json::from_str(&json).unwrap();
+
+        let RepartitionFrom::Partitioned { exprs } = state.from else {
+            panic!("expected partition source");
+        };
+        assert_eq!(exprs, vec![range_expr("x", 0, 100)]);
+    }
+
+    #[test]
+    fn test_repartition_start_deserializes_current_from() {
+        let state = RepartitionStart::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["col1".to_string()],
+            },
+            vec![range_expr("col1", 0, 50)],
+        );
+        let json = serde_json::to_string(&state).unwrap();
+
+        let state: RepartitionStart = serde_json::from_str(&json).unwrap();
+
+        let RepartitionFrom::Unpartitioned { partition_columns } = state.from else {
+            panic!("expected unpartitioned source");
+        };
+        assert_eq!(partition_columns, vec!["col1"]);
+    }
+
+    #[tokio::test]
+    async fn test_partitioned_source_does_not_initialize_partition_metadata_update() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        env.create_physical_table_metadata_for_repartition(
+            table_id,
+            vec![test_region_route(
+                RegionId::new(table_id, 1),
+                &range_expr("x", 0, 100).as_json_str().unwrap(),
+            )],
+            test_region_wal_options(&[1]),
+        )
+        .await;
+        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
+        let mut ctx = new_parent_context(&env, node_manager, table_id);
+        let mut state = RepartitionStart::new(
+            RepartitionFrom::Partitioned {
+                exprs: vec![range_expr("x", 0, 100)],
+            },
+            vec![range_expr("x", 0, 50), range_expr("x", 50, 100)],
+        );
+
+        let (next, status) = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert!(!status.need_persist());
+        assert!(next.as_any().is::<AllocateRegion>());
+        assert!(ctx.persistent_ctx.partition_metadata_update.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_unpartitioned_source_initializes_partition_metadata_update() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let mut state = RepartitionStart::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["col2".to_string(), "col1".to_string()],
+            },
+            vec![range_expr("col2", 0, 50), range_expr("col2", 50, 100)],
+        );
+
+        let (next, status) = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert!(status.need_persist());
+        assert!(next.as_any().is::<UpdatePartitionMetadata>());
+        assert_eq!(
+            ctx.persistent_ctx
+                .partition_metadata_update
+                .as_ref()
+                .unwrap()
+                .partition_key_indices,
+            vec![2, 0]
+        );
+    }
+
+    #[tokio::test]
+    async fn test_unpartitioned_source_rejects_existing_partition_metadata() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let current = ctx.get_raw_table_info_value().await.unwrap();
+        let mut table_info = current.table_info.clone();
+        table_info.meta.partition_key_indices = vec![0];
+        ctx.update_table_info(&current, current.update(table_info))
+            .await
+            .unwrap();
+        let mut state = RepartitionStart::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["col1".to_string()],
+            },
+            vec![range_expr("col1", 0, 50)],
+        );
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(err.to_string().contains("expects an unpartitioned table"));
+        assert!(ctx.persistent_ctx.partition_metadata_update.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_repartition_start_rejects_empty_target_partition_exprs() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let mut state =
+            RepartitionStart::new(RepartitionFrom::Partitioned { exprs: vec![] }, vec![]);
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(
+            err.to_string()
+                .contains("non-empty target partition expressions")
+        );
+    }
+
+    #[tokio::test]
+    async fn test_unpartitioned_source_rejects_empty_target_partition_exprs() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let mut state = RepartitionStart::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["col1".to_string()],
+            },
+            vec![],
+        );
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(
+            err.to_string()
+                .contains("non-empty target partition expressions")
+        );
+        assert!(ctx.persistent_ctx.partition_metadata_update.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_unpartitioned_source_rejects_empty_partition_columns() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let mut state = RepartitionStart::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec![],
+            },
+            vec![range_expr("col1", 0, 50)],
+        );
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(err.to_string().contains("non-empty partition columns"));
+        assert!(ctx.persistent_ctx.partition_metadata_update.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_unpartitioned_source_rejects_missing_partition_column() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let mut state = RepartitionStart::new(
+            RepartitionFrom::Unpartitioned {
+                partition_columns: vec!["missing_col".to_string()],
+            },
+            vec![range_expr("col1", 0, 50)],
+        );
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(
+            err.to_string()
+                .contains("Partition column missing_col not found")
+        );
+        assert!(ctx.persistent_ctx.partition_metadata_update.is_none());
+    }
 }
--- a/src/meta-srv/src/procedure/repartition/test_util.rs
+++ b/src/meta-srv/src/procedure/repartition/test_util.rs
@@ -42,7 +42,7 @@ use uuid::Uuid;
 use crate::cache_invalidator::MetasrvCacheInvalidator;
 use crate::metasrv::MetasrvInfo;
 use crate::procedure::repartition::group::{Context, PersistentContext, VolatileContext};
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
 use crate::procedure::repartition::{
    Context as ParentContext, PersistentContext as ParentPersistentContext, RepartitionProcedure,
 };
@@ -177,8 +177,8 @@ pub fn test_region_wal_options(region_numbers: &[RegionNumber]) -> HashMap<Regio

 pub fn new_persistent_context(
    table_id: TableId,
-    sources: Vec<RegionDescriptor>,
-    targets: Vec<RegionDescriptor>,
+    sources: Vec<SourceRegionDescriptor>,
+    targets: Vec<TargetRegionDescriptor>,
 ) -> PersistentContext {
    PersistentContext {
        group_id: Uuid::new_v4(),
--- a/src/meta-srv/src/procedure/repartition/update_partition_metadata.rs
+++ b/src/meta-srv/src/procedure/repartition/update_partition_metadata.rs
@@ -0,0 +1,251 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_meta::lock_key::TableLock;
+use common_procedure::{Context as ProcedureContext, Status};
+use serde::{Deserialize, Serialize};
+use snafu::ensure;
+
+use crate::error::{self, Result};
+use crate::procedure::repartition::allocate_region::AllocateRegion;
+use crate::procedure::repartition::plan::AllocationPlanEntry;
+use crate::procedure::repartition::{Context, State};
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct PartitionMetadataUpdate {
+    pub partition_key_indices: Vec<usize>,
+}
+
+impl PartitionMetadataUpdate {
+    pub fn new(partition_key_indices: Vec<usize>) -> Self {
+        Self {
+            partition_key_indices,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct UpdatePartitionMetadata {
+    plan_entries: Vec<AllocationPlanEntry>,
+}
+
+impl UpdatePartitionMetadata {
+    pub fn new(plan_entries: Vec<AllocationPlanEntry>) -> Self {
+        Self { plan_entries }
+    }
+}
+
+#[async_trait::async_trait]
+#[typetag::serde]
+impl State for UpdatePartitionMetadata {
+    async fn next(
+        &mut self,
+        ctx: &mut Context,
+        procedure_ctx: &ProcedureContext,
+    ) -> Result<(Box<dyn State>, Status)> {
+        let Some(update) = ctx.persistent_ctx.partition_metadata_update.as_ref() else {
+            return Ok((
+                Box::new(AllocateRegion::new(self.plan_entries.clone())),
+                Status::executing(false),
+            ));
+        };
+        let partition_key_indices = update.partition_key_indices.clone();
+        ensure!(
+            !partition_key_indices.is_empty(),
+            error::InvalidArgumentsSnafu {
+                err_msg:
+                    "Repartition partition metadata update expects non-empty partition key indices"
+                        .to_string(),
+            }
+        );
+
+        let table_id = ctx.persistent_ctx.table_id;
+        let table_lock = TableLock::Write(table_id).into();
+        let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await;
+        let table_info_value = ctx.get_raw_table_info_value().await?;
+        let current_partition_key_indices = &table_info_value.table_info.meta.partition_key_indices;
+        if current_partition_key_indices == &partition_key_indices {
+            return Ok((
+                Box::new(AllocateRegion::new(self.plan_entries.clone())),
+                Status::executing(true),
+            ));
+        }
+        ensure!(
+            current_partition_key_indices.is_empty(),
+            error::InvalidArgumentsSnafu {
+                err_msg: format!(
+                    "Repartition partition metadata update expects an unpartitioned table, but table {} has partition key indices: {:?}",
+                    table_id, current_partition_key_indices
+                ),
+            }
+        );
+
+        let mut new_table_info = table_info_value.table_info.clone();
+        new_table_info.meta.partition_key_indices = partition_key_indices;
+        ctx.update_table_info(&table_info_value, table_info_value.update(new_table_info))
+            .await?;
+        // We don't invalidate cache here because the subsequent AllocateRegion step
+        // will update the table route and invalidate the cache accordingly.
+
+        Ok((
+            Box::new(AllocateRegion::new(self.plan_entries.clone())),
+            Status::executing(true),
+        ))
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use common_meta::ddl::test_util::datanode_handler::NaiveDatanodeHandler;
+    use common_meta::test_util::MockDatanodeManager;
+    use store_api::storage::{RegionId, TableId};
+
+    use super::*;
+    use crate::procedure::repartition::test_util::{
+        TestingEnv, new_parent_context, range_expr, test_region_route, test_region_wal_options,
+    };
+
+    async fn new_test_context(env: &TestingEnv, table_id: TableId) -> Context {
+        env.create_physical_table_metadata_for_repartition(
+            table_id,
+            vec![test_region_route(RegionId::new(table_id, 1), "")],
+            test_region_wal_options(&[1]),
+        )
+        .await;
+        let node_manager = Arc::new(MockDatanodeManager::new(NaiveDatanodeHandler));
+        let mut ctx = new_parent_context(env, node_manager, table_id);
+        ctx.persistent_ctx.partition_metadata_update = Some(PartitionMetadataUpdate::new(vec![0]));
+        ctx
+    }
+
+    async fn set_partition_key_indices(ctx: &Context, partition_key_indices: Vec<usize>) {
+        let current = ctx.get_raw_table_info_value().await.unwrap();
+        let mut table_info = current.table_info.clone();
+        table_info.meta.partition_key_indices = partition_key_indices;
+        ctx.update_table_info(&current, current.update(table_info))
+            .await
+            .unwrap();
+    }
+
+    async fn partition_key_indices(ctx: &Context) -> Vec<usize> {
+        ctx.get_table_info_value()
+            .await
+            .unwrap()
+            .table_info
+            .meta
+            .partition_key_indices
+    }
+
+    #[tokio::test]
+    async fn test_update_partition_metadata_applies_to_unpartitioned_table() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let mut state = UpdatePartitionMetadata::new(vec![]);
+
+        let (next, status) = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert!(status.need_persist());
+        assert!(next.as_any().is::<AllocateRegion>());
+        assert_eq!(partition_key_indices(&ctx).await, vec![0]);
+    }
+
+    #[tokio::test]
+    async fn test_update_partition_metadata_replay_is_noop() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        set_partition_key_indices(&ctx, vec![0]).await;
+        let mut state = UpdatePartitionMetadata::new(vec![]);
+
+        let (next, status) = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert!(status.need_persist());
+        assert!(next.as_any().is::<AllocateRegion>());
+        assert_eq!(partition_key_indices(&ctx).await, vec![0]);
+    }
+
+    #[tokio::test]
+    async fn test_update_partition_metadata_rejects_empty_partition_key_indices() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        ctx.persistent_ctx.partition_metadata_update = Some(PartitionMetadataUpdate::new(vec![]));
+        let mut state = UpdatePartitionMetadata::new(vec![]);
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(err.to_string().contains("non-empty partition key indices"));
+        assert!(partition_key_indices(&ctx).await.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_update_partition_metadata_rejects_other_partition_keys() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        set_partition_key_indices(&ctx, vec![1]).await;
+        let mut state = UpdatePartitionMetadata::new(vec![]);
+
+        let err = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap_err();
+
+        assert!(err.to_string().contains("expects an unpartitioned table"));
+        assert_eq!(partition_key_indices(&ctx).await, vec![1]);
+    }
+
+    #[tokio::test]
+    async fn test_update_partition_metadata_preserves_plan_entries() {
+        let env = TestingEnv::new();
+        let table_id = 1024;
+        let mut ctx = new_test_context(&env, table_id).await;
+        let plan_entries = vec![crate::procedure::repartition::plan::AllocationPlanEntry {
+            group_id: uuid::Uuid::new_v4(),
+            source_regions: vec![
+                crate::procedure::repartition::plan::SourceRegionDescriptor::Default {
+                    region_id: RegionId::new(table_id, 1),
+                },
+            ],
+            target_partition_exprs: vec![range_expr("x", 0, 10)],
+            transition_map: vec![vec![0]],
+        }];
+        let mut state = UpdatePartitionMetadata::new(plan_entries);
+
+        let (next, _) = state
+            .next(&mut ctx, &TestingEnv::procedure_context())
+            .await
+            .unwrap();
+
+        assert!(next.as_any().is::<AllocateRegion>());
+    }
+}
--- a/src/meta-srv/src/procedure/repartition/utils.rs
+++ b/src/meta-srv/src/procedure/repartition/utils.rs
@@ -23,7 +23,7 @@ use store_api::storage::{RegionId, RegionNumber, TableId};

 use crate::error::{self, Result};
 use crate::procedure::repartition::group::GroupId;
-use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::repartition::plan::SourceRegionDescriptor;

 /// Returns the `datanode_table_value`
 ///
@@ -138,21 +138,23 @@ pub fn merge_and_validate_region_wal_options(
 /// restored here.
 pub fn rollback_group_metadata_routes(
    group_id: GroupId,
-    source_regions: &[RegionDescriptor],
+    source_regions: &[SourceRegionDescriptor],
    original_target_routes: &[RegionRoute],
    allocated_region_ids: &[RegionId],
    pending_deallocate_region_ids: &[RegionId],
    region_routes_map: &mut HashMap<RegionId, &mut RegionRoute>,
 ) -> Result<()> {
    for source in source_regions {
-        let region_route = region_routes_map.get_mut(&source.region_id).context(
+        let region_id = source.region_id();
+        let region_route = region_routes_map.get_mut(&region_id).context(
            error::RepartitionSourceRegionMissingSnafu {
                group_id,
-                region_id: source.region_id,
+                region_id,
            },
        )?;
        region_route.clear_leader_staging();
-        if pending_deallocate_region_ids.contains(&source.region_id) {
+        region_route.region.partition_expr = source.route_expr_for_rollback()?;
+        if pending_deallocate_region_ids.contains(&region_id) {
            region_route.clear_ignore_all_writes();
        }
    }
@@ -191,7 +193,7 @@ mod tests {

    use super::*;
    use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
-    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::plan::{SourceRegionDescriptor, TargetRegionDescriptor};
    use crate::procedure::repartition::test_util::range_expr;

    /// Helper function to create a Kafka WAL option string from a topic name.
@@ -242,7 +244,7 @@ mod tests {

    fn original_target_routes(
        region_routes: &[RegionRoute],
-        targets: &[RegionDescriptor],
+        targets: &[TargetRegionDescriptor],
    ) -> Vec<RegionRoute> {
        let target_ids = targets
            .iter()
@@ -380,16 +382,16 @@ mod tests {
            ),
            new_staged_region_route(RegionId::new(table_id, 3), "", None, false),
        ];
-        let sources = vec![RegionDescriptor {
-            region_id: RegionId::new(table_id, 1),
-            partition_expr: range_expr("x", 0, 100),
-        }];
+        let sources = vec![SourceRegionDescriptor::partitioned(
+            RegionId::new(table_id, 1),
+            range_expr("x", 0, 100),
+        )];
        let targets = vec![
-            RegionDescriptor {
+            TargetRegionDescriptor {
                region_id: RegionId::new(table_id, 1),
                partition_expr: range_expr("x", 0, 50),
            },
-            RegionDescriptor {
+            TargetRegionDescriptor {
                region_id: RegionId::new(table_id, 3),
                partition_expr: range_expr("x", 50, 100),
            },
@@ -420,6 +422,60 @@ mod tests {
        assert_eq!(applied_region_routes, original_region_routes);
    }

+    #[test]
+    fn test_rollback_group_metadata_routes_default_source_restores_empty_expr() {
+        let group_id = Uuid::new_v4();
+        let table_id = 1024;
+        let default_region_id = RegionId::new(table_id, 1);
+        let allocated_region_id = RegionId::new(table_id, 2);
+        let source_regions = vec![SourceRegionDescriptor::Default {
+            region_id: default_region_id,
+        }];
+        let target_regions = vec![
+            TargetRegionDescriptor {
+                region_id: default_region_id,
+                partition_expr: range_expr("x", 0, 50),
+            },
+            TargetRegionDescriptor {
+                region_id: allocated_region_id,
+                partition_expr: range_expr("x", 50, 100),
+            },
+        ];
+        let current_region_routes = vec![
+            new_staged_region_route(default_region_id, "", None, false),
+            new_staged_region_route(allocated_region_id, "", None, false),
+        ];
+        let original_target_routes = vec![current_region_routes[0].clone()];
+        let mut applied_region_routes = UpdateMetadata::apply_staging_region_routes(
+            group_id,
+            &source_regions,
+            &target_regions,
+            &[],
+            &current_region_routes,
+        )
+        .unwrap();
+        assert_eq!(
+            applied_region_routes[0].region.partition_expr,
+            range_expr("x", 0, 50).as_json_str().unwrap()
+        );
+
+        rollback_group_metadata_routes(
+            group_id,
+            &source_regions,
+            &original_target_routes,
+            &[allocated_region_id],
+            &[],
+            &mut applied_region_routes
+                .iter_mut()
+                .map(|route| (route.region.id, route))
+                .collect(),
+        )
+        .unwrap();
+
+        assert_eq!(applied_region_routes[0].region.partition_expr, "");
+        assert!(!applied_region_routes[0].is_leader_staging());
+    }
+
    #[test]
    fn test_rollback_group_metadata_routes_merge_case_is_idempotent() {
        let group_id = Uuid::new_v4();
@@ -445,16 +501,16 @@ mod tests {
            ),
        ];
        let sources = vec![
-            RegionDescriptor {
-                region_id: RegionId::new(table_id, 1),
-                partition_expr: range_expr("x", 0, 100),
-            },
-            RegionDescriptor {
-                region_id: RegionId::new(table_id, 2),
-                partition_expr: range_expr("x", 100, 200),
-            },
+            SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 1),
+                range_expr("x", 0, 100),
+            ),
+            SourceRegionDescriptor::partitioned(
+                RegionId::new(table_id, 2),
+                range_expr("x", 100, 200),
+            ),
        ];
-        let targets = vec![RegionDescriptor {
+        let targets = vec![TargetRegionDescriptor {
            region_id: RegionId::new(table_id, 1),
            partition_expr: range_expr("x", 0, 200),
        }];
--- a/src/meta-srv/src/procedure/test_util.rs
+++ b/src/meta-srv/src/procedure/test_util.rs
@@ -66,7 +66,7 @@ impl MailboxContext {
    ) {
        let pusher_id = channel.pusher_id();
        let pusher = Pusher::new(tx);
-        let _ = self.pushers.insert(pusher_id.string_key(), pusher).await;
+        let _ = self.pushers.insert(pusher_id, pusher).await;
    }

    pub fn mailbox(&self) -> &MailboxRef {
--- a/src/meta-srv/src/service/heartbeat.rs
+++ b/src/meta-srv/src/service/heartbeat.rs
@@ -20,10 +20,12 @@ use api::v1::meta::{
    AskLeaderRequest, AskLeaderResponse, HeartbeatRequest, HeartbeatResponse, Peer, RequestHeader,
    ResponseHeader, Role, heartbeat_server,
 };
+use common_meta::election::LeaderChangeMessage;
 use common_telemetry::{debug, error, info, warn};
 use futures::StreamExt;
 use once_cell::sync::OnceCell;
 use snafu::{OptionExt, ResultExt};
+use tokio::sync::broadcast::error::RecvError;
 use tokio::sync::mpsc;
 use tokio::sync::mpsc::Sender;
 use tokio_stream::wrappers::ReceiverStream;
@@ -31,10 +33,282 @@ use tonic::{Request, Response, Status, Streaming};

 use crate::error::{self, Result};
 use crate::handler::{HeartbeatHandlerGroup, Pusher, PusherId};
-use crate::metasrv::{Context, Metasrv};
+use crate::metasrv::{Context, ElectionRef, Metasrv};
 use crate::metrics::METRIC_META_HEARTBEAT_RECV;
 use crate::service::{GrpcResult, GrpcStream};

+type HeartbeatResponseResult = std::result::Result<HeartbeatResponse, Status>;
+
+#[async_trait::async_trait]
+trait HeartbeatRequestStream {
+    async fn next(&mut self) -> Option<std::result::Result<HeartbeatRequest, Status>>;
+}
+
+struct TonicHeartbeatRequestStream {
+    inner: Streaming<HeartbeatRequest>,
+}
+
+impl TonicHeartbeatRequestStream {
+    fn new(inner: Streaming<HeartbeatRequest>) -> Self {
+        Self { inner }
+    }
+}
+
+#[async_trait::async_trait]
+impl HeartbeatRequestStream for TonicHeartbeatRequestStream {
+    async fn next(&mut self) -> Option<std::result::Result<HeartbeatRequest, Status>> {
+        self.inner.next().await
+    }
+}
+
+enum LeaderStepDownEvent {
+    StepDown,
+    Closed,
+}
+
+#[async_trait::async_trait]
+trait LeaderStepDown {
+    async fn wait(&mut self) -> LeaderStepDownEvent;
+}
+
+struct ElectionLeaderStepDown {
+    rx: tokio::sync::broadcast::Receiver<LeaderChangeMessage>,
+}
+
+impl ElectionLeaderStepDown {
+    fn new(election: ElectionRef) -> Self {
+        Self {
+            rx: election.subscribe_leader_change(),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl LeaderStepDown for ElectionLeaderStepDown {
+    async fn wait(&mut self) -> LeaderStepDownEvent {
+        loop {
+            match self.rx.recv().await {
+                Ok(LeaderChangeMessage::StepDown(_)) => return LeaderStepDownEvent::StepDown,
+                Ok(LeaderChangeMessage::Elected(_)) => {}
+                Err(RecvError::Lagged(skipped)) => {
+                    warn!(
+                        "Leader step-down watcher lagged, skipped {} leader change events",
+                        skipped
+                    );
+                }
+                Err(RecvError::Closed) => return LeaderStepDownEvent::Closed,
+            }
+        }
+    }
+}
+
+struct HeartbeatSession<R, L> {
+    requests: R,
+    tx: Sender<HeartbeatResponseResult>,
+    leader_step_down: Option<L>,
+    handler_group: Arc<HeartbeatHandlerGroup>,
+    ctx: Context,
+    sender_id: PusherId,
+}
+
+impl<R, L> HeartbeatSession<R, L>
+where
+    R: HeartbeatRequestStream,
+    L: LeaderStepDown,
+{
+    /// Initializes the heartbeat session by receiving the first request,
+    /// and returns `None` if the stream is closed or an error occurs.
+    async fn init(
+        mut requests: R,
+        tx: Sender<HeartbeatResponseResult>,
+        leader_step_down: Option<L>,
+        handler_group: Arc<HeartbeatHandlerGroup>,
+        ctx: Context,
+    ) -> Option<Self> {
+        let msg = requests.next().await?;
+
+        let req = match msg {
+            Ok(req) => req,
+            Err(err) => {
+                error!("Failed to receive the first heartbeat request, error: {err}");
+                let _ = handle_request_stream_error(None, &tx, err).await;
+                return None;
+            }
+        };
+
+        let Some(header) = req.header.as_ref() else {
+            error!("Exit on malformed request: MissingRequestHeader");
+            let _ = tx
+                .send(Err(error::MissingRequestHeaderSnafu {}.build().into()))
+                .await;
+            return None;
+        };
+
+        let sender_id = register_pusher(&handler_group, header, tx.clone()).await;
+        let mut session = Self {
+            requests,
+            tx,
+            leader_step_down,
+            handler_group,
+            ctx,
+            sender_id,
+        };
+
+        if session.handle_request(req, true).await {
+            Some(session)
+        } else {
+            session.cleanup().await;
+            None
+        }
+    }
+
+    /// Runs the heartbeat session until the stream is closed or an error occurs.
+    async fn run(mut self) {
+        let mut leader_step_down = self.leader_step_down.take();
+
+        loop {
+            tokio::select! {
+                msg = self.requests.next() => {
+                    let Some(msg) = msg else {
+                        break;
+                    };
+
+                    if !self.handle_message(msg).await {
+                        break;
+                    }
+                }
+                event = wait_leader_step_down(leader_step_down.as_mut()), if leader_step_down.is_some() => {
+                    match event {
+                        LeaderStepDownEvent::StepDown => {
+                            self.send_not_leader_error().await;
+                            break;
+                        }
+                        LeaderStepDownEvent::Closed => {
+                            warn!("Leader step-down watcher closed");
+                            self.send_election_unavailable_error().await;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
+        self.cleanup().await;
+    }
+
+    /// Handles the incoming message, and returns whether to continue the session.
+    async fn handle_message(&mut self, msg: std::result::Result<HeartbeatRequest, Status>) -> bool {
+        match msg {
+            Ok(req) => self.handle_request(req, false).await,
+            Err(err) => handle_request_stream_error(Some(self.sender_id), &self.tx, err).await,
+        }
+    }
+
+    /// Handles the incoming heartbeat request, and returns whether to continue the session.
+    async fn handle_request(&mut self, req: HeartbeatRequest, is_handshake: bool) -> bool {
+        debug!("Receiving heartbeat request: {:?}", req);
+
+        let sender_id = self.sender_id.to_string();
+        METRIC_META_HEARTBEAT_RECV
+            .with_label_values(&[sender_id.as_str()])
+            .inc();
+
+        let res = self
+            .handler_group
+            .handle(req, self.ctx.clone().with_handshake(is_handshake))
+            .await
+            .inspect_err(
+                |e| warn!(e; "Failed to handle heartbeat request, sender: {}", self.sender_id),
+            )
+            .map_err(|e| e.into());
+
+        let is_not_leader = res.as_ref().is_ok_and(|r| r.is_not_leader());
+
+        debug!("Sending heartbeat response: {:?}", res);
+
+        if self.tx.send(res).await.is_err() {
+            info!(
+                "ReceiverStream was dropped; shutting down, sender: {}",
+                self.sender_id
+            );
+            return false;
+        }
+
+        if is_not_leader {
+            warn!(
+                "Quit because it is no longer the leader, sender: {}",
+                self.sender_id
+            );
+            self.send_not_leader_error().await;
+            return false;
+        }
+
+        true
+    }
+
+    async fn send_not_leader_error(&mut self) {
+        let _ = self
+            .tx
+            .send(Err(Status::aborted(format!(
+                "The requested metasrv node is not leader, node addr: {}",
+                self.ctx.server_addr
+            ))))
+            .await;
+    }
+
+    async fn send_election_unavailable_error(&mut self) {
+        let _ = self
+            .tx
+            .send(Err(Status::unavailable(format!(
+                "The requested metasrv node is shutting down, node addr: {}",
+                self.ctx.server_addr
+            ))))
+            .await;
+    }
+
+    async fn cleanup(&self) {
+        info!("Heartbeat stream closed, sender: {}", self.sender_id);
+        let _ = self.handler_group.deregister_push(self.sender_id).await;
+    }
+}
+
+async fn wait_leader_step_down<L>(leader_step_down: Option<&mut L>) -> LeaderStepDownEvent
+where
+    L: LeaderStepDown,
+{
+    match leader_step_down {
+        Some(leader_step_down) => leader_step_down.wait().await,
+        None => std::future::pending().await,
+    }
+}
+
+/// Handles request stream error by logging and forwarding the error to the client if possible.
+///
+/// Returns `false` if the stream should be terminated.
+async fn handle_request_stream_error(
+    sender_id: Option<PusherId>,
+    tx: &Sender<HeartbeatResponseResult>,
+    err: Status,
+) -> bool {
+    if let Some(io_err) = error::match_for_io_error(&err)
+        && io_err.kind() == ErrorKind::BrokenPipe
+    {
+        error!("Client disconnected: broken pipe, sender: {:?}", sender_id);
+        return false;
+    }
+    error!(err; "Error while receiving heartbeat request, sender: {:?}", sender_id);
+
+    if tx.send(Err(err)).await.is_err() {
+        info!(
+            "Failed to forward heartbeat request stream error; response stream was dropped, sender: {:?}",
+            sender_id
+        );
+        return false;
+    }
+
+    true
+}
+
 #[async_trait::async_trait]
 impl heartbeat_server::Heartbeat for Metasrv {
    type HeartbeatStream = GrpcStream<HeartbeatResponse>;
@@ -43,88 +317,26 @@ impl heartbeat_server::Heartbeat for Metasrv {
        &self,
        req: Request<Streaming<HeartbeatRequest>>,
    ) -> GrpcResult<Self::HeartbeatStream> {
-        let mut in_stream = req.into_inner();
        let (tx, rx) = mpsc::channel(128);
        let handler_group = self.handler_group().context(error::UnexpectedSnafu {
            violated: "expected heartbeat handlers",
        })?;

        let ctx = self.new_ctx();
+        let requests = TonicHeartbeatRequestStream::new(req.into_inner());
        let _handle = common_runtime::spawn_global(async move {
-            let mut pusher_id = None;
-            while let Some(msg) = in_stream.next().await {
-                let mut is_not_leader = false;
-                match msg {
-                    Ok(req) => {
-                        debug!("Receiving heartbeat request: {:?}", req);
-
-                        let Some(header) = req.header.as_ref() else {
-                            error!("Exit on malformed request: MissingRequestHeader");
-                            let _ = tx
-                                .send(Err(error::MissingRequestHeaderSnafu {}.build().into()))
-                                .await;
-                            break;
-                        };
-
-                        let is_handshake = pusher_id.is_none();
-                        if is_handshake {
-                            pusher_id =
-                                Some(register_pusher(&handler_group, header, tx.clone()).await);
-                        }
-                        if let Some(k) = &pusher_id {
-                            METRIC_META_HEARTBEAT_RECV.with_label_values(&[&k.to_string()]);
-                        } else {
-                            METRIC_META_HEARTBEAT_RECV.with_label_values(&["none"]);
-                        }
-
-                        let res = handler_group
-                            .handle(req, ctx.clone().with_handshake(is_handshake))
-                            .await
-                            .inspect_err(|e| warn!(e; "Failed to handle heartbeat request, pusher: {pusher_id:?}", ))
-                            .map_err(|e| e.into());
-
-                        is_not_leader = res.as_ref().is_ok_and(|r| r.is_not_leader());
-
-                        debug!("Sending heartbeat response: {:?}", res);
-
-                        if tx.send(res).await.is_err() {
-                            info!("ReceiverStream was dropped; shutting down");
-                            break;
-                        }
-                    }
-                    Err(err) => {
-                        if let Some(io_err) = error::match_for_io_error(&err)
-                            && io_err.kind() == ErrorKind::BrokenPipe
-                        {
-                            // client disconnected in unexpected way
-                            error!("Client disconnected: broken pipe");
-                            break;
-                        }
-                        error!(err; "Sending heartbeat response error");
-
-                        if tx.send(Err(err)).await.is_err() {
-                            info!("ReceiverStream was dropped; shutting down");
-                            break;
-                        }
-                    }
-                }
-
-                if is_not_leader {
-                    warn!("Quit because it is no longer the leader");
-                    let _ = tx
-                        .send(Err(Status::aborted(format!(
-                            "The requested metasrv node is not leader, node addr: {}",
-                            ctx.server_addr
-                        ))))
-                        .await;
-                    break;
-                }
-            }
-
-            info!("Heartbeat stream closed: {pusher_id:?}");
-
-            if let Some(pusher_id) = pusher_id {
-                let _ = handler_group.deregister_push(pusher_id).await;
+            if let Some(session) = HeartbeatSession::init(
+                requests,
+                tx,
+                ctx.election
+                    .as_ref()
+                    .map(|r| ElectionLeaderStepDown::new(r.clone())),
+                handler_group,
+                ctx,
+            )
+            .await
+            {
+                session.run().await;
            }
        });

@@ -192,6 +404,7 @@ async fn register_pusher(

 #[cfg(test)]
 mod tests {
+    use std::collections::VecDeque;
    use std::sync::Arc;

    use api::v1::meta::heartbeat_server::Heartbeat;
@@ -199,12 +412,300 @@ mod tests {
    use common_meta::kv_backend::memory::MemoryKvBackend;
    use common_telemetry::tracing_context::W3cTrace;
    use servers::grpc::GrpcOptions;
-    use tonic::IntoRequest;
+    use tokio::sync::mpsc;
+    use tonic::{Code, IntoRequest};

-    use super::get_node_id;
+    use super::*;
+    use crate::handler::test_utils::TestEnv;
    use crate::metasrv::MetasrvOptions;
    use crate::metasrv::builder::MetasrvBuilder;

+    struct MockHeartbeatRequestStream {
+        messages: VecDeque<std::result::Result<HeartbeatRequest, Status>>,
+        pending_when_empty: bool,
+    }
+
+    impl MockHeartbeatRequestStream {
+        fn new(messages: Vec<std::result::Result<HeartbeatRequest, Status>>) -> Self {
+            Self {
+                messages: messages.into(),
+                pending_when_empty: false,
+            }
+        }
+
+        fn pending_after(messages: Vec<std::result::Result<HeartbeatRequest, Status>>) -> Self {
+            Self {
+                messages: messages.into(),
+                pending_when_empty: true,
+            }
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl HeartbeatRequestStream for MockHeartbeatRequestStream {
+        async fn next(&mut self) -> Option<std::result::Result<HeartbeatRequest, Status>> {
+            if let Some(message) = self.messages.pop_front() {
+                return Some(message);
+            }
+
+            if self.pending_when_empty {
+                std::future::pending().await
+            } else {
+                None
+            }
+        }
+    }
+
+    struct MockLeaderStepDown {
+        event: Option<LeaderStepDownEvent>,
+    }
+
+    impl MockLeaderStepDown {
+        fn new(event: LeaderStepDownEvent) -> Self {
+            Self { event: Some(event) }
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl LeaderStepDown for MockLeaderStepDown {
+        async fn wait(&mut self) -> LeaderStepDownEvent {
+            self.event.take().unwrap()
+        }
+    }
+
+    fn heartbeat_request(role: Role, member_id: u64) -> HeartbeatRequest {
+        HeartbeatRequest {
+            header: Some(RequestHeader {
+                role: role.into(),
+                member_id,
+                ..Default::default()
+            }),
+            ..Default::default()
+        }
+    }
+
+    fn sender_id(role: Role, member_id: u64) -> PusherId {
+        PusherId::new(role, member_id)
+    }
+
+    fn test_context() -> Context {
+        TestEnv::new().ctx()
+    }
+
+    fn test_handler_group() -> Arc<HeartbeatHandlerGroup> {
+        Arc::new(HeartbeatHandlerGroup::default())
+    }
+
+    async fn init_session<L>(
+        requests: MockHeartbeatRequestStream,
+        tx: Sender<HeartbeatResponseResult>,
+        leader_step_down: Option<L>,
+        handler_group: Arc<HeartbeatHandlerGroup>,
+    ) -> Option<HeartbeatSession<MockHeartbeatRequestStream, L>>
+    where
+        L: LeaderStepDown,
+    {
+        HeartbeatSession::init(
+            requests,
+            tx,
+            leader_step_down,
+            handler_group,
+            test_context(),
+        )
+        .await
+    }
+
+    async fn recv_response(
+        rx: &mut mpsc::Receiver<HeartbeatResponseResult>,
+    ) -> HeartbeatResponseResult {
+        rx.recv().await.unwrap()
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_init_returns_none_on_empty_stream() {
+        let (tx, _rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let requests = MockHeartbeatRequestStream::new(vec![]);
+
+        let session = init_session(
+            requests,
+            tx,
+            None::<MockLeaderStepDown>,
+            handler_group.clone(),
+        )
+        .await;
+
+        assert!(session.is_none());
+        assert!(
+            !handler_group
+                .contains_pusher(&sender_id(Role::Datanode, 42))
+                .await
+        );
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_init_forwards_first_stream_error() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let requests = MockHeartbeatRequestStream::new(vec![Err(Status::internal("boom"))]);
+
+        let session = init_session(requests, tx, None::<MockLeaderStepDown>, handler_group).await;
+
+        assert!(session.is_none());
+        let status = recv_response(&mut rx).await.unwrap_err();
+        assert_eq!(Code::Internal, status.code());
+        assert_eq!("boom", status.message());
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_init_sends_error_on_missing_header() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let requests = MockHeartbeatRequestStream::new(vec![Ok(HeartbeatRequest::default())]);
+
+        let session = init_session(
+            requests,
+            tx,
+            None::<MockLeaderStepDown>,
+            handler_group.clone(),
+        )
+        .await;
+
+        assert!(session.is_none());
+        assert!(
+            !handler_group
+                .contains_pusher(&sender_id(Role::Datanode, 42))
+                .await
+        );
+
+        let status = recv_response(&mut rx).await.unwrap_err();
+        assert_eq!(Code::InvalidArgument, status.code());
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_init_registers_sender() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let sender_id = sender_id(Role::Datanode, 42);
+        let requests =
+            MockHeartbeatRequestStream::new(vec![Ok(heartbeat_request(Role::Datanode, 42))]);
+
+        let session = init_session(
+            requests,
+            tx,
+            None::<MockLeaderStepDown>,
+            handler_group.clone(),
+        )
+        .await;
+
+        assert!(session.is_some());
+        assert!(handler_group.contains_pusher(&sender_id).await);
+
+        let response = recv_response(&mut rx).await.unwrap();
+        assert!(response.heartbeat_config.is_some());
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_run_deregisters_sender_on_stream_close() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let sender_id = sender_id(Role::Datanode, 42);
+        let requests =
+            MockHeartbeatRequestStream::new(vec![Ok(heartbeat_request(Role::Datanode, 42))]);
+        let session = init_session(
+            requests,
+            tx,
+            None::<MockLeaderStepDown>,
+            handler_group.clone(),
+        )
+        .await
+        .unwrap();
+        let _ = recv_response(&mut rx).await.unwrap();
+
+        session.run().await;
+
+        assert!(!handler_group.contains_pusher(&sender_id).await);
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_run_forwards_stream_error_after_init() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let sender_id = sender_id(Role::Datanode, 42);
+        let requests = MockHeartbeatRequestStream::new(vec![
+            Ok(heartbeat_request(Role::Datanode, 42)),
+            Err(Status::unavailable("temporary")),
+        ]);
+        let session = init_session(
+            requests,
+            tx,
+            None::<MockLeaderStepDown>,
+            handler_group.clone(),
+        )
+        .await
+        .unwrap();
+        let _ = recv_response(&mut rx).await.unwrap();
+
+        session.run().await;
+
+        let status = recv_response(&mut rx).await.unwrap_err();
+        assert_eq!(Code::Unavailable, status.code());
+        assert_eq!("temporary", status.message());
+        assert!(!handler_group.contains_pusher(&sender_id).await);
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_leader_step_down_sends_aborted_and_deregisters() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let sender_id = sender_id(Role::Datanode, 42);
+        let requests = MockHeartbeatRequestStream::pending_after(vec![Ok(heartbeat_request(
+            Role::Datanode,
+            42,
+        ))]);
+        let session = init_session(
+            requests,
+            tx,
+            Some(MockLeaderStepDown::new(LeaderStepDownEvent::StepDown)),
+            handler_group.clone(),
+        )
+        .await
+        .unwrap();
+        let _ = recv_response(&mut rx).await.unwrap();
+
+        session.run().await;
+
+        let status = recv_response(&mut rx).await.unwrap_err();
+        assert_eq!(Code::Aborted, status.code());
+        assert!(!handler_group.contains_pusher(&sender_id).await);
+    }
+
+    #[tokio::test]
+    async fn test_heartbeat_session_leader_watcher_closed_sends_unavailable_and_deregisters() {
+        let (tx, mut rx) = mpsc::channel(8);
+        let handler_group = test_handler_group();
+        let sender_id = sender_id(Role::Datanode, 42);
+        let requests = MockHeartbeatRequestStream::pending_after(vec![Ok(heartbeat_request(
+            Role::Datanode,
+            42,
+        ))]);
+        let session = init_session(
+            requests,
+            tx,
+            Some(MockLeaderStepDown::new(LeaderStepDownEvent::Closed)),
+            handler_group.clone(),
+        )
+        .await
+        .unwrap();
+        let _ = recv_response(&mut rx).await.unwrap();
+
+        session.run().await;
+
+        let status = recv_response(&mut rx).await.unwrap_err();
+        assert_eq!(Code::Unavailable, status.code());
+        assert!(!handler_group.contains_pusher(&sender_id).await);
+    }
+
    #[tokio::test]
    async fn test_ask_leader() {
        let kv_backend = Arc::new(MemoryKvBackend::new());
--- a/src/meta-srv/src/service/mailbox.rs
+++ b/src/meta-srv/src/service/mailbox.rs
@@ -13,7 +13,6 @@
 // limitations under the License.

 use std::fmt::{Display, Formatter};
-use std::ops::Range;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
@@ -69,20 +68,11 @@ pub enum BroadcastChannel {
 }

 impl BroadcastChannel {
-    pub(crate) fn pusher_range(&self) -> Range<String> {
+    pub(crate) fn role(&self) -> Role {
        match self {
-            BroadcastChannel::Datanode => Range {
-                start: format!("{}-", Role::Datanode as i32),
-                end: format!("{}-", Role::Frontend as i32),
-            },
-            BroadcastChannel::Frontend => Range {
-                start: format!("{}-", Role::Frontend as i32),
-                end: format!("{}-", Role::Flownode as i32),
-            },
-            BroadcastChannel::Flownode => Range {
-                start: format!("{}-", Role::Flownode as i32),
-                end: format!("{}-", Role::Flownode as i32 + 1),
-            },
+            BroadcastChannel::Datanode => Role::Datanode,
+            BroadcastChannel::Frontend => Role::Frontend,
+            BroadcastChannel::Flownode => Role::Flownode,
        }
    }
 }
@@ -207,9 +197,6 @@ pub trait Mailbox: Send + Sync {
    async fn broadcast(&self, ch: &BroadcastChannel, msg: &MailboxMessage) -> Result<()>;

    async fn on_recv(&self, id: MessageId, maybe_msg: Result<MailboxMessage>) -> Result<()>;
-
-    /// Reset all pushers of the mailbox.
-    async fn reset(&self);
 }

 #[cfg(test)]
@@ -222,19 +209,10 @@ mod tests {
    use super::*;

    #[test]
-    fn test_channel_pusher_range() {
-        assert_eq!(
-            BroadcastChannel::Datanode.pusher_range(),
-            ("0-".to_string().."1-".to_string())
-        );
-        assert_eq!(
-            BroadcastChannel::Frontend.pusher_range(),
-            ("1-".to_string().."2-".to_string())
-        );
-        assert_eq!(
-            BroadcastChannel::Flownode.pusher_range(),
-            ("2-".to_string().."3-".to_string())
-        );
+    fn test_broadcast_channel_role() {
+        assert_eq!(BroadcastChannel::Datanode.role(), Role::Datanode);
+        assert_eq!(BroadcastChannel::Frontend.role(), Role::Frontend);
+        assert_eq!(BroadcastChannel::Flownode.role(), Role::Flownode);
    }

    #[tokio::test]
--- a/src/metric-engine/src/engine/put.rs
+++ b/src/metric-engine/src/engine/put.rs
@@ -31,7 +31,7 @@ use store_api::storage::{RegionId, TableId};

 use crate::engine::MetricEngineInner;
 use crate::error::{
-    ColumnNotFoundSnafu, CreateDefaultSnafu, ForbiddenPhysicalAlterSnafu, InvalidRequestSnafu,
+    ColumnNotFoundSnafu, CreateDefaultSnafu, ForbiddenPhysicalWriteSnafu, InvalidRequestSnafu,
    LogicalRegionNotFoundSnafu, PhysicalRegionNotFoundSnafu, Result, UnexpectedRequestSnafu,
    UnsupportedRegionRequestSnafu,
 };
@@ -55,7 +55,7 @@ impl MetricEngineInner {
            );
            FORBIDDEN_OPERATION_COUNT.inc();

-            ForbiddenPhysicalAlterSnafu.fail()
+            ForbiddenPhysicalWriteSnafu.fail()
        } else {
            self.put_logical_region(region_id, request).await
        }
@@ -86,18 +86,31 @@ impl MetricEngineInner {

        // Fast path: single request, no batching overhead
        if len == 1 {
-            let (logical_id, req) = requests.into_iter().next().unwrap();
-            return self.put_logical_region(logical_id, req).await;
+            let (region_id, req) = requests.into_iter().next().unwrap();
+            let is_putting_physical_region =
+                self.state.read().unwrap().exist_physical_region(region_id);
+            if is_putting_physical_region {
+                FORBIDDEN_OPERATION_COUNT.inc();
+                return ForbiddenPhysicalWriteSnafu.fail();
+            }
+
+            return self.put_logical_region(region_id, req).await;
        }

        let mut requests_per_physical: HashMap<RegionId, Vec<(RegionId, RegionPutRequest)>> =
            HashMap::new();
-        for (logical_region_id, request) in requests {
-            let physical_region_id = self.find_physical_region_id(logical_region_id)?;
+        for (region_id, request) in requests {
+            let is_putting_physical_region =
+                self.state.read().unwrap().exist_physical_region(region_id);
+            if is_putting_physical_region {
+                FORBIDDEN_OPERATION_COUNT.inc();
+                return ForbiddenPhysicalWriteSnafu.fail();
+            }
+            let physical_region_id = self.find_physical_region_id(region_id)?;
            requests_per_physical
                .entry(physical_region_id)
                .or_default()
-                .push((logical_region_id, request));
+                .push((region_id, request));
        }

        let mut total_affected_rows: AffectedRows = 0;
@@ -1226,6 +1239,84 @@ mod tests {
        assert_eq!(batches.iter().map(|b| b.num_rows()).sum::<usize>(), 0);
    }

+    #[tokio::test]
+    async fn test_batch_write_single_physical_region_forbidden() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+        let engine = env.metric();
+
+        let physical_region_id = env.default_physical_region_id();
+        let schema = test_util::row_schema_with_tags(&["job"]);
+        let requests = vec![(
+            physical_region_id,
+            RegionPutRequest {
+                rows: Rows {
+                    schema,
+                    rows: test_util::build_rows(1, 1),
+                },
+                hint: None,
+                partition_expr_version: None,
+            },
+        )];
+
+        let err = engine
+            .inner
+            .put_regions_batch(requests.into_iter())
+            .await
+            .unwrap_err();
+
+        assert!(matches!(
+            err,
+            crate::error::Error::ForbiddenPhysicalWrite { .. }
+        ));
+    }
+
+    #[tokio::test]
+    async fn test_batch_write_physical_region_forbidden() {
+        let env = TestEnv::new().await;
+        env.init_metric_region().await;
+        let engine = env.metric();
+
+        let physical_region_id = env.default_physical_region_id();
+        let logical_region_id = env.default_logical_region_id();
+        let schema = test_util::row_schema_with_tags(&["job"]);
+        let requests = vec![
+            (
+                logical_region_id,
+                RegionPutRequest {
+                    rows: Rows {
+                        schema: schema.clone(),
+                        rows: test_util::build_rows(1, 1),
+                    },
+                    hint: None,
+                    partition_expr_version: None,
+                },
+            ),
+            (
+                physical_region_id,
+                RegionPutRequest {
+                    rows: Rows {
+                        schema,
+                        rows: test_util::build_rows(1, 1),
+                    },
+                    hint: None,
+                    partition_expr_version: None,
+                },
+            ),
+        ];
+
+        let err = engine
+            .inner
+            .put_regions_batch(requests.into_iter())
+            .await
+            .unwrap_err();
+
+        assert!(matches!(
+            err,
+            crate::error::Error::ForbiddenPhysicalWrite { .. }
+        ));
+    }
+
    #[tokio::test]
    async fn test_batch_write_single_request_fast_path() {
        let env = TestEnv::new().await;
--- a/src/metric-engine/src/error.rs
+++ b/src/metric-engine/src/error.rs
@@ -254,6 +254,12 @@ pub enum Error {
        location: Location,
    },

+    #[snafu(display("Write request to physical region is forbidden"))]
+    ForbiddenPhysicalWrite {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("Invalid region metadata"))]
    InvalidMetadata {
        source: store_api::metadata::MetadataError,
@@ -411,6 +417,7 @@ impl ErrorExt for Error {
            | CreateDefault { .. } => StatusCode::InvalidArguments,

            ForbiddenPhysicalAlter { .. }
+            | ForbiddenPhysicalWrite { .. }
            | UnsupportedRegionRequest { .. }
            | MissingFiles { .. } => StatusCode::Unsupported,

--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -53,6 +53,7 @@ dashmap.workspace = true
 dotenv.workspace = true
 either.workspace = true
 futures.workspace = true
+humantime.workspace = true
 humantime-serde.workspace = true
 index.workspace = true
 itertools.workspace = true
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -30,6 +30,7 @@ use std::sync::Arc;
 use bytes::Bytes;
 use common_base::readable_size::ReadableSize;
 use common_telemetry::warn;
+use datatypes::arrow::buffer::BooleanBuffer;
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::value::Value;
 use datatypes::vectors::VectorRef;
@@ -38,8 +39,10 @@ use index::result_cache::IndexResultCache;
 use moka::notification::RemovalCause;
 use moka::sync::Cache;
 use object_store::ObjectStore;
+use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
 use parquet::file::metadata::{FileMetaData, PageIndexPolicy, ParquetMetaData};
 use puffin::puffin_manager::cache::{PuffinMetadataCache, PuffinMetadataCacheRef};
+use smallvec::SmallVec;
 use snafu::{OptionExt, ResultExt};
 use store_api::metadata::RegionMetadataRef;
 use store_api::storage::{ConcreteDataType, FileId, RegionId, TimeSeriesRowSelector};
@@ -74,6 +77,8 @@ const INDEX_TYPE: &str = "index";
 const SELECTOR_RESULT_TYPE: &str = "selector_result";
 /// Metrics type key for range scan result cache.
 const RANGE_RESULT_TYPE: &str = "range_result";
+/// Metrics type key for prefilter result cache.
+const PREFILTER_RESULT_TYPE: &str = "prefilter_result";
 const RANGE_RESULT_CONCAT_MEMORY_LIMIT: ReadableSize = ReadableSize::mb(512);
 const RANGE_RESULT_CONCAT_MEMORY_PERMIT: ReadableSize = ReadableSize::kb(1);

@@ -274,6 +279,117 @@ fn strip_region_metadata_from_parquet(parquet_metadata: ParquetMetaData) -> Parq
        .build()
 }

+fn removal_cause_str(cause: RemovalCause) -> &'static str {
+    match cause {
+        RemovalCause::Expired => "expired",
+        RemovalCause::Explicit => "explicit",
+        RemovalCause::Replaced => "replaced",
+        RemovalCause::Size => "size",
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct PrefilterRowSelector {
+    row_count: usize,
+    skip: bool,
+}
+
+// `parquet::arrow::arrow_reader::RowSelector` does not implement `Hash`, but
+// prefilter cache keys must hash the upstream row-selection snapshot. Keep a
+// local hashable mirror of the two fields that define selector semantics.
+// TODO(yingwen): Remove this mirror if upstream `RowSelector` implements `Hash`.
+impl From<&RowSelector> for PrefilterRowSelector {
+    fn from(selector: &RowSelector) -> Self {
+        Self {
+            row_count: selector.row_count,
+            skip: selector.skip,
+        }
+    }
+}
+
+/// Key for a cached prefilter result.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub(crate) struct PrefilterKey {
+    file_id: FileId,
+    row_group_idx: u32,
+    row_selection: Option<Arc<Vec<PrefilterRowSelector>>>,
+    schema_version: u64,
+    filter_exprs: SmallVec<[String; 1]>,
+    mem_usage: usize,
+}
+
+impl PrefilterKey {
+    pub(crate) fn row_selection_snapshot(
+        row_selection: Option<&RowSelection>,
+    ) -> Option<Arc<Vec<PrefilterRowSelector>>> {
+        row_selection.map(|selection| {
+            Arc::new(
+                selection
+                    .iter()
+                    .map(PrefilterRowSelector::from)
+                    .collect::<Vec<_>>(),
+            )
+        })
+    }
+
+    pub(crate) fn new(
+        file_id: FileId,
+        row_group_idx: u32,
+        row_selection: Option<Arc<Vec<PrefilterRowSelector>>>,
+        schema_version: u64,
+        filter_exprs: SmallVec<[String; 1]>,
+    ) -> Self {
+        let row_selection_bytes = row_selection
+            .as_ref()
+            .map(|selection| selection.len() * mem::size_of::<PrefilterRowSelector>())
+            .unwrap_or(0);
+        let spilled_expr_bytes = if filter_exprs.spilled() {
+            filter_exprs.capacity() * mem::size_of::<String>()
+        } else {
+            0
+        };
+        let expr_bytes = filter_exprs.iter().map(|s| s.capacity()).sum::<usize>();
+
+        Self {
+            file_id,
+            row_group_idx,
+            row_selection,
+            schema_version,
+            filter_exprs,
+            mem_usage: mem::size_of::<Self>()
+                + row_selection_bytes
+                + spilled_expr_bytes
+                + expr_bytes,
+        }
+    }
+
+    fn mem_usage(&self) -> usize {
+        self.mem_usage
+    }
+}
+
+type PrefilterResultCache = Cache<PrefilterKey, Arc<BooleanBuffer>>;
+
+fn new_prefilter_result_cache(capacity: u64) -> PrefilterResultCache {
+    Cache::builder()
+        .max_capacity(capacity)
+        .weigher(prefilter_result_cache_weight)
+        .eviction_listener(|k, v, cause| {
+            let size = prefilter_result_cache_weight(&k, &v);
+            CACHE_BYTES
+                .with_label_values(&[PREFILTER_RESULT_TYPE])
+                .sub(size.into());
+            CACHE_EVICTION
+                .with_label_values(&[PREFILTER_RESULT_TYPE, removal_cause_str(cause)])
+                .inc();
+        })
+        .build()
+}
+
+fn prefilter_result_cache_weight(k: &PrefilterKey, v: &Arc<BooleanBuffer>) -> u32 {
+    (k.mem_usage() + mem::size_of::<BooleanBuffer>() + v.values().len()) as u32
+}
+
 /// Cache strategies that may only enable a subset of caches.
 #[derive(Clone)]
 pub enum CacheStrategy {
@@ -358,6 +474,23 @@ impl CacheStrategy {
        }
    }

+    /// Calls [CacheManager::get_prefilter_result()].
+    /// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
+    pub(crate) fn get_prefilter_result(&self, key: &PrefilterKey) -> Option<Arc<BooleanBuffer>> {
+        match self {
+            CacheStrategy::EnableAll(cache_manager) => cache_manager.get_prefilter_result(key),
+            CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
+        }
+    }
+
+    /// Calls [CacheManager::put_prefilter_result()].
+    /// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
+    pub(crate) fn put_prefilter_result(&self, key: PrefilterKey, result: Arc<BooleanBuffer>) {
+        if let CacheStrategy::EnableAll(cache_manager) = self {
+            cache_manager.put_prefilter_result(key, result);
+        }
+    }
+
    /// Calls [CacheManager::remove_parquet_meta_data()].
    pub fn remove_parquet_meta_data(&self, file_id: RegionFileId) {
        match self {
@@ -610,6 +743,8 @@ pub struct CacheManager {
    range_result_memory_limiter: Arc<RangeResultMemoryLimiter>,
    /// Cache for index result.
    index_result_cache: Option<IndexResultCache>,
+    /// Cache for prefilter result.
+    prefilter_result_cache: Option<PrefilterResultCache>,
 }

 pub type CacheManagerRef = Arc<CacheManager>;
@@ -908,6 +1043,21 @@ impl CacheManager {
    pub(crate) fn index_result_cache(&self) -> Option<&IndexResultCache> {
        self.index_result_cache.as_ref()
    }
+
+    pub(crate) fn get_prefilter_result(&self, key: &PrefilterKey) -> Option<Arc<BooleanBuffer>> {
+        self.prefilter_result_cache
+            .as_ref()
+            .and_then(|cache| update_hit_miss(cache.get(key), PREFILTER_RESULT_TYPE))
+    }
+
+    pub(crate) fn put_prefilter_result(&self, key: PrefilterKey, result: Arc<BooleanBuffer>) {
+        if let Some(cache) = &self.prefilter_result_cache {
+            CACHE_BYTES
+                .with_label_values(&[PREFILTER_RESULT_TYPE])
+                .add(prefilter_result_cache_weight(&key, &result).into());
+            cache.insert(key, result);
+        }
+    }
 }

 /// Increases selector cache miss metrics.
@@ -930,6 +1080,7 @@ pub struct CacheManagerBuilder {
    index_content_size: u64,
    index_content_page_size: u64,
    index_result_cache_size: u64,
+    prefilter_result_cache_size: u64,
    puffin_metadata_size: u64,
    write_cache: Option<WriteCacheRef>,
    selector_result_cache_size: u64,
@@ -985,6 +1136,12 @@ impl CacheManagerBuilder {
        self
    }

+    /// Sets cache size for prefilter result.
+    pub fn prefilter_result_cache_size(mut self, bytes: u64) -> Self {
+        self.prefilter_result_cache_size = bytes;
+        self
+    }
+
    /// Sets cache size for puffin metadata.
    pub fn puffin_metadata_size(mut self, bytes: u64) -> Self {
        self.puffin_metadata_size = bytes;
@@ -1005,15 +1162,6 @@ impl CacheManagerBuilder {

    /// Builds the [CacheManager].
    pub fn build(self) -> CacheManager {
-        fn to_str(cause: RemovalCause) -> &'static str {
-            match cause {
-                RemovalCause::Expired => "expired",
-                RemovalCause::Explicit => "explicit",
-                RemovalCause::Replaced => "replaced",
-                RemovalCause::Size => "size",
-            }
-        }
-
        let sst_meta_cache = (self.sst_meta_cache_size != 0).then(|| {
            Cache::builder()
                .max_capacity(self.sst_meta_cache_size)
@@ -1024,7 +1172,7 @@ impl CacheManagerBuilder {
                        .with_label_values(&[SST_META_TYPE])
                        .sub(size.into());
                    CACHE_EVICTION
-                        .with_label_values(&[SST_META_TYPE, to_str(cause)])
+                        .with_label_values(&[SST_META_TYPE, removal_cause_str(cause)])
                        .inc();
                })
                .build()
@@ -1039,7 +1187,7 @@ impl CacheManagerBuilder {
                        .with_label_values(&[VECTOR_TYPE])
                        .sub(size.into());
                    CACHE_EVICTION
-                        .with_label_values(&[VECTOR_TYPE, to_str(cause)])
+                        .with_label_values(&[VECTOR_TYPE, removal_cause_str(cause)])
                        .inc();
                })
                .build()
@@ -1052,7 +1200,7 @@ impl CacheManagerBuilder {
                    let size = page_cache_weight(&k, &v);
                    CACHE_BYTES.with_label_values(&[PAGE_TYPE]).sub(size.into());
                    CACHE_EVICTION
-                        .with_label_values(&[PAGE_TYPE, to_str(cause)])
+                        .with_label_values(&[PAGE_TYPE, removal_cause_str(cause)])
                        .inc();
                })
                .build()
@@ -1073,6 +1221,8 @@ impl CacheManagerBuilder {
            .then(|| Arc::new(VectorIndexCache::new(self.index_content_size)));
        let index_result_cache = (self.index_result_cache_size != 0)
            .then(|| IndexResultCache::new(self.index_result_cache_size));
+        let prefilter_result_cache = (self.prefilter_result_cache_size != 0)
+            .then(|| new_prefilter_result_cache(self.prefilter_result_cache_size));
        let puffin_metadata_cache =
            PuffinMetadataCache::new(self.puffin_metadata_size, &CACHE_BYTES);
        let selector_result_cache = (self.selector_result_cache_size != 0).then(|| {
@@ -1085,7 +1235,7 @@ impl CacheManagerBuilder {
                        .with_label_values(&[SELECTOR_RESULT_TYPE])
                        .sub(size.into());
                    CACHE_EVICTION
-                        .with_label_values(&[SELECTOR_RESULT_TYPE, to_str(cause)])
+                        .with_label_values(&[SELECTOR_RESULT_TYPE, removal_cause_str(cause)])
                        .inc();
                })
                .build()
@@ -1100,7 +1250,7 @@ impl CacheManagerBuilder {
                        .with_label_values(&[RANGE_RESULT_TYPE])
                        .sub(size.into());
                    CACHE_EVICTION
-                        .with_label_values(&[RANGE_RESULT_TYPE, to_str(cause)])
+                        .with_label_values(&[RANGE_RESULT_TYPE, removal_cause_str(cause)])
                        .inc();
                })
                .build()
@@ -1123,6 +1273,7 @@ impl CacheManagerBuilder {
                RANGE_RESULT_CONCAT_MEMORY_PERMIT.as_bytes() as usize,
            )),
            index_result_cache,
+            prefilter_result_cache,
        }
    }
 }
@@ -1551,6 +1702,127 @@ mod tests {
        assert!(cache.get_selector_result(&key).is_some());
    }

+    #[test]
+    fn test_prefilter_result_cache() {
+        let disabled = CacheManager::builder().build();
+        let file_id = FileId::random();
+        let key = PrefilterKey::new(
+            file_id,
+            0,
+            None,
+            1,
+            SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()]),
+        );
+        let selection = Arc::new(BooleanBuffer::new_set(3));
+
+        disabled.put_prefilter_result(key.clone(), selection.clone());
+        assert!(disabled.get_prefilter_result(&key).is_none());
+
+        let cache = Arc::new(
+            CacheManager::builder()
+                .prefilter_result_cache_size(1000)
+                .build(),
+        );
+        assert!(cache.get_prefilter_result(&key).is_none());
+        cache.put_prefilter_result(key.clone(), selection.clone());
+        assert_eq!(
+            cache.get_prefilter_result(&key).unwrap().as_ref(),
+            selection.as_ref()
+        );
+
+        let enable_all = CacheStrategy::EnableAll(cache.clone());
+        assert!(enable_all.get_prefilter_result(&key).is_some());
+
+        let compaction = CacheStrategy::Compaction(cache.clone());
+        assert!(compaction.get_prefilter_result(&key).is_none());
+        compaction.put_prefilter_result(key.clone(), selection.clone());
+        assert!(cache.get_prefilter_result(&key).is_some());
+
+        let disabled_strategy = CacheStrategy::Disabled;
+        assert!(disabled_strategy.get_prefilter_result(&key).is_none());
+        disabled_strategy.put_prefilter_result(key.clone(), selection);
+        assert!(cache.get_prefilter_result(&key).is_some());
+    }
+
+    #[test]
+    fn test_prefilter_key_distinguishes_dimensions() {
+        let file_id = FileId::random();
+        let row_selection = RowSelection::from(vec![RowSelector::skip(1), RowSelector::select(3)]);
+        let other_row_selection =
+            RowSelection::from(vec![RowSelector::skip(2), RowSelector::select(2)]);
+        let row_selection = PrefilterKey::row_selection_snapshot(Some(&row_selection));
+        let other_row_selection = PrefilterKey::row_selection_snapshot(Some(&other_row_selection));
+        let base = PrefilterKey::new(
+            file_id,
+            0,
+            row_selection.clone(),
+            1,
+            SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()]),
+        );
+
+        assert_ne!(
+            base,
+            PrefilterKey::new(
+                FileId::random(),
+                0,
+                row_selection.clone(),
+                1,
+                SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()])
+            )
+        );
+        assert_ne!(
+            base,
+            PrefilterKey::new(
+                file_id,
+                1,
+                row_selection.clone(),
+                1,
+                SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()])
+            )
+        );
+        assert_ne!(
+            base,
+            PrefilterKey::new(
+                file_id,
+                0,
+                other_row_selection,
+                1,
+                SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()])
+            )
+        );
+        assert_ne!(
+            base,
+            PrefilterKey::new(
+                file_id,
+                0,
+                row_selection.clone(),
+                1,
+                SmallVec::from_vec(vec!["tag_0 IN ([b])".to_string()])
+            )
+        );
+        assert_ne!(
+            base,
+            PrefilterKey::new(
+                file_id,
+                0,
+                row_selection.clone(),
+                2,
+                SmallVec::from_vec(vec!["tag_0 IN ([a])".to_string()])
+            )
+        );
+        let pk_group = PrefilterKey::new(
+            file_id,
+            0,
+            row_selection,
+            1,
+            SmallVec::from_vec(vec![
+                "tag_0 IN ([a])".to_string(),
+                "tag_1 IN ([x])".to_string(),
+            ]),
+        );
+        assert_ne!(base, pk_group);
+    }
+
    #[test]
    fn test_range_result_cache() {
        let cache = Arc::new(
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -117,6 +117,8 @@ pub struct MitoConfig {
    pub selector_result_cache_size: ReadableSize,
    /// Cache size for flat range scan results. Setting it to 0 to disable the cache.
    pub range_result_cache_size: ReadableSize,
+    /// Cache size for prefilter results. Setting it to 0 to disable the cache.
+    pub prefilter_result_cache_size: ReadableSize,
    /// Whether to enable the write cache.
    pub enable_write_cache: bool,
    /// File system path for write cache dir's root, defaults to `{data_home}`.
@@ -202,6 +204,7 @@ impl Default for MitoConfig {
            page_cache_size: ReadableSize::mb(512),
            selector_result_cache_size: ReadableSize::mb(512),
            range_result_cache_size: ReadableSize::mb(512),
+            prefilter_result_cache_size: ReadableSize::mb(128),
            enable_write_cache: false,
            write_cache_path: String::new(),
            write_cache_size: ReadableSize::gb(5),
@@ -330,6 +333,8 @@ impl MitoConfig {
        self.page_cache_size = page_cache_size;
        self.selector_result_cache_size = mem_cache_size;
        self.range_result_cache_size = mem_cache_size;
+        // Use a smaller cache size because prefilter result usually should be small.
+        self.prefilter_result_cache_size = sst_meta_cache_size;

        self.index.adjust_buffer_and_cache_size(sys_memory);
    }
--- a/src/mito2/src/engine.rs
+++ b/src/mito2/src/engine.rs
@@ -118,6 +118,7 @@ use store_api::region_engine::{
    RemapManifestsResponse, SetRegionRoleStateResponse, SettableRegionRoleState,
    SyncRegionFromRequest, SyncRegionFromResponse,
 };
+use store_api::region_info::RegionInfoEntry;
 use store_api::region_request::{
    AffectedRows, RegionCatchupRequest, RegionOpenRequest, RegionRequest,
 };
@@ -613,8 +614,10 @@ impl MitoEngine {
                            return Vec::new();
                        }
                    };
+                    // The index file path is derived from the physical file owner. After
+                    // repartition, `entry.region_id` is only the referring region.
                    let region_index_id = RegionIndexId::new(
-                        RegionFileId::new(entry.region_id, file_id),
+                        RegionFileId::new(entry.origin_region_id, file_id),
                        index_version,
                    );
                    let context = IndexEntryContext {
@@ -654,6 +657,16 @@ impl MitoEngine {
        results
    }

+    /// Lists region info entries of all regions in the engine.
+    pub async fn all_region_infos(&self) -> Vec<RegionInfoEntry> {
+        let node_id = self.inner.workers.file_ref_manager().node_id();
+        self.inner
+            .workers
+            .all_regions()
+            .map(|region| region.region_info_entry(node_id))
+            .collect()
+    }
+
    /// Lists all SSTs from the storage layer of all regions in the engine.
    pub fn all_ssts_from_storage(&self) -> impl Stream<Item = Result<StorageSstEntry>> {
        let node_id = self.inner.workers.file_ref_manager().node_id();
--- a/src/mito2/src/engine/basic_test.rs
+++ b/src/mito2/src/engine/basic_test.rs
@@ -978,6 +978,58 @@ async fn test_list_ssts_with_format(
    assert_eq!(debug_format, expected_storage_ssts, "{}", debug_format);
 }

+#[tokio::test]
+async fn test_all_region_infos() {
+    let mut env = TestEnv::with_prefix("all-region-infos").await;
+    let engine = env
+        .create_engine(MitoConfig {
+            default_flat_format: true,
+            ..Default::default()
+        })
+        .await;
+
+    let region_id = RegionId::new(1024, 7);
+    let request = CreateRequestBuilder::new().build();
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+
+    let rows = Rows {
+        schema: column_schemas,
+        rows: build_rows_for_key("region-info", 0, 3, 0),
+    };
+    put_rows(&engine, region_id, rows).await;
+    engine
+        .handle_request(
+            region_id,
+            RegionRequest::Flush(RegionFlushRequest::default()),
+        )
+        .await
+        .unwrap();
+
+    let entries = engine.all_region_infos().await;
+    let entry = entries
+        .iter()
+        .find(|entry| entry.region_id == region_id)
+        .expect("region info entry should exist");
+
+    assert_eq!(region_id.as_u64(), entry.region_id.as_u64());
+    assert_eq!(region_id.table_id(), entry.table_id);
+    assert_eq!(region_id.region_number(), entry.region_number);
+    assert_eq!(region_id.region_group(), entry.region_group);
+    assert_eq!(region_id.region_sequence(), entry.region_sequence);
+    assert!(!entry.state.is_empty());
+    assert_eq!("Leader", entry.role);
+    assert!(entry.writable);
+    assert_eq!(3, entry.committed_sequence);
+    assert_eq!(Some(3), entry.flushed_sequence);
+    assert!(entry.manifest_version > 0);
+    assert!(serde_json::from_str::<serde_json::Value>(&entry.region_options).is_ok());
+    assert_eq!("flat", entry.sst_format);
+}
+
 #[tokio::test]
 async fn test_all_index_metas_list_all_types() {
    test_all_index_metas_list_all_types_with_format(false, r#"
--- a/src/mito2/src/engine/scan_test.rs
+++ b/src/mito2/src/engine/scan_test.rs
@@ -100,7 +100,7 @@ async fn test_incremental_query_stale_error() {
            region_id,
            ScanRequest {
                memtable_min_sequence: Some(min_readable_seq),
-                sst_min_sequence: Some(u64::MAX),
+                skip_sst_files: true,
                ..Default::default()
            },
        )
--- a/src/mito2/src/read/range_cache.rs
+++ b/src/mito2/src/read/range_cache.rs
@@ -19,13 +19,20 @@ use std::sync::Arc;

 use async_stream::try_stream;
 use common_telemetry::warn;
+use common_time::Timestamp;
+use common_time::range::TimestampRange;
+use common_time::timestamp::TimeUnit;
+use datafusion_expr::expr::Expr;
+use datafusion_expr::{Between, BinaryExpr, Operator};
 use datatypes::arrow::compute::concat_batches;
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::prelude::ConcreteDataType;
+use datatypes::value::scalar_value_to_timestamp;
 use futures::TryStreamExt;
 use snafu::ResultExt;
 use store_api::region_engine::PartitionRange;
 use store_api::storage::{FileId, RegionId, TimeSeriesRowSelector};
+use table::predicate::is_string_timestamp_literal;
 use tokio::sync::{mpsc, oneshot};

 use crate::cache::CacheStrategy;
@@ -139,7 +146,6 @@ impl ScanRequestFingerprint {
            .unwrap_or(&[])
    }

-    #[allow(dead_code)]
    pub(crate) fn without_time_filters(&self) -> Self {
        Self {
            inner: Arc::clone(&self.inner),
@@ -266,6 +272,177 @@ pub(crate) fn collect_partition_range_row_groups(
    }
 }

+/// Returns the timestamp range where all time-only predicates are guaranteed true.
+///
+/// Returns `Some(min_to_max)` for empty input (vacuously true everywhere).
+/// Returns `None` if any expression contains an unsupported shape: `OR`, `NOT`,
+/// `IN`, non-literal RHS, unsupported operator, column-name mismatch, an `=`
+/// literal that cannot be represented exactly in the column unit, or overflow
+/// during bound adjustment.
+///
+/// This is intentionally stricter than `extract_time_range_from_expr` in
+/// `table::predicate`: lower bounds round up and upper bounds round down. If a
+/// partition's file-time range is contained by the returned range, every row in
+/// that partition satisfies the original time predicates.
+///
+/// `IsNull`/`IsNotNull` on the time index are not routed into `time_filters`
+/// today. If that changes, handle them here before stripping time filters from
+/// the cache key.
+pub(crate) fn implied_time_range_from_exprs(
+    ts_col_name: &str,
+    ts_col_unit: TimeUnit,
+    exprs: &[&Expr],
+) -> Option<TimestampRange> {
+    let mut acc = TimestampRange::min_to_max();
+    for expr in exprs {
+        let r = implied_time_range_from_expr(ts_col_name, ts_col_unit, expr)?;
+        acc = acc.and(&r);
+    }
+    Some(acc)
+}
+
+fn implied_time_range_from_expr(
+    ts_col_name: &str,
+    ts_col_unit: TimeUnit,
+    expr: &Expr,
+) -> Option<TimestampRange> {
+    match expr {
+        Expr::BinaryExpr(BinaryExpr { left, op, right }) => match op {
+            Operator::And => {
+                let l = implied_time_range_from_expr(ts_col_name, ts_col_unit, left)?;
+                let r = implied_time_range_from_expr(ts_col_name, ts_col_unit, right)?;
+                Some(l.and(&r))
+            }
+            Operator::Eq | Operator::Lt | Operator::LtEq | Operator::Gt | Operator::GtEq => {
+                implied_from_cmp(ts_col_name, ts_col_unit, left, *op, right)
+            }
+            // `OR` would require a strict intersection over a union of half-planes
+            // (not the loose-span union provided by `TimestampRange::or`), so we
+            // refuse it. Any other operator is unsupported.
+            _ => None,
+        },
+        Expr::Between(Between {
+            expr,
+            negated,
+            low,
+            high,
+        }) => {
+            if *negated {
+                return None;
+            }
+            implied_from_between(ts_col_name, ts_col_unit, expr, low, high)
+        }
+        // Includes `IsNull`, `IsNotNull`, `Not`, `InList`, function calls, etc.
+        _ => None,
+    }
+}
+
+fn match_ts_column_literal<'a>(
+    ts_col_name: &str,
+    left: &'a Expr,
+    right: &'a Expr,
+) -> Option<(Timestamp, bool)> {
+    let (col, scalar, reverse) = match (left, right) {
+        (Expr::Column(c), Expr::Literal(s, _)) => (c, s, false),
+        (Expr::Literal(s, _), Expr::Column(c)) => (c, s, true),
+        _ => return None,
+    };
+    if col.name != ts_col_name {
+        return None;
+    }
+    // Reject string literals: their conversion needs a timezone we do not have,
+    // and the existing extractor in `table::predicate` rejects them too.
+    if is_string_timestamp_literal(scalar) {
+        return None;
+    }
+    scalar_value_to_timestamp(scalar, None).map(|t| (t, reverse))
+}
+
+fn implied_from_cmp(
+    ts_col_name: &str,
+    ts_col_unit: TimeUnit,
+    left: &Expr,
+    op: Operator,
+    right: &Expr,
+) -> Option<TimestampRange> {
+    let (ts, reverse) = match_ts_column_literal(ts_col_name, left, right)?;
+    // Normalize to "column OP literal".
+    let op = if reverse {
+        match op {
+            Operator::Lt => Operator::Gt,
+            Operator::LtEq => Operator::GtEq,
+            Operator::Gt => Operator::Lt,
+            Operator::GtEq => Operator::LtEq,
+            Operator::Eq => Operator::Eq,
+            _ => return None,
+        }
+    } else {
+        op
+    };
+
+    match op {
+        Operator::GtEq => {
+            // ts >= L. Round the lower bound up in the column unit.
+            let b = ts.convert_to_ceil(ts_col_unit)?;
+            Some(TimestampRange::from_start(b))
+        }
+        Operator::Gt => {
+            // ts > L. floor(L) + 1 is the tight lower bound in the column unit.
+            let v = ts.convert_to(ts_col_unit)?.value().checked_add(1)?;
+            Some(TimestampRange::from_start(Timestamp::new(v, ts_col_unit)))
+        }
+        Operator::LtEq => {
+            // ts <= L. Round the upper bound down in the column unit.
+            let b = ts.convert_to(ts_col_unit)?;
+            Some(TimestampRange::until_end(b, true))
+        }
+        Operator::Lt => {
+            // ts < L. `ts < ceil(L)` is the tight bound: equal to `ts < L` when
+            // L is exactly representable, and `ts <= floor(L)` otherwise.
+            let b = ts.convert_to_ceil(ts_col_unit)?;
+            Some(TimestampRange::until_end(b, false))
+        }
+        Operator::Eq => {
+            // ts = L. Only provable when L is exactly representable.
+            let f = ts.convert_to(ts_col_unit)?;
+            let c = ts.convert_to_ceil(ts_col_unit)?;
+            if f.value() != c.value() {
+                return None;
+            }
+            Some(TimestampRange::single(f))
+        }
+        _ => None,
+    }
+}
+
+fn implied_from_between(
+    ts_col_name: &str,
+    ts_col_unit: TimeUnit,
+    expr: &Expr,
+    low: &Expr,
+    high: &Expr,
+) -> Option<TimestampRange> {
+    let Expr::Column(c) = expr else {
+        return None;
+    };
+    if c.name != ts_col_name {
+        return None;
+    }
+    let (low_s, high_s) = match (low, high) {
+        (Expr::Literal(l, _), Expr::Literal(h, _)) => (l, h),
+        _ => return None,
+    };
+    if is_string_timestamp_literal(low_s) || is_string_timestamp_literal(high_s) {
+        return None;
+    }
+    let low_ts = scalar_value_to_timestamp(low_s, None)?;
+    let high_ts = scalar_value_to_timestamp(high_s, None)?;
+    // BETWEEN low AND high is equivalent to ts >= low AND ts <= high.
+    let lo = low_ts.convert_to_ceil(ts_col_unit)?;
+    let hi = high_ts.convert_to(ts_col_unit)?;
+    Some(TimestampRange::new_inclusive(Some(lo), Some(hi)))
+}
+
 /// Builds a cache key for the given partition range if it is eligible for caching.
 pub(crate) fn build_range_cache_key(
    stream_ctx: &StreamContext,
@@ -292,17 +469,36 @@ pub(crate) fn build_range_cache_key(
        return None;
    }

-    // TODO(yingwen): We used to call `fingerprint.without_time_filters()` when the query's
-    // `TimestampRange` fully covered the partition's `FileTimeRange`, so different queries that
-    // all enclosed the same partition could share a cache entry. The cover check turned out to
-    // be too coarse: it returned true in cases where the dropped time predicates would still
-    // have excluded rows, so the cache served results that should have been filtered. Reviving
-    // the optimization needs a per-predicate implication check that walks each time-only `Expr`
-    // (recursing through AND/OR/NOT) and proves the predicate is satisfied for every timestamp
-    // inside the partition's `FileTimeRange` — not the looser "does `extract_time_range_from_expr`
-    // return a range that covers the partition" used previously. Until then, always carry the
-    // full fingerprint so cache reuse stays correct.
-    let scan = fingerprint.clone();
+    // If the implied range covers this partition's `FileTimeRange`, drop
+    // time-only predicates from the cache key so that queries with different
+    // but equally-covering time bounds share an entry. `None` means some
+    // time-only predicate had an unsupported shape (e.g. `OR`), so we keep
+    // them in the key.
+    let range_meta = &stream_ctx.ranges[part_range.identifier];
+    let (file_min, file_max) = range_meta.time_range;
+    let covers = match &stream_ctx.scan_implied_time_range {
+        // An empty implied range can never cover a non-empty file range, so
+        // short-circuit. We also skip the unit asserts because
+        // `TimestampRange::empty()` uses `Timestamp::default()` (millisecond),
+        // which would falsely trip the asserts for non-ms time index columns.
+        Some(implied) if !implied.is_empty() => {
+            // The `contains` check is sound only when `file_min`/`file_max`
+            // share the implied range's unit (the time index column's unit).
+            // Mito stores time index values in that unit; assert to catch any
+            // future drift.
+            if let Some(ts) = implied.start().as_ref().or(implied.end().as_ref()) {
+                assert_eq!(file_min.unit(), ts.unit());
+                assert_eq!(file_max.unit(), ts.unit());
+            }
+            implied.contains(&file_min) && implied.contains(&file_max)
+        }
+        _ => false,
+    };
+    let scan = if covers {
+        fingerprint.without_time_filters()
+    } else {
+        fingerprint.clone()
+    };

    Some(RangeScanCacheKey {
        region_id: stream_ctx.input.region_metadata().region_id,
@@ -722,11 +918,16 @@ mod tests {
            num_rows: 10,
        };
        let partition_range = range_meta.new_partition_range(0);
-        let scan_fingerprint = crate::read::scan_region::build_scan_fingerprint(&input);
+        let (scan_fingerprint, scan_implied_time_range) =
+            match crate::read::scan_region::build_scan_fingerprint(&input) {
+                Some(b) => (Some(b.fingerprint), b.implied_time_range),
+                None => (None, None),
+            };
        let stream_ctx = StreamContext {
            input,
            ranges: vec![range_meta],
            scan_fingerprint,
+            scan_implied_time_range,
            query_start: Instant::now(),
        };

@@ -770,57 +971,312 @@ mod tests {
    }

    #[tokio::test]
-    async fn preserves_time_filters_when_query_covers_partition_range() {
-        assert_range_cache_filters(
-            vec![
-                col("ts").gt_eq(ts_lit(1000)),
-                col("ts").lt(ts_lit(2001)),
-                col("ts").is_not_null(),
-                col("k0").eq(lit("foo")),
-            ],
-            TimestampRange::with_unit(1000, 2002, TimeUnit::Millisecond),
-            (
-                Timestamp::new_millisecond(1000),
-                Timestamp::new_millisecond(2000),
-            ),
-            vec![col("k0").eq(lit("foo")), col("ts").is_not_null()],
-            vec![col("ts").gt_eq(ts_lit(1000)), col("ts").lt(ts_lit(2001))],
-        )
-        .await;
+    async fn range_cache_time_filter_key_cases() {
+        let partition = (
+            Timestamp::new_millisecond(1000),
+            Timestamp::new_millisecond(2000),
+        );
+
+        struct Case {
+            filters: Vec<Expr>,
+            query_time_range: Option<TimestampRange>,
+            expected_filters: Vec<Expr>,
+            expected_time_filters: Vec<Expr>,
+        }
+
+        // Time filters are stripped only when their implied range fully covers
+        // the partition's file-time range. `is_not_null(ts)` stays in regular
+        // filters because it is not routed into `time_filters`.
+        for case in [
+            Case {
+                filters: vec![
+                    col("ts").gt_eq(ts_lit(1000)),
+                    col("ts").lt(ts_lit(2001)),
+                    col("ts").is_not_null(),
+                    col("k0").eq(lit("foo")),
+                ],
+                query_time_range: TimestampRange::with_unit(1000, 2002, TimeUnit::Millisecond),
+                expected_filters: vec![col("k0").eq(lit("foo")), col("ts").is_not_null()],
+                expected_time_filters: vec![],
+            },
+            Case {
+                filters: vec![
+                    col("ts").gt_eq(ts_lit(500)),
+                    col("ts").lt(ts_lit(3000)),
+                    col("k0").eq(lit("foo")),
+                ],
+                query_time_range: TimestampRange::with_unit(500, 3000, TimeUnit::Millisecond),
+                expected_filters: vec![col("k0").eq(lit("foo"))],
+                expected_time_filters: vec![],
+            },
+            Case {
+                filters: vec![
+                    col("ts").gt_eq(ts_lit(1000)),
+                    col("ts").lt_eq(ts_lit(2000)),
+                    col("k0").eq(lit("foo")),
+                ],
+                query_time_range: TimestampRange::with_unit(1000, 2001, TimeUnit::Millisecond),
+                expected_filters: vec![col("k0").eq(lit("foo"))],
+                expected_time_filters: vec![],
+            },
+            Case {
+                filters: vec![
+                    col("ts").between(ts_lit(1000), ts_lit(2000)),
+                    col("k0").eq(lit("foo")),
+                ],
+                query_time_range: TimestampRange::with_unit(1000, 2001, TimeUnit::Millisecond),
+                expected_filters: vec![col("k0").eq(lit("foo"))],
+                expected_time_filters: vec![],
+            },
+            Case {
+                filters: vec![col("ts").gt_eq(ts_lit(1200)), col("k0").eq(lit("foo"))],
+                query_time_range: TimestampRange::with_unit(1200, 2001, TimeUnit::Millisecond),
+                expected_filters: vec![col("k0").eq(lit("foo"))],
+                expected_time_filters: vec![col("ts").gt_eq(ts_lit(1200))],
+            },
+            Case {
+                filters: vec![
+                    col("ts").gt_eq(ts_lit(1500)),
+                    col("ts").is_not_null(),
+                    col("k0").eq(lit("foo")),
+                ],
+                query_time_range: None,
+                expected_filters: vec![col("k0").eq(lit("foo")), col("ts").is_not_null()],
+                expected_time_filters: vec![col("ts").gt_eq(ts_lit(1500))],
+            },
+        ] {
+            assert_range_cache_filters(
+                case.filters,
+                case.query_time_range,
+                partition,
+                case.expected_filters,
+                case.expected_time_filters,
+            )
+            .await;
+        }
    }

    #[tokio::test]
-    async fn preserves_time_filters_when_query_does_not_cover_partition_range() {
-        assert_range_cache_filters(
-            vec![col("ts").gt_eq(ts_lit(1000)), col("k0").eq(lit("foo"))],
-            TimestampRange::with_unit(1000, 1500, TimeUnit::Millisecond),
-            (
-                Timestamp::new_millisecond(1000),
-                Timestamp::new_millisecond(2000),
-            ),
-            vec![col("k0").eq(lit("foo"))],
-            vec![col("ts").gt_eq(ts_lit(1000))],
+    async fn two_distinct_queries_share_cache_key_when_both_cover() {
+        let partition_range = (
+            Timestamp::new_millisecond(1000),
+            Timestamp::new_millisecond(2000),
+        );
+
+        let (ctx_a, part_a) = new_stream_context(
+            vec![
+                col("ts").gt_eq(ts_lit(500)),
+                col("ts").lt(ts_lit(3000)),
+                col("k0").eq(lit("foo")),
+            ],
+            TimestampRange::with_unit(500, 3000, TimeUnit::Millisecond),
+            partition_range,
        )
        .await;
+        let (ctx_b, part_b) = new_stream_context(
+            vec![
+                col("ts").gt_eq(ts_lit(100)),
+                col("ts").lt(ts_lit(5000)),
+                col("k0").eq(lit("foo")),
+            ],
+            TimestampRange::with_unit(100, 5000, TimeUnit::Millisecond),
+            partition_range,
+        )
+        .await;
+
+        let key_a = build_range_cache_key(&ctx_a, &part_a).unwrap();
+        let key_b = build_range_cache_key(&ctx_b, &part_b).unwrap();
+        assert_eq!(key_a.scan, key_b.scan);
+        assert!(key_a.scan.time_filters().is_empty());
    }

    #[tokio::test]
-    async fn preserves_time_filters_when_query_has_no_time_range_limit() {
-        assert_range_cache_filters(
-            vec![
-                col("ts").gt_eq(ts_lit(1000)),
-                col("ts").is_not_null(),
-                col("k0").eq(lit("foo")),
-            ],
+    async fn disables_optimization_on_or_clause() {
+        let partition_range = (
+            Timestamp::new_millisecond(1000),
+            Timestamp::new_millisecond(2000),
+        );
+
+        let or_a = col("ts").gt_eq(ts_lit(1000)).or(col("ts").lt(ts_lit(500)));
+        let or_b = col("ts").gt_eq(ts_lit(900)).or(col("ts").lt(ts_lit(400)));
+
+        let (ctx_a, part_a) = new_stream_context(
+            vec![or_a.clone(), col("k0").eq(lit("foo"))],
            None,
-            (
-                Timestamp::new_millisecond(1000),
-                Timestamp::new_millisecond(2000),
-            ),
-            vec![col("k0").eq(lit("foo")), col("ts").is_not_null()],
-            vec![col("ts").gt_eq(ts_lit(1000))],
+            partition_range,
        )
        .await;
+        let (ctx_b, part_b) = new_stream_context(
+            vec![or_b.clone(), col("k0").eq(lit("foo"))],
+            None,
+            partition_range,
+        )
+        .await;
+
+        assert!(ctx_a.scan_implied_time_range.is_none());
+        let key_a = build_range_cache_key(&ctx_a, &part_a).unwrap();
+        let key_b = build_range_cache_key(&ctx_b, &part_b).unwrap();
+        assert_ne!(key_a.scan, key_b.scan);
+        assert_eq!(
+            key_a.scan.time_filters(),
+            normalized_exprs([or_a]).as_slice()
+        );
+    }
+
+    #[tokio::test]
+    async fn empty_implied_range_does_not_panic_on_non_ms_file_range() {
+        // Contradictory time predicates make the implied range empty. The
+        // empty range's sentinel timestamps use `Timestamp::default()` (ms),
+        // so without the `is_empty()` short-circuit the unit asserts would
+        // panic against a non-ms `range_meta.time_range`.
+        let partition = (
+            Timestamp::new_millisecond(1000),
+            Timestamp::new_millisecond(2000),
+        );
+
+        let (mut ctx, part_range) = new_stream_context(
+            vec![col("ts").gt_eq(ts_lit(1500)), col("k0").eq(lit("foo"))],
+            TimestampRange::with_unit(1500, 3000, TimeUnit::Millisecond),
+            partition,
+        )
+        .await;
+
+        ctx.scan_implied_time_range = Some(TimestampRange::empty());
+        ctx.ranges[0].time_range = (
+            Timestamp::new(1_000_000_000, TimeUnit::Nanosecond),
+            Timestamp::new(2_000_000_000, TimeUnit::Nanosecond),
+        );
+
+        let key = build_range_cache_key(&ctx, &part_range).unwrap();
+        // Empty implied range cannot cover, so time filters stay in the key.
+        assert!(!key.scan.time_filters().is_empty());
+    }
+
+    fn ms_ts(v: i64) -> Timestamp {
+        Timestamp::new_millisecond(v)
+    }
+
+    fn implied_ms(expr: Expr) -> Option<TimestampRange> {
+        implied_time_range_from_exprs("ts", TimeUnit::Millisecond, &[&expr])
+    }
+
+    #[test]
+    fn implied_time_range_supported_exprs() {
+        for (expr, expected) in [
+            (
+                col("ts").gt_eq(ts_lit(1000)),
+                Some(TimestampRange::from_start(ms_ts(1000))),
+            ),
+            (
+                col("ts").gt(ts_lit(1000)),
+                Some(TimestampRange::from_start(ms_ts(1001))),
+            ),
+            (
+                col("ts").lt_eq(ts_lit(2000)),
+                Some(TimestampRange::until_end(ms_ts(2000), true)),
+            ),
+            (
+                col("ts").lt(ts_lit(2000)),
+                Some(TimestampRange::until_end(ms_ts(2000), false)),
+            ),
+            (
+                col("ts").eq(ts_lit(1500)),
+                Some(TimestampRange::single(ms_ts(1500))),
+            ),
+            (
+                ts_lit(1000).lt_eq(col("ts")),
+                Some(TimestampRange::from_start(ms_ts(1000))),
+            ),
+            (
+                col("ts").between(ts_lit(1000), ts_lit(2000)),
+                Some(TimestampRange::new_inclusive(
+                    Some(ms_ts(1000)),
+                    Some(ms_ts(2000)),
+                )),
+            ),
+            (
+                col("ts")
+                    .gt_eq(ts_lit(1000))
+                    .and(col("ts").lt(ts_lit(2000))),
+                TimestampRange::with_unit(1000, 2000, TimeUnit::Millisecond),
+            ),
+            (
+                col("ts")
+                    .gt_eq(ts_lit(1000))
+                    .and(col("ts").lt(ts_lit(5000)))
+                    .and(col("ts").lt_eq(ts_lit(3000))),
+                TimestampRange::with_unit(1000, 3001, TimeUnit::Millisecond),
+            ),
+        ] {
+            assert_eq!(implied_ms(expr), expected);
+        }
+
+        assert_eq!(
+            implied_time_range_from_exprs("ts", TimeUnit::Millisecond, &[]),
+            Some(TimestampRange::min_to_max())
+        );
+    }
+
+    #[test]
+    fn implied_time_range_unsupported_exprs() {
+        let not_between = Expr::Between(Between {
+            expr: Box::new(col("ts")),
+            negated: true,
+            low: Box::new(ts_lit(1000)),
+            high: Box::new(ts_lit(2000)),
+        });
+
+        for expr in [
+            not_between,
+            col("ts").gt_eq(ts_lit(1000)).or(col("ts").lt(ts_lit(500))),
+            Expr::Not(Box::new(col("ts").gt_eq(ts_lit(1000)))),
+            col("ts").in_list(vec![ts_lit(1000), ts_lit(2000)], false),
+            col("ts").gt_eq(col("other")),
+            col("other_ts").gt_eq(ts_lit(1000)),
+        ] {
+            assert!(implied_ms(expr).is_none());
+        }
+    }
+
+    #[test]
+    fn implied_time_range_unit_conversion() {
+        let second_1 = lit(ScalarValue::TimestampSecond(Some(1), None));
+        let ns_1500 = lit(ScalarValue::TimestampNanosecond(Some(1_500_000_000), None));
+        let ns_1500_5 = lit(ScalarValue::TimestampNanosecond(Some(1_500_500_000), None));
+
+        for (expr, expected) in [
+            (
+                col("ts").gt_eq(second_1.clone()),
+                Some(TimestampRange::from_start(ms_ts(1000))),
+            ),
+            (
+                col("ts").lt_eq(second_1),
+                Some(TimestampRange::until_end(ms_ts(1000), true)),
+            ),
+            (
+                col("ts").eq(ns_1500),
+                Some(TimestampRange::single(ms_ts(1500))),
+            ),
+            (col("ts").eq(ns_1500_5.clone()), None),
+            (
+                col("ts").gt_eq(ns_1500_5.clone()),
+                Some(TimestampRange::from_start(ms_ts(1501))),
+            ),
+            (
+                col("ts").lt_eq(ns_1500_5.clone()),
+                Some(TimestampRange::until_end(ms_ts(1500), true)),
+            ),
+            (
+                col("ts").gt(ns_1500_5.clone()),
+                Some(TimestampRange::from_start(ms_ts(1501))),
+            ),
+            (
+                col("ts").lt(ns_1500_5),
+                Some(TimestampRange::until_end(ms_ts(1501), false)),
+            ),
+        ] {
+            assert_eq!(implied_ms(expr), expected);
+        }
    }

    #[test]
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -33,6 +33,7 @@ use datafusion_expr::Expr;
 use datafusion_expr::utils::expr_to_columns;
 use datatypes::schema::ext::ArrowSchemaExt;
 use futures::StreamExt;
+use itertools::Itertools;
 use partition::expr::PartitionExpr;
 use smallvec::SmallVec;
 use snafu::ResultExt;
@@ -57,7 +58,7 @@ use crate::metrics::READ_SST_COUNT;
 use crate::read::compat::{self, FlatCompatBatch};
 use crate::read::flat_projection::FlatProjectionMapper;
 use crate::read::range::{FileRangeBuilder, MemRangeBuilder, RangeMeta, RowGroupIndex};
-use crate::read::range_cache::ScanRequestFingerprint;
+use crate::read::range_cache::{ScanRequestFingerprint, implied_time_range_from_exprs};
 use crate::read::read_columns::{
    ReadColumns, merge, read_columns_from_predicate, read_columns_from_projection,
 };
@@ -436,7 +437,16 @@ impl ScanRegion {
            .schema
            .arrow_schema()
            .has_json_extension_field()
-            .then_some(&self.request.json_type_hint);
+            .then_some(&self.request.json_type_hint)
+            .inspect(|json_type_hint| {
+                debug!(
+                    "Concretized JSON type: {{{}}}",
+                    json_type_hint
+                        .iter()
+                        .map(|(k, v)| format!("{}: {}", k, v))
+                        .join(", ")
+                );
+            });
        let mapper = FlatProjectionMapper::new_with_read_columns(
            &self.version.metadata,
            projection,
@@ -446,26 +456,28 @@ impl ScanRegion {

        let ssts = &self.version.ssts;
        let mut files = Vec::new();
-        for level in ssts.levels() {
-            for file in level.files.values() {
-                let exceed_min_sequence = match (sst_min_sequence, file.meta_ref().sequence) {
-                    (Some(min_sequence), Some(file_sequence)) => file_sequence > min_sequence,
-                    // If the file's sequence is None (or actually is zero), it could mean the file
-                    // is generated and added to the region "directly". In this case, its data should
-                    // be considered as fresh as the memtable. So its sequence is treated greater than
-                    // the min_sequence, whatever the value of min_sequence is. Hence the default
-                    // "true" in this arm.
-                    (Some(_), None) => true,
-                    (None, _) => true,
-                };
+        if !self.request.skip_sst_files {
+            for level in ssts.levels() {
+                for file in level.files.values() {
+                    let exceed_min_sequence = match (sst_min_sequence, file.meta_ref().sequence) {
+                        (Some(min_sequence), Some(file_sequence)) => file_sequence > min_sequence,
+                        // If the file's sequence is None (or actually is zero), it could mean the file
+                        // is generated and added to the region "directly". In this case, its data should
+                        // be considered as fresh as the memtable. So its sequence is treated greater than
+                        // the min_sequence, whatever the value of min_sequence is. Hence the default
+                        // "true" in this arm.
+                        (Some(_), None) => true,
+                        (None, _) => true,
+                    };

-                // Finds SST files in range.
-                if exceed_min_sequence && file_in_range(file, &time_range) {
-                    files.push(file.clone());
+                    // Finds SST files in range.
+                    if exceed_min_sequence && file_in_range(file, &time_range) {
+                        files.push(file.clone());
+                    }
+                    // There is no need to check and prune for file's sequence here as the sequence number is usually very new,
+                    // unless the timing is too good, or the sequence number wouldn't be in file.
+                    // and the batch will be filtered out by tree reader anyway.
                }
-                // There is no need to check and prune for file's sequence here as the sequence number is usually very new,
-                // unless the timing is too good, or the sequence number wouldn't be in file.
-                // and the batch will be filtered out by tree reader anyway.
            }
        }

@@ -569,7 +581,9 @@ impl ScanRegion {
            .with_vector_index_k(vector_index_k);

        #[cfg(feature = "enterprise")]
-        let input = if let Some(provider) = self.extension_range_provider {
+        let input = if !self.request.skip_sst_files
+            && let Some(provider) = self.extension_range_provider
+        {
            let ranges = provider
                .find_extension_ranges(self.version.flushed_sequence, time_range, &self.request)
                .await?;
@@ -1299,9 +1313,21 @@ fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode {
    }
 }

-/// Builds a [ScanRequestFingerprint] from a [ScanInput] if the scan is eligible
+/// Output of [build_scan_fingerprint]: the cache fingerprint plus the derived
+/// implied time range used to decide whether the cache key can drop the time
+/// predicates for a given partition (see `build_range_cache_key`).
+pub(crate) struct ScanFingerprintBundle {
+    pub(crate) fingerprint: ScanRequestFingerprint,
+    /// `Some(r)` = all time-only predicates are guaranteed true on `r` (in the
+    /// column's `TimeUnit`).
+    /// `None`    = at least one time-only predicate could not be proven (e.g.
+    /// `OR`), so the cache-key optimization is disabled for this scan.
+    pub(crate) implied_time_range: Option<TimestampRange>,
+}
+
+/// Builds a [ScanFingerprintBundle] from a [ScanInput] if the scan is eligible
 /// for partition range caching.
-pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFingerprint> {
+pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanFingerprintBundle> {
    let eligible = !input.compaction
        && !input.files.is_empty()
        && matches!(input.cache_strategy, CacheStrategy::EnableAll(_));
@@ -1334,7 +1360,7 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFin
        .unwrap_or_default();

    let mut filters = Vec::new();
-    let mut time_filters = Vec::new();
+    let mut time_only_exprs: Vec<&Expr> = Vec::new();
    let mut has_tag_filter = false;
    let mut columns = HashSet::new();

@@ -1350,20 +1376,17 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFin
            _ => false,
        };

-        // TODO(yingwen): The split between `time_filters` and `filters` is currently inert
-        // because `build_range_cache_key()` always keeps both in the cache key. We used to
-        // strip `time_filters` when the query's `TimestampRange` covered the partition's
-        // `FileTimeRange`, but `extract_time_range_from_expr` is not precise enough to prove
-        // a time predicate is implied by that range (it can return a wider range than the
-        // predicate, and it does not analyze AND/OR shapes), which let the cache reuse rows
-        // that should have been filtered. Reviving the optimization needs a per-predicate
-        // implication check that walks each time-only `Expr` (recursing through AND/OR/NOT)
-        // and proves the predicate holds for every timestamp inside the partition's
-        // `FileTimeRange`; until then both buckets land in the fingerprint.
+        // Route time-only exprs that the legacy extractor recognizes into
+        // `time_only_exprs` so the implication walker
+        // (`implied_time_range_from_exprs`, called below) can attempt to drop
+        // them from the cache key when the partition's `FileTimeRange` is fully
+        // covered, then stringify them into the fingerprint's `time_filters`
+        // bucket. Time-only exprs that the extractor doesn't recognize stay in
+        // `filters` and never get stripped — conservatively correct.
        if is_time_only
            && extract_time_range_from_expr(&time_index_name, ts_col_unit, expr).is_some()
        {
-            time_filters.push(expr.to_string());
+            time_only_exprs.push(expr);
        } else {
            filters.push(expr.to_string());
        }
@@ -1374,31 +1397,38 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFin
        return None;
    }

+    let implied_time_range =
+        implied_time_range_from_exprs(&time_index_name, ts_col_unit, &time_only_exprs);
+    let mut time_filters: Vec<String> = time_only_exprs.iter().map(|e| e.to_string()).collect();
+
    // Ensure the filters are sorted for consistent fingerprinting.
    filters.sort_unstable();
    time_filters.sort_unstable();
    let read_columns = input.read_cols.clone();
-    Some(
-        crate::read::range_cache::ScanRequestFingerprintBuilder {
-            read_column_types: read_columns
-                .column_ids_iter()
-                .map(|id| {
-                    metadata
-                        .column_by_id(id)
-                        .map(|col| col.column_schema.data_type.clone())
-                })
-                .collect(),
-            read_columns,
-            filters,
-            time_filters,
-            series_row_selector: input.series_row_selector,
-            append_mode: input.append_mode,
-            filter_deleted: input.filter_deleted,
-            merge_mode: input.merge_mode,
-            partition_expr_version: metadata.partition_expr_version,
-        }
-        .build(),
-    )
+    let fingerprint = crate::read::range_cache::ScanRequestFingerprintBuilder {
+        read_column_types: read_columns
+            .column_ids_iter()
+            .map(|id| {
+                metadata
+                    .column_by_id(id)
+                    .map(|col| col.column_schema.data_type.clone())
+            })
+            .collect(),
+        read_columns,
+        filters,
+        time_filters,
+        series_row_selector: input.series_row_selector,
+        append_mode: input.append_mode,
+        filter_deleted: input.filter_deleted,
+        merge_mode: input.merge_mode,
+        partition_expr_version: metadata.partition_expr_version,
+    }
+    .build();
+
+    Some(ScanFingerprintBundle {
+        fingerprint,
+        implied_time_range,
+    })
 }

 /// Context shared by different streams from a scanner.
@@ -1412,6 +1442,13 @@ pub struct StreamContext {
    /// `None` when the scan is not eligible for caching.
    #[allow(dead_code)]
    pub(crate) scan_fingerprint: Option<ScanRequestFingerprint>,
+    /// Implied range of every time-only predicate, in the time index column's
+    /// `TimeUnit`. Used by `build_range_cache_key` to decide whether the
+    /// partition's `FileTimeRange` is fully covered (allowing `time_filters`
+    /// to be stripped from the cache key). `None` when caching is ineligible
+    /// or when the implication walker bailed on an unsupported shape (e.g.
+    /// `OR`).
+    pub(crate) scan_implied_time_range: Option<TimestampRange>,

    // Metrics:
    /// The start time of the query.
@@ -1424,12 +1461,16 @@ impl StreamContext {
        let query_start = input.query_start.unwrap_or_else(Instant::now);
        let ranges = RangeMeta::seq_scan_ranges(&input);
        READ_SST_COUNT.observe(input.num_files() as f64);
-        let scan_fingerprint = build_scan_fingerprint(&input);
+        let (scan_fingerprint, scan_implied_time_range) = match build_scan_fingerprint(&input) {
+            Some(b) => (Some(b.fingerprint), b.implied_time_range),
+            None => (None, None),
+        };

        Self {
            input,
            ranges,
            scan_fingerprint,
+            scan_implied_time_range,
            query_start,
        }
    }
@@ -1439,12 +1480,16 @@ impl StreamContext {
        let query_start = input.query_start.unwrap_or_else(Instant::now);
        let ranges = RangeMeta::unordered_scan_ranges(&input);
        READ_SST_COUNT.observe(input.num_files() as f64);
-        let scan_fingerprint = build_scan_fingerprint(&input);
+        let (scan_fingerprint, scan_implied_time_range) = match build_scan_fingerprint(&input) {
+            Some(b) => (Some(b.fingerprint), b.implied_time_range),
+            None => (None, None),
+        };

        Self {
            input,
            ranges,
            scan_fingerprint,
+            scan_implied_time_range,
            query_start,
        }
    }
@@ -1841,7 +1886,7 @@ mod tests {
            partition_expr_version: 0,
        }
        .build();
-        assert_eq!(expected, fingerprint);
+        assert_eq!(expected, fingerprint.fingerprint);
    }

    #[tokio::test]
@@ -1914,7 +1959,7 @@ mod tests {
            partition_expr_version: metadata.partition_expr_version,
        }
        .build();
-        assert_eq!(expected, fingerprint);
+        assert_eq!(expected, fingerprint.fingerprint);
        assert_ne!(0, metadata.partition_expr_version);
    }

--- a/src/mito2/src/read/scan_util.rs
+++ b/src/mito2/src/read/scan_util.rs
@@ -1375,6 +1375,7 @@ mod split_tests {
            input,
            ranges: vec![],
            scan_fingerprint: None,
+            scan_implied_time_range: None,
            query_start: std::time::Instant::now(),
        }
    }
@@ -1661,7 +1662,7 @@ where
    }
 }

-/// Splits the batch by timestamps.
+/// Splits the batch so each sub-batch has strictly increasing timestamps.
 ///
 /// # Panics
 /// Panics if the timestamp array is invalid.
@@ -1682,7 +1683,7 @@ pub(crate) fn split_record_batch(record_batch: RecordBatch, batches: &mut VecDeq
    offsets.push(0);
    let values = ts_values.values();
    for (i, &value) in values.iter().take(batch_rows - 1).enumerate() {
-        if value > values[i + 1] {
+        if value >= values[i + 1] {
            offsets.push(i + 1);
        }
    }
@@ -1755,6 +1756,7 @@ mod tests {
            input,
            ranges: Vec::new(),
            scan_fingerprint: None,
+            scan_implied_time_range: None,
            query_start: Instant::now(),
        })
    }
@@ -1949,4 +1951,76 @@ mod tests {
            compute_average_batch_size(std::iter::empty())
        );
    }
+
+    /// Builds a flat-format record batch whose time index column holds `timestamps`.
+    fn flat_ts_batch(timestamps: &[i64]) -> RecordBatch {
+        use datatypes::arrow::array::{TimestampMillisecondArray, UInt8Array, UInt64Array};
+        use datatypes::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+
+        let num_rows = timestamps.len();
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "ts",
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                false,
+            ),
+            Field::new("pk", DataType::UInt64, false),
+            Field::new("seq", DataType::UInt64, false),
+            Field::new("op", DataType::UInt8, false),
+        ]));
+        RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(TimestampMillisecondArray::from(timestamps.to_vec())),
+                Arc::new(UInt64Array::from(vec![0u64; num_rows])),
+                Arc::new(UInt64Array::from(vec![0u64; num_rows])),
+                Arc::new(UInt8Array::from(vec![0u8; num_rows])),
+            ],
+        )
+        .unwrap()
+    }
+
+    /// Splits `timestamps` and returns the time index values of each sub-batch.
+    fn split_ts(timestamps: &[i64]) -> Vec<Vec<i64>> {
+        let mut batches = VecDeque::new();
+        split_record_batch(flat_ts_batch(timestamps), &mut batches);
+        batches
+            .iter()
+            .map(|batch| {
+                let pos = time_index_column_index(batch.num_columns());
+                let (values, _) = timestamp_array_to_primitive(batch.column(pos)).unwrap();
+                values.values().to_vec()
+            })
+            .collect()
+    }
+
+    #[test]
+    fn test_split_record_batch_on_equal_timestamps() {
+        // Splits on both decreasing and equal timestamps.
+        assert_eq!(
+            split_ts(&[1, 2, 2, 3, 1]),
+            vec![vec![1, 2], vec![2, 3], vec![1]]
+        );
+        // A run of equal timestamps yields single-row sub-batches.
+        assert_eq!(split_ts(&[5, 5, 5]), vec![vec![5], vec![5], vec![5]]);
+        // Equal-ts run at the leading edge of the batch.
+        assert_eq!(split_ts(&[5, 5, 1, 2]), vec![vec![5], vec![5], vec![1, 2]]);
+        // Equal-ts run at the trailing edge of the batch.
+        assert_eq!(split_ts(&[1, 2, 5, 5]), vec![vec![1, 2, 5], vec![5]]);
+    }
+
+    #[test]
+    fn test_split_record_batch_on_decreasing_timestamps() {
+        assert_eq!(split_ts(&[1, 2, 3]), vec![vec![1, 2, 3]]);
+        assert_eq!(split_ts(&[1, 3, 2, 4]), vec![vec![1, 3], vec![2, 4]]);
+    }
+
+    #[test]
+    fn test_split_record_batch_empty_and_single_row() {
+        let mut batches = VecDeque::new();
+        split_record_batch(flat_ts_batch(&[]), &mut batches);
+        assert!(batches.is_empty());
+
+        assert_eq!(split_ts(&[42]), vec![vec![42]]);
+    }
 }
--- a/src/mito2/src/region.rs
+++ b/src/mito2/src/region.rs
@@ -37,6 +37,7 @@ use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::{
    RegionManifestInfo, RegionRole, RegionStatistic, SettableRegionRoleState,
 };
+use store_api::region_info::RegionInfoEntry;
 use store_api::region_request::{PathType, StagingPartitionDirective};
 use store_api::sst_entry::ManifestSstEntry;
 use store_api::storage::{FileId, RegionId, SequenceNumber};
@@ -111,6 +112,22 @@ impl RegionRoleState {
            RegionRoleState::Follower => None,
        }
    }
+
+    pub(crate) fn as_str(&self) -> &'static str {
+        match self {
+            RegionRoleState::Follower => "Follower",
+            RegionRoleState::Leader(RegionLeaderState::Writable) => "Leader(Writable)",
+            RegionRoleState::Leader(RegionLeaderState::Staging) => "Leader(Staging)",
+            RegionRoleState::Leader(RegionLeaderState::EnteringStaging) => {
+                "Leader(EnteringStaging)"
+            }
+            RegionRoleState::Leader(RegionLeaderState::Altering) => "Leader(Altering)",
+            RegionRoleState::Leader(RegionLeaderState::Dropping) => "Leader(Dropping)",
+            RegionRoleState::Leader(RegionLeaderState::Truncating) => "Leader(Truncating)",
+            RegionRoleState::Leader(RegionLeaderState::Editing) => "Leader(Editing)",
+            RegionRoleState::Leader(RegionLeaderState::Downgrading) => "Leader(Downgrading)",
+        }
+    }
 }

 /// Metadata and runtime status of a region.
@@ -584,14 +601,14 @@ impl MitoRegion {
        let memtables = &version.memtables;
        let memtable_usage = (memtables.mutable_usage() + memtables.immutables_usage()) as u64;

-        let sst_usage = version.ssts.sst_usage();
-        let index_usage = version.ssts.index_usage();
+        let sst_usage = version.ssts.owned_sst_usage(self.region_id);
+        let index_usage = version.ssts.owned_index_usage(self.region_id);
        let flushed_entry_id = version.flushed_entry_id;

        let wal_usage = self.estimated_wal_usage(memtable_usage);
        let manifest_usage = self.stats.total_manifest_size();
-        let num_rows = version.ssts.num_rows() + version.memtables.num_rows();
-        let num_files = version.ssts.num_files();
+        let num_rows = version.ssts.owned_num_rows(self.region_id) + version.memtables.num_rows();
+        let num_files = version.ssts.owned_num_files(self.region_id);
        let manifest_version = self.stats.manifest_version();
        let file_removed_cnt = self.stats.file_removed_cnt();

@@ -648,6 +665,41 @@ impl MitoRegion {
        self.access_layer.clone()
    }

+    /// Returns the region info entry of the region.
+    pub(crate) fn region_info_entry(&self, node_id: Option<u64>) -> RegionInfoEntry {
+        let region_id = self.region_id;
+        let version = self.version();
+        let state = self.state();
+        let role = self.region_role();
+        let region_options = serde_json::to_string(&version.options)
+            .unwrap_or_else(|err| serde_json::json!({ "error": err.to_string() }).to_string());
+        let sst_format = match version.options.sst_format.unwrap_or_default() {
+            crate::sst::FormatType::PrimaryKey => "primary_key",
+            crate::sst::FormatType::Flat => "flat",
+        }
+        .to_string();
+
+        RegionInfoEntry {
+            region_id,
+            table_id: region_id.table_id(),
+            region_number: region_id.region_number(),
+            region_group: region_id.region_group(),
+            region_sequence: region_id.region_sequence(),
+            state: state.as_str().to_string(),
+            role: role.to_string(),
+            writable: self.is_writable(),
+            committed_sequence: self.find_committed_sequence(),
+            flushed_sequence: Some(self.flushed_sequence()).filter(|sequence| *sequence > 0),
+            manifest_version: self.stats.manifest_version(),
+            compaction_time_window: version
+                .compaction_time_window
+                .map(|duration| humantime::format_duration(duration).to_string()),
+            region_options,
+            sst_format,
+            node_id,
+        }
+    }
+
    /// Returns the SST entries of the region.
    pub async fn manifest_sst_entries(&self) -> Vec<ManifestSstEntry> {
        let table_dir = self.table_dir();
@@ -1623,6 +1675,23 @@ mod tests {
        assert!(AtomicCell::<RegionRoleState>::is_lock_free());
    }

+    #[test]
+    fn test_region_role_state_as_str() {
+        assert_eq!("Follower", RegionRoleState::Follower.as_str());
+        assert_eq!(
+            "Leader(Writable)",
+            RegionRoleState::Leader(RegionLeaderState::Writable).as_str()
+        );
+        assert_eq!(
+            "Leader(Staging)",
+            RegionRoleState::Leader(RegionLeaderState::Staging).as_str()
+        );
+        assert_eq!(
+            "Leader(Downgrading)",
+            RegionRoleState::Leader(RegionLeaderState::Downgrading).as_str()
+        );
+    }
+
    async fn build_test_region(env: &SchedulerEnv) -> MitoRegion {
        let builder = VersionControlBuilder::new();
        let version_control = Arc::new(builder.build());
--- a/src/mito2/src/sst/index/bloom_filter/applier.rs
+++ b/src/mito2/src/sst/index/bloom_filter/applier.rs
@@ -21,6 +21,7 @@ use std::time::Instant;

 use common_base::range_read::RangeReader;
 use common_telemetry::{tracing, warn};
+use datatypes::data_type::ConcreteDataType;
 use index::bloom_filter::applier::{BloomFilterApplier, InListPredicate};
 use index::bloom_filter::reader::{
    BloomFilterReadMetrics, BloomFilterReader, BloomFilterReaderImpl,
@@ -30,6 +31,7 @@ use object_store::ObjectStore;
 use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
 use puffin::puffin_manager::{PuffinManager, PuffinReader};
 use snafu::ResultExt;
+use store_api::metadata::RegionMetadataRef;
 use store_api::region_request::PathType;
 use store_api::storage::ColumnId;

@@ -38,7 +40,6 @@ use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
 use crate::cache::index::bloom_filter_index::{
    BloomFilterIndexCacheRef, CachedBloomFilterIndexBlobReader, Tag,
 };
-use crate::cache::index::result_cache::PredicateKey;
 use crate::error::{
    ApplyBloomFilterIndexSnafu, Error, MetadataSnafu, PuffinBuildReaderSnafu, PuffinReadBlobSnafu,
    Result,
@@ -133,10 +134,10 @@ pub struct BloomFilterIndexApplier {

    /// Bloom filter predicates.
    /// For each column, the value will be retained only if it contains __all__ predicates.
-    predicates: Arc<BTreeMap<ColumnId, Vec<InListPredicate>>>,
+    default_predicates: Arc<BTreeMap<ColumnId, Vec<InListPredicate>>>,

-    /// Predicate key. Used to identify the predicate and fetch result from cache.
-    predicate_key: PredicateKey,
+    /// Expected predicate column types from the latest region metadata.
+    expected_predicate_col_types: BTreeMap<ColumnId, ConcreteDataType>,
 }

 impl BloomFilterIndexApplier {
@@ -149,8 +150,9 @@ impl BloomFilterIndexApplier {
        object_store: ObjectStore,
        puffin_manager_factory: PuffinManagerFactory,
        predicates: BTreeMap<ColumnId, Vec<InListPredicate>>,
+        expected_predicate_col_types: BTreeMap<ColumnId, ConcreteDataType>,
    ) -> Self {
-        let predicates = Arc::new(predicates);
+        let default_predicates = Arc::new(predicates);
        Self {
            table_dir,
            path_type,
@@ -159,8 +161,8 @@ impl BloomFilterIndexApplier {
            puffin_manager_factory,
            puffin_metadata_cache: None,
            bloom_filter_index_cache: None,
-            predicate_key: PredicateKey::new_bloom(predicates.clone()),
-            predicates,
+            default_predicates,
+            expected_predicate_col_types,
        }
    }

@@ -207,6 +209,7 @@ impl BloomFilterIndexApplier {
        &self,
        file_id: RegionIndexId,
        file_size_hint: Option<u64>,
+        predicates: &BTreeMap<ColumnId, Vec<InListPredicate>>,
        row_groups: impl Iterator<Item = (usize, bool)>,
        mut metrics: Option<&mut BloomFilterIndexApplyMetrics>,
    ) -> Result<Vec<(usize, Vec<Range<usize>>)>> {
@@ -230,7 +233,7 @@ impl BloomFilterIndexApplier {
            .map(|(i, range)| (*i, vec![range.clone()]))
            .collect::<Vec<_>>();

-        for (column_id, predicates) in self.predicates.iter() {
+        for (column_id, predicates) in predicates {
            let blob = match self
                .blob_reader(file_id, *column_id, file_size_hint, metrics.as_deref_mut())
                .await?
@@ -438,9 +441,46 @@ impl BloomFilterIndexApplier {
        Ok(())
    }

-    /// Returns the predicate key.
-    pub fn predicate_key(&self) -> &PredicateKey {
-        &self.predicate_key
+    /// Returns compatible bloom filter predicates with the given SST metadata.
+    ///
+    /// Returns `None` when no compatible predicate remains for this SST.
+    pub fn compatible_predicate_for_sst(
+        &self,
+        sst_metadata: &RegionMetadataRef,
+    ) -> Option<Arc<BTreeMap<ColumnId, Vec<InListPredicate>>>> {
+        let mut has_type_mismatch = false;
+        let mut compatible_col_ids = Vec::new();
+
+        for (col_id, expected) in &self.expected_predicate_col_types {
+            let Some(sst_col) = sst_metadata.column_by_id(*col_id) else {
+                has_type_mismatch = true;
+                continue;
+            };
+
+            if sst_col.column_schema.data_type != *expected {
+                has_type_mismatch = true;
+                continue;
+            }
+
+            compatible_col_ids.push(*col_id);
+        }
+
+        if compatible_col_ids.is_empty() {
+            return None;
+        }
+
+        if !has_type_mismatch {
+            return Some(self.default_predicates.clone());
+        }
+
+        let mut compatible_predicates = BTreeMap::new();
+        for col_id in compatible_col_ids {
+            if let Some(predicates) = self.default_predicates.get(&col_id) {
+                compatible_predicates.insert(col_id, predicates.clone());
+            }
+        }
+
+        Some(Arc::new(compatible_predicates))
    }
 }

@@ -456,9 +496,12 @@ fn is_blob_not_found(err: &Error) -> bool {

 #[cfg(test)]
 mod tests {
+    use std::collections::BTreeSet;

    use datafusion_expr::{Expr, col, lit};
    use futures::future::BoxFuture;
+    use index::Bytes;
+    use object_store::services::Memory;
    use puffin::puffin_manager::PuffinWriter;
    use store_api::metadata::RegionMetadata;
    use store_api::storage::FileId;
@@ -470,6 +513,113 @@ mod tests {
        mock_object_store, mock_region_metadata, new_batch, new_intm_mgr,
    };

+    #[tokio::test]
+    async fn test_compatible_predicate_for_sst() {
+        let (_d, puffin_manager_factory) =
+            PuffinManagerFactory::new_for_test_async("test_plan_for_sst_basic_").await;
+        let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
+        let table_dir = "table_dir".to_string();
+
+        let predicates = BTreeMap::from_iter([(
+            1,
+            vec![InListPredicate {
+                list: BTreeSet::from_iter([Bytes::from("foo")]),
+            }],
+        )]);
+        let expected_predicate_col_types =
+            BTreeMap::from_iter([(1, ConcreteDataType::string_datatype())]);
+
+        let applier = BloomFilterIndexApplier::new(
+            table_dir,
+            PathType::Bare,
+            object_store,
+            puffin_manager_factory,
+            predicates,
+            expected_predicate_col_types,
+        );
+        let predicates = applier.compatible_predicate_for_sst(&mock_region_metadata());
+        assert!(predicates.is_some());
+    }
+
+    #[tokio::test]
+    async fn test_compatible_predicate_for_sst_type_mismatch() {
+        let (_d, puffin_manager_factory) =
+            PuffinManagerFactory::new_for_test_async("test_plan_for_sst_type_mismatch_").await;
+        let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
+        let table_dir = "table_dir".to_string();
+
+        let predicates = BTreeMap::from_iter([(
+            1,
+            vec![InListPredicate {
+                list: BTreeSet::from_iter([Bytes::from("foo")]),
+            }],
+        )]);
+        let expected_predicate_col_types =
+            BTreeMap::from_iter([(1, ConcreteDataType::int64_datatype())]);
+
+        let applier = BloomFilterIndexApplier::new(
+            table_dir,
+            PathType::Bare,
+            object_store,
+            puffin_manager_factory,
+            predicates,
+            expected_predicate_col_types,
+        );
+        let predicates = applier.compatible_predicate_for_sst(&mock_region_metadata());
+        assert!(predicates.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_compatible_predicate_for_sst_partial_type_mismatch() {
+        let (_d, puffin_manager_factory) =
+            PuffinManagerFactory::new_for_test_async("test_plan_for_sst_partial_mismatch_").await;
+        let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
+        let table_dir = "table_dir".to_string();
+
+        // Column 1 (tag_str): expected string — matches SST (compatible).
+        // Column 3 (field_u64): expected int64 — SST has uint64 (mismatched).
+        let predicates = BTreeMap::from_iter([
+            (
+                1,
+                vec![InListPredicate {
+                    list: BTreeSet::from_iter([Bytes::from("foo")]),
+                }],
+            ),
+            (
+                3,
+                vec![InListPredicate {
+                    list: BTreeSet::from_iter([Bytes::from("bar")]),
+                }],
+            ),
+        ]);
+        let expected_predicate_col_types = BTreeMap::from_iter([
+            (1, ConcreteDataType::string_datatype()),
+            (3, ConcreteDataType::int64_datatype()), // intentional mismatch
+        ]);
+
+        let applier = BloomFilterIndexApplier::new(
+            table_dir,
+            PathType::Bare,
+            object_store,
+            puffin_manager_factory,
+            predicates,
+            expected_predicate_col_types,
+        );
+        let result = applier.compatible_predicate_for_sst(&mock_region_metadata());
+
+        // The subset containing only the compatible column must be returned.
+        let result = result.expect("expected Some with compatible subset");
+        assert!(
+            result.contains_key(&1),
+            "compatible column 1 must be present"
+        );
+        assert!(
+            !result.contains_key(&3),
+            "mismatched column 3 must be absent"
+        );
+        assert_eq!(result.len(), 1, "only the compatible predicate must remain");
+    }
+
    #[allow(clippy::type_complexity)]
    fn tester(
        table_dir: String,
@@ -496,8 +646,11 @@ mod tests {
                );

                let applier = builder.build(&exprs).unwrap().unwrap();
+                let predicates = applier
+                    .compatible_predicate_for_sst(&Arc::new(metadata.clone()))
+                    .unwrap();
                applier
-                    .apply(file_id, None, row_groups.into_iter(), None)
+                    .apply(file_id, None, &predicates, row_groups.into_iter(), None)
                    .await
                    .unwrap()
                    .into_iter()
--- a/Show More
+++ b/Show More