diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index dd9209ffcd..549d6300ea 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,23 +5,23 @@
 * @GreptimeTeam/db-approver
 
 ## [Module] Database Engine
-/src/index @zhongzc
+/src/index @evenyag @discord9 @WenyXu 
 /src/mito2 @evenyag @v0y4g3r @waynexia
-/src/query @evenyag
+/src/query @evenyag @waynexia @discord9
 
 ## [Module] Distributed
-/src/common/meta @MichaelScofield
-/src/common/procedure @MichaelScofield
-/src/meta-client @MichaelScofield
-/src/meta-srv @MichaelScofield
+/src/common/meta @MichaelScofield @WenyXu
+/src/common/procedure @MichaelScofield @WenyXu
+/src/meta-client @MichaelScofield @WenyXu
+/src/meta-srv @MichaelScofield @WenyXu
 
 ## [Module] Write Ahead Log
-/src/log-store @v0y4g3r
-/src/store-api @v0y4g3r
+/src/log-store @v0y4g3r @WenyXu
+/src/store-api @v0y4g3r @evenyag
 
 ## [Module] Metrics Engine
-/src/metric-engine @waynexia
-/src/promql @waynexia
+/src/metric-engine @waynexia @WenyXu
+/src/promql @waynexia @evenyag @discord9
 
 ## [Module] Flow
-/src/flow @zhongzc @waynexia
+/src/flow @discord9 @waynexia
diff --git a/.github/actions/build-greptime-binary/action.yml b/.github/actions/build-greptime-binary/action.yml
index ecbc05ed38..62ee9eb599 100644
--- a/.github/actions/build-greptime-binary/action.yml
+++ b/.github/actions/build-greptime-binary/action.yml
@@ -32,9 +32,23 @@ inputs:
     description: Image Registry
     required: false
     default: 'docker.io'
+  large-page-size:
+    description: Build GreptimeDB with large page size (65536).
+    required: false
+    default: 'false'
+
 runs:
   using: composite
   steps:
+    - name: Set extra build environment variables
+      shell: bash
+      run: |
+        if [[ '${{ inputs.large-page-size }}' == 'true' ]]; then
+          echo 'EXTRA_BUILD_ENVS="JEMALLOC_SYS_WITH_LG_PAGE=16"' >> $GITHUB_ENV
+        else
+          echo 'EXTRA_BUILD_ENVS=' >> $GITHUB_ENV
+        fi
+
     - name: Build greptime binary
       shell: bash
       if: ${{ inputs.build-android-artifacts == 'false' }}
@@ -45,7 +59,8 @@ runs:
           FEATURES=${{ inputs.features }} \
           BASE_IMAGE=${{ inputs.base-image }} \
           IMAGE_NAMESPACE=${{ inputs.image-namespace }} \
-          IMAGE_REGISTRY=${{ inputs.image-registry }}
+          IMAGE_REGISTRY=${{ inputs.image-registry }} \
+          EXTRA_BUILD_ENVS=$EXTRA_BUILD_ENVS
 
     - name: Upload artifacts
       uses: ./.github/actions/upload-artifacts
diff --git a/.github/actions/build-linux-artifacts/action.yml b/.github/actions/build-linux-artifacts/action.yml
index 9c88b25075..3cb9c43955 100644
--- a/.github/actions/build-linux-artifacts/action.yml
+++ b/.github/actions/build-linux-artifacts/action.yml
@@ -27,6 +27,10 @@ inputs:
     description: Working directory to build the artifacts
     required: false
     default: .
+  large-page-size:
+    description: Build GreptimeDB with large page size (65536).
+    required: false
+    default: 'false'
 runs:
   using: composite
   steps:
@@ -59,6 +63,7 @@ runs:
         working-dir: ${{ inputs.working-dir }}
         image-registry: ${{ inputs.image-registry }}
         image-namespace: ${{ inputs.image-namespace }}
+        large-page-size: ${{ inputs.large-page-size }}
 
     - name: Clean up the target directory # Clean up the target directory for the centos7 base image, or it will still use the objects of last build.
       shell: bash
@@ -77,6 +82,7 @@ runs:
         working-dir: ${{ inputs.working-dir }}
         image-registry: ${{ inputs.image-registry }}
         image-namespace: ${{ inputs.image-namespace }}
+        large-page-size: ${{ inputs.large-page-size }}
 
     - name: Build greptime on android base image
       uses: ./.github/actions/build-greptime-binary
@@ -89,3 +95,4 @@ runs:
         build-android-artifacts: true
         image-registry: ${{ inputs.image-registry }}
         image-namespace: ${{ inputs.image-namespace }}
+        large-page-size: ${{ inputs.large-page-size }}
diff --git a/.github/scripts/update-helm-charts-version.sh b/.github/scripts/update-helm-charts-version.sh
index d501ed8d02..e60e991846 100755
--- a/.github/scripts/update-helm-charts-version.sh
+++ b/.github/scripts/update-helm-charts-version.sh
@@ -39,8 +39,11 @@ update_helm_charts_version() {
     --body "This PR updates the GreptimeDB version." \
     --base main \
     --head $BRANCH_NAME \
-    --reviewer zyy17 \
-    --reviewer daviderli614
+    --reviewer sunng87 \
+    --reviewer daviderli614 \
+    --reviewer killme2008 \
+    --reviewer evenyag \
+    --reviewer fengjiachun
 }
 
 update_helm_charts_version
diff --git a/.github/scripts/update-homebrew-greptme-version.sh b/.github/scripts/update-homebrew-greptme-version.sh
index 4abf4f2218..f474f19778 100755
--- a/.github/scripts/update-homebrew-greptme-version.sh
+++ b/.github/scripts/update-homebrew-greptme-version.sh
@@ -35,8 +35,11 @@ update_homebrew_greptime_version() {
     --body "This PR updates the GreptimeDB version." \
     --base main \
     --head $BRANCH_NAME \
-    --reviewer zyy17 \
-    --reviewer daviderli614
+    --reviewer sunng87 \
+    --reviewer daviderli614 \
+    --reviewer killme2008 \
+    --reviewer evenyag \
+    --reviewer fengjiachun
 }
 
 update_homebrew_greptime_version
diff --git a/.github/workflows/dev-build.yml b/.github/workflows/dev-build.yml
index fad3e316e8..021867e4ed 100644
--- a/.github/workflows/dev-build.yml
+++ b/.github/workflows/dev-build.yml
@@ -4,10 +4,11 @@ name: GreptimeDB Development Build
 on:
   workflow_dispatch: # Allows you to run this workflow manually.
     inputs:
-      repository:
-        description: The public repository to build
+      large-page-size:
+        description: Build GreptimeDB with large page size (65536).
+        type: boolean
         required: false
-        default: GreptimeTeam/greptimedb
+        default: false
       commit: # Note: We only pull the source code and use the current workflow to build the artifacts.
         description: The commit to build
         required: true
@@ -181,6 +182,7 @@ jobs:
           working-dir: ${{ env.CHECKOUT_GREPTIMEDB_PATH }}
           image-registry: ${{ vars.ECR_IMAGE_REGISTRY }}
           image-namespace: ${{ vars.ECR_IMAGE_NAMESPACE }}
+          large-page-size: ${{ inputs.large-page-size }}
 
   build-linux-arm64-artifacts:
     name: Build linux-arm64 artifacts
@@ -214,6 +216,7 @@ jobs:
           working-dir: ${{ env.CHECKOUT_GREPTIMEDB_PATH }}
           image-registry: ${{ vars.ECR_IMAGE_REGISTRY }}
           image-namespace: ${{ vars.ECR_IMAGE_NAMESPACE }}
+          large-page-size: ${{ inputs.large-page-size }}
 
   release-images-to-dockerhub:
     name: Build and push images to DockerHub
diff --git a/.github/workflows/multi-lang-tests.yml b/.github/workflows/multi-lang-tests.yml
new file mode 100644
index 0000000000..6da0a658dd
--- /dev/null
+++ b/.github/workflows/multi-lang-tests.yml
@@ -0,0 +1,57 @@
+name: Multi-language Integration Tests
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build-greptimedb:
+    if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
+    name: Build GreptimeDB binary
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+      - uses: arduino/setup-protoc@v3
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+      - uses: Swatinem/rust-cache@v2
+        with:
+          shared-key: "multi-lang-build"
+          cache-all-crates: "true"
+          save-if: ${{ github.ref == 'refs/heads/main' }}
+      - name: Install cargo-gc-bin
+        shell: bash
+        run: cargo install cargo-gc-bin --force
+      - name: Build greptime binary
+        shell: bash
+        run: cargo gc -- --bin greptime --features "pg_kvbackend,mysql_kvbackend"
+      - name: Pack greptime binary
+        shell: bash
+        run: |
+          mkdir bin && \
+          mv ./target/debug/greptime bin
+      - name: Print greptime binary info
+        run: ls -lh bin
+      - name: Upload greptime binary
+        uses: actions/upload-artifact@v4
+        with:
+          name: greptime-bin
+          path: bin/
+          retention-days: 1
+
+  run-multi-lang-tests:
+    name: Run Multi-language SDK Tests
+    needs: build-greptimedb
+    uses: ./.github/workflows/run-multi-lang-tests.yml
+    with:
+      artifact-name: greptime-bin
diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml
index 6640d1d3df..710a767334 100644
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -174,6 +174,18 @@ jobs:
           image-registry: ${{ vars.ECR_IMAGE_REGISTRY }}
           image-namespace: ${{ vars.ECR_IMAGE_NAMESPACE }}
 
+  run-multi-lang-tests:
+    name: Run Multi-language SDK Tests
+    if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'schedule' }}
+    needs: [
+      allocate-runners,
+      build-linux-amd64-artifacts,
+    ]
+    uses: ./.github/workflows/run-multi-lang-tests.yml
+    with:
+      artifact-name: greptime-linux-amd64-${{ needs.allocate-runners.outputs.version }}
+      artifact-is-tarball: true
+
   release-images-to-dockerhub:
     name: Build and push images to DockerHub
     if: ${{ inputs.release_images || github.event_name == 'schedule' }}
@@ -301,7 +313,8 @@ jobs:
     if: ${{ github.repository == 'GreptimeTeam/greptimedb' && always() }} # Not requiring successful dependent jobs, always run.
     name: Send notification to Greptime team
     needs: [
-      release-images-to-dockerhub
+      release-images-to-dockerhub,
+      run-multi-lang-tests,
     ]
     runs-on: ubuntu-latest
     permissions:
@@ -319,17 +332,17 @@ jobs:
         run: pnpm tsx bin/report-ci-failure.ts
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          CI_REPORT_STATUS: ${{ needs.release-images-to-dockerhub.outputs.nightly-build-result == 'success' }}
+          CI_REPORT_STATUS: ${{ needs.release-images-to-dockerhub.outputs.nightly-build-result == 'success' && (needs.run-multi-lang-tests.result == 'success' || needs.run-multi-lang-tests.result == 'skipped') }}
       - name: Notify nightly build successful result
         uses: slackapi/slack-github-action@v1.23.0
-        if: ${{ needs.release-images-to-dockerhub.outputs.nightly-build-result == 'success' }}
+        if: ${{ needs.release-images-to-dockerhub.outputs.nightly-build-result == 'success' && (needs.run-multi-lang-tests.result == 'success' || needs.run-multi-lang-tests.result == 'skipped') }}
         with:
           payload: |
             {"text": "GreptimeDB's ${{ env.NEXT_RELEASE_VERSION }} build has completed successfully."}
 
       - name: Notify nightly build failed result
         uses: slackapi/slack-github-action@v1.23.0
-        if: ${{ needs.release-images-to-dockerhub.outputs.nightly-build-result != 'success' }}
+        if: ${{ needs.release-images-to-dockerhub.outputs.nightly-build-result != 'success' || needs.run-multi-lang-tests.result == 'failure' }}
         with:
           payload: |
             {"text": "GreptimeDB's ${{ env.NEXT_RELEASE_VERSION }} build has failed, please check ${{ steps.report-ci-status.outputs.html_url }}."}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index bc9da93b9c..25a9c02524 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -49,14 +49,9 @@ on:
         description: Do not run integration tests during the build
         type: boolean
         default: true
-      build_linux_amd64_artifacts:
+      build_linux_artifacts:
         type: boolean
-        description: Build linux-amd64 artifacts
-        required: false
-        default: false
-      build_linux_arm64_artifacts:
-        type: boolean
-        description: Build linux-arm64 artifacts
+        description: Build linux artifacts (both amd64 and arm64)
         required: false
         default: false
       build_macos_artifacts:
@@ -144,7 +139,7 @@ jobs:
           ./.github/scripts/check-version.sh "${{ steps.create-version.outputs.version }}"
 
       - name: Allocate linux-amd64 runner
-        if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+        if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
         uses: ./.github/actions/start-runner
         id: start-linux-amd64-runner
         with:
@@ -158,7 +153,7 @@ jobs:
           subnet-id: ${{ vars.EC2_RUNNER_SUBNET_ID }}
 
       - name: Allocate linux-arm64 runner
-        if: ${{ inputs.build_linux_arm64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+        if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
         uses: ./.github/actions/start-runner
         id: start-linux-arm64-runner
         with:
@@ -173,7 +168,7 @@ jobs:
 
   build-linux-amd64-artifacts:
     name: Build linux-amd64 artifacts
-    if: ${{ inputs.build_linux_amd64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+    if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
     needs: [
       allocate-runners,
     ]
@@ -195,7 +190,7 @@ jobs:
 
   build-linux-arm64-artifacts:
     name: Build linux-arm64 artifacts
-    if: ${{ inputs.build_linux_arm64_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+    if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
     needs: [
       allocate-runners,
     ]
@@ -215,6 +210,18 @@ jobs:
           image-registry: ${{ vars.ECR_IMAGE_REGISTRY }}
           image-namespace: ${{ vars.ECR_IMAGE_NAMESPACE }}
 
+  run-multi-lang-tests:
+    name: Run Multi-language SDK Tests
+    if: ${{ inputs.build_linux_artifacts || github.event_name == 'push' || github.event_name == 'schedule' }}
+    needs: [
+      allocate-runners,
+      build-linux-amd64-artifacts,
+    ]
+    uses: ./.github/workflows/run-multi-lang-tests.yml
+    with:
+      artifact-name: greptime-linux-amd64-${{ needs.allocate-runners.outputs.version }}
+      artifact-is-tarball: true
+
   build-macos-artifacts:
     name: Build macOS artifacts
     strategy:
@@ -303,6 +310,7 @@ jobs:
       allocate-runners,
       build-linux-amd64-artifacts,
       build-linux-arm64-artifacts,
+      run-multi-lang-tests,
     ]
     runs-on: ubuntu-latest
     outputs:
@@ -373,7 +381,18 @@ jobs:
 
   publish-github-release:
     name: Create GitHub release and upload artifacts
-    if: ${{ inputs.publish_github_release || github.event_name == 'push' || github.event_name == 'schedule' }}
+    # Use always() to run even when optional jobs (macos, windows) are skipped.
+    # Then check that required jobs succeeded and optional jobs didn't fail.
+    if: |
+      always() &&
+      (inputs.publish_github_release || github.event_name == 'push' || github.event_name == 'schedule') &&
+      needs.allocate-runners.result == 'success' &&
+      (needs.build-linux-amd64-artifacts.result == 'success' || needs.build-linux-amd64-artifacts.result == 'skipped') &&
+      (needs.build-linux-arm64-artifacts.result == 'success' || needs.build-linux-arm64-artifacts.result == 'skipped') &&
+      (needs.build-macos-artifacts.result == 'success' || needs.build-macos-artifacts.result == 'skipped') &&
+      (needs.build-windows-artifacts.result == 'success' || needs.build-windows-artifacts.result == 'skipped') &&
+      (needs.release-images-to-dockerhub.result == 'success' || needs.release-images-to-dockerhub.result == 'skipped') &&
+      (needs.run-multi-lang-tests.result == 'success' || needs.run-multi-lang-tests.result == 'skipped')
     needs: [ # The job have to wait for all the artifacts are built.
       allocate-runners,
       build-linux-amd64-artifacts,
@@ -381,6 +400,7 @@ jobs:
       build-macos-artifacts,
       build-windows-artifacts,
       release-images-to-dockerhub,
+      run-multi-lang-tests,
     ]
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/run-multi-lang-tests.yml b/.github/workflows/run-multi-lang-tests.yml
new file mode 100644
index 0000000000..f744d7a644
--- /dev/null
+++ b/.github/workflows/run-multi-lang-tests.yml
@@ -0,0 +1,194 @@
+# Reusable workflow for running multi-language SDK tests against GreptimeDB
+# Used by: multi-lang-tests.yml, release.yml, nightly-build.yml
+# Supports both direct binary artifacts and tarball artifacts
+
+name: Run Multi-language SDK Tests
+
+on:
+  workflow_call:
+    inputs:
+      artifact-name:
+        required: true
+        type: string
+        description: 'Name of the artifact containing greptime binary'
+      http-port:
+        required: false
+        type: string
+        default: '4000'
+        description: 'HTTP server port'
+      mysql-port:
+        required: false
+        type: string
+        default: '4002'
+        description: 'MySQL server port'
+      postgres-port:
+        required: false
+        type: string
+        default: '4003'
+        description: 'PostgreSQL server port'
+      db-name:
+        required: false
+        type: string
+        default: 'test_db'
+        description: 'Test database name'
+      username:
+        required: false
+        type: string
+        default: 'greptime_user'
+        description: 'Authentication username'
+      password:
+        required: false
+        type: string
+        default: 'greptime_pwd'
+        description: 'Authentication password'
+      timeout-minutes:
+        required: false
+        type: number
+        default: 30
+        description: 'Job timeout in minutes'
+      artifact-is-tarball:
+        required: false
+        type: boolean
+        default: false
+        description: 'Whether the artifact is a tarball (tar.gz) that needs to be extracted'
+
+jobs:
+  run-tests:
+    name: Run Multi-language SDK Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: ${{ inputs.timeout-minutes }}
+    steps:
+      - name: Checkout greptimedb-tests repository
+        uses: actions/checkout@v4
+        with:
+          repository: GreptimeTeam/greptimedb-tests
+          persist-credentials: false
+
+      - name: Download pre-built greptime binary
+        uses: actions/download-artifact@v4
+        with:
+          name: ${{ inputs.artifact-name }}
+          path: artifact
+
+      - name: Setup greptime binary
+        run: |
+          mkdir -p bin
+          if [ "${{ inputs.artifact-is-tarball }}" = "true" ]; then
+            # Extract tarball and find greptime binary
+            tar -xzf artifact/*.tar.gz -C artifact
+            find artifact -name "greptime" -type f -exec cp {} bin/greptime \;
+          else
+            # Direct binary format
+            if [ -f artifact/greptime ]; then
+              cp artifact/greptime bin/greptime
+            else
+              cp artifact/* bin/greptime
+            fi
+          fi
+          chmod +x ./bin/greptime
+          ls -lh ./bin/greptime
+          ./bin/greptime --version
+
+      - name: Setup Java 17
+        uses: actions/setup-java@v4
+        with:
+          distribution: 'temurin'
+          java-version: '17'
+          cache: 'maven'
+
+      - name: Setup Python 3.8
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.8'
+
+      - name: Setup Go 1.24
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.24'
+          cache: true
+          cache-dependency-path: go-tests/go.sum
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '18'
+
+      - name: Install Python dependencies
+        run: |
+          pip install mysql-connector-python psycopg2-binary
+          python3 -c "import mysql.connector; print(f'mysql-connector-python {mysql.connector.__version__}')"
+          python3 -c "import psycopg2; print(f'psycopg2 {psycopg2.__version__}')"
+
+      - name: Install Go dependencies
+        working-directory: go-tests
+        run: |
+          go mod download
+          go mod verify
+          go version
+
+      - name: Kill existing GreptimeDB processes
+        run: |
+          pkill -f greptime || true
+          sleep 2
+
+      - name: Start GreptimeDB standalone
+        run: |
+          ./bin/greptime standalone start \
+            --http-addr 0.0.0.0:${{ inputs.http-port }} \
+            --rpc-addr 0.0.0.0:4001 \
+            --mysql-addr 0.0.0.0:${{ inputs.mysql-port }} \
+            --postgres-addr 0.0.0.0:${{ inputs.postgres-port }} \
+            --user-provider=static_user_provider:cmd:${{ inputs.username }}=${{ inputs.password }} > /tmp/greptimedb.log 2>&1 &
+
+      - name: Wait for GreptimeDB to be ready
+        run: |
+          echo "Waiting for GreptimeDB..."
+          for i in {1..60}; do
+            if curl -sf http://localhost:${{ inputs.http-port }}/health > /dev/null; then
+              echo "✅ GreptimeDB is ready"
+              exit 0
+            fi
+            sleep 2
+          done
+          echo "❌ GreptimeDB failed to start"
+          cat /tmp/greptimedb.log
+          exit 1
+
+      - name: Run multi-language tests
+        env:
+          DB_NAME: ${{ inputs.db-name }}
+          MYSQL_HOST: 127.0.0.1
+          MYSQL_PORT: ${{ inputs.mysql-port }}
+          POSTGRES_HOST: 127.0.0.1
+          POSTGRES_PORT: ${{ inputs.postgres-port }}
+          HTTP_HOST: 127.0.0.1
+          HTTP_PORT: ${{ inputs.http-port }}
+          GREPTIME_USERNAME: ${{ inputs.username }}
+          GREPTIME_PASSWORD: ${{ inputs.password }}
+        run: |
+          chmod +x ./run_tests.sh
+          ./run_tests.sh
+
+      - name: Collect logs on failure
+        if: failure()
+        run: |
+          echo "=== GreptimeDB Logs ==="
+          cat /tmp/greptimedb.log || true
+
+      - name: Upload test logs on failure
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-logs
+          path: |
+            /tmp/greptimedb.log
+            java-tests/target/surefire-reports/
+            python-tests/.pytest_cache/
+            go-tests/*.log
+            **/test-output/
+          retention-days: 7
+
+      - name: Cleanup
+        if: always()
+        run: |
+          pkill -f greptime || true
diff --git a/AUTHOR.md b/AUTHOR.md
index 021d7b299f..633c535237 100644
--- a/AUTHOR.md
+++ b/AUTHOR.md
@@ -2,41 +2,41 @@
 
 ## Individual Committers (in alphabetical order)
 
-* [CookiePieWw](https://github.com/CookiePieWw)
-* [etolbakov](https://github.com/etolbakov)
-* [irenjj](https://github.com/irenjj)
-* [KKould](https://github.com/KKould)
-* [Lanqing Yang](https://github.com/lyang24)
-* [NiwakaDev](https://github.com/NiwakaDev)
-* [tisonkun](https://github.com/tisonkun)
+- [apdong2022](https://github.com/apdong2022)
+- [beryl678](https://github.com/beryl678)
+- [CookiePieWw](https://github.com/CookiePieWw)
+- [etolbakov](https://github.com/etolbakov)
+- [irenjj](https://github.com/irenjj)
+- [KKould](https://github.com/KKould)
+- [Lanqing Yang](https://github.com/lyang24)
+- [nicecui](https://github.com/nicecui)
+- [NiwakaDev](https://github.com/NiwakaDev)
+- [paomian](https://github.com/paomian)
+- [tisonkun](https://github.com/tisonkun)
+- [Wenjie0329](https://github.com/Wenjie0329)
+- [zhaoyingnan01](https://github.com/zhaoyingnan01)
+- [zhongzc](https://github.com/zhongzc)
+- [ZonaHex](https://github.com/ZonaHex)
+- [zyy17](https://github.com/zyy17)
 
 ## Team Members (in alphabetical order)
 
-* [apdong2022](https://github.com/apdong2022)
-* [beryl678](https://github.com/beryl678)
-* [daviderli614](https://github.com/daviderli614)
-* [discord9](https://github.com/discord9)
-* [evenyag](https://github.com/evenyag)
-* [fengjiachun](https://github.com/fengjiachun)
-* [fengys1996](https://github.com/fengys1996)
-* [GrepTime](https://github.com/GrepTime)
-* [holalengyu](https://github.com/holalengyu)
-* [killme2008](https://github.com/killme2008)
-* [MichaelScofield](https://github.com/MichaelScofield)
-* [nicecui](https://github.com/nicecui)
-* [paomian](https://github.com/paomian)
-* [shuiyisong](https://github.com/shuiyisong)
-* [sunchanglong](https://github.com/sunchanglong)
-* [sunng87](https://github.com/sunng87)
-* [v0y4g3r](https://github.com/v0y4g3r)
-* [waynexia](https://github.com/waynexia)
-* [Wenjie0329](https://github.com/Wenjie0329)
-* [WenyXu](https://github.com/WenyXu)
-* [xtang](https://github.com/xtang)
-* [zhaoyingnan01](https://github.com/zhaoyingnan01)
-* [zhongzc](https://github.com/zhongzc)
-* [ZonaHex](https://github.com/ZonaHex)
-* [zyy17](https://github.com/zyy17)
+- [daviderli614](https://github.com/daviderli614)
+- [discord9](https://github.com/discord9)
+- [evenyag](https://github.com/evenyag)
+- [fengjiachun](https://github.com/fengjiachun)
+- [fengys1996](https://github.com/fengys1996)
+- [GrepTime](https://github.com/GrepTime)
+- [holalengyu](https://github.com/holalengyu)
+- [killme2008](https://github.com/killme2008)
+- [MichaelScofield](https://github.com/MichaelScofield)
+- [shuiyisong](https://github.com/shuiyisong)
+- [sunchanglong](https://github.com/sunchanglong)
+- [sunng87](https://github.com/sunng87)
+- [v0y4g3r](https://github.com/v0y4g3r)
+- [waynexia](https://github.com/waynexia)
+- [WenyXu](https://github.com/WenyXu)
+- [xtang](https://github.com/xtang)
 
 ## All Contributors
 
diff --git a/Cargo.lock b/Cargo.lock
index a4dce12b27..281cbabb5b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -212,7 +212,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"
 
 [[package]]
 name = "api"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "arrow-schema",
  "common-base",
@@ -733,17 +733,17 @@ dependencies = [
 
 [[package]]
 name = "auth"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "async-trait",
  "common-base",
+ "common-config",
  "common-error",
  "common-macro",
  "common-telemetry",
  "common-test-util",
  "digest",
- "notify",
  "sha1",
  "snafu 0.8.6",
  "sql",
@@ -1383,7 +1383,7 @@ dependencies = [
 
 [[package]]
 name = "cache"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "catalog",
  "common-error",
@@ -1418,7 +1418,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "catalog"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "arrow",
@@ -1763,7 +1763,7 @@ checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675"
 
 [[package]]
 name = "cli"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -1816,7 +1816,7 @@ dependencies = [
 
 [[package]]
 name = "client"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "arc-swap",
@@ -1849,7 +1849,7 @@ dependencies = [
  "snafu 0.8.6",
  "store-api",
  "substrait 0.37.3",
- "substrait 1.0.0-beta.1",
+ "substrait 1.0.0-beta.2",
  "tokio",
  "tokio-stream",
  "tonic 0.13.1",
@@ -1889,7 +1889,7 @@ dependencies = [
 
 [[package]]
 name = "cmd"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "async-trait",
  "auth",
@@ -1977,6 +1977,17 @@ dependencies = [
  "unicode-width 0.2.1",
 ]
 
+[[package]]
+name = "codespan-reporting"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681"
+dependencies = [
+ "serde",
+ "termcolor",
+ "unicode-width 0.2.1",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.4"
@@ -2012,7 +2023,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
 
 [[package]]
 name = "common-base"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "anymap2",
  "async-trait",
@@ -2036,14 +2047,14 @@ dependencies = [
 
 [[package]]
 name = "common-catalog"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "const_format",
 ]
 
 [[package]]
 name = "common-config"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "common-base",
  "common-error",
@@ -2055,6 +2066,7 @@ dependencies = [
  "datanode",
  "humantime-serde",
  "meta-client",
+ "notify",
  "object-store",
  "serde",
  "serde_json",
@@ -2067,7 +2079,7 @@ dependencies = [
 
 [[package]]
 name = "common-datasource"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "arrow",
  "arrow-schema",
@@ -2102,7 +2114,7 @@ dependencies = [
 
 [[package]]
 name = "common-decimal"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "bigdecimal 0.4.8",
  "common-error",
@@ -2115,7 +2127,7 @@ dependencies = [
 
 [[package]]
 name = "common-error"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "common-macro",
  "http 1.3.1",
@@ -2126,7 +2138,7 @@ dependencies = [
 
 [[package]]
 name = "common-event-recorder"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "async-trait",
@@ -2148,7 +2160,7 @@ dependencies = [
 
 [[package]]
 name = "common-frontend"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "async-trait",
@@ -2170,7 +2182,7 @@ dependencies = [
 
 [[package]]
 name = "common-function"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "ahash 0.8.12",
  "api",
@@ -2208,6 +2220,7 @@ dependencies = [
  "hyperloglogplus",
  "jsonb",
  "memchr",
+ "mito-codec",
  "nalgebra",
  "num",
  "num-traits",
@@ -2229,7 +2242,7 @@ dependencies = [
 
 [[package]]
 name = "common-greptimedb-telemetry"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "async-trait",
  "common-runtime",
@@ -2246,12 +2259,13 @@ dependencies = [
 
 [[package]]
 name = "common-grpc"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "arrow-flight",
  "bytes",
  "common-base",
+ "common-config",
  "common-error",
  "common-macro",
  "common-recordbatch",
@@ -2270,6 +2284,7 @@ dependencies = [
  "serde",
  "serde_json",
  "snafu 0.8.6",
+ "tempfile",
  "tokio",
  "tokio-util",
  "tonic 0.13.1",
@@ -2279,7 +2294,7 @@ dependencies = [
 
 [[package]]
 name = "common-grpc-expr"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "common-base",
@@ -2299,7 +2314,7 @@ dependencies = [
 
 [[package]]
 name = "common-macro"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "greptime-proto",
  "once_cell",
@@ -2310,7 +2325,7 @@ dependencies = [
 
 [[package]]
 name = "common-mem-prof"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "anyhow",
  "common-error",
@@ -2324,9 +2339,22 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "common-memory-manager"
+version = "1.0.0-beta.2"
+dependencies = [
+ "common-error",
+ "common-macro",
+ "common-telemetry",
+ "humantime",
+ "serde",
+ "snafu 0.8.6",
+ "tokio",
+]
+
 [[package]]
 name = "common-meta"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "anymap2",
  "api",
@@ -2398,7 +2426,7 @@ dependencies = [
 
 [[package]]
 name = "common-options"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "common-grpc",
  "humantime-serde",
@@ -2407,11 +2435,11 @@ dependencies = [
 
 [[package]]
 name = "common-plugins"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 
 [[package]]
 name = "common-pprof"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "common-error",
  "common-macro",
@@ -2423,7 +2451,7 @@ dependencies = [
 
 [[package]]
 name = "common-procedure"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "async-stream",
@@ -2452,7 +2480,7 @@ dependencies = [
 
 [[package]]
 name = "common-procedure-test"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "async-trait",
  "common-procedure",
@@ -2462,7 +2490,7 @@ dependencies = [
 
 [[package]]
 name = "common-query"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "async-trait",
@@ -2488,7 +2516,7 @@ dependencies = [
 
 [[package]]
 name = "common-recordbatch"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "arc-swap",
  "common-base",
@@ -2512,7 +2540,7 @@ dependencies = [
 
 [[package]]
 name = "common-runtime"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "async-trait",
  "clap 4.5.40",
@@ -2541,7 +2569,7 @@ dependencies = [
 
 [[package]]
 name = "common-session"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "serde",
  "strum 0.27.1",
@@ -2549,7 +2577,7 @@ dependencies = [
 
 [[package]]
 name = "common-sql"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "common-base",
  "common-decimal",
@@ -2567,7 +2595,7 @@ dependencies = [
 
 [[package]]
 name = "common-stat"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "common-base",
  "common-runtime",
@@ -2582,7 +2610,7 @@ dependencies = [
 
 [[package]]
 name = "common-telemetry"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "backtrace",
  "common-base",
@@ -2611,7 +2639,7 @@ dependencies = [
 
 [[package]]
 name = "common-test-util"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "client",
  "common-grpc",
@@ -2624,7 +2652,7 @@ dependencies = [
 
 [[package]]
 name = "common-time"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "arrow",
  "chrono",
@@ -2642,7 +2670,7 @@ dependencies = [
 
 [[package]]
 name = "common-version"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "build-data",
  "cargo-manifest",
@@ -2653,7 +2681,7 @@ dependencies = [
 
 [[package]]
 name = "common-wal"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "common-base",
  "common-error",
@@ -2676,7 +2704,7 @@ dependencies = [
 
 [[package]]
 name = "common-workload"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "common-telemetry",
  "serde",
@@ -2842,6 +2870,15 @@ dependencies = [
  "unicode-segmentation",
 ]
 
+[[package]]
+name = "convert_case"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9"
+dependencies = [
+ "unicode-segmentation",
+]
+
 [[package]]
 name = "core-foundation"
 version = "0.9.4"
@@ -3143,6 +3180,68 @@ dependencies = [
  "cipher",
 ]
 
+[[package]]
+name = "cxx"
+version = "1.0.190"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7620f6cfc4dcca21f2b085b7a890e16c60fd66f560cd69ee60594908dc72ab1"
+dependencies = [
+ "cc",
+ "cxx-build",
+ "cxxbridge-cmd",
+ "cxxbridge-flags",
+ "cxxbridge-macro",
+ "foldhash 0.2.0",
+ "link-cplusplus",
+]
+
+[[package]]
+name = "cxx-build"
+version = "1.0.190"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a9bc1a22964ff6a355fbec24cf68266a0ed28f8b84c0864c386474ea3d0e479"
+dependencies = [
+ "cc",
+ "codespan-reporting 0.13.1",
+ "indexmap 2.11.4",
+ "proc-macro2",
+ "quote",
+ "scratch",
+ "syn 2.0.106",
+]
+
+[[package]]
+name = "cxxbridge-cmd"
+version = "1.0.190"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1f29a879d35f7906e3c9b77d7a1005a6a0787d330c09dfe4ffb5f617728cb44"
+dependencies = [
+ "clap 4.5.40",
+ "codespan-reporting 0.13.1",
+ "indexmap 2.11.4",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.106",
+]
+
+[[package]]
+name = "cxxbridge-flags"
+version = "1.0.190"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d67109015f93f683e364085aa6489a5b2118b4a40058482101d699936a7836d6"
+
+[[package]]
+name = "cxxbridge-macro"
+version = "1.0.190"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d187e019e7b05a1f3e69a8396b70800ee867aa9fc2ab972761173ccee03742df"
+dependencies = [
+ "indexmap 2.11.4",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.106",
+]
+
 [[package]]
 name = "darling"
 version = "0.14.4"
@@ -3738,9 +3837,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-pg-catalog"
-version = "0.12.1"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15824c98ff2009c23b0398d441499b147f7c5ac0e5ee993e7a473d79040e3626"
+checksum = "09bfd1feed7ed335227af0b65955ed825e467cf67fad6ecd089123202024cfd1"
 dependencies = [
  "async-trait",
  "datafusion",
@@ -3913,7 +4012,7 @@ dependencies = [
 
 [[package]]
 name = "datanode"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "arrow-flight",
@@ -3977,7 +4076,7 @@ dependencies = [
 
 [[package]]
 name = "datatypes"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "arrow",
  "arrow-array",
@@ -4181,21 +4280,23 @@ dependencies = [
 
 [[package]]
 name = "derive_more"
-version = "1.0.0"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05"
+checksum = "10b768e943bed7bf2cab53df09f4bc34bfd217cdb57d971e769874c9a6710618"
 dependencies = [
  "derive_more-impl",
 ]
 
 [[package]]
 name = "derive_more-impl"
-version = "1.0.0"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22"
+checksum = "6d286bfdaf75e988b4a78e013ecd79c581e06399ab53fbacd2d916c2f904f30b"
 dependencies = [
+ "convert_case 0.10.0",
  "proc-macro2",
  "quote",
+ "rustc_version",
  "syn 2.0.106",
  "unicode-xid",
 ]
@@ -4649,7 +4750,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
 
 [[package]]
 name = "file-engine"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "async-trait",
@@ -4781,7 +4882,7 @@ checksum = "8bf7cc16383c4b8d58b9905a8509f02926ce3058053c056376248d958c9df1e8"
 
 [[package]]
 name = "flow"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "arrow",
@@ -4850,7 +4951,7 @@ dependencies = [
  "sql",
  "store-api",
  "strum 0.27.1",
- "substrait 1.0.0-beta.1",
+ "substrait 1.0.0-beta.2",
  "table",
  "tokio",
  "tonic 0.13.1",
@@ -4888,6 +4989,12 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
 
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.2.2"
@@ -4905,13 +5012,14 @@ checksum = "28dd6caf6059519a65843af8fe2a3ae298b14b80179855aeb4adc2c1934ee619"
 
 [[package]]
 name = "frontend"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "arc-swap",
  "async-stream",
  "async-trait",
  "auth",
+ "axum 0.8.4",
  "bytes",
  "cache",
  "catalog",
@@ -4946,9 +5054,11 @@ dependencies = [
  "hostname 0.4.1",
  "humantime",
  "humantime-serde",
+ "hyper-util",
  "lazy_static",
  "log-query",
  "meta-client",
+ "meta-srv",
  "num_cpus",
  "opentelemetry-proto",
  "operator",
@@ -4960,6 +5070,7 @@ dependencies = [
  "prost 0.13.5",
  "query",
  "rand 0.9.1",
+ "reqwest",
  "serde",
  "serde_json",
  "servers",
@@ -5348,7 +5459,7 @@ dependencies = [
 [[package]]
 name = "greptime-proto"
 version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=14b9dc40bdc8288742b0cefc7bb024303b7429ef#14b9dc40bdc8288742b0cefc7bb024303b7429ef"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=0423fa30203187c75e2937a668df1da699c8b96c#0423fa30203187c75e2937a668df1da699c8b96c"
 dependencies = [
  "prost 0.13.5",
  "prost-types 0.13.5",
@@ -5484,7 +5595,7 @@ checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5"
 dependencies = [
  "allocator-api2",
  "equivalent",
- "foldhash",
+ "foldhash 0.1.5",
 ]
 
 [[package]]
@@ -6116,7 +6227,7 @@ dependencies = [
 
 [[package]]
 name = "index"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "async-trait",
  "asynchronous-codec",
@@ -6129,6 +6240,7 @@ dependencies = [
  "common-telemetry",
  "common-test-util",
  "criterion 0.4.0",
+ "datatypes",
  "fastbloom",
  "fst",
  "futures",
@@ -6137,6 +6249,7 @@ dependencies = [
  "jieba-rs",
  "lazy_static",
  "mockall",
+ "nalgebra",
  "pin-project",
  "prost 0.13.5",
  "puffin",
@@ -6154,6 +6267,7 @@ dependencies = [
  "tempfile",
  "tokio",
  "tokio-util",
+ "usearch",
  "uuid",
 ]
 
@@ -6985,6 +7099,15 @@ dependencies = [
  "vcpkg",
 ]
 
+[[package]]
+name = "link-cplusplus"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f78c730aaa7d0b9336a299029ea49f9ee53b0ed06e9202e8cb7db9bae7b8c82"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "linked-hash-map"
 version = "0.5.6"
@@ -7045,7 +7168,7 @@ checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
 
 [[package]]
 name = "log-query"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "chrono",
  "common-error",
@@ -7057,7 +7180,7 @@ dependencies = [
 
 [[package]]
 name = "log-store"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "async-stream",
  "async-trait",
@@ -7298,12 +7421,6 @@ dependencies = [
  "digest",
 ]
 
-[[package]]
-name = "md5"
-version = "0.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
-
 [[package]]
 name = "md5"
 version = "0.8.0"
@@ -7364,7 +7481,7 @@ dependencies = [
 
 [[package]]
 name = "meta-client"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "async-trait",
@@ -7392,7 +7509,7 @@ dependencies = [
 
 [[package]]
 name = "meta-srv"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "async-trait",
@@ -7440,7 +7557,9 @@ dependencies = [
  "lazy_static",
  "local-ip-address",
  "once_cell",
+ "ordered-float 4.6.0",
  "parking_lot 0.12.4",
+ "partition",
  "prometheus",
  "prost 0.13.5",
  "rand 0.9.1",
@@ -7490,7 +7609,7 @@ dependencies = [
 
 [[package]]
 name = "metric-engine"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "aquamarine",
@@ -7509,9 +7628,11 @@ dependencies = [
  "common-test-util",
  "common-time",
  "common-wal",
+ "criterion 0.4.0",
  "datafusion",
  "datatypes",
  "futures-util",
+ "fxhash",
  "humantime-serde",
  "itertools 0.14.0",
  "lazy_static",
@@ -7585,7 +7706,7 @@ dependencies = [
 
 [[package]]
 name = "mito-codec"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "bytes",
@@ -7610,7 +7731,7 @@ dependencies = [
 
 [[package]]
 name = "mito2"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "aquamarine",
@@ -7628,6 +7749,7 @@ dependencies = [
  "common-function",
  "common-grpc",
  "common-macro",
+ "common-memory-manager",
  "common-meta",
  "common-query",
  "common-recordbatch",
@@ -7649,6 +7771,7 @@ dependencies = [
  "either",
  "futures",
  "greptime-proto",
+ "humantime",
  "humantime-serde",
  "index",
  "itertools 0.14.0",
@@ -8348,7 +8471,7 @@ dependencies = [
 
 [[package]]
 name = "object-store"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "anyhow",
  "bytes",
@@ -8357,10 +8480,10 @@ dependencies = [
  "common-macro",
  "common-telemetry",
  "common-test-util",
+ "derive_builder 0.20.2",
  "futures",
  "humantime-serde",
  "lazy_static",
- "md5 0.7.0",
  "moka",
  "opendal",
  "prometheus",
@@ -8527,7 +8650,7 @@ dependencies = [
 [[package]]
 name = "opensrv-mysql"
 version = "0.8.0"
-source = "git+https://github.com/datafuselabs/opensrv?rev=a1fb4da215c8693c7e4f62be249a01b7fec52997#a1fb4da215c8693c7e4f62be249a01b7fec52997"
+source = "git+https://github.com/datafuselabs/opensrv?tag=v0.10.0#074bd8fb81da3c9e6d6a098a482f3380478b9c0b"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -8633,7 +8756,7 @@ dependencies = [
 
 [[package]]
 name = "operator"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "ahash 0.8.12",
  "api",
@@ -8659,6 +8782,7 @@ dependencies = [
  "common-recordbatch",
  "common-runtime",
  "common-sql",
+ "common-stat",
  "common-telemetry",
  "common-test-util",
  "common-time",
@@ -8670,6 +8794,7 @@ dependencies = [
  "futures",
  "futures-util",
  "humantime",
+ "itertools 0.14.0",
  "jsonb",
  "lazy_static",
  "meta-client",
@@ -8691,7 +8816,7 @@ dependencies = [
  "sql",
  "sqlparser",
  "store-api",
- "substrait 1.0.0-beta.1",
+ "substrait 1.0.0-beta.2",
  "table",
  "tokio",
  "tokio-util",
@@ -8977,7 +9102,7 @@ dependencies = [
 
 [[package]]
 name = "partition"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "async-trait",
@@ -9181,10 +9306,21 @@ dependencies = [
 ]
 
 [[package]]
-name = "pgwire"
-version = "0.34.2"
+name = "pg_interval"
+version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f56a81b4fcc69016028f657a68f9b8e8a2a4b7d07684ca3298f2d3e7ff199ce"
+checksum = "fe46640b465e284b048ef065cbed8ef17a622878d310c724578396b4cfd00df2"
+dependencies = [
+ "bytes",
+ "chrono",
+ "postgres-types",
+]
+
+[[package]]
+name = "pgwire"
+version = "0.36.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70a2bcdcc4b20a88e0648778ecf00415bbd5b447742275439c22176835056f99"
 dependencies = [
  "async-trait",
  "base64 0.22.1",
@@ -9194,12 +9330,13 @@ dependencies = [
  "futures",
  "hex",
  "lazy-regex",
- "md5 0.8.0",
+ "md5",
  "postgres-types",
  "rand 0.9.1",
  "ring",
  "rust_decimal",
  "rustls-pki-types",
+ "ryu",
  "serde",
  "serde_json",
  "stringprep",
@@ -9322,7 +9459,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
 [[package]]
 name = "pipeline"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "ahash 0.8.12",
  "api",
@@ -9478,9 +9615,10 @@ dependencies = [
 
 [[package]]
 name = "plugins"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "auth",
+ "catalog",
  "clap 4.5.40",
  "cli",
  "common-base",
@@ -9489,6 +9627,7 @@ dependencies = [
  "datanode",
  "flow",
  "frontend",
+ "meta-client",
  "meta-srv",
  "serde",
  "snafu 0.8.6",
@@ -9778,7 +9917,7 @@ dependencies = [
 
 [[package]]
 name = "promql"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "ahash 0.8.12",
  "async-trait",
@@ -10061,7 +10200,7 @@ dependencies = [
 
 [[package]]
 name = "puffin"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "async-compression 0.4.19",
  "async-trait",
@@ -10103,7 +10242,7 @@ dependencies = [
 
 [[package]]
 name = "query"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "ahash 0.8.12",
  "api",
@@ -10170,7 +10309,7 @@ dependencies = [
  "sql",
  "sqlparser",
  "store-api",
- "substrait 1.0.0-beta.1",
+ "substrait 1.0.0-beta.2",
  "table",
  "tokio",
  "tokio-stream",
@@ -10813,7 +10952,7 @@ dependencies = [
 [[package]]
 name = "rskafka"
 version = "0.6.0"
-source = "git+https://github.com/WenyXu/rskafka.git?rev=7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76#7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76"
+source = "git+https://github.com/GreptimeTeam/rskafka.git?rev=f5688f83e7da591cda3f2674c2408b4c0ed4ed50#f5688f83e7da591cda3f2674c2408b4c0ed4ed50"
 dependencies = [
  "bytes",
  "chrono",
@@ -11242,6 +11381,12 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
+[[package]]
+name = "scratch"
+version = "1.0.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2"
+
 [[package]]
 name = "scrypt"
 version = "0.11.0"
@@ -11506,7 +11651,7 @@ dependencies = [
 
 [[package]]
 name = "servers"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "ahash 0.8.12",
  "api",
@@ -11583,6 +11728,7 @@ dependencies = [
  "otel-arrow-rust",
  "parking_lot 0.12.4",
  "permutation",
+ "pg_interval",
  "pgwire",
  "pin-project",
  "pipeline",
@@ -11624,6 +11770,7 @@ dependencies = [
  "tower 0.5.2",
  "tower-http 0.6.6",
  "tracing",
+ "tracing-opentelemetry",
  "urlencoding",
  "uuid",
  "vrl",
@@ -11632,7 +11779,7 @@ dependencies = [
 
 [[package]]
 name = "session"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "ahash 0.8.12",
  "api",
@@ -11966,7 +12113,7 @@ dependencies = [
 
 [[package]]
 name = "sql"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "arrow-buffer",
@@ -12026,7 +12173,7 @@ dependencies = [
 
 [[package]]
 name = "sqlness-runner"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "async-trait",
  "clap 4.5.40",
@@ -12303,7 +12450,7 @@ dependencies = [
 
 [[package]]
 name = "standalone"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "async-trait",
  "catalog",
@@ -12344,7 +12491,7 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
 
 [[package]]
 name = "store-api"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "aquamarine",
@@ -12557,7 +12704,7 @@ dependencies = [
 
 [[package]]
 name = "substrait"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "async-trait",
  "bytes",
@@ -12680,7 +12827,7 @@ dependencies = [
 
 [[package]]
 name = "table"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "async-trait",
@@ -12919,7 +13066,7 @@ dependencies = [
  "getrandom 0.3.3",
  "once_cell",
  "rustix 1.0.7",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -12949,7 +13096,7 @@ checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683"
 
 [[package]]
 name = "tests-fuzz"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "arbitrary",
  "async-trait",
@@ -12993,7 +13140,7 @@ dependencies = [
 
 [[package]]
 name = "tests-integration"
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 dependencies = [
  "api",
  "arrow-flight",
@@ -13043,6 +13190,7 @@ dependencies = [
  "loki-proto",
  "meta-client",
  "meta-srv",
+ "mito2",
  "moka",
  "mysql_async",
  "object-store",
@@ -13067,7 +13215,7 @@ dependencies = [
  "sqlx",
  "standalone",
  "store-api",
- "substrait 1.0.0-beta.1",
+ "substrait 1.0.0-beta.2",
  "table",
  "tempfile",
  "time",
@@ -14092,6 +14240,16 @@ version = "2.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
 
+[[package]]
+name = "usearch"
+version = "2.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2cc9fc5f872a3a4f9081d5f42624d788231b763e1846c829b9968a3755ac884d"
+dependencies = [
+ "cxx",
+ "cxx-build",
+]
+
 [[package]]
 name = "utf8-ranges"
 version = "1.0.5"
@@ -14231,7 +14389,7 @@ dependencies = [
  "ciborium",
  "cidr",
  "clap 4.5.40",
- "codespan-reporting",
+ "codespan-reporting 0.12.0",
  "community-id",
  "convert_case 0.7.1",
  "crc",
diff --git a/Cargo.toml b/Cargo.toml
index eedca63570..a8a09ae993 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,6 +21,7 @@ members = [
     "src/common/grpc-expr",
     "src/common/macro",
     "src/common/mem-prof",
+    "src/common/memory-manager",
     "src/common/meta",
     "src/common/options",
     "src/common/plugins",
@@ -74,7 +75,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "1.0.0-beta.1"
+version = "1.0.0-beta.2"
 edition = "2024"
 license = "Apache-2.0"
 
@@ -131,7 +132,7 @@ datafusion-functions = "50"
 datafusion-functions-aggregate-common = "50"
 datafusion-optimizer = "50"
 datafusion-orc = "0.5"
-datafusion-pg-catalog = "0.12.1"
+datafusion-pg-catalog = "0.12.3"
 datafusion-physical-expr = "50"
 datafusion-physical-plan = "50"
 datafusion-sql = "50"
@@ -139,6 +140,7 @@ datafusion-substrait = "50"
 deadpool = "0.12"
 deadpool-postgres = "0.14"
 derive_builder = "0.20"
+derive_more = { version = "2.1", features = ["full"] }
 dotenv = "0.15"
 either = "1.15"
 etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62df834f0cffda355eba96691fe1a9a332b75a7", features = [
@@ -148,7 +150,7 @@ etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62d
 fst = "0.4.7"
 futures = "0.3"
 futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "14b9dc40bdc8288742b0cefc7bb024303b7429ef" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "0423fa30203187c75e2937a668df1da699c8b96c" }
 hex = "0.4"
 http = "1"
 humantime = "2.1"
@@ -200,7 +202,8 @@ reqwest = { version = "0.12", default-features = false, features = [
     "stream",
     "multipart",
 ] }
-rskafka = { git = "https://github.com/WenyXu/rskafka.git", rev = "7b0f31ed39db049b4ee2e5f1e95b5a30be9baf76", features = [
+# Branch: feat/request-timeout
+rskafka = { git = "https://github.com/GreptimeTeam/rskafka.git", rev = "f5688f83e7da591cda3f2674c2408b4c0ed4ed50", features = [
     "transport-tls",
 ] }
 rstest = "0.25"
@@ -234,6 +237,7 @@ tower = "0.5"
 tower-http = "0.6"
 tracing = "0.1"
 tracing-appender = "0.2"
+tracing-opentelemetry = "0.31.0"
 tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"] }
 typetag = "0.2"
 uuid = { version = "1.17", features = ["serde", "v4", "fast-rng"] }
@@ -263,6 +267,7 @@ common-grpc = { path = "src/common/grpc" }
 common-grpc-expr = { path = "src/common/grpc-expr" }
 common-macro = { path = "src/common/macro" }
 common-mem-prof = { path = "src/common/mem-prof" }
+common-memory-manager = { path = "src/common/memory-manager" }
 common-meta = { path = "src/common/meta" }
 common-options = { path = "src/common/options" }
 common-plugins = { path = "src/common/plugins" }
diff --git a/Makefile b/Makefile
index a200244029..91fb600d14 100644
--- a/Makefile
+++ b/Makefile
@@ -17,6 +17,8 @@ CARGO_REGISTRY_CACHE ?= ${HOME}/.cargo/registry
 ARCH := $(shell uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/')
 OUTPUT_DIR := $(shell if [ "$(RELEASE)" = "true" ]; then echo "release"; elif [ ! -z "$(CARGO_PROFILE)" ]; then echo "$(CARGO_PROFILE)" ; else echo "debug"; fi)
 SQLNESS_OPTS ?=
+EXTRA_BUILD_ENVS ?=
+ASSEMBLED_EXTRA_BUILD_ENV := $(foreach var,$(EXTRA_BUILD_ENVS),-e $(var))
 
 # The arguments for running integration tests.
 ETCD_VERSION ?= v3.5.9
@@ -83,6 +85,7 @@ build: ## Build debug version greptime.
 .PHONY: build-by-dev-builder
 build-by-dev-builder: ## Build greptime by dev-builder.
 	docker run --network=host \
+	${ASSEMBLED_EXTRA_BUILD_ENV} \
 	-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry \
 	-w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-${BASE_IMAGE}:${DEV_BUILDER_IMAGE_TAG} \
 	make build \
diff --git a/README.md b/README.md
index 017ec46b6a..6c83582a24 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
 
 <div align="center">
 <h3 align="center">
-  <a href="https://docs.greptime.com/">User Guide</a> |
+  <a href="https://docs.greptime.com/user-guide/overview/">User Guide</a> |
   <a href="https://greptimedb.rs/">API Docs</a> |
   <a href="https://github.com/GreptimeTeam/greptimedb/issues/5446">Roadmap 2025</a>
 </h4>
@@ -66,17 +66,24 @@
 
 ## Introduction
 
-**GreptimeDB** is an open-source, cloud-native database purpose-built for the unified collection and analysis of observability data (metrics, logs, and traces). Whether you’re operating on the edge, in the cloud, or across hybrid environments, GreptimeDB empowers real-time insights at massive scale — all in one system.
+**GreptimeDB** is an open-source, cloud-native database that unifies metrics, logs, and traces, enabling real-time observability at any scale — across edge, cloud, and hybrid environments.
 
 ## Features
 
 |   Feature  | Description |
 | --------- | ----------- |
-| [Unified Observability Data](https://docs.greptime.com/user-guide/concepts/why-greptimedb) | Store metrics, logs, and traces as timestamped, contextual wide events. Query via [SQL](https://docs.greptime.com/user-guide/query-data/sql), [PromQL](https://docs.greptime.com/user-guide/query-data/promql), and [streaming](https://docs.greptime.com/user-guide/flow-computation/overview). |
-| [High Performance & Cost Effective](https://docs.greptime.com/user-guide/manage-data/data-index) | Written in Rust, with a distributed query engine, [rich indexing](https://docs.greptime.com/user-guide/manage-data/data-index), and optimized columnar storage, delivering sub-second responses at PB scale. |
-| [Cloud-Native Architecture](https://docs.greptime.com/user-guide/concepts/architecture) | Designed for [Kubernetes](https://docs.greptime.com/user-guide/deployments-administration/deploy-on-kubernetes/greptimedb-operator-management), with compute/storage separation, native object storage (AWS S3, Azure Blob, etc.) and seamless cross-cloud access. |
-| [Developer-Friendly](https://docs.greptime.com/user-guide/protocols/overview) | Access via SQL/PromQL interfaces, REST API, MySQL/PostgreSQL protocols, and popular ingestion [protocols](https://docs.greptime.com/user-guide/protocols/overview). |
-| [Flexible Deployment](https://docs.greptime.com/user-guide/deployments-administration/overview) | Deploy anywhere: edge (including ARM/[Android](https://docs.greptime.com/user-guide/deployments-administration/run-on-android)) or cloud, with unified APIs and efficient data sync. |
+| [All-in-One Observability](https://docs.greptime.com/user-guide/concepts/why-greptimedb) | OpenTelemetry-native platform unifying metrics, logs, and traces. Query via [SQL](https://docs.greptime.com/user-guide/query-data/sql), [PromQL](https://docs.greptime.com/user-guide/query-data/promql), and [Flow](https://docs.greptime.com/user-guide/flow-computation/overview). |
+| [High Performance](https://docs.greptime.com/user-guide/manage-data/data-index) | Written in Rust with [rich indexing](https://docs.greptime.com/user-guide/manage-data/data-index) (inverted, fulltext, skipping, vector), delivering sub-second responses at PB scale. |
+| [Cost Efficiency](https://docs.greptime.com/user-guide/concepts/architecture) | 50x lower operational and storage costs with compute-storage separation and native object storage (S3, Azure Blob, etc.). |
+| [Cloud-Native & Scalable](https://docs.greptime.com/user-guide/deployments-administration/deploy-on-kubernetes/greptimedb-operator-management) | Purpose-built for [Kubernetes](https://docs.greptime.com/user-guide/deployments-administration/deploy-on-kubernetes/greptimedb-operator-management) with unlimited cross-cloud scaling, handling hundreds of thousands of concurrent requests. |
+| [Developer-Friendly](https://docs.greptime.com/user-guide/protocols/overview) | SQL/PromQL interfaces, built-in web dashboard, REST API, MySQL/PostgreSQL protocol compatibility, and native [OpenTelemetry](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/) support. |
+| [Flexible Deployment](https://docs.greptime.com/user-guide/deployments-administration/overview) | Deploy anywhere from ARM-based edge devices (including [Android](https://docs.greptime.com/user-guide/deployments-administration/run-on-android)) to cloud, with unified APIs and efficient data sync. |
+
+  ✅ **Perfect for:**
+  - Unified observability stack replacing Prometheus + Loki + Tempo
+  - Large-scale metrics with high cardinality (millions to billions of time series)
+  - Large-scale observability platform requiring cost efficiency and scalability
+  - IoT and edge computing with resource and bandwidth constraints
 
 Learn more in [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why-greptimedb) and [Observability 2.0 and the Database for It](https://greptime.com/blogs/2025-04-25-greptimedb-observability2-new-database).
 
@@ -85,10 +92,10 @@ Learn more in [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why
 | Feature                         | GreptimeDB            | Traditional TSDB   | Log Stores      |
 |----------------------------------|-----------------------|--------------------|-----------------|
 | Data Types                      | Metrics, Logs, Traces | Metrics only       | Logs only       |
-| Query Language                  | SQL, PromQL, Streaming|  Custom/PromQL     | Custom/DSL      |
+| Query Language                  | SQL, PromQL |  Custom/PromQL     | Custom/DSL      |
 | Deployment                      | Edge + Cloud          | Cloud/On-prem      | Mostly central  |
 | Indexing & Performance          | PB-Scale, Sub-second  | Varies             | Varies          |
-| Integration                     | REST, SQL, Common protocols | Varies     | Varies          |
+| Integration                     | REST API, SQL, Common protocols | Varies     | Varies          |
 
 **Performance:**
 * [GreptimeDB tops JSONBench's billion-record cold run test!](https://greptime.com/blogs/2025-03-18-jsonbench-greptimedb-performance)
@@ -98,8 +105,14 @@ Read [more benchmark reports](https://docs.greptime.com/user-guide/concepts/feat
 
 ## Architecture
 
-* Read the [architecture](https://docs.greptime.com/contributor-guide/overview/#architecture) document.
-* [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb/1-overview) provides an in-depth look at GreptimeDB:
+GreptimeDB can run in two modes:
+* **Standalone Mode** - Single binary for development and small deployments
+* **Distributed Mode** - Separate components for production scale:
+  - Frontend: Query processing and protocol handling
+  - Datanode: Data storage and retrieval
+  - Metasrv: Metadata management and coordination
+  
+Read the [architecture](https://docs.greptime.com/contributor-guide/overview/#architecture) document. [DeepWiki](https://deepwiki.com/GreptimeTeam/greptimedb/1-overview) provides an in-depth look at GreptimeDB:
   <img alt="GreptimeDB System Overview" src="docs/architecture.png">
 
 ## Try GreptimeDB
@@ -119,7 +132,8 @@ docker run -p 127.0.0.1:4000-4003:4000-4003 \
   --postgres-addr 0.0.0.0:4003
 ```
 Dashboard: [http://localhost:4000/dashboard](http://localhost:4000/dashboard)
-[Full Install Guide](https://docs.greptime.com/getting-started/installation/overview)
+
+Read more in the [full Install Guide](https://docs.greptime.com/getting-started/installation/overview).
 
 **Troubleshooting:**
 * Cannot connect to the database? Ensure that ports `4000`, `4001`, `4002`, and `4003` are not blocked by a firewall or used by other services.
@@ -148,21 +162,26 @@ cargo run -- standalone start
 
 ## Tools & Extensions
 
-- **Kubernetes:** [GreptimeDB Operator](https://github.com/GrepTimeTeam/greptimedb-operator)
-- **Helm Charts:** [Greptime Helm Charts](https://github.com/GreptimeTeam/helm-charts)
-- **Dashboard:** [Web UI](https://github.com/GreptimeTeam/dashboard)
-- **SDKs/Ingester:** [Go](https://github.com/GreptimeTeam/greptimedb-ingester-go), [Java](https://github.com/GreptimeTeam/greptimedb-ingester-java), [C++](https://github.com/GreptimeTeam/greptimedb-ingester-cpp), [Erlang](https://github.com/GreptimeTeam/greptimedb-ingester-erl), [Rust](https://github.com/GreptimeTeam/greptimedb-ingester-rust), [JS](https://github.com/GreptimeTeam/greptimedb-ingester-js)
-- **Grafana**: [Official Dashboard](https://github.com/GreptimeTeam/greptimedb/blob/main/grafana/README.md)
+- **Kubernetes**: [GreptimeDB Operator](https://github.com/GrepTimeTeam/greptimedb-operator)
+- **Helm Charts**: [Greptime Helm Charts](https://github.com/GreptimeTeam/helm-charts)
+- **Dashboard**: [Web UI](https://github.com/GreptimeTeam/dashboard)
+- **gRPC Ingester**: [Go](https://github.com/GreptimeTeam/greptimedb-ingester-go), [Java](https://github.com/GreptimeTeam/greptimedb-ingester-java), [C++](https://github.com/GreptimeTeam/greptimedb-ingester-cpp), [Erlang](https://github.com/GreptimeTeam/greptimedb-ingester-erl), [Rust](https://github.com/GreptimeTeam/greptimedb-ingester-rust)
+- **Grafana Data Source**: [GreptimeDB Grafana data source plugin](https://github.com/GreptimeTeam/greptimedb-grafana-datasource)
+- **Grafana Dashboard**: [Official Dashboard for monitoring](https://github.com/GreptimeTeam/greptimedb/blob/main/grafana/README.md)
 
 ## Project Status
 
-> **Status:** Beta.
-> **GA (v1.0):** Targeted for mid 2025.
+> **Status:** Beta — marching toward v1.0 GA!
+> **GA (v1.0):** January 10, 2026
 
-- Being used in production by early adopters
+- Deployed in production by open-source projects and commercial users
 - Stable, actively maintained, with regular releases ([version info](https://docs.greptime.com/nightly/reference/about-greptimedb-version))
 - Suitable for evaluation and pilot deployments
 
+GreptimeDB v1.0 represents a major milestone toward maturity — marking stable APIs, production readiness, and proven performance.
+
+**Roadmap:** Beta1 (Nov 10) → Beta2 (Nov 24) → RC1 (Dec 8) → GA (Jan 10, 2026), please read [v1.0 highlights and release plan](https://greptime.com/blogs/2025-11-05-greptimedb-v1-highlights) for details.
+
 For production use, we recommend using the latest stable release.
 [![Star History Chart](https://api.star-history.com/svg?repos=GreptimeTeam/GreptimeDB&type=Date)](https://www.star-history.com/#GreptimeTeam/GreptimeDB&Date)
 
@@ -203,5 +222,5 @@ Special thanks to all contributors! See [AUTHORS.md](https://github.com/Greptime
 
 - Uses [Apache Arrow™](https://arrow.apache.org/) (memory model)
 - [Apache Parquet™](https://parquet.apache.org/) (file storage)
-- [Apache Arrow DataFusion™](https://arrow.apache.org/datafusion/) (query engine)
+- [Apache DataFusion™](https://arrow.apache.org/datafusion/) (query engine)
 - [Apache OpenDAL™](https://opendal.apache.org/) (data access abstraction)
diff --git a/config/config.md b/config/config.md
index 58c491b4ad..e61c48c43f 100644
--- a/config/config.md
+++ b/config/config.md
@@ -108,9 +108,6 @@
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `./greptimedb_data` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
-| `storage.enable_read_cache` | Bool | `true` | Whether to enable read cache. If not set, the read cache will be enabled by default when using object storage. |
-| `storage.cache_path` | String | Unset | Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.<br/>A local file directory, defaults to `{data_home}`. An empty string means disabling. |
-| `storage.cache_capacity` | String | Unset | The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger. |
 | `storage.bucket` | String | Unset | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
 | `storage.root` | String | Unset | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
 | `storage.access_key_id` | String | Unset | The access key id of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3` and `Oss`**. |
@@ -141,6 +138,8 @@
 | `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). |
 | `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). |
 | `region_engine.mito.max_background_purges` | Integer | Auto | Max number of running background purge jobs (default: number of cpu cores). |
+| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. |
+| `region_engine.mito.experimental_compaction_on_exhausted` | String | wait | Behavior when compaction cannot acquire memory from the budget.<br/>Options: "wait" (default, 10s), "wait(<duration>)", "fail" |
 | `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. |
 | `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
 | `region_engine.mito.global_write_buffer_reject_size` | String | Auto | Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size`. |
@@ -154,6 +153,8 @@
 | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).<br/>When enabled, index files are loaded into the write cache during region initialization,<br/>which can improve query performance at the cost of longer startup times. |
 | `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).<br/>The remaining capacity is used for data (parquet) files.<br/>Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,<br/>1GiB is reserved for index files and 4GiB for data files. |
+| `region_engine.mito.enable_refill_cache_on_read` | Bool | `true` | Enable refilling cache on read operations (default: true).<br/>When disabled, cache refilling on read won't happen. |
+| `region_engine.mito.manifest_cache_size` | String | `256MB` | Capacity for manifest cache (default: 256MB). |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
 | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
@@ -210,14 +211,6 @@
 | `slow_query.record_type` | String | Unset | The record type of slow queries. It can be `system_table` or `log`. |
 | `slow_query.threshold` | String | Unset | The threshold of slow query. |
 | `slow_query.sample_ratio` | Float | Unset | The sampling ratio of slow query log. The value should be in the range of (0, 1]. |
-| `export_metrics` | -- | -- | The standalone can export its metrics and send to Prometheus compatible service (e.g. `greptimedb`) from remote-write API.<br/>This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
-| `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
-| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
-| `export_metrics.self_import` | -- | -- | For `standalone` mode, `self_import` is recommended to collect metrics generated by itself<br/>You must create the database before enabling it. |
-| `export_metrics.self_import.db` | String | Unset | -- |
-| `export_metrics.remote_write` | -- | -- | -- |
-| `export_metrics.remote_write.url` | String | `""` | The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. |
-| `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
 | `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
 | `memory` | -- | -- | The memory options. |
@@ -302,7 +295,6 @@
 | `meta_client` | -- | -- | The metasrv client options. |
 | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
 | `meta_client.timeout` | String | `3s` | Operation timeout. |
-| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
 | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
 | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
 | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
@@ -335,12 +327,6 @@
 | `slow_query.threshold` | String | `30s` | The threshold of slow query. It can be human readable time string, for example: `10s`, `100ms`, `1s`. |
 | `slow_query.sample_ratio` | Float | `1.0` | The sampling ratio of slow query log. The value should be in the range of (0, 1]. For example, `0.1` means 10% of the slow queries will be logged and `1.0` means all slow queries will be logged. |
 | `slow_query.ttl` | String | `90d` | The TTL of the `slow_queries` system table. Default is `90d` when `record_type` is `system_table`. |
-| `export_metrics` | -- | -- | The frontend can export its metrics and send to Prometheus compatible service (e.g. `greptimedb` itself) from remote-write API.<br/>This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
-| `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
-| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
-| `export_metrics.remote_write` | -- | -- | -- |
-| `export_metrics.remote_write.url` | String | `""` | The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. |
-| `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
 | `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
 | `memory` | -- | -- | The memory options. |
@@ -354,7 +340,7 @@
 | Key | Type | Default | Descriptions |
 | --- | -----| ------- | ----------- |
 | `data_home` | String | `./greptimedb_data` | The working home directory. |
-| `store_addrs` | Array | -- | Store server address default to etcd store.<br/>For postgres store, the format is:<br/>"password=password dbname=postgres user=postgres host=localhost port=5432"<br/>For etcd store, the format is:<br/>"127.0.0.1:2379" |
+| `store_addrs` | Array | -- | Store server address(es). The format depends on the selected backend.<br/><br/>For etcd: a list of "host:port" endpoints.<br/>e.g. ["192.168.1.1:2379", "192.168.1.2:2379"]<br/><br/>For PostgreSQL: a connection string in libpq format or URI.<br/>e.g.<br/>- "host=localhost port=5432 user=postgres password=<PASSWORD> dbname=postgres"<br/>- "postgresql://user:password@localhost:5432/mydb?connect_timeout=10"<br/>The detail see: https://docs.rs/tokio-postgres/latest/tokio_postgres/config/struct.Config.html<br/><br/>For mysql store, the format is a MySQL connection URL.<br/>e.g. "mysql://user:password@localhost:3306/greptime_meta?ssl-mode=VERIFY_CA&ssl-ca=/path/to/ca.pem" |
 | `store_key_prefix` | String | `""` | If it's not empty, the metasrv will store all data with this key prefix. |
 | `backend` | String | `etcd_store` | The datastore for meta server.<br/>Available values:<br/>- `etcd_store` (default value)<br/>- `memory_store`<br/>- `postgres_store`<br/>- `mysql_store` |
 | `meta_table_name` | String | `greptime_metakv` | Table name in RDS to store metadata. Effect when using a RDS kvbackend.<br/>**Only used when backend is `postgres_store`.** |
@@ -370,12 +356,11 @@
 | `runtime` | -- | -- | The runtime options. |
 | `runtime.global_rt_size` | Integer | `8` | The number of threads to execute the runtime for global read operations. |
 | `runtime.compact_rt_size` | Integer | `4` | The number of threads to execute the runtime for global write operations. |
-| `backend_tls` | -- | -- | TLS configuration for kv store backend (applicable for etcd, PostgreSQL, and MySQL backends)<br/>When using etcd, PostgreSQL, or MySQL as metadata store, you can configure TLS here |
+| `backend_tls` | -- | -- | TLS configuration for kv store backend (applicable for etcd, PostgreSQL, and MySQL backends)<br/>When using etcd, PostgreSQL, or MySQL as metadata store, you can configure TLS here<br/><br/>Note: if TLS is configured in both this section and the `store_addrs` connection string, the<br/>settings here will override the TLS settings in `store_addrs`. |
 | `backend_tls.mode` | String | `prefer` | TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html<br/>- "disable" - No TLS<br/>- "prefer" (default) - Try TLS, fallback to plain<br/>- "require" - Require TLS<br/>- "verify_ca" - Require TLS and verify CA<br/>- "verify_full" - Require TLS and verify hostname |
 | `backend_tls.cert_path` | String | `""` | Path to client certificate file (for client authentication)<br/>Like "/path/to/client.crt" |
 | `backend_tls.key_path` | String | `""` | Path to client private key file (for client authentication)<br/>Like "/path/to/client.key" |
 | `backend_tls.ca_cert_path` | String | `""` | Path to CA certificate file (for server certificate verification)<br/>Required when using custom CAs or self-signed certificates<br/>Leave empty to use system root certificates only<br/>Like "/path/to/ca.crt" |
-| `backend_tls.watch` | Bool | `false` | Watch for certificate file changes and auto reload |
 | `grpc` | -- | -- | The gRPC server options. |
 | `grpc.bind_addr` | String | `127.0.0.1:3002` | The address to bind the gRPC server. |
 | `grpc.server_addr` | String | `127.0.0.1:3002` | The communication server address for the frontend and datanode to connect to metasrv.<br/>If left empty or unset, the server will automatically use the IP address of the first network interface<br/>on the host, with the same port number as the one specified in `bind_addr`. |
@@ -430,12 +415,6 @@
 | `logging.otlp_headers` | -- | -- | Additional OTLP headers, only valid when using OTLP http |
 | `logging.tracing_sample_ratio` | -- | Unset | The percentage of tracing will be sampled and exported.<br/>Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.<br/>ratio > 1 are treated as 1. Fractions < 0 are treated as 0 |
 | `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- |
-| `export_metrics` | -- | -- | The metasrv can export its metrics and send to Prometheus compatible service (e.g. `greptimedb` itself) from remote-write API.<br/>This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
-| `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
-| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
-| `export_metrics.remote_write` | -- | -- | -- |
-| `export_metrics.remote_write.url` | String | `""` | The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. |
-| `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
 | `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
 | `memory` | -- | -- | The memory options. |
@@ -478,7 +457,6 @@
 | `meta_client` | -- | -- | The metasrv client options. |
 | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
 | `meta_client.timeout` | String | `3s` | Operation timeout. |
-| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
 | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
 | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
 | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
@@ -509,9 +487,6 @@
 | `storage` | -- | -- | The data storage options. |
 | `storage.data_home` | String | `./greptimedb_data` | The working home directory. |
 | `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |
-| `storage.cache_path` | String | Unset | Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.<br/>A local file directory, defaults to `{data_home}`. An empty string means disabling. |
-| `storage.enable_read_cache` | Bool | `true` | Whether to enable read cache. If not set, the read cache will be enabled by default when using object storage. |
-| `storage.cache_capacity` | String | Unset | The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger. |
 | `storage.bucket` | String | Unset | The S3 bucket name.<br/>**It's only used when the storage type is `S3`, `Oss` and `Gcs`**. |
 | `storage.root` | String | Unset | The S3 data will be stored in the specified prefix, for example, `s3://${bucket}/${root}`.<br/>**It's only used when the storage type is `S3`, `Oss` and `Azblob`**. |
 | `storage.access_key_id` | String | Unset | The access key id of the aws account.<br/>It's **highly recommended** to use AWS IAM roles instead of hardcoding the access key id and secret key.<br/>**It's only used when the storage type is `S3` and `Oss`**. |
@@ -544,6 +519,8 @@
 | `region_engine.mito.max_background_flushes` | Integer | Auto | Max number of running background flush jobs (default: 1/2 of cpu cores). |
 | `region_engine.mito.max_background_compactions` | Integer | Auto | Max number of running background compaction jobs (default: 1/4 of cpu cores). |
 | `region_engine.mito.max_background_purges` | Integer | Auto | Max number of running background purge jobs (default: number of cpu cores). |
+| `region_engine.mito.experimental_compaction_memory_limit` | String | 0 | Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit. |
+| `region_engine.mito.experimental_compaction_on_exhausted` | String | wait | Behavior when compaction cannot acquire memory from the budget.<br/>Options: "wait" (default, 10s), "wait(<duration>)", "fail" |
 | `region_engine.mito.auto_flush_interval` | String | `1h` | Interval to auto flush a region if it has not flushed yet. |
 | `region_engine.mito.global_write_buffer_size` | String | Auto | Global write buffer size for all regions. If not set, it's default to 1/8 of OS memory with a max limitation of 1GB. |
 | `region_engine.mito.global_write_buffer_reject_size` | String | Auto | Global write buffer size threshold to reject write requests. If not set, it's default to 2 times of `global_write_buffer_size` |
@@ -557,6 +534,8 @@
 | `region_engine.mito.write_cache_ttl` | String | Unset | TTL for write cache. |
 | `region_engine.mito.preload_index_cache` | Bool | `true` | Preload index (puffin) files into cache on region open (default: true).<br/>When enabled, index files are loaded into the write cache during region initialization,<br/>which can improve query performance at the cost of longer startup times. |
 | `region_engine.mito.index_cache_percent` | Integer | `20` | Percentage of write cache capacity allocated for index (puffin) files (default: 20).<br/>The remaining capacity is used for data (parquet) files.<br/>Must be between 0 and 100 (exclusive). For example, with a 5GiB write cache and 20% allocation,<br/>1GiB is reserved for index files and 4GiB for data files. |
+| `region_engine.mito.enable_refill_cache_on_read` | Bool | `true` | Enable refilling cache on read operations (default: true).<br/>When disabled, cache refilling on read won't happen. |
+| `region_engine.mito.manifest_cache_size` | String | `256MB` | Capacity for manifest cache (default: 256MB). |
 | `region_engine.mito.sst_write_buffer_size` | String | `8MB` | Buffer size for SST writing. |
 | `region_engine.mito.parallel_scan_channel_size` | Integer | `32` | Capacity of the channel to send data from parallel scan tasks to the main task. |
 | `region_engine.mito.max_concurrent_scan_files` | Integer | `384` | Maximum number of SST files to scan concurrently. |
@@ -608,12 +587,6 @@
 | `logging.otlp_headers` | -- | -- | Additional OTLP headers, only valid when using OTLP http |
 | `logging.tracing_sample_ratio` | -- | Unset | The percentage of tracing will be sampled and exported.<br/>Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.<br/>ratio > 1 are treated as 1. Fractions < 0 are treated as 0 |
 | `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- |
-| `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. `greptimedb` itself) from remote-write API.<br/>This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
-| `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
-| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
-| `export_metrics.remote_write` | -- | -- | -- |
-| `export_metrics.remote_write.url` | String | `""` | The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`. |
-| `export_metrics.remote_write.headers` | InlineTable | -- | HTTP headers of Prometheus remote-write carry. |
 | `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
 | `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
 | `memory` | -- | -- | The memory options. |
@@ -656,7 +629,6 @@
 | `meta_client` | -- | -- | The metasrv client options. |
 | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
 | `meta_client.timeout` | String | `3s` | Operation timeout. |
-| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
 | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
 | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
 | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
diff --git a/config/datanode.example.toml b/config/datanode.example.toml
index dda926e1cb..47b6cc8cec 100644
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -99,9 +99,6 @@ metasrv_addrs = ["127.0.0.1:3002"]
 ## Operation timeout.
 timeout = "3s"
 
-## Heartbeat timeout.
-heartbeat_timeout = "500ms"
-
 ## DDL timeout.
 ddl_timeout = "10s"
 
@@ -284,18 +281,6 @@ data_home = "./greptimedb_data"
 ## - `Oss`: the data is stored in the Aliyun OSS.
 type = "File"
 
-## Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.
-## A local file directory, defaults to `{data_home}`. An empty string means disabling.
-## @toml2docs:none-default
-#+ cache_path = ""
-
-## Whether to enable read cache. If not set, the read cache will be enabled by default when using object storage.
-#+ enable_read_cache = true
-
-## The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger.
-## @toml2docs:none-default
-cache_capacity = "5GiB"
-
 ## The S3 bucket name.
 ## **It's only used when the storage type is `S3`, `Oss` and `Gcs`**.
 ## @toml2docs:none-default
@@ -455,6 +440,15 @@ compress_manifest = false
 ## @toml2docs:none-default="Auto"
 #+ max_background_purges = 8
 
+## Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit.
+## @toml2docs:none-default="0"
+#+ experimental_compaction_memory_limit = "0"
+
+## Behavior when compaction cannot acquire memory from the budget.
+## Options: "wait" (default, 10s), "wait(<duration>)", "fail"
+## @toml2docs:none-default="wait"
+#+ experimental_compaction_on_exhausted = "wait"
+
 ## Interval to auto flush a region if it has not flushed yet.
 auto_flush_interval = "1h"
 
@@ -510,6 +504,13 @@ preload_index_cache = true
 ## 1GiB is reserved for index files and 4GiB for data files.
 index_cache_percent = 20
 
+## Enable refilling cache on read operations (default: true).
+## When disabled, cache refilling on read won't happen.
+enable_refill_cache_on_read = true
+
+## Capacity for manifest cache (default: 256MB).
+manifest_cache_size = "256MB"
+
 ## Buffer size for SST writing.
 sst_write_buffer_size = "8MB"
 
@@ -712,21 +713,6 @@ otlp_export_protocol = "http"
 [logging.tracing_sample_ratio]
 default_ratio = 1.0
 
-## The datanode can export its metrics and send to Prometheus compatible service (e.g. `greptimedb` itself) from remote-write API.
-## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
-[export_metrics]
-## whether enable export metrics.
-enable = false
-## The interval of export metrics.
-write_interval = "30s"
-
-[export_metrics.remote_write]
-## The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`.
-url = ""
-
-## HTTP headers of Prometheus remote-write carry.
-headers = { }
-
 ## The tracing options. Only effect when compiled with `tokio-console` feature.
 #+ [tracing]
 ## The tokio console address.
diff --git a/config/flownode.example.toml b/config/flownode.example.toml
index 4e44c1ecbb..b13acfc447 100644
--- a/config/flownode.example.toml
+++ b/config/flownode.example.toml
@@ -78,9 +78,6 @@ metasrv_addrs = ["127.0.0.1:3002"]
 ## Operation timeout.
 timeout = "3s"
 
-## Heartbeat timeout.
-heartbeat_timeout = "500ms"
-
 ## DDL timeout.
 ddl_timeout = "10s"
 
diff --git a/config/frontend.example.toml b/config/frontend.example.toml
index 04d763c18f..701cb0b087 100644
--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -226,9 +226,6 @@ metasrv_addrs = ["127.0.0.1:3002"]
 ## Operation timeout.
 timeout = "3s"
 
-## Heartbeat timeout.
-heartbeat_timeout = "500ms"
-
 ## DDL timeout.
 ddl_timeout = "10s"
 
@@ -329,21 +326,6 @@ sample_ratio = 1.0
 ## The TTL of the `slow_queries` system table. Default is `90d` when `record_type` is `system_table`.
 ttl = "90d"
 
-## The frontend can export its metrics and send to Prometheus compatible service (e.g. `greptimedb` itself) from remote-write API.
-## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
-[export_metrics]
-## whether enable export metrics.
-enable = false
-## The interval of export metrics.
-write_interval = "30s"
-
-[export_metrics.remote_write]
-## The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`.
-url = ""
-
-## HTTP headers of Prometheus remote-write carry.
-headers = { }
-
 ## The tracing options. Only effect when compiled with `tokio-console` feature.
 #+ [tracing]
 ## The tokio console address.
diff --git a/config/metasrv.example.toml b/config/metasrv.example.toml
index d7d5ace99c..7997383a52 100644
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -1,11 +1,19 @@
 ## The working home directory.
 data_home = "./greptimedb_data"
 
-## Store server address default to etcd store.
-## For postgres store, the format is:
-## "password=password dbname=postgres user=postgres host=localhost port=5432"
-## For etcd store, the format is:
-## "127.0.0.1:2379"
+## Store server address(es). The format depends on the selected backend.
+##
+## For etcd: a list of "host:port" endpoints.
+## e.g. ["192.168.1.1:2379", "192.168.1.2:2379"]
+##
+## For PostgreSQL: a connection string in libpq format or URI.
+## e.g.
+## - "host=localhost port=5432 user=postgres password=<PASSWORD> dbname=postgres"
+## - "postgresql://user:password@localhost:5432/mydb?connect_timeout=10"
+## The detail see: https://docs.rs/tokio-postgres/latest/tokio_postgres/config/struct.Config.html
+##
+## For mysql store, the format is a MySQL connection URL.
+## e.g. "mysql://user:password@localhost:3306/greptime_meta?ssl-mode=VERIFY_CA&ssl-ca=/path/to/ca.pem"
 store_addrs = ["127.0.0.1:2379"]
 
 ## If it's not empty, the metasrv will store all data with this key prefix.
@@ -75,6 +83,9 @@ node_max_idle_time = "24hours"
 
 ## TLS configuration for kv store backend (applicable for etcd, PostgreSQL, and MySQL backends)
 ## When using etcd, PostgreSQL, or MySQL as metadata store, you can configure TLS here
+##
+## Note: if TLS is configured in both this section and the `store_addrs` connection string, the
+## settings here will override the TLS settings in `store_addrs`.
 [backend_tls]
 ## TLS mode, refer to https://www.postgresql.org/docs/current/libpq-ssl.html
 ## - "disable" - No TLS
@@ -98,9 +109,6 @@ key_path = ""
 ## Like "/path/to/ca.crt"
 ca_cert_path = ""
 
-## Watch for certificate file changes and auto reload
-watch = false
-
 ## The gRPC server options.
 [grpc]
 ## The address to bind the gRPC server.
@@ -323,21 +331,6 @@ otlp_export_protocol = "http"
 [logging.tracing_sample_ratio]
 default_ratio = 1.0
 
-## The metasrv can export its metrics and send to Prometheus compatible service (e.g. `greptimedb` itself) from remote-write API.
-## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
-[export_metrics]
-## whether enable export metrics.
-enable = false
-## The interval of export metrics.
-write_interval = "30s"
-
-[export_metrics.remote_write]
-## The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`.
-url = ""
-
-## HTTP headers of Prometheus remote-write carry.
-headers = { }
-
 ## The tracing options. Only effect when compiled with `tokio-console` feature.
 #+ [tracing]
 ## The tokio console address.
diff --git a/config/standalone.example.toml b/config/standalone.example.toml
index 70e6e0888f..47be8f6334 100644
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -388,18 +388,6 @@ data_home = "./greptimedb_data"
 ## - `Oss`: the data is stored in the Aliyun OSS.
 type = "File"
 
-## Whether to enable read cache. If not set, the read cache will be enabled by default when using object storage.
-#+ enable_read_cache = true
-
-## Read cache configuration for object storage such as 'S3' etc, it's configured by default when using object storage. It is recommended to configure it when using object storage for better performance.
-## A local file directory, defaults to `{data_home}`. An empty string means disabling.
-## @toml2docs:none-default
-#+ cache_path = ""
-
-## The local file cache capacity in bytes. If your disk space is sufficient, it is recommended to set it larger.
-## @toml2docs:none-default
-cache_capacity = "5GiB"
-
 ## The S3 bucket name.
 ## **It's only used when the storage type is `S3`, `Oss` and `Gcs`**.
 ## @toml2docs:none-default
@@ -546,6 +534,15 @@ compress_manifest = false
 ## @toml2docs:none-default="Auto"
 #+ max_background_purges = 8
 
+## Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit.
+## @toml2docs:none-default="0"
+#+ experimental_compaction_memory_limit = "0"
+
+## Behavior when compaction cannot acquire memory from the budget.
+## Options: "wait" (default, 10s), "wait(<duration>)", "fail"
+## @toml2docs:none-default="wait"
+#+ experimental_compaction_on_exhausted = "wait"
+
 ## Interval to auto flush a region if it has not flushed yet.
 auto_flush_interval = "1h"
 
@@ -601,6 +598,13 @@ preload_index_cache = true
 ## 1GiB is reserved for index files and 4GiB for data files.
 index_cache_percent = 20
 
+## Enable refilling cache on read operations (default: true).
+## When disabled, cache refilling on read won't happen.
+enable_refill_cache_on_read = true
+
+## Capacity for manifest cache (default: 256MB).
+manifest_cache_size = "256MB"
+
 ## Buffer size for SST writing.
 sst_write_buffer_size = "8MB"
 
@@ -820,27 +824,6 @@ default_ratio = 1.0
 ## @toml2docs:none-default
 #+ sample_ratio = 1.0
 
-## The standalone can export its metrics and send to Prometheus compatible service (e.g. `greptimedb`) from remote-write API.
-## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
-[export_metrics]
-## whether enable export metrics.
-enable = false
-## The interval of export metrics.
-write_interval = "30s"
-
-## For `standalone` mode, `self_import` is recommended to collect metrics generated by itself
-## You must create the database before enabling it.
-[export_metrics.self_import]
-## @toml2docs:none-default
-db = "greptime_metrics"
-
-[export_metrics.remote_write]
-## The prometheus remote write endpoint that the metrics send to. The url example can be: `http://127.0.0.1:4000/v1/prometheus/write?db=greptime_metrics`.
-url = ""
-
-## HTTP headers of Prometheus remote-write carry.
-headers = { }
-
 ## The tracing options. Only effect when compiled with `tokio-console` feature.
 #+ [tracing]
 ## The tokio console address.
diff --git a/docker/buildx/centos/Dockerfile b/docker/buildx/centos/Dockerfile
index b7e822fac6..f5bbd15ad6 100644
--- a/docker/buildx/centos/Dockerfile
+++ b/docker/buildx/centos/Dockerfile
@@ -1,10 +1,10 @@
-FROM centos:7 as builder
+FROM centos:7 AS builder
 
 ARG CARGO_PROFILE
 ARG FEATURES
 ARG OUTPUT_DIR
 
-ENV LANG en_US.utf8
+ENV LANG=en_US.utf8
 WORKDIR /greptimedb
 
 # Install dependencies
@@ -22,7 +22,7 @@ RUN unzip protoc-3.15.8-linux-x86_64.zip -d /usr/local/
 # Install Rust
 SHELL ["/bin/bash", "-c"]
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain none -y
-ENV PATH /usr/local/bin:/root/.cargo/bin/:$PATH
+ENV PATH=/usr/local/bin:/root/.cargo/bin/:$PATH
 
 # Build the project in release mode.
 RUN --mount=target=.,rw \
@@ -33,7 +33,7 @@ RUN --mount=target=.,rw \
     TARGET_DIR=/out/target
 
 # Export the binary to the clean image.
-FROM centos:7 as base
+FROM centos:7 AS base
 
 ARG OUTPUT_DIR
 
@@ -45,7 +45,7 @@ RUN yum install -y epel-release \
 
 WORKDIR /greptime
 COPY --from=builder /out/target/${OUTPUT_DIR}/greptime /greptime/bin/
-ENV PATH /greptime/bin/:$PATH
+ENV PATH=/greptime/bin/:$PATH
 
 ENV MALLOC_CONF="prof:true,prof_active:false"
 
diff --git a/docker/buildx/distroless/Dockerfile b/docker/buildx/distroless/Dockerfile
new file mode 100644
index 0000000000..b0f3af33e8
--- /dev/null
+++ b/docker/buildx/distroless/Dockerfile
@@ -0,0 +1,65 @@
+FROM ubuntu:22.04 AS builder
+
+ARG CARGO_PROFILE
+ARG FEATURES
+ARG OUTPUT_DIR
+
+ENV LANG=en_US.utf8
+WORKDIR /greptimedb
+
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y software-properties-common
+
+# Install dependencies.
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update && apt-get install -y \
+    libssl-dev \
+    protobuf-compiler \
+    curl \
+    git \
+    build-essential \
+    pkg-config
+
+# Install Rust.
+SHELL ["/bin/bash", "-c"]
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain none -y
+ENV PATH=/root/.cargo/bin/:$PATH
+
+# Build the project in release mode.
+RUN --mount=target=. \
+    --mount=type=cache,target=/root/.cargo/registry \
+    make build \
+    CARGO_PROFILE=${CARGO_PROFILE} \
+    FEATURES=${FEATURES} \
+    TARGET_DIR=/out/target
+
+FROM ubuntu:22.04 AS libs
+
+ARG TARGETARCH
+
+# Copy required library dependencies based on architecture
+RUN if [ "$TARGETARCH" = "amd64" ]; then \
+        cp /lib/x86_64-linux-gnu/libz.so.1.2.11 /lib/x86_64-linux-gnu/libz.so.1; \
+    elif [ "$TARGETARCH" = "arm64" ]; then \
+        cp /lib/aarch64-linux-gnu/libz.so.1.2.11 /lib/aarch64-linux-gnu/libz.so.1; \
+    else \
+        echo "Unsupported architecture: $TARGETARCH" && exit 1; \
+    fi
+
+# Export the binary to the clean distroless image.
+FROM gcr.io/distroless/cc-debian12:latest AS base
+
+ARG OUTPUT_DIR
+ARG TARGETARCH
+
+# Copy required library dependencies
+COPY --from=libs /lib /lib
+COPY --from=busybox:stable /bin/busybox /bin/busybox
+
+WORKDIR /greptime
+COPY --from=builder /out/target/${OUTPUT_DIR}/greptime /greptime/bin/greptime
+ENV PATH=/greptime/bin/:$PATH
+
+ENV MALLOC_CONF="prof:true,prof_active:false"
+
+ENTRYPOINT ["greptime"]
diff --git a/docker/buildx/ubuntu/Dockerfile b/docker/buildx/ubuntu/Dockerfile
index 6306e04688..b6dc386da4 100644
--- a/docker/buildx/ubuntu/Dockerfile
+++ b/docker/buildx/ubuntu/Dockerfile
@@ -1,10 +1,10 @@
-FROM ubuntu:22.04 as builder
+FROM ubuntu:22.04 AS builder
 
 ARG CARGO_PROFILE
 ARG FEATURES
 ARG OUTPUT_DIR
 
-ENV LANG en_US.utf8
+ENV LANG=en_US.utf8
 WORKDIR /greptimedb
 
 RUN apt-get update && \
@@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/var/cache/apt \
 # Install Rust.
 SHELL ["/bin/bash", "-c"]
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --no-modify-path --default-toolchain none -y
-ENV PATH /root/.cargo/bin/:$PATH
+ENV PATH=/root/.cargo/bin/:$PATH
 
 # Build the project in release mode.
 RUN --mount=target=. \
@@ -35,7 +35,7 @@ RUN --mount=target=. \
 
 # Export the binary to the clean image.
 # TODO(zyy17): Maybe should use the more secure container image.
-FROM ubuntu:22.04 as base
+FROM ubuntu:22.04 AS base
 
 ARG OUTPUT_DIR
 
@@ -45,7 +45,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get \
 
 WORKDIR /greptime
 COPY --from=builder /out/target/${OUTPUT_DIR}/greptime /greptime/bin/
-ENV PATH /greptime/bin/:$PATH
+ENV PATH=/greptime/bin/:$PATH
 
 ENV MALLOC_CONF="prof:true,prof_active:false"
 
diff --git a/docker/ci/centos/Dockerfile b/docker/ci/centos/Dockerfile
index 480f2196b2..67efadd7dc 100644
--- a/docker/ci/centos/Dockerfile
+++ b/docker/ci/centos/Dockerfile
@@ -13,7 +13,7 @@ ARG TARGETARCH
 
 ADD $TARGETARCH/greptime /greptime/bin/
 
-ENV PATH /greptime/bin/:$PATH
+ENV PATH=/greptime/bin/:$PATH
 
 ENV MALLOC_CONF="prof:true,prof_active:false"
 
diff --git a/docker/ci/distroless/Dockerfile b/docker/ci/distroless/Dockerfile
new file mode 100644
index 0000000000..f5e7ebd88e
--- /dev/null
+++ b/docker/ci/distroless/Dockerfile
@@ -0,0 +1,40 @@
+FROM ubuntu:22.04 AS libs
+
+ARG TARGETARCH
+
+# Copy required library dependencies based on architecture
+# TARGETARCH values: amd64, arm64
+# Ubuntu library paths: x86_64-linux-gnu, aarch64-linux-gnu
+RUN if [ "$TARGETARCH" = "amd64" ]; then \
+        mkdir -p /output/x86_64-linux-gnu && \
+        cp /lib/x86_64-linux-gnu/libz.so.1.2.11 /output/x86_64-linux-gnu/libz.so.1; \
+    elif [ "$TARGETARCH" = "arm64" ]; then \
+        mkdir -p /output/aarch64-linux-gnu && \
+        cp /lib/aarch64-linux-gnu/libz.so.1.2.11 /output/aarch64-linux-gnu/libz.so.1; \
+    else \
+        echo "Unsupported architecture: $TARGETARCH" && exit 1; \
+    fi
+
+FROM gcr.io/distroless/cc-debian12:latest
+
+# The root path under which contains all the dependencies to build this Dockerfile.
+ARG DOCKER_BUILD_ROOT=.
+# The binary name of GreptimeDB executable.
+# Defaults to "greptime", but sometimes in other projects it might be different.
+ARG TARGET_BIN=greptime
+
+ARG TARGETARCH
+
+# Copy required library dependencies
+COPY --from=libs /output /lib
+COPY --from=busybox:stable /bin/busybox /bin/busybox
+
+ADD $TARGETARCH/$TARGET_BIN /greptime/bin/
+
+ENV PATH=/greptime/bin/:$PATH
+
+ENV TARGET_BIN=$TARGET_BIN
+
+ENV MALLOC_CONF="prof:true,prof_active:false"
+
+ENTRYPOINT ["greptime"]
diff --git a/docker/ci/ubuntu/Dockerfile b/docker/ci/ubuntu/Dockerfile
index 046fd62972..c1a88e02c8 100644
--- a/docker/ci/ubuntu/Dockerfile
+++ b/docker/ci/ubuntu/Dockerfile
@@ -14,7 +14,7 @@ ARG TARGETARCH
 
 ADD $TARGETARCH/$TARGET_BIN /greptime/bin/
 
-ENV PATH /greptime/bin/:$PATH
+ENV PATH=/greptime/bin/:$PATH
 
 ENV TARGET_BIN=$TARGET_BIN
 
diff --git a/docs/how-to/how-to-change-log-level-on-the-fly.md b/docs/how-to/how-to-change-log-level-on-the-fly.md
index 16a72bf6ae..c3bf2602a2 100644
--- a/docs/how-to/how-to-change-log-level-on-the-fly.md
+++ b/docs/how-to/how-to-change-log-level-on-the-fly.md
@@ -13,4 +13,19 @@ Log Level changed from Some("info") to "trace,flow=debug"%
 
 The data is a string in the format of `global_level,module1=level1,module2=level2,...` that follows the same rule of `RUST_LOG`. 
 
-The module is the module name of the log, and the level is the log level. The log level can be one of the following: `trace`, `debug`, `info`, `warn`, `error`, `off`(case insensitive).
\ No newline at end of file
+The module is the module name of the log, and the level is the log level. The log level can be one of the following: `trace`, `debug`, `info`, `warn`, `error`, `off`(case insensitive).
+
+# Enable/Disable Trace on the Fly
+
+## HTTP API
+
+example:
+```bash
+curl --data "true" 127.0.0.1:4000/debug/enable_trace
+```
+And database will reply with something like:
+```
+trace enabled%
+```
+
+Possible values are "true" or "false".
diff --git a/docs/rfcs/2025-07-23-global-gc-worker.md b/docs/rfcs/2025-07-23-global-gc-worker.md
index 69d1e3ac34..331ed01f38 100644
--- a/docs/rfcs/2025-07-23-global-gc-worker.md
+++ b/docs/rfcs/2025-07-23-global-gc-worker.md
@@ -106,6 +106,37 @@ This mechanism may be too complex to implement at once. We can consider a two-ph
 Also the read replica shouldn't be later in manifest version for more than the lingering time of obsolete files, otherwise it might ref to files that are already deleted by the GC worker.
 - need to upload tmp manifest to object storage, which may introduce additional complexity and potential performance overhead. But since long-running queries are typically not frequent, the performance impact is expected to be minimal.
 
+one potential race condition with region-migration is illustrated below:
+
+```mermaid
+sequenceDiagram
+    participant gc_worker as GC Worker(same dn as region 1)
+    participant region1 as Region 1 (Leader → Follower)
+    participant region2 as Region 2 (Follower → Leader)
+    participant region_dir as Region Directory
+
+    gc_worker->>region1: Start GC, get region manifest
+    activate region1
+    region1-->>gc_worker: Region 1 manifest
+    deactivate region1
+    gc_worker->>region_dir: Scan region directory
+
+    Note over region1,region2: Region Migration Occurs
+    region1-->>region2: Downgrade to Follower
+    region2-->>region1: Becomes Leader
+
+    region2->>region_dir: Add new file
+
+    gc_worker->>region_dir: Continue scanning
+    gc_worker-->>region_dir: Discovers new file
+    Note over gc_worker: New file not in Region 1's manifest
+    gc_worker->>gc_worker: Mark file as orphan(incorrectly)
+```
+which could cause gc worker to incorrectly mark the new file as orphan and delete it, if config the lingering time for orphan files(files not mentioned anywhere(in used or unused)) is not long enough.
+
+A good enough solution could be to use lock to prevent gc worker to happen on the region if region migration is happening on the region, and vise versa.
+
+The race condition between gc worker and repartition also needs to be considered carefully. For now, acquiring lock for both region-migration and repartition during gc worker process could be a simple solution.
 
 ## Conclusion and Rationale
 
diff --git a/flake.lock b/flake.lock
index e410fe9785..0bc3e6f283 100644
--- a/flake.lock
+++ b/flake.lock
@@ -8,11 +8,11 @@
         "rust-analyzer-src": "rust-analyzer-src"
       },
       "locked": {
-        "lastModified": 1760078406,
-        "narHash": "sha256-JeJK0ZA845PtkCHkfo4KjeI1mYrsr2s3cxBYKhF4BoE=",
+        "lastModified": 1765252472,
+        "narHash": "sha256-byMt/uMi7DJ8tRniFopDFZMO3leSjGp6GS4zWOFT+uQ=",
         "owner": "nix-community",
         "repo": "fenix",
-        "rev": "351277c60d104944122ee389cdf581c5ce2c6732",
+        "rev": "8456b985f6652e3eef0632ee9992b439735c5544",
         "type": "github"
       },
       "original": {
@@ -41,16 +41,16 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1759994382,
-        "narHash": "sha256-wSK+3UkalDZRVHGCRikZ//CyZUJWDJkBDTQX1+G77Ow=",
+        "lastModified": 1764983851,
+        "narHash": "sha256-y7RPKl/jJ/KAP/VKLMghMgXTlvNIJMHKskl8/Uuar7o=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "5da4a26309e796daa7ffca72df93dbe53b8164c7",
+        "rev": "d9bc5c7dceb30d8d6fafa10aeb6aa8a48c218454",
         "type": "github"
       },
       "original": {
         "owner": "NixOS",
-        "ref": "nixos-25.05",
+        "ref": "nixos-25.11",
         "repo": "nixpkgs",
         "type": "github"
       }
@@ -65,11 +65,11 @@
     "rust-analyzer-src": {
       "flake": false,
       "locked": {
-        "lastModified": 1760014945,
-        "narHash": "sha256-ySdl7F9+oeWNHVrg3QL/brazqmJvYFEdpGnF3pyoDH8=",
+        "lastModified": 1765120009,
+        "narHash": "sha256-nG76b87rkaDzibWbnB5bYDm6a52b78A+fpm+03pqYIw=",
         "owner": "rust-lang",
         "repo": "rust-analyzer",
-        "rev": "90d2e1ce4dfe7dc49250a8b88a0f08ffdb9cb23f",
+        "rev": "5e3e9c4e61bba8a5e72134b9ffefbef8f531d008",
         "type": "github"
       },
       "original": {
diff --git a/flake.nix b/flake.nix
index 555e4c714e..58c10465a0 100644
--- a/flake.nix
+++ b/flake.nix
@@ -2,7 +2,7 @@
   description = "Development environment flake";
 
   inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05";
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11";
     fenix = {
       url = "github:nix-community/fenix";
       inputs.nixpkgs.follows = "nixpkgs";
@@ -48,7 +48,7 @@
             gnuplot ## for cargo bench
           ];
 
-          LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath buildInputs;
+          buildInputs = buildInputs;
           NIX_HARDENING_ENABLE = "";
         };
       });
diff --git a/src/api/src/helper.rs b/src/api/src/helper.rs
index da5fdcfeda..b64e6d0265 100644
--- a/src/api/src/helper.rs
+++ b/src/api/src/helper.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::HashSet;
+use std::collections::{BTreeMap, HashSet};
 use std::sync::Arc;
 
 use common_decimal::Decimal128;
@@ -20,13 +20,12 @@ use common_decimal::decimal128::{DECIMAL128_DEFAULT_SCALE, DECIMAL128_MAX_PRECIS
 use common_time::time::Time;
 use common_time::timestamp::TimeUnit;
 use common_time::{Date, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth, Timestamp};
+use datatypes::json::value::{JsonNumber, JsonValue, JsonValueRef, JsonVariant};
 use datatypes::prelude::{ConcreteDataType, ValueRef};
 use datatypes::types::{
-    IntervalType, JsonFormat, StructField, StructType, TimeType, TimestampType,
-};
-use datatypes::value::{
-    ListValue, ListValueRef, OrderedF32, OrderedF64, StructValue, StructValueRef, Value,
+    IntervalType, JsonFormat, JsonType, StructField, StructType, TimeType, TimestampType,
 };
+use datatypes::value::{ListValueRef, OrderedF32, OrderedF64, StructValueRef, Value};
 use datatypes::vectors::VectorRef;
 use greptime_proto::v1::column_data_type_extension::TypeExt;
 use greptime_proto::v1::ddl_request::Expr;
@@ -34,9 +33,9 @@ use greptime_proto::v1::greptime_request::Request;
 use greptime_proto::v1::query_request::Query;
 use greptime_proto::v1::value::ValueData;
 use greptime_proto::v1::{
-    self, ColumnDataTypeExtension, DdlRequest, DecimalTypeExtension, JsonNativeTypeExtension,
-    JsonTypeExtension, ListTypeExtension, QueryRequest, Row, SemanticType, StructTypeExtension,
-    VectorTypeExtension,
+    self, ColumnDataTypeExtension, DdlRequest, DecimalTypeExtension, DictionaryTypeExtension,
+    JsonList, JsonNativeTypeExtension, JsonObject, JsonTypeExtension, ListTypeExtension,
+    QueryRequest, Row, SemanticType, StructTypeExtension, VectorTypeExtension, json_value,
 };
 use paste::paste;
 use snafu::prelude::*;
@@ -81,6 +80,10 @@ impl ColumnDataTypeWrapper {
     pub fn to_parts(&self) -> (ColumnDataType, Option<ColumnDataTypeExtension>) {
         (self.datatype, self.datatype_ext.clone())
     }
+
+    pub fn into_parts(self) -> (ColumnDataType, Option<ColumnDataTypeExtension>) {
+        (self.datatype, self.datatype_ext)
+    }
 }
 
 impl From<ColumnDataTypeWrapper> for ConcreteDataType {
@@ -126,6 +129,7 @@ impl From<ColumnDataTypeWrapper> for ConcreteDataType {
                         };
                         ConcreteDataType::json_native_datatype(inner_type.into())
                     }
+                    None => ConcreteDataType::Json(JsonType::null()),
                     _ => {
                         // invalid state, type extension is missing or invalid
                         ConcreteDataType::null_datatype()
@@ -215,6 +219,26 @@ impl From<ColumnDataTypeWrapper> for ConcreteDataType {
                     ConcreteDataType::null_datatype()
                 }
             }
+            ColumnDataType::Dictionary => {
+                if let Some(TypeExt::DictionaryType(d)) = datatype_wrapper
+                    .datatype_ext
+                    .as_ref()
+                    .and_then(|datatype_ext| datatype_ext.type_ext.as_ref())
+                {
+                    let key_type = ColumnDataTypeWrapper {
+                        datatype: d.key_datatype(),
+                        datatype_ext: d.key_datatype_extension.clone().map(|ext| *ext),
+                    };
+                    let value_type = ColumnDataTypeWrapper {
+                        datatype: d.value_datatype(),
+                        datatype_ext: d.value_datatype_extension.clone().map(|ext| *ext),
+                    };
+                    ConcreteDataType::dictionary_datatype(key_type.into(), value_type.into())
+                } else {
+                    // invalid state: type extension not found
+                    ConcreteDataType::null_datatype()
+                }
+            }
         }
     }
 }
@@ -338,13 +362,30 @@ impl ColumnDataTypeWrapper {
             }),
         }
     }
+
+    pub fn dictionary_datatype(
+        key_type: ColumnDataTypeWrapper,
+        value_type: ColumnDataTypeWrapper,
+    ) -> Self {
+        ColumnDataTypeWrapper {
+            datatype: ColumnDataType::Dictionary,
+            datatype_ext: Some(ColumnDataTypeExtension {
+                type_ext: Some(TypeExt::DictionaryType(Box::new(DictionaryTypeExtension {
+                    key_datatype: key_type.datatype().into(),
+                    key_datatype_extension: key_type.datatype_ext.map(Box::new),
+                    value_datatype: value_type.datatype().into(),
+                    value_datatype_extension: value_type.datatype_ext.map(Box::new),
+                }))),
+            }),
+        }
+    }
 }
 
 impl TryFrom<ConcreteDataType> for ColumnDataTypeWrapper {
     type Error = error::Error;
 
     fn try_from(datatype: ConcreteDataType) -> Result<Self> {
-        let column_datatype = match datatype {
+        let column_datatype = match &datatype {
             ConcreteDataType::Boolean(_) => ColumnDataType::Boolean,
             ConcreteDataType::Int8(_) => ColumnDataType::Int8,
             ConcreteDataType::Int16(_) => ColumnDataType::Int16,
@@ -381,9 +422,8 @@ impl TryFrom<ConcreteDataType> for ColumnDataTypeWrapper {
             ConcreteDataType::Vector(_) => ColumnDataType::Vector,
             ConcreteDataType::List(_) => ColumnDataType::List,
             ConcreteDataType::Struct(_) => ColumnDataType::Struct,
-            ConcreteDataType::Null(_)
-            | ConcreteDataType::Dictionary(_)
-            | ConcreteDataType::Duration(_) => {
+            ConcreteDataType::Dictionary(_) => ColumnDataType::Dictionary,
+            ConcreteDataType::Null(_) | ConcreteDataType::Duration(_) => {
                 return error::IntoColumnDataTypeSnafu { from: datatype }.fail();
             }
         };
@@ -404,16 +444,22 @@ impl TryFrom<ConcreteDataType> for ColumnDataTypeWrapper {
                         JsonFormat::Jsonb => Some(ColumnDataTypeExtension {
                             type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())),
                         }),
-                        JsonFormat::Native(inner) => {
-                            let inner_type = ColumnDataTypeWrapper::try_from(*inner.clone())?;
-                            Some(ColumnDataTypeExtension {
-                                type_ext: Some(TypeExt::JsonNativeType(Box::new(
-                                    JsonNativeTypeExtension {
-                                        datatype: inner_type.datatype.into(),
-                                        datatype_extension: inner_type.datatype_ext.map(Box::new),
-                                    },
-                                ))),
-                            })
+                        JsonFormat::Native(native_type) => {
+                            if native_type.is_null() {
+                                None
+                            } else {
+                                let native_type = ConcreteDataType::from(native_type.as_ref());
+                                let (datatype, datatype_extension) =
+                                    ColumnDataTypeWrapper::try_from(native_type)?.into_parts();
+                                Some(ColumnDataTypeExtension {
+                                    type_ext: Some(TypeExt::JsonNativeType(Box::new(
+                                        JsonNativeTypeExtension {
+                                            datatype: datatype as i32,
+                                            datatype_extension: datatype_extension.map(Box::new),
+                                        },
+                                    ))),
+                                })
+                            }
                         }
                     }
                 } else {
@@ -463,6 +509,25 @@ impl TryFrom<ConcreteDataType> for ColumnDataTypeWrapper {
                     None
                 }
             }
+            ColumnDataType::Dictionary => {
+                if let ConcreteDataType::Dictionary(dict_type) = &datatype {
+                    let key_type = ColumnDataTypeWrapper::try_from(dict_type.key_type().clone())?;
+                    let value_type =
+                        ColumnDataTypeWrapper::try_from(dict_type.value_type().clone())?;
+                    Some(ColumnDataTypeExtension {
+                        type_ext: Some(TypeExt::DictionaryType(Box::new(
+                            DictionaryTypeExtension {
+                                key_datatype: key_type.datatype.into(),
+                                key_datatype_extension: key_type.datatype_ext.map(Box::new),
+                                value_datatype: value_type.datatype.into(),
+                                value_datatype_extension: value_type.datatype_ext.map(Box::new),
+                            },
+                        ))),
+                    })
+                } else {
+                    None
+                }
+            }
             _ => None,
         };
         Ok(Self {
@@ -601,6 +666,9 @@ pub fn values_with_capacity(datatype: ColumnDataType, capacity: usize) -> Values
             struct_values: Vec::with_capacity(capacity),
             ..Default::default()
         },
+        ColumnDataType::Dictionary => Values {
+            ..Default::default()
+        },
     }
 }
 
@@ -640,6 +708,7 @@ fn ddl_request_type(request: &DdlRequest) -> &'static str {
         Some(Expr::CreateView(_)) => "ddl.create_view",
         Some(Expr::DropView(_)) => "ddl.drop_view",
         Some(Expr::AlterDatabase(_)) => "ddl.alter_database",
+        Some(Expr::CommentOn(_)) => "ddl.comment_on",
         None => "ddl.empty",
     }
 }
@@ -801,21 +870,8 @@ pub fn pb_value_to_value_ref<'a>(
         }
 
         ValueData::JsonValue(inner_value) => {
-            let json_datatype_ext = datatype_ext
-                .as_ref()
-                .and_then(|ext| {
-                    if let Some(TypeExt::JsonNativeType(l)) = &ext.type_ext {
-                        Some(l)
-                    } else {
-                        None
-                    }
-                })
-                .expect("json value must contain datatype ext");
-
-            ValueRef::Json(Box::new(pb_value_to_value_ref(
-                inner_value,
-                json_datatype_ext.datatype_extension.as_deref(),
-            )))
+            let value = decode_json_value(inner_value);
+            ValueRef::Json(Box::new(value))
         }
     }
 }
@@ -839,125 +895,64 @@ pub fn is_column_type_value_eq(
         .unwrap_or(false)
 }
 
-/// Convert value into proto's value.
-pub fn to_proto_value(value: Value) -> v1::Value {
-    match value {
-        Value::Null => v1::Value { value_data: None },
-        Value::Boolean(v) => v1::Value {
-            value_data: Some(ValueData::BoolValue(v)),
-        },
-        Value::UInt8(v) => v1::Value {
-            value_data: Some(ValueData::U8Value(v.into())),
-        },
-        Value::UInt16(v) => v1::Value {
-            value_data: Some(ValueData::U16Value(v.into())),
-        },
-        Value::UInt32(v) => v1::Value {
-            value_data: Some(ValueData::U32Value(v)),
-        },
-        Value::UInt64(v) => v1::Value {
-            value_data: Some(ValueData::U64Value(v)),
-        },
-        Value::Int8(v) => v1::Value {
-            value_data: Some(ValueData::I8Value(v.into())),
-        },
-        Value::Int16(v) => v1::Value {
-            value_data: Some(ValueData::I16Value(v.into())),
-        },
-        Value::Int32(v) => v1::Value {
-            value_data: Some(ValueData::I32Value(v)),
-        },
-        Value::Int64(v) => v1::Value {
-            value_data: Some(ValueData::I64Value(v)),
-        },
-        Value::Float32(v) => v1::Value {
-            value_data: Some(ValueData::F32Value(*v)),
-        },
-        Value::Float64(v) => v1::Value {
-            value_data: Some(ValueData::F64Value(*v)),
-        },
-        Value::String(v) => v1::Value {
-            value_data: Some(ValueData::StringValue(v.as_utf8().to_string())),
-        },
-        Value::Binary(v) => v1::Value {
-            value_data: Some(ValueData::BinaryValue(v.to_vec())),
-        },
-        Value::Date(v) => v1::Value {
-            value_data: Some(ValueData::DateValue(v.val())),
-        },
-        Value::Timestamp(v) => match v.unit() {
-            TimeUnit::Second => v1::Value {
-                value_data: Some(ValueData::TimestampSecondValue(v.value())),
-            },
-            TimeUnit::Millisecond => v1::Value {
-                value_data: Some(ValueData::TimestampMillisecondValue(v.value())),
-            },
-            TimeUnit::Microsecond => v1::Value {
-                value_data: Some(ValueData::TimestampMicrosecondValue(v.value())),
-            },
-            TimeUnit::Nanosecond => v1::Value {
-                value_data: Some(ValueData::TimestampNanosecondValue(v.value())),
-            },
-        },
-        Value::Time(v) => match v.unit() {
-            TimeUnit::Second => v1::Value {
-                value_data: Some(ValueData::TimeSecondValue(v.value())),
-            },
-            TimeUnit::Millisecond => v1::Value {
-                value_data: Some(ValueData::TimeMillisecondValue(v.value())),
-            },
-            TimeUnit::Microsecond => v1::Value {
-                value_data: Some(ValueData::TimeMicrosecondValue(v.value())),
-            },
-            TimeUnit::Nanosecond => v1::Value {
-                value_data: Some(ValueData::TimeNanosecondValue(v.value())),
-            },
-        },
-        Value::IntervalYearMonth(v) => v1::Value {
-            value_data: Some(ValueData::IntervalYearMonthValue(v.to_i32())),
-        },
-        Value::IntervalDayTime(v) => v1::Value {
-            value_data: Some(ValueData::IntervalDayTimeValue(v.to_i64())),
-        },
-        Value::IntervalMonthDayNano(v) => v1::Value {
-            value_data: Some(ValueData::IntervalMonthDayNanoValue(
-                convert_month_day_nano_to_pb(v),
-            )),
-        },
-        Value::Decimal128(v) => v1::Value {
-            value_data: Some(ValueData::Decimal128Value(convert_to_pb_decimal128(v))),
-        },
-        Value::List(list_value) => v1::Value {
-            value_data: Some(ValueData::ListValue(v1::ListValue {
-                items: convert_list_to_pb_values(list_value),
+fn encode_json_value(value: JsonValue) -> v1::JsonValue {
+    fn helper(json: JsonVariant) -> v1::JsonValue {
+        let value = match json {
+            JsonVariant::Null => None,
+            JsonVariant::Bool(x) => Some(json_value::Value::Boolean(x)),
+            JsonVariant::Number(x) => Some(match x {
+                JsonNumber::PosInt(i) => json_value::Value::Uint(i),
+                JsonNumber::NegInt(i) => json_value::Value::Int(i),
+                JsonNumber::Float(f) => json_value::Value::Float(f.0),
+            }),
+            JsonVariant::String(x) => Some(json_value::Value::Str(x)),
+            JsonVariant::Array(x) => Some(json_value::Value::Array(JsonList {
+                items: x.into_iter().map(helper).collect::<Vec<_>>(),
             })),
-        },
-        Value::Struct(struct_value) => v1::Value {
-            value_data: Some(ValueData::StructValue(v1::StructValue {
-                items: convert_struct_to_pb_values(struct_value),
-            })),
-        },
-        Value::Json(v) => v1::Value {
-            value_data: Some(ValueData::JsonValue(Box::new(to_proto_value(*v)))),
-        },
-        Value::Duration(_) => v1::Value { value_data: None },
+            JsonVariant::Object(x) => {
+                let entries = x
+                    .into_iter()
+                    .map(|(key, v)| v1::json_object::Entry {
+                        key,
+                        value: Some(helper(v)),
+                    })
+                    .collect::<Vec<_>>();
+                Some(json_value::Value::Object(JsonObject { entries }))
+            }
+        };
+        v1::JsonValue { value }
     }
+    helper(value.into_variant())
 }
 
-fn convert_list_to_pb_values(list_value: ListValue) -> Vec<v1::Value> {
-    list_value
-        .take_items()
-        .into_iter()
-        .map(to_proto_value)
-        .collect()
-}
-
-fn convert_struct_to_pb_values(struct_value: StructValue) -> Vec<v1::Value> {
-    struct_value
-        .take_items()
-        .into_iter()
-        .map(to_proto_value)
-        .collect()
+fn decode_json_value(value: &v1::JsonValue) -> JsonValueRef<'_> {
+    let Some(value) = &value.value else {
+        return JsonValueRef::null();
+    };
+    match value {
+        json_value::Value::Boolean(x) => (*x).into(),
+        json_value::Value::Int(x) => (*x).into(),
+        json_value::Value::Uint(x) => (*x).into(),
+        json_value::Value::Float(x) => (*x).into(),
+        json_value::Value::Str(x) => (x.as_str()).into(),
+        json_value::Value::Array(array) => array
+            .items
+            .iter()
+            .map(|x| decode_json_value(x).into_variant())
+            .collect::<Vec<_>>()
+            .into(),
+        json_value::Value::Object(x) => x
+            .entries
+            .iter()
+            .filter_map(|entry| {
+                entry
+                    .value
+                    .as_ref()
+                    .map(|v| (entry.key.as_str(), decode_json_value(v).into_variant()))
+            })
+            .collect::<BTreeMap<_, _>>()
+            .into(),
+    }
 }
 
 /// Returns the [ColumnDataTypeWrapper] of the value.
@@ -1006,14 +1001,14 @@ pub fn vectors_to_rows<'a>(
     let mut rows = vec![Row { values: vec![] }; row_count];
     for column in columns {
         for (row_index, row) in rows.iter_mut().enumerate() {
-            row.values.push(value_to_grpc_value(column.get(row_index)))
+            row.values.push(to_grpc_value(column.get(row_index)))
         }
     }
 
     rows
 }
 
-pub fn value_to_grpc_value(value: Value) -> GrpcValue {
+pub fn to_grpc_value(value: Value) -> GrpcValue {
     GrpcValue {
         value_data: match value {
             Value::Null => None,
@@ -1053,7 +1048,7 @@ pub fn value_to_grpc_value(value: Value) -> GrpcValue {
                 let items = list_value
                     .take_items()
                     .into_iter()
-                    .map(value_to_grpc_value)
+                    .map(to_grpc_value)
                     .collect();
                 Some(ValueData::ListValue(v1::ListValue { items }))
             }
@@ -1061,13 +1056,11 @@ pub fn value_to_grpc_value(value: Value) -> GrpcValue {
                 let items = struct_value
                     .take_items()
                     .into_iter()
-                    .map(value_to_grpc_value)
+                    .map(to_grpc_value)
                     .collect();
                 Some(ValueData::StructValue(v1::StructValue { items }))
             }
-            Value::Json(inner_value) => Some(ValueData::JsonValue(Box::new(value_to_grpc_value(
-                *inner_value,
-            )))),
+            Value::Json(v) => Some(ValueData::JsonValue(encode_json_value(*v))),
             Value::Duration(_) => unreachable!(),
         },
     }
@@ -1163,6 +1156,7 @@ mod tests {
     use common_time::interval::IntervalUnit;
     use datatypes::scalars::ScalarVector;
     use datatypes::types::{Int8Type, Int32Type, UInt8Type, UInt32Type};
+    use datatypes::value::{ListValue, StructValue};
     use datatypes::vectors::{
         BooleanVector, DateVector, Float32Vector, PrimitiveVector, StringVector,
     };
@@ -1259,6 +1253,9 @@ mod tests {
         let values = values_with_capacity(ColumnDataType::Json, 2);
         assert_eq!(2, values.json_values.capacity());
         assert_eq!(2, values.string_values.capacity());
+
+        let values = values_with_capacity(ColumnDataType::Dictionary, 2);
+        assert!(values.bool_values.is_empty());
     }
 
     #[test]
@@ -1355,6 +1352,17 @@ mod tests {
             ConcreteDataType::list_datatype(Arc::new(ConcreteDataType::string_datatype())),
             ColumnDataTypeWrapper::list_datatype(ColumnDataTypeWrapper::string_datatype()).into()
         );
+        assert_eq!(
+            ConcreteDataType::dictionary_datatype(
+                ConcreteDataType::int32_datatype(),
+                ConcreteDataType::string_datatype()
+            ),
+            ColumnDataTypeWrapper::dictionary_datatype(
+                ColumnDataTypeWrapper::int32_datatype(),
+                ColumnDataTypeWrapper::string_datatype()
+            )
+            .into()
+        );
         let struct_type = StructType::new(Arc::new(vec![
             StructField::new("id".to_string(), ConcreteDataType::int64_datatype(), true),
             StructField::new(
@@ -1525,6 +1533,18 @@ mod tests {
             ColumnDataTypeWrapper::vector_datatype(3),
             ConcreteDataType::vector_datatype(3).try_into().unwrap()
         );
+        assert_eq!(
+            ColumnDataTypeWrapper::dictionary_datatype(
+                ColumnDataTypeWrapper::int32_datatype(),
+                ColumnDataTypeWrapper::string_datatype()
+            ),
+            ConcreteDataType::dictionary_datatype(
+                ConcreteDataType::int32_datatype(),
+                ConcreteDataType::string_datatype()
+            )
+            .try_into()
+            .unwrap()
+        );
 
         let result: Result<ColumnDataTypeWrapper> = ConcreteDataType::null_datatype().try_into();
         assert!(result.is_err());
@@ -1580,6 +1600,20 @@ mod tests {
                         datatype_extension: Some(Box::new(ColumnDataTypeExtension {
                             type_ext: Some(TypeExt::StructType(StructTypeExtension {
                                 fields: vec![
+                                    v1::StructField {
+                                        name: "address".to_string(),
+                                        datatype: ColumnDataTypeWrapper::string_datatype()
+                                            .datatype()
+                                            .into(),
+                                        datatype_extension: None
+                                    },
+                                    v1::StructField {
+                                        name: "age".to_string(),
+                                        datatype: ColumnDataTypeWrapper::int64_datatype()
+                                            .datatype()
+                                            .into(),
+                                        datatype_extension: None
+                                    },
                                     v1::StructField {
                                         name: "id".to_string(),
                                         datatype: ColumnDataTypeWrapper::int64_datatype()
@@ -1594,20 +1628,6 @@ mod tests {
                                             .into(),
                                         datatype_extension: None
                                     },
-                                    v1::StructField {
-                                        name: "age".to_string(),
-                                        datatype: ColumnDataTypeWrapper::int32_datatype()
-                                            .datatype()
-                                            .into(),
-                                        datatype_extension: None
-                                    },
-                                    v1::StructField {
-                                        name: "address".to_string(),
-                                        datatype: ColumnDataTypeWrapper::string_datatype()
-                                            .datatype()
-                                            .into(),
-                                        datatype_extension: None
-                                    }
                                 ]
                             }))
                         }))
@@ -1740,7 +1760,7 @@ mod tests {
             Arc::new(ConcreteDataType::boolean_datatype()),
         ));
 
-        let pb_value = to_proto_value(value);
+        let pb_value = to_grpc_value(value);
 
         match pb_value.value_data.unwrap() {
             ValueData::ListValue(pb_list_value) => {
@@ -1769,7 +1789,7 @@ mod tests {
             .unwrap(),
         );
 
-        let pb_value = to_proto_value(value);
+        let pb_value = to_grpc_value(value);
 
         match pb_value.value_data.unwrap() {
             ValueData::StructValue(pb_struct_value) => {
@@ -1778,4 +1798,199 @@ mod tests {
             _ => panic!("Unexpected value type"),
         }
     }
+
+    #[test]
+    fn test_encode_decode_json_value() {
+        let json = JsonValue::null();
+        let proto = encode_json_value(json.clone());
+        assert!(proto.value.is_none());
+        let value = decode_json_value(&proto);
+        assert_eq!(json.as_ref(), value);
+
+        let json: JsonValue = true.into();
+        let proto = encode_json_value(json.clone());
+        assert_eq!(proto.value, Some(json_value::Value::Boolean(true)));
+        let value = decode_json_value(&proto);
+        assert_eq!(json.as_ref(), value);
+
+        let json: JsonValue = (-1i64).into();
+        let proto = encode_json_value(json.clone());
+        assert_eq!(proto.value, Some(json_value::Value::Int(-1)));
+        let value = decode_json_value(&proto);
+        assert_eq!(json.as_ref(), value);
+
+        let json: JsonValue = 1u64.into();
+        let proto = encode_json_value(json.clone());
+        assert_eq!(proto.value, Some(json_value::Value::Uint(1)));
+        let value = decode_json_value(&proto);
+        assert_eq!(json.as_ref(), value);
+
+        let json: JsonValue = 1.0f64.into();
+        let proto = encode_json_value(json.clone());
+        assert_eq!(proto.value, Some(json_value::Value::Float(1.0)));
+        let value = decode_json_value(&proto);
+        assert_eq!(json.as_ref(), value);
+
+        let json: JsonValue = "s".into();
+        let proto = encode_json_value(json.clone());
+        assert_eq!(proto.value, Some(json_value::Value::Str("s".to_string())));
+        let value = decode_json_value(&proto);
+        assert_eq!(json.as_ref(), value);
+
+        let json: JsonValue = [1i64, 2, 3].into();
+        let proto = encode_json_value(json.clone());
+        assert_eq!(
+            proto.value,
+            Some(json_value::Value::Array(JsonList {
+                items: vec![
+                    v1::JsonValue {
+                        value: Some(json_value::Value::Int(1))
+                    },
+                    v1::JsonValue {
+                        value: Some(json_value::Value::Int(2))
+                    },
+                    v1::JsonValue {
+                        value: Some(json_value::Value::Int(3))
+                    }
+                ]
+            }))
+        );
+        let value = decode_json_value(&proto);
+        assert_eq!(json.as_ref(), value);
+
+        let json: JsonValue = [(); 0].into();
+        let proto = encode_json_value(json.clone());
+        assert_eq!(
+            proto.value,
+            Some(json_value::Value::Array(JsonList { items: vec![] }))
+        );
+        let value = decode_json_value(&proto);
+        assert_eq!(json.as_ref(), value);
+
+        let json: JsonValue = [("k3", 3i64), ("k2", 2i64), ("k1", 1i64)].into();
+        let proto = encode_json_value(json.clone());
+        assert_eq!(
+            proto.value,
+            Some(json_value::Value::Object(JsonObject {
+                entries: vec![
+                    v1::json_object::Entry {
+                        key: "k1".to_string(),
+                        value: Some(v1::JsonValue {
+                            value: Some(json_value::Value::Int(1))
+                        }),
+                    },
+                    v1::json_object::Entry {
+                        key: "k2".to_string(),
+                        value: Some(v1::JsonValue {
+                            value: Some(json_value::Value::Int(2))
+                        }),
+                    },
+                    v1::json_object::Entry {
+                        key: "k3".to_string(),
+                        value: Some(v1::JsonValue {
+                            value: Some(json_value::Value::Int(3))
+                        }),
+                    },
+                ]
+            }))
+        );
+        let value = decode_json_value(&proto);
+        assert_eq!(json.as_ref(), value);
+
+        let json: JsonValue = [("null", ()); 0].into();
+        let proto = encode_json_value(json.clone());
+        assert_eq!(
+            proto.value,
+            Some(json_value::Value::Object(JsonObject { entries: vec![] }))
+        );
+        let value = decode_json_value(&proto);
+        assert_eq!(json.as_ref(), value);
+
+        let json: JsonValue = [
+            ("null", JsonVariant::from(())),
+            ("bool", false.into()),
+            ("list", ["hello", "world"].into()),
+            (
+                "object",
+                [
+                    ("positive_i", JsonVariant::from(42u64)),
+                    ("negative_i", (-42i64).into()),
+                    ("nested", [("what", "blah")].into()),
+                ]
+                .into(),
+            ),
+        ]
+        .into();
+        let proto = encode_json_value(json.clone());
+        assert_eq!(
+            proto.value,
+            Some(json_value::Value::Object(JsonObject {
+                entries: vec![
+                    v1::json_object::Entry {
+                        key: "bool".to_string(),
+                        value: Some(v1::JsonValue {
+                            value: Some(json_value::Value::Boolean(false))
+                        }),
+                    },
+                    v1::json_object::Entry {
+                        key: "list".to_string(),
+                        value: Some(v1::JsonValue {
+                            value: Some(json_value::Value::Array(JsonList {
+                                items: vec![
+                                    v1::JsonValue {
+                                        value: Some(json_value::Value::Str("hello".to_string()))
+                                    },
+                                    v1::JsonValue {
+                                        value: Some(json_value::Value::Str("world".to_string()))
+                                    },
+                                ]
+                            }))
+                        }),
+                    },
+                    v1::json_object::Entry {
+                        key: "null".to_string(),
+                        value: Some(v1::JsonValue { value: None }),
+                    },
+                    v1::json_object::Entry {
+                        key: "object".to_string(),
+                        value: Some(v1::JsonValue {
+                            value: Some(json_value::Value::Object(JsonObject {
+                                entries: vec![
+                                    v1::json_object::Entry {
+                                        key: "negative_i".to_string(),
+                                        value: Some(v1::JsonValue {
+                                            value: Some(json_value::Value::Int(-42))
+                                        }),
+                                    },
+                                    v1::json_object::Entry {
+                                        key: "nested".to_string(),
+                                        value: Some(v1::JsonValue {
+                                            value: Some(json_value::Value::Object(JsonObject {
+                                                entries: vec![v1::json_object::Entry {
+                                                    key: "what".to_string(),
+                                                    value: Some(v1::JsonValue {
+                                                        value: Some(json_value::Value::Str(
+                                                            "blah".to_string()
+                                                        ))
+                                                    }),
+                                                },]
+                                            }))
+                                        }),
+                                    },
+                                    v1::json_object::Entry {
+                                        key: "positive_i".to_string(),
+                                        value: Some(v1::JsonValue {
+                                            value: Some(json_value::Value::Uint(42))
+                                        }),
+                                    },
+                                ]
+                            }))
+                        }),
+                    },
+                ]
+            }))
+        );
+        let value = decode_json_value(&proto);
+        assert_eq!(json.as_ref(), value);
+    }
 }
diff --git a/src/auth/Cargo.toml b/src/auth/Cargo.toml
index 905bd72373..9c91023da5 100644
--- a/src/auth/Cargo.toml
+++ b/src/auth/Cargo.toml
@@ -15,11 +15,11 @@ workspace = true
 api.workspace = true
 async-trait.workspace = true
 common-base.workspace = true
+common-config.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 common-telemetry.workspace = true
 digest = "0.10"
-notify.workspace = true
 sha1 = "0.10"
 snafu.workspace = true
 sql.workspace = true
diff --git a/src/auth/src/error.rs b/src/auth/src/error.rs
index d28a85f828..a8dfe7f629 100644
--- a/src/auth/src/error.rs
+++ b/src/auth/src/error.rs
@@ -75,11 +75,12 @@ pub enum Error {
         username: String,
     },
 
-    #[snafu(display("Failed to initialize a watcher for file {}", path))]
+    #[snafu(display("Failed to initialize a file watcher"))]
     FileWatch {
-        path: String,
         #[snafu(source)]
-        error: notify::Error,
+        source: common_config::error::Error,
+        #[snafu(implicit)]
+        location: Location,
     },
 
     #[snafu(display("User is not authorized to perform this action"))]
diff --git a/src/auth/src/user_provider/watch_file_user_provider.rs b/src/auth/src/user_provider/watch_file_user_provider.rs
index 4df17502b7..451efd5cc4 100644
--- a/src/auth/src/user_provider/watch_file_user_provider.rs
+++ b/src/auth/src/user_provider/watch_file_user_provider.rs
@@ -12,16 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::path::Path;
-use std::sync::mpsc::channel;
 use std::sync::{Arc, Mutex};
 
 use async_trait::async_trait;
+use common_config::file_watcher::{FileWatcherBuilder, FileWatcherConfig};
 use common_telemetry::{info, warn};
-use notify::{EventKind, RecursiveMode, Watcher};
-use snafu::{ResultExt, ensure};
+use snafu::ResultExt;
 
-use crate::error::{FileWatchSnafu, InvalidConfigSnafu, Result};
+use crate::error::{FileWatchSnafu, Result};
 use crate::user_provider::{UserInfoMap, authenticate_with_credential, load_credential_from_file};
 use crate::{Identity, Password, UserInfoRef, UserProvider};
 
@@ -41,61 +39,36 @@ impl WatchFileUserProvider {
     pub fn new(filepath: &str) -> Result<Self> {
         let credential = load_credential_from_file(filepath)?;
         let users = Arc::new(Mutex::new(credential));
-        let this = WatchFileUserProvider {
-            users: users.clone(),
-        };
 
-        let (tx, rx) = channel::<notify::Result<notify::Event>>();
-        let mut debouncer =
-            notify::recommended_watcher(tx).context(FileWatchSnafu { path: "<none>" })?;
-        let mut dir = Path::new(filepath).to_path_buf();
-        ensure!(
-            dir.pop(),
-            InvalidConfigSnafu {
-                value: filepath,
-                msg: "UserProvider path must be a file path",
-            }
-        );
-        debouncer
-            .watch(&dir, RecursiveMode::NonRecursive)
-            .context(FileWatchSnafu { path: filepath })?;
+        let users_clone = users.clone();
+        let filepath_owned = filepath.to_string();
 
-        let filepath = filepath.to_string();
-        std::thread::spawn(move || {
-            let filename = Path::new(&filepath).file_name();
-            let _hold = debouncer;
-            while let Ok(res) = rx.recv() {
-                if let Ok(event) = res {
-                    let is_this_file = event.paths.iter().any(|p| p.file_name() == filename);
-                    let is_relevant_event = matches!(
-                        event.kind,
-                        EventKind::Modify(_) | EventKind::Create(_) | EventKind::Remove(_)
+        FileWatcherBuilder::new()
+            .watch_path(filepath)
+            .context(FileWatchSnafu)?
+            .config(FileWatcherConfig::new())
+            .spawn(move || match load_credential_from_file(&filepath_owned) {
+                Ok(credential) => {
+                    let mut users = users_clone.lock().expect("users credential must be valid");
+                    #[cfg(not(test))]
+                    info!("User provider file {} reloaded", &filepath_owned);
+                    #[cfg(test)]
+                    info!(
+                        "User provider file {} reloaded: {:?}",
+                        &filepath_owned, credential
                     );
-                    if is_this_file && is_relevant_event {
-                        info!(?event.kind, "User provider file {} changed", &filepath);
-                        match load_credential_from_file(&filepath) {
-                            Ok(credential) => {
-                                let mut users =
-                                    users.lock().expect("users credential must be valid");
-                                #[cfg(not(test))]
-                                info!("User provider file {filepath} reloaded");
-                                #[cfg(test)]
-                                info!("User provider file {filepath} reloaded: {credential:?}");
-                                *users = credential;
-                            }
-                            Err(err) => {
-                                warn!(
-                                    ?err,
-                                    "Fail to load credential from file {filepath}; keep the old one",
-                                )
-                            }
-                        }
-                    }
+                    *users = credential;
                 }
-            }
-        });
+                Err(err) => {
+                    warn!(
+                        ?err,
+                        "Fail to load credential from file {}; keep the old one", &filepath_owned
+                    )
+                }
+            })
+            .context(FileWatchSnafu)?;
 
-        Ok(this)
+        Ok(WatchFileUserProvider { users })
     }
 }
 
diff --git a/src/catalog/Cargo.toml b/src/catalog/Cargo.toml
index 62674e2572..c41548082d 100644
--- a/src/catalog/Cargo.toml
+++ b/src/catalog/Cargo.toml
@@ -5,7 +5,6 @@ edition.workspace = true
 license.workspace = true
 
 [features]
-enterprise = []
 testing = []
 
 [lints]
diff --git a/src/catalog/src/kvbackend.rs b/src/catalog/src/kvbackend.rs
index d7f32fc66d..334acc999c 100644
--- a/src/catalog/src/kvbackend.rs
+++ b/src/catalog/src/kvbackend.rs
@@ -12,13 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-pub use client::{CachedKvBackend, CachedKvBackendBuilder, MetaKvBackend};
-
 mod builder;
 mod client;
 mod manager;
 mod table_cache;
 
-pub use builder::KvBackendCatalogManagerBuilder;
+pub use builder::{
+    CatalogManagerConfigurator, CatalogManagerConfiguratorRef, KvBackendCatalogManagerBuilder,
+};
+pub use client::{CachedKvBackend, CachedKvBackendBuilder, MetaKvBackend};
 pub use manager::KvBackendCatalogManager;
 pub use table_cache::{TableCache, TableCacheRef, new_table_cache};
diff --git a/src/catalog/src/kvbackend/builder.rs b/src/catalog/src/kvbackend/builder.rs
index 247a111124..de56f81c0f 100644
--- a/src/catalog/src/kvbackend/builder.rs
+++ b/src/catalog/src/kvbackend/builder.rs
@@ -12,9 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::collections::HashMap;
 use std::sync::Arc;
 
 use common_catalog::consts::DEFAULT_CATALOG_NAME;
+use common_error::ext::BoxedError;
 use common_meta::cache::LayeredCacheRegistryRef;
 use common_meta::key::TableMetadataManager;
 use common_meta::key::flow::FlowMetadataManager;
@@ -23,24 +25,34 @@ use common_procedure::ProcedureManagerRef;
 use moka::sync::Cache;
 use partition::manager::PartitionRuleManager;
 
-#[cfg(feature = "enterprise")]
-use crate::information_schema::InformationSchemaTableFactoryRef;
-use crate::information_schema::{InformationExtensionRef, InformationSchemaProvider};
+use crate::information_schema::{
+    InformationExtensionRef, InformationSchemaProvider, InformationSchemaTableFactoryRef,
+};
 use crate::kvbackend::KvBackendCatalogManager;
 use crate::kvbackend::manager::{CATALOG_CACHE_MAX_CAPACITY, SystemCatalog};
 use crate::process_manager::ProcessManagerRef;
 use crate::system_schema::numbers_table_provider::NumbersTableProvider;
 use crate::system_schema::pg_catalog::PGCatalogProvider;
 
+/// The configurator that customizes or enhances the [`KvBackendCatalogManagerBuilder`].
+#[async_trait::async_trait]
+pub trait CatalogManagerConfigurator<C>: Send + Sync {
+    async fn configure(
+        &self,
+        builder: KvBackendCatalogManagerBuilder,
+        ctx: C,
+    ) -> std::result::Result<KvBackendCatalogManagerBuilder, BoxedError>;
+}
+
+pub type CatalogManagerConfiguratorRef<C> = Arc<dyn CatalogManagerConfigurator<C>>;
+
 pub struct KvBackendCatalogManagerBuilder {
     information_extension: InformationExtensionRef,
     backend: KvBackendRef,
     cache_registry: LayeredCacheRegistryRef,
     procedure_manager: Option<ProcedureManagerRef>,
     process_manager: Option<ProcessManagerRef>,
-    #[cfg(feature = "enterprise")]
-    extra_information_table_factories:
-        std::collections::HashMap<String, InformationSchemaTableFactoryRef>,
+    extra_information_table_factories: HashMap<String, InformationSchemaTableFactoryRef>,
 }
 
 impl KvBackendCatalogManagerBuilder {
@@ -55,8 +67,7 @@ impl KvBackendCatalogManagerBuilder {
             cache_registry,
             procedure_manager: None,
             process_manager: None,
-            #[cfg(feature = "enterprise")]
-            extra_information_table_factories: std::collections::HashMap::new(),
+            extra_information_table_factories: HashMap::new(),
         }
     }
 
@@ -71,10 +82,9 @@ impl KvBackendCatalogManagerBuilder {
     }
 
     /// Sets the extra information tables.
-    #[cfg(feature = "enterprise")]
     pub fn with_extra_information_table_factories(
         mut self,
-        factories: std::collections::HashMap<String, InformationSchemaTableFactoryRef>,
+        factories: HashMap<String, InformationSchemaTableFactoryRef>,
     ) -> Self {
         self.extra_information_table_factories = factories;
         self
@@ -87,7 +97,6 @@ impl KvBackendCatalogManagerBuilder {
             cache_registry,
             procedure_manager,
             process_manager,
-            #[cfg(feature = "enterprise")]
             extra_information_table_factories,
         } = self;
         Arc::new_cyclic(|me| KvBackendCatalogManager {
@@ -111,7 +120,6 @@ impl KvBackendCatalogManagerBuilder {
                         process_manager.clone(),
                         backend.clone(),
                     );
-                    #[cfg(feature = "enterprise")]
                     let provider = provider
                         .with_extra_table_factories(extra_information_table_factories.clone());
                     Arc::new(provider)
@@ -123,7 +131,6 @@ impl KvBackendCatalogManagerBuilder {
                 numbers_table_provider: NumbersTableProvider,
                 backend,
                 process_manager,
-                #[cfg(feature = "enterprise")]
                 extra_information_table_factories,
             },
             cache_registry,
diff --git a/src/catalog/src/kvbackend/manager.rs b/src/catalog/src/kvbackend/manager.rs
index 29e0cc4ce8..7852142c6a 100644
--- a/src/catalog/src/kvbackend/manager.rs
+++ b/src/catalog/src/kvbackend/manager.rs
@@ -53,9 +53,9 @@ use crate::error::{
     CacheNotFoundSnafu, GetTableCacheSnafu, InvalidTableInfoInCatalogSnafu, ListCatalogsSnafu,
     ListSchemasSnafu, ListTablesSnafu, Result, TableMetadataManagerSnafu,
 };
-#[cfg(feature = "enterprise")]
-use crate::information_schema::InformationSchemaTableFactoryRef;
-use crate::information_schema::{InformationExtensionRef, InformationSchemaProvider};
+use crate::information_schema::{
+    InformationExtensionRef, InformationSchemaProvider, InformationSchemaTableFactoryRef,
+};
 use crate::kvbackend::TableCacheRef;
 use crate::process_manager::ProcessManagerRef;
 use crate::system_schema::SystemSchemaProvider;
@@ -557,7 +557,6 @@ pub(super) struct SystemCatalog {
     pub(super) numbers_table_provider: NumbersTableProvider,
     pub(super) backend: KvBackendRef,
     pub(super) process_manager: Option<ProcessManagerRef>,
-    #[cfg(feature = "enterprise")]
     pub(super) extra_information_table_factories:
         std::collections::HashMap<String, InformationSchemaTableFactoryRef>,
 }
@@ -628,7 +627,6 @@ impl SystemCatalog {
                         self.process_manager.clone(),
                         self.backend.clone(),
                     );
-                    #[cfg(feature = "enterprise")]
                     let provider = provider
                         .with_extra_table_factories(self.extra_information_table_factories.clone());
                     Arc::new(provider)
diff --git a/src/catalog/src/system_schema/information_schema.rs b/src/catalog/src/system_schema/information_schema.rs
index 44609ade34..9715aa9402 100644
--- a/src/catalog/src/system_schema/information_schema.rs
+++ b/src/catalog/src/system_schema/information_schema.rs
@@ -22,7 +22,6 @@ mod procedure_info;
 pub mod process_list;
 pub mod region_peers;
 mod region_statistics;
-mod runtime_metrics;
 pub mod schemata;
 mod ssts;
 mod table_constraints;
@@ -65,7 +64,6 @@ use crate::system_schema::information_schema::information_memory_table::get_sche
 use crate::system_schema::information_schema::key_column_usage::InformationSchemaKeyColumnUsage;
 use crate::system_schema::information_schema::partitions::InformationSchemaPartitions;
 use crate::system_schema::information_schema::region_peers::InformationSchemaRegionPeers;
-use crate::system_schema::information_schema::runtime_metrics::InformationSchemaMetrics;
 use crate::system_schema::information_schema::schemata::InformationSchemaSchemata;
 use crate::system_schema::information_schema::ssts::{
     InformationSchemaSstsIndexMeta, InformationSchemaSstsManifest, InformationSchemaSstsStorage,
@@ -119,7 +117,6 @@ macro_rules! setup_memory_table {
     };
 }
 
-#[cfg(feature = "enterprise")]
 pub struct MakeInformationTableRequest {
     pub catalog_name: String,
     pub catalog_manager: Weak<dyn CatalogManager>,
@@ -130,12 +127,10 @@ pub struct MakeInformationTableRequest {
 ///
 /// This trait allows for extensibility of the information schema by providing
 /// a way to dynamically create custom information schema tables.
-#[cfg(feature = "enterprise")]
 pub trait InformationSchemaTableFactory {
     fn make_information_table(&self, req: MakeInformationTableRequest) -> SystemTableRef;
 }
 
-#[cfg(feature = "enterprise")]
 pub type InformationSchemaTableFactoryRef = Arc<dyn InformationSchemaTableFactory + Send + Sync>;
 
 /// The `information_schema` tables info provider.
@@ -145,9 +140,7 @@ pub struct InformationSchemaProvider {
     process_manager: Option<ProcessManagerRef>,
     flow_metadata_manager: Arc<FlowMetadataManager>,
     tables: HashMap<String, TableRef>,
-    #[allow(dead_code)]
     kv_backend: KvBackendRef,
-    #[cfg(feature = "enterprise")]
     extra_table_factories: HashMap<String, InformationSchemaTableFactoryRef>,
 }
 
@@ -168,7 +161,6 @@ impl SystemSchemaProviderInner for InformationSchemaProvider {
     }
 
     fn system_table(&self, name: &str) -> Option<SystemTableRef> {
-        #[cfg(feature = "enterprise")]
         if let Some(factory) = self.extra_table_factories.get(name) {
             let req = MakeInformationTableRequest {
                 catalog_name: self.catalog_name.clone(),
@@ -216,7 +208,6 @@ impl SystemSchemaProviderInner for InformationSchemaProvider {
                 self.catalog_name.clone(),
                 self.catalog_manager.clone(),
             )) as _),
-            RUNTIME_METRICS => Some(Arc::new(InformationSchemaMetrics::new())),
             PARTITIONS => Some(Arc::new(InformationSchemaPartitions::new(
                 self.catalog_name.clone(),
                 self.catalog_manager.clone(),
@@ -284,7 +275,6 @@ impl InformationSchemaProvider {
             process_manager,
             tables: HashMap::new(),
             kv_backend,
-            #[cfg(feature = "enterprise")]
             extra_table_factories: HashMap::new(),
         };
 
@@ -293,7 +283,6 @@ impl InformationSchemaProvider {
         provider
     }
 
-    #[cfg(feature = "enterprise")]
     pub(crate) fn with_extra_table_factories(
         mut self,
         factories: HashMap<String, InformationSchemaTableFactoryRef>,
@@ -311,10 +300,6 @@ impl InformationSchemaProvider {
         // authentication details, and other critical information.
         // Only put these tables under `greptime` catalog to prevent info leak.
         if self.catalog_name == DEFAULT_CATALOG_NAME {
-            tables.insert(
-                RUNTIME_METRICS.to_string(),
-                self.build_table(RUNTIME_METRICS).unwrap(),
-            );
             tables.insert(
                 BUILD_INFO.to_string(),
                 self.build_table(BUILD_INFO).unwrap(),
@@ -365,7 +350,6 @@ impl InformationSchemaProvider {
         if let Some(process_list) = self.build_table(PROCESS_LIST) {
             tables.insert(PROCESS_LIST.to_string(), process_list);
         }
-        #[cfg(feature = "enterprise")]
         for name in self.extra_table_factories.keys() {
             tables.insert(name.clone(), self.build_table(name).expect(name));
         }
@@ -444,7 +428,7 @@ pub trait InformationExtension {
 }
 
 /// The request to inspect the datanode.
-#[derive(Debug, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq)]
 pub struct DatanodeInspectRequest {
     /// Kind to fetch from datanode.
     pub kind: DatanodeInspectKind,
diff --git a/src/catalog/src/system_schema/information_schema/partitions.rs b/src/catalog/src/system_schema/information_schema/partitions.rs
index 68f4f83051..b9396fe554 100644
--- a/src/catalog/src/system_schema/information_schema/partitions.rs
+++ b/src/catalog/src/system_schema/information_schema/partitions.rs
@@ -211,6 +211,7 @@ struct InformationSchemaPartitionsBuilder {
     partition_names: StringVectorBuilder,
     partition_ordinal_positions: Int64VectorBuilder,
     partition_expressions: StringVectorBuilder,
+    partition_descriptions: StringVectorBuilder,
     create_times: TimestampSecondVectorBuilder,
     partition_ids: UInt64VectorBuilder,
 }
@@ -231,6 +232,7 @@ impl InformationSchemaPartitionsBuilder {
             partition_names: StringVectorBuilder::with_capacity(INIT_CAPACITY),
             partition_ordinal_positions: Int64VectorBuilder::with_capacity(INIT_CAPACITY),
             partition_expressions: StringVectorBuilder::with_capacity(INIT_CAPACITY),
+            partition_descriptions: StringVectorBuilder::with_capacity(INIT_CAPACITY),
             create_times: TimestampSecondVectorBuilder::with_capacity(INIT_CAPACITY),
             partition_ids: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
         }
@@ -319,6 +321,21 @@ impl InformationSchemaPartitionsBuilder {
             return;
         }
 
+        // Get partition column names (shared by all partitions)
+        // In MySQL, PARTITION_EXPRESSION is the partitioning function expression (e.g., column name)
+        let partition_columns: String = table_info
+            .meta
+            .partition_column_names()
+            .cloned()
+            .collect::<Vec<_>>()
+            .join(", ");
+
+        let partition_expr_str = if partition_columns.is_empty() {
+            None
+        } else {
+            Some(partition_columns)
+        };
+
         for (index, partition) in partitions.iter().enumerate() {
             let partition_name = format!("p{index}");
 
@@ -328,8 +345,12 @@ impl InformationSchemaPartitionsBuilder {
             self.partition_names.push(Some(&partition_name));
             self.partition_ordinal_positions
                 .push(Some((index + 1) as i64));
-            let expression = partition.partition_expr.as_ref().map(|e| e.to_string());
-            self.partition_expressions.push(expression.as_deref());
+            // PARTITION_EXPRESSION: partition column names (same for all partitions)
+            self.partition_expressions
+                .push(partition_expr_str.as_deref());
+            // PARTITION_DESCRIPTION: partition boundary expression (different for each partition)
+            let description = partition.partition_expr.as_ref().map(|e| e.to_string());
+            self.partition_descriptions.push(description.as_deref());
             self.create_times.push(Some(TimestampSecond::from(
                 table_info.meta.created_on.timestamp(),
             )));
@@ -369,7 +390,7 @@ impl InformationSchemaPartitionsBuilder {
             null_string_vector.clone(),
             Arc::new(self.partition_expressions.finish()),
             null_string_vector.clone(),
-            null_string_vector.clone(),
+            Arc::new(self.partition_descriptions.finish()),
             // TODO(dennis): rows and index statistics info
             null_i64_vector.clone(),
             null_i64_vector.clone(),
diff --git a/src/catalog/src/system_schema/information_schema/runtime_metrics.rs b/src/catalog/src/system_schema/information_schema/runtime_metrics.rs
deleted file mode 100644
index 5ccb871321..0000000000
--- a/src/catalog/src/system_schema/information_schema/runtime_metrics.rs
+++ /dev/null
@@ -1,265 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use arrow_schema::SchemaRef as ArrowSchemaRef;
-use common_catalog::consts::INFORMATION_SCHEMA_RUNTIME_METRICS_TABLE_ID;
-use common_error::ext::BoxedError;
-use common_recordbatch::adapter::RecordBatchStreamAdapter;
-use common_recordbatch::{RecordBatch, SendableRecordBatchStream};
-use common_time::util::current_time_millis;
-use datafusion::execution::TaskContext;
-use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
-use datafusion::physical_plan::stream::RecordBatchStreamAdapter as DfRecordBatchStreamAdapter;
-use datafusion::physical_plan::streaming::PartitionStream as DfPartitionStream;
-use datatypes::prelude::{ConcreteDataType, MutableVector};
-use datatypes::scalars::ScalarVectorBuilder;
-use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
-use datatypes::vectors::{
-    ConstantVector, Float64VectorBuilder, StringVectorBuilder, TimestampMillisecondVector,
-    VectorRef,
-};
-use itertools::Itertools;
-use snafu::ResultExt;
-use store_api::storage::{ScanRequest, TableId};
-
-use crate::error::{CreateRecordBatchSnafu, InternalSnafu, Result};
-use crate::system_schema::information_schema::{InformationTable, RUNTIME_METRICS};
-
-#[derive(Debug)]
-pub(super) struct InformationSchemaMetrics {
-    schema: SchemaRef,
-}
-
-const METRIC_NAME: &str = "metric_name";
-const METRIC_VALUE: &str = "value";
-const METRIC_LABELS: &str = "labels";
-const PEER_ADDR: &str = "peer_addr";
-const PEER_TYPE: &str = "peer_type";
-const TIMESTAMP: &str = "timestamp";
-
-/// The `information_schema.runtime_metrics` virtual table.
-/// It provides the GreptimeDB runtime metrics for the users by SQL.
-impl InformationSchemaMetrics {
-    pub(super) fn new() -> Self {
-        Self {
-            schema: Self::schema(),
-        }
-    }
-
-    fn schema() -> SchemaRef {
-        Arc::new(Schema::new(vec![
-            ColumnSchema::new(METRIC_NAME, ConcreteDataType::string_datatype(), false),
-            ColumnSchema::new(METRIC_VALUE, ConcreteDataType::float64_datatype(), false),
-            ColumnSchema::new(METRIC_LABELS, ConcreteDataType::string_datatype(), true),
-            ColumnSchema::new(PEER_ADDR, ConcreteDataType::string_datatype(), true),
-            ColumnSchema::new(PEER_TYPE, ConcreteDataType::string_datatype(), false),
-            ColumnSchema::new(
-                TIMESTAMP,
-                ConcreteDataType::timestamp_millisecond_datatype(),
-                false,
-            ),
-        ]))
-    }
-
-    fn builder(&self) -> InformationSchemaMetricsBuilder {
-        InformationSchemaMetricsBuilder::new(self.schema.clone())
-    }
-}
-
-impl InformationTable for InformationSchemaMetrics {
-    fn table_id(&self) -> TableId {
-        INFORMATION_SCHEMA_RUNTIME_METRICS_TABLE_ID
-    }
-
-    fn table_name(&self) -> &'static str {
-        RUNTIME_METRICS
-    }
-
-    fn schema(&self) -> SchemaRef {
-        self.schema.clone()
-    }
-
-    fn to_stream(&self, request: ScanRequest) -> Result<SendableRecordBatchStream> {
-        let schema = self.schema.arrow_schema().clone();
-        let mut builder = self.builder();
-        let stream = Box::pin(DfRecordBatchStreamAdapter::new(
-            schema,
-            futures::stream::once(async move {
-                builder
-                    .make_metrics(Some(request))
-                    .await
-                    .map(|x| x.into_df_record_batch())
-                    .map_err(Into::into)
-            }),
-        ));
-
-        Ok(Box::pin(
-            RecordBatchStreamAdapter::try_new(stream)
-                .map_err(BoxedError::new)
-                .context(InternalSnafu)?,
-        ))
-    }
-}
-
-struct InformationSchemaMetricsBuilder {
-    schema: SchemaRef,
-
-    metric_names: StringVectorBuilder,
-    metric_values: Float64VectorBuilder,
-    metric_labels: StringVectorBuilder,
-    peer_addrs: StringVectorBuilder,
-    peer_types: StringVectorBuilder,
-}
-
-impl InformationSchemaMetricsBuilder {
-    fn new(schema: SchemaRef) -> Self {
-        Self {
-            schema,
-            metric_names: StringVectorBuilder::with_capacity(42),
-            metric_values: Float64VectorBuilder::with_capacity(42),
-            metric_labels: StringVectorBuilder::with_capacity(42),
-            peer_addrs: StringVectorBuilder::with_capacity(42),
-            peer_types: StringVectorBuilder::with_capacity(42),
-        }
-    }
-
-    fn add_metric(
-        &mut self,
-        metric_name: &str,
-        labels: String,
-        metric_value: f64,
-        peer: Option<&str>,
-        peer_type: &str,
-    ) {
-        self.metric_names.push(Some(metric_name));
-        self.metric_values.push(Some(metric_value));
-        self.metric_labels.push(Some(&labels));
-        self.peer_addrs.push(peer);
-        self.peer_types.push(Some(peer_type));
-    }
-
-    async fn make_metrics(&mut self, _request: Option<ScanRequest>) -> Result<RecordBatch> {
-        let metric_families = prometheus::gather();
-
-        let write_request =
-            common_telemetry::metric::convert_metric_to_write_request(metric_families, None, 0);
-
-        for ts in write_request.timeseries {
-            //Safety: always has `__name__` label
-            let metric_name = ts
-                .labels
-                .iter()
-                .find_map(|label| {
-                    if label.name == "__name__" {
-                        Some(label.value.clone())
-                    } else {
-                        None
-                    }
-                })
-                .unwrap();
-
-            self.add_metric(
-                &metric_name,
-                ts.labels
-                    .into_iter()
-                    .filter_map(|label| {
-                        if label.name == "__name__" {
-                            None
-                        } else {
-                            Some(format!("{}={}", label.name, label.value))
-                        }
-                    })
-                    .join(", "),
-                // Safety: always has a sample
-                ts.samples[0].value,
-                // The peer column is always `None` for standalone
-                None,
-                "STANDALONE",
-            );
-        }
-
-        // FIXME(dennis): fetching other peers metrics
-        self.finish()
-    }
-
-    fn finish(&mut self) -> Result<RecordBatch> {
-        let rows_num = self.metric_names.len();
-
-        let timestamps = Arc::new(ConstantVector::new(
-            Arc::new(TimestampMillisecondVector::from_slice([
-                current_time_millis(),
-            ])),
-            rows_num,
-        ));
-
-        let columns: Vec<VectorRef> = vec![
-            Arc::new(self.metric_names.finish()),
-            Arc::new(self.metric_values.finish()),
-            Arc::new(self.metric_labels.finish()),
-            Arc::new(self.peer_addrs.finish()),
-            Arc::new(self.peer_types.finish()),
-            timestamps,
-        ];
-
-        RecordBatch::new(self.schema.clone(), columns).context(CreateRecordBatchSnafu)
-    }
-}
-
-impl DfPartitionStream for InformationSchemaMetrics {
-    fn schema(&self) -> &ArrowSchemaRef {
-        self.schema.arrow_schema()
-    }
-
-    fn execute(&self, _: Arc<TaskContext>) -> DfSendableRecordBatchStream {
-        let schema = self.schema.arrow_schema().clone();
-        let mut builder = self.builder();
-        Box::pin(DfRecordBatchStreamAdapter::new(
-            schema,
-            futures::stream::once(async move {
-                builder
-                    .make_metrics(None)
-                    .await
-                    .map(|x| x.into_df_record_batch())
-                    .map_err(Into::into)
-            }),
-        ))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use common_recordbatch::RecordBatches;
-
-    use super::*;
-
-    #[tokio::test]
-    async fn test_make_metrics() {
-        let metrics = InformationSchemaMetrics::new();
-
-        let stream = metrics.to_stream(ScanRequest::default()).unwrap();
-
-        let batches = RecordBatches::try_collect(stream).await.unwrap();
-
-        let result_literal = batches.pretty_print().unwrap();
-
-        assert!(result_literal.contains(METRIC_NAME));
-        assert!(result_literal.contains(METRIC_VALUE));
-        assert!(result_literal.contains(METRIC_LABELS));
-        assert!(result_literal.contains(PEER_ADDR));
-        assert!(result_literal.contains(PEER_TYPE));
-        assert!(result_literal.contains(TIMESTAMP));
-    }
-}
diff --git a/src/catalog/src/system_schema/information_schema/table_names.rs b/src/catalog/src/system_schema/information_schema/table_names.rs
index 23791425dc..2a3329fece 100644
--- a/src/catalog/src/system_schema/information_schema/table_names.rs
+++ b/src/catalog/src/system_schema/information_schema/table_names.rs
@@ -38,7 +38,6 @@ pub const TABLE_PRIVILEGES: &str = "table_privileges";
 pub const TRIGGERS: &str = "triggers";
 pub const GLOBAL_STATUS: &str = "global_status";
 pub const SESSION_STATUS: &str = "session_status";
-pub const RUNTIME_METRICS: &str = "runtime_metrics";
 pub const PARTITIONS: &str = "partitions";
 pub const REGION_PEERS: &str = "region_peers";
 pub const TABLE_CONSTRAINTS: &str = "table_constraints";
diff --git a/src/catalog/src/system_schema/information_schema/tables.rs b/src/catalog/src/system_schema/information_schema/tables.rs
index 507dedc547..38a0cb1d61 100644
--- a/src/catalog/src/system_schema/information_schema/tables.rs
+++ b/src/catalog/src/system_schema/information_schema/tables.rs
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::HashSet;
 use std::sync::{Arc, Weak};
 
 use arrow_schema::SchemaRef as ArrowSchemaRef;
@@ -255,14 +254,17 @@ impl InformationSchemaTablesBuilder {
         // TODO(dennis): `region_stats` API is not stable in distributed cluster because of network issue etc.
         // But we don't want the statements such as `show tables` fail,
         // so using `unwrap_or_else` here instead of `?` operator.
-        let region_stats = information_extension
-            .region_stats()
-            .await
-            .map_err(|e| {
-                error!(e; "Failed to call region_stats");
-                e
-            })
-            .unwrap_or_else(|_| vec![]);
+        let region_stats = {
+            let mut x = information_extension
+                .region_stats()
+                .await
+                .unwrap_or_else(|e| {
+                    error!(e; "Failed to find region stats in information_schema, fallback to all empty");
+                    vec![]
+                });
+            x.sort_unstable_by_key(|x| x.id);
+            x
+        };
 
         for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
             let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
@@ -273,16 +275,16 @@ impl InformationSchemaTablesBuilder {
                 // TODO(dennis): make it working for metric engine
                 let table_region_stats =
                     if table_info.meta.engine == MITO_ENGINE || table_info.is_physical_table() {
-                        let region_ids = table_info
+                        table_info
                             .meta
                             .region_numbers
                             .iter()
                             .map(|n| RegionId::new(table_info.ident.table_id, *n))
-                            .collect::<HashSet<_>>();
-
-                        region_stats
-                            .iter()
-                            .filter(|stat| region_ids.contains(&stat.id))
+                            .flat_map(|region_id| {
+                                region_stats
+                                    .binary_search_by_key(&region_id, |x| x.id)
+                                    .map(|i| &region_stats[i])
+                            })
                             .collect::<Vec<_>>()
                     } else {
                         vec![]
diff --git a/src/cli/src/data/export.rs b/src/cli/src/data/export.rs
index 5ddc2a39bc..007f8aa67c 100644
--- a/src/cli/src/data/export.rs
+++ b/src/cli/src/data/export.rs
@@ -67,9 +67,17 @@ pub struct ExportCommand {
     #[clap(long, default_value_t = default_database())]
     database: String,
 
-    /// Parallelism of the export.
-    #[clap(long, short = 'j', default_value = "1")]
-    export_jobs: usize,
+    /// The number of databases exported in parallel.
+    /// For example, if there are 20 databases and `db_parallelism` is 4,
+    /// 4 databases will be exported concurrently.
+    #[clap(long, short = 'j', default_value = "1", alias = "export-jobs")]
+    db_parallelism: usize,
+
+    /// The number of tables exported in parallel within a single database.
+    /// For example, if a database has 30 tables and `parallelism` is 8,
+    /// 8 tables will be exported concurrently.
+    #[clap(long, default_value = "4")]
+    table_parallelism: usize,
 
     /// Max retry times for each job.
     #[clap(long, default_value = "3")]
@@ -210,10 +218,11 @@ impl ExportCommand {
             schema,
             database_client,
             output_dir: self.output_dir.clone(),
-            parallelism: self.export_jobs,
+            export_jobs: self.db_parallelism,
             target: self.target.clone(),
             start_time: self.start_time.clone(),
             end_time: self.end_time.clone(),
+            parallelism: self.table_parallelism,
             s3: self.s3,
             ddl_local_dir: self.ddl_local_dir.clone(),
             s3_bucket: self.s3_bucket.clone(),
@@ -251,10 +260,11 @@ pub struct Export {
     schema: Option<String>,
     database_client: DatabaseClient,
     output_dir: Option<String>,
-    parallelism: usize,
+    export_jobs: usize,
     target: ExportTarget,
     start_time: Option<String>,
     end_time: Option<String>,
+    parallelism: usize,
     s3: bool,
     ddl_local_dir: Option<String>,
     s3_bucket: Option<String>,
@@ -464,7 +474,7 @@ impl Export {
 
     async fn export_create_table(&self) -> Result<()> {
         let timer = Instant::now();
-        let semaphore = Arc::new(Semaphore::new(self.parallelism));
+        let semaphore = Arc::new(Semaphore::new(self.export_jobs));
         let db_names = self.get_db_names().await?;
         let db_count = db_names.len();
         let operator = Arc::new(self.build_prefer_fs_operator().await?);
@@ -625,13 +635,13 @@ impl Export {
 
     async fn export_database_data(&self) -> Result<()> {
         let timer = Instant::now();
-        let semaphore = Arc::new(Semaphore::new(self.parallelism));
+        let semaphore = Arc::new(Semaphore::new(self.export_jobs));
         let db_names = self.get_db_names().await?;
         let db_count = db_names.len();
         let mut tasks = Vec::with_capacity(db_count);
         let operator = Arc::new(self.build_operator().await?);
         let fs_first_operator = Arc::new(self.build_prefer_fs_operator().await?);
-        let with_options = build_with_options(&self.start_time, &self.end_time);
+        let with_options = build_with_options(&self.start_time, &self.end_time, self.parallelism);
 
         for schema in db_names {
             let semaphore_moved = semaphore.clone();
@@ -888,7 +898,11 @@ impl Tool for Export {
 }
 
 /// Builds the WITH options string for SQL commands, assuming consistent syntax across S3 and local exports.
-fn build_with_options(start_time: &Option<String>, end_time: &Option<String>) -> String {
+fn build_with_options(
+    start_time: &Option<String>,
+    end_time: &Option<String>,
+    parallelism: usize,
+) -> String {
     let mut options = vec!["format = 'parquet'".to_string()];
     if let Some(start) = start_time {
         options.push(format!("start_time = '{}'", start));
@@ -896,5 +910,6 @@ fn build_with_options(start_time: &Option<String>, end_time: &Option<String>) ->
     if let Some(end) = end_time {
         options.push(format!("end_time = '{}'", end));
     }
+    options.push(format!("parallelism = {}", parallelism));
     options.join(", ")
 }
diff --git a/src/cli/src/data/import.rs b/src/cli/src/data/import.rs
index 908f3d4c9f..ffe8b62c7e 100644
--- a/src/cli/src/data/import.rs
+++ b/src/cli/src/data/import.rs
@@ -56,9 +56,11 @@ pub struct ImportCommand {
     #[clap(long, default_value_t = default_database())]
     database: String,
 
-    /// Parallelism of the import.
-    #[clap(long, short = 'j', default_value = "1")]
-    import_jobs: usize,
+    /// The number of databases imported in parallel.
+    /// For example, if there are 20 databases and `db_parallelism` is 4,
+    /// 4 databases will be imported concurrently.
+    #[clap(long, short = 'j', default_value = "1", alias = "import-jobs")]
+    db_parallelism: usize,
 
     /// Max retry times for each job.
     #[clap(long, default_value = "3")]
@@ -109,7 +111,7 @@ impl ImportCommand {
             schema,
             database_client,
             input_dir: self.input_dir.clone(),
-            parallelism: self.import_jobs,
+            parallelism: self.db_parallelism,
             target: self.target.clone(),
         }))
     }
diff --git a/src/client/src/client.rs b/src/client/src/client.rs
index 611cce954d..39cb5c30aa 100644
--- a/src/client/src/client.rs
+++ b/src/client/src/client.rs
@@ -21,7 +21,7 @@ use api::v1::prometheus_gateway_client::PrometheusGatewayClient;
 use api::v1::region::region_client::RegionClient as PbRegionClient;
 use arrow_flight::flight_service_client::FlightServiceClient;
 use common_grpc::channel_manager::{
-    ChannelConfig, ChannelManager, ClientTlsOption, load_tls_config,
+    ChannelConfig, ChannelManager, ClientTlsOption, load_client_tls_config,
 };
 use parking_lot::RwLock;
 use snafu::{OptionExt, ResultExt};
@@ -95,9 +95,9 @@ impl Client {
         U: AsRef<str>,
         A: AsRef<[U]>,
     {
-        let channel_config = ChannelConfig::default().client_tls_config(client_tls);
-        let tls_config = load_tls_config(channel_config.client_tls.as_ref())
-            .context(error::CreateTlsChannelSnafu)?;
+        let channel_config = ChannelConfig::default().client_tls_config(client_tls.clone());
+        let tls_config =
+            load_client_tls_config(Some(client_tls)).context(error::CreateTlsChannelSnafu)?;
         let channel_manager = ChannelManager::with_config(channel_config, tls_config);
         Ok(Self::with_manager_and_urls(channel_manager, urls))
     }
diff --git a/src/client/src/database.rs b/src/client/src/database.rs
index 0646c3e2a3..239f3fe3f9 100644
--- a/src/client/src/database.rs
+++ b/src/client/src/database.rs
@@ -435,10 +435,10 @@ impl Database {
                             .context(ExternalSnafu)?;
                         match flight_message {
                             FlightMessage::RecordBatch(arrow_batch) => {
-                                yield RecordBatch::try_from_df_record_batch(
+                                yield Ok(RecordBatch::from_df_record_batch(
                                     schema_cloned.clone(),
                                     arrow_batch,
-                                )
+                                ))
                             }
                             FlightMessage::Metrics(_) => {}
                             FlightMessage::AffectedRows(_) | FlightMessage::Schema(_) => {
diff --git a/src/client/src/region.rs b/src/client/src/region.rs
index 6e5a286083..3e80b83cec 100644
--- a/src/client/src/region.rs
+++ b/src/client/src/region.rs
@@ -182,10 +182,8 @@ impl RegionRequester {
 
                 match flight_message {
                     FlightMessage::RecordBatch(record_batch) => {
-                        let result_to_yield = RecordBatch::try_from_df_record_batch(
-                            schema_cloned.clone(),
-                            record_batch,
-                        );
+                        let result_to_yield =
+                            RecordBatch::from_df_record_batch(schema_cloned.clone(), record_batch);
 
                         // get the next message from the stream. normally it should be a metrics message.
                         if let Some(next_flight_message_result) = flight_message_stream.next().await
@@ -219,7 +217,7 @@ impl RegionRequester {
                             stream_ended = true;
                         }
 
-                        yield result_to_yield;
+                        yield Ok(result_to_yield);
                     }
                     FlightMessage::Metrics(s) => {
                         // just a branch in case of some metrics message comes after other things.
diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml
index 7a957b509b..d279ddb7f0 100644
--- a/src/cmd/Cargo.toml
+++ b/src/cmd/Cargo.toml
@@ -16,7 +16,7 @@ default = [
     "meta-srv/pg_kvbackend",
     "meta-srv/mysql_kvbackend",
 ]
-enterprise = ["common-meta/enterprise", "frontend/enterprise", "meta-srv/enterprise", "catalog/enterprise"]
+enterprise = ["common-meta/enterprise", "frontend/enterprise", "meta-srv/enterprise"]
 tokio-console = ["common-telemetry/tokio-console"]
 
 [lints]
diff --git a/src/cmd/src/datanode/objbench.rs b/src/cmd/src/datanode/objbench.rs
index dffb971072..0a3f27b77e 100644
--- a/src/cmd/src/datanode/objbench.rs
+++ b/src/cmd/src/datanode/objbench.rs
@@ -145,6 +145,17 @@ impl ObjbenchCommand {
         let region_meta = extract_region_metadata(&self.source, &parquet_meta)?;
         let num_rows = parquet_meta.file_metadata().num_rows() as u64;
         let num_row_groups = parquet_meta.num_row_groups() as u64;
+        let max_row_group_uncompressed_size: u64 = parquet_meta
+            .row_groups()
+            .iter()
+            .map(|rg| {
+                rg.columns()
+                    .iter()
+                    .map(|c| c.uncompressed_size() as u64)
+                    .sum::<u64>()
+            })
+            .max()
+            .unwrap_or(0);
 
         println!(
             "{} Metadata loaded - rows: {}, size: {} bytes",
@@ -160,9 +171,11 @@ impl ObjbenchCommand {
             time_range: Default::default(),
             level: 0,
             file_size,
+            max_row_group_uncompressed_size,
             available_indexes: Default::default(),
+            indexes: Default::default(),
             index_file_size: 0,
-            index_file_id: None,
+            index_version: 0,
             num_rows,
             num_row_groups,
             sequence: None,
@@ -563,7 +576,7 @@ fn new_noop_file_purger() -> FilePurgerRef {
     #[derive(Debug)]
     struct Noop;
     impl FilePurger for Noop {
-        fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool) {}
+        fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool, _index_outdated: bool) {}
     }
     Arc::new(Noop)
 }
diff --git a/src/cmd/src/error.rs b/src/cmd/src/error.rs
index 0b77dec341..fbff2d42e0 100644
--- a/src/cmd/src/error.rs
+++ b/src/cmd/src/error.rs
@@ -99,13 +99,6 @@ pub enum Error {
         source: flow::Error,
     },
 
-    #[snafu(display("Servers error"))]
-    Servers {
-        #[snafu(implicit)]
-        location: Location,
-        source: servers::error::Error,
-    },
-
     #[snafu(display("Failed to start frontend"))]
     StartFrontend {
         #[snafu(implicit)]
@@ -336,7 +329,6 @@ impl ErrorExt for Error {
             Error::ShutdownFrontend { source, .. } => source.status_code(),
             Error::StartMetaServer { source, .. } => source.status_code(),
             Error::ShutdownMetaServer { source, .. } => source.status_code(),
-            Error::Servers { source, .. } => source.status_code(),
             Error::BuildMetaServer { source, .. } => source.status_code(),
             Error::UnsupportedSelectorType { source, .. } => source.status_code(),
             Error::BuildCli { source, .. } => source.status_code(),
diff --git a/src/cmd/src/flownode.rs b/src/cmd/src/flownode.rs
index 07f3279724..6cefdb0f79 100644
--- a/src/cmd/src/flownode.rs
+++ b/src/cmd/src/flownode.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::fmt::Debug;
 use std::path::Path;
 use std::sync::Arc;
 use std::time::Duration;
@@ -39,12 +40,14 @@ use flow::{
     get_flow_auth_options,
 };
 use meta_client::{MetaClientOptions, MetaClientType};
+use plugins::flownode::context::GrpcConfigureContext;
+use servers::configurator::GrpcBuilderConfiguratorRef;
 use snafu::{OptionExt, ResultExt, ensure};
 use tracing_appender::non_blocking::WorkerGuard;
 
 use crate::error::{
     BuildCacheRegistrySnafu, InitMetadataSnafu, LoadLayeredConfigSnafu, MetaClientInitSnafu,
-    MissingConfigSnafu, Result, ShutdownFlownodeSnafu, StartFlownodeSnafu,
+    MissingConfigSnafu, OtherSnafu, Result, ShutdownFlownodeSnafu, StartFlownodeSnafu,
 };
 use crate::options::{GlobalOptions, GreptimeOptions};
 use crate::{App, create_resource_limit_metrics, log_versions, maybe_activate_heap_profile};
@@ -55,33 +58,14 @@ type FlownodeOptions = GreptimeOptions<flow::FlownodeOptions>;
 
 pub struct Instance {
     flownode: FlownodeInstance,
-
-    // The components of flownode, which make it easier to expand based
-    // on the components.
-    #[cfg(feature = "enterprise")]
-    components: Components,
-
     // Keep the logging guard to prevent the worker from being dropped.
     _guard: Vec<WorkerGuard>,
 }
 
-#[cfg(feature = "enterprise")]
-pub struct Components {
-    pub catalog_manager: catalog::CatalogManagerRef,
-    pub fe_client: Arc<FrontendClient>,
-    pub kv_backend: common_meta::kv_backend::KvBackendRef,
-}
-
 impl Instance {
-    pub fn new(
-        flownode: FlownodeInstance,
-        #[cfg(feature = "enterprise")] components: Components,
-        guard: Vec<WorkerGuard>,
-    ) -> Self {
+    pub fn new(flownode: FlownodeInstance, guard: Vec<WorkerGuard>) -> Self {
         Self {
             flownode,
-            #[cfg(feature = "enterprise")]
-            components,
             _guard: guard,
         }
     }
@@ -94,11 +78,6 @@ impl Instance {
     pub fn flownode_mut(&mut self) -> &mut FlownodeInstance {
         &mut self.flownode
     }
-
-    #[cfg(feature = "enterprise")]
-    pub fn components(&self) -> &Components {
-        &self.components
-    }
 }
 
 #[async_trait::async_trait]
@@ -396,7 +375,7 @@ impl StartCommand {
         let frontend_client = Arc::new(frontend_client);
         let flownode_builder = FlownodeBuilder::new(
             opts.clone(),
-            plugins,
+            plugins.clone(),
             table_metadata_manager,
             catalog_manager.clone(),
             flow_metadata_manager,
@@ -405,8 +384,29 @@ impl StartCommand {
         .with_heartbeat_task(heartbeat_task);
 
         let mut flownode = flownode_builder.build().await.context(StartFlownodeSnafu)?;
+
+        let builder =
+            FlownodeServiceBuilder::grpc_server_builder(&opts, flownode.flownode_server());
+        let builder = if let Some(configurator) =
+            plugins.get::<GrpcBuilderConfiguratorRef<GrpcConfigureContext>>()
+        {
+            let context = GrpcConfigureContext {
+                kv_backend: cached_meta_backend.clone(),
+                fe_client: frontend_client.clone(),
+                flownode_id: member_id,
+                catalog_manager: catalog_manager.clone(),
+            };
+            configurator
+                .configure(builder, context)
+                .await
+                .context(OtherSnafu)?
+        } else {
+            builder
+        };
+        let grpc_server = builder.build();
+
         let services = FlownodeServiceBuilder::new(&opts)
-            .with_default_grpc_server(flownode.flownode_server())
+            .with_grpc_server(grpc_server)
             .enable_http_service()
             .build()
             .context(StartFlownodeSnafu)?;
@@ -430,16 +430,6 @@ impl StartCommand {
             .set_frontend_invoker(invoker)
             .await;
 
-        #[cfg(feature = "enterprise")]
-        let components = Components {
-            catalog_manager: catalog_manager.clone(),
-            fe_client: frontend_client,
-            kv_backend: cached_meta_backend,
-        };
-
-        #[cfg(not(feature = "enterprise"))]
-        return Ok(Instance::new(flownode, guard));
-        #[cfg(feature = "enterprise")]
-        Ok(Instance::new(flownode, components, guard))
+        Ok(Instance::new(flownode, guard))
     }
 }
diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs
index 89992eba37..fa36a99ed4 100644
--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::fmt::Debug;
 use std::path::Path;
 use std::sync::Arc;
 use std::time::Duration;
@@ -19,7 +20,10 @@ use std::time::Duration;
 use async_trait::async_trait;
 use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry};
 use catalog::information_extension::DistributedInformationExtension;
-use catalog::kvbackend::{CachedKvBackendBuilder, KvBackendCatalogManagerBuilder, MetaKvBackend};
+use catalog::kvbackend::{
+    CachedKvBackendBuilder, CatalogManagerConfiguratorRef, KvBackendCatalogManagerBuilder,
+    MetaKvBackend,
+};
 use catalog::process_manager::ProcessManager;
 use clap::Parser;
 use client::client_manager::NodeClients;
@@ -31,6 +35,7 @@ use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder};
 use common_meta::heartbeat::handler::HandlerGroupExecutor;
 use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler;
 use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
+use common_meta::heartbeat::handler::suspend::SuspendHandler;
 use common_query::prelude::set_default_prefix;
 use common_stat::ResourceStatImpl;
 use common_telemetry::info;
@@ -41,15 +46,17 @@ use frontend::frontend::Frontend;
 use frontend::heartbeat::HeartbeatTask;
 use frontend::instance::builder::FrontendBuilder;
 use frontend::server::Services;
-use meta_client::{MetaClientOptions, MetaClientType};
+use meta_client::{MetaClientOptions, MetaClientRef, MetaClientType};
+use plugins::frontend::context::{
+    CatalogManagerConfigureContext, DistributedCatalogManagerConfigureContext,
+};
 use servers::addrs;
-use servers::export_metrics::ExportMetricsTask;
 use servers::grpc::GrpcOptions;
-use servers::tls::{TlsMode, TlsOption};
+use servers::tls::{TlsMode, TlsOption, merge_tls_option};
 use snafu::{OptionExt, ResultExt};
 use tracing_appender::non_blocking::WorkerGuard;
 
-use crate::error::{self, Result};
+use crate::error::{self, OtherSnafu, Result};
 use crate::options::{GlobalOptions, GreptimeOptions};
 use crate::{App, create_resource_limit_metrics, log_versions, maybe_activate_heap_profile};
 
@@ -177,6 +184,8 @@ pub struct StartCommand {
     #[clap(long)]
     tls_key_path: Option<String>,
     #[clap(long)]
+    tls_watch: bool,
+    #[clap(long)]
     user_provider: Option<String>,
     #[clap(long)]
     disable_dashboard: Option<bool>,
@@ -230,6 +239,7 @@ impl StartCommand {
             self.tls_mode.clone(),
             self.tls_cert_path.clone(),
             self.tls_key_path.clone(),
+            self.tls_watch,
         );
 
         if let Some(addr) = &self.http_addr {
@@ -246,7 +256,7 @@ impl StartCommand {
 
         if let Some(addr) = &self.rpc_bind_addr {
             opts.grpc.bind_addr.clone_from(addr);
-            opts.grpc.tls = tls_opts.clone();
+            opts.grpc.tls = merge_tls_option(&opts.grpc.tls, tls_opts.clone());
         }
 
         if let Some(addr) = &self.rpc_server_addr {
@@ -281,13 +291,13 @@ impl StartCommand {
         if let Some(addr) = &self.mysql_addr {
             opts.mysql.enable = true;
             opts.mysql.addr.clone_from(addr);
-            opts.mysql.tls = tls_opts.clone();
+            opts.mysql.tls = merge_tls_option(&opts.mysql.tls, tls_opts.clone());
         }
 
         if let Some(addr) = &self.postgres_addr {
             opts.postgres.enable = true;
             opts.postgres.addr.clone_from(addr);
-            opts.postgres.tls = tls_opts;
+            opts.postgres.tls = merge_tls_option(&opts.postgres.tls, tls_opts.clone());
         }
 
         if let Some(enable) = self.influxdb_enable {
@@ -414,38 +424,30 @@ impl StartCommand {
             layered_cache_registry.clone(),
         )
         .with_process_manager(process_manager.clone());
-        #[cfg(feature = "enterprise")]
-        let builder = if let Some(factories) = plugins.get() {
-            builder.with_extra_information_table_factories(factories)
+        let builder = if let Some(configurator) =
+            plugins.get::<CatalogManagerConfiguratorRef<CatalogManagerConfigureContext>>()
+        {
+            let ctx = DistributedCatalogManagerConfigureContext {
+                meta_client: meta_client.clone(),
+            };
+            let ctx = CatalogManagerConfigureContext::Distributed(ctx);
+
+            configurator
+                .configure(builder, ctx)
+                .await
+                .context(OtherSnafu)?
         } else {
             builder
         };
         let catalog_manager = builder.build();
 
-        let executor = HandlerGroupExecutor::new(vec![
-            Arc::new(ParseMailboxMessageHandler),
-            Arc::new(InvalidateCacheHandler::new(layered_cache_registry.clone())),
-        ]);
-
-        let mut resource_stat = ResourceStatImpl::default();
-        resource_stat.start_collect_cpu_usage();
-
-        let heartbeat_task = HeartbeatTask::new(
-            &opts,
-            meta_client.clone(),
-            opts.heartbeat.clone(),
-            Arc::new(executor),
-            Arc::new(resource_stat),
-        );
-        let heartbeat_task = Some(heartbeat_task);
-
         let instance = FrontendBuilder::new(
             opts.clone(),
             cached_meta_backend.clone(),
             layered_cache_registry.clone(),
             catalog_manager,
             client,
-            meta_client,
+            meta_client.clone(),
             process_manager,
         )
         .with_plugin(plugins.clone())
@@ -453,10 +455,10 @@ impl StartCommand {
         .try_build()
         .await
         .context(error::StartFrontendSnafu)?;
-        let instance = Arc::new(instance);
 
-        let export_metrics_task = ExportMetricsTask::try_new(&opts.export_metrics, Some(&plugins))
-            .context(error::ServersSnafu)?;
+        let heartbeat_task = Some(create_heartbeat_task(&opts, meta_client, &instance));
+
+        let instance = Arc::new(instance);
 
         let servers = Services::new(opts, instance.clone(), plugins)
             .build()
@@ -466,13 +468,34 @@ impl StartCommand {
             instance,
             servers,
             heartbeat_task,
-            export_metrics_task,
         };
 
         Ok(Instance::new(frontend, guard))
     }
 }
 
+pub fn create_heartbeat_task(
+    options: &frontend::frontend::FrontendOptions,
+    meta_client: MetaClientRef,
+    instance: &frontend::instance::Instance,
+) -> HeartbeatTask {
+    let executor = Arc::new(HandlerGroupExecutor::new(vec![
+        Arc::new(ParseMailboxMessageHandler),
+        Arc::new(SuspendHandler::new(instance.suspend_state())),
+        Arc::new(InvalidateCacheHandler::new(
+            instance.cache_invalidator().clone(),
+        )),
+    ]));
+
+    let stat = {
+        let mut stat = ResourceStatImpl::default();
+        stat.start_collect_cpu_usage();
+        Arc::new(stat)
+    };
+
+    HeartbeatTask::new(options, meta_client, executor, stat)
+}
+
 #[cfg(test)]
 mod tests {
     use std::io::Write;
diff --git a/src/cmd/src/metasrv.rs b/src/cmd/src/metasrv.rs
index 4f71775e74..ee67267de3 100644
--- a/src/cmd/src/metasrv.rs
+++ b/src/cmd/src/metasrv.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::fmt;
+use std::fmt::{self, Debug};
 use std::path::Path;
 use std::time::Duration;
 
@@ -23,7 +23,7 @@ use common_config::Configurable;
 use common_telemetry::info;
 use common_telemetry::logging::{DEFAULT_LOGGING_DIR, TracingOptions};
 use common_version::{short_version, verbose_version};
-use meta_srv::bootstrap::MetasrvInstance;
+use meta_srv::bootstrap::{MetasrvInstance, metasrv_builder};
 use meta_srv::metasrv::BackendImpl;
 use snafu::ResultExt;
 use tracing_appender::non_blocking::WorkerGuard;
@@ -177,7 +177,7 @@ pub struct StartCommand {
     backend: Option<BackendImpl>,
 }
 
-impl fmt::Debug for StartCommand {
+impl Debug for StartCommand {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("StartCommand")
             .field("rpc_bind_addr", &self.rpc_bind_addr)
@@ -341,7 +341,7 @@ impl StartCommand {
             .await
             .context(StartMetaServerSnafu)?;
 
-        let builder = meta_srv::bootstrap::metasrv_builder(&opts, plugins, None)
+        let builder = metasrv_builder(&opts, plugins, None)
             .await
             .context(error::BuildMetaServerSnafu)?;
         let metasrv = builder.build().await.context(error::BuildMetaServerSnafu)?;
diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs
index bf5aff7825..012680ac08 100644
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::fmt::Debug;
 use std::net::SocketAddr;
 use std::path::Path;
 use std::sync::Arc;
@@ -20,7 +21,7 @@ use std::{fs, path};
 use async_trait::async_trait;
 use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry};
 use catalog::information_schema::InformationExtensionRef;
-use catalog::kvbackend::KvBackendCatalogManagerBuilder;
+use catalog::kvbackend::{CatalogManagerConfiguratorRef, KvBackendCatalogManagerBuilder};
 use catalog::process_manager::ProcessManager;
 use clap::Parser;
 use common_base::Plugins;
@@ -31,7 +32,7 @@ use common_meta::cache::LayeredCacheRegistryBuilder;
 use common_meta::ddl::flow_meta::FlowMetadataAllocator;
 use common_meta::ddl::table_meta::TableMetadataAllocator;
 use common_meta::ddl::{DdlContext, NoopRegionFailureDetectorControl};
-use common_meta::ddl_manager::DdlManager;
+use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef};
 use common_meta::key::flow::FlowMetadataManager;
 use common_meta::key::{TableMetadataManager, TableMetadataManagerRef};
 use common_meta::kv_backend::KvBackendRef;
@@ -57,14 +58,17 @@ use frontend::instance::StandaloneDatanodeManager;
 use frontend::instance::builder::FrontendBuilder;
 use frontend::server::Services;
 use meta_srv::metasrv::{FLOW_ID_SEQ, TABLE_ID_SEQ};
-use servers::export_metrics::ExportMetricsTask;
-use servers::tls::{TlsMode, TlsOption};
+use plugins::frontend::context::{
+    CatalogManagerConfigureContext, StandaloneCatalogManagerConfigureContext,
+};
+use plugins::standalone::context::DdlManagerConfigureContext;
+use servers::tls::{TlsMode, TlsOption, merge_tls_option};
 use snafu::ResultExt;
 use standalone::StandaloneInformationExtension;
 use standalone::options::StandaloneOptions;
 use tracing_appender::non_blocking::WorkerGuard;
 
-use crate::error::{Result, StartFlownodeSnafu};
+use crate::error::{OtherSnafu, Result, StartFlownodeSnafu};
 use crate::options::{GlobalOptions, GreptimeOptions};
 use crate::{App, create_resource_limit_metrics, error, log_versions, maybe_activate_heap_profile};
 
@@ -117,34 +121,15 @@ pub struct Instance {
     flownode: FlownodeInstance,
     procedure_manager: ProcedureManagerRef,
     wal_options_allocator: WalOptionsAllocatorRef,
-
-    // The components of standalone, which make it easier to expand based
-    // on the components.
-    #[cfg(feature = "enterprise")]
-    components: Components,
-
     // Keep the logging guard to prevent the worker from being dropped.
     _guard: Vec<WorkerGuard>,
 }
 
-#[cfg(feature = "enterprise")]
-pub struct Components {
-    pub plugins: Plugins,
-    pub kv_backend: KvBackendRef,
-    pub frontend_client: Arc<FrontendClient>,
-    pub catalog_manager: catalog::CatalogManagerRef,
-}
-
 impl Instance {
     /// Find the socket addr of a server by its `name`.
     pub fn server_addr(&self, name: &str) -> Option<SocketAddr> {
         self.frontend.server_handlers().addr(name)
     }
-
-    #[cfg(feature = "enterprise")]
-    pub fn components(&self) -> &Components {
-        &self.components
-    }
 }
 
 #[async_trait]
@@ -228,6 +213,8 @@ pub struct StartCommand {
     #[clap(long)]
     tls_key_path: Option<String>,
     #[clap(long)]
+    tls_watch: bool,
+    #[clap(long)]
     user_provider: Option<String>,
     #[clap(long, default_value = "GREPTIMEDB_STANDALONE")]
     pub env_prefix: String,
@@ -277,6 +264,7 @@ impl StartCommand {
             self.tls_mode.clone(),
             self.tls_cert_path.clone(),
             self.tls_key_path.clone(),
+            self.tls_watch,
         );
 
         if let Some(addr) = &self.http_addr {
@@ -305,19 +293,20 @@ impl StartCommand {
                     ),
                 }.fail();
             }
-            opts.grpc.bind_addr.clone_from(addr)
+            opts.grpc.bind_addr.clone_from(addr);
+            opts.grpc.tls = merge_tls_option(&opts.grpc.tls, tls_opts.clone());
         }
 
         if let Some(addr) = &self.mysql_addr {
             opts.mysql.enable = true;
             opts.mysql.addr.clone_from(addr);
-            opts.mysql.tls = tls_opts.clone();
+            opts.mysql.tls = merge_tls_option(&opts.mysql.tls, tls_opts.clone());
         }
 
         if let Some(addr) = &self.postgres_addr {
             opts.postgres.enable = true;
             opts.postgres.addr.clone_from(addr);
-            opts.postgres.tls = tls_opts;
+            opts.postgres.tls = merge_tls_option(&opts.postgres.tls, tls_opts.clone());
         }
 
         if self.influxdb_enable {
@@ -413,6 +402,13 @@ impl StartCommand {
         plugins.insert::<InformationExtensionRef>(information_extension.clone());
 
         let process_manager = Arc::new(ProcessManager::new(opts.grpc.server_addr.clone(), None));
+
+        // for standalone not use grpc, but get a handler to frontend grpc client without
+        // actually make a connection
+        let (frontend_client, frontend_instance_handler) =
+            FrontendClient::from_empty_grpc_handler(opts.query.clone());
+        let frontend_client = Arc::new(frontend_client);
+
         let builder = KvBackendCatalogManagerBuilder::new(
             information_extension.clone(),
             kv_backend.clone(),
@@ -420,9 +416,17 @@ impl StartCommand {
         )
         .with_procedure_manager(procedure_manager.clone())
         .with_process_manager(process_manager.clone());
-        #[cfg(feature = "enterprise")]
-        let builder = if let Some(factories) = plugins.get() {
-            builder.with_extra_information_table_factories(factories)
+        let builder = if let Some(configurator) =
+            plugins.get::<CatalogManagerConfiguratorRef<CatalogManagerConfigureContext>>()
+        {
+            let ctx = StandaloneCatalogManagerConfigureContext {
+                fe_client: frontend_client.clone(),
+            };
+            let ctx = CatalogManagerConfigureContext::Standalone(ctx);
+            configurator
+                .configure(builder, ctx)
+                .await
+                .context(OtherSnafu)?
         } else {
             builder
         };
@@ -437,11 +441,6 @@ impl StartCommand {
             ..Default::default()
         };
 
-        // for standalone not use grpc, but get a handler to frontend grpc client without
-        // actually make a connection
-        let (frontend_client, frontend_instance_handler) =
-            FrontendClient::from_empty_grpc_handler(opts.query.clone());
-        let frontend_client = Arc::new(frontend_client);
         let flow_builder = FlownodeBuilder::new(
             flownode_options,
             plugins.clone(),
@@ -512,11 +511,21 @@ impl StartCommand {
 
         let ddl_manager = DdlManager::try_new(ddl_context, procedure_manager.clone(), true)
             .context(error::InitDdlManagerSnafu)?;
-        #[cfg(feature = "enterprise")]
-        let ddl_manager = {
-            let trigger_ddl_manager: Option<common_meta::ddl_manager::TriggerDdlManagerRef> =
-                plugins.get();
-            ddl_manager.with_trigger_ddl_manager(trigger_ddl_manager)
+
+        let ddl_manager = if let Some(configurator) =
+            plugins.get::<DdlManagerConfiguratorRef<DdlManagerConfigureContext>>()
+        {
+            let ctx = DdlManagerConfigureContext {
+                kv_backend: kv_backend.clone(),
+                fe_client: frontend_client.clone(),
+                catalog_manager: catalog_manager.clone(),
+            };
+            configurator
+                .configure(ddl_manager, ctx)
+                .await
+                .context(OtherSnafu)?
+        } else {
+            ddl_manager
         };
 
         let procedure_executor = Arc::new(LocalProcedureExecutor::new(
@@ -562,9 +571,6 @@ impl StartCommand {
         .context(StartFlownodeSnafu)?;
         flow_streaming_engine.set_frontend_invoker(invoker).await;
 
-        let export_metrics_task = ExportMetricsTask::try_new(&opts.export_metrics, Some(&plugins))
-            .context(error::ServersSnafu)?;
-
         let servers = Services::new(opts, fe_instance.clone(), plugins.clone())
             .build()
             .context(error::StartFrontendSnafu)?;
@@ -573,15 +579,6 @@ impl StartCommand {
             instance: fe_instance,
             servers,
             heartbeat_task: None,
-            export_metrics_task,
-        };
-
-        #[cfg(feature = "enterprise")]
-        let components = Components {
-            plugins,
-            kv_backend,
-            frontend_client,
-            catalog_manager,
         };
 
         Ok(Instance {
@@ -590,8 +587,6 @@ impl StartCommand {
             flownode,
             procedure_manager,
             wal_options_allocator,
-            #[cfg(feature = "enterprise")]
-            components,
             _guard: guard,
         })
     }
@@ -769,6 +764,8 @@ mod tests {
     fn test_load_log_options_from_cli() {
         let cmd = StartCommand {
             user_provider: Some("static_user_provider:cmd:test=test".to_string()),
+            mysql_addr: Some("127.0.0.1:4002".to_string()),
+            postgres_addr: Some("127.0.0.1:4003".to_string()),
             ..Default::default()
         };
 
diff --git a/src/cmd/tests/load_config_test.rs b/src/cmd/tests/load_config_test.rs
index 222012bfd8..79b42dbfc1 100644
--- a/src/cmd/tests/load_config_test.rs
+++ b/src/cmd/tests/load_config_test.rs
@@ -31,7 +31,6 @@ use meta_srv::selector::SelectorType;
 use metric_engine::config::EngineConfig as MetricEngineConfig;
 use mito2::config::MitoConfig;
 use query::options::QueryOptions;
-use servers::export_metrics::ExportMetricsOption;
 use servers::grpc::GrpcOptions;
 use servers::http::HttpOptions;
 use servers::tls::{TlsMode, TlsOption};
@@ -53,7 +52,6 @@ fn test_load_datanode_example_config() {
             meta_client: Some(MetaClientOptions {
                 metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
                 timeout: Duration::from_secs(3),
-                heartbeat_timeout: Duration::from_millis(500),
                 ddl_timeout: Duration::from_secs(10),
                 connect_timeout: Duration::from_secs(1),
                 tcp_nodelay: true,
@@ -95,11 +93,6 @@ fn test_load_datanode_example_config() {
                 tracing_sample_ratio: Some(Default::default()),
                 ..Default::default()
             },
-            export_metrics: ExportMetricsOption {
-                self_import: None,
-                remote_write: Some(Default::default()),
-                ..Default::default()
-            },
             grpc: GrpcOptions::default()
                 .with_bind_addr("127.0.0.1:3001")
                 .with_server_addr("127.0.0.1:3001"),
@@ -124,7 +117,6 @@ fn test_load_frontend_example_config() {
             meta_client: Some(MetaClientOptions {
                 metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
                 timeout: Duration::from_secs(3),
-                heartbeat_timeout: Duration::from_millis(500),
                 ddl_timeout: Duration::from_secs(10),
                 connect_timeout: Duration::from_secs(1),
                 tcp_nodelay: true,
@@ -146,11 +138,6 @@ fn test_load_frontend_example_config() {
                     ..Default::default()
                 },
             },
-            export_metrics: ExportMetricsOption {
-                self_import: None,
-                remote_write: Some(Default::default()),
-                ..Default::default()
-            },
             grpc: GrpcOptions {
                 bind_addr: "127.0.0.1:4001".to_string(),
                 server_addr: "127.0.0.1:4001".to_string(),
@@ -201,11 +188,6 @@ fn test_load_metasrv_example_config() {
                     tcp_nodelay: true,
                 },
             },
-            export_metrics: ExportMetricsOption {
-                self_import: None,
-                remote_write: Some(Default::default()),
-                ..Default::default()
-            },
             backend_tls: Some(TlsOption {
                 mode: TlsMode::Prefer,
                 cert_path: String::new(),
@@ -257,7 +239,6 @@ fn test_load_flownode_example_config() {
             meta_client: Some(MetaClientOptions {
                 metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
                 timeout: Duration::from_secs(3),
-                heartbeat_timeout: Duration::from_millis(500),
                 ddl_timeout: Duration::from_secs(10),
                 connect_timeout: Duration::from_secs(1),
                 tcp_nodelay: true,
@@ -317,11 +298,6 @@ fn test_load_standalone_example_config() {
                 tracing_sample_ratio: Some(Default::default()),
                 ..Default::default()
             },
-            export_metrics: ExportMetricsOption {
-                self_import: Some(Default::default()),
-                remote_write: Some(Default::default()),
-                ..Default::default()
-            },
             http: HttpOptions {
                 cors_allowed_origins: vec!["https://example.com".to_string()],
                 ..Default::default()
diff --git a/src/common/base/src/plugins.rs b/src/common/base/src/plugins.rs
index bbab003c69..aa1a9d1287 100644
--- a/src/common/base/src/plugins.rs
+++ b/src/common/base/src/plugins.rs
@@ -32,7 +32,12 @@ impl Plugins {
 
     pub fn insert<T: 'static + Send + Sync>(&self, value: T) {
         let last = self.write().insert(value);
-        assert!(last.is_none(), "each type of plugins must be one and only");
+        if last.is_some() {
+            panic!(
+                "Plugin of type {} already exists",
+                std::any::type_name::<T>()
+            );
+        }
     }
 
     pub fn get<T: 'static + Send + Sync + Clone>(&self) -> Option<T> {
@@ -140,7 +145,7 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "each type of plugins must be one and only")]
+    #[should_panic(expected = "Plugin of type i32 already exists")]
     fn test_plugin_uniqueness() {
         let plugins = Plugins::new();
         plugins.insert(1i32);
diff --git a/src/common/catalog/src/consts.rs b/src/common/catalog/src/consts.rs
index 8a59a15cc6..1cd5db8a0c 100644
--- a/src/common/catalog/src/consts.rs
+++ b/src/common/catalog/src/consts.rs
@@ -86,8 +86,6 @@ pub const INFORMATION_SCHEMA_TRIGGERS_TABLE_ID: u32 = 24;
 pub const INFORMATION_SCHEMA_GLOBAL_STATUS_TABLE_ID: u32 = 25;
 /// id for information_schema.SESSION_STATUS
 pub const INFORMATION_SCHEMA_SESSION_STATUS_TABLE_ID: u32 = 26;
-/// id for information_schema.RUNTIME_METRICS
-pub const INFORMATION_SCHEMA_RUNTIME_METRICS_TABLE_ID: u32 = 27;
 /// id for information_schema.PARTITIONS
 pub const INFORMATION_SCHEMA_PARTITIONS_TABLE_ID: u32 = 28;
 /// id for information_schema.REGION_PEERS
@@ -112,6 +110,8 @@ pub const INFORMATION_SCHEMA_SSTS_MANIFEST_TABLE_ID: u32 = 37;
 pub const INFORMATION_SCHEMA_SSTS_STORAGE_TABLE_ID: u32 = 38;
 /// id for information_schema.ssts_index_meta
 pub const INFORMATION_SCHEMA_SSTS_INDEX_META_TABLE_ID: u32 = 39;
+/// id for information_schema.alerts
+pub const INFORMATION_SCHEMA_ALERTS_TABLE_ID: u32 = 40;
 
 // ----- End of information_schema tables -----
 
diff --git a/src/common/config/Cargo.toml b/src/common/config/Cargo.toml
index b45c03a6c3..2737f82a58 100644
--- a/src/common/config/Cargo.toml
+++ b/src/common/config/Cargo.toml
@@ -11,8 +11,10 @@ workspace = true
 common-base.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
+common-telemetry.workspace = true
 config.workspace = true
 humantime-serde.workspace = true
+notify.workspace = true
 object-store.workspace = true
 serde.workspace = true
 serde_json.workspace = true
diff --git a/src/common/config/src/error.rs b/src/common/config/src/error.rs
index fbce83fd00..82abd8a9b8 100644
--- a/src/common/config/src/error.rs
+++ b/src/common/config/src/error.rs
@@ -49,14 +49,41 @@ pub enum Error {
         #[snafu(implicit)]
         location: Location,
     },
+
+    #[snafu(display("Failed to watch file: {}", path))]
+    FileWatch {
+        path: String,
+        #[snafu(source)]
+        error: notify::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to canonicalize path: {}", path))]
+    CanonicalizePath {
+        path: String,
+        #[snafu(source)]
+        error: std::io::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Invalid path '{}': expected a file, not a directory", path))]
+    InvalidPath {
+        path: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }
 
 impl ErrorExt for Error {
     fn status_code(&self) -> StatusCode {
         match self {
-            Error::TomlFormat { .. } | Error::LoadLayeredConfig { .. } => {
-                StatusCode::InvalidArguments
-            }
+            Error::TomlFormat { .. }
+            | Error::LoadLayeredConfig { .. }
+            | Error::FileWatch { .. }
+            | Error::InvalidPath { .. }
+            | Error::CanonicalizePath { .. } => StatusCode::InvalidArguments,
             Error::SerdeJson { .. } => StatusCode::Unexpected,
         }
     }
diff --git a/src/common/config/src/file_watcher.rs b/src/common/config/src/file_watcher.rs
new file mode 100644
index 0000000000..2507af024a
--- /dev/null
+++ b/src/common/config/src/file_watcher.rs
@@ -0,0 +1,355 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Common file watching utilities for configuration hot-reloading.
+//!
+//! This module provides a generic file watcher that can be used to watch
+//! files for changes and trigger callbacks when changes occur.
+//!
+//! The watcher monitors the parent directory of each file rather than the
+//! file itself. This ensures that file deletions and recreations are properly
+//! tracked, which is common with editors that use atomic saves or when
+//! configuration files are replaced.
+
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+use std::sync::mpsc::channel;
+
+use common_telemetry::{error, info, warn};
+use notify::{EventKind, RecursiveMode, Watcher};
+use snafu::ResultExt;
+
+use crate::error::{CanonicalizePathSnafu, FileWatchSnafu, InvalidPathSnafu, Result};
+
+/// Configuration for the file watcher behavior.
+#[derive(Debug, Clone, Default)]
+pub struct FileWatcherConfig {
+    /// Whether to include Remove events in addition to Modify and Create.
+    pub include_remove_events: bool,
+}
+
+impl FileWatcherConfig {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_modify_and_create(mut self) -> Self {
+        self.include_remove_events = false;
+        self
+    }
+
+    pub fn with_remove_events(mut self) -> Self {
+        self.include_remove_events = true;
+        self
+    }
+}
+
+/// A builder for creating file watchers with flexible configuration.
+///
+/// The watcher monitors the parent directory of each file to handle file
+/// deletion and recreation properly. Events are filtered to only trigger
+/// callbacks for the specific files being watched.
+pub struct FileWatcherBuilder {
+    config: FileWatcherConfig,
+    /// Canonicalized paths of files to watch.
+    file_paths: Vec<PathBuf>,
+}
+
+impl FileWatcherBuilder {
+    /// Create a new builder with default configuration.
+    pub fn new() -> Self {
+        Self {
+            config: FileWatcherConfig::default(),
+            file_paths: Vec::new(),
+        }
+    }
+
+    /// Set the watcher configuration.
+    pub fn config(mut self, config: FileWatcherConfig) -> Self {
+        self.config = config;
+        self
+    }
+
+    /// Add a file path to watch.
+    ///
+    /// Returns an error if the path is a directory.
+    /// The path is canonicalized for reliable comparison with events.
+    pub fn watch_path<P: AsRef<Path>>(mut self, path: P) -> Result<Self> {
+        let path = path.as_ref();
+        snafu::ensure!(
+            path.is_file(),
+            InvalidPathSnafu {
+                path: path.display().to_string(),
+            }
+        );
+        // Canonicalize the path for reliable comparison with event paths
+        let canonical = path.canonicalize().context(CanonicalizePathSnafu {
+            path: path.display().to_string(),
+        })?;
+        self.file_paths.push(canonical);
+        Ok(self)
+    }
+
+    /// Add multiple file paths to watch.
+    ///
+    /// Returns an error if any path is a directory.
+    pub fn watch_paths<P: AsRef<Path>, I: IntoIterator<Item = P>>(
+        mut self,
+        paths: I,
+    ) -> Result<Self> {
+        for path in paths {
+            self = self.watch_path(path)?;
+        }
+        Ok(self)
+    }
+
+    /// Build and spawn the file watcher with the given callback.
+    ///
+    /// The callback is invoked when relevant file events are detected for
+    /// the watched files. The watcher monitors the parent directories to
+    /// handle file deletion and recreation properly.
+    ///
+    /// The spawned watcher thread runs for the lifetime of the process.
+    pub fn spawn<F>(self, callback: F) -> Result<()>
+    where
+        F: Fn() + Send + 'static,
+    {
+        let (tx, rx) = channel::<notify::Result<notify::Event>>();
+        let mut watcher =
+            notify::recommended_watcher(tx).context(FileWatchSnafu { path: "<none>" })?;
+
+        // Collect unique parent directories to watch
+        let mut watched_dirs: HashSet<PathBuf> = HashSet::new();
+        for file_path in &self.file_paths {
+            if let Some(parent) = file_path.parent()
+                && watched_dirs.insert(parent.to_path_buf())
+            {
+                watcher
+                    .watch(parent, RecursiveMode::NonRecursive)
+                    .context(FileWatchSnafu {
+                        path: parent.display().to_string(),
+                    })?;
+            }
+        }
+
+        let config = self.config;
+        let watched_files: HashSet<PathBuf> = self.file_paths.iter().cloned().collect();
+
+        info!(
+            "Spawning file watcher for paths: {:?} (watching parent directories)",
+            self.file_paths
+                .iter()
+                .map(|p| p.display().to_string())
+                .collect::<Vec<_>>()
+        );
+
+        std::thread::spawn(move || {
+            // Keep watcher alive in the thread
+            let _watcher = watcher;
+
+            while let Ok(res) = rx.recv() {
+                match res {
+                    Ok(event) => {
+                        if !is_relevant_event(&event.kind, &config) {
+                            continue;
+                        }
+
+                        // Check if any of the event paths match our watched files
+                        let is_watched_file = event.paths.iter().any(|event_path| {
+                            // Try to canonicalize the event path for comparison
+                            // If the file was deleted, canonicalize will fail, so we also
+                            // compare the raw path
+                            if let Ok(canonical) = event_path.canonicalize()
+                                && watched_files.contains(&canonical)
+                            {
+                                return true;
+                            }
+                            // For deleted files, compare using the raw path
+                            watched_files.contains(event_path)
+                        });
+
+                        if !is_watched_file {
+                            continue;
+                        }
+
+                        info!(?event.kind, ?event.paths, "Detected file change");
+                        callback();
+                    }
+                    Err(err) => {
+                        warn!("File watcher error: {}", err);
+                    }
+                }
+            }
+
+            error!("File watcher channel closed unexpectedly");
+        });
+
+        Ok(())
+    }
+}
+
+impl Default for FileWatcherBuilder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Check if an event kind is relevant based on the configuration.
+fn is_relevant_event(kind: &EventKind, config: &FileWatcherConfig) -> bool {
+    match kind {
+        EventKind::Modify(_) | EventKind::Create(_) => true,
+        EventKind::Remove(_) => config.include_remove_events,
+        _ => false,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::time::Duration;
+
+    use common_test_util::temp_dir::create_temp_dir;
+
+    use super::*;
+
+    #[test]
+    fn test_file_watcher_detects_changes() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("test_file_watcher");
+        let file_path = dir.path().join("test_file.txt");
+
+        // Create initial file
+        std::fs::write(&file_path, "initial content").unwrap();
+
+        let counter = Arc::new(AtomicUsize::new(0));
+        let counter_clone = counter.clone();
+
+        FileWatcherBuilder::new()
+            .watch_path(&file_path)
+            .unwrap()
+            .config(FileWatcherConfig::new())
+            .spawn(move || {
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+            })
+            .unwrap();
+
+        // Give watcher time to start
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Modify the file
+        std::fs::write(&file_path, "modified content").unwrap();
+
+        // Wait for the event to be processed
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert!(
+            counter.load(Ordering::SeqCst) >= 1,
+            "Watcher should have detected at least one change"
+        );
+    }
+
+    #[test]
+    fn test_file_watcher_detects_delete_and_recreate() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("test_file_watcher_recreate");
+        let file_path = dir.path().join("test_file.txt");
+
+        // Create initial file
+        std::fs::write(&file_path, "initial content").unwrap();
+
+        let counter = Arc::new(AtomicUsize::new(0));
+        let counter_clone = counter.clone();
+
+        FileWatcherBuilder::new()
+            .watch_path(&file_path)
+            .unwrap()
+            .config(FileWatcherConfig::new())
+            .spawn(move || {
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+            })
+            .unwrap();
+
+        // Give watcher time to start
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Delete the file
+        std::fs::remove_file(&file_path).unwrap();
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Recreate the file - this should still be detected because we watch the directory
+        std::fs::write(&file_path, "recreated content").unwrap();
+
+        // Wait for the event to be processed
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert!(
+            counter.load(Ordering::SeqCst) >= 1,
+            "Watcher should have detected file recreation"
+        );
+    }
+
+    #[test]
+    fn test_file_watcher_ignores_other_files() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("test_file_watcher_other");
+        let watched_file = dir.path().join("watched.txt");
+        let other_file = dir.path().join("other.txt");
+
+        // Create both files
+        std::fs::write(&watched_file, "watched content").unwrap();
+        std::fs::write(&other_file, "other content").unwrap();
+
+        let counter = Arc::new(AtomicUsize::new(0));
+        let counter_clone = counter.clone();
+
+        FileWatcherBuilder::new()
+            .watch_path(&watched_file)
+            .unwrap()
+            .config(FileWatcherConfig::new())
+            .spawn(move || {
+                counter_clone.fetch_add(1, Ordering::SeqCst);
+            })
+            .unwrap();
+
+        // Give watcher time to start
+        std::thread::sleep(Duration::from_millis(100));
+
+        // Modify the other file - should NOT trigger callback
+        std::fs::write(&other_file, "modified other content").unwrap();
+
+        // Wait for potential event
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert_eq!(
+            counter.load(Ordering::SeqCst),
+            0,
+            "Watcher should not have detected changes to other files"
+        );
+
+        // Now modify the watched file - SHOULD trigger callback
+        std::fs::write(&watched_file, "modified watched content").unwrap();
+
+        // Wait for the event to be processed
+        std::thread::sleep(Duration::from_millis(500));
+
+        assert!(
+            counter.load(Ordering::SeqCst) >= 1,
+            "Watcher should have detected change to watched file"
+        );
+    }
+}
diff --git a/src/common/config/src/lib.rs b/src/common/config/src/lib.rs
index cc25ebce16..eea3b1351d 100644
--- a/src/common/config/src/lib.rs
+++ b/src/common/config/src/lib.rs
@@ -14,6 +14,7 @@
 
 pub mod config;
 pub mod error;
+pub mod file_watcher;
 
 use std::time::Duration;
 
diff --git a/src/common/datasource/src/buffered_writer.rs b/src/common/datasource/src/buffered_writer.rs
index e1571b0187..953715b223 100644
--- a/src/common/datasource/src/buffered_writer.rs
+++ b/src/common/datasource/src/buffered_writer.rs
@@ -12,28 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::future::Future;
-
 use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
 use datafusion::parquet::format::FileMetaData;
-use snafu::{OptionExt, ResultExt};
-use tokio::io::{AsyncWrite, AsyncWriteExt};
 
-use crate::error::{self, Result};
-use crate::share_buffer::SharedBuffer;
-
-pub struct LazyBufferedWriter<T, U, F> {
-    path: String,
-    writer_factory: F,
-    writer: Option<T>,
-    /// None stands for [`LazyBufferedWriter`] closed.
-    encoder: Option<U>,
-    buffer: SharedBuffer,
-    rows_written: usize,
-    bytes_written: u64,
-    threshold: usize,
-}
+use crate::error::Result;
 
 pub trait DfRecordBatchEncoder {
     fn write(&mut self, batch: &RecordBatch) -> Result<()>;
@@ -43,126 +26,3 @@ pub trait DfRecordBatchEncoder {
 pub trait ArrowWriterCloser {
     async fn close(mut self) -> Result<FileMetaData>;
 }
-
-impl<
-    T: AsyncWrite + Send + Unpin,
-    U: DfRecordBatchEncoder + ArrowWriterCloser,
-    F: Fn(String) -> Fut,
-    Fut: Future<Output = Result<T>>,
-> LazyBufferedWriter<T, U, F>
-{
-    /// Closes `LazyBufferedWriter` and optionally flushes all data to underlying storage
-    /// if any row's been written.
-    pub async fn close_with_arrow_writer(mut self) -> Result<(FileMetaData, u64)> {
-        let encoder = self
-            .encoder
-            .take()
-            .context(error::BufferedWriterClosedSnafu)?;
-        let metadata = encoder.close().await?;
-
-        // It's important to shut down! flushes all pending writes
-        self.close_inner_writer().await?;
-        Ok((metadata, self.bytes_written))
-    }
-}
-
-impl<
-    T: AsyncWrite + Send + Unpin,
-    U: DfRecordBatchEncoder,
-    F: Fn(String) -> Fut,
-    Fut: Future<Output = Result<T>>,
-> LazyBufferedWriter<T, U, F>
-{
-    /// Closes the writer and flushes the buffer data.
-    pub async fn close_inner_writer(&mut self) -> Result<()> {
-        // Use `rows_written` to keep a track of if any rows have been written.
-        // If no row's been written, then we can simply close the underlying
-        // writer without flush so that no file will be actually created.
-        if self.rows_written != 0 {
-            self.bytes_written += self.try_flush(true).await?;
-        }
-
-        if let Some(writer) = &mut self.writer {
-            writer.shutdown().await.context(error::AsyncWriteSnafu)?;
-        }
-        Ok(())
-    }
-
-    pub fn new(
-        threshold: usize,
-        buffer: SharedBuffer,
-        encoder: U,
-        path: impl AsRef<str>,
-        writer_factory: F,
-    ) -> Self {
-        Self {
-            path: path.as_ref().to_string(),
-            threshold,
-            encoder: Some(encoder),
-            buffer,
-            rows_written: 0,
-            bytes_written: 0,
-            writer_factory,
-            writer: None,
-        }
-    }
-
-    pub async fn write(&mut self, batch: &RecordBatch) -> Result<()> {
-        let encoder = self
-            .encoder
-            .as_mut()
-            .context(error::BufferedWriterClosedSnafu)?;
-        encoder.write(batch)?;
-        self.rows_written += batch.num_rows();
-        self.bytes_written += self.try_flush(false).await?;
-        Ok(())
-    }
-
-    async fn try_flush(&mut self, all: bool) -> Result<u64> {
-        let mut bytes_written: u64 = 0;
-
-        // Once buffered data size reaches threshold, split the data in chunks (typically 4MB)
-        // and write to underlying storage.
-        while self.buffer.buffer.lock().unwrap().len() >= self.threshold {
-            let chunk = {
-                let mut buffer = self.buffer.buffer.lock().unwrap();
-                buffer.split_to(self.threshold)
-            };
-            let size = chunk.len();
-
-            self.maybe_init_writer()
-                .await?
-                .write_all(&chunk)
-                .await
-                .context(error::AsyncWriteSnafu)?;
-
-            bytes_written += size as u64;
-        }
-
-        if all {
-            bytes_written += self.try_flush_all().await?;
-        }
-        Ok(bytes_written)
-    }
-
-    /// Only initiates underlying file writer when rows have been written.
-    async fn maybe_init_writer(&mut self) -> Result<&mut T> {
-        if let Some(ref mut writer) = self.writer {
-            Ok(writer)
-        } else {
-            let writer = (self.writer_factory)(self.path.clone()).await?;
-            Ok(self.writer.insert(writer))
-        }
-    }
-
-    async fn try_flush_all(&mut self) -> Result<u64> {
-        let remain = self.buffer.buffer.lock().unwrap().split();
-        let size = remain.len();
-        self.maybe_init_writer()
-            .await?
-            .write_all(&remain)
-            .await
-            .context(error::AsyncWriteSnafu)?;
-        Ok(size as u64)
-    }
-}
diff --git a/src/common/datasource/src/compressed_writer.rs b/src/common/datasource/src/compressed_writer.rs
new file mode 100644
index 0000000000..afd2544f4c
--- /dev/null
+++ b/src/common/datasource/src/compressed_writer.rs
@@ -0,0 +1,202 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::io;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+
+use async_compression::tokio::write::{BzEncoder, GzipEncoder, XzEncoder, ZstdEncoder};
+use snafu::ResultExt;
+use tokio::io::{AsyncWrite, AsyncWriteExt};
+
+use crate::compression::CompressionType;
+use crate::error::{self, Result};
+
+/// A compressed writer that wraps an underlying async writer with compression.
+///
+/// This writer supports multiple compression formats including GZIP, BZIP2, XZ, and ZSTD.
+/// It provides transparent compression for any async writer implementation.
+pub struct CompressedWriter {
+    inner: Box<dyn AsyncWrite + Unpin + Send>,
+    compression_type: CompressionType,
+}
+
+impl CompressedWriter {
+    /// Creates a new compressed writer with the specified compression type.
+    ///
+    /// # Arguments
+    ///
+    /// * `writer` - The underlying writer to wrap with compression
+    /// * `compression_type` - The type of compression to apply
+    pub fn new(
+        writer: impl AsyncWrite + Unpin + Send + 'static,
+        compression_type: CompressionType,
+    ) -> Self {
+        let inner: Box<dyn AsyncWrite + Unpin + Send> = match compression_type {
+            CompressionType::Gzip => Box::new(GzipEncoder::new(writer)),
+            CompressionType::Bzip2 => Box::new(BzEncoder::new(writer)),
+            CompressionType::Xz => Box::new(XzEncoder::new(writer)),
+            CompressionType::Zstd => Box::new(ZstdEncoder::new(writer)),
+            CompressionType::Uncompressed => Box::new(writer),
+        };
+
+        Self {
+            inner,
+            compression_type,
+        }
+    }
+
+    /// Returns the compression type used by this writer.
+    pub fn compression_type(&self) -> CompressionType {
+        self.compression_type
+    }
+
+    /// Flush the writer and shutdown compression
+    pub async fn shutdown(mut self) -> Result<()> {
+        self.inner
+            .shutdown()
+            .await
+            .context(error::AsyncWriteSnafu)?;
+        Ok(())
+    }
+}
+
+impl AsyncWrite for CompressedWriter {
+    fn poll_write(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<io::Result<usize>> {
+        Pin::new(&mut self.inner).poll_write(cx, buf)
+    }
+
+    fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        Pin::new(&mut self.inner).poll_flush(cx)
+    }
+
+    fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        Pin::new(&mut self.inner).poll_shutdown(cx)
+    }
+}
+
+/// A trait for converting async writers into compressed writers.
+///
+/// This trait is automatically implemented for all types that implement [`AsyncWrite`].
+pub trait IntoCompressedWriter {
+    /// Converts this writer into a [`CompressedWriter`] with the specified compression type.
+    ///
+    /// # Arguments
+    ///
+    /// * `self` - The underlying writer to wrap with compression
+    /// * `compression_type` - The type of compression to apply
+    fn into_compressed_writer(self, compression_type: CompressionType) -> CompressedWriter
+    where
+        Self: AsyncWrite + Unpin + Send + 'static + Sized,
+    {
+        CompressedWriter::new(self, compression_type)
+    }
+}
+
+impl<W: AsyncWrite + Unpin + Send + 'static> IntoCompressedWriter for W {}
+
+#[cfg(test)]
+mod tests {
+    use tokio::io::{AsyncReadExt, AsyncWriteExt, duplex};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_compressed_writer_gzip() {
+        let (duplex_writer, mut duplex_reader) = duplex(1024);
+        let mut writer = duplex_writer.into_compressed_writer(CompressionType::Gzip);
+        let original = b"test data for gzip compression";
+
+        writer.write_all(original).await.unwrap();
+        writer.shutdown().await.unwrap();
+
+        let mut buffer = Vec::new();
+        duplex_reader.read_to_end(&mut buffer).await.unwrap();
+
+        // The compressed data should be different from the original
+        assert_ne!(buffer, original);
+        assert!(!buffer.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_compressed_writer_bzip2() {
+        let (duplex_writer, mut duplex_reader) = duplex(1024);
+        let mut writer = duplex_writer.into_compressed_writer(CompressionType::Bzip2);
+        let original = b"test data for bzip2 compression";
+
+        writer.write_all(original).await.unwrap();
+        writer.shutdown().await.unwrap();
+
+        let mut buffer = Vec::new();
+        duplex_reader.read_to_end(&mut buffer).await.unwrap();
+
+        // The compressed data should be different from the original
+        assert_ne!(buffer, original);
+        assert!(!buffer.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_compressed_writer_xz() {
+        let (duplex_writer, mut duplex_reader) = duplex(1024);
+        let mut writer = duplex_writer.into_compressed_writer(CompressionType::Xz);
+        let original = b"test data for xz compression";
+
+        writer.write_all(original).await.unwrap();
+        writer.shutdown().await.unwrap();
+
+        let mut buffer = Vec::new();
+        duplex_reader.read_to_end(&mut buffer).await.unwrap();
+
+        // The compressed data should be different from the original
+        assert_ne!(buffer, original);
+        assert!(!buffer.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_compressed_writer_zstd() {
+        let (duplex_writer, mut duplex_reader) = duplex(1024);
+        let mut writer = duplex_writer.into_compressed_writer(CompressionType::Zstd);
+        let original = b"test data for zstd compression";
+
+        writer.write_all(original).await.unwrap();
+        writer.shutdown().await.unwrap();
+
+        let mut buffer = Vec::new();
+        duplex_reader.read_to_end(&mut buffer).await.unwrap();
+
+        // The compressed data should be different from the original
+        assert_ne!(buffer, original);
+        assert!(!buffer.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_compressed_writer_uncompressed() {
+        let (duplex_writer, mut duplex_reader) = duplex(1024);
+        let mut writer = duplex_writer.into_compressed_writer(CompressionType::Uncompressed);
+        let original = b"test data for uncompressed";
+
+        writer.write_all(original).await.unwrap();
+        writer.shutdown().await.unwrap();
+
+        let mut buffer = Vec::new();
+        duplex_reader.read_to_end(&mut buffer).await.unwrap();
+
+        // Uncompressed data should be the same as the original
+        assert_eq!(buffer, original);
+    }
+}
diff --git a/src/common/datasource/src/error.rs b/src/common/datasource/src/error.rs
index cfaa5a19c0..a8aa08e55c 100644
--- a/src/common/datasource/src/error.rs
+++ b/src/common/datasource/src/error.rs
@@ -194,12 +194,6 @@ pub enum Error {
         location: Location,
     },
 
-    #[snafu(display("Buffered writer closed"))]
-    BufferedWriterClosed {
-        #[snafu(implicit)]
-        location: Location,
-    },
-
     #[snafu(display("Failed to write parquet file, path: {}", path))]
     WriteParquet {
         path: String,
@@ -208,6 +202,14 @@ pub enum Error {
         #[snafu(source)]
         error: parquet::errors::ParquetError,
     },
+
+    #[snafu(display("Failed to build file stream"))]
+    BuildFileStream {
+        #[snafu(implicit)]
+        location: Location,
+        #[snafu(source)]
+        error: datafusion::error::DataFusionError,
+    },
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
@@ -239,7 +241,7 @@ impl ErrorExt for Error {
             | ReadRecordBatch { .. }
             | WriteRecordBatch { .. }
             | EncodeRecordBatch { .. }
-            | BufferedWriterClosed { .. }
+            | BuildFileStream { .. }
             | OrcReader { .. } => StatusCode::Unexpected,
         }
     }
diff --git a/src/common/datasource/src/file_format.rs b/src/common/datasource/src/file_format.rs
index 7c4e8d6c88..614be170e8 100644
--- a/src/common/datasource/src/file_format.rs
+++ b/src/common/datasource/src/file_format.rs
@@ -30,12 +30,22 @@ use arrow::record_batch::RecordBatch;
 use arrow_schema::{ArrowError, Schema as ArrowSchema};
 use async_trait::async_trait;
 use bytes::{Buf, Bytes};
-use datafusion::datasource::physical_plan::FileOpenFuture;
+use common_recordbatch::DfSendableRecordBatchStream;
+use datafusion::datasource::file_format::file_compression_type::FileCompressionType as DfCompressionType;
+use datafusion::datasource::listing::PartitionedFile;
+use datafusion::datasource::object_store::ObjectStoreUrl;
+use datafusion::datasource::physical_plan::{
+    FileGroup, FileOpenFuture, FileScanConfigBuilder, FileSource, FileStream,
+};
 use datafusion::error::{DataFusionError, Result as DataFusionResult};
 use datafusion::physical_plan::SendableRecordBatchStream;
+use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
+use datatypes::arrow::datatypes::SchemaRef;
 use futures::{StreamExt, TryStreamExt};
 use object_store::ObjectStore;
+use object_store_opendal::OpendalStore;
 use snafu::ResultExt;
+use tokio::io::AsyncWriteExt;
 use tokio_util::compat::FuturesAsyncWriteCompatExt;
 
 use self::csv::CsvFormat;
@@ -43,7 +53,8 @@ use self::json::JsonFormat;
 use self::orc::OrcFormat;
 use self::parquet::ParquetFormat;
 use crate::DEFAULT_WRITE_BUFFER_SIZE;
-use crate::buffered_writer::{DfRecordBatchEncoder, LazyBufferedWriter};
+use crate::buffered_writer::DfRecordBatchEncoder;
+use crate::compressed_writer::{CompressedWriter, IntoCompressedWriter};
 use crate::compression::CompressionType;
 use crate::error::{self, Result};
 use crate::share_buffer::SharedBuffer;
@@ -195,33 +206,128 @@ pub async fn infer_schemas(
     ArrowSchema::try_merge(schemas).context(error::MergeSchemaSnafu)
 }
 
-pub async fn stream_to_file<T: DfRecordBatchEncoder, U: Fn(SharedBuffer) -> T>(
+/// Writes data to a compressed writer if the data is not empty.
+///
+/// Does nothing if `data` is empty; otherwise writes all data and returns any error.
+async fn write_to_compressed_writer(
+    compressed_writer: &mut CompressedWriter,
+    data: &[u8],
+) -> Result<()> {
+    if !data.is_empty() {
+        compressed_writer
+            .write_all(data)
+            .await
+            .context(error::AsyncWriteSnafu)?;
+    }
+    Ok(())
+}
+
+/// Streams [SendableRecordBatchStream] to a file with optional compression support.
+/// Data is buffered and flushed according to the given `threshold`.
+/// Ensures that writer resources are cleanly released and that an empty file is not
+/// created if no rows are written.
+///
+/// Returns the total number of rows successfully written.
+pub async fn stream_to_file<E>(
     mut stream: SendableRecordBatchStream,
     store: ObjectStore,
     path: &str,
     threshold: usize,
     concurrency: usize,
-    encoder_factory: U,
-) -> Result<usize> {
+    compression_type: CompressionType,
+    encoder_factory: impl Fn(SharedBuffer) -> E,
+) -> Result<usize>
+where
+    E: DfRecordBatchEncoder,
+{
+    // Create the file writer with OpenDAL's built-in buffering
+    let writer = store
+        .writer_with(path)
+        .concurrent(concurrency)
+        .chunk(DEFAULT_WRITE_BUFFER_SIZE.as_bytes() as usize)
+        .await
+        .with_context(|_| error::WriteObjectSnafu { path })?
+        .into_futures_async_write()
+        .compat_write();
+
+    // Apply compression if needed
+    let mut compressed_writer = writer.into_compressed_writer(compression_type);
+
+    // Create a buffer for the encoder
     let buffer = SharedBuffer::with_capacity(threshold);
-    let encoder = encoder_factory(buffer.clone());
-    let mut writer = LazyBufferedWriter::new(threshold, buffer, encoder, path, |path| async {
-        store
-            .writer_with(&path)
-            .concurrent(concurrency)
-            .chunk(DEFAULT_WRITE_BUFFER_SIZE.as_bytes() as usize)
-            .await
-            .map(|v| v.into_futures_async_write().compat_write())
-            .context(error::WriteObjectSnafu { path })
-    });
+    let mut encoder = encoder_factory(buffer.clone());
 
     let mut rows = 0;
 
+    // Process each record batch
     while let Some(batch) = stream.next().await {
         let batch = batch.context(error::ReadRecordBatchSnafu)?;
-        writer.write(&batch).await?;
+
+        // Write batch using the encoder
+        encoder.write(&batch)?;
         rows += batch.num_rows();
+
+        loop {
+            let chunk = {
+                let mut buffer_guard = buffer.buffer.lock().unwrap();
+                if buffer_guard.len() < threshold {
+                    break;
+                }
+                buffer_guard.split_to(threshold)
+            };
+            write_to_compressed_writer(&mut compressed_writer, &chunk).await?;
+        }
     }
-    writer.close_inner_writer().await?;
+
+    // If no row's been written, just simply close the underlying writer
+    // without flush so that no file will be actually created.
+    if rows != 0 {
+        // Final flush of any remaining data
+        let final_data = {
+            let mut buffer_guard = buffer.buffer.lock().unwrap();
+            buffer_guard.split()
+        };
+        write_to_compressed_writer(&mut compressed_writer, &final_data).await?;
+    }
+
+    // Shutdown compression and close writer
+    compressed_writer.shutdown().await?;
+
     Ok(rows)
 }
+
+/// Creates a [FileStream] for reading data from a file with optional column projection
+/// and compression support.
+///
+/// Returns [SendableRecordBatchStream].
+pub async fn file_to_stream(
+    store: &ObjectStore,
+    filename: &str,
+    file_schema: SchemaRef,
+    file_source: Arc<dyn FileSource>,
+    projection: Option<Vec<usize>>,
+    compression_type: CompressionType,
+) -> Result<DfSendableRecordBatchStream> {
+    let df_compression: DfCompressionType = compression_type.into();
+    let config = FileScanConfigBuilder::new(
+        ObjectStoreUrl::local_filesystem(),
+        file_schema,
+        file_source.clone(),
+    )
+    .with_file_group(FileGroup::new(vec![PartitionedFile::new(
+        filename.to_string(),
+        0,
+    )]))
+    .with_projection(projection)
+    .with_file_compression_type(df_compression)
+    .build();
+
+    let store = Arc::new(OpendalStore::new(store.clone()));
+    let file_opener = file_source
+        .with_projection(&config)
+        .create_file_opener(store, &config, 0);
+    let stream = FileStream::new(&config, 0, file_opener, &ExecutionPlanMetricsSet::new())
+        .context(error::BuildFileStreamSnafu)?;
+
+    Ok(Box::pin(stream))
+}
diff --git a/src/common/datasource/src/file_format/csv.rs b/src/common/datasource/src/file_format/csv.rs
index efffce8d12..392f3ff49d 100644
--- a/src/common/datasource/src/file_format/csv.rs
+++ b/src/common/datasource/src/file_format/csv.rs
@@ -157,19 +157,27 @@ pub async fn stream_to_csv(
     concurrency: usize,
     format: &CsvFormat,
 ) -> Result<usize> {
-    stream_to_file(stream, store, path, threshold, concurrency, |buffer| {
-        let mut builder = WriterBuilder::new();
-        if let Some(timestamp_format) = &format.timestamp_format {
-            builder = builder.with_timestamp_format(timestamp_format.to_owned())
-        }
-        if let Some(date_format) = &format.date_format {
-            builder = builder.with_date_format(date_format.to_owned())
-        }
-        if let Some(time_format) = &format.time_format {
-            builder = builder.with_time_format(time_format.to_owned())
-        }
-        builder.build(buffer)
-    })
+    stream_to_file(
+        stream,
+        store,
+        path,
+        threshold,
+        concurrency,
+        format.compression_type,
+        |buffer| {
+            let mut builder = WriterBuilder::new();
+            if let Some(timestamp_format) = &format.timestamp_format {
+                builder = builder.with_timestamp_format(timestamp_format.to_owned())
+            }
+            if let Some(date_format) = &format.date_format {
+                builder = builder.with_date_format(date_format.to_owned())
+            }
+            if let Some(time_format) = &format.time_format {
+                builder = builder.with_time_format(time_format.to_owned())
+            }
+            builder.build(buffer)
+        },
+    )
     .await
 }
 
@@ -181,13 +189,21 @@ impl DfRecordBatchEncoder for csv::Writer<SharedBuffer> {
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
 
+    use common_recordbatch::adapter::DfRecordBatchStreamAdapter;
+    use common_recordbatch::{RecordBatch, RecordBatches};
     use common_test_util::find_workspace_path;
+    use datafusion::datasource::physical_plan::{CsvSource, FileSource};
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::schema::{ColumnSchema, Schema};
+    use datatypes::vectors::{Float64Vector, StringVector, UInt32Vector, VectorRef};
+    use futures::TryStreamExt;
 
     use super::*;
     use crate::file_format::{
         FORMAT_COMPRESSION_TYPE, FORMAT_DELIMITER, FORMAT_HAS_HEADER,
-        FORMAT_SCHEMA_INFER_MAX_RECORD, FileFormat,
+        FORMAT_SCHEMA_INFER_MAX_RECORD, FileFormat, file_to_stream,
     };
     use crate::test_util::{format_schema, test_store};
 
@@ -297,4 +313,166 @@ mod tests {
             }
         );
     }
+
+    #[tokio::test]
+    async fn test_compressed_csv() {
+        // Create test data
+        let column_schemas = vec![
+            ColumnSchema::new("id", ConcreteDataType::uint32_datatype(), false),
+            ColumnSchema::new("name", ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new("value", ConcreteDataType::float64_datatype(), false),
+        ];
+        let schema = Arc::new(Schema::new(column_schemas));
+
+        // Create multiple record batches with different data
+        let batch1_columns: Vec<VectorRef> = vec![
+            Arc::new(UInt32Vector::from_slice(vec![1, 2, 3])),
+            Arc::new(StringVector::from(vec!["Alice", "Bob", "Charlie"])),
+            Arc::new(Float64Vector::from_slice(vec![10.5, 20.3, 30.7])),
+        ];
+        let batch1 = RecordBatch::new(schema.clone(), batch1_columns).unwrap();
+
+        let batch2_columns: Vec<VectorRef> = vec![
+            Arc::new(UInt32Vector::from_slice(vec![4, 5, 6])),
+            Arc::new(StringVector::from(vec!["David", "Eva", "Frank"])),
+            Arc::new(Float64Vector::from_slice(vec![40.1, 50.2, 60.3])),
+        ];
+        let batch2 = RecordBatch::new(schema.clone(), batch2_columns).unwrap();
+
+        let batch3_columns: Vec<VectorRef> = vec![
+            Arc::new(UInt32Vector::from_slice(vec![7, 8, 9])),
+            Arc::new(StringVector::from(vec!["Grace", "Henry", "Ivy"])),
+            Arc::new(Float64Vector::from_slice(vec![70.4, 80.5, 90.6])),
+        ];
+        let batch3 = RecordBatch::new(schema.clone(), batch3_columns).unwrap();
+
+        // Combine all batches into a RecordBatches collection
+        let recordbatches = RecordBatches::try_new(schema, vec![batch1, batch2, batch3]).unwrap();
+
+        // Test with different compression types
+        let compression_types = vec![
+            CompressionType::Gzip,
+            CompressionType::Bzip2,
+            CompressionType::Xz,
+            CompressionType::Zstd,
+        ];
+
+        // Create a temporary file path
+        let temp_dir = common_test_util::temp_dir::create_temp_dir("test_compressed_csv");
+        for compression_type in compression_types {
+            let format = CsvFormat {
+                compression_type,
+                ..CsvFormat::default()
+            };
+
+            // Use correct format without Debug formatter
+            let compressed_file_name =
+                format!("test_compressed_csv.{}", compression_type.file_extension());
+            let compressed_file_path = temp_dir.path().join(&compressed_file_name);
+            let compressed_file_path_str = compressed_file_path.to_str().unwrap();
+
+            // Create a simple file store for testing
+            let store = test_store("/");
+
+            // Export CSV with compression
+            let rows = stream_to_csv(
+                Box::pin(DfRecordBatchStreamAdapter::new(recordbatches.as_stream())),
+                store,
+                compressed_file_path_str,
+                1024,
+                1,
+                &format,
+            )
+            .await
+            .unwrap();
+
+            assert_eq!(rows, 9);
+
+            // Verify compressed file was created and has content
+            assert!(compressed_file_path.exists());
+            let file_size = std::fs::metadata(&compressed_file_path).unwrap().len();
+            assert!(file_size > 0);
+
+            // Verify the file is actually compressed
+            let file_content = std::fs::read(&compressed_file_path).unwrap();
+            // Compressed files should not start with CSV header
+            // They should have compression magic bytes
+            match compression_type {
+                CompressionType::Gzip => {
+                    // Gzip magic bytes: 0x1f 0x8b
+                    assert_eq!(file_content[0], 0x1f, "Gzip file should start with 0x1f");
+                    assert_eq!(
+                        file_content[1], 0x8b,
+                        "Gzip file should have 0x8b as second byte"
+                    );
+                }
+                CompressionType::Bzip2 => {
+                    // Bzip2 magic bytes: 'BZ'
+                    assert_eq!(file_content[0], b'B', "Bzip2 file should start with 'B'");
+                    assert_eq!(
+                        file_content[1], b'Z',
+                        "Bzip2 file should have 'Z' as second byte"
+                    );
+                }
+                CompressionType::Xz => {
+                    // XZ magic bytes: 0xFD '7zXZ'
+                    assert_eq!(file_content[0], 0xFD, "XZ file should start with 0xFD");
+                }
+                CompressionType::Zstd => {
+                    // Zstd magic bytes: 0x28 0xB5 0x2F 0xFD
+                    assert_eq!(file_content[0], 0x28, "Zstd file should start with 0x28");
+                    assert_eq!(
+                        file_content[1], 0xB5,
+                        "Zstd file should have 0xB5 as second byte"
+                    );
+                }
+                _ => {}
+            }
+
+            // Verify the compressed file can be decompressed and content matches original data
+            let store = test_store("/");
+            let schema = Arc::new(
+                CsvFormat {
+                    compression_type,
+                    ..Default::default()
+                }
+                .infer_schema(&store, compressed_file_path_str)
+                .await
+                .unwrap(),
+            );
+            let csv_source = CsvSource::new(true, b',', b'"')
+                .with_schema(schema.clone())
+                .with_batch_size(8192);
+
+            let stream = file_to_stream(
+                &store,
+                compressed_file_path_str,
+                schema.clone(),
+                csv_source.clone(),
+                None,
+                compression_type,
+            )
+            .await
+            .unwrap();
+
+            let batches = stream.try_collect::<Vec<_>>().await.unwrap();
+            let pretty_print = arrow::util::pretty::pretty_format_batches(&batches)
+                .unwrap()
+                .to_string();
+            let expected = r#"+----+---------+-------+
+| id | name    | value |
++----+---------+-------+
+| 1  | Alice   | 10.5  |
+| 2  | Bob     | 20.3  |
+| 3  | Charlie | 30.7  |
+| 4  | David   | 40.1  |
+| 5  | Eva     | 50.2  |
+| 6  | Frank   | 60.3  |
+| 7  | Grace   | 70.4  |
+| 8  | Henry   | 80.5  |
+| 9  | Ivy     | 90.6  |
++----+---------+-------+"#;
+            assert_eq!(expected, pretty_print);
+        }
+    }
 }
diff --git a/src/common/datasource/src/file_format/json.rs b/src/common/datasource/src/file_format/json.rs
index c234eec846..cafcd71372 100644
--- a/src/common/datasource/src/file_format/json.rs
+++ b/src/common/datasource/src/file_format/json.rs
@@ -115,10 +115,17 @@ pub async fn stream_to_json(
     path: &str,
     threshold: usize,
     concurrency: usize,
+    format: &JsonFormat,
 ) -> Result<usize> {
-    stream_to_file(stream, store, path, threshold, concurrency, |buffer| {
-        json::LineDelimitedWriter::new(buffer)
-    })
+    stream_to_file(
+        stream,
+        store,
+        path,
+        threshold,
+        concurrency,
+        format.compression_type,
+        json::LineDelimitedWriter::new,
+    )
     .await
 }
 
@@ -130,10 +137,21 @@ impl DfRecordBatchEncoder for json::Writer<SharedBuffer, LineDelimited> {
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
+
+    use common_recordbatch::adapter::DfRecordBatchStreamAdapter;
+    use common_recordbatch::{RecordBatch, RecordBatches};
     use common_test_util::find_workspace_path;
+    use datafusion::datasource::physical_plan::{FileSource, JsonSource};
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::schema::{ColumnSchema, Schema};
+    use datatypes::vectors::{Float64Vector, StringVector, UInt32Vector, VectorRef};
+    use futures::TryStreamExt;
 
     use super::*;
-    use crate::file_format::{FORMAT_COMPRESSION_TYPE, FORMAT_SCHEMA_INFER_MAX_RECORD, FileFormat};
+    use crate::file_format::{
+        FORMAT_COMPRESSION_TYPE, FORMAT_SCHEMA_INFER_MAX_RECORD, FileFormat, file_to_stream,
+    };
     use crate::test_util::{format_schema, test_store};
 
     fn test_data_root() -> String {
@@ -203,4 +221,165 @@ mod tests {
             }
         );
     }
+
+    #[tokio::test]
+    async fn test_compressed_json() {
+        // Create test data
+        let column_schemas = vec![
+            ColumnSchema::new("id", ConcreteDataType::uint32_datatype(), false),
+            ColumnSchema::new("name", ConcreteDataType::string_datatype(), false),
+            ColumnSchema::new("value", ConcreteDataType::float64_datatype(), false),
+        ];
+        let schema = Arc::new(Schema::new(column_schemas));
+
+        // Create multiple record batches with different data
+        let batch1_columns: Vec<VectorRef> = vec![
+            Arc::new(UInt32Vector::from_slice(vec![1, 2, 3])),
+            Arc::new(StringVector::from(vec!["Alice", "Bob", "Charlie"])),
+            Arc::new(Float64Vector::from_slice(vec![10.5, 20.3, 30.7])),
+        ];
+        let batch1 = RecordBatch::new(schema.clone(), batch1_columns).unwrap();
+
+        let batch2_columns: Vec<VectorRef> = vec![
+            Arc::new(UInt32Vector::from_slice(vec![4, 5, 6])),
+            Arc::new(StringVector::from(vec!["David", "Eva", "Frank"])),
+            Arc::new(Float64Vector::from_slice(vec![40.1, 50.2, 60.3])),
+        ];
+        let batch2 = RecordBatch::new(schema.clone(), batch2_columns).unwrap();
+
+        let batch3_columns: Vec<VectorRef> = vec![
+            Arc::new(UInt32Vector::from_slice(vec![7, 8, 9])),
+            Arc::new(StringVector::from(vec!["Grace", "Henry", "Ivy"])),
+            Arc::new(Float64Vector::from_slice(vec![70.4, 80.5, 90.6])),
+        ];
+        let batch3 = RecordBatch::new(schema.clone(), batch3_columns).unwrap();
+
+        // Combine all batches into a RecordBatches collection
+        let recordbatches = RecordBatches::try_new(schema, vec![batch1, batch2, batch3]).unwrap();
+
+        // Test with different compression types
+        let compression_types = vec![
+            CompressionType::Gzip,
+            CompressionType::Bzip2,
+            CompressionType::Xz,
+            CompressionType::Zstd,
+        ];
+
+        // Create a temporary file path
+        let temp_dir = common_test_util::temp_dir::create_temp_dir("test_compressed_json");
+        for compression_type in compression_types {
+            let format = JsonFormat {
+                compression_type,
+                ..JsonFormat::default()
+            };
+
+            let compressed_file_name =
+                format!("test_compressed_json.{}", compression_type.file_extension());
+            let compressed_file_path = temp_dir.path().join(&compressed_file_name);
+            let compressed_file_path_str = compressed_file_path.to_str().unwrap();
+
+            // Create a simple file store for testing
+            let store = test_store("/");
+
+            // Export JSON with compression
+            let rows = stream_to_json(
+                Box::pin(DfRecordBatchStreamAdapter::new(recordbatches.as_stream())),
+                store,
+                compressed_file_path_str,
+                1024,
+                1,
+                &format,
+            )
+            .await
+            .unwrap();
+
+            assert_eq!(rows, 9);
+
+            // Verify compressed file was created and has content
+            assert!(compressed_file_path.exists());
+            let file_size = std::fs::metadata(&compressed_file_path).unwrap().len();
+            assert!(file_size > 0);
+
+            // Verify the file is actually compressed
+            let file_content = std::fs::read(&compressed_file_path).unwrap();
+            // Compressed files should not start with '{' (JSON character)
+            // They should have compression magic bytes
+            match compression_type {
+                CompressionType::Gzip => {
+                    // Gzip magic bytes: 0x1f 0x8b
+                    assert_eq!(file_content[0], 0x1f, "Gzip file should start with 0x1f");
+                    assert_eq!(
+                        file_content[1], 0x8b,
+                        "Gzip file should have 0x8b as second byte"
+                    );
+                }
+                CompressionType::Bzip2 => {
+                    // Bzip2 magic bytes: 'BZ'
+                    assert_eq!(file_content[0], b'B', "Bzip2 file should start with 'B'");
+                    assert_eq!(
+                        file_content[1], b'Z',
+                        "Bzip2 file should have 'Z' as second byte"
+                    );
+                }
+                CompressionType::Xz => {
+                    // XZ magic bytes: 0xFD '7zXZ'
+                    assert_eq!(file_content[0], 0xFD, "XZ file should start with 0xFD");
+                }
+                CompressionType::Zstd => {
+                    // Zstd magic bytes: 0x28 0xB5 0x2F 0xFD
+                    assert_eq!(file_content[0], 0x28, "Zstd file should start with 0x28");
+                    assert_eq!(
+                        file_content[1], 0xB5,
+                        "Zstd file should have 0xB5 as second byte"
+                    );
+                }
+                _ => {}
+            }
+
+            // Verify the compressed file can be decompressed and content matches original data
+            let store = test_store("/");
+            let schema = Arc::new(
+                JsonFormat {
+                    compression_type,
+                    ..Default::default()
+                }
+                .infer_schema(&store, compressed_file_path_str)
+                .await
+                .unwrap(),
+            );
+            let json_source = JsonSource::new()
+                .with_schema(schema.clone())
+                .with_batch_size(8192);
+
+            let stream = file_to_stream(
+                &store,
+                compressed_file_path_str,
+                schema.clone(),
+                json_source.clone(),
+                None,
+                compression_type,
+            )
+            .await
+            .unwrap();
+
+            let batches = stream.try_collect::<Vec<_>>().await.unwrap();
+            let pretty_print = arrow::util::pretty::pretty_format_batches(&batches)
+                .unwrap()
+                .to_string();
+            let expected = r#"+----+---------+-------+
+| id | name    | value |
++----+---------+-------+
+| 1  | Alice   | 10.5  |
+| 2  | Bob     | 20.3  |
+| 3  | Charlie | 30.7  |
+| 4  | David   | 40.1  |
+| 5  | Eva     | 50.2  |
+| 6  | Frank   | 60.3  |
+| 7  | Grace   | 70.4  |
+| 8  | Henry   | 80.5  |
+| 9  | Ivy     | 90.6  |
++----+---------+-------+"#;
+            assert_eq!(expected, pretty_print);
+        }
+    }
 }
diff --git a/src/common/datasource/src/lib.rs b/src/common/datasource/src/lib.rs
index 72e94c7f36..91663ce22c 100644
--- a/src/common/datasource/src/lib.rs
+++ b/src/common/datasource/src/lib.rs
@@ -16,6 +16,7 @@
 #![feature(type_alias_impl_trait)]
 
 pub mod buffered_writer;
+pub mod compressed_writer;
 pub mod compression;
 pub mod error;
 pub mod file_format;
diff --git a/src/common/datasource/src/test_util.rs b/src/common/datasource/src/test_util.rs
index f3f813be34..244df3b7a5 100644
--- a/src/common/datasource/src/test_util.rs
+++ b/src/common/datasource/src/test_util.rs
@@ -28,7 +28,7 @@ use object_store::ObjectStore;
 use object_store::services::Fs;
 
 use crate::file_format::csv::{CsvFormat, stream_to_csv};
-use crate::file_format::json::stream_to_json;
+use crate::file_format::json::{JsonFormat, stream_to_json};
 use crate::test_util;
 
 pub const TEST_BATCH_SIZE: usize = 100;
@@ -122,13 +122,16 @@ pub async fn setup_stream_to_json_test(origin_path: &str, threshold: impl Fn(usi
 
     let output_path = format!("{}/{}", dir.path().display(), "output");
 
+    let json_format = JsonFormat::default();
+
     assert!(
         stream_to_json(
             Box::pin(stream),
             tmp_store.clone(),
             &output_path,
             threshold(size),
-            8
+            8,
+            &json_format,
         )
         .await
         .is_ok()
diff --git a/src/common/error/src/lib.rs b/src/common/error/src/lib.rs
index 18e6a0c9ae..9b6facda2c 100644
--- a/src/common/error/src/lib.rs
+++ b/src/common/error/src/lib.rs
@@ -21,6 +21,8 @@ pub mod status_code;
 use http::{HeaderMap, HeaderValue};
 pub use snafu;
 
+use crate::status_code::StatusCode;
+
 // HACK - these headers are here for shared in gRPC services. For common HTTP headers,
 // please define in `src/servers/src/http/header.rs`.
 pub const GREPTIME_DB_HEADER_ERROR_CODE: &str = "x-greptime-err-code";
@@ -46,6 +48,29 @@ pub fn from_err_code_msg_to_header(code: u32, msg: &str) -> HeaderMap {
     header
 }
 
+/// Extract [StatusCode] and error message from [HeaderMap], if any.
+///
+/// Note that if the [StatusCode] is illegal, for example, a random number that is not pre-defined
+/// as a [StatusCode], the result is still `None`.
+pub fn from_header_to_err_code_msg(headers: &HeaderMap) -> Option<(StatusCode, &str)> {
+    let code = headers
+        .get(GREPTIME_DB_HEADER_ERROR_CODE)
+        .and_then(|value| {
+            value
+                .to_str()
+                .ok()
+                .and_then(|x| x.parse::<u32>().ok())
+                .and_then(StatusCode::from_u32)
+        });
+    let msg = headers
+        .get(GREPTIME_DB_HEADER_ERROR_MSG)
+        .and_then(|x| x.to_str().ok());
+    match (code, msg) {
+        (Some(code), Some(msg)) => Some((code, msg)),
+        _ => None,
+    }
+}
+
 /// Returns the external root cause of the source error (exclude the current error).
 pub fn root_source(err: &dyn std::error::Error) -> Option<&dyn std::error::Error> {
     // There are some divergence about the behavior of the `sources()` API
diff --git a/src/common/error/src/status_code.rs b/src/common/error/src/status_code.rs
index 08f33af609..4dc5a0398e 100644
--- a/src/common/error/src/status_code.rs
+++ b/src/common/error/src/status_code.rs
@@ -42,6 +42,8 @@ pub enum StatusCode {
     External = 1007,
     /// The request is deadline exceeded (typically server-side).
     DeadlineExceeded = 1008,
+    /// Service got suspended for various reason. For example, resources exceed limit.
+    Suspended = 1009,
     // ====== End of common status code ================
 
     // ====== Begin of SQL related status code =========
@@ -175,7 +177,8 @@ impl StatusCode {
             | StatusCode::AccessDenied
             | StatusCode::PermissionDenied
             | StatusCode::RequestOutdated
-            | StatusCode::External => false,
+            | StatusCode::External
+            | StatusCode::Suspended => false,
         }
     }
 
@@ -223,7 +226,8 @@ impl StatusCode {
             | StatusCode::InvalidAuthHeader
             | StatusCode::AccessDenied
             | StatusCode::PermissionDenied
-            | StatusCode::RequestOutdated => false,
+            | StatusCode::RequestOutdated
+            | StatusCode::Suspended => false,
         }
     }
 
@@ -347,7 +351,8 @@ pub fn status_to_tonic_code(status_code: StatusCode) -> Code {
         | StatusCode::RegionNotReady => Code::Unavailable,
         StatusCode::RuntimeResourcesExhausted
         | StatusCode::RateLimited
-        | StatusCode::RegionBusy => Code::ResourceExhausted,
+        | StatusCode::RegionBusy
+        | StatusCode::Suspended => Code::ResourceExhausted,
         StatusCode::UnsupportedPasswordType
         | StatusCode::UserPasswordMismatch
         | StatusCode::AuthHeaderNotFound
diff --git a/src/common/event-recorder/src/recorder.rs b/src/common/event-recorder/src/recorder.rs
index ddf0bcdae0..ace7702991 100644
--- a/src/common/event-recorder/src/recorder.rs
+++ b/src/common/event-recorder/src/recorder.rs
@@ -97,9 +97,9 @@ pub trait Event: Send + Sync + Debug {
         vec![]
     }
 
-    /// Add the extra row to the event with the default row.
-    fn extra_row(&self) -> Result<Row> {
-        Ok(Row { values: vec![] })
+    /// Add the extra rows to the event with the default row.
+    fn extra_rows(&self) -> Result<Vec<Row>> {
+        Ok(vec![Row { values: vec![] }])
     }
 
     /// Returns the event as any type.
@@ -159,15 +159,17 @@ pub fn build_row_inserts_request(events: &[&Box<dyn Event>]) -> Result<RowInsert
 
     let mut rows: Vec<Row> = Vec::with_capacity(events.len());
     for event in events {
-        let extra_row = event.extra_row()?;
-        let mut values = Vec::with_capacity(3 + extra_row.values.len());
-        values.extend([
-            ValueData::StringValue(event.event_type().to_string()).into(),
-            ValueData::BinaryValue(event.json_payload()?.into_bytes()).into(),
-            ValueData::TimestampNanosecondValue(event.timestamp().value()).into(),
-        ]);
-        values.extend(extra_row.values);
-        rows.push(Row { values });
+        let extra_rows = event.extra_rows()?;
+        for extra_row in extra_rows {
+            let mut values = Vec::with_capacity(3 + extra_row.values.len());
+            values.extend([
+                ValueData::StringValue(event.event_type().to_string()).into(),
+                ValueData::BinaryValue(event.json_payload()?.into_bytes()).into(),
+                ValueData::TimestampNanosecondValue(event.timestamp().value()).into(),
+            ]);
+            values.extend(extra_row.values);
+            rows.push(Row { values });
+        }
     }
 
     Ok(RowInsertRequests {
diff --git a/src/common/frontend/src/slow_query_event.rs b/src/common/frontend/src/slow_query_event.rs
index 0e65443acb..32ca457da4 100644
--- a/src/common/frontend/src/slow_query_event.rs
+++ b/src/common/frontend/src/slow_query_event.rs
@@ -107,8 +107,8 @@ impl Event for SlowQueryEvent {
         ]
     }
 
-    fn extra_row(&self) -> Result<Row> {
-        Ok(Row {
+    fn extra_rows(&self) -> Result<Vec<Row>> {
+        Ok(vec![Row {
             values: vec![
                 ValueData::U64Value(self.cost).into(),
                 ValueData::U64Value(self.threshold).into(),
@@ -119,7 +119,7 @@ impl Event for SlowQueryEvent {
                 ValueData::TimestampMillisecondValue(self.promql_start.unwrap_or(0)).into(),
                 ValueData::TimestampMillisecondValue(self.promql_end.unwrap_or(0)).into(),
             ],
-        })
+        }])
     }
 
     fn json_payload(&self) -> Result<String> {
diff --git a/src/common/function/Cargo.toml b/src/common/function/Cargo.toml
index 1d272f5d04..557fbac6e0 100644
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -39,7 +39,7 @@ datafusion-functions-aggregate-common.workspace = true
 datafusion-pg-catalog.workspace = true
 datafusion-physical-expr.workspace = true
 datatypes.workspace = true
-derive_more = { version = "1", default-features = false, features = ["display"] }
+derive_more.workspace = true
 geo = { version = "0.29", optional = true }
 geo-types = { version = "0.7", optional = true }
 geohash = { version = "0.13", optional = true }
@@ -47,6 +47,7 @@ h3o = { version = "0.6", optional = true }
 hyperloglogplus = "0.4"
 jsonb.workspace = true
 memchr = "2.7"
+mito-codec.workspace = true
 nalgebra.workspace = true
 num = "0.4"
 num-traits = "0.2"
diff --git a/src/common/function/src/admin.rs b/src/common/function/src/admin.rs
index 11270c3282..e7fd186b86 100644
--- a/src/common/function/src/admin.rs
+++ b/src/common/function/src/admin.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+mod build_index_table;
 mod flush_compact_region;
 mod flush_compact_table;
 mod migrate_region;
@@ -26,6 +27,7 @@ use reconcile_catalog::ReconcileCatalogFunction;
 use reconcile_database::ReconcileDatabaseFunction;
 use reconcile_table::ReconcileTableFunction;
 
+use crate::admin::build_index_table::BuildIndexFunction;
 use crate::flush_flow::FlushFlowFunction;
 use crate::function_registry::FunctionRegistry;
 
@@ -40,6 +42,7 @@ impl AdminFunction {
         registry.register(CompactRegionFunction::factory());
         registry.register(FlushTableFunction::factory());
         registry.register(CompactTableFunction::factory());
+        registry.register(BuildIndexFunction::factory());
         registry.register(FlushFlowFunction::factory());
         registry.register(ReconcileCatalogFunction::factory());
         registry.register(ReconcileDatabaseFunction::factory());
diff --git a/src/common/function/src/admin/build_index_table.rs b/src/common/function/src/admin/build_index_table.rs
new file mode 100644
index 0000000000..155f198c79
--- /dev/null
+++ b/src/common/function/src/admin/build_index_table.rs
@@ -0,0 +1,80 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use arrow::datatypes::DataType as ArrowDataType;
+use common_error::ext::BoxedError;
+use common_macro::admin_fn;
+use common_query::error::{
+    InvalidFuncArgsSnafu, MissingTableMutationHandlerSnafu, Result, TableMutationSnafu,
+    UnsupportedInputDataTypeSnafu,
+};
+use datafusion_expr::{Signature, Volatility};
+use datatypes::prelude::*;
+use session::context::QueryContextRef;
+use session::table_name::table_name_to_full_name;
+use snafu::{ResultExt, ensure};
+use table::requests::BuildIndexTableRequest;
+
+use crate::handlers::TableMutationHandlerRef;
+
+#[admin_fn(
+    name = BuildIndexFunction,
+    display_name = build_index,
+    sig_fn = build_index_signature,
+    ret = uint64
+)]
+pub(crate) async fn build_index(
+    table_mutation_handler: &TableMutationHandlerRef,
+    query_ctx: &QueryContextRef,
+    params: &[ValueRef<'_>],
+) -> Result<Value> {
+    ensure!(
+        params.len() == 1,
+        InvalidFuncArgsSnafu {
+            err_msg: format!(
+                "The length of the args is not correct, expect 1, have: {}",
+                params.len()
+            ),
+        }
+    );
+
+    let ValueRef::String(table_name) = params[0] else {
+        return UnsupportedInputDataTypeSnafu {
+            function: "build_index",
+            datatypes: params.iter().map(|v| v.data_type()).collect::<Vec<_>>(),
+        }
+        .fail();
+    };
+
+    let (catalog_name, schema_name, table_name) = table_name_to_full_name(table_name, query_ctx)
+        .map_err(BoxedError::new)
+        .context(TableMutationSnafu)?;
+
+    let affected_rows = table_mutation_handler
+        .build_index(
+            BuildIndexTableRequest {
+                catalog_name,
+                schema_name,
+                table_name,
+            },
+            query_ctx.clone(),
+        )
+        .await?;
+
+    Ok(Value::from(affected_rows as u64))
+}
+
+fn build_index_signature() -> Signature {
+    Signature::uniform(1, vec![ArrowDataType::Utf8], Volatility::Immutable)
+}
diff --git a/src/common/function/src/function_registry.rs b/src/common/function/src/function_registry.rs
index e51dcf4cb8..6208c9569c 100644
--- a/src/common/function/src/function_registry.rs
+++ b/src/common/function/src/function_registry.rs
@@ -34,6 +34,7 @@ use crate::scalars::json::JsonFunction;
 use crate::scalars::matches::MatchesFunction;
 use crate::scalars::matches_term::MatchesTermFunction;
 use crate::scalars::math::MathFunction;
+use crate::scalars::primary_key::DecodePrimaryKeyFunction;
 use crate::scalars::string::register_string_functions;
 use crate::scalars::timestamp::TimestampFunction;
 use crate::scalars::uddsketch_calc::UddSketchCalcFunction;
@@ -143,6 +144,7 @@ pub static FUNCTION_REGISTRY: LazyLock<Arc<FunctionRegistry>> = LazyLock::new(||
     ExpressionFunction::register(&function_registry);
     UddSketchCalcFunction::register(&function_registry);
     HllCalcFunction::register(&function_registry);
+    DecodePrimaryKeyFunction::register(&function_registry);
 
     // Full text search function
     MatchesFunction::register(&function_registry);
diff --git a/src/common/function/src/handlers.rs b/src/common/function/src/handlers.rs
index e7ab67e312..0e6060e90c 100644
--- a/src/common/function/src/handlers.rs
+++ b/src/common/function/src/handlers.rs
@@ -25,7 +25,9 @@ use common_query::Output;
 use common_query::error::Result;
 use session::context::QueryContextRef;
 use store_api::storage::RegionId;
-use table::requests::{CompactTableRequest, DeleteRequest, FlushTableRequest, InsertRequest};
+use table::requests::{
+    BuildIndexTableRequest, CompactTableRequest, DeleteRequest, FlushTableRequest, InsertRequest,
+};
 
 /// A trait for handling table mutations in `QueryEngine`.
 #[async_trait]
@@ -47,6 +49,13 @@ pub trait TableMutationHandler: Send + Sync {
         ctx: QueryContextRef,
     ) -> Result<AffectedRows>;
 
+    /// Trigger an index build task for the table.
+    async fn build_index(
+        &self,
+        request: BuildIndexTableRequest,
+        ctx: QueryContextRef,
+    ) -> Result<AffectedRows>;
+
     /// Trigger a flush task for a table region.
     async fn flush_region(&self, region_id: RegionId, ctx: QueryContextRef)
     -> Result<AffectedRows>;
diff --git a/src/common/function/src/scalars.rs b/src/common/function/src/scalars.rs
index 9a8c9cc3a0..6cf138b69a 100644
--- a/src/common/function/src/scalars.rs
+++ b/src/common/function/src/scalars.rs
@@ -20,6 +20,7 @@ pub mod json;
 pub mod matches;
 pub mod matches_term;
 pub mod math;
+pub mod primary_key;
 pub(crate) mod string;
 pub mod vector;
 
diff --git a/src/common/function/src/scalars/expression.rs b/src/common/function/src/scalars/expression.rs
index 75920801db..63ed40fc8f 100644
--- a/src/common/function/src/scalars/expression.rs
+++ b/src/common/function/src/scalars/expression.rs
@@ -14,6 +14,7 @@
 
 mod binary;
 mod ctx;
+mod if_func;
 mod is_null;
 mod unary;
 
@@ -22,6 +23,7 @@ pub use ctx::EvalContext;
 pub use unary::scalar_unary_op;
 
 use crate::function_registry::FunctionRegistry;
+use crate::scalars::expression::if_func::IfFunction;
 use crate::scalars::expression::is_null::IsNullFunction;
 
 pub(crate) struct ExpressionFunction;
@@ -29,5 +31,6 @@ pub(crate) struct ExpressionFunction;
 impl ExpressionFunction {
     pub fn register(registry: &FunctionRegistry) {
         registry.register_scalar(IsNullFunction::default());
+        registry.register_scalar(IfFunction::default());
     }
 }
diff --git a/src/common/function/src/scalars/expression/if_func.rs b/src/common/function/src/scalars/expression/if_func.rs
new file mode 100644
index 0000000000..92108cd307
--- /dev/null
+++ b/src/common/function/src/scalars/expression/if_func.rs
@@ -0,0 +1,404 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt;
+use std::fmt::Display;
+
+use arrow::array::ArrowNativeTypeOp;
+use arrow::datatypes::ArrowPrimitiveType;
+use datafusion::arrow::array::{Array, ArrayRef, AsArray, BooleanArray, PrimitiveArray};
+use datafusion::arrow::compute::kernels::zip::zip;
+use datafusion::arrow::datatypes::DataType;
+use datafusion_common::DataFusionError;
+use datafusion_expr::type_coercion::binary::comparison_coercion;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
+
+use crate::function::Function;
+
+const NAME: &str = "if";
+
+/// MySQL-compatible IF function: IF(condition, true_value, false_value)
+///
+/// Returns true_value if condition is TRUE (not NULL and not 0),
+/// otherwise returns false_value.
+///
+/// MySQL truthy rules:
+/// - NULL -> false
+/// - 0 (numeric zero) -> false
+/// - Any non-zero numeric -> true
+/// - Boolean true/false -> use directly
+#[derive(Clone, Debug)]
+pub struct IfFunction {
+    signature: Signature,
+}
+
+impl Default for IfFunction {
+    fn default() -> Self {
+        Self {
+            signature: Signature::any(3, Volatility::Immutable),
+        }
+    }
+}
+
+impl Display for IfFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", NAME.to_ascii_uppercase())
+    }
+}
+
+impl Function for IfFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, input_types: &[DataType]) -> datafusion_common::Result<DataType> {
+        // Return the common type of true_value and false_value (args[1] and args[2])
+        if input_types.len() < 3 {
+            return Err(DataFusionError::Plan(format!(
+                "{} requires 3 arguments, got {}",
+                NAME,
+                input_types.len()
+            )));
+        }
+        let true_type = &input_types[1];
+        let false_type = &input_types[2];
+
+        // Use comparison_coercion to find common type
+        comparison_coercion(true_type, false_type).ok_or_else(|| {
+            DataFusionError::Plan(format!(
+                "Cannot find common type for IF function between {:?} and {:?}",
+                true_type, false_type
+            ))
+        })
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        if args.args.len() != 3 {
+            return Err(DataFusionError::Plan(format!(
+                "{} requires exactly 3 arguments, got {}",
+                NAME,
+                args.args.len()
+            )));
+        }
+
+        let condition = &args.args[0];
+        let true_value = &args.args[1];
+        let false_value = &args.args[2];
+
+        // Convert condition to boolean array using MySQL truthy rules
+        let bool_array = to_boolean_array(condition, args.number_rows)?;
+
+        // Convert true and false values to arrays
+        let true_array = true_value.to_array(args.number_rows)?;
+        let false_array = false_value.to_array(args.number_rows)?;
+
+        // Use zip to select values based on condition
+        // zip expects &dyn Datum, and ArrayRef (Arc<dyn Array>) implements Datum
+        let result = zip(&bool_array, &true_array, &false_array)?;
+        Ok(ColumnarValue::Array(result))
+    }
+}
+
+/// Convert a ColumnarValue to a BooleanArray using MySQL truthy rules:
+/// - NULL -> false
+/// - 0 (any numeric zero) -> false
+/// - Non-zero numeric -> true
+/// - Boolean -> use directly
+fn to_boolean_array(
+    value: &ColumnarValue,
+    num_rows: usize,
+) -> datafusion_common::Result<BooleanArray> {
+    let array = value.to_array(num_rows)?;
+    array_to_bool(array)
+}
+
+/// Convert an integer PrimitiveArray to BooleanArray using MySQL truthy rules:
+/// NULL -> false, 0 -> false, non-zero -> true
+fn int_array_to_bool<T>(array: &PrimitiveArray<T>) -> BooleanArray
+where
+    T: ArrowPrimitiveType,
+    T::Native: ArrowNativeTypeOp,
+{
+    BooleanArray::from_iter(
+        array
+            .iter()
+            .map(|opt| Some(opt.is_some_and(|v| !v.is_zero()))),
+    )
+}
+
+/// Convert a float PrimitiveArray to BooleanArray using MySQL truthy rules:
+/// NULL -> false, 0 (including -0.0) -> false, NaN -> true, other non-zero -> true
+fn float_array_to_bool<T>(array: &PrimitiveArray<T>) -> BooleanArray
+where
+    T: ArrowPrimitiveType,
+    T::Native: ArrowNativeTypeOp + num_traits::Float,
+{
+    use num_traits::Float;
+    BooleanArray::from_iter(
+        array
+            .iter()
+            .map(|opt| Some(opt.is_some_and(|v| v.is_nan() || !v.is_zero()))),
+    )
+}
+
+/// Convert an Array to BooleanArray using MySQL truthy rules
+fn array_to_bool(array: ArrayRef) -> datafusion_common::Result<BooleanArray> {
+    use arrow::datatypes::*;
+
+    match array.data_type() {
+        DataType::Boolean => {
+            let bool_array = array.as_boolean();
+            Ok(BooleanArray::from_iter(
+                bool_array.iter().map(|opt| Some(opt.unwrap_or(false))),
+            ))
+        }
+        DataType::Int8 => Ok(int_array_to_bool(array.as_primitive::<Int8Type>())),
+        DataType::Int16 => Ok(int_array_to_bool(array.as_primitive::<Int16Type>())),
+        DataType::Int32 => Ok(int_array_to_bool(array.as_primitive::<Int32Type>())),
+        DataType::Int64 => Ok(int_array_to_bool(array.as_primitive::<Int64Type>())),
+        DataType::UInt8 => Ok(int_array_to_bool(array.as_primitive::<UInt8Type>())),
+        DataType::UInt16 => Ok(int_array_to_bool(array.as_primitive::<UInt16Type>())),
+        DataType::UInt32 => Ok(int_array_to_bool(array.as_primitive::<UInt32Type>())),
+        DataType::UInt64 => Ok(int_array_to_bool(array.as_primitive::<UInt64Type>())),
+        // Float16 needs special handling since half::f16 doesn't implement num_traits::Float
+        DataType::Float16 => {
+            let typed_array = array.as_primitive::<Float16Type>();
+            Ok(BooleanArray::from_iter(typed_array.iter().map(|opt| {
+                Some(opt.is_some_and(|v| {
+                    let f = v.to_f32();
+                    f.is_nan() || !f.is_zero()
+                }))
+            })))
+        }
+        DataType::Float32 => Ok(float_array_to_bool(array.as_primitive::<Float32Type>())),
+        DataType::Float64 => Ok(float_array_to_bool(array.as_primitive::<Float64Type>())),
+        // Null type is always false.
+        // Note: NullArray::is_null() returns false (physical null), so we must handle it explicitly.
+        // See: https://github.com/apache/arrow-rs/issues/4840
+        DataType::Null => Ok(BooleanArray::from(vec![false; array.len()])),
+        // For other types, treat non-null as true
+        _ => {
+            let len = array.len();
+            Ok(BooleanArray::from_iter(
+                (0..len).map(|i| Some(!array.is_null(i))),
+            ))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow_schema::Field;
+    use datafusion_common::ScalarValue;
+    use datafusion_common::arrow::array::{AsArray, Int32Array, StringArray};
+
+    use super::*;
+
+    #[test]
+    fn test_if_function_basic() {
+        let if_func = IfFunction::default();
+        assert_eq!("if", if_func.name());
+
+        // Test IF(true, 'yes', 'no') -> 'yes'
+        let result = if_func
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Scalar(ScalarValue::Boolean(Some(true))),
+                    ColumnarValue::Scalar(ScalarValue::Utf8(Some("yes".to_string()))),
+                    ColumnarValue::Scalar(ScalarValue::Utf8(Some("no".to_string()))),
+                ],
+                arg_fields: vec![],
+                number_rows: 1,
+                return_field: Arc::new(Field::new("", DataType::Utf8, true)),
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        if let ColumnarValue::Array(arr) = result {
+            let str_arr = arr.as_string::<i32>();
+            assert_eq!(str_arr.value(0), "yes");
+        } else {
+            panic!("Expected Array result");
+        }
+    }
+
+    #[test]
+    fn test_if_function_false() {
+        let if_func = IfFunction::default();
+
+        // Test IF(false, 'yes', 'no') -> 'no'
+        let result = if_func
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))),
+                    ColumnarValue::Scalar(ScalarValue::Utf8(Some("yes".to_string()))),
+                    ColumnarValue::Scalar(ScalarValue::Utf8(Some("no".to_string()))),
+                ],
+                arg_fields: vec![],
+                number_rows: 1,
+                return_field: Arc::new(Field::new("", DataType::Utf8, true)),
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        if let ColumnarValue::Array(arr) = result {
+            let str_arr = arr.as_string::<i32>();
+            assert_eq!(str_arr.value(0), "no");
+        } else {
+            panic!("Expected Array result");
+        }
+    }
+
+    #[test]
+    fn test_if_function_null_is_false() {
+        let if_func = IfFunction::default();
+
+        // Test IF(NULL, 'yes', 'no') -> 'no' (NULL is treated as false)
+        // Using Boolean(None) - typed null
+        let result = if_func
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Scalar(ScalarValue::Boolean(None)),
+                    ColumnarValue::Scalar(ScalarValue::Utf8(Some("yes".to_string()))),
+                    ColumnarValue::Scalar(ScalarValue::Utf8(Some("no".to_string()))),
+                ],
+                arg_fields: vec![],
+                number_rows: 1,
+                return_field: Arc::new(Field::new("", DataType::Utf8, true)),
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        if let ColumnarValue::Array(arr) = result {
+            let str_arr = arr.as_string::<i32>();
+            assert_eq!(str_arr.value(0), "no");
+        } else {
+            panic!("Expected Array result");
+        }
+
+        // Test IF(NULL, 'yes', 'no') -> 'no' using ScalarValue::Null (untyped null from SQL NULL literal)
+        let result = if_func
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Scalar(ScalarValue::Null),
+                    ColumnarValue::Scalar(ScalarValue::Utf8(Some("yes".to_string()))),
+                    ColumnarValue::Scalar(ScalarValue::Utf8(Some("no".to_string()))),
+                ],
+                arg_fields: vec![],
+                number_rows: 1,
+                return_field: Arc::new(Field::new("", DataType::Utf8, true)),
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        if let ColumnarValue::Array(arr) = result {
+            let str_arr = arr.as_string::<i32>();
+            assert_eq!(str_arr.value(0), "no");
+        } else {
+            panic!("Expected Array result");
+        }
+    }
+
+    #[test]
+    fn test_if_function_numeric_truthy() {
+        let if_func = IfFunction::default();
+
+        // Test IF(1, 'yes', 'no') -> 'yes' (non-zero is true)
+        let result = if_func
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Scalar(ScalarValue::Int32(Some(1))),
+                    ColumnarValue::Scalar(ScalarValue::Utf8(Some("yes".to_string()))),
+                    ColumnarValue::Scalar(ScalarValue::Utf8(Some("no".to_string()))),
+                ],
+                arg_fields: vec![],
+                number_rows: 1,
+                return_field: Arc::new(Field::new("", DataType::Utf8, true)),
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        if let ColumnarValue::Array(arr) = result {
+            let str_arr = arr.as_string::<i32>();
+            assert_eq!(str_arr.value(0), "yes");
+        } else {
+            panic!("Expected Array result");
+        }
+
+        // Test IF(0, 'yes', 'no') -> 'no' (zero is false)
+        let result = if_func
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Scalar(ScalarValue::Int32(Some(0))),
+                    ColumnarValue::Scalar(ScalarValue::Utf8(Some("yes".to_string()))),
+                    ColumnarValue::Scalar(ScalarValue::Utf8(Some("no".to_string()))),
+                ],
+                arg_fields: vec![],
+                number_rows: 1,
+                return_field: Arc::new(Field::new("", DataType::Utf8, true)),
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        if let ColumnarValue::Array(arr) = result {
+            let str_arr = arr.as_string::<i32>();
+            assert_eq!(str_arr.value(0), "no");
+        } else {
+            panic!("Expected Array result");
+        }
+    }
+
+    #[test]
+    fn test_if_function_with_arrays() {
+        let if_func = IfFunction::default();
+
+        // Test with array condition
+        let condition = Int32Array::from(vec![Some(1), Some(0), None, Some(5)]);
+        let true_val = StringArray::from(vec!["yes", "yes", "yes", "yes"]);
+        let false_val = StringArray::from(vec!["no", "no", "no", "no"]);
+
+        let result = if_func
+            .invoke_with_args(ScalarFunctionArgs {
+                args: vec![
+                    ColumnarValue::Array(Arc::new(condition)),
+                    ColumnarValue::Array(Arc::new(true_val)),
+                    ColumnarValue::Array(Arc::new(false_val)),
+                ],
+                arg_fields: vec![],
+                number_rows: 4,
+                return_field: Arc::new(Field::new("", DataType::Utf8, true)),
+                config_options: Arc::new(Default::default()),
+            })
+            .unwrap();
+
+        if let ColumnarValue::Array(arr) = result {
+            let str_arr = arr.as_string::<i32>();
+            assert_eq!(str_arr.value(0), "yes"); // 1 is true
+            assert_eq!(str_arr.value(1), "no"); // 0 is false
+            assert_eq!(str_arr.value(2), "no"); // NULL is false
+            assert_eq!(str_arr.value(3), "yes"); // 5 is true
+        } else {
+            panic!("Expected Array result");
+        }
+    }
+}
diff --git a/src/common/function/src/scalars/geo/relation.rs b/src/common/function/src/scalars/geo/relation.rs
index 4567e56bb5..ccbbe53000 100644
--- a/src/common/function/src/scalars/geo/relation.rs
+++ b/src/common/function/src/scalars/geo/relation.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::fmt::Display;
 use std::sync::Arc;
 
 use datafusion_common::arrow::array::{Array, AsArray, BooleanBuilder};
diff --git a/src/common/function/src/scalars/json.rs b/src/common/function/src/scalars/json.rs
index 9b022d71da..f84937fa0f 100644
--- a/src/common/function/src/scalars/json.rs
+++ b/src/common/function/src/scalars/json.rs
@@ -19,7 +19,7 @@ mod json_path_match;
 mod json_to_string;
 mod parse_json;
 
-use json_get::{JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetString};
+use json_get::{JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetObject, JsonGetString};
 use json_is::{
     JsonIsArray, JsonIsBool, JsonIsFloat, JsonIsInt, JsonIsNull, JsonIsObject, JsonIsString,
 };
@@ -39,6 +39,7 @@ impl JsonFunction {
         registry.register_scalar(JsonGetFloat::default());
         registry.register_scalar(JsonGetString::default());
         registry.register_scalar(JsonGetBool::default());
+        registry.register_scalar(JsonGetObject::default());
 
         registry.register_scalar(JsonIsNull::default());
         registry.register_scalar(JsonIsInt::default());
diff --git a/src/common/function/src/scalars/json/json_get.rs b/src/common/function/src/scalars/json/json_get.rs
index 51dd2fc9b7..92ea9cf990 100644
--- a/src/common/function/src/scalars/json/json_get.rs
+++ b/src/common/function/src/scalars/json/json_get.rs
@@ -16,10 +16,13 @@ use std::fmt::{self, Display};
 use std::sync::Arc;
 
 use arrow::compute;
+use datafusion_common::DataFusionError;
 use datafusion_common::arrow::array::{
-    Array, AsArray, BooleanBuilder, Float64Builder, Int64Builder, StringViewBuilder,
+    Array, AsArray, BinaryViewBuilder, BooleanBuilder, Float64Builder, Int64Builder,
+    StringViewBuilder,
 };
 use datafusion_common::arrow::datatypes::DataType;
+use datafusion_expr::type_coercion::aggregates::STRINGS;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature};
 
 use crate::function::{Function, extract_args};
@@ -212,13 +215,92 @@ impl Display for JsonGetString {
     }
 }
 
+/// Get the object from JSON value by path.
+pub(super) struct JsonGetObject {
+    signature: Signature,
+}
+
+impl JsonGetObject {
+    const NAME: &'static str = "json_get_object";
+}
+
+impl Default for JsonGetObject {
+    fn default() -> Self {
+        Self {
+            signature: helper::one_of_sigs2(
+                vec![
+                    DataType::Binary,
+                    DataType::LargeBinary,
+                    DataType::BinaryView,
+                ],
+                STRINGS.to_vec(),
+            ),
+        }
+    }
+}
+
+impl Function for JsonGetObject {
+    fn name(&self) -> &str {
+        Self::NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(DataType::BinaryView)
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        let [arg0, arg1] = extract_args(self.name(), &args)?;
+        let arg0 = compute::cast(&arg0, &DataType::BinaryView)?;
+        let jsons = arg0.as_binary_view();
+        let arg1 = compute::cast(&arg1, &DataType::Utf8View)?;
+        let paths = arg1.as_string_view();
+
+        let len = jsons.len();
+        let mut builder = BinaryViewBuilder::with_capacity(len);
+
+        for i in 0..len {
+            let json = jsons.is_valid(i).then(|| jsons.value(i));
+            let path = paths.is_valid(i).then(|| paths.value(i));
+            let result = if let (Some(json), Some(path)) = (json, path) {
+                let result = jsonb::jsonpath::parse_json_path(path.as_bytes()).and_then(|path| {
+                    let mut data = Vec::new();
+                    let mut offset = Vec::new();
+                    jsonb::get_by_path(json, path, &mut data, &mut offset)
+                        .map(|()| jsonb::is_object(&data).then_some(data))
+                });
+                result.map_err(|e| DataFusionError::Execution(e.to_string()))?
+            } else {
+                None
+            };
+            builder.append_option(result);
+        }
+
+        Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+    }
+}
+
+impl Display for JsonGetObject {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", Self::NAME.to_ascii_uppercase())
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
 
     use arrow_schema::Field;
-    use datafusion_common::arrow::array::{BinaryArray, StringArray};
+    use datafusion_common::ScalarValue;
+    use datafusion_common::arrow::array::{BinaryArray, BinaryViewArray, StringArray};
     use datafusion_common::arrow::datatypes::{Float64Type, Int64Type};
+    use datatypes::types::parse_string_to_jsonb;
 
     use super::*;
 
@@ -425,4 +507,49 @@ mod tests {
             assert_eq!(*gt, result);
         }
     }
+
+    #[test]
+    fn test_json_get_object() -> datafusion_common::Result<()> {
+        let udf = JsonGetObject::default();
+        assert_eq!("json_get_object", udf.name());
+        assert_eq!(
+            DataType::BinaryView,
+            udf.return_type(&[DataType::BinaryView, DataType::Utf8View])?
+        );
+
+        let json_value = parse_string_to_jsonb(r#"{"a": {"b": {"c": {"d": 1}}}}"#).unwrap();
+        let paths = vec!["$", "$.a", "$.a.b", "$.a.b.c", "$.a.b.c.d", "$.e", "$.a.e"];
+        let number_rows = paths.len();
+
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Scalar(ScalarValue::Binary(Some(json_value))),
+                ColumnarValue::Array(Arc::new(StringArray::from_iter_values(paths))),
+            ],
+            arg_fields: vec![],
+            number_rows,
+            return_field: Arc::new(Field::new("x", DataType::Binary, false)),
+            config_options: Arc::new(Default::default()),
+        };
+        let result = udf
+            .invoke_with_args(args)
+            .and_then(|x| x.to_array(number_rows))?;
+        let result = result.as_binary_view();
+
+        let expected = &BinaryViewArray::from_iter(
+            vec![
+                Some(r#"{"a": {"b": {"c": {"d": 1}}}}"#),
+                Some(r#"{"b": {"c": {"d": 1}}}"#),
+                Some(r#"{"c": {"d": 1}}"#),
+                Some(r#"{"d": 1}"#),
+                None,
+                None,
+                None,
+            ]
+            .into_iter()
+            .map(|x| x.and_then(|s| parse_string_to_jsonb(s).ok())),
+        );
+        assert_eq!(result, expected);
+        Ok(())
+    }
 }
diff --git a/src/common/function/src/scalars/json/json_to_string.rs b/src/common/function/src/scalars/json/json_to_string.rs
index ae134b75dd..6c0cc260b2 100644
--- a/src/common/function/src/scalars/json/json_to_string.rs
+++ b/src/common/function/src/scalars/json/json_to_string.rs
@@ -32,7 +32,15 @@ impl Default for JsonToStringFunction {
     fn default() -> Self {
         Self {
             // TODO(LFC): Use a more clear type here instead of "Binary" for Json input, once we have a "Json" type.
-            signature: Signature::exact(vec![DataType::Binary], Volatility::Immutable),
+            signature: Signature::uniform(
+                1,
+                vec![
+                    DataType::Binary,
+                    DataType::LargeBinary,
+                    DataType::BinaryView,
+                ],
+                Volatility::Immutable,
+            ),
         }
     }
 }
@@ -57,7 +65,8 @@ impl Function for JsonToStringFunction {
         args: ScalarFunctionArgs,
     ) -> datafusion_common::Result<ColumnarValue> {
         let [arg0] = extract_args(self.name(), &args)?;
-        let jsons = arg0.as_binary::<i32>();
+        let arg0 = arrow::compute::cast(&arg0, &DataType::BinaryView)?;
+        let jsons = arg0.as_binary_view();
 
         let size = jsons.len();
         let mut builder = StringViewBuilder::with_capacity(size);
diff --git a/src/common/function/src/scalars/primary_key.rs b/src/common/function/src/scalars/primary_key.rs
new file mode 100644
index 0000000000..680c663bc5
--- /dev/null
+++ b/src/common/function/src/scalars/primary_key.rs
@@ -0,0 +1,521 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::fmt::{self, Display};
+use std::sync::Arc;
+
+use datafusion_common::arrow::array::{
+    Array, ArrayRef, BinaryArray, BinaryViewArray, DictionaryArray, ListBuilder, StringBuilder,
+};
+use datafusion_common::arrow::datatypes::{DataType, Field};
+use datafusion_common::{DataFusionError, ScalarValue};
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
+use datatypes::arrow::datatypes::UInt32Type;
+use datatypes::value::Value;
+use mito_codec::row_converter::{
+    CompositeValues, PrimaryKeyCodec, SortField, build_primary_key_codec_with_fields,
+};
+use store_api::codec::PrimaryKeyEncoding;
+use store_api::metadata::RegionMetadata;
+use store_api::storage::ColumnId;
+use store_api::storage::consts::{PRIMARY_KEY_COLUMN_NAME, ReservedColumnId};
+
+use crate::function::{Function, extract_args};
+use crate::function_registry::FunctionRegistry;
+
+type NameValuePair = (String, Option<String>);
+
+#[derive(Clone, Debug)]
+pub(crate) struct DecodePrimaryKeyFunction {
+    signature: Signature,
+}
+
+const NAME: &str = "decode_primary_key";
+const NULL_VALUE_LITERAL: &str = "null";
+
+impl Default for DecodePrimaryKeyFunction {
+    fn default() -> Self {
+        Self {
+            signature: Signature::any(3, Volatility::Immutable),
+        }
+    }
+}
+
+impl DecodePrimaryKeyFunction {
+    pub fn register(registry: &FunctionRegistry) {
+        registry.register_scalar(Self::default());
+    }
+
+    fn return_data_type() -> DataType {
+        DataType::List(Arc::new(Field::new("item", DataType::Utf8, true)))
+    }
+}
+
+impl Function for DecodePrimaryKeyFunction {
+    fn name(&self) -> &str {
+        NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(Self::return_data_type())
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        let [encoded, _, _] = extract_args(self.name(), &args)?;
+        let number_rows = args.number_rows;
+
+        let encoding = parse_encoding(&args.args[1])?;
+        let metadata = parse_region_metadata(&args.args[2])?;
+        let codec = build_codec(&metadata, encoding);
+        let name_lookup: HashMap<_, _> = metadata
+            .column_metadatas
+            .iter()
+            .map(|c| (c.column_id, c.column_schema.name.clone()))
+            .collect();
+
+        let decoded_rows = decode_primary_keys(encoded, number_rows, codec.as_ref(), &name_lookup)?;
+        let array = build_list_array(&decoded_rows)?;
+
+        Ok(ColumnarValue::Array(array))
+    }
+}
+
+impl Display for DecodePrimaryKeyFunction {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "DECODE_PRIMARY_KEY")
+    }
+}
+
+fn parse_encoding(arg: &ColumnarValue) -> datafusion_common::Result<PrimaryKeyEncoding> {
+    let encoding = match arg {
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(v)))
+        | ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(v))) => v.as_str(),
+        ColumnarValue::Scalar(value) => {
+            return Err(DataFusionError::Execution(format!(
+                "encoding must be a string literal, got {value:?}"
+            )));
+        }
+        ColumnarValue::Array(_) => {
+            return Err(DataFusionError::Execution(
+                "encoding must be a scalar string".to_string(),
+            ));
+        }
+    };
+
+    match encoding.to_ascii_lowercase().as_str() {
+        "dense" => Ok(PrimaryKeyEncoding::Dense),
+        "sparse" => Ok(PrimaryKeyEncoding::Sparse),
+        _ => Err(DataFusionError::Execution(format!(
+            "unsupported primary key encoding: {encoding}"
+        ))),
+    }
+}
+
+fn build_codec(
+    metadata: &RegionMetadata,
+    encoding: PrimaryKeyEncoding,
+) -> Arc<dyn PrimaryKeyCodec> {
+    let fields = metadata.primary_key_columns().map(|c| {
+        (
+            c.column_id,
+            SortField::new(c.column_schema.data_type.clone()),
+        )
+    });
+    build_primary_key_codec_with_fields(encoding, fields)
+}
+
+fn parse_region_metadata(arg: &ColumnarValue) -> datafusion_common::Result<RegionMetadata> {
+    let json = match arg {
+        ColumnarValue::Scalar(ScalarValue::Utf8(Some(v)))
+        | ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(v))) => v.as_str(),
+        ColumnarValue::Scalar(value) => {
+            return Err(DataFusionError::Execution(format!(
+                "region metadata must be a string literal, got {value:?}"
+            )));
+        }
+        ColumnarValue::Array(_) => {
+            return Err(DataFusionError::Execution(
+                "region metadata must be a scalar string".to_string(),
+            ));
+        }
+    };
+
+    RegionMetadata::from_json(json)
+        .map_err(|e| DataFusionError::Execution(format!("failed to parse region metadata: {e:?}")))
+}
+
+fn decode_primary_keys(
+    encoded: ArrayRef,
+    number_rows: usize,
+    codec: &dyn PrimaryKeyCodec,
+    name_lookup: &HashMap<ColumnId, String>,
+) -> datafusion_common::Result<Vec<Vec<NameValuePair>>> {
+    if let Some(dict) = encoded
+        .as_any()
+        .downcast_ref::<DictionaryArray<UInt32Type>>()
+    {
+        decode_dictionary(dict, number_rows, codec, name_lookup)
+    } else if let Some(array) = encoded.as_any().downcast_ref::<BinaryArray>() {
+        decode_binary_array(array, codec, name_lookup)
+    } else if let Some(array) = encoded.as_any().downcast_ref::<BinaryViewArray>() {
+        decode_binary_view_array(array, codec, name_lookup)
+    } else {
+        Err(DataFusionError::Execution(format!(
+            "column {PRIMARY_KEY_COLUMN_NAME} must be binary or dictionary(binary) array"
+        )))
+    }
+}
+
+fn decode_dictionary(
+    dict: &DictionaryArray<UInt32Type>,
+    number_rows: usize,
+    codec: &dyn PrimaryKeyCodec,
+    name_lookup: &HashMap<ColumnId, String>,
+) -> datafusion_common::Result<Vec<Vec<NameValuePair>>> {
+    let values = dict
+        .values()
+        .as_any()
+        .downcast_ref::<BinaryArray>()
+        .ok_or_else(|| {
+            DataFusionError::Execution("primary key dictionary values are not binary".to_string())
+        })?;
+
+    let mut decoded_values = Vec::with_capacity(values.len());
+    for i in 0..values.len() {
+        let pk = values.value(i);
+        let pairs = decode_one(pk, codec, name_lookup)?;
+        decoded_values.push(pairs);
+    }
+
+    let mut rows = Vec::with_capacity(number_rows);
+    let keys = dict.keys();
+    for i in 0..number_rows {
+        let dict_index = keys.value(i) as usize;
+        rows.push(decoded_values[dict_index].clone());
+    }
+
+    Ok(rows)
+}
+
+fn decode_binary_array(
+    array: &BinaryArray,
+    codec: &dyn PrimaryKeyCodec,
+    name_lookup: &HashMap<ColumnId, String>,
+) -> datafusion_common::Result<Vec<Vec<NameValuePair>>> {
+    (0..array.len())
+        .map(|i| decode_one(array.value(i), codec, name_lookup))
+        .collect()
+}
+
+fn decode_binary_view_array(
+    array: &BinaryViewArray,
+    codec: &dyn PrimaryKeyCodec,
+    name_lookup: &HashMap<ColumnId, String>,
+) -> datafusion_common::Result<Vec<Vec<NameValuePair>>> {
+    (0..array.len())
+        .map(|i| decode_one(array.value(i), codec, name_lookup))
+        .collect()
+}
+
+fn decode_one(
+    pk: &[u8],
+    codec: &dyn PrimaryKeyCodec,
+    name_lookup: &HashMap<ColumnId, String>,
+) -> datafusion_common::Result<Vec<NameValuePair>> {
+    let decoded = codec
+        .decode(pk)
+        .map_err(|e| DataFusionError::Execution(format!("failed to decode primary key: {e}")))?;
+
+    Ok(match decoded {
+        CompositeValues::Dense(values) => values
+            .into_iter()
+            .map(|(column_id, value)| (column_name(column_id, name_lookup), value_to_string(value)))
+            .collect(),
+        CompositeValues::Sparse(values) => {
+            let mut values: Vec<_> = values
+                .iter()
+                .map(|(column_id, value)| {
+                    (
+                        *column_id,
+                        column_name(*column_id, name_lookup),
+                        value_to_string(value.clone()),
+                    )
+                })
+                .collect();
+            values.sort_by_key(|(column_id, _, _)| {
+                (ReservedColumnId::is_reserved(*column_id), *column_id)
+            });
+            values
+                .into_iter()
+                .map(|(_, name, value)| (name, value))
+                .collect()
+        }
+    })
+}
+
+fn column_name(column_id: ColumnId, name_lookup: &HashMap<ColumnId, String>) -> String {
+    if let Some(name) = name_lookup.get(&column_id) {
+        return name.clone();
+    }
+
+    if column_id == ReservedColumnId::table_id() {
+        return "__table_id".to_string();
+    }
+    if column_id == ReservedColumnId::tsid() {
+        return "__tsid".to_string();
+    }
+
+    column_id.to_string()
+}
+
+fn value_to_string(value: Value) -> Option<String> {
+    match value {
+        Value::Null => None,
+        _ => Some(value.to_string()),
+    }
+}
+
+fn build_list_array(rows: &[Vec<NameValuePair>]) -> datafusion_common::Result<ArrayRef> {
+    let mut builder = ListBuilder::new(StringBuilder::new());
+
+    for row in rows {
+        for (key, value) in row {
+            let value = value.as_deref().unwrap_or(NULL_VALUE_LITERAL);
+            builder.values().append_value(format!("{key} : {value}"));
+        }
+        builder.append(true);
+    }
+
+    Ok(Arc::new(builder.finish()))
+}
+
+#[cfg(test)]
+mod tests {
+    use api::v1::SemanticType;
+    use datafusion_common::ScalarValue;
+    use datatypes::arrow::array::builder::BinaryDictionaryBuilder;
+    use datatypes::arrow::array::{BinaryArray, ListArray, StringArray};
+    use datatypes::arrow::datatypes::UInt32Type;
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::schema::ColumnSchema;
+    use datatypes::value::Value;
+    use mito_codec::row_converter::{
+        DensePrimaryKeyCodec, PrimaryKeyCodecExt, SortField, SparsePrimaryKeyCodec,
+    };
+    use store_api::codec::PrimaryKeyEncoding;
+    use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
+    use store_api::storage::consts::ReservedColumnId;
+    use store_api::storage::{ColumnId, RegionId};
+
+    use super::*;
+
+    fn pk_field() -> Arc<Field> {
+        Arc::new(Field::new_dictionary(
+            PRIMARY_KEY_COLUMN_NAME,
+            DataType::UInt32,
+            DataType::Binary,
+            false,
+        ))
+    }
+
+    fn region_metadata_json(
+        columns: &[(ColumnId, &str, ConcreteDataType)],
+        encoding: PrimaryKeyEncoding,
+    ) -> String {
+        let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
+        builder.push_column_metadata(ColumnMetadata {
+            column_schema: ColumnSchema::new(
+                "ts",
+                ConcreteDataType::timestamp_millisecond_datatype(),
+                false,
+            ),
+            semantic_type: SemanticType::Timestamp,
+            column_id: 100,
+        });
+        builder.primary_key_encoding(encoding);
+        for (id, name, ty) in columns {
+            builder.push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new((*name).to_string(), ty.clone(), true),
+                semantic_type: SemanticType::Tag,
+                column_id: *id,
+            });
+        }
+        builder.primary_key(columns.iter().map(|(id, _, _)| *id).collect());
+
+        builder.build().unwrap().to_json().unwrap()
+    }
+
+    fn list_row(list: &ListArray, row_idx: usize) -> Vec<String> {
+        let values = list.value(row_idx);
+        let values = values.as_any().downcast_ref::<StringArray>().unwrap();
+        (0..values.len())
+            .map(|i| values.value(i).to_string())
+            .collect()
+    }
+
+    #[test]
+    fn test_decode_dense_primary_key() {
+        let columns = vec![
+            (0, "host", ConcreteDataType::string_datatype()),
+            (1, "core", ConcreteDataType::int64_datatype()),
+        ];
+        let metadata_json = region_metadata_json(&columns, PrimaryKeyEncoding::Dense);
+        let codec = DensePrimaryKeyCodec::with_fields(
+            columns
+                .iter()
+                .map(|(id, _, ty)| (*id, SortField::new(ty.clone())))
+                .collect(),
+        );
+
+        let rows = vec![
+            vec![Value::from("a"), Value::from(1_i64)],
+            vec![Value::from("b"), Value::from(2_i64)],
+            vec![Value::from("a"), Value::from(1_i64)],
+        ];
+
+        let mut builder = BinaryDictionaryBuilder::<UInt32Type>::new();
+        for row in &rows {
+            let encoded = codec.encode(row.iter().map(|v| v.as_value_ref())).unwrap();
+            builder.append(encoded.as_slice()).unwrap();
+        }
+        let dict_array: ArrayRef = Arc::new(builder.finish());
+
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(dict_array),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("dense".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(metadata_json))),
+            ],
+            arg_fields: vec![
+                pk_field(),
+                Arc::new(Field::new("encoding", DataType::Utf8, false)),
+                Arc::new(Field::new("region_metadata", DataType::Utf8, false)),
+            ],
+            number_rows: 3,
+            return_field: Arc::new(Field::new(
+                "decoded",
+                DecodePrimaryKeyFunction::return_data_type(),
+                false,
+            )),
+            config_options: Default::default(),
+        };
+
+        let func = DecodePrimaryKeyFunction::default();
+        let result = func
+            .invoke_with_args(args)
+            .and_then(|v| v.to_array(3))
+            .unwrap();
+        let list = result.as_any().downcast_ref::<ListArray>().unwrap();
+
+        let expected = [
+            vec!["host : a".to_string(), "core : 1".to_string()],
+            vec!["host : b".to_string(), "core : 2".to_string()],
+            vec!["host : a".to_string(), "core : 1".to_string()],
+        ];
+
+        for (row_idx, expected_row) in expected.iter().enumerate() {
+            assert_eq!(*expected_row, list_row(list, row_idx));
+        }
+    }
+
+    #[test]
+    fn test_decode_sparse_primary_key() {
+        let columns = vec![
+            (10, "k0", ConcreteDataType::string_datatype()),
+            (11, "k1", ConcreteDataType::string_datatype()),
+        ];
+        let metadata_json = region_metadata_json(&columns, PrimaryKeyEncoding::Sparse);
+        let codec = SparsePrimaryKeyCodec::schemaless();
+
+        let rows = vec![
+            vec![
+                (ReservedColumnId::table_id(), Value::UInt32(1)),
+                (ReservedColumnId::tsid(), Value::UInt64(100)),
+                (10, Value::from("a")),
+                (11, Value::from("b")),
+            ],
+            vec![
+                (ReservedColumnId::table_id(), Value::UInt32(1)),
+                (ReservedColumnId::tsid(), Value::UInt64(200)),
+                (10, Value::from("c")),
+                (11, Value::from("d")),
+            ],
+        ];
+
+        let mut encoded_values = Vec::with_capacity(rows.len());
+        for row in &rows {
+            let mut buf = Vec::new();
+            codec.encode_values(row, &mut buf).unwrap();
+            encoded_values.push(buf);
+        }
+
+        let pk_array: ArrayRef = Arc::new(BinaryArray::from_iter_values(
+            encoded_values.iter().cloned(),
+        ));
+
+        let args = ScalarFunctionArgs {
+            args: vec![
+                ColumnarValue::Array(pk_array),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("sparse".to_string()))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some(metadata_json))),
+            ],
+            arg_fields: vec![
+                pk_field(),
+                Arc::new(Field::new("encoding", DataType::Utf8, false)),
+                Arc::new(Field::new("region_metadata", DataType::Utf8, false)),
+            ],
+            number_rows: rows.len(),
+            return_field: Arc::new(Field::new(
+                "decoded",
+                DecodePrimaryKeyFunction::return_data_type(),
+                false,
+            )),
+            config_options: Default::default(),
+        };
+
+        let func = DecodePrimaryKeyFunction::default();
+        let result = func
+            .invoke_with_args(args)
+            .and_then(|v| v.to_array(rows.len()))
+            .unwrap();
+        let list = result.as_any().downcast_ref::<ListArray>().unwrap();
+
+        let expected = [
+            vec![
+                "k0 : a".to_string(),
+                "k1 : b".to_string(),
+                "__tsid : 100".to_string(),
+                "__table_id : 1".to_string(),
+            ],
+            vec![
+                "k0 : c".to_string(),
+                "k1 : d".to_string(),
+                "__tsid : 200".to_string(),
+                "__table_id : 1".to_string(),
+            ],
+        ];
+
+        for (row_idx, expected_row) in expected.iter().enumerate() {
+            assert_eq!(*expected_row, list_row(list, row_idx));
+        }
+    }
+}
diff --git a/src/common/function/src/state.rs b/src/common/function/src/state.rs
index f90479b923..d1a3d341b4 100644
--- a/src/common/function/src/state.rs
+++ b/src/common/function/src/state.rs
@@ -44,7 +44,8 @@ impl FunctionState {
         use session::context::QueryContextRef;
         use store_api::storage::RegionId;
         use table::requests::{
-            CompactTableRequest, DeleteRequest, FlushTableRequest, InsertRequest,
+            BuildIndexTableRequest, CompactTableRequest, DeleteRequest, FlushTableRequest,
+            InsertRequest,
         };
 
         use crate::handlers::{FlowServiceHandler, ProcedureServiceHandler, TableMutationHandler};
@@ -120,6 +121,14 @@ impl FunctionState {
                 Ok(ROWS)
             }
 
+            async fn build_index(
+                &self,
+                _request: BuildIndexTableRequest,
+                _ctx: QueryContextRef,
+            ) -> Result<AffectedRows> {
+                Ok(ROWS)
+            }
+
             async fn flush_region(
                 &self,
                 _region_id: RegionId,
diff --git a/src/common/function/src/system/pg_catalog.rs b/src/common/function/src/system/pg_catalog.rs
index 07e7d2abaf..96bcc3fe9d 100644
--- a/src/common/function/src/system/pg_catalog.rs
+++ b/src/common/function/src/system/pg_catalog.rs
@@ -12,21 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-mod version;
-
 use std::sync::Arc;
 
 use common_catalog::consts::{
     DEFAULT_PRIVATE_SCHEMA_NAME, INFORMATION_SCHEMA_NAME, PG_CATALOG_NAME,
 };
-use datafusion::arrow::array::{ArrayRef, StringArray, as_boolean_array};
+use datafusion::arrow::array::{ArrayRef, StringArray, StringBuilder, as_boolean_array};
 use datafusion::catalog::TableFunction;
 use datafusion::common::ScalarValue;
 use datafusion::common::utils::SingleRowListArrayBuilder;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
 use datafusion_pg_catalog::pg_catalog::{self, PgCatalogStaticTables};
 use datatypes::arrow::datatypes::{DataType, Field};
-use version::PGVersionFunction;
+use derive_more::derive::Display;
 
 use crate::function::{Function, find_function_context};
 use crate::function_registry::FunctionRegistry;
@@ -36,11 +34,15 @@ const CURRENT_SCHEMA_FUNCTION_NAME: &str = "current_schema";
 const CURRENT_SCHEMAS_FUNCTION_NAME: &str = "current_schemas";
 const SESSION_USER_FUNCTION_NAME: &str = "session_user";
 const CURRENT_DATABASE_FUNCTION_NAME: &str = "current_database";
+const OBJ_DESCRIPTION_FUNCTION_NAME: &str = "obj_description";
+const COL_DESCRIPTION_FUNCTION_NAME: &str = "col_description";
+const SHOBJ_DESCRIPTION_FUNCTION_NAME: &str = "shobj_description";
+const PG_MY_TEMP_SCHEMA_FUNCTION_NAME: &str = "pg_my_temp_schema";
 
 define_nullary_udf!(CurrentSchemaFunction);
-define_nullary_udf!(CurrentSchemasFunction);
 define_nullary_udf!(SessionUserFunction);
 define_nullary_udf!(CurrentDatabaseFunction);
+define_nullary_udf!(PgMyTempSchemaFunction);
 
 impl Function for CurrentDatabaseFunction {
     fn name(&self) -> &str {
@@ -118,6 +120,23 @@ impl Function for SessionUserFunction {
     }
 }
 
+#[derive(Display, Debug)]
+#[display("{}", self.name())]
+pub(super) struct CurrentSchemasFunction {
+    signature: Signature,
+}
+
+impl CurrentSchemasFunction {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::new(
+                TypeSignature::Exact(vec![DataType::Boolean]),
+                Volatility::Stable,
+            ),
+        }
+    }
+}
+
 impl Function for CurrentSchemasFunction {
     fn name(&self) -> &str {
         CURRENT_SCHEMAS_FUNCTION_NAME
@@ -125,9 +144,9 @@ impl Function for CurrentSchemasFunction {
 
     fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
         Ok(DataType::List(Arc::new(Field::new(
-            "x",
-            DataType::Utf8View,
-            false,
+            "item",
+            DataType::Utf8,
+            true,
         ))))
     }
 
@@ -159,6 +178,175 @@ impl Function for CurrentSchemasFunction {
     }
 }
 
+/// PostgreSQL obj_description - returns NULL for compatibility
+#[derive(Display, Debug, Clone)]
+#[display("{}", self.name())]
+pub(super) struct ObjDescriptionFunction {
+    signature: Signature,
+}
+
+impl ObjDescriptionFunction {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Int64, DataType::Utf8]),
+                    TypeSignature::Exact(vec![DataType::UInt32, DataType::Utf8]),
+                    TypeSignature::Exact(vec![DataType::Int64]),
+                    TypeSignature::Exact(vec![DataType::UInt32]),
+                ],
+                Volatility::Stable,
+            ),
+        }
+    }
+}
+
+impl Function for ObjDescriptionFunction {
+    fn name(&self) -> &str {
+        OBJ_DESCRIPTION_FUNCTION_NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(DataType::Utf8)
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        let num_rows = args.number_rows;
+        let mut builder = StringBuilder::with_capacity(num_rows, 0);
+        for _ in 0..num_rows {
+            builder.append_null();
+        }
+        Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+    }
+}
+
+/// PostgreSQL col_description - returns NULL for compatibility
+#[derive(Display, Debug, Clone)]
+#[display("{}", self.name())]
+pub(super) struct ColDescriptionFunction {
+    signature: Signature,
+}
+
+impl ColDescriptionFunction {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Int64, DataType::Int32]),
+                    TypeSignature::Exact(vec![DataType::UInt32, DataType::Int32]),
+                    TypeSignature::Exact(vec![DataType::Int64, DataType::Int64]),
+                    TypeSignature::Exact(vec![DataType::UInt32, DataType::Int64]),
+                ],
+                Volatility::Stable,
+            ),
+        }
+    }
+}
+
+impl Function for ColDescriptionFunction {
+    fn name(&self) -> &str {
+        COL_DESCRIPTION_FUNCTION_NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(DataType::Utf8)
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        let num_rows = args.number_rows;
+        let mut builder = StringBuilder::with_capacity(num_rows, 0);
+        for _ in 0..num_rows {
+            builder.append_null();
+        }
+        Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+    }
+}
+
+/// PostgreSQL shobj_description - returns NULL for compatibility
+#[derive(Display, Debug, Clone)]
+#[display("{}", self.name())]
+pub(super) struct ShobjDescriptionFunction {
+    signature: Signature,
+}
+
+impl ShobjDescriptionFunction {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Int64, DataType::Utf8]),
+                    TypeSignature::Exact(vec![DataType::UInt64, DataType::Utf8]),
+                    TypeSignature::Exact(vec![DataType::Int32, DataType::Utf8]),
+                    TypeSignature::Exact(vec![DataType::UInt32, DataType::Utf8]),
+                ],
+                Volatility::Stable,
+            ),
+        }
+    }
+}
+
+impl Function for ShobjDescriptionFunction {
+    fn name(&self) -> &str {
+        SHOBJ_DESCRIPTION_FUNCTION_NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(DataType::Utf8)
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        let num_rows = args.number_rows;
+        let mut builder = StringBuilder::with_capacity(num_rows, 0);
+        for _ in 0..num_rows {
+            builder.append_null();
+        }
+        Ok(ColumnarValue::Array(Arc::new(builder.finish())))
+    }
+}
+
+/// PostgreSQL pg_my_temp_schema - returns 0 (no temp schema) for compatibility
+impl Function for PgMyTempSchemaFunction {
+    fn name(&self) -> &str {
+        PG_MY_TEMP_SCHEMA_FUNCTION_NAME
+    }
+
+    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
+        Ok(DataType::UInt32)
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn invoke_with_args(
+        &self,
+        _args: ScalarFunctionArgs,
+    ) -> datafusion_common::Result<ColumnarValue> {
+        Ok(ColumnarValue::Scalar(ScalarValue::UInt32(Some(0))))
+    }
+}
+
 pub(super) struct PGCatalogFunction;
 
 impl PGCatalogFunction {
@@ -166,9 +354,8 @@ impl PGCatalogFunction {
         let static_tables =
             Arc::new(PgCatalogStaticTables::try_new().expect("load postgres static tables"));
 
-        registry.register_scalar(PGVersionFunction::default());
         registry.register_scalar(CurrentSchemaFunction::default());
-        registry.register_scalar(CurrentSchemasFunction::default());
+        registry.register_scalar(CurrentSchemasFunction::new());
         registry.register_scalar(SessionUserFunction::default());
         registry.register_scalar(CurrentDatabaseFunction::default());
         registry.register(pg_catalog::format_type::create_format_type_udf());
@@ -199,5 +386,100 @@ impl PGCatalogFunction {
         registry.register(pg_catalog::create_pg_total_relation_size_udf());
         registry.register(pg_catalog::create_pg_stat_get_numscans());
         registry.register(pg_catalog::create_pg_get_constraintdef());
+        registry.register(pg_catalog::create_pg_get_partition_ancestors_udf());
+        registry.register(pg_catalog::quote_ident_udf::create_quote_ident_udf());
+        registry.register(pg_catalog::quote_ident_udf::create_parse_ident_udf());
+        registry.register_scalar(ObjDescriptionFunction::new());
+        registry.register_scalar(ColDescriptionFunction::new());
+        registry.register_scalar(ShobjDescriptionFunction::new());
+        registry.register_scalar(PgMyTempSchemaFunction::default());
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow_schema::Field;
+    use datafusion::arrow::array::Array;
+    use datafusion_common::ScalarValue;
+    use datafusion_expr::ColumnarValue;
+
+    use super::*;
+
+    fn create_test_args(args: Vec<ColumnarValue>, number_rows: usize) -> ScalarFunctionArgs {
+        ScalarFunctionArgs {
+            args,
+            arg_fields: vec![],
+            number_rows,
+            return_field: Arc::new(Field::new("result", DataType::Utf8, true)),
+            config_options: Arc::new(Default::default()),
+        }
+    }
+
+    #[test]
+    fn test_obj_description_function() {
+        let func = ObjDescriptionFunction::new();
+        assert_eq!("obj_description", func.name());
+        assert_eq!(DataType::Utf8, func.return_type(&[]).unwrap());
+
+        let args = create_test_args(
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1234))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("pg_class".to_string()))),
+            ],
+            1,
+        );
+        let result = func.invoke_with_args(args).unwrap();
+        if let ColumnarValue::Array(arr) = result {
+            assert_eq!(1, arr.len());
+            assert!(arr.is_null(0));
+        } else {
+            panic!("Expected Array result");
+        }
+    }
+
+    #[test]
+    fn test_col_description_function() {
+        let func = ColDescriptionFunction::new();
+        assert_eq!("col_description", func.name());
+        assert_eq!(DataType::Utf8, func.return_type(&[]).unwrap());
+
+        let args = create_test_args(
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1234))),
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+            ],
+            1,
+        );
+        let result = func.invoke_with_args(args).unwrap();
+        if let ColumnarValue::Array(arr) = result {
+            assert_eq!(1, arr.len());
+            assert!(arr.is_null(0));
+        } else {
+            panic!("Expected Array result");
+        }
+    }
+
+    #[test]
+    fn test_shobj_description_function() {
+        let func = ShobjDescriptionFunction::new();
+        assert_eq!("shobj_description", func.name());
+        assert_eq!(DataType::Utf8, func.return_type(&[]).unwrap());
+
+        let args = create_test_args(
+            vec![
+                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
+                ColumnarValue::Scalar(ScalarValue::Utf8(Some("pg_database".to_string()))),
+            ],
+            1,
+        );
+        let result = func.invoke_with_args(args).unwrap();
+        if let ColumnarValue::Array(arr) = result {
+            assert_eq!(1, arr.len());
+            assert!(arr.is_null(0));
+        } else {
+            panic!("Expected Array result");
+        }
     }
 }
diff --git a/src/common/function/src/system/pg_catalog/version.rs b/src/common/function/src/system/pg_catalog/version.rs
deleted file mode 100644
index 9acdd39472..0000000000
--- a/src/common/function/src/system/pg_catalog/version.rs
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::fmt;
-
-use datafusion::arrow::datatypes::DataType;
-use datafusion_common::ScalarValue;
-use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
-
-use crate::function::Function;
-
-#[derive(Clone, Debug)]
-pub(crate) struct PGVersionFunction {
-    signature: Signature,
-}
-
-impl Default for PGVersionFunction {
-    fn default() -> Self {
-        Self {
-            signature: Signature::exact(vec![], Volatility::Immutable),
-        }
-    }
-}
-
-impl fmt::Display for PGVersionFunction {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "pg_catalog.VERSION")
-    }
-}
-
-impl Function for PGVersionFunction {
-    fn name(&self) -> &str {
-        "pg_catalog.version"
-    }
-
-    fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
-        Ok(DataType::Utf8View)
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn invoke_with_args(&self, _: ScalarFunctionArgs) -> datafusion_common::Result<ColumnarValue> {
-        Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(Some(format!(
-            "PostgreSQL 16.3 GreptimeDB {}",
-            common_version::version()
-        )))))
-    }
-}
diff --git a/src/common/function/src/system/version.rs b/src/common/function/src/system/version.rs
index 369ad14080..1c148bd7ab 100644
--- a/src/common/function/src/system/version.rs
+++ b/src/common/function/src/system/version.rs
@@ -50,7 +50,7 @@ impl Function for VersionFunction {
                 )
             }
             Channel::Postgres => {
-                format!("16.3-greptimedb-{}", common_version::version())
+                format!("PostgreSQL 16.3 GreptimeDB {}", common_version::version())
             }
             _ => common_version::version().to_string(),
         };
diff --git a/src/common/grpc/Cargo.toml b/src/common/grpc/Cargo.toml
index 1684d0b297..e57b9124fa 100644
--- a/src/common/grpc/Cargo.toml
+++ b/src/common/grpc/Cargo.toml
@@ -12,6 +12,7 @@ api.workspace = true
 arrow-flight.workspace = true
 bytes.workspace = true
 common-base.workspace = true
+common-config.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 common-recordbatch.workspace = true
@@ -37,6 +38,7 @@ vec1 = "1.12"
 criterion = "0.4"
 hyper-util = { workspace = true, features = ["tokio"] }
 rand.workspace = true
+tempfile.workspace = true
 
 [[bench]]
 name = "bench_main"
diff --git a/src/common/grpc/src/channel_manager.rs b/src/common/grpc/src/channel_manager.rs
index 667b73f5f3..a60604da94 100644
--- a/src/common/grpc/src/channel_manager.rs
+++ b/src/common/grpc/src/channel_manager.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::path::Path;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering};
 use std::time::Duration;
@@ -30,6 +31,7 @@ use tonic::transport::{
 use tower::Service;
 
 use crate::error::{CreateChannelSnafu, InvalidConfigFilePathSnafu, Result};
+use crate::reloadable_tls::{ReloadableTlsConfig, TlsConfigLoader, maybe_watch_tls_config};
 
 const RECYCLE_CHANNEL_INTERVAL_SECS: u64 = 60;
 pub const DEFAULT_GRPC_REQUEST_TIMEOUT_SECS: u64 = 10;
@@ -50,7 +52,7 @@ pub struct ChannelManager {
 struct Inner {
     id: u64,
     config: ChannelConfig,
-    client_tls_config: Option<ClientTlsConfig>,
+    reloadable_client_tls_config: Option<Arc<ReloadableClientTlsConfig>>,
     pool: Arc<Pool>,
     channel_recycle_started: AtomicBool,
     cancel: CancellationToken,
@@ -78,7 +80,7 @@ impl Inner {
         Self {
             id,
             config,
-            client_tls_config: None,
+            reloadable_client_tls_config: None,
             pool,
             channel_recycle_started: AtomicBool::new(false),
             cancel,
@@ -91,13 +93,17 @@ impl ChannelManager {
         Default::default()
     }
 
-    /// unified with config function that support tls config
-    /// use [`load_tls_config`] to load tls config from file system
-    pub fn with_config(config: ChannelConfig, tls_config: Option<ClientTlsConfig>) -> Self {
+    /// Create a ChannelManager with configuration and optional TLS config
+    ///
+    /// Use [`load_client_tls_config`] to create TLS configuration from `ClientTlsOption`.
+    /// The TLS config supports both static (watch disabled) and dynamic reloading (watch enabled).
+    /// If you want to use dynamic reloading, please **manually** invoke [`maybe_watch_client_tls_config`] after this method.
+    pub fn with_config(
+        config: ChannelConfig,
+        reloadable_tls_config: Option<Arc<ReloadableClientTlsConfig>>,
+    ) -> Self {
         let mut inner = Inner::with_config(config.clone());
-        if let Some(tls_config) = tls_config {
-            inner.client_tls_config = Some(tls_config);
-        }
+        inner.reloadable_client_tls_config = reloadable_tls_config;
         Self {
             inner: Arc::new(inner),
         }
@@ -172,8 +178,21 @@ impl ChannelManager {
         self.pool().retain_channel(f);
     }
 
+    /// Clear all channels to force reconnection.
+    /// This should be called when TLS configuration changes to ensure new connections use updated certificates.
+    pub fn clear_all_channels(&self) {
+        self.pool().retain_channel(|_, _| false);
+    }
+
     fn build_endpoint(&self, addr: &str) -> Result<Endpoint> {
-        let http_prefix = if self.inner.client_tls_config.is_some() {
+        // Get the latest TLS config from reloadable config (which handles both static and dynamic cases)
+        let tls_config = self
+            .inner
+            .reloadable_client_tls_config
+            .as_ref()
+            .and_then(|c| c.get_config());
+
+        let http_prefix = if tls_config.is_some() {
             "https"
         } else {
             "http"
@@ -212,9 +231,9 @@ impl ChannelManager {
         if let Some(enabled) = self.config().http2_adaptive_window {
             endpoint = endpoint.http2_adaptive_window(enabled);
         }
-        if let Some(tls_config) = &self.inner.client_tls_config {
+        if let Some(tls_config) = tls_config {
             endpoint = endpoint
-                .tls_config(tls_config.clone())
+                .tls_config(tls_config)
                 .context(CreateChannelSnafu { addr })?;
         }
 
@@ -248,7 +267,7 @@ impl ChannelManager {
     }
 }
 
-pub fn load_tls_config(tls_option: Option<&ClientTlsOption>) -> Result<Option<ClientTlsConfig>> {
+fn load_tls_config(tls_option: Option<&ClientTlsOption>) -> Result<Option<ClientTlsConfig>> {
     let path_config = match tls_option {
         Some(path_config) if path_config.enabled => path_config,
         _ => return Ok(None),
@@ -276,13 +295,69 @@ pub fn load_tls_config(tls_option: Option<&ClientTlsOption>) -> Result<Option<Cl
     Ok(Some(tls_config))
 }
 
-#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+impl TlsConfigLoader<ClientTlsConfig> for ClientTlsOption {
+    type Error = crate::error::Error;
+
+    fn load(&self) -> Result<Option<ClientTlsConfig>> {
+        load_tls_config(Some(self))
+    }
+
+    fn watch_paths(&self) -> Vec<&Path> {
+        let mut paths = Vec::new();
+        if let Some(cert_path) = &self.client_cert_path {
+            paths.push(Path::new(cert_path.as_str()));
+        }
+        if let Some(key_path) = &self.client_key_path {
+            paths.push(Path::new(key_path.as_str()));
+        }
+        if let Some(ca_path) = &self.server_ca_cert_path {
+            paths.push(Path::new(ca_path.as_str()));
+        }
+        paths
+    }
+
+    fn watch_enabled(&self) -> bool {
+        self.enabled && self.watch
+    }
+}
+
+/// Type alias for client-side reloadable TLS config
+pub type ReloadableClientTlsConfig = ReloadableTlsConfig<ClientTlsConfig, ClientTlsOption>;
+
+/// Load client TLS configuration from `ClientTlsOption` and return a `ReloadableClientTlsConfig`.
+/// This is the primary way to create TLS configuration for the ChannelManager.
+pub fn load_client_tls_config(
+    tls_option: Option<ClientTlsOption>,
+) -> Result<Option<Arc<ReloadableClientTlsConfig>>> {
+    match tls_option {
+        Some(option) if option.enabled => {
+            let reloadable = ReloadableClientTlsConfig::try_new(option)?;
+            Ok(Some(Arc::new(reloadable)))
+        }
+        _ => Ok(None),
+    }
+}
+
+pub fn maybe_watch_client_tls_config(
+    client_tls_config: Arc<ReloadableClientTlsConfig>,
+    channel_manager: ChannelManager,
+) -> Result<()> {
+    maybe_watch_tls_config(client_tls_config, move || {
+        // Clear all existing channels to force reconnection with new certificates
+        channel_manager.clear_all_channels();
+        info!("Cleared all existing channels to use new TLS certificates.");
+    })
+}
+
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
 pub struct ClientTlsOption {
     /// Whether to enable TLS for client.
     pub enabled: bool,
     pub server_ca_cert_path: Option<String>,
     pub client_cert_path: Option<String>,
     pub client_key_path: Option<String>,
+    #[serde(default)]
+    pub watch: bool,
 }
 
 #[derive(Clone, Debug, PartialEq, Eq)]
@@ -602,6 +677,7 @@ mod tests {
                 server_ca_cert_path: Some("some_server_path".to_string()),
                 client_cert_path: Some("some_cert_path".to_string()),
                 client_key_path: Some("some_key_path".to_string()),
+                watch: false,
             });
 
         assert_eq!(
@@ -623,6 +699,7 @@ mod tests {
                     server_ca_cert_path: Some("some_server_path".to_string()),
                     client_cert_path: Some("some_cert_path".to_string()),
                     client_key_path: Some("some_key_path".to_string()),
+                    watch: false,
                 }),
                 max_recv_message_size: DEFAULT_MAX_GRPC_RECV_MESSAGE_SIZE,
                 max_send_message_size: DEFAULT_MAX_GRPC_SEND_MESSAGE_SIZE,
diff --git a/src/common/grpc/src/error.rs b/src/common/grpc/src/error.rs
index 147ff70c07..1d987514df 100644
--- a/src/common/grpc/src/error.rs
+++ b/src/common/grpc/src/error.rs
@@ -38,6 +38,14 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Failed to watch config file"))]
+    FileWatch {
+        #[snafu(source)]
+        source: common_config::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display(
         "Write type mismatch, column name: {}, expected: {}, actual: {}",
         column_name,
@@ -108,6 +116,7 @@ impl ErrorExt for Error {
         match self {
             Error::InvalidTlsConfig { .. }
             | Error::InvalidConfigFilePath { .. }
+            | Error::FileWatch { .. }
             | Error::TypeMismatch { .. }
             | Error::InvalidFlightData { .. }
             | Error::NotSupported { .. } => StatusCode::InvalidArguments,
diff --git a/src/common/grpc/src/flight/do_put.rs b/src/common/grpc/src/flight/do_put.rs
index 15011fc74b..7997b7ba79 100644
--- a/src/common/grpc/src/flight/do_put.rs
+++ b/src/common/grpc/src/flight/do_put.rs
@@ -46,13 +46,16 @@ pub struct DoPutResponse {
     request_id: i64,
     /// The successfully ingested rows number.
     affected_rows: AffectedRows,
+    /// The elapsed time in seconds for handling the bulk insert.
+    elapsed_secs: f64,
 }
 
 impl DoPutResponse {
-    pub fn new(request_id: i64, affected_rows: AffectedRows) -> Self {
+    pub fn new(request_id: i64, affected_rows: AffectedRows, elapsed_secs: f64) -> Self {
         Self {
             request_id,
             affected_rows,
+            elapsed_secs,
         }
     }
 
@@ -63,6 +66,10 @@ impl DoPutResponse {
     pub fn affected_rows(&self) -> AffectedRows {
         self.affected_rows
     }
+
+    pub fn elapsed_secs(&self) -> f64 {
+        self.elapsed_secs
+    }
 }
 
 impl TryFrom<PutResult> for DoPutResponse {
@@ -86,8 +93,11 @@ mod tests {
 
     #[test]
     fn test_serde_do_put_response() {
-        let x = DoPutResponse::new(42, 88);
+        let x = DoPutResponse::new(42, 88, 0.123);
         let serialized = serde_json::to_string(&x).unwrap();
-        assert_eq!(serialized, r#"{"request_id":42,"affected_rows":88}"#);
+        assert_eq!(
+            serialized,
+            r#"{"request_id":42,"affected_rows":88,"elapsed_secs":0.123}"#
+        );
     }
 }
diff --git a/src/common/grpc/src/lib.rs b/src/common/grpc/src/lib.rs
index 287644b529..8527dd079b 100644
--- a/src/common/grpc/src/lib.rs
+++ b/src/common/grpc/src/lib.rs
@@ -16,6 +16,7 @@ pub mod channel_manager;
 pub mod error;
 pub mod flight;
 pub mod precision;
+pub mod reloadable_tls;
 pub mod select;
 
 pub use arrow_flight::FlightData;
diff --git a/src/common/grpc/src/reloadable_tls.rs b/src/common/grpc/src/reloadable_tls.rs
new file mode 100644
index 0000000000..1f4f07590e
--- /dev/null
+++ b/src/common/grpc/src/reloadable_tls.rs
@@ -0,0 +1,145 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::path::Path;
+use std::result::Result as StdResult;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{Arc, RwLock};
+
+use common_config::file_watcher::{FileWatcherBuilder, FileWatcherConfig};
+use common_telemetry::{error, info};
+use snafu::ResultExt;
+
+use crate::error::{FileWatchSnafu, Result};
+
+/// A trait for loading TLS configuration from an option type
+pub trait TlsConfigLoader<T> {
+    type Error;
+
+    /// Load the TLS configuration
+    fn load(&self) -> StdResult<Option<T>, Self::Error>;
+
+    /// Get paths to certificate files for watching
+    fn watch_paths(&self) -> Vec<&Path>;
+
+    /// Check if watching is enabled
+    fn watch_enabled(&self) -> bool;
+}
+
+/// A mutable container for TLS config
+///
+/// This struct allows dynamic reloading of certificates and keys.
+/// It's generic over the config type (e.g., ServerConfig, ClientTlsConfig)
+/// and the option type (e.g., TlsOption, ClientTlsOption).
+#[derive(Debug)]
+pub struct ReloadableTlsConfig<T, O>
+where
+    O: TlsConfigLoader<T>,
+{
+    tls_option: O,
+    config: RwLock<Option<T>>,
+    version: AtomicUsize,
+}
+
+impl<T, O> ReloadableTlsConfig<T, O>
+where
+    O: TlsConfigLoader<T>,
+{
+    /// Create config by loading configuration from the option type
+    pub fn try_new(tls_option: O) -> StdResult<Self, O::Error> {
+        let config = tls_option.load()?;
+        Ok(Self {
+            tls_option,
+            config: RwLock::new(config),
+            version: AtomicUsize::new(0),
+        })
+    }
+
+    /// Reread certificates and keys from file system.
+    pub fn reload(&self) -> StdResult<(), O::Error> {
+        let config = self.tls_option.load()?;
+        *self.config.write().unwrap() = config;
+        self.version.fetch_add(1, Ordering::Relaxed);
+        Ok(())
+    }
+
+    /// Get the config held by this container
+    pub fn get_config(&self) -> Option<T>
+    where
+        T: Clone,
+    {
+        self.config.read().unwrap().clone()
+    }
+
+    /// Get associated option
+    pub fn get_tls_option(&self) -> &O {
+        &self.tls_option
+    }
+
+    /// Get version of current config
+    ///
+    /// this version will auto increase when config get reloaded.
+    pub fn get_version(&self) -> usize {
+        self.version.load(Ordering::Relaxed)
+    }
+}
+
+/// Watch TLS configuration files for changes and reload automatically
+///
+/// This is a generic function that works with any ReloadableTlsConfig.
+/// When changes are detected, it calls the provided callback after reloading.
+///
+/// T: the original TLS config
+/// O: the compiled TLS option
+/// F: the hook function to be called after reloading
+/// E: the error type for the loading operation
+pub fn maybe_watch_tls_config<T, O, F, E>(
+    tls_config: Arc<ReloadableTlsConfig<T, O>>,
+    on_reload: F,
+) -> Result<()>
+where
+    T: Send + Sync + 'static,
+    O: TlsConfigLoader<T, Error = E> + Send + Sync + 'static,
+    E: std::error::Error + Send + Sync + 'static,
+    F: Fn() + Send + 'static,
+{
+    if !tls_config.get_tls_option().watch_enabled() {
+        return Ok(());
+    }
+
+    let watch_paths: Vec<_> = tls_config
+        .get_tls_option()
+        .watch_paths()
+        .iter()
+        .map(|p| p.to_path_buf())
+        .collect();
+
+    let tls_config_for_watcher = tls_config.clone();
+
+    FileWatcherBuilder::new()
+        .watch_paths(&watch_paths)
+        .context(FileWatchSnafu)?
+        .config(FileWatcherConfig::new())
+        .spawn(move || {
+            if let Err(err) = tls_config_for_watcher.reload() {
+                error!("Failed to reload TLS config: {}", err);
+            } else {
+                info!("Reloaded TLS cert/key file successfully.");
+                on_reload();
+            }
+        })
+        .context(FileWatchSnafu)?;
+
+    Ok(())
+}
diff --git a/src/common/grpc/tests/mod.rs b/src/common/grpc/tests/mod.rs
index a437d21cd9..93188e35fc 100644
--- a/src/common/grpc/tests/mod.rs
+++ b/src/common/grpc/tests/mod.rs
@@ -13,14 +13,15 @@
 // limitations under the License.
 
 use common_grpc::channel_manager::{
-    ChannelConfig, ChannelManager, ClientTlsOption, load_tls_config,
+    ChannelConfig, ChannelManager, ClientTlsOption, load_client_tls_config,
+    maybe_watch_client_tls_config,
 };
 
 #[tokio::test]
 async fn test_mtls_config() {
     // test no config
     let config = ChannelConfig::new();
-    let re = load_tls_config(config.client_tls.as_ref());
+    let re = load_client_tls_config(config.client_tls.clone());
     assert!(re.is_ok());
     assert!(re.unwrap().is_none());
 
@@ -30,9 +31,10 @@ async fn test_mtls_config() {
         server_ca_cert_path: Some("tests/tls/wrong_ca.pem".to_string()),
         client_cert_path: Some("tests/tls/wrong_client.pem".to_string()),
         client_key_path: Some("tests/tls/wrong_client.key".to_string()),
+        watch: false,
     });
 
-    let re = load_tls_config(config.client_tls.as_ref());
+    let re = load_client_tls_config(config.client_tls.clone());
     assert!(re.is_err());
 
     // test corrupted file content
@@ -41,9 +43,10 @@ async fn test_mtls_config() {
         server_ca_cert_path: Some("tests/tls/ca.pem".to_string()),
         client_cert_path: Some("tests/tls/client.pem".to_string()),
         client_key_path: Some("tests/tls/corrupted".to_string()),
+        watch: false,
     });
 
-    let tls_config = load_tls_config(config.client_tls.as_ref()).unwrap();
+    let tls_config = load_client_tls_config(config.client_tls.clone()).unwrap();
     let re = ChannelManager::with_config(config, tls_config);
 
     let re = re.get("127.0.0.1:0");
@@ -55,10 +58,112 @@ async fn test_mtls_config() {
         server_ca_cert_path: Some("tests/tls/ca.pem".to_string()),
         client_cert_path: Some("tests/tls/client.pem".to_string()),
         client_key_path: Some("tests/tls/client.key".to_string()),
+        watch: false,
     });
 
-    let tls_config = load_tls_config(config.client_tls.as_ref()).unwrap();
+    let tls_config = load_client_tls_config(config.client_tls.clone()).unwrap();
     let re = ChannelManager::with_config(config, tls_config);
     let re = re.get("127.0.0.1:0");
     let _ = re.unwrap();
 }
+
+#[tokio::test]
+async fn test_reloadable_client_tls_config() {
+    common_telemetry::init_default_ut_logging();
+
+    let dir = tempfile::tempdir().unwrap();
+    let cert_path = dir.path().join("client.pem");
+    let key_path = dir.path().join("client.key");
+
+    std::fs::copy("tests/tls/client.pem", &cert_path).expect("failed to copy cert to tmpdir");
+    std::fs::copy("tests/tls/client.key", &key_path).expect("failed to copy key to tmpdir");
+
+    assert!(std::fs::exists(&cert_path).unwrap());
+    assert!(std::fs::exists(&key_path).unwrap());
+
+    let client_tls_option = ClientTlsOption {
+        enabled: true,
+        server_ca_cert_path: Some("tests/tls/ca.pem".to_string()),
+        client_cert_path: Some(
+            cert_path
+                .clone()
+                .into_os_string()
+                .into_string()
+                .expect("failed to convert path to string"),
+        ),
+        client_key_path: Some(
+            key_path
+                .clone()
+                .into_os_string()
+                .into_string()
+                .expect("failed to convert path to string"),
+        ),
+        watch: true,
+    };
+
+    let reloadable_config = load_client_tls_config(Some(client_tls_option))
+        .expect("failed to load tls config")
+        .expect("tls config should be present");
+
+    let config = ChannelConfig::new();
+    let manager = ChannelManager::with_config(config, Some(reloadable_config.clone()));
+
+    maybe_watch_client_tls_config(reloadable_config.clone(), manager.clone())
+        .expect("failed to watch client config");
+
+    assert_eq!(0, reloadable_config.get_version());
+    assert!(reloadable_config.get_config().is_some());
+
+    // Create a channel to verify it gets cleared on reload
+    let _ = manager.get("127.0.0.1:0").expect("failed to get channel");
+
+    // Simulate file change by copying a different key file
+    let tmp_file = key_path.with_extension("tmp");
+    std::fs::copy("tests/tls/server.key", &tmp_file).expect("Failed to copy temp key file");
+    std::fs::rename(&tmp_file, &key_path).expect("Failed to rename temp key file");
+
+    const MAX_RETRIES: usize = 30;
+    let mut retries = 0;
+    let mut version_updated = false;
+
+    while retries < MAX_RETRIES {
+        if reloadable_config.get_version() > 0 {
+            version_updated = true;
+            break;
+        }
+        std::thread::sleep(std::time::Duration::from_millis(100));
+        retries += 1;
+    }
+
+    assert!(version_updated, "TLS config did not reload in time");
+    assert!(reloadable_config.get_version() > 0);
+    assert!(reloadable_config.get_config().is_some());
+}
+
+#[tokio::test]
+async fn test_channel_manager_with_reloadable_tls() {
+    common_telemetry::init_default_ut_logging();
+
+    let client_tls_option = ClientTlsOption {
+        enabled: true,
+        server_ca_cert_path: Some("tests/tls/ca.pem".to_string()),
+        client_cert_path: Some("tests/tls/client.pem".to_string()),
+        client_key_path: Some("tests/tls/client.key".to_string()),
+        watch: false,
+    };
+
+    let reloadable_config = load_client_tls_config(Some(client_tls_option))
+        .expect("failed to load tls config")
+        .expect("tls config should be present");
+
+    let config = ChannelConfig::new();
+    let manager = ChannelManager::with_config(config, Some(reloadable_config.clone()));
+
+    // Test that we can get a channel
+    let channel = manager.get("127.0.0.1:0");
+    assert!(channel.is_ok());
+
+    // Test that config is properly set
+    assert_eq!(0, reloadable_config.get_version());
+    assert!(reloadable_config.get_config().is_some());
+}
diff --git a/src/common/macro/src/row/schema.rs b/src/common/macro/src/row/schema.rs
index 67848a36a0..82296655f9 100644
--- a/src/common/macro/src/row/schema.rs
+++ b/src/common/macro/src/row/schema.rs
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use greptime_proto::v1::ColumnDataTypeExtension;
 use greptime_proto::v1::column_data_type_extension::TypeExt;
-use proc_macro2::TokenStream as TokenStream2;
+use proc_macro2::{Span, TokenStream as TokenStream2};
 use quote::quote;
 use syn::spanned::Spanned;
 use syn::{DeriveInput, Result};
@@ -69,57 +70,7 @@ fn impl_schema_method(fields: &[ParsedField<'_>]) -> Result<TokenStream2> {
             let semantic_type_val = convert_semantic_type_to_proto_semantic_type(column_attribute.semantic_type) as i32;
             let semantic_type = syn::LitInt::new(&semantic_type_val.to_string(), ident.span());
             let extension = match extension {
-                Some(ext) => {
-                    match ext.type_ext {
-                        Some(TypeExt::DecimalType(ext)) => {
-                            let precision = syn::LitInt::new(&ext.precision.to_string(), ident.span());
-                            let scale = syn::LitInt::new(&ext.scale.to_string(), ident.span());
-                            quote! {
-                                Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::DecimalType(DecimalTypeExtension { precision: #precision, scale: #scale })) })
-                            }
-                        }
-                        Some(TypeExt::JsonType(ext)) => {
-                            let json_type = syn::LitInt::new(&ext.to_string(), ident.span());
-                            quote! {
-                                Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::JsonType(#json_type)) })
-                            }
-                        }
-                        Some(TypeExt::VectorType(ext)) => {
-                            let dim = syn::LitInt::new(&ext.dim.to_string(), ident.span());
-                            quote! {
-                                Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::VectorType(VectorTypeExtension { dim: #dim })) })
-                            }
-                        }
-                        // TODO(sunng87): revisit all these implementations
-                        Some(TypeExt::ListType(ext)) => {
-                            let item_type = syn::Ident::new(&ext.datatype.to_string(), ident.span());
-                            quote! {
-                                Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::ListType(ListTypeExtension { item_type: #item_type })) })
-                            }
-                        }
-                        Some(TypeExt::StructType(ext)) => {
-                            let fields = ext.fields.iter().map(|field| {
-                                let field_name = syn::Ident::new(&field.name.clone(), ident.span());
-                                let field_type = syn::Ident::new(&field.datatype.to_string(), ident.span());
-                                quote! {
-                                    StructField { name: #field_name, type_: #field_type }
-                                }
-                            }).collect::<Vec<_>>();
-                            quote! {
-                                Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::StructType(StructTypeExtension { fields: [#(#fields),*] })) })
-                            }
-                        }
-                        Some(TypeExt::JsonNativeType(ext)) => {
-                            let inner = syn::Ident::new(&ext.datatype.to_string(), ident.span());
-                            quote! {
-                                Some(ColumnDataTypeExtension { type_ext: Some(TypeExt::JsonNativeType(JsonNativeTypeExtension { datatype: #inner })) })
-                            }
-                        }
-                        None => {
-                            quote! { None }
-                        }
-                    }
-                }
+                Some(ext) => column_data_type_extension_to_tokens(&ext, ident.span()),
                 None => quote! { None },
             };
 
@@ -141,3 +92,125 @@ fn impl_schema_method(fields: &[ParsedField<'_>]) -> Result<TokenStream2> {
         }
     })
 }
+
+fn column_data_type_extension_to_tokens(
+    extension: &ColumnDataTypeExtension,
+    span: Span,
+) -> TokenStream2 {
+    match extension.type_ext.as_ref() {
+        Some(TypeExt::DecimalType(ext)) => {
+            let precision = syn::LitInt::new(&ext.precision.to_string(), span);
+            let scale = syn::LitInt::new(&ext.scale.to_string(), span);
+            quote! {
+                Some(ColumnDataTypeExtension {
+                    type_ext: Some(TypeExt::DecimalType(DecimalTypeExtension {
+                        precision: #precision,
+                        scale: #scale,
+                    })),
+                })
+            }
+        }
+        Some(TypeExt::JsonType(ext)) => {
+            let json_type = syn::LitInt::new(&ext.to_string(), span);
+            quote! {
+                Some(ColumnDataTypeExtension {
+                    type_ext: Some(TypeExt::JsonType(#json_type)),
+                })
+            }
+        }
+        Some(TypeExt::VectorType(ext)) => {
+            let dim = syn::LitInt::new(&ext.dim.to_string(), span);
+            quote! {
+                Some(ColumnDataTypeExtension {
+                    type_ext: Some(TypeExt::VectorType(VectorTypeExtension { dim: #dim })),
+                })
+            }
+        }
+        Some(TypeExt::ListType(ext)) => {
+            let datatype = syn::LitInt::new(&ext.datatype.to_string(), span);
+            let datatype_extension = ext
+                .datatype_extension
+                .as_deref()
+                .map(|ext| column_data_type_extension_to_tokens(ext, span))
+                .unwrap_or_else(|| quote! { None });
+            quote! {
+                Some(ColumnDataTypeExtension {
+                    type_ext: Some(TypeExt::ListType(Box::new(ListTypeExtension {
+                        datatype: #datatype,
+                        datatype_extension: #datatype_extension,
+                    }))),
+                })
+            }
+        }
+        Some(TypeExt::StructType(ext)) => {
+            let fields = ext.fields.iter().map(|field| {
+                let field_name = &field.name;
+                let datatype = syn::LitInt::new(&field.datatype.to_string(), span);
+                let datatype_extension = field
+                    .datatype_extension
+                    .as_ref()
+                    .map(|ext| column_data_type_extension_to_tokens(ext, span))
+                    .unwrap_or_else(|| quote! { None });
+                quote! {
+                    greptime_proto::v1::StructField {
+                        name: #field_name.to_string(),
+                        datatype: #datatype,
+                        datatype_extension: #datatype_extension,
+                    }
+                }
+            });
+            quote! {
+                Some(ColumnDataTypeExtension {
+                    type_ext: Some(TypeExt::StructType(StructTypeExtension {
+                        fields: vec![#(#fields),*],
+                    })),
+                })
+            }
+        }
+        Some(TypeExt::JsonNativeType(ext)) => {
+            let inner = syn::LitInt::new(&ext.datatype.to_string(), span);
+            let datatype_extension = ext
+                .datatype_extension
+                .as_deref()
+                .map(|ext| column_data_type_extension_to_tokens(ext, span))
+                .unwrap_or_else(|| quote! { None });
+            quote! {
+                Some(ColumnDataTypeExtension {
+                    type_ext: Some(TypeExt::JsonNativeType(Box::new(
+                        JsonNativeTypeExtension {
+                            datatype: #inner,
+                            datatype_extension: #datatype_extension,
+                        },
+                    ))),
+                })
+            }
+        }
+        Some(TypeExt::DictionaryType(ext)) => {
+            let key_datatype = syn::LitInt::new(&ext.key_datatype.to_string(), span);
+            let value_datatype = syn::LitInt::new(&ext.value_datatype.to_string(), span);
+            let key_datatype_extension = ext
+                .key_datatype_extension
+                .as_deref()
+                .map(|ext| column_data_type_extension_to_tokens(ext, span))
+                .unwrap_or_else(|| quote! { None });
+            let value_datatype_extension = ext
+                .value_datatype_extension
+                .as_deref()
+                .map(|ext| column_data_type_extension_to_tokens(ext, span))
+                .unwrap_or_else(|| quote! { None });
+            quote! {
+                Some(ColumnDataTypeExtension {
+                    type_ext: Some(TypeExt::DictionaryType(Box::new(
+                        DictionaryTypeExtension {
+                            key_datatype: #key_datatype,
+                            key_datatype_extension: #key_datatype_extension,
+                            value_datatype: #value_datatype,
+                            value_datatype_extension: #value_datatype_extension,
+                        },
+                    ))),
+                })
+            }
+        }
+        None => quote! { None },
+    }
+}
diff --git a/src/common/macro/src/row/utils.rs b/src/common/macro/src/row/utils.rs
index 40f990a40a..1768b2747a 100644
--- a/src/common/macro/src/row/utils.rs
+++ b/src/common/macro/src/row/utils.rs
@@ -309,5 +309,8 @@ pub(crate) fn convert_column_data_type_to_value_data_ident(
         ColumnDataType::Vector => format_ident!("VectorValue"),
         ColumnDataType::List => format_ident!("ListValue"),
         ColumnDataType::Struct => format_ident!("StructValue"),
+        ColumnDataType::Dictionary => {
+            panic!("Dictionary data type is not supported in row macros yet")
+        }
     }
 }
diff --git a/src/common/memory-manager/Cargo.toml b/src/common/memory-manager/Cargo.toml
new file mode 100644
index 0000000000..a6be50f774
--- /dev/null
+++ b/src/common/memory-manager/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "common-memory-manager"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+common-error = { workspace = true }
+common-macro = { workspace = true }
+common-telemetry = { workspace = true }
+humantime = { workspace = true }
+serde = { workspace = true }
+snafu = { workspace = true }
+tokio = { workspace = true, features = ["sync"] }
+
+[dev-dependencies]
+tokio = { workspace = true, features = ["rt", "macros"] }
diff --git a/src/common/memory-manager/src/error.rs b/src/common/memory-manager/src/error.rs
new file mode 100644
index 0000000000..5ff7d74ad6
--- /dev/null
+++ b/src/common/memory-manager/src/error.rs
@@ -0,0 +1,53 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use snafu::Snafu;
+
+pub type Result<T> = std::result::Result<T, Error>;
+
+#[derive(Snafu)]
+#[snafu(visibility(pub))]
+#[stack_trace_debug]
+pub enum Error {
+    #[snafu(display(
+        "Memory limit exceeded: requested {requested_bytes} bytes, limit {limit_bytes} bytes"
+    ))]
+    MemoryLimitExceeded {
+        requested_bytes: u64,
+        limit_bytes: u64,
+    },
+
+    #[snafu(display("Memory semaphore unexpectedly closed"))]
+    MemorySemaphoreClosed,
+}
+
+impl ErrorExt for Error {
+    fn status_code(&self) -> StatusCode {
+        use Error::*;
+
+        match self {
+            MemoryLimitExceeded { .. } => StatusCode::RuntimeResourcesExhausted,
+            MemorySemaphoreClosed => StatusCode::Unexpected,
+        }
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
diff --git a/src/common/memory-manager/src/guard.rs b/src/common/memory-manager/src/guard.rs
new file mode 100644
index 0000000000..e72e16ab5b
--- /dev/null
+++ b/src/common/memory-manager/src/guard.rs
@@ -0,0 +1,138 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::{fmt, mem};
+
+use common_telemetry::debug;
+use tokio::sync::{OwnedSemaphorePermit, TryAcquireError};
+
+use crate::manager::{MemoryMetrics, MemoryQuota, bytes_to_permits, permits_to_bytes};
+
+/// Guard representing a slice of reserved memory.
+pub struct MemoryGuard<M: MemoryMetrics> {
+    pub(crate) state: GuardState<M>,
+}
+
+pub(crate) enum GuardState<M: MemoryMetrics> {
+    Unlimited,
+    Limited {
+        permit: OwnedSemaphorePermit,
+        quota: MemoryQuota<M>,
+    },
+}
+
+impl<M: MemoryMetrics> MemoryGuard<M> {
+    pub(crate) fn unlimited() -> Self {
+        Self {
+            state: GuardState::Unlimited,
+        }
+    }
+
+    pub(crate) fn limited(permit: OwnedSemaphorePermit, quota: MemoryQuota<M>) -> Self {
+        Self {
+            state: GuardState::Limited { permit, quota },
+        }
+    }
+
+    /// Returns granted quota in bytes.
+    pub fn granted_bytes(&self) -> u64 {
+        match &self.state {
+            GuardState::Unlimited => 0,
+            GuardState::Limited { permit, .. } => permits_to_bytes(permit.num_permits() as u32),
+        }
+    }
+
+    /// Tries to allocate additional memory during task execution.
+    ///
+    /// On success, merges the new memory into this guard and returns true.
+    /// On failure, returns false and leaves this guard unchanged.
+    pub fn request_additional(&mut self, bytes: u64) -> bool {
+        match &mut self.state {
+            GuardState::Unlimited => true,
+            GuardState::Limited { permit, quota } => {
+                if bytes == 0 {
+                    return true;
+                }
+
+                let additional_permits = bytes_to_permits(bytes);
+
+                match quota
+                    .semaphore
+                    .clone()
+                    .try_acquire_many_owned(additional_permits)
+                {
+                    Ok(additional_permit) => {
+                        permit.merge(additional_permit);
+                        quota.update_in_use_metric();
+                        debug!("Allocated additional {} bytes", bytes);
+                        true
+                    }
+                    Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
+                        quota.metrics.inc_rejected("request_additional");
+                        false
+                    }
+                }
+            }
+        }
+    }
+
+    /// Releases a portion of granted memory back to the pool early,
+    /// before the guard is dropped.
+    ///
+    /// Returns true if the release succeeds or is a no-op; false if the request exceeds granted.
+    pub fn early_release_partial(&mut self, bytes: u64) -> bool {
+        match &mut self.state {
+            GuardState::Unlimited => true,
+            GuardState::Limited { permit, quota } => {
+                if bytes == 0 {
+                    return true;
+                }
+
+                let release_permits = bytes_to_permits(bytes);
+
+                match permit.split(release_permits as usize) {
+                    Some(released_permit) => {
+                        let released_bytes = permits_to_bytes(released_permit.num_permits() as u32);
+                        drop(released_permit);
+                        quota.update_in_use_metric();
+                        debug!("Early released {} bytes from memory guard", released_bytes);
+                        true
+                    }
+                    None => false,
+                }
+            }
+        }
+    }
+}
+
+impl<M: MemoryMetrics> Drop for MemoryGuard<M> {
+    fn drop(&mut self) {
+        if let GuardState::Limited { permit, quota } =
+            mem::replace(&mut self.state, GuardState::Unlimited)
+        {
+            let bytes = permits_to_bytes(permit.num_permits() as u32);
+            drop(permit);
+            quota.update_in_use_metric();
+            debug!("Released memory: {} bytes", bytes);
+        }
+    }
+}
+
+impl<M: MemoryMetrics> fmt::Debug for MemoryGuard<M> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("MemoryGuard")
+            .field("granted_bytes", &self.granted_bytes())
+            .finish()
+    }
+}
diff --git a/src/common/memory-manager/src/lib.rs b/src/common/memory-manager/src/lib.rs
new file mode 100644
index 0000000000..61d52f6366
--- /dev/null
+++ b/src/common/memory-manager/src/lib.rs
@@ -0,0 +1,47 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Generic memory management for resource-constrained operations.
+//!
+//! This crate provides a reusable memory quota system based on semaphores,
+//! allowing different subsystems (compaction, flush, index build, etc.) to
+//! share the same allocation logic while using their own metrics.
+
+mod error;
+mod guard;
+mod manager;
+mod policy;
+
+#[cfg(test)]
+mod tests;
+
+pub use error::{Error, Result};
+pub use guard::MemoryGuard;
+pub use manager::{MemoryManager, MemoryMetrics, PERMIT_GRANULARITY_BYTES};
+pub use policy::{DEFAULT_MEMORY_WAIT_TIMEOUT, OnExhaustedPolicy};
+
+/// No-op metrics implementation for testing.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct NoOpMetrics;
+
+impl MemoryMetrics for NoOpMetrics {
+    #[inline(always)]
+    fn set_limit(&self, _: i64) {}
+
+    #[inline(always)]
+    fn set_in_use(&self, _: i64) {}
+
+    #[inline(always)]
+    fn inc_rejected(&self, _: &str) {}
+}
diff --git a/src/common/memory-manager/src/manager.rs b/src/common/memory-manager/src/manager.rs
new file mode 100644
index 0000000000..8cc7b937e4
--- /dev/null
+++ b/src/common/memory-manager/src/manager.rs
@@ -0,0 +1,173 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use snafu::ensure;
+use tokio::sync::{Semaphore, TryAcquireError};
+
+use crate::error::{MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result};
+use crate::guard::MemoryGuard;
+
+/// Minimum bytes controlled by one semaphore permit.
+pub const PERMIT_GRANULARITY_BYTES: u64 = 1 << 20; // 1 MB
+
+/// Trait for recording memory usage metrics.
+pub trait MemoryMetrics: Clone + Send + Sync + 'static {
+    fn set_limit(&self, bytes: i64);
+    fn set_in_use(&self, bytes: i64);
+    fn inc_rejected(&self, reason: &str);
+}
+
+/// Generic memory manager for quota-controlled operations.
+#[derive(Clone)]
+pub struct MemoryManager<M: MemoryMetrics> {
+    quota: Option<MemoryQuota<M>>,
+}
+
+#[derive(Clone)]
+pub(crate) struct MemoryQuota<M: MemoryMetrics> {
+    pub(crate) semaphore: Arc<Semaphore>,
+    pub(crate) limit_permits: u32,
+    pub(crate) metrics: M,
+}
+
+impl<M: MemoryMetrics> MemoryManager<M> {
+    /// Creates a new memory manager with the given limit in bytes.
+    /// `limit_bytes = 0` disables the limit.
+    pub fn new(limit_bytes: u64, metrics: M) -> Self {
+        if limit_bytes == 0 {
+            metrics.set_limit(0);
+            return Self { quota: None };
+        }
+
+        let limit_permits = bytes_to_permits(limit_bytes);
+        let limit_aligned_bytes = permits_to_bytes(limit_permits);
+        metrics.set_limit(limit_aligned_bytes as i64);
+
+        Self {
+            quota: Some(MemoryQuota {
+                semaphore: Arc::new(Semaphore::new(limit_permits as usize)),
+                limit_permits,
+                metrics,
+            }),
+        }
+    }
+
+    /// Returns the configured limit in bytes (0 if unlimited).
+    pub fn limit_bytes(&self) -> u64 {
+        self.quota
+            .as_ref()
+            .map(|quota| permits_to_bytes(quota.limit_permits))
+            .unwrap_or(0)
+    }
+
+    /// Returns currently used bytes.
+    pub fn used_bytes(&self) -> u64 {
+        self.quota
+            .as_ref()
+            .map(|quota| permits_to_bytes(quota.used_permits()))
+            .unwrap_or(0)
+    }
+
+    /// Returns available bytes.
+    pub fn available_bytes(&self) -> u64 {
+        self.quota
+            .as_ref()
+            .map(|quota| permits_to_bytes(quota.available_permits_clamped()))
+            .unwrap_or(0)
+    }
+
+    /// Acquires memory, waiting if necessary until enough is available.
+    ///
+    /// # Errors
+    /// - Returns error if requested bytes exceed the total limit
+    /// - Returns error if the semaphore is unexpectedly closed
+    pub async fn acquire(&self, bytes: u64) -> Result<MemoryGuard<M>> {
+        match &self.quota {
+            None => Ok(MemoryGuard::unlimited()),
+            Some(quota) => {
+                let permits = bytes_to_permits(bytes);
+
+                ensure!(
+                    permits <= quota.limit_permits,
+                    MemoryLimitExceededSnafu {
+                        requested_bytes: bytes,
+                        limit_bytes: permits_to_bytes(quota.limit_permits),
+                    }
+                );
+
+                let permit = quota
+                    .semaphore
+                    .clone()
+                    .acquire_many_owned(permits)
+                    .await
+                    .map_err(|_| MemorySemaphoreClosedSnafu.build())?;
+                quota.update_in_use_metric();
+                Ok(MemoryGuard::limited(permit, quota.clone()))
+            }
+        }
+    }
+
+    /// Tries to acquire memory. Returns Some(guard) on success, None if insufficient.
+    pub fn try_acquire(&self, bytes: u64) -> Option<MemoryGuard<M>> {
+        match &self.quota {
+            None => Some(MemoryGuard::unlimited()),
+            Some(quota) => {
+                let permits = bytes_to_permits(bytes);
+
+                match quota.semaphore.clone().try_acquire_many_owned(permits) {
+                    Ok(permit) => {
+                        quota.update_in_use_metric();
+                        Some(MemoryGuard::limited(permit, quota.clone()))
+                    }
+                    Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
+                        quota.metrics.inc_rejected("try_acquire");
+                        None
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl<M: MemoryMetrics> MemoryQuota<M> {
+    pub(crate) fn used_permits(&self) -> u32 {
+        self.limit_permits
+            .saturating_sub(self.available_permits_clamped())
+    }
+
+    pub(crate) fn available_permits_clamped(&self) -> u32 {
+        self.semaphore
+            .available_permits()
+            .min(self.limit_permits as usize) as u32
+    }
+
+    pub(crate) fn update_in_use_metric(&self) {
+        let bytes = permits_to_bytes(self.used_permits());
+        self.metrics.set_in_use(bytes as i64);
+    }
+}
+
+pub(crate) fn bytes_to_permits(bytes: u64) -> u32 {
+    bytes
+        .saturating_add(PERMIT_GRANULARITY_BYTES - 1)
+        .saturating_div(PERMIT_GRANULARITY_BYTES)
+        .min(Semaphore::MAX_PERMITS as u64)
+        .min(u32::MAX as u64) as u32
+}
+
+pub(crate) fn permits_to_bytes(permits: u32) -> u64 {
+    (permits as u64).saturating_mul(PERMIT_GRANULARITY_BYTES)
+}
diff --git a/src/common/memory-manager/src/policy.rs b/src/common/memory-manager/src/policy.rs
new file mode 100644
index 0000000000..3f19568b8f
--- /dev/null
+++ b/src/common/memory-manager/src/policy.rs
@@ -0,0 +1,83 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::Duration;
+
+use humantime::{format_duration, parse_duration};
+use serde::{Deserialize, Serialize};
+
+/// Default wait timeout for memory acquisition.
+pub const DEFAULT_MEMORY_WAIT_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// Defines how to react when memory cannot be acquired immediately.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum OnExhaustedPolicy {
+    /// Wait until enough memory is released, bounded by timeout.
+    Wait { timeout: Duration },
+
+    /// Fail immediately if memory is not available.
+    Fail,
+}
+
+impl Default for OnExhaustedPolicy {
+    fn default() -> Self {
+        OnExhaustedPolicy::Wait {
+            timeout: DEFAULT_MEMORY_WAIT_TIMEOUT,
+        }
+    }
+}
+
+impl Serialize for OnExhaustedPolicy {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        let text = match self {
+            OnExhaustedPolicy::Fail => "fail".to_string(),
+            OnExhaustedPolicy::Wait { timeout } if *timeout == DEFAULT_MEMORY_WAIT_TIMEOUT => {
+                "wait".to_string()
+            }
+            OnExhaustedPolicy::Wait { timeout } => format!("wait({})", format_duration(*timeout)),
+        };
+        serializer.serialize_str(&text)
+    }
+}
+
+impl<'de> Deserialize<'de> for OnExhaustedPolicy {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        let raw = String::deserialize(deserializer)?;
+        let lower = raw.to_ascii_lowercase();
+
+        // Accept both "skip" (legacy) and "fail".
+        if lower == "skip" || lower == "fail" {
+            return Ok(OnExhaustedPolicy::Fail);
+        }
+        if lower == "wait" {
+            return Ok(OnExhaustedPolicy::default());
+        }
+        if lower.starts_with("wait(") && lower.ends_with(')') {
+            let inner = &raw[5..raw.len() - 1];
+            let timeout = parse_duration(inner).map_err(serde::de::Error::custom)?;
+            return Ok(OnExhaustedPolicy::Wait { timeout });
+        }
+
+        Err(serde::de::Error::custom(format!(
+            "invalid memory policy: {}, expected wait, wait(<duration>), fail",
+            raw
+        )))
+    }
+}
diff --git a/src/common/memory-manager/src/tests.rs b/src/common/memory-manager/src/tests.rs
new file mode 100644
index 0000000000..3a928f9c7c
--- /dev/null
+++ b/src/common/memory-manager/src/tests.rs
@@ -0,0 +1,247 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use tokio::time::{Duration, sleep};
+
+use crate::{MemoryManager, NoOpMetrics, PERMIT_GRANULARITY_BYTES};
+
+#[test]
+fn test_try_acquire_unlimited() {
+    let manager = MemoryManager::new(0, NoOpMetrics);
+    let guard = manager.try_acquire(10 * PERMIT_GRANULARITY_BYTES).unwrap();
+    assert_eq!(manager.limit_bytes(), 0);
+    assert_eq!(guard.granted_bytes(), 0);
+}
+
+#[test]
+fn test_try_acquire_limited_success_and_release() {
+    let bytes = 2 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(bytes, NoOpMetrics);
+    {
+        let guard = manager.try_acquire(PERMIT_GRANULARITY_BYTES).unwrap();
+        assert_eq!(guard.granted_bytes(), PERMIT_GRANULARITY_BYTES);
+        assert_eq!(manager.used_bytes(), PERMIT_GRANULARITY_BYTES);
+        drop(guard);
+    }
+    assert_eq!(manager.used_bytes(), 0);
+}
+
+#[test]
+fn test_try_acquire_exceeds_limit() {
+    let limit = PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+    let result = manager.try_acquire(limit + PERMIT_GRANULARITY_BYTES);
+    assert!(result.is_none());
+}
+
+#[tokio::test(flavor = "current_thread")]
+async fn test_acquire_blocks_and_unblocks() {
+    let bytes = 2 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(bytes, NoOpMetrics);
+    let guard = manager.try_acquire(bytes).unwrap();
+
+    // Spawn a task that will block on acquire()
+    let waiter = {
+        let manager = manager.clone();
+        tokio::spawn(async move {
+            // This will block until memory is available
+            let _guard = manager.acquire(bytes).await.unwrap();
+        })
+    };
+
+    sleep(Duration::from_millis(10)).await;
+    // Release memory - this should unblock the waiter
+    drop(guard);
+
+    // Waiter should complete now
+    waiter.await.unwrap();
+}
+
+#[test]
+fn test_request_additional_success() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES; // 10MB limit
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    // Acquire base quota (5MB)
+    let base = 5 * PERMIT_GRANULARITY_BYTES;
+    let mut guard = manager.try_acquire(base).unwrap();
+    assert_eq!(guard.granted_bytes(), base);
+    assert_eq!(manager.used_bytes(), base);
+
+    // Request additional memory (3MB) - should succeed and merge
+    assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_request_additional_exceeds_limit() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES; // 10MB limit
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    // Acquire base quota (5MB)
+    let base = 5 * PERMIT_GRANULARITY_BYTES;
+    let mut guard = manager.try_acquire(base).unwrap();
+
+    // Request additional memory (3MB) - should succeed
+    assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+
+    // Request more (3MB) - should fail (would exceed 10MB limit)
+    let result = guard.request_additional(3 * PERMIT_GRANULARITY_BYTES);
+    assert!(!result);
+
+    // Still at 8MB
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_request_additional_auto_release_on_guard_drop() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    {
+        let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+        // Request additional - memory is merged into guard
+        assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
+        assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+
+        // When guard drops, all memory (base + additional) is released together
+    }
+
+    // After scope, all memory should be released
+    assert_eq!(manager.used_bytes(), 0);
+}
+
+#[test]
+fn test_request_additional_unlimited() {
+    let manager = MemoryManager::new(0, NoOpMetrics); // Unlimited
+    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Should always succeed with unlimited manager
+    assert!(guard.request_additional(100 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 0);
+    assert_eq!(manager.used_bytes(), 0);
+}
+
+#[test]
+fn test_request_additional_zero_bytes() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Request 0 bytes should succeed without affecting anything
+    assert!(guard.request_additional(0));
+    assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_early_release_partial_success() {
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(8 * PERMIT_GRANULARITY_BYTES).unwrap();
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+
+    // Release half
+    assert!(guard.early_release_partial(4 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 4 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 4 * PERMIT_GRANULARITY_BYTES);
+
+    // Released memory should be available to others
+    let _guard2 = manager.try_acquire(4 * PERMIT_GRANULARITY_BYTES).unwrap();
+    assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_early_release_partial_exceeds_granted() {
+    let manager = MemoryManager::new(10 * PERMIT_GRANULARITY_BYTES, NoOpMetrics);
+    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Try to release more than granted - should fail
+    assert!(!guard.early_release_partial(10 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_early_release_partial_unlimited() {
+    let manager = MemoryManager::new(0, NoOpMetrics);
+    let mut guard = manager.try_acquire(100 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Unlimited guard - release should succeed (no-op)
+    assert!(guard.early_release_partial(50 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 0);
+}
+
+#[test]
+fn test_request_and_early_release_symmetry() {
+    let limit = 20 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
+
+    // Request additional
+    assert!(guard.request_additional(5 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
+
+    // Early release some
+    assert!(guard.early_release_partial(3 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
+
+    // Request again
+    assert!(guard.request_additional(2 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
+
+    // Early release again
+    assert!(guard.early_release_partial(4 * PERMIT_GRANULARITY_BYTES));
+    assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+    assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
+
+    drop(guard);
+    assert_eq!(manager.used_bytes(), 0);
+}
+
+#[test]
+fn test_small_allocation_rounds_up() {
+    // Test that allocations smaller than PERMIT_GRANULARITY_BYTES
+    // round up to 1 permit and can use request_additional()
+    let limit = 10 * PERMIT_GRANULARITY_BYTES;
+    let manager = MemoryManager::new(limit, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(512 * 1024).unwrap(); // 512KB
+    assert_eq!(guard.granted_bytes(), PERMIT_GRANULARITY_BYTES); // Rounds up to 1MB
+    assert!(guard.request_additional(2 * PERMIT_GRANULARITY_BYTES)); // Can request more
+    assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
+}
+
+#[test]
+fn test_acquire_zero_bytes_lazy_allocation() {
+    // Test that acquire(0) returns 0 permits but can request_additional() later
+    let manager = MemoryManager::new(10 * PERMIT_GRANULARITY_BYTES, NoOpMetrics);
+
+    let mut guard = manager.try_acquire(0).unwrap();
+    assert_eq!(guard.granted_bytes(), 0); // No permits consumed
+    assert_eq!(manager.used_bytes(), 0);
+
+    assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES)); // Lazy allocation
+    assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
+}
diff --git a/src/common/meta/src/cluster.rs b/src/common/meta/src/cluster.rs
index 74485513e9..78af133e8f 100644
--- a/src/common/meta/src/cluster.rs
+++ b/src/common/meta/src/cluster.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::fmt::{Display, Formatter};
 use std::hash::{DefaultHasher, Hash, Hasher};
 use std::str::FromStr;
 
@@ -60,7 +61,7 @@ pub trait ClusterInfo {
 }
 
 /// The key of [NodeInfo] in the storage. The format is `__meta_cluster_node_info-0-{role}-{node_id}`.
-#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize, PartialOrd, Ord)]
 pub struct NodeInfoKey {
     /// The role of the node. It can be `[Role::Datanode]` or `[Role::Frontend]`.
     pub role: Role,
@@ -135,7 +136,7 @@ pub struct NodeInfo {
     pub hostname: String,
 }
 
-#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize, PartialOrd, Ord)]
 pub enum Role {
     Datanode,
     Frontend,
@@ -241,6 +242,12 @@ impl From<&NodeInfoKey> for Vec<u8> {
     }
 }
 
+impl Display for NodeInfoKey {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}-{}", self.role, self.node_id)
+    }
+}
+
 impl FromStr for NodeInfo {
     type Err = Error;
 
diff --git a/src/common/meta/src/datanode.rs b/src/common/meta/src/datanode.rs
index ffa85b4a7e..8b521d8e43 100644
--- a/src/common/meta/src/datanode.rs
+++ b/src/common/meta/src/datanode.rs
@@ -132,6 +132,8 @@ pub enum RegionManifestInfo {
     Mito {
         manifest_version: u64,
         flushed_entry_id: u64,
+        /// Number of files removed in the manifest's `removed_files` field.
+        file_removed_cnt: u64,
     },
     Metric {
         data_manifest_version: u64,
@@ -271,9 +273,11 @@ impl From<store_api::region_engine::RegionManifestInfo> for RegionManifestInfo {
             store_api::region_engine::RegionManifestInfo::Mito {
                 manifest_version,
                 flushed_entry_id,
+                file_removed_cnt,
             } => RegionManifestInfo::Mito {
                 manifest_version,
                 flushed_entry_id,
+                file_removed_cnt,
             },
             store_api::region_engine::RegionManifestInfo::Metric {
                 data_manifest_version,
diff --git a/src/common/meta/src/ddl.rs b/src/common/meta/src/ddl.rs
index e12331d4a2..7af538f785 100644
--- a/src/common/meta/src/ddl.rs
+++ b/src/common/meta/src/ddl.rs
@@ -31,6 +31,7 @@ use crate::region_registry::LeaderRegionRegistryRef;
 pub mod alter_database;
 pub mod alter_logical_tables;
 pub mod alter_table;
+pub mod comment_on;
 pub mod create_database;
 pub mod create_flow;
 pub mod create_logical_tables;
diff --git a/src/common/meta/src/ddl/alter_database.rs b/src/common/meta/src/ddl/alter_database.rs
index 6e199cb92a..736459e533 100644
--- a/src/common/meta/src/ddl/alter_database.rs
+++ b/src/common/meta/src/ddl/alter_database.rs
@@ -47,6 +47,9 @@ fn build_new_schema_value(
                     SetDatabaseOption::Ttl(ttl) => {
                         value.ttl = Some(*ttl);
                     }
+                    SetDatabaseOption::Other(key, val) => {
+                        value.extra_options.insert(key.clone(), val.clone());
+                    }
                 }
             }
         }
@@ -54,6 +57,9 @@ fn build_new_schema_value(
             for key in keys.0.iter() {
                 match key {
                     UnsetDatabaseOption::Ttl => value.ttl = None,
+                    UnsetDatabaseOption::Other(key) => {
+                        value.extra_options.remove(key);
+                    }
                 }
             }
         }
@@ -234,4 +240,41 @@ mod tests {
             build_new_schema_value(current_schema_value, &unset_ttl_alter_kind).unwrap();
         assert_eq!(new_schema_value.ttl, None);
     }
+
+    #[test]
+    fn test_build_new_schema_value_with_compaction_options() {
+        let set_compaction = AlterDatabaseKind::SetDatabaseOptions(SetDatabaseOptions(vec![
+            SetDatabaseOption::Other("compaction.type".to_string(), "twcs".to_string()),
+            SetDatabaseOption::Other("compaction.twcs.time_window".to_string(), "1d".to_string()),
+        ]));
+
+        let current_schema_value = SchemaNameValue::default();
+        let new_schema_value =
+            build_new_schema_value(current_schema_value.clone(), &set_compaction).unwrap();
+
+        assert_eq!(
+            new_schema_value.extra_options.get("compaction.type"),
+            Some(&"twcs".to_string())
+        );
+        assert_eq!(
+            new_schema_value
+                .extra_options
+                .get("compaction.twcs.time_window"),
+            Some(&"1d".to_string())
+        );
+
+        let unset_compaction = AlterDatabaseKind::UnsetDatabaseOptions(UnsetDatabaseOptions(vec![
+            UnsetDatabaseOption::Other("compaction.type".to_string()),
+        ]));
+
+        let new_schema_value = build_new_schema_value(new_schema_value, &unset_compaction).unwrap();
+
+        assert_eq!(new_schema_value.extra_options.get("compaction.type"), None);
+        assert_eq!(
+            new_schema_value
+                .extra_options
+                .get("compaction.twcs.time_window"),
+            Some(&"1d".to_string())
+        );
+    }
 }
diff --git a/src/common/meta/src/ddl/alter_table/executor.rs b/src/common/meta/src/ddl/alter_table/executor.rs
index a5e843dd08..5e44023f35 100644
--- a/src/common/meta/src/ddl/alter_table/executor.rs
+++ b/src/common/meta/src/ddl/alter_table/executor.rs
@@ -301,8 +301,8 @@ fn build_new_table_info(
         | AlterKind::UnsetTableOptions { .. }
         | AlterKind::SetIndexes { .. }
         | AlterKind::UnsetIndexes { .. }
-        | AlterKind::DropDefaults { .. } => {}
-        AlterKind::SetDefaults { .. } => {}
+        | AlterKind::DropDefaults { .. }
+        | AlterKind::SetDefaults { .. } => {}
     }
 
     info!(
diff --git a/src/common/meta/src/ddl/comment_on.rs b/src/common/meta/src/ddl/comment_on.rs
new file mode 100644
index 0000000000..37b614ba5e
--- /dev/null
+++ b/src/common/meta/src/ddl/comment_on.rs
@@ -0,0 +1,509 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use async_trait::async_trait;
+use chrono::Utc;
+use common_catalog::format_full_table_name;
+use common_procedure::error::{FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu};
+use common_procedure::{Context as ProcedureContext, LockKey, Procedure, Status};
+use common_telemetry::tracing::info;
+use datatypes::schema::COMMENT_KEY as COLUMN_COMMENT_KEY;
+use serde::{Deserialize, Serialize};
+use snafu::{OptionExt, ResultExt, ensure};
+use store_api::storage::TableId;
+use strum::AsRefStr;
+use table::metadata::RawTableInfo;
+use table::requests::COMMENT_KEY as TABLE_COMMENT_KEY;
+use table::table_name::TableName;
+
+use crate::cache_invalidator::Context;
+use crate::ddl::DdlContext;
+use crate::ddl::utils::map_to_procedure_error;
+use crate::error::{ColumnNotFoundSnafu, FlowNotFoundSnafu, Result, TableNotFoundSnafu};
+use crate::instruction::CacheIdent;
+use crate::key::flow::flow_info::{FlowInfoKey, FlowInfoValue};
+use crate::key::table_info::{TableInfoKey, TableInfoValue};
+use crate::key::table_name::TableNameKey;
+use crate::key::{DeserializedValueWithBytes, FlowId, MetadataKey, MetadataValue};
+use crate::lock_key::{CatalogLock, FlowNameLock, SchemaLock, TableNameLock};
+use crate::rpc::ddl::{CommentObjectType, CommentOnTask};
+use crate::rpc::store::PutRequest;
+
+pub struct CommentOnProcedure {
+    pub context: DdlContext,
+    pub data: CommentOnData,
+}
+
+impl CommentOnProcedure {
+    pub const TYPE_NAME: &'static str = "metasrv-procedure::CommentOn";
+
+    pub fn new(task: CommentOnTask, context: DdlContext) -> Self {
+        Self {
+            context,
+            data: CommentOnData::new(task),
+        }
+    }
+
+    pub fn from_json(json: &str, context: DdlContext) -> ProcedureResult<Self> {
+        let data = serde_json::from_str(json).context(FromJsonSnafu)?;
+
+        Ok(Self { context, data })
+    }
+
+    pub async fn on_prepare(&mut self) -> Result<Status> {
+        match self.data.object_type {
+            CommentObjectType::Table | CommentObjectType::Column => {
+                self.prepare_table_or_column().await?;
+            }
+            CommentObjectType::Flow => {
+                self.prepare_flow().await?;
+            }
+        }
+
+        // Fast path: if comment is unchanged, skip update
+        if self.data.is_unchanged {
+            let object_desc = match self.data.object_type {
+                CommentObjectType::Table => format!(
+                    "table {}",
+                    format_full_table_name(
+                        &self.data.catalog_name,
+                        &self.data.schema_name,
+                        &self.data.object_name,
+                    )
+                ),
+                CommentObjectType::Column => format!(
+                    "column {}.{}",
+                    format_full_table_name(
+                        &self.data.catalog_name,
+                        &self.data.schema_name,
+                        &self.data.object_name,
+                    ),
+                    self.data.column_name.as_ref().unwrap()
+                ),
+                CommentObjectType::Flow => {
+                    format!("flow {}.{}", self.data.catalog_name, self.data.object_name)
+                }
+            };
+            info!("Comment unchanged for {}, skipping update", object_desc);
+            return Ok(Status::done());
+        }
+
+        self.data.state = CommentOnState::UpdateMetadata;
+        Ok(Status::executing(true))
+    }
+
+    async fn prepare_table_or_column(&mut self) -> Result<()> {
+        let table_name_key = TableNameKey::new(
+            &self.data.catalog_name,
+            &self.data.schema_name,
+            &self.data.object_name,
+        );
+
+        let table_id = self
+            .context
+            .table_metadata_manager
+            .table_name_manager()
+            .get(table_name_key)
+            .await?
+            .with_context(|| TableNotFoundSnafu {
+                table_name: format_full_table_name(
+                    &self.data.catalog_name,
+                    &self.data.schema_name,
+                    &self.data.object_name,
+                ),
+            })?
+            .table_id();
+
+        let table_info = self
+            .context
+            .table_metadata_manager
+            .table_info_manager()
+            .get(table_id)
+            .await?
+            .with_context(|| TableNotFoundSnafu {
+                table_name: format_full_table_name(
+                    &self.data.catalog_name,
+                    &self.data.schema_name,
+                    &self.data.object_name,
+                ),
+            })?;
+
+        // For column comments, validate the column exists
+        if self.data.object_type == CommentObjectType::Column {
+            let column_name = self.data.column_name.as_ref().unwrap();
+            let column_exists = table_info
+                .table_info
+                .meta
+                .schema
+                .column_schemas
+                .iter()
+                .any(|col| &col.name == column_name);
+
+            ensure!(
+                column_exists,
+                ColumnNotFoundSnafu {
+                    column_name,
+                    column_id: 0u32, // column_id is not known here
+                }
+            );
+        }
+
+        self.data.table_id = Some(table_id);
+
+        // Check if comment is unchanged for early exit optimization
+        match self.data.object_type {
+            CommentObjectType::Table => {
+                let current_comment = &table_info.table_info.desc;
+                if &self.data.comment == current_comment {
+                    self.data.is_unchanged = true;
+                }
+            }
+            CommentObjectType::Column => {
+                let column_name = self.data.column_name.as_ref().unwrap();
+                let column_schema = table_info
+                    .table_info
+                    .meta
+                    .schema
+                    .column_schemas
+                    .iter()
+                    .find(|col| &col.name == column_name)
+                    .unwrap(); // Safe: validated above
+
+                let current_comment = column_schema.metadata().get(COLUMN_COMMENT_KEY);
+                if self.data.comment.as_deref() == current_comment.map(String::as_str) {
+                    self.data.is_unchanged = true;
+                }
+            }
+            CommentObjectType::Flow => {
+                // this branch is handled in `prepare_flow`
+            }
+        }
+
+        self.data.table_info = Some(table_info);
+
+        Ok(())
+    }
+
+    async fn prepare_flow(&mut self) -> Result<()> {
+        let flow_name_value = self
+            .context
+            .flow_metadata_manager
+            .flow_name_manager()
+            .get(&self.data.catalog_name, &self.data.object_name)
+            .await?
+            .with_context(|| FlowNotFoundSnafu {
+                flow_name: &self.data.object_name,
+            })?;
+
+        let flow_id = flow_name_value.flow_id();
+        let flow_info = self
+            .context
+            .flow_metadata_manager
+            .flow_info_manager()
+            .get_raw(flow_id)
+            .await?
+            .with_context(|| FlowNotFoundSnafu {
+                flow_name: &self.data.object_name,
+            })?;
+
+        self.data.flow_id = Some(flow_id);
+
+        // Check if comment is unchanged for early exit optimization
+        let current_comment = &flow_info.get_inner_ref().comment;
+        let new_comment = self.data.comment.as_deref().unwrap_or("");
+        if new_comment == current_comment.as_str() {
+            self.data.is_unchanged = true;
+        }
+
+        self.data.flow_info = Some(flow_info);
+
+        Ok(())
+    }
+
+    pub async fn on_update_metadata(&mut self) -> Result<Status> {
+        match self.data.object_type {
+            CommentObjectType::Table => {
+                self.update_table_comment().await?;
+            }
+            CommentObjectType::Column => {
+                self.update_column_comment().await?;
+            }
+            CommentObjectType::Flow => {
+                self.update_flow_comment().await?;
+            }
+        }
+
+        self.data.state = CommentOnState::InvalidateCache;
+        Ok(Status::executing(true))
+    }
+
+    async fn update_table_comment(&mut self) -> Result<()> {
+        let table_info_value = self.data.table_info.as_ref().unwrap();
+        let mut new_table_info = table_info_value.table_info.clone();
+
+        new_table_info.desc = self.data.comment.clone();
+
+        // Sync comment to table options
+        sync_table_comment_option(
+            &mut new_table_info.meta.options,
+            new_table_info.desc.as_deref(),
+        );
+
+        self.update_table_info(table_info_value, new_table_info)
+            .await?;
+
+        info!(
+            "Updated comment for table {}.{}.{}",
+            self.data.catalog_name, self.data.schema_name, self.data.object_name
+        );
+
+        Ok(())
+    }
+
+    async fn update_column_comment(&mut self) -> Result<()> {
+        let table_info_value = self.data.table_info.as_ref().unwrap();
+        let mut new_table_info = table_info_value.table_info.clone();
+
+        let column_name = self.data.column_name.as_ref().unwrap();
+        let column_schema = new_table_info
+            .meta
+            .schema
+            .column_schemas
+            .iter_mut()
+            .find(|col| &col.name == column_name)
+            .unwrap(); // Safe: validated in prepare
+
+        update_column_comment_metadata(column_schema, self.data.comment.clone());
+
+        self.update_table_info(table_info_value, new_table_info)
+            .await?;
+
+        info!(
+            "Updated comment for column {}.{}.{}.{}",
+            self.data.catalog_name, self.data.schema_name, self.data.object_name, column_name
+        );
+
+        Ok(())
+    }
+
+    async fn update_flow_comment(&mut self) -> Result<()> {
+        let flow_id = self.data.flow_id.unwrap();
+        let flow_info_value = self.data.flow_info.as_ref().unwrap();
+
+        let mut new_flow_info = flow_info_value.get_inner_ref().clone();
+        new_flow_info.comment = self.data.comment.clone().unwrap_or_default();
+        new_flow_info.updated_time = Utc::now();
+
+        let raw_value = new_flow_info.try_as_raw_value()?;
+
+        self.context
+            .table_metadata_manager
+            .kv_backend()
+            .put(
+                PutRequest::new()
+                    .with_key(FlowInfoKey::new(flow_id).to_bytes())
+                    .with_value(raw_value),
+            )
+            .await?;
+
+        info!(
+            "Updated comment for flow {}.{}",
+            self.data.catalog_name, self.data.object_name
+        );
+
+        Ok(())
+    }
+
+    async fn update_table_info(
+        &self,
+        current_table_info: &DeserializedValueWithBytes<TableInfoValue>,
+        new_table_info: RawTableInfo,
+    ) -> Result<()> {
+        let table_id = current_table_info.table_info.ident.table_id;
+        let new_table_info_value = current_table_info.update(new_table_info);
+        let raw_value = new_table_info_value.try_as_raw_value()?;
+
+        self.context
+            .table_metadata_manager
+            .kv_backend()
+            .put(
+                PutRequest::new()
+                    .with_key(TableInfoKey::new(table_id).to_bytes())
+                    .with_value(raw_value),
+            )
+            .await?;
+
+        Ok(())
+    }
+
+    pub async fn on_invalidate_cache(&mut self) -> Result<Status> {
+        let cache_invalidator = &self.context.cache_invalidator;
+
+        match self.data.object_type {
+            CommentObjectType::Table | CommentObjectType::Column => {
+                let table_id = self.data.table_id.unwrap();
+                let table_name = TableName::new(
+                    self.data.catalog_name.clone(),
+                    self.data.schema_name.clone(),
+                    self.data.object_name.clone(),
+                );
+
+                let cache_ident = vec![
+                    CacheIdent::TableId(table_id),
+                    CacheIdent::TableName(table_name),
+                ];
+
+                cache_invalidator
+                    .invalidate(&Context::default(), &cache_ident)
+                    .await?;
+            }
+            CommentObjectType::Flow => {
+                let flow_id = self.data.flow_id.unwrap();
+                let cache_ident = vec![CacheIdent::FlowId(flow_id)];
+
+                cache_invalidator
+                    .invalidate(&Context::default(), &cache_ident)
+                    .await?;
+            }
+        }
+
+        Ok(Status::done())
+    }
+}
+
+#[async_trait]
+impl Procedure for CommentOnProcedure {
+    fn type_name(&self) -> &str {
+        Self::TYPE_NAME
+    }
+
+    async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
+        match self.data.state {
+            CommentOnState::Prepare => self.on_prepare().await,
+            CommentOnState::UpdateMetadata => self.on_update_metadata().await,
+            CommentOnState::InvalidateCache => self.on_invalidate_cache().await,
+        }
+        .map_err(map_to_procedure_error)
+    }
+
+    fn dump(&self) -> ProcedureResult<String> {
+        serde_json::to_string(&self.data).context(ToJsonSnafu)
+    }
+
+    fn lock_key(&self) -> LockKey {
+        let catalog = &self.data.catalog_name;
+        let schema = &self.data.schema_name;
+
+        let lock_key = match self.data.object_type {
+            CommentObjectType::Table | CommentObjectType::Column => {
+                vec![
+                    CatalogLock::Read(catalog).into(),
+                    SchemaLock::read(catalog, schema).into(),
+                    TableNameLock::new(catalog, schema, &self.data.object_name).into(),
+                ]
+            }
+            CommentObjectType::Flow => {
+                vec![
+                    CatalogLock::Read(catalog).into(),
+                    FlowNameLock::new(catalog, &self.data.object_name).into(),
+                ]
+            }
+        };
+
+        LockKey::new(lock_key)
+    }
+}
+
+#[derive(Debug, Serialize, Deserialize, AsRefStr)]
+enum CommentOnState {
+    Prepare,
+    UpdateMetadata,
+    InvalidateCache,
+}
+
+/// The data of comment on procedure.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct CommentOnData {
+    state: CommentOnState,
+    catalog_name: String,
+    schema_name: String,
+    object_type: CommentObjectType,
+    object_name: String,
+    /// Column name (only for Column comments)
+    column_name: Option<String>,
+    comment: Option<String>,
+    /// Cached table ID (for Table/Column)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    table_id: Option<TableId>,
+    /// Cached table info (for Table/Column)
+    #[serde(skip)]
+    table_info: Option<DeserializedValueWithBytes<TableInfoValue>>,
+    /// Cached flow ID (for Flow)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    flow_id: Option<FlowId>,
+    /// Cached flow info (for Flow)
+    #[serde(skip)]
+    flow_info: Option<DeserializedValueWithBytes<FlowInfoValue>>,
+    /// Whether the comment is unchanged (optimization for early exit)
+    #[serde(skip)]
+    is_unchanged: bool,
+}
+
+impl CommentOnData {
+    pub fn new(task: CommentOnTask) -> Self {
+        Self {
+            state: CommentOnState::Prepare,
+            catalog_name: task.catalog_name,
+            schema_name: task.schema_name,
+            object_type: task.object_type,
+            object_name: task.object_name,
+            column_name: task.column_name,
+            comment: task.comment,
+            table_id: None,
+            table_info: None,
+            flow_id: None,
+            flow_info: None,
+            is_unchanged: false,
+        }
+    }
+}
+
+fn update_column_comment_metadata(
+    column_schema: &mut datatypes::schema::ColumnSchema,
+    comment: Option<String>,
+) {
+    match comment {
+        Some(value) => {
+            column_schema
+                .mut_metadata()
+                .insert(COLUMN_COMMENT_KEY.to_string(), value);
+        }
+        None => {
+            column_schema.mut_metadata().remove(COLUMN_COMMENT_KEY);
+        }
+    }
+}
+
+fn sync_table_comment_option(options: &mut table::requests::TableOptions, comment: Option<&str>) {
+    match comment {
+        Some(value) => {
+            options
+                .extra_options
+                .insert(TABLE_COMMENT_KEY.to_string(), value.to_string());
+        }
+        None => {
+            options.extra_options.remove(TABLE_COMMENT_KEY);
+        }
+    }
+}
diff --git a/src/common/meta/src/ddl/tests/alter_table.rs b/src/common/meta/src/ddl/tests/alter_table.rs
index e16a85b403..a9ba4a0aa8 100644
--- a/src/common/meta/src/ddl/tests/alter_table.rs
+++ b/src/common/meta/src/ddl/tests/alter_table.rs
@@ -182,7 +182,7 @@ fn alter_request_handler(_peer: Peer, request: RegionRequest) -> Result<RegionRe
         let region_id = RegionId::from(req.region_id);
         response.extensions.insert(
             MANIFEST_INFO_EXTENSION_KEY.to_string(),
-            RegionManifestInfo::encode_list(&[(region_id, RegionManifestInfo::mito(1, 1))])
+            RegionManifestInfo::encode_list(&[(region_id, RegionManifestInfo::mito(1, 1, 0))])
                 .unwrap(),
         );
         response.extensions.insert(
diff --git a/src/common/meta/src/ddl_manager.rs b/src/common/meta/src/ddl_manager.rs
index 9ade13052d..ff8583b6b9 100644
--- a/src/common/meta/src/ddl_manager.rs
+++ b/src/common/meta/src/ddl_manager.rs
@@ -14,6 +14,7 @@
 
 use std::sync::Arc;
 
+use common_error::ext::BoxedError;
 use common_procedure::{
     BoxedProcedureLoader, Output, ProcedureId, ProcedureManagerRef, ProcedureWithId, watcher,
 };
@@ -26,6 +27,7 @@ use store_api::storage::TableId;
 use crate::ddl::alter_database::AlterDatabaseProcedure;
 use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure;
 use crate::ddl::alter_table::AlterTableProcedure;
+use crate::ddl::comment_on::CommentOnProcedure;
 use crate::ddl::create_database::CreateDatabaseProcedure;
 use crate::ddl::create_flow::CreateFlowProcedure;
 use crate::ddl::create_logical_tables::CreateLogicalTablesProcedure;
@@ -51,21 +53,34 @@ use crate::rpc::ddl::DdlTask::CreateTrigger;
 #[cfg(feature = "enterprise")]
 use crate::rpc::ddl::DdlTask::DropTrigger;
 use crate::rpc::ddl::DdlTask::{
-    AlterDatabase, AlterLogicalTables, AlterTable, CreateDatabase, CreateFlow, CreateLogicalTables,
-    CreateTable, CreateView, DropDatabase, DropFlow, DropLogicalTables, DropTable, DropView,
-    TruncateTable,
+    AlterDatabase, AlterLogicalTables, AlterTable, CommentOn, CreateDatabase, CreateFlow,
+    CreateLogicalTables, CreateTable, CreateView, DropDatabase, DropFlow, DropLogicalTables,
+    DropTable, DropView, TruncateTable,
 };
 #[cfg(feature = "enterprise")]
 use crate::rpc::ddl::trigger::CreateTriggerTask;
 #[cfg(feature = "enterprise")]
 use crate::rpc::ddl::trigger::DropTriggerTask;
 use crate::rpc::ddl::{
-    AlterDatabaseTask, AlterTableTask, CreateDatabaseTask, CreateFlowTask, CreateTableTask,
-    CreateViewTask, DropDatabaseTask, DropFlowTask, DropTableTask, DropViewTask, QueryContext,
-    SubmitDdlTaskRequest, SubmitDdlTaskResponse, TruncateTableTask,
+    AlterDatabaseTask, AlterTableTask, CommentOnTask, CreateDatabaseTask, CreateFlowTask,
+    CreateTableTask, CreateViewTask, DropDatabaseTask, DropFlowTask, DropTableTask, DropViewTask,
+    QueryContext, SubmitDdlTaskRequest, SubmitDdlTaskResponse, TruncateTableTask,
 };
 use crate::rpc::router::RegionRoute;
 
+/// A configurator that customizes or enhances a [`DdlManager`].
+#[async_trait::async_trait]
+pub trait DdlManagerConfigurator<C>: Send + Sync {
+    /// Configures the given [`DdlManager`] using the provided [`DdlManagerConfigureContext`].
+    async fn configure(
+        &self,
+        ddl_manager: DdlManager,
+        ctx: C,
+    ) -> std::result::Result<DdlManager, BoxedError>;
+}
+
+pub type DdlManagerConfiguratorRef<C> = Arc<dyn DdlManagerConfigurator<C>>;
+
 pub type DdlManagerRef = Arc<DdlManager>;
 
 pub type BoxedProcedureLoaderFactory = dyn Fn(DdlContext) -> BoxedProcedureLoader;
@@ -148,11 +163,8 @@ impl DdlManager {
     }
 
     #[cfg(feature = "enterprise")]
-    pub fn with_trigger_ddl_manager(
-        mut self,
-        trigger_ddl_manager: Option<TriggerDdlManagerRef>,
-    ) -> Self {
-        self.trigger_ddl_manager = trigger_ddl_manager;
+    pub fn with_trigger_ddl_manager(mut self, trigger_ddl_manager: TriggerDdlManagerRef) -> Self {
+        self.trigger_ddl_manager = Some(trigger_ddl_manager);
         self
     }
 
@@ -181,7 +193,8 @@ impl DdlManager {
             TruncateTableProcedure,
             CreateDatabaseProcedure,
             DropDatabaseProcedure,
-            DropViewProcedure
+            DropViewProcedure,
+            CommentOnProcedure
         );
 
         for (type_name, loader_factory) in loaders {
@@ -397,6 +410,19 @@ impl DdlManager {
         self.submit_procedure(procedure_with_id).await
     }
 
+    /// Submits and executes a comment on task.
+    #[tracing::instrument(skip_all)]
+    pub async fn submit_comment_on_task(
+        &self,
+        comment_on_task: CommentOnTask,
+    ) -> Result<(ProcedureId, Option<Output>)> {
+        let context = self.create_context();
+        let procedure = CommentOnProcedure::new(comment_on_task, context);
+        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
+
+        self.submit_procedure(procedure_with_id).await
+    }
+
     async fn submit_procedure(
         &self,
         procedure_with_id: ProcedureWithId,
@@ -465,6 +491,7 @@ impl DdlManager {
                     handle_create_view_task(self, create_view_task).await
                 }
                 DropView(drop_view_task) => handle_drop_view_task(self, drop_view_task).await,
+                CommentOn(comment_on_task) => handle_comment_on_task(self, comment_on_task).await,
                 #[cfg(feature = "enterprise")]
                 CreateTrigger(create_trigger_task) => {
                     handle_create_trigger_task(
@@ -896,6 +923,26 @@ async fn handle_create_view_task(
     })
 }
 
+async fn handle_comment_on_task(
+    ddl_manager: &DdlManager,
+    comment_on_task: CommentOnTask,
+) -> Result<SubmitDdlTaskResponse> {
+    let (id, _) = ddl_manager
+        .submit_comment_on_task(comment_on_task.clone())
+        .await?;
+
+    let procedure_id = id.to_string();
+    info!(
+        "Comment on {}.{}.{} is updated via procedure_id {id:?}",
+        comment_on_task.catalog_name, comment_on_task.schema_name, comment_on_task.object_name
+    );
+
+    Ok(SubmitDdlTaskResponse {
+        key: procedure_id.into(),
+        ..Default::default()
+    })
+}
+
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
diff --git a/src/common/meta/src/distributed_time_constants.rs b/src/common/meta/src/distributed_time_constants.rs
index d18b377c28..688e7a424a 100644
--- a/src/common/meta/src/distributed_time_constants.rs
+++ b/src/common/meta/src/distributed_time_constants.rs
@@ -14,6 +14,8 @@
 
 use std::time::Duration;
 
+use etcd_client::ConnectOptions;
+
 /// Heartbeat interval time (is the basic unit of various time).
 pub const HEARTBEAT_INTERVAL_MILLIS: u64 = 3000;
 
@@ -41,6 +43,23 @@ pub const POSTGRES_KEEP_ALIVE_SECS: u64 = 30;
 /// In a lease, there are two opportunities for renewal.
 pub const META_KEEP_ALIVE_INTERVAL_SECS: u64 = META_LEASE_SECS / 2;
 
+/// The timeout of the heartbeat request.
+pub const HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
+
+/// The keep-alive interval of the heartbeat channel.
+pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS: Duration = Duration::from_secs(15);
+
+/// The keep-alive timeout of the heartbeat channel.
+pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS: Duration = Duration::from_secs(5);
+
+/// The default options for the etcd client.
+pub fn default_etcd_client_options() -> ConnectOptions {
+    ConnectOptions::new()
+        .with_keep_alive_while_idle(true)
+        .with_keep_alive(Duration::from_secs(15), Duration::from_secs(5))
+        .with_connect_timeout(Duration::from_secs(10))
+}
+
 /// The default mailbox round-trip timeout.
 pub const MAILBOX_RTT_SECS: u64 = 1;
 
diff --git a/src/common/meta/src/error.rs b/src/common/meta/src/error.rs
index 4bac6450a2..5a79f806a8 100644
--- a/src/common/meta/src/error.rs
+++ b/src/common/meta/src/error.rs
@@ -272,13 +272,6 @@ pub enum Error {
         location: Location,
     },
 
-    #[snafu(display("Failed to send message: {err_msg}"))]
-    SendMessage {
-        err_msg: String,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
     #[snafu(display("Failed to serde json"))]
     SerdeJson {
         #[snafu(source)]
@@ -1118,7 +1111,7 @@ impl ErrorExt for Error {
             | DeserializeFlexbuffers { .. }
             | ConvertTimeRanges { .. } => StatusCode::Unexpected,
 
-            SendMessage { .. } | GetKvCache { .. } | CacheNotGet { .. } => StatusCode::Internal,
+            GetKvCache { .. } | CacheNotGet { .. } => StatusCode::Internal,
 
             SchemaAlreadyExists { .. } => StatusCode::DatabaseAlreadyExists,
 
diff --git a/src/common/meta/src/heartbeat/handler.rs b/src/common/meta/src/heartbeat/handler.rs
index afa71f0edf..ecc735083b 100644
--- a/src/common/meta/src/heartbeat/handler.rs
+++ b/src/common/meta/src/heartbeat/handler.rs
@@ -23,6 +23,7 @@ use crate::heartbeat::mailbox::{IncomingMessage, MailboxRef};
 
 pub mod invalidate_table_cache;
 pub mod parse_mailbox_message;
+pub mod suspend;
 #[cfg(test)]
 mod tests;
 
diff --git a/src/common/meta/src/heartbeat/handler/suspend.rs b/src/common/meta/src/heartbeat/handler/suspend.rs
new file mode 100644
index 0000000000..06cf4d06e4
--- /dev/null
+++ b/src/common/meta/src/heartbeat/handler/suspend.rs
@@ -0,0 +1,69 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use async_trait::async_trait;
+use common_telemetry::{info, warn};
+
+use crate::error::Result;
+use crate::heartbeat::handler::{
+    HandleControl, HeartbeatResponseHandler, HeartbeatResponseHandlerContext,
+};
+use crate::instruction::Instruction;
+
+/// A heartbeat response handler that handles special "suspend" error.
+/// It will simply set or clear (if previously set) the inner suspend atomic state.
+pub struct SuspendHandler {
+    suspend: Arc<AtomicBool>,
+}
+
+impl SuspendHandler {
+    pub fn new(suspend: Arc<AtomicBool>) -> Self {
+        Self { suspend }
+    }
+}
+
+#[async_trait]
+impl HeartbeatResponseHandler for SuspendHandler {
+    fn is_acceptable(&self, context: &HeartbeatResponseHandlerContext) -> bool {
+        matches!(
+            context.incoming_message,
+            Some((_, Instruction::Suspend)) | None
+        )
+    }
+
+    async fn handle(&self, context: &mut HeartbeatResponseHandlerContext) -> Result<HandleControl> {
+        let flip_state = |expect: bool| {
+            self.suspend
+                .compare_exchange(expect, !expect, Ordering::Relaxed, Ordering::Relaxed)
+                .is_ok()
+        };
+
+        if let Some((_, Instruction::Suspend)) = context.incoming_message.take() {
+            if flip_state(false) {
+                warn!("Suspend instruction received from meta, entering suspension state");
+            }
+        } else {
+            // Suspended components are made always tried to get rid of this state, we don't want
+            // an "un-suspend" instruction to resume them running. That can be error-prone.
+            // So if the "suspend" instruction is not found in the heartbeat, just unset the state.
+            if flip_state(true) {
+                info!("clear suspend state");
+            }
+        }
+        Ok(HandleControl::Continue)
+    }
+}
diff --git a/src/common/meta/src/heartbeat/mailbox.rs b/src/common/meta/src/heartbeat/mailbox.rs
index 538a81b72c..5ee45436a0 100644
--- a/src/common/meta/src/heartbeat/mailbox.rs
+++ b/src/common/meta/src/heartbeat/mailbox.rs
@@ -15,8 +15,8 @@
 use std::sync::Arc;
 
 use tokio::sync::mpsc::Sender;
+use tokio::sync::mpsc::error::SendError;
 
-use crate::error::{self, Result};
 use crate::instruction::{Instruction, InstructionReply};
 
 pub type IncomingMessage = (MessageMeta, Instruction);
@@ -51,13 +51,8 @@ impl HeartbeatMailbox {
         Self { sender }
     }
 
-    pub async fn send(&self, message: OutgoingMessage) -> Result<()> {
-        self.sender.send(message).await.map_err(|e| {
-            error::SendMessageSnafu {
-                err_msg: e.to_string(),
-            }
-            .build()
-        })
+    pub async fn send(&self, message: OutgoingMessage) -> Result<(), SendError<OutgoingMessage>> {
+        self.sender.send(message).await
     }
 }
 
diff --git a/src/common/meta/src/instruction.rs b/src/common/meta/src/instruction.rs
index b9e34d9230..230c076673 100644
--- a/src/common/meta/src/instruction.rs
+++ b/src/common/meta/src/instruction.rs
@@ -339,6 +339,16 @@ pub struct FlushRegions {
     pub error_strategy: FlushErrorStrategy,
 }
 
+impl Display for FlushRegions {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "FlushRegions(region_ids={:?}, strategy={:?}, error_strategy={:?})",
+            self.region_ids, self.strategy, self.error_strategy
+        )
+    }
+}
+
 impl FlushRegions {
     /// Create synchronous single-region flush
     pub fn sync_single(region_id: RegionId) -> Self {
@@ -420,20 +430,25 @@ where
 /// Instruction to get file references for specified regions.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct GetFileRefs {
-    /// List of region IDs to get file references for.
-    pub region_ids: Vec<RegionId>,
+    /// List of region IDs to get file references from active FileHandles (in-memory).
+    pub query_regions: Vec<RegionId>,
+    /// Mapping from the source region ID (where to read the manifest) to
+    /// the target region IDs (whose file references to look for).
+    /// Key: The region ID of the manifest.
+    /// Value: The list of region IDs to find references for in that manifest.
+    pub related_regions: HashMap<RegionId, Vec<RegionId>>,
 }
 
 impl Display for GetFileRefs {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(f, "GetFileRefs(region_ids={:?})", self.region_ids)
+        write!(f, "GetFileRefs(region_ids={:?})", self.query_regions)
     }
 }
 
 /// Instruction to trigger garbage collection for a region.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct GcRegions {
-    /// The region ID to perform GC on.
+    /// The region ID to perform GC on, only regions that are currently on the given datanode can be garbage collected, regions not on the datanode will report errors.
     pub regions: Vec<RegionId>,
     /// The file references manifest containing temporary file references.
     pub file_refs_manifest: FileRefsManifest,
@@ -524,6 +539,8 @@ pub enum Instruction {
     GetFileRefs(GetFileRefs),
     /// Triggers garbage collection for a region.
     GcRegions(GcRegions),
+    /// Temporary suspend serving reads or writes
+    Suspend,
 }
 
 impl Instruction {
diff --git a/src/common/meta/src/key/datanode_table.rs b/src/common/meta/src/key/datanode_table.rs
index 68105a478a..8aca2fcaf7 100644
--- a/src/common/meta/src/key/datanode_table.rs
+++ b/src/common/meta/src/key/datanode_table.rs
@@ -164,6 +164,25 @@ impl DatanodeTableManager {
             .transpose()
     }
 
+    pub async fn batch_get(
+        &self,
+        keys: &[DatanodeTableKey],
+    ) -> Result<HashMap<DatanodeTableKey, DatanodeTableValue>> {
+        let req = BatchGetRequest::default().with_keys(keys.iter().map(|k| k.to_bytes()).collect());
+        let resp = self.kv_backend.batch_get(req).await?;
+        let values = resp
+            .kvs
+            .into_iter()
+            .map(|kv| {
+                Ok((
+                    DatanodeTableKey::from_bytes(&kv.key)?,
+                    DatanodeTableValue::try_from_raw_value(&kv.value)?,
+                ))
+            })
+            .collect::<Result<HashMap<_, _>>>()?;
+        Ok(values)
+    }
+
     pub fn tables(
         &self,
         datanode_id: DatanodeId,
diff --git a/src/common/meta/src/key/table_info.rs b/src/common/meta/src/key/table_info.rs
index c93961f643..637da1a32d 100644
--- a/src/common/meta/src/key/table_info.rs
+++ b/src/common/meta/src/key/table_info.rs
@@ -94,7 +94,7 @@ impl TableInfoValue {
         }
     }
 
-    pub(crate) fn update(&self, new_table_info: RawTableInfo) -> Self {
+    pub fn update(&self, new_table_info: RawTableInfo) -> Self {
         Self {
             table_info: new_table_info,
             version: self.version + 1,
diff --git a/src/common/meta/src/key/table_route.rs b/src/common/meta/src/key/table_route.rs
index 5f6782f002..fe1f11bf15 100644
--- a/src/common/meta/src/key/table_route.rs
+++ b/src/common/meta/src/key/table_route.rs
@@ -661,13 +661,32 @@ impl TableRouteStorage {
 
     /// Returns batch of [`TableRouteValue`] that respects the order of `table_ids`.
     pub async fn batch_get(&self, table_ids: &[TableId]) -> Result<Vec<Option<TableRouteValue>>> {
-        let mut table_routes = self.batch_get_inner(table_ids).await?;
-        self.remap_routes_addresses(&mut table_routes).await?;
+        let raw_table_routes = self.batch_get_inner(table_ids).await?;
 
-        Ok(table_routes)
+        Ok(raw_table_routes
+            .into_iter()
+            .map(|v| v.map(|x| x.inner))
+            .collect())
     }
 
-    async fn batch_get_inner(&self, table_ids: &[TableId]) -> Result<Vec<Option<TableRouteValue>>> {
+    /// Returns batch of [`TableRouteValue`] wrapped with [`DeserializedValueWithBytes`].
+    ///
+    /// The return value is a vector of [`Option<DeserializedValueWithBytes<TableRouteValue>>`].
+    /// Note: This method remaps the addresses of the table routes, but does not update their raw byte representations.
+    pub async fn batch_get_with_raw_bytes(
+        &self,
+        table_ids: &[TableId],
+    ) -> Result<Vec<Option<DeserializedValueWithBytes<TableRouteValue>>>> {
+        let mut raw_table_routes = self.batch_get_inner(table_ids).await?;
+        self.remap_routes_addresses(&mut raw_table_routes).await?;
+
+        Ok(raw_table_routes)
+    }
+
+    async fn batch_get_inner(
+        &self,
+        table_ids: &[TableId],
+    ) -> Result<Vec<Option<DeserializedValueWithBytes<TableRouteValue>>>> {
         let keys = table_ids
             .iter()
             .map(|id| TableRouteKey::new(*id).to_bytes())
@@ -685,7 +704,7 @@ impl TableRouteStorage {
         keys.into_iter()
             .map(|key| {
                 if let Some(value) = kvs.get(&key) {
-                    Ok(Some(TableRouteValue::try_from_raw_value(value)?))
+                    Ok(Some(DeserializedValueWithBytes::from_inner_slice(value)?))
                 } else {
                     Ok(None)
                 }
@@ -695,14 +714,14 @@ impl TableRouteStorage {
 
     async fn remap_routes_addresses(
         &self,
-        table_routes: &mut [Option<TableRouteValue>],
+        table_routes: &mut [Option<DeserializedValueWithBytes<TableRouteValue>>],
     ) -> Result<()> {
         let keys = table_routes
             .iter()
             .flat_map(|table_route| {
                 table_route
                     .as_ref()
-                    .map(extract_address_keys)
+                    .map(|x| extract_address_keys(&x.inner))
                     .unwrap_or_default()
             })
             .collect::<HashSet<_>>()
diff --git a/src/common/meta/src/key/topic_region.rs b/src/common/meta/src/key/topic_region.rs
index 844a46735f..c34229cf9e 100644
--- a/src/common/meta/src/key/topic_region.rs
+++ b/src/common/meta/src/key/topic_region.rs
@@ -33,7 +33,7 @@ use crate::rpc::store::{
 
 // The TopicRegionKey is a key for the topic-region mapping in the kvbackend.
 // The layout of the key is `__topic_region/{topic_name}/{region_id}`.
-#[derive(Debug, Clone, PartialEq)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct TopicRegionKey<'a> {
     pub region_id: RegionId,
     pub topic: &'a str,
diff --git a/src/common/meta/src/kv_backend.rs b/src/common/meta/src/kv_backend.rs
index cdd7102e11..7f747508d4 100644
--- a/src/common/meta/src/kv_backend.rs
+++ b/src/common/meta/src/kv_backend.rs
@@ -34,6 +34,8 @@ pub mod memory;
 #[cfg(any(feature = "mysql_kvbackend", feature = "pg_kvbackend"))]
 pub mod rds;
 pub mod test;
+#[cfg(any(test, feature = "testing"))]
+pub mod test_util;
 pub mod txn;
 pub mod util;
 pub type KvBackendRef<E = Error> = Arc<dyn KvBackend<Error = E> + Send + Sync>;
diff --git a/src/common/meta/src/kv_backend/test_util.rs b/src/common/meta/src/kv_backend/test_util.rs
new file mode 100644
index 0000000000..ce502c3332
--- /dev/null
+++ b/src/common/meta/src/kv_backend/test_util.rs
@@ -0,0 +1,125 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::sync::Arc;
+
+use derive_builder::Builder;
+
+use crate::error::Result;
+use crate::kv_backend::txn::{Txn, TxnResponse};
+use crate::kv_backend::{
+    BatchDeleteRequest, BatchDeleteResponse, BatchGetRequest, BatchGetResponse, BatchPutRequest,
+    BatchPutResponse, DeleteRangeRequest, DeleteRangeResponse, KvBackend, PutRequest, PutResponse,
+    RangeRequest, RangeResponse, TxnService,
+};
+
+pub type MockFn<Req, Resp> = Arc<dyn Fn(Req) -> Result<Resp> + Send + Sync>;
+
+/// A mock kv backend for testing.
+#[derive(Builder)]
+pub struct MockKvBackend {
+    #[builder(setter(strip_option), default)]
+    pub range_fn: Option<MockFn<RangeRequest, RangeResponse>>,
+    #[builder(setter(strip_option), default)]
+    pub put_fn: Option<MockFn<PutRequest, PutResponse>>,
+    #[builder(setter(strip_option), default)]
+    pub batch_put_fn: Option<MockFn<BatchPutRequest, BatchPutResponse>>,
+    #[builder(setter(strip_option), default)]
+    pub batch_get_fn: Option<MockFn<BatchGetRequest, BatchGetResponse>>,
+    #[builder(setter(strip_option), default)]
+    pub delete_range_fn: Option<MockFn<DeleteRangeRequest, DeleteRangeResponse>>,
+    #[builder(setter(strip_option), default)]
+    pub batch_delete_fn: Option<MockFn<BatchDeleteRequest, BatchDeleteResponse>>,
+    #[builder(setter(strip_option), default)]
+    pub txn: Option<MockFn<Txn, TxnResponse>>,
+    #[builder(setter(strip_option), default)]
+    pub max_txn_ops: Option<usize>,
+}
+
+#[async_trait::async_trait]
+impl TxnService for MockKvBackend {
+    type Error = crate::error::Error;
+
+    async fn txn(&self, txn: Txn) -> Result<TxnResponse> {
+        if let Some(f) = &self.txn {
+            f(txn)
+        } else {
+            unimplemented!()
+        }
+    }
+
+    fn max_txn_ops(&self) -> usize {
+        self.max_txn_ops.unwrap()
+    }
+}
+
+#[async_trait::async_trait]
+impl KvBackend for MockKvBackend {
+    fn name(&self) -> &str {
+        "mock_kv_backend"
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    async fn range(&self, req: RangeRequest) -> Result<RangeResponse> {
+        if let Some(f) = &self.range_fn {
+            f(req)
+        } else {
+            unimplemented!()
+        }
+    }
+
+    async fn put(&self, req: PutRequest) -> Result<PutResponse> {
+        if let Some(f) = &self.put_fn {
+            f(req)
+        } else {
+            unimplemented!()
+        }
+    }
+
+    async fn batch_put(&self, req: BatchPutRequest) -> Result<BatchPutResponse> {
+        if let Some(f) = &self.batch_put_fn {
+            f(req)
+        } else {
+            unimplemented!()
+        }
+    }
+
+    async fn batch_get(&self, req: BatchGetRequest) -> Result<BatchGetResponse> {
+        if let Some(f) = &self.batch_get_fn {
+            f(req)
+        } else {
+            unimplemented!()
+        }
+    }
+
+    async fn delete_range(&self, req: DeleteRangeRequest) -> Result<DeleteRangeResponse> {
+        if let Some(f) = &self.delete_range_fn {
+            f(req)
+        } else {
+            unimplemented!()
+        }
+    }
+
+    async fn batch_delete(&self, req: BatchDeleteRequest) -> Result<BatchDeleteResponse> {
+        if let Some(f) = &self.batch_delete_fn {
+            f(req)
+        } else {
+            unimplemented!()
+        }
+    }
+}
diff --git a/src/common/meta/src/region_registry.rs b/src/common/meta/src/region_registry.rs
index 1f672d563d..f1741b281b 100644
--- a/src/common/meta/src/region_registry.rs
+++ b/src/common/meta/src/region_registry.rs
@@ -67,6 +67,7 @@ impl LeaderRegionManifestInfo {
             RegionManifestInfo::Mito {
                 manifest_version,
                 flushed_entry_id,
+                file_removed_cnt: _,
             } => LeaderRegionManifestInfo::Mito {
                 manifest_version,
                 flushed_entry_id,
diff --git a/src/common/meta/src/rpc/ddl.rs b/src/common/meta/src/rpc/ddl.rs
index b9a871775f..ba36909c7c 100644
--- a/src/common/meta/src/rpc/ddl.rs
+++ b/src/common/meta/src/rpc/ddl.rs
@@ -23,19 +23,20 @@ use api::v1::alter_database_expr::Kind as PbAlterDatabaseKind;
 use api::v1::meta::ddl_task_request::Task;
 use api::v1::meta::{
     AlterDatabaseTask as PbAlterDatabaseTask, AlterTableTask as PbAlterTableTask,
-    AlterTableTasks as PbAlterTableTasks, CreateDatabaseTask as PbCreateDatabaseTask,
-    CreateFlowTask as PbCreateFlowTask, CreateTableTask as PbCreateTableTask,
-    CreateTableTasks as PbCreateTableTasks, CreateViewTask as PbCreateViewTask,
-    DdlTaskRequest as PbDdlTaskRequest, DdlTaskResponse as PbDdlTaskResponse,
-    DropDatabaseTask as PbDropDatabaseTask, DropFlowTask as PbDropFlowTask,
-    DropTableTask as PbDropTableTask, DropTableTasks as PbDropTableTasks,
-    DropViewTask as PbDropViewTask, Partition, ProcedureId,
+    AlterTableTasks as PbAlterTableTasks, CommentOnTask as PbCommentOnTask,
+    CreateDatabaseTask as PbCreateDatabaseTask, CreateFlowTask as PbCreateFlowTask,
+    CreateTableTask as PbCreateTableTask, CreateTableTasks as PbCreateTableTasks,
+    CreateViewTask as PbCreateViewTask, DdlTaskRequest as PbDdlTaskRequest,
+    DdlTaskResponse as PbDdlTaskResponse, DropDatabaseTask as PbDropDatabaseTask,
+    DropFlowTask as PbDropFlowTask, DropTableTask as PbDropTableTask,
+    DropTableTasks as PbDropTableTasks, DropViewTask as PbDropViewTask, Partition, ProcedureId,
     TruncateTableTask as PbTruncateTableTask,
 };
 use api::v1::{
-    AlterDatabaseExpr, AlterTableExpr, CreateDatabaseExpr, CreateFlowExpr, CreateTableExpr,
-    CreateViewExpr, DropDatabaseExpr, DropFlowExpr, DropTableExpr, DropViewExpr, EvalInterval,
-    ExpireAfter, Option as PbOption, QueryContext as PbQueryContext, TruncateTableExpr,
+    AlterDatabaseExpr, AlterTableExpr, CommentObjectType as PbCommentObjectType, CommentOnExpr,
+    CreateDatabaseExpr, CreateFlowExpr, CreateTableExpr, CreateViewExpr, DropDatabaseExpr,
+    DropFlowExpr, DropTableExpr, DropViewExpr, EvalInterval, ExpireAfter, Option as PbOption,
+    QueryContext as PbQueryContext, TruncateTableExpr,
 };
 use base64::Engine as _;
 use base64::engine::general_purpose;
@@ -47,6 +48,7 @@ use serde_with::{DefaultOnNull, serde_as};
 use session::context::{QueryContextBuilder, QueryContextRef};
 use snafu::{OptionExt, ResultExt};
 use table::metadata::{RawTableInfo, TableId};
+use table::requests::validate_database_option;
 use table::table_name::TableName;
 use table::table_reference::TableReference;
 
@@ -77,6 +79,7 @@ pub enum DdlTask {
     DropView(DropViewTask),
     #[cfg(feature = "enterprise")]
     CreateTrigger(trigger::CreateTriggerTask),
+    CommentOn(CommentOnTask),
 }
 
 impl DdlTask {
@@ -199,6 +202,11 @@ impl DdlTask {
             view_info,
         })
     }
+
+    /// Creates a [`DdlTask`] to comment on a table, column, or flow.
+    pub fn new_comment_on(task: CommentOnTask) -> Self {
+        DdlTask::CommentOn(task)
+    }
 }
 
 impl TryFrom<Task> for DdlTask {
@@ -277,6 +285,7 @@ impl TryFrom<Task> for DdlTask {
                     .fail()
                 }
             }
+            Task::CommentOnTask(comment_on) => Ok(DdlTask::CommentOn(comment_on.try_into()?)),
         }
     }
 }
@@ -331,6 +340,7 @@ impl TryFrom<SubmitDdlTaskRequest> for PbDdlTaskRequest {
             DdlTask::CreateTrigger(task) => Task::CreateTriggerTask(task.try_into()?),
             #[cfg(feature = "enterprise")]
             DdlTask::DropTrigger(task) => Task::DropTriggerTask(task.into()),
+            DdlTask::CommentOn(task) => Task::CommentOnTask(task.into()),
         };
 
         Ok(Self {
@@ -1059,14 +1069,21 @@ impl TryFrom<PbOption> for SetDatabaseOption {
     type Error = error::Error;
 
     fn try_from(PbOption { key, value }: PbOption) -> Result<Self> {
-        match key.to_ascii_lowercase().as_str() {
+        let key_lower = key.to_ascii_lowercase();
+        match key_lower.as_str() {
             TTL_KEY => {
                 let ttl = DatabaseTimeToLive::from_humantime_or_str(&value)
                     .map_err(|_| InvalidSetDatabaseOptionSnafu { key, value }.build())?;
 
                 Ok(SetDatabaseOption::Ttl(ttl))
             }
-            _ => InvalidSetDatabaseOptionSnafu { key, value }.fail(),
+            _ => {
+                if validate_database_option(&key_lower) {
+                    Ok(SetDatabaseOption::Other(key_lower, value))
+                } else {
+                    InvalidSetDatabaseOptionSnafu { key, value }.fail()
+                }
+            }
         }
     }
 }
@@ -1074,20 +1091,29 @@ impl TryFrom<PbOption> for SetDatabaseOption {
 #[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
 pub enum SetDatabaseOption {
     Ttl(DatabaseTimeToLive),
+    Other(String, String),
 }
 
 #[derive(Debug, PartialEq, Clone, Serialize, Deserialize)]
 pub enum UnsetDatabaseOption {
     Ttl,
+    Other(String),
 }
 
 impl TryFrom<&str> for UnsetDatabaseOption {
     type Error = error::Error;
 
     fn try_from(key: &str) -> Result<Self> {
-        match key.to_ascii_lowercase().as_str() {
+        let key_lower = key.to_ascii_lowercase();
+        match key_lower.as_str() {
             TTL_KEY => Ok(UnsetDatabaseOption::Ttl),
-            _ => InvalidUnsetDatabaseOptionSnafu { key }.fail(),
+            _ => {
+                if validate_database_option(&key_lower) {
+                    Ok(UnsetDatabaseOption::Other(key_lower))
+                } else {
+                    InvalidUnsetDatabaseOptionSnafu { key }.fail()
+                }
+            }
         }
     }
 }
@@ -1260,6 +1286,119 @@ impl From<DropFlowTask> for PbDropFlowTask {
     }
 }
 
+/// Represents the ID of the object being commented on (Table or Flow).
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum CommentObjectId {
+    Table(TableId),
+    Flow(FlowId),
+}
+
+/// Comment on table, column, or flow
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct CommentOnTask {
+    pub catalog_name: String,
+    pub schema_name: String,
+    pub object_type: CommentObjectType,
+    pub object_name: String,
+    /// Column name (only for Column comments)
+    pub column_name: Option<String>,
+    /// Object ID (Table or Flow) for validation and cache invalidation
+    pub object_id: Option<CommentObjectId>,
+    pub comment: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum CommentObjectType {
+    Table,
+    Column,
+    Flow,
+}
+
+impl CommentOnTask {
+    pub fn table_ref(&self) -> TableReference<'_> {
+        TableReference {
+            catalog: &self.catalog_name,
+            schema: &self.schema_name,
+            table: &self.object_name,
+        }
+    }
+}
+
+// Proto conversions for CommentObjectType
+impl From<CommentObjectType> for PbCommentObjectType {
+    fn from(object_type: CommentObjectType) -> Self {
+        match object_type {
+            CommentObjectType::Table => PbCommentObjectType::Table,
+            CommentObjectType::Column => PbCommentObjectType::Column,
+            CommentObjectType::Flow => PbCommentObjectType::Flow,
+        }
+    }
+}
+
+impl TryFrom<i32> for CommentObjectType {
+    type Error = error::Error;
+
+    fn try_from(value: i32) -> Result<Self> {
+        match value {
+            0 => Ok(CommentObjectType::Table),
+            1 => Ok(CommentObjectType::Column),
+            2 => Ok(CommentObjectType::Flow),
+            _ => error::InvalidProtoMsgSnafu {
+                err_msg: format!(
+                    "Invalid CommentObjectType value: {}. Valid values are: 0 (Table), 1 (Column), 2 (Flow)",
+                    value
+                ),
+            }
+            .fail(),
+        }
+    }
+}
+
+// Proto conversions for CommentOnTask
+impl TryFrom<PbCommentOnTask> for CommentOnTask {
+    type Error = error::Error;
+
+    fn try_from(pb: PbCommentOnTask) -> Result<Self> {
+        let comment_on = pb.comment_on.context(error::InvalidProtoMsgSnafu {
+            err_msg: "expected comment_on",
+        })?;
+
+        Ok(CommentOnTask {
+            catalog_name: comment_on.catalog_name,
+            schema_name: comment_on.schema_name,
+            object_type: comment_on.object_type.try_into()?,
+            object_name: comment_on.object_name,
+            column_name: if comment_on.column_name.is_empty() {
+                None
+            } else {
+                Some(comment_on.column_name)
+            },
+            comment: if comment_on.comment.is_empty() {
+                None
+            } else {
+                Some(comment_on.comment)
+            },
+            object_id: None,
+        })
+    }
+}
+
+impl From<CommentOnTask> for PbCommentOnTask {
+    fn from(task: CommentOnTask) -> Self {
+        let pb_object_type: PbCommentObjectType = task.object_type.into();
+        PbCommentOnTask {
+            comment_on: Some(CommentOnExpr {
+                catalog_name: task.catalog_name,
+                schema_name: task.schema_name,
+                object_type: pb_object_type as i32,
+                object_name: task.object_name,
+                column_name: task.column_name.unwrap_or_default(),
+                comment: task.comment.unwrap_or_default(),
+            }),
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct QueryContext {
     pub(crate) current_catalog: String,
diff --git a/src/common/meta/src/wal_options_allocator/topic_creator.rs b/src/common/meta/src/wal_options_allocator/topic_creator.rs
index 8ba5b9ec59..0c3caf215a 100644
--- a/src/common/meta/src/wal_options_allocator/topic_creator.rs
+++ b/src/common/meta/src/wal_options_allocator/topic_creator.rs
@@ -14,7 +14,7 @@
 
 use common_telemetry::{debug, error, info};
 use common_wal::config::kafka::common::{
-    DEFAULT_BACKOFF_CONFIG, KafkaConnectionConfig, KafkaTopicConfig,
+    DEFAULT_BACKOFF_CONFIG, DEFAULT_CONNECT_TIMEOUT, KafkaConnectionConfig, KafkaTopicConfig,
 };
 use rskafka::client::error::Error as RsKafkaError;
 use rskafka::client::error::ProtocolError::TopicAlreadyExists;
@@ -205,11 +205,13 @@ impl KafkaTopicCreator {
         self.partition_client(topic).await.unwrap()
     }
 }
+
 /// Builds a kafka [Client](rskafka::client::Client).
 pub async fn build_kafka_client(connection: &KafkaConnectionConfig) -> Result<Client> {
     // Builds an kafka controller client for creating topics.
     let mut builder = ClientBuilder::new(connection.broker_endpoints.clone())
-        .backoff_config(DEFAULT_BACKOFF_CONFIG);
+        .backoff_config(DEFAULT_BACKOFF_CONFIG)
+        .connect_timeout(Some(DEFAULT_CONNECT_TIMEOUT));
     if let Some(sasl) = &connection.sasl {
         builder = builder.sasl_config(sasl.config.clone().into_sasl_config());
     };
diff --git a/src/common/procedure/src/event.rs b/src/common/procedure/src/event.rs
index bc76de7842..d659236369 100644
--- a/src/common/procedure/src/event.rs
+++ b/src/common/procedure/src/event.rs
@@ -92,25 +92,96 @@ impl Event for ProcedureEvent {
         schema
     }
 
-    fn extra_row(&self) -> Result<Row> {
-        let error_str = match &self.state {
-            ProcedureState::Failed { error } => format!("{:?}", error),
-            ProcedureState::PrepareRollback { error } => format!("{:?}", error),
-            ProcedureState::RollingBack { error } => format!("{:?}", error),
-            ProcedureState::Retrying { error } => format!("{:?}", error),
-            ProcedureState::Poisoned { error, .. } => format!("{:?}", error),
-            _ => "".to_string(),
-        };
-        let mut row = vec![
-            ValueData::StringValue(self.procedure_id.to_string()).into(),
-            ValueData::StringValue(self.state.as_str_name().to_string()).into(),
-            ValueData::StringValue(error_str).into(),
-        ];
-        row.append(&mut self.internal_event.extra_row()?.values);
-        Ok(Row { values: row })
+    fn extra_rows(&self) -> Result<Vec<Row>> {
+        let mut internal_event_extra_rows = self.internal_event.extra_rows()?;
+        let mut rows = Vec::with_capacity(internal_event_extra_rows.len());
+        for internal_event_extra_row in internal_event_extra_rows.iter_mut() {
+            let error_str = match &self.state {
+                ProcedureState::Failed { error } => format!("{:?}", error),
+                ProcedureState::PrepareRollback { error } => format!("{:?}", error),
+                ProcedureState::RollingBack { error } => format!("{:?}", error),
+                ProcedureState::Retrying { error } => format!("{:?}", error),
+                ProcedureState::Poisoned { error, .. } => format!("{:?}", error),
+                _ => "".to_string(),
+            };
+            let mut values = Vec::with_capacity(3 + internal_event_extra_row.values.len());
+            values.extend([
+                ValueData::StringValue(self.procedure_id.to_string()).into(),
+                ValueData::StringValue(self.state.as_str_name().to_string()).into(),
+                ValueData::StringValue(error_str).into(),
+            ]);
+            values.append(&mut internal_event_extra_row.values);
+            rows.push(Row { values });
+        }
+
+        Ok(rows)
     }
 
     fn as_any(&self) -> &dyn Any {
         self
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use api::v1::value::ValueData;
+    use api::v1::{ColumnDataType, ColumnSchema, Row, SemanticType};
+    use common_event_recorder::Event;
+
+    use crate::{ProcedureEvent, ProcedureId, ProcedureState};
+
+    #[derive(Debug)]
+    struct TestEvent;
+
+    impl Event for TestEvent {
+        fn event_type(&self) -> &str {
+            "test_event"
+        }
+
+        fn extra_schema(&self) -> Vec<ColumnSchema> {
+            vec![ColumnSchema {
+                column_name: "test_event_column".to_string(),
+                datatype: ColumnDataType::String.into(),
+                semantic_type: SemanticType::Field.into(),
+                ..Default::default()
+            }]
+        }
+
+        fn extra_rows(&self) -> common_event_recorder::error::Result<Vec<Row>> {
+            Ok(vec![
+                Row {
+                    values: vec![ValueData::StringValue("test_event1".to_string()).into()],
+                },
+                Row {
+                    values: vec![ValueData::StringValue("test_event2".to_string()).into()],
+                },
+            ])
+        }
+
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+    }
+
+    #[test]
+    fn test_procedure_event_extra_rows() {
+        let procedure_event = ProcedureEvent::new(
+            ProcedureId::random(),
+            Box::new(TestEvent {}),
+            ProcedureState::Running,
+        );
+
+        let procedure_event_extra_rows = procedure_event.extra_rows().unwrap();
+        assert_eq!(procedure_event_extra_rows.len(), 2);
+        assert_eq!(procedure_event_extra_rows[0].values.len(), 4);
+        assert_eq!(
+            procedure_event_extra_rows[0].values[3],
+            ValueData::StringValue("test_event1".to_string()).into()
+        );
+        assert_eq!(procedure_event_extra_rows[1].values.len(), 4);
+        assert_eq!(
+            procedure_event_extra_rows[1].values[3],
+            ValueData::StringValue("test_event2".to_string()).into()
+        );
+    }
+}
diff --git a/src/common/procedure/src/local/runner.rs b/src/common/procedure/src/local/runner.rs
index 9135bee9b4..f1c70c78c3 100644
--- a/src/common/procedure/src/local/runner.rs
+++ b/src/common/procedure/src/local/runner.rs
@@ -331,8 +331,29 @@ impl Runner {
                         }
 
                         match status {
-                            Status::Executing { .. } => {}
+                            Status::Executing { .. } => {
+                                let prev_state = self.meta.state();
+                                if !matches!(prev_state, ProcedureState::Running) {
+                                    info!(
+                                        "Set Procedure {}-{} state to running, prev_state: {:?}",
+                                        self.procedure.type_name(),
+                                        self.meta.id,
+                                        prev_state
+                                    );
+                                    self.meta.set_state(ProcedureState::Running);
+                                }
+                            }
                             Status::Suspended { subprocedures, .. } => {
+                                let prev_state = self.meta.state();
+                                if !matches!(prev_state, ProcedureState::Running) {
+                                    info!(
+                                        "Set Procedure {}-{} state to running, prev_state: {:?}",
+                                        self.procedure.type_name(),
+                                        self.meta.id,
+                                        prev_state
+                                    );
+                                    self.meta.set_state(ProcedureState::Running);
+                                }
                                 self.on_suspended(subprocedures).await;
                             }
                             Status::Done { output } => {
@@ -393,8 +414,12 @@ impl Runner {
                             return;
                         }
 
-                        self.meta
-                            .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
+                        if self.procedure.rollback_supported() {
+                            self.meta
+                                .set_state(ProcedureState::prepare_rollback(Arc::new(e)));
+                        } else {
+                            self.meta.set_state(ProcedureState::failed(Arc::new(e)));
+                        }
                     }
                 }
             }
@@ -1080,20 +1105,10 @@ mod tests {
         let mut runner = new_runner(meta.clone(), Box::new(fail), procedure_store.clone());
         runner.manager_ctx.start();
 
-        runner.execute_once(&ctx).await;
-        let state = runner.meta.state();
-        assert!(state.is_prepare_rollback(), "{state:?}");
-
         runner.execute_once(&ctx).await;
         let state = runner.meta.state();
         assert!(state.is_failed(), "{state:?}");
-        check_files(
-            &object_store,
-            &procedure_store,
-            ctx.procedure_id,
-            &["0000000000.rollback"],
-        )
-        .await;
+        check_files(&object_store, &procedure_store, ctx.procedure_id, &[]).await;
     }
 
     #[tokio::test]
@@ -1146,6 +1161,8 @@ mod tests {
             async move {
                 if times == 1 {
                     Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
+                } else if times == 2 {
+                    Ok(Status::executing(false))
                 } else {
                     Ok(Status::done())
                 }
@@ -1172,6 +1189,10 @@ mod tests {
         let state = runner.meta.state();
         assert!(state.is_retrying(), "{state:?}");
 
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_running(), "{state:?}");
+
         runner.execute_once(&ctx).await;
         let state = runner.meta.state();
         assert!(state.is_done(), "{state:?}");
@@ -1185,6 +1206,86 @@ mod tests {
         .await;
     }
 
+    #[tokio::test(flavor = "multi_thread")]
+    async fn test_execute_on_retry_later_error_with_child() {
+        common_telemetry::init_default_ut_logging();
+        let mut times = 0;
+        let child_id = ProcedureId::random();
+
+        let exec_fn = move |_| {
+            times += 1;
+            async move {
+                debug!("times: {}", times);
+                if times == 1 {
+                    Err(Error::retry_later(MockError::new(StatusCode::Unexpected)))
+                } else if times == 2 {
+                    let exec_fn = |_| {
+                        async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }
+                            .boxed()
+                    };
+                    let fail = ProcedureAdapter {
+                        data: "fail".to_string(),
+                        lock_key: LockKey::single_exclusive("catalog.schema.table.region-0"),
+                        poison_keys: PoisonKeys::default(),
+                        exec_fn,
+                        rollback_fn: None,
+                    };
+
+                    Ok(Status::Suspended {
+                        subprocedures: vec![ProcedureWithId {
+                            id: child_id,
+                            procedure: Box::new(fail),
+                        }],
+                        persist: true,
+                    })
+                } else {
+                    Ok(Status::done())
+                }
+            }
+            .boxed()
+        };
+
+        let retry_later = ProcedureAdapter {
+            data: "retry_later".to_string(),
+            lock_key: LockKey::single_exclusive("catalog.schema.table"),
+            poison_keys: PoisonKeys::default(),
+            exec_fn,
+            rollback_fn: None,
+        };
+
+        let dir = create_temp_dir("retry_later");
+        let meta = retry_later.new_meta(ROOT_ID);
+        let ctx = context_without_provider(meta.id);
+        let object_store = test_util::new_object_store(&dir);
+        let procedure_store = Arc::new(ProcedureStore::from_object_store(object_store.clone()));
+        let mut runner = new_runner(meta.clone(), Box::new(retry_later), procedure_store.clone());
+        runner.manager_ctx.start();
+        debug!("execute_once 1");
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_retrying(), "{state:?}");
+
+        let moved_meta = meta.clone();
+        tokio::spawn(async move {
+            moved_meta.child_notify.notify_one();
+        });
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_running(), "{state:?}");
+
+        runner.execute_once(&ctx).await;
+        let state = runner.meta.state();
+        assert!(state.is_done(), "{state:?}");
+        assert!(meta.state().is_done());
+        check_files(
+            &object_store,
+            &procedure_store,
+            ctx.procedure_id,
+            &["0000000000.step", "0000000001.commit"],
+        )
+        .await;
+    }
+
     #[tokio::test]
     async fn test_execute_exceed_max_retry_later() {
         let exec_fn =
@@ -1304,7 +1405,7 @@ mod tests {
     async fn test_child_error() {
         let mut times = 0;
         let child_id = ProcedureId::random();
-
+        common_telemetry::init_default_ut_logging();
         let exec_fn = move |ctx: Context| {
             times += 1;
             async move {
@@ -1529,7 +1630,7 @@ mod tests {
 
         runner.execute_once(&ctx).await;
         let state = runner.meta.state();
-        assert!(state.is_prepare_rollback(), "{state:?}");
+        assert!(state.is_failed(), "{state:?}");
 
         let procedure_id = runner
             .manager_ctx
@@ -1596,11 +1697,6 @@ mod tests {
         let state = runner.meta.state();
         assert!(state.is_running(), "{state:?}");
 
-        runner.execute_once(&ctx).await;
-        let state = runner.meta.state();
-        assert!(state.is_prepare_rollback(), "{state:?}");
-        assert!(meta.state().is_prepare_rollback());
-
         runner.execute_once(&ctx).await;
         let state = runner.meta.state();
         assert!(state.is_failed(), "{state:?}");
diff --git a/src/common/query/src/error.rs b/src/common/query/src/error.rs
index 618795bb4a..e70b9f4833 100644
--- a/src/common/query/src/error.rs
+++ b/src/common/query/src/error.rs
@@ -52,9 +52,6 @@ pub enum Error {
         data_type: ArrowDatatype,
     },
 
-    #[snafu(display("Failed to downcast vector: {}", err_msg))]
-    DowncastVector { err_msg: String },
-
     #[snafu(display("Invalid input type: {}", err_msg))]
     InvalidInputType {
         #[snafu(implicit)]
@@ -209,8 +206,7 @@ pub type Result<T> = std::result::Result<T, Error>;
 impl ErrorExt for Error {
     fn status_code(&self) -> StatusCode {
         match self {
-            Error::DowncastVector { .. }
-            | Error::InvalidInputState { .. }
+            Error::InvalidInputState { .. }
             | Error::ToScalarValue { .. }
             | Error::GetScalarVector { .. }
             | Error::ArrowCompute { .. }
diff --git a/src/common/query/src/lib.rs b/src/common/query/src/lib.rs
index 77ecec8eed..91a417d356 100644
--- a/src/common/query/src/lib.rs
+++ b/src/common/query/src/lib.rs
@@ -46,6 +46,22 @@ pub enum OutputData {
     Stream(SendableRecordBatchStream),
 }
 
+impl OutputData {
+    /// Consume the data to pretty printed string.
+    pub async fn pretty_print(self) -> String {
+        match self {
+            OutputData::AffectedRows(x) => {
+                format!("Affected Rows: {x}")
+            }
+            OutputData::RecordBatches(x) => x.pretty_print().unwrap_or_else(|e| e.to_string()),
+            OutputData::Stream(x) => common_recordbatch::util::collect_batches(x)
+                .await
+                .and_then(|x| x.pretty_print())
+                .unwrap_or_else(|e| e.to_string()),
+        }
+    }
+}
+
 /// OutputMeta stores meta information produced/generated during the execution
 #[derive(Debug, Default)]
 pub struct OutputMeta {
diff --git a/src/common/recordbatch/src/adapter.rs b/src/common/recordbatch/src/adapter.rs
index fdec79fdef..7e504559b6 100644
--- a/src/common/recordbatch/src/adapter.rs
+++ b/src/common/recordbatch/src/adapter.rs
@@ -314,10 +314,10 @@ impl Stream for RecordBatchStreamAdapter {
                         metric_collector.record_batch_metrics,
                     );
                 }
-                Poll::Ready(Some(RecordBatch::try_from_df_record_batch(
+                Poll::Ready(Some(Ok(RecordBatch::from_df_record_batch(
                     self.schema(),
                     df_record_batch,
-                )))
+                ))))
             }
             Poll::Ready(None) => {
                 if let Metrics::Unresolved(df_plan) | Metrics::PartialResolved(df_plan, _) =
diff --git a/src/common/recordbatch/src/error.rs b/src/common/recordbatch/src/error.rs
index 2584b41b25..6d794463a0 100644
--- a/src/common/recordbatch/src/error.rs
+++ b/src/common/recordbatch/src/error.rs
@@ -133,18 +133,6 @@ pub enum Error {
         source: datatypes::error::Error,
     },
 
-    #[snafu(display(
-        "Failed to downcast vector of type '{:?}' to type '{:?}'",
-        from_type,
-        to_type
-    ))]
-    DowncastVector {
-        from_type: ConcreteDataType,
-        to_type: ConcreteDataType,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
     #[snafu(display("Error occurs when performing arrow computation"))]
     ArrowCompute {
         #[snafu(source)]
@@ -200,6 +188,13 @@ pub enum Error {
         #[snafu(implicit)]
         location: Location,
     },
+
+    #[snafu(display("Failed to align JSON array, reason: {reason}"))]
+    AlignJsonArray {
+        reason: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }
 
 impl ErrorExt for Error {
@@ -215,9 +210,8 @@ impl ErrorExt for Error {
             | Error::ToArrowScalar { .. }
             | Error::ProjectArrowRecordBatch { .. }
             | Error::PhysicalExpr { .. }
-            | Error::RecordBatchSliceIndexOverflow { .. } => StatusCode::Internal,
-
-            Error::DowncastVector { .. } => StatusCode::Unexpected,
+            | Error::RecordBatchSliceIndexOverflow { .. }
+            | Error::AlignJsonArray { .. } => StatusCode::Internal,
 
             Error::PollStream { .. } => StatusCode::EngineExecuteQuery,
 
diff --git a/src/common/recordbatch/src/lib.rs b/src/common/recordbatch/src/lib.rs
index 1f5f28e87f..c1253cfa1c 100644
--- a/src/common/recordbatch/src/lib.rs
+++ b/src/common/recordbatch/src/lib.rs
@@ -18,7 +18,7 @@ pub mod adapter;
 pub mod cursor;
 pub mod error;
 pub mod filter;
-mod recordbatch;
+pub mod recordbatch;
 pub mod util;
 
 use std::fmt;
@@ -30,19 +30,20 @@ use adapter::RecordBatchMetrics;
 use arc_swap::ArcSwapOption;
 use common_base::readable_size::ReadableSize;
 pub use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
+use datatypes::arrow::array::{ArrayRef, AsArray, StringBuilder};
 use datatypes::arrow::compute::SortOptions;
 pub use datatypes::arrow::record_batch::RecordBatch as DfRecordBatch;
 use datatypes::arrow::util::pretty;
 use datatypes::prelude::{ConcreteDataType, VectorRef};
-use datatypes::scalars::{ScalarVector, ScalarVectorBuilder};
 use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
 use datatypes::types::{JsonFormat, jsonb_to_string};
-use datatypes::vectors::{BinaryVector, StringVectorBuilder};
 use error::Result;
 use futures::task::{Context, Poll};
 use futures::{Stream, TryStreamExt};
 pub use recordbatch::RecordBatch;
-use snafu::{OptionExt, ResultExt, ensure};
+use snafu::{ResultExt, ensure};
+
+use crate::error::NewDfRecordBatchSnafu;
 
 pub trait RecordBatchStream: Stream<Item = Result<RecordBatch>> {
     fn name(&self) -> &str {
@@ -92,20 +93,14 @@ pub fn map_json_type_to_string(
     mapped_schema: &SchemaRef,
 ) -> Result<RecordBatch> {
     let mut vectors = Vec::with_capacity(original_schema.column_schemas().len());
-    for (vector, schema) in batch.columns.iter().zip(original_schema.column_schemas()) {
+    for (vector, schema) in batch.columns().iter().zip(original_schema.column_schemas()) {
         if let ConcreteDataType::Json(j) = &schema.data_type {
             if matches!(&j.format, JsonFormat::Jsonb) {
-                let mut string_vector_builder = StringVectorBuilder::with_capacity(vector.len());
-                let binary_vector = vector
-                    .as_any()
-                    .downcast_ref::<BinaryVector>()
-                    .with_context(|| error::DowncastVectorSnafu {
-                        from_type: schema.data_type.clone(),
-                        to_type: ConcreteDataType::binary_datatype(),
-                    })?;
-                for value in binary_vector.iter_data() {
+                let mut string_vector_builder = StringBuilder::new();
+                let binary_vector = vector.as_binary::<i32>();
+                for value in binary_vector.iter() {
                     let Some(value) = value else {
-                        string_vector_builder.push(None);
+                        string_vector_builder.append_null();
                         continue;
                     };
                     let string_value =
@@ -113,11 +108,11 @@ pub fn map_json_type_to_string(
                             from_type: schema.data_type.clone(),
                             to_type: ConcreteDataType::string_datatype(),
                         })?;
-                    string_vector_builder.push(Some(string_value.as_str()));
+                    string_vector_builder.append_value(string_value);
                 }
 
                 let string_vector = string_vector_builder.finish();
-                vectors.push(Arc::new(string_vector) as VectorRef);
+                vectors.push(Arc::new(string_vector) as ArrayRef);
             } else {
                 vectors.push(vector.clone());
             }
@@ -126,7 +121,15 @@ pub fn map_json_type_to_string(
         }
     }
 
-    RecordBatch::new(mapped_schema.clone(), vectors)
+    let record_batch = datatypes::arrow::record_batch::RecordBatch::try_new(
+        mapped_schema.arrow_schema().clone(),
+        vectors,
+    )
+    .context(NewDfRecordBatchSnafu)?;
+    Ok(RecordBatch::from_df_record_batch(
+        mapped_schema.clone(),
+        record_batch,
+    ))
 }
 
 /// Maps the json type to string in the schema.
@@ -755,11 +758,7 @@ impl Stream for MemoryTrackedStream {
     fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
         match Pin::new(&mut self.inner).poll_next(cx) {
             Poll::Ready(Some(Ok(batch))) => {
-                let additional = batch
-                    .columns()
-                    .iter()
-                    .map(|c| c.memory_size())
-                    .sum::<usize>();
+                let additional = batch.buffer_memory_size();
 
                 if let Err(e) = self.permit.track(additional, self.total_tracked) {
                     return Poll::Ready(Some(Err(e)));
diff --git a/src/common/recordbatch/src/recordbatch.rs b/src/common/recordbatch/src/recordbatch.rs
index 727950495a..a9dd663c2c 100644
--- a/src/common/recordbatch/src/recordbatch.rs
+++ b/src/common/recordbatch/src/recordbatch.rs
@@ -20,7 +20,8 @@ use datafusion::arrow::util::pretty::pretty_format_batches;
 use datafusion_common::arrow::array::ArrayRef;
 use datafusion_common::arrow::compute;
 use datafusion_common::arrow::datatypes::{DataType as ArrowDataType, SchemaRef as ArrowSchemaRef};
-use datatypes::arrow::array::RecordBatchOptions;
+use datatypes::arrow::array::{Array, AsArray, RecordBatchOptions, StructArray, new_null_array};
+use datatypes::extension::json::is_json_extension_type;
 use datatypes::prelude::DataType;
 use datatypes::schema::SchemaRef;
 use datatypes::vectors::{Helper, VectorRef};
@@ -30,15 +31,14 @@ use snafu::{OptionExt, ResultExt, ensure};
 
 use crate::DfRecordBatch;
 use crate::error::{
-    self, ArrowComputeSnafu, CastVectorSnafu, ColumnNotExistsSnafu, DataTypesSnafu,
-    ProjectArrowRecordBatchSnafu, Result,
+    self, AlignJsonArraySnafu, ArrowComputeSnafu, ColumnNotExistsSnafu, DataTypesSnafu,
+    NewDfRecordBatchSnafu, ProjectArrowRecordBatchSnafu, Result,
 };
 
 /// A two-dimensional batch of column-oriented data with a defined schema.
 #[derive(Clone, Debug, PartialEq)]
 pub struct RecordBatch {
     pub schema: SchemaRef,
-    pub columns: Vec<VectorRef>,
     df_record_batch: DfRecordBatch,
 }
 
@@ -60,12 +60,13 @@ impl RecordBatch {
         // TODO(LFC): Remove the casting here once `Batch` is no longer used.
         let arrow_arrays = Self::cast_view_arrays(schema.arrow_schema(), arrow_arrays)?;
 
+        let arrow_arrays = maybe_align_json_array_with_schema(schema.arrow_schema(), arrow_arrays)?;
+
         let df_record_batch = DfRecordBatch::try_new(schema.arrow_schema().clone(), arrow_arrays)
             .context(error::NewDfRecordBatchSnafu)?;
 
         Ok(RecordBatch {
             schema,
-            columns,
             df_record_batch,
         })
     }
@@ -91,14 +92,8 @@ impl RecordBatch {
     /// Create an empty [`RecordBatch`] from `schema`.
     pub fn new_empty(schema: SchemaRef) -> RecordBatch {
         let df_record_batch = DfRecordBatch::new_empty(schema.arrow_schema().clone());
-        let columns = schema
-            .column_schemas()
-            .iter()
-            .map(|col| col.data_type.create_mutable_vector(0).to_vector())
-            .collect();
         RecordBatch {
             schema,
-            columns,
             df_record_batch,
         }
     }
@@ -113,17 +108,12 @@ impl RecordBatch {
         .context(error::NewDfRecordBatchSnafu)?;
         Ok(RecordBatch {
             schema,
-            columns: vec![],
             df_record_batch,
         })
     }
 
     pub fn try_project(&self, indices: &[usize]) -> Result<Self> {
         let schema = Arc::new(self.schema.try_project(indices).context(DataTypesSnafu)?);
-        let mut columns = Vec::with_capacity(indices.len());
-        for index in indices {
-            columns.push(self.columns[*index].clone());
-        }
         let df_record_batch = self.df_record_batch.project(indices).with_context(|_| {
             ProjectArrowRecordBatchSnafu {
                 schema: self.schema.clone(),
@@ -133,7 +123,6 @@ impl RecordBatch {
 
         Ok(Self {
             schema,
-            columns,
             df_record_batch,
         })
     }
@@ -141,21 +130,11 @@ impl RecordBatch {
     /// Create a new [`RecordBatch`] from `schema` and `df_record_batch`.
     ///
     /// This method doesn't check the schema.
-    pub fn try_from_df_record_batch(
-        schema: SchemaRef,
-        df_record_batch: DfRecordBatch,
-    ) -> Result<RecordBatch> {
-        let columns = df_record_batch
-            .columns()
-            .iter()
-            .map(|c| Helper::try_into_vector(c.clone()).context(error::DataTypesSnafu))
-            .collect::<Result<Vec<_>>>()?;
-
-        Ok(RecordBatch {
+    pub fn from_df_record_batch(schema: SchemaRef, df_record_batch: DfRecordBatch) -> RecordBatch {
+        RecordBatch {
             schema,
-            columns,
             df_record_batch,
-        })
+        }
     }
 
     #[inline]
@@ -169,23 +148,22 @@ impl RecordBatch {
     }
 
     #[inline]
-    pub fn columns(&self) -> &[VectorRef] {
-        &self.columns
+    pub fn columns(&self) -> &[ArrayRef] {
+        self.df_record_batch.columns()
     }
 
     #[inline]
-    pub fn column(&self, idx: usize) -> &VectorRef {
-        &self.columns[idx]
+    pub fn column(&self, idx: usize) -> &ArrayRef {
+        self.df_record_batch.column(idx)
     }
 
-    pub fn column_by_name(&self, name: &str) -> Option<&VectorRef> {
-        let idx = self.schema.column_index_by_name(name)?;
-        Some(&self.columns[idx])
+    pub fn column_by_name(&self, name: &str) -> Option<&ArrayRef> {
+        self.df_record_batch.column_by_name(name)
     }
 
     #[inline]
     pub fn num_columns(&self) -> usize {
-        self.columns.len()
+        self.df_record_batch.num_columns()
     }
 
     #[inline]
@@ -201,9 +179,14 @@ impl RecordBatch {
         let mut vectors = HashMap::with_capacity(self.num_columns());
 
         // column schemas in recordbatch must match its vectors, otherwise it's corrupted
-        for (vector_schema, vector) in self.schema.column_schemas().iter().zip(self.columns.iter())
+        for (field, array) in self
+            .df_record_batch
+            .schema()
+            .fields()
+            .iter()
+            .zip(self.df_record_batch.columns().iter())
         {
-            let column_name = &vector_schema.name;
+            let column_name = field.name();
             let column_schema =
                 table_schema
                     .column_schema_by_name(column_name)
@@ -211,15 +194,12 @@ impl RecordBatch {
                         table_name,
                         column_name,
                     })?;
-            let vector = if vector_schema.data_type != column_schema.data_type {
-                vector
-                    .cast(&column_schema.data_type)
-                    .with_context(|_| CastVectorSnafu {
-                        from_type: vector.data_type(),
-                        to_type: column_schema.data_type.clone(),
-                    })?
+            let vector = if field.data_type() != &column_schema.data_type.as_arrow_type() {
+                let array = compute::cast(array, &column_schema.data_type.as_arrow_type())
+                    .context(ArrowComputeSnafu)?;
+                Helper::try_into_vector(array).context(DataTypesSnafu)?
             } else {
-                vector.clone()
+                Helper::try_into_vector(array).context(DataTypesSnafu)?
             };
 
             let _ = vectors.insert(column_name.clone(), vector);
@@ -244,8 +224,69 @@ impl RecordBatch {
                 visit_index: offset + len
             }
         );
-        let columns = self.columns.iter().map(|vector| vector.slice(offset, len));
-        RecordBatch::new(self.schema.clone(), columns)
+        let sliced = self.df_record_batch.slice(offset, len);
+        Ok(RecordBatch::from_df_record_batch(
+            self.schema.clone(),
+            sliced,
+        ))
+    }
+
+    /// Returns the total number of bytes of memory pointed to by the arrays in this `RecordBatch`.
+    ///
+    /// The buffers store bytes in the Arrow memory format, and include the data as well as the validity map.
+    /// Note that this does not always correspond to the exact memory usage of an array,
+    /// since multiple arrays can share the same buffers or slices thereof.
+    pub fn buffer_memory_size(&self) -> usize {
+        self.df_record_batch
+            .columns()
+            .iter()
+            .map(|array| array.get_buffer_memory_size())
+            .sum()
+    }
+
+    /// Iterate the values as strings in the column at index `i`.
+    ///
+    /// Note that if the underlying array is not a valid GreptimeDB vector, an empty iterator is
+    /// returned.
+    ///
+    /// # Panics
+    /// if index `i` is out of bound.
+    pub fn iter_column_as_string(&self, i: usize) -> Box<dyn Iterator<Item = Option<String>> + '_> {
+        macro_rules! iter {
+            ($column: ident) => {
+                Box::new(
+                    (0..$column.len())
+                        .map(|i| $column.is_valid(i).then(|| $column.value(i).to_string())),
+                )
+            };
+        }
+
+        let column = self.df_record_batch.column(i);
+        match column.data_type() {
+            ArrowDataType::Utf8 => {
+                let column = column.as_string::<i32>();
+                let iter = iter!(column);
+                iter as _
+            }
+            ArrowDataType::LargeUtf8 => {
+                let column = column.as_string::<i64>();
+                iter!(column)
+            }
+            ArrowDataType::Utf8View => {
+                let column = column.as_string_view();
+                iter!(column)
+            }
+            _ => {
+                if let Ok(column) = Helper::try_into_vector(column) {
+                    Box::new(
+                        (0..column.len())
+                            .map(move |i| (!column.is_null(i)).then(|| column.get(i).to_string())),
+                    )
+                } else {
+                    Box::new(std::iter::empty())
+                }
+            }
+        }
     }
 }
 
@@ -259,8 +300,9 @@ impl Serialize for RecordBatch {
         let mut s = serializer.serialize_struct("record", 2)?;
         s.serialize_field("schema", &**self.schema.arrow_schema())?;
 
-        let vec = self
-            .columns
+        let columns = self.df_record_batch.columns();
+        let columns = Helper::try_into_vectors(columns).map_err(Error::custom)?;
+        let vec = columns
             .iter()
             .map(|c| c.serialize_to_json())
             .collect::<std::result::Result<Vec<_>, _>>()
@@ -278,35 +320,121 @@ pub fn merge_record_batches(schema: SchemaRef, batches: &[RecordBatch]) -> Resul
         return Ok(RecordBatch::new_empty(schema));
     }
 
-    let n_rows = batches.iter().map(|b| b.num_rows()).sum();
-    let n_columns = schema.num_columns();
-    // Collect arrays from each batch
-    let mut merged_columns = Vec::with_capacity(n_columns);
-
-    for col_idx in 0..n_columns {
-        let mut acc = schema.column_schemas()[col_idx]
-            .data_type
-            .create_mutable_vector(n_rows);
-
-        for batch in batches {
-            let column = batch.column(col_idx);
-            acc.extend_slice_of(column.as_ref(), 0, column.len())
-                .context(error::DataTypesSnafu)?;
-        }
-
-        merged_columns.push(acc.to_vector());
-    }
+    let record_batch = compute::concat_batches(
+        schema.arrow_schema(),
+        batches.iter().map(|x| x.df_record_batch()),
+    )
+    .context(ArrowComputeSnafu)?;
 
     // Create a new RecordBatch with merged columns
-    RecordBatch::new(schema, merged_columns)
+    Ok(RecordBatch::from_df_record_batch(schema, record_batch))
+}
+
+/// Align a json array `json_array` to the json type `schema_type`. The `schema_type` is often the
+/// "largest" json type after some insertions in the table schema, while the json array previously
+/// written in the SST could be lagged behind it. So it's important to "amend" the json array's
+/// missing fields with null arrays, to align the array's data type with the provided one.
+///
+/// # Panics
+///
+/// - The json array is not an Arrow [StructArray], or the provided data type `schema_type` is not
+///   of Struct type. Both of which shouldn't happen unless we switch our implementation of how
+///   json array is physically stored.
+pub fn align_json_array(json_array: &ArrayRef, schema_type: &ArrowDataType) -> Result<ArrayRef> {
+    let json_type = json_array.data_type();
+    if json_type == schema_type {
+        return Ok(json_array.clone());
+    }
+
+    let json_array = json_array.as_struct();
+    let array_fields = json_array.fields();
+    let array_columns = json_array.columns();
+    let ArrowDataType::Struct(schema_fields) = schema_type else {
+        unreachable!()
+    };
+    let mut aligned = Vec::with_capacity(schema_fields.len());
+
+    // Compare the fields in the json array and the to-be-aligned schema, amending with null arrays
+    // on the way. It's very important to note that fields in the json array and in the json type
+    // are both SORTED.
+
+    let mut i = 0; // point to the schema fields
+    let mut j = 0; // point to the array fields
+    while i < schema_fields.len() && j < array_fields.len() {
+        let schema_field = &schema_fields[i];
+        let array_field = &array_fields[j];
+        if schema_field.name() == array_field.name() {
+            if matches!(schema_field.data_type(), ArrowDataType::Struct(_)) {
+                // A `StructArray`s in a json array must be another json array. (Like a nested json
+                // object in a json value.)
+                aligned.push(align_json_array(
+                    &array_columns[j],
+                    schema_field.data_type(),
+                )?);
+            } else {
+                aligned.push(array_columns[j].clone());
+            }
+            j += 1;
+        } else {
+            aligned.push(new_null_array(schema_field.data_type(), json_array.len()));
+        }
+        i += 1;
+    }
+    if i < schema_fields.len() {
+        for field in &schema_fields[i..] {
+            aligned.push(new_null_array(field.data_type(), json_array.len()));
+        }
+    }
+    ensure!(
+        j == array_fields.len(),
+        AlignJsonArraySnafu {
+            reason: format!(
+                "this json array has more fields {:?}",
+                array_fields[j..]
+                    .iter()
+                    .map(|x| x.name())
+                    .collect::<Vec<_>>(),
+            )
+        }
+    );
+
+    let json_array =
+        StructArray::try_new(schema_fields.clone(), aligned, json_array.nulls().cloned())
+            .context(NewDfRecordBatchSnafu)?;
+    Ok(Arc::new(json_array))
+}
+
+fn maybe_align_json_array_with_schema(
+    schema: &ArrowSchemaRef,
+    arrays: Vec<ArrayRef>,
+) -> Result<Vec<ArrayRef>> {
+    if schema.fields().iter().all(|f| !is_json_extension_type(f)) {
+        return Ok(arrays);
+    }
+
+    let mut aligned = Vec::with_capacity(arrays.len());
+    for (field, array) in schema.fields().iter().zip(arrays.into_iter()) {
+        if !is_json_extension_type(field) {
+            aligned.push(array);
+            continue;
+        }
+
+        let json_array = align_json_array(&array, field.data_type())?;
+        aligned.push(json_array);
+    }
+    Ok(aligned)
 }
 
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
 
-    use datatypes::arrow::array::{AsArray, UInt32Array};
-    use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, UInt32Type};
+    use datatypes::arrow::array::{
+        AsArray, BooleanArray, Float64Array, Int64Array, ListArray, UInt32Array,
+    };
+    use datatypes::arrow::datatypes::{
+        DataType, Field, Fields, Int64Type, Schema as ArrowSchema, UInt32Type,
+    };
     use datatypes::arrow_array::StringArray;
     use datatypes::data_type::ConcreteDataType;
     use datatypes::schema::{ColumnSchema, Schema};
@@ -314,6 +442,165 @@ mod tests {
 
     use super::*;
 
+    #[test]
+    fn test_align_json_array() -> Result<()> {
+        struct TestCase {
+            json_array: ArrayRef,
+            schema_type: DataType,
+            expected: std::result::Result<ArrayRef, String>,
+        }
+
+        impl TestCase {
+            fn new(
+                json_array: StructArray,
+                schema_type: Fields,
+                expected: std::result::Result<Vec<ArrayRef>, String>,
+            ) -> Self {
+                Self {
+                    json_array: Arc::new(json_array),
+                    schema_type: DataType::Struct(schema_type.clone()),
+                    expected: expected
+                        .map(|x| Arc::new(StructArray::new(schema_type, x, None)) as ArrayRef),
+                }
+            }
+
+            fn test(self) -> Result<()> {
+                let result = align_json_array(&self.json_array, &self.schema_type);
+                match (result, self.expected) {
+                    (Ok(json_array), Ok(expected)) => assert_eq!(&json_array, &expected),
+                    (Ok(json_array), Err(e)) => {
+                        panic!("expecting error {e} but actually get: {json_array:?}")
+                    }
+                    (Err(e), Err(expected)) => assert_eq!(e.to_string(), expected),
+                    (Err(e), Ok(_)) => return Err(e),
+                }
+                Ok(())
+            }
+        }
+
+        // Test empty json array can be aligned with a complex json type.
+        TestCase::new(
+            StructArray::new_empty_fields(2, None),
+            Fields::from(vec![
+                Field::new("int", DataType::Int64, true),
+                Field::new_struct(
+                    "nested",
+                    vec![Field::new("bool", DataType::Boolean, true)],
+                    true,
+                ),
+                Field::new("string", DataType::Utf8, true),
+            ]),
+            Ok(vec![
+                Arc::new(Int64Array::new_null(2)) as ArrayRef,
+                Arc::new(StructArray::new_null(
+                    Fields::from(vec![Arc::new(Field::new("bool", DataType::Boolean, true))]),
+                    2,
+                )),
+                Arc::new(StringArray::new_null(2)),
+            ]),
+        )
+        .test()?;
+
+        // Test simple json array alignment.
+        TestCase::new(
+            StructArray::from(vec![(
+                Arc::new(Field::new("float", DataType::Float64, true)),
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])) as ArrayRef,
+            )]),
+            Fields::from(vec![
+                Field::new("float", DataType::Float64, true),
+                Field::new("string", DataType::Utf8, true),
+            ]),
+            Ok(vec![
+                Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0])) as ArrayRef,
+                Arc::new(StringArray::new_null(3)),
+            ]),
+        )
+        .test()?;
+
+        // Test complex json array alignment.
+        TestCase::new(
+            StructArray::from(vec![
+                (
+                    Arc::new(Field::new_list(
+                        "list",
+                        Field::new_list_field(DataType::Int64, true),
+                        true,
+                    )),
+                    Arc::new(ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
+                        Some(vec![Some(1)]),
+                        None,
+                        Some(vec![Some(2), Some(3)]),
+                    ])) as ArrayRef,
+                ),
+                (
+                    Arc::new(Field::new_struct(
+                        "nested",
+                        vec![Field::new("int", DataType::Int64, true)],
+                        true,
+                    )),
+                    Arc::new(StructArray::from(vec![(
+                        Arc::new(Field::new("int", DataType::Int64, true)),
+                        Arc::new(Int64Array::from(vec![-1, -2, -3])) as ArrayRef,
+                    )])),
+                ),
+                (
+                    Arc::new(Field::new("string", DataType::Utf8, true)),
+                    Arc::new(StringArray::from(vec!["a", "b", "c"])),
+                ),
+            ]),
+            Fields::from(vec![
+                Field::new("bool", DataType::Boolean, true),
+                Field::new_list("list", Field::new_list_field(DataType::Int64, true), true),
+                Field::new_struct(
+                    "nested",
+                    vec![
+                        Field::new("float", DataType::Float64, true),
+                        Field::new("int", DataType::Int64, true),
+                    ],
+                    true,
+                ),
+                Field::new("string", DataType::Utf8, true),
+            ]),
+            Ok(vec![
+                Arc::new(BooleanArray::new_null(3)) as ArrayRef,
+                Arc::new(ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
+                    Some(vec![Some(1)]),
+                    None,
+                    Some(vec![Some(2), Some(3)]),
+                ])),
+                Arc::new(StructArray::from(vec![
+                    (
+                        Arc::new(Field::new("float", DataType::Float64, true)),
+                        Arc::new(Float64Array::new_null(3)) as ArrayRef,
+                    ),
+                    (
+                        Arc::new(Field::new("int", DataType::Int64, true)),
+                        Arc::new(Int64Array::from(vec![-1, -2, -3])),
+                    ),
+                ])),
+                Arc::new(StringArray::from(vec!["a", "b", "c"])),
+            ]),
+        )
+        .test()?;
+
+        // Test align failed.
+        TestCase::new(
+            StructArray::try_from(vec![
+                ("i", Arc::new(Int64Array::from(vec![1])) as ArrayRef),
+                ("j", Arc::new(Int64Array::from(vec![2])) as ArrayRef),
+            ])
+            .unwrap(),
+            Fields::from(vec![Field::new("i", DataType::Int64, true)]),
+            Err(
+                r#"Failed to align JSON array, reason: this json array has more fields ["j"]"#
+                    .to_string(),
+            ),
+        )
+        .test()?;
+        Ok(())
+    }
+
     #[test]
     fn test_record_batch() {
         let arrow_schema = Arc::new(ArrowSchema::new(vec![
@@ -326,21 +613,21 @@ mod tests {
         let c2 = Arc::new(UInt32Vector::from_slice([4, 5, 6]));
         let columns: Vec<VectorRef> = vec![c1, c2];
 
+        let expected = vec![
+            Arc::new(UInt32Array::from_iter_values([1, 2, 3])) as ArrayRef,
+            Arc::new(UInt32Array::from_iter_values([4, 5, 6])),
+        ];
+
         let batch = RecordBatch::new(schema.clone(), columns.clone()).unwrap();
         assert_eq!(3, batch.num_rows());
-        assert_eq!(&columns, batch.columns());
-        for (i, expect) in columns.iter().enumerate().take(batch.num_columns()) {
-            let column = batch.column(i);
-            assert_eq!(expect, column);
-        }
+        assert_eq!(expected, batch.df_record_batch().columns());
         assert_eq!(schema, batch.schema);
 
-        assert_eq!(columns[0], *batch.column_by_name("c1").unwrap());
-        assert_eq!(columns[1], *batch.column_by_name("c2").unwrap());
+        assert_eq!(&expected[0], batch.column_by_name("c1").unwrap());
+        assert_eq!(&expected[1], batch.column_by_name("c2").unwrap());
         assert!(batch.column_by_name("c3").is_none());
 
-        let converted =
-            RecordBatch::try_from_df_record_batch(schema, batch.df_record_batch().clone()).unwrap();
+        let converted = RecordBatch::from_df_record_batch(schema, batch.df_record_batch().clone());
         assert_eq!(batch, converted);
         assert_eq!(*batch.df_record_batch(), converted.into_df_record_batch());
     }
@@ -385,12 +672,12 @@ mod tests {
         let recordbatch = recordbatch.slice(1, 2).expect("recordbatch slice");
 
         let expected = &UInt32Array::from_iter_values([2u32, 3]);
-        let array = recordbatch.column(0).to_arrow_array();
+        let array = recordbatch.column(0);
         let actual = array.as_primitive::<UInt32Type>();
         assert_eq!(expected, actual);
 
         let expected = &StringArray::from(vec!["hello", "greptime"]);
-        let array = recordbatch.column(1).to_arrow_array();
+        let array = recordbatch.column(1);
         let actual = array.as_string::<i32>();
         assert_eq!(expected, actual);
 
diff --git a/src/common/sql/src/convert.rs b/src/common/sql/src/convert.rs
index 0ff2e44061..edb793baf6 100644
--- a/src/common/sql/src/convert.rs
+++ b/src/common/sql/src/convert.rs
@@ -211,8 +211,7 @@ pub fn sql_value_to_value(
             | Value::Duration(_)
             | Value::IntervalYearMonth(_)
             | Value::IntervalDayTime(_)
-            | Value::IntervalMonthDayNano(_)
-            | Value::Json(_) => match unary_op {
+            | Value::IntervalMonthDayNano(_) => match unary_op {
                 UnaryOperator::Plus => {}
                 UnaryOperator::Minus => {
                     value = value
@@ -222,19 +221,25 @@ pub fn sql_value_to_value(
                 _ => return InvalidUnaryOpSnafu { unary_op, value }.fail(),
             },
 
-            Value::String(_) | Value::Binary(_) | Value::List(_) | Value::Struct(_) => {
+            Value::String(_)
+            | Value::Binary(_)
+            | Value::List(_)
+            | Value::Struct(_)
+            | Value::Json(_) => {
                 return InvalidUnaryOpSnafu { unary_op, value }.fail();
             }
         }
     }
 
-    if value.data_type() != *data_type {
+    let value_datatype = value.data_type();
+    // The datatype of json value is determined by its actual data, so we can't simply "cast" it here.
+    if value_datatype.is_json() || value_datatype == *data_type {
+        Ok(value)
+    } else {
         datatypes::types::cast(value, data_type).with_context(|_| InvalidCastSnafu {
             sql_value: sql_val.clone(),
             datatype: data_type,
         })
-    } else {
-        Ok(value)
     }
 }
 
diff --git a/src/common/sql/src/default_constraint.rs b/src/common/sql/src/default_constraint.rs
index 0366f9aec3..e2a57337a5 100644
--- a/src/common/sql/src/default_constraint.rs
+++ b/src/common/sql/src/default_constraint.rs
@@ -16,6 +16,7 @@ use common_time::timezone::Timezone;
 use datatypes::prelude::ConcreteDataType;
 use datatypes::schema::ColumnDefaultConstraint;
 use datatypes::schema::constraint::{CURRENT_TIMESTAMP, CURRENT_TIMESTAMP_FN};
+use snafu::ensure;
 use sqlparser::ast::ValueWithSpan;
 pub use sqlparser::ast::{
     BinaryOperator, ColumnDef, ColumnOption, ColumnOptionDef, DataType, Expr, Function,
@@ -37,6 +38,14 @@ pub fn parse_column_default_constraint(
         .iter()
         .find(|o| matches!(o.option, ColumnOption::Default(_)))
     {
+        ensure!(
+            !data_type.is_json(),
+            UnsupportedDefaultValueSnafu {
+                column_name,
+                reason: "json column cannot have a default value",
+            }
+        );
+
         let default_constraint = match &opt.option {
             ColumnOption::Default(Expr::Value(v)) => ColumnDefaultConstraint::Value(
                 sql_value_to_value(column_name, data_type, &v.value, timezone, None, false)?,
@@ -82,7 +91,7 @@ pub fn parse_column_default_constraint(
                 } else {
                     return UnsupportedDefaultValueSnafu {
                         column_name,
-                        expr: *expr.clone(),
+                        reason: format!("expr '{expr}' not supported"),
                     }
                     .fail();
                 }
@@ -90,14 +99,14 @@ pub fn parse_column_default_constraint(
             ColumnOption::Default(others) => {
                 return UnsupportedDefaultValueSnafu {
                     column_name,
-                    expr: others.clone(),
+                    reason: format!("expr '{others}' not supported"),
                 }
                 .fail();
             }
             _ => {
                 return UnsupportedDefaultValueSnafu {
                     column_name,
-                    expr: Expr::Value(SqlValue::Null.into()),
+                    reason: format!("option '{}' not supported", opt.option),
                 }
                 .fail();
             }
diff --git a/src/common/sql/src/error.rs b/src/common/sql/src/error.rs
index b777b54103..ed23df0cc1 100644
--- a/src/common/sql/src/error.rs
+++ b/src/common/sql/src/error.rs
@@ -55,13 +55,11 @@ pub enum Error {
     },
 
     #[snafu(display(
-        "Unsupported expr in default constraint: {} for column: {}",
-        expr,
-        column_name
+        "Unsupported default constraint for column: '{column_name}', reason: {reason}"
     ))]
     UnsupportedDefaultValue {
         column_name: String,
-        expr: Expr,
+        reason: String,
         #[snafu(implicit)]
         location: Location,
     },
diff --git a/src/common/stat/src/resource.rs b/src/common/stat/src/resource.rs
index babfa54a19..7894ccfda3 100644
--- a/src/common/stat/src/resource.rs
+++ b/src/common/stat/src/resource.rs
@@ -58,10 +58,14 @@ pub fn get_total_memory_bytes() -> i64 {
     }
 }
 
-/// Get the total CPU cores. The result will be rounded to the nearest integer.
-/// For example, if the total CPU is 1.5 cores(1500 millicores), the result will be 2.
+/// Get the total CPU cores. The result will be rounded up to the next integer (ceiling).
+/// For example, if the total CPU is 1.1 cores (1100 millicores) or 1.5 cores (1500 millicores), the result will be 2.
 pub fn get_total_cpu_cores() -> usize {
-    ((get_total_cpu_millicores() as f64) / 1000.0).round() as usize
+    cpu_cores(get_total_cpu_millicores())
+}
+
+fn cpu_cores(cpu_millicores: i64) -> usize {
+    ((cpu_millicores as f64) / 1_000.0).ceil() as usize
 }
 
 /// Get the total memory in readable size.
@@ -178,6 +182,13 @@ mod tests {
     #[test]
     fn test_get_total_cpu_cores() {
         assert!(get_total_cpu_cores() > 0);
+        assert_eq!(cpu_cores(1), 1);
+        assert_eq!(cpu_cores(100), 1);
+        assert_eq!(cpu_cores(500), 1);
+        assert_eq!(cpu_cores(1000), 1);
+        assert_eq!(cpu_cores(1100), 2);
+        assert_eq!(cpu_cores(1900), 2);
+        assert_eq!(cpu_cores(10_000), 10);
     }
 
     #[test]
diff --git a/src/common/telemetry/Cargo.toml b/src/common/telemetry/Cargo.toml
index d0bc6876bc..92c3304d53 100644
--- a/src/common/telemetry/Cargo.toml
+++ b/src/common/telemetry/Cargo.toml
@@ -35,5 +35,5 @@ tokio.workspace = true
 tracing = "0.1"
 tracing-appender.workspace = true
 tracing-log = "0.2"
-tracing-opentelemetry = "0.31.0"
+tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
diff --git a/src/common/telemetry/src/lib.rs b/src/common/telemetry/src/lib.rs
index ba46bfa0d9..cd60d61645 100644
--- a/src/common/telemetry/src/lib.rs
+++ b/src/common/telemetry/src/lib.rs
@@ -21,7 +21,10 @@ mod panic_hook;
 pub mod tracing_context;
 mod tracing_sampler;
 
-pub use logging::{RELOAD_HANDLE, init_default_ut_logging, init_global_logging};
+pub use logging::{
+    LOG_RELOAD_HANDLE, TRACE_RELOAD_HANDLE, get_or_init_tracer, init_default_ut_logging,
+    init_global_logging,
+};
 pub use metric::dump_metrics;
 pub use panic_hook::set_panic_hook;
 pub use {common_error, tracing, tracing_subscriber};
diff --git a/src/common/telemetry/src/logging.rs b/src/common/telemetry/src/logging.rs
index d2b8a64b39..1b371c1d78 100644
--- a/src/common/telemetry/src/logging.rs
+++ b/src/common/telemetry/src/logging.rs
@@ -16,7 +16,7 @@
 use std::collections::HashMap;
 use std::env;
 use std::io::IsTerminal;
-use std::sync::{Arc, Mutex, Once};
+use std::sync::{Arc, Mutex, Once, RwLock};
 use std::time::Duration;
 
 use common_base::serde::empty_string_as_default;
@@ -25,15 +25,17 @@ use opentelemetry::trace::TracerProvider;
 use opentelemetry::{KeyValue, global};
 use opentelemetry_otlp::{Protocol, SpanExporter, WithExportConfig, WithHttpConfig};
 use opentelemetry_sdk::propagation::TraceContextPropagator;
-use opentelemetry_sdk::trace::Sampler;
+use opentelemetry_sdk::trace::{Sampler, Tracer};
 use opentelemetry_semantic_conventions::resource;
 use serde::{Deserialize, Serialize};
+use tracing::callsite;
+use tracing::metadata::LevelFilter;
 use tracing_appender::non_blocking::WorkerGuard;
 use tracing_appender::rolling::{RollingFileAppender, Rotation};
 use tracing_log::LogTracer;
 use tracing_subscriber::filter::{FilterFn, Targets};
 use tracing_subscriber::fmt::Layer;
-use tracing_subscriber::layer::SubscriberExt;
+use tracing_subscriber::layer::{Layered, SubscriberExt};
 use tracing_subscriber::prelude::*;
 use tracing_subscriber::{EnvFilter, Registry, filter};
 
@@ -48,10 +50,192 @@ pub const DEFAULT_OTLP_HTTP_ENDPOINT: &str = "http://localhost:4318/v1/traces";
 /// The default logs directory.
 pub const DEFAULT_LOGGING_DIR: &str = "logs";
 
-// Handle for reloading log level
-pub static RELOAD_HANDLE: OnceCell<tracing_subscriber::reload::Handle<Targets, Registry>> =
+/// Handle for reloading log level
+pub static LOG_RELOAD_HANDLE: OnceCell<tracing_subscriber::reload::Handle<Targets, Registry>> =
     OnceCell::new();
 
+type DynSubscriber = Layered<tracing_subscriber::reload::Layer<Targets, Registry>, Registry>;
+type OtelTraceLayer = tracing_opentelemetry::OpenTelemetryLayer<DynSubscriber, Tracer>;
+
+#[derive(Clone)]
+pub struct TraceReloadHandle {
+    inner: Arc<RwLock<Option<OtelTraceLayer>>>,
+}
+
+impl TraceReloadHandle {
+    fn new(inner: Arc<RwLock<Option<OtelTraceLayer>>>) -> Self {
+        Self { inner }
+    }
+
+    pub fn reload(&self, new_layer: Option<OtelTraceLayer>) {
+        let mut guard = self.inner.write().unwrap();
+        *guard = new_layer;
+        drop(guard);
+
+        callsite::rebuild_interest_cache();
+    }
+}
+
+/// A tracing layer that can be dynamically reloaded.
+///
+/// Mostly copied from [`tracing_subscriber::reload::Layer`].
+struct TraceLayer {
+    inner: Arc<RwLock<Option<OtelTraceLayer>>>,
+}
+
+impl TraceLayer {
+    fn new(initial: Option<OtelTraceLayer>) -> (Self, TraceReloadHandle) {
+        let inner = Arc::new(RwLock::new(initial));
+        (
+            Self {
+                inner: inner.clone(),
+            },
+            TraceReloadHandle::new(inner),
+        )
+    }
+
+    fn with_layer<R>(&self, f: impl FnOnce(&OtelTraceLayer) -> R) -> Option<R> {
+        self.inner
+            .read()
+            .ok()
+            .and_then(|guard| guard.as_ref().map(f))
+    }
+
+    fn with_layer_mut<R>(&self, f: impl FnOnce(&mut OtelTraceLayer) -> R) -> Option<R> {
+        self.inner
+            .write()
+            .ok()
+            .and_then(|mut guard| guard.as_mut().map(f))
+    }
+}
+
+impl tracing_subscriber::Layer<DynSubscriber> for TraceLayer {
+    fn on_register_dispatch(&self, subscriber: &tracing::Dispatch) {
+        let _ = self.with_layer(|layer| layer.on_register_dispatch(subscriber));
+    }
+
+    fn on_layer(&mut self, subscriber: &mut DynSubscriber) {
+        let _ = self.with_layer_mut(|layer| layer.on_layer(subscriber));
+    }
+
+    fn register_callsite(
+        &self,
+        metadata: &'static tracing::Metadata<'static>,
+    ) -> tracing::subscriber::Interest {
+        self.with_layer(|layer| layer.register_callsite(metadata))
+            .unwrap_or_else(tracing::subscriber::Interest::always)
+    }
+
+    fn enabled(
+        &self,
+        metadata: &tracing::Metadata<'_>,
+        ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>,
+    ) -> bool {
+        self.with_layer(|layer| layer.enabled(metadata, ctx))
+            .unwrap_or(true)
+    }
+
+    fn on_new_span(
+        &self,
+        attrs: &tracing::span::Attributes<'_>,
+        id: &tracing::span::Id,
+        ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>,
+    ) {
+        let _ = self.with_layer(|layer| layer.on_new_span(attrs, id, ctx));
+    }
+
+    fn max_level_hint(&self) -> Option<LevelFilter> {
+        self.with_layer(|layer| layer.max_level_hint()).flatten()
+    }
+
+    fn on_record(
+        &self,
+        span: &tracing::span::Id,
+        values: &tracing::span::Record<'_>,
+        ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>,
+    ) {
+        let _ = self.with_layer(|layer| layer.on_record(span, values, ctx));
+    }
+
+    fn on_follows_from(
+        &self,
+        span: &tracing::span::Id,
+        follows: &tracing::span::Id,
+        ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>,
+    ) {
+        let _ = self.with_layer(|layer| layer.on_follows_from(span, follows, ctx));
+    }
+
+    fn event_enabled(
+        &self,
+        event: &tracing::Event<'_>,
+        ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>,
+    ) -> bool {
+        self.with_layer(|layer| layer.event_enabled(event, ctx))
+            .unwrap_or(true)
+    }
+
+    fn on_event(
+        &self,
+        event: &tracing::Event<'_>,
+        ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>,
+    ) {
+        let _ = self.with_layer(|layer| layer.on_event(event, ctx));
+    }
+
+    fn on_enter(
+        &self,
+        id: &tracing::span::Id,
+        ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>,
+    ) {
+        let _ = self.with_layer(|layer| layer.on_enter(id, ctx));
+    }
+
+    fn on_exit(
+        &self,
+        id: &tracing::span::Id,
+        ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>,
+    ) {
+        let _ = self.with_layer(|layer| layer.on_exit(id, ctx));
+    }
+
+    fn on_close(
+        &self,
+        id: tracing::span::Id,
+        ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>,
+    ) {
+        let _ = self.with_layer(|layer| layer.on_close(id, ctx));
+    }
+
+    fn on_id_change(
+        &self,
+        old: &tracing::span::Id,
+        new: &tracing::span::Id,
+        ctx: tracing_subscriber::layer::Context<'_, DynSubscriber>,
+    ) {
+        let _ = self.with_layer(|layer| layer.on_id_change(old, new, ctx));
+    }
+
+    unsafe fn downcast_raw(&self, id: std::any::TypeId) -> Option<*const ()> {
+        self.inner.read().ok().and_then(|guard| {
+            guard
+                .as_ref()
+                .and_then(|layer| unsafe { layer.downcast_raw(id) })
+        })
+    }
+}
+
+/// Handle for reloading trace level
+pub static TRACE_RELOAD_HANDLE: OnceCell<TraceReloadHandle> = OnceCell::new();
+
+static TRACER: OnceCell<Mutex<TraceState>> = OnceCell::new();
+
+#[derive(Debug)]
+enum TraceState {
+    Ready(Tracer),
+    Deferred(TraceContext),
+}
+
 /// The logging options that used to initialize the logger.
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[serde(default)]
@@ -167,6 +351,13 @@ impl PartialEq for LoggingOptions {
 
 impl Eq for LoggingOptions {}
 
+#[derive(Clone, Debug)]
+struct TraceContext {
+    app_name: String,
+    node_id: String,
+    logging_opts: LoggingOptions,
+}
+
 impl Default for LoggingOptions {
     fn default() -> Self {
         Self {
@@ -242,6 +433,7 @@ pub fn init_global_logging(
 ) -> Vec<WorkerGuard> {
     static START: Once = Once::new();
     let mut guards = vec![];
+    let node_id = node_id.unwrap_or_else(|| "none".to_string());
 
     START.call_once(|| {
         // Enable log compatible layer to convert log record to tracing span.
@@ -357,10 +549,37 @@ pub fn init_global_logging(
 
         let (dyn_filter, reload_handle) = tracing_subscriber::reload::Layer::new(filter.clone());
 
-        RELOAD_HANDLE
+        LOG_RELOAD_HANDLE
             .set(reload_handle)
             .expect("reload handle already set, maybe init_global_logging get called twice?");
 
+        let mut initial_tracer = None;
+        let trace_state = if opts.enable_otlp_tracing {
+            let tracer = create_tracer(app_name, &node_id, opts);
+            initial_tracer = Some(tracer.clone());
+            TraceState::Ready(tracer)
+        } else {
+            TraceState::Deferred(TraceContext {
+                app_name: app_name.to_string(),
+                node_id: node_id.clone(),
+                logging_opts: opts.clone(),
+            })
+        };
+
+        TRACER
+            .set(Mutex::new(trace_state))
+            .expect("trace state already initialized");
+
+        let initial_trace_layer = initial_tracer
+            .as_ref()
+            .map(|tracer| tracing_opentelemetry::layer().with_tracer(tracer.clone()));
+
+        let (dyn_trace_layer, trace_reload_handle) = TraceLayer::new(initial_trace_layer);
+
+        TRACE_RELOAD_HANDLE
+            .set(trace_reload_handle)
+            .unwrap_or_else(|_| panic!("failed to set trace reload handle"));
+
         // Must enable 'tokio_unstable' cfg to use this feature.
         // For example: `RUSTFLAGS="--cfg tokio_unstable" cargo run -F common-telemetry/console -- standalone start`
         #[cfg(feature = "tokio-console")]
@@ -383,6 +602,7 @@ pub fn init_global_logging(
 
             Registry::default()
                 .with(dyn_filter)
+                .with(dyn_trace_layer)
                 .with(tokio_console_layer)
                 .with(stdout_logging_layer)
                 .with(file_logging_layer)
@@ -396,53 +616,61 @@ pub fn init_global_logging(
         #[cfg(not(feature = "tokio-console"))]
         let subscriber = Registry::default()
             .with(dyn_filter)
+            .with(dyn_trace_layer)
             .with(stdout_logging_layer)
             .with(file_logging_layer)
             .with(err_file_logging_layer)
             .with(slow_query_logging_layer);
 
-        if opts.enable_otlp_tracing {
-            global::set_text_map_propagator(TraceContextPropagator::new());
+        global::set_text_map_propagator(TraceContextPropagator::new());
 
-            let sampler = opts
-                .tracing_sample_ratio
-                .as_ref()
-                .map(create_sampler)
-                .map(Sampler::ParentBased)
-                .unwrap_or(Sampler::ParentBased(Box::new(Sampler::AlwaysOn)));
-
-            let provider = opentelemetry_sdk::trace::SdkTracerProvider::builder()
-                .with_batch_exporter(build_otlp_exporter(opts))
-                .with_sampler(sampler)
-                .with_resource(
-                    opentelemetry_sdk::Resource::builder_empty()
-                        .with_attributes([
-                            KeyValue::new(resource::SERVICE_NAME, app_name.to_string()),
-                            KeyValue::new(
-                                resource::SERVICE_INSTANCE_ID,
-                                node_id.unwrap_or("none".to_string()),
-                            ),
-                            KeyValue::new(resource::SERVICE_VERSION, common_version::version()),
-                            KeyValue::new(resource::PROCESS_PID, std::process::id().to_string()),
-                        ])
-                        .build(),
-                )
-                .build();
-            let tracer = provider.tracer("greptimedb");
-
-            tracing::subscriber::set_global_default(
-                subscriber.with(tracing_opentelemetry::layer().with_tracer(tracer)),
-            )
+        tracing::subscriber::set_global_default(subscriber)
             .expect("error setting global tracing subscriber");
-        } else {
-            tracing::subscriber::set_global_default(subscriber)
-                .expect("error setting global tracing subscriber");
-        }
     });
 
     guards
 }
 
+fn create_tracer(app_name: &str, node_id: &str, opts: &LoggingOptions) -> Tracer {
+    let sampler = opts
+        .tracing_sample_ratio
+        .as_ref()
+        .map(create_sampler)
+        .map(Sampler::ParentBased)
+        .unwrap_or(Sampler::ParentBased(Box::new(Sampler::AlwaysOn)));
+
+    let resource = opentelemetry_sdk::Resource::builder_empty()
+        .with_attributes([
+            KeyValue::new(resource::SERVICE_NAME, app_name.to_string()),
+            KeyValue::new(resource::SERVICE_INSTANCE_ID, node_id.to_string()),
+            KeyValue::new(resource::SERVICE_VERSION, common_version::version()),
+            KeyValue::new(resource::PROCESS_PID, std::process::id().to_string()),
+        ])
+        .build();
+
+    opentelemetry_sdk::trace::SdkTracerProvider::builder()
+        .with_batch_exporter(build_otlp_exporter(opts))
+        .with_sampler(sampler)
+        .with_resource(resource)
+        .build()
+        .tracer("greptimedb")
+}
+
+/// Ensure that the OTLP tracer has been constructed, building it lazily if needed.
+pub fn get_or_init_tracer() -> Result<Tracer, &'static str> {
+    let state = TRACER.get().ok_or("trace state is not initialized")?;
+    let mut guard = state.lock().expect("trace state lock poisoned");
+
+    match &mut *guard {
+        TraceState::Ready(tracer) => Ok(tracer.clone()),
+        TraceState::Deferred(context) => {
+            let tracer = create_tracer(&context.app_name, &context.node_id, &context.logging_opts);
+            *guard = TraceState::Ready(tracer.clone());
+            Ok(tracer)
+        }
+    }
+}
+
 fn build_otlp_exporter(opts: &LoggingOptions) -> SpanExporter {
     let protocol = opts
         .otlp_export_protocol
diff --git a/src/common/wal/src/config/kafka/common.rs b/src/common/wal/src/config/kafka/common.rs
index 1b9bcc77be..f58ba640c8 100644
--- a/src/common/wal/src/config/kafka/common.rs
+++ b/src/common/wal/src/config/kafka/common.rs
@@ -36,6 +36,9 @@ pub const DEFAULT_BACKOFF_CONFIG: BackoffConfig = BackoffConfig {
     deadline: Some(Duration::from_secs(3)),
 };
 
+/// The default connect timeout for kafka client.
+pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
+
 /// Default interval for auto WAL pruning.
 pub const DEFAULT_AUTO_PRUNE_INTERVAL: Duration = Duration::from_mins(30);
 /// Default limit for concurrent auto pruning tasks.
diff --git a/src/datanode/src/config.rs b/src/datanode/src/config.rs
index e40a52bd6b..19b4647b8e 100644
--- a/src/datanode/src/config.rs
+++ b/src/datanode/src/config.rs
@@ -28,7 +28,6 @@ use mito2::config::MitoConfig;
 pub(crate) use object_store::config::ObjectStoreConfig;
 use query::options::QueryOptions;
 use serde::{Deserialize, Serialize};
-use servers::export_metrics::ExportMetricsOption;
 use servers::grpc::GrpcOptions;
 use servers::heartbeat_options::HeartbeatOptions;
 use servers::http::HttpOptions;
@@ -82,7 +81,6 @@ pub struct DatanodeOptions {
     pub region_engine: Vec<RegionEngineConfig>,
     pub logging: LoggingOptions,
     pub enable_telemetry: bool,
-    pub export_metrics: ExportMetricsOption,
     pub tracing: TracingOptions,
     pub query: QueryOptions,
     pub memory: MemoryOptions,
@@ -138,7 +136,6 @@ impl Default for DatanodeOptions {
             logging: LoggingOptions::default(),
             heartbeat: HeartbeatOptions::datanode_default(),
             enable_telemetry: true,
-            export_metrics: ExportMetricsOption::default(),
             tracing: TracingOptions::default(),
             query: QueryOptions::default(),
             memory: MemoryOptions::default(),
diff --git a/src/datanode/src/datanode.rs b/src/datanode/src/datanode.rs
index 6b370c7eb6..e202ce9f2c 100644
--- a/src/datanode/src/datanode.rs
+++ b/src/datanode/src/datanode.rs
@@ -22,6 +22,7 @@ use common_base::Plugins;
 use common_error::ext::BoxedError;
 use common_greptimedb_telemetry::GreptimeDBTelemetryTask;
 use common_meta::cache::{LayeredCacheRegistry, SchemaCacheRef, TableSchemaCacheRef};
+use common_meta::cache_invalidator::CacheInvalidatorRef;
 use common_meta::datanode::TopicStatsReporter;
 use common_meta::key::runtime_switch::RuntimeSwitchManager;
 use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef};
@@ -48,7 +49,6 @@ use object_store::manager::{ObjectStoreManager, ObjectStoreManagerRef};
 use object_store::util::normalize_dir;
 use query::QueryEngineFactory;
 use query::dummy_catalog::{DummyCatalogManager, TableProviderFactoryRef};
-use servers::export_metrics::ExportMetricsTask;
 use servers::server::ServerHandlers;
 use snafu::{OptionExt, ResultExt, ensure};
 use store_api::path_utils::WAL_DIR;
@@ -84,7 +84,6 @@ pub struct Datanode {
     greptimedb_telemetry_task: Arc<GreptimeDBTelemetryTask>,
     leases_notifier: Option<Arc<Notify>>,
     plugins: Plugins,
-    export_metrics_task: Option<ExportMetricsTask>,
 }
 
 impl Datanode {
@@ -96,10 +95,6 @@ impl Datanode {
 
         self.start_telemetry();
 
-        if let Some(t) = self.export_metrics_task.as_ref() {
-            t.start(None).context(StartServerSnafu)?
-        }
-
         self.services.start_all().await.context(StartServerSnafu)
     }
 
@@ -287,21 +282,11 @@ impl DatanodeBuilder {
             open_all_regions.await?;
         }
 
-        let mut resource_stat = ResourceStatImpl::default();
-        resource_stat.start_collect_cpu_usage();
-
         let heartbeat_task = if let Some(meta_client) = meta_client {
-            Some(
-                HeartbeatTask::try_new(
-                    &self.opts,
-                    region_server.clone(),
-                    meta_client,
-                    cache_registry,
-                    self.plugins.clone(),
-                    Arc::new(resource_stat),
-                )
-                .await?,
-            )
+            let task = self
+                .create_heartbeat_task(&region_server, meta_client, cache_registry)
+                .await?;
+            Some(task)
         } else {
             None
         };
@@ -319,10 +304,6 @@ impl DatanodeBuilder {
             None
         };
 
-        let export_metrics_task =
-            ExportMetricsTask::try_new(&self.opts.export_metrics, Some(&self.plugins))
-                .context(StartServerSnafu)?;
-
         Ok(Datanode {
             services: ServerHandlers::default(),
             heartbeat_task,
@@ -331,10 +312,32 @@ impl DatanodeBuilder {
             region_event_receiver,
             leases_notifier,
             plugins: self.plugins.clone(),
-            export_metrics_task,
         })
     }
 
+    async fn create_heartbeat_task(
+        &self,
+        region_server: &RegionServer,
+        meta_client: MetaClientRef,
+        cache_invalidator: CacheInvalidatorRef,
+    ) -> Result<HeartbeatTask> {
+        let stat = {
+            let mut stat = ResourceStatImpl::default();
+            stat.start_collect_cpu_usage();
+            Arc::new(stat)
+        };
+
+        HeartbeatTask::try_new(
+            &self.opts,
+            region_server.clone(),
+            meta_client,
+            cache_invalidator,
+            self.plugins.clone(),
+            stat,
+        )
+        .await
+    }
+
     /// Builds [ObjectStoreManager] from [StorageConfig].
     pub async fn build_object_store_manager(cfg: &StorageConfig) -> Result<ObjectStoreManagerRef> {
         let object_store = store::new_object_store(cfg.store.clone(), &cfg.data_home).await?;
diff --git a/src/datanode/src/error.rs b/src/datanode/src/error.rs
index 74bddbaede..a8fe3fd969 100644
--- a/src/datanode/src/error.rs
+++ b/src/datanode/src/error.rs
@@ -410,14 +410,6 @@ pub enum Error {
         location: Location,
     },
 
-    #[snafu(display("Failed to build cache store"))]
-    BuildCacheStore {
-        #[snafu(source)]
-        error: object_store::Error,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
     #[snafu(display("Not yet implemented: {what}"))]
     NotYetImplemented { what: String },
 }
@@ -493,7 +485,6 @@ impl ErrorExt for Error {
             SerializeJson { .. } => StatusCode::Internal,
 
             ObjectStore { source, .. } => source.status_code(),
-            BuildCacheStore { .. } => StatusCode::StorageUnavailable,
         }
     }
 
diff --git a/src/datanode/src/heartbeat.rs b/src/datanode/src/heartbeat.rs
index 33ba648830..2b07adf06e 100644
--- a/src/datanode/src/heartbeat.rs
+++ b/src/datanode/src/heartbeat.rs
@@ -25,6 +25,7 @@ use common_meta::datanode::REGION_STATISTIC_KEY;
 use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
 use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler;
 use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
+use common_meta::heartbeat::handler::suspend::SuspendHandler;
 use common_meta::heartbeat::handler::{
     HandlerGroupExecutor, HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
 };
@@ -91,6 +92,7 @@ impl HeartbeatTask {
         let resp_handler_executor = Arc::new(HandlerGroupExecutor::new(vec![
             region_alive_keeper.clone(),
             Arc::new(ParseMailboxMessageHandler),
+            Arc::new(SuspendHandler::new(region_server.suspend_state())),
             Arc::new(
                 RegionHeartbeatResponseHandler::new(region_server.clone())
                     .with_open_region_parallelism(opts.init_regions_parallelism),
diff --git a/src/datanode/src/heartbeat/handler.rs b/src/datanode/src/heartbeat/handler.rs
index 8954513653..9accd138fd 100644
--- a/src/datanode/src/heartbeat/handler.rs
+++ b/src/datanode/src/heartbeat/handler.rs
@@ -99,26 +99,30 @@ impl RegionHeartbeatResponseHandler {
         self
     }
 
-    fn build_handler(&self, instruction: &Instruction) -> MetaResult<Box<InstructionHandlers>> {
+    fn build_handler(
+        &self,
+        instruction: &Instruction,
+    ) -> MetaResult<Option<Box<InstructionHandlers>>> {
         match instruction {
-            Instruction::CloseRegions(_) => Ok(Box::new(CloseRegionsHandler.into())),
-            Instruction::OpenRegions(_) => Ok(Box::new(
+            Instruction::CloseRegions(_) => Ok(Some(Box::new(CloseRegionsHandler.into()))),
+            Instruction::OpenRegions(_) => Ok(Some(Box::new(
                 OpenRegionsHandler {
                     open_region_parallelism: self.open_region_parallelism,
                 }
                 .into(),
-            )),
-            Instruction::FlushRegions(_) => Ok(Box::new(FlushRegionsHandler.into())),
-            Instruction::DowngradeRegions(_) => Ok(Box::new(DowngradeRegionsHandler.into())),
-            Instruction::UpgradeRegions(_) => Ok(Box::new(
+            ))),
+            Instruction::FlushRegions(_) => Ok(Some(Box::new(FlushRegionsHandler.into()))),
+            Instruction::DowngradeRegions(_) => Ok(Some(Box::new(DowngradeRegionsHandler.into()))),
+            Instruction::UpgradeRegions(_) => Ok(Some(Box::new(
                 UpgradeRegionsHandler {
                     upgrade_region_parallelism: self.open_region_parallelism,
                 }
                 .into(),
-            )),
-            Instruction::GetFileRefs(_) => Ok(Box::new(GetFileRefsHandler.into())),
-            Instruction::GcRegions(_) => Ok(Box::new(GcRegionsHandler.into())),
+            ))),
+            Instruction::GetFileRefs(_) => Ok(Some(Box::new(GetFileRefsHandler.into()))),
+            Instruction::GcRegions(_) => Ok(Some(Box::new(GcRegionsHandler.into()))),
             Instruction::InvalidateCaches(_) => InvalidHeartbeatResponseSnafu.fail(),
+            Instruction::Suspend => Ok(None),
         }
     }
 }
@@ -216,30 +220,24 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
             .context(InvalidHeartbeatResponseSnafu)?;
 
         let mailbox = ctx.mailbox.clone();
-        let region_server = self.region_server.clone();
-        let downgrade_tasks = self.downgrade_tasks.clone();
-        let flush_tasks = self.flush_tasks.clone();
-        let gc_tasks = self.gc_tasks.clone();
-        let handler = self.build_handler(&instruction)?;
-        let _handle = common_runtime::spawn_global(async move {
-            let reply = handler
-                .handle(
-                    &HandlerContext {
-                        region_server,
-                        downgrade_tasks,
-                        flush_tasks,
-                        gc_tasks,
-                    },
-                    instruction,
-                )
-                .await;
-
-            if let Some(reply) = reply
-                && let Err(e) = mailbox.send((meta, reply)).await
-            {
-                error!(e; "Failed to send reply to mailbox");
-            }
-        });
+        if let Some(handler) = self.build_handler(&instruction)? {
+            let context = HandlerContext {
+                region_server: self.region_server.clone(),
+                downgrade_tasks: self.downgrade_tasks.clone(),
+                flush_tasks: self.flush_tasks.clone(),
+                gc_tasks: self.gc_tasks.clone(),
+            };
+            let _handle = common_runtime::spawn_global(async move {
+                let reply = handler.handle(&context, instruction).await;
+                if let Some(reply) = reply
+                    && let Err(e) = mailbox.send((meta, reply)).await
+                {
+                    let error = e.to_string();
+                    let (meta, reply) = e.0;
+                    error!("Failed to send reply {reply} to {meta:?}: {error}");
+                }
+            });
+        }
 
         Ok(HandleControl::Continue)
     }
diff --git a/src/datanode/src/heartbeat/handler/file_ref.rs b/src/datanode/src/heartbeat/handler/file_ref.rs
index ccad7922b5..4d2ac325a8 100644
--- a/src/datanode/src/heartbeat/handler/file_ref.rs
+++ b/src/datanode/src/heartbeat/handler/file_ref.rs
@@ -39,9 +39,8 @@ impl InstructionHandler for GetFileRefsHandler {
                 error: Some("MitoEngine not found".to_string()),
             }));
         };
-
         match mito_engine
-            .get_snapshot_of_unmanifested_refs(get_file_refs.region_ids)
+            .get_snapshot_of_file_refs(get_file_refs.query_regions, get_file_refs.related_regions)
             .await
         {
             Ok(all_file_refs) => {
diff --git a/src/datanode/src/heartbeat/handler/flush_region.rs b/src/datanode/src/heartbeat/handler/flush_region.rs
index 721673432e..a86d672eca 100644
--- a/src/datanode/src/heartbeat/handler/flush_region.rs
+++ b/src/datanode/src/heartbeat/handler/flush_region.rs
@@ -320,4 +320,15 @@ mod tests {
         assert!(flush_reply.results[0].1.is_ok());
         assert!(flush_reply.results[1].1.is_err());
     }
+
+    #[test]
+    fn test_flush_regions_display() {
+        let region_id = RegionId::new(1024, 1);
+        let flush_regions = FlushRegions::sync_single(region_id);
+        let display = format!("{}", flush_regions);
+        assert_eq!(
+            display,
+            "FlushRegions(region_ids=[4398046511105(1024, 1)], strategy=Sync, error_strategy=FailFast)"
+        );
+    }
 }
diff --git a/src/datanode/src/heartbeat/handler/gc_worker.rs b/src/datanode/src/heartbeat/handler/gc_worker.rs
index 75b0005e93..9329dcb0c6 100644
--- a/src/datanode/src/heartbeat/handler/gc_worker.rs
+++ b/src/datanode/src/heartbeat/handler/gc_worker.rs
@@ -15,7 +15,7 @@
 use common_meta::instruction::{GcRegions, GcRegionsReply, InstructionReply};
 use common_telemetry::{debug, warn};
 use mito2::gc::LocalGcWorker;
-use snafu::{OptionExt, ResultExt};
+use snafu::{OptionExt, ResultExt, ensure};
 use store_api::storage::{FileRefsManifest, RegionId};
 
 use crate::error::{GcMitoEngineSnafu, InvalidGcArgsSnafu, Result, UnexpectedSnafu};
@@ -35,20 +35,6 @@ impl InstructionHandler for GcRegionsHandler {
         let region_ids = gc_regions.regions.clone();
         debug!("Received gc regions instruction: {:?}", region_ids);
 
-        let is_same_table = region_ids.windows(2).all(|w| {
-            let t1 = w[0].table_id();
-            let t2 = w[1].table_id();
-            t1 == t2
-        });
-        if !is_same_table {
-            return Some(InstructionReply::GcRegions(GcRegionsReply {
-                result: Err(format!(
-                    "Regions to GC should belong to the same table, found: {:?}",
-                    region_ids
-                )),
-            }));
-        }
-
         let (region_id, gc_worker) = match self
             .create_gc_worker(
                 ctx,
@@ -103,6 +89,8 @@ impl InstructionHandler for GcRegionsHandler {
 }
 
 impl GcRegionsHandler {
+    /// Create a GC worker for the given region IDs.
+    /// Return the first region ID(after sort by given region id) and the GC worker.
     async fn create_gc_worker(
         &self,
         ctx: &HandlerContext,
@@ -112,22 +100,37 @@ impl GcRegionsHandler {
     ) -> Result<(RegionId, LocalGcWorker)> {
         // always use the smallest region id on datanode as the target region id
         region_ids.sort_by_key(|r| r.region_number());
+
         let mito_engine = ctx
             .region_server
             .mito_engine()
             .with_context(|| UnexpectedSnafu {
                 violated: "MitoEngine not found".to_string(),
             })?;
-        let region_id = *region_ids.first().with_context(|| UnexpectedSnafu {
-            violated: "No region ids provided".to_string(),
+
+        let region_id = *region_ids.first().with_context(|| InvalidGcArgsSnafu {
+            msg: "No region ids provided".to_string(),
         })?;
 
-        let mito_config = mito_engine.mito_config();
+        // also need to ensure all regions are on this datanode
+        ensure!(
+            region_ids
+                .iter()
+                .all(|rid| mito_engine.find_region(*rid).is_some()),
+            InvalidGcArgsSnafu {
+                msg: format!(
+                    "Some regions are not on current datanode:{:?}",
+                    region_ids
+                        .iter()
+                        .filter(|rid| mito_engine.find_region(**rid).is_none())
+                        .collect::<Vec<_>>()
+                ),
+            }
+        );
 
         // Find the access layer from one of the regions that exists on this datanode
-        let access_layer = region_ids
-            .iter()
-            .find_map(|rid| mito_engine.find_region(*rid))
+        let access_layer = mito_engine
+            .find_region(region_id)
             .with_context(|| InvalidGcArgsSnafu {
                 msg: format!(
                     "None of the regions is on current datanode:{:?}",
@@ -136,14 +139,22 @@ impl GcRegionsHandler {
             })?
             .access_layer();
 
+        // if region happen to be dropped before this but after gc scheduler send gc instr,
+        // need to deal with it properly(it is ok for region to be dropped after GC worker started)
+        // region not found here can only be drop table/database case, since region migration is prevented by lock in gc procedure
+        // TODO(discord9): add integration test for this drop case
+        let mito_regions = region_ids
+            .iter()
+            .filter_map(|rid| mito_engine.find_region(*rid).map(|r| (*rid, r)))
+            .collect();
+
         let cache_manager = mito_engine.cache_manager();
 
         let gc_worker = LocalGcWorker::try_new(
             access_layer.clone(),
             Some(cache_manager),
-            region_ids.into_iter().collect(),
-            Default::default(),
-            mito_config.clone().into(),
+            mito_regions,
+            mito_engine.mito_config().gc.clone(),
             file_ref_manifest.clone(),
             &mito_engine.gc_limiter(),
             full_file_listing,
diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs
index ff80c8b10a..e95543d474 100644
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -17,6 +17,7 @@ mod catalog;
 use std::collections::HashMap;
 use std::fmt::Debug;
 use std::ops::Deref;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, RwLock};
 use std::time::Duration;
 
@@ -52,7 +53,9 @@ pub use query::dummy_catalog::{
     DummyCatalogList, DummyTableProviderFactory, TableProviderFactoryRef,
 };
 use serde_json;
-use servers::error::{self as servers_error, ExecuteGrpcRequestSnafu, Result as ServerResult};
+use servers::error::{
+    self as servers_error, ExecuteGrpcRequestSnafu, Result as ServerResult, SuspendedSnafu,
+};
 use servers::grpc::FlightCompression;
 use servers::grpc::flight::{FlightCraft, FlightRecordBatchStream, TonicStream};
 use servers::grpc::region_server::RegionServerHandler;
@@ -89,6 +92,7 @@ use crate::region_server::catalog::{NameAwareCatalogList, NameAwareDataSourceInj
 pub struct RegionServer {
     inner: Arc<RegionServerInner>,
     flight_compression: FlightCompression,
+    suspend: Arc<AtomicBool>,
 }
 
 pub struct RegionStat {
@@ -136,6 +140,7 @@ impl RegionServer {
                 ),
             )),
             flight_compression,
+            suspend: Arc::new(AtomicBool::new(false)),
         }
     }
 
@@ -518,7 +523,7 @@ impl RegionServer {
 
         let manifest_info = match manifest_info {
             ManifestInfo::MitoManifestInfo(info) => {
-                RegionManifestInfo::mito(info.data_manifest_version, 0)
+                RegionManifestInfo::mito(info.data_manifest_version, 0, 0)
             }
             ManifestInfo::MetricManifestInfo(info) => RegionManifestInfo::metric(
                 info.data_manifest_version,
@@ -595,6 +600,14 @@ impl RegionServer {
             .handle_sync_region(engine_with_status.engine(), region_id, manifest_info)
             .await
     }
+
+    fn is_suspended(&self) -> bool {
+        self.suspend.load(Ordering::Relaxed)
+    }
+
+    pub(crate) fn suspend_state(&self) -> Arc<AtomicBool> {
+        self.suspend.clone()
+    }
 }
 
 #[async_trait]
@@ -644,6 +657,8 @@ impl FlightCraft for RegionServer {
         &self,
         request: Request<Ticket>,
     ) -> TonicResult<Response<TonicStream<FlightData>>> {
+        ensure!(!self.is_suspended(), SuspendedSnafu);
+
         let ticket = request.into_inner().ticket;
         let request = api::v1::region::QueryRequest::decode(ticket.as_ref())
             .context(servers_error::InvalidFlightTicketSnafu)?;
@@ -1200,7 +1215,8 @@ impl RegionServerInner {
             | RegionRequest::Flush(_)
             | RegionRequest::Compact(_)
             | RegionRequest::Truncate(_)
-            | RegionRequest::BuildIndex(_) => RegionChange::None,
+            | RegionRequest::BuildIndex(_)
+            | RegionRequest::EnterStaging(_) => RegionChange::None,
             RegionRequest::Catchup(_) => RegionChange::Catchup,
         };
 
@@ -1260,7 +1276,6 @@ impl RegionServerInner {
             .with_context(|_| HandleRegionRequestSnafu { region_id })?
             .new_opened_logical_region_ids()
         else {
-            warn!("No new opened logical regions");
             return Ok(());
         };
 
diff --git a/src/datanode/src/store.rs b/src/datanode/src/store.rs
index 6dc6f280c6..78db3d6103 100644
--- a/src/datanode/src/store.rs
+++ b/src/datanode/src/store.rs
@@ -14,15 +14,10 @@
 
 //! object storage utilities
 
-use std::sync::Arc;
-
-use common_telemetry::info;
-use object_store::config::ObjectStorageCacheConfig;
+use common_telemetry::{info, warn};
 use object_store::factory::new_raw_object_store;
-use object_store::layers::LruCacheLayer;
-use object_store::services::Fs;
 use object_store::util::{clean_temp_dir, join_dir, with_instrument_layers, with_retry_layers};
-use object_store::{ATOMIC_WRITE_DIR, Access, ObjectStore, ObjectStoreBuilder};
+use object_store::{ATOMIC_WRITE_DIR, ObjectStore};
 use snafu::prelude::*;
 
 use crate::config::ObjectStoreConfig;
@@ -47,23 +42,58 @@ pub(crate) async fn new_object_store_without_cache(
     Ok(object_store)
 }
 
+/// Cleans up old LRU read cache directories that were removed.
+fn clean_old_read_cache(store: &ObjectStoreConfig, data_home: &str) {
+    if !store.is_object_storage() {
+        return;
+    }
+
+    let Some(cache_config) = store.cache_config() else {
+        return;
+    };
+
+    // Only cleans if read cache was enabled
+    if !cache_config.enable_read_cache {
+        return;
+    }
+
+    let cache_base_dir = if cache_config.cache_path.is_empty() {
+        data_home
+    } else {
+        &cache_config.cache_path
+    };
+
+    // Cleans up the old read cache directory
+    let old_read_cache_dir = join_dir(cache_base_dir, "cache/object/read");
+    info!(
+        "Cleaning up old read cache directory: {}",
+        old_read_cache_dir
+    );
+    if let Err(e) = clean_temp_dir(&old_read_cache_dir) {
+        warn!(e; "Failed to clean old read cache directory {}", old_read_cache_dir);
+    }
+
+    // Cleans up the atomic temp dir used by the cache layer
+    let cache_atomic_temp_dir = join_dir(cache_base_dir, ATOMIC_WRITE_DIR);
+    info!(
+        "Cleaning up old cache atomic temp directory: {}",
+        cache_atomic_temp_dir
+    );
+    if let Err(e) = clean_temp_dir(&cache_atomic_temp_dir) {
+        warn!(e; "Failed to clean old cache atomic temp directory {}", cache_atomic_temp_dir);
+    }
+}
+
 pub async fn new_object_store(store: ObjectStoreConfig, data_home: &str) -> Result<ObjectStore> {
+    // Cleans up old LRU read cache directories.
+    // TODO: Remove this line after the 1.0 release.
+    clean_old_read_cache(&store, data_home);
+
     let object_store = new_raw_object_store(&store, data_home)
         .await
         .context(error::ObjectStoreSnafu)?;
-    // Enable retry layer and cache layer for non-fs object storages
+    // Enables retry layer for non-fs object storages
     let object_store = if store.is_object_storage() {
-        let object_store = {
-            // It's safe to unwrap here because we already checked above.
-            let cache_config = store.cache_config().unwrap();
-            if let Some(cache_layer) = build_cache_layer(cache_config, data_home).await? {
-                // Adds cache layer
-                object_store.layer(cache_layer)
-            } else {
-                object_store
-            }
-        };
-
         // Adds retry layer
         with_retry_layers(object_store)
     } else {
@@ -73,40 +103,3 @@ pub async fn new_object_store(store: ObjectStoreConfig, data_home: &str) -> Resu
     let object_store = with_instrument_layers(object_store, true);
     Ok(object_store)
 }
-
-async fn build_cache_layer(
-    cache_config: &ObjectStorageCacheConfig,
-    data_home: &str,
-) -> Result<Option<LruCacheLayer<impl Access>>> {
-    // No need to build cache layer if read cache is disabled.
-    if !cache_config.enable_read_cache {
-        return Ok(None);
-    }
-    let cache_base_dir = if cache_config.cache_path.is_empty() {
-        data_home
-    } else {
-        &cache_config.cache_path
-    };
-    let atomic_temp_dir = join_dir(cache_base_dir, ATOMIC_WRITE_DIR);
-    clean_temp_dir(&atomic_temp_dir).context(error::ObjectStoreSnafu)?;
-
-    let cache_store = Fs::default()
-        .root(cache_base_dir)
-        .atomic_write_dir(&atomic_temp_dir)
-        .build()
-        .context(error::BuildCacheStoreSnafu)?;
-
-    let cache_layer = LruCacheLayer::new(
-        Arc::new(cache_store),
-        cache_config.cache_capacity.0 as usize,
-    )
-    .context(error::BuildCacheStoreSnafu)?;
-    cache_layer.recover_cache(false).await;
-
-    info!(
-        "Enabled local object storage cache, path: {}, capacity: {}.",
-        cache_config.cache_path, cache_config.cache_capacity
-    );
-
-    Ok(Some(cache_layer))
-}
diff --git a/src/datanode/src/tests.rs b/src/datanode/src/tests.rs
index 5f7db4d928..3fe4954aea 100644
--- a/src/datanode/src/tests.rs
+++ b/src/datanode/src/tests.rs
@@ -24,8 +24,8 @@ use common_query::Output;
 use common_runtime::Runtime;
 use common_runtime::runtime::{BuilderBuild, RuntimeTrait};
 use datafusion::catalog::TableFunction;
+use datafusion::dataframe::DataFrame;
 use datafusion_expr::{AggregateUDF, LogicalPlan};
-use query::dataframe::DataFrame;
 use query::planner::LogicalPlanner;
 use query::query_engine::{DescribeResult, QueryEngineState};
 use query::{QueryEngine, QueryEngineContext};
@@ -33,7 +33,8 @@ use servers::grpc::FlightCompression;
 use session::context::QueryContextRef;
 use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::{
-    RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
+    CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
+    RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
     SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse,
 };
 use store_api::region_request::{AffectedRows, RegionRequest};
@@ -291,6 +292,21 @@ impl RegionEngine for MockRegionEngine {
         unimplemented!()
     }
 
+    async fn remap_manifests(
+        &self,
+        _request: RemapManifestsRequest,
+    ) -> Result<RemapManifestsResponse, BoxedError> {
+        unimplemented!()
+    }
+
+    async fn copy_region_from(
+        &self,
+        _region_id: RegionId,
+        _request: CopyRegionFromRequest,
+    ) -> Result<CopyRegionFromResponse, BoxedError> {
+        unimplemented!()
+    }
+
     fn as_any(&self) -> &dyn Any {
         self
     }
diff --git a/src/datatypes/src/arrow_array.rs b/src/datatypes/src/arrow_array.rs
index 97aa299fad..ac5e6444af 100644
--- a/src/datatypes/src/arrow_array.rs
+++ b/src/datatypes/src/arrow_array.rs
@@ -12,9 +12,117 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use arrow::array::{ArrayRef, AsArray};
+use arrow::datatypes::{
+    DataType, DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
+    DurationSecondType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
+    Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
+    TimestampNanosecondType, TimestampSecondType,
+};
+use common_time::time::Time;
+use common_time::{Duration, Timestamp};
+
 pub type BinaryArray = arrow::array::BinaryArray;
 pub type MutableBinaryArray = arrow::array::BinaryBuilder;
 pub type StringArray = arrow::array::StringArray;
 pub type MutableStringArray = arrow::array::StringBuilder;
 pub type LargeStringArray = arrow::array::LargeStringArray;
 pub type MutableLargeStringArray = arrow::array::LargeStringBuilder;
+
+/// Get the [Timestamp] value at index `i` of the timestamp array.
+///
+/// Note: This method does not check for nulls and the value is arbitrary
+/// if [`is_null`](arrow::array::Array::is_null) returns true for the index.
+///
+/// # Panics
+/// 1. if index `i` is out of bounds;
+/// 2. or the array is not timestamp type.
+pub fn timestamp_array_value(array: &ArrayRef, i: usize) -> Timestamp {
+    let DataType::Timestamp(time_unit, _) = &array.data_type() else {
+        unreachable!()
+    };
+    let v = match time_unit {
+        TimeUnit::Second => {
+            let array = array.as_primitive::<TimestampSecondType>();
+            array.value(i)
+        }
+        TimeUnit::Millisecond => {
+            let array = array.as_primitive::<TimestampMillisecondType>();
+            array.value(i)
+        }
+        TimeUnit::Microsecond => {
+            let array = array.as_primitive::<TimestampMicrosecondType>();
+            array.value(i)
+        }
+        TimeUnit::Nanosecond => {
+            let array = array.as_primitive::<TimestampNanosecondType>();
+            array.value(i)
+        }
+    };
+    Timestamp::new(v, time_unit.into())
+}
+
+/// Get the [Time] value at index `i` of the time array.
+///
+/// Note: This method does not check for nulls and the value is arbitrary
+/// if [`is_null`](arrow::array::Array::is_null) returns true for the index.
+///
+/// # Panics
+/// 1. if index `i` is out of bounds;
+/// 2. or the array is not `Time32` or `Time64` type.
+pub fn time_array_value(array: &ArrayRef, i: usize) -> Time {
+    match array.data_type() {
+        DataType::Time32(time_unit) | DataType::Time64(time_unit) => match time_unit {
+            TimeUnit::Second => {
+                let array = array.as_primitive::<Time32SecondType>();
+                Time::new_second(array.value(i) as i64)
+            }
+            TimeUnit::Millisecond => {
+                let array = array.as_primitive::<Time32MillisecondType>();
+                Time::new_millisecond(array.value(i) as i64)
+            }
+            TimeUnit::Microsecond => {
+                let array = array.as_primitive::<Time64MicrosecondType>();
+                Time::new_microsecond(array.value(i))
+            }
+            TimeUnit::Nanosecond => {
+                let array = array.as_primitive::<Time64NanosecondType>();
+                Time::new_nanosecond(array.value(i))
+            }
+        },
+        _ => unreachable!(),
+    }
+}
+
+/// Get the [Duration] value at index `i` of the duration array.
+///
+/// Note: This method does not check for nulls and the value is arbitrary
+/// if [`is_null`](arrow::array::Array::is_null) returns true for the index.
+///
+/// # Panics
+/// 1. if index `i` is out of bounds;
+/// 2. or the array is not duration type.
+pub fn duration_array_value(array: &ArrayRef, i: usize) -> Duration {
+    let DataType::Duration(time_unit) = array.data_type() else {
+        unreachable!();
+    };
+    let v = match time_unit {
+        TimeUnit::Second => {
+            let array = array.as_primitive::<DurationSecondType>();
+            array.value(i)
+        }
+        TimeUnit::Millisecond => {
+            let array = array.as_primitive::<DurationMillisecondType>();
+            array.value(i)
+        }
+        TimeUnit::Microsecond => {
+            let array = array.as_primitive::<DurationMicrosecondType>();
+            array.value(i)
+        }
+        TimeUnit::Nanosecond => {
+            let array = array.as_primitive::<DurationNanosecondType>();
+            array.value(i)
+        }
+    };
+    Duration::new(v, time_unit.into())
+}
diff --git a/src/datatypes/src/data_type.rs b/src/datatypes/src/data_type.rs
index 4f5e8ab531..25fd095a9f 100644
--- a/src/datatypes/src/data_type.rs
+++ b/src/datatypes/src/data_type.rs
@@ -15,7 +15,6 @@
 use std::fmt;
 use std::sync::Arc;
 
-use arrow::compute::cast as arrow_array_cast;
 use arrow::datatypes::{
     DataType as ArrowDataType, IntervalUnit as ArrowIntervalUnit, TimeUnit as ArrowTimeUnit,
 };
@@ -33,8 +32,8 @@ use crate::types::{
     BinaryType, BooleanType, DateType, Decimal128Type, DictionaryType, DurationMicrosecondType,
     DurationMillisecondType, DurationNanosecondType, DurationSecondType, DurationType, Float32Type,
     Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, IntervalDayTimeType,
-    IntervalMonthDayNanoType, IntervalType, IntervalYearMonthType, JsonFormat, JsonType, ListType,
-    NullType, StringType, StructType, TimeMillisecondType, TimeType, TimestampMicrosecondType,
+    IntervalMonthDayNanoType, IntervalType, IntervalYearMonthType, JsonType, ListType, NullType,
+    StringType, StructType, TimeMillisecondType, TimeType, TimestampMicrosecondType,
     TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, TimestampType,
     UInt8Type, UInt16Type, UInt32Type, UInt64Type, VectorType,
 };
@@ -368,8 +367,10 @@ impl ConcreteDataType {
 
     /// Checks if the data type can cast to another data type.
     pub fn can_arrow_type_cast_to(&self, to_type: &ConcreteDataType) -> bool {
-        let array = arrow_array::new_empty_array(&self.as_arrow_type());
-        arrow_array_cast(array.as_ref(), &to_type.as_arrow_type()).is_ok()
+        match (self, to_type) {
+            (ConcreteDataType::Json(this), ConcreteDataType::Json(that)) => that.is_include(this),
+            _ => arrow::compute::can_cast_types(&self.as_arrow_type(), &to_type.as_arrow_type()),
+        }
     }
 
     /// Try to cast data type as a [`DurationType`].
@@ -677,7 +678,7 @@ impl ConcreteDataType {
     }
 
     pub fn json_native_datatype(inner_type: ConcreteDataType) -> ConcreteDataType {
-        ConcreteDataType::Json(JsonType::new(JsonFormat::Native(Box::new(inner_type))))
+        ConcreteDataType::Json(JsonType::new_native((&inner_type).into()))
     }
 }
 
diff --git a/src/datatypes/src/error.rs b/src/datatypes/src/error.rs
index fe3ee99b63..65aca699ec 100644
--- a/src/datatypes/src/error.rs
+++ b/src/datatypes/src/error.rs
@@ -189,7 +189,7 @@ pub enum Error {
         location: Location,
     },
 
-    #[snafu(display("Invalid JSON text: {}", value))]
+    #[snafu(display("Invalid JSON: {}", value))]
     InvalidJson {
         value: String,
         #[snafu(implicit)]
diff --git a/src/datatypes/src/extension/json.rs b/src/datatypes/src/extension/json.rs
index bd3bd94712..abc75bb35b 100644
--- a/src/datatypes/src/extension/json.rs
+++ b/src/datatypes/src/extension/json.rs
@@ -15,7 +15,7 @@
 use std::sync::Arc;
 
 use arrow_schema::extension::ExtensionType;
-use arrow_schema::{ArrowError, DataType};
+use arrow_schema::{ArrowError, DataType, FieldRef};
 use serde::{Deserialize, Serialize};
 
 use crate::json::JsonStructureSettings;
@@ -102,3 +102,8 @@ impl ExtensionType for JsonExtensionType {
         Ok(json)
     }
 }
+
+/// Check if this field is to be treated as json extension type.
+pub fn is_json_extension_type(field: &FieldRef) -> bool {
+    field.extension_type_name() == Some(JsonExtensionType::NAME)
+}
diff --git a/src/datatypes/src/json.rs b/src/datatypes/src/json.rs
index 64952bb39a..b12c63c479 100644
--- a/src/datatypes/src/json.rs
+++ b/src/datatypes/src/json.rs
@@ -19,18 +19,19 @@
 //! The struct will carry all the fields of the Json object. We will not flatten any json object in this implementation.
 //!
 
-use std::collections::HashSet;
+pub mod value;
+
+use std::collections::{BTreeMap, HashSet};
 use std::sync::Arc;
 
-use common_base::bytes::StringBytes;
-use ordered_float::OrderedFloat;
 use serde::{Deserialize, Serialize};
 use serde_json::{Map, Value as Json};
 use snafu::{ResultExt, ensure};
 
-use crate::data_type::{ConcreteDataType, DataType};
 use crate::error::{self, Error};
-use crate::types::{ListType, StructField, StructType};
+use crate::json::value::{JsonValue, JsonVariant};
+use crate::types::json_type::{JsonNativeType, JsonNumberType, JsonObjectType};
+use crate::types::{StructField, StructType};
 use crate::value::{ListValue, StructValue, Value};
 
 /// The configuration of JSON encoding
@@ -102,7 +103,7 @@ impl JsonStructureSettings {
     pub fn encode_with_type(
         &self,
         json: Json,
-        data_type: Option<&ConcreteDataType>,
+        data_type: Option<&JsonNativeType>,
     ) -> Result<Value, Error> {
         let context = JsonContext {
             key_path: String::new(),
@@ -146,70 +147,65 @@ impl<'a> JsonContext<'a> {
 /// Main encoding function with key path tracking
 pub fn encode_json_with_context<'a>(
     json: Json,
-    data_type: Option<&ConcreteDataType>,
+    data_type: Option<&JsonNativeType>,
     context: &JsonContext<'a>,
-) -> Result<Value, Error> {
+) -> Result<JsonValue, Error> {
     // Check if the entire encoding should be unstructured
     if matches!(context.settings, JsonStructureSettings::UnstructuredRaw) {
         let json_string = json.to_string();
-        let struct_value = StructValue::try_new(
-            vec![Value::String(json_string.into())],
-            StructType::new(Arc::new(vec![StructField::new(
-                JsonStructureSettings::RAW_FIELD.to_string(),
-                ConcreteDataType::string_datatype(),
-                true,
-            )])),
-        )?;
-        return Ok(Value::Struct(struct_value));
+        return Ok([(JsonStructureSettings::RAW_FIELD, json_string)].into());
     }
 
     // Check if current key should be treated as unstructured
     if context.is_unstructured_key() {
-        return Ok(Value::String(json.to_string().into()));
+        return Ok(json.to_string().into());
     }
 
     match json {
         Json::Object(json_object) => {
-            ensure!(
-                matches!(data_type, Some(ConcreteDataType::Struct(_)) | None),
-                error::InvalidJsonSnafu {
-                    value: "JSON object can only be encoded to Struct type".to_string(),
+            let object_type = match data_type.as_ref() {
+                Some(JsonNativeType::Object(x)) => Some(x),
+                None => None,
+                _ => {
+                    return error::InvalidJsonSnafu {
+                        value: "JSON object value must be encoded with object type",
+                    }
+                    .fail();
                 }
-            );
-
-            let data_type = data_type.and_then(|x| x.as_struct());
-            let struct_value = encode_json_object_with_context(json_object, data_type, context)?;
-            Ok(Value::Struct(struct_value))
+            };
+            encode_json_object_with_context(json_object, object_type, context)
         }
         Json::Array(json_array) => {
-            let item_type = if let Some(ConcreteDataType::List(list_type)) = data_type {
-                Some(list_type.item_type())
-            } else {
-                None
+            let item_type = match data_type.as_ref() {
+                Some(JsonNativeType::Array(x)) => Some(x.as_ref()),
+                None => None,
+                _ => {
+                    return error::InvalidJsonSnafu {
+                        value: "JSON array value must be encoded with array type",
+                    }
+                    .fail();
+                }
             };
-            let list_value = encode_json_array_with_context(json_array, item_type, context)?;
-            Ok(Value::List(list_value))
+            encode_json_array_with_context(json_array, item_type, context)
         }
         _ => {
             // For non-collection types, verify type compatibility
             if let Some(expected_type) = data_type {
-                let (value, actual_type) =
-                    encode_json_value_with_context(json, Some(expected_type), context)?;
-                if &actual_type == expected_type {
+                let value = encode_json_value_with_context(json, Some(expected_type), context)?;
+                let actual_type = value.json_type().native_type();
+                if actual_type == expected_type {
                     Ok(value)
                 } else {
                     Err(error::InvalidJsonSnafu {
                         value: format!(
                             "JSON value type {} does not match expected type {}",
-                            actual_type.name(),
-                            expected_type.name()
+                            actual_type, expected_type
                         ),
                     }
                     .build())
                 }
             } else {
-                let (value, _) = encode_json_value_with_context(json, None, context)?;
-                Ok(value)
+                encode_json_value_with_context(json, None, context)
             }
         }
     }
@@ -217,31 +213,21 @@ pub fn encode_json_with_context<'a>(
 
 fn encode_json_object_with_context<'a>(
     mut json_object: Map<String, Json>,
-    fields: Option<&StructType>,
+    fields: Option<&JsonObjectType>,
     context: &JsonContext<'a>,
-) -> Result<StructValue, Error> {
-    let total_json_keys = json_object.len();
-    let mut items = Vec::with_capacity(total_json_keys);
-    let mut struct_fields = Vec::with_capacity(total_json_keys);
+) -> Result<JsonValue, Error> {
+    let mut object = BTreeMap::new();
     // First, process fields from the provided schema in their original order
     if let Some(fields) = fields {
-        for field in fields.fields().iter() {
-            let field_name = field.name();
-
+        for (field_name, field_type) in fields {
             if let Some(value) = json_object.remove(field_name) {
                 let field_context = context.with_key(field_name);
-                let (value, data_type) =
-                    encode_json_value_with_context(value, Some(field.data_type()), &field_context)?;
-                items.push(value);
-                struct_fields.push(StructField::new(
-                    field_name.to_string(),
-                    data_type,
-                    true, // JSON fields are always nullable
-                ));
+                let value =
+                    encode_json_value_with_context(value, Some(field_type), &field_context)?;
+                object.insert(field_name.clone(), value.into_variant());
             } else {
                 // Field exists in schema but not in JSON - add null value
-                items.push(Value::Null);
-                struct_fields.push(field.clone());
+                object.insert(field_name.clone(), ().into());
             }
         }
     }
@@ -250,139 +236,111 @@ fn encode_json_object_with_context<'a>(
     for (key, value) in json_object {
         let field_context = context.with_key(&key);
 
-        let (value, data_type) = encode_json_value_with_context(value, None, &field_context)?;
-        items.push(value);
+        let value = encode_json_value_with_context(value, None, &field_context)?;
 
-        struct_fields.push(StructField::new(
-            key.clone(),
-            data_type,
-            true, // JSON fields are always nullable
-        ));
+        object.insert(key, value.into_variant());
     }
 
-    let struct_type = StructType::new(Arc::new(struct_fields));
-    StructValue::try_new(items, struct_type)
+    Ok(JsonValue::new(JsonVariant::Object(object)))
 }
 
 fn encode_json_array_with_context<'a>(
     json_array: Vec<Json>,
-    item_type: Option<&ConcreteDataType>,
+    item_type: Option<&JsonNativeType>,
     context: &JsonContext<'a>,
-) -> Result<ListValue, Error> {
+) -> Result<JsonValue, Error> {
     let json_array_len = json_array.len();
     let mut items = Vec::with_capacity(json_array_len);
-    let mut element_type = None;
+    let mut element_type = item_type.cloned();
 
     for (index, value) in json_array.into_iter().enumerate() {
         let array_context = context.with_key(&index.to_string());
-        let (item_value, item_type) =
-            encode_json_value_with_context(value, item_type, &array_context)?;
-        items.push(item_value);
+        let item_value =
+            encode_json_value_with_context(value, element_type.as_ref(), &array_context)?;
+        let item_type = item_value.json_type().native_type().clone();
+        items.push(item_value.into_variant());
 
         // Determine the common type for the list
         if let Some(current_type) = &element_type {
-            // For now, we'll use the first non-null type we encounter
-            // In a more sophisticated implementation, we might want to find a common supertype
-            if *current_type == ConcreteDataType::null_datatype()
-                && item_type != ConcreteDataType::null_datatype()
-            {
-                element_type = Some(item_type);
-            }
+            // It's valid for json array to have different types of items, for example,
+            // ["a string", 1]. However, the `JsonValue` will be converted to Arrow list array,
+            // which requires all items have exactly same type. So we forbid the different types
+            // case here. Besides, it's not common for items in a json array to differ. So I think
+            // we are good here.
+            ensure!(
+                item_type == *current_type,
+                error::InvalidJsonSnafu {
+                    value: "all items in json array must have the same type"
+                }
+            );
         } else {
             element_type = Some(item_type);
         }
     }
 
-    // Use provided item_type if available, otherwise determine from elements
-    let element_type = if let Some(item_type) = item_type {
-        item_type.clone()
-    } else {
-        element_type.unwrap_or_else(ConcreteDataType::string_datatype)
-    };
-
-    Ok(ListValue::new(items, Arc::new(element_type)))
+    Ok(JsonValue::new(JsonVariant::Array(items)))
 }
 
 /// Helper function to encode a JSON value to a Value and determine its ConcreteDataType with context
 fn encode_json_value_with_context<'a>(
     json: Json,
-    expected_type: Option<&ConcreteDataType>,
+    expected_type: Option<&JsonNativeType>,
     context: &JsonContext<'a>,
-) -> Result<(Value, ConcreteDataType), Error> {
+) -> Result<JsonValue, Error> {
     // Check if current key should be treated as unstructured
     if context.is_unstructured_key() {
-        return Ok((
-            Value::String(json.to_string().into()),
-            ConcreteDataType::string_datatype(),
-        ));
+        return Ok(json.to_string().into());
     }
 
     match json {
-        Json::Null => Ok((Value::Null, ConcreteDataType::null_datatype())),
-        Json::Bool(b) => Ok((Value::Boolean(b), ConcreteDataType::boolean_datatype())),
+        Json::Null => Ok(JsonValue::null()),
+        Json::Bool(b) => Ok(b.into()),
         Json::Number(n) => {
             if let Some(i) = n.as_i64() {
                 // Use int64 for all integer numbers when possible
                 if let Some(expected) = expected_type
                     && let Ok(value) = try_convert_to_expected_type(i, expected)
                 {
-                    return Ok((value, expected.clone()));
+                    return Ok(value);
                 }
-                Ok((Value::Int64(i), ConcreteDataType::int64_datatype()))
+                Ok(i.into())
             } else if let Some(u) = n.as_u64() {
                 // Use int64 for unsigned integers that fit, otherwise use u64
                 if let Some(expected) = expected_type
                     && let Ok(value) = try_convert_to_expected_type(u, expected)
                 {
-                    return Ok((value, expected.clone()));
+                    return Ok(value);
                 }
                 if u <= i64::MAX as u64 {
-                    Ok((Value::Int64(u as i64), ConcreteDataType::int64_datatype()))
+                    Ok((u as i64).into())
                 } else {
-                    Ok((Value::UInt64(u), ConcreteDataType::uint64_datatype()))
+                    Ok(u.into())
                 }
             } else if let Some(f) = n.as_f64() {
                 // Try to use the expected type if provided
                 if let Some(expected) = expected_type
                     && let Ok(value) = try_convert_to_expected_type(f, expected)
                 {
-                    return Ok((value, expected.clone()));
+                    return Ok(value);
                 }
 
                 // Default to f64 for floating point numbers
-                Ok((
-                    Value::Float64(OrderedFloat(f)),
-                    ConcreteDataType::float64_datatype(),
-                ))
+                Ok(f.into())
             } else {
                 // Fallback to string representation
-                Ok((
-                    Value::String(StringBytes::from(n.to_string())),
-                    ConcreteDataType::string_datatype(),
-                ))
+                Ok(n.to_string().into())
             }
         }
         Json::String(s) => {
             if let Some(expected) = expected_type
                 && let Ok(value) = try_convert_to_expected_type(s.as_str(), expected)
             {
-                return Ok((value, expected.clone()));
+                return Ok(value);
             }
-            Ok((
-                Value::String(StringBytes::from(s.clone())),
-                ConcreteDataType::string_datatype(),
-            ))
-        }
-        Json::Array(arr) => {
-            let list_value = encode_json_array_with_context(arr, expected_type, context)?;
-            let datatype = ConcreteDataType::List(ListType::new(list_value.datatype()));
-            Ok((Value::List(list_value), datatype))
-        }
-        Json::Object(obj) => {
-            let struct_value = encode_json_object_with_context(obj, None, context)?;
-            let data_type = ConcreteDataType::Struct(struct_value.struct_type().clone());
-            Ok((Value::Struct(struct_value), data_type))
+            Ok(s.into())
         }
+        Json::Array(arr) => encode_json_array_with_context(arr, expected_type, context),
+        Json::Object(obj) => encode_json_object_with_context(obj, None, context),
     }
 }
 
@@ -402,7 +360,6 @@ pub fn decode_value_with_context<'a>(
     }
 
     match value {
-        Value::Json(inner) => decode_value_with_context(*inner, context),
         Value::Struct(struct_value) => decode_struct_with_context(struct_value, context),
         Value::List(list_value) => decode_list_with_context(list_value, context),
         _ => decode_primitive_value(value),
@@ -569,11 +526,13 @@ fn decode_struct_with_settings<'a>(
                 key_path: field_context.key_path.clone(),
                 settings: &JsonStructureSettings::Structured(None),
             };
-            let (decoded_value, data_type) = encode_json_value_with_context(
+            let decoded_value = encode_json_value_with_context(
                 json_value,
                 None, // Don't force a specific type, let it be inferred from JSON
                 &structured_context,
-            )?;
+            )?
+            .into_value();
+            let data_type = decoded_value.data_type();
 
             items.push(decoded_value);
             struct_fields.push(StructField::new(
@@ -651,8 +610,9 @@ fn decode_unstructured_raw_struct(struct_value: StructValue) -> Result<StructVal
                 key_path: String::new(),
                 settings: &JsonStructureSettings::Structured(None),
             };
-            let (decoded_value, data_type) =
-                encode_json_value_with_context(json_value, None, &context)?;
+            let decoded_value =
+                encode_json_value_with_context(json_value, None, &context)?.into_value();
+            let data_type = decoded_value.data_type();
 
             if let Value::Struct(decoded_struct) = decoded_value {
                 return Ok(decoded_struct);
@@ -678,22 +638,48 @@ fn decode_unstructured_raw_struct(struct_value: StructValue) -> Result<StructVal
 /// Helper function to try converting a value to an expected type
 fn try_convert_to_expected_type<T>(
     value: T,
-    expected_type: &ConcreteDataType,
-) -> Result<Value, Error>
+    expected_type: &JsonNativeType,
+) -> Result<JsonValue, Error>
 where
-    T: Into<Value>,
+    T: Into<JsonValue>,
 {
     let value = value.into();
-    expected_type.try_cast(value.clone()).ok_or_else(|| {
+    let cast_error = || {
         error::CastTypeSnafu {
-            msg: format!(
-                "Cannot cast from {} to {}",
-                value.data_type().name(),
-                expected_type.name()
-            ),
+            msg: format!("Cannot cast value {value} to {expected_type}"),
         }
-        .build()
-    })
+        .fail()
+    };
+    let actual_type = value.json_type().native_type();
+    match (actual_type, expected_type) {
+        (x, y) if x == y => Ok(value),
+        (JsonNativeType::Number(x), JsonNativeType::Number(y)) => match (x, y) {
+            (JsonNumberType::U64, JsonNumberType::I64) => {
+                if let Some(i) = value.as_i64() {
+                    Ok(i.into())
+                } else {
+                    cast_error()
+                }
+            }
+            (JsonNumberType::I64, JsonNumberType::U64) => {
+                if let Some(i) = value.as_u64() {
+                    Ok(i.into())
+                } else {
+                    cast_error()
+                }
+            }
+            (_, JsonNumberType::F64) => {
+                if let Some(f) = value.as_f64() {
+                    Ok(f.into())
+                } else {
+                    cast_error()
+                }
+            }
+            _ => cast_error(),
+        },
+        (_, JsonNativeType::String) => Ok(value.to_string().into()),
+        _ => cast_error(),
+    }
 }
 
 #[cfg(test)]
@@ -702,6 +688,7 @@ mod tests {
     use serde_json::json;
 
     use super::*;
+    use crate::data_type::ConcreteDataType;
     use crate::types::ListType;
 
     #[test]
@@ -898,15 +885,15 @@ mod tests {
         let json = Json::from(42);
         let settings = JsonStructureSettings::Structured(None);
         let result = settings
-            .encode_with_type(json.clone(), Some(&ConcreteDataType::int8_datatype()))
+            .encode_with_type(json.clone(), Some(&JsonNativeType::u64()))
             .unwrap()
             .into_json_inner()
             .unwrap();
-        assert_eq!(result, Value::Int8(42));
+        assert_eq!(result, Value::UInt64(42));
 
         // Test with expected string type
         let result = settings
-            .encode_with_type(json, Some(&ConcreteDataType::string_datatype()))
+            .encode_with_type(json, Some(&JsonNativeType::String))
             .unwrap()
             .into_json_inner()
             .unwrap();
@@ -917,23 +904,11 @@ mod tests {
     fn test_encode_json_array_mixed_types() {
         let json = json!([1, "hello", true, 3.15]);
         let settings = JsonStructureSettings::Structured(None);
-        let result = settings
-            .encode_with_type(json, None)
-            .unwrap()
-            .into_json_inner()
-            .unwrap();
-
-        if let Value::List(list_value) = result {
-            assert_eq!(list_value.items().len(), 4);
-            // The first non-null type should determine the list type
-            // In this case, it should be string since we can't find a common numeric type
-            assert_eq!(
-                list_value.datatype(),
-                Arc::new(ConcreteDataType::int64_datatype())
-            );
-        } else {
-            panic!("Expected List value");
-        }
+        let result = settings.encode_with_type(json, None);
+        assert_eq!(
+            result.unwrap_err().to_string(),
+            "Invalid JSON: all items in json array must have the same type"
+        );
     }
 
     #[test]
@@ -951,7 +926,7 @@ mod tests {
             // Empty arrays default to string type
             assert_eq!(
                 list_value.datatype(),
-                Arc::new(ConcreteDataType::string_datatype())
+                Arc::new(ConcreteDataType::null_datatype())
             );
         } else {
             panic!("Expected List value");
@@ -987,16 +962,10 @@ mod tests {
         });
 
         // Define expected struct type
-        let fields = vec![
-            StructField::new(
-                "name".to_string(),
-                ConcreteDataType::string_datatype(),
-                true,
-            ),
-            StructField::new("age".to_string(), ConcreteDataType::int64_datatype(), true),
-        ];
-        let struct_type = StructType::new(Arc::new(fields));
-        let concrete_type = ConcreteDataType::Struct(struct_type);
+        let concrete_type = JsonNativeType::Object(JsonObjectType::from([
+            ("name".to_string(), JsonNativeType::String),
+            ("age".to_string(), JsonNativeType::i64()),
+        ]));
 
         let settings = JsonStructureSettings::Structured(None);
         let result = settings
@@ -1008,15 +977,15 @@ mod tests {
         if let Value::Struct(struct_value) = result {
             assert_eq!(struct_value.items().len(), 2);
             let struct_fields = struct_value.struct_type().fields();
-            assert_eq!(struct_fields[0].name(), "name");
+            assert_eq!(struct_fields[0].name(), "age");
             assert_eq!(
                 struct_fields[0].data_type(),
-                &ConcreteDataType::string_datatype()
+                &ConcreteDataType::int64_datatype()
             );
-            assert_eq!(struct_fields[1].name(), "age");
+            assert_eq!(struct_fields[1].name(), "name");
             assert_eq!(
                 struct_fields[1].data_type(),
-                &ConcreteDataType::int64_datatype()
+                &ConcreteDataType::string_datatype()
             );
         } else {
             panic!("Expected Struct value");
@@ -1032,34 +1001,24 @@ mod tests {
         });
 
         // Define schema with specific field order
-        let fields = vec![
-            StructField::new(
-                "a_field".to_string(),
-                ConcreteDataType::string_datatype(),
-                true,
-            ),
-            StructField::new(
-                "m_field".to_string(),
-                ConcreteDataType::string_datatype(),
-                true,
-            ),
-            StructField::new(
-                "z_field".to_string(),
-                ConcreteDataType::string_datatype(),
-                true,
-            ),
-        ];
-        let struct_type = StructType::new(Arc::new(fields));
+        let json_type = JsonObjectType::from([
+            ("a_field".to_string(), JsonNativeType::String),
+            ("m_field".to_string(), JsonNativeType::String),
+            ("z_field".to_string(), JsonNativeType::String),
+        ]);
 
-        let result = encode_json_object_with_context(
+        let Value::Struct(result) = encode_json_object_with_context(
             json.as_object().unwrap().clone(),
-            Some(&struct_type),
+            Some(&json_type),
             &JsonContext {
                 key_path: String::new(),
                 settings: &JsonStructureSettings::Structured(None),
             },
         )
-        .unwrap();
+        .map(|x| x.into_value())
+        .unwrap() else {
+            unreachable!()
+        };
 
         // Verify field order is preserved from schema
         let struct_fields = result.struct_type().fields();
@@ -1083,37 +1042,35 @@ mod tests {
         });
 
         // Define schema with only name and age
-        let fields = vec![
-            StructField::new(
-                "name".to_string(),
-                ConcreteDataType::string_datatype(),
-                true,
-            ),
-            StructField::new("age".to_string(), ConcreteDataType::int64_datatype(), true),
-        ];
-        let struct_type = StructType::new(Arc::new(fields));
+        let json_type = JsonObjectType::from([
+            ("name".to_string(), JsonNativeType::String),
+            ("age".to_string(), JsonNativeType::i64()),
+        ]);
 
-        let result = encode_json_object_with_context(
+        let Value::Struct(result) = encode_json_object_with_context(
             json.as_object().unwrap().clone(),
-            Some(&struct_type),
+            Some(&json_type),
             &JsonContext {
                 key_path: String::new(),
                 settings: &JsonStructureSettings::Structured(None),
             },
         )
-        .unwrap();
+        .map(|x| x.into_value())
+        .unwrap() else {
+            unreachable!()
+        };
 
-        // Verify schema fields come first in order
+        // verify fields are sorted in json value
         let struct_fields = result.struct_type().fields();
-        assert_eq!(struct_fields[0].name(), "name");
+        assert_eq!(struct_fields[0].name(), "active");
         assert_eq!(struct_fields[1].name(), "age");
-        assert_eq!(struct_fields[2].name(), "active");
+        assert_eq!(struct_fields[2].name(), "name");
 
         // Verify values are correct
         let items = result.items();
-        assert_eq!(items[0], Value::String("Alice".into()));
+        assert_eq!(items[0], Value::Boolean(true));
         assert_eq!(items[1], Value::Int64(25));
-        assert_eq!(items[2], Value::Boolean(true));
+        assert_eq!(items[2], Value::String("Alice".into()));
     }
 
     #[test]
@@ -1124,35 +1081,33 @@ mod tests {
         });
 
         // Define schema with name and age
-        let fields = vec![
-            StructField::new(
-                "name".to_string(),
-                ConcreteDataType::string_datatype(),
-                true,
-            ),
-            StructField::new("age".to_string(), ConcreteDataType::int64_datatype(), true),
-        ];
-        let struct_type = StructType::new(Arc::new(fields));
+        let json_type = JsonObjectType::from([
+            ("name".to_string(), JsonNativeType::String),
+            ("age".to_string(), JsonNativeType::i64()),
+        ]);
 
-        let result = encode_json_object_with_context(
+        let Value::Struct(result) = encode_json_object_with_context(
             json.as_object().unwrap().clone(),
-            Some(&struct_type),
+            Some(&json_type),
             &JsonContext {
                 key_path: String::new(),
                 settings: &JsonStructureSettings::Structured(None),
             },
         )
-        .unwrap();
+        .map(|x| x.into_value())
+        .unwrap() else {
+            unreachable!()
+        };
 
         // Verify both schema fields are present
         let struct_fields = result.struct_type().fields();
-        assert_eq!(struct_fields[0].name(), "name");
-        assert_eq!(struct_fields[1].name(), "age");
+        assert_eq!(struct_fields[0].name(), "age");
+        assert_eq!(struct_fields[1].name(), "name");
 
         // Verify values - name has value, age is null
         let items = result.items();
-        assert_eq!(items[0], Value::String("Bob".into()));
-        assert_eq!(items[1], Value::Null);
+        assert_eq!(items[0], Value::Null);
+        assert_eq!(items[1], Value::String("Bob".into()));
     }
 
     #[test]
@@ -1175,21 +1130,22 @@ mod tests {
     #[test]
     fn test_encode_json_array_with_item_type() {
         let json = json!([1, 2, 3]);
-        let item_type = Arc::new(ConcreteDataType::int8_datatype());
-        let list_type = ListType::new(item_type.clone());
-        let concrete_type = ConcreteDataType::List(list_type);
+        let item_type = Arc::new(ConcreteDataType::uint64_datatype());
         let settings = JsonStructureSettings::Structured(None);
         let result = settings
-            .encode_with_type(json, Some(&concrete_type))
+            .encode_with_type(
+                json,
+                Some(&JsonNativeType::Array(Box::new(JsonNativeType::u64()))),
+            )
             .unwrap()
             .into_json_inner()
             .unwrap();
 
         if let Value::List(list_value) = result {
             assert_eq!(list_value.items().len(), 3);
-            assert_eq!(list_value.items()[0], Value::Int8(1));
-            assert_eq!(list_value.items()[1], Value::Int8(2));
-            assert_eq!(list_value.items()[2], Value::Int8(3));
+            assert_eq!(list_value.items()[0], Value::UInt64(1));
+            assert_eq!(list_value.items()[1], Value::UInt64(2));
+            assert_eq!(list_value.items()[2], Value::UInt64(3));
             assert_eq!(list_value.datatype(), item_type);
         } else {
             panic!("Expected List value");
@@ -1199,12 +1155,13 @@ mod tests {
     #[test]
     fn test_encode_json_array_empty_with_item_type() {
         let json = json!([]);
-        let item_type = Arc::new(ConcreteDataType::string_datatype());
-        let list_type = ListType::new(item_type.clone());
-        let concrete_type = ConcreteDataType::List(list_type);
+        let item_type = Arc::new(ConcreteDataType::null_datatype());
         let settings = JsonStructureSettings::Structured(None);
         let result = settings
-            .encode_with_type(json, Some(&concrete_type))
+            .encode_with_type(
+                json,
+                Some(&JsonNativeType::Array(Box::new(JsonNativeType::Null))),
+            )
             .unwrap()
             .into_json_inner()
             .unwrap();
@@ -1219,6 +1176,7 @@ mod tests {
 
     #[cfg(test)]
     mod decode_tests {
+        use ordered_float::OrderedFloat;
         use serde_json::json;
 
         use super::*;
@@ -1473,7 +1431,7 @@ mod tests {
         // Test encoding JSON number with expected int64 type
         let json = Json::from(42);
         let result = settings
-            .encode_with_type(json, Some(&ConcreteDataType::int64_datatype()))
+            .encode_with_type(json, Some(&JsonNativeType::i64()))
             .unwrap()
             .into_json_inner()
             .unwrap();
@@ -1482,7 +1440,7 @@ mod tests {
         // Test encoding JSON string with expected string type
         let json = Json::String("hello".to_string());
         let result = settings
-            .encode_with_type(json, Some(&ConcreteDataType::string_datatype()))
+            .encode_with_type(json, Some(&JsonNativeType::String))
             .unwrap()
             .into_json_inner()
             .unwrap();
@@ -1491,7 +1449,7 @@ mod tests {
         // Test encoding JSON boolean with expected boolean type
         let json = Json::Bool(true);
         let result = settings
-            .encode_with_type(json, Some(&ConcreteDataType::boolean_datatype()))
+            .encode_with_type(json, Some(&JsonNativeType::Bool))
             .unwrap()
             .into_json_inner()
             .unwrap();
@@ -1503,12 +1461,12 @@ mod tests {
         // Test encoding JSON number with mismatched string type
         let json = Json::from(42);
         let settings = JsonStructureSettings::Structured(None);
-        let result = settings.encode_with_type(json, Some(&ConcreteDataType::string_datatype()));
+        let result = settings.encode_with_type(json, Some(&JsonNativeType::String));
         assert!(result.is_ok()); // Should succeed due to type conversion
 
         // Test encoding JSON object with mismatched non-struct type
         let json = json!({"name": "test"});
-        let result = settings.encode_with_type(json, Some(&ConcreteDataType::int64_datatype()));
+        let result = settings.encode_with_type(json, Some(&JsonNativeType::i64()));
         assert!(result.is_err()); // Should fail - object can't be converted to int64
     }
 
@@ -1516,12 +1474,13 @@ mod tests {
     fn test_encode_json_array_with_list_type() {
         let json = json!([1, 2, 3]);
         let item_type = Arc::new(ConcreteDataType::int64_datatype());
-        let list_type = ListType::new(item_type.clone());
-        let concrete_type = ConcreteDataType::List(list_type);
 
         let settings = JsonStructureSettings::Structured(None);
         let result = settings
-            .encode_with_type(json, Some(&concrete_type))
+            .encode_with_type(
+                json,
+                Some(&JsonNativeType::Array(Box::new(JsonNativeType::i64()))),
+            )
             .unwrap()
             .into_json_inner()
             .unwrap();
@@ -1543,7 +1502,7 @@ mod tests {
         let json = Json::Null;
         let settings = JsonStructureSettings::Structured(None);
         let result = settings
-            .encode_with_type(json.clone(), Some(&ConcreteDataType::null_datatype()))
+            .encode_with_type(json.clone(), Some(&JsonNativeType::Null))
             .unwrap()
             .into_json_inner()
             .unwrap();
@@ -1552,7 +1511,7 @@ mod tests {
         // Test float with float64 type
         let json = Json::from(3.15);
         let result = settings
-            .encode_with_type(json, Some(&ConcreteDataType::float64_datatype()))
+            .encode_with_type(json, Some(&JsonNativeType::f64()))
             .unwrap()
             .into_json_inner()
             .unwrap();
@@ -1644,20 +1603,11 @@ mod tests {
         }
 
         // Test with encode_with_type (with type)
-        let struct_type = StructType::new(Arc::new(vec![
-            StructField::new(
-                "name".to_string(),
-                ConcreteDataType::string_datatype(),
-                true,
-            ),
-            StructField::new("age".to_string(), ConcreteDataType::int64_datatype(), true),
-            StructField::new(
-                "active".to_string(),
-                ConcreteDataType::boolean_datatype(),
-                true,
-            ),
+        let concrete_type = JsonNativeType::Object(JsonObjectType::from([
+            ("name".to_string(), JsonNativeType::String),
+            ("age".to_string(), JsonNativeType::i64()),
+            ("active".to_string(), JsonNativeType::Bool),
         ]));
-        let concrete_type = ConcreteDataType::Struct(struct_type);
 
         let result2 = settings
             .encode_with_type(json, Some(&concrete_type))
@@ -2153,20 +2103,11 @@ mod tests {
             )])),
         );
 
-        let decoded_struct = settings.decode_struct(array_struct).unwrap();
-        let fields = decoded_struct.struct_type().fields();
-        let decoded_fields: Vec<&str> = fields.iter().map(|f| f.name()).collect();
-        assert!(decoded_fields.contains(&"value"));
-
-        if let Value::List(list_value) = &decoded_struct.items()[0] {
-            assert_eq!(list_value.items().len(), 4);
-            assert_eq!(list_value.items()[0], Value::Int64(1));
-            assert_eq!(list_value.items()[1], Value::String("hello".into()));
-            assert_eq!(list_value.items()[2], Value::Boolean(true));
-            assert_eq!(list_value.items()[3], Value::Float64(OrderedFloat(3.15)));
-        } else {
-            panic!("Expected array to be decoded as ListValue");
-        }
+        let decoded_struct = settings.decode_struct(array_struct);
+        assert_eq!(
+            decoded_struct.unwrap_err().to_string(),
+            "Invalid JSON: all items in json array must have the same type"
+        );
     }
 
     #[test]
diff --git a/src/datatypes/src/json/value.rs b/src/datatypes/src/json/value.rs
new file mode 100644
index 0000000000..acff194e12
--- /dev/null
+++ b/src/datatypes/src/json/value.rs
@@ -0,0 +1,691 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::BTreeMap;
+use std::fmt::{Display, Formatter};
+use std::hash::{Hash, Hasher};
+use std::sync::{Arc, OnceLock};
+
+use num_traits::ToPrimitive;
+use ordered_float::OrderedFloat;
+use serde::{Deserialize, Serialize};
+use serde_json::Number;
+
+use crate::data_type::ConcreteDataType;
+use crate::types::json_type::JsonNativeType;
+use crate::types::{JsonType, StructField, StructType};
+use crate::value::{ListValue, ListValueRef, StructValue, StructValueRef, Value, ValueRef};
+
+/// Number in json, can be a positive integer, a negative integer, or a floating number.
+/// Each of which is represented as `u64`, `i64` and `f64`.
+///
+/// This follows how `serde_json` designs number.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum JsonNumber {
+    PosInt(u64),
+    NegInt(i64),
+    Float(OrderedFloat<f64>),
+}
+
+impl JsonNumber {
+    fn as_u64(&self) -> Option<u64> {
+        match self {
+            JsonNumber::PosInt(n) => Some(*n),
+            JsonNumber::NegInt(n) => (*n >= 0).then_some(*n as u64),
+            _ => None,
+        }
+    }
+
+    fn as_i64(&self) -> Option<i64> {
+        match self {
+            JsonNumber::PosInt(n) => (*n <= i64::MAX as u64).then_some(*n as i64),
+            JsonNumber::NegInt(n) => Some(*n),
+            _ => None,
+        }
+    }
+
+    fn as_f64(&self) -> f64 {
+        match self {
+            JsonNumber::PosInt(n) => *n as f64,
+            JsonNumber::NegInt(n) => *n as f64,
+            JsonNumber::Float(n) => n.0,
+        }
+    }
+}
+
+impl From<u64> for JsonNumber {
+    fn from(i: u64) -> Self {
+        Self::PosInt(i)
+    }
+}
+
+impl From<i64> for JsonNumber {
+    fn from(n: i64) -> Self {
+        Self::NegInt(n)
+    }
+}
+
+impl From<f64> for JsonNumber {
+    fn from(i: f64) -> Self {
+        Self::Float(i.into())
+    }
+}
+
+impl Display for JsonNumber {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::PosInt(x) => write!(f, "{x}"),
+            Self::NegInt(x) => write!(f, "{x}"),
+            Self::Float(x) => write!(f, "{x}"),
+        }
+    }
+}
+
+/// Variants of json.
+///
+/// This follows how [serde_json::Value] designs except that we only choose to use [BTreeMap] to
+/// preserve the fields order by their names in the json object. (By default `serde_json` uses
+/// [BTreeMap], too. But it additionally supports "IndexMap" which preserves the order by insertion
+/// times of fields.)
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum JsonVariant {
+    Null,
+    Bool(bool),
+    Number(JsonNumber),
+    String(String),
+    Array(Vec<JsonVariant>),
+    Object(BTreeMap<String, JsonVariant>),
+}
+
+impl JsonVariant {
+    fn native_type(&self) -> JsonNativeType {
+        match self {
+            JsonVariant::Null => JsonNativeType::Null,
+            JsonVariant::Bool(_) => JsonNativeType::Bool,
+            JsonVariant::Number(n) => match n {
+                JsonNumber::PosInt(_) => JsonNativeType::u64(),
+                JsonNumber::NegInt(_) => JsonNativeType::i64(),
+                JsonNumber::Float(_) => JsonNativeType::f64(),
+            },
+            JsonVariant::String(_) => JsonNativeType::String,
+            JsonVariant::Array(array) => {
+                let item_type = if let Some(first) = array.first() {
+                    first.native_type()
+                } else {
+                    JsonNativeType::Null
+                };
+                JsonNativeType::Array(Box::new(item_type))
+            }
+            JsonVariant::Object(object) => JsonNativeType::Object(
+                object
+                    .iter()
+                    .map(|(k, v)| (k.clone(), v.native_type()))
+                    .collect(),
+            ),
+        }
+    }
+
+    fn json_type(&self) -> JsonType {
+        JsonType::new_native(self.native_type())
+    }
+
+    fn as_ref(&self) -> JsonVariantRef<'_> {
+        match self {
+            JsonVariant::Null => JsonVariantRef::Null,
+            JsonVariant::Bool(x) => (*x).into(),
+            JsonVariant::Number(x) => match x {
+                JsonNumber::PosInt(i) => (*i).into(),
+                JsonNumber::NegInt(i) => (*i).into(),
+                JsonNumber::Float(f) => (f.0).into(),
+            },
+            JsonVariant::String(x) => x.as_str().into(),
+            JsonVariant::Array(array) => {
+                JsonVariantRef::Array(array.iter().map(|x| x.as_ref()).collect())
+            }
+            JsonVariant::Object(object) => JsonVariantRef::Object(
+                object
+                    .iter()
+                    .map(|(k, v)| (k.as_str(), v.as_ref()))
+                    .collect(),
+            ),
+        }
+    }
+}
+
+impl From<()> for JsonVariant {
+    fn from(_: ()) -> Self {
+        Self::Null
+    }
+}
+
+impl From<bool> for JsonVariant {
+    fn from(v: bool) -> Self {
+        Self::Bool(v)
+    }
+}
+
+impl<T: Into<JsonNumber>> From<T> for JsonVariant {
+    fn from(v: T) -> Self {
+        Self::Number(v.into())
+    }
+}
+
+impl From<&str> for JsonVariant {
+    fn from(v: &str) -> Self {
+        Self::String(v.to_string())
+    }
+}
+
+impl From<String> for JsonVariant {
+    fn from(v: String) -> Self {
+        Self::String(v)
+    }
+}
+
+impl<const N: usize, T: Into<JsonVariant>> From<[T; N]> for JsonVariant {
+    fn from(vs: [T; N]) -> Self {
+        Self::Array(vs.into_iter().map(|x| x.into()).collect())
+    }
+}
+
+impl<K: Into<String>, V: Into<JsonVariant>, const N: usize> From<[(K, V); N]> for JsonVariant {
+    fn from(vs: [(K, V); N]) -> Self {
+        Self::Object(vs.into_iter().map(|(k, v)| (k.into(), v.into())).collect())
+    }
+}
+
+impl Display for JsonVariant {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Null => write!(f, "null"),
+            Self::Bool(x) => write!(f, "{x}"),
+            Self::Number(x) => write!(f, "{x}"),
+            Self::String(x) => write!(f, "{x}"),
+            Self::Array(array) => write!(
+                f,
+                "[{}]",
+                array
+                    .iter()
+                    .map(|x| x.to_string())
+                    .collect::<Vec<_>>()
+                    .join(", ")
+            ),
+            Self::Object(object) => {
+                write!(
+                    f,
+                    "{{ {} }}",
+                    object
+                        .iter()
+                        .map(|(k, v)| format!("{k}: {v}"))
+                        .collect::<Vec<_>>()
+                        .join(", ")
+                )
+            }
+        }
+    }
+}
+
+/// Represents any valid JSON value.
+#[derive(Debug, Eq, Serialize, Deserialize)]
+pub struct JsonValue {
+    #[serde(skip)]
+    json_type: OnceLock<JsonType>,
+    json_variant: JsonVariant,
+}
+
+impl JsonValue {
+    pub fn null() -> Self {
+        ().into()
+    }
+
+    pub(crate) fn new(json_variant: JsonVariant) -> Self {
+        Self {
+            json_type: OnceLock::new(),
+            json_variant,
+        }
+    }
+
+    pub(crate) fn data_type(&self) -> ConcreteDataType {
+        ConcreteDataType::Json(self.json_type().clone())
+    }
+
+    pub fn json_type(&self) -> &JsonType {
+        self.json_type.get_or_init(|| self.json_variant.json_type())
+    }
+
+    pub(crate) fn is_null(&self) -> bool {
+        matches!(self.json_variant, JsonVariant::Null)
+    }
+
+    /// Check if this JSON value is an empty object.
+    pub fn is_empty_object(&self) -> bool {
+        match &self.json_variant {
+            JsonVariant::Object(object) => object.is_empty(),
+            _ => false,
+        }
+    }
+
+    pub(crate) fn as_i64(&self) -> Option<i64> {
+        match self.json_variant {
+            JsonVariant::Number(n) => n.as_i64(),
+            _ => None,
+        }
+    }
+
+    pub(crate) fn as_u64(&self) -> Option<u64> {
+        match self.json_variant {
+            JsonVariant::Number(n) => n.as_u64(),
+            _ => None,
+        }
+    }
+
+    pub(crate) fn as_f64(&self) -> Option<f64> {
+        match self.json_variant {
+            JsonVariant::Number(n) => Some(n.as_f64()),
+            _ => None,
+        }
+    }
+
+    pub(crate) fn as_f64_lossy(&self) -> Option<f64> {
+        match self.json_variant {
+            JsonVariant::Number(n) => Some(match n {
+                JsonNumber::PosInt(i) => i as f64,
+                JsonNumber::NegInt(i) => i as f64,
+                JsonNumber::Float(f) => f.0,
+            }),
+            _ => None,
+        }
+    }
+
+    pub(crate) fn as_bool(&self) -> Option<bool> {
+        match self.json_variant {
+            JsonVariant::Bool(b) => Some(b),
+            _ => None,
+        }
+    }
+
+    pub fn as_ref(&self) -> JsonValueRef<'_> {
+        JsonValueRef {
+            json_type: OnceLock::new(),
+            json_variant: self.json_variant.as_ref(),
+        }
+    }
+
+    pub fn into_variant(self) -> JsonVariant {
+        self.json_variant
+    }
+
+    pub(crate) fn into_value(self) -> Value {
+        fn helper(v: JsonVariant) -> Value {
+            match v {
+                JsonVariant::Null => Value::Null,
+                JsonVariant::Bool(x) => Value::Boolean(x),
+                JsonVariant::Number(x) => match x {
+                    JsonNumber::PosInt(i) => Value::UInt64(i),
+                    JsonNumber::NegInt(i) => Value::Int64(i),
+                    JsonNumber::Float(f) => Value::Float64(f),
+                },
+                JsonVariant::String(x) => Value::String(x.into()),
+                JsonVariant::Array(array) => {
+                    let item_type = if let Some(first) = array.first() {
+                        first.native_type()
+                    } else {
+                        JsonNativeType::Null
+                    };
+                    Value::List(ListValue::new(
+                        array.into_iter().map(helper).collect(),
+                        Arc::new((&item_type).into()),
+                    ))
+                }
+                JsonVariant::Object(object) => {
+                    let mut fields = Vec::with_capacity(object.len());
+                    let mut items = Vec::with_capacity(object.len());
+                    for (k, v) in object {
+                        fields.push(StructField::new(k, (&v.native_type()).into(), true));
+                        items.push(helper(v));
+                    }
+                    Value::Struct(StructValue::new(items, StructType::new(Arc::new(fields))))
+                }
+            }
+        }
+        helper(self.json_variant)
+    }
+}
+
+impl<T: Into<JsonVariant>> From<T> for JsonValue {
+    fn from(v: T) -> Self {
+        Self {
+            json_type: OnceLock::new(),
+            json_variant: v.into(),
+        }
+    }
+}
+
+impl From<JsonValue> for serde_json::Value {
+    fn from(v: JsonValue) -> Self {
+        fn helper(v: JsonVariant) -> serde_json::Value {
+            match v {
+                JsonVariant::Null => serde_json::Value::Null,
+                JsonVariant::Bool(x) => serde_json::Value::Bool(x),
+                JsonVariant::Number(x) => match x {
+                    JsonNumber::PosInt(i) => serde_json::Value::Number(i.into()),
+                    JsonNumber::NegInt(i) => serde_json::Value::Number(i.into()),
+                    JsonNumber::Float(f) => {
+                        if let Some(x) = Number::from_f64(f.0) {
+                            serde_json::Value::Number(x)
+                        } else {
+                            serde_json::Value::String("NaN".into())
+                        }
+                    }
+                },
+                JsonVariant::String(x) => serde_json::Value::String(x),
+                JsonVariant::Array(array) => {
+                    serde_json::Value::Array(array.into_iter().map(helper).collect())
+                }
+                JsonVariant::Object(object) => serde_json::Value::Object(
+                    object.into_iter().map(|(k, v)| (k, helper(v))).collect(),
+                ),
+            }
+        }
+        helper(v.json_variant)
+    }
+}
+
+impl Clone for JsonValue {
+    fn clone(&self) -> Self {
+        let Self {
+            json_type: _,
+            json_variant,
+        } = self;
+        Self {
+            json_type: OnceLock::new(),
+            json_variant: json_variant.clone(),
+        }
+    }
+}
+
+impl PartialEq<JsonValue> for JsonValue {
+    fn eq(&self, other: &JsonValue) -> bool {
+        let Self {
+            json_type: _,
+            json_variant,
+        } = self;
+        json_variant.eq(&other.json_variant)
+    }
+}
+
+impl Hash for JsonValue {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let Self {
+            json_type: _,
+            json_variant,
+        } = self;
+        json_variant.hash(state);
+    }
+}
+
+impl Display for JsonValue {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.json_variant)
+    }
+}
+
+/// References of variants of json.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
+pub enum JsonVariantRef<'a> {
+    Null,
+    Bool(bool),
+    Number(JsonNumber),
+    String(&'a str),
+    Array(Vec<JsonVariantRef<'a>>),
+    Object(BTreeMap<&'a str, JsonVariantRef<'a>>),
+}
+
+impl JsonVariantRef<'_> {
+    fn json_type(&self) -> JsonType {
+        fn native_type(v: &JsonVariantRef<'_>) -> JsonNativeType {
+            match v {
+                JsonVariantRef::Null => JsonNativeType::Null,
+                JsonVariantRef::Bool(_) => JsonNativeType::Bool,
+                JsonVariantRef::Number(n) => match n {
+                    JsonNumber::PosInt(_) => JsonNativeType::u64(),
+                    JsonNumber::NegInt(_) => JsonNativeType::i64(),
+                    JsonNumber::Float(_) => JsonNativeType::f64(),
+                },
+                JsonVariantRef::String(_) => JsonNativeType::String,
+                JsonVariantRef::Array(array) => {
+                    let item_type = if let Some(first) = array.first() {
+                        native_type(first)
+                    } else {
+                        JsonNativeType::Null
+                    };
+                    JsonNativeType::Array(Box::new(item_type))
+                }
+                JsonVariantRef::Object(object) => JsonNativeType::Object(
+                    object
+                        .iter()
+                        .map(|(k, v)| (k.to_string(), native_type(v)))
+                        .collect(),
+                ),
+            }
+        }
+        JsonType::new_native(native_type(self))
+    }
+}
+
+impl From<()> for JsonVariantRef<'_> {
+    fn from(_: ()) -> Self {
+        Self::Null
+    }
+}
+
+impl From<bool> for JsonVariantRef<'_> {
+    fn from(v: bool) -> Self {
+        Self::Bool(v)
+    }
+}
+
+impl<T: Into<JsonNumber>> From<T> for JsonVariantRef<'_> {
+    fn from(v: T) -> Self {
+        Self::Number(v.into())
+    }
+}
+
+impl<'a> From<&'a str> for JsonVariantRef<'a> {
+    fn from(v: &'a str) -> Self {
+        Self::String(v)
+    }
+}
+
+impl<'a, const N: usize, T: Into<JsonVariantRef<'a>>> From<[T; N]> for JsonVariantRef<'a> {
+    fn from(vs: [T; N]) -> Self {
+        Self::Array(vs.into_iter().map(|x| x.into()).collect())
+    }
+}
+
+impl<'a, V: Into<JsonVariantRef<'a>>, const N: usize> From<[(&'a str, V); N]>
+    for JsonVariantRef<'a>
+{
+    fn from(vs: [(&'a str, V); N]) -> Self {
+        Self::Object(vs.into_iter().map(|(k, v)| (k, v.into())).collect())
+    }
+}
+
+impl<'a> From<Vec<JsonVariantRef<'a>>> for JsonVariantRef<'a> {
+    fn from(v: Vec<JsonVariantRef<'a>>) -> Self {
+        Self::Array(v)
+    }
+}
+
+impl<'a> From<BTreeMap<&'a str, JsonVariantRef<'a>>> for JsonVariantRef<'a> {
+    fn from(v: BTreeMap<&'a str, JsonVariantRef<'a>>) -> Self {
+        Self::Object(v)
+    }
+}
+
+impl From<JsonVariantRef<'_>> for JsonVariant {
+    fn from(v: JsonVariantRef) -> Self {
+        match v {
+            JsonVariantRef::Null => Self::Null,
+            JsonVariantRef::Bool(x) => Self::Bool(x),
+            JsonVariantRef::Number(x) => Self::Number(x),
+            JsonVariantRef::String(x) => Self::String(x.to_string()),
+            JsonVariantRef::Array(array) => {
+                Self::Array(array.into_iter().map(Into::into).collect())
+            }
+            JsonVariantRef::Object(object) => Self::Object(
+                object
+                    .into_iter()
+                    .map(|(k, v)| (k.to_string(), v.into()))
+                    .collect(),
+            ),
+        }
+    }
+}
+
+/// Reference to representation of any valid JSON value.
+#[derive(Debug, Serialize)]
+pub struct JsonValueRef<'a> {
+    #[serde(skip)]
+    json_type: OnceLock<JsonType>,
+    json_variant: JsonVariantRef<'a>,
+}
+
+impl<'a> JsonValueRef<'a> {
+    pub fn null() -> Self {
+        ().into()
+    }
+
+    pub(crate) fn data_type(&self) -> ConcreteDataType {
+        ConcreteDataType::Json(self.json_type().clone())
+    }
+
+    pub(crate) fn json_type(&self) -> &JsonType {
+        self.json_type.get_or_init(|| self.json_variant.json_type())
+    }
+
+    pub fn into_variant(self) -> JsonVariantRef<'a> {
+        self.json_variant
+    }
+
+    pub(crate) fn is_null(&self) -> bool {
+        matches!(self.json_variant, JsonVariantRef::Null)
+    }
+
+    pub fn is_object(&self) -> bool {
+        matches!(self.json_variant, JsonVariantRef::Object(_))
+    }
+
+    pub(crate) fn as_f32(&self) -> Option<f32> {
+        match self.json_variant {
+            JsonVariantRef::Number(JsonNumber::Float(f)) => f.to_f32(),
+            _ => None,
+        }
+    }
+
+    pub(crate) fn as_f64(&self) -> Option<f64> {
+        match self.json_variant {
+            JsonVariantRef::Number(JsonNumber::Float(f)) => Some(f.0),
+            _ => None,
+        }
+    }
+
+    pub fn as_value_ref(&self) -> ValueRef<'_> {
+        fn helper<'a>(v: &'a JsonVariantRef) -> ValueRef<'a> {
+            match v {
+                JsonVariantRef::Null => ValueRef::Null,
+                JsonVariantRef::Bool(x) => ValueRef::Boolean(*x),
+                JsonVariantRef::Number(x) => match x {
+                    JsonNumber::PosInt(i) => ValueRef::UInt64(*i),
+                    JsonNumber::NegInt(i) => ValueRef::Int64(*i),
+                    JsonNumber::Float(f) => ValueRef::Float64(*f),
+                },
+                JsonVariantRef::String(x) => ValueRef::String(x),
+                JsonVariantRef::Array(array) => {
+                    let val = array.iter().map(helper).collect::<Vec<_>>();
+                    let item_datatype = if let Some(first) = val.first() {
+                        first.data_type()
+                    } else {
+                        ConcreteDataType::null_datatype()
+                    };
+                    ValueRef::List(ListValueRef::RefList {
+                        val,
+                        item_datatype: Arc::new(item_datatype),
+                    })
+                }
+                JsonVariantRef::Object(object) => {
+                    let mut fields = Vec::with_capacity(object.len());
+                    let mut val = Vec::with_capacity(object.len());
+                    for (k, v) in object.iter() {
+                        let v = helper(v);
+                        fields.push(StructField::new(k.to_string(), v.data_type(), true));
+                        val.push(v);
+                    }
+                    ValueRef::Struct(StructValueRef::RefList {
+                        val,
+                        fields: StructType::new(Arc::new(fields)),
+                    })
+                }
+            }
+        }
+        helper(&self.json_variant)
+    }
+
+    pub(crate) fn data_size(&self) -> usize {
+        size_of_val(self)
+    }
+}
+
+impl<'a, T: Into<JsonVariantRef<'a>>> From<T> for JsonValueRef<'a> {
+    fn from(v: T) -> Self {
+        Self {
+            json_type: OnceLock::new(),
+            json_variant: v.into(),
+        }
+    }
+}
+
+impl From<JsonValueRef<'_>> for JsonValue {
+    fn from(v: JsonValueRef<'_>) -> Self {
+        Self {
+            json_type: OnceLock::new(),
+            json_variant: v.json_variant.into(),
+        }
+    }
+}
+
+impl PartialEq for JsonValueRef<'_> {
+    fn eq(&self, other: &Self) -> bool {
+        let Self {
+            json_type: _,
+            json_variant,
+        } = self;
+        json_variant == &other.json_variant
+    }
+}
+
+impl Eq for JsonValueRef<'_> {}
+
+impl Clone for JsonValueRef<'_> {
+    fn clone(&self) -> Self {
+        let Self {
+            json_type: _,
+            json_variant,
+        } = self;
+        Self {
+            json_type: OnceLock::new(),
+            json_variant: json_variant.clone(),
+        }
+    }
+}
diff --git a/src/datatypes/src/schema.rs b/src/datatypes/src/schema.rs
index 6bdf321137..b5451617f8 100644
--- a/src/datatypes/src/schema.rs
+++ b/src/datatypes/src/schema.rs
@@ -33,7 +33,8 @@ pub use crate::schema::column_schema::{
     COLUMN_SKIPPING_INDEX_OPT_KEY_FALSE_POSITIVE_RATE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
     COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, ColumnExtType, ColumnSchema, FULLTEXT_KEY,
     FulltextAnalyzer, FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata,
-    SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY,
+    SKIPPING_INDEX_KEY, SkippingIndexOptions, SkippingIndexType, TIME_INDEX_KEY, VECTOR_INDEX_KEY,
+    VectorDistanceMetric, VectorIndexEngineType, VectorIndexOptions,
 };
 pub use crate::schema::constraint::ColumnDefaultConstraint;
 pub use crate::schema::raw::RawSchema;
@@ -273,8 +274,9 @@ fn collect_fields(column_schemas: &[ColumnSchema]) -> Result<FieldsAndIndices> {
             _ => None,
         };
         if let Some(extype) = extype {
-            let metadata = HashMap::from([(TYPE_KEY.to_string(), extype.to_string())]);
-            field = field.with_metadata(metadata);
+            field
+                .metadata_mut()
+                .insert(TYPE_KEY.to_string(), extype.to_string());
         }
         fields.push(field);
         ensure!(
diff --git a/src/datatypes/src/schema/column_schema.rs b/src/datatypes/src/schema/column_schema.rs
index 2ba7beb701..38cdd7bb06 100644
--- a/src/datatypes/src/schema/column_schema.rs
+++ b/src/datatypes/src/schema/column_schema.rs
@@ -46,6 +46,8 @@ pub const FULLTEXT_KEY: &str = "greptime:fulltext";
 pub const INVERTED_INDEX_KEY: &str = "greptime:inverted_index";
 /// Key used to store skip options in arrow field's metadata.
 pub const SKIPPING_INDEX_KEY: &str = "greptime:skipping_index";
+/// Key used to store vector index options in arrow field's metadata.
+pub const VECTOR_INDEX_KEY: &str = "greptime:vector_index";
 
 /// Keys used in fulltext options
 pub const COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE: &str = "enable";
@@ -216,6 +218,53 @@ impl ColumnSchema {
         self.metadata.contains_key(INVERTED_INDEX_KEY)
     }
 
+    /// Checks if this column has a vector index.
+    pub fn is_vector_indexed(&self) -> bool {
+        match self.vector_index_options() {
+            Ok(opts) => opts.is_some(),
+            Err(e) => {
+                common_telemetry::warn!(
+                    "Failed to deserialize vector_index_options for column '{}': {}",
+                    self.name,
+                    e
+                );
+                false
+            }
+        }
+    }
+
+    /// Gets the vector index options.
+    pub fn vector_index_options(&self) -> Result<Option<VectorIndexOptions>> {
+        match self.metadata.get(VECTOR_INDEX_KEY) {
+            None => Ok(None),
+            Some(json) => {
+                let options =
+                    serde_json::from_str(json).context(error::DeserializeSnafu { json })?;
+                Ok(Some(options))
+            }
+        }
+    }
+
+    /// Sets the vector index options.
+    pub fn set_vector_index_options(&mut self, options: &VectorIndexOptions) -> Result<()> {
+        self.metadata.insert(
+            VECTOR_INDEX_KEY.to_string(),
+            serde_json::to_string(options).context(error::SerializeSnafu)?,
+        );
+        Ok(())
+    }
+
+    /// Removes the vector index options.
+    pub fn unset_vector_index_options(&mut self) {
+        self.metadata.remove(VECTOR_INDEX_KEY);
+    }
+
+    /// Sets vector index options and returns self for chaining.
+    pub fn with_vector_index_options(mut self, options: &VectorIndexOptions) -> Result<Self> {
+        self.set_vector_index_options(options)?;
+        Ok(self)
+    }
+
     /// Set default constraint.
     ///
     /// If a default constraint exists for the column, this method will
@@ -431,6 +480,10 @@ impl ColumnSchema {
 
         Ok(())
     }
+
+    pub fn is_indexed(&self) -> bool {
+        self.is_inverted_indexed() || self.is_fulltext_indexed() || self.is_skipping_indexed()
+    }
 }
 
 /// Column extended type set in column schema's metadata.
@@ -960,6 +1013,181 @@ impl TryFrom<HashMap<String, String>> for SkippingIndexOptions {
     }
 }
 
+/// Distance metric for vector similarity search.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default, Visit, VisitMut)]
+#[serde(rename_all = "lowercase")]
+pub enum VectorDistanceMetric {
+    /// Squared Euclidean distance (L2^2).
+    #[default]
+    L2sq,
+    /// Cosine distance (1 - cosine similarity).
+    Cosine,
+    /// Inner product (negative, for maximum inner product search).
+    #[serde(alias = "ip")]
+    InnerProduct,
+}
+
+impl fmt::Display for VectorDistanceMetric {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            VectorDistanceMetric::L2sq => write!(f, "l2sq"),
+            VectorDistanceMetric::Cosine => write!(f, "cosine"),
+            VectorDistanceMetric::InnerProduct => write!(f, "ip"),
+        }
+    }
+}
+
+impl std::str::FromStr for VectorDistanceMetric {
+    type Err = String;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "l2sq" | "l2" | "euclidean" => Ok(VectorDistanceMetric::L2sq),
+            "cosine" | "cos" => Ok(VectorDistanceMetric::Cosine),
+            "inner_product" | "ip" | "dot" => Ok(VectorDistanceMetric::InnerProduct),
+            _ => Err(format!(
+                "Unknown distance metric: {}. Expected: l2sq, cosine, or ip",
+                s
+            )),
+        }
+    }
+}
+
+impl VectorDistanceMetric {
+    /// Returns the metric as u8 for blob serialization.
+    pub fn as_u8(&self) -> u8 {
+        match self {
+            Self::L2sq => 0,
+            Self::Cosine => 1,
+            Self::InnerProduct => 2,
+        }
+    }
+
+    /// Parses metric from u8 (used when reading blob).
+    pub fn try_from_u8(v: u8) -> Option<Self> {
+        match v {
+            0 => Some(Self::L2sq),
+            1 => Some(Self::Cosine),
+            2 => Some(Self::InnerProduct),
+            _ => None,
+        }
+    }
+}
+
+/// Default HNSW connectivity parameter.
+const DEFAULT_VECTOR_INDEX_CONNECTIVITY: u32 = 16;
+/// Default expansion factor during index construction.
+const DEFAULT_VECTOR_INDEX_EXPANSION_ADD: u32 = 128;
+/// Default expansion factor during search.
+const DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH: u32 = 64;
+
+fn default_vector_index_connectivity() -> u32 {
+    DEFAULT_VECTOR_INDEX_CONNECTIVITY
+}
+
+fn default_vector_index_expansion_add() -> u32 {
+    DEFAULT_VECTOR_INDEX_EXPANSION_ADD
+}
+
+fn default_vector_index_expansion_search() -> u32 {
+    DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH
+}
+
+/// Supported vector index engine types.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, Visit, VisitMut)]
+#[serde(rename_all = "lowercase")]
+pub enum VectorIndexEngineType {
+    /// USearch HNSW implementation.
+    #[default]
+    Usearch,
+    // Future: Vsag,
+}
+
+impl VectorIndexEngineType {
+    /// Returns the engine type as u8 for blob serialization.
+    pub fn as_u8(&self) -> u8 {
+        match self {
+            Self::Usearch => 0,
+        }
+    }
+
+    /// Parses engine type from u8 (used when reading blob).
+    pub fn try_from_u8(v: u8) -> Option<Self> {
+        match v {
+            0 => Some(Self::Usearch),
+            _ => None,
+        }
+    }
+}
+
+impl fmt::Display for VectorIndexEngineType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Usearch => write!(f, "usearch"),
+        }
+    }
+}
+
+impl std::str::FromStr for VectorIndexEngineType {
+    type Err = String;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        match s.to_lowercase().as_str() {
+            "usearch" => Ok(Self::Usearch),
+            _ => Err(format!(
+                "Unknown vector index engine: {}. Expected: usearch",
+                s
+            )),
+        }
+    }
+}
+
+/// Options for vector index (HNSW).
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Visit, VisitMut)]
+#[serde(rename_all = "kebab-case")]
+pub struct VectorIndexOptions {
+    /// Vector index engine type (default: usearch).
+    #[serde(default)]
+    pub engine: VectorIndexEngineType,
+    /// Distance metric for similarity search.
+    #[serde(default)]
+    pub metric: VectorDistanceMetric,
+    /// HNSW connectivity parameter (M in the paper).
+    /// Higher values improve recall but increase memory usage.
+    #[serde(default = "default_vector_index_connectivity")]
+    pub connectivity: u32,
+    /// Expansion factor during index construction (ef_construction).
+    /// Higher values improve index quality but slow down construction.
+    #[serde(default = "default_vector_index_expansion_add")]
+    pub expansion_add: u32,
+    /// Expansion factor during search (ef_search).
+    /// Higher values improve recall but slow down search.
+    #[serde(default = "default_vector_index_expansion_search")]
+    pub expansion_search: u32,
+}
+
+impl Default for VectorIndexOptions {
+    fn default() -> Self {
+        Self {
+            engine: VectorIndexEngineType::default(),
+            metric: VectorDistanceMetric::default(),
+            connectivity: DEFAULT_VECTOR_INDEX_CONNECTIVITY,
+            expansion_add: DEFAULT_VECTOR_INDEX_EXPANSION_ADD,
+            expansion_search: DEFAULT_VECTOR_INDEX_EXPANSION_SEARCH,
+        }
+    }
+}
+
+impl fmt::Display for VectorIndexOptions {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "engine={}, metric={}, connectivity={}, expansion_add={}, expansion_search={}",
+            self.engine, self.metric, self.connectivity, self.expansion_add, self.expansion_search
+        )
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::sync::Arc;
diff --git a/src/datatypes/src/types.rs b/src/datatypes/src/types.rs
index 1c7df86249..597bbb673b 100644
--- a/src/datatypes/src/types.rs
+++ b/src/datatypes/src/types.rs
@@ -20,7 +20,7 @@ mod decimal_type;
 mod dictionary_type;
 mod duration_type;
 mod interval_type;
-mod json_type;
+pub mod json_type;
 mod list_type;
 mod null_type;
 mod primitive_type;
diff --git a/src/datatypes/src/types/json_type.rs b/src/datatypes/src/types/json_type.rs
index 660ddfe2c4..4c838b78d1 100644
--- a/src/datatypes/src/types/json_type.rs
+++ b/src/datatypes/src/types/json_type.rs
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::{BTreeMap, HashMap};
+use std::collections::BTreeMap;
+use std::fmt::{Display, Formatter};
 use std::str::FromStr;
 use std::sync::Arc;
 
 use arrow::datatypes::DataType as ArrowDataType;
-use arrow_schema::Fields;
 use common_base::bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use snafu::ResultExt;
@@ -35,14 +35,134 @@ use crate::vectors::json::builder::JsonVectorBuilder;
 use crate::vectors::{BinaryVectorBuilder, MutableVector};
 
 pub const JSON_TYPE_NAME: &str = "Json";
-const JSON_PLAIN_FIELD_NAME: &str = "__plain__";
+const JSON_PLAIN_FIELD_NAME: &str = "__json_plain__";
 const JSON_PLAIN_FIELD_METADATA_KEY: &str = "is_plain_json";
 
+pub type JsonObjectType = BTreeMap<String, JsonNativeType>;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
+pub enum JsonNumberType {
+    U64,
+    I64,
+    F64,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
+pub enum JsonNativeType {
+    Null,
+    Bool,
+    Number(JsonNumberType),
+    String,
+    Array(Box<JsonNativeType>),
+    Object(JsonObjectType),
+}
+
+impl JsonNativeType {
+    pub fn is_null(&self) -> bool {
+        matches!(self, JsonNativeType::Null)
+    }
+
+    pub fn u64() -> Self {
+        Self::Number(JsonNumberType::U64)
+    }
+
+    pub fn i64() -> Self {
+        Self::Number(JsonNumberType::I64)
+    }
+
+    pub fn f64() -> Self {
+        Self::Number(JsonNumberType::F64)
+    }
+}
+
+impl From<&JsonNativeType> for ConcreteDataType {
+    fn from(value: &JsonNativeType) -> Self {
+        match value {
+            JsonNativeType::Null => ConcreteDataType::null_datatype(),
+            JsonNativeType::Bool => ConcreteDataType::boolean_datatype(),
+            JsonNativeType::Number(JsonNumberType::U64) => ConcreteDataType::uint64_datatype(),
+            JsonNativeType::Number(JsonNumberType::I64) => ConcreteDataType::int64_datatype(),
+            JsonNativeType::Number(JsonNumberType::F64) => ConcreteDataType::float64_datatype(),
+            JsonNativeType::String => ConcreteDataType::string_datatype(),
+            JsonNativeType::Array(item_type) => {
+                ConcreteDataType::List(ListType::new(Arc::new(item_type.as_ref().into())))
+            }
+            JsonNativeType::Object(object) => {
+                let fields = object
+                    .iter()
+                    .map(|(type_name, field_type)| {
+                        StructField::new(type_name.clone(), field_type.into(), true)
+                    })
+                    .collect();
+                ConcreteDataType::Struct(StructType::new(Arc::new(fields)))
+            }
+        }
+    }
+}
+
+impl From<&ConcreteDataType> for JsonNativeType {
+    fn from(value: &ConcreteDataType) -> Self {
+        match value {
+            ConcreteDataType::Null(_) => JsonNativeType::Null,
+            ConcreteDataType::Boolean(_) => JsonNativeType::Bool,
+            ConcreteDataType::UInt64(_)
+            | ConcreteDataType::UInt32(_)
+            | ConcreteDataType::UInt16(_)
+            | ConcreteDataType::UInt8(_) => JsonNativeType::u64(),
+            ConcreteDataType::Int64(_)
+            | ConcreteDataType::Int32(_)
+            | ConcreteDataType::Int16(_)
+            | ConcreteDataType::Int8(_) => JsonNativeType::i64(),
+            ConcreteDataType::Float64(_) | ConcreteDataType::Float32(_) => JsonNativeType::f64(),
+            ConcreteDataType::String(_) => JsonNativeType::String,
+            ConcreteDataType::List(list_type) => {
+                JsonNativeType::Array(Box::new(list_type.item_type().into()))
+            }
+            ConcreteDataType::Struct(struct_type) => JsonNativeType::Object(
+                struct_type
+                    .fields()
+                    .iter()
+                    .map(|field| (field.name().to_string(), field.data_type().into()))
+                    .collect(),
+            ),
+            ConcreteDataType::Json(json_type) => json_type.native_type().clone(),
+            _ => unreachable!(),
+        }
+    }
+}
+
+impl Display for JsonNativeType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            JsonNativeType::Null => write!(f, "Null"),
+            JsonNativeType::Bool => write!(f, "Bool"),
+            JsonNativeType::Number(t) => {
+                write!(f, "Number({t:?})")
+            }
+            JsonNativeType::String => write!(f, "String"),
+            JsonNativeType::Array(item_type) => {
+                write!(f, "Array[{}]", item_type)
+            }
+            JsonNativeType::Object(object) => {
+                write!(
+                    f,
+                    "Object{{{}}}",
+                    object
+                        .iter()
+                        .map(|(k, v)| format!(r#""{k}": {v}"#))
+                        .collect::<Vec<_>>()
+                        .join(", ")
+                )
+            }
+        }
+    }
+}
+
 #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize, Default)]
 pub enum JsonFormat {
     #[default]
     Jsonb,
-    Native(Box<ConcreteDataType>),
+    Native(Box<JsonNativeType>),
 }
 
 /// JsonType is a data type for JSON data. It is stored as binary data of jsonb format.
@@ -57,48 +177,41 @@ impl JsonType {
         Self { format }
     }
 
-    pub(crate) fn empty() -> Self {
+    pub(crate) fn new_native(native: JsonNativeType) -> Self {
         Self {
-            format: JsonFormat::Native(Box::new(ConcreteDataType::null_datatype())),
+            format: JsonFormat::Native(Box::new(native)),
+        }
+    }
+
+    pub(crate) fn native_type(&self) -> &JsonNativeType {
+        match &self.format {
+            JsonFormat::Jsonb => &JsonNativeType::String,
+            JsonFormat::Native(x) => x.as_ref(),
+        }
+    }
+
+    pub fn null() -> Self {
+        Self {
+            format: JsonFormat::Native(Box::new(JsonNativeType::Null)),
         }
     }
 
     /// Make json type a struct type, by:
     /// - if the json is an object, its entries are mapped to struct fields, obviously;
-    /// - if not, the json is one of bool, number, string or array, make it a special field called
-    ///   [JSON_PLAIN_FIELD_NAME] with metadata [JSON_PLAIN_FIELD_METADATA_KEY] = `"true"` in a
-    ///   struct with only that field.
+    /// - if not, the json is one of bool, number, string or array, make it a special field
+    ///   (see [plain_json_struct_type]).
     pub(crate) fn as_struct_type(&self) -> StructType {
         match &self.format {
             JsonFormat::Jsonb => StructType::default(),
-            JsonFormat::Native(inner) => match inner.as_ref() {
+            JsonFormat::Native(inner) => match ConcreteDataType::from(inner.as_ref()) {
                 ConcreteDataType::Struct(t) => t.clone(),
-                x => {
-                    let mut field =
-                        StructField::new(JSON_PLAIN_FIELD_NAME.to_string(), x.clone(), true);
-                    field.insert_metadata(JSON_PLAIN_FIELD_METADATA_KEY, true);
-                    StructType::new(Arc::new(vec![field]))
-                }
+                x => plain_json_struct_type(x),
             },
         }
     }
 
-    /// Check if this json type is the special "plain" one.
-    /// See [JsonType::as_struct_type].
-    pub(crate) fn is_plain_json(&self) -> bool {
-        let JsonFormat::Native(box ConcreteDataType::Struct(t)) = &self.format else {
-            return true;
-        };
-        let fields = t.fields();
-        let Some((single, [])) = fields.split_first() else {
-            return false;
-        };
-        single.name() == JSON_PLAIN_FIELD_NAME
-            && single.metadata(JSON_PLAIN_FIELD_METADATA_KEY) == Some("true")
-    }
-
     /// Try to merge this json type with others, error on datatype conflict.
-    pub(crate) fn merge(&mut self, other: &JsonType) -> Result<()> {
+    pub fn merge(&mut self, other: &JsonType) -> Result<()> {
         match (&self.format, &other.format) {
             (JsonFormat::Jsonb, JsonFormat::Jsonb) => Ok(()),
             (JsonFormat::Native(this), JsonFormat::Native(that)) => {
@@ -113,7 +226,8 @@ impl JsonType {
         }
     }
 
-    pub(crate) fn is_mergeable(&self, other: &JsonType) -> bool {
+    /// Check if it can merge with `other` json type.
+    pub fn is_mergeable(&self, other: &JsonType) -> bool {
         match (&self.format, &other.format) {
             (JsonFormat::Jsonb, JsonFormat::Jsonb) => true,
             (JsonFormat::Native(this), JsonFormat::Native(that)) => {
@@ -122,19 +236,58 @@ impl JsonType {
             _ => false,
         }
     }
+
+    /// Check if it includes all fields in `other` json type.
+    pub fn is_include(&self, other: &JsonType) -> bool {
+        match (&self.format, &other.format) {
+            (JsonFormat::Jsonb, JsonFormat::Jsonb) => true,
+            (JsonFormat::Native(this), JsonFormat::Native(that)) => {
+                is_include(this.as_ref(), that.as_ref())
+            }
+            _ => false,
+        }
+    }
 }
 
-fn is_mergeable(this: &ConcreteDataType, that: &ConcreteDataType) -> bool {
-    fn is_mergeable_struct(this: &StructType, that: &StructType) -> bool {
-        let this_fields = this.fields();
-        let this_fields = this_fields
-            .iter()
-            .map(|x| (x.name(), x))
-            .collect::<HashMap<_, _>>();
+fn is_include(this: &JsonNativeType, that: &JsonNativeType) -> bool {
+    fn is_include_object(this: &JsonObjectType, that: &JsonObjectType) -> bool {
+        for (type_name, that_type) in that {
+            let Some(this_type) = this.get(type_name) else {
+                return false;
+            };
+            if !is_include(this_type, that_type) {
+                return false;
+            }
+        }
+        true
+    }
 
-        for that_field in that.fields().iter() {
-            if let Some(this_field) = this_fields.get(that_field.name())
-                && !is_mergeable(this_field.data_type(), that_field.data_type())
+    match (this, that) {
+        (this, that) if this == that => true,
+        (JsonNativeType::Array(this), JsonNativeType::Array(that)) => {
+            is_include(this.as_ref(), that.as_ref())
+        }
+        (JsonNativeType::Object(this), JsonNativeType::Object(that)) => {
+            is_include_object(this, that)
+        }
+        (_, JsonNativeType::Null) => true,
+        _ => false,
+    }
+}
+
+/// A special struct type for denoting "plain"(not object) json value. It has only one field, with
+/// fixed name [JSON_PLAIN_FIELD_NAME] and with metadata [JSON_PLAIN_FIELD_METADATA_KEY] = `"true"`.
+pub(crate) fn plain_json_struct_type(item_type: ConcreteDataType) -> StructType {
+    let mut field = StructField::new(JSON_PLAIN_FIELD_NAME.to_string(), item_type, true);
+    field.insert_metadata(JSON_PLAIN_FIELD_METADATA_KEY, true);
+    StructType::new(Arc::new(vec![field]))
+}
+
+fn is_mergeable(this: &JsonNativeType, that: &JsonNativeType) -> bool {
+    fn is_mergeable_object(this: &JsonObjectType, that: &JsonObjectType) -> bool {
+        for (type_name, that_type) in that {
+            if let Some(this_type) = this.get(type_name)
+                && !is_mergeable(this_type, that_type)
             {
                 return false;
             }
@@ -144,27 +297,41 @@ fn is_mergeable(this: &ConcreteDataType, that: &ConcreteDataType) -> bool {
 
     match (this, that) {
         (this, that) if this == that => true,
-        (ConcreteDataType::List(this), ConcreteDataType::List(that)) => {
-            is_mergeable(this.item_type(), that.item_type())
+        (JsonNativeType::Array(this), JsonNativeType::Array(that)) => {
+            is_mergeable(this.as_ref(), that.as_ref())
         }
-        (ConcreteDataType::Struct(this), ConcreteDataType::Struct(that)) => {
-            is_mergeable_struct(this, that)
+        (JsonNativeType::Object(this), JsonNativeType::Object(that)) => {
+            is_mergeable_object(this, that)
         }
-        (ConcreteDataType::Null(_), _) | (_, ConcreteDataType::Null(_)) => true,
+        (JsonNativeType::Null, _) | (_, JsonNativeType::Null) => true,
         _ => false,
     }
 }
 
-fn merge(this: &ConcreteDataType, that: &ConcreteDataType) -> Result<ConcreteDataType> {
+fn merge(this: &JsonNativeType, that: &JsonNativeType) -> Result<JsonNativeType> {
+    fn merge_object(this: &JsonObjectType, that: &JsonObjectType) -> Result<JsonObjectType> {
+        let mut this = this.clone();
+        // merge "that" into "this" directly:
+        for (type_name, that_type) in that {
+            if let Some(this_type) = this.get_mut(type_name) {
+                let merged_type = merge(this_type, that_type)?;
+                *this_type = merged_type;
+            } else {
+                this.insert(type_name.clone(), that_type.clone());
+            }
+        }
+        Ok(this)
+    }
+
     match (this, that) {
         (this, that) if this == that => Ok(this.clone()),
-        (ConcreteDataType::List(this), ConcreteDataType::List(that)) => {
-            merge_list(this, that).map(ConcreteDataType::List)
+        (JsonNativeType::Array(this), JsonNativeType::Array(that)) => {
+            merge(this.as_ref(), that.as_ref()).map(|x| JsonNativeType::Array(Box::new(x)))
         }
-        (ConcreteDataType::Struct(this), ConcreteDataType::Struct(that)) => {
-            merge_struct(this, that).map(ConcreteDataType::Struct)
+        (JsonNativeType::Object(this), JsonNativeType::Object(that)) => {
+            merge_object(this, that).map(JsonNativeType::Object)
         }
-        (ConcreteDataType::Null(_), x) | (x, ConcreteDataType::Null(_)) => Ok(x.clone()),
+        (JsonNativeType::Null, x) | (x, JsonNativeType::Null) => Ok(x.clone()),
         _ => MergeJsonDatatypeSnafu {
             reason: format!("datatypes have conflict, this: {this}, that: {that}"),
         }
@@ -172,38 +339,6 @@ fn merge(this: &ConcreteDataType, that: &ConcreteDataType) -> Result<ConcreteDat
     }
 }
 
-fn merge_list(this: &ListType, that: &ListType) -> Result<ListType> {
-    let merged = merge(this.item_type(), that.item_type())?;
-    Ok(ListType::new(Arc::new(merged)))
-}
-
-fn merge_struct(this: &StructType, that: &StructType) -> Result<StructType> {
-    let this = Arc::unwrap_or_clone(this.fields());
-    let that = Arc::unwrap_or_clone(that.fields());
-
-    let mut this: BTreeMap<String, StructField> = this
-        .into_iter()
-        .map(|x| (x.name().to_string(), x))
-        .collect();
-    // merge "that" into "this" directly:
-    for that_field in that {
-        let field_name = that_field.name().to_string();
-        if let Some(this_field) = this.get(&field_name) {
-            let merged_field = StructField::new(
-                field_name.clone(),
-                merge(this_field.data_type(), that_field.data_type())?,
-                true, // the value in json object must be always nullable
-            );
-            this.insert(field_name, merged_field);
-        } else {
-            this.insert(field_name, that_field);
-        }
-    }
-
-    let fields = this.into_values().collect::<Vec<_>>();
-    Ok(StructType::new(Arc::new(fields)))
-}
-
 impl DataType for JsonType {
     fn name(&self) -> String {
         match &self.format {
@@ -223,14 +358,14 @@ impl DataType for JsonType {
     fn as_arrow_type(&self) -> ArrowDataType {
         match self.format {
             JsonFormat::Jsonb => ArrowDataType::Binary,
-            JsonFormat::Native(_) => ArrowDataType::Struct(Fields::empty()),
+            JsonFormat::Native(_) => self.as_struct_type().as_arrow_type(),
         }
     }
 
     fn create_mutable_vector(&self, capacity: usize) -> Box<dyn MutableVector> {
-        match self.format {
+        match &self.format {
             JsonFormat::Jsonb => Box::new(BinaryVectorBuilder::with_capacity(capacity)),
-            JsonFormat::Native(_) => Box::new(JsonVectorBuilder::with_capacity(capacity)),
+            JsonFormat::Native(x) => Box::new(JsonVectorBuilder::new(*x.clone(), capacity)),
         }
     }
 
@@ -242,6 +377,12 @@ impl DataType for JsonType {
     }
 }
 
+impl Display for JsonType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.name())
+    }
+}
+
 /// Converts a json type value to string
 pub fn jsonb_to_string(val: &[u8]) -> Result<String> {
     match jsonb::from_slice(val) {
@@ -272,6 +413,204 @@ mod tests {
     use super::*;
     use crate::json::JsonStructureSettings;
 
+    #[test]
+    fn test_json_type_include() {
+        fn test(this: &JsonNativeType, that: &JsonNativeType, expected: bool) {
+            assert_eq!(is_include(this, that), expected);
+        }
+
+        test(&JsonNativeType::Null, &JsonNativeType::Null, true);
+        test(&JsonNativeType::Null, &JsonNativeType::Bool, false);
+
+        test(&JsonNativeType::Bool, &JsonNativeType::Null, true);
+        test(&JsonNativeType::Bool, &JsonNativeType::Bool, true);
+        test(&JsonNativeType::Bool, &JsonNativeType::u64(), false);
+
+        test(&JsonNativeType::u64(), &JsonNativeType::Null, true);
+        test(&JsonNativeType::u64(), &JsonNativeType::u64(), true);
+        test(&JsonNativeType::u64(), &JsonNativeType::String, false);
+
+        test(&JsonNativeType::String, &JsonNativeType::Null, true);
+        test(&JsonNativeType::String, &JsonNativeType::String, true);
+        test(
+            &JsonNativeType::String,
+            &JsonNativeType::Array(Box::new(JsonNativeType::f64())),
+            false,
+        );
+
+        test(
+            &JsonNativeType::Array(Box::new(JsonNativeType::f64())),
+            &JsonNativeType::Null,
+            true,
+        );
+        test(
+            &JsonNativeType::Array(Box::new(JsonNativeType::f64())),
+            &JsonNativeType::Array(Box::new(JsonNativeType::Null)),
+            true,
+        );
+        test(
+            &JsonNativeType::Array(Box::new(JsonNativeType::f64())),
+            &JsonNativeType::Array(Box::new(JsonNativeType::f64())),
+            true,
+        );
+        test(
+            &JsonNativeType::Array(Box::new(JsonNativeType::f64())),
+            &JsonNativeType::String,
+            false,
+        );
+        test(
+            &JsonNativeType::Array(Box::new(JsonNativeType::f64())),
+            &JsonNativeType::Object(JsonObjectType::new()),
+            false,
+        );
+
+        let simple_json_object = &JsonNativeType::Object(JsonObjectType::from([(
+            "foo".to_string(),
+            JsonNativeType::String,
+        )]));
+        test(simple_json_object, &JsonNativeType::Null, true);
+        test(simple_json_object, simple_json_object, true);
+        test(simple_json_object, &JsonNativeType::i64(), false);
+        test(
+            simple_json_object,
+            &JsonNativeType::Object(JsonObjectType::from([(
+                "bar".to_string(),
+                JsonNativeType::i64(),
+            )])),
+            false,
+        );
+
+        let complex_json_object = &JsonNativeType::Object(JsonObjectType::from([
+            (
+                "nested".to_string(),
+                JsonNativeType::Object(JsonObjectType::from([(
+                    "a".to_string(),
+                    JsonNativeType::Object(JsonObjectType::from([(
+                        "b".to_string(),
+                        JsonNativeType::Object(JsonObjectType::from([(
+                            "c".to_string(),
+                            JsonNativeType::String,
+                        )])),
+                    )])),
+                )])),
+            ),
+            ("bar".to_string(), JsonNativeType::i64()),
+        ]));
+        test(complex_json_object, &JsonNativeType::Null, true);
+        test(complex_json_object, &JsonNativeType::String, false);
+        test(complex_json_object, complex_json_object, true);
+        test(
+            complex_json_object,
+            &JsonNativeType::Object(JsonObjectType::from([(
+                "bar".to_string(),
+                JsonNativeType::i64(),
+            )])),
+            true,
+        );
+        test(
+            complex_json_object,
+            &JsonNativeType::Object(JsonObjectType::from([
+                (
+                    "nested".to_string(),
+                    JsonNativeType::Object(JsonObjectType::from([(
+                        "a".to_string(),
+                        JsonNativeType::Null,
+                    )])),
+                ),
+                ("bar".to_string(), JsonNativeType::i64()),
+            ])),
+            true,
+        );
+        test(
+            complex_json_object,
+            &JsonNativeType::Object(JsonObjectType::from([
+                (
+                    "nested".to_string(),
+                    JsonNativeType::Object(JsonObjectType::from([(
+                        "a".to_string(),
+                        JsonNativeType::String,
+                    )])),
+                ),
+                ("bar".to_string(), JsonNativeType::i64()),
+            ])),
+            false,
+        );
+        test(
+            complex_json_object,
+            &JsonNativeType::Object(JsonObjectType::from([
+                (
+                    "nested".to_string(),
+                    JsonNativeType::Object(JsonObjectType::from([(
+                        "a".to_string(),
+                        JsonNativeType::Object(JsonObjectType::from([(
+                            "b".to_string(),
+                            JsonNativeType::String,
+                        )])),
+                    )])),
+                ),
+                ("bar".to_string(), JsonNativeType::i64()),
+            ])),
+            false,
+        );
+        test(
+            complex_json_object,
+            &JsonNativeType::Object(JsonObjectType::from([
+                (
+                    "nested".to_string(),
+                    JsonNativeType::Object(JsonObjectType::from([(
+                        "a".to_string(),
+                        JsonNativeType::Object(JsonObjectType::from([(
+                            "b".to_string(),
+                            JsonNativeType::Object(JsonObjectType::from([(
+                                "c".to_string(),
+                                JsonNativeType::Null,
+                            )])),
+                        )])),
+                    )])),
+                ),
+                ("bar".to_string(), JsonNativeType::i64()),
+            ])),
+            true,
+        );
+        test(
+            complex_json_object,
+            &JsonNativeType::Object(JsonObjectType::from([
+                (
+                    "nested".to_string(),
+                    JsonNativeType::Object(JsonObjectType::from([(
+                        "a".to_string(),
+                        JsonNativeType::Object(JsonObjectType::from([(
+                            "b".to_string(),
+                            JsonNativeType::Object(JsonObjectType::from([(
+                                "c".to_string(),
+                                JsonNativeType::Bool,
+                            )])),
+                        )])),
+                    )])),
+                ),
+                ("bar".to_string(), JsonNativeType::i64()),
+            ])),
+            false,
+        );
+        test(
+            complex_json_object,
+            &JsonNativeType::Object(JsonObjectType::from([(
+                "nested".to_string(),
+                JsonNativeType::Object(JsonObjectType::from([(
+                    "a".to_string(),
+                    JsonNativeType::Object(JsonObjectType::from([(
+                        "b".to_string(),
+                        JsonNativeType::Object(JsonObjectType::from([(
+                            "c".to_string(),
+                            JsonNativeType::String,
+                        )])),
+                    )])),
+                )])),
+            )])),
+            true,
+        );
+    }
+
     #[test]
     fn test_merge_json_type() -> Result<()> {
         fn test(
@@ -303,9 +642,7 @@ mod tests {
             Ok(())
         }
 
-        let json_type = &mut JsonType::new(JsonFormat::Native(Box::new(
-            ConcreteDataType::null_datatype(),
-        )));
+        let json_type = &mut JsonType::new_native(JsonNativeType::Null);
 
         // can merge with json object:
         let json = r#"{
@@ -313,16 +650,15 @@ mod tests {
             "list": [1, 2, 3],
             "object": {"a": 1}
         }"#;
-        let expected =
-            r#"Json<Struct<"hello": String, "list": List<Int64>, "object": Struct<"a": Int64>>>"#;
+        let expected = r#"Json<Object{"hello": String, "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}>"#;
         test(json, json_type, Ok(expected))?;
 
         // cannot merge with other non-object json values:
         let jsons = [r#""s""#, "1", "[1]"];
         let expects = [
-            r#"Failed to merge JSON datatype: datatypes have conflict, this: Struct<"hello": String, "list": List<Int64>, "object": Struct<"a": Int64>>, that: String"#,
-            r#"Failed to merge JSON datatype: datatypes have conflict, this: Struct<"hello": String, "list": List<Int64>, "object": Struct<"a": Int64>>, that: Int64"#,
-            r#"Failed to merge JSON datatype: datatypes have conflict, this: Struct<"hello": String, "list": List<Int64>, "object": Struct<"a": Int64>>, that: List<Int64>"#,
+            r#"Failed to merge JSON datatype: datatypes have conflict, this: Object{"hello": String, "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}, that: String"#,
+            r#"Failed to merge JSON datatype: datatypes have conflict, this: Object{"hello": String, "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}, that: Number(I64)"#,
+            r#"Failed to merge JSON datatype: datatypes have conflict, this: Object{"hello": String, "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}, that: Array[Number(I64)]"#,
         ];
         for (json, expect) in jsons.into_iter().zip(expects.into_iter()) {
             test(json, json_type, Err(expect))?;
@@ -334,8 +670,7 @@ mod tests {
             "float": 0.123,
             "no": 42
         }"#;
-        let expected =
-            r#"Failed to merge JSON datatype: datatypes have conflict, this: String, that: Int64"#;
+        let expected = r#"Failed to merge JSON datatype: datatypes have conflict, this: String, that: Number(I64)"#;
         test(json, json_type, Err(expected))?;
 
         // can merge with another json object:
@@ -344,7 +679,7 @@ mod tests {
             "float": 0.123,
             "int": 42
         }"#;
-        let expected = r#"Json<Struct<"float": Float64, "hello": String, "int": Int64, "list": List<Int64>, "object": Struct<"a": Int64>>>"#;
+        let expected = r#"Json<Object{"float": Number(F64), "hello": String, "int": Number(I64), "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}>"#;
         test(json, json_type, Ok(expected))?;
 
         // can merge with some complex nested json object:
@@ -354,7 +689,7 @@ mod tests {
             "float": 0.456,
             "int": 0
         }"#;
-        let expected = r#"Json<Struct<"float": Float64, "hello": String, "int": Int64, "list": List<Int64>, "object": Struct<"a": Int64, "foo": String, "l": List<String>, "o": Struct<"key": String>>>>"#;
+        let expected = r#"Json<Object{"float": Number(F64), "hello": String, "int": Number(I64), "list": Array[Number(I64)], "object": Object{"a": Number(I64), "foo": String, "l": Array[String], "o": Object{"key": String}}}>"#;
         test(json, json_type, Ok(expected))?;
 
         Ok(())
diff --git a/src/datatypes/src/types/string_type.rs b/src/datatypes/src/types/string_type.rs
index 61677ead4a..fff1d87f00 100644
--- a/src/datatypes/src/types/string_type.rs
+++ b/src/datatypes/src/types/string_type.rs
@@ -177,7 +177,7 @@ impl DataType for StringType {
             Value::Duration(v) => Some(Value::String(StringBytes::from(v.to_string()))),
             Value::Decimal128(v) => Some(Value::String(StringBytes::from(v.to_string()))),
 
-            Value::Json(v) => self.try_cast(*v),
+            Value::Json(v) => serde_json::to_string(v.as_ref()).ok().map(|s| s.into()),
 
             // StringBytes is only support for utf-8, Value::Binary and collections are not allowed.
             Value::Binary(_) | Value::List(_) | Value::Struct(_) => None,
diff --git a/src/datatypes/src/types/struct_type.rs b/src/datatypes/src/types/struct_type.rs
index 90ea6ac9f5..2cf2a8825d 100644
--- a/src/datatypes/src/types/struct_type.rs
+++ b/src/datatypes/src/types/struct_type.rs
@@ -151,6 +151,7 @@ impl StructField {
         self.metadata.insert(key.to_string(), value.to_string());
     }
 
+    #[expect(unused)]
     pub(crate) fn metadata(&self, key: &str) -> Option<&str> {
         self.metadata.get(key).map(String::as_str)
     }
diff --git a/src/datatypes/src/value.rs b/src/datatypes/src/value.rs
index 90ed848b7d..1c7dc35de6 100644
--- a/src/datatypes/src/value.rs
+++ b/src/datatypes/src/value.rs
@@ -36,6 +36,7 @@ use crate::error::{
     self, ConvertArrowArrayToScalarsSnafu, ConvertScalarToArrowArraySnafu, Error,
     InconsistentStructFieldsAndItemsSnafu, Result, TryFromValueSnafu,
 };
+use crate::json::value::{JsonValue, JsonValueRef};
 use crate::prelude::*;
 use crate::type_id::LogicalTypeId;
 use crate::types::{IntervalType, ListType, StructType};
@@ -86,7 +87,7 @@ pub enum Value {
     Struct(StructValue),
 
     // Json Logical types:
-    Json(Box<Value>),
+    Json(Box<JsonValue>),
 }
 
 impl Display for Value {
@@ -197,7 +198,7 @@ macro_rules! define_data_type_func {
                 $struct::Struct(struct_value) => {
                     ConcreteDataType::struct_datatype(struct_value.struct_type().clone())
                 }
-                $struct::Json(v) => ConcreteDataType::json_native_datatype(v.data_type()),
+                $struct::Json(v) => v.data_type(),
             }
         }
     };
@@ -220,7 +221,6 @@ impl Value {
         match self {
             Value::Null => Ok(None),
             Value::List(v) => Ok(Some(v)),
-            Value::Json(inner) => inner.as_list(),
             other => error::CastTypeSnafu {
                 msg: format!("Failed to cast {other:?} to list value"),
             }
@@ -232,7 +232,6 @@ impl Value {
         match self {
             Value::Null => Ok(None),
             Value::Struct(v) => Ok(Some(v)),
-            Value::Json(inner) => inner.as_struct(),
             other => error::CastTypeSnafu {
                 msg: format!("Failed to cast {other:?} to struct value"),
             }
@@ -267,7 +266,7 @@ impl Value {
             Value::Duration(v) => ValueRef::Duration(*v),
             Value::Decimal128(v) => ValueRef::Decimal128(*v),
             Value::Struct(v) => ValueRef::Struct(StructValueRef::Ref(v)),
-            Value::Json(v) => ValueRef::Json(Box::new(v.as_value_ref())),
+            Value::Json(v) => ValueRef::Json(Box::new((**v).as_ref())),
         }
     }
 
@@ -391,7 +390,7 @@ impl Value {
     /// Extract the inner JSON value from a JSON type.
     pub fn into_json_inner(self) -> Option<Value> {
         match self {
-            Value::Json(v) => Some(*v),
+            Value::Json(v) => Some((*v).into_value()),
             _ => None,
         }
     }
@@ -501,7 +500,12 @@ impl Value {
                 let struct_type = output_type.as_struct().unwrap();
                 struct_value.try_to_scalar_value(struct_type)?
             }
-            Value::Json(v) => v.try_to_scalar_value(output_type)?,
+            Value::Json(_) => {
+                return error::ToScalarValueSnafu {
+                    reason: "unsupported for json value",
+                }
+                .fail();
+            }
         };
 
         Ok(scalar_value)
@@ -554,13 +558,12 @@ impl Value {
             Value::IntervalDayTime(x) => Some(Value::IntervalDayTime(x.negative())),
             Value::IntervalMonthDayNano(x) => Some(Value::IntervalMonthDayNano(x.negative())),
 
-            Value::Json(v) => v.try_negative().map(|neg| Value::Json(Box::new(neg))),
-
             Value::Binary(_)
             | Value::String(_)
             | Value::Boolean(_)
             | Value::List(_)
-            | Value::Struct(_) => None,
+            | Value::Struct(_)
+            | Value::Json(_) => None,
         }
     }
 }
@@ -929,7 +932,7 @@ impl TryFrom<Value> for serde_json::Value {
                     .collect::<serde_json::Result<Map<String, serde_json::Value>>>()?;
                 serde_json::Value::Object(map)
             }
-            Value::Json(v) => serde_json::Value::try_from(*v)?,
+            Value::Json(v) => (*v).into(),
         };
 
         Ok(json_value)
@@ -1263,7 +1266,7 @@ impl From<ValueRef<'_>> for Value {
             ValueRef::List(v) => v.to_value(),
             ValueRef::Decimal128(v) => Value::Decimal128(v),
             ValueRef::Struct(v) => v.to_value(),
-            ValueRef::Json(v) => Value::Json(Box::new(Value::from(*v))),
+            ValueRef::Json(v) => Value::Json(Box::new(JsonValue::from(*v))),
         }
     }
 }
@@ -1307,7 +1310,7 @@ pub enum ValueRef<'a> {
     List(ListValueRef<'a>),
     Struct(StructValueRef<'a>),
 
-    Json(Box<ValueRef<'a>>),
+    Json(Box<JsonValueRef<'a>>),
 }
 
 macro_rules! impl_as_for_value_ref {
@@ -1315,18 +1318,6 @@ macro_rules! impl_as_for_value_ref {
         match $value {
             ValueRef::Null => Ok(None),
             ValueRef::$Variant(v) => Ok(Some(v.clone())),
-            ValueRef::Json(v) => match v.as_ref() {
-                ValueRef::Null => Ok(None),
-                ValueRef::$Variant(v) => Ok(Some(v.clone())),
-                other => error::CastTypeSnafu {
-                    msg: format!(
-                        "Failed to cast value ref {:?} to {}",
-                        other,
-                        stringify!($Variant)
-                    ),
-                }
-                .fail(),
-            },
             other => error::CastTypeSnafu {
                 msg: format!(
                     "Failed to cast value ref {:?} to {}",
@@ -1402,7 +1393,7 @@ impl<'a> ValueRef<'a> {
         match self {
             ValueRef::Null => Ok(None),
             ValueRef::Float32(f) => Ok(Some(f.0)),
-            ValueRef::Json(v) => v.try_into_f32(),
+            ValueRef::Json(v) => Ok(v.as_f32()),
             other => error::CastTypeSnafu {
                 msg: format!("Failed to cast value ref {:?} to ValueRef::Float32", other,),
             }
@@ -1414,7 +1405,7 @@ impl<'a> ValueRef<'a> {
         match self {
             ValueRef::Null => Ok(None),
             ValueRef::Float64(f) => Ok(Some(f.0)),
-            ValueRef::Json(v) => v.try_into_f64(),
+            ValueRef::Json(v) => Ok(v.as_f64()),
             other => error::CastTypeSnafu {
                 msg: format!("Failed to cast value ref {:?} to ValueRef::Float64", other,),
             }
@@ -1746,6 +1737,7 @@ pub(crate) mod tests {
     use num_traits::Float;
 
     use super::*;
+    use crate::json::value::{JsonVariant, JsonVariantRef};
     use crate::types::StructField;
     use crate::vectors::ListVectorBuilder;
 
@@ -2281,19 +2273,48 @@ pub(crate) mod tests {
 
         check_type_and_value(
             &ConcreteDataType::json_native_datatype(ConcreteDataType::boolean_datatype()),
-            &Value::Json(Box::new(Value::Boolean(true))),
+            &Value::Json(Box::new(true.into())),
         );
 
         check_type_and_value(
             &ConcreteDataType::json_native_datatype(build_list_type()),
-            &Value::Json(Box::new(Value::List(build_list_value()))),
+            &Value::Json(Box::new([true].into())),
         );
 
         check_type_and_value(
             &ConcreteDataType::json_native_datatype(ConcreteDataType::struct_datatype(
-                build_struct_type(),
+                StructType::new(Arc::new(vec![
+                    StructField::new(
+                        "address".to_string(),
+                        ConcreteDataType::string_datatype(),
+                        true,
+                    ),
+                    StructField::new("age".to_string(), ConcreteDataType::uint64_datatype(), true),
+                    StructField::new(
+                        "awards".to_string(),
+                        ConcreteDataType::list_datatype(Arc::new(
+                            ConcreteDataType::boolean_datatype(),
+                        )),
+                        true,
+                    ),
+                    StructField::new("id".to_string(), ConcreteDataType::int64_datatype(), true),
+                    StructField::new(
+                        "name".to_string(),
+                        ConcreteDataType::string_datatype(),
+                        true,
+                    ),
+                ])),
+            )),
+            &Value::Json(Box::new(
+                [
+                    ("id", JsonVariant::from(1i64)),
+                    ("name", "Alice".into()),
+                    ("age", 1u64.into()),
+                    ("address", "blah".into()),
+                    ("awards", [true, false].into()),
+                ]
+                .into(),
             )),
-            &Value::Json(Box::new(Value::Struct(build_struct_value()))),
         );
     }
 
@@ -2435,25 +2456,27 @@ pub(crate) mod tests {
 
         // string wrapped in json
         assert_eq!(
-            serde_json::Value::try_from(Value::Json(Box::new(Value::String("hello".into()))))
-                .unwrap(),
+            serde_json::Value::try_from(Value::Json(Box::new("hello".into()))).unwrap(),
             serde_json::json!("hello")
         );
 
         // list wrapped in json
         assert_eq!(
-            serde_json::Value::try_from(Value::Json(Box::new(Value::List(ListValue::new(
-                vec![Value::Int32(1), Value::Int32(2), Value::Int32(3),],
-                Arc::new(ConcreteDataType::int32_datatype())
-            )))))
-            .unwrap(),
+            serde_json::Value::try_from(Value::Json(Box::new([1i64, 2, 3,].into()))).unwrap(),
             serde_json::json!([1, 2, 3])
         );
 
         // struct wrapped in json
         assert_eq!(
-            serde_json::Value::try_from(Value::Json(Box::new(Value::Struct(struct_value))))
-                .unwrap(),
+            serde_json::Value::try_from(Value::Json(Box::new(
+                [
+                    ("num".to_string(), JsonVariant::from(42i64)),
+                    ("name".to_string(), "tomcat".into()),
+                    ("yes_or_no".to_string(), true.into()),
+                ]
+                .into()
+            )))
+            .unwrap(),
             serde_json::json!({
                 "num": 42,
                 "name": "tomcat",
@@ -2465,7 +2488,7 @@ pub(crate) mod tests {
     #[test]
     fn test_null_value() {
         assert!(Value::Null.is_null());
-        assert!(Value::Json(Box::new(Value::Null)).is_null());
+        assert!(Value::Json(Box::new(JsonValue::null())).is_null());
         assert!(!Value::Boolean(true).is_null());
         assert!(Value::Null < Value::Boolean(false));
         assert!(Value::Boolean(true) > Value::Null);
@@ -2544,13 +2567,6 @@ pub(crate) mod tests {
             ValueRef::Struct(StructValueRef::Ref(&struct_value)),
             Value::Struct(struct_value.clone()).as_value_ref()
         );
-
-        assert_eq!(
-            ValueRef::Json(Box::new(ValueRef::Struct(StructValueRef::Ref(
-                &struct_value
-            )))),
-            Value::Json(Box::new(Value::Struct(struct_value.clone()))).as_value_ref()
-        );
     }
 
     #[test]
@@ -2675,8 +2691,18 @@ pub(crate) mod tests {
         );
 
         assert_eq!(
-            Value::Json(Box::new(Value::Struct(build_struct_value()))).to_string(),
-            "Json({ id: 1, name: tom, age: 25, address: 94038, awards: Boolean[true, false] })"
+            Value::Json(Box::new(
+                [
+                    ("id", JsonVariant::from(1i64)),
+                    ("name", "tom".into()),
+                    ("age", 25u64.into()),
+                    ("address", "94038".into()),
+                    ("awards", [true, false].into()),
+                ]
+                .into()
+            ))
+            .to_string(),
+            "Json({ address: 94038, age: 25, awards: [true, false], id: 1, name: tom })"
         )
     }
 
@@ -3167,10 +3193,17 @@ pub(crate) mod tests {
         );
 
         check_value_ref_size_eq(
-            &ValueRef::Json(Box::new(ValueRef::Struct(StructValueRef::Ref(
-                &build_struct_value(),
-            )))),
-            31,
+            &ValueRef::Json(Box::new(
+                [
+                    ("id", JsonVariantRef::from(1i64)),
+                    ("name", "tom".into()),
+                    ("age", 25u64.into()),
+                    ("address", "94038".into()),
+                    ("awards", [true, false].into()),
+                ]
+                .into(),
+            )),
+            48,
         );
     }
 
diff --git a/src/datatypes/src/vectors/json/builder.rs b/src/datatypes/src/vectors/json/builder.rs
index cb19a329ef..3a32dda171 100644
--- a/src/datatypes/src/vectors/json/builder.rs
+++ b/src/datatypes/src/vectors/json/builder.rs
@@ -14,13 +14,14 @@
 
 use std::any::Any;
 use std::collections::HashMap;
-
-use snafu::OptionExt;
+use std::sync::LazyLock;
 
 use crate::data_type::ConcreteDataType;
 use crate::error::{Result, TryFromValueSnafu, UnsupportedOperationSnafu};
+use crate::json::value::JsonValueRef;
 use crate::prelude::{ValueRef, Vector, VectorRef};
-use crate::types::JsonType;
+use crate::types::json_type::JsonNativeType;
+use crate::types::{JsonType, json_type};
 use crate::value::StructValueRef;
 use crate::vectors::{MutableVector, StructVectorBuilder};
 
@@ -40,16 +41,16 @@ impl JsonStructsBuilder {
         self.inner.len()
     }
 
-    fn push(&mut self, value: &ValueRef) -> Result<()> {
-        if self.json_type.is_plain_json() {
-            let value = ValueRef::Struct(StructValueRef::RefList {
-                val: vec![value.clone()],
-                fields: self.json_type.as_struct_type(),
-            });
-            self.inner.try_push_value_ref(&value)
-        } else {
-            self.inner.try_push_value_ref(value)
+    fn push(&mut self, json: &JsonValueRef) -> Result<()> {
+        let mut value = json.as_value_ref();
+        if !json.is_object() {
+            let fields = json_type::plain_json_struct_type(value.data_type());
+            value = ValueRef::Struct(StructValueRef::RefList {
+                val: vec![value],
+                fields,
+            })
         }
+        self.inner.try_push_value_ref(&value)
     }
 
     /// Try to merge (and consume the data of) other json vector builder into this one.
@@ -181,9 +182,9 @@ pub(crate) struct JsonVectorBuilder {
 }
 
 impl JsonVectorBuilder {
-    pub(crate) fn with_capacity(capacity: usize) -> Self {
+    pub(crate) fn new(json_type: JsonNativeType, capacity: usize) -> Self {
         Self {
-            merged_type: JsonType::empty(),
+            merged_type: JsonType::new_native(json_type),
             capacity,
             builders: vec![],
         }
@@ -252,13 +253,17 @@ impl MutableVector for JsonVectorBuilder {
     }
 
     fn try_push_value_ref(&mut self, value: &ValueRef) -> Result<()> {
-        let data_type = value.data_type();
-        let json_type = data_type.as_json().with_context(|| TryFromValueSnafu {
-            reason: format!("expected json value, got {value:?}"),
-        })?;
+        let ValueRef::Json(value) = value else {
+            return TryFromValueSnafu {
+                reason: format!("expected json value, got {value:?}"),
+            }
+            .fail();
+        };
+        let json_type = value.json_type();
 
         let builder = match self.builders.last_mut() {
             Some(last) => {
+                // TODO(LFC): use "is_include" and amend json value with nulls
                 if &last.json_type != json_type {
                     self.try_create_new_builder(json_type)?
                 } else {
@@ -268,21 +273,16 @@ impl MutableVector for JsonVectorBuilder {
             None => self.try_create_new_builder(json_type)?,
         };
 
-        let ValueRef::Json(value) = value else {
-            // Safety: json datatype value must be the value of json.
-            unreachable!()
-        };
-        builder.push(value)
+        builder.push(value.as_ref())
     }
 
     fn push_null(&mut self) {
-        let null_json_value = ValueRef::Json(Box::new(ValueRef::Null));
-        self.try_push_value_ref(&null_json_value)
+        static NULL_JSON: LazyLock<ValueRef> =
+            LazyLock::new(|| ValueRef::Json(Box::new(JsonValueRef::null())));
+        self.try_push_value_ref(&NULL_JSON)
             // Safety: learning from the method "try_push_value_ref", a null json value should be
             // always able to push into any json vectors.
-            .unwrap_or_else(|e| {
-                panic!("failed to push null json value: {null_json_value:?}, error: {e}")
-            });
+            .unwrap_or_else(|e| panic!("failed to push null json value, error: {e}"));
     }
 
     fn extend_slice_of(&mut self, _: &dyn Vector, _: usize, _: usize) -> Result<()> {
@@ -307,12 +307,11 @@ mod tests {
         let value = settings.encode(json).unwrap();
 
         let value = value.as_value_ref();
-        let result = builder.try_push_value_ref(&value);
-        match (result, expected) {
-            (Ok(()), Ok(())) => (),
-            (Err(e), Err(expected)) => assert_eq!(e.to_string(), expected),
-            _ => unreachable!(),
-        }
+        let result = builder
+            .try_push_value_ref(&value)
+            .map_err(|e| e.to_string());
+        let expected = expected.map_err(|e| e.to_string());
+        assert_eq!(result, expected);
     }
 
     #[test]
@@ -322,24 +321,24 @@ mod tests {
             Ok(()),
             Ok(()),
             Err(
-                "Failed to merge JSON datatype: datatypes have conflict, this: Int64, that: String",
+                "Failed to merge JSON datatype: datatypes have conflict, this: Number(I64), that: String",
             ),
             Err(
-                "Failed to merge JSON datatype: datatypes have conflict, this: Int64, that: List<Boolean>",
+                "Failed to merge JSON datatype: datatypes have conflict, this: Number(I64), that: Array[Bool]",
             ),
         ];
-        let mut builder = JsonVectorBuilder::with_capacity(1);
+        let mut builder = JsonVectorBuilder::new(JsonNativeType::Null, 1);
         for (json, result) in jsons.into_iter().zip(results.into_iter()) {
             push(json, &mut builder, result);
         }
         let vector = builder.to_vector();
         let expected = r#"
-+----------------+
-| StructVector   |
-+----------------+
-| {__plain__: 1} |
-| {__plain__: 2} |
-+----------------+"#;
++---------------------+
+| StructVector        |
++---------------------+
+| {__json_plain__: 1} |
+| {__json_plain__: 2} |
++---------------------+"#;
         assert_eq!(pretty_print(vector), expected.trim());
         Ok(())
     }
@@ -388,7 +387,7 @@ mod tests {
             "object": {"timestamp": 1761523203000}
         }"#,
         ];
-        let mut builder = JsonVectorBuilder::with_capacity(1);
+        let mut builder = JsonVectorBuilder::new(JsonNativeType::Null, 1);
         for json in jsons {
             push(json, &mut builder, Ok(()));
         }
@@ -397,12 +396,12 @@ mod tests {
         // test children builders:
         assert_eq!(builder.builders.len(), 6);
         let expect_types = [
-            r#"Json<Struct<"list": List<Int64>, "s": String>>"#,
-            r#"Json<Struct<"float": Float64, "s": String>>"#,
-            r#"Json<Struct<"float": Float64, "int": Int64>>"#,
-            r#"Json<Struct<"int": Int64, "object": Struct<"hello": String, "timestamp": Int64>>>"#,
-            r#"Json<Struct<"nested": Struct<"a": Struct<"b": Struct<"b": Struct<"a": String>>>>, "object": Struct<"timestamp": Int64>>>"#,
-            r#"Json<Struct<"nested": Struct<"a": Struct<"b": Struct<"a": Struct<"b": String>>>>, "object": Struct<"timestamp": Int64>>>"#,
+            r#"Json<Object{"list": Array[Number(I64)], "s": String}>"#,
+            r#"Json<Object{"float": Number(F64), "s": String}>"#,
+            r#"Json<Object{"float": Number(F64), "int": Number(I64)}>"#,
+            r#"Json<Object{"int": Number(I64), "object": Object{"hello": String, "timestamp": Number(I64)}}>"#,
+            r#"Json<Object{"nested": Object{"a": Object{"b": Object{"b": Object{"a": String}}}}, "object": Object{"timestamp": Number(I64)}}>"#,
+            r#"Json<Object{"nested": Object{"a": Object{"b": Object{"a": Object{"b": String}}}}, "object": Object{"timestamp": Number(I64)}}>"#,
         ];
         let expect_vectors = [
             r#"
@@ -457,7 +456,7 @@ mod tests {
         }
 
         // test final merged json type:
-        let expected = r#"Json<Struct<"float": Float64, "int": Int64, "list": List<Int64>, "nested": Struct<"a": Struct<"b": Struct<"a": Struct<"b": String>, "b": Struct<"a": String>>>>, "object": Struct<"hello": String, "timestamp": Int64>, "s": String>>"#;
+        let expected = r#"Json<Object{"float": Number(F64), "int": Number(I64), "list": Array[Number(I64)], "nested": Object{"a": Object{"b": Object{"a": Object{"b": String}, "b": Object{"a": String}}}}, "object": Object{"hello": String, "timestamp": Number(I64)}, "s": String}>"#;
         assert_eq!(builder.data_type().to_string(), expected);
 
         // test final produced vector:
diff --git a/src/datatypes/src/vectors/struct_vector.rs b/src/datatypes/src/vectors/struct_vector.rs
index d9490a63bb..44de9abf5e 100644
--- a/src/datatypes/src/vectors/struct_vector.rs
+++ b/src/datatypes/src/vectors/struct_vector.rs
@@ -379,10 +379,8 @@ impl MutableVector for StructVectorBuilder {
                 },
                 StructValueRef::Ref(val) => self.push_struct_value(val)?,
                 StructValueRef::RefList { val, fields } => {
-                    let struct_value = StructValue::try_new(
-                        val.iter().map(|v| Value::from(v.clone())).collect(),
-                        fields.clone(),
-                    )?;
+                    let struct_value =
+                        StructValue::try_new(val.into_iter().map(Value::from).collect(), fields)?;
                     self.push_struct_value(&struct_value)?;
                 }
             }
@@ -429,12 +427,17 @@ impl ScalarVectorBuilder for StructVectorBuilder {
             .value_builders
             .iter_mut()
             .map(|b| b.to_vector().to_arrow_array())
-            .collect();
-        let struct_array = StructArray::new(
-            self.fields.as_arrow_fields(),
-            arrays,
-            self.null_buffer.finish(),
-        );
+            .collect::<Vec<_>>();
+
+        let struct_array = if arrays.is_empty() {
+            StructArray::new_empty_fields(self.len(), self.null_buffer.finish())
+        } else {
+            StructArray::new(
+                self.fields.as_arrow_fields(),
+                arrays,
+                self.null_buffer.finish(),
+            )
+        };
 
         StructVector::try_new(self.fields.clone(), struct_array).unwrap()
     }
diff --git a/src/file-engine/src/engine.rs b/src/file-engine/src/engine.rs
index d3ec72c1e2..5dd787b919 100644
--- a/src/file-engine/src/engine.rs
+++ b/src/file-engine/src/engine.rs
@@ -26,7 +26,8 @@ use object_store::ObjectStore;
 use snafu::{OptionExt, ensure};
 use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::{
-    RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
+    CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
+    RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
     SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState,
     SinglePartitionScanner, SyncManifestResponse,
 };
@@ -150,6 +151,31 @@ impl RegionEngine for FileRegionEngine {
         Ok(SyncManifestResponse::NotSupported)
     }
 
+    async fn remap_manifests(
+        &self,
+        _request: RemapManifestsRequest,
+    ) -> Result<RemapManifestsResponse, BoxedError> {
+        Err(BoxedError::new(
+            UnsupportedSnafu {
+                operation: "remap_manifests",
+            }
+            .build(),
+        ))
+    }
+
+    async fn copy_region_from(
+        &self,
+        _region_id: RegionId,
+        _request: CopyRegionFromRequest,
+    ) -> Result<CopyRegionFromResponse, BoxedError> {
+        Err(BoxedError::new(
+            UnsupportedSnafu {
+                operation: "copy_region_from",
+            }
+            .build(),
+        ))
+    }
+
     fn role(&self, region_id: RegionId) -> Option<RegionRole> {
         self.inner.state(region_id)
     }
diff --git a/src/file-engine/src/error.rs b/src/file-engine/src/error.rs
index 2447fe1fde..3179d0d0fd 100644
--- a/src/file-engine/src/error.rs
+++ b/src/file-engine/src/error.rs
@@ -151,13 +151,6 @@ pub enum Error {
         location: Location,
     },
 
-    #[snafu(display("Failed to build stream adapter"))]
-    BuildStreamAdapter {
-        #[snafu(implicit)]
-        location: Location,
-        source: common_recordbatch::error::Error,
-    },
-
     #[snafu(display("Failed to parse file format"))]
     ParseFileFormat {
         #[snafu(implicit)]
@@ -200,13 +193,6 @@ pub enum Error {
         #[snafu(implicit)]
         location: Location,
     },
-
-    #[snafu(display("Missing default value for column: {}", column))]
-    MissingColumnNoDefault {
-        column: String,
-        #[snafu(implicit)]
-        location: Location,
-    },
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
@@ -222,13 +208,11 @@ impl ErrorExt for Error {
             | Unsupported { .. }
             | InvalidMetadata { .. }
             | ProjectionOutOfBounds { .. }
-            | CreateDefault { .. }
-            | MissingColumnNoDefault { .. } => StatusCode::InvalidArguments,
+            | CreateDefault { .. } => StatusCode::InvalidArguments,
 
             RegionNotFound { .. } => StatusCode::RegionNotFound,
 
             BuildBackend { source, .. } => source.status_code(),
-            BuildStreamAdapter { source, .. } => source.status_code(),
             ParseFileFormat { source, .. } => source.status_code(),
 
             CheckObject { .. }
diff --git a/src/file-engine/src/manifest.rs b/src/file-engine/src/manifest.rs
index 7e8aa7a732..ac2732fe69 100644
--- a/src/file-engine/src/manifest.rs
+++ b/src/file-engine/src/manifest.rs
@@ -94,7 +94,9 @@ impl FileRegionManifest {
             builder.push_column_metadata(column.clone());
         }
         builder.primary_key(self.primary_key.clone());
-        let metadata = builder.build().context(InvalidMetadataSnafu)?;
+        let metadata = builder
+            .build_without_validation()
+            .context(InvalidMetadataSnafu)?;
 
         Ok(Arc::new(metadata))
     }
@@ -127,3 +129,49 @@ impl FileRegionManifest {
             .context(MissingRequiredFieldSnafu { name })
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use api::v1::SemanticType;
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::schema::ColumnSchema;
+
+    use super::*;
+
+    #[test]
+    fn metadata_allows_internal_column_name() {
+        let manifest = FileRegionManifest {
+            region_id: RegionId::new(1, 0),
+            column_metadatas: vec![
+                ColumnMetadata {
+                    column_schema: ColumnSchema::new(
+                        "__primary_key",
+                        ConcreteDataType::string_datatype(),
+                        false,
+                    ),
+                    semantic_type: SemanticType::Tag,
+                    column_id: 1,
+                },
+                ColumnMetadata {
+                    column_schema: ColumnSchema::new(
+                        "ts",
+                        ConcreteDataType::timestamp_millisecond_datatype(),
+                        false,
+                    ),
+                    semantic_type: SemanticType::Timestamp,
+                    column_id: 2,
+                },
+            ],
+            primary_key: vec![1],
+            options: HashMap::default(),
+        };
+
+        let metadata = manifest.metadata().unwrap();
+        assert!(
+            metadata
+                .column_metadatas
+                .iter()
+                .any(|c| c.column_schema.name == "__primary_key")
+        );
+    }
+}
diff --git a/src/file-engine/src/query.rs b/src/file-engine/src/query.rs
index b56777d43c..75d40c4608 100644
--- a/src/file-engine/src/query.rs
+++ b/src/file-engine/src/query.rs
@@ -20,23 +20,26 @@ use std::sync::Arc;
 use std::task::{Context, Poll};
 
 use common_datasource::object_store::build_backend;
-use common_error::ext::BoxedError;
 use common_recordbatch::adapter::RecordBatchMetrics;
-use common_recordbatch::error::{CastVectorSnafu, ExternalSnafu, Result as RecordBatchResult};
-use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
+use common_recordbatch::error::{self as recordbatch_error, Result as RecordBatchResult};
+use common_recordbatch::{
+    DfSendableRecordBatchStream, OrderOption, RecordBatch, RecordBatchStream,
+    SendableRecordBatchStream,
+};
 use datafusion::logical_expr::utils as df_logical_expr_utils;
 use datafusion_expr::expr::Expr;
-use datatypes::prelude::ConcreteDataType;
-use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
-use datatypes::vectors::VectorRef;
+use datatypes::arrow::compute as arrow_compute;
+use datatypes::data_type::DataType;
+use datatypes::schema::{Schema, SchemaRef};
+use datatypes::vectors::Helper;
 use futures::Stream;
-use snafu::{OptionExt, ResultExt, ensure};
+use snafu::{GenerateImplicitData, ResultExt, ensure};
 use store_api::storage::ScanRequest;
 
 use self::file_stream::ScanPlanConfig;
 use crate::error::{
-    BuildBackendSnafu, CreateDefaultSnafu, ExtractColumnFromFilterSnafu,
-    MissingColumnNoDefaultSnafu, ProjectSchemaSnafu, ProjectionOutOfBoundsSnafu, Result,
+    BuildBackendSnafu, ExtractColumnFromFilterSnafu, ProjectSchemaSnafu,
+    ProjectionOutOfBoundsSnafu, Result,
 };
 use crate::region::FileRegion;
 
@@ -48,6 +51,16 @@ impl FileRegion {
         let file_filters = self.filters_pushdown_to_file(&request.filters)?;
         let file_schema = Arc::new(Schema::new(self.file_options.file_column_schemas.clone()));
 
+        let projected_file_schema = if let Some(projection) = &file_projection {
+            Arc::new(
+                file_schema
+                    .try_project(projection)
+                    .context(ProjectSchemaSnafu)?,
+            )
+        } else {
+            file_schema.clone()
+        };
+
         let file_stream = file_stream::create_stream(
             &self.format,
             &ScanPlanConfig {
@@ -64,6 +77,7 @@ impl FileRegion {
 
         Ok(Box::pin(FileToScanRegionStream::new(
             scan_schema,
+            projected_file_schema,
             file_stream,
         )))
     }
@@ -144,7 +158,10 @@ impl FileRegion {
 
 struct FileToScanRegionStream {
     scan_schema: SchemaRef,
-    file_stream: SendableRecordBatchStream,
+    file_stream: DfSendableRecordBatchStream,
+    /// Maps columns in `scan_schema` to their index in the projected file schema.
+    /// `None` means the column doesn't exist in the file and should be filled with default values.
+    scan_to_file_projection: Vec<Option<usize>>,
 }
 
 impl RecordBatchStream for FileToScanRegionStream {
@@ -167,15 +184,49 @@ impl Stream for FileToScanRegionStream {
     fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
         match Pin::new(&mut self.file_stream).poll_next(ctx) {
             Poll::Pending => Poll::Pending,
-            Poll::Ready(Some(file_record_batch)) => {
-                let file_record_batch = file_record_batch?;
-                let scan_record_batch = if self.schema_eq(&file_record_batch) {
-                    Ok(file_record_batch)
-                } else {
-                    self.convert_record_batch(&file_record_batch)
-                };
+            Poll::Ready(Some(Ok(file_record_batch))) => {
+                let num_rows = file_record_batch.num_rows();
+                let mut columns = Vec::with_capacity(self.scan_schema.num_columns());
 
-                Poll::Ready(Some(scan_record_batch))
+                for (idx, column_schema) in self.scan_schema.column_schemas().iter().enumerate() {
+                    if let Some(file_idx) = self.scan_to_file_projection[idx] {
+                        let expected_arrow_type = column_schema.data_type.as_arrow_type();
+                        let mut array = file_record_batch.column(file_idx).clone();
+
+                        if array.data_type() != &expected_arrow_type {
+                            array = arrow_compute::cast(array.as_ref(), &expected_arrow_type)
+                                .context(recordbatch_error::ArrowComputeSnafu)?;
+                        }
+
+                        let vector = Helper::try_into_vector(array)
+                            .context(recordbatch_error::DataTypesSnafu)?;
+                        columns.push(vector);
+                    } else {
+                        let vector = column_schema
+                            .create_default_vector(num_rows)
+                            .context(recordbatch_error::DataTypesSnafu)?
+                            .ok_or_else(|| {
+                                recordbatch_error::CreateRecordBatchesSnafu {
+                                    reason: format!(
+                                        "column {} is missing from file source and has no default",
+                                        column_schema.name
+                                    ),
+                                }
+                                .build()
+                            })?;
+                        columns.push(vector);
+                    }
+                }
+
+                let record_batch = RecordBatch::new(self.scan_schema.clone(), columns)?;
+
+                Poll::Ready(Some(Ok(record_batch)))
+            }
+            Poll::Ready(Some(Err(error))) => {
+                Poll::Ready(Some(Err(recordbatch_error::Error::PollStream {
+                    error,
+                    location: snafu::Location::generate(),
+                })))
             }
             Poll::Ready(None) => Poll::Ready(None),
         }
@@ -183,86 +234,21 @@ impl Stream for FileToScanRegionStream {
 }
 
 impl FileToScanRegionStream {
-    fn new(scan_schema: SchemaRef, file_stream: SendableRecordBatchStream) -> Self {
+    fn new(
+        scan_schema: SchemaRef,
+        file_schema: SchemaRef,
+        file_stream: DfSendableRecordBatchStream,
+    ) -> Self {
+        let scan_to_file_projection = scan_schema
+            .column_schemas()
+            .iter()
+            .map(|column| file_schema.column_index_by_name(&column.name))
+            .collect();
+
         Self {
             scan_schema,
             file_stream,
+            scan_to_file_projection,
         }
     }
-
-    fn schema_eq(&self, file_record_batch: &RecordBatch) -> bool {
-        self.scan_schema
-            .column_schemas()
-            .iter()
-            .all(|scan_column_schema| {
-                file_record_batch
-                    .column_by_name(&scan_column_schema.name)
-                    .map(|rb| rb.data_type() == scan_column_schema.data_type)
-                    .unwrap_or_default()
-            })
-    }
-
-    /// Converts a RecordBatch from file schema to scan schema.
-    ///
-    /// This function performs the following operations:
-    /// - Projection: Only columns present in scan schema are retained.
-    /// - Cast Type: Columns present in both file schema and scan schema but with different types are cast to the type in scan schema.
-    /// - Backfill: Columns present in scan schema but not in file schema are backfilled with default values.
-    fn convert_record_batch(
-        &self,
-        file_record_batch: &RecordBatch,
-    ) -> RecordBatchResult<RecordBatch> {
-        let file_row_count = file_record_batch.num_rows();
-        let columns = self
-            .scan_schema
-            .column_schemas()
-            .iter()
-            .map(|scan_column_schema| {
-                let file_column = file_record_batch.column_by_name(&scan_column_schema.name);
-                if let Some(file_column) = file_column {
-                    Self::cast_column_type(file_column, &scan_column_schema.data_type)
-                } else {
-                    Self::backfill_column(scan_column_schema, file_row_count)
-                }
-            })
-            .collect::<RecordBatchResult<Vec<_>>>()?;
-
-        RecordBatch::new(self.scan_schema.clone(), columns)
-    }
-
-    fn cast_column_type(
-        source_column: &VectorRef,
-        target_data_type: &ConcreteDataType,
-    ) -> RecordBatchResult<VectorRef> {
-        if &source_column.data_type() == target_data_type {
-            Ok(source_column.clone())
-        } else {
-            source_column
-                .cast(target_data_type)
-                .context(CastVectorSnafu {
-                    from_type: source_column.data_type(),
-                    to_type: target_data_type.clone(),
-                })
-        }
-    }
-
-    fn backfill_column(
-        column_schema: &ColumnSchema,
-        num_rows: usize,
-    ) -> RecordBatchResult<VectorRef> {
-        Self::create_default_vector(column_schema, num_rows)
-            .map_err(BoxedError::new)
-            .context(ExternalSnafu)
-    }
-
-    fn create_default_vector(column_schema: &ColumnSchema, num_rows: usize) -> Result<VectorRef> {
-        column_schema
-            .create_default_vector(num_rows)
-            .with_context(|_| CreateDefaultSnafu {
-                column: column_schema.name.clone(),
-            })?
-            .with_context(|| MissingColumnNoDefaultSnafu {
-                column: column_schema.name.clone(),
-            })
-    }
 }
diff --git a/src/file-engine/src/query/file_stream.rs b/src/file-engine/src/query/file_stream.rs
index 1f26c25493..199bb5e0bd 100644
--- a/src/file-engine/src/query/file_stream.rs
+++ b/src/file-engine/src/query/file_stream.rs
@@ -17,8 +17,6 @@ use std::sync::Arc;
 use common_datasource::file_format::Format;
 use common_datasource::file_format::csv::CsvFormat;
 use common_datasource::file_format::parquet::DefaultParquetFileReaderFactory;
-use common_recordbatch::SendableRecordBatchStream;
-use common_recordbatch::adapter::RecordBatchStreamAdapter;
 use datafusion::common::ToDFSchema;
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::datasource::object_store::ObjectStoreUrl;
@@ -28,8 +26,10 @@ use datafusion::datasource::physical_plan::{
 use datafusion::datasource::source::DataSourceExec;
 use datafusion::physical_expr::create_physical_expr;
 use datafusion::physical_expr::execution_props::ExecutionProps;
-use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
+use datafusion::physical_plan::{
+    ExecutionPlan, SendableRecordBatchStream as DfSendableRecordBatchStream,
+};
 use datafusion::prelude::SessionContext;
 use datafusion_expr::expr::Expr;
 use datafusion_expr::utils::conjunction;
@@ -48,7 +48,7 @@ fn build_record_batch_stream(
     file_schema: Arc<ArrowSchema>,
     limit: Option<usize>,
     file_source: Arc<dyn FileSource>,
-) -> Result<SendableRecordBatchStream> {
+) -> Result<DfSendableRecordBatchStream> {
     let files = scan_plan_config
         .files
         .iter()
@@ -77,15 +77,13 @@ fn build_record_batch_stream(
         &ExecutionPlanMetricsSet::new(),
     )
     .context(error::BuildStreamSnafu)?;
-    let adapter = RecordBatchStreamAdapter::try_new(Box::pin(stream))
-        .context(error::BuildStreamAdapterSnafu)?;
-    Ok(Box::pin(adapter))
+    Ok(Box::pin(stream))
 }
 
 fn new_csv_stream(
     config: &ScanPlanConfig,
     format: &CsvFormat,
-) -> Result<SendableRecordBatchStream> {
+) -> Result<DfSendableRecordBatchStream> {
     let file_schema = config.file_schema.arrow_schema().clone();
 
     // push down limit only if there is no filter
@@ -98,7 +96,7 @@ fn new_csv_stream(
     build_record_batch_stream(config, file_schema, limit, csv_source)
 }
 
-fn new_json_stream(config: &ScanPlanConfig) -> Result<SendableRecordBatchStream> {
+fn new_json_stream(config: &ScanPlanConfig) -> Result<DfSendableRecordBatchStream> {
     let file_schema = config.file_schema.arrow_schema().clone();
 
     // push down limit only if there is no filter
@@ -108,7 +106,9 @@ fn new_json_stream(config: &ScanPlanConfig) -> Result<SendableRecordBatchStream>
     build_record_batch_stream(config, file_schema, limit, file_source)
 }
 
-fn new_parquet_stream_with_exec_plan(config: &ScanPlanConfig) -> Result<SendableRecordBatchStream> {
+fn new_parquet_stream_with_exec_plan(
+    config: &ScanPlanConfig,
+) -> Result<DfSendableRecordBatchStream> {
     let file_schema = config.file_schema.arrow_schema().clone();
     let ScanPlanConfig {
         files,
@@ -161,12 +161,10 @@ fn new_parquet_stream_with_exec_plan(config: &ScanPlanConfig) -> Result<Sendable
         .execute(0, task_ctx)
         .context(error::ParquetScanPlanSnafu)?;
 
-    Ok(Box::pin(
-        RecordBatchStreamAdapter::try_new(stream).context(error::BuildStreamAdapterSnafu)?,
-    ))
+    Ok(stream)
 }
 
-fn new_orc_stream(config: &ScanPlanConfig) -> Result<SendableRecordBatchStream> {
+fn new_orc_stream(config: &ScanPlanConfig) -> Result<DfSendableRecordBatchStream> {
     let file_schema = config.file_schema.arrow_schema().clone();
 
     // push down limit only if there is no filter
@@ -189,7 +187,7 @@ pub struct ScanPlanConfig<'a> {
 pub fn create_stream(
     format: &Format,
     config: &ScanPlanConfig,
-) -> Result<SendableRecordBatchStream> {
+) -> Result<DfSendableRecordBatchStream> {
     match format {
         Format::Csv(format) => new_csv_stream(config, format),
         Format::Json(_) => new_json_stream(config),
diff --git a/src/flow/src/adapter/node_context.rs b/src/flow/src/adapter/node_context.rs
index 2cfad8671e..bcddcbb891 100644
--- a/src/flow/src/adapter/node_context.rs
+++ b/src/flow/src/adapter/node_context.rs
@@ -199,7 +199,7 @@ impl SourceSender {
     /// send record batch
     pub async fn send_record_batch(&self, batch: RecordBatch) -> Result<usize, Error> {
         let row_cnt = batch.num_rows();
-        let batch = Batch::from(batch);
+        let batch = Batch::try_from(batch)?;
 
         self.send_buf_row_cnt.fetch_add(row_cnt, Ordering::SeqCst);
 
diff --git a/src/flow/src/batching_mode/frontend_client.rs b/src/flow/src/batching_mode/frontend_client.rs
index e9994b5b14..174fa25671 100644
--- a/src/flow/src/batching_mode/frontend_client.rs
+++ b/src/flow/src/batching_mode/frontend_client.rs
@@ -23,7 +23,7 @@ use api::v1::query_request::Query;
 use api::v1::{CreateTableExpr, QueryRequest};
 use client::{Client, Database};
 use common_error::ext::{BoxedError, ErrorExt};
-use common_grpc::channel_manager::{ChannelConfig, ChannelManager, load_tls_config};
+use common_grpc::channel_manager::{ChannelConfig, ChannelManager, load_client_tls_config};
 use common_meta::cluster::{NodeInfo, NodeInfoKey, Role};
 use common_meta::peer::Peer;
 use common_meta::rpc::store::RangeRequest;
@@ -110,6 +110,26 @@ impl FrontendClient {
         )
     }
 
+    /// Check if the frontend client is initialized.
+    ///
+    /// In distributed mode, it is always initialized.
+    /// In standalone mode, it checks if the database client is set.
+    pub fn is_initialized(&self) -> bool {
+        match self {
+            FrontendClient::Distributed { .. } => true,
+            FrontendClient::Standalone {
+                database_client, ..
+            } => {
+                let guard = database_client.lock();
+                if let Ok(guard) = guard {
+                    guard.is_some()
+                } else {
+                    false
+                }
+            }
+        }
+    }
+
     pub fn from_meta_client(
         meta_client: Arc<MetaClient>,
         auth: Option<FlowAuthHeader>,
@@ -124,7 +144,7 @@ impl FrontendClient {
                     .connect_timeout(batch_opts.grpc_conn_timeout)
                     .timeout(batch_opts.query_timeout);
 
-                let tls_config = load_tls_config(batch_opts.frontend_tls.as_ref())
+                let tls_config = load_client_tls_config(batch_opts.frontend_tls.clone())
                     .context(InvalidClientConfigSnafu)?;
                 ChannelManager::with_config(cfg, tls_config)
             },
diff --git a/src/flow/src/expr.rs b/src/flow/src/expr.rs
index c17db3bf7e..5c0359e55f 100644
--- a/src/flow/src/expr.rs
+++ b/src/flow/src/expr.rs
@@ -25,6 +25,7 @@ mod signature;
 pub(crate) mod utils;
 
 use arrow::compute::FilterBuilder;
+use common_recordbatch::RecordBatch;
 use datatypes::prelude::{ConcreteDataType, DataType};
 use datatypes::value::Value;
 use datatypes::vectors::{BooleanVector, Helper, VectorRef};
@@ -38,6 +39,8 @@ pub(crate) use relation::{Accum, Accumulator, AggregateExpr, AggregateFunc};
 pub(crate) use scalar::{ScalarExpr, TypedExpr};
 use snafu::{ResultExt, ensure};
 
+use crate::Error;
+use crate::error::DatatypesSnafu;
 use crate::expr::error::{ArrowSnafu, DataTypeSnafu};
 use crate::repr::Diff;
 
@@ -55,13 +58,19 @@ pub struct Batch {
     diffs: Option<VectorRef>,
 }
 
-impl From<common_recordbatch::RecordBatch> for Batch {
-    fn from(value: common_recordbatch::RecordBatch) -> Self {
-        Self {
+impl TryFrom<RecordBatch> for Batch {
+    type Error = Error;
+
+    fn try_from(value: RecordBatch) -> Result<Self, Self::Error> {
+        let columns = value.columns();
+        let batch = Helper::try_into_vectors(columns).context(DatatypesSnafu {
+            extra: "failed to convert Arrow array to vector when building Flow batch",
+        })?;
+        Ok(Self {
             row_count: value.num_rows(),
-            batch: value.columns,
+            batch,
             diffs: None,
-        }
+        })
     }
 }
 
diff --git a/src/flow/src/repr.rs b/src/flow/src/repr.rs
index 301431aff5..715f60594b 100644
--- a/src/flow/src/repr.rs
+++ b/src/flow/src/repr.rs
@@ -17,7 +17,7 @@
 
 mod relation;
 
-use api::helper::{pb_value_to_value_ref, value_to_grpc_value};
+use api::helper::{pb_value_to_value_ref, to_grpc_value};
 use api::v1::Row as ProtoRow;
 use datatypes::data_type::ConcreteDataType;
 use datatypes::types::cast;
@@ -201,11 +201,7 @@ impl From<ProtoRow> for Row {
 
 impl From<Row> for ProtoRow {
     fn from(row: Row) -> Self {
-        let values = row
-            .unpack()
-            .into_iter()
-            .map(value_to_grpc_value)
-            .collect_vec();
+        let values = row.unpack().into_iter().map(to_grpc_value).collect_vec();
         ProtoRow { values }
     }
 }
diff --git a/src/frontend/Cargo.toml b/src/frontend/Cargo.toml
index 24d9c8c5ff..c8c78f7d74 100644
--- a/src/frontend/Cargo.toml
+++ b/src/frontend/Cargo.toml
@@ -17,6 +17,7 @@ arc-swap = "1.0"
 async-stream.workspace = true
 async-trait.workspace = true
 auth.workspace = true
+axum.workspace = true
 bytes.workspace = true
 cache.workspace = true
 catalog.workspace = true
@@ -85,6 +86,9 @@ common-test-util.workspace = true
 datanode.workspace = true
 datatypes.workspace = true
 futures.workspace = true
+hyper-util = { workspace = true, features = ["tokio"] }
+meta-srv.workspace = true
+reqwest.workspace = true
 serde_json.workspace = true
 strfmt = "0.2"
 tower.workspace = true
diff --git a/src/frontend/src/error.rs b/src/frontend/src/error.rs
index 710c855958..03b809d999 100644
--- a/src/frontend/src/error.rs
+++ b/src/frontend/src/error.rs
@@ -364,6 +364,12 @@ pub enum Error {
         #[snafu(implicit)]
         location: Location,
     },
+
+    #[snafu(display("Service suspended"))]
+    Suspended {
+        #[snafu(implicit)]
+        location: Location,
+    },
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
@@ -444,6 +450,8 @@ impl ErrorExt for Error {
             Error::StatementTimeout { .. } => StatusCode::Cancelled,
 
             Error::AcquireLimiter { .. } => StatusCode::Internal,
+
+            Error::Suspended { .. } => StatusCode::Suspended,
         }
     }
 
diff --git a/src/frontend/src/frontend.rs b/src/frontend/src/frontend.rs
index dce9ffd158..ac617c8b08 100644
--- a/src/frontend/src/frontend.rs
+++ b/src/frontend/src/frontend.rs
@@ -23,7 +23,6 @@ use common_telemetry::logging::{LoggingOptions, SlowQueryOptions, TracingOptions
 use meta_client::MetaClientOptions;
 use query::options::QueryOptions;
 use serde::{Deserialize, Serialize};
-use servers::export_metrics::{ExportMetricsOption, ExportMetricsTask};
 use servers::grpc::GrpcOptions;
 use servers::heartbeat_options::HeartbeatOptions;
 use servers::http::HttpOptions;
@@ -34,7 +33,6 @@ use crate::error;
 use crate::error::Result;
 use crate::heartbeat::HeartbeatTask;
 use crate::instance::Instance;
-use crate::instance::prom_store::ExportMetricHandler;
 use crate::service_config::{
     InfluxdbOptions, JaegerOptions, MysqlOptions, OpentsdbOptions, OtlpOptions, PostgresOptions,
     PromStoreOptions,
@@ -63,7 +61,6 @@ pub struct FrontendOptions {
     pub logging: LoggingOptions,
     pub datanode: DatanodeClientOptions,
     pub user_provider: Option<String>,
-    pub export_metrics: ExportMetricsOption,
     pub tracing: TracingOptions,
     pub query: QueryOptions,
     pub max_in_flight_write_bytes: Option<ReadableSize>,
@@ -94,7 +91,6 @@ impl Default for FrontendOptions {
             logging: LoggingOptions::default(),
             datanode: DatanodeClientOptions::default(),
             user_provider: None,
-            export_metrics: ExportMetricsOption::default(),
             tracing: TracingOptions::default(),
             query: QueryOptions::default(),
             max_in_flight_write_bytes: None,
@@ -117,7 +113,6 @@ pub struct Frontend {
     pub instance: Arc<Instance>,
     pub servers: ServerHandlers,
     pub heartbeat_task: Option<HeartbeatTask>,
-    pub export_metrics_task: Option<ExportMetricsTask>,
 }
 
 impl Frontend {
@@ -126,17 +121,6 @@ impl Frontend {
             t.start().await?;
         }
 
-        if let Some(t) = self.export_metrics_task.as_ref() {
-            if t.send_by_handler {
-                let inserter = self.instance.inserter().clone();
-                let statement_executor = self.instance.statement_executor().clone();
-                let handler = ExportMetricHandler::new_handler(inserter, statement_executor);
-                t.start(Some(handler)).context(error::StartServerSnafu)?
-            } else {
-                t.start(None).context(error::StartServerSnafu)?;
-            }
-        }
-
         self.servers
             .start_all()
             .await
@@ -157,7 +141,43 @@ impl Frontend {
 
 #[cfg(test)]
 mod tests {
+    use std::sync::atomic::{AtomicBool, Ordering};
+    use std::time::Duration;
+
+    use api::v1::meta::heartbeat_server::HeartbeatServer;
+    use api::v1::meta::mailbox_message::Payload;
+    use api::v1::meta::{
+        AskLeaderRequest, AskLeaderResponse, HeartbeatRequest, HeartbeatResponse, MailboxMessage,
+        Peer, ResponseHeader, Role, heartbeat_server,
+    };
+    use async_trait::async_trait;
+    use client::{Client, Database};
+    use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+    use common_error::ext::ErrorExt;
+    use common_error::from_header_to_err_code_msg;
+    use common_error::status_code::StatusCode;
+    use common_grpc::channel_manager::ChannelManager;
+    use common_meta::distributed_time_constants::FRONTEND_HEARTBEAT_INTERVAL_MILLIS;
+    use common_meta::heartbeat::handler::HandlerGroupExecutor;
+    use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
+    use common_meta::heartbeat::handler::suspend::SuspendHandler;
+    use common_meta::instruction::Instruction;
+    use common_stat::ResourceStatImpl;
+    use meta_client::MetaClientRef;
+    use meta_client::client::MetaClientBuilder;
+    use meta_srv::service::GrpcStream;
+    use servers::grpc::{FlightCompression, GRPC_SERVER};
+    use servers::http::HTTP_SERVER;
+    use servers::http::result::greptime_result_v1::GreptimedbV1Response;
+    use tokio::sync::mpsc;
+    use tonic::codec::CompressionEncoding;
+    use tonic::codegen::tokio_stream::StreamExt;
+    use tonic::codegen::tokio_stream::wrappers::ReceiverStream;
+    use tonic::{Request, Response, Status, Streaming};
+
     use super::*;
+    use crate::instance::builder::FrontendBuilder;
+    use crate::server::Services;
 
     #[test]
     fn test_toml() {
@@ -165,4 +185,277 @@ mod tests {
         let toml_string = toml::to_string(&opts).unwrap();
         let _parsed: FrontendOptions = toml::from_str(&toml_string).unwrap();
     }
+
+    struct SuspendableHeartbeatServer {
+        suspend: Arc<AtomicBool>,
+    }
+
+    #[async_trait]
+    impl heartbeat_server::Heartbeat for SuspendableHeartbeatServer {
+        type HeartbeatStream = GrpcStream<HeartbeatResponse>;
+
+        async fn heartbeat(
+            &self,
+            request: Request<Streaming<HeartbeatRequest>>,
+        ) -> std::result::Result<Response<Self::HeartbeatStream>, Status> {
+            let (tx, rx) = mpsc::channel(4);
+
+            common_runtime::spawn_global({
+                let mut requests = request.into_inner();
+                let suspend = self.suspend.clone();
+                async move {
+                    while let Some(request) = requests.next().await {
+                        if let Err(e) = request {
+                            let _ = tx.send(Err(e)).await;
+                            return;
+                        }
+
+                        let mailbox_message =
+                            suspend.load(Ordering::Relaxed).then(|| MailboxMessage {
+                                payload: Some(Payload::Json(
+                                    serde_json::to_string(&Instruction::Suspend).unwrap(),
+                                )),
+                                ..Default::default()
+                            });
+                        let response = HeartbeatResponse {
+                            header: Some(ResponseHeader::success()),
+                            mailbox_message,
+                            ..Default::default()
+                        };
+
+                        let _ = tx.send(Ok(response)).await;
+                    }
+                }
+            });
+
+            Ok(Response::new(Box::pin(ReceiverStream::new(rx))))
+        }
+
+        async fn ask_leader(
+            &self,
+            _: Request<AskLeaderRequest>,
+        ) -> std::result::Result<Response<AskLeaderResponse>, Status> {
+            Ok(Response::new(AskLeaderResponse {
+                header: Some(ResponseHeader::success()),
+                leader: Some(Peer {
+                    addr: "localhost:0".to_string(),
+                    ..Default::default()
+                }),
+            }))
+        }
+    }
+
+    async fn create_meta_client(
+        options: &MetaClientOptions,
+        heartbeat_server: Arc<SuspendableHeartbeatServer>,
+    ) -> MetaClientRef {
+        let (client, server) = tokio::io::duplex(1024);
+
+        // create the heartbeat server:
+        common_runtime::spawn_global(async move {
+            let mut router = tonic::transport::Server::builder();
+            let router = router.add_service(
+                HeartbeatServer::from_arc(heartbeat_server)
+                    .accept_compressed(CompressionEncoding::Zstd)
+                    .send_compressed(CompressionEncoding::Zstd),
+            );
+            router
+                .serve_with_incoming(futures::stream::iter([Ok::<_, std::io::Error>(server)]))
+                .await
+        });
+
+        // Move client to an option so we can _move_ the inner value
+        // on the first attempt to connect. All other attempts will fail.
+        let mut client = Some(client);
+        let connector = tower::service_fn(move |_| {
+            let client = client.take();
+            async move {
+                if let Some(client) = client {
+                    Ok(hyper_util::rt::TokioIo::new(client))
+                } else {
+                    Err(std::io::Error::other("client already taken"))
+                }
+            }
+        });
+        let manager = ChannelManager::new();
+        manager
+            .reset_with_connector("localhost:0", connector)
+            .unwrap();
+
+        // create the heartbeat client:
+        let mut client = MetaClientBuilder::new(0, Role::Frontend)
+            .enable_heartbeat()
+            .heartbeat_channel_manager(manager)
+            .build();
+        client.start(&options.metasrv_addrs).await.unwrap();
+        Arc::new(client)
+    }
+
+    async fn create_frontend(
+        options: &FrontendOptions,
+        meta_client: MetaClientRef,
+    ) -> Result<Frontend> {
+        let instance = Arc::new(
+            FrontendBuilder::new_test(options, meta_client.clone())
+                .try_build()
+                .await?,
+        );
+
+        let servers =
+            Services::new(options.clone(), instance.clone(), Default::default()).build()?;
+
+        let executor = Arc::new(HandlerGroupExecutor::new(vec![
+            Arc::new(ParseMailboxMessageHandler),
+            Arc::new(SuspendHandler::new(instance.suspend_state())),
+        ]));
+        let heartbeat_task = Some(HeartbeatTask::new(
+            options,
+            meta_client,
+            executor,
+            Arc::new(ResourceStatImpl::default()),
+        ));
+
+        let mut frontend = Frontend {
+            instance,
+            servers,
+            heartbeat_task,
+        };
+        frontend.start().await?;
+        Ok(frontend)
+    }
+
+    async fn verify_suspend_state_by_http(
+        frontend: &Frontend,
+        expected: std::result::Result<&str, (StatusCode, &str)>,
+    ) {
+        let addr = frontend.server_handlers().addr(HTTP_SERVER).unwrap();
+        let response = reqwest::get(format!("http://{}/v1/sql?sql=SELECT 1", addr))
+            .await
+            .unwrap();
+
+        let headers = response.headers();
+        let response = if let Some((code, error)) = from_header_to_err_code_msg(headers) {
+            Err((code, error))
+        } else {
+            Ok(response.text().await.unwrap())
+        };
+
+        match (response, expected) {
+            (Ok(response), Ok(expected)) => {
+                let response: GreptimedbV1Response = serde_json::from_str(&response).unwrap();
+                let response = serde_json::to_string(response.output()).unwrap();
+                assert_eq!(&response, expected);
+            }
+            (Err(actual), Err(expected)) => assert_eq!(actual, expected),
+            _ => unreachable!(),
+        }
+    }
+
+    async fn verify_suspend_state_by_grpc(
+        frontend: &Frontend,
+        expected: std::result::Result<&str, (StatusCode, &str)>,
+    ) {
+        let addr = frontend.server_handlers().addr(GRPC_SERVER).unwrap();
+        let client = Client::with_urls([addr.to_string()]);
+        let client = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
+        let response = client.sql("SELECT 1").await;
+
+        match (response, expected) {
+            (Ok(response), Ok(expected)) => {
+                let response = response.data.pretty_print().await;
+                assert_eq!(&response, expected.trim());
+            }
+            (Err(actual), Err(expected)) => {
+                assert_eq!(actual.status_code(), expected.0);
+                assert_eq!(actual.output_msg(), expected.1);
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn test_suspend_frontend() -> Result<()> {
+        common_telemetry::init_default_ut_logging();
+
+        let meta_client_options = MetaClientOptions {
+            metasrv_addrs: vec!["localhost:0".to_string()],
+            ..Default::default()
+        };
+        let options = FrontendOptions {
+            http: HttpOptions {
+                addr: "127.0.0.1:0".to_string(),
+                ..Default::default()
+            },
+            grpc: GrpcOptions {
+                bind_addr: "127.0.0.1:0".to_string(),
+                flight_compression: FlightCompression::None,
+                ..Default::default()
+            },
+            mysql: MysqlOptions {
+                enable: false,
+                ..Default::default()
+            },
+            postgres: PostgresOptions {
+                enable: false,
+                ..Default::default()
+            },
+            meta_client: Some(meta_client_options.clone()),
+            ..Default::default()
+        };
+
+        let server = Arc::new(SuspendableHeartbeatServer {
+            suspend: Arc::new(AtomicBool::new(false)),
+        });
+        let meta_client = create_meta_client(&meta_client_options, server.clone()).await;
+        let frontend = create_frontend(&options, meta_client).await?;
+
+        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        // initial state: not suspend:
+        assert!(!frontend.instance.is_suspended());
+        verify_suspend_state_by_http(&frontend, Ok(r#"[{"records":{"schema":{"column_schemas":[{"name":"Int64(1)","data_type":"Int64"}]},"rows":[[1]],"total_rows":1}}]"#)).await;
+        verify_suspend_state_by_grpc(
+            &frontend,
+            Ok(r#"
++----------+
+| Int64(1) |
++----------+
+| 1        |
++----------+"#),
+        )
+        .await;
+
+        // make heartbeat server returned "suspend" instruction,
+        server.suspend.store(true, Ordering::Relaxed);
+        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        // ... then the frontend is suspended:
+        assert!(frontend.instance.is_suspended());
+        verify_suspend_state_by_http(
+            &frontend,
+            Err((
+                StatusCode::Suspended,
+                "error: Service suspended, execution_time_ms: 0",
+            )),
+        )
+        .await;
+        verify_suspend_state_by_grpc(&frontend, Err((StatusCode::Suspended, "Service suspended")))
+            .await;
+
+        // make heartbeat server NOT returned "suspend" instruction,
+        server.suspend.store(false, Ordering::Relaxed);
+        tokio::time::sleep(Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS)).await;
+        // ... then frontend's suspend state is cleared:
+        assert!(!frontend.instance.is_suspended());
+        verify_suspend_state_by_http(&frontend, Ok(r#"[{"records":{"schema":{"column_schemas":[{"name":"Int64(1)","data_type":"Int64"}]},"rows":[[1]],"total_rows":1}}]"#)).await;
+        verify_suspend_state_by_grpc(
+            &frontend,
+            Ok(r#"
++----------+
+| Int64(1) |
++----------+
+| 1        |
++----------+"#),
+        )
+        .await;
+        Ok(())
+    }
 }
diff --git a/src/frontend/src/heartbeat.rs b/src/frontend/src/heartbeat.rs
index 9c3954b0c6..64680abfd4 100644
--- a/src/frontend/src/heartbeat.rs
+++ b/src/frontend/src/heartbeat.rs
@@ -27,7 +27,6 @@ use common_stat::ResourceStatRef;
 use common_telemetry::{debug, error, info, warn};
 use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
 use servers::addrs;
-use servers::heartbeat_options::HeartbeatOptions;
 use snafu::ResultExt;
 use tokio::sync::mpsc;
 use tokio::sync::mpsc::Receiver;
@@ -54,7 +53,6 @@ impl HeartbeatTask {
     pub fn new(
         opts: &FrontendOptions,
         meta_client: Arc<MetaClient>,
-        heartbeat_opts: HeartbeatOptions,
         resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
         resource_stat: ResourceStatRef,
     ) -> Self {
@@ -68,8 +66,8 @@ impl HeartbeatTask {
                 addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr))
             },
             meta_client,
-            report_interval: heartbeat_opts.interval,
-            retry_interval: heartbeat_opts.retry_interval,
+            report_interval: opts.heartbeat.interval,
+            retry_interval: opts.heartbeat.retry_interval,
             resp_handler_executor,
             start_time_ms: common_time::util::current_time_millis() as u64,
             resource_stat,
@@ -196,7 +194,8 @@ impl HeartbeatTask {
         let report_interval = self.report_interval;
         let start_time_ms = self.start_time_ms;
         let self_peer = Some(Peer {
-            // The peer id doesn't make sense for frontend, so we just set it 0.
+            // The node id will be actually calculated from its address (by hashing the address
+            // string) in the metasrv. So it can be set to 0 here, as a placeholder.
             id: 0,
             addr: self.peer_addr.clone(),
         });
diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs
index 062bc0cf95..73b9ed72a8 100644
--- a/src/frontend/src/instance.rs
+++ b/src/frontend/src/instance.rs
@@ -26,7 +26,8 @@ mod region_query;
 pub mod standalone;
 
 use std::pin::Pin;
-use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+use std::sync::{Arc, atomic};
 use std::time::{Duration, SystemTime};
 
 use async_stream::stream;
@@ -83,6 +84,7 @@ use snafu::prelude::*;
 use sql::ast::ObjectNamePartExt;
 use sql::dialect::Dialect;
 use sql::parser::{ParseOptions, ParserContext};
+use sql::statements::comment::CommentObject;
 use sql::statements::copy::{CopyDatabase, CopyTable};
 use sql::statements::statement::Statement;
 use sql::statements::tql::Tql;
@@ -119,6 +121,7 @@ pub struct Instance {
     limiter: Option<LimiterRef>,
     process_manager: ProcessManagerRef,
     slow_query_options: SlowQueryOptions,
+    suspend: Arc<AtomicBool>,
 
     // cache for otlp metrics
     // first layer key: db-string
@@ -171,6 +174,14 @@ impl Instance {
     pub fn procedure_executor(&self) -> &ProcedureExecutorRef {
         self.statement_executor.procedure_executor()
     }
+
+    pub fn suspend_state(&self) -> Arc<AtomicBool> {
+        self.suspend.clone()
+    }
+
+    pub(crate) fn is_suspended(&self) -> bool {
+        self.suspend.load(atomic::Ordering::Relaxed)
+    }
 }
 
 fn parse_stmt(sql: &str, dialect: &(dyn Dialect + Send + Sync)) -> Result<Vec<Statement>> {
@@ -513,6 +524,10 @@ impl SqlQueryHandler for Instance {
 
     #[tracing::instrument(skip_all)]
     async fn do_query(&self, query: &str, query_ctx: QueryContextRef) -> Vec<Result<Output>> {
+        if self.is_suspended() {
+            return vec![error::SuspendedSnafu {}.fail()];
+        }
+
         let query_interceptor_opt = self.plugins.get::<SqlQueryInterceptorRef<Error>>();
         let query_interceptor = query_interceptor_opt.as_ref();
         let query = match query_interceptor.pre_parsing(query, query_ctx.clone()) {
@@ -580,6 +595,8 @@ impl SqlQueryHandler for Instance {
         plan: LogicalPlan,
         query_ctx: QueryContextRef,
     ) -> Result<Output> {
+        ensure!(!self.is_suspended(), error::SuspendedSnafu);
+
         if should_capture_statement(stmt.as_ref()) {
             // It's safe to unwrap here because we've already checked the type.
             let stmt = stmt.unwrap();
@@ -641,6 +658,10 @@ impl SqlQueryHandler for Instance {
         query: &PromQuery,
         query_ctx: QueryContextRef,
     ) -> Vec<Result<Output>> {
+        if self.is_suspended() {
+            return vec![error::SuspendedSnafu {}.fail()];
+        }
+
         // check will be done in prometheus handler's do_query
         let result = PrometheusHandler::do_query(self, query, query_ctx)
             .await
@@ -655,6 +676,8 @@ impl SqlQueryHandler for Instance {
         stmt: Statement,
         query_ctx: QueryContextRef,
     ) -> Result<Option<DescribeResult>> {
+        ensure!(!self.is_suspended(), error::SuspendedSnafu);
+
         if matches!(
             stmt,
             Statement::Insert(_) | Statement::Query(_) | Statement::Delete(_)
@@ -875,7 +898,7 @@ pub fn check_permission(
             validate_param(&stmt.table_name, query_ctx)?;
         }
         Statement::ShowCreateFlow(stmt) => {
-            validate_param(&stmt.flow_name, query_ctx)?;
+            validate_flow(&stmt.flow_name, query_ctx)?;
         }
         #[cfg(feature = "enterprise")]
         Statement::ShowCreateTrigger(stmt) => {
@@ -908,6 +931,12 @@ pub fn check_permission(
         // show charset and show collation won't be checked
         Statement::ShowCharset(_) | Statement::ShowCollation(_) => {}
 
+        Statement::Comment(comment) => match &comment.object {
+            CommentObject::Table(table) => validate_param(table, query_ctx)?,
+            CommentObject::Column { table, .. } => validate_param(table, query_ctx)?,
+            CommentObject::Flow(flow) => validate_flow(flow, query_ctx)?,
+        },
+
         Statement::Insert(insert) => {
             let name = insert.table_name().context(ParseSqlSnafu)?;
             validate_param(name, query_ctx)?;
@@ -993,6 +1022,27 @@ fn validate_param(name: &ObjectName, query_ctx: &QueryContextRef) -> Result<()>
         .context(SqlExecInterceptedSnafu)
 }
 
+fn validate_flow(name: &ObjectName, query_ctx: &QueryContextRef) -> Result<()> {
+    let catalog = match &name.0[..] {
+        [_flow] => query_ctx.current_catalog().to_string(),
+        [catalog, _flow] => catalog.to_string_unquoted(),
+        _ => {
+            return InvalidSqlSnafu {
+                err_msg: format!(
+                    "expect flow name to be <catalog>.<flow_name> or <flow_name>, actual: {name}",
+                ),
+            }
+            .fail();
+        }
+    };
+
+    let schema = query_ctx.current_schema();
+
+    validate_catalog_and_schema(&catalog, &schema, query_ctx)
+        .map_err(BoxedError::new)
+        .context(SqlExecInterceptedSnafu)
+}
+
 fn validate_database(name: &ObjectName, query_ctx: &QueryContextRef) -> Result<()> {
     let (catalog, schema) = match &name.0[..] {
         [schema] => (
@@ -1251,6 +1301,28 @@ mod tests {
 
         // test describe table
         let sql = "DESC TABLE {catalog}{schema}demo;";
-        replace_test(sql, plugins, &query_ctx);
+        replace_test(sql, plugins.clone(), &query_ctx);
+
+        let comment_flow_cases = [
+            ("COMMENT ON FLOW my_flow IS 'comment';", true),
+            ("COMMENT ON FLOW greptime.my_flow IS 'comment';", true),
+            ("COMMENT ON FLOW wrongcatalog.my_flow IS 'comment';", false),
+        ];
+        for (sql, is_ok) in comment_flow_cases {
+            let stmt = &parse_stmt(sql, &GreptimeDbDialect {}).unwrap()[0];
+            let result = check_permission(plugins.clone(), stmt, &query_ctx);
+            assert_eq!(result.is_ok(), is_ok);
+        }
+
+        let show_flow_cases = [
+            ("SHOW CREATE FLOW my_flow;", true),
+            ("SHOW CREATE FLOW greptime.my_flow;", true),
+            ("SHOW CREATE FLOW wrongcatalog.my_flow;", false),
+        ];
+        for (sql, is_ok) in show_flow_cases {
+            let stmt = &parse_stmt(sql, &GreptimeDbDialect {}).unwrap()[0];
+            let result = check_permission(plugins.clone(), stmt, &query_ctx);
+            assert_eq!(result.is_ok(), is_ok);
+        }
     }
 }
diff --git a/src/frontend/src/instance/builder.rs b/src/frontend/src/instance/builder.rs
index 7a32e0adcb..bd3547b371 100644
--- a/src/frontend/src/instance/builder.rs
+++ b/src/frontend/src/instance/builder.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
 
 use cache::{TABLE_FLOWNODE_SET_CACHE_NAME, TABLE_ROUTE_CACHE_NAME};
 use catalog::CatalogManagerRef;
@@ -32,15 +33,18 @@ use operator::flow::FlowServiceOperator;
 use operator::insert::Inserter;
 use operator::procedure::ProcedureServiceOperator;
 use operator::request::Requester;
-use operator::statement::{StatementExecutor, StatementExecutorRef};
+use operator::statement::{
+    ExecutorConfigureContext, StatementExecutor, StatementExecutorConfiguratorRef,
+    StatementExecutorRef,
+};
 use operator::table::TableMutationOperator;
 use partition::manager::PartitionRuleManager;
 use pipeline::pipeline_operator::PipelineOperator;
 use query::QueryEngineFactory;
 use query::region_query::RegionQueryHandlerFactoryRef;
-use snafu::OptionExt;
+use snafu::{OptionExt, ResultExt};
 
-use crate::error::{self, Result};
+use crate::error::{self, ExternalSnafu, Result};
 use crate::events::EventHandlerImpl;
 use crate::frontend::FrontendOptions;
 use crate::instance::Instance;
@@ -84,6 +88,33 @@ impl FrontendBuilder {
         }
     }
 
+    #[cfg(test)]
+    pub(crate) fn new_test(
+        options: &FrontendOptions,
+        meta_client: meta_client::MetaClientRef,
+    ) -> Self {
+        let kv_backend = Arc::new(common_meta::kv_backend::memory::MemoryKvBackend::new());
+
+        let layered_cache_registry = Arc::new(
+            common_meta::cache::LayeredCacheRegistryBuilder::default()
+                .add_cache_registry(cache::build_fundamental_cache_registry(kv_backend.clone()))
+                .build(),
+        );
+
+        Self::new(
+            options.clone(),
+            kv_backend,
+            layered_cache_registry,
+            catalog::memory::MemoryCatalogManager::with_default_setup(),
+            Arc::new(client::client_manager::NodeClients::default()),
+            meta_client,
+            Arc::new(catalog::process_manager::ProcessManager::new(
+                "".to_string(),
+                None,
+            )),
+        )
+    }
+
     pub fn with_local_cache_invalidator(self, cache_invalidator: CacheInvalidatorRef) -> Self {
         Self {
             local_cache_invalidator: Some(cache_invalidator),
@@ -187,10 +218,15 @@ impl FrontendBuilder {
             Some(process_manager.clone()),
         );
 
-        #[cfg(feature = "enterprise")]
         let statement_executor =
-            if let Some(factory) = plugins.get::<operator::statement::TriggerQuerierFactoryRef>() {
-                statement_executor.with_trigger_querier(factory.create(kv_backend.clone()))
+            if let Some(configurator) = plugins.get::<StatementExecutorConfiguratorRef>() {
+                let ctx = ExecutorConfigureContext {
+                    kv_backend: kv_backend.clone(),
+                };
+                configurator
+                    .configure(statement_executor, ctx)
+                    .await
+                    .context(ExternalSnafu)?
             } else {
                 statement_executor
             };
@@ -234,6 +270,7 @@ impl FrontendBuilder {
             process_manager,
             otlp_metrics_table_legacy_cache: DashMap::new(),
             slow_query_options: self.options.slow_query.clone(),
+            suspend: Arc::new(AtomicBool::new(false)),
         })
     }
 }
diff --git a/src/frontend/src/instance/grpc.rs b/src/frontend/src/instance/grpc.rs
index 9eeb57ce01..9d3e3ac85c 100644
--- a/src/frontend/src/instance/grpc.rs
+++ b/src/frontend/src/instance/grpc.rs
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::pin::Pin;
 use std::sync::Arc;
+use std::time::Instant;
 
 use api::helper::from_pb_time_ranges;
 use api::v1::ddl_request::{Expr as DdlExpr, Expr};
@@ -22,16 +24,18 @@ use api::v1::{
     DeleteRequests, DropFlowExpr, InsertIntoPlan, InsertRequests, RowDeleteRequests,
     RowInsertRequests,
 };
+use async_stream::try_stream;
 use async_trait::async_trait;
 use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
 use common_base::AffectedRows;
 use common_error::ext::BoxedError;
-use common_grpc::FlightData;
-use common_grpc::flight::FlightDecoder;
+use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
 use common_query::logical_plan::add_insert_to_logical_plan;
 use common_telemetry::tracing::{self};
 use datafusion::datasource::DefaultTableSource;
+use futures::Stream;
+use futures::stream::StreamExt;
 use query::parser::PromQuery;
 use servers::interceptor::{GrpcQueryInterceptor, GrpcQueryInterceptorRef};
 use servers::query_handler::grpc::GrpcQueryHandler;
@@ -230,6 +234,11 @@ impl GrpcQueryHandler for Instance {
                     DdlExpr::DropView(_) => {
                         todo!("implemented in the following PR")
                     }
+                    DdlExpr::CommentOn(expr) => {
+                        self.statement_executor
+                            .comment_by_expr(expr, ctx.clone())
+                            .await?
+                    }
                 }
             }
         };
@@ -240,10 +249,8 @@ impl GrpcQueryHandler for Instance {
 
     async fn put_record_batch(
         &self,
-        table_name: &TableName,
+        request: servers::grpc::flight::PutRecordBatchRequest,
         table_ref: &mut Option<TableRef>,
-        decoder: &mut FlightDecoder,
-        data: FlightData,
         ctx: QueryContextRef,
     ) -> Result<AffectedRows> {
         let table = if let Some(table) = table_ref {
@@ -252,15 +259,15 @@ impl GrpcQueryHandler for Instance {
             let table = self
                 .catalog_manager()
                 .table(
-                    &table_name.catalog_name,
-                    &table_name.schema_name,
-                    &table_name.table_name,
+                    &request.table_name.catalog_name,
+                    &request.table_name.schema_name,
+                    &request.table_name.table_name,
                     None,
                 )
                 .await
                 .context(CatalogSnafu)?
                 .with_context(|| TableNotFoundSnafu {
-                    table_name: table_name.to_string(),
+                    table_name: request.table_name.to_string(),
                 })?;
             *table_ref = Some(table.clone());
             table
@@ -279,10 +286,87 @@ impl GrpcQueryHandler for Instance {
         // do we check limit for bulk insert?
 
         self.inserter
-            .handle_bulk_insert(table, decoder, data)
+            .handle_bulk_insert(
+                table,
+                request.flight_data,
+                request.record_batch,
+                request.schema_bytes,
+            )
             .await
             .context(TableOperationSnafu)
     }
+
+    fn handle_put_record_batch_stream(
+        &self,
+        mut stream: servers::grpc::flight::PutRecordBatchRequestStream,
+        ctx: QueryContextRef,
+    ) -> Pin<Box<dyn Stream<Item = Result<DoPutResponse>> + Send>> {
+        // Clone all necessary data to make it 'static
+        let catalog_manager = self.catalog_manager().clone();
+        let plugins = self.plugins.clone();
+        let inserter = self.inserter.clone();
+        let ctx = ctx.clone();
+        let mut table_ref: Option<TableRef> = None;
+        let mut table_checked = false;
+
+        Box::pin(try_stream! {
+            // Process each request in the stream
+            while let Some(request_result) = stream.next().await {
+                let request = request_result.map_err(|e| {
+                    let error_msg = format!("Stream error: {:?}", e);
+                    IncompleteGrpcRequestSnafu { err_msg: error_msg }.build()
+                })?;
+
+                // Resolve table and check permissions on first RecordBatch (after schema is received)
+                if !table_checked {
+                    let table_name = &request.table_name;
+
+                    plugins
+                        .get::<PermissionCheckerRef>()
+                        .as_ref()
+                        .check_permission(ctx.current_user(), PermissionReq::BulkInsert)
+                        .context(PermissionSnafu)?;
+
+                    // Resolve table reference
+                    table_ref = Some(
+                        catalog_manager
+                            .table(
+                                &table_name.catalog_name,
+                                &table_name.schema_name,
+                                &table_name.table_name,
+                                None,
+                            )
+                            .await
+                            .context(CatalogSnafu)?
+                            .with_context(|| TableNotFoundSnafu {
+                                table_name: table_name.to_string(),
+                            })?,
+                    );
+
+                    // Check permissions for the table
+                    let interceptor_ref = plugins.get::<GrpcQueryInterceptorRef<Error>>();
+                    let interceptor = interceptor_ref.as_ref();
+                    interceptor.pre_bulk_insert(table_ref.clone().unwrap(), ctx.clone())?;
+
+                    table_checked = true;
+                }
+
+                let request_id = request.request_id;
+                let start = Instant::now();
+                let rows = inserter
+                    .handle_bulk_insert(
+                        table_ref.clone().unwrap(),
+                        request.flight_data,
+                        request.record_batch,
+                        request.schema_bytes,
+                    )
+                    .await
+                    .context(TableOperationSnafu)?;
+                let elapsed_secs = start.elapsed().as_secs_f64();
+                yield DoPutResponse::new(request_id, rows, elapsed_secs);
+            }
+        })
+    }
 }
 
 fn fill_catalog_and_schema_from_context(ddl_expr: &mut DdlExpr, ctx: &QueryContextRef) {
@@ -330,6 +414,9 @@ fn fill_catalog_and_schema_from_context(ddl_expr: &mut DdlExpr, ctx: &QueryConte
         Expr::DropView(expr) => {
             check_and_fill!(expr);
         }
+        Expr::CommentOn(expr) => {
+            check_and_fill!(expr);
+        }
     }
 }
 
diff --git a/src/frontend/src/instance/jaeger.rs b/src/frontend/src/instance/jaeger.rs
index 7d06236c4a..607ed80098 100644
--- a/src/frontend/src/instance/jaeger.rs
+++ b/src/frontend/src/instance/jaeger.rs
@@ -32,16 +32,16 @@ use common_telemetry::warn;
 use datafusion::dataframe::DataFrame;
 use datafusion::execution::SessionStateBuilder;
 use datafusion::execution::context::SessionContext;
+use datafusion::functions_window::expr_fn::row_number;
 use datafusion_expr::select_expr::SelectExpr;
-use datafusion_expr::{Expr, SortExpr, col, lit, lit_timestamp_nano, wildcard};
-use datatypes::value::ValueRef;
+use datafusion_expr::{Expr, ExprFunctionExt, SortExpr, col, lit, lit_timestamp_nano, wildcard};
 use query::QueryEngineRef;
 use serde_json::Value as JsonValue;
 use servers::error::{
     CatalogSnafu, CollectRecordbatchSnafu, DataFusionSnafu, Result as ServerResult,
     TableNotFoundSnafu,
 };
-use servers::http::jaeger::{JAEGER_QUERY_TABLE_NAME_KEY, QueryTraceParams};
+use servers::http::jaeger::{JAEGER_QUERY_TABLE_NAME_KEY, QueryTraceParams, TraceUserAgent};
 use servers::otlp::trace::{
     DURATION_NANO_COLUMN, KEY_OTEL_STATUS_ERROR_KEY, SERVICE_NAME_COLUMN, SPAN_ATTRIBUTES_COLUMN,
     SPAN_KIND_COLUMN, SPAN_KIND_PREFIX, SPAN_NAME_COLUMN, SPAN_STATUS_CODE, SPAN_STATUS_ERROR,
@@ -50,12 +50,14 @@ use servers::otlp::trace::{
 use servers::query_handler::JaegerQueryHandler;
 use session::context::QueryContextRef;
 use snafu::{OptionExt, ResultExt};
+use table::TableRef;
 use table::requests::{TABLE_DATA_MODEL, TABLE_DATA_MODEL_TRACE_V1};
 use table::table::adapter::DfTableProviderAdapter;
 
 use crate::instance::Instance;
 
 const DEFAULT_LIMIT: usize = 2000;
+const KEY_RN: &str = "greptime_rn";
 
 #[async_trait]
 impl JaegerQueryHandler for Instance {
@@ -63,8 +65,7 @@ impl JaegerQueryHandler for Instance {
         // It's equivalent to `SELECT DISTINCT(service_name) FROM {db}.{trace_table}`.
         Ok(query_trace_table(
             ctx,
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
             vec![SelectExpr::from(col(SERVICE_NAME_COLUMN))],
             vec![],
             vec![],
@@ -105,8 +106,7 @@ impl JaegerQueryHandler for Instance {
         // ```.
         Ok(query_trace_table(
             ctx,
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
             vec![
                 SelectExpr::from(col(SPAN_NAME_COLUMN)),
                 SelectExpr::from(col(SPAN_KIND_COLUMN)),
@@ -128,6 +128,7 @@ impl JaegerQueryHandler for Instance {
         trace_id: &str,
         start_time: Option<i64>,
         end_time: Option<i64>,
+        limit: Option<usize>,
     ) -> ServerResult<Output> {
         // It's equivalent to the following SQL query:
         //
@@ -157,12 +158,11 @@ impl JaegerQueryHandler for Instance {
 
         Ok(query_trace_table(
             ctx,
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
             selects,
             filters,
             vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
-            Some(DEFAULT_LIMIT),
+            limit,
             None,
             vec![],
         )
@@ -217,8 +217,7 @@ impl JaegerQueryHandler for Instance {
         // ```.
         let output = query_trace_table(
             ctx.clone(),
-            self.catalog_manager(),
-            self.query_engine(),
+            self,
             vec![wildcard()],
             filters,
             vec![],
@@ -259,26 +258,47 @@ impl JaegerQueryHandler for Instance {
             filters.push(col(TIMESTAMP_COLUMN).lt_eq(lit_timestamp_nano(end_time)));
         }
 
-        Ok(query_trace_table(
-            ctx,
-            self.catalog_manager(),
-            self.query_engine(),
-            vec![wildcard()],
-            filters,
-            vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
-            None,
-            None,
-            vec![],
-        )
-        .await?)
+        match query_params.user_agent {
+            TraceUserAgent::Grafana => {
+                // grafana only use trace id and timestamp
+                // clicking the trace id will invoke the query trace api
+                // so we only need to return 1 span for each trace
+                let table_name = ctx
+                    .extension(JAEGER_QUERY_TABLE_NAME_KEY)
+                    .unwrap_or(TRACE_TABLE_NAME);
+
+                let table = get_table(ctx.clone(), self.catalog_manager(), table_name).await?;
+
+                Ok(find_traces_rank_3(
+                    table,
+                    self.query_engine(),
+                    filters,
+                    vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
+                )
+                .await?)
+            }
+            _ => {
+                // query all spans
+                Ok(query_trace_table(
+                    ctx,
+                    self,
+                    vec![wildcard()],
+                    filters,
+                    vec![col(TIMESTAMP_COLUMN).sort(false, false)], // Sort by timestamp in descending order.
+                    None,
+                    None,
+                    vec![],
+                )
+                .await?)
+            }
+        }
     }
 }
 
 #[allow(clippy::too_many_arguments)]
 async fn query_trace_table(
     ctx: QueryContextRef,
-    catalog_manager: &CatalogManagerRef,
-    query_engine: &QueryEngineRef,
+    instance: &Instance,
     selects: Vec<SelectExpr>,
     filters: Vec<Expr>,
     sorts: Vec<SortExpr>,
@@ -308,7 +328,8 @@ async fn query_trace_table(
         }
     };
 
-    let table = catalog_manager
+    let table = instance
+        .catalog_manager()
         .table(
             ctx.current_catalog(),
             &ctx.current_schema(),
@@ -341,7 +362,7 @@ async fn query_trace_table(
         .map(|s| format!("\"{}\"", s))
         .collect::<HashSet<String>>();
 
-    let df_context = create_df_context(query_engine)?;
+    let df_context = create_df_context(instance.query_engine())?;
 
     let dataframe = df_context
         .read_table(Arc::new(DfTableProviderAdapter::new(table)))
@@ -392,6 +413,84 @@ async fn query_trace_table(
     Ok(output)
 }
 
+async fn get_table(
+    ctx: QueryContextRef,
+    catalog_manager: &CatalogManagerRef,
+    table_name: &str,
+) -> ServerResult<TableRef> {
+    catalog_manager
+        .table(
+            ctx.current_catalog(),
+            &ctx.current_schema(),
+            table_name,
+            Some(&ctx),
+        )
+        .await
+        .context(CatalogSnafu)?
+        .with_context(|| TableNotFoundSnafu {
+            table: table_name,
+            catalog: ctx.current_catalog(),
+            schema: ctx.current_schema(),
+        })
+}
+
+async fn find_traces_rank_3(
+    table: TableRef,
+    query_engine: &QueryEngineRef,
+    filters: Vec<Expr>,
+    sorts: Vec<SortExpr>,
+) -> ServerResult<Output> {
+    let df_context = create_df_context(query_engine)?;
+
+    let dataframe = df_context
+        .read_table(Arc::new(DfTableProviderAdapter::new(table)))
+        .context(DataFusionSnafu)?;
+
+    let dataframe = dataframe
+        .select(vec![wildcard()])
+        .context(DataFusionSnafu)?;
+
+    // Apply all filters.
+    let dataframe = filters.into_iter().try_fold(dataframe, |df, expr| {
+        df.filter(expr).context(DataFusionSnafu)
+    })?;
+
+    // Apply the sorts if needed.
+    let dataframe = if !sorts.is_empty() {
+        dataframe.sort(sorts).context(DataFusionSnafu)?
+    } else {
+        dataframe
+    };
+
+    // create rank column, for each trace, get the earliest 3 spans
+    let trace_id_col = vec![col(TRACE_ID_COLUMN)];
+    let timestamp_asc = vec![col(TIMESTAMP_COLUMN).sort(true, false)];
+
+    let dataframe = dataframe
+        .with_column(
+            KEY_RN,
+            row_number()
+                .partition_by(trace_id_col)
+                .order_by(timestamp_asc)
+                .build()
+                .context(DataFusionSnafu)?,
+        )
+        .context(DataFusionSnafu)?;
+
+    let dataframe = dataframe
+        .filter(col(KEY_RN).lt_eq(lit(3)))
+        .context(DataFusionSnafu)?;
+
+    // Execute the query and collect the result.
+    let stream = dataframe.execute_stream().await.context(DataFusionSnafu)?;
+
+    let output = Output::new_with_stream(Box::pin(
+        RecordBatchStreamAdapter::try_new(stream).context(CollectRecordbatchSnafu)?,
+    ));
+
+    Ok(output)
+}
+
 // The current implementation registers UDFs during the planning stage, which makes it difficult
 // to utilize them through DataFrame APIs. To address this limitation, we create a new session
 // context and register the required UDFs, allowing them to be decoupled from the global context.
@@ -587,13 +686,10 @@ async fn trace_ids_from_output(output: Output) -> ServerResult<Vec<String>> {
         {
             let mut trace_ids = vec![];
             for recordbatch in recordbatches {
-                for col in recordbatch.columns().iter() {
-                    for row_idx in 0..recordbatch.num_rows() {
-                        if let ValueRef::String(value) = col.get_ref(row_idx) {
-                            trace_ids.push(value.to_string());
-                        }
-                    }
-                }
+                recordbatch
+                    .iter_column_as_string(0)
+                    .flatten()
+                    .for_each(|x| trace_ids.push(x));
             }
 
             return Ok(trace_ids);
diff --git a/src/frontend/src/instance/promql.rs b/src/frontend/src/instance/promql.rs
index 0d754167c7..f527b18a71 100644
--- a/src/frontend/src/instance/promql.rs
+++ b/src/frontend/src/instance/promql.rs
@@ -20,7 +20,6 @@ use common_catalog::consts::INFORMATION_SCHEMA_NAME;
 use common_catalog::format_full_table_name;
 use common_recordbatch::util;
 use common_telemetry::tracing;
-use datatypes::prelude::Value;
 use promql_parser::label::{MatchOp, Matcher, Matchers};
 use query::promql;
 use query::promql::planner::PromPlanner;
@@ -90,15 +89,10 @@ impl Instance {
 
         for batch in batches {
             // Only one column the results, ensured by `prometheus::metric_name_matchers_to_plan`.
-            let names = batch.column(0);
-
-            for i in 0..names.len() {
-                let Value::String(name) = names.get(i) else {
-                    unreachable!();
-                };
-
-                results.push(name.into_string());
-            }
+            batch
+                .iter_column_as_string(0)
+                .flatten()
+                .for_each(|x| results.push(x))
         }
 
         Ok(results)
@@ -142,7 +136,7 @@ impl Instance {
                 table_name: format_full_table_name(ctx.current_catalog(), &table_schema, &metric),
             })?;
 
-        let scan_plan = dataframe.into_logical_plan();
+        let scan_plan = dataframe.into_unoptimized_plan();
         let filter_conditions =
             PromPlanner::matchers_to_expr(Matchers::new(matchers), scan_plan.schema())
                 .context(PrometheusLabelValuesQueryPlanSnafu)?;
@@ -173,11 +167,10 @@ impl Instance {
         let mut results = Vec::with_capacity(batches.iter().map(|b| b.num_rows()).sum());
         for batch in batches {
             // Only one column in results, ensured by `prometheus::label_values_matchers_to_plan`.
-            let names = batch.column(0);
-
-            for i in 0..names.len() {
-                results.push(names.get(i).to_string());
-            }
+            batch
+                .iter_column_as_string(0)
+                .flatten()
+                .for_each(|x| results.push(x))
         }
 
         Ok(results)
diff --git a/src/frontend/src/limiter.rs b/src/frontend/src/limiter.rs
index e0e32e6b1b..1055267b2d 100644
--- a/src/frontend/src/limiter.rs
+++ b/src/frontend/src/limiter.rs
@@ -18,7 +18,8 @@ use api::v1::column::Values;
 use api::v1::greptime_request::Request;
 use api::v1::value::ValueData;
 use api::v1::{
-    Decimal128, InsertRequests, IntervalMonthDayNano, RowInsertRequest, RowInsertRequests,
+    Decimal128, InsertRequests, IntervalMonthDayNano, JsonValue, RowInsertRequest,
+    RowInsertRequests, json_value,
 };
 use pipeline::ContextReq;
 use snafu::ResultExt;
@@ -229,12 +230,29 @@ impl Limiter {
                         .unwrap_or(0)
                 })
                 .sum(),
-            ValueData::JsonValue(inner) => inner
-                .as_ref()
-                .value_data
-                .as_ref()
-                .map(Self::size_of_value_data)
-                .unwrap_or(0),
+            ValueData::JsonValue(v) => {
+                fn calc(v: &JsonValue) -> usize {
+                    let Some(value) = v.value.as_ref() else {
+                        return 0;
+                    };
+                    match value {
+                        json_value::Value::Boolean(_) => size_of::<bool>(),
+                        json_value::Value::Int(_) => size_of::<i64>(),
+                        json_value::Value::Uint(_) => size_of::<u64>(),
+                        json_value::Value::Float(_) => size_of::<f64>(),
+                        json_value::Value::Str(s) => s.len(),
+                        json_value::Value::Array(array) => array.items.iter().map(calc).sum(),
+                        json_value::Value::Object(object) => object
+                            .entries
+                            .iter()
+                            .flat_map(|entry| {
+                                entry.value.as_ref().map(|v| entry.key.len() + calc(v))
+                            })
+                            .sum(),
+                    }
+                }
+                calc(v)
+            }
         }
     }
 }
diff --git a/src/frontend/src/server.rs b/src/frontend/src/server.rs
index 6c19109ab2..be14fb3cd2 100644
--- a/src/frontend/src/server.rs
+++ b/src/frontend/src/server.rs
@@ -16,16 +16,21 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 
 use auth::UserProviderRef;
+use axum::extract::{Request, State};
+use axum::middleware::Next;
+use axum::response::IntoResponse;
 use common_base::Plugins;
 use common_config::Configurable;
 use common_telemetry::info;
 use meta_client::MetaClientOptions;
 use servers::error::Error as ServerError;
 use servers::grpc::builder::GrpcServerBuilder;
+use servers::grpc::flight::FlightCraftRef;
 use servers::grpc::frontend_grpc_handler::FrontendGrpcHandler;
 use servers::grpc::greptime_handler::GreptimeRequestHandler;
 use servers::grpc::{GrpcOptions, GrpcServer};
 use servers::http::event::LogValidatorRef;
+use servers::http::result::error_result::ErrorResponse;
 use servers::http::utils::router::RouterConfigurator;
 use servers::http::{HttpServer, HttpServerBuilder};
 use servers::interceptor::LogIngestInterceptorRef;
@@ -36,8 +41,9 @@ use servers::postgres::PostgresServer;
 use servers::query_handler::grpc::ServerGrpcQueryHandlerAdapter;
 use servers::query_handler::sql::ServerSqlQueryHandlerAdapter;
 use servers::server::{Server, ServerHandlers};
-use servers::tls::{ReloadableTlsServerConfig, maybe_watch_tls_config};
+use servers::tls::{ReloadableTlsServerConfig, maybe_watch_server_tls_config};
 use snafu::ResultExt;
+use tonic::Status;
 
 use crate::error::{self, Result, StartServerSnafu, TomlFormatSnafu};
 use crate::frontend::FrontendOptions;
@@ -52,6 +58,7 @@ where
     grpc_server_builder: Option<GrpcServerBuilder>,
     http_server_builder: Option<HttpServerBuilder>,
     plugins: Plugins,
+    flight_handler: Option<FlightCraftRef>,
 }
 
 impl<T> Services<T>
@@ -65,6 +72,7 @@ where
             grpc_server_builder: None,
             http_server_builder: None,
             plugins,
+            flight_handler: None,
         }
     }
 
@@ -122,7 +130,16 @@ where
             builder = builder.with_extra_router(configurator.router());
         }
 
-        builder
+        builder.add_layer(axum::middleware::from_fn_with_state(
+            self.instance.clone(),
+            async move |State(state): State<Arc<Instance>>, request: Request, next: Next| {
+                if state.is_suspended() {
+                    return ErrorResponse::from_error(servers::error::SuspendedSnafu.build())
+                        .into_response();
+                }
+                next.run(request).await
+            },
+        ))
     }
 
     pub fn with_grpc_server_builder(self, builder: GrpcServerBuilder) -> Self {
@@ -139,6 +156,13 @@ where
         }
     }
 
+    pub fn with_flight_handler(self, flight_handler: FlightCraftRef) -> Self {
+        Self {
+            flight_handler: Some(flight_handler),
+            ..self
+        }
+    }
+
     fn build_grpc_server(
         &mut self,
         grpc: &GrpcOptions,
@@ -173,6 +197,12 @@ where
             grpc.flight_compression,
         );
 
+        // Use custom flight handler if provided, otherwise use the default GreptimeRequestHandler
+        let flight_handler = self
+            .flight_handler
+            .clone()
+            .unwrap_or_else(|| Arc::new(greptime_request_handler.clone()) as FlightCraftRef);
+
         let grpc_server = builder
             .name(name)
             .database_handler(greptime_request_handler.clone())
@@ -181,7 +211,17 @@ where
                 self.instance.clone(),
                 user_provider.clone(),
             ))
-            .flight_handler(Arc::new(greptime_request_handler));
+            .flight_handler(flight_handler)
+            .add_layer(axum::middleware::from_fn_with_state(
+                self.instance.clone(),
+                async move |State(state): State<Arc<Instance>>, request: Request, next: Next| {
+                    if state.is_suspended() {
+                        let status = Status::from(servers::error::SuspendedSnafu.build());
+                        return status.into_http();
+                    }
+                    next.run(request).await
+                },
+            ));
 
         let grpc_server = if !external {
             let frontend_grpc_handler =
@@ -258,7 +298,7 @@ where
             );
 
             // will not watch if watch is disabled in tls option
-            maybe_watch_tls_config(tls_server_config.clone()).context(StartServerSnafu)?;
+            maybe_watch_server_tls_config(tls_server_config.clone()).context(StartServerSnafu)?;
 
             let mysql_server = MysqlServer::create_server(
                 common_runtime::global_runtime(),
@@ -287,7 +327,7 @@ where
                 ReloadableTlsServerConfig::try_new(opts.tls.clone()).context(StartServerSnafu)?,
             );
 
-            maybe_watch_tls_config(tls_server_config.clone()).context(StartServerSnafu)?;
+            maybe_watch_server_tls_config(tls_server_config.clone()).context(StartServerSnafu)?;
 
             let pg_server = Box::new(PostgresServer::new(
                 ServerSqlQueryHandlerAdapter::arc(instance.clone()),
diff --git a/src/index/Cargo.toml b/src/index/Cargo.toml
index bde6959b89..03fa4da4e8 100644
--- a/src/index/Cargo.toml
+++ b/src/index/Cargo.toml
@@ -17,6 +17,7 @@ common-error.workspace = true
 common-macro.workspace = true
 common-runtime.workspace = true
 common-telemetry.workspace = true
+datatypes.workspace = true
 fastbloom = "0.8"
 fst.workspace = true
 futures.workspace = true
@@ -25,6 +26,7 @@ itertools.workspace = true
 jieba-rs = "0.8"
 lazy_static.workspace = true
 mockall.workspace = true
+nalgebra.workspace = true
 pin-project.workspace = true
 prost.workspace = true
 puffin.workspace = true
@@ -39,6 +41,7 @@ tantivy = { version = "0.24", features = ["zstd-compression"] }
 tantivy-jieba = "0.16"
 tokio.workspace = true
 tokio-util.workspace = true
+usearch = { version = "2.21", default-features = false, features = ["fp16lib"] }
 uuid.workspace = true
 
 [dev-dependencies]
diff --git a/src/index/src/bloom_filter/applier.rs b/src/index/src/bloom_filter/applier.rs
index 18332d4815..db219c9e61 100644
--- a/src/index/src/bloom_filter/applier.rs
+++ b/src/index/src/bloom_filter/applier.rs
@@ -21,7 +21,7 @@ use itertools::Itertools;
 
 use crate::Bytes;
 use crate::bloom_filter::error::Result;
-use crate::bloom_filter::reader::BloomFilterReader;
+use crate::bloom_filter::reader::{BloomFilterReadMetrics, BloomFilterReader};
 
 /// `InListPredicate` contains a list of acceptable values. A value needs to match at least
 /// one of the elements (logical OR semantic) for the predicate to be satisfied.
@@ -38,7 +38,7 @@ pub struct BloomFilterApplier {
 
 impl BloomFilterApplier {
     pub async fn new(reader: Box<dyn BloomFilterReader + Send>) -> Result<Self> {
-        let meta = reader.metadata().await?;
+        let meta = reader.metadata(None).await?;
 
         Ok(Self { reader, meta })
     }
@@ -50,6 +50,7 @@ impl BloomFilterApplier {
         &mut self,
         predicates: &[InListPredicate],
         search_ranges: &[Range<usize>],
+        metrics: Option<&mut BloomFilterReadMetrics>,
     ) -> Result<Vec<Range<usize>>> {
         if predicates.is_empty() {
             // If no predicates, return empty result
@@ -57,7 +58,7 @@ impl BloomFilterApplier {
         }
 
         let segments = self.row_ranges_to_segments(search_ranges);
-        let (seg_locations, bloom_filters) = self.load_bloom_filters(&segments).await?;
+        let (seg_locations, bloom_filters) = self.load_bloom_filters(&segments, metrics).await?;
         let matching_row_ranges = self.find_matching_rows(seg_locations, bloom_filters, predicates);
         Ok(intersect_ranges(search_ranges, &matching_row_ranges))
     }
@@ -95,6 +96,7 @@ impl BloomFilterApplier {
     async fn load_bloom_filters(
         &mut self,
         segments: &[usize],
+        metrics: Option<&mut BloomFilterReadMetrics>,
     ) -> Result<(Vec<(u64, usize)>, Vec<BloomFilter>)> {
         let segment_locations = segments
             .iter()
@@ -108,7 +110,10 @@ impl BloomFilterApplier {
             .map(|i| self.meta.bloom_filter_locs[i as usize])
             .collect::<Vec<_>>();
 
-        let bloom_filters = self.reader.bloom_filter_vec(&bloom_filter_locs).await?;
+        let bloom_filters = self
+            .reader
+            .bloom_filter_vec(&bloom_filter_locs, metrics)
+            .await?;
 
         Ok((segment_locations, bloom_filters))
     }
@@ -422,7 +427,10 @@ mod tests {
         ];
 
         for (predicates, search_range, expected) in cases {
-            let result = applier.search(&predicates, &[search_range]).await.unwrap();
+            let result = applier
+                .search(&predicates, &[search_range], None)
+                .await
+                .unwrap();
             assert_eq!(
                 result, expected,
                 "Expected {:?}, got {:?}",
diff --git a/src/index/src/bloom_filter/reader.rs b/src/index/src/bloom_filter/reader.rs
index 466024f0d7..a9e08694e7 100644
--- a/src/index/src/bloom_filter/reader.rs
+++ b/src/index/src/bloom_filter/reader.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 use std::ops::{Range, Rem};
+use std::time::{Duration, Instant};
 
 use async_trait::async_trait;
 use bytemuck::try_cast_slice;
@@ -34,6 +35,72 @@ const BLOOM_META_LEN_SIZE: u64 = 4;
 /// Default prefetch size of bloom filter meta.
 pub const DEFAULT_PREFETCH_SIZE: u64 = 8192; // 8KiB
 
+/// Metrics for bloom filter read operations.
+#[derive(Default, Clone)]
+pub struct BloomFilterReadMetrics {
+    /// Total byte size to read.
+    pub total_bytes: u64,
+    /// Total number of ranges to read.
+    pub total_ranges: usize,
+    /// Elapsed time to fetch data.
+    pub fetch_elapsed: Duration,
+    /// Number of cache hits.
+    pub cache_hit: usize,
+    /// Number of cache misses.
+    pub cache_miss: usize,
+}
+
+impl std::fmt::Debug for BloomFilterReadMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let Self {
+            total_bytes,
+            total_ranges,
+            fetch_elapsed,
+            cache_hit,
+            cache_miss,
+        } = self;
+
+        // If both total_bytes and cache_hit are 0, we didn't read anything.
+        if *total_bytes == 0 && *cache_hit == 0 {
+            return write!(f, "{{}}");
+        }
+        write!(f, "{{")?;
+
+        if *total_bytes > 0 {
+            write!(f, "\"total_bytes\":{}", total_bytes)?;
+        }
+        if *cache_hit > 0 {
+            if *total_bytes > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "\"cache_hit\":{}", cache_hit)?;
+        }
+
+        if *total_ranges > 0 {
+            write!(f, ", \"total_ranges\":{}", total_ranges)?;
+        }
+        if !fetch_elapsed.is_zero() {
+            write!(f, ", \"fetch_elapsed\":\"{:?}\"", fetch_elapsed)?;
+        }
+        if *cache_miss > 0 {
+            write!(f, ", \"cache_miss\":{}", cache_miss)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl BloomFilterReadMetrics {
+    /// Merges another metrics into this one.
+    pub fn merge_from(&mut self, other: &Self) {
+        self.total_bytes += other.total_bytes;
+        self.total_ranges += other.total_ranges;
+        self.fetch_elapsed += other.fetch_elapsed;
+        self.cache_hit += other.cache_hit;
+        self.cache_miss += other.cache_miss;
+    }
+}
+
 /// Safely converts bytes to Vec<u64> using bytemuck for optimal performance.
 /// Faster than chunking and converting each piece individually.
 ///
@@ -79,25 +146,33 @@ pub fn bytes_to_u64_vec(bytes: &Bytes) -> Vec<u64> {
 #[async_trait]
 pub trait BloomFilterReader: Sync {
     /// Reads range of bytes from the file.
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes>;
+    async fn range_read(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Bytes>;
 
     /// Reads bunch of ranges from the file.
-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
-        let mut results = Vec::with_capacity(ranges.len());
-        for range in ranges {
-            let size = (range.end - range.start) as u32;
-            let data = self.range_read(range.start, size).await?;
-            results.push(data);
-        }
-        Ok(results)
-    }
+    async fn read_vec(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Vec<Bytes>>;
 
     /// Reads the meta information of the bloom filter.
-    async fn metadata(&self) -> Result<BloomFilterMeta>;
+    async fn metadata(
+        &self,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilterMeta>;
 
     /// Reads a bloom filter with the given location.
-    async fn bloom_filter(&self, loc: &BloomFilterLoc) -> Result<BloomFilter> {
-        let bytes = self.range_read(loc.offset, loc.size as _).await?;
+    async fn bloom_filter(
+        &self,
+        loc: &BloomFilterLoc,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilter> {
+        let bytes = self.range_read(loc.offset, loc.size as _, metrics).await?;
         let vec = bytes_to_u64_vec(&bytes);
         let bm = BloomFilter::from_vec(vec)
             .seed(&SEED)
@@ -105,12 +180,16 @@ pub trait BloomFilterReader: Sync {
         Ok(bm)
     }
 
-    async fn bloom_filter_vec(&self, locs: &[BloomFilterLoc]) -> Result<Vec<BloomFilter>> {
+    async fn bloom_filter_vec(
+        &self,
+        locs: &[BloomFilterLoc],
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Vec<BloomFilter>> {
         let ranges = locs
             .iter()
             .map(|l| l.offset..l.offset + l.size)
             .collect::<Vec<_>>();
-        let bss = self.read_vec(&ranges).await?;
+        let bss = self.read_vec(&ranges, metrics).await?;
 
         let mut result = Vec::with_capacity(bss.len());
         for (bs, loc) in bss.into_iter().zip(locs.iter()) {
@@ -140,24 +219,59 @@ impl<R: RangeReader> BloomFilterReaderImpl<R> {
 
 #[async_trait]
 impl<R: RangeReader> BloomFilterReader for BloomFilterReaderImpl<R> {
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes> {
-        self.reader
+    async fn range_read(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Bytes> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+        let result = self
+            .reader
             .read(offset..offset + size as u64)
             .await
-            .context(IoSnafu)
+            .context(IoSnafu)?;
+
+        if let Some(m) = metrics {
+            m.total_ranges += 1;
+            m.total_bytes += size as u64;
+            if let Some(start) = start {
+                m.fetch_elapsed += start.elapsed();
+            }
+        }
+
+        Ok(result)
     }
 
-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
-        self.reader.read_vec(ranges).await.context(IoSnafu)
+    async fn read_vec(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Vec<Bytes>> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+        let result = self.reader.read_vec(ranges).await.context(IoSnafu)?;
+
+        if let Some(m) = metrics {
+            m.total_ranges += ranges.len();
+            m.total_bytes += ranges.iter().map(|r| r.end - r.start).sum::<u64>();
+            if let Some(start) = start {
+                m.fetch_elapsed += start.elapsed();
+            }
+        }
+
+        Ok(result)
     }
 
-    async fn metadata(&self) -> Result<BloomFilterMeta> {
+    async fn metadata(
+        &self,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilterMeta> {
         let metadata = self.reader.metadata().await.context(IoSnafu)?;
         let file_size = metadata.content_length;
 
         let mut meta_reader =
             BloomFilterMetaReader::new(&self.reader, file_size, Some(DEFAULT_PREFETCH_SIZE));
-        meta_reader.metadata().await
+        meta_reader.metadata(metrics).await
     }
 }
 
@@ -183,7 +297,10 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
     ///
     /// It will first prefetch some bytes from the end of the file,
     /// then parse the metadata from the prefetch bytes.
-    pub async fn metadata(&mut self) -> Result<BloomFilterMeta> {
+    pub async fn metadata(
+        &mut self,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilterMeta> {
         ensure!(
             self.file_size >= BLOOM_META_LEN_SIZE,
             FileSizeTooSmallSnafu {
@@ -191,6 +308,7 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
             }
         );
 
+        let start = metrics.as_ref().map(|_| Instant::now());
         let meta_start = self.file_size.saturating_sub(self.prefetch_size);
         let suffix = self
             .reader
@@ -208,8 +326,28 @@ impl<R: RangeReader> BloomFilterMetaReader<R> {
                 .read(metadata_start..self.file_size - BLOOM_META_LEN_SIZE)
                 .await
                 .context(IoSnafu)?;
+
+            if let Some(m) = metrics {
+                // suffix read + meta read
+                m.total_ranges += 2;
+                // Ignores the meta length size to simplify the calculation.
+                m.total_bytes += self.file_size.min(self.prefetch_size) + length;
+                if let Some(start) = start {
+                    m.fetch_elapsed += start.elapsed();
+                }
+            }
+
             BloomFilterMeta::decode(meta).context(DecodeProtoSnafu)
         } else {
+            if let Some(m) = metrics {
+                // suffix read only
+                m.total_ranges += 1;
+                m.total_bytes += self.file_size.min(self.prefetch_size);
+                if let Some(start) = start {
+                    m.fetch_elapsed += start.elapsed();
+                }
+            }
+
             let metadata_start = self.file_size - length - BLOOM_META_LEN_SIZE - meta_start;
             let meta = &suffix[metadata_start as usize..suffix_len - BLOOM_META_LEN_SIZE as usize];
             BloomFilterMeta::decode(meta).context(DecodeProtoSnafu)
@@ -290,7 +428,7 @@ mod tests {
         for prefetch in [0u64, file_size / 2, file_size, file_size + 10] {
             let mut reader =
                 BloomFilterMetaReader::new(bytes.clone(), file_size as _, Some(prefetch));
-            let meta = reader.metadata().await.unwrap();
+            let meta = reader.metadata(None).await.unwrap();
 
             assert_eq!(meta.rows_per_segment, 2);
             assert_eq!(meta.segment_count, 2);
@@ -312,11 +450,11 @@ mod tests {
         let bytes = mock_bloom_filter_bytes().await;
 
         let reader = BloomFilterReaderImpl::new(bytes);
-        let meta = reader.metadata().await.unwrap();
+        let meta = reader.metadata(None).await.unwrap();
 
         assert_eq!(meta.bloom_filter_locs.len(), 2);
         let bf = reader
-            .bloom_filter(&meta.bloom_filter_locs[0])
+            .bloom_filter(&meta.bloom_filter_locs[0], None)
             .await
             .unwrap();
         assert!(bf.contains(&b"a"));
@@ -325,7 +463,7 @@ mod tests {
         assert!(bf.contains(&b"d"));
 
         let bf = reader
-            .bloom_filter(&meta.bloom_filter_locs[1])
+            .bloom_filter(&meta.bloom_filter_locs[1], None)
             .await
             .unwrap();
         assert!(bf.contains(&b"e"));
diff --git a/src/index/src/fulltext_index/tests.rs b/src/index/src/fulltext_index/tests.rs
index abdf20e22d..2198ea67b3 100644
--- a/src/index/src/fulltext_index/tests.rs
+++ b/src/index/src/fulltext_index/tests.rs
@@ -74,7 +74,7 @@ async fn test_search(
     writer.finish().await.unwrap();
 
     let reader = puffin_manager.reader(&file_name).await.unwrap();
-    let index_dir = reader.dir(&blob_key).await.unwrap();
+    let (index_dir, _metrics) = reader.dir(&blob_key).await.unwrap();
     let searcher = TantivyFulltextIndexSearcher::new(index_dir.path(), config).unwrap();
     for (query, expected) in query_expected {
         let results = searcher.search(query).await.unwrap();
diff --git a/src/index/src/inverted_index/format/reader.rs b/src/index/src/inverted_index/format/reader.rs
index 40fa22130a..ff67284e51 100644
--- a/src/index/src/inverted_index/format/reader.rs
+++ b/src/index/src/inverted_index/format/reader.rs
@@ -15,6 +15,7 @@
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Duration;
 
 use async_trait::async_trait;
 use bytes::Bytes;
@@ -29,37 +30,115 @@ pub use crate::inverted_index::format::reader::blob::InvertedIndexBlobReader;
 mod blob;
 mod footer;
 
+/// Metrics for inverted index read operations.
+#[derive(Default, Clone)]
+pub struct InvertedIndexReadMetrics {
+    /// Total byte size to read.
+    pub total_bytes: u64,
+    /// Total number of ranges to read.
+    pub total_ranges: usize,
+    /// Elapsed time to fetch data.
+    pub fetch_elapsed: Duration,
+    /// Number of cache hits.
+    pub cache_hit: usize,
+    /// Number of cache misses.
+    pub cache_miss: usize,
+}
+
+impl std::fmt::Debug for InvertedIndexReadMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let Self {
+            total_bytes,
+            total_ranges,
+            fetch_elapsed,
+            cache_hit,
+            cache_miss,
+        } = self;
+
+        // If both total_bytes and cache_hit are 0, we didn't read anything.
+        if *total_bytes == 0 && *cache_hit == 0 {
+            return write!(f, "{{}}");
+        }
+        write!(f, "{{")?;
+
+        if *total_bytes > 0 {
+            write!(f, "\"total_bytes\":{}", total_bytes)?;
+        }
+        if *cache_hit > 0 {
+            if *total_bytes > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "\"cache_hit\":{}", cache_hit)?;
+        }
+
+        if *total_ranges > 0 {
+            write!(f, ", \"total_ranges\":{}", total_ranges)?;
+        }
+        if !fetch_elapsed.is_zero() {
+            write!(f, ", \"fetch_elapsed\":\"{:?}\"", fetch_elapsed)?;
+        }
+        if *cache_miss > 0 {
+            write!(f, ", \"cache_miss\":{}", cache_miss)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl InvertedIndexReadMetrics {
+    /// Merges another metrics into this one.
+    pub fn merge_from(&mut self, other: &Self) {
+        self.total_bytes += other.total_bytes;
+        self.total_ranges += other.total_ranges;
+        self.fetch_elapsed += other.fetch_elapsed;
+        self.cache_hit += other.cache_hit;
+        self.cache_miss += other.cache_miss;
+    }
+}
+
 /// InvertedIndexReader defines an asynchronous reader of inverted index data
 #[mockall::automock]
 #[async_trait]
 pub trait InvertedIndexReader: Send + Sync {
     /// Seeks to given offset and reads data with exact size as provided.
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>>;
+    async fn range_read<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<u8>>;
 
     /// Reads the bytes in the given ranges.
-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
-        let mut result = Vec::with_capacity(ranges.len());
-        for range in ranges {
-            let data = self
-                .range_read(range.start, (range.end - range.start) as u32)
-                .await?;
-            result.push(Bytes::from(data));
-        }
-        Ok(result)
-    }
+    async fn read_vec<'a>(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<Bytes>>;
 
     /// Retrieves metadata of all inverted indices stored within the blob.
-    async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>>;
+    async fn metadata<'a>(
+        &self,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Arc<InvertedIndexMetas>>;
 
     /// Retrieves the finite state transducer (FST) map from the given offset and size.
-    async fn fst(&self, offset: u64, size: u32) -> Result<FstMap> {
-        let fst_data = self.range_read(offset, size).await?;
+    async fn fst<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<FstMap> {
+        let fst_data = self.range_read(offset, size, metrics).await?;
         FstMap::new(fst_data).context(DecodeFstSnafu)
     }
 
     /// Retrieves the multiple finite state transducer (FST) maps from the given ranges.
-    async fn fst_vec(&mut self, ranges: &[Range<u64>]) -> Result<Vec<FstMap>> {
-        self.read_vec(ranges)
+    async fn fst_vec<'a>(
+        &mut self,
+        ranges: &[Range<u64>],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<FstMap>> {
+        self.read_vec(ranges, metrics)
             .await?
             .into_iter()
             .map(|bytes| FstMap::new(bytes.to_vec()).context(DecodeFstSnafu))
@@ -67,19 +146,28 @@ pub trait InvertedIndexReader: Send + Sync {
     }
 
     /// Retrieves the bitmap from the given offset and size.
-    async fn bitmap(&self, offset: u64, size: u32, bitmap_type: BitmapType) -> Result<Bitmap> {
-        self.range_read(offset, size).await.and_then(|bytes| {
-            Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
-        })
+    async fn bitmap<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        bitmap_type: BitmapType,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Bitmap> {
+        self.range_read(offset, size, metrics)
+            .await
+            .and_then(|bytes| {
+                Bitmap::deserialize_from(&bytes, bitmap_type).context(DecodeBitmapSnafu)
+            })
     }
 
     /// Retrieves the multiple bitmaps from the given ranges.
-    async fn bitmap_deque(
+    async fn bitmap_deque<'a>(
         &mut self,
         ranges: &[(Range<u64>, BitmapType)],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
     ) -> Result<VecDeque<Bitmap>> {
         let (ranges, types): (Vec<_>, Vec<_>) = ranges.iter().cloned().unzip();
-        let bytes = self.read_vec(&ranges).await?;
+        let bytes = self.read_vec(&ranges, metrics).await?;
         bytes
             .into_iter()
             .zip(types)
diff --git a/src/index/src/inverted_index/format/reader/blob.rs b/src/index/src/inverted_index/format/reader/blob.rs
index f48791e8f4..05f8f40047 100644
--- a/src/index/src/inverted_index/format/reader/blob.rs
+++ b/src/index/src/inverted_index/format/reader/blob.rs
@@ -14,6 +14,7 @@
 
 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Instant;
 
 use async_trait::async_trait;
 use bytes::Bytes;
@@ -23,10 +24,10 @@ use snafu::{ResultExt, ensure};
 
 use crate::inverted_index::error::{CommonIoSnafu, Result, UnexpectedBlobSizeSnafu};
 use crate::inverted_index::format::MIN_BLOB_SIZE;
-use crate::inverted_index::format::reader::InvertedIndexReader;
 use crate::inverted_index::format::reader::footer::{
     DEFAULT_PREFETCH_SIZE, InvertedIndexFooterReader,
 };
+use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
 
 /// Inverted index blob reader, implements [`InvertedIndexReader`]
 pub struct InvertedIndexBlobReader<R> {
@@ -53,27 +54,58 @@ impl<R> InvertedIndexBlobReader<R> {
 
 #[async_trait]
 impl<R: RangeReader + Sync> InvertedIndexReader for InvertedIndexBlobReader<R> {
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>> {
+    async fn range_read<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<u8>> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+
         let buf = self
             .source
             .read(offset..offset + size as u64)
             .await
             .context(CommonIoSnafu)?;
+
+        if let Some(m) = metrics {
+            m.total_bytes += size as u64;
+            m.total_ranges += 1;
+            m.fetch_elapsed += start.unwrap().elapsed();
+        }
+
         Ok(buf.into())
     }
 
-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
-        self.source.read_vec(ranges).await.context(CommonIoSnafu)
+    async fn read_vec<'a>(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<Bytes>> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+
+        let result = self.source.read_vec(ranges).await.context(CommonIoSnafu)?;
+
+        if let Some(m) = metrics {
+            m.total_bytes += ranges.iter().map(|r| r.end - r.start).sum::<u64>();
+            m.total_ranges += ranges.len();
+            m.fetch_elapsed += start.unwrap().elapsed();
+        }
+
+        Ok(result)
     }
 
-    async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>> {
+    async fn metadata<'a>(
+        &self,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Arc<InvertedIndexMetas>> {
         let metadata = self.source.metadata().await.context(CommonIoSnafu)?;
         let blob_size = metadata.content_length;
         Self::validate_blob_size(blob_size)?;
 
         let mut footer_reader = InvertedIndexFooterReader::new(&self.source, blob_size)
             .with_prefetch_size(DEFAULT_PREFETCH_SIZE);
-        footer_reader.metadata().await.map(Arc::new)
+        footer_reader.metadata(metrics).await.map(Arc::new)
     }
 }
 
@@ -173,7 +205,7 @@ mod tests {
         let blob = create_inverted_index_blob();
         let blob_reader = InvertedIndexBlobReader::new(blob);
 
-        let metas = blob_reader.metadata().await.unwrap();
+        let metas = blob_reader.metadata(None).await.unwrap();
         assert_eq!(metas.metas.len(), 2);
 
         let meta0 = metas.metas.get("tag0").unwrap();
@@ -200,13 +232,14 @@ mod tests {
         let blob = create_inverted_index_blob();
         let blob_reader = InvertedIndexBlobReader::new(blob);
 
-        let metas = blob_reader.metadata().await.unwrap();
+        let metas = blob_reader.metadata(None).await.unwrap();
         let meta = metas.metas.get("tag0").unwrap();
 
         let fst_map = blob_reader
             .fst(
                 meta.base_offset + meta.relative_fst_offset as u64,
                 meta.fst_size,
+                None,
             )
             .await
             .unwrap();
@@ -219,6 +252,7 @@ mod tests {
             .fst(
                 meta.base_offset + meta.relative_fst_offset as u64,
                 meta.fst_size,
+                None,
             )
             .await
             .unwrap();
@@ -232,30 +266,30 @@ mod tests {
         let blob = create_inverted_index_blob();
         let blob_reader = InvertedIndexBlobReader::new(blob);
 
-        let metas = blob_reader.metadata().await.unwrap();
+        let metas = blob_reader.metadata(None).await.unwrap();
         let meta = metas.metas.get("tag0").unwrap();
 
         let bitmap = blob_reader
-            .bitmap(meta.base_offset, 26, BitmapType::Roaring)
+            .bitmap(meta.base_offset, 26, BitmapType::Roaring, None)
             .await
             .unwrap();
         assert_eq!(bitmap, mock_bitmap());
         let bitmap = blob_reader
-            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
+            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring, None)
             .await
             .unwrap();
         assert_eq!(bitmap, mock_bitmap());
 
-        let metas = blob_reader.metadata().await.unwrap();
+        let metas = blob_reader.metadata(None).await.unwrap();
         let meta = metas.metas.get("tag1").unwrap();
 
         let bitmap = blob_reader
-            .bitmap(meta.base_offset, 26, BitmapType::Roaring)
+            .bitmap(meta.base_offset, 26, BitmapType::Roaring, None)
             .await
             .unwrap();
         assert_eq!(bitmap, mock_bitmap());
         let bitmap = blob_reader
-            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring)
+            .bitmap(meta.base_offset + 26, 26, BitmapType::Roaring, None)
             .await
             .unwrap();
         assert_eq!(bitmap, mock_bitmap());
diff --git a/src/index/src/inverted_index/format/reader/footer.rs b/src/index/src/inverted_index/format/reader/footer.rs
index 2609eb6cbb..866021c6e6 100644
--- a/src/index/src/inverted_index/format/reader/footer.rs
+++ b/src/index/src/inverted_index/format/reader/footer.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::time::Instant;
+
 use common_base::range_read::RangeReader;
 use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexMetas};
 use prost::Message;
@@ -23,6 +25,7 @@ use crate::inverted_index::error::{
     UnexpectedZeroSegmentRowCountSnafu,
 };
 use crate::inverted_index::format::FOOTER_PAYLOAD_SIZE_SIZE;
+use crate::inverted_index::format::reader::InvertedIndexReadMetrics;
 
 pub const DEFAULT_PREFETCH_SIZE: u64 = 8192; // 8KiB
 
@@ -54,12 +57,17 @@ impl<R> InvertedIndexFooterReader<R> {
 }
 
 impl<R: RangeReader> InvertedIndexFooterReader<R> {
-    pub async fn metadata(&mut self) -> Result<InvertedIndexMetas> {
+    pub async fn metadata(
+        &mut self,
+        mut metrics: Option<&mut InvertedIndexReadMetrics>,
+    ) -> Result<InvertedIndexMetas> {
         ensure!(
             self.blob_size >= FOOTER_PAYLOAD_SIZE_SIZE,
             BlobSizeTooSmallSnafu
         );
 
+        let start = metrics.as_ref().map(|_| Instant::now());
+
         let footer_start = self.blob_size.saturating_sub(self.prefetch_size());
         let suffix = self
             .source
@@ -73,19 +81,36 @@ impl<R: RangeReader> InvertedIndexFooterReader<R> {
         let footer_size = FOOTER_PAYLOAD_SIZE_SIZE;
 
         // Did not fetch the entire file metadata in the initial read, need to make a second request.
-        if length > suffix_len as u64 - footer_size {
+        let result = if length > suffix_len as u64 - footer_size {
             let metadata_start = self.blob_size - length - footer_size;
             let meta = self
                 .source
                 .read(metadata_start..self.blob_size - footer_size)
                 .await
                 .context(CommonIoSnafu)?;
+
+            if let Some(m) = metrics.as_deref_mut() {
+                m.total_bytes += self.blob_size.min(self.prefetch_size()) + length;
+                m.total_ranges += 2;
+            }
+
             self.parse_payload(&meta, length)
         } else {
+            if let Some(m) = metrics.as_deref_mut() {
+                m.total_bytes += self.blob_size.min(self.prefetch_size());
+                m.total_ranges += 1;
+            }
+
             let metadata_start = self.blob_size - length - footer_size - footer_start;
             let meta = &suffix[metadata_start as usize..suffix_len - footer_size as usize];
             self.parse_payload(meta, length)
+        };
+
+        if let Some(m) = metrics {
+            m.fetch_elapsed += start.unwrap().elapsed();
         }
+
+        result
     }
 
     fn read_tailing_four_bytes(suffix: &[u8]) -> Result<[u8; 4]> {
@@ -186,7 +211,7 @@ mod tests {
                 reader = reader.with_prefetch_size(prefetch);
             }
 
-            let metas = reader.metadata().await.unwrap();
+            let metas = reader.metadata(None).await.unwrap();
             assert_eq!(metas.metas.len(), 1);
             let index_meta = &metas.metas.get("test").unwrap();
             assert_eq!(index_meta.name, "test");
@@ -210,7 +235,7 @@ mod tests {
                 reader = reader.with_prefetch_size(prefetch);
             }
 
-            let result = reader.metadata().await;
+            let result = reader.metadata(None).await;
             assert_matches!(result, Err(Error::UnexpectedFooterPayloadSize { .. }));
         }
     }
@@ -233,7 +258,7 @@ mod tests {
                 reader = reader.with_prefetch_size(prefetch);
             }
 
-            let result = reader.metadata().await;
+            let result = reader.metadata(None).await;
             assert_matches!(result, Err(Error::UnexpectedOffsetSize { .. }));
         }
     }
diff --git a/src/index/src/inverted_index/format/writer/blob.rs b/src/index/src/inverted_index/format/writer/blob.rs
index 5991284869..58d8593591 100644
--- a/src/index/src/inverted_index/format/writer/blob.rs
+++ b/src/index/src/inverted_index/format/writer/blob.rs
@@ -122,7 +122,7 @@ mod tests {
             .unwrap();
 
         let reader = InvertedIndexBlobReader::new(blob);
-        let metadata = reader.metadata().await.unwrap();
+        let metadata = reader.metadata(None).await.unwrap();
         assert_eq!(metadata.total_row_count, 8);
         assert_eq!(metadata.segment_row_count, 1);
         assert_eq!(metadata.metas.len(), 0);
@@ -182,7 +182,7 @@ mod tests {
             .unwrap();
 
         let reader = InvertedIndexBlobReader::new(blob);
-        let metadata = reader.metadata().await.unwrap();
+        let metadata = reader.metadata(None).await.unwrap();
         assert_eq!(metadata.total_row_count, 8);
         assert_eq!(metadata.segment_row_count, 1);
         assert_eq!(metadata.metas.len(), 2);
@@ -198,13 +198,19 @@ mod tests {
             .fst(
                 tag0.base_offset + tag0.relative_fst_offset as u64,
                 tag0.fst_size,
+                None,
             )
             .await
             .unwrap();
         assert_eq!(fst0.len(), 3);
         let [offset, size] = unpack(fst0.get(b"a").unwrap());
         let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag0.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
@@ -213,7 +219,12 @@ mod tests {
         );
         let [offset, size] = unpack(fst0.get(b"b").unwrap());
         let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag0.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
@@ -222,7 +233,12 @@ mod tests {
         );
         let [offset, size] = unpack(fst0.get(b"c").unwrap());
         let bitmap = reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag0.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
@@ -241,13 +257,19 @@ mod tests {
             .fst(
                 tag1.base_offset + tag1.relative_fst_offset as u64,
                 tag1.fst_size,
+                None,
             )
             .await
             .unwrap();
         assert_eq!(fst1.len(), 3);
         let [offset, size] = unpack(fst1.get(b"x").unwrap());
         let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag1.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
@@ -256,7 +278,12 @@ mod tests {
         );
         let [offset, size] = unpack(fst1.get(b"y").unwrap());
         let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag1.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
@@ -265,7 +292,12 @@ mod tests {
         );
         let [offset, size] = unpack(fst1.get(b"z").unwrap());
         let bitmap = reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag1.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
diff --git a/src/index/src/inverted_index/search/fst_values_mapper.rs b/src/index/src/inverted_index/search/fst_values_mapper.rs
index f9c15c40d8..38df713c8d 100644
--- a/src/index/src/inverted_index/search/fst_values_mapper.rs
+++ b/src/index/src/inverted_index/search/fst_values_mapper.rs
@@ -16,7 +16,7 @@ use greptime_proto::v1::index::{BitmapType, InvertedIndexMeta};
 
 use crate::bitmap::Bitmap;
 use crate::inverted_index::error::Result;
-use crate::inverted_index::format::reader::InvertedIndexReader;
+use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
 
 /// `ParallelFstValuesMapper` enables parallel mapping of multiple FST value groups to their
 /// corresponding bitmaps within an inverted index.
@@ -35,7 +35,8 @@ impl<'a> ParallelFstValuesMapper<'a> {
 
     pub async fn map_values_vec(
         &mut self,
-        value_and_meta_vec: &[(Vec<u64>, &'a InvertedIndexMeta)],
+        value_and_meta_vec: &[(Vec<u64>, &InvertedIndexMeta)],
+        metrics: Option<&mut InvertedIndexReadMetrics>,
     ) -> Result<Vec<Bitmap>> {
         let groups = value_and_meta_vec
             .iter()
@@ -64,7 +65,7 @@ impl<'a> ParallelFstValuesMapper<'a> {
         }
 
         common_telemetry::debug!("fetch ranges: {:?}", fetch_ranges);
-        let mut bitmaps = self.reader.bitmap_deque(&fetch_ranges).await?;
+        let mut bitmaps = self.reader.bitmap_deque(&fetch_ranges, metrics).await?;
         let mut output = Vec::with_capacity(groups.len());
 
         for counter in groups {
@@ -95,23 +96,25 @@ mod tests {
     #[tokio::test]
     async fn test_map_values_vec() {
         let mut mock_reader = MockInvertedIndexReader::new();
-        mock_reader.expect_bitmap_deque().returning(|ranges| {
-            let mut output = VecDeque::new();
-            for (range, bitmap_type) in ranges {
-                let offset = range.start;
-                let size = range.end - range.start;
-                match (offset, size, bitmap_type) {
-                    (1, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+        mock_reader
+            .expect_bitmap_deque()
+            .returning(|ranges, _metrics| {
+                let mut output = VecDeque::new();
+                for (range, bitmap_type) in ranges {
+                    let offset = range.start;
+                    let size = range.end - range.start;
+                    match (offset, size, bitmap_type) {
+                        (1, 1, BitmapType::Roaring) => {
+                            output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+                        }
+                        (2, 1, BitmapType::Roaring) => {
+                            output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
+                        }
+                        _ => unreachable!(),
                     }
-                    (2, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b01010101], *bitmap_type))
-                    }
-                    _ => unreachable!(),
                 }
-            }
-            Ok(output)
-        });
+                Ok(output)
+            });
 
         let meta = InvertedIndexMeta {
             bitmap_type: BitmapType::Roaring.into(),
@@ -120,13 +123,13 @@ mod tests {
         let mut values_mapper = ParallelFstValuesMapper::new(&mut mock_reader);
 
         let result = values_mapper
-            .map_values_vec(&[(vec![], &meta)])
+            .map_values_vec(&[(vec![], &meta)], None)
             .await
             .unwrap();
         assert_eq!(result[0].count_ones(), 0);
 
         let result = values_mapper
-            .map_values_vec(&[(vec![value(1, 1)], &meta)])
+            .map_values_vec(&[(vec![value(1, 1)], &meta)], None)
             .await
             .unwrap();
         assert_eq!(
@@ -135,7 +138,7 @@ mod tests {
         );
 
         let result = values_mapper
-            .map_values_vec(&[(vec![value(2, 1)], &meta)])
+            .map_values_vec(&[(vec![value(2, 1)], &meta)], None)
             .await
             .unwrap();
         assert_eq!(
@@ -144,7 +147,7 @@ mod tests {
         );
 
         let result = values_mapper
-            .map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)])
+            .map_values_vec(&[(vec![value(1, 1), value(2, 1)], &meta)], None)
             .await
             .unwrap();
         assert_eq!(
@@ -153,7 +156,7 @@ mod tests {
         );
 
         let result = values_mapper
-            .map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)])
+            .map_values_vec(&[(vec![value(2, 1), value(1, 1)], &meta)], None)
             .await
             .unwrap();
         assert_eq!(
@@ -162,7 +165,10 @@ mod tests {
         );
 
         let result = values_mapper
-            .map_values_vec(&[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)])
+            .map_values_vec(
+                &[(vec![value(2, 1)], &meta), (vec![value(1, 1)], &meta)],
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
@@ -174,10 +180,13 @@ mod tests {
             Bitmap::from_lsb0_bytes(&[0b10101010], BitmapType::Roaring)
         );
         let result = values_mapper
-            .map_values_vec(&[
-                (vec![value(2, 1), value(1, 1)], &meta),
-                (vec![value(1, 1)], &meta),
-            ])
+            .map_values_vec(
+                &[
+                    (vec![value(2, 1), value(1, 1)], &meta),
+                    (vec![value(1, 1)], &meta),
+                ],
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
diff --git a/src/index/src/inverted_index/search/index_apply.rs b/src/index/src/inverted_index/search/index_apply.rs
index a80f102e02..02a1f96450 100644
--- a/src/index/src/inverted_index/search/index_apply.rs
+++ b/src/index/src/inverted_index/search/index_apply.rs
@@ -19,7 +19,7 @@ pub use predicates_apply::PredicatesIndexApplier;
 
 use crate::bitmap::Bitmap;
 use crate::inverted_index::error::Result;
-use crate::inverted_index::format::reader::InvertedIndexReader;
+use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
 
 /// The output of an apply operation.
 #[derive(Clone, Debug, PartialEq)]
@@ -44,10 +44,11 @@ pub trait IndexApplier: Send + Sync {
     /// Applies the predefined predicates to the data read by the given index reader, returning
     /// a list of relevant indices (e.g., post IDs, group IDs, row IDs).
     #[allow(unused_parens)]
-    async fn apply<'a>(
+    async fn apply<'a, 'b>(
         &self,
         context: SearchContext,
         reader: &mut (dyn InvertedIndexReader + 'a),
+        metrics: Option<&'b mut InvertedIndexReadMetrics>,
     ) -> Result<ApplyOutput>;
 
     /// Returns the memory usage of the applier.
diff --git a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
index ae22e79c74..441a4b4304 100644
--- a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
+++ b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs
@@ -19,7 +19,7 @@ use greptime_proto::v1::index::InvertedIndexMetas;
 
 use crate::bitmap::Bitmap;
 use crate::inverted_index::error::{IndexNotFoundSnafu, Result};
-use crate::inverted_index::format::reader::InvertedIndexReader;
+use crate::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
 use crate::inverted_index::search::fst_apply::{
     FstApplier, IntersectionFstApplier, KeysFstApplier,
 };
@@ -43,12 +43,14 @@ pub struct PredicatesIndexApplier {
 impl IndexApplier for PredicatesIndexApplier {
     /// Applies all `FstApplier`s to the data in the inverted index reader, intersecting the individual
     /// bitmaps obtained for each index to result in a final set of indices.
-    async fn apply<'a>(
+    async fn apply<'a, 'b>(
         &self,
         context: SearchContext,
         reader: &mut (dyn InvertedIndexReader + 'a),
+        metrics: Option<&'b mut InvertedIndexReadMetrics>,
     ) -> Result<ApplyOutput> {
-        let metadata = reader.metadata().await?;
+        let mut metrics = metrics;
+        let metadata = reader.metadata(metrics.as_deref_mut()).await?;
         let mut output = ApplyOutput {
             matched_segment_ids: Bitmap::new_bitvec(),
             total_row_count: metadata.total_row_count as _,
@@ -84,7 +86,7 @@ impl IndexApplier for PredicatesIndexApplier {
             return Ok(output);
         }
 
-        let fsts = reader.fst_vec(&fst_ranges).await?;
+        let fsts = reader.fst_vec(&fst_ranges, metrics.as_deref_mut()).await?;
         let value_and_meta_vec = fsts
             .into_iter()
             .zip(appliers)
@@ -92,7 +94,7 @@ impl IndexApplier for PredicatesIndexApplier {
             .collect::<Vec<_>>();
 
         let mut mapper = ParallelFstValuesMapper::new(reader);
-        let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec).await?;
+        let mut bm_vec = mapper.map_values_vec(&value_and_meta_vec, metrics).await?;
 
         let mut bitmap = bm_vec.pop().unwrap(); // SAFETY: `fst_ranges` is not empty
         for bm in bm_vec {
@@ -221,26 +223,28 @@ mod tests {
         let mut mock_reader = MockInvertedIndexReader::new();
         mock_reader
             .expect_metadata()
-            .returning(|| Ok(mock_metas([("tag-0", 0)])));
-        mock_reader.expect_fst_vec().returning(|_ranges| {
+            .returning(|_| Ok(mock_metas([("tag-0", 0)])));
+        mock_reader.expect_fst_vec().returning(|_ranges, _metrics| {
             Ok(vec![
                 FstMap::from_iter([(b"tag-0_value-0", fst_value(2, 1))]).unwrap(),
             ])
         });
 
-        mock_reader.expect_bitmap_deque().returning(|arg| {
-            assert_eq!(arg.len(), 1);
-            let range = &arg[0].0;
-            let bitmap_type = arg[0].1;
-            assert_eq!(*range, 2..3);
-            assert_eq!(bitmap_type, BitmapType::Roaring);
-            Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
-                &[0b10101010],
-                bitmap_type,
-            )]))
-        });
+        mock_reader
+            .expect_bitmap_deque()
+            .returning(|arg, _metrics| {
+                assert_eq!(arg.len(), 1);
+                let range = &arg[0].0;
+                let bitmap_type = arg[0].1;
+                assert_eq!(*range, 2..3);
+                assert_eq!(bitmap_type, BitmapType::Roaring);
+                Ok(VecDeque::from([Bitmap::from_lsb0_bytes(
+                    &[0b10101010],
+                    bitmap_type,
+                )]))
+            });
         let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
             .await
             .unwrap();
         assert_eq!(
@@ -252,14 +256,14 @@ mod tests {
         let mut mock_reader = MockInvertedIndexReader::new();
         mock_reader
             .expect_metadata()
-            .returning(|| Ok(mock_metas([("tag-0", 0)])));
-        mock_reader.expect_fst_vec().returning(|_range| {
+            .returning(|_| Ok(mock_metas([("tag-0", 0)])));
+        mock_reader.expect_fst_vec().returning(|_range, _metrics| {
             Ok(vec![
                 FstMap::from_iter([(b"tag-0_value-1", fst_value(2, 1))]).unwrap(),
             ])
         });
         let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
             .await
             .unwrap();
         assert_eq!(output.matched_segment_ids.count_ones(), 0);
@@ -279,8 +283,8 @@ mod tests {
         let mut mock_reader = MockInvertedIndexReader::new();
         mock_reader
             .expect_metadata()
-            .returning(|| Ok(mock_metas([("tag-0", 0), ("tag-1", 1)])));
-        mock_reader.expect_fst_vec().returning(|ranges| {
+            .returning(|_| Ok(mock_metas([("tag-0", 0), ("tag-1", 1)])));
+        mock_reader.expect_fst_vec().returning(|ranges, _metrics| {
             let mut output = vec![];
             for range in ranges {
                 match range.start {
@@ -293,27 +297,29 @@ mod tests {
             }
             Ok(output)
         });
-        mock_reader.expect_bitmap_deque().returning(|ranges| {
-            let mut output = VecDeque::new();
-            for (range, bitmap_type) in ranges {
-                let offset = range.start;
-                let size = range.end - range.start;
-                match (offset, size, bitmap_type) {
-                    (1, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+        mock_reader
+            .expect_bitmap_deque()
+            .returning(|ranges, _metrics| {
+                let mut output = VecDeque::new();
+                for (range, bitmap_type) in ranges {
+                    let offset = range.start;
+                    let size = range.end - range.start;
+                    match (offset, size, bitmap_type) {
+                        (1, 1, BitmapType::Roaring) => {
+                            output.push_back(Bitmap::from_lsb0_bytes(&[0b10101010], *bitmap_type))
+                        }
+                        (2, 1, BitmapType::Roaring) => {
+                            output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
+                        }
+                        _ => unreachable!(),
                     }
-                    (2, 1, BitmapType::Roaring) => {
-                        output.push_back(Bitmap::from_lsb0_bytes(&[0b11011011], *bitmap_type))
-                    }
-                    _ => unreachable!(),
                 }
-            }
 
-            Ok(output)
-        });
+                Ok(output)
+            });
 
         let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
             .await
             .unwrap();
         assert_eq!(
@@ -331,10 +337,10 @@ mod tests {
         let mut mock_reader: MockInvertedIndexReader = MockInvertedIndexReader::new();
         mock_reader
             .expect_metadata()
-            .returning(|| Ok(mock_metas([("tag-0", 0)])));
+            .returning(|_| Ok(mock_metas([("tag-0", 0)])));
 
         let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
             .await
             .unwrap();
         assert_eq!(output.matched_segment_ids, Bitmap::full_bitvec(8)); // full range to scan
@@ -343,7 +349,7 @@ mod tests {
     #[tokio::test]
     async fn test_index_applier_with_empty_index() {
         let mut mock_reader = MockInvertedIndexReader::new();
-        mock_reader.expect_metadata().returning(move || {
+        mock_reader.expect_metadata().returning(move |_| {
             Ok(Arc::new(InvertedIndexMetas {
                 total_row_count: 0, // No rows
                 segment_row_count: 1,
@@ -359,7 +365,7 @@ mod tests {
         };
 
         let output = applier
-            .apply(SearchContext::default(), &mut mock_reader)
+            .apply(SearchContext::default(), &mut mock_reader, None)
             .await
             .unwrap();
         assert!(output.matched_segment_ids.is_empty());
@@ -370,7 +376,7 @@ mod tests {
         let mut mock_reader = MockInvertedIndexReader::new();
         mock_reader
             .expect_metadata()
-            .returning(|| Ok(mock_metas(vec![])));
+            .returning(|_| Ok(mock_metas(vec![])));
 
         let mut mock_fst_applier = MockFstApplier::new();
         mock_fst_applier.expect_apply().never();
@@ -385,6 +391,7 @@ mod tests {
                     index_not_found_strategy: IndexNotFoundStrategy::ThrowError,
                 },
                 &mut mock_reader,
+                None,
             )
             .await;
         assert!(matches!(result, Err(Error::IndexNotFound { .. })));
@@ -395,6 +402,7 @@ mod tests {
                     index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty,
                 },
                 &mut mock_reader,
+                None,
             )
             .await
             .unwrap();
@@ -406,6 +414,7 @@ mod tests {
                     index_not_found_strategy: IndexNotFoundStrategy::Ignore,
                 },
                 &mut mock_reader,
+                None,
             )
             .await
             .unwrap();
diff --git a/src/index/src/lib.rs b/src/index/src/lib.rs
index 547f880bb4..f4f299bef6 100644
--- a/src/index/src/lib.rs
+++ b/src/index/src/lib.rs
@@ -22,6 +22,7 @@ pub mod external_provider;
 pub mod fulltext_index;
 pub mod inverted_index;
 pub mod target;
+pub mod vector;
 
 pub type Bytes = Vec<u8>;
 pub type BytesRef<'a> = &'a [u8];
diff --git a/src/index/src/vector.rs b/src/index/src/vector.rs
new file mode 100644
index 0000000000..77c844f610
--- /dev/null
+++ b/src/index/src/vector.rs
@@ -0,0 +1,163 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Vector index types and options.
+//!
+//! This module re-exports types from `datatypes` and provides conversions
+//! to USearch types, as well as distance computation functions.
+
+pub use datatypes::schema::{VectorDistanceMetric, VectorIndexOptions};
+use nalgebra::DVectorView;
+pub use usearch::MetricKind;
+
+/// Converts a VectorDistanceMetric to a USearch MetricKind.
+pub fn distance_metric_to_usearch(metric: VectorDistanceMetric) -> MetricKind {
+    match metric {
+        VectorDistanceMetric::L2sq => MetricKind::L2sq,
+        VectorDistanceMetric::Cosine => MetricKind::Cos,
+        VectorDistanceMetric::InnerProduct => MetricKind::IP,
+    }
+}
+
+/// Computes distance between two vectors using the specified metric.
+///
+/// Uses SIMD-optimized implementations via nalgebra.
+///
+/// **Note:** The caller must ensure that the two vectors have the same length
+/// and are non-empty. Empty vectors return 0.0 for all metrics.
+pub fn compute_distance(v1: &[f32], v2: &[f32], metric: VectorDistanceMetric) -> f32 {
+    // Empty vectors are degenerate; return 0.0 uniformly across all metrics.
+    if v1.is_empty() || v2.is_empty() {
+        return 0.0;
+    }
+
+    match metric {
+        VectorDistanceMetric::L2sq => l2sq(v1, v2),
+        VectorDistanceMetric::Cosine => cosine(v1, v2),
+        VectorDistanceMetric::InnerProduct => -dot(v1, v2),
+    }
+}
+
+/// Calculates the squared L2 distance between two vectors.
+fn l2sq(lhs: &[f32], rhs: &[f32]) -> f32 {
+    let lhs = DVectorView::from_slice(lhs, lhs.len());
+    let rhs = DVectorView::from_slice(rhs, rhs.len());
+    (lhs - rhs).norm_squared()
+}
+
+/// Calculates the cosine distance between two vectors.
+///
+/// Returns a value in `[0.0, 2.0]` where 0.0 means identical direction and 2.0 means
+/// opposite direction. For degenerate cases (zero or near-zero magnitude vectors),
+/// returns 1.0 (maximum uncertainty) to avoid NaN and ensure safe index operations.
+fn cosine(lhs: &[f32], rhs: &[f32]) -> f32 {
+    let lhs_vec = DVectorView::from_slice(lhs, lhs.len());
+    let rhs_vec = DVectorView::from_slice(rhs, rhs.len());
+
+    let dot_product = lhs_vec.dot(&rhs_vec);
+    let lhs_norm = lhs_vec.norm();
+    let rhs_norm = rhs_vec.norm();
+
+    // Zero-magnitude vectors have undefined direction; return max distance as safe fallback.
+    if dot_product.abs() < f32::EPSILON
+        || lhs_norm.abs() < f32::EPSILON
+        || rhs_norm.abs() < f32::EPSILON
+    {
+        return 1.0;
+    }
+
+    let cos_similar = dot_product / (lhs_norm * rhs_norm);
+    let res = 1.0 - cos_similar;
+    // Clamp near-zero results to exactly 0.0 to avoid floating-point artifacts.
+    if res.abs() < f32::EPSILON { 0.0 } else { res }
+}
+
+/// Calculates the dot product between two vectors.
+fn dot(lhs: &[f32], rhs: &[f32]) -> f32 {
+    let lhs = DVectorView::from_slice(lhs, lhs.len());
+    let rhs = DVectorView::from_slice(rhs, rhs.len());
+    lhs.dot(&rhs)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_distance_metric_to_usearch() {
+        assert_eq!(
+            distance_metric_to_usearch(VectorDistanceMetric::L2sq),
+            MetricKind::L2sq
+        );
+        assert_eq!(
+            distance_metric_to_usearch(VectorDistanceMetric::Cosine),
+            MetricKind::Cos
+        );
+        assert_eq!(
+            distance_metric_to_usearch(VectorDistanceMetric::InnerProduct),
+            MetricKind::IP
+        );
+    }
+
+    #[test]
+    fn test_vector_index_options_default() {
+        let options = VectorIndexOptions::default();
+        assert_eq!(options.metric, VectorDistanceMetric::L2sq);
+        assert_eq!(options.connectivity, 16);
+        assert_eq!(options.expansion_add, 128);
+        assert_eq!(options.expansion_search, 64);
+    }
+
+    #[test]
+    fn test_compute_distance_l2sq() {
+        let v1 = vec![1.0, 2.0, 3.0];
+        let v2 = vec![4.0, 5.0, 6.0];
+        // L2sq = (4-1)^2 + (5-2)^2 + (6-3)^2 = 9 + 9 + 9 = 27
+        let dist = compute_distance(&v1, &v2, VectorDistanceMetric::L2sq);
+        assert!((dist - 27.0).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_compute_distance_cosine() {
+        let v1 = vec![1.0, 0.0, 0.0];
+        let v2 = vec![0.0, 1.0, 0.0];
+        // Orthogonal vectors have cosine similarity of 0, distance of 1
+        let dist = compute_distance(&v1, &v2, VectorDistanceMetric::Cosine);
+        assert!((dist - 1.0).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_compute_distance_inner_product() {
+        let v1 = vec![1.0, 2.0, 3.0];
+        let v2 = vec![4.0, 5.0, 6.0];
+        // Inner product = 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
+        // Distance is negated: -32
+        let dist = compute_distance(&v1, &v2, VectorDistanceMetric::InnerProduct);
+        assert!((dist - (-32.0)).abs() < 1e-6);
+    }
+
+    #[test]
+    fn test_compute_distance_empty_vectors() {
+        // Empty vectors should return 0.0 uniformly for all metrics
+        assert_eq!(compute_distance(&[], &[], VectorDistanceMetric::L2sq), 0.0);
+        assert_eq!(
+            compute_distance(&[], &[], VectorDistanceMetric::Cosine),
+            0.0
+        );
+        assert_eq!(
+            compute_distance(&[], &[], VectorDistanceMetric::InnerProduct),
+            0.0
+        );
+    }
+}
diff --git a/src/log-store/src/kafka/client_manager.rs b/src/log-store/src/kafka/client_manager.rs
index 8a19238356..49d363ad14 100644
--- a/src/log-store/src/kafka/client_manager.rs
+++ b/src/log-store/src/kafka/client_manager.rs
@@ -16,7 +16,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 
 use common_wal::config::kafka::DatanodeKafkaConfig;
-use common_wal::config::kafka::common::DEFAULT_BACKOFF_CONFIG;
+use common_wal::config::kafka::common::{DEFAULT_BACKOFF_CONFIG, DEFAULT_CONNECT_TIMEOUT};
 use dashmap::DashMap;
 use rskafka::client::ClientBuilder;
 use rskafka::client::partition::{Compression, PartitionClient, UnknownTopicHandling};
@@ -78,7 +78,8 @@ impl ClientManager {
     ) -> Result<Self> {
         // Sets backoff config for the top-level kafka client and all clients constructed by it.
         let mut builder = ClientBuilder::new(config.connection.broker_endpoints.clone())
-            .backoff_config(DEFAULT_BACKOFF_CONFIG);
+            .backoff_config(DEFAULT_BACKOFF_CONFIG)
+            .connect_timeout(Some(DEFAULT_CONNECT_TIMEOUT));
         if let Some(sasl) = &config.connection.sasl {
             builder = builder.sasl_config(sasl.config.clone().into_sasl_config());
         };
diff --git a/src/meta-client/src/client.rs b/src/meta-client/src/client.rs
index d819251597..fff34d6d26 100644
--- a/src/meta-client/src/client.rs
+++ b/src/meta-client/src/client.rs
@@ -189,6 +189,9 @@ impl MetaClientBuilder {
         let mgr = client.channel_manager.clone();
 
         if self.enable_heartbeat {
+            if self.heartbeat_channel_manager.is_some() {
+                info!("Enable heartbeat channel using the heartbeat channel manager.");
+            }
             let mgr = self.heartbeat_channel_manager.unwrap_or(mgr.clone());
             client.heartbeat = Some(HeartbeatClient::new(
                 self.id,
diff --git a/src/meta-client/src/client/ask_leader.rs b/src/meta-client/src/client/ask_leader.rs
index 95d7851b95..e34d0dfedf 100644
--- a/src/meta-client/src/client/ask_leader.rs
+++ b/src/meta-client/src/client/ask_leader.rs
@@ -24,7 +24,7 @@ use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
 use common_telemetry::tracing_context::TracingContext;
 use common_telemetry::warn;
 use rand::seq::SliceRandom;
-use snafu::{OptionExt, ResultExt};
+use snafu::ResultExt;
 use tokio::time::timeout;
 use tonic::transport::Channel;
 
@@ -101,12 +101,14 @@ impl AskLeader {
         };
 
         let (tx, mut rx) = tokio::sync::mpsc::channel(peers.len());
+        let channel_manager = self.channel_manager.clone();
 
         for addr in &peers {
             let mut client = self.create_asker(addr)?;
             let tx_clone = tx.clone();
             let req = req.clone();
             let addr = addr.clone();
+            let channel_manager = channel_manager.clone();
             tokio::spawn(async move {
                 match client.ask_leader(req).await {
                     Ok(res) => {
@@ -117,13 +119,19 @@ impl AskLeader {
                         };
                     }
                     Err(status) => {
+                        // Reset cached channel even on generic errors: the VIP may keep us on a dead
+                        // backend, so forcing a reconnect gives us a chance to hit a healthy peer.
+                        Self::reset_channels_with_manager(
+                            &channel_manager,
+                            std::slice::from_ref(&addr),
+                        );
                         warn!("Failed to ask leader from: {addr}, {status}");
                     }
                 }
             });
         }
 
-        let leader = timeout(
+        let leader = match timeout(
             self.channel_manager
                 .config()
                 .timeout
@@ -131,8 +139,16 @@ impl AskLeader {
             rx.recv(),
         )
         .await
-        .context(error::AskLeaderTimeoutSnafu)?
-        .context(error::NoLeaderSnafu)?;
+        {
+            Ok(Some(leader)) => leader,
+            Ok(None) => return error::NoLeaderSnafu.fail(),
+            Err(e) => {
+                // All peers timed out. Reset channels to force reconnection,
+                // which may help escape dead backends in VIP/LB scenarios.
+                Self::reset_channels_with_manager(&self.channel_manager, &peers);
+                return Err(e).context(error::AskLeaderTimeoutSnafu);
+            }
+        };
 
         let mut leadership_group = self.leadership_group.write().unwrap();
         leadership_group.leader = Some(leader.clone());
@@ -169,6 +185,15 @@ impl AskLeader {
                 .context(error::CreateChannelSnafu)?,
         ))
     }
+
+    /// Drop cached channels for the given peers so a fresh connection is used next time.
+    fn reset_channels_with_manager(channel_manager: &ChannelManager, peers: &[String]) {
+        if peers.is_empty() {
+            return;
+        }
+
+        channel_manager.retain_channel(|addr, _| !peers.iter().any(|peer| peer == addr));
+    }
 }
 
 #[async_trait]
diff --git a/src/meta-client/src/lib.rs b/src/meta-client/src/lib.rs
index 5b56b8e181..715154a8e5 100644
--- a/src/meta-client/src/lib.rs
+++ b/src/meta-client/src/lib.rs
@@ -18,6 +18,10 @@ use std::time::Duration;
 use client::RegionFollowerClientRef;
 use common_base::Plugins;
 use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
+use common_meta::distributed_time_constants::{
+    HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS, HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS,
+    HEARTBEAT_TIMEOUT,
+};
 use common_telemetry::{debug, info};
 use serde::{Deserialize, Serialize};
 
@@ -34,8 +38,6 @@ pub struct MetaClientOptions {
     #[serde(with = "humantime_serde")]
     pub timeout: Duration,
     #[serde(with = "humantime_serde")]
-    pub heartbeat_timeout: Duration,
-    #[serde(with = "humantime_serde")]
     pub ddl_timeout: Duration,
     #[serde(with = "humantime_serde")]
     pub connect_timeout: Duration,
@@ -52,7 +54,6 @@ impl Default for MetaClientOptions {
         Self {
             metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
             timeout: Duration::from_millis(3_000u64),
-            heartbeat_timeout: Duration::from_millis(500u64),
             ddl_timeout: Duration::from_millis(10_000u64),
             connect_timeout: Duration::from_millis(1_000u64),
             tcp_nodelay: true,
@@ -97,7 +98,11 @@ pub async fn create_meta_client(
         .timeout(meta_client_options.timeout)
         .connect_timeout(meta_client_options.connect_timeout)
         .tcp_nodelay(meta_client_options.tcp_nodelay);
-    let heartbeat_config = base_config.clone();
+    let heartbeat_config = base_config
+        .clone()
+        .timeout(HEARTBEAT_TIMEOUT)
+        .http2_keep_alive_interval(HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS)
+        .http2_keep_alive_timeout(HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS);
 
     if let MetaClientType::Frontend = client_type {
         let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout);
diff --git a/src/meta-srv/Cargo.toml b/src/meta-srv/Cargo.toml
index 1c4bff72cf..3ed3c5a834 100644
--- a/src/meta-srv/Cargo.toml
+++ b/src/meta-srv/Cargo.toml
@@ -62,7 +62,9 @@ hyper-util = { workspace = true, features = ["tokio"] }
 itertools.workspace = true
 lazy_static.workspace = true
 once_cell.workspace = true
+ordered-float.workspace = true
 parking_lot.workspace = true
+partition.workspace = true
 prometheus.workspace = true
 prost.workspace = true
 rand.workspace = true
diff --git a/src/meta-srv/src/bootstrap.rs b/src/meta-srv/src/bootstrap.rs
index 20e5810a90..e00b65380a 100644
--- a/src/meta-srv/src/bootstrap.rs
+++ b/src/meta-srv/src/bootstrap.rs
@@ -14,6 +14,7 @@
 
 use std::net::SocketAddr;
 use std::sync::Arc;
+use std::time::Duration;
 
 use api::v1::meta::cluster_server::ClusterServer;
 use api::v1::meta::heartbeat_server::HeartbeatServer;
@@ -29,8 +30,7 @@ use common_meta::kv_backend::memory::MemoryKvBackend;
 use common_meta::kv_backend::{KvBackendRef, ResettableKvBackendRef};
 use common_telemetry::info;
 use either::Either;
-use servers::configurator::ConfiguratorRef;
-use servers::export_metrics::ExportMetricsTask;
+use servers::configurator::GrpcRouterConfiguratorRef;
 use servers::http::{HttpServer, HttpServerBuilder};
 use servers::metrics_handler::MetricsHandler;
 use servers::server::Server;
@@ -45,20 +45,26 @@ use crate::cluster::{MetaPeerClientBuilder, MetaPeerClientRef};
 #[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
 use crate::election::CANDIDATE_LEASE_SECS;
 use crate::election::etcd::EtcdElection;
+use crate::error::OtherSnafu;
 use crate::metasrv::builder::MetasrvBuilder;
 use crate::metasrv::{
     BackendImpl, ElectionRef, Metasrv, MetasrvOptions, SelectTarget, SelectorRef,
 };
-use crate::selector::SelectorType;
 use crate::selector::lease_based::LeaseBasedSelector;
 use crate::selector::load_based::LoadBasedSelector;
 use crate::selector::round_robin::RoundRobinSelector;
 use crate::selector::weight_compute::RegionNumsBasedWeightCompute;
+use crate::selector::{Selector, SelectorType};
 use crate::service::admin;
 use crate::service::admin::admin_axum_router;
 use crate::utils::etcd::create_etcd_client_with_tls;
 use crate::{Result, error};
 
+/// The default keep-alive interval for gRPC.
+const DEFAULT_GRPC_KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(10);
+/// The default keep-alive timeout for gRPC.
+const DEFAULT_GRPC_KEEP_ALIVE_TIMEOUT: Duration = Duration::from_secs(10);
+
 pub struct MetasrvInstance {
     metasrv: Arc<Metasrv>,
 
@@ -70,8 +76,6 @@ pub struct MetasrvInstance {
 
     plugins: Plugins,
 
-    export_metrics_task: Option<ExportMetricsTask>,
-
     /// gRPC serving state receiver. Only present if the gRPC server is started.
     serve_state: Arc<Mutex<Option<oneshot::Receiver<Result<()>>>>>,
 
@@ -95,15 +99,12 @@ impl MetasrvInstance {
 
         // put metasrv into plugins for later use
         plugins.insert::<Arc<Metasrv>>(metasrv.clone());
-        let export_metrics_task = ExportMetricsTask::try_new(&opts.export_metrics, Some(&plugins))
-            .context(error::InitExportMetricsTaskSnafu)?;
         Ok(MetasrvInstance {
             metasrv,
             http_server: Either::Left(Some(builder)),
             opts,
             signal_sender: None,
             plugins,
-            export_metrics_task,
             serve_state: Default::default(),
             bind_addr: None,
         })
@@ -131,18 +132,21 @@ impl MetasrvInstance {
 
         self.metasrv.try_start().await?;
 
-        if let Some(t) = self.export_metrics_task.as_ref() {
-            t.start(None).context(error::InitExportMetricsTaskSnafu)?
-        }
-
         let (tx, rx) = mpsc::channel::<()>(1);
 
         self.signal_sender = Some(tx);
 
         // Start gRPC server with admin services for backward compatibility
         let mut router = router(self.metasrv.clone());
-        if let Some(configurator) = self.metasrv.plugins().get::<ConfiguratorRef>() {
-            router = configurator.config_grpc(router);
+        if let Some(configurator) = self
+            .metasrv
+            .plugins()
+            .get::<GrpcRouterConfiguratorRef<()>>()
+        {
+            router = configurator
+                .configure_grpc_router(router, ())
+                .await
+                .context(OtherSnafu)?;
         }
 
         let (serve_state_tx, serve_state_rx) = oneshot::channel();
@@ -247,7 +251,12 @@ macro_rules! add_compressed_service {
 }
 
 pub fn router(metasrv: Arc<Metasrv>) -> Router {
-    let mut router = tonic::transport::Server::builder().accept_http1(true); // for admin services
+    let mut router = tonic::transport::Server::builder()
+        // for admin services
+        .accept_http1(true)
+        // For quick network failures detection.
+        .http2_keepalive_interval(Some(DEFAULT_GRPC_KEEP_ALIVE_INTERVAL))
+        .http2_keepalive_timeout(Some(DEFAULT_GRPC_KEEP_ALIVE_TIMEOUT));
     let router = add_compressed_service!(router, HeartbeatServer::from_arc(metasrv.clone()));
     let router = add_compressed_service!(router, StoreServer::from_arc(metasrv.clone()));
     let router = add_compressed_service!(router, ClusterServer::from_arc(metasrv.clone()));
@@ -282,7 +291,7 @@ pub async fn metasrv_builder(
 
             use common_meta::distributed_time_constants::POSTGRES_KEEP_ALIVE_SECS;
             use common_meta::kv_backend::rds::PgStore;
-            use deadpool_postgres::Config;
+            use deadpool_postgres::{Config, ManagerConfig, RecyclingMethod};
 
             use crate::election::rds::postgres::{ElectionPgClient, PgElection};
             use crate::utils::postgres::create_postgres_pool;
@@ -296,9 +305,16 @@ pub async fn metasrv_builder(
             let mut cfg = Config::new();
             cfg.keepalives = Some(true);
             cfg.keepalives_idle = Some(Duration::from_secs(POSTGRES_KEEP_ALIVE_SECS));
-            // We use a separate pool for election since we need a different session keep-alive idle time.
-            let pool = create_postgres_pool(&opts.store_addrs, Some(cfg), opts.backend_tls.clone())
-                .await?;
+            cfg.manager = Some(ManagerConfig {
+                recycling_method: RecyclingMethod::Verified,
+            });
+            // Use a dedicated pool for the election client to allow customized session settings.
+            let pool = create_postgres_pool(
+                &opts.store_addrs,
+                Some(cfg.clone()),
+                opts.backend_tls.clone(),
+            )
+            .await?;
 
             let election_client = ElectionPgClient::new(
                 pool,
@@ -318,8 +334,8 @@ pub async fn metasrv_builder(
             )
             .await?;
 
-            let pool =
-                create_postgres_pool(&opts.store_addrs, None, opts.backend_tls.clone()).await?;
+            let pool = create_postgres_pool(&opts.store_addrs, Some(cfg), opts.backend_tls.clone())
+                .await?;
             let kv_backend = PgStore::with_pg_pool(
                 pool,
                 opts.meta_schema_name.as_deref(),
@@ -395,7 +411,12 @@ pub async fn metasrv_builder(
         info!("Using selector from plugins");
         selector
     } else {
-        let selector = match opts.selector {
+        let selector: Arc<
+            dyn Selector<
+                    Context = crate::metasrv::SelectorContext,
+                    Output = Vec<common_meta::peer::Peer>,
+                >,
+        > = match opts.selector {
             SelectorType::LoadBased => Arc::new(LoadBasedSelector::new(
                 RegionNumsBasedWeightCompute,
                 meta_peer_client.clone(),
diff --git a/src/meta-srv/src/discovery.rs b/src/meta-srv/src/discovery.rs
index 6151e7afbd..54532ec454 100644
--- a/src/meta-srv/src/discovery.rs
+++ b/src/meta-srv/src/discovery.rs
@@ -26,6 +26,7 @@ use common_meta::distributed_time_constants::{
 use common_meta::error::Result;
 use common_meta::peer::{Peer, PeerDiscovery, PeerResolver};
 use common_meta::{DatanodeId, FlownodeId};
+use common_time::util::DefaultSystemTimer;
 use snafu::ResultExt;
 
 use crate::cluster::MetaPeerClient;
@@ -35,6 +36,7 @@ use crate::discovery::lease::{LeaseValueAccessor, LeaseValueType};
 impl PeerDiscovery for MetaPeerClient {
     async fn active_frontends(&self) -> Result<Vec<Peer>> {
         utils::alive_frontends(
+            &DefaultSystemTimer,
             self,
             Duration::from_millis(FRONTEND_HEARTBEAT_INTERVAL_MILLIS),
         )
@@ -47,20 +49,30 @@ impl PeerDiscovery for MetaPeerClient {
         &self,
         filter: Option<for<'a> fn(&'a NodeWorkloads) -> bool>,
     ) -> Result<Vec<Peer>> {
-        utils::alive_datanodes(self, Duration::from_secs(DATANODE_LEASE_SECS), filter)
-            .await
-            .map_err(BoxedError::new)
-            .context(common_meta::error::ExternalSnafu)
+        utils::alive_datanodes(
+            &DefaultSystemTimer,
+            self,
+            Duration::from_secs(DATANODE_LEASE_SECS),
+            filter,
+        )
+        .await
+        .map_err(BoxedError::new)
+        .context(common_meta::error::ExternalSnafu)
     }
 
     async fn active_flownodes(
         &self,
         filter: Option<for<'a> fn(&'a NodeWorkloads) -> bool>,
     ) -> Result<Vec<Peer>> {
-        utils::alive_flownodes(self, Duration::from_secs(FLOWNODE_LEASE_SECS), filter)
-            .await
-            .map_err(BoxedError::new)
-            .context(common_meta::error::ExternalSnafu)
+        utils::alive_flownodes(
+            &DefaultSystemTimer,
+            self,
+            Duration::from_secs(FLOWNODE_LEASE_SECS),
+            filter,
+        )
+        .await
+        .map_err(BoxedError::new)
+        .context(common_meta::error::ExternalSnafu)
     }
 }
 
diff --git a/src/meta-srv/src/discovery/lease.rs b/src/meta-srv/src/discovery/lease.rs
index 9d9e0d6c23..7035e3bcad 100644
--- a/src/meta-srv/src/discovery/lease.rs
+++ b/src/meta-srv/src/discovery/lease.rs
@@ -95,20 +95,22 @@ impl LeaseValueAccessor for MetaPeerClient {
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicI64, Ordering};
     use std::time::Duration;
 
-    use api::v1::meta::DatanodeWorkloads;
     use api::v1::meta::heartbeat_request::NodeWorkloads;
+    use api::v1::meta::{DatanodeWorkloads, FlownodeWorkloads};
     use common_meta::cluster::{FrontendStatus, NodeInfo, NodeInfoKey, NodeStatus, Role};
     use common_meta::distributed_time_constants::FRONTEND_HEARTBEAT_INTERVAL_MILLIS;
     use common_meta::kv_backend::ResettableKvBackendRef;
     use common_meta::peer::{Peer, PeerDiscovery};
     use common_meta::rpc::store::PutRequest;
-    use common_time::util::current_time_millis;
+    use common_time::util::{DefaultSystemTimer, SystemTimer, current_time_millis};
     use common_workload::DatanodeWorkloadType;
 
     use crate::discovery::utils::{self, accept_ingest_workload};
-    use crate::key::{DatanodeLeaseKey, LeaseValue};
+    use crate::key::{DatanodeLeaseKey, FlownodeLeaseKey, LeaseValue};
     use crate::test_util::create_meta_peer_client;
 
     async fn put_lease_value(
@@ -126,17 +128,47 @@ mod tests {
             .unwrap();
     }
 
+    async fn put_flownode_lease_value(
+        kv_backend: &ResettableKvBackendRef,
+        key: FlownodeLeaseKey,
+        value: LeaseValue,
+    ) {
+        kv_backend
+            .put(PutRequest {
+                key: key.try_into().unwrap(),
+                value: value.try_into().unwrap(),
+                prev_kv: false,
+            })
+            .await
+            .unwrap();
+    }
+
+    struct MockTimer {
+        current: Arc<AtomicI64>,
+    }
+
+    impl SystemTimer for MockTimer {
+        fn current_time_millis(&self) -> i64 {
+            self.current.fetch_add(1, Ordering::Relaxed)
+        }
+
+        fn current_time_rfc3339(&self) -> String {
+            unimplemented!()
+        }
+    }
+
     #[tokio::test]
     async fn test_alive_datanodes() {
         let client = create_meta_peer_client();
         let in_memory = client.memory_backend();
         let lease_secs = 10;
+        let timer = DefaultSystemTimer;
 
         // put a stale lease value for node 1
         let key = DatanodeLeaseKey { node_id: 1 };
         let value = LeaseValue {
             // 20s ago
-            timestamp_millis: current_time_millis() - lease_secs * 2 * 1000,
+            timestamp_millis: timer.current_time_millis() - lease_secs * 2 * 1000,
             node_addr: "127.0.0.1:20201".to_string(),
             workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
                 types: vec![DatanodeWorkloadType::Hybrid as i32],
@@ -147,7 +179,7 @@ mod tests {
         // put a fresh lease value for node 2
         let key = DatanodeLeaseKey { node_id: 2 };
         let value = LeaseValue {
-            timestamp_millis: current_time_millis(),
+            timestamp_millis: timer.current_time_millis(),
             node_addr: "127.0.0.1:20202".to_string(),
             workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
                 types: vec![DatanodeWorkloadType::Hybrid as i32],
@@ -155,6 +187,37 @@ mod tests {
         };
         put_lease_value(&in_memory, key.clone(), value.clone()).await;
         let peers = utils::alive_datanodes(
+            &timer,
+            client.as_ref(),
+            Duration::from_secs(lease_secs as u64),
+            None,
+        )
+        .await
+        .unwrap();
+        assert_eq!(peers.len(), 1);
+        assert_eq!(peers, vec![Peer::new(2, "127.0.0.1:20202".to_string())]);
+    }
+
+    #[tokio::test]
+    async fn test_alive_datanodes_with_timer() {
+        let client = create_meta_peer_client();
+        let in_memory = client.memory_backend();
+        let lease_secs = 10;
+        let timer = MockTimer {
+            current: Arc::new(AtomicI64::new(current_time_millis())),
+        };
+
+        let key = DatanodeLeaseKey { node_id: 2 };
+        let value = LeaseValue {
+            timestamp_millis: timer.current_time_millis(),
+            node_addr: "127.0.0.1:20202".to_string(),
+            workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
+                types: vec![DatanodeWorkloadType::Hybrid as i32],
+            }),
+        };
+        put_lease_value(&in_memory, key.clone(), value.clone()).await;
+        let peers = utils::alive_datanodes(
+            &timer,
             client.as_ref(),
             Duration::from_secs(lease_secs as u64),
             None,
@@ -170,12 +233,13 @@ mod tests {
         let client = create_meta_peer_client();
         let in_memory = client.memory_backend();
         let lease_secs = 10;
+        let timer = DefaultSystemTimer;
 
         // put a lease value for node 1 without mode info
         let key = DatanodeLeaseKey { node_id: 1 };
         let value = LeaseValue {
             // 20s ago
-            timestamp_millis: current_time_millis() - 20 * 1000,
+            timestamp_millis: timer.current_time_millis() - 20 * 1000,
             node_addr: "127.0.0.1:20201".to_string(),
             workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
                 types: vec![DatanodeWorkloadType::Hybrid as i32],
@@ -186,7 +250,7 @@ mod tests {
         // put a lease value for node 2 with mode info
         let key = DatanodeLeaseKey { node_id: 2 };
         let value = LeaseValue {
-            timestamp_millis: current_time_millis(),
+            timestamp_millis: timer.current_time_millis(),
             node_addr: "127.0.0.1:20202".to_string(),
             workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
                 types: vec![DatanodeWorkloadType::Hybrid as i32],
@@ -197,7 +261,7 @@ mod tests {
         // put a lease value for node 3 with mode info
         let key = DatanodeLeaseKey { node_id: 3 };
         let value = LeaseValue {
-            timestamp_millis: current_time_millis(),
+            timestamp_millis: timer.current_time_millis(),
             node_addr: "127.0.0.1:20203".to_string(),
             workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
                 types: vec![i32::MAX],
@@ -208,7 +272,7 @@ mod tests {
         // put a lease value for node 3 with mode info
         let key = DatanodeLeaseKey { node_id: 4 };
         let value = LeaseValue {
-            timestamp_millis: current_time_millis(),
+            timestamp_millis: timer.current_time_millis(),
             node_addr: "127.0.0.1:20204".to_string(),
             workloads: NodeWorkloads::Datanode(DatanodeWorkloads {
                 types: vec![i32::MAX],
@@ -217,6 +281,7 @@ mod tests {
         put_lease_value(&in_memory, key, value).await;
 
         let peers = utils::alive_datanodes(
+            &timer,
             client.as_ref(),
             Duration::from_secs(lease_secs),
             Some(accept_ingest_workload),
@@ -227,18 +292,84 @@ mod tests {
         assert!(peers.contains(&Peer::new(2, "127.0.0.1:20202".to_string())));
     }
 
+    #[tokio::test]
+    async fn test_alive_flownodes() {
+        let client = create_meta_peer_client();
+        let in_memory = client.memory_backend();
+        let lease_secs = 10;
+        let timer = DefaultSystemTimer;
+
+        // put a stale lease value for node 1
+        let key = FlownodeLeaseKey { node_id: 1 };
+        let value = LeaseValue {
+            // 20s ago
+            timestamp_millis: timer.current_time_millis() - lease_secs * 2 * 1000,
+            node_addr: "127.0.0.1:20201".to_string(),
+            workloads: NodeWorkloads::Flownode(FlownodeWorkloads { types: vec![] }),
+        };
+        put_flownode_lease_value(&in_memory, key, value).await;
+
+        // put a fresh lease value for node 2
+        let key = FlownodeLeaseKey { node_id: 2 };
+        let value = LeaseValue {
+            timestamp_millis: timer.current_time_millis(),
+            node_addr: "127.0.0.1:20202".to_string(),
+            workloads: NodeWorkloads::Flownode(FlownodeWorkloads { types: vec![] }),
+        };
+        put_flownode_lease_value(&in_memory, key.clone(), value.clone()).await;
+        let peers = utils::alive_flownodes(
+            &timer,
+            client.as_ref(),
+            Duration::from_secs(lease_secs as u64),
+            None,
+        )
+        .await
+        .unwrap();
+        assert_eq!(peers.len(), 1);
+        assert_eq!(peers, vec![Peer::new(2, "127.0.0.1:20202".to_string())]);
+    }
+
+    #[tokio::test]
+    async fn test_alive_flownodes_with_timer() {
+        let client = create_meta_peer_client();
+        let in_memory = client.memory_backend();
+        let lease_secs = 10;
+        let timer = MockTimer {
+            current: Arc::new(AtomicI64::new(current_time_millis())),
+        };
+
+        let key = FlownodeLeaseKey { node_id: 2 };
+        let value = LeaseValue {
+            timestamp_millis: timer.current_time_millis(),
+            node_addr: "127.0.0.1:20202".to_string(),
+            workloads: NodeWorkloads::Flownode(FlownodeWorkloads { types: vec![] }),
+        };
+        put_flownode_lease_value(&in_memory, key.clone(), value.clone()).await;
+        let peers = utils::alive_flownodes(
+            &timer,
+            client.as_ref(),
+            Duration::from_secs(lease_secs as u64),
+            None,
+        )
+        .await
+        .unwrap();
+        assert_eq!(peers.len(), 1);
+        assert_eq!(peers, vec![Peer::new(2, "127.0.0.1:20202".to_string())]);
+    }
+
     #[tokio::test]
     async fn test_lookup_frontends() {
         let client = create_meta_peer_client();
         let in_memory = client.memory_backend();
         let lease_secs = 10;
+        let timer = DefaultSystemTimer;
 
         let active_frontend_node = NodeInfo {
             peer: Peer {
                 id: 0,
                 addr: "127.0.0.1:20201".to_string(),
             },
-            last_activity_ts: current_time_millis(),
+            last_activity_ts: timer.current_time_millis(),
             status: NodeStatus::Frontend(FrontendStatus {}),
             version: "1.0.0".to_string(),
             git_commit: "1234567890".to_string(),
@@ -266,7 +397,7 @@ mod tests {
                 id: 1,
                 addr: "127.0.0.1:20201".to_string(),
             },
-            last_activity_ts: current_time_millis() - 20 * 1000,
+            last_activity_ts: timer.current_time_millis() - 20 * 1000,
             status: NodeStatus::Frontend(FrontendStatus {}),
             version: "1.0.0".to_string(),
             git_commit: "1234567890".to_string(),
@@ -287,9 +418,52 @@ mod tests {
             .await
             .unwrap();
 
-        let peers = utils::alive_frontends(client.as_ref(), Duration::from_secs(lease_secs))
+        let peers =
+            utils::alive_frontends(&timer, client.as_ref(), Duration::from_secs(lease_secs))
+                .await
+                .unwrap();
+        assert_eq!(peers.len(), 1);
+        assert_eq!(peers[0].id, 0);
+    }
+
+    #[tokio::test]
+    async fn test_lookup_frontends_with_timer() {
+        let client = create_meta_peer_client();
+        let in_memory = client.memory_backend();
+        let lease_secs = 10;
+        let timer = MockTimer {
+            current: Arc::new(AtomicI64::new(current_time_millis())),
+        };
+
+        let active_frontend_node = NodeInfo {
+            peer: Peer {
+                id: 0,
+                addr: "127.0.0.1:20201".to_string(),
+            },
+            last_activity_ts: timer.current_time_millis(),
+            status: NodeStatus::Frontend(FrontendStatus {}),
+            version: "1.0.0".to_string(),
+            git_commit: "1234567890".to_string(),
+            start_time_ms: current_time_millis() as u64,
+            total_cpu_millicores: 0,
+            total_memory_bytes: 0,
+            cpu_usage_millicores: 0,
+            memory_usage_bytes: 0,
+            hostname: "test_hostname".to_string(),
+        };
+        let key_prefix = NodeInfoKey::key_prefix_with_role(Role::Frontend);
+        in_memory
+            .put(PutRequest {
+                key: format!("{}{}", key_prefix, "0").into(),
+                value: active_frontend_node.try_into().unwrap(),
+                prev_kv: false,
+            })
             .await
             .unwrap();
+        let peers =
+            utils::alive_frontends(&timer, client.as_ref(), Duration::from_secs(lease_secs))
+                .await
+                .unwrap();
         assert_eq!(peers.len(), 1);
         assert_eq!(peers[0].id, 0);
     }
diff --git a/src/meta-srv/src/discovery/utils.rs b/src/meta-srv/src/discovery/utils.rs
index 9a8ec7a82c..317033e0cf 100644
--- a/src/meta-srv/src/discovery/utils.rs
+++ b/src/meta-srv/src/discovery/utils.rs
@@ -19,7 +19,7 @@ use common_meta::DatanodeId;
 use common_meta::cluster::NodeInfo;
 use common_meta::kv_backend::KvBackendRef;
 use common_meta::peer::Peer;
-use common_time::util::{DefaultSystemTimer, SystemTimer};
+use common_time::util::SystemTimer;
 use common_workload::DatanodeWorkloadType;
 use snafu::ResultExt;
 
@@ -49,16 +49,9 @@ pub trait LastActiveTs {
 /// Builds a filter closure that checks whether a [`LastActiveTs`] item
 /// is still within the specified active duration, relative to the
 /// current time provided by the given [`SystemTimer`].
-///
-/// The returned closure uses the timestamp at the time of building,
-/// so the "now" reference point is fixed when this function is called.
-pub fn build_active_filter<T: LastActiveTs>(
-    timer: impl SystemTimer,
-    active_duration: Duration,
-) -> impl Fn(&T) -> bool {
-    let now = timer.current_time_millis();
-    let active_duration = active_duration.as_millis() as u64;
-    move |item: &T| {
+pub fn build_active_filter<T: LastActiveTs>(active_duration: Duration) -> impl Fn(i64, &T) -> bool {
+    move |now: i64, item: &T| {
+        let active_duration = active_duration.as_millis() as u64;
         let elapsed = now.saturating_sub(item.last_active_ts()) as u64;
         elapsed < active_duration
     }
@@ -66,18 +59,19 @@ pub fn build_active_filter<T: LastActiveTs>(
 
 /// Returns the alive datanodes.
 pub async fn alive_datanodes(
+    timer: &impl SystemTimer,
     accessor: &impl LeaseValueAccessor,
     active_duration: Duration,
     condition: Option<fn(&NodeWorkloads) -> bool>,
 ) -> Result<Vec<Peer>> {
-    let active_filter = build_active_filter(DefaultSystemTimer, active_duration);
+    let active_filter = build_active_filter(active_duration);
     let condition = condition.unwrap_or(|_| true);
-    Ok(accessor
-        .lease_values(LeaseValueType::Datanode)
-        .await?
+    let lease_values = accessor.lease_values(LeaseValueType::Datanode).await?;
+    let now = timer.current_time_millis();
+    Ok(lease_values
         .into_iter()
         .filter_map(|(peer_id, lease_value)| {
-            if active_filter(&lease_value) && condition(&lease_value.workloads) {
+            if active_filter(now, &lease_value) && condition(&lease_value.workloads) {
                 Some(Peer::new(peer_id, lease_value.node_addr))
             } else {
                 None
@@ -88,18 +82,19 @@ pub async fn alive_datanodes(
 
 /// Returns the alive flownodes.
 pub async fn alive_flownodes(
+    timer: &impl SystemTimer,
     accessor: &impl LeaseValueAccessor,
     active_duration: Duration,
     condition: Option<fn(&NodeWorkloads) -> bool>,
 ) -> Result<Vec<Peer>> {
-    let active_filter = build_active_filter(DefaultSystemTimer, active_duration);
+    let active_filter = build_active_filter(active_duration);
     let condition = condition.unwrap_or(|_| true);
-    Ok(accessor
-        .lease_values(LeaseValueType::Flownode)
-        .await?
+    let lease_values = accessor.lease_values(LeaseValueType::Flownode).await?;
+    let now = timer.current_time_millis();
+    Ok(lease_values
         .into_iter()
         .filter_map(|(peer_id, lease_value)| {
-            if active_filter(&lease_value) && condition(&lease_value.workloads) {
+            if active_filter(now, &lease_value) && condition(&lease_value.workloads) {
                 Some(Peer::new(peer_id, lease_value.node_addr))
             } else {
                 None
@@ -110,16 +105,17 @@ pub async fn alive_flownodes(
 
 /// Returns the alive frontends.
 pub async fn alive_frontends(
+    timer: &impl SystemTimer,
     lister: &impl NodeInfoAccessor,
     active_duration: Duration,
 ) -> Result<Vec<Peer>> {
-    let active_filter = build_active_filter(DefaultSystemTimer, active_duration);
-    Ok(lister
-        .node_infos(NodeInfoType::Frontend)
-        .await?
+    let active_filter = build_active_filter(active_duration);
+    let node_infos = lister.node_infos(NodeInfoType::Frontend).await?;
+    let now = timer.current_time_millis();
+    Ok(node_infos
         .into_iter()
         .filter_map(|(_, node_info)| {
-            if active_filter(&node_info) {
+            if active_filter(now, &node_info) {
                 Some(node_info.peer)
             } else {
                 None
@@ -130,15 +126,18 @@ pub async fn alive_frontends(
 
 /// Returns the alive datanode peer.
 pub async fn alive_datanode(
+    timer: &impl SystemTimer,
     lister: &impl LeaseValueAccessor,
     peer_id: u64,
     active_duration: Duration,
 ) -> Result<Option<Peer>> {
-    let active_filter = build_active_filter(DefaultSystemTimer, active_duration);
-    let v = lister
+    let active_filter = build_active_filter(active_duration);
+    let lease_value = lister
         .lease_value(LeaseValueType::Datanode, peer_id)
-        .await?
-        .filter(|(_, lease)| active_filter(lease))
+        .await?;
+    let now = timer.current_time_millis();
+    let v = lease_value
+        .filter(|(_, lease)| active_filter(now, lease))
         .map(|(peer_id, lease)| Peer::new(peer_id, lease.node_addr));
 
     Ok(v)
diff --git a/src/meta-srv/src/election/etcd.rs b/src/meta-srv/src/election/etcd.rs
index 7a6a02f490..883f723d74 100644
--- a/src/meta-srv/src/election/etcd.rs
+++ b/src/meta-srv/src/election/etcd.rs
@@ -63,22 +63,6 @@ pub struct EtcdElection {
 }
 
 impl EtcdElection {
-    pub async fn with_endpoints<E, S>(
-        leader_value: E,
-        endpoints: S,
-        store_key_prefix: String,
-    ) -> Result<ElectionRef>
-    where
-        E: AsRef<str>,
-        S: AsRef<[E]>,
-    {
-        let client = Client::connect(endpoints, None)
-            .await
-            .context(error::ConnectEtcdSnafu)?;
-
-        Self::with_etcd_client(leader_value, client, store_key_prefix).await
-    }
-
     pub async fn with_etcd_client<E>(
         leader_value: E,
         client: Client,
diff --git a/src/meta-srv/src/election/rds/mysql.rs b/src/meta-srv/src/election/rds/mysql.rs
index a15d1bac51..78832e3e11 100644
--- a/src/meta-srv/src/election/rds/mysql.rs
+++ b/src/meta-srv/src/election/rds/mysql.rs
@@ -1651,6 +1651,41 @@ mod tests {
         drop_table(&leader_mysql_election.client, table_name).await;
     }
 
+    #[tokio::test]
+    async fn test_reset_campaign() {
+        maybe_skip_mysql_integration_test!();
+        common_telemetry::init_default_ut_logging();
+        let leader_value = "test_leader".to_string();
+        let uuid = uuid::Uuid::new_v4().to_string();
+        let table_name = "test_reset_campaign_greptime_metakv";
+        let candidate_lease_ttl = Duration::from_secs(5);
+        let meta_lease_ttl = Duration::from_secs(2);
+        let execution_timeout = Duration::from_secs(10);
+        let idle_session_timeout = Duration::from_secs(0);
+        let client = create_mysql_client(Some(table_name), execution_timeout, idle_session_timeout)
+            .await
+            .unwrap();
+
+        let (tx, _) = broadcast::channel(100);
+        let leader_mysql_election = MySqlElection {
+            leader_value,
+            client,
+            is_leader: AtomicBool::new(false),
+            leader_infancy: AtomicBool::new(true),
+            leader_watcher: tx,
+            store_key_prefix: uuid,
+            candidate_lease_ttl,
+            meta_lease_ttl,
+            sql_set: ElectionSqlFactory::new(table_name).build(),
+        };
+        leader_mysql_election
+            .is_leader
+            .store(true, Ordering::Relaxed);
+        leader_mysql_election.reset_campaign().await;
+        assert!(!leader_mysql_election.is_leader());
+        drop_table(&leader_mysql_election.client, table_name).await;
+    }
+
     #[tokio::test]
     async fn test_follower_action() {
         maybe_skip_mysql_integration_test!();
diff --git a/src/meta-srv/src/election/rds/postgres.rs b/src/meta-srv/src/election/rds/postgres.rs
index 5d8a8bf2fa..77bcd30dfe 100644
--- a/src/meta-srv/src/election/rds/postgres.rs
+++ b/src/meta-srv/src/election/rds/postgres.rs
@@ -1582,6 +1582,44 @@ mod tests {
         drop_table(&follower_pg_election, table_name).await;
     }
 
+    #[tokio::test]
+    async fn test_reset_campaign() {
+        maybe_skip_postgres_integration_test!();
+        let leader_value = "test_leader".to_string();
+        let uuid = uuid::Uuid::new_v4().to_string();
+        let table_name = "test_reset_campaign_greptime_metakv";
+        let candidate_lease_ttl = Duration::from_secs(5);
+        let execution_timeout = Duration::from_secs(10);
+        let statement_timeout = Duration::from_secs(10);
+        let meta_lease_ttl = Duration::from_secs(2);
+        let idle_session_timeout = Duration::from_secs(0);
+        let client = create_postgres_client(
+            Some(table_name),
+            execution_timeout,
+            idle_session_timeout,
+            statement_timeout,
+        )
+        .await
+        .unwrap();
+
+        let (tx, _) = broadcast::channel(100);
+        let leader_pg_election = PgElection {
+            leader_value,
+            pg_client: RwLock::new(client),
+            is_leader: AtomicBool::new(false),
+            leader_infancy: AtomicBool::new(true),
+            leader_watcher: tx,
+            store_key_prefix: uuid,
+            candidate_lease_ttl,
+            meta_lease_ttl,
+            sql_set: ElectionSqlFactory::new(28321, None, table_name).build(),
+        };
+        leader_pg_election.is_leader.store(true, Ordering::Relaxed);
+        leader_pg_election.reset_campaign().await;
+        assert!(!leader_pg_election.is_leader());
+        drop_table(&leader_pg_election, table_name).await;
+    }
+
     #[tokio::test]
     async fn test_idle_session_timeout() {
         maybe_skip_postgres_integration_test!();
diff --git a/src/meta-srv/src/error.rs b/src/meta-srv/src/error.rs
index 2f4756c2ae..f00ccdeb3a 100644
--- a/src/meta-srv/src/error.rs
+++ b/src/meta-srv/src/error.rs
@@ -23,6 +23,7 @@ use store_api::storage::RegionId;
 use table::metadata::TableId;
 use tokio::sync::mpsc::error::SendError;
 use tonic::codegen::http;
+use uuid::Uuid;
 
 use crate::metasrv::SelectTarget;
 use crate::pubsub::Message;
@@ -304,13 +305,6 @@ pub enum Error {
         source: servers::error::Error,
     },
 
-    #[snafu(display("Failed to init export metrics task"))]
-    InitExportMetricsTask {
-        #[snafu(implicit)]
-        location: Location,
-        source: servers::error::Error,
-    },
-
     #[snafu(display("Failed to parse address {}", addr))]
     ParseAddr {
         addr: String,
@@ -989,13 +983,63 @@ pub enum Error {
         #[snafu(source)]
         source: common_meta::error::Error,
     },
+
+    #[snafu(display(
+        "Repartition group {} source region missing, region id: {}",
+        group_id,
+        region_id
+    ))]
+    RepartitionSourceRegionMissing {
+        group_id: Uuid,
+        region_id: RegionId,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display(
+        "Repartition group {} target region missing, region id: {}",
+        group_id,
+        region_id
+    ))]
+    RepartitionTargetRegionMissing {
+        group_id: Uuid,
+        region_id: RegionId,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to serialize partition expression: {}", source))]
+    SerializePartitionExpr {
+        #[snafu(source)]
+        source: partition::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display(
+        "Partition expression mismatch, region id: {}, expected: {}, actual: {}",
+        region_id,
+        expected,
+        actual
+    ))]
+    PartitionExprMismatch {
+        region_id: RegionId,
+        expected: String,
+        actual: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }
 
 impl Error {
     /// Returns `true` if the error is retryable.
     pub fn is_retryable(&self) -> bool {
-        matches!(self, Error::RetryLater { .. })
-            || matches!(self, Error::RetryLaterWithSource { .. })
+        matches!(
+            self,
+            Error::RetryLater { .. }
+                | Error::RetryLaterWithSource { .. }
+                | Error::MailboxTimeout { .. }
+        )
     }
 }
 
@@ -1044,6 +1088,7 @@ impl ErrorExt for Error {
             | Error::MailboxChannelClosed { .. }
             | Error::IsNotLeader { .. } => StatusCode::IllegalState,
             Error::RetryLaterWithSource { source, .. } => source.status_code(),
+            Error::SerializePartitionExpr { source, .. } => source.status_code(),
 
             Error::Unsupported { .. } => StatusCode::Unsupported,
 
@@ -1061,12 +1106,14 @@ impl ErrorExt for Error {
             | Error::ParseAddr { .. }
             | Error::UnsupportedSelectorType { .. }
             | Error::InvalidArguments { .. }
-            | Error::InitExportMetricsTask { .. }
             | Error::ProcedureNotFound { .. }
             | Error::TooManyPartitions { .. }
             | Error::TomlFormat { .. }
             | Error::HandlerNotFound { .. }
-            | Error::LeaderPeerChanged { .. } => StatusCode::InvalidArguments,
+            | Error::LeaderPeerChanged { .. }
+            | Error::RepartitionSourceRegionMissing { .. }
+            | Error::RepartitionTargetRegionMissing { .. }
+            | Error::PartitionExprMismatch { .. } => StatusCode::InvalidArguments,
             Error::LeaseKeyFromUtf8 { .. }
             | Error::LeaseValueFromUtf8 { .. }
             | Error::InvalidRegionKeyFromUtf8 { .. }
diff --git a/src/meta-srv/src/events/region_migration_event.rs b/src/meta-srv/src/events/region_migration_event.rs
index 3fc8500599..7e5c5b6fc2 100644
--- a/src/meta-srv/src/events/region_migration_event.rs
+++ b/src/meta-srv/src/events/region_migration_event.rs
@@ -21,7 +21,7 @@ use common_event_recorder::Event;
 use common_event_recorder::error::{Result, SerializeEventSnafu};
 use serde::Serialize;
 use snafu::ResultExt;
-use store_api::storage::{RegionId, TableId};
+use store_api::storage::RegionId;
 
 use crate::procedure::region_migration::{PersistentContext, RegionMigrationTriggerReason};
 
@@ -37,35 +37,34 @@ pub const EVENTS_TABLE_DST_NODE_ID_COLUMN_NAME: &str = "region_migration_dst_nod
 pub const EVENTS_TABLE_DST_PEER_ADDR_COLUMN_NAME: &str = "region_migration_dst_peer_addr";
 
 /// RegionMigrationEvent is the event of region migration.
-#[derive(Debug, Serialize)]
+#[derive(Debug)]
 pub(crate) struct RegionMigrationEvent {
-    #[serde(skip)]
-    region_id: RegionId,
-    #[serde(skip)]
-    table_id: TableId,
-    #[serde(skip)]
-    region_number: u32,
-    #[serde(skip)]
+    // The region ids of the region migration.
+    region_ids: Vec<RegionId>,
+    // The trigger reason of the region migration.
     trigger_reason: RegionMigrationTriggerReason,
-    #[serde(skip)]
+    // The source node id of the region migration.
     src_node_id: u64,
-    #[serde(skip)]
+    // The source peer address of the region migration.
     src_peer_addr: String,
-    #[serde(skip)]
+    // The destination node id of the region migration.
     dst_node_id: u64,
-    #[serde(skip)]
+    // The destination peer address of the region migration.
     dst_peer_addr: String,
+    // The timeout of the region migration.
+    timeout: Duration,
+}
 
-    // The following fields will be serialized as the json payload.
+#[derive(Debug, Serialize)]
+struct Payload {
+    #[serde(with = "humantime_serde")]
     timeout: Duration,
 }
 
 impl RegionMigrationEvent {
     pub fn from_persistent_ctx(ctx: &PersistentContext) -> Self {
         Self {
-            region_id: ctx.region_id,
-            table_id: ctx.region_id.table_id(),
-            region_number: ctx.region_id.region_number(),
+            region_ids: ctx.region_ids.clone(),
             trigger_reason: ctx.trigger_reason,
             src_node_id: ctx.from_peer.id,
             src_peer_addr: ctx.from_peer.addr.clone(),
@@ -134,23 +133,31 @@ impl Event for RegionMigrationEvent {
         ]
     }
 
-    fn extra_row(&self) -> Result<Row> {
-        Ok(Row {
-            values: vec![
-                ValueData::U64Value(self.region_id.as_u64()).into(),
-                ValueData::U32Value(self.table_id).into(),
-                ValueData::U32Value(self.region_number).into(),
-                ValueData::StringValue(self.trigger_reason.to_string()).into(),
-                ValueData::U64Value(self.src_node_id).into(),
-                ValueData::StringValue(self.src_peer_addr.clone()).into(),
-                ValueData::U64Value(self.dst_node_id).into(),
-                ValueData::StringValue(self.dst_peer_addr.clone()).into(),
-            ],
-        })
+    fn extra_rows(&self) -> Result<Vec<Row>> {
+        let mut extra_rows = Vec::with_capacity(self.region_ids.len());
+        for region_id in &self.region_ids {
+            extra_rows.push(Row {
+                values: vec![
+                    ValueData::U64Value(region_id.as_u64()).into(),
+                    ValueData::U32Value(region_id.table_id()).into(),
+                    ValueData::U32Value(region_id.region_number()).into(),
+                    ValueData::StringValue(self.trigger_reason.to_string()).into(),
+                    ValueData::U64Value(self.src_node_id).into(),
+                    ValueData::StringValue(self.src_peer_addr.clone()).into(),
+                    ValueData::U64Value(self.dst_node_id).into(),
+                    ValueData::StringValue(self.dst_peer_addr.clone()).into(),
+                ],
+            });
+        }
+
+        Ok(extra_rows)
     }
 
     fn json_payload(&self) -> Result<String> {
-        serde_json::to_string(self).context(SerializeEventSnafu)
+        serde_json::to_string(&Payload {
+            timeout: self.timeout,
+        })
+        .context(SerializeEventSnafu)
     }
 
     fn as_any(&self) -> &dyn Any {
diff --git a/src/meta-srv/src/gc.rs b/src/meta-srv/src/gc.rs
new file mode 100644
index 0000000000..d8c0adb204
--- /dev/null
+++ b/src/meta-srv/src/gc.rs
@@ -0,0 +1,39 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// TODO(discord9): remove this once gc scheduler is fully merged
+#![allow(unused)]
+
+use std::collections::{HashMap, HashSet};
+
+use common_meta::peer::Peer;
+use store_api::storage::RegionId;
+
+mod candidate;
+mod ctx;
+mod handler;
+#[cfg(test)]
+mod mock;
+mod options;
+mod procedure;
+mod scheduler;
+mod tracker;
+
+pub use options::GcSchedulerOptions;
+pub use procedure::BatchGcProcedure;
+pub(crate) use scheduler::{GcScheduler, GcTickerRef};
+
+pub type Region2Peers = HashMap<RegionId, (Peer, Vec<Peer>)>;
+
+pub(crate) type Peer2Regions = HashMap<Peer, HashSet<RegionId>>;
diff --git a/src/meta-srv/src/gc/candidate.rs b/src/meta-srv/src/gc/candidate.rs
new file mode 100644
index 0000000000..05fc79ac52
--- /dev/null
+++ b/src/meta-srv/src/gc/candidate.rs
@@ -0,0 +1,135 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::time::Instant;
+
+use common_meta::datanode::{RegionManifestInfo, RegionStat};
+use common_telemetry::{debug, info};
+use ordered_float::OrderedFloat;
+use store_api::region_engine::RegionRole;
+use store_api::storage::RegionId;
+use table::metadata::TableId;
+
+use crate::error::Result;
+use crate::gc::scheduler::GcScheduler;
+
+/// Represents a region candidate for GC with its priority score.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) struct GcCandidate {
+    pub(crate) region_id: RegionId,
+    pub(crate) score: OrderedFloat<f64>,
+    pub(crate) region_stat: RegionStat,
+}
+
+impl GcCandidate {
+    fn new(region_id: RegionId, score: f64, region_stat: RegionStat) -> Self {
+        Self {
+            region_id,
+            score: OrderedFloat(score),
+            region_stat,
+        }
+    }
+
+    #[allow(unused)]
+    fn score_f64(&self) -> f64 {
+        self.score.into_inner()
+    }
+}
+
+impl GcScheduler {
+    /// Calculate GC priority score for a region based on various metrics.
+    fn calculate_gc_score(&self, region_stat: &RegionStat) -> f64 {
+        let sst_count_score = region_stat.sst_num as f64 * self.config.sst_count_weight;
+
+        let file_remove_cnt_score = match &region_stat.region_manifest {
+            RegionManifestInfo::Mito {
+                file_removed_cnt, ..
+            } => *file_removed_cnt as f64 * self.config.file_removed_count_weight,
+            // Metric engine doesn't have file_removal_rate, also this should be unreachable since metrics engine doesn't support gc
+            RegionManifestInfo::Metric { .. } => 0.0,
+        };
+
+        sst_count_score + file_remove_cnt_score
+    }
+
+    /// Filter and score regions that are candidates for GC, grouped by table.
+    pub(crate) async fn select_gc_candidates(
+        &self,
+        table_to_region_stats: &HashMap<TableId, Vec<RegionStat>>,
+    ) -> Result<HashMap<TableId, Vec<GcCandidate>>> {
+        let mut table_candidates: HashMap<TableId, Vec<GcCandidate>> = HashMap::new();
+        let now = Instant::now();
+
+        for (table_id, region_stats) in table_to_region_stats {
+            let mut candidates = Vec::new();
+            let tracker = self.region_gc_tracker.lock().await;
+
+            for region_stat in region_stats {
+                if region_stat.role != RegionRole::Leader {
+                    continue;
+                }
+
+                // Skip regions that are too small
+                if region_stat.approximate_bytes < self.config.min_region_size_threshold {
+                    continue;
+                }
+
+                // Skip regions that are in cooldown period
+                if let Some(gc_info) = tracker.get(&region_stat.id)
+                    && now.saturating_duration_since(gc_info.last_gc_time)
+                        < self.config.gc_cooldown_period
+                {
+                    debug!("Skipping region {} due to cooldown", region_stat.id);
+                    continue;
+                }
+
+                let score = self.calculate_gc_score(region_stat);
+
+                debug!(
+                    "Region {} (table {}) has GC score {:.4}",
+                    region_stat.id, table_id, score
+                );
+
+                // Only consider regions with a meaningful score
+                if score > 0.0 {
+                    candidates.push(GcCandidate::new(region_stat.id, score, region_stat.clone()));
+                }
+            }
+
+            // Sort candidates by score in descending order and take top N
+            candidates.sort_by(|a, b| b.score.cmp(&a.score));
+            let top_candidates: Vec<GcCandidate> = candidates
+                .into_iter()
+                .take(self.config.regions_per_table_threshold)
+                .collect();
+
+            if !top_candidates.is_empty() {
+                info!(
+                    "Selected {} GC candidates for table {} (top {} out of all qualified)",
+                    top_candidates.len(),
+                    table_id,
+                    self.config.regions_per_table_threshold
+                );
+                table_candidates.insert(*table_id, top_candidates);
+            }
+        }
+
+        info!(
+            "Selected GC candidates for {} tables",
+            table_candidates.len()
+        );
+        Ok(table_candidates)
+    }
+}
diff --git a/src/meta-srv/src/gc/ctx.rs b/src/meta-srv/src/gc/ctx.rs
new file mode 100644
index 0000000000..7b1cfc68e1
--- /dev/null
+++ b/src/meta-srv/src/gc/ctx.rs
@@ -0,0 +1,380 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+use std::time::Duration;
+
+use api::v1::meta::MailboxMessage;
+use common_meta::datanode::RegionStat;
+use common_meta::instruction::{
+    GcRegions, GetFileRefs, GetFileRefsReply, Instruction, InstructionReply,
+};
+use common_meta::key::TableMetadataManagerRef;
+use common_meta::key::table_route::PhysicalTableRouteValue;
+use common_meta::peer::Peer;
+use common_procedure::{ProcedureManagerRef, ProcedureWithId, watcher};
+use common_telemetry::{debug, error, warn};
+use snafu::{OptionExt as _, ResultExt as _};
+use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
+use table::metadata::TableId;
+
+use crate::cluster::MetaPeerClientRef;
+use crate::error::{self, Result, TableMetadataManagerSnafu, UnexpectedSnafu};
+use crate::gc::Region2Peers;
+use crate::gc::procedure::{BatchGcProcedure, GcRegionProcedure};
+use crate::handler::HeartbeatMailbox;
+use crate::service::mailbox::{Channel, MailboxRef};
+
+#[async_trait::async_trait]
+pub(crate) trait SchedulerCtx: Send + Sync {
+    async fn get_table_to_region_stats(&self) -> Result<HashMap<TableId, Vec<RegionStat>>>;
+
+    async fn get_table_route(
+        &self,
+        table_id: TableId,
+    ) -> Result<(TableId, PhysicalTableRouteValue)>;
+
+    async fn get_file_references(
+        &self,
+        query_regions: &[RegionId],
+        related_regions: HashMap<RegionId, Vec<RegionId>>,
+        region_routes: &Region2Peers,
+        timeout: Duration,
+    ) -> Result<FileRefsManifest>;
+
+    async fn gc_regions(
+        &self,
+        peer: Peer,
+        region_ids: &[RegionId],
+        file_refs_manifest: &FileRefsManifest,
+        full_file_listing: bool,
+        timeout: Duration,
+    ) -> Result<GcReport>;
+}
+
+pub(crate) struct DefaultGcSchedulerCtx {
+    /// The metadata manager.
+    pub(crate) table_metadata_manager: TableMetadataManagerRef,
+    /// Procedure manager.
+    pub(crate) procedure_manager: ProcedureManagerRef,
+    /// For getting `RegionStats`.
+    pub(crate) meta_peer_client: MetaPeerClientRef,
+    /// The mailbox to send messages.
+    pub(crate) mailbox: MailboxRef,
+    /// The server address.
+    pub(crate) server_addr: String,
+}
+
+impl DefaultGcSchedulerCtx {
+    pub fn try_new(
+        table_metadata_manager: TableMetadataManagerRef,
+        procedure_manager: ProcedureManagerRef,
+        meta_peer_client: MetaPeerClientRef,
+        mailbox: MailboxRef,
+        server_addr: String,
+    ) -> Result<Self> {
+        Ok(Self {
+            table_metadata_manager,
+            procedure_manager,
+            meta_peer_client,
+            mailbox,
+            server_addr,
+        })
+    }
+}
+
+#[async_trait::async_trait]
+impl SchedulerCtx for DefaultGcSchedulerCtx {
+    async fn get_table_to_region_stats(&self) -> Result<HashMap<TableId, Vec<RegionStat>>> {
+        let dn_stats = self.meta_peer_client.get_all_dn_stat_kvs().await?;
+        let mut table_to_region_stats: HashMap<TableId, Vec<RegionStat>> = HashMap::new();
+        for (_dn_id, stats) in dn_stats {
+            let mut stats = stats.stats;
+
+            let Some(latest_stat) = stats.iter().max_by_key(|s| s.timestamp_millis).cloned() else {
+                continue;
+            };
+
+            for region_stat in latest_stat.region_stats {
+                table_to_region_stats
+                    .entry(region_stat.id.table_id())
+                    .or_default()
+                    .push(region_stat);
+            }
+        }
+        Ok(table_to_region_stats)
+    }
+
+    async fn get_table_route(
+        &self,
+        table_id: TableId,
+    ) -> Result<(TableId, PhysicalTableRouteValue)> {
+        self.table_metadata_manager
+            .table_route_manager()
+            .get_physical_table_route(table_id)
+            .await
+            .context(TableMetadataManagerSnafu)
+    }
+
+    async fn gc_regions(
+        &self,
+        peer: Peer,
+        region_ids: &[RegionId],
+        file_refs_manifest: &FileRefsManifest,
+        full_file_listing: bool,
+        timeout: Duration,
+    ) -> Result<GcReport> {
+        self.gc_regions_inner(
+            peer,
+            region_ids,
+            file_refs_manifest,
+            full_file_listing,
+            timeout,
+        )
+        .await
+    }
+
+    async fn get_file_references(
+        &self,
+        query_regions: &[RegionId],
+        related_regions: HashMap<RegionId, Vec<RegionId>>,
+        region_routes: &Region2Peers,
+        timeout: Duration,
+    ) -> Result<FileRefsManifest> {
+        debug!(
+            "Getting file references for {} regions",
+            query_regions.len()
+        );
+
+        // Group regions by datanode to minimize RPC calls
+        let mut datanode2query_regions: HashMap<Peer, Vec<RegionId>> = HashMap::new();
+
+        for region_id in query_regions {
+            if let Some((leader, followers)) = region_routes.get(region_id) {
+                datanode2query_regions
+                    .entry(leader.clone())
+                    .or_default()
+                    .push(*region_id);
+                // also need to send for follower regions for file refs in case query is running on follower
+                for follower in followers {
+                    datanode2query_regions
+                        .entry(follower.clone())
+                        .or_default()
+                        .push(*region_id);
+                }
+            } else {
+                return error::UnexpectedSnafu {
+                    violated: format!(
+                        "region_routes: {region_routes:?} does not contain region_id: {region_id}",
+                    ),
+                }
+                .fail();
+            }
+        }
+        let mut datanode2related_regions: HashMap<Peer, HashMap<RegionId, Vec<RegionId>>> =
+            HashMap::new();
+        for (related_region, queries) in related_regions {
+            if let Some((leader, followers)) = region_routes.get(&related_region) {
+                datanode2related_regions
+                    .entry(leader.clone())
+                    .or_default()
+                    .insert(related_region, queries.clone());
+            } // since read from manifest, no need to send to followers
+        }
+
+        // Send GetFileRefs instructions to each datanode
+        let mut all_file_refs: HashMap<RegionId, HashSet<FileId>> = HashMap::new();
+        let mut all_manifest_versions = HashMap::new();
+
+        for (peer, regions) in datanode2query_regions {
+            let related_regions = datanode2related_regions.remove(&peer).unwrap_or_default();
+            match self
+                .send_get_file_refs_instruction(&peer, &regions, related_regions, timeout)
+                .await
+            {
+                Ok(manifest) => {
+                    // TODO(discord9): if other regions provide file refs for one region on other datanode, and no version,
+                    // is it correct to merge manifest_version directly?
+                    // FIXME: follower region how to merge version???
+
+                    for (region_id, file_refs) in manifest.file_refs {
+                        all_file_refs
+                            .entry(region_id)
+                            .or_default()
+                            .extend(file_refs);
+                    }
+                    // region manifest version should be the smallest one among all peers, so outdated region can be detected
+                    for (region_id, version) in manifest.manifest_version {
+                        let entry = all_manifest_versions.entry(region_id).or_insert(version);
+                        *entry = (*entry).min(version);
+                    }
+                }
+                Err(e) => {
+                    warn!(
+                        "Failed to get file refs from datanode {}: {}. Skipping regions on this datanode.",
+                        peer, e
+                    );
+                    // Continue processing other datanodes instead of failing the entire operation
+                    continue;
+                }
+            }
+        }
+
+        Ok(FileRefsManifest {
+            file_refs: all_file_refs,
+            manifest_version: all_manifest_versions,
+        })
+    }
+}
+
+impl DefaultGcSchedulerCtx {
+    async fn gc_regions_inner(
+        &self,
+        peer: Peer,
+        region_ids: &[RegionId],
+        file_refs_manifest: &FileRefsManifest,
+        full_file_listing: bool,
+        timeout: Duration,
+    ) -> Result<GcReport> {
+        debug!(
+            "Sending GC instruction to datanode {} for {} regions (full_file_listing: {})",
+            peer,
+            region_ids.len(),
+            full_file_listing
+        );
+
+        let gc_regions = GcRegions {
+            regions: region_ids.to_vec(),
+            file_refs_manifest: file_refs_manifest.clone(),
+            full_file_listing,
+        };
+        let procedure = GcRegionProcedure::new(
+            self.mailbox.clone(),
+            self.server_addr.clone(),
+            peer,
+            gc_regions,
+            format!("GC for {} regions", region_ids.len()),
+            timeout,
+        );
+        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
+
+        let id = procedure_with_id.id;
+
+        let mut watcher = self
+            .procedure_manager
+            .submit(procedure_with_id)
+            .await
+            .context(error::SubmitProcedureSnafu)?;
+        let res = watcher::wait(&mut watcher)
+            .await
+            .context(error::WaitProcedureSnafu)?
+            .with_context(|| error::UnexpectedSnafu {
+                violated: format!(
+                    "GC procedure {id} successfully completed but no result returned"
+                ),
+            })?;
+
+        let gc_report = GcRegionProcedure::cast_result(res)?;
+
+        Ok(gc_report)
+    }
+
+    /// TODO(discord9): add support to read manifest of related regions for file refs too
+    /// (now it's only reading  active FileHandles)
+    async fn send_get_file_refs_instruction(
+        &self,
+        peer: &Peer,
+        query_regions: &[RegionId],
+        related_regions: HashMap<RegionId, Vec<RegionId>>,
+        timeout: Duration,
+    ) -> Result<FileRefsManifest> {
+        debug!(
+            "Sending GetFileRefs instruction to datanode {} for {} regions",
+            peer,
+            query_regions.len()
+        );
+
+        let instruction = Instruction::GetFileRefs(GetFileRefs {
+            query_regions: query_regions.to_vec(),
+            related_regions,
+        });
+
+        let reply = self
+            .send_instruction(peer, instruction, "Get file references", timeout)
+            .await?;
+
+        let InstructionReply::GetFileRefs(GetFileRefsReply {
+            file_refs_manifest,
+            success,
+            error,
+        }) = reply
+        else {
+            return error::UnexpectedInstructionReplySnafu {
+                mailbox_message: format!("{:?}", reply),
+                reason: "Unexpected reply of the GetFileRefs instruction",
+            }
+            .fail();
+        };
+
+        if !success {
+            return error::UnexpectedSnafu {
+                violated: format!(
+                    "Failed to get file references from datanode {}: {:?}",
+                    peer, error
+                ),
+            }
+            .fail();
+        }
+
+        Ok(file_refs_manifest)
+    }
+
+    async fn send_instruction(
+        &self,
+        peer: &Peer,
+        instruction: Instruction,
+        description: &str,
+        timeout: Duration,
+    ) -> Result<InstructionReply> {
+        let msg = MailboxMessage::json_message(
+            &format!("{}: {}", description, instruction),
+            &format!("Metasrv@{}", self.server_addr),
+            &format!("Datanode-{}@{}", peer.id, peer.addr),
+            common_time::util::current_time_millis(),
+            &instruction,
+        )
+        .with_context(|_| error::SerializeToJsonSnafu {
+            input: instruction.to_string(),
+        })?;
+
+        let mailbox_rx = self
+            .mailbox
+            .send(&Channel::Datanode(peer.id), msg, timeout)
+            .await?;
+
+        match mailbox_rx.await {
+            Ok(reply_msg) => {
+                let reply = HeartbeatMailbox::json_reply(&reply_msg)?;
+                Ok(reply)
+            }
+            Err(e) => {
+                error!(
+                    "Failed to receive reply from datanode {} for {}: {}",
+                    peer, description, e
+                );
+                Err(e)
+            }
+        }
+    }
+}
diff --git a/src/meta-srv/src/gc/handler.rs b/src/meta-srv/src/gc/handler.rs
new file mode 100644
index 0000000000..4085f6289c
--- /dev/null
+++ b/src/meta-srv/src/gc/handler.rs
@@ -0,0 +1,459 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+use std::time::Instant;
+
+use common_meta::key::table_route::PhysicalTableRouteValue;
+use common_meta::peer::Peer;
+use common_telemetry::{debug, error, info, warn};
+use futures::StreamExt;
+use itertools::Itertools;
+use store_api::storage::{FileRefsManifest, GcReport, RegionId};
+use table::metadata::TableId;
+use tokio::time::sleep;
+
+use crate::error::Result;
+use crate::gc::candidate::GcCandidate;
+use crate::gc::scheduler::{GcJobReport, GcScheduler};
+use crate::gc::tracker::RegionGcInfo;
+use crate::region;
+
+pub(crate) type Region2Peers = HashMap<RegionId, (Peer, Vec<Peer>)>;
+
+pub(crate) type Peer2Regions = HashMap<Peer, HashSet<RegionId>>;
+
+impl GcScheduler {
+    /// Iterate through all region stats, find region that might need gc, and send gc instruction to
+    /// the corresponding datanode with improved parallel processing and retry logic.
+    pub(crate) async fn trigger_gc(&self) -> Result<GcJobReport> {
+        let start_time = Instant::now();
+        info!("Starting GC cycle");
+
+        // Step 1: Get all region statistics
+        let table_to_region_stats = self.ctx.get_table_to_region_stats().await?;
+        info!(
+            "Fetched region stats for {} tables",
+            table_to_region_stats.len()
+        );
+
+        // Step 2: Select GC candidates based on our scoring algorithm
+        let per_table_candidates = self.select_gc_candidates(&table_to_region_stats).await?;
+
+        if per_table_candidates.is_empty() {
+            info!("No GC candidates found, skipping GC cycle");
+            return Ok(Default::default());
+        }
+
+        // Step 3: Aggregate candidates by datanode
+        let datanode_to_candidates = self
+            .aggregate_candidates_by_datanode(per_table_candidates)
+            .await?;
+
+        if datanode_to_candidates.is_empty() {
+            info!("No valid datanode candidates found, skipping GC cycle");
+            return Ok(Default::default());
+        }
+
+        // Step 4: Process datanodes concurrently with limited parallelism
+        let report = self
+            .parallel_process_datanodes(datanode_to_candidates)
+            .await;
+
+        let duration = start_time.elapsed();
+        info!(
+            "Finished GC cycle. Processed {} datanodes ({} failed). Duration: {:?}",
+            report.per_datanode_reports.len(), // Reuse field for datanode count
+            report.failed_datanodes.len(),
+            duration
+        );
+        debug!("Detailed GC Job Report: {report:#?}");
+
+        Ok(report)
+    }
+
+    /// Find related regions that might share files with the candidate regions.
+    /// Currently returns the same regions since repartition is not implemented yet.
+    /// TODO(discord9): When repartition is implemented, this should also find src/dst regions
+    /// that might share files with the candidate regions.
+    pub(crate) async fn find_related_regions(
+        &self,
+        candidate_region_ids: &[RegionId],
+    ) -> Result<HashMap<RegionId, Vec<RegionId>>> {
+        Ok(candidate_region_ids.iter().map(|&r| (r, vec![r])).collect())
+    }
+
+    /// Aggregate GC candidates by their corresponding datanode peer.
+    pub(crate) async fn aggregate_candidates_by_datanode(
+        &self,
+        per_table_candidates: HashMap<TableId, Vec<GcCandidate>>,
+    ) -> Result<HashMap<Peer, Vec<(TableId, GcCandidate)>>> {
+        let mut datanode_to_candidates: HashMap<Peer, Vec<(TableId, GcCandidate)>> = HashMap::new();
+
+        for (table_id, candidates) in per_table_candidates {
+            if candidates.is_empty() {
+                continue;
+            }
+
+            // Get table route information to map regions to peers
+            let (phy_table_id, table_peer) = self.ctx.get_table_route(table_id).await?;
+
+            if phy_table_id != table_id {
+                // Skip logical tables
+                continue;
+            }
+
+            let region_to_peer = table_peer
+                .region_routes
+                .iter()
+                .filter_map(|r| {
+                    r.leader_peer
+                        .as_ref()
+                        .map(|peer| (r.region.id, peer.clone()))
+                })
+                .collect::<HashMap<RegionId, Peer>>();
+
+            for candidate in candidates {
+                if let Some(peer) = region_to_peer.get(&candidate.region_id) {
+                    datanode_to_candidates
+                        .entry(peer.clone())
+                        .or_default()
+                        .push((table_id, candidate));
+                } else {
+                    warn!(
+                        "Skipping region {} for table {}: no leader peer found",
+                        candidate.region_id, table_id
+                    );
+                }
+            }
+        }
+
+        info!(
+            "Aggregated GC candidates for {} datanodes",
+            datanode_to_candidates.len()
+        );
+        Ok(datanode_to_candidates)
+    }
+
+    /// Process multiple datanodes concurrently with limited parallelism.
+    pub(crate) async fn parallel_process_datanodes(
+        &self,
+        datanode_to_candidates: HashMap<Peer, Vec<(TableId, GcCandidate)>>,
+    ) -> GcJobReport {
+        let mut report = GcJobReport::default();
+
+        // Create a stream of datanode GC tasks with limited concurrency
+        let results: Vec<_> = futures::stream::iter(
+            datanode_to_candidates
+                .into_iter()
+                .filter(|(_, candidates)| !candidates.is_empty()),
+        )
+        .map(|(peer, candidates)| {
+            let scheduler = self;
+            let peer_clone = peer.clone();
+            async move {
+                (
+                    peer,
+                    scheduler.process_datanode_gc(peer_clone, candidates).await,
+                )
+            }
+        })
+        .buffer_unordered(self.config.max_concurrent_tables) // Reuse table concurrency limit for datanodes
+        .collect()
+        .await;
+
+        // Process all datanode GC results and collect regions that need retry from table reports
+        for (peer, result) in results {
+            match result {
+                Ok(dn_report) => {
+                    report.per_datanode_reports.insert(peer.id, dn_report);
+                }
+                Err(e) => {
+                    error!("Failed to process datanode GC for peer {}: {:#?}", peer, e);
+                    // Note: We don't have a direct way to map peer to table_id here,
+                    // so we just log the error. The table_reports will contain individual region failures.
+                    report.failed_datanodes.entry(peer.id).or_default().push(e);
+                }
+            }
+        }
+
+        report
+    }
+
+    /// Process GC for a single datanode with all its candidate regions.
+    /// Returns the table reports for this datanode.
+    pub(crate) async fn process_datanode_gc(
+        &self,
+        peer: Peer,
+        candidates: Vec<(TableId, GcCandidate)>,
+    ) -> Result<GcReport> {
+        info!(
+            "Starting GC for datanode {} with {} candidate regions",
+            peer,
+            candidates.len()
+        );
+
+        if candidates.is_empty() {
+            return Ok(Default::default());
+        }
+
+        let all_region_ids: Vec<RegionId> = candidates.iter().map(|(_, c)| c.region_id).collect();
+
+        let all_related_regions = self.find_related_regions(&all_region_ids).await?;
+
+        let (region_to_peer, _) = self
+            .discover_datanodes_for_regions(&all_related_regions.keys().cloned().collect_vec())
+            .await?;
+
+        // Step 1: Get file references for all regions on this datanode
+        let file_refs_manifest = self
+            .ctx
+            .get_file_references(
+                &all_region_ids,
+                all_related_regions,
+                &region_to_peer,
+                self.config.mailbox_timeout,
+            )
+            .await?;
+
+        // Step 2: Create a single GcRegionProcedure for all regions on this datanode
+        let (gc_report, fully_listed_regions) = {
+            // Partition regions into full listing and fast listing in a single pass
+
+            let mut batch_full_listing_decisions =
+                self.batch_should_use_full_listing(&all_region_ids).await;
+
+            let need_full_list_regions = batch_full_listing_decisions
+                .iter()
+                .filter_map(
+                    |(&region_id, &need_full)| {
+                        if need_full { Some(region_id) } else { None }
+                    },
+                )
+                .collect_vec();
+            let mut fast_list_regions = batch_full_listing_decisions
+                .iter()
+                .filter_map(
+                    |(&region_id, &need_full)| {
+                        if !need_full { Some(region_id) } else { None }
+                    },
+                )
+                .collect_vec();
+
+            let mut combined_report = GcReport::default();
+
+            // First process regions that can fast list
+            if !fast_list_regions.is_empty() {
+                match self
+                    .ctx
+                    .gc_regions(
+                        peer.clone(),
+                        &fast_list_regions,
+                        &file_refs_manifest,
+                        false,
+                        self.config.mailbox_timeout,
+                    )
+                    .await
+                {
+                    Ok(report) => combined_report.merge(report),
+                    Err(e) => {
+                        error!(
+                            "Failed to GC regions {:?} on datanode {}: {}",
+                            fast_list_regions, peer, e
+                        );
+
+                        // Add to need_retry_regions since it failed
+                        combined_report
+                            .need_retry_regions
+                            .extend(fast_list_regions.clone().into_iter());
+                    }
+                }
+            }
+
+            if !need_full_list_regions.is_empty() {
+                match self
+                    .ctx
+                    .gc_regions(
+                        peer.clone(),
+                        &need_full_list_regions,
+                        &file_refs_manifest,
+                        true,
+                        self.config.mailbox_timeout,
+                    )
+                    .await
+                {
+                    Ok(report) => combined_report.merge(report),
+                    Err(e) => {
+                        error!(
+                            "Failed to GC regions {:?} on datanode {}: {}",
+                            need_full_list_regions, peer, e
+                        );
+
+                        // Add to need_retry_regions since it failed
+                        combined_report
+                            .need_retry_regions
+                            .extend(need_full_list_regions.clone());
+                    }
+                }
+            }
+            let fully_listed_regions = need_full_list_regions
+                .into_iter()
+                .filter(|r| !combined_report.need_retry_regions.contains(r))
+                .collect::<HashSet<_>>();
+
+            (combined_report, fully_listed_regions)
+        };
+
+        // Step 3: Process the combined GC report and update table reports
+        for region_id in &all_region_ids {
+            self.update_full_listing_time(*region_id, fully_listed_regions.contains(region_id))
+                .await;
+        }
+
+        info!(
+            "Completed GC for datanode {}: {} regions processed",
+            peer,
+            all_region_ids.len()
+        );
+
+        Ok(gc_report)
+    }
+
+    /// Discover datanodes for the given regions(and it's related regions) by fetching table routes in batches.
+    /// Returns mappings from region to peer(leader, Vec<followers>) and peer to regions.
+    async fn discover_datanodes_for_regions(
+        &self,
+        regions: &[RegionId],
+    ) -> Result<(Region2Peers, Peer2Regions)> {
+        let all_related_regions = self
+            .find_related_regions(regions)
+            .await?
+            .into_iter()
+            .flat_map(|(k, mut v)| {
+                v.push(k);
+                v
+            })
+            .collect_vec();
+        let mut region_to_peer = HashMap::new();
+        let mut peer_to_regions = HashMap::new();
+
+        // Group regions by table ID for batch processing
+        let mut table_to_regions: HashMap<TableId, Vec<RegionId>> = HashMap::new();
+        for region_id in all_related_regions {
+            let table_id = region_id.table_id();
+            table_to_regions
+                .entry(table_id)
+                .or_default()
+                .push(region_id);
+        }
+
+        // Process each table's regions together for efficiency
+        for (table_id, table_regions) in table_to_regions {
+            match self.ctx.get_table_route(table_id).await {
+                Ok((_phy_table_id, table_route)) => {
+                    self.get_table_regions_peer(
+                        &table_route,
+                        &table_regions,
+                        &mut region_to_peer,
+                        &mut peer_to_regions,
+                    );
+                }
+                Err(e) => {
+                    // Continue with other tables instead of failing completely
+                    // TODO(discord9): consider failing here instead
+                    warn!(
+                        "Failed to get table route for table {}: {}, skipping its regions",
+                        table_id, e
+                    );
+                    continue;
+                }
+            }
+        }
+
+        Ok((region_to_peer, peer_to_regions))
+    }
+
+    /// Process regions for a single table to find their current leader peers.
+    fn get_table_regions_peer(
+        &self,
+        table_route: &PhysicalTableRouteValue,
+        table_regions: &[RegionId],
+        region_to_peer: &mut Region2Peers,
+        peer_to_regions: &mut Peer2Regions,
+    ) {
+        for &region_id in table_regions {
+            let mut found = false;
+
+            // Find the region in the table route
+            for region_route in &table_route.region_routes {
+                if region_route.region.id == region_id
+                    && let Some(leader_peer) = &region_route.leader_peer
+                {
+                    region_to_peer.insert(
+                        region_id,
+                        (leader_peer.clone(), region_route.follower_peers.clone()),
+                    );
+                    peer_to_regions
+                        .entry(leader_peer.clone())
+                        .or_default()
+                        .insert(region_id);
+                    found = true;
+                    break;
+                }
+            }
+
+            if !found {
+                warn!(
+                    "Failed to find region {} in table route or no leader peer found",
+                    region_id,
+                );
+            }
+        }
+    }
+
+    async fn batch_should_use_full_listing(
+        &self,
+        region_ids: &[RegionId],
+    ) -> HashMap<RegionId, bool> {
+        let mut result = HashMap::new();
+        let mut gc_tracker = self.region_gc_tracker.lock().await;
+        let now = Instant::now();
+        for &region_id in region_ids {
+            let use_full_listing = {
+                if let Some(gc_info) = gc_tracker.get(&region_id) {
+                    if let Some(last_full_listing) = gc_info.last_full_listing_time {
+                        // check if pass cooling down interval after last full listing
+                        let elapsed = now.saturating_duration_since(last_full_listing);
+                        elapsed >= self.config.full_file_listing_interval
+                    } else {
+                        // Never did full listing for this region, do it now
+                        true
+                    }
+                } else {
+                    // First time GC for this region, skip doing full listing, for this time
+                    gc_tracker.insert(
+                        region_id,
+                        RegionGcInfo {
+                            last_gc_time: now,
+                            last_full_listing_time: Some(now),
+                        },
+                    );
+                    false
+                }
+            };
+            result.insert(region_id, use_full_listing);
+        }
+        result
+    }
+}
diff --git a/src/meta-srv/src/gc/mock.rs b/src/meta-srv/src/gc/mock.rs
new file mode 100644
index 0000000000..61ec515985
--- /dev/null
+++ b/src/meta-srv/src/gc/mock.rs
@@ -0,0 +1,458 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+mod basic;
+mod candidate_select;
+mod concurrent;
+mod config;
+mod err_handle;
+mod full_list;
+mod integration;
+mod misc;
+
+use std::collections::{HashMap, HashSet};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use common_meta::datanode::{RegionManifestInfo, RegionStat};
+use common_meta::key::table_route::PhysicalTableRouteValue;
+use common_meta::peer::Peer;
+use common_meta::rpc::router::{Region, RegionRoute};
+use common_telemetry::debug;
+use ordered_float::OrderedFloat;
+use store_api::region_engine::RegionRole;
+use store_api::storage::{FileRefsManifest, GcReport, RegionId};
+use table::metadata::TableId;
+use tokio::sync::mpsc::Sender;
+
+use crate::error::{Result, UnexpectedSnafu};
+use crate::gc::candidate::GcCandidate;
+use crate::gc::ctx::SchedulerCtx;
+use crate::gc::handler::Region2Peers;
+use crate::gc::options::GcSchedulerOptions;
+use crate::gc::scheduler::{Event, GcScheduler};
+
+pub const TEST_REGION_SIZE_200MB: u64 = 200_000_000;
+
+/// Helper function to create an empty GcReport for the given region IDs
+pub fn new_empty_report_with(region_ids: impl IntoIterator<Item = RegionId>) -> GcReport {
+    let mut deleted_files = HashMap::new();
+    for region_id in region_ids {
+        deleted_files.insert(region_id, vec![]);
+    }
+    GcReport {
+        deleted_files,
+        need_retry_regions: HashSet::new(),
+    }
+}
+
+#[allow(clippy::type_complexity)]
+#[derive(Debug, Default)]
+pub struct MockSchedulerCtx {
+    pub table_to_region_stats: Arc<Mutex<Option<HashMap<TableId, Vec<RegionStat>>>>>,
+    pub table_routes: Arc<Mutex<HashMap<TableId, (TableId, PhysicalTableRouteValue)>>>,
+    pub file_refs: Arc<Mutex<Option<FileRefsManifest>>>,
+    pub gc_reports: Arc<Mutex<HashMap<RegionId, GcReport>>>,
+    pub candidates: Arc<Mutex<Option<HashMap<TableId, Vec<GcCandidate>>>>>,
+    pub get_table_to_region_stats_calls: Arc<Mutex<usize>>,
+    pub get_file_references_calls: Arc<Mutex<usize>>,
+    pub gc_regions_calls: Arc<Mutex<usize>>,
+    // Error injection fields for testing
+    pub get_table_to_region_stats_error: Arc<Mutex<Option<crate::error::Error>>>,
+    pub get_table_route_error: Arc<Mutex<Option<crate::error::Error>>>,
+    pub get_file_references_error: Arc<Mutex<Option<crate::error::Error>>>,
+    pub gc_regions_error: Arc<Mutex<Option<crate::error::Error>>>,
+    // Retry testing fields
+    pub gc_regions_retry_count: Arc<Mutex<HashMap<RegionId, usize>>>,
+    pub gc_regions_error_sequence: Arc<Mutex<Vec<crate::error::Error>>>,
+    pub gc_regions_success_after_retries: Arc<Mutex<HashMap<RegionId, usize>>>,
+    // Per-region error injection
+    pub gc_regions_per_region_errors: Arc<Mutex<HashMap<RegionId, crate::error::Error>>>,
+}
+
+impl MockSchedulerCtx {
+    pub fn with_table_routes(
+        self,
+        table_routes: HashMap<TableId, (TableId, Vec<(RegionId, Peer)>)>,
+    ) -> Self {
+        *self.table_routes.lock().unwrap() = table_routes
+            .into_iter()
+            .map(|(k, (phy_id, region2peer))| {
+                let phy = PhysicalTableRouteValue::new(
+                    region2peer
+                        .into_iter()
+                        .map(|(region_id, peer)| RegionRoute {
+                            region: Region::new_test(region_id),
+                            leader_peer: Some(peer),
+                            ..Default::default()
+                        })
+                        .collect(),
+                );
+
+                (k, (phy_id, phy))
+            })
+            .collect();
+        self
+    }
+
+    /// Set an error to be returned by `get_table_to_region_stats`
+    #[allow(dead_code)]
+    pub fn with_get_table_to_region_stats_error(self, error: crate::error::Error) -> Self {
+        *self.get_table_to_region_stats_error.lock().unwrap() = Some(error);
+        self
+    }
+
+    /// Set an error to be returned by `get_table_route`
+    pub fn set_table_route_error(&self, error: crate::error::Error) {
+        *self.get_table_route_error.lock().unwrap() = Some(error);
+    }
+
+    /// Set an error to be returned by `get_file_references`
+    #[allow(dead_code)]
+    pub fn with_get_file_references_error(self, error: crate::error::Error) -> Self {
+        *self.get_file_references_error.lock().unwrap() = Some(error);
+        self
+    }
+
+    /// Set an error to be returned by `gc_regions`
+    pub fn with_gc_regions_error(self, error: crate::error::Error) -> Self {
+        *self.gc_regions_error.lock().unwrap() = Some(error);
+        self
+    }
+
+    /// Set a sequence of errors to be returned by `gc_regions` for retry testing
+    pub fn set_gc_regions_error_sequence(&self, errors: Vec<crate::error::Error>) {
+        *self.gc_regions_error_sequence.lock().unwrap() = errors;
+    }
+
+    /// Set success after a specific number of retries for a region
+    pub fn set_gc_regions_success_after_retries(&self, region_id: RegionId, retries: usize) {
+        self.gc_regions_success_after_retries
+            .lock()
+            .unwrap()
+            .insert(region_id, retries);
+    }
+
+    /// Get the retry count for a specific region
+    pub fn get_retry_count(&self, region_id: RegionId) -> usize {
+        self.gc_regions_retry_count
+            .lock()
+            .unwrap()
+            .get(&region_id)
+            .copied()
+            .unwrap_or(0)
+    }
+
+    /// Reset all retry tracking
+    pub fn reset_retry_tracking(&self) {
+        *self.gc_regions_retry_count.lock().unwrap() = HashMap::new();
+        *self.gc_regions_error_sequence.lock().unwrap() = Vec::new();
+        *self.gc_regions_success_after_retries.lock().unwrap() = HashMap::new();
+    }
+
+    /// Set an error to be returned for a specific region
+    pub fn set_gc_regions_error_for_region(&self, region_id: RegionId, error: crate::error::Error) {
+        self.gc_regions_per_region_errors
+            .lock()
+            .unwrap()
+            .insert(region_id, error);
+    }
+
+    /// Clear per-region errors
+    #[allow(unused)]
+    pub fn clear_gc_regions_per_region_errors(&self) {
+        self.gc_regions_per_region_errors.lock().unwrap().clear();
+    }
+}
+
+#[async_trait::async_trait]
+impl SchedulerCtx for MockSchedulerCtx {
+    async fn get_table_to_region_stats(&self) -> Result<HashMap<TableId, Vec<RegionStat>>> {
+        *self.get_table_to_region_stats_calls.lock().unwrap() += 1;
+
+        // Check if we should return an injected error
+        if let Some(error) = self.get_table_to_region_stats_error.lock().unwrap().take() {
+            return Err(error);
+        }
+
+        Ok(self
+            .table_to_region_stats
+            .lock()
+            .unwrap()
+            .clone()
+            .unwrap_or_default())
+    }
+
+    async fn get_table_route(
+        &self,
+        table_id: TableId,
+    ) -> Result<(TableId, PhysicalTableRouteValue)> {
+        // Check if we should return an injected error
+        if let Some(error) = self.get_table_route_error.lock().unwrap().take() {
+            return Err(error);
+        }
+
+        Ok(self
+            .table_routes
+            .lock()
+            .unwrap()
+            .get(&table_id)
+            .cloned()
+            .unwrap_or_else(|| (table_id, PhysicalTableRouteValue::default())))
+    }
+
+    async fn get_file_references(
+        &self,
+        query_regions: &[RegionId],
+        _related_regions: HashMap<RegionId, Vec<RegionId>>,
+        region_to_peer: &Region2Peers,
+        _timeout: Duration,
+    ) -> Result<FileRefsManifest> {
+        *self.get_file_references_calls.lock().unwrap() += 1;
+
+        // Check if we should return an injected error
+        if let Some(error) = self.get_file_references_error.lock().unwrap().take() {
+            return Err(error);
+        }
+        if query_regions
+            .iter()
+            .any(|region_id| !region_to_peer.contains_key(region_id))
+        {
+            UnexpectedSnafu {
+                violated: format!(
+                    "region_to_peer{region_to_peer:?} does not contain all region_ids requested: {:?}",
+                    query_regions
+                ),
+            }.fail()?;
+        }
+
+        Ok(self.file_refs.lock().unwrap().clone().unwrap_or_default())
+    }
+
+    async fn gc_regions(
+        &self,
+        _peer: Peer,
+        region_ids: &[RegionId],
+        _file_refs_manifest: &FileRefsManifest,
+        _full_file_listing: bool,
+        _timeout: Duration,
+    ) -> Result<GcReport> {
+        *self.gc_regions_calls.lock().unwrap() += 1;
+
+        // Check per-region error injection first (for any region)
+        for &region_id in region_ids {
+            if let Some(error) = self
+                .gc_regions_per_region_errors
+                .lock()
+                .unwrap()
+                .remove(&region_id)
+            {
+                *self
+                    .gc_regions_retry_count
+                    .lock()
+                    .unwrap()
+                    .entry(region_id)
+                    .or_insert(0) += 1;
+                return Err(error);
+            }
+        }
+
+        // Check if we should return an injected error
+        if let Some(error) = self.gc_regions_error.lock().unwrap().take() {
+            for region_id in region_ids {
+                *self
+                    .gc_regions_retry_count
+                    .lock()
+                    .unwrap()
+                    .entry(*region_id)
+                    .or_insert(0) += 1;
+            }
+            return Err(error);
+        }
+
+        // Handle error sequence for retry testing
+        {
+            let mut error_sequence = self.gc_regions_error_sequence.lock().unwrap();
+            if !error_sequence.is_empty() {
+                let error = error_sequence.remove(0);
+                for region_id in region_ids {
+                    *self
+                        .gc_regions_retry_count
+                        .lock()
+                        .unwrap()
+                        .entry(*region_id)
+                        .or_insert(0) += 1;
+                }
+                return Err(error);
+            }
+        }
+
+        // Build the final report by processing each region individually
+        let mut final_report = GcReport::default();
+        let gc_reports = self.gc_reports.lock().unwrap();
+        let success_after_retries = self.gc_regions_success_after_retries.lock().unwrap();
+
+        for &region_id in region_ids {
+            // Get current retry count for this region
+            let retry_count = self
+                .gc_regions_retry_count
+                .lock()
+                .unwrap()
+                .get(&region_id)
+                .copied()
+                .unwrap_or(0);
+
+            // Check if this region should succeed or need retry
+            if let Some(&required_retries) = success_after_retries.get(&region_id) {
+                if retry_count < required_retries {
+                    debug!(
+                        "Region {} needs retry (attempt {}/{})",
+                        region_id,
+                        retry_count + 1,
+                        required_retries
+                    );
+                    // This region needs more retries - add to need_retry_regions
+                    final_report.need_retry_regions.insert(region_id);
+                    // Track the retry attempt
+                    let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
+                    *retry_count_map.entry(region_id).or_insert(0) += 1;
+                } else {
+                    debug!(
+                        "Region {} has completed retries - succeeding now",
+                        region_id
+                    );
+                    // This region has completed all required retries - succeed
+                    if let Some(report) = gc_reports.get(&region_id) {
+                        final_report.merge(report.clone());
+                    }
+                    // Track the success attempt
+                    let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
+                    *retry_count_map.entry(region_id).or_insert(0) += 1;
+                }
+            } else {
+                // No retry requirement - check if we have a GC report for this region
+                if let Some(report) = gc_reports.get(&region_id) {
+                    // We have a GC report - succeed immediately
+                    final_report.merge(report.clone());
+                    // Track the success attempt
+                    let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
+                    *retry_count_map.entry(region_id).or_insert(0) += 1;
+                } else {
+                    // No GC report available - this region should be marked for retry
+                    final_report.need_retry_regions.insert(region_id);
+                    // Track the attempt
+                    let mut retry_count_map = self.gc_regions_retry_count.lock().unwrap();
+                    *retry_count_map.entry(region_id).or_insert(0) += 1;
+                }
+            }
+        }
+
+        // Return the report with need_retry_regions populated - let the caller handle retry logic
+        Ok(final_report)
+    }
+}
+
+pub struct TestEnv {
+    pub scheduler: GcScheduler,
+    pub ctx: Arc<MockSchedulerCtx>,
+    #[allow(dead_code)]
+    tx: Sender<Event>,
+}
+
+#[allow(unused)]
+impl TestEnv {
+    pub fn new() -> Self {
+        let ctx = Arc::new(MockSchedulerCtx::default());
+        let (tx, rx) = GcScheduler::channel();
+        let config = GcSchedulerOptions::default();
+
+        let scheduler = GcScheduler {
+            ctx: ctx.clone(),
+            receiver: rx,
+            config,
+            region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+            last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+        };
+
+        Self { scheduler, ctx, tx }
+    }
+
+    pub fn with_candidates(self, candidates: HashMap<TableId, Vec<GcCandidate>>) -> Self {
+        *self.ctx.candidates.lock().unwrap() = Some(candidates);
+        self
+    }
+
+    #[allow(dead_code)]
+    pub async fn run_scheduler(mut self) {
+        self.scheduler.run().await;
+    }
+
+    #[allow(dead_code)]
+    pub async fn tick(&self) {
+        self.tx.send(Event::Tick).await.unwrap();
+    }
+}
+
+/// Helper function to create a mock GC candidate that will pass the GC threshold
+fn new_candidate(region_id: RegionId, score: f64) -> GcCandidate {
+    // will pass threshold for gc
+    let region_stat = mock_region_stat(region_id, RegionRole::Leader, 10_000, 10);
+
+    GcCandidate {
+        region_id,
+        score: OrderedFloat(score),
+        region_stat,
+    }
+}
+
+/// Helper function to create a mock GC candidate
+fn mock_candidate(region_id: RegionId) -> GcCandidate {
+    let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10);
+    GcCandidate {
+        region_id,
+        score: ordered_float::OrderedFloat(1.0),
+        region_stat,
+    }
+}
+
+/// Helper function to create a mock RegionStat
+fn mock_region_stat(
+    id: RegionId,
+    role: RegionRole,
+    approximate_bytes: u64,
+    sst_num: u64,
+) -> RegionStat {
+    RegionStat {
+        id,
+        role,
+        approximate_bytes,
+        sst_num,
+        region_manifest: RegionManifestInfo::Mito {
+            manifest_version: 0,
+            flushed_entry_id: 0,
+            file_removed_cnt: 0,
+        },
+        rcus: 0,
+        wcus: 0,
+        engine: "mito".to_string(),
+        num_rows: 0,
+        memtable_size: 0,
+        manifest_size: 0,
+        sst_size: 0,
+        index_size: 0,
+        data_topic_latest_entry_id: 0,
+        metadata_topic_latest_entry_id: 0,
+        written_bytes: 0,
+    }
+}
diff --git a/src/meta-srv/src/gc/mock/basic.rs b/src/meta-srv/src/gc/mock/basic.rs
new file mode 100644
index 0000000000..2cf3679245
--- /dev/null
+++ b/src/meta-srv/src/gc/mock/basic.rs
@@ -0,0 +1,164 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+use common_meta::peer::Peer;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionRole;
+use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
+
+use crate::gc::mock::{
+    MockSchedulerCtx, TEST_REGION_SIZE_200MB, TestEnv, mock_region_stat, new_candidate,
+};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+#[tokio::test]
+async fn test_parallel_process_datanodes_empty() {
+    let env = TestEnv::new();
+    let report = env
+        .scheduler
+        .parallel_process_datanodes(HashMap::new())
+        .await;
+
+    assert_eq!(report.per_datanode_reports.len(), 0);
+    assert_eq!(report.failed_datanodes.len(), 0);
+}
+
+#[tokio::test]
+async fn test_parallel_process_datanodes_with_candidates() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+    let candidates = HashMap::from([(table_id, vec![new_candidate(region_id, 1.0)])]);
+
+    let mut gc_reports = HashMap::new();
+    let deleted_files = vec![FileId::random()];
+    gc_reports.insert(
+        region_id,
+        GcReport {
+            deleted_files: HashMap::from([(region_id, deleted_files.clone())]),
+            ..Default::default()
+        },
+    );
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+    let ctx = MockSchedulerCtx {
+        gc_reports: Arc::new(Mutex::new(gc_reports)),
+        file_refs: Arc::new(Mutex::new(Some(file_refs))),
+        ..Default::default()
+    }
+    .with_table_routes(HashMap::from([(
+        table_id,
+        (table_id, vec![(region_id, peer.clone())]),
+    )]));
+
+    let env = TestEnv::new();
+    // We need to replace the ctx with the one with gc_reports
+    let mut scheduler = env.scheduler;
+    scheduler.ctx = Arc::new(ctx);
+
+    // Convert table-based candidates to datanode-based candidates
+    let datanode_to_candidates = HashMap::from([(
+        peer,
+        candidates
+            .into_iter()
+            .flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
+            .collect(),
+    )]);
+
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    assert_eq!(report.per_datanode_reports.len(), 1);
+    assert_eq!(report.failed_datanodes.len(), 0);
+}
+
+#[tokio::test]
+async fn test_handle_tick() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+    let candidates = HashMap::from([(table_id, vec![new_candidate(region_id, 1.0)])]);
+
+    let mut gc_reports = HashMap::new();
+    gc_reports.insert(region_id, GcReport::default());
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(HashMap::from([(
+                table_id,
+                vec![mock_region_stat(
+                    region_id,
+                    RegionRole::Leader,
+                    TEST_REGION_SIZE_200MB,
+                    10,
+                )],
+            )])))),
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            candidates: Arc::new(Mutex::new(Some(candidates))),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer)]),
+        )])),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let report = scheduler.handle_tick().await.unwrap();
+
+    // Validate the returned GcJobReport
+    assert_eq!(
+        report.per_datanode_reports.len(),
+        1,
+        "Should process 1 datanode"
+    );
+    assert_eq!(
+        report.failed_datanodes.len(),
+        0,
+        "Should have 0 failed datanodes"
+    );
+
+    assert_eq!(*ctx.get_table_to_region_stats_calls.lock().unwrap(), 1);
+    assert_eq!(*ctx.get_file_references_calls.lock().unwrap(), 1);
+    assert_eq!(*ctx.gc_regions_calls.lock().unwrap(), 1);
+
+    let tracker = scheduler.region_gc_tracker.lock().await;
+    assert!(
+        tracker.contains_key(&region_id),
+        "Tracker should have one region: {:?}",
+        tracker
+    );
+}
diff --git a/src/meta-srv/src/gc/mock/candidate_select.rs b/src/meta-srv/src/gc/mock/candidate_select.rs
new file mode 100644
index 0000000000..73da83802a
--- /dev/null
+++ b/src/meta-srv/src/gc/mock/candidate_select.rs
@@ -0,0 +1,390 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+use common_meta::datanode::RegionManifestInfo;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionRole;
+use store_api::storage::RegionId;
+
+use crate::gc::mock::{MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+/// Candidate Selection Tests
+#[tokio::test]
+async fn test_gc_candidate_filtering_by_role() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let leader_region = RegionId::new(table_id, 1);
+    let follower_region = RegionId::new(table_id, 2);
+
+    let mut leader_stat = mock_region_stat(
+        leader_region,
+        RegionRole::Leader,
+        TEST_REGION_SIZE_200MB,
+        10,
+    ); // 200MB
+
+    let mut follower_stat = mock_region_stat(
+        follower_region,
+        RegionRole::Follower,
+        TEST_REGION_SIZE_200MB,
+        10,
+    ); // 200MB
+
+    // Set up manifest info for scoring
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut leader_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut follower_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![leader_stat.clone(), follower_stat.clone()])]);
+
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+
+    let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
+
+    // Should only select leader regions
+    assert_eq!(
+        candidates.len(),
+        1,
+        "Expected 1 table with candidates, got {}",
+        candidates.len()
+    );
+    if let Some(table_candidates) = candidates.get(&table_id) {
+        assert_eq!(
+            table_candidates.len(),
+            1,
+            "Expected 1 candidate for table {}, got {}",
+            table_id,
+            table_candidates.len()
+        );
+        assert_eq!(
+            table_candidates[0].region_id, leader_region,
+            "Expected leader region {}, got {}",
+            leader_region, table_candidates[0].region_id
+        );
+    } else {
+        panic!("Expected table {} to have candidates", table_id);
+    }
+}
+
+#[tokio::test]
+async fn test_gc_candidate_size_threshold() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let small_region = RegionId::new(table_id, 1);
+    let large_region = RegionId::new(table_id, 2);
+
+    let mut small_stat = mock_region_stat(small_region, RegionRole::Leader, 50_000_000, 5); // 50MB
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut small_stat.region_manifest
+    {
+        *file_removed_cnt = 3;
+    }
+
+    let mut large_stat =
+        mock_region_stat(large_region, RegionRole::Leader, TEST_REGION_SIZE_200MB, 20); // 200MB
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut large_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![small_stat, large_stat])]);
+
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+
+    let config = GcSchedulerOptions {
+        min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+
+    let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
+
+    // Should only select large region
+    assert_eq!(
+        candidates.len(),
+        1,
+        "Expected 1 table with candidates, got {}",
+        candidates.len()
+    );
+    if let Some(table_candidates) = candidates.get(&table_id) {
+        assert_eq!(
+            table_candidates.len(),
+            1,
+            "Expected 1 candidate for table {}, got {}",
+            table_id,
+            table_candidates.len()
+        );
+        assert_eq!(
+            table_candidates[0].region_id, large_region,
+            "Expected large region {}, got {}",
+            large_region, table_candidates[0].region_id
+        );
+    } else {
+        panic!("Expected table {} to have candidates", table_id);
+    }
+}
+
+#[tokio::test]
+async fn test_gc_candidate_scoring() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let low_score_region = RegionId::new(table_id, 1);
+    let high_score_region = RegionId::new(table_id, 2);
+
+    let mut low_stat = mock_region_stat(
+        low_score_region,
+        RegionRole::Leader,
+        TEST_REGION_SIZE_200MB,
+        5,
+    ); // 200MB
+    // Set low file removal rate for low_score_region
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut low_stat.region_manifest
+    {
+        *file_removed_cnt = 2;
+    }
+
+    let mut high_stat = mock_region_stat(
+        high_score_region,
+        RegionRole::Leader,
+        TEST_REGION_SIZE_200MB,
+        50,
+    ); // 200MB
+    // Set high file removal rate for high_score_region
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut high_stat.region_manifest
+    {
+        *file_removed_cnt = 20;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![low_stat, high_stat])]);
+
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+
+    let config = GcSchedulerOptions {
+        sst_count_weight: 1.0,
+        file_removed_count_weight: 0.5,
+        min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+
+    let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
+
+    // Should select both regions but high score region should be first
+    assert_eq!(
+        candidates.len(),
+        1,
+        "Expected 1 table with candidates, got {}",
+        candidates.len()
+    );
+    if let Some(table_candidates) = candidates.get(&table_id) {
+        assert_eq!(
+            table_candidates.len(),
+            2,
+            "Expected 2 candidates for table {}, got {}",
+            table_id,
+            table_candidates.len()
+        );
+        // Higher score region should come first (sorted by score descending)
+        assert_eq!(
+            table_candidates[0].region_id, high_score_region,
+            "High score region should be first"
+        );
+        assert!(
+            table_candidates[0].score > table_candidates[1].score,
+            "High score region should have higher score: {} > {}",
+            table_candidates[0].score,
+            table_candidates[1].score
+        );
+    } else {
+        panic!("Expected table {} to have candidates", table_id);
+    }
+}
+
+#[tokio::test]
+async fn test_gc_candidate_regions_per_table_threshold() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    // Create 10 regions for the same table
+    let mut region_stats = Vec::new();
+
+    for i in 0..10 {
+        let region_id = RegionId::new(table_id, i + 1);
+        let mut stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 20); // 200MB
+
+        // Set different file removal rates to create different scores
+        // Higher region IDs get higher scores (better GC candidates)
+        if let RegionManifestInfo::Mito {
+            file_removed_cnt, ..
+        } = &mut stat.region_manifest
+        {
+            *file_removed_cnt = (i as u64 + 1) * 2; // Region 1: 2, Region 2: 4, ..., Region 10: 20
+        }
+
+        region_stats.push(stat);
+    }
+
+    let table_stats = HashMap::from([(table_id, region_stats)]);
+
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+
+    // Set regions_per_table_threshold to 3
+    let config = GcSchedulerOptions {
+        regions_per_table_threshold: 3,
+        min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+
+    let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
+
+    // Should have 1 table with candidates
+    assert_eq!(
+        candidates.len(),
+        1,
+        "Expected 1 table with candidates, got {}",
+        candidates.len()
+    );
+
+    if let Some(table_candidates) = candidates.get(&table_id) {
+        // Should only have 3 candidates due to regions_per_table_threshold
+        assert_eq!(
+            table_candidates.len(),
+            3,
+            "Expected 3 candidates for table {} due to regions_per_table_threshold, got {}",
+            table_id,
+            table_candidates.len()
+        );
+
+        // Verify that the top 3 scoring regions are selected
+        // Regions 8, 9, 10 should have the highest scores (file_removed_cnt: 16, 18, 20)
+        // They should be returned in descending order by score
+        let expected_regions = vec![10, 9, 8];
+        let actual_regions: Vec<u32> = table_candidates
+            .iter()
+            .map(|c| c.region_id.region_number())
+            .collect();
+
+        assert_eq!(
+            actual_regions, expected_regions,
+            "Expected regions {:?} to be selected, got {:?}",
+            expected_regions, actual_regions
+        );
+
+        // Verify they are sorted by score in descending order
+        for i in 0..table_candidates.len() - 1 {
+            assert!(
+                table_candidates[i].score >= table_candidates[i + 1].score,
+                "Candidates should be sorted by score descending: {} >= {}",
+                table_candidates[i].score,
+                table_candidates[i + 1].score
+            );
+        }
+    } else {
+        panic!("Expected table {} to have candidates", table_id);
+    }
+}
diff --git a/src/meta-srv/src/gc/mock/concurrent.rs b/src/meta-srv/src/gc/mock/concurrent.rs
new file mode 100644
index 0000000000..2bef9b9896
--- /dev/null
+++ b/src/meta-srv/src/gc/mock/concurrent.rs
@@ -0,0 +1,516 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use common_meta::key::table_route::PhysicalTableRouteValue;
+use common_meta::peer::Peer;
+use common_meta::rpc::router::{Region, RegionRoute};
+use common_telemetry::{info, init_default_ut_logging};
+use store_api::region_engine::RegionRole;
+use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
+
+use crate::gc::mock::{
+    MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_candidate, mock_region_stat, new_candidate,
+};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+/// Concurrent Processing Tests
+#[tokio::test]
+async fn test_concurrent_table_processing_limits() {
+    init_default_ut_logging();
+
+    let mut candidates = HashMap::new();
+    let mut gc_reports = HashMap::new();
+
+    // Create many tables with candidates
+    for table_id in 1..=10 {
+        let region_id = RegionId::new(table_id, 1);
+        candidates.insert(table_id, vec![new_candidate(region_id, 1.0)]);
+        gc_reports.insert(
+            region_id,
+            GcReport {
+                deleted_files: HashMap::from([(region_id, vec![FileId::random()])]),
+                ..Default::default()
+            },
+        );
+    }
+
+    let ctx = MockSchedulerCtx {
+        candidates: Arc::new(Mutex::new(Some(candidates))),
+        file_refs: Arc::new(Mutex::new(Some(FileRefsManifest {
+            manifest_version: (1..=10).map(|i| (RegionId::new(i, 1), 1)).collect(),
+            ..Default::default()
+        }))),
+        gc_reports: Arc::new(Mutex::new(gc_reports)),
+        ..Default::default()
+    }
+    .with_table_routes(
+        (1..=10)
+            .map(|table_id| {
+                let region_id = RegionId::new(table_id, 1);
+                (table_id, (table_id, vec![(region_id, Peer::new(1, ""))]))
+            })
+            .collect(),
+    );
+
+    let ctx = Arc::new(ctx);
+
+    let config = GcSchedulerOptions {
+        max_concurrent_tables: 3,                          // Set a low limit
+        retry_backoff_duration: Duration::from_millis(50), // for faster test
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
+
+    // Convert table-based candidates to datanode-based candidates
+    let peer = Peer::new(1, "");
+    let datanode_to_candidates = HashMap::from([(
+        peer,
+        candidates
+            .into_iter()
+            .flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
+            .collect(),
+    )]);
+
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    // Should process all datanodes
+    assert_eq!(report.per_datanode_reports.len(), 1);
+    assert_eq!(report.failed_datanodes.len(), 0);
+}
+
+#[tokio::test]
+async fn test_datanode_processes_tables_with_partial_gc_failures() {
+    init_default_ut_logging();
+
+    let table1 = 1;
+    let region1 = RegionId::new(table1, 1);
+    let table2 = 2;
+    let region2 = RegionId::new(table2, 1);
+    let peer = Peer::new(1, "");
+
+    let mut candidates = HashMap::new();
+    candidates.insert(table1, vec![new_candidate(region1, 1.0)]);
+    candidates.insert(table2, vec![new_candidate(region2, 1.0)]);
+
+    // Set up GC reports for success and failure
+    let mut gc_reports = HashMap::new();
+    gc_reports.insert(
+        region1,
+        GcReport {
+            deleted_files: HashMap::from([(region1, vec![])]),
+            ..Default::default()
+        },
+    );
+    // region2 will have no GC report, simulating failure
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region1, 1), (region2, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            candidates: Arc::new(Mutex::new(Some(candidates))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([
+            (table1, (table1, vec![(region1, peer.clone())])),
+            (table2, (table2, vec![(region2, peer.clone())])),
+        ])),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
+
+    // Convert table-based candidates to datanode-based candidates
+
+    let datanode_to_candidates = HashMap::from([(
+        peer,
+        candidates
+            .into_iter()
+            .flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
+            .collect(),
+    )]);
+
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    // Should have one datanode with mixed results
+    assert_eq!(report.per_datanode_reports.len(), 1);
+    // also check one failed region (region2 has no GC report, so it should be in need_retry_regions)
+    let datanode_report = report.per_datanode_reports.values().next().unwrap();
+    assert_eq!(datanode_report.need_retry_regions.len(), 1);
+    assert_eq!(report.failed_datanodes.len(), 0);
+}
+
+// Region Concurrency Tests
+
+#[tokio::test]
+async fn test_region_gc_concurrency_limit() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let peer = Peer::new(1, "");
+
+    // Create multiple regions for the same table
+    let mut region_stats = Vec::new();
+    let mut candidates = Vec::new();
+    let mut gc_reports = HashMap::new();
+
+    for i in 1..=10 {
+        let region_id = RegionId::new(table_id, i as u32);
+        let region_stat =
+            mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+        region_stats.push(region_stat);
+
+        candidates.push(mock_candidate(region_id));
+
+        gc_reports.insert(
+            region_id,
+            GcReport {
+                deleted_files: HashMap::from([(
+                    region_id,
+                    vec![FileId::random(), FileId::random()],
+                )]),
+                ..Default::default()
+            },
+        );
+    }
+
+    let table_stats = HashMap::from([(table_id, region_stats)]);
+
+    let file_refs = FileRefsManifest {
+        manifest_version: (1..=10)
+            .map(|i| (RegionId::new(table_id, i as u32), 1))
+            .collect(),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (
+                table_id,
+                (1..=10)
+                    .map(|i| (RegionId::new(table_id, i as u32), peer.clone()))
+                    .collect(),
+            ),
+        )])),
+    );
+
+    // Configure low concurrency limit
+    let config = GcSchedulerOptions {
+        region_gc_concurrency: 3, // Only 3 regions can be processed concurrently
+        retry_backoff_duration: Duration::from_millis(50), // for faster test
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let start_time = Instant::now();
+    let report = scheduler
+        .process_datanode_gc(
+            peer,
+            candidates.into_iter().map(|c| (table_id, c)).collect(),
+        )
+        .await
+        .unwrap();
+    let duration = start_time.elapsed();
+
+    // All regions should be processed successfully
+    // Check that all 10 regions have deleted files
+    assert_eq!(report.deleted_files.len(), 10);
+    for i in 1..=10 {
+        let region_id = RegionId::new(table_id, i as u32);
+        assert!(report.deleted_files.contains_key(&region_id));
+        assert_eq!(report.deleted_files[&region_id].len(), 2); // Each region has 2 deleted files
+    }
+    assert!(report.need_retry_regions.is_empty());
+
+    // Verify that concurrency limit was respected (this is hard to test directly,
+    // but we can verify that the processing completed successfully)
+    info!(
+        "Processed 10 regions with concurrency limit 3 in {:?}",
+        duration
+    );
+}
+
+#[tokio::test]
+async fn test_region_gc_concurrency_with_partial_failures() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let peer = Peer::new(1, "");
+
+    // Create multiple regions with mixed success/failure
+    let mut region_stats = Vec::new();
+    let mut candidates = Vec::new();
+    let mut gc_reports = HashMap::new();
+
+    // Create the context first so we can set errors on it
+    let ctx = Arc::new(MockSchedulerCtx::default());
+
+    for i in 1..=6 {
+        let region_id = RegionId::new(table_id, i as u32);
+        let region_stat =
+            mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+        region_stats.push(region_stat);
+
+        candidates.push(mock_candidate(region_id));
+
+        if i % 2 == 0 {
+            // Even regions will succeed
+            gc_reports.insert(
+                region_id,
+                GcReport {
+                    deleted_files: HashMap::from([(
+                        region_id,
+                        vec![FileId::random(), FileId::random()],
+                    )]),
+                    ..Default::default()
+                },
+            );
+        } else {
+            // Odd regions will fail - don't add them to gc_reports
+            // This will cause them to be marked as needing retry
+        }
+    }
+
+    let table_stats = HashMap::from([(table_id, region_stats)]);
+
+    let file_refs = FileRefsManifest {
+        manifest_version: (1..=6)
+            .map(|i| (RegionId::new(table_id, i as u32), 1))
+            .collect(),
+        ..Default::default()
+    };
+
+    // Update the context with the data
+    *ctx.table_to_region_stats.lock().unwrap() = Some(table_stats);
+    *ctx.gc_reports.lock().unwrap() = gc_reports;
+    *ctx.file_refs.lock().unwrap() = Some(file_refs);
+    let region_routes = (1..=6)
+        .map(|i| RegionRoute {
+            region: Region::new_test(RegionId::new(table_id, i as u32)),
+            leader_peer: Some(peer.clone()),
+            ..Default::default()
+        })
+        .collect();
+
+    *ctx.table_routes.lock().unwrap() = HashMap::from([(
+        table_id,
+        (table_id, PhysicalTableRouteValue::new(region_routes)),
+    )]);
+
+    // Configure concurrency limit
+    let config = GcSchedulerOptions {
+        region_gc_concurrency: 2, // Process 2 regions concurrently
+        retry_backoff_duration: Duration::from_millis(50), // for faster test
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let datanode_to_candidates = HashMap::from([(
+        peer.clone(),
+        candidates.into_iter().map(|c| (table_id, c)).collect(),
+    )]);
+
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    let report = report.per_datanode_reports.get(&peer.id).unwrap();
+
+    // Should have 3 successful and 3 failed regions
+    // Even regions (2, 4, 6) should succeed, odd regions (1, 3, 5) should fail
+    let mut successful_regions = 0;
+    let mut failed_regions = 0;
+
+    for i in 1..=6 {
+        let region_id = RegionId::new(table_id, i as u32);
+        if i % 2 == 0 {
+            // Even regions should succeed
+            if report.deleted_files.contains_key(&region_id) {
+                successful_regions += 1;
+            }
+        } else {
+            // Odd regions should fail - they should be in need_retry_regions
+            if report.need_retry_regions.contains(&region_id) {
+                failed_regions += 1;
+            }
+        }
+    }
+
+    // In the new implementation, regions that cause gc_regions to return an error
+    // are added to need_retry_regions. Let's check if we have the expected mix.
+    info!(
+        "Successful regions: {}, Failed regions: {}",
+        successful_regions, failed_regions
+    );
+    info!(
+        "Deleted files: {:?}",
+        report.deleted_files.keys().collect::<Vec<_>>()
+    );
+    info!("Need retry regions: {:?}", report.need_retry_regions);
+
+    // The exact count might vary depending on how the mock handles errors,
+    // but we should have some successful and some failed regions
+    assert!(
+        successful_regions > 0,
+        "Should have at least some successful regions"
+    );
+    assert!(
+        failed_regions > 0,
+        "Should have at least some failed regions"
+    );
+}
+
+#[tokio::test]
+async fn test_region_gc_concurrency_with_retryable_errors() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let peer = Peer::new(1, "");
+
+    // Create multiple regions
+    let mut region_stats = Vec::new();
+    let mut candidates = Vec::new();
+
+    for i in 1..=5 {
+        let region_id = RegionId::new(table_id, i as u32);
+        let region_stat =
+            mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+        region_stats.push(region_stat);
+        candidates.push(mock_candidate(region_id));
+    }
+
+    let table_stats = HashMap::from([(table_id, region_stats)]);
+
+    let file_refs = FileRefsManifest {
+        manifest_version: (1..=5)
+            .map(|i| (RegionId::new(table_id, i as u32), 1))
+            .collect(),
+        ..Default::default()
+    };
+
+    let gc_report = (1..=5)
+        .map(|i| {
+            let region_id = RegionId::new(table_id, i as u32);
+            (
+                region_id,
+                // mock the actual gc report with deleted files when succeeded(even no files to delete)
+                GcReport::new(HashMap::from([(region_id, vec![])]), HashSet::new()),
+            )
+        })
+        .collect();
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            gc_reports: Arc::new(Mutex::new(gc_report)),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (
+                table_id,
+                (1..=5)
+                    .map(|i| (RegionId::new(table_id, i as u32), peer.clone()))
+                    .collect(),
+            ),
+        )])),
+    );
+
+    // Configure concurrency limit
+    let config = GcSchedulerOptions {
+        region_gc_concurrency: 2, // Process 2 regions concurrently
+        retry_backoff_duration: Duration::from_millis(50),
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let datanode_to_candidates = HashMap::from([(
+        peer.clone(),
+        candidates.into_iter().map(|c| (table_id, c)).collect(),
+    )]);
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    let report = report.per_datanode_reports.get(&peer.id).unwrap();
+
+    // In the new implementation without retry logic, all regions should be processed
+    // The exact behavior depends on how the mock handles the regions
+    info!(
+        "Deleted files: {:?}",
+        report.deleted_files.keys().collect::<Vec<_>>()
+    );
+    info!("Need retry regions: {:?}", report.need_retry_regions);
+
+    // We should have processed all 5 regions in some way
+    let total_processed = report.deleted_files.len() + report.need_retry_regions.len();
+    assert_eq!(total_processed, 5, "Should have processed all 5 regions");
+}
diff --git a/src/meta-srv/src/gc/mock/config.rs b/src/meta-srv/src/gc/mock/config.rs
new file mode 100644
index 0000000000..f4ec9be948
--- /dev/null
+++ b/src/meta-srv/src/gc/mock/config.rs
@@ -0,0 +1,197 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+use common_meta::datanode::RegionManifestInfo;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionRole;
+use store_api::storage::RegionId;
+
+use crate::gc::mock::{MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+/// Configuration Tests
+#[tokio::test]
+async fn test_different_gc_weights() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+
+    let mut region_stat =
+        mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB to pass size threshold
+
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut region_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+
+    // Test with different weights
+    let config1 = GcSchedulerOptions {
+        sst_count_weight: 2.0,
+        file_removed_count_weight: 0.5,
+        min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
+        ..Default::default()
+    };
+
+    let scheduler1 = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: config1,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+
+    let candidates1 = scheduler1.select_gc_candidates(&stats).await.unwrap();
+
+    let config2 = GcSchedulerOptions {
+        sst_count_weight: 0.5,
+        file_removed_count_weight: 2.0,
+        min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
+        ..Default::default()
+    };
+
+    let scheduler2 = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: config2,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = &ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+    let candidates2 = scheduler2.select_gc_candidates(stats).await.unwrap();
+
+    // Both should select the region but with different scores
+    assert_eq!(
+        candidates1.len(),
+        1,
+        "Expected 1 table with candidates for config1, got {}",
+        candidates1.len()
+    );
+    assert_eq!(
+        candidates2.len(),
+        1,
+        "Expected 1 table with candidates for config2, got {}",
+        candidates2.len()
+    );
+
+    // Verify the region is actually selected
+    assert!(
+        candidates1.contains_key(&table_id),
+        "Config1 should contain table_id {}",
+        table_id
+    );
+    assert!(
+        candidates2.contains_key(&table_id),
+        "Config2 should contain table_id {}",
+        table_id
+    );
+}
+
+#[tokio::test]
+async fn test_regions_per_table_threshold() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let mut region_stats = Vec::new();
+
+    // Create many regions
+    for i in 1..=10 {
+        let region_id = RegionId::new(table_id, i as u32);
+        let mut stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+
+        if let RegionManifestInfo::Mito {
+            file_removed_cnt, ..
+        } = &mut stat.region_manifest
+        {
+            *file_removed_cnt = 5;
+        }
+
+        region_stats.push(stat);
+    }
+
+    let table_stats = HashMap::from([(table_id, region_stats)]);
+
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+
+    let config = GcSchedulerOptions {
+        regions_per_table_threshold: 3, // Limit to 3 regions per table
+        min_region_size_threshold: 100 * 1024 * 1024, // 100MB (default)
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let stats = ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+
+    let candidates = scheduler.select_gc_candidates(&stats).await.unwrap();
+
+    assert_eq!(
+        candidates.len(),
+        1,
+        "Expected 1 table with candidates, got {}",
+        candidates.len()
+    );
+    if let Some(table_candidates) = candidates.get(&table_id) {
+        // Should be limited to 3 regions
+        assert_eq!(
+            table_candidates.len(),
+            3,
+            "Expected 3 candidates for table {}, got {}",
+            table_id,
+            table_candidates.len()
+        );
+    } else {
+        panic!("Expected table {} to have candidates", table_id);
+    }
+}
diff --git a/src/meta-srv/src/gc/mock/err_handle.rs b/src/meta-srv/src/gc/mock/err_handle.rs
new file mode 100644
index 0000000000..952671006d
--- /dev/null
+++ b/src/meta-srv/src/gc/mock/err_handle.rs
@@ -0,0 +1,293 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use common_meta::datanode::RegionManifestInfo;
+use common_meta::peer::Peer;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionRole;
+use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
+
+use crate::gc::mock::{
+    MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat, new_empty_report_with,
+};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+/// Error Handling Tests
+#[tokio::test]
+async fn test_gc_regions_failure_handling() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    // Create region stat with proper size and file_removed_cnt to ensure it gets selected as candidate
+    let mut region_stat =
+        mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut region_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    // Create a context that will return an error for gc_regions
+    let mut gc_reports = HashMap::new();
+    gc_reports.insert(region_id, GcReport::default());
+
+    // Inject an error for gc_regions method
+    let gc_error = crate::error::UnexpectedSnafu {
+        violated: "Simulated GC failure for testing".to_string(),
+    }
+    .build();
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        file_refs: HashMap::from([(region_id, HashSet::from([FileId::random()]))]),
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer)]),
+        )]))
+        .with_gc_regions_error(gc_error),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    // This should handle the failure gracefully
+    let report = scheduler.handle_tick().await.unwrap();
+
+    // Validate the report shows the failure handling
+    assert_eq!(
+        report.per_datanode_reports.len(),
+        1,
+        "Should process 1 datanode despite failure"
+    );
+    assert_eq!(
+        report.failed_datanodes.len(),
+        0,
+        "Should have 0 failed datanodes (failure handled via need_retry_regions)"
+    );
+
+    // Check that the region is in need_retry_regions due to the failure
+    let datanode_report = report.per_datanode_reports.values().next().unwrap();
+    assert_eq!(
+        datanode_report.need_retry_regions.len(),
+        1,
+        "Should have 1 region in need_retry_regions due to failure"
+    );
+    assert!(
+        datanode_report.need_retry_regions.contains(&region_id),
+        "Region should be in need_retry_regions"
+    );
+
+    // Verify that calls were made despite potential failures
+    assert_eq!(
+        *ctx.get_table_to_region_stats_calls.lock().unwrap(),
+        1,
+        "Expected 1 call to get_table_to_region_stats"
+    );
+    assert!(
+        *ctx.get_file_references_calls.lock().unwrap() >= 1,
+        "Expected at least 1 call to get_file_references"
+    );
+    assert!(
+        *ctx.gc_regions_calls.lock().unwrap() >= 1,
+        "Expected at least 1 call to gc_regions"
+    );
+}
+
+#[tokio::test]
+async fn test_get_file_references_failure() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    // Create region stat with proper size and file_removed_cnt to ensure it gets selected as candidate
+    let mut region_stat =
+        mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut region_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    // Create context with empty file refs (simulating failure)
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            file_refs: Arc::new(Mutex::new(Some(FileRefsManifest::default()))),
+            gc_reports: Arc::new(Mutex::new(HashMap::from([(
+                region_id,
+                new_empty_report_with([region_id]),
+            )]))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer)]),
+        )])),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions {
+            retry_backoff_duration: Duration::from_millis(10), // shorten for test
+            ..Default::default()
+        },
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let report = scheduler.handle_tick().await.unwrap();
+
+    // Validate the report shows the expected results
+    // In the new implementation, even if get_file_references fails, we still create a datanode report
+    assert_eq!(
+        report.per_datanode_reports.len(),
+        1,
+        "Should process 1 datanode"
+    );
+    assert_eq!(
+        report.failed_datanodes.len(),
+        0,
+        "Should have 0 failed datanodes (failure handled gracefully)"
+    );
+
+    // The region should be processed but may have empty results due to file refs failure
+    let datanode_report = report.per_datanode_reports.values().next().unwrap();
+    // The current implementation still processes the region even with file refs failure
+    // and creates an empty entry in deleted_files
+    assert!(
+        datanode_report.deleted_files.contains_key(&region_id),
+        "Should have region in deleted_files (even if empty)"
+    );
+    assert!(
+        datanode_report.deleted_files[&region_id].is_empty(),
+        "Should have empty deleted files due to file refs failure"
+    );
+
+    // Should still attempt to get file references (may be called multiple times due to retry logic)
+    assert!(
+        *ctx.get_file_references_calls.lock().unwrap() >= 1,
+        "Expected at least 1 call to get_file_references, got {}",
+        *ctx.get_file_references_calls.lock().unwrap()
+    );
+}
+
+#[tokio::test]
+async fn test_get_table_route_failure() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+
+    // Create region stat with proper size and file_removed_cnt to ensure it gets selected as candidate
+    let mut region_stat =
+        mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut region_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    // Inject an error for get_table_route method to simulate failure
+    let route_error = crate::error::UnexpectedSnafu {
+        violated: "Simulated table route failure for testing".to_string(),
+    }
+    .build();
+
+    // Create context with table route error injection
+    let ctx = Arc::new(MockSchedulerCtx {
+        table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+        ..Default::default()
+    });
+    ctx.set_table_route_error(route_error);
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    // Get candidates first
+    let stats = &ctx
+        .table_to_region_stats
+        .lock()
+        .unwrap()
+        .clone()
+        .unwrap_or_default();
+    let candidates = scheduler.select_gc_candidates(stats).await.unwrap();
+
+    // Convert table-based candidates to datanode-based candidates
+    let datanode_to_candidates = HashMap::from([(
+        Peer::new(1, ""),
+        candidates
+            .into_iter()
+            .flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
+            .collect(),
+    )]);
+
+    // This should handle table route failure gracefully
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    // Should process the datanode but handle route error gracefully
+    assert_eq!(
+        report.per_datanode_reports.len(),
+        0,
+        "Expected 0 datanode report"
+    );
+    assert_eq!(
+        report.failed_datanodes.len(),
+        1,
+        "Expected 1 failed datanodes (route error handled gracefully)"
+    );
+    assert!(
+        report.failed_datanodes.contains_key(&1),
+        "Failed datanodes should contain the datanode with route error"
+    );
+}
diff --git a/src/meta-srv/src/gc/mock/full_list.rs b/src/meta-srv/src/gc/mock/full_list.rs
new file mode 100644
index 0000000000..649334938a
--- /dev/null
+++ b/src/meta-srv/src/gc/mock/full_list.rs
@@ -0,0 +1,272 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use common_meta::peer::Peer;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionRole;
+use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
+
+use crate::gc::mock::{MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_candidate, mock_region_stat};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+// Full File Listing Tests
+
+#[tokio::test]
+async fn test_full_file_listing_first_time_gc() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    let gc_report = GcReport {
+        deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
+        ..Default::default()
+    };
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(HashMap::from([(region_id, gc_report)]))),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer.clone())]),
+        )])),
+    );
+
+    // Configure short full file listing interval for testing
+    let config = GcSchedulerOptions {
+        full_file_listing_interval: Duration::from_secs(3600), // 1 hour
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    // First GC - should use full listing since region has never been GC'd
+    let reports = scheduler
+        .process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
+        .await
+        .unwrap();
+
+    assert_eq!(reports.deleted_files.len(), 1);
+
+    // Verify that full listing was used by checking the tracker
+    let tracker = scheduler.region_gc_tracker.lock().await;
+    let gc_info = tracker
+        .get(&region_id)
+        .expect("Region should be in tracker");
+    assert!(
+        gc_info.last_full_listing_time.is_some(),
+        "First GC should use full listing"
+    );
+}
+
+#[tokio::test]
+async fn test_full_file_listing_interval_enforcement() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    let gc_report = GcReport {
+        deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
+        ..Default::default()
+    };
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(HashMap::from([(region_id, gc_report)]))),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer.clone())]),
+        )])),
+    );
+
+    // Configure very short full file listing interval for testing
+    let config = GcSchedulerOptions {
+        full_file_listing_interval: Duration::from_millis(100), // 100ms
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    // First GC - should use full listing
+    let reports1 = scheduler
+        .process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
+        .await
+        .unwrap();
+    assert_eq!(reports1.deleted_files.len(), 1);
+
+    // Get the first full listing time
+    let first_full_listing_time = {
+        let tracker = scheduler.region_gc_tracker.lock().await;
+        let gc_info = tracker
+            .get(&region_id)
+            .expect("Region should be in tracker");
+        gc_info
+            .last_full_listing_time
+            .expect("Should have full listing time")
+    };
+
+    // Wait for interval to pass
+    tokio::time::sleep(Duration::from_millis(150)).await;
+
+    // Second GC - should use full listing again since interval has passed
+    let _reports2 = scheduler
+        .process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
+        .await
+        .unwrap();
+
+    // Verify that full listing was used again
+    let tracker = scheduler.region_gc_tracker.lock().await;
+    let gc_info = tracker
+        .get(&region_id)
+        .expect("Region should be in tracker");
+    let second_full_listing_time = gc_info
+        .last_full_listing_time
+        .expect("Should have full listing time");
+
+    assert!(
+        second_full_listing_time > first_full_listing_time,
+        "Second GC should update full listing time"
+    );
+}
+
+#[tokio::test]
+async fn test_full_file_listing_no_interval_passed() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    let region_stat = mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    let gc_report = GcReport {
+        deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
+        ..Default::default()
+    };
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(HashMap::from([(region_id, gc_report)]))),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer.clone())]),
+        )])),
+    );
+
+    // Configure long full file listing interval
+    let config = GcSchedulerOptions {
+        full_file_listing_interval: Duration::from_secs(3600), // 1 hour
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    // First GC - should use full listing
+    let reports1 = scheduler
+        .process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
+        .await
+        .unwrap();
+    assert_eq!(reports1.deleted_files.len(), 1);
+
+    // Get the first full listing time
+    let first_full_listing_time = {
+        let tracker = scheduler.region_gc_tracker.lock().await;
+        let gc_info = tracker
+            .get(&region_id)
+            .expect("Region should be in tracker");
+        gc_info
+            .last_full_listing_time
+            .expect("Should have full listing time")
+    };
+
+    // Second GC immediately - should NOT use full listing since interval hasn't passed
+    let reports2 = scheduler
+        .process_datanode_gc(peer.clone(), vec![(table_id, mock_candidate(region_id))])
+        .await
+        .unwrap();
+    assert_eq!(reports2.deleted_files.len(), 1);
+
+    // Verify that full listing time was NOT updated
+    let tracker = scheduler.region_gc_tracker.lock().await;
+    let gc_info = tracker
+        .get(&region_id)
+        .expect("Region should be in tracker");
+    let second_full_listing_time = gc_info
+        .last_full_listing_time
+        .expect("Should have full listing time");
+
+    assert_eq!(
+        second_full_listing_time, first_full_listing_time,
+        "Second GC should not update full listing time when interval hasn't passed"
+    );
+}
diff --git a/src/meta-srv/src/gc/mock/integration.rs b/src/meta-srv/src/gc/mock/integration.rs
new file mode 100644
index 0000000000..484871bb5e
--- /dev/null
+++ b/src/meta-srv/src/gc/mock/integration.rs
@@ -0,0 +1,255 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use common_meta::datanode::RegionManifestInfo;
+use common_meta::peer::Peer;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionRole;
+use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
+
+use crate::gc::mock::{
+    MockSchedulerCtx, TEST_REGION_SIZE_200MB, mock_region_stat, new_empty_report_with,
+};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+// Integration Flow Tests
+
+#[tokio::test]
+async fn test_full_gc_workflow() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    let mut region_stat =
+        mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut region_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    let mut gc_reports = HashMap::new();
+    gc_reports.insert(
+        region_id,
+        GcReport {
+            deleted_files: HashMap::from([(region_id, vec![FileId::random(), FileId::random()])]),
+            ..Default::default()
+        },
+    );
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer)]),
+        )])),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    // Run the full workflow
+    let report = scheduler.handle_tick().await.unwrap();
+
+    // Validate the returned GcJobReport - should have 1 datanode report
+    assert_eq!(
+        report.per_datanode_reports.len(),
+        1,
+        "Should process 1 datanode"
+    );
+    assert_eq!(
+        report.failed_datanodes.len(),
+        0,
+        "Should have no failed datanodes"
+    );
+
+    // Get the datanode report
+    let datanode_report = report.per_datanode_reports.values().next().unwrap();
+
+    // Check that the region was processed successfully
+    assert!(
+        datanode_report.deleted_files.contains_key(&region_id),
+        "Should have deleted files for region"
+    );
+    assert_eq!(
+        datanode_report.deleted_files[&region_id].len(),
+        2,
+        "Should have 2 deleted files"
+    );
+    assert!(
+        datanode_report.need_retry_regions.is_empty(),
+        "Should have no retry regions"
+    );
+
+    // Verify all steps were executed
+    assert_eq!(
+        *ctx.get_table_to_region_stats_calls.lock().unwrap(),
+        1,
+        "Expected 1 call to get_table_to_region_stats"
+    );
+    assert_eq!(
+        *ctx.get_file_references_calls.lock().unwrap(),
+        1,
+        "Expected 1 call to get_file_references"
+    );
+    assert_eq!(
+        *ctx.gc_regions_calls.lock().unwrap(),
+        1,
+        "Expected 1 call to gc_regions"
+    );
+}
+
+/// Due to https://github.com/rust-lang/rust/issues/100141 can't have Instant early than process start time on non-linux OS
+/// This is fine since in real usage instant will always be after process start time
+#[cfg(target_os = "linux")]
+#[tokio::test]
+async fn test_tracker_cleanup() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+
+    // Create region stat with proper file_removed_cnt to ensure it gets selected as candidate
+    let mut region_stat =
+        mock_region_stat(region_id, RegionRole::Leader, TEST_REGION_SIZE_200MB, 10); // 200MB
+    if let RegionManifestInfo::Mito {
+        file_removed_cnt, ..
+    } = &mut region_stat.region_manifest
+    {
+        *file_removed_cnt = 5;
+    }
+
+    let table_stats = HashMap::from([(table_id, vec![region_stat])]);
+
+    let mut gc_reports = HashMap::new();
+    gc_reports.insert(region_id, new_empty_report_with([region_id]));
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region_id, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            table_to_region_stats: Arc::new(Mutex::new(Some(table_stats))),
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer)]),
+        )])),
+    );
+
+    let old_region_gc_tracker = {
+        let mut tracker = HashMap::new();
+        tracker.insert(
+            region_id,
+            crate::gc::tracker::RegionGcInfo {
+                last_full_listing_time: Some(Instant::now() - Duration::from_secs(7200)), // 2 hours ago
+                last_gc_time: Instant::now() - Duration::from_secs(7200), // 2 hours ago
+            },
+        );
+        // also insert a different table that should also be cleaned up
+        tracker.insert(
+            RegionId::new(2, 1),
+            crate::gc::tracker::RegionGcInfo {
+                last_full_listing_time: Some(Instant::now() - Duration::from_secs(7200)), // 2 hours ago
+                last_gc_time: Instant::now() - Duration::from_secs(7200), // 2 hours ago
+            },
+        );
+        tracker
+    };
+
+    // Use a custom config with shorter cleanup interval to trigger cleanup
+    let config = GcSchedulerOptions {
+        // 30 minutes
+        tracker_cleanup_interval: Duration::from_secs(1800),
+        ..Default::default()
+    };
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config,
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(old_region_gc_tracker)),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(
+            Instant::now() - Duration::from_secs(3600), // Old cleanup time (1 hour ago)
+        )),
+    };
+
+    let report = scheduler.handle_tick().await.unwrap();
+
+    // Validate the returned GcJobReport - should have 1 datanode report
+    assert_eq!(
+        report.per_datanode_reports.len(),
+        1,
+        "Should process 1 datanode"
+    );
+    assert_eq!(
+        report.failed_datanodes.len(),
+        0,
+        "Should have no failed datanodes"
+    );
+
+    // Get the datanode report
+    let datanode_report = report.per_datanode_reports.values().next().unwrap();
+
+    // Check that the region was processed successfully
+    assert!(
+        datanode_report.deleted_files.contains_key(&region_id),
+        "Should have deleted files for region"
+    );
+    assert!(
+        datanode_report.need_retry_regions.is_empty(),
+        "Should have no retry regions"
+    );
+
+    // Verify tracker was updated
+    let tracker = scheduler.region_gc_tracker.lock().await;
+    assert!(
+        tracker.contains_key(&region_id),
+        "Tracker should contain region {}",
+        region_id
+    );
+    // only one entry
+    assert_eq!(tracker.len(), 1, "Tracker should only have 1 entry");
+}
diff --git a/src/meta-srv/src/gc/mock/misc.rs b/src/meta-srv/src/gc/mock/misc.rs
new file mode 100644
index 0000000000..eb5a9de2c2
--- /dev/null
+++ b/src/meta-srv/src/gc/mock/misc.rs
@@ -0,0 +1,155 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+use common_meta::peer::Peer;
+use common_telemetry::init_default_ut_logging;
+use store_api::storage::{FileRefsManifest, GcReport, RegionId};
+
+use crate::gc::mock::{MockSchedulerCtx, new_candidate};
+use crate::gc::{GcScheduler, GcSchedulerOptions};
+
+/// Edge Case Tests
+
+#[tokio::test]
+async fn test_empty_file_refs_manifest() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region_id = RegionId::new(table_id, 1);
+    let peer = Peer::new(1, "");
+    let candidates = HashMap::from([(table_id, vec![new_candidate(region_id, 1.0)])]);
+
+    // Empty file refs manifest
+    let file_refs = FileRefsManifest::default();
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            candidates: Arc::new(Mutex::new(Some(candidates))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (table_id, vec![(region_id, peer)]),
+        )])),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
+
+    // Convert table-based candidates to datanode-based candidates
+    let peer = Peer::new(1, "");
+    let datanode_to_candidates = HashMap::from([(
+        peer,
+        candidates
+            .into_iter()
+            .flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
+            .collect(),
+    )]);
+
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    assert_eq!(report.per_datanode_reports.len(), 1);
+    assert_eq!(report.failed_datanodes.len(), 0);
+    // Should handle empty file refs gracefully
+}
+
+#[tokio::test]
+async fn test_multiple_regions_per_table() {
+    init_default_ut_logging();
+
+    let table_id = 1;
+    let region1 = RegionId::new(table_id, 1);
+    let region2 = RegionId::new(table_id, 2);
+    let region3 = RegionId::new(table_id, 3);
+    let peer = Peer::new(1, "");
+
+    let candidates = HashMap::from([(
+        table_id,
+        vec![
+            new_candidate(region1, 1.0),
+            new_candidate(region2, 2.0),
+            new_candidate(region3, 3.0),
+        ],
+    )]);
+
+    let mut gc_reports = HashMap::new();
+    gc_reports.insert(region1, GcReport::default());
+    gc_reports.insert(region2, GcReport::default());
+    gc_reports.insert(region3, GcReport::default());
+
+    let file_refs = FileRefsManifest {
+        manifest_version: HashMap::from([(region1, 1), (region2, 1), (region3, 1)]),
+        ..Default::default()
+    };
+
+    let ctx = Arc::new(
+        MockSchedulerCtx {
+            gc_reports: Arc::new(Mutex::new(gc_reports)),
+            file_refs: Arc::new(Mutex::new(Some(file_refs))),
+            candidates: Arc::new(Mutex::new(Some(candidates))),
+            ..Default::default()
+        }
+        .with_table_routes(HashMap::from([(
+            table_id,
+            (
+                table_id,
+                vec![
+                    (region1, peer.clone()),
+                    (region2, peer.clone()),
+                    (region3, peer.clone()),
+                ],
+            ),
+        )])),
+    );
+
+    let scheduler = GcScheduler {
+        ctx: ctx.clone(),
+        receiver: GcScheduler::channel().1,
+        config: GcSchedulerOptions::default(),
+        region_gc_tracker: Arc::new(tokio::sync::Mutex::new(HashMap::new())),
+        last_tracker_cleanup: Arc::new(tokio::sync::Mutex::new(Instant::now())),
+    };
+
+    let candidates = ctx.candidates.lock().unwrap().clone().unwrap_or_default();
+
+    // Convert table-based candidates to datanode-based candidates
+    let datanode_to_candidates = HashMap::from([(
+        peer.clone(),
+        candidates
+            .into_iter()
+            .flat_map(|(table_id, candidates)| candidates.into_iter().map(move |c| (table_id, c)))
+            .collect(),
+    )]);
+
+    let report = scheduler
+        .parallel_process_datanodes(datanode_to_candidates)
+        .await;
+
+    assert_eq!(report.per_datanode_reports.len(), 1);
+    assert_eq!(report.failed_datanodes.len(), 0);
+}
diff --git a/src/meta-srv/src/gc/options.rs b/src/meta-srv/src/gc/options.rs
new file mode 100644
index 0000000000..02ed25323a
--- /dev/null
+++ b/src/meta-srv/src/gc/options.rs
@@ -0,0 +1,171 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::Duration;
+
+use serde::{Deserialize, Serialize};
+use snafu::ensure;
+
+use crate::error::{self, Result};
+
+/// The interval of the gc ticker.
+#[allow(unused)]
+pub(crate) const TICKER_INTERVAL: Duration = Duration::from_secs(60 * 5);
+
+/// Configuration for GC operations.
+///
+/// TODO(discord9): not expose most config to users for now, until GC scheduler is fully stable.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+#[serde(default)]
+pub struct GcSchedulerOptions {
+    /// Whether GC is enabled. Default to false.
+    /// If set to false, no GC will be performed, and potentially some
+    /// files from datanodes will never be deleted.
+    pub enable: bool,
+    /// Maximum number of tables to process concurrently.
+    pub max_concurrent_tables: usize,
+    /// Maximum number of retries per region when GC fails.
+    pub max_retries_per_region: usize,
+    /// Concurrency for region GC within a table.
+    pub region_gc_concurrency: usize,
+    /// Backoff duration between retries.
+    pub retry_backoff_duration: Duration,
+    /// Minimum region size threshold for GC (in bytes).
+    pub min_region_size_threshold: u64,
+    /// Weight for SST file count in GC scoring.
+    pub sst_count_weight: f64,
+    /// Weight for file removal rate in GC scoring.
+    pub file_removed_count_weight: f64,
+    /// Cooldown period between GC operations on the same region.
+    pub gc_cooldown_period: Duration,
+    /// Maximum number of regions to select for GC per table.
+    pub regions_per_table_threshold: usize,
+    /// Timeout duration for mailbox communication with datanodes.
+    pub mailbox_timeout: Duration,
+    /// Interval for performing full file listing during GC to find orphan files.
+    /// Full file listing is expensive but necessary to clean up orphan files.
+    /// Set to a larger value (e.g., 24 hours) to balance performance and cleanup.
+    /// Every Nth GC cycle will use full file listing, where N = full_file_listing_interval / TICKER_INTERVAL.
+    pub full_file_listing_interval: Duration,
+    /// Interval for cleaning up stale region entries from the GC tracker.
+    /// This removes entries for regions that no longer exist (e.g., after table drops).
+    /// Set to a larger value (e.g., 6 hours) since this is just for memory cleanup.
+    pub tracker_cleanup_interval: Duration,
+}
+
+impl Default for GcSchedulerOptions {
+    fn default() -> Self {
+        Self {
+            enable: false,
+            max_concurrent_tables: 10,
+            max_retries_per_region: 3,
+            retry_backoff_duration: Duration::from_secs(5),
+            region_gc_concurrency: 16,
+            min_region_size_threshold: 100 * 1024 * 1024, // 100MB
+            sst_count_weight: 1.0,
+            file_removed_count_weight: 0.5,
+            gc_cooldown_period: Duration::from_secs(60 * 5), // 5 minutes
+            regions_per_table_threshold: 20,                 // Select top 20 regions per table
+            mailbox_timeout: Duration::from_secs(60),        // 60 seconds
+            // Perform full file listing every 24 hours to find orphan files
+            full_file_listing_interval: Duration::from_secs(60 * 60 * 24),
+            // Clean up stale tracker entries every 6 hours
+            tracker_cleanup_interval: Duration::from_secs(60 * 60 * 6),
+        }
+    }
+}
+
+impl GcSchedulerOptions {
+    /// Validates the configuration options.
+    pub fn validate(&self) -> Result<()> {
+        ensure!(
+            self.max_concurrent_tables > 0,
+            error::InvalidArgumentsSnafu {
+                err_msg: "max_concurrent_tables must be greater than 0",
+            }
+        );
+
+        ensure!(
+            self.max_retries_per_region > 0,
+            error::InvalidArgumentsSnafu {
+                err_msg: "max_retries_per_region must be greater than 0",
+            }
+        );
+
+        ensure!(
+            self.region_gc_concurrency > 0,
+            error::InvalidArgumentsSnafu {
+                err_msg: "region_gc_concurrency must be greater than 0",
+            }
+        );
+
+        ensure!(
+            !self.retry_backoff_duration.is_zero(),
+            error::InvalidArgumentsSnafu {
+                err_msg: "retry_backoff_duration must be greater than 0",
+            }
+        );
+
+        ensure!(
+            self.sst_count_weight >= 0.0,
+            error::InvalidArgumentsSnafu {
+                err_msg: "sst_count_weight must be non-negative",
+            }
+        );
+
+        ensure!(
+            self.file_removed_count_weight >= 0.0,
+            error::InvalidArgumentsSnafu {
+                err_msg: "file_removal_rate_weight must be non-negative",
+            }
+        );
+
+        ensure!(
+            !self.gc_cooldown_period.is_zero(),
+            error::InvalidArgumentsSnafu {
+                err_msg: "gc_cooldown_period must be greater than 0",
+            }
+        );
+
+        ensure!(
+            self.regions_per_table_threshold > 0,
+            error::InvalidArgumentsSnafu {
+                err_msg: "regions_per_table_threshold must be greater than 0",
+            }
+        );
+
+        ensure!(
+            !self.mailbox_timeout.is_zero(),
+            error::InvalidArgumentsSnafu {
+                err_msg: "mailbox_timeout must be greater than 0",
+            }
+        );
+
+        ensure!(
+            !self.full_file_listing_interval.is_zero(),
+            error::InvalidArgumentsSnafu {
+                err_msg: "full_file_listing_interval must be greater than 0",
+            }
+        );
+
+        ensure!(
+            !self.tracker_cleanup_interval.is_zero(),
+            error::InvalidArgumentsSnafu {
+                err_msg: "tracker_cleanup_interval must be greater than 0",
+            }
+        );
+
+        Ok(())
+    }
+}
diff --git a/src/meta-srv/src/gc/procedure.rs b/src/meta-srv/src/gc/procedure.rs
new file mode 100644
index 0000000000..4ddd606630
--- /dev/null
+++ b/src/meta-srv/src/gc/procedure.rs
@@ -0,0 +1,544 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+use std::time::Duration;
+
+use api::v1::meta::MailboxMessage;
+use common_meta::instruction::{self, GcRegions, GetFileRefs, GetFileRefsReply, InstructionReply};
+use common_meta::lock_key::RegionLock;
+use common_meta::peer::Peer;
+use common_procedure::error::ToJsonSnafu;
+use common_procedure::{
+    Context as ProcedureContext, Error as ProcedureError, LockKey, Procedure,
+    Result as ProcedureResult, Status,
+};
+use common_telemetry::{debug, error, info, warn};
+use itertools::Itertools as _;
+use serde::{Deserialize, Serialize};
+use snafu::ResultExt as _;
+use store_api::storage::{FileRefsManifest, GcReport, RegionId};
+
+use crate::error::{self, Result, SerializeToJsonSnafu};
+use crate::gc::Region2Peers;
+use crate::handler::HeartbeatMailbox;
+use crate::service::mailbox::{Channel, MailboxRef};
+
+/// Helper function to send GetFileRefs instruction and wait for reply.
+async fn send_get_file_refs(
+    mailbox: &MailboxRef,
+    server_addr: &str,
+    peer: &Peer,
+    instruction: GetFileRefs,
+    timeout: Duration,
+) -> Result<GetFileRefsReply> {
+    let instruction = instruction::Instruction::GetFileRefs(instruction);
+    let msg = MailboxMessage::json_message(
+        &format!("Get file references: {}", instruction),
+        &format!("Metasrv@{}", server_addr),
+        &format!("Datanode-{}@{}", peer.id, peer.addr),
+        common_time::util::current_time_millis(),
+        &instruction,
+    )
+    .with_context(|_| SerializeToJsonSnafu {
+        input: instruction.to_string(),
+    })?;
+
+    let mailbox_rx = mailbox
+        .send(&Channel::Datanode(peer.id), msg, timeout)
+        .await?;
+
+    let reply = match mailbox_rx.await {
+        Ok(reply_msg) => HeartbeatMailbox::json_reply(&reply_msg)?,
+        Err(e) => {
+            error!(
+                "Failed to receive reply from datanode {} for GetFileRefs: {}",
+                peer, e
+            );
+            return Err(e);
+        }
+    };
+
+    let InstructionReply::GetFileRefs(reply) = reply else {
+        return error::UnexpectedInstructionReplySnafu {
+            mailbox_message: format!("{:?}", reply),
+            reason: "Unexpected reply of the GetFileRefs instruction",
+        }
+        .fail();
+    };
+
+    Ok(reply)
+}
+
+/// Helper function to send GcRegions instruction and wait for reply.
+async fn send_gc_regions(
+    mailbox: &MailboxRef,
+    peer: &Peer,
+    gc_regions: GcRegions,
+    server_addr: &str,
+    timeout: Duration,
+    description: &str,
+) -> Result<GcReport> {
+    let instruction = instruction::Instruction::GcRegions(gc_regions.clone());
+    let msg = MailboxMessage::json_message(
+        &format!("{}: {}", description, instruction),
+        &format!("Metasrv@{}", server_addr),
+        &format!("Datanode-{}@{}", peer.id, peer.addr),
+        common_time::util::current_time_millis(),
+        &instruction,
+    )
+    .with_context(|_| SerializeToJsonSnafu {
+        input: instruction.to_string(),
+    })?;
+
+    let mailbox_rx = mailbox
+        .send(&Channel::Datanode(peer.id), msg, timeout)
+        .await?;
+
+    let reply = match mailbox_rx.await {
+        Ok(reply_msg) => HeartbeatMailbox::json_reply(&reply_msg)?,
+        Err(e) => {
+            error!(
+                "Failed to receive reply from datanode {} for {}: {}",
+                peer, description, e
+            );
+            return Err(e);
+        }
+    };
+
+    let InstructionReply::GcRegions(reply) = reply else {
+        return error::UnexpectedInstructionReplySnafu {
+            mailbox_message: format!("{:?}", reply),
+            reason: "Unexpected reply of the GcRegions instruction",
+        }
+        .fail();
+    };
+
+    let res = reply.result;
+    match res {
+        Ok(report) => Ok(report),
+        Err(e) => {
+            error!(
+                "Datanode {} reported error during GC for regions {:?}: {}",
+                peer, gc_regions, e
+            );
+            error::UnexpectedSnafu {
+                violated: format!(
+                    "Datanode {} reported error during GC for regions {:?}: {}",
+                    peer, gc_regions, e
+                ),
+            }
+            .fail()
+        }
+    }
+}
+
+/// TODO(discord9): another procedure which do both get file refs and gc regions.
+pub struct GcRegionProcedure {
+    mailbox: MailboxRef,
+    data: GcRegionData,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct GcRegionData {
+    server_addr: String,
+    peer: Peer,
+    gc_regions: GcRegions,
+    description: String,
+    timeout: Duration,
+}
+
+impl GcRegionProcedure {
+    pub const TYPE_NAME: &'static str = "metasrv-procedure::GcRegionProcedure";
+
+    pub fn new(
+        mailbox: MailboxRef,
+        server_addr: String,
+        peer: Peer,
+        gc_regions: GcRegions,
+        description: String,
+        timeout: Duration,
+    ) -> Self {
+        Self {
+            mailbox,
+            data: GcRegionData {
+                peer,
+                server_addr,
+                gc_regions,
+                description,
+                timeout,
+            },
+        }
+    }
+
+    async fn send_gc_instr(&self) -> Result<GcReport> {
+        send_gc_regions(
+            &self.mailbox,
+            &self.data.peer,
+            self.data.gc_regions.clone(),
+            &self.data.server_addr,
+            self.data.timeout,
+            &self.data.description,
+        )
+        .await
+    }
+
+    pub fn cast_result(res: Arc<dyn Any>) -> Result<GcReport> {
+        res.downcast_ref::<GcReport>().cloned().ok_or_else(|| {
+            error::UnexpectedSnafu {
+                violated: format!(
+                    "Failed to downcast procedure result to GcReport, got {:?}",
+                    std::any::type_name_of_val(&res.as_ref())
+                ),
+            }
+            .build()
+        })
+    }
+}
+
+#[async_trait::async_trait]
+impl Procedure for GcRegionProcedure {
+    fn type_name(&self) -> &str {
+        Self::TYPE_NAME
+    }
+
+    async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
+        // Send GC instruction to the datanode. This procedure only handle lock&send, results or other kind of
+        // errors will be reported back via the oneshot channel.
+        let reply = self
+            .send_gc_instr()
+            .await
+            .map_err(ProcedureError::external)?;
+
+        Ok(Status::done_with_output(reply))
+    }
+
+    fn dump(&self) -> ProcedureResult<String> {
+        serde_json::to_string(&self.data).context(ToJsonSnafu)
+    }
+
+    /// Read lock all regions involved in this GC procedure.
+    /// So i.e. region migration won't happen during GC and cause race conditions.
+    ///
+    /// only read lock the regions not catatlog/schema because it can run concurrently with other procedures(i.e. drop database/table)
+    /// TODO:(discord9): integration test to verify this
+    fn lock_key(&self) -> LockKey {
+        let lock_key: Vec<_> = self
+            .data
+            .gc_regions
+            .regions
+            .iter()
+            .sorted() // sort to have a deterministic lock order
+            .map(|id| RegionLock::Read(*id).into())
+            .collect();
+
+        LockKey::new(lock_key)
+    }
+}
+
+/// Procedure to perform get file refs then batch GC for multiple regions, should only be used by admin function
+/// for triggering manual gc, as it holds locks for too long and for all regions during the procedure.
+pub struct BatchGcProcedure {
+    mailbox: MailboxRef,
+    data: BatchGcData,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct BatchGcData {
+    state: State,
+    server_addr: String,
+    /// The regions to be GC-ed
+    regions: Vec<RegionId>,
+    full_file_listing: bool,
+    region_routes: Region2Peers,
+    /// Related regions (e.g., for shared files). Map: RegionId -> List of related RegionIds.
+    related_regions: HashMap<RegionId, Vec<RegionId>>,
+    /// Acquired file references (Populated in Acquiring state)
+    file_refs: FileRefsManifest,
+    /// mailbox timeout duration
+    timeout: Duration,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum State {
+    /// Initial state
+    Start,
+    /// Fetching file references from datanodes
+    Acquiring,
+    /// Sending GC instruction to the target datanode
+    Gcing,
+}
+
+impl BatchGcProcedure {
+    pub const TYPE_NAME: &'static str = "metasrv-procedure::BatchGcProcedure";
+
+    pub fn new(
+        mailbox: MailboxRef,
+        server_addr: String,
+        regions: Vec<RegionId>,
+        full_file_listing: bool,
+        region_routes: Region2Peers,
+        related_regions: HashMap<RegionId, Vec<RegionId>>,
+        timeout: Duration,
+    ) -> Self {
+        Self {
+            mailbox,
+            data: BatchGcData {
+                state: State::Start,
+                server_addr,
+                regions,
+                full_file_listing,
+                region_routes,
+                related_regions,
+                file_refs: FileRefsManifest::default(),
+                timeout,
+            },
+        }
+    }
+
+    /// Get file references from all datanodes that host the regions
+    async fn get_file_references(&self) -> Result<FileRefsManifest> {
+        use std::collections::{HashMap, HashSet};
+
+        let query_regions = &self.data.regions;
+        let related_regions = &self.data.related_regions;
+        let region_routes = &self.data.region_routes;
+        let timeout = self.data.timeout;
+
+        // Group regions by datanode to minimize RPC calls
+        let mut datanode2query_regions: HashMap<Peer, Vec<RegionId>> = HashMap::new();
+
+        for region_id in query_regions {
+            if let Some((leader, followers)) = region_routes.get(region_id) {
+                datanode2query_regions
+                    .entry(leader.clone())
+                    .or_default()
+                    .push(*region_id);
+                // also need to send for follower regions for file refs in case query is running on follower
+                for follower in followers {
+                    datanode2query_regions
+                        .entry(follower.clone())
+                        .or_default()
+                        .push(*region_id);
+                }
+            } else {
+                return error::UnexpectedSnafu {
+                    violated: format!(
+                        "region_routes: {region_routes:?} does not contain region_id: {region_id}",
+                    ),
+                }
+                .fail();
+            }
+        }
+
+        let mut datanode2related_regions: HashMap<Peer, HashMap<RegionId, Vec<RegionId>>> =
+            HashMap::new();
+        for (related_region, queries) in related_regions {
+            if let Some((leader, _followers)) = region_routes.get(related_region) {
+                datanode2related_regions
+                    .entry(leader.clone())
+                    .or_default()
+                    .insert(*related_region, queries.clone());
+            } // since read from manifest, no need to send to followers
+        }
+
+        // Send GetFileRefs instructions to each datanode
+        let mut all_file_refs: HashMap<RegionId, HashSet<store_api::storage::FileId>> =
+            HashMap::new();
+        let mut all_manifest_versions = HashMap::new();
+
+        for (peer, regions) in datanode2query_regions {
+            let related_regions_for_peer =
+                datanode2related_regions.remove(&peer).unwrap_or_default();
+
+            let instruction = GetFileRefs {
+                query_regions: regions.clone(),
+                related_regions: related_regions_for_peer,
+            };
+
+            let reply = send_get_file_refs(
+                &self.mailbox,
+                &self.data.server_addr,
+                &peer,
+                instruction,
+                timeout,
+            )
+            .await?;
+
+            if !reply.success {
+                return error::UnexpectedSnafu {
+                    violated: format!(
+                        "Failed to get file references from datanode {}: {:?}",
+                        peer, reply.error
+                    ),
+                }
+                .fail();
+            }
+
+            // Merge the file references from this datanode
+            for (region_id, file_refs) in reply.file_refs_manifest.file_refs {
+                all_file_refs
+                    .entry(region_id)
+                    .or_default()
+                    .extend(file_refs);
+            }
+
+            // region manifest version should be the smallest one among all peers, so outdated region can be detected
+            for (region_id, version) in reply.file_refs_manifest.manifest_version {
+                let entry = all_manifest_versions.entry(region_id).or_insert(version);
+                *entry = (*entry).min(version);
+            }
+        }
+
+        Ok(FileRefsManifest {
+            file_refs: all_file_refs,
+            manifest_version: all_manifest_versions,
+        })
+    }
+
+    /// Send GC instruction to all datanodes that host the regions,
+    /// returns regions that need retry.
+    async fn send_gc_instructions(&self) -> Result<Vec<RegionId>> {
+        let regions = &self.data.regions;
+        let region_routes = &self.data.region_routes;
+        let file_refs = &self.data.file_refs;
+        let timeout = self.data.timeout;
+
+        // Group regions by datanode
+        let mut datanode2regions: HashMap<Peer, Vec<RegionId>> = HashMap::new();
+
+        for region_id in regions {
+            if let Some((leader, _followers)) = region_routes.get(region_id) {
+                datanode2regions
+                    .entry(leader.clone())
+                    .or_default()
+                    .push(*region_id);
+            } else {
+                return error::UnexpectedSnafu {
+                    violated: format!(
+                        "region_routes: {region_routes:?} does not contain region_id: {region_id}",
+                    ),
+                }
+                .fail();
+            }
+        }
+
+        let mut all_need_retry = HashSet::new();
+        // Send GC instructions to each datanode
+        for (peer, regions_for_peer) in datanode2regions {
+            let gc_regions = GcRegions {
+                regions: regions_for_peer.clone(),
+                // file_refs_manifest can be large; cloning for each datanode is acceptable here since this is an admin-only operation.
+                file_refs_manifest: file_refs.clone(),
+                full_file_listing: self.data.full_file_listing,
+            };
+
+            let report = send_gc_regions(
+                &self.mailbox,
+                &peer,
+                gc_regions,
+                self.data.server_addr.as_str(),
+                timeout,
+                "Batch GC",
+            )
+            .await?;
+
+            let success = report.deleted_files.keys().collect_vec();
+            let need_retry = report.need_retry_regions.iter().cloned().collect_vec();
+
+            if need_retry.is_empty() {
+                info!(
+                    "GC report from datanode {}: successfully deleted files for regions {:?}",
+                    peer, success
+                );
+            } else {
+                warn!(
+                    "GC report from datanode {}: successfully deleted files for regions {:?}, need retry for regions {:?}",
+                    peer, success, need_retry
+                );
+            }
+            all_need_retry.extend(report.need_retry_regions);
+        }
+
+        Ok(all_need_retry.into_iter().collect())
+    }
+}
+
+#[async_trait::async_trait]
+impl Procedure for BatchGcProcedure {
+    fn type_name(&self) -> &str {
+        Self::TYPE_NAME
+    }
+
+    async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
+        match self.data.state {
+            State::Start => {
+                // Transition to Acquiring state
+                self.data.state = State::Acquiring;
+                Ok(Status::executing(false))
+            }
+            State::Acquiring => {
+                // Get file references from all datanodes
+                match self.get_file_references().await {
+                    Ok(file_refs) => {
+                        self.data.file_refs = file_refs;
+                        self.data.state = State::Gcing;
+                        Ok(Status::executing(false))
+                    }
+                    Err(e) => {
+                        error!("Failed to get file references: {}", e);
+                        Err(ProcedureError::external(e))
+                    }
+                }
+            }
+            State::Gcing => {
+                // Send GC instructions to all datanodes
+                // TODO(discord9): handle need-retry regions
+                match self.send_gc_instructions().await {
+                    Ok(_) => {
+                        info!(
+                            "Batch GC completed successfully for regions {:?}",
+                            self.data.regions
+                        );
+                        Ok(Status::done())
+                    }
+                    Err(e) => {
+                        error!("Failed to send GC instructions: {}", e);
+                        Err(ProcedureError::external(e))
+                    }
+                }
+            }
+        }
+    }
+
+    fn dump(&self) -> ProcedureResult<String> {
+        serde_json::to_string(&self.data).context(ToJsonSnafu)
+    }
+
+    /// Read lock all regions involved in this GC procedure.
+    /// So i.e. region migration won't happen during GC and cause race conditions.
+    fn lock_key(&self) -> LockKey {
+        let lock_key: Vec<_> = self
+            .data
+            .regions
+            .iter()
+            .sorted() // sort to have a deterministic lock order
+            .map(|id| RegionLock::Read(*id).into())
+            .collect();
+
+        LockKey::new(lock_key)
+    }
+}
diff --git a/src/meta-srv/src/gc/scheduler.rs b/src/meta-srv/src/gc/scheduler.rs
new file mode 100644
index 0000000000..e3ed3834bb
--- /dev/null
+++ b/src/meta-srv/src/gc/scheduler.rs
@@ -0,0 +1,162 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+use std::time::Instant;
+
+use common_meta::DatanodeId;
+use common_meta::key::TableMetadataManagerRef;
+use common_procedure::ProcedureManagerRef;
+use common_telemetry::{error, info};
+use store_api::storage::GcReport;
+use tokio::sync::Mutex;
+use tokio::sync::mpsc::{Receiver, Sender};
+
+use crate::cluster::MetaPeerClientRef;
+use crate::define_ticker;
+use crate::error::{Error, Result};
+use crate::gc::ctx::{DefaultGcSchedulerCtx, SchedulerCtx};
+use crate::gc::options::{GcSchedulerOptions, TICKER_INTERVAL};
+use crate::gc::tracker::RegionGcTracker;
+use crate::service::mailbox::MailboxRef;
+
+/// Report for a GC job.
+#[derive(Debug, Default)]
+pub struct GcJobReport {
+    pub per_datanode_reports: HashMap<DatanodeId, GcReport>,
+    pub failed_datanodes: HashMap<DatanodeId, Vec<Error>>,
+}
+impl GcJobReport {
+    pub fn merge(&mut self, mut other: GcJobReport) {
+        // merge per_datanode_reports&failed_datanodes
+        for (dn_id, report) in other.per_datanode_reports {
+            let self_report = self.per_datanode_reports.entry(dn_id).or_default();
+            self_report.merge(report);
+        }
+        let all_failed_dn_ids = self
+            .failed_datanodes
+            .keys()
+            .cloned()
+            .chain(other.failed_datanodes.keys().cloned())
+            .collect::<HashSet<_>>();
+        for dn_id in all_failed_dn_ids {
+            let entry = self.failed_datanodes.entry(dn_id).or_default();
+            if let Some(other_errors) = other.failed_datanodes.remove(&dn_id) {
+                entry.extend(other_errors);
+            }
+        }
+        self.failed_datanodes
+            .retain(|dn_id, _| !self.per_datanode_reports.contains_key(dn_id));
+    }
+}
+
+/// [`Event`] represents various types of events that can be processed by the gc ticker.
+///
+/// Variants:
+/// - `Tick`: This event is used to trigger gc periodically.
+pub(crate) enum Event {
+    Tick,
+}
+
+#[allow(unused)]
+pub(crate) type GcTickerRef = Arc<GcTicker>;
+
+define_ticker!(
+    /// [GcTicker] is used to trigger gc periodically.
+    GcTicker,
+    event_type = Event,
+    event_value = Event::Tick
+);
+
+/// [`GcScheduler`] is used to periodically trigger garbage collection on datanodes.
+pub struct GcScheduler {
+    pub(crate) ctx: Arc<dyn SchedulerCtx>,
+    /// The receiver of events.
+    pub(crate) receiver: Receiver<Event>,
+    /// GC configuration.
+    pub(crate) config: GcSchedulerOptions,
+    /// Tracks the last GC time for regions.
+    pub(crate) region_gc_tracker: Arc<Mutex<RegionGcTracker>>,
+    /// Last time the tracker was cleaned up.
+    pub(crate) last_tracker_cleanup: Arc<Mutex<Instant>>,
+}
+
+impl GcScheduler {
+    /// Creates a new [`GcScheduler`] with custom configuration.
+    pub(crate) fn new_with_config(
+        table_metadata_manager: TableMetadataManagerRef,
+        procedure_manager: ProcedureManagerRef,
+        meta_peer_client: MetaPeerClientRef,
+        mailbox: MailboxRef,
+        server_addr: String,
+        config: GcSchedulerOptions,
+    ) -> Result<(Self, GcTicker)> {
+        // Validate configuration before creating the scheduler
+        config.validate()?;
+
+        let (tx, rx) = Self::channel();
+        let gc_ticker = GcTicker::new(TICKER_INTERVAL, tx);
+        let gc_trigger = Self {
+            ctx: Arc::new(DefaultGcSchedulerCtx::try_new(
+                table_metadata_manager,
+                procedure_manager,
+                meta_peer_client,
+                mailbox,
+                server_addr,
+            )?),
+            receiver: rx,
+            config,
+            region_gc_tracker: Arc::new(Mutex::new(HashMap::new())),
+            last_tracker_cleanup: Arc::new(Mutex::new(Instant::now())),
+        };
+        Ok((gc_trigger, gc_ticker))
+    }
+
+    pub(crate) fn channel() -> (Sender<Event>, Receiver<Event>) {
+        tokio::sync::mpsc::channel(8)
+    }
+
+    /// Starts the gc trigger.
+    pub fn try_start(mut self) -> Result<()> {
+        common_runtime::spawn_global(async move { self.run().await });
+        info!("GC trigger started");
+        Ok(())
+    }
+
+    pub(crate) async fn run(&mut self) {
+        while let Some(event) = self.receiver.recv().await {
+            match event {
+                Event::Tick => {
+                    info!("Received gc tick");
+                    if let Err(e) = self.handle_tick().await {
+                        error!("Failed to handle gc tick: {}", e);
+                    }
+                }
+            }
+        }
+    }
+
+    pub(crate) async fn handle_tick(&self) -> Result<GcJobReport> {
+        info!("Start to trigger gc");
+        let report = self.trigger_gc().await?;
+
+        // Periodically clean up stale tracker entries
+        self.cleanup_tracker_if_needed().await?;
+
+        info!("Finished gc trigger");
+
+        Ok(report)
+    }
+}
diff --git a/src/meta-srv/src/gc/tracker.rs b/src/meta-srv/src/gc/tracker.rs
new file mode 100644
index 0000000000..c5f93483a6
--- /dev/null
+++ b/src/meta-srv/src/gc/tracker.rs
@@ -0,0 +1,129 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+use std::time::Instant;
+
+use common_telemetry::info;
+use store_api::storage::RegionId;
+
+use crate::error::Result;
+use crate::gc::scheduler::GcScheduler;
+
+/// Tracks GC timing information for a region.
+#[derive(Debug, Clone)]
+pub(crate) struct RegionGcInfo {
+    /// Last time a regular GC was performed on this region.
+    pub(crate) last_gc_time: Instant,
+    /// Last time a full file listing GC was performed on this region.
+    pub(crate) last_full_listing_time: Option<Instant>,
+}
+
+impl RegionGcInfo {
+    pub(crate) fn new(last_gc_time: Instant) -> Self {
+        Self {
+            last_gc_time,
+            last_full_listing_time: None,
+        }
+    }
+}
+
+/// Tracks the last GC time for regions to implement cooldown.
+pub(crate) type RegionGcTracker = HashMap<RegionId, RegionGcInfo>;
+
+impl GcScheduler {
+    /// Clean up stale entries from the region GC tracker if enough time has passed.
+    /// This removes entries for regions that no longer exist in the current table routes.
+    pub(crate) async fn cleanup_tracker_if_needed(&self) -> Result<()> {
+        let mut last_cleanup = *self.last_tracker_cleanup.lock().await;
+        let now = Instant::now();
+
+        // Check if enough time has passed since last cleanup
+        if now.saturating_duration_since(last_cleanup) < self.config.tracker_cleanup_interval {
+            return Ok(());
+        }
+
+        info!("Starting region GC tracker cleanup");
+        let cleanup_start = Instant::now();
+
+        // Get all current region IDs from table routes
+        let table_to_region_stats = self.ctx.get_table_to_region_stats().await?;
+        let mut current_regions = HashSet::new();
+        for region_stats in table_to_region_stats.values() {
+            for region_stat in region_stats {
+                current_regions.insert(region_stat.id);
+            }
+        }
+
+        // Remove stale entries from tracker
+        let mut tracker = self.region_gc_tracker.lock().await;
+        let initial_count = tracker.len();
+        tracker.retain(|region_id, _| current_regions.contains(region_id));
+        let removed_count = initial_count - tracker.len();
+
+        *self.last_tracker_cleanup.lock().await = now;
+
+        info!(
+            "Completed region GC tracker cleanup: removed {} stale entries out of {} total (retained {}). Duration: {:?}",
+            removed_count,
+            initial_count,
+            tracker.len(),
+            cleanup_start.elapsed()
+        );
+
+        Ok(())
+    }
+
+    /// Determine if full file listing should be used for a region based on the last full listing time.
+    pub(crate) async fn should_use_full_listing(&self, region_id: RegionId) -> bool {
+        let gc_tracker = self.region_gc_tracker.lock().await;
+        let now = Instant::now();
+
+        if let Some(gc_info) = gc_tracker.get(&region_id) {
+            if let Some(last_full_listing) = gc_info.last_full_listing_time {
+                let elapsed = now.saturating_duration_since(last_full_listing);
+                elapsed >= self.config.full_file_listing_interval
+            } else {
+                // Never did full listing for this region, do it now
+                true
+            }
+        } else {
+            // First time GC for this region, do full listing
+            true
+        }
+    }
+
+    pub(crate) async fn update_full_listing_time(
+        &self,
+        region_id: RegionId,
+        did_full_listing: bool,
+    ) {
+        let mut gc_tracker = self.region_gc_tracker.lock().await;
+        let now = Instant::now();
+
+        gc_tracker
+            .entry(region_id)
+            .and_modify(|info| {
+                if did_full_listing {
+                    info.last_full_listing_time = Some(now);
+                }
+                info.last_gc_time = now;
+            })
+            .or_insert_with(|| RegionGcInfo {
+                last_gc_time: now,
+                // prevent need to full listing on the first GC
+                last_full_listing_time: Some(now),
+            });
+    }
+}
diff --git a/src/meta-srv/src/handler.rs b/src/meta-srv/src/handler.rs
index 8f7aba2f92..12fcfab26f 100644
--- a/src/meta-srv/src/handler.rs
+++ b/src/meta-srv/src/handler.rs
@@ -32,7 +32,7 @@ use collect_leader_region_handler::CollectLeaderRegionHandler;
 use collect_stats_handler::CollectStatsHandler;
 use common_base::Plugins;
 use common_meta::datanode::Stat;
-use common_meta::instruction::{Instruction, InstructionReply};
+use common_meta::instruction::InstructionReply;
 use common_meta::sequence::Sequence;
 use common_telemetry::{debug, info, warn};
 use dashmap::DashMap;
@@ -114,16 +114,19 @@ pub enum HandleControl {
 #[derive(Debug, Default)]
 pub struct HeartbeatAccumulator {
     pub header: Option<ResponseHeader>,
-    pub instructions: Vec<Instruction>,
+    mailbox_message: Option<MailboxMessage>,
     pub stat: Option<Stat>,
     pub inactive_region_ids: HashSet<RegionId>,
     pub region_lease: Option<RegionLease>,
 }
 
 impl HeartbeatAccumulator {
-    pub fn into_mailbox_message(self) -> Option<MailboxMessage> {
-        // TODO(jiachun): to HeartbeatResponse payload
-        None
+    pub(crate) fn take_mailbox_message(&mut self) -> Option<MailboxMessage> {
+        self.mailbox_message.take()
+    }
+
+    pub fn set_mailbox_message(&mut self, message: MailboxMessage) {
+        let _ = self.mailbox_message.insert(message);
     }
 }
 
@@ -275,6 +278,15 @@ impl Pushers {
     async fn remove(&self, pusher_id: &str) -> Option<Pusher> {
         self.0.write().await.remove(pusher_id)
     }
+
+    pub(crate) async fn clear(&self) -> Vec<String> {
+        let mut pushers = self.0.write().await;
+        let keys = pushers.keys().cloned().collect::<Vec<_>>();
+        if !keys.is_empty() {
+            pushers.clear();
+        }
+        keys
+    }
 }
 
 #[derive(Clone)]
@@ -309,12 +321,11 @@ impl HeartbeatHandlerGroup {
     }
 
     /// Deregisters the heartbeat response [`Pusher`] with the given key from the group.
-    ///
-    /// Returns the [`Pusher`] if it exists.
-    pub async fn deregister_push(&self, pusher_id: PusherId) -> Option<Pusher> {
-        METRIC_META_HEARTBEAT_CONNECTION_NUM.dec();
+    pub async fn deregister_push(&self, pusher_id: PusherId) {
         info!("Pusher unregister: {}", pusher_id);
-        self.pushers.remove(&pusher_id.string_key()).await
+        if self.pushers.remove(&pusher_id.string_key()).await.is_some() {
+            METRIC_META_HEARTBEAT_CONNECTION_NUM.dec();
+        }
     }
 
     /// Returns the [`Pushers`] of the group.
@@ -351,10 +362,11 @@ impl HeartbeatHandlerGroup {
             }
         }
         let header = std::mem::take(&mut acc.header);
+        let mailbox_message = acc.take_mailbox_message();
         let res = HeartbeatResponse {
             header,
             region_lease: acc.region_lease,
-            ..Default::default()
+            mailbox_message,
         };
         Ok(res)
     }
@@ -382,7 +394,9 @@ impl HeartbeatMailbox {
 
     /// Parses the [Instruction] from [MailboxMessage].
     #[cfg(test)]
-    pub fn json_instruction(msg: &MailboxMessage) -> Result<Instruction> {
+    pub(crate) fn json_instruction(
+        msg: &MailboxMessage,
+    ) -> Result<common_meta::instruction::Instruction> {
         let Payload::Json(payload) =
             msg.payload
                 .as_ref()
@@ -519,6 +533,14 @@ impl Mailbox for HeartbeatMailbox {
 
         Ok(())
     }
+
+    async fn reset(&self) {
+        let keys = self.pushers.clear().await;
+        if !keys.is_empty() {
+            info!("Reset mailbox, deregister pushers: {:?}", keys);
+            METRIC_META_HEARTBEAT_CONNECTION_NUM.sub(keys.len() as i64);
+        }
+    }
 }
 
 /// The builder to build the group of heartbeat handlers.
diff --git a/src/meta-srv/src/handler/collect_leader_region_handler.rs b/src/meta-srv/src/handler/collect_leader_region_handler.rs
index fc81143b82..ddb4cd0ea3 100644
--- a/src/meta-srv/src/handler/collect_leader_region_handler.rs
+++ b/src/meta-srv/src/handler/collect_leader_region_handler.rs
@@ -73,6 +73,7 @@ mod tests {
             region_manifest: RegionManifestInfo::Mito {
                 manifest_version,
                 flushed_entry_id: 0,
+                file_removed_cnt: 0,
             },
             rcus: 0,
             wcus: 0,
diff --git a/src/meta-srv/src/handler/failure_handler.rs b/src/meta-srv/src/handler/failure_handler.rs
index 7039678654..eb79a1c30d 100644
--- a/src/meta-srv/src/handler/failure_handler.rs
+++ b/src/meta-srv/src/handler/failure_handler.rs
@@ -102,6 +102,7 @@ mod tests {
                 region_manifest: RegionManifestInfo::Mito {
                     manifest_version: 0,
                     flushed_entry_id: 0,
+                    file_removed_cnt: 0,
                 },
                 data_topic_latest_entry_id: 0,
                 metadata_topic_latest_entry_id: 0,
diff --git a/src/meta-srv/src/handler/persist_stats_handler.rs b/src/meta-srv/src/handler/persist_stats_handler.rs
index abc2fa3c3e..75281f982a 100644
--- a/src/meta-srv/src/handler/persist_stats_handler.rs
+++ b/src/meta-srv/src/handler/persist_stats_handler.rs
@@ -294,6 +294,7 @@ mod tests {
             region_manifest: RegionManifestInfo::Mito {
                 manifest_version: 1,
                 flushed_entry_id: 100,
+                file_removed_cnt: 0,
             },
             written_bytes,
             data_topic_latest_entry_id: 200,
diff --git a/src/meta-srv/src/handler/region_lease_handler.rs b/src/meta-srv/src/handler/region_lease_handler.rs
index 1dd49cd44e..d0e9757742 100644
--- a/src/meta-srv/src/handler/region_lease_handler.rs
+++ b/src/meta-srv/src/handler/region_lease_handler.rs
@@ -129,6 +129,7 @@ impl HeartbeatHandler for RegionLeaseHandler {
 
 #[cfg(test)]
 mod test {
+
     use std::collections::{HashMap, HashSet};
     use std::sync::Arc;
 
@@ -138,6 +139,7 @@ mod test {
     use common_meta::key::table_route::TableRouteValue;
     use common_meta::key::test_utils::new_test_table_info;
     use common_meta::kv_backend::memory::MemoryKvBackend;
+    use common_meta::kv_backend::test_util::MockKvBackendBuilder;
     use common_meta::peer::Peer;
     use common_meta::region_keeper::MemoryRegionKeeper;
     use common_meta::rpc::router::{LeaderState, Region, RegionRoute};
@@ -173,6 +175,7 @@ mod test {
             region_manifest: RegionManifestInfo::Mito {
                 manifest_version: 0,
                 flushed_entry_id: 0,
+                file_removed_cnt: 0,
             },
             data_topic_latest_entry_id: 0,
             metadata_topic_latest_entry_id: 0,
@@ -414,4 +417,58 @@ mod test {
 
         assert_eq!(granted, expected);
     }
+
+    #[tokio::test]
+    async fn test_handle_renew_region_lease_failure() {
+        common_telemetry::init_default_ut_logging();
+        let kv = MockKvBackendBuilder::default()
+            .batch_get_fn(Arc::new(|_| {
+                common_meta::error::UnexpectedSnafu {
+                    err_msg: "mock err",
+                }
+                .fail()
+            }) as _)
+            .build()
+            .unwrap();
+        let kvbackend = Arc::new(kv);
+        let table_metadata_manager = Arc::new(TableMetadataManager::new(kvbackend));
+
+        let datanode_id = 1;
+        let region_number = 1u32;
+        let table_id = 10;
+        let region_id = RegionId::new(table_id, region_number);
+        let another_region_id = RegionId::new(table_id, region_number + 1);
+        let no_exist_region_id = RegionId::new(table_id, region_number + 2);
+        let peer = Peer::empty(datanode_id);
+
+        let builder = MetasrvBuilder::new();
+        let metasrv = builder.build().await.unwrap();
+        let ctx = &mut metasrv.new_ctx();
+
+        let req = HeartbeatRequest {
+            duration_since_epoch: 1234,
+            ..Default::default()
+        };
+
+        let acc = &mut HeartbeatAccumulator::default();
+        acc.stat = Some(Stat {
+            id: peer.id,
+            region_stats: vec![
+                new_empty_region_stat(region_id, RegionRole::Leader),
+                new_empty_region_stat(another_region_id, RegionRole::Leader),
+                new_empty_region_stat(no_exist_region_id, RegionRole::Leader),
+            ],
+            ..Default::default()
+        });
+        let handler = RegionLeaseHandler::new(
+            distributed_time_constants::REGION_LEASE_SECS,
+            table_metadata_manager.clone(),
+            Default::default(),
+            None,
+        );
+        handler.handle(&req, ctx, acc).await.unwrap();
+
+        assert!(acc.region_lease.is_none());
+        assert!(acc.inactive_region_ids.is_empty());
+    }
 }
diff --git a/src/meta-srv/src/lib.rs b/src/meta-srv/src/lib.rs
index 71d57ca83f..c67bc32b40 100644
--- a/src/meta-srv/src/lib.rs
+++ b/src/meta-srv/src/lib.rs
@@ -25,6 +25,7 @@ pub mod election;
 pub mod error;
 pub mod events;
 mod failure_detector;
+pub mod gc;
 pub mod handler;
 pub mod key;
 pub mod metasrv;
diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs
index ba1798c386..99f392cfd8 100644
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -49,9 +49,9 @@ use common_procedure::options::ProcedureConfig;
 use common_stat::ResourceStatRef;
 use common_telemetry::logging::{LoggingOptions, TracingOptions};
 use common_telemetry::{error, info, warn};
+use common_time::util::DefaultSystemTimer;
 use common_wal::config::MetasrvWalConfig;
 use serde::{Deserialize, Serialize};
-use servers::export_metrics::ExportMetricsOption;
 use servers::grpc::GrpcOptions;
 use servers::http::HttpOptions;
 use servers::tls::TlsOption;
@@ -67,6 +67,7 @@ use crate::error::{
     StartTelemetryTaskSnafu, StopProcedureManagerSnafu,
 };
 use crate::failure_detector::PhiAccrualFailureDetectorOptions;
+use crate::gc::{GcSchedulerOptions, GcTickerRef};
 use crate::handler::{HeartbeatHandlerGroupBuilder, HeartbeatHandlerGroupRef};
 use crate::procedure::ProcedureManagerListenerAdapter;
 use crate::procedure::region_migration::manager::RegionMigrationManagerRef;
@@ -168,8 +169,6 @@ pub struct MetasrvOptions {
     pub data_home: String,
     /// The WAL options.
     pub wal: MetasrvWalConfig,
-    /// The metrics export options.
-    pub export_metrics: ExportMetricsOption,
     /// The store key prefix. If it is not empty, all keys in the store will be prefixed with it.
     /// This is useful when multiple metasrv clusters share the same store.
     pub store_key_prefix: String,
@@ -209,6 +208,8 @@ pub struct MetasrvOptions {
     pub event_recorder: EventRecorderOptions,
     /// The stats persistence options.
     pub stats_persistence: StatsPersistenceOptions,
+    /// The GC scheduler options.
+    pub gc: GcSchedulerOptions,
 }
 
 impl fmt::Debug for MetasrvOptions {
@@ -233,7 +234,6 @@ impl fmt::Debug for MetasrvOptions {
             .field("enable_telemetry", &self.enable_telemetry)
             .field("data_home", &self.data_home)
             .field("wal", &self.wal)
-            .field("export_metrics", &self.export_metrics)
             .field("store_key_prefix", &self.store_key_prefix)
             .field("max_txn_ops", &self.max_txn_ops)
             .field("flush_stats_factor", &self.flush_stats_factor)
@@ -291,7 +291,6 @@ impl Default for MetasrvOptions {
             enable_telemetry: true,
             data_home: DEFAULT_DATA_HOME.to_string(),
             wal: MetasrvWalConfig::default(),
-            export_metrics: ExportMetricsOption::default(),
             store_key_prefix: String::new(),
             max_txn_ops: 128,
             flush_stats_factor: 3,
@@ -307,6 +306,7 @@ impl Default for MetasrvOptions {
             node_max_idle_time: Duration::from_secs(24 * 60 * 60),
             event_recorder: EventRecorderOptions::default(),
             stats_persistence: StatsPersistenceOptions::default(),
+            gc: GcSchedulerOptions::default(),
         }
     }
 }
@@ -452,6 +452,7 @@ pub struct MetaStateHandler {
     greptimedb_telemetry_task: Arc<GreptimeDBTelemetryTask>,
     leader_cached_kv_backend: Arc<LeaderCachedKvBackend>,
     leadership_change_notifier: LeadershipChangeNotifier,
+    mailbox: MailboxRef,
     state: StateRef,
 }
 
@@ -475,6 +476,9 @@ impl MetaStateHandler {
     pub async fn on_leader_stop(&self) {
         self.state.write().unwrap().next_state(become_follower());
 
+        // Enforces the mailbox to clear all pushers.
+        // The remaining heartbeat connections will be closed by the remote peer or keep-alive detection.
+        self.mailbox.reset().await;
         self.leadership_change_notifier
             .notify_on_leader_stop()
             .await;
@@ -528,6 +532,7 @@ pub struct Metasrv {
     table_id_sequence: SequenceRef,
     reconciliation_manager: ReconciliationManagerRef,
     resource_stat: ResourceStatRef,
+    gc_ticker: Option<GcTickerRef>,
 
     plugins: Plugins,
 }
@@ -588,6 +593,9 @@ impl Metasrv {
             if let Some(region_flush_trigger) = &self.region_flush_ticker {
                 leadership_change_notifier.add_listener(region_flush_trigger.clone() as _);
             }
+            if let Some(gc_ticker) = &self.gc_ticker {
+                leadership_change_notifier.add_listener(gc_ticker.clone() as _);
+            }
             if let Some(customizer) = self.plugins.get::<LeadershipChangeNotifierCustomizerRef>() {
                 customizer.customize(&mut leadership_change_notifier);
             }
@@ -598,6 +606,7 @@ impl Metasrv {
                 state: self.state.clone(),
                 leader_cached_kv_backend: leader_cached_kv_backend.clone(),
                 leadership_change_notifier,
+                mailbox: self.mailbox.clone(),
             };
             let _handle = common_runtime::spawn_global(async move {
                 loop {
@@ -735,6 +744,7 @@ impl Metasrv {
     /// A datanode is considered alive when it's still within the lease period.
     pub(crate) async fn lookup_datanode_peer(&self, peer_id: u64) -> Result<Option<Peer>> {
         discovery::utils::alive_datanode(
+            &DefaultSystemTimer,
             self.meta_peer_client.as_ref(),
             peer_id,
             Duration::from_secs(distributed_time_constants::DATANODE_LEASE_SECS),
diff --git a/src/meta-srv/src/metasrv/builder.rs b/src/meta-srv/src/metasrv/builder.rs
index 5a33dc9c4f..cbefb79cfa 100644
--- a/src/meta-srv/src/metasrv/builder.rs
+++ b/src/meta-srv/src/metasrv/builder.rs
@@ -28,7 +28,7 @@ use common_meta::ddl::table_meta::{TableMetadataAllocator, TableMetadataAllocato
 use common_meta::ddl::{
     DdlContext, NoopRegionFailureDetectorControl, RegionFailureDetectorControllerRef,
 };
-use common_meta::ddl_manager::DdlManager;
+use common_meta::ddl_manager::{DdlManager, DdlManagerConfiguratorRef};
 use common_meta::distributed_time_constants::{self};
 use common_meta::key::TableMetadataManager;
 use common_meta::key::flow::FlowMetadataManager;
@@ -54,8 +54,9 @@ use store_api::storage::MAX_REGION_SEQ;
 use crate::bootstrap::build_default_meta_peer_client;
 use crate::cache_invalidator::MetasrvCacheInvalidator;
 use crate::cluster::MetaPeerClientRef;
-use crate::error::{self, BuildWalOptionsAllocatorSnafu, Result};
+use crate::error::{self, BuildWalOptionsAllocatorSnafu, OtherSnafu, Result};
 use crate::events::EventHandlerImpl;
+use crate::gc::GcScheduler;
 use crate::greptimedb_telemetry::get_greptimedb_telemetry_task;
 use crate::handler::failure_handler::RegionFailureHandler;
 use crate::handler::flow_state_handler::FlowStateHandler;
@@ -401,13 +402,23 @@ impl MetasrvBuilder {
         let procedure_manager_c = procedure_manager.clone();
         let ddl_manager = DdlManager::try_new(ddl_context, procedure_manager_c, true)
             .context(error::InitDdlManagerSnafu)?;
-        #[cfg(feature = "enterprise")]
-        let ddl_manager = {
-            let trigger_ddl_manager = plugins.as_ref().and_then(|plugins| {
-                plugins.get::<common_meta::ddl_manager::TriggerDdlManagerRef>()
-            });
-            ddl_manager.with_trigger_ddl_manager(trigger_ddl_manager)
+
+        let ddl_manager = if let Some(configurator) = plugins
+            .as_ref()
+            .and_then(|p| p.get::<DdlManagerConfiguratorRef<DdlManagerConfigureContext>>())
+        {
+            let ctx = DdlManagerConfigureContext {
+                kv_backend: kv_backend.clone(),
+                meta_peer_client: meta_peer_client.clone(),
+            };
+            configurator
+                .configure(ddl_manager, ctx)
+                .await
+                .context(OtherSnafu)?
+        } else {
+            ddl_manager
         };
+
         let ddl_manager = Arc::new(ddl_manager);
 
         let region_flush_ticker = if is_remote_wal {
@@ -458,6 +469,22 @@ impl MetasrvBuilder {
             None
         };
 
+        let gc_ticker = if options.gc.enable {
+            let (gc_scheduler, gc_ticker) = GcScheduler::new_with_config(
+                table_metadata_manager.clone(),
+                procedure_manager.clone(),
+                meta_peer_client.clone(),
+                mailbox.clone(),
+                options.grpc.server_addr.clone(),
+                options.gc.clone(),
+            )?;
+            gc_scheduler.try_start()?;
+
+            Some(Arc::new(gc_ticker))
+        } else {
+            None
+        };
+
         let customized_region_lease_renewer = plugins
             .as_ref()
             .and_then(|plugins| plugins.get::<CustomizedRegionLeaseRenewerRef>());
@@ -562,6 +589,7 @@ impl MetasrvBuilder {
             reconciliation_manager,
             topic_stats_registry,
             resource_stat: Arc::new(resource_stat),
+            gc_ticker,
         })
     }
 }
@@ -610,3 +638,9 @@ impl Default for MetasrvBuilder {
         Self::new()
     }
 }
+
+/// The context for [`DdlManagerConfiguratorRef`].
+pub struct DdlManagerConfigureContext {
+    pub kv_backend: KvBackendRef,
+    pub meta_peer_client: MetaPeerClientRef,
+}
diff --git a/src/meta-srv/src/procedure.rs b/src/meta-srv/src/procedure.rs
index 88869d8482..da1a1b00e7 100644
--- a/src/meta-srv/src/procedure.rs
+++ b/src/meta-srv/src/procedure.rs
@@ -19,6 +19,7 @@ use common_procedure::ProcedureManagerRef;
 use snafu::ResultExt;
 
 pub mod region_migration;
+pub mod repartition;
 #[cfg(any(test, feature = "testing"))]
 pub mod test_util;
 #[cfg(test)]
diff --git a/src/meta-srv/src/procedure/region_migration.rs b/src/meta-srv/src/procedure/region_migration.rs
index 935e59ba33..3613fd0894 100644
--- a/src/meta-srv/src/procedure/region_migration.rs
+++ b/src/meta-srv/src/procedure/region_migration.rs
@@ -24,8 +24,10 @@ pub(crate) mod open_candidate_region;
 pub mod test_util;
 pub(crate) mod update_metadata;
 pub(crate) mod upgrade_candidate_region;
+pub(crate) mod utils;
 
 use std::any::Any;
+use std::collections::{HashMap, HashSet};
 use std::fmt::{Debug, Display};
 use std::sync::Arc;
 use std::time::Duration;
@@ -36,7 +38,6 @@ use common_meta::cache_invalidator::CacheInvalidatorRef;
 use common_meta::ddl::RegionFailureDetectorControllerRef;
 use common_meta::instruction::CacheIdent;
 use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue};
-use common_meta::key::table_info::TableInfoValue;
 use common_meta::key::table_route::TableRouteValue;
 use common_meta::key::topic_region::{ReplayCheckpoint, TopicRegionKey};
 use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef};
@@ -56,9 +57,9 @@ pub use manager::{
     RegionMigrationManagerRef, RegionMigrationProcedureTask, RegionMigrationProcedureTracker,
     RegionMigrationTriggerReason,
 };
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize};
 use snafu::{OptionExt, ResultExt};
-use store_api::storage::RegionId;
+use store_api::storage::{RegionId, TableId};
 use tokio::time::Instant;
 
 use self::migration_start::RegionMigrationStart;
@@ -73,6 +74,25 @@ use crate::service::mailbox::MailboxRef;
 /// The default timeout for region migration.
 pub const DEFAULT_REGION_MIGRATION_TIMEOUT: Duration = Duration::from_secs(120);
 
+#[derive(Debug, Deserialize)]
+#[serde(untagged)]
+enum SingleOrMultiple<T> {
+    Single(T),
+    Multiple(Vec<T>),
+}
+
+fn single_or_multiple_from<'de, D, T>(deserializer: D) -> std::result::Result<Vec<T>, D::Error>
+where
+    D: Deserializer<'de>,
+    T: Deserialize<'de>,
+{
+    let helper = SingleOrMultiple::<T>::deserialize(deserializer)?;
+    Ok(match helper {
+        SingleOrMultiple::Single(x) => vec![x],
+        SingleOrMultiple::Multiple(xs) => xs,
+    })
+}
+
 /// It's shared in each step and available even after recovering.
 ///
 /// It will only be updated/stored after the Red node has succeeded.
@@ -81,15 +101,23 @@ pub const DEFAULT_REGION_MIGRATION_TIMEOUT: Duration = Duration::from_secs(120);
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct PersistentContext {
     /// The table catalog.
-    pub(crate) catalog: String,
+    #[deprecated(note = "use `catalog_and_schema` instead")]
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub(crate) catalog: Option<String>,
     /// The table schema.
-    pub(crate) schema: String,
+    #[deprecated(note = "use `catalog_and_schema` instead")]
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub(crate) schema: Option<String>,
+    /// The catalog and schema of the regions.
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub(crate) catalog_and_schema: Vec<(String, String)>,
     /// The [Peer] of migration source.
     pub(crate) from_peer: Peer,
     /// The [Peer] of migration destination.
     pub(crate) to_peer: Peer,
     /// The [RegionId] of migration region.
-    pub(crate) region_id: RegionId,
+    #[serde(deserialize_with = "single_or_multiple_from", alias = "region_id")]
+    pub(crate) region_ids: Vec<RegionId>,
     /// The timeout for downgrading leader region and upgrading candidate region operations.
     #[serde(with = "humantime_serde", default = "default_timeout")]
     pub(crate) timeout: Duration,
@@ -98,20 +126,80 @@ pub struct PersistentContext {
     pub(crate) trigger_reason: RegionMigrationTriggerReason,
 }
 
+impl PersistentContext {
+    pub fn new(
+        catalog_and_schema: Vec<(String, String)>,
+        from_peer: Peer,
+        to_peer: Peer,
+        region_ids: Vec<RegionId>,
+        timeout: Duration,
+        trigger_reason: RegionMigrationTriggerReason,
+    ) -> Self {
+        #[allow(deprecated)]
+        Self {
+            catalog: None,
+            schema: None,
+            catalog_and_schema,
+            from_peer,
+            to_peer,
+            region_ids,
+            timeout,
+            trigger_reason,
+        }
+    }
+}
+
 fn default_timeout() -> Duration {
     Duration::from_secs(10)
 }
 
 impl PersistentContext {
     pub fn lock_key(&self) -> Vec<StringKey> {
-        let region_id = self.region_id;
-        let lock_key = vec![
-            CatalogLock::Read(&self.catalog).into(),
-            SchemaLock::read(&self.catalog, &self.schema).into(),
-            RegionLock::Write(region_id).into(),
-        ];
+        let mut lock_keys =
+            Vec::with_capacity(self.region_ids.len() + 2 + self.catalog_and_schema.len() * 2);
+        #[allow(deprecated)]
+        if let (Some(catalog), Some(schema)) = (&self.catalog, &self.schema) {
+            lock_keys.push(CatalogLock::Read(catalog).into());
+            lock_keys.push(SchemaLock::read(catalog, schema).into());
+        }
+        for (catalog, schema) in self.catalog_and_schema.iter() {
+            lock_keys.push(CatalogLock::Read(catalog).into());
+            lock_keys.push(SchemaLock::read(catalog, schema).into());
+        }
 
-        lock_key
+        // Sort the region ids to ensure the same order of region ids.
+        let mut region_ids = self.region_ids.clone();
+        region_ids.sort_unstable();
+        for region_id in region_ids {
+            lock_keys.push(RegionLock::Write(region_id).into());
+        }
+        lock_keys
+    }
+
+    /// Returns the table ids of the regions.
+    ///
+    /// The return value is a set of table ids.
+    pub fn region_table_ids(&self) -> Vec<TableId> {
+        self.region_ids
+            .iter()
+            .map(|region_id| region_id.table_id())
+            .collect::<HashSet<_>>()
+            .into_iter()
+            .collect()
+    }
+
+    /// Returns the table regions map.
+    ///
+    /// The key is the table id, the value is the region ids of the table.
+    pub fn table_regions(&self) -> HashMap<TableId, Vec<RegionId>> {
+        let mut table_regions = HashMap::new();
+        for region_id in &self.region_ids {
+            table_regions
+                .entry(region_id.table_id())
+                .or_insert_with(Vec::new)
+                .push(*region_id);
+        }
+        table_regions
     }
 }
 
@@ -227,23 +315,18 @@ pub struct VolatileContext {
     /// `opening_region_guard` will be set after the
     /// [OpenCandidateRegion](crate::procedure::region_migration::open_candidate_region::OpenCandidateRegion) step.
     ///
-    /// `opening_region_guard` should be consumed after
+    /// `opening_region_guards` should be consumed after
     /// the corresponding [RegionRoute](common_meta::rpc::router::RegionRoute) of the opening region
     /// was written into [TableRouteValue](common_meta::key::table_route::TableRouteValue).
-    opening_region_guard: Option<OperatingRegionGuard>,
-    /// `datanode_table` is stored via previous steps for future use.
-    from_peer_datanode_table: Option<DatanodeTableValue>,
-    /// `table_info` is stored via previous steps for future use.
-    ///
-    /// `table_info` should remain unchanged during the procedure;
-    /// no other DDL procedure executed concurrently for the current table.
-    table_info: Option<DeserializedValueWithBytes<TableInfoValue>>,
+    opening_region_guards: Vec<OperatingRegionGuard>,
     /// The deadline of leader region lease.
     leader_region_lease_deadline: Option<Instant>,
-    /// The last_entry_id of leader region.
-    leader_region_last_entry_id: Option<u64>,
-    /// The last_entry_id of leader metadata region (Only used for metric engine).
-    leader_region_metadata_last_entry_id: Option<u64>,
+    /// The datanode table values.
+    from_peer_datanode_table_values: Option<HashMap<TableId, DatanodeTableValue>>,
+    /// The last_entry_ids of leader regions.
+    leader_region_last_entry_ids: HashMap<RegionId, u64>,
+    /// The last_entry_ids of leader metadata regions (Only used for metric engine).
+    leader_region_metadata_last_entry_ids: HashMap<RegionId, u64>,
     /// Metrics of region migration.
     metrics: Metrics,
 }
@@ -262,13 +345,15 @@ impl VolatileContext {
     }
 
     /// Sets the `leader_region_last_entry_id`.
-    pub fn set_last_entry_id(&mut self, last_entry_id: u64) {
-        self.leader_region_last_entry_id = Some(last_entry_id)
+    pub fn set_last_entry_id(&mut self, region_id: RegionId, last_entry_id: u64) {
+        self.leader_region_last_entry_ids
+            .insert(region_id, last_entry_id);
     }
 
     /// Sets the `leader_region_metadata_last_entry_id`.
-    pub fn set_metadata_last_entry_id(&mut self, last_entry_id: u64) {
-        self.leader_region_metadata_last_entry_id = Some(last_entry_id);
+    pub fn set_metadata_last_entry_id(&mut self, region_id: RegionId, last_entry_id: u64) {
+        self.leader_region_metadata_last_entry_ids
+            .insert(region_id, last_entry_id);
     }
 }
 
@@ -317,7 +402,7 @@ impl DefaultContextFactory {
 impl ContextFactory for DefaultContextFactory {
     fn new_context(self, persistent_ctx: PersistentContext) -> Context {
         Context {
-            persistent_ctx: Arc::new(persistent_ctx),
+            persistent_ctx,
             volatile_ctx: self.volatile_ctx,
             in_memory: self.in_memory_key,
             table_metadata_manager: self.table_metadata_manager,
@@ -332,7 +417,7 @@ impl ContextFactory for DefaultContextFactory {
 
 /// The context of procedure execution.
 pub struct Context {
-    persistent_ctx: Arc<PersistentContext>,
+    persistent_ctx: PersistentContext,
     volatile_ctx: VolatileContext,
     in_memory: KvBackendRef,
     table_metadata_manager: TableMetadataManagerRef,
@@ -391,6 +476,47 @@ impl Context {
         &self.server_addr
     }
 
+    /// Returns the table ids of the regions.
+    pub fn region_table_ids(&self) -> Vec<TableId> {
+        self.persistent_ctx
+            .region_ids
+            .iter()
+            .map(|region_id| region_id.table_id())
+            .collect::<HashSet<_>>()
+            .into_iter()
+            .collect()
+    }
+
+    /// Returns the `table_routes` of [VolatileContext] if any.
+    /// Otherwise, returns the value retrieved from remote.
+    ///
+    /// Retry:
+    /// - Failed to retrieve the metadata of table.
+    pub async fn get_table_route_values(
+        &self,
+    ) -> Result<HashMap<TableId, DeserializedValueWithBytes<TableRouteValue>>> {
+        let table_ids = self.persistent_ctx.region_table_ids();
+        let table_routes = self
+            .table_metadata_manager
+            .table_route_manager()
+            .table_route_storage()
+            .batch_get_with_raw_bytes(&table_ids)
+            .await
+            .context(error::TableMetadataManagerSnafu)
+            .map_err(BoxedError::new)
+            .with_context(|_| error::RetryLaterWithSourceSnafu {
+                reason: format!("Failed to get table routes: {table_ids:?}"),
+            })?;
+        let table_routes = table_ids
+            .into_iter()
+            .zip(table_routes)
+            .filter_map(|(table_id, table_route)| {
+                table_route.map(|table_route| (table_id, table_route))
+            })
+            .collect::<HashMap<_, _>>();
+        Ok(table_routes)
+    }
+
     /// Returns the `table_route` of [VolatileContext] if any.
     /// Otherwise, returns the value retrieved from remote.
     ///
@@ -398,9 +524,9 @@ impl Context {
     /// - Failed to retrieve the metadata of table.
     pub async fn get_table_route_value(
         &self,
+        table_id: TableId,
     ) -> Result<DeserializedValueWithBytes<TableRouteValue>> {
-        let table_id = self.persistent_ctx.region_id.table_id();
-        let table_route = self
+        let table_route_value = self
             .table_metadata_manager
             .table_route_manager()
             .table_route_storage()
@@ -409,11 +535,76 @@ impl Context {
             .context(error::TableMetadataManagerSnafu)
             .map_err(BoxedError::new)
             .with_context(|_| error::RetryLaterWithSourceSnafu {
-                reason: format!("Failed to get TableRoute: {table_id}"),
+                reason: format!("Failed to get table routes: {table_id:}"),
             })?
             .context(error::TableRouteNotFoundSnafu { table_id })?;
+        Ok(table_route_value)
+    }
 
-        Ok(table_route)
+    /// Returns the `from_peer_datanode_table_values` of [VolatileContext] if any.
+    /// Otherwise, returns the value retrieved from remote.
+    ///
+    /// Retry:
+    /// - Failed to retrieve the metadata of datanode table.
+    pub async fn get_from_peer_datanode_table_values(
+        &mut self,
+    ) -> Result<&HashMap<TableId, DatanodeTableValue>> {
+        let from_peer_datanode_table_values =
+            &mut self.volatile_ctx.from_peer_datanode_table_values;
+        if from_peer_datanode_table_values.is_none() {
+            let table_ids = self.persistent_ctx.region_table_ids();
+            let datanode_table_keys = table_ids
+                .iter()
+                .map(|table_id| DatanodeTableKey {
+                    datanode_id: self.persistent_ctx.from_peer.id,
+                    table_id: *table_id,
+                })
+                .collect::<Vec<_>>();
+            let datanode_table_values = self
+                .table_metadata_manager
+                .datanode_table_manager()
+                .batch_get(&datanode_table_keys)
+                .await
+                .context(error::TableMetadataManagerSnafu)
+                .map_err(BoxedError::new)
+                .with_context(|_| error::RetryLaterWithSourceSnafu {
+                    reason: format!("Failed to get DatanodeTable: {table_ids:?}"),
+                })?
+                .into_iter()
+                .map(|(k, v)| (k.table_id, v))
+                .collect();
+            *from_peer_datanode_table_values = Some(datanode_table_values);
+        }
+        Ok(from_peer_datanode_table_values.as_ref().unwrap())
+    }
+
+    /// Returns the `from_peer_datanode_table_value` of [VolatileContext] if any.
+    /// Otherwise, returns the value retrieved from remote.
+    ///
+    /// Retry:
+    /// - Failed to retrieve the metadata of datanode table.
+    pub async fn get_from_peer_datanode_table_value(
+        &self,
+        table_id: TableId,
+    ) -> Result<DatanodeTableValue> {
+        let datanode_table_value = self
+            .table_metadata_manager
+            .datanode_table_manager()
+            .get(&DatanodeTableKey {
+                datanode_id: self.persistent_ctx.from_peer.id,
+                table_id,
+            })
+            .await
+            .context(error::TableMetadataManagerSnafu)
+            .map_err(BoxedError::new)
+            .with_context(|_| error::RetryLaterWithSourceSnafu {
+                reason: format!("Failed to get DatanodeTable: {table_id}"),
+            })?
+            .context(error::DatanodeTableNotFoundSnafu {
+                table_id,
+                datanode_id: self.persistent_ctx.from_peer.id,
+            })?;
+        Ok(datanode_table_value)
     }
 
     /// Notifies the RegionSupervisor to register failure detectors of failed region.
@@ -422,11 +613,18 @@ impl Context {
     /// Now, we need to register the failure detector for the failed region again.
     pub async fn register_failure_detectors(&self) {
         let datanode_id = self.persistent_ctx.from_peer.id;
-        let region_id = self.persistent_ctx.region_id;
-
+        let region_ids = &self.persistent_ctx.region_ids;
+        let detecting_regions = region_ids
+            .iter()
+            .map(|region_id| (datanode_id, *region_id))
+            .collect::<Vec<_>>();
         self.region_failure_detector_controller
-            .register_failure_detectors(vec![(datanode_id, region_id)])
+            .register_failure_detectors(detecting_regions)
             .await;
+        info!(
+            "Registered failure detectors after migration failures for datanode {}, regions {:?}",
+            datanode_id, region_ids
+        );
     }
 
     /// Notifies the RegionSupervisor to deregister failure detectors.
@@ -435,10 +633,14 @@ impl Context {
     /// We need to deregister the failure detectors for the original region if the procedure is finished.
     pub async fn deregister_failure_detectors(&self) {
         let datanode_id = self.persistent_ctx.from_peer.id;
-        let region_id = self.persistent_ctx.region_id;
+        let region_ids = &self.persistent_ctx.region_ids;
+        let detecting_regions = region_ids
+            .iter()
+            .map(|region_id| (datanode_id, *region_id))
+            .collect::<Vec<_>>();
 
         self.region_failure_detector_controller
-            .deregister_failure_detectors(vec![(datanode_id, region_id)])
+            .deregister_failure_detectors(detecting_regions)
             .await;
     }
 
@@ -448,112 +650,52 @@ impl Context {
     /// so we need to deregister the failure detectors for the candidate region if the procedure is aborted.
     pub async fn deregister_failure_detectors_for_candidate_region(&self) {
         let to_peer_id = self.persistent_ctx.to_peer.id;
-        let region_id = self.persistent_ctx.region_id;
+        let region_ids = &self.persistent_ctx.region_ids;
+        let detecting_regions = region_ids
+            .iter()
+            .map(|region_id| (to_peer_id, *region_id))
+            .collect::<Vec<_>>();
 
         self.region_failure_detector_controller
-            .deregister_failure_detectors(vec![(to_peer_id, region_id)])
+            .deregister_failure_detectors(detecting_regions)
             .await;
     }
 
-    /// Returns the `table_info` of [VolatileContext] if any.
-    /// Otherwise, returns the value retrieved from remote.
-    ///
-    /// Retry:
-    /// - Failed to retrieve the metadata of table.
-    pub async fn get_table_info_value(
-        &mut self,
-    ) -> Result<&DeserializedValueWithBytes<TableInfoValue>> {
-        let table_info_value = &mut self.volatile_ctx.table_info;
-
-        if table_info_value.is_none() {
-            let table_id = self.persistent_ctx.region_id.table_id();
-            let table_info = self
-                .table_metadata_manager
-                .table_info_manager()
-                .get(table_id)
-                .await
-                .context(error::TableMetadataManagerSnafu)
-                .map_err(BoxedError::new)
-                .with_context(|_| error::RetryLaterWithSourceSnafu {
-                    reason: format!("Failed to get TableInfo: {table_id}"),
-                })?
-                .context(error::TableInfoNotFoundSnafu { table_id })?;
-
-            *table_info_value = Some(table_info);
-        }
-
-        Ok(table_info_value.as_ref().unwrap())
-    }
-
-    /// Returns the `table_info` of [VolatileContext] if any.
-    /// Otherwise, returns the value retrieved from remote.
-    ///
-    /// Retry:
-    /// - Failed to retrieve the metadata of datanode.
-    pub async fn get_from_peer_datanode_table_value(&mut self) -> Result<&DatanodeTableValue> {
-        let datanode_value = &mut self.volatile_ctx.from_peer_datanode_table;
-
-        if datanode_value.is_none() {
-            let table_id = self.persistent_ctx.region_id.table_id();
-            let datanode_id = self.persistent_ctx.from_peer.id;
-
-            let datanode_table = self
-                .table_metadata_manager
-                .datanode_table_manager()
-                .get(&DatanodeTableKey {
-                    datanode_id,
-                    table_id,
-                })
-                .await
-                .context(error::TableMetadataManagerSnafu)
-                .map_err(BoxedError::new)
-                .with_context(|_| error::RetryLaterWithSourceSnafu {
-                    reason: format!("Failed to get DatanodeTable: ({datanode_id},{table_id})"),
-                })?
-                .context(error::DatanodeTableNotFoundSnafu {
-                    table_id,
-                    datanode_id,
-                })?;
-
-            *datanode_value = Some(datanode_table);
-        }
-
-        Ok(datanode_value.as_ref().unwrap())
-    }
-
-    /// Fetches the replay checkpoint for the given topic.
-    pub async fn fetch_replay_checkpoint(&self, topic: &str) -> Result<Option<ReplayCheckpoint>> {
-        let region_id = self.region_id();
-        let topic_region_key = TopicRegionKey::new(region_id, topic);
-        let value = self
+    /// Fetches the replay checkpoints for the given topic region keys.
+    pub async fn get_replay_checkpoints(
+        &self,
+        topic_region_keys: Vec<TopicRegionKey<'_>>,
+    ) -> Result<HashMap<RegionId, ReplayCheckpoint>> {
+        let topic_region_values = self
             .table_metadata_manager
             .topic_region_manager()
-            .get(topic_region_key)
+            .batch_get(topic_region_keys)
             .await
             .context(error::TableMetadataManagerSnafu)?;
 
-        Ok(value.and_then(|value| value.checkpoint))
-    }
+        let replay_checkpoints = topic_region_values
+            .into_iter()
+            .flat_map(|(key, value)| value.checkpoint.map(|value| (key, value)))
+            .collect::<HashMap<_, _>>();
 
-    /// Returns the [RegionId].
-    pub fn region_id(&self) -> RegionId {
-        self.persistent_ctx.region_id
+        Ok(replay_checkpoints)
     }
 
     /// Broadcasts the invalidate table cache message.
     pub async fn invalidate_table_cache(&self) -> Result<()> {
-        let table_id = self.region_id().table_id();
+        let table_ids = self.region_table_ids();
+        let mut cache_idents = Vec::with_capacity(table_ids.len());
+        for table_id in &table_ids {
+            cache_idents.push(CacheIdent::TableId(*table_id));
+        }
         // ignore the result
         let ctx = common_meta::cache_invalidator::Context::default();
-        let _ = self
-            .cache_invalidator
-            .invalidate(&ctx, &[CacheIdent::TableId(table_id)])
-            .await;
+        let _ = self.cache_invalidator.invalidate(&ctx, &cache_idents).await;
         Ok(())
     }
 
     /// Returns the [PersistentContext] of the procedure.
-    pub fn persistent_ctx(&self) -> Arc<PersistentContext> {
+    pub fn persistent_ctx(&self) -> PersistentContext {
         self.persistent_ctx.clone()
     }
 }
@@ -595,7 +737,7 @@ pub struct RegionMigrationData<'a> {
 pub(crate) struct RegionMigrationProcedure {
     state: Box<dyn State>,
     context: Context,
-    _guard: Option<RegionMigrationProcedureGuard>,
+    _guards: Vec<RegionMigrationProcedureGuard>,
 }
 
 impl RegionMigrationProcedure {
@@ -604,22 +746,22 @@ impl RegionMigrationProcedure {
     pub fn new(
         persistent_context: PersistentContext,
         context_factory: impl ContextFactory,
-        guard: Option<RegionMigrationProcedureGuard>,
+        guards: Vec<RegionMigrationProcedureGuard>,
     ) -> Self {
         let state = Box::new(RegionMigrationStart {});
-        Self::new_inner(state, persistent_context, context_factory, guard)
+        Self::new_inner(state, persistent_context, context_factory, guards)
     }
 
     fn new_inner(
         state: Box<dyn State>,
         persistent_context: PersistentContext,
         context_factory: impl ContextFactory,
-        guard: Option<RegionMigrationProcedureGuard>,
+        guards: Vec<RegionMigrationProcedureGuard>,
     ) -> Self {
         Self {
             state,
             context: context_factory.new_context(persistent_context),
-            _guard: guard,
+            _guards: guards,
         }
     }
 
@@ -632,20 +774,26 @@ impl RegionMigrationProcedure {
             persistent_ctx,
             state,
         } = serde_json::from_str(json).context(FromJsonSnafu)?;
+        let guards = persistent_ctx
+            .region_ids
+            .iter()
+            .flat_map(|region_id| {
+                tracker.insert_running_procedure(&RegionMigrationProcedureTask {
+                    region_id: *region_id,
+                    from_peer: persistent_ctx.from_peer.clone(),
+                    to_peer: persistent_ctx.to_peer.clone(),
+                    timeout: persistent_ctx.timeout,
+                    trigger_reason: persistent_ctx.trigger_reason,
+                })
+            })
+            .collect::<Vec<_>>();
 
-        let guard = tracker.insert_running_procedure(&RegionMigrationProcedureTask {
-            region_id: persistent_ctx.region_id,
-            from_peer: persistent_ctx.from_peer.clone(),
-            to_peer: persistent_ctx.to_peer.clone(),
-            timeout: persistent_ctx.timeout,
-            trigger_reason: persistent_ctx.trigger_reason,
-        });
         let context = context_factory.new_context(persistent_ctx);
 
         Ok(Self {
             state,
             context,
-            _guard: guard,
+            _guards: guards,
         })
     }
 
@@ -653,27 +801,25 @@ impl RegionMigrationProcedure {
         let _timer = METRIC_META_REGION_MIGRATION_EXECUTE
             .with_label_values(&["rollback"])
             .start_timer();
-
-        let table_id = self.context.region_id().table_id();
-        let region_id = self.context.region_id();
-        let table_metadata_manager = self.context.table_metadata_manager.clone();
-        let table_route = self.context.get_table_route_value().await?;
-
-        // Safety: It must be a physical table route.
-        let downgraded = table_route
-            .region_routes()
-            .unwrap()
-            .iter()
-            .filter(|route| route.region.id == region_id)
-            .any(|route| route.is_leader_downgrading());
-
-        if downgraded {
-            let table_lock = TableLock::Write(region_id.table_id()).into();
+        let ctx = &self.context;
+        let table_regions = ctx.persistent_ctx.table_regions();
+        for (table_id, regions) in table_regions {
+            let table_lock = TableLock::Write(table_id).into();
             let _guard = procedure_ctx.provider.acquire_lock(&table_lock).await;
-            info!("Rollbacking downgraded region leader table route, region: {region_id}");
-            table_metadata_manager
+            let table_route = ctx.get_table_route_value(table_id).await?;
+            let region_routes = table_route.region_routes().unwrap();
+            let downgraded = region_routes
+                .iter()
+                .filter(|route| regions.contains(&route.region.id))
+                .any(|route| route.is_leader_downgrading());
+            if downgraded {
+                info!(
+                    "Rollbacking downgraded region leader table route, table: {table_id}, regions: {regions:?}"
+                );
+                let table_metadata_manager = &ctx.table_metadata_manager;
+                table_metadata_manager
                     .update_leader_region_status(table_id, &table_route, |route| {
-                        if route.region.id == region_id {
+                        if regions.contains(&route.region.id) {
                             Some(None)
                         } else {
                             None
@@ -683,13 +829,13 @@ impl RegionMigrationProcedure {
                     .context(error::TableMetadataManagerSnafu)
                     .map_err(BoxedError::new)
                     .with_context(|_| error::RetryLaterWithSourceSnafu {
-                        reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"),
+                        reason: format!("Failed to update the table route during the rollback downgraded leader region: {regions:?}"),
                     })?;
-            self.context
-                .deregister_failure_detectors_for_candidate_region()
-                .await;
+            }
         }
-
+        self.context
+            .deregister_failure_detectors_for_candidate_region()
+            .await;
         self.context.register_failure_detectors().await;
 
         Ok(())
@@ -732,14 +878,14 @@ impl Procedure for RegionMigrationProcedure {
                     Err(ProcedureError::retry_later(e))
                 } else {
                     // Consumes the opening region guard before deregistering the failure detectors.
-                    self.context.volatile_ctx.opening_region_guard.take();
+                    self.context.volatile_ctx.opening_region_guards.clear();
                     self.context
                         .deregister_failure_detectors_for_candidate_region()
                         .await;
                     error!(
                         e;
-                        "Region migration procedure failed, region_id: {}, from_peer: {}, to_peer: {}, {}",
-                        self.context.region_id(),
+                        "Region migration procedure failed, regions: {:?}, from_peer: {}, to_peer: {}, {}",
+                        self.context.persistent_ctx.region_ids,
                         self.context.persistent_ctx.from_peer,
                         self.context.persistent_ctx.to_peer,
                         self.context.volatile_ctx.metrics,
@@ -766,7 +912,7 @@ impl Procedure for RegionMigrationProcedure {
     }
 
     fn user_metadata(&self) -> Option<UserMetadata> {
-        Some(UserMetadata::new(self.context.persistent_ctx()))
+        Some(UserMetadata::new(Arc::new(self.context.persistent_ctx())))
     }
 }
 
@@ -780,7 +926,6 @@ mod tests {
     use common_meta::key::test_utils::new_test_table_info;
     use common_meta::rpc::router::{Region, RegionRoute};
 
-    use super::update_metadata::UpdateMetadata;
     use super::*;
     use crate::handler::HeartbeatMailbox;
     use crate::procedure::region_migration::open_candidate_region::OpenCandidateRegion;
@@ -803,7 +948,7 @@ mod tests {
         let env = TestingEnv::new();
         let context = env.context_factory();
 
-        let procedure = RegionMigrationProcedure::new(persistent_context, context, None);
+        let procedure = RegionMigrationProcedure::new(persistent_context, context, vec![]);
 
         let key = procedure.lock_key();
         let keys = key.keys_to_lock().cloned().collect::<Vec<_>>();
@@ -820,16 +965,27 @@ mod tests {
         let env = TestingEnv::new();
         let context = env.context_factory();
 
-        let procedure = RegionMigrationProcedure::new(persistent_context, context, None);
+        let procedure = RegionMigrationProcedure::new(persistent_context, context, vec![]);
 
         let serialized = procedure.dump().unwrap();
-        let expected = r#"{"persistent_ctx":{"catalog":"greptime","schema":"public","from_peer":{"id":1,"addr":""},"to_peer":{"id":2,"addr":""},"region_id":4398046511105,"timeout":"10s","trigger_reason":"Unknown"},"state":{"region_migration_state":"RegionMigrationStart"}}"#;
+        let expected = r#"{"persistent_ctx":{"catalog_and_schema":[["greptime","public"]],"from_peer":{"id":1,"addr":""},"to_peer":{"id":2,"addr":""},"region_ids":[4398046511105],"timeout":"10s","trigger_reason":"Unknown"},"state":{"region_migration_state":"RegionMigrationStart"}}"#;
         assert_eq!(expected, serialized);
     }
 
     #[test]
     fn test_backward_compatibility() {
-        let persistent_ctx = test_util::new_persistent_context(1, 2, RegionId::new(1024, 1));
+        let persistent_ctx = PersistentContext {
+            #[allow(deprecated)]
+            catalog: Some("greptime".into()),
+            #[allow(deprecated)]
+            schema: Some("public".into()),
+            catalog_and_schema: vec![],
+            from_peer: Peer::empty(1),
+            to_peer: Peer::empty(2),
+            region_ids: vec![RegionId::new(1024, 1)],
+            timeout: Duration::from_secs(10),
+            trigger_reason: RegionMigrationTriggerReason::default(),
+        };
         // NOTES: Changes it will break backward compatibility.
         let serialized = r#"{"catalog":"greptime","schema":"public","from_peer":{"id":1,"addr":""},"to_peer":{"id":2,"addr":""},"region_id":4398046511105}"#;
         let deserialized: PersistentContext = serde_json::from_str(serialized).unwrap();
@@ -864,7 +1020,7 @@ mod tests {
             let persistent_context = new_persistent_context();
             let context_factory = env.context_factory();
             let state = Box::<MockState>::default();
-            RegionMigrationProcedure::new_inner(state, persistent_context, context_factory, None)
+            RegionMigrationProcedure::new_inner(state, persistent_context, context_factory, vec![])
         }
 
         let ctx = TestingEnv::procedure_context();
@@ -887,7 +1043,9 @@ mod tests {
         let mut procedure =
             RegionMigrationProcedure::from_json(&serialized, context_factory, tracker.clone())
                 .unwrap();
-        assert!(tracker.contains(procedure.context.persistent_ctx.region_id));
+        for region_id in &procedure.context.persistent_ctx.region_ids {
+            assert!(tracker.contains(*region_id));
+        }
 
         for _ in 1..3 {
             status = Some(procedure.execute(&ctx).await.unwrap());
@@ -927,9 +1085,34 @@ mod tests {
         vec![
             // MigrationStart
             Step::next(
-                "Should be the update metadata for downgrading",
+                "Should be the open candidate region",
                 None,
-                Assertion::simple(assert_update_metadata_downgrade, assert_need_persist),
+                Assertion::simple(assert_open_candidate_region, assert_need_persist),
+            ),
+            // OpenCandidateRegion
+            Step::next(
+                "Should be the flush leader region",
+                Some(mock_datanode_reply(
+                    to_peer_id,
+                    Arc::new(|id| Ok(new_open_region_reply(id, true, None))),
+                )),
+                Assertion::simple(assert_flush_leader_region, assert_no_persist),
+            ),
+            // Flush Leader Region
+            Step::next(
+                "Should be the flush leader region",
+                Some(mock_datanode_reply(
+                    from_peer_id,
+                    Arc::new(move |id| {
+                        Ok(new_flush_region_reply_for_region(
+                            id,
+                            RegionId::new(1024, 1),
+                            true,
+                            None,
+                        ))
+                    }),
+                )),
+                Assertion::simple(assert_update_metadata_downgrade, assert_no_persist),
             ),
             // UpdateMetadata::Downgrade
             Step::next(
@@ -988,7 +1171,7 @@ mod tests {
         let to_peer_id = persistent_context.to_peer.id;
         let from_peer = persistent_context.from_peer.clone();
         let to_peer = persistent_context.to_peer.clone();
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let table_info = new_test_table_info(1024, vec![1]).into();
         let region_routes = vec![RegionRoute {
             region: Region::new_test(region_id),
@@ -1015,61 +1198,6 @@ mod tests {
         runner.suite.verify_table_metadata().await;
     }
 
-    #[tokio::test]
-    async fn test_procedure_flow_idempotent() {
-        common_telemetry::init_default_ut_logging();
-
-        let persistent_context = test_util::new_persistent_context(1, 2, RegionId::new(1024, 1));
-        let state = Box::new(RegionMigrationStart);
-
-        // The table metadata.
-        let from_peer_id = persistent_context.from_peer.id;
-        let to_peer_id = persistent_context.to_peer.id;
-        let from_peer = persistent_context.from_peer.clone();
-        let to_peer = persistent_context.to_peer.clone();
-        let region_id = persistent_context.region_id;
-        let table_info = new_test_table_info(1024, vec![1]).into();
-        let region_routes = vec![RegionRoute {
-            region: Region::new_test(region_id),
-            leader_peer: Some(from_peer),
-            follower_peers: vec![to_peer],
-            ..Default::default()
-        }];
-
-        let suite = ProcedureMigrationTestSuite::new(persistent_context, state);
-        suite.init_table_metadata(table_info, region_routes).await;
-
-        let steps = procedure_flow_steps(from_peer_id, to_peer_id);
-        let setup_to_latest_persisted_state = Step::setup(
-            "Sets state to UpdateMetadata::Downgrade",
-            merge_before_test_fn(vec![
-                setup_state(Arc::new(|| Box::new(UpdateMetadata::Downgrade))),
-                Arc::new(reset_volatile_ctx),
-            ]),
-        );
-
-        let steps = [
-            steps.clone(),
-            vec![setup_to_latest_persisted_state.clone()],
-            steps.clone()[1..].to_vec(),
-            vec![setup_to_latest_persisted_state],
-            steps.clone()[1..].to_vec(),
-        ]
-        .concat();
-        let timer = Instant::now();
-
-        // Run the table tests.
-        let runner = ProcedureMigrationSuiteRunner::new(suite)
-            .steps(steps.clone())
-            .run_once()
-            .await;
-
-        // Ensure it didn't run into the slow path.
-        assert!(timer.elapsed().as_secs() < REGION_LEASE_SECS / 2);
-
-        runner.suite.verify_table_metadata().await;
-    }
-
     #[tokio::test]
     async fn test_procedure_flow_open_candidate_region_retryable_error() {
         common_telemetry::init_default_ut_logging();
@@ -1080,7 +1208,7 @@ mod tests {
         // The table metadata.
         let to_peer_id = persistent_context.to_peer.id;
         let from_peer = persistent_context.from_peer.clone();
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let table_info = new_test_table_info(1024, vec![1]).into();
         let region_routes = vec![RegionRoute {
             region: Region::new_test(region_id),
@@ -1168,13 +1296,12 @@ mod tests {
         let from_peer_id = persistent_context.from_peer.id;
         let to_peer_id = persistent_context.to_peer.id;
         let from_peer = persistent_context.from_peer.clone();
-        let to_peer = persistent_context.to_peer.clone();
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let table_info = new_test_table_info(1024, vec![1]).into();
         let region_routes = vec![RegionRoute {
             region: Region::new_test(region_id),
             leader_peer: Some(from_peer),
-            follower_peers: vec![to_peer],
+            follower_peers: vec![],
             ..Default::default()
         }];
 
@@ -1184,9 +1311,34 @@ mod tests {
         let steps = vec![
             // MigrationStart
             Step::next(
-                "Should be the update metadata for downgrading",
+                "Should be the open candidate region",
                 None,
-                Assertion::simple(assert_update_metadata_downgrade, assert_need_persist),
+                Assertion::simple(assert_open_candidate_region, assert_need_persist),
+            ),
+            // OpenCandidateRegion
+            Step::next(
+                "Should be the flush leader region",
+                Some(mock_datanode_reply(
+                    to_peer_id,
+                    Arc::new(|id| Ok(new_open_region_reply(id, true, None))),
+                )),
+                Assertion::simple(assert_flush_leader_region, assert_no_persist),
+            ),
+            // Flush Leader Region
+            Step::next(
+                "Should be the flush leader region",
+                Some(mock_datanode_reply(
+                    from_peer_id,
+                    Arc::new(move |id| {
+                        Ok(new_flush_region_reply_for_region(
+                            id,
+                            RegionId::new(1024, 1),
+                            true,
+                            None,
+                        ))
+                    }),
+                )),
+                Assertion::simple(assert_update_metadata_downgrade, assert_no_persist),
             ),
             // UpdateMetadata::Downgrade
             Step::next(
@@ -1230,9 +1382,9 @@ mod tests {
         ];
 
         let setup_to_latest_persisted_state = Step::setup(
-            "Sets state to UpdateMetadata::Downgrade",
+            "Sets state to OpenCandidateRegion",
             merge_before_test_fn(vec![
-                setup_state(Arc::new(|| Box::new(UpdateMetadata::Downgrade))),
+                setup_state(Arc::new(|| Box::new(OpenCandidateRegion))),
                 Arc::new(reset_volatile_ctx),
             ]),
         );
@@ -1264,7 +1416,7 @@ mod tests {
         let to_peer_id = persistent_context.to_peer.id;
         let from_peer_id = persistent_context.from_peer.id;
         let from_peer = persistent_context.from_peer.clone();
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let table_info = new_test_table_info(1024, vec![1]).into();
         let region_routes = vec![RegionRoute {
             region: Region::new_test(region_id),
diff --git a/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs b/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs
index 5a8beb7ca4..c20c7fede2 100644
--- a/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs
@@ -19,7 +19,6 @@ use api::v1::meta::MailboxMessage;
 use common_meta::RegionIdent;
 use common_meta::distributed_time_constants::REGION_LEASE_SECS;
 use common_meta::instruction::{Instruction, InstructionReply, SimpleReply};
-use common_meta::key::datanode_table::RegionInfo;
 use common_procedure::{Context as ProcedureContext, Status};
 use common_telemetry::{info, warn};
 use serde::{Deserialize, Serialize};
@@ -47,12 +46,12 @@ impl State for CloseDowngradedRegion {
     ) -> Result<(Box<dyn State>, Status)> {
         if let Err(err) = self.close_downgraded_leader_region(ctx).await {
             let downgrade_leader_datanode = &ctx.persistent_ctx.from_peer;
-            let region_id = ctx.region_id();
-            warn!(err; "Failed to close downgraded leader region: {region_id} on datanode {:?}", downgrade_leader_datanode);
+            let region_ids = &ctx.persistent_ctx.region_ids;
+            warn!(err; "Failed to close downgraded leader regions: {region_ids:?} on datanode {:?}", downgrade_leader_datanode);
         }
         info!(
-            "Region migration is finished: region_id: {}, from_peer: {}, to_peer: {}, trigger_reason: {}, {}",
-            ctx.region_id(),
+            "Region migration is finished: regions: {:?}, from_peer: {}, to_peer: {}, trigger_reason: {}, {}",
+            ctx.persistent_ctx.region_ids,
             ctx.persistent_ctx.from_peer,
             ctx.persistent_ctx.to_peer,
             ctx.persistent_ctx.trigger_reason,
@@ -74,28 +73,30 @@ impl CloseDowngradedRegion {
     async fn build_close_region_instruction(&self, ctx: &mut Context) -> Result<Instruction> {
         let pc = &ctx.persistent_ctx;
         let downgrade_leader_datanode_id = pc.from_peer.id;
-        let table_id = pc.region_id.table_id();
-        let region_number = pc.region_id.region_number();
-        let datanode_table_value = ctx.get_from_peer_datanode_table_value().await?;
+        let region_ids = &ctx.persistent_ctx.region_ids;
+        let mut idents = Vec::with_capacity(region_ids.len());
 
-        let RegionInfo { engine, .. } = datanode_table_value.region_info.clone();
+        for region_id in region_ids {
+            idents.push(RegionIdent {
+                datanode_id: downgrade_leader_datanode_id,
+                table_id: region_id.table_id(),
+                region_number: region_id.region_number(),
+                // The `engine` field is not used for closing region.
+                engine: String::new(),
+            });
+        }
 
-        Ok(Instruction::CloseRegions(vec![RegionIdent {
-            datanode_id: downgrade_leader_datanode_id,
-            table_id,
-            region_number,
-            engine,
-        }]))
+        Ok(Instruction::CloseRegions(idents))
     }
 
     /// Closes the downgraded leader region.
     async fn close_downgraded_leader_region(&self, ctx: &mut Context) -> Result<()> {
         let close_instruction = self.build_close_region_instruction(ctx).await?;
-        let region_id = ctx.region_id();
+        let region_ids = &ctx.persistent_ctx.region_ids;
         let pc = &ctx.persistent_ctx;
         let downgrade_leader_datanode = &pc.from_peer;
         let msg = MailboxMessage::json_message(
-            &format!("Close downgraded region: {}", region_id),
+            &format!("Close downgraded regions: {:?}", region_ids),
             &format!("Metasrv@{}", ctx.server_addr()),
             &format!(
                 "Datanode-{}@{}",
@@ -118,8 +119,8 @@ impl CloseDowngradedRegion {
             Ok(msg) => {
                 let reply = HeartbeatMailbox::json_reply(&msg)?;
                 info!(
-                    "Received close downgraded leade region reply: {:?}, region: {}",
-                    reply, region_id
+                    "Received close downgraded leade region reply: {:?}, region: {:?}",
+                    reply, region_ids
                 );
                 let InstructionReply::CloseRegions(SimpleReply { result, error }) = reply else {
                     return error::UnexpectedInstructionReplySnafu {
@@ -134,7 +135,7 @@ impl CloseDowngradedRegion {
                 } else {
                     error::UnexpectedSnafu {
                         violated: format!(
-                            "Failed to close downgraded leader region: {region_id} on datanode {:?}, error: {error:?}",
+                            "Failed to close downgraded leader region: {region_ids:?} on datanode {:?}, error: {error:?}",
                             downgrade_leader_datanode,
                         ),
                     }
diff --git a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs
index fb4065748c..d10220098f 100644
--- a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs
@@ -22,7 +22,7 @@ use common_meta::instruction::{
     DowngradeRegion, DowngradeRegionReply, DowngradeRegionsReply, Instruction, InstructionReply,
 };
 use common_procedure::{Context as ProcedureContext, Status};
-use common_telemetry::{error, info, warn};
+use common_telemetry::{debug, error, info, warn};
 use common_time::util::current_time_millis;
 use serde::{Deserialize, Serialize};
 use snafu::{OptionExt, ResultExt};
@@ -70,30 +70,30 @@ impl State for DowngradeLeaderRegion {
             Ok(_) => {
                 // Do nothing
                 info!(
-                    "Downgraded region leader success, region: {}",
-                    ctx.persistent_ctx.region_id
+                    "Downgraded region leader success, region: {:?}",
+                    ctx.persistent_ctx.region_ids
                 );
             }
             Err(error::Error::ExceededDeadline { .. }) => {
                 info!(
-                    "Downgrade region leader exceeded deadline, region: {}",
-                    ctx.persistent_ctx.region_id
+                    "Downgrade region leader exceeded deadline, region: {:?}",
+                    ctx.persistent_ctx.region_ids
                 );
                 // Rollbacks the metadata if procedure is timeout
                 return Ok((Box::new(UpdateMetadata::Rollback), Status::executing(false)));
             }
             Err(err) => {
-                error!(err; "Occurs non-retryable error, region: {}", ctx.persistent_ctx.region_id);
+                error!(err; "Occurs non-retryable error, region: {:?}", ctx.persistent_ctx.region_ids);
                 if let Some(deadline) = ctx.volatile_ctx.leader_region_lease_deadline.as_ref() {
                     info!(
-                        "Running into the downgrade region leader slow path, region: {}, sleep until {:?}",
-                        ctx.persistent_ctx.region_id, deadline
+                        "Running into the downgrade region leader slow path, region: {:?}, sleep until {:?}",
+                        ctx.persistent_ctx.region_ids, deadline
                     );
                     tokio::time::sleep_until(*deadline).await;
                 } else {
                     warn!(
-                        "Leader region lease deadline is not set, region: {}",
-                        ctx.persistent_ctx.region_id
+                        "Leader region lease deadline is not set, region: {:?}",
+                        ctx.persistent_ctx.region_ids
                     );
                 }
             }
@@ -118,12 +118,76 @@ impl DowngradeLeaderRegion {
         ctx: &Context,
         flush_timeout: Duration,
     ) -> Instruction {
-        let pc = &ctx.persistent_ctx;
-        let region_id = pc.region_id;
-        Instruction::DowngradeRegions(vec![DowngradeRegion {
+        let region_ids = &ctx.persistent_ctx.region_ids;
+        let mut downgrade_regions = Vec::with_capacity(region_ids.len());
+        for region_id in region_ids {
+            downgrade_regions.push(DowngradeRegion {
+                region_id: *region_id,
+                flush_timeout: Some(flush_timeout),
+            });
+        }
+
+        Instruction::DowngradeRegions(downgrade_regions)
+    }
+
+    fn handle_downgrade_region_reply(
+        &self,
+        ctx: &mut Context,
+        reply: &DowngradeRegionReply,
+        now: &Instant,
+    ) -> Result<()> {
+        let leader = &ctx.persistent_ctx.from_peer;
+        let DowngradeRegionReply {
             region_id,
-            flush_timeout: Some(flush_timeout),
-        }])
+            last_entry_id,
+            metadata_last_entry_id,
+            exists,
+            error,
+        } = reply;
+
+        if error.is_some() {
+            return error::RetryLaterSnafu {
+                reason: format!(
+                    "Failed to downgrade the region {} on datanode {:?}, error: {:?}, elapsed: {:?}",
+                    region_id, leader, error, now.elapsed()
+                ),
+            }
+            .fail();
+        }
+
+        if !exists {
+            warn!(
+                "Trying to downgrade the region {} on datanode {:?}, but region doesn't exist!, elapsed: {:?}",
+                region_id,
+                leader,
+                now.elapsed()
+            );
+        } else {
+            info!(
+                "Region {} leader is downgraded on datanode {:?}, last_entry_id: {:?}, metadata_last_entry_id: {:?}, elapsed: {:?}",
+                region_id,
+                leader,
+                last_entry_id,
+                metadata_last_entry_id,
+                now.elapsed()
+            );
+        }
+
+        if let Some(last_entry_id) = last_entry_id {
+            debug!(
+                "set last_entry_id: {:?}, region_id: {:?}",
+                last_entry_id, region_id
+            );
+            ctx.volatile_ctx
+                .set_last_entry_id(*region_id, *last_entry_id);
+        }
+
+        if let Some(metadata_last_entry_id) = metadata_last_entry_id {
+            ctx.volatile_ctx
+                .set_metadata_last_entry_id(*region_id, *metadata_last_entry_id);
+        }
+
+        Ok(())
     }
 
     /// Tries to downgrade a leader region.
@@ -140,7 +204,7 @@ impl DowngradeLeaderRegion {
     /// - [ExceededDeadline](error::Error::ExceededDeadline)
     /// - Invalid JSON.
     async fn downgrade_region(&self, ctx: &mut Context) -> Result<()> {
-        let region_id = ctx.persistent_ctx.region_id;
+        let region_ids = &ctx.persistent_ctx.region_ids;
         let operation_timeout =
             ctx.next_operation_timeout()
                 .context(error::ExceededDeadlineSnafu {
@@ -150,7 +214,7 @@ impl DowngradeLeaderRegion {
 
         let leader = &ctx.persistent_ctx.from_peer;
         let msg = MailboxMessage::json_message(
-            &format!("Downgrade leader region: {}", region_id),
+            &format!("Downgrade leader regions: {:?}", region_ids),
             &format!("Metasrv@{}", ctx.server_addr()),
             &format!("Datanode-{}@{}", leader.id, leader.addr),
             common_time::util::current_time_millis(),
@@ -168,9 +232,9 @@ impl DowngradeLeaderRegion {
             Ok(msg) => {
                 let reply = HeartbeatMailbox::json_reply(&msg)?;
                 info!(
-                    "Received downgrade region reply: {:?}, region: {}, elapsed: {:?}",
+                    "Received downgrade region reply: {:?}, region: {:?}, elapsed: {:?}",
                     reply,
-                    region_id,
+                    region_ids,
                     now.elapsed()
                 );
                 let InstructionReply::DowngradeRegions(DowngradeRegionsReply { replies }) = reply
@@ -182,57 +246,14 @@ impl DowngradeLeaderRegion {
                     .fail();
                 };
 
-                // TODO(weny): handle multiple replies.
-                let DowngradeRegionReply {
-                    region_id,
-                    last_entry_id,
-                    metadata_last_entry_id,
-                    exists,
-                    error,
-                } = &replies[0];
-
-                if error.is_some() {
-                    return error::RetryLaterSnafu {
-                        reason: format!(
-                            "Failed to downgrade the region {} on datanode {:?}, error: {:?}, elapsed: {:?}",
-                            region_id, leader, error, now.elapsed()
-                        ),
-                    }
-                    .fail();
+                for reply in replies {
+                    self.handle_downgrade_region_reply(ctx, &reply, &now)?;
                 }
-
-                if !exists {
-                    warn!(
-                        "Trying to downgrade the region {} on datanode {:?}, but region doesn't exist!, elapsed: {:?}",
-                        region_id,
-                        leader,
-                        now.elapsed()
-                    );
-                } else {
-                    info!(
-                        "Region {} leader is downgraded on datanode {:?}, last_entry_id: {:?}, metadata_last_entry_id: {:?}, elapsed: {:?}",
-                        region_id,
-                        leader,
-                        last_entry_id,
-                        metadata_last_entry_id,
-                        now.elapsed()
-                    );
-                }
-
-                if let Some(last_entry_id) = last_entry_id {
-                    ctx.volatile_ctx.set_last_entry_id(*last_entry_id);
-                }
-
-                if let Some(metadata_last_entry_id) = metadata_last_entry_id {
-                    ctx.volatile_ctx
-                        .set_metadata_last_entry_id(*metadata_last_entry_id);
-                }
-
                 Ok(())
             }
             Err(error::Error::MailboxTimeout { .. }) => {
                 let reason = format!(
-                    "Mailbox received timeout for downgrade leader region {region_id} on datanode {:?}, elapsed: {:?}",
+                    "Mailbox received timeout for downgrade leader region {region_ids:?} on datanode {:?}, elapsed: {:?}",
                     leader,
                     now.elapsed()
                 );
@@ -248,7 +269,7 @@ impl DowngradeLeaderRegion {
         let last_connection_at = match find_datanode_lease_value(&ctx.in_memory, leader.id).await {
             Ok(lease_value) => lease_value.map(|lease_value| lease_value.timestamp_millis),
             Err(err) => {
-                error!(err; "Failed to find datanode lease value for datanode: {}, during region migration, region: {}", leader, ctx.persistent_ctx.region_id);
+                error!(err; "Failed to find datanode lease value for datanode: {}, during region migration, region: {:?}", leader, ctx.persistent_ctx.region_ids);
                 return;
             }
         };
@@ -266,8 +287,8 @@ impl DowngradeLeaderRegion {
             if elapsed >= (REGION_LEASE_SECS * 1000) as i64 {
                 ctx.volatile_ctx.reset_leader_region_lease_deadline();
                 info!(
-                    "Datanode {}({}) has been disconnected for longer than the region lease period ({:?}), reset leader region lease deadline to None, region: {}",
-                    leader, last_connection_at, region_lease, ctx.persistent_ctx.region_id
+                    "Datanode {}({}) has been disconnected for longer than the region lease period ({:?}), reset leader region lease deadline to None, region: {:?}",
+                    leader, last_connection_at, region_lease, ctx.persistent_ctx.region_ids
                 );
             } else if elapsed > 0 {
                 // `now - last_connection_at` < REGION_LEASE_SECS * 1000
@@ -277,23 +298,23 @@ impl DowngradeLeaderRegion {
                 ctx.volatile_ctx
                     .set_leader_region_lease_deadline(lease_timeout);
                 info!(
-                    "Datanode {}({}) last connected {:?} ago, updated leader region lease deadline to {:?}, region: {}",
+                    "Datanode {}({}) last connected {:?} ago, updated leader region lease deadline to {:?}, region: {:?}",
                     leader,
                     last_connection_at,
                     elapsed,
                     ctx.volatile_ctx.leader_region_lease_deadline,
-                    ctx.persistent_ctx.region_id
+                    ctx.persistent_ctx.region_ids
                 );
             } else {
                 warn!(
-                    "Datanode {} has invalid last connection timestamp: {} (which is after current time: {}), region: {}",
-                    leader, last_connection_at, now, ctx.persistent_ctx.region_id
+                    "Datanode {} has invalid last connection timestamp: {} (which is after current time: {}), region: {:?}",
+                    leader, last_connection_at, now, ctx.persistent_ctx.region_ids
                 )
             }
         } else {
             warn!(
-                "Failed to find last connection time for datanode {}, unable to update region lease deadline, region: {}",
-                leader, ctx.persistent_ctx.region_id
+                "Failed to find last connection time for datanode {}, unable to update region lease deadline, region: {:?}",
+                leader, ctx.persistent_ctx.region_ids
             )
         }
     }
@@ -318,19 +339,20 @@ impl DowngradeLeaderRegion {
                 retry += 1;
                 // Throws the error immediately if the procedure exceeded the deadline.
                 if matches!(err, error::Error::ExceededDeadline { .. }) {
-                    error!(err; "Failed to downgrade region leader, region: {}, exceeded deadline", ctx.persistent_ctx.region_id);
+                    error!(err; "Failed to downgrade region leader, regions: {:?}, exceeded deadline", ctx.persistent_ctx.region_ids);
                     return Err(err);
                 } else if matches!(err, error::Error::PusherNotFound { .. }) {
                     // Throws the error immediately if the datanode is unreachable.
-                    error!(err; "Failed to downgrade region leader, region: {}, datanode({}) is unreachable(PusherNotFound)", ctx.persistent_ctx.region_id, ctx.persistent_ctx.from_peer.id);
+                    error!(err; "Failed to downgrade region leader, regions: {:?}, datanode({}) is unreachable(PusherNotFound)", ctx.persistent_ctx.region_ids, ctx.persistent_ctx.from_peer.id);
                     self.update_leader_region_lease_deadline(ctx).await;
                     return Err(err);
                 } else if err.is_retryable() && retry < self.optimistic_retry {
-                    error!(err; "Failed to downgrade region leader, region: {}, retry later", ctx.persistent_ctx.region_id);
+                    error!(err; "Failed to downgrade region leader, regions: {:?}, retry later", ctx.persistent_ctx.region_ids);
                     sleep(self.retry_initial_interval).await;
                 } else {
                     return Err(BoxedError::new(err)).context(error::DowngradeLeaderSnafu {
-                        region_id: ctx.persistent_ctx.region_id,
+                        // TODO(weny): handle multiple regions.
+                        region_id: ctx.persistent_ctx.region_ids[0],
                     })?;
                 }
             } else {
@@ -367,22 +389,21 @@ mod tests {
     };
 
     fn new_persistent_context() -> PersistentContext {
-        PersistentContext {
-            catalog: "greptime".into(),
-            schema: "public".into(),
-            from_peer: Peer::empty(1),
-            to_peer: Peer::empty(2),
-            region_id: RegionId::new(1024, 1),
-            timeout: Duration::from_millis(1000),
-            trigger_reason: RegionMigrationTriggerReason::Manual,
-        }
+        PersistentContext::new(
+            vec![("greptime".into(), "public".into())],
+            Peer::empty(1),
+            Peer::empty(2),
+            vec![RegionId::new(1024, 1)],
+            Duration::from_millis(1000),
+            RegionMigrationTriggerReason::Manual,
+        )
     }
 
     async fn prepare_table_metadata(ctx: &Context, wal_options: HashMap<u32, String>) {
-        let table_info =
-            new_test_table_info(ctx.persistent_ctx.region_id.table_id(), vec![1]).into();
+        let region_id = ctx.persistent_ctx.region_ids[0];
+        let table_info = new_test_table_info(region_id.table_id(), vec![1]).into();
         let region_routes = vec![RegionRoute {
-            region: Region::new_test(ctx.persistent_ctx.region_id),
+            region: Region::new_test(region_id),
             leader_peer: Some(ctx.persistent_ctx.from_peer.clone()),
             follower_peers: vec![ctx.persistent_ctx.to_peer.clone()],
             ..Default::default()
@@ -590,7 +611,13 @@ mod tests {
         });
 
         state.downgrade_region_with_retry(&mut ctx).await.unwrap();
-        assert_eq!(ctx.volatile_ctx.leader_region_last_entry_id, Some(1));
+        assert_eq!(
+            ctx.volatile_ctx
+                .leader_region_last_entry_ids
+                .get(&RegionId::new(0, 0))
+                .cloned(),
+            Some(1)
+        );
         assert!(ctx.volatile_ctx.leader_region_lease_deadline.is_none());
     }
 
@@ -636,7 +663,7 @@ mod tests {
             .await
             .unwrap_err();
         assert_matches!(err, error::Error::DowngradeLeader { .. });
-        assert_eq!(ctx.volatile_ctx.leader_region_last_entry_id, None);
+        // assert_eq!(ctx.volatile_ctx.leader_region_last_entry_id, None);
         // Should remain no change.
         assert_eq!(
             ctx.volatile_ctx.leader_region_lease_deadline.unwrap(),
@@ -671,7 +698,13 @@ mod tests {
         let (next, _) = state.next(&mut ctx, &procedure_ctx).await.unwrap();
         let elapsed = timer.elapsed().as_secs();
         assert!(elapsed < REGION_LEASE_SECS / 2);
-        assert_eq!(ctx.volatile_ctx.leader_region_last_entry_id, Some(1));
+        assert_eq!(
+            ctx.volatile_ctx
+                .leader_region_last_entry_ids
+                .get(&RegionId::new(0, 0))
+                .cloned(),
+            Some(1)
+        );
         assert!(ctx.volatile_ctx.leader_region_lease_deadline.is_none());
 
         let _ = next
diff --git a/src/meta-srv/src/procedure/region_migration/flush_leader_region.rs b/src/meta-srv/src/procedure/region_migration/flush_leader_region.rs
index b5cc1a955c..f9e5900cbb 100644
--- a/src/meta-srv/src/procedure/region_migration/flush_leader_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/flush_leader_region.rs
@@ -15,7 +15,7 @@
 use std::any::Any;
 
 use api::v1::meta::MailboxMessage;
-use common_meta::instruction::{FlushRegions, Instruction, InstructionReply};
+use common_meta::instruction::{FlushErrorStrategy, FlushRegions, Instruction, InstructionReply};
 use common_procedure::{Context as ProcedureContext, Status};
 use common_telemetry::{info, warn};
 use serde::{Deserialize, Serialize};
@@ -64,8 +64,10 @@ impl PreFlushRegion {
     /// Builds flush leader region instruction.
     fn build_flush_leader_region_instruction(&self, ctx: &Context) -> Instruction {
         let pc = &ctx.persistent_ctx;
-        let region_id = pc.region_id;
-        Instruction::FlushRegions(FlushRegions::sync_single(region_id))
+        Instruction::FlushRegions(FlushRegions::sync_batch(
+            pc.region_ids.clone(),
+            FlushErrorStrategy::TryAll,
+        ))
     }
 
     /// Tries to flush a leader region.
@@ -88,11 +90,11 @@ impl PreFlushRegion {
                     operation: "Flush leader region",
                 })?;
         let flush_instruction = self.build_flush_leader_region_instruction(ctx);
-        let region_id = ctx.persistent_ctx.region_id;
+        let region_ids = &ctx.persistent_ctx.region_ids;
         let leader = &ctx.persistent_ctx.from_peer;
 
         let msg = MailboxMessage::json_message(
-            &format!("Flush leader region: {}", region_id),
+            &format!("Flush leader region: {:?}", region_ids),
             &format!("Metasrv@{}", ctx.server_addr()),
             &format!("Datanode-{}@{}", leader.id, leader.addr),
             common_time::util::current_time_millis(),
@@ -111,32 +113,42 @@ impl PreFlushRegion {
                 Ok(msg) => {
                     let reply = HeartbeatMailbox::json_reply(&msg)?;
                     info!(
-                        "Received flush leader region reply: {:?}, region: {}, elapsed: {:?}",
+                        "Received flush leader region reply: {:?}, region: {:?}, elapsed: {:?}",
                         reply,
-                        region_id,
+                        region_ids,
                         now.elapsed()
                     );
 
                     let reply_result = match reply {
                         InstructionReply::FlushRegions(flush_reply) => {
-                            if flush_reply.results.len() != 1 {
+                            if flush_reply.results.len() != region_ids.len() {
                                 return error::UnexpectedInstructionReplySnafu {
                                     mailbox_message: msg.to_string(),
-                                    reason: "expect single region flush result",
+                                    reason: format!(
+                                        "expect {} region flush result, but got {}",
+                                        region_ids.len(),
+                                        flush_reply.results.len()
+                                    ),
                                 }
                                 .fail();
                             }
-                            let (reply_region_id, result) = &flush_reply.results[0];
-                            if *reply_region_id != region_id {
-                                return error::UnexpectedInstructionReplySnafu {
-                                    mailbox_message: msg.to_string(),
-                                    reason: "flush reply region ID mismatch",
-                                }
-                                .fail();
-                            }
-                            match result {
-                                Ok(()) => (true, None),
-                                Err(err) => (false, Some(err.clone())),
+
+                            match flush_reply.overall_success {
+                                true => (true, None),
+                                false => (
+                                    false,
+                                    Some(
+                                        flush_reply
+                                            .results
+                                            .iter()
+                                            .filter_map(|(region_id, result)| match result {
+                                                Ok(_) => None,
+                                                Err(e) => Some(format!("{}: {}", region_id, e)),
+                                            })
+                                            .collect::<Vec<String>>()
+                                            .join("; "),
+                                    ),
+                                ),
                             }
                         }
                         _ => {
@@ -149,15 +161,15 @@ impl PreFlushRegion {
                     };
                     let (result, error) = reply_result;
 
-                    if error.is_some() {
+                    if let Some(error) = error {
                         warn!(
-                            "Failed to flush leader region {} on datanode {:?}, error: {:?}. Skip flush operation.",
-                            region_id, leader, error
+                            "Failed to flush leader regions {:?} on datanode {:?}, error: {}. Skip flush operation.",
+                            region_ids, leader, &error
                         );
                     } else if result {
                         info!(
-                            "The flush leader region {} on datanode {:?} is successful, elapsed: {:?}",
-                            region_id,
+                            "The flush leader regions {:?} on datanode {:?} is successful, elapsed: {:?}",
+                            region_ids,
                             leader,
                             now.elapsed()
                         );
@@ -166,15 +178,15 @@ impl PreFlushRegion {
                     Ok(())
                 }
                 Err(Error::MailboxTimeout { .. }) => error::ExceededDeadlineSnafu {
-                    operation: "Flush leader region",
+                    operation: "Flush leader regions",
                 }
                 .fail(),
                 Err(err) => Err(err),
             },
             Err(Error::PusherNotFound { .. }) => {
                 warn!(
-                    "Failed to flush leader region({}), the datanode({}) is unreachable(PusherNotFound). Skip flush operation.",
-                    region_id, leader
+                    "Failed to flush leader regions({:?}), the datanode({}) is unreachable(PusherNotFound). Skip flush operation.",
+                    region_ids, leader
                 );
                 Ok(())
             }
@@ -268,7 +280,7 @@ mod tests {
         // to_peer: 2
         let persistent_context = new_persistent_context();
         let from_peer_id = persistent_context.from_peer.id;
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let mut env = TestingEnv::new();
         let mut ctx = env.context_factory().new_context(persistent_context);
         let mailbox_ctx = env.mailbox_context();
@@ -297,7 +309,7 @@ mod tests {
         // to_peer: 2
         let persistent_context = new_persistent_context();
         let from_peer_id = persistent_context.from_peer.id;
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let mut env = TestingEnv::new();
         let mut ctx = env.context_factory().new_context(persistent_context);
         let mailbox_ctx = env.mailbox_context();
diff --git a/src/meta-srv/src/procedure/region_migration/manager.rs b/src/meta-srv/src/procedure/region_migration/manager.rs
index 563b0f290d..fcd8f7a6e6 100644
--- a/src/meta-srv/src/procedure/region_migration/manager.rs
+++ b/src/meta-srv/src/procedure/region_migration/manager.rs
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::HashMap;
 use std::collections::hash_map::Entry;
+use std::collections::{HashMap, HashSet};
 use std::fmt::Display;
 use std::sync::{Arc, RwLock};
 use std::time::Duration;
@@ -31,6 +31,9 @@ use table::table_name::TableName;
 
 use crate::error::{self, Result};
 use crate::metrics::{METRIC_META_REGION_MIGRATION_DATANODES, METRIC_META_REGION_MIGRATION_FAIL};
+use crate::procedure::region_migration::utils::{
+    RegionMigrationAnalysis, RegionMigrationTaskBatch, analyze_region_migration_task,
+};
 use crate::procedure::region_migration::{
     DefaultContextFactory, PersistentContext, RegionMigrationProcedure,
 };
@@ -99,6 +102,7 @@ impl Drop for RegionMigrationProcedureGuard {
     }
 }
 
+/// A task of region migration procedure.
 #[derive(Debug, Clone)]
 pub struct RegionMigrationProcedureTask {
     pub(crate) region_id: RegionId,
@@ -151,6 +155,25 @@ impl Display for RegionMigrationProcedureTask {
     }
 }
 
+/// The result of submitting a region migration task.
+#[derive(Debug, Default, PartialEq, Eq)]
+pub struct SubmitRegionMigrationTaskResult {
+    /// Regions already migrated to the `to_peer`.
+    pub migrated: Vec<RegionId>,
+    /// Regions where the leader peer has changed.
+    pub leader_changed: Vec<RegionId>,
+    /// Regions where `to_peer` is already a follower (conflict).
+    pub peer_conflict: Vec<RegionId>,
+    /// Regions whose table is not found.
+    pub table_not_found: Vec<RegionId>,
+    /// Regions still pending migration.
+    pub migrating: Vec<RegionId>,
+    /// Regions that have been submitted for migration.
+    pub submitted: Vec<RegionId>,
+    /// The procedure id of the region migration procedure.
+    pub procedure_id: Option<ProcedureId>,
+}
+
 impl RegionMigrationManager {
     /// Returns new [`RegionMigrationManager`]
     pub(crate) fn new(
@@ -332,6 +355,168 @@ impl RegionMigrationManager {
         Ok(())
     }
 
+    /// Extracts regions from the migration task that are already running migration procedures.
+    ///
+    /// Returns a tuple containing those region ids that are already running and the newly created procedure guards.
+    /// The regions that are already running will be removed from the [`RegionMigrationTask`].
+    fn extract_running_regions(
+        &self,
+        task: &mut RegionMigrationTaskBatch,
+    ) -> (Vec<RegionId>, Vec<RegionMigrationProcedureGuard>) {
+        let mut migrating_region_ids = Vec::new();
+        let mut procedure_guards = Vec::with_capacity(task.region_ids.len());
+
+        for region_id in &task.region_ids {
+            let Some(guard) = self.insert_running_procedure(&RegionMigrationProcedureTask::new(
+                *region_id,
+                task.from_peer.clone(),
+                task.to_peer.clone(),
+                task.timeout,
+                task.trigger_reason,
+            )) else {
+                migrating_region_ids.push(*region_id);
+                continue;
+            };
+            procedure_guards.push(guard);
+        }
+
+        let migrating_set = migrating_region_ids.iter().cloned().collect::<HashSet<_>>();
+        task.region_ids.retain(|id| !migrating_set.contains(id));
+
+        (migrating_region_ids, procedure_guards)
+    }
+
+    pub async fn submit_region_migration_task(
+        &self,
+        mut task: RegionMigrationTaskBatch,
+    ) -> Result<SubmitRegionMigrationTaskResult> {
+        let (migrating_region_ids, procedure_guards) = self.extract_running_regions(&mut task);
+        let RegionMigrationAnalysis {
+            migrated,
+            leader_changed,
+            peer_conflict,
+            mut table_not_found,
+            pending,
+        } = analyze_region_migration_task(&task, &self.context_factory.table_metadata_manager)
+            .await?;
+        if pending.is_empty() {
+            return Ok(SubmitRegionMigrationTaskResult {
+                migrated,
+                leader_changed,
+                peer_conflict,
+                table_not_found,
+                migrating: migrating_region_ids,
+                submitted: vec![],
+                procedure_id: None,
+            });
+        }
+
+        // Updates the region ids to the pending region ids.
+        task.region_ids = pending;
+        let table_regions = task.table_regions();
+        let table_ids = table_regions.keys().cloned().collect::<Vec<_>>();
+        let table_info_values = self
+            .context_factory
+            .table_metadata_manager
+            .table_info_manager()
+            .batch_get(&table_ids)
+            .await
+            .context(error::TableMetadataManagerSnafu)?;
+        let mut catalog_and_schema = Vec::with_capacity(table_info_values.len());
+        for (table_id, regions) in table_regions {
+            match table_info_values.get(&table_id) {
+                Some(table_info) => {
+                    let TableName {
+                        catalog_name,
+                        schema_name,
+                        ..
+                    } = table_info.table_name();
+                    catalog_and_schema.push((catalog_name, schema_name));
+                }
+                None => {
+                    task.region_ids.retain(|id| id.table_id() != table_id);
+                    table_not_found.extend(regions);
+                }
+            }
+        }
+        if task.region_ids.is_empty() {
+            return Ok(SubmitRegionMigrationTaskResult {
+                migrated,
+                leader_changed,
+                peer_conflict,
+                table_not_found,
+                migrating: migrating_region_ids,
+                submitted: vec![],
+                procedure_id: None,
+            });
+        }
+
+        let submitting_region_ids = task.region_ids.clone();
+        let procedure_id = self
+            .submit_procedure_inner(task, procedure_guards, catalog_and_schema)
+            .await?;
+        Ok(SubmitRegionMigrationTaskResult {
+            migrated,
+            leader_changed,
+            peer_conflict,
+            table_not_found,
+            migrating: migrating_region_ids,
+            submitted: submitting_region_ids,
+            procedure_id: Some(procedure_id),
+        })
+    }
+
+    async fn submit_procedure_inner(
+        &self,
+        task: RegionMigrationTaskBatch,
+        procedure_guards: Vec<RegionMigrationProcedureGuard>,
+        catalog_and_schema: Vec<(String, String)>,
+    ) -> Result<ProcedureId> {
+        let procedure = RegionMigrationProcedure::new(
+            PersistentContext::new(
+                catalog_and_schema,
+                task.from_peer.clone(),
+                task.to_peer.clone(),
+                task.region_ids.clone(),
+                task.timeout,
+                task.trigger_reason,
+            ),
+            self.context_factory.clone(),
+            procedure_guards,
+        );
+        let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
+        let procedure_id = procedure_with_id.id;
+        info!("Starting region migration procedure {procedure_id} for {task}");
+        let procedure_manager = self.procedure_manager.clone();
+        let num_region = task.region_ids.len();
+
+        common_runtime::spawn_global(async move {
+            let watcher = &mut match procedure_manager.submit(procedure_with_id).await {
+                Ok(watcher) => watcher,
+                Err(e) => {
+                    error!(e; "Failed to submit region migration procedure {procedure_id} for {task}");
+                    return;
+                }
+            };
+            METRIC_META_REGION_MIGRATION_DATANODES
+                .with_label_values(&["src", &task.from_peer.id.to_string()])
+                .inc_by(num_region as u64);
+            METRIC_META_REGION_MIGRATION_DATANODES
+                .with_label_values(&["desc", &task.to_peer.id.to_string()])
+                .inc_by(num_region as u64);
+
+            if let Err(e) = watcher::wait(watcher).await {
+                error!(e; "Failed to wait region migration procedure {procedure_id} for {task}");
+                METRIC_META_REGION_MIGRATION_FAIL.inc();
+                return;
+            }
+
+            info!("Region migration procedure {procedure_id} for {task} is finished successfully!");
+        });
+
+        Ok(procedure_id)
+    }
+
     /// Submits a new region migration procedure.
     pub async fn submit_procedure(
         &self,
@@ -384,17 +569,16 @@ impl RegionMigrationManager {
             trigger_reason,
         } = task.clone();
         let procedure = RegionMigrationProcedure::new(
-            PersistentContext {
-                catalog: catalog_name,
-                schema: schema_name,
-                region_id,
+            PersistentContext::new(
+                vec![(catalog_name, schema_name)],
                 from_peer,
                 to_peer,
+                vec![region_id],
                 timeout,
                 trigger_reason,
-            },
+            ),
             self.context_factory.clone(),
-            Some(guard),
+            vec![guard],
         );
         let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
         let procedure_id = procedure_with_id.id;
@@ -645,4 +829,162 @@ mod test {
 
         assert_matches!(err, error::Error::Unexpected { .. });
     }
+
+    #[tokio::test]
+    async fn test_submit_procedure_with_multiple_regions_invalid_task() {
+        let env = TestingEnv::new();
+        let context_factory = env.context_factory();
+        let manager = RegionMigrationManager::new(env.procedure_manager().clone(), context_factory);
+        let task = RegionMigrationTaskBatch {
+            region_ids: vec![RegionId::new(1024, 1)],
+            from_peer: Peer::empty(1),
+            to_peer: Peer::empty(1),
+            timeout: Duration::from_millis(1000),
+            trigger_reason: RegionMigrationTriggerReason::Manual,
+        };
+
+        let err = manager
+            .submit_region_migration_task(task)
+            .await
+            .unwrap_err();
+        assert_matches!(err, error::Error::InvalidArguments { .. });
+    }
+
+    #[tokio::test]
+    async fn test_submit_procedure_with_multiple_regions_no_region_to_migrate() {
+        common_telemetry::init_default_ut_logging();
+        let env = TestingEnv::new();
+        let context_factory = env.context_factory();
+        let manager = RegionMigrationManager::new(env.procedure_manager().clone(), context_factory);
+        let region_id = RegionId::new(1024, 1);
+        let task = RegionMigrationTaskBatch {
+            region_ids: vec![region_id],
+            from_peer: Peer::empty(1),
+            to_peer: Peer::empty(2),
+            timeout: Duration::from_millis(1000),
+            trigger_reason: RegionMigrationTriggerReason::Manual,
+        };
+        let table_info = new_test_table_info(1024, vec![1]).into();
+        let region_routes = vec![RegionRoute {
+            region: Region::new_test(region_id),
+            leader_peer: Some(Peer::empty(2)),
+            ..Default::default()
+        }];
+        env.create_physical_table_metadata(table_info, region_routes)
+            .await;
+        let result = manager.submit_region_migration_task(task).await.unwrap();
+
+        assert_eq!(
+            result,
+            SubmitRegionMigrationTaskResult {
+                migrated: vec![region_id],
+                ..Default::default()
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn test_submit_procedure_with_multiple_regions_leader_peer_changed() {
+        let env = TestingEnv::new();
+        let context_factory = env.context_factory();
+        let manager = RegionMigrationManager::new(env.procedure_manager().clone(), context_factory);
+        let region_id = RegionId::new(1024, 1);
+        let task = RegionMigrationTaskBatch {
+            region_ids: vec![region_id],
+            from_peer: Peer::empty(1),
+            to_peer: Peer::empty(2),
+            timeout: Duration::from_millis(1000),
+            trigger_reason: RegionMigrationTriggerReason::Manual,
+        };
+
+        let table_info = new_test_table_info(1024, vec![1]).into();
+        let region_routes = vec![RegionRoute {
+            region: Region::new_test(RegionId::new(1024, 1)),
+            leader_peer: Some(Peer::empty(3)),
+            ..Default::default()
+        }];
+
+        env.create_physical_table_metadata(table_info, region_routes)
+            .await;
+        let result = manager.submit_region_migration_task(task).await.unwrap();
+        assert_eq!(
+            result,
+            SubmitRegionMigrationTaskResult {
+                leader_changed: vec![region_id],
+                ..Default::default()
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn test_submit_procedure_with_multiple_regions_peer_conflict() {
+        let env = TestingEnv::new();
+        let context_factory = env.context_factory();
+        let manager = RegionMigrationManager::new(env.procedure_manager().clone(), context_factory);
+        let region_id = RegionId::new(1024, 1);
+        let task = RegionMigrationTaskBatch {
+            region_ids: vec![region_id],
+            from_peer: Peer::empty(3),
+            to_peer: Peer::empty(2),
+            timeout: Duration::from_millis(1000),
+            trigger_reason: RegionMigrationTriggerReason::Manual,
+        };
+
+        let table_info = new_test_table_info(1024, vec![1]).into();
+        let region_routes = vec![RegionRoute {
+            region: Region::new_test(region_id),
+            leader_peer: Some(Peer::empty(3)),
+            follower_peers: vec![Peer::empty(2)],
+            ..Default::default()
+        }];
+
+        env.create_physical_table_metadata(table_info, region_routes)
+            .await;
+        let result = manager.submit_region_migration_task(task).await.unwrap();
+        assert_eq!(
+            result,
+            SubmitRegionMigrationTaskResult {
+                peer_conflict: vec![region_id],
+                ..Default::default()
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn test_running_regions() {
+        let env = TestingEnv::new();
+        let context_factory = env.context_factory();
+        let manager = RegionMigrationManager::new(env.procedure_manager().clone(), context_factory);
+        let region_id = RegionId::new(1024, 1);
+        let task = RegionMigrationTaskBatch {
+            region_ids: vec![region_id, RegionId::new(1024, 2)],
+            from_peer: Peer::empty(1),
+            to_peer: Peer::empty(2),
+            timeout: Duration::from_millis(1000),
+            trigger_reason: RegionMigrationTriggerReason::Manual,
+        };
+        // Inserts one
+        manager.tracker.running_procedures.write().unwrap().insert(
+            region_id,
+            RegionMigrationProcedureTask::new(
+                region_id,
+                task.from_peer.clone(),
+                task.to_peer.clone(),
+                task.timeout,
+                task.trigger_reason,
+            ),
+        );
+        let table_info = new_test_table_info(1024, vec![1]).into();
+        let region_routes = vec![RegionRoute {
+            region: Region::new_test(RegionId::new(1024, 2)),
+            leader_peer: Some(Peer::empty(1)),
+            ..Default::default()
+        }];
+        env.create_physical_table_metadata(table_info, region_routes)
+            .await;
+        let result = manager.submit_region_migration_task(task).await.unwrap();
+        assert_eq!(result.migrating, vec![region_id]);
+        assert_eq!(result.submitted, vec![RegionId::new(1024, 2)]);
+        assert!(result.procedure_id.is_some());
+    }
 }
diff --git a/src/meta-srv/src/procedure/region_migration/migration_abort.rs b/src/meta-srv/src/procedure/region_migration/migration_abort.rs
index a25443c815..f3ad8052de 100644
--- a/src/meta-srv/src/procedure/region_migration/migration_abort.rs
+++ b/src/meta-srv/src/procedure/region_migration/migration_abort.rs
@@ -44,9 +44,9 @@ impl State for RegionMigrationAbort {
         _procedure_ctx: &ProcedureContext,
     ) -> Result<(Box<dyn State>, Status)> {
         warn!(
-            "Region migration is aborted: {}, region_id: {}, from_peer: {}, to_peer: {}, trigger_reason: {}, {}",
+            "Region migration is aborted: {}, regions: {:?}, from_peer: {}, to_peer: {}, trigger_reason: {}, {}",
             self.reason,
-            ctx.region_id(),
+            ctx.persistent_ctx.region_ids,
             ctx.persistent_ctx.from_peer,
             ctx.persistent_ctx.to_peer,
             ctx.persistent_ctx.trigger_reason,
diff --git a/src/meta-srv/src/procedure/region_migration/migration_start.rs b/src/meta-srv/src/procedure/region_migration/migration_start.rs
index e544adbf4c..99d2972aa8 100644
--- a/src/meta-srv/src/procedure/region_migration/migration_start.rs
+++ b/src/meta-srv/src/procedure/region_migration/migration_start.rs
@@ -20,22 +20,18 @@ use common_procedure::{Context as ProcedureContext, Status};
 use common_telemetry::info;
 use serde::{Deserialize, Serialize};
 use snafu::{OptionExt, ResultExt};
-use store_api::storage::RegionId;
 
 use crate::error::{self, Result};
 use crate::procedure::region_migration::migration_abort::RegionMigrationAbort;
 use crate::procedure::region_migration::migration_end::RegionMigrationEnd;
 use crate::procedure::region_migration::open_candidate_region::OpenCandidateRegion;
-use crate::procedure::region_migration::update_metadata::UpdateMetadata;
 use crate::procedure::region_migration::{Context, State};
 
 /// The behaviors:
 ///
-/// If the expected leader region has been opened on `to_peer`, go to the [RegionMigrationEnd] state.
-///
-/// If the candidate region has been opened on `to_peer`, go to the [UpdateMetadata::Downgrade] state.
-///
-/// Otherwise go to the [OpenCandidateRegion] state.
+/// - If all regions have been migrated, transitions to [RegionMigrationEnd].
+/// - If any of the region leaders is not the `from_peer`, transitions to [RegionMigrationAbort].
+/// - Otherwise, continues with [OpenCandidateRegion] to initiate the candidate region.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct RegionMigrationStart;
 
@@ -44,44 +40,62 @@ pub struct RegionMigrationStart;
 impl State for RegionMigrationStart {
     /// Yields next [State].
     ///
-    /// If the expected leader region has been opened on `to_peer`, go to the [RegionMigrationEnd] state.
+    /// Determines the next [State] for region migration:
     ///
-    /// If the candidate region has been opened on `to_peer`, go to the [UpdateMetadata::Downgrade] state.
-    ///
-    /// Otherwise go to the [OpenCandidateRegion] state.
+    /// - If all regions have been migrated, transitions to [RegionMigrationEnd].
+    /// - If any of the region leaders is not the `from_peer`, transitions to [RegionMigrationAbort].
+    /// - Otherwise, continues with [OpenCandidateRegion] to initiate the candidate region.
     async fn next(
         &mut self,
         ctx: &mut Context,
         _procedure_ctx: &ProcedureContext,
     ) -> Result<(Box<dyn State>, Status)> {
-        let region_id = ctx.persistent_ctx.region_id;
-        let region_route = self.retrieve_region_route(ctx, region_id).await?;
+        let mut region_routes = self.retrieve_region_routes(ctx).await?;
         let to_peer = &ctx.persistent_ctx.to_peer;
         let from_peer = &ctx.persistent_ctx.from_peer;
+        let region_ids = &ctx.persistent_ctx.region_ids;
 
-        if self.has_migrated(&region_route, to_peer)? {
+        self.filter_unmigrated_regions(&mut region_routes, to_peer);
+
+        // No region to migrate, skip the migration.
+        if region_routes.is_empty() {
             info!(
-                "Region has been migrated, region: {:?}, to_peer: {:?}",
-                region_route.region.id, to_peer
+                "All regions have been migrated, regions: {:?}, to_peer: {:?}",
+                region_ids, to_peer
             );
-            Ok((Box::new(RegionMigrationEnd), Status::done()))
-        } else if self.invalid_leader_peer(&region_route, from_peer)? {
-            info!(
-                "Abort region migration, region:{:?}, unexpected leader peer: {:?}, expected: {:?}",
-                region_route.region.id, region_route.leader_peer, from_peer,
-            );
-            Ok((
-                Box::new(RegionMigrationAbort::new(&format!(
-                    "Invalid region leader peer: {from_peer:?}, expected: {:?}",
-                    region_route.leader_peer.as_ref().unwrap(),
-                ))),
-                Status::done(),
-            ))
-        } else if self.check_candidate_region_on_peer(&region_route, to_peer) {
-            Ok((Box::new(UpdateMetadata::Downgrade), Status::executing(true)))
-        } else {
-            Ok((Box::new(OpenCandidateRegion), Status::executing(true)))
+            return Ok((Box::new(RegionMigrationEnd), Status::done()));
         }
+
+        // Updates the region ids to the unmigrated regions.
+        if region_routes.len() != region_ids.len() {
+            let unmigrated_region_ids = region_routes.iter().map(|route| route.region.id).collect();
+            info!(
+                "Some of the regions have been migrated, only migrate the following regions: {:?}, to_peer: {:?}",
+                unmigrated_region_ids, to_peer
+            );
+            ctx.persistent_ctx.region_ids = unmigrated_region_ids;
+        }
+
+        // Checks if any of the region leaders is not the `from_peer`.
+        for region_route in &region_routes {
+            if self.invalid_leader_peer(region_route, from_peer)? {
+                info!(
+                    "Abort region migration, region:{}, unexpected leader peer: {:?}, expected: {:?}",
+                    region_route.region.id, region_route.leader_peer, from_peer,
+                );
+                return Ok((
+                    Box::new(RegionMigrationAbort::new(&format!(
+                        "Invalid region leader peer: {:?}, expected: {:?}",
+                        region_route.leader_peer.as_ref().unwrap(),
+                        from_peer,
+                    ))),
+                    Status::done(),
+                ));
+            }
+        }
+
+        // If all checks pass, open the candidate region.
+        Ok((Box::new(OpenCandidateRegion), Status::executing(true)))
     }
 
     fn as_any(&self) -> &dyn Any {
@@ -90,7 +104,7 @@ impl State for RegionMigrationStart {
 }
 
 impl RegionMigrationStart {
-    /// Retrieves region route.
+    /// Retrieves region routes for multiple regions.
     ///
     /// Abort(non-retry):
     /// - TableRoute is not found.
@@ -98,39 +112,32 @@ impl RegionMigrationStart {
     ///
     /// Retry:
     /// - Failed to retrieve the metadata of table.
-    async fn retrieve_region_route(
-        &self,
-        ctx: &mut Context,
-        region_id: RegionId,
-    ) -> Result<RegionRoute> {
-        let table_id = region_id.table_id();
-        let table_route = ctx.get_table_route_value().await?;
+    async fn retrieve_region_routes(&self, ctx: &mut Context) -> Result<Vec<RegionRoute>> {
+        let region_ids = &ctx.persistent_ctx.region_ids;
+        let table_route_values = ctx.get_table_route_values().await?;
+        let mut region_routes = Vec::with_capacity(region_ids.len());
+        for region_id in region_ids {
+            let table_id = region_id.table_id();
+            let region_route = table_route_values
+                .get(&table_id)
+                .context(error::TableRouteNotFoundSnafu { table_id })?
+                .region_routes()
+                .with_context(|_| error::UnexpectedLogicalRouteTableSnafu {
+                    err_msg: format!("TableRoute({table_id:?}) is a non-physical TableRouteValue."),
+                })?
+                .iter()
+                .find(|route| route.region.id == *region_id)
+                .cloned()
+                .with_context(|| error::UnexpectedSnafu {
+                    violated: format!(
+                        "RegionRoute({}) is not found in TableRoute({})",
+                        region_id, table_id
+                    ),
+                })?;
+            region_routes.push(region_route);
+        }
 
-        let region_route = table_route
-            .region_routes()
-            .context(error::UnexpectedLogicalRouteTableSnafu {
-                err_msg: format!("{self:?} is a non-physical TableRouteValue."),
-            })?
-            .iter()
-            .find(|route| route.region.id == region_id)
-            .cloned()
-            .context(error::UnexpectedSnafu {
-                violated: format!(
-                    "RegionRoute({}) is not found in TableRoute({})",
-                    region_id, table_id
-                ),
-            })?;
-
-        Ok(region_route)
-    }
-
-    /// Checks whether the candidate region on region has been opened.
-    /// Returns true if it's been opened.
-    fn check_candidate_region_on_peer(&self, region_route: &RegionRoute, to_peer: &Peer) -> bool {
-        region_route
-            .follower_peers
-            .iter()
-            .any(|peer| peer.id == to_peer.id)
+        Ok(region_routes)
     }
 
     /// Returns true if the region leader is not the `from_peer`.
@@ -143,7 +150,7 @@ impl RegionMigrationStart {
         let is_invalid_leader_peer = region_route
             .leader_peer
             .as_ref()
-            .context(error::UnexpectedSnafu {
+            .with_context(|| error::UnexpectedSnafu {
                 violated: format!("Leader peer is not found in TableRoute({})", region_id),
             })?
             .id
@@ -151,6 +158,12 @@ impl RegionMigrationStart {
         Ok(is_invalid_leader_peer)
     }
 
+    /// Filters out regions that unmigrated.
+    fn filter_unmigrated_regions(&self, region_routes: &mut Vec<RegionRoute>, to_peer: &Peer) {
+        region_routes
+            .retain(|region_route| !self.has_migrated(region_route, to_peer).unwrap_or(false));
+    }
+
     /// Checks whether the region has been migrated.
     /// Returns true if it's.
     ///     
@@ -162,7 +175,7 @@ impl RegionMigrationStart {
         let region_migrated = region_route
             .leader_peer
             .as_ref()
-            .context(error::UnexpectedSnafu {
+            .with_context(|| error::UnexpectedSnafu {
                 violated: format!("Leader peer is not found in TableRoute({})", region_id),
             })?
             .id
@@ -173,6 +186,7 @@ impl RegionMigrationStart {
 
 #[cfg(test)]
 mod tests {
+
     use std::assert_matches::assert_matches;
 
     use common_meta::key::test_utils::new_test_table_info;
@@ -183,7 +197,6 @@ mod tests {
     use super::*;
     use crate::error::Error;
     use crate::procedure::region_migration::test_util::{self, TestingEnv, new_procedure_context};
-    use crate::procedure::region_migration::update_metadata::UpdateMetadata;
     use crate::procedure::region_migration::{ContextFactory, PersistentContext};
 
     fn new_persistent_context() -> PersistentContext {
@@ -196,14 +209,8 @@ mod tests {
         let env = TestingEnv::new();
         let persistent_context = new_persistent_context();
         let mut ctx = env.context_factory().new_context(persistent_context);
-
-        let err = state
-            .retrieve_region_route(&mut ctx, RegionId::new(1024, 1))
-            .await
-            .unwrap_err();
-
+        let err = state.retrieve_region_routes(&mut ctx).await.unwrap_err();
         assert_matches!(err, Error::TableRouteNotFound { .. });
-
         assert!(!err.is_retryable());
     }
 
@@ -216,56 +223,20 @@ mod tests {
         let env = TestingEnv::new();
         let mut ctx = env.context_factory().new_context(persistent_context);
 
-        let table_info = new_test_table_info(1024, vec![1]).into();
+        let table_info = new_test_table_info(1024, vec![3]).into();
         let region_route = RegionRoute {
-            region: Region::new_test(RegionId::new(1024, 1)),
+            region: Region::new_test(RegionId::new(1024, 3)),
             leader_peer: Some(from_peer.clone()),
             ..Default::default()
         };
 
         env.create_physical_table_metadata(table_info, vec![region_route])
             .await;
-
-        let err = state
-            .retrieve_region_route(&mut ctx, RegionId::new(1024, 3))
-            .await
-            .unwrap_err();
-
+        let err = state.retrieve_region_routes(&mut ctx).await.unwrap_err();
         assert_matches!(err, Error::Unexpected { .. });
         assert!(!err.is_retryable());
     }
 
-    #[tokio::test]
-    async fn test_next_update_metadata_downgrade_state() {
-        let mut state = Box::new(RegionMigrationStart);
-        // from_peer: 1
-        // to_peer: 2
-        let persistent_context = new_persistent_context();
-        let from_peer_id = persistent_context.from_peer.id;
-        let to_peer = persistent_context.to_peer.clone();
-        let region_id = persistent_context.region_id;
-
-        let env = TestingEnv::new();
-        let mut ctx = env.context_factory().new_context(persistent_context);
-
-        let table_info = new_test_table_info(1024, vec![1]).into();
-        let region_routes = vec![RegionRoute {
-            region: Region::new_test(region_id),
-            leader_peer: Some(Peer::empty(from_peer_id)),
-            follower_peers: vec![to_peer],
-            ..Default::default()
-        }];
-
-        env.create_physical_table_metadata(table_info, region_routes)
-            .await;
-        let procedure_ctx = new_procedure_context();
-        let (next, _) = state.next(&mut ctx, &procedure_ctx).await.unwrap();
-
-        let update_metadata = next.as_any().downcast_ref::<UpdateMetadata>().unwrap();
-
-        assert_matches!(update_metadata, UpdateMetadata::Downgrade);
-    }
-
     #[tokio::test]
     async fn test_next_migration_end_state() {
         let mut state = Box::new(RegionMigrationStart);
@@ -274,7 +245,7 @@ mod tests {
         let persistent_context = new_persistent_context();
         let to_peer = persistent_context.to_peer.clone();
         let from_peer = persistent_context.from_peer.clone();
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
 
         let env = TestingEnv::new();
         let mut ctx = env.context_factory().new_context(persistent_context);
@@ -302,7 +273,7 @@ mod tests {
         // to_peer: 2
         let persistent_context = new_persistent_context();
         let from_peer_id = persistent_context.from_peer.id;
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let env = TestingEnv::new();
         let mut ctx = env.context_factory().new_context(persistent_context);
 
@@ -327,12 +298,12 @@ mod tests {
         // from_peer: 1
         // to_peer: 2
         let persistent_context = new_persistent_context();
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let env = TestingEnv::new();
         let mut ctx = env.context_factory().new_context(persistent_context);
 
         let table_info = new_test_table_info(1024, vec![1]).into();
-        let region_routes = vec![RegionRoute {
+        let region_routes: Vec<RegionRoute> = vec![RegionRoute {
             region: Region::new_test(region_id),
             leader_peer: Some(Peer::empty(1024)),
             ..Default::default()
diff --git a/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs
index 111bd41fd2..67e1bfb857 100644
--- a/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs
@@ -66,33 +66,43 @@ impl OpenCandidateRegion {
     /// Abort(non-retry):
     /// - Datanode Table is not found.
     async fn build_open_region_instruction(&self, ctx: &mut Context) -> Result<Instruction> {
-        let pc = &ctx.persistent_ctx;
-        let table_id = pc.region_id.table_id();
-        let region_number = pc.region_id.region_number();
-        let candidate_id = pc.to_peer.id;
-        let datanode_table_value = ctx.get_from_peer_datanode_table_value().await?;
+        let region_ids = ctx.persistent_ctx.region_ids.clone();
+        let from_peer_id = ctx.persistent_ctx.from_peer.id;
+        let to_peer_id = ctx.persistent_ctx.to_peer.id;
+        let datanode_table_values = ctx.get_from_peer_datanode_table_values().await?;
+        let mut open_regions = Vec::with_capacity(region_ids.len());
 
-        let RegionInfo {
-            region_storage_path,
-            region_options,
-            region_wal_options,
-            engine,
-        } = datanode_table_value.region_info.clone();
-
-        let open_instruction = Instruction::OpenRegions(vec![OpenRegion::new(
-            RegionIdent {
-                datanode_id: candidate_id,
-                table_id,
-                region_number,
+        for region_id in region_ids {
+            let table_id = region_id.table_id();
+            let region_number = region_id.region_number();
+            let datanode_table_value = datanode_table_values.get(&table_id).context(
+                error::DatanodeTableNotFoundSnafu {
+                    table_id,
+                    datanode_id: from_peer_id,
+                },
+            )?;
+            let RegionInfo {
+                region_storage_path,
+                region_options,
+                region_wal_options,
                 engine,
-            },
-            &region_storage_path,
-            region_options,
-            region_wal_options,
-            true,
-        )]);
+            } = datanode_table_value.region_info.clone();
 
-        Ok(open_instruction)
+            open_regions.push(OpenRegion::new(
+                RegionIdent {
+                    datanode_id: to_peer_id,
+                    table_id,
+                    region_number,
+                    engine,
+                },
+                &region_storage_path,
+                region_options,
+                region_wal_options,
+                true,
+            ));
+        }
+
+        Ok(Instruction::OpenRegions(open_regions))
     }
 
     /// Opens the candidate region.
@@ -112,25 +122,27 @@ impl OpenCandidateRegion {
     ) -> Result<()> {
         let pc = &ctx.persistent_ctx;
         let vc = &mut ctx.volatile_ctx;
-        let region_id = pc.region_id;
+        let region_ids = &pc.region_ids;
         let candidate = &pc.to_peer;
 
         // This method might be invoked multiple times.
         // Only registers the guard if `opening_region_guard` is absent.
-        if vc.opening_region_guard.is_none() {
-            // Registers the opening region.
-            let guard = ctx
-                .opening_region_keeper
-                .register(candidate.id, region_id)
-                .context(error::RegionOpeningRaceSnafu {
-                    peer_id: candidate.id,
-                    region_id,
-                })?;
-            vc.opening_region_guard = Some(guard);
+        if vc.opening_region_guards.is_empty() {
+            for region_id in region_ids {
+                // Registers the opening region.
+                let guard = ctx
+                    .opening_region_keeper
+                    .register(candidate.id, *region_id)
+                    .context(error::RegionOpeningRaceSnafu {
+                        peer_id: candidate.id,
+                        region_id: *region_id,
+                    })?;
+                vc.opening_region_guards.push(guard);
+            }
         }
 
         let msg = MailboxMessage::json_message(
-            &format!("Open candidate region: {}", region_id),
+            &format!("Open candidate regions: {:?}", region_ids),
             &format!("Metasrv@{}", ctx.server_addr()),
             &format!("Datanode-{}@{}", candidate.id, candidate.addr),
             common_time::util::current_time_millis(),
@@ -154,9 +166,9 @@ impl OpenCandidateRegion {
             Ok(msg) => {
                 let reply = HeartbeatMailbox::json_reply(&msg)?;
                 info!(
-                    "Received open region reply: {:?}, region: {}, elapsed: {:?}",
+                    "Received open region reply: {:?}, region: {:?}, elapsed: {:?}",
                     reply,
-                    region_id,
+                    region_ids,
                     now.elapsed()
                 );
                 let InstructionReply::OpenRegions(SimpleReply { result, error }) = reply else {
@@ -172,7 +184,7 @@ impl OpenCandidateRegion {
                 } else {
                     error::RetryLaterSnafu {
                         reason: format!(
-                            "Region {region_id} is not opened by datanode {:?}, error: {error:?}, elapsed: {:?}",
+                            "Region {region_ids:?} is not opened by datanode {:?}, error: {error:?}, elapsed: {:?}",
                             candidate,
                             now.elapsed()
                         ),
@@ -182,7 +194,7 @@ impl OpenCandidateRegion {
             }
             Err(error::Error::MailboxTimeout { .. }) => {
                 let reason = format!(
-                    "Mailbox received timeout for open candidate region {region_id} on datanode {:?}, elapsed: {:?}",
+                    "Mailbox received timeout for open candidate region {region_ids:?} on datanode {:?}, elapsed: {:?}",
                     candidate,
                     now.elapsed()
                 );
@@ -255,7 +267,7 @@ mod tests {
         // from_peer: 1
         // to_peer: 2
         let persistent_context = new_persistent_context();
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let to_peer_id = persistent_context.to_peer.id;
         let env = TestingEnv::new();
         let mut ctx = env.context_factory().new_context(persistent_context);
@@ -276,7 +288,7 @@ mod tests {
         // from_peer: 1
         // to_peer: 2
         let persistent_context = new_persistent_context();
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let to_peer_id = persistent_context.to_peer.id;
 
         let env = TestingEnv::new();
@@ -302,7 +314,7 @@ mod tests {
         // from_peer: 1
         // to_peer: 2
         let persistent_context = new_persistent_context();
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let to_peer_id = persistent_context.to_peer.id;
 
         let mut env = TestingEnv::new();
@@ -335,7 +347,7 @@ mod tests {
         // from_peer: 1
         // to_peer: 2
         let persistent_context = new_persistent_context();
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let to_peer_id = persistent_context.to_peer.id;
 
         let mut env = TestingEnv::new();
@@ -370,7 +382,7 @@ mod tests {
         // from_peer: 1
         // to_peer: 2
         let persistent_context = new_persistent_context();
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let to_peer_id = persistent_context.to_peer.id;
         let mut env = TestingEnv::new();
 
@@ -410,14 +422,14 @@ mod tests {
         // to_peer: 2
         let persistent_context = new_persistent_context();
         let from_peer_id = persistent_context.from_peer.id;
-        let region_id = persistent_context.region_id;
+        let region_id = persistent_context.region_ids[0];
         let to_peer_id = persistent_context.to_peer.id;
         let mut env = TestingEnv::new();
 
         // Prepares table
         let table_info = new_test_table_info(1024, vec![1]).into();
         let region_routes = vec![RegionRoute {
-            region: Region::new_test(persistent_context.region_id),
+            region: Region::new_test(region_id),
             leader_peer: Some(Peer::empty(from_peer_id)),
             ..Default::default()
         }];
@@ -445,10 +457,7 @@ mod tests {
         let procedure_ctx = new_procedure_context();
         let (next, _) = state.next(&mut ctx, &procedure_ctx).await.unwrap();
         let vc = ctx.volatile_ctx;
-        assert_eq!(
-            vc.opening_region_guard.unwrap().info(),
-            (to_peer_id, region_id)
-        );
+        assert_eq!(vc.opening_region_guards[0].info(), (to_peer_id, region_id));
 
         let flush_leader_region = next.as_any().downcast_ref::<PreFlushRegion>().unwrap();
         assert_matches!(flush_leader_region, PreFlushRegion);
diff --git a/src/meta-srv/src/procedure/region_migration/test_util.rs b/src/meta-srv/src/procedure/region_migration/test_util.rs
index a44c3a20c6..c039fc441d 100644
--- a/src/meta-srv/src/procedure/region_migration/test_util.rs
+++ b/src/meta-srv/src/procedure/region_migration/test_util.rs
@@ -185,15 +185,14 @@ impl TestingEnv {
 
 /// Generates a [PersistentContext].
 pub fn new_persistent_context(from: u64, to: u64, region_id: RegionId) -> PersistentContext {
-    PersistentContext {
-        catalog: "greptime".into(),
-        schema: "public".into(),
-        from_peer: Peer::empty(from),
-        to_peer: Peer::empty(to),
-        region_id,
-        timeout: Duration::from_secs(10),
-        trigger_reason: RegionMigrationTriggerReason::default(),
-    }
+    PersistentContext::new(
+        vec![("greptime".into(), "public".into())],
+        Peer::empty(from),
+        Peer::empty(to),
+        vec![region_id],
+        Duration::from_secs(10),
+        RegionMigrationTriggerReason::default(),
+    )
 }
 
 /// The test suite for region migration procedure.
@@ -306,37 +305,38 @@ impl ProcedureMigrationTestSuite {
 
     /// Verifies table metadata after region migration.
     pub(crate) async fn verify_table_metadata(&self) {
-        let region_id = self.context.persistent_ctx.region_id;
-        let table_route = self
-            .env
-            .table_metadata_manager
-            .table_route_manager()
-            .table_route_storage()
-            .get(region_id.table_id())
-            .await
-            .unwrap()
-            .unwrap();
-        let region_routes = table_route.region_routes().unwrap();
+        for region_id in &self.context.persistent_ctx.region_ids {
+            let table_route = self
+                .env
+                .table_metadata_manager
+                .table_route_manager()
+                .table_route_storage()
+                .get(region_id.table_id())
+                .await
+                .unwrap()
+                .unwrap();
+            let region_routes = table_route.region_routes().unwrap();
 
-        let expected_leader_id = self.context.persistent_ctx.to_peer.id;
-        let removed_follower_id = self.context.persistent_ctx.from_peer.id;
+            let expected_leader_id = self.context.persistent_ctx.to_peer.id;
+            let removed_follower_id = self.context.persistent_ctx.from_peer.id;
 
-        let region_route = region_routes
-            .iter()
-            .find(|route| route.region.id == region_id)
-            .unwrap();
-
-        assert!(!region_route.is_leader_downgrading());
-        assert_eq!(
-            region_route.leader_peer.as_ref().unwrap().id,
-            expected_leader_id
-        );
-        assert!(
-            !region_route
-                .follower_peers
+            let region_route = region_routes
                 .iter()
-                .any(|route| route.id == removed_follower_id)
-        )
+                .find(|route| route.region.id == *region_id)
+                .unwrap();
+
+            assert!(!region_route.is_leader_downgrading());
+            assert_eq!(
+                region_route.leader_peer.as_ref().unwrap().id,
+                expected_leader_id
+            );
+            assert!(
+                !region_route
+                    .follower_peers
+                    .iter()
+                    .any(|route| route.id == removed_follower_id)
+            )
+        }
     }
 }
 
diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata.rs b/src/meta-srv/src/procedure/region_migration/update_metadata.rs
index 8e7b2d4d3b..e96a025c5d 100644
--- a/src/meta-srv/src/procedure/region_migration/update_metadata.rs
+++ b/src/meta-srv/src/procedure/region_migration/update_metadata.rs
@@ -18,7 +18,6 @@ pub(crate) mod upgrade_candidate_region;
 
 use std::any::Any;
 
-use common_meta::lock_key::TableLock;
 use common_procedure::{Context as ProcedureContext, Status};
 use common_telemetry::warn;
 use serde::{Deserialize, Serialize};
@@ -48,12 +47,10 @@ impl State for UpdateMetadata {
         ctx: &mut Context,
         procedure_ctx: &ProcedureContext,
     ) -> Result<(Box<dyn State>, Status)> {
-        let table_id = TableLock::Write(ctx.region_id().table_id()).into();
-        let _guard = procedure_ctx.provider.acquire_lock(&table_id).await;
-
         match self {
             UpdateMetadata::Downgrade => {
-                self.downgrade_leader_region(ctx).await?;
+                self.downgrade_leader_region(ctx, &procedure_ctx.provider)
+                    .await?;
 
                 Ok((
                     Box::<DowngradeLeaderRegion>::default(),
@@ -61,7 +58,8 @@ impl State for UpdateMetadata {
                 ))
             }
             UpdateMetadata::Upgrade => {
-                self.upgrade_candidate_region(ctx).await?;
+                self.upgrade_candidate_region(ctx, &procedure_ctx.provider)
+                    .await?;
 
                 if let Err(err) = ctx.invalidate_table_cache().await {
                     warn!(
@@ -71,7 +69,8 @@ impl State for UpdateMetadata {
                 Ok((Box::new(CloseDowngradedRegion), Status::executing(false)))
             }
             UpdateMetadata::Rollback => {
-                self.rollback_downgraded_region(ctx).await?;
+                self.rollback_downgraded_region(ctx, &procedure_ctx.provider)
+                    .await?;
 
                 if let Err(err) = ctx.invalidate_table_cache().await {
                     warn!(
diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs b/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs
index 77e5acbacd..05e29c9b08 100644
--- a/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs
@@ -13,7 +13,10 @@
 // limitations under the License.
 
 use common_error::ext::BoxedError;
+use common_meta::lock_key::TableLock;
 use common_meta::rpc::router::LeaderState;
+use common_procedure::ContextProviderRef;
+use common_telemetry::{error, info};
 use snafu::ResultExt;
 
 use crate::error::{self, Result};
@@ -37,35 +40,46 @@ impl UpdateMetadata {
     ///   It will only update **other region** info. Therefore, It's safe to retry after failure.
     ///
     /// - There is no other DDL procedure executed concurrently for the current table.
-    pub async fn downgrade_leader_region(&self, ctx: &mut Context) -> Result<()> {
+    pub async fn downgrade_leader_region(
+        &self,
+        ctx: &mut Context,
+        ctx_provider: &ContextProviderRef,
+    ) -> Result<()> {
         let table_metadata_manager = ctx.table_metadata_manager.clone();
         let from_peer_id = ctx.persistent_ctx.from_peer.id;
-        let region_id = ctx.region_id();
-        let table_id = region_id.table_id();
-        let current_table_route_value = ctx.get_table_route_value().await?;
+        let table_regions = ctx.persistent_ctx.table_regions();
 
-        // TODO(weny): ensures the leader region peer is the `from_peer`.
-        if let Err(err) = table_metadata_manager
-            .update_leader_region_status(table_id, &current_table_route_value, |route| {
-                if route.region.id == region_id
-                    && route
-                        .leader_peer
-                        .as_ref()
-                        .is_some_and(|leader_peer| leader_peer.id == from_peer_id)
-                {
-                    Some(Some(LeaderState::Downgrading))
-                } else {
-                    None
-                }
-            })
-            .await
-            .context(error::TableMetadataManagerSnafu)
-        {
-            return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
-                reason: format!(
-                    "Failed to update the table route during the downgrading leader region, region_id: {region_id}, from_peer_id: {from_peer_id}"
-                ),
-            });
+        for (table_id, regions) in table_regions {
+            let table_lock = TableLock::Write(table_id).into();
+            let _guard = ctx_provider.acquire_lock(&table_lock).await;
+
+            let current_table_route_value = ctx.get_table_route_value(table_id).await?;
+            if let Err(err) = table_metadata_manager
+                .update_leader_region_status(table_id, &current_table_route_value, |route| {
+                    if regions.contains(&route.region.id)
+                        && route
+                            .leader_peer
+                            .as_ref()
+                            .is_some_and(|leader_peer| leader_peer.id == from_peer_id)
+                    {
+                        Some(Some(LeaderState::Downgrading))
+                    } else {
+                        None
+                    }
+                })
+                .await
+                .context(error::TableMetadataManagerSnafu)
+            {
+                error!(err; "Failed to update the table route during the downgrading leader region, regions: {regions:?}, from_peer_id: {from_peer_id}");
+                return Err(BoxedError::new(err)).with_context(|_| error::RetryLaterWithSourceSnafu {
+                    reason: format!(
+                        "Failed to update the table route during the downgrading leader region, regions: {regions:?}, from_peer_id: {from_peer_id}"
+                    ),
+                });
+            }
+            info!(
+                "Downgrading leader region table route success, table_id: {table_id}, regions: {regions:?}, from_peer_id: {from_peer_id}"
+            );
         }
 
         Ok(())
@@ -75,10 +89,13 @@ impl UpdateMetadata {
 #[cfg(test)]
 mod tests {
     use std::assert_matches::assert_matches;
+    use std::collections::HashMap;
+    use std::sync::Arc;
 
     use common_meta::key::test_utils::new_test_table_info;
     use common_meta::peer::Peer;
     use common_meta::rpc::router::{Region, RegionRoute};
+    use common_procedure_test::MockContextProvider;
     use store_api::storage::RegionId;
 
     use crate::error::Error;
@@ -104,8 +121,12 @@ mod tests {
         let env = TestingEnv::new();
         let persistent_context = new_persistent_context();
         let mut ctx = env.context_factory().new_context(persistent_context);
+        let provider = Arc::new(MockContextProvider::new(HashMap::new())) as _;
 
-        let err = state.downgrade_leader_region(&mut ctx).await.unwrap_err();
+        let err = state
+            .downgrade_leader_region(&mut ctx, &provider)
+            .await
+            .unwrap_err();
 
         assert_matches!(err, Error::TableRouteNotFound { .. });
 
@@ -119,7 +140,7 @@ mod tests {
 
         let env = TestingEnv::new();
         let mut ctx = env.context_factory().new_context(persistent_context);
-        let table_id = ctx.region_id().table_id();
+        let table_id = ctx.persistent_ctx.region_ids[0].table_id();
 
         let table_info = new_test_table_info(1024, vec![1, 2]).into();
         let region_routes = vec![RegionRoute {
@@ -162,7 +183,7 @@ mod tests {
 
         let env = TestingEnv::new();
         let mut ctx = env.context_factory().new_context(persistent_context);
-        let table_id = ctx.region_id().table_id();
+        let table_id = ctx.persistent_ctx.region_ids[0].table_id();
 
         let table_info = new_test_table_info(1024, vec![1, 2]).into();
         let region_routes = vec![RegionRoute {
diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs b/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs
index 8f50e14b33..fc32e37672 100644
--- a/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 use common_error::ext::BoxedError;
+use common_meta::lock_key::TableLock;
+use common_procedure::ContextProviderRef;
+use common_telemetry::{error, info};
 use snafu::ResultExt;
 
 use crate::error::{self, Result};
@@ -28,28 +31,39 @@ impl UpdateMetadata {
     /// Retry:
     /// - Failed to update [TableRouteValue](common_meta::key::table_region::TableRegionValue).
     /// - Failed to retrieve the metadata of table.
-    pub async fn rollback_downgraded_region(&self, ctx: &mut Context) -> Result<()> {
+    pub async fn rollback_downgraded_region(
+        &self,
+        ctx: &mut Context,
+        ctx_provider: &ContextProviderRef,
+    ) -> Result<()> {
         let table_metadata_manager = ctx.table_metadata_manager.clone();
-        let region_id = ctx.region_id();
-        let table_id = region_id.table_id();
-        let current_table_route_value = ctx.get_table_route_value().await?;
+        let table_regions = ctx.persistent_ctx.table_regions();
 
-        if let Err(err) = table_metadata_manager
-            .update_leader_region_status(table_id, &current_table_route_value, |route| {
-                if route.region.id == region_id {
-                    Some(None)
-                } else {
-                    None
-                }
-            })
-            .await
-            .context(error::TableMetadataManagerSnafu)
-        {
-            return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
-                reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}"),
-            });
+        for (table_id, regions) in table_regions {
+            let table_lock = TableLock::Write(table_id).into();
+            let _guard = ctx_provider.acquire_lock(&table_lock).await;
+
+            let current_table_route_value = ctx.get_table_route_value(table_id).await?;
+            if let Err(err) = table_metadata_manager
+                .update_leader_region_status(table_id, &current_table_route_value, |route| {
+                    if regions.contains(&route.region.id) {
+                        Some(None)
+                    } else {
+                        None
+                    }
+                })
+                .await
+                .context(error::TableMetadataManagerSnafu)
+            {
+                error!(err; "Failed to update the table route during the rollback downgraded leader regions: {regions:?}");
+                return Err(BoxedError::new(err)).with_context(|_| error::RetryLaterWithSourceSnafu {
+                    reason: format!("Failed to update the table route during the rollback downgraded leader regions: {regions:?}"),
+                });
+            }
+            info!(
+                "Rolling back downgraded leader region table route success, table_id: {table_id}, regions: {regions:?}"
+            );
         }
-
         ctx.register_failure_detectors().await;
 
         Ok(())
@@ -59,10 +73,13 @@ impl UpdateMetadata {
 #[cfg(test)]
 mod tests {
     use std::assert_matches::assert_matches;
+    use std::collections::HashMap;
+    use std::sync::Arc;
 
     use common_meta::key::test_utils::new_test_table_info;
     use common_meta::peer::Peer;
     use common_meta::rpc::router::{LeaderState, Region, RegionRoute};
+    use common_procedure_test::MockContextProvider;
     use store_api::storage::RegionId;
 
     use crate::error::Error;
@@ -82,7 +99,11 @@ mod tests {
         let persistent_context = new_persistent_context();
         let mut ctx = env.context_factory().new_context(persistent_context);
 
-        let err = state.downgrade_leader_region(&mut ctx).await.unwrap_err();
+        let provider = Arc::new(MockContextProvider::new(HashMap::new())) as _;
+        let err = state
+            .rollback_downgraded_region(&mut ctx, &provider)
+            .await
+            .unwrap_err();
 
         assert_matches!(err, Error::TableRouteNotFound { .. });
 
@@ -97,7 +118,7 @@ mod tests {
 
         let env = TestingEnv::new();
         let mut ctx = env.context_factory().new_context(persistent_context);
-        let table_id = ctx.region_id().table_id();
+        let table_id = ctx.persistent_ctx.region_ids[0].table_id();
 
         let table_info = new_test_table_info(1024, vec![1, 2, 3]).into();
         let region_routes = vec![
diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs
index 7e33c9c75c..0e545f5d92 100644
--- a/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs
@@ -14,9 +14,12 @@
 
 use common_error::ext::BoxedError;
 use common_meta::key::datanode_table::RegionInfo;
+use common_meta::lock_key::TableLock;
 use common_meta::rpc::router::{RegionRoute, region_distribution};
-use common_telemetry::{info, warn};
+use common_procedure::ContextProviderRef;
+use common_telemetry::{error, info, warn};
 use snafu::{OptionExt, ResultExt, ensure};
+use store_api::storage::RegionId;
 
 use crate::error::{self, Result};
 use crate::procedure::region_migration::Context;
@@ -24,104 +27,114 @@ use crate::procedure::region_migration::update_metadata::UpdateMetadata;
 
 impl UpdateMetadata {
     /// Returns new [Vec<RegionRoute>].
-    async fn build_upgrade_candidate_region_metadata(
+    fn build_upgrade_candidate_region_metadata(
         &self,
         ctx: &mut Context,
+        region_ids: &[RegionId],
+        mut region_routes: Vec<RegionRoute>,
     ) -> Result<Vec<RegionRoute>> {
-        let region_id = ctx.region_id();
-        let table_route_value = ctx.get_table_route_value().await?.clone();
+        let old_leader_peer = &ctx.persistent_ctx.from_peer;
+        let new_leader_peer = &ctx.persistent_ctx.to_peer;
+        for region_id in region_ids {
+            // Find the RegionRoute for this region_id.
+            let region_route = region_routes
+                .iter_mut()
+                .find(|route| route.region.id == *region_id)
+                .context(error::RegionRouteNotFoundSnafu {
+                    region_id: *region_id,
+                })?;
 
-        let mut region_routes = table_route_value
-            .region_routes()
-            .context(error::UnexpectedLogicalRouteTableSnafu {
-                err_msg: format!("{self:?} is a non-physical TableRouteValue."),
-            })?
-            .clone();
-        let region_route = region_routes
-            .iter_mut()
-            .find(|route| route.region.id == region_id)
-            .context(error::RegionRouteNotFoundSnafu { region_id })?;
+            // Remove any "downgraded leader" state.
+            region_route.set_leader_state(None);
 
-        // Removes downgraded status.
-        region_route.set_leader_state(None);
-
-        let candidate = &ctx.persistent_ctx.to_peer;
-        let expected_old_leader = &ctx.persistent_ctx.from_peer;
-
-        // Upgrades candidate to leader.
-        ensure!(
-            region_route
-                .leader_peer
-                .take_if(|old_leader| old_leader.id == expected_old_leader.id)
-                .is_some(),
-            error::UnexpectedSnafu {
-                violated: format!(
-                    "Unexpected region leader: {:?} during the upgrading candidate metadata, expected: {:?}",
-                    region_route.leader_peer, expected_old_leader
-                ),
-            }
-        );
-
-        region_route.leader_peer = Some(candidate.clone());
-        info!(
-            "Upgrading candidate region to leader region: {:?} for region: {}",
-            candidate, region_id
-        );
-
-        // Removes the candidate region in followers.
-        let removed = region_route
-            .follower_peers
-            .extract_if(.., |peer| peer.id == candidate.id)
-            .collect::<Vec<_>>();
-
-        if removed.len() > 1 {
-            warn!(
-                "Removes duplicated regions: {removed:?} during the upgrading candidate metadata for region: {region_id}"
-            );
-        }
-
-        Ok(region_routes)
-    }
-
-    /// Returns true if region metadata has been updated.
-    async fn check_metadata_updated(&self, ctx: &mut Context) -> Result<bool> {
-        let region_id = ctx.region_id();
-        let table_route_value = ctx.get_table_route_value().await?.clone();
-
-        let region_routes = table_route_value
-            .region_routes()
-            .context(error::UnexpectedLogicalRouteTableSnafu {
-                err_msg: format!("{self:?} is a non-physical TableRouteValue."),
-            })?
-            .clone();
-        let region_route = region_routes
-            .into_iter()
-            .find(|route| route.region.id == region_id)
-            .context(error::RegionRouteNotFoundSnafu { region_id })?;
-
-        let leader_peer = region_route
-            .leader_peer
-            .as_ref()
-            .context(error::UnexpectedSnafu {
-                violated: format!("The leader peer of region {region_id} is not found during the update metadata for upgrading"),
-            })?;
-
-        let candidate_peer_id = ctx.persistent_ctx.to_peer.id;
-
-        if leader_peer.id == candidate_peer_id {
+            // Check old leader matches expectation before upgrading to new leader.
             ensure!(
-                !region_route.is_leader_downgrading(),
+                region_route
+                    .leader_peer
+                    .take_if(|old_leader| old_leader.id == old_leader_peer.id)
+                    .is_some(),
                 error::UnexpectedSnafu {
                     violated: format!(
-                        "Unexpected intermediate state is found during the update metadata for upgrading region {region_id}"
+                        "Unexpected region leader: {:?} during the candidate-to-leader upgrade; expected: {:?}",
+                        region_route.leader_peer, old_leader_peer
                     ),
                 }
             );
 
-            Ok(true)
-        } else {
-            Ok(false)
+            // Set new leader.
+            region_route.leader_peer = Some(new_leader_peer.clone());
+
+            // Remove new leader from followers (avoids duplicate leader/follower).
+            let removed = region_route
+                .follower_peers
+                .extract_if(.., |peer| peer.id == new_leader_peer.id)
+                .collect::<Vec<_>>();
+
+            // Warn if more than one follower with the new leader id was present.
+            if removed.len() > 1 {
+                warn!(
+                    "Removed duplicate followers: {removed:?} during candidate-to-leader upgrade for region: {region_id}"
+                );
+            }
         }
+
+        info!(
+            "Building metadata for upgrading candidate region to new leader: {:?} for regions: {:?}",
+            new_leader_peer, region_ids,
+        );
+
+        Ok(region_routes)
+    }
+
+    /// Checks if metadata has been upgraded for a list of regions by verifying if their
+    /// leader peers have been switched to a specified peer ID (`to_peer_id`) and that
+    /// no region is in a leader downgrading state.
+    ///
+    /// Returns:
+    /// - `Ok(true)` if all regions' leader is the target peer and no downgrading occurs.
+    /// - `Ok(false)` if any region's leader is not the target peer.
+    /// - Error if region route or leader peer cannot be found, or an unexpected state is detected.
+    fn check_metadata_updated(
+        &self,
+        ctx: &mut Context,
+        region_ids: &[RegionId],
+        region_routes: &[RegionRoute],
+    ) -> Result<bool> {
+        // Iterate through each provided region ID
+        for region_id in region_ids {
+            // Find the route info for this region
+            let region_route = region_routes
+                .iter()
+                .find(|route| route.region.id == *region_id)
+                .context(error::RegionRouteNotFoundSnafu {
+                    region_id: *region_id,
+                })?;
+
+            // Get the leader peer for the region, error if not found
+            let leader_peer = region_route.leader_peer.as_ref().with_context(||error::UnexpectedSnafu {
+                violated: format!(
+                    "The leader peer of region {region_id} is not found during the metadata upgrade check"
+                ),
+            })?;
+
+            // If the leader is not the expected peer, return false (i.e., not yet upgraded)
+            if leader_peer.id != ctx.persistent_ctx.to_peer.id {
+                return Ok(false);
+            } else {
+                // If leader matches but region is in leader downgrading state, error (unexpected state)
+                ensure!(
+                    !region_route.is_leader_downgrading(),
+                    error::UnexpectedSnafu {
+                        violated: format!(
+                            "Unexpected intermediate state is found during the metadata upgrade check for region {region_id}"
+                        ),
+                    }
+                );
+            }
+        }
+
+        // All regions' leader match expected peer and are not downgrading; considered upgraded
+        Ok(true)
     }
 
     /// Upgrades the candidate region.
@@ -133,55 +146,77 @@ impl UpdateMetadata {
     /// Retry:
     /// - Failed to update [TableRouteValue](common_meta::key::table_region::TableRegionValue).
     /// - Failed to retrieve the metadata of table.
-    pub async fn upgrade_candidate_region(&self, ctx: &mut Context) -> Result<()> {
-        let region_id = ctx.region_id();
+    pub async fn upgrade_candidate_region(
+        &self,
+        ctx: &mut Context,
+        ctx_provider: &ContextProviderRef,
+    ) -> Result<()> {
         let table_metadata_manager = ctx.table_metadata_manager.clone();
+        let table_regions = ctx.persistent_ctx.table_regions();
+        let from_peer_id = ctx.persistent_ctx.from_peer.id;
+        let to_peer_id = ctx.persistent_ctx.to_peer.id;
 
-        if self.check_metadata_updated(ctx).await? {
-            return Ok(());
+        for (table_id, region_ids) in table_regions {
+            let table_lock = TableLock::Write(table_id).into();
+            let _guard = ctx_provider.acquire_lock(&table_lock).await;
+
+            let table_route_value = ctx.get_table_route_value(table_id).await?;
+            let region_routes = table_route_value.region_routes().with_context(|_| {
+                error::UnexpectedLogicalRouteTableSnafu {
+                    err_msg: format!("TableRoute({table_id:?}) is a non-physical TableRouteValue."),
+                }
+            })?;
+            if self.check_metadata_updated(ctx, &region_ids, region_routes)? {
+                continue;
+            }
+            let datanode_table_value = ctx.get_from_peer_datanode_table_value(table_id).await?;
+            let RegionInfo {
+                region_storage_path,
+                region_options,
+                region_wal_options,
+                engine,
+            } = datanode_table_value.region_info.clone();
+            let new_region_routes = self.build_upgrade_candidate_region_metadata(
+                ctx,
+                &region_ids,
+                region_routes.clone(),
+            )?;
+            let region_distribution = region_distribution(region_routes);
+            info!(
+                "Trying to update region routes to {:?} for table: {}",
+                region_distribution, table_id,
+            );
+
+            if let Err(err) = table_metadata_manager
+                .update_table_route(
+                    table_id,
+                    RegionInfo {
+                        engine: engine.clone(),
+                        region_storage_path: region_storage_path.clone(),
+                        region_options: region_options.clone(),
+                        region_wal_options: region_wal_options.clone(),
+                    },
+                    &table_route_value,
+                    new_region_routes,
+                    &region_options,
+                    &region_wal_options,
+                )
+                .await
+                .context(error::TableMetadataManagerSnafu)
+            {
+                error!(err; "Failed to update the table route during the upgrading candidate region: {region_ids:?}, from_peer_id: {from_peer_id}");
+                return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
+                    reason: format!("Failed to update the table route during the upgrading candidate region: {table_id}"),
+                });
+            };
+            info!(
+                "Upgrading candidate region table route success, table_id: {table_id}, regions: {region_ids:?}, to_peer_id: {to_peer_id}"
+            );
         }
 
-        let region_routes = self.build_upgrade_candidate_region_metadata(ctx).await?;
-        let datanode_table_value = ctx.get_from_peer_datanode_table_value().await?;
-        let RegionInfo {
-            region_storage_path,
-            region_options,
-            region_wal_options,
-            engine,
-        } = datanode_table_value.region_info.clone();
-        let table_route_value = ctx.get_table_route_value().await?;
-
-        let region_distribution = region_distribution(&region_routes);
-        info!(
-            "Trying to update region routes to {:?} for table: {}",
-            region_distribution,
-            region_id.table_id()
-        );
-        if let Err(err) = table_metadata_manager
-            .update_table_route(
-                region_id.table_id(),
-                RegionInfo {
-                    engine: engine.clone(),
-                    region_storage_path: region_storage_path.clone(),
-                    region_options: region_options.clone(),
-                    region_wal_options: region_wal_options.clone(),
-                },
-                &table_route_value,
-                region_routes,
-                &region_options,
-                &region_wal_options,
-            )
-            .await
-            .context(error::TableMetadataManagerSnafu)
-        {
-            return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
-                reason: format!("Failed to update the table route during the upgrading candidate region: {region_id}"),
-            });
-        };
-
         ctx.deregister_failure_detectors().await;
         // Consumes the guard.
-        ctx.volatile_ctx.opening_region_guard.take();
+        ctx.volatile_ctx.opening_region_guards.clear();
 
         Ok(())
     }
@@ -210,16 +245,11 @@ mod tests {
 
     #[tokio::test]
     async fn test_table_route_is_not_found_error() {
-        let state = UpdateMetadata::Upgrade;
-
         let env = TestingEnv::new();
         let persistent_context = new_persistent_context();
-        let mut ctx = env.context_factory().new_context(persistent_context);
+        let ctx = env.context_factory().new_context(persistent_context);
 
-        let err = state
-            .build_upgrade_candidate_region_metadata(&mut ctx)
-            .await
-            .unwrap_err();
+        let err = ctx.get_table_route_value(1024).await.unwrap_err();
 
         assert_matches!(err, Error::TableRouteNotFound { .. });
         assert!(!err.is_retryable());
@@ -238,13 +268,20 @@ mod tests {
             leader_peer: Some(Peer::empty(4)),
             ..Default::default()
         }];
-
         env.create_physical_table_metadata(table_info, region_routes)
             .await;
 
+        let table_route_value = ctx.get_table_route_value(1024).await.unwrap();
+        let region_routes = table_route_value
+            .into_inner()
+            .into_physical_table_route()
+            .region_routes;
         let err = state
-            .build_upgrade_candidate_region_metadata(&mut ctx)
-            .await
+            .build_upgrade_candidate_region_metadata(
+                &mut ctx,
+                &[RegionId::new(1024, 1)],
+                region_routes,
+            )
             .unwrap_err();
 
         assert_matches!(err, Error::RegionRouteNotFound { .. });
@@ -268,9 +305,17 @@ mod tests {
         env.create_physical_table_metadata(table_info, region_routes)
             .await;
 
+        let table_route_value = ctx.get_table_route_value(1024).await.unwrap();
+        let region_routes = table_route_value
+            .into_inner()
+            .into_physical_table_route()
+            .region_routes;
         let err = state
-            .build_upgrade_candidate_region_metadata(&mut ctx)
-            .await
+            .build_upgrade_candidate_region_metadata(
+                &mut ctx,
+                &[RegionId::new(1024, 1)],
+                region_routes,
+            )
             .unwrap_err();
 
         assert_matches!(err, Error::Unexpected { .. });
@@ -297,9 +342,17 @@ mod tests {
         env.create_physical_table_metadata(table_info, region_routes)
             .await;
 
+        let table_route_value = ctx.get_table_route_value(1024).await.unwrap();
+        let region_routes = table_route_value
+            .into_inner()
+            .into_physical_table_route()
+            .region_routes;
         let new_region_routes = state
-            .build_upgrade_candidate_region_metadata(&mut ctx)
-            .await
+            .build_upgrade_candidate_region_metadata(
+                &mut ctx,
+                &[RegionId::new(1024, 1)],
+                region_routes,
+            )
             .unwrap();
 
         assert!(!new_region_routes[0].is_leader_downgrading());
@@ -327,8 +380,11 @@ mod tests {
 
         env.create_physical_table_metadata(table_info, region_routes)
             .await;
-
-        let updated = state.check_metadata_updated(&mut ctx).await.unwrap();
+        let table_routes = ctx.get_table_route_value(1024).await.unwrap();
+        let region_routes = table_routes.region_routes().unwrap();
+        let updated = state
+            .check_metadata_updated(&mut ctx, &[RegionId::new(1024, 1)], region_routes)
+            .unwrap();
         assert!(!updated);
     }
 
@@ -352,7 +408,11 @@ mod tests {
         env.create_physical_table_metadata(table_info, region_routes)
             .await;
 
-        let updated = state.check_metadata_updated(&mut ctx).await.unwrap();
+        let table_routes = ctx.get_table_route_value(1024).await.unwrap();
+        let region_routes = table_routes.region_routes().unwrap();
+        let updated = state
+            .check_metadata_updated(&mut ctx, &[RegionId::new(1024, 1)], region_routes)
+            .unwrap();
         assert!(updated);
     }
 
@@ -376,7 +436,11 @@ mod tests {
         env.create_physical_table_metadata(table_info, region_routes)
             .await;
 
-        let err = state.check_metadata_updated(&mut ctx).await.unwrap_err();
+        let table_routes = ctx.get_table_route_value(1024).await.unwrap();
+        let region_routes = table_routes.region_routes().unwrap();
+        let err = state
+            .check_metadata_updated(&mut ctx, &[RegionId::new(1024, 1)], region_routes)
+            .unwrap_err();
         assert_matches!(err, Error::Unexpected { .. });
         assert!(err.to_string().contains("intermediate state"));
     }
@@ -401,7 +465,7 @@ mod tests {
         let guard = opening_keeper
             .register(2, RegionId::new(table_id, 1))
             .unwrap();
-        ctx.volatile_ctx.opening_region_guard = Some(guard);
+        ctx.volatile_ctx.opening_region_guards.push(guard);
 
         env.create_physical_table_metadata(table_info, region_routes)
             .await;
@@ -425,7 +489,7 @@ mod tests {
             .unwrap();
         let region_routes = table_route.region_routes().unwrap();
 
-        assert!(ctx.volatile_ctx.opening_region_guard.is_none());
+        assert!(ctx.volatile_ctx.opening_region_guards.is_empty());
         assert_eq!(region_routes.len(), 1);
         assert!(!region_routes[0].is_leader_downgrading());
         assert!(region_routes[0].follower_peers.is_empty());
diff --git a/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs
index 155130db41..0390ddf0da 100644
--- a/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 use std::any::Any;
+use std::collections::HashSet;
 use std::time::Duration;
 
 use api::v1::meta::MailboxMessage;
@@ -20,10 +21,11 @@ use common_meta::ddl::utils::parse_region_wal_options;
 use common_meta::instruction::{
     Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply, UpgradeRegionsReply,
 };
+use common_meta::key::topic_region::TopicRegionKey;
 use common_meta::lock_key::RemoteWalLock;
 use common_meta::wal_options_allocator::extract_topic_from_wal_options;
 use common_procedure::{Context as ProcedureContext, Status};
-use common_telemetry::{error, warn};
+use common_telemetry::{error, info};
 use common_wal::options::WalOptions;
 use serde::{Deserialize, Serialize};
 use snafu::{OptionExt, ResultExt, ensure};
@@ -66,17 +68,9 @@ impl State for UpgradeCandidateRegion {
     ) -> Result<(Box<dyn State>, Status)> {
         let now = Instant::now();
 
-        let region_wal_option = self.get_region_wal_option(ctx).await?;
-        let region_id = ctx.persistent_ctx.region_id;
-        if region_wal_option.is_none() {
-            warn!(
-                "Region {} wal options not found, during upgrade candidate region",
-                region_id
-            );
-        }
-
+        let topics = self.get_kafka_topics(ctx).await?;
         if self
-            .upgrade_region_with_retry(ctx, procedure_ctx, region_wal_option.as_ref())
+            .upgrade_region_with_retry(ctx, procedure_ctx, topics)
             .await
         {
             ctx.update_upgrade_candidate_region_elapsed(now);
@@ -93,24 +87,32 @@ impl State for UpgradeCandidateRegion {
 }
 
 impl UpgradeCandidateRegion {
-    async fn get_region_wal_option(&self, ctx: &mut Context) -> Result<Option<WalOptions>> {
-        let region_id = ctx.persistent_ctx.region_id;
-        match ctx.get_from_peer_datanode_table_value().await {
-            Ok(datanode_table_value) => {
-                let region_wal_options =
-                    parse_region_wal_options(&datanode_table_value.region_info.region_wal_options)
-                        .context(error::ParseWalOptionsSnafu)?;
-                Ok(region_wal_options.get(&region_id.region_number()).cloned())
+    async fn get_kafka_topics(&self, ctx: &mut Context) -> Result<HashSet<String>> {
+        let table_regions = ctx.persistent_ctx.table_regions();
+        let datanode_table_values = ctx.get_from_peer_datanode_table_values().await?;
+        let mut topics = HashSet::new();
+        for (table_id, regions) in table_regions {
+            let Some(datanode_table_value) = datanode_table_values.get(&table_id) else {
+                continue;
+            };
+
+            let region_wal_options =
+                parse_region_wal_options(&datanode_table_value.region_info.region_wal_options)
+                    .context(error::ParseWalOptionsSnafu)?;
+
+            for region_id in regions {
+                let Some(WalOptions::Kafka(kafka_wal_options)) =
+                    region_wal_options.get(&region_id.region_number())
+                else {
+                    continue;
+                };
+                if !topics.contains(&kafka_wal_options.topic) {
+                    topics.insert(kafka_wal_options.topic.clone());
+                }
             }
-            Err(error::Error::DatanodeTableNotFound { datanode_id, .. }) => {
-                warn!(
-                    "Datanode table not found, during upgrade candidate region, the target region might already been migrated, region_id: {}, datanode_id: {}",
-                    region_id, datanode_id
-                );
-                Ok(None)
-            }
-            Err(e) => Err(e),
         }
+
+        Ok(topics)
     }
 
     /// Builds upgrade region instruction.
@@ -119,35 +121,105 @@ impl UpgradeCandidateRegion {
         ctx: &mut Context,
         replay_timeout: Duration,
     ) -> Result<Instruction> {
-        let pc = &ctx.persistent_ctx;
-        let region_id = pc.region_id;
-        let last_entry_id = ctx.volatile_ctx.leader_region_last_entry_id;
-        let metadata_last_entry_id = ctx.volatile_ctx.leader_region_metadata_last_entry_id;
-        // Try our best to retrieve replay checkpoint.
-        let datanode_table_value = ctx.get_from_peer_datanode_table_value().await.ok();
-        let checkpoint = if let Some(topic) = datanode_table_value.as_ref().and_then(|v| {
-            extract_topic_from_wal_options(region_id, &v.region_info.region_wal_options)
-        }) {
-            ctx.fetch_replay_checkpoint(&topic).await.ok().flatten()
-        } else {
-            None
-        };
+        let region_ids = ctx.persistent_ctx.region_ids.clone();
+        let datanode_table_values = ctx.get_from_peer_datanode_table_values().await?;
+        let mut region_topic = Vec::with_capacity(region_ids.len());
+        for region_id in region_ids.iter() {
+            let table_id = region_id.table_id();
+            if let Some(datanode_table_value) = datanode_table_values.get(&table_id)
+                && let Some(topic) = extract_topic_from_wal_options(
+                    *region_id,
+                    &datanode_table_value.region_info.region_wal_options,
+                )
+            {
+                region_topic.push((*region_id, topic));
+            }
+        }
 
-        let upgrade_instruction = Instruction::UpgradeRegions(vec![
-            UpgradeRegion {
+        let replay_checkpoints = ctx
+            .get_replay_checkpoints(
+                region_topic
+                    .iter()
+                    .map(|(region_id, topic)| TopicRegionKey::new(*region_id, topic))
+                    .collect(),
+            )
+            .await?;
+        // Build upgrade regions instruction.
+        let mut upgrade_regions = Vec::with_capacity(region_ids.len());
+        for region_id in region_ids {
+            let last_entry_id = ctx
+                .volatile_ctx
+                .leader_region_last_entry_ids
+                .get(&region_id)
+                .copied();
+            let metadata_last_entry_id = ctx
+                .volatile_ctx
+                .leader_region_metadata_last_entry_ids
+                .get(&region_id)
+                .copied();
+            let checkpoint = replay_checkpoints.get(&region_id).copied();
+            upgrade_regions.push(UpgradeRegion {
                 region_id,
                 last_entry_id,
                 metadata_last_entry_id,
                 replay_timeout,
                 location_id: Some(ctx.persistent_ctx.from_peer.id),
-                replay_entry_id: None,
-                metadata_replay_entry_id: None,
-            }
-            .with_replay_entry_id(checkpoint.map(|c| c.entry_id))
-            .with_metadata_replay_entry_id(checkpoint.and_then(|c| c.metadata_entry_id)),
-        ]);
+                replay_entry_id: checkpoint.map(|c| c.entry_id),
+                metadata_replay_entry_id: checkpoint.and_then(|c| c.metadata_entry_id),
+            });
+        }
 
-        Ok(upgrade_instruction)
+        Ok(Instruction::UpgradeRegions(upgrade_regions))
+    }
+
+    fn handle_upgrade_region_reply(
+        &self,
+        ctx: &mut Context,
+        UpgradeRegionReply {
+            region_id,
+            ready,
+            exists,
+            error,
+        }: &UpgradeRegionReply,
+        now: &Instant,
+    ) -> Result<()> {
+        let candidate = &ctx.persistent_ctx.to_peer;
+        if error.is_some() {
+            return error::RetryLaterSnafu {
+                reason: format!(
+                    "Failed to upgrade the region {} on datanode {:?}, error: {:?}, elapsed: {:?}",
+                    region_id,
+                    candidate,
+                    error,
+                    now.elapsed()
+                ),
+            }
+            .fail();
+        }
+
+        ensure!(
+            exists,
+            error::UnexpectedSnafu {
+                violated: format!(
+                    "Candidate region {} doesn't exist on datanode {:?}",
+                    region_id, candidate
+                )
+            }
+        );
+
+        if self.require_ready && !ready {
+            return error::RetryLaterSnafu {
+                reason: format!(
+                    "Candidate region {} still replaying the wal on datanode {:?}, elapsed: {:?}",
+                    region_id,
+                    candidate,
+                    now.elapsed()
+                ),
+            }
+            .fail();
+        }
+
+        Ok(())
     }
 
     /// Tries to upgrade a candidate region.
@@ -175,11 +247,11 @@ impl UpgradeCandidateRegion {
             .await?;
 
         let pc = &ctx.persistent_ctx;
-        let region_id = pc.region_id;
+        let region_ids = &pc.region_ids;
         let candidate = &pc.to_peer;
 
         let msg = MailboxMessage::json_message(
-            &format!("Upgrade candidate region: {}", region_id),
+            &format!("Upgrade candidate regions: {:?}", region_ids),
             &format!("Metasrv@{}", ctx.server_addr()),
             &format!("Datanode-{}@{}", candidate.id, candidate.addr),
             common_time::util::current_time_millis(),
@@ -192,9 +264,16 @@ impl UpgradeCandidateRegion {
         let ch = Channel::Datanode(candidate.id);
         let receiver = ctx.mailbox.send(&ch, msg, operation_timeout).await?;
 
+        let now = Instant::now();
         match receiver.await {
             Ok(msg) => {
                 let reply = HeartbeatMailbox::json_reply(&msg)?;
+                info!(
+                    "Received upgrade region reply: {:?}, regions: {:?}, elapsed: {:?}",
+                    reply,
+                    region_ids,
+                    now.elapsed()
+                );
                 let InstructionReply::UpgradeRegions(UpgradeRegionsReply { replies }) = reply
                 else {
                     return error::UnexpectedInstructionReplySnafu {
@@ -203,51 +282,16 @@ impl UpgradeCandidateRegion {
                     }
                     .fail();
                 };
-                // TODO(weny): handle multiple replies.
-                let UpgradeRegionReply {
-                    ready,
-                    exists,
-                    error,
-                    ..
-                } = &replies[0];
-
-                // Notes: The order of handling is important.
-                if error.is_some() {
-                    return error::RetryLaterSnafu {
-                        reason: format!(
-                            "Failed to upgrade the region {} on datanode {:?}, error: {:?}",
-                            region_id, candidate, error
-                        ),
-                    }
-                    .fail();
+                for reply in replies {
+                    self.handle_upgrade_region_reply(ctx, &reply, &now)?;
                 }
-
-                ensure!(
-                    exists,
-                    error::UnexpectedSnafu {
-                        violated: format!(
-                            "Candidate region {} doesn't exist on datanode {:?}",
-                            region_id, candidate
-                        )
-                    }
-                );
-
-                if self.require_ready && !ready {
-                    return error::RetryLaterSnafu {
-                        reason: format!(
-                            "Candidate region {} still replaying the wal on datanode {:?}",
-                            region_id, candidate
-                        ),
-                    }
-                    .fail();
-                }
-
                 Ok(())
             }
             Err(error::Error::MailboxTimeout { .. }) => {
                 let reason = format!(
-                    "Mailbox received timeout for upgrade candidate region {region_id} on datanode {:?}",
+                    "Mailbox received timeout for upgrade candidate regions {region_ids:?} on datanode {:?}, elapsed: {:?}",
                     candidate,
+                    now.elapsed()
                 );
                 error::RetryLaterSnafu { reason }.fail()
             }
@@ -262,26 +306,24 @@ impl UpgradeCandidateRegion {
         &self,
         ctx: &mut Context,
         procedure_ctx: &ProcedureContext,
-        wal_options: Option<&WalOptions>,
+        topics: HashSet<String>,
     ) -> bool {
         let mut retry = 0;
         let mut upgraded = false;
 
+        let mut guards = Vec::with_capacity(topics.len());
         loop {
             let timer = Instant::now();
             // If using Kafka WAL, acquire a read lock on the topic to prevent WAL pruning during the upgrade.
-            let _guard = if let Some(WalOptions::Kafka(kafka_wal_options)) = wal_options {
-                Some(
+            for topic in &topics {
+                guards.push(
                     procedure_ctx
                         .provider
-                        .acquire_lock(
-                            &(RemoteWalLock::Read(kafka_wal_options.topic.clone()).into()),
-                        )
+                        .acquire_lock(&(RemoteWalLock::Read(topic.clone()).into()))
                         .await,
-                )
-            } else {
-                None
-            };
+                );
+            }
+
             if let Err(err) = self.upgrade_region(ctx).await {
                 retry += 1;
                 ctx.update_operations_elapsed(timer);
@@ -327,22 +369,21 @@ mod tests {
     };
 
     fn new_persistent_context() -> PersistentContext {
-        PersistentContext {
-            catalog: "greptime".into(),
-            schema: "public".into(),
-            from_peer: Peer::empty(1),
-            to_peer: Peer::empty(2),
-            region_id: RegionId::new(1024, 1),
-            timeout: Duration::from_millis(1000),
-            trigger_reason: RegionMigrationTriggerReason::Manual,
-        }
+        PersistentContext::new(
+            vec![("greptime".into(), "public".into())],
+            Peer::empty(1),
+            Peer::empty(2),
+            vec![RegionId::new(1024, 1)],
+            Duration::from_millis(1000),
+            RegionMigrationTriggerReason::Manual,
+        )
     }
 
     async fn prepare_table_metadata(ctx: &Context, wal_options: HashMap<u32, String>) {
-        let table_info =
-            new_test_table_info(ctx.persistent_ctx.region_id.table_id(), vec![1]).into();
+        let region_id = ctx.persistent_ctx.region_ids[0];
+        let table_info = new_test_table_info(region_id.table_id(), vec![1]).into();
         let region_routes = vec![RegionRoute {
-            region: Region::new_test(ctx.persistent_ctx.region_id),
+            region: Region::new_test(region_id),
             leader_peer: Some(ctx.persistent_ctx.from_peer.clone()),
             follower_peers: vec![ctx.persistent_ctx.to_peer.clone()],
             ..Default::default()
diff --git a/src/meta-srv/src/procedure/region_migration/utils.rs b/src/meta-srv/src/procedure/region_migration/utils.rs
new file mode 100644
index 0000000000..09921ee0d6
--- /dev/null
+++ b/src/meta-srv/src/procedure/region_migration/utils.rs
@@ -0,0 +1,487 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::fmt::Display;
+use std::time::Duration;
+
+use common_meta::key::TableMetadataManagerRef;
+use common_meta::peer::Peer;
+use common_meta::rpc::router::RegionRoute;
+use itertools::Itertools;
+use snafu::{OptionExt, ResultExt};
+use store_api::storage::{RegionId, TableId};
+
+use crate::error::{self, Result};
+use crate::procedure::region_migration::{
+    DEFAULT_REGION_MIGRATION_TIMEOUT, RegionMigrationProcedureTask, RegionMigrationTriggerReason,
+};
+
+/// A migration task describing how regions are intended to move between peers.
+#[derive(Debug, Clone)]
+pub struct RegionMigrationTaskBatch {
+    /// Region ids involved in this migration.
+    pub region_ids: Vec<RegionId>,
+    /// Source peer where regions currently reside.
+    pub from_peer: Peer,
+    /// Destination peer to migrate regions to.
+    pub to_peer: Peer,
+    /// Timeout for migration.
+    pub timeout: Duration,
+    /// Reason why this migration was triggered.
+    pub trigger_reason: RegionMigrationTriggerReason,
+}
+
+impl RegionMigrationTaskBatch {
+    /// Constructs a [`RegionMigrationTaskBatch`] from a vector of region migration procedure tasks.
+    ///
+    /// Aggregates region IDs, determines source and destination peers, sets an appropriate timeout,
+    /// and assigns the trigger reason for the migration batch.
+    ///
+    /// # Panic
+    /// if the `tasks` are empty.
+    pub fn from_tasks(tasks: Vec<(RegionMigrationProcedureTask, u32)>) -> Self {
+        let max_count = tasks.iter().map(|(_, count)| *count).max().unwrap_or(1);
+        let region_ids = tasks.iter().map(|(r, _)| r.region_id).collect::<Vec<_>>();
+        let from_peer = tasks[0].0.from_peer.clone();
+        let to_peer = tasks[0].0.to_peer.clone();
+        let timeout = DEFAULT_REGION_MIGRATION_TIMEOUT * max_count;
+        let trigger_reason = RegionMigrationTriggerReason::Failover;
+        Self {
+            region_ids,
+            from_peer,
+            to_peer,
+            timeout,
+            trigger_reason,
+        }
+    }
+}
+
+impl Display for RegionMigrationTaskBatch {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "RegionMigrationTask {{ region_ids: {:?}, from_peer: {:?}, to_peer: {:?}, timeout: {:?}, trigger_reason: {:?} }}",
+            self.region_ids, self.from_peer, self.to_peer, self.timeout, self.trigger_reason
+        )
+    }
+}
+
+impl RegionMigrationTaskBatch {
+    /// Returns the table regions map.
+    ///
+    /// The key is the table id, the value is the region ids of the table.
+    pub(crate) fn table_regions(&self) -> HashMap<TableId, Vec<RegionId>> {
+        let mut table_regions = HashMap::new();
+        for region_id in &self.region_ids {
+            table_regions
+                .entry(region_id.table_id())
+                .or_insert_with(Vec::new)
+                .push(*region_id);
+        }
+        table_regions
+    }
+}
+
+/// Represents the result of analyzing a migration task.
+#[derive(Debug, Clone, Default, PartialEq)]
+pub(crate) struct RegionMigrationAnalysis {
+    /// Regions already migrated to the `to_peer`.
+    pub(crate) migrated: Vec<RegionId>,
+    /// Regions where the leader peer has changed.
+    pub(crate) leader_changed: Vec<RegionId>,
+    /// Regions where `to_peer` is already a follower (conflict).
+    pub(crate) peer_conflict: Vec<RegionId>,
+    /// Regions whose table is not found.
+    pub(crate) table_not_found: Vec<RegionId>,
+    /// Regions still pending migration.
+    pub(crate) pending: Vec<RegionId>,
+}
+
+fn leader_peer(region_route: &RegionRoute) -> Result<&Peer> {
+    region_route
+        .leader_peer
+        .as_ref()
+        .with_context(|| error::UnexpectedSnafu {
+            violated: format!(
+                "Region route leader peer is not found in region({})",
+                region_route.region.id
+            ),
+        })
+}
+
+/// Returns true if the region has already been migrated to `to_peer`.
+fn has_migrated(region_route: &RegionRoute, to_peer_id: u64) -> Result<bool> {
+    if region_route.is_leader_downgrading() {
+        return Ok(false);
+    }
+
+    let leader_peer = leader_peer(region_route)?;
+    Ok(leader_peer.id == to_peer_id)
+}
+
+/// Returns true if the leader peer of the region has changed.
+fn has_leader_changed(region_route: &RegionRoute, from_peer_id: u64) -> Result<bool> {
+    let leader_peer = leader_peer(region_route)?;
+
+    Ok(leader_peer.id != from_peer_id)
+}
+
+/// Returns true if `to_peer` is already a follower of the region (conflict).
+fn has_peer_conflict(region_route: &RegionRoute, to_peer_id: u64) -> bool {
+    region_route
+        .follower_peers
+        .iter()
+        .map(|p| p.id)
+        .contains(&to_peer_id)
+}
+
+/// Updates the verification result based on a single region route.
+fn update_result_with_region_route(
+    result: &mut RegionMigrationAnalysis,
+    region_route: &RegionRoute,
+    from_peer_id: u64,
+    to_peer_id: u64,
+) -> Result<()> {
+    if has_migrated(region_route, to_peer_id)? {
+        result.migrated.push(region_route.region.id);
+        return Ok(());
+    }
+    if has_leader_changed(region_route, from_peer_id)? {
+        result.leader_changed.push(region_route.region.id);
+        return Ok(());
+    }
+    if has_peer_conflict(region_route, to_peer_id) {
+        result.peer_conflict.push(region_route.region.id);
+        return Ok(());
+    }
+    result.pending.push(region_route.region.id);
+    Ok(())
+}
+
+/// Analyzes the migration task and categorizes regions by their current state.
+///
+/// Returns a [`RegionMigrationAnalysis`] describing the migration status.
+pub async fn analyze_region_migration_task(
+    task: &RegionMigrationTaskBatch,
+    table_metadata_manager: &TableMetadataManagerRef,
+) -> Result<RegionMigrationAnalysis> {
+    if task.to_peer.id == task.from_peer.id {
+        return error::InvalidArgumentsSnafu {
+            err_msg: format!(
+                "The `from_peer_id`({}) can't equal `to_peer_id`({})",
+                task.from_peer.id, task.to_peer.id
+            ),
+        }
+        .fail();
+    }
+    let table_regions = task.table_regions();
+    let table_ids = table_regions.keys().cloned().collect::<Vec<_>>();
+    let mut result = RegionMigrationAnalysis::default();
+
+    let table_routes = table_metadata_manager
+        .table_route_manager()
+        .table_route_storage()
+        .batch_get_with_raw_bytes(&table_ids)
+        .await
+        .context(error::TableMetadataManagerSnafu)?;
+
+    for (table_id, table_route) in table_ids.into_iter().zip(table_routes) {
+        let region_ids = table_regions.get(&table_id).unwrap();
+        let Some(table_route) = table_route else {
+            result.table_not_found.extend(region_ids);
+            continue;
+        };
+        // Throws error if the table route is not a physical table route.
+        let region_routes = table_route.region_routes().with_context(|_| {
+            error::UnexpectedLogicalRouteTableSnafu {
+                err_msg: format!("TableRoute({table_id:?}) is a non-physical TableRouteValue."),
+            }
+        })?;
+        for region_route in region_routes
+            .iter()
+            .filter(|r| region_ids.contains(&r.region.id))
+        {
+            update_result_with_region_route(
+                &mut result,
+                region_route,
+                task.from_peer.id,
+                task.to_peer.id,
+            )?;
+        }
+    }
+
+    Ok(result)
+}
+
+#[cfg(test)]
+mod tests {
+
+    use std::assert_matches::assert_matches;
+    use std::sync::Arc;
+    use std::time::Duration;
+
+    use common_meta::key::TableMetadataManager;
+    use common_meta::key::table_route::{
+        LogicalTableRouteValue, PhysicalTableRouteValue, TableRouteValue,
+    };
+    use common_meta::kv_backend::TxnService;
+    use common_meta::kv_backend::memory::MemoryKvBackend;
+    use common_meta::peer::Peer;
+    use common_meta::rpc::router::{Region, RegionRoute};
+    use store_api::storage::RegionId;
+
+    use crate::error::Error;
+    use crate::procedure::region_migration::RegionMigrationTriggerReason;
+    use crate::procedure::region_migration::utils::{
+        RegionMigrationAnalysis, RegionMigrationTaskBatch, analyze_region_migration_task,
+        update_result_with_region_route,
+    };
+
+    #[test]
+    fn test_update_result_with_region_route() {
+        // The region is already migrated to the to_peer.
+        let mut result = RegionMigrationAnalysis::default();
+        let region_id = RegionId::new(1, 1);
+        let region_route = RegionRoute {
+            region: Region::new_test(region_id),
+            leader_peer: Some(Peer::empty(1)),
+            follower_peers: vec![],
+            leader_state: None,
+            leader_down_since: None,
+        };
+        update_result_with_region_route(&mut result, &region_route, 2, 1).unwrap();
+        assert_eq!(
+            result,
+            RegionMigrationAnalysis {
+                migrated: vec![region_id],
+                ..Default::default()
+            }
+        );
+
+        // Test region leader changed.
+        let mut result = RegionMigrationAnalysis::default();
+        let region_id = RegionId::new(1, 1);
+        let region_route = RegionRoute {
+            region: Region::new_test(region_id),
+            leader_peer: Some(Peer::empty(1)),
+            follower_peers: vec![],
+            leader_state: None,
+            leader_down_since: None,
+        };
+        update_result_with_region_route(&mut result, &region_route, 2, 3).unwrap();
+        assert_eq!(
+            result,
+            RegionMigrationAnalysis {
+                leader_changed: vec![region_id],
+                ..Default::default()
+            }
+        );
+
+        // Test region peer conflict.
+        let mut result = RegionMigrationAnalysis::default();
+        let region_id = RegionId::new(1, 1);
+        let region_route = RegionRoute {
+            region: Region::new_test(region_id),
+            leader_peer: Some(Peer::empty(1)),
+            follower_peers: vec![Peer::empty(2)],
+            leader_state: None,
+            leader_down_since: None,
+        };
+        update_result_with_region_route(&mut result, &region_route, 1, 2).unwrap();
+        assert_eq!(
+            result,
+            RegionMigrationAnalysis {
+                peer_conflict: vec![region_id],
+                ..Default::default()
+            }
+        );
+
+        // Test normal case.
+        let mut result = RegionMigrationAnalysis::default();
+        let region_id = RegionId::new(1, 1);
+        let region_route = RegionRoute {
+            region: Region::new_test(region_id),
+            leader_peer: Some(Peer::empty(1)),
+            follower_peers: vec![],
+            leader_state: None,
+            leader_down_since: None,
+        };
+        update_result_with_region_route(&mut result, &region_route, 1, 3).unwrap();
+        assert_eq!(
+            result,
+            RegionMigrationAnalysis {
+                pending: vec![region_id],
+                ..Default::default()
+            }
+        );
+
+        // Test leader peer not set
+        let mut result = RegionMigrationAnalysis::default();
+        let region_id = RegionId::new(1, 1);
+        let region_route = RegionRoute {
+            region: Region::new_test(region_id),
+            leader_peer: None,
+            follower_peers: vec![],
+            leader_state: None,
+            leader_down_since: None,
+        };
+        let err = update_result_with_region_route(&mut result, &region_route, 1, 3).unwrap_err();
+        assert_matches!(err, Error::Unexpected { .. });
+    }
+
+    #[tokio::test]
+    async fn test_analyze_region_migration_task_invalid_task() {
+        let task = &RegionMigrationTaskBatch {
+            region_ids: vec![RegionId::new(1, 1)],
+            from_peer: Peer::empty(1),
+            to_peer: Peer::empty(1),
+            timeout: Duration::from_millis(1000),
+            trigger_reason: RegionMigrationTriggerReason::Manual,
+        };
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone()));
+        let err = analyze_region_migration_task(task, &table_metadata_manager)
+            .await
+            .unwrap_err();
+        assert_matches!(err, Error::InvalidArguments { .. });
+    }
+
+    #[tokio::test]
+    async fn test_analyze_region_migration_table_not_found() {
+        let task = &RegionMigrationTaskBatch {
+            region_ids: vec![RegionId::new(1, 1)],
+            from_peer: Peer::empty(1),
+            to_peer: Peer::empty(2),
+            timeout: Duration::from_millis(1000),
+            trigger_reason: RegionMigrationTriggerReason::Manual,
+        };
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone()));
+        let result = analyze_region_migration_task(task, &table_metadata_manager)
+            .await
+            .unwrap();
+        assert_eq!(
+            result,
+            RegionMigrationAnalysis {
+                table_not_found: vec![RegionId::new(1, 1)],
+                ..Default::default()
+            }
+        );
+    }
+
+    #[tokio::test]
+    async fn test_analyze_region_migration_unexpected_logical_table() {
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone()));
+        let (txn, _) = table_metadata_manager
+            .table_route_manager()
+            .table_route_storage()
+            .build_create_txn(
+                1024,
+                &TableRouteValue::Logical(LogicalTableRouteValue::new(
+                    1024,
+                    vec![RegionId::new(1023, 1)],
+                )),
+            )
+            .unwrap();
+        kv_backend.txn(txn).await.unwrap();
+        let task = &RegionMigrationTaskBatch {
+            region_ids: vec![RegionId::new(1024, 1)],
+            from_peer: Peer::empty(1),
+            to_peer: Peer::empty(2),
+            timeout: Duration::from_millis(1000),
+            trigger_reason: RegionMigrationTriggerReason::Manual,
+        };
+        let err = analyze_region_migration_task(task, &table_metadata_manager)
+            .await
+            .unwrap_err();
+        assert_matches!(err, Error::UnexpectedLogicalRouteTable { .. });
+    }
+
+    #[tokio::test]
+    async fn test_analyze_region_migration_normal_case() {
+        let kv_backend = Arc::new(MemoryKvBackend::default());
+        let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone()));
+        let (txn, _) = table_metadata_manager
+            .table_route_manager()
+            .table_route_storage()
+            .build_create_txn(
+                1024,
+                &TableRouteValue::Physical(PhysicalTableRouteValue::new(vec![
+                    // Already migrated to the to_peer.
+                    RegionRoute {
+                        region: Region::new_test(RegionId::new(1024, 1)),
+                        leader_peer: Some(Peer::empty(2)),
+                        follower_peers: vec![],
+                        leader_state: None,
+                        leader_down_since: None,
+                    },
+                    // Leader peer changed.
+                    RegionRoute {
+                        region: Region::new_test(RegionId::new(1024, 2)),
+                        leader_peer: Some(Peer::empty(3)),
+                        follower_peers: vec![],
+                        leader_state: None,
+                        leader_down_since: None,
+                    },
+                    // Peer conflict.
+                    RegionRoute {
+                        region: Region::new_test(RegionId::new(1024, 3)),
+                        leader_peer: Some(Peer::empty(1)),
+                        follower_peers: vec![Peer::empty(2)],
+                        leader_state: None,
+                        leader_down_since: None,
+                    },
+                    // Normal case.
+                    RegionRoute {
+                        region: Region::new_test(RegionId::new(1024, 4)),
+                        leader_peer: Some(Peer::empty(1)),
+                        follower_peers: vec![],
+                        leader_state: None,
+                        leader_down_since: None,
+                    },
+                ])),
+            )
+            .unwrap();
+
+        kv_backend.txn(txn).await.unwrap();
+        let task = &RegionMigrationTaskBatch {
+            region_ids: vec![
+                RegionId::new(1024, 1),
+                RegionId::new(1024, 2),
+                RegionId::new(1024, 3),
+                RegionId::new(1024, 4),
+                RegionId::new(1025, 1),
+            ],
+            from_peer: Peer::empty(1),
+            to_peer: Peer::empty(2),
+            timeout: Duration::from_millis(1000),
+            trigger_reason: RegionMigrationTriggerReason::Manual,
+        };
+        let result = analyze_region_migration_task(task, &table_metadata_manager)
+            .await
+            .unwrap();
+        assert_eq!(
+            result,
+            RegionMigrationAnalysis {
+                pending: vec![RegionId::new(1024, 4)],
+                migrated: vec![RegionId::new(1024, 1)],
+                leader_changed: vec![RegionId::new(1024, 2)],
+                peer_conflict: vec![RegionId::new(1024, 3)],
+                table_not_found: vec![RegionId::new(1025, 1)],
+            }
+        );
+    }
+}
diff --git a/src/meta-srv/src/procedure/repartition.rs b/src/meta-srv/src/procedure/repartition.rs
new file mode 100644
index 0000000000..f55d349df5
--- /dev/null
+++ b/src/meta-srv/src/procedure/repartition.rs
@@ -0,0 +1,19 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub mod group;
+pub mod plan;
+
+#[cfg(test)]
+pub mod test_util;
diff --git a/src/meta-srv/src/procedure/repartition/group.rs b/src/meta-srv/src/procedure/repartition/group.rs
new file mode 100644
index 0000000000..7c3ee14e64
--- /dev/null
+++ b/src/meta-srv/src/procedure/repartition/group.rs
@@ -0,0 +1,284 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub(crate) mod repartition_start;
+pub(crate) mod update_metadata;
+
+use std::any::Any;
+use std::fmt::Debug;
+
+use common_error::ext::BoxedError;
+use common_meta::DatanodeId;
+use common_meta::cache_invalidator::CacheInvalidatorRef;
+use common_meta::instruction::CacheIdent;
+use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue, RegionInfo};
+use common_meta::key::table_route::TableRouteValue;
+use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef};
+use common_meta::rpc::router::RegionRoute;
+use common_procedure::{Context as ProcedureContext, Status};
+use serde::{Deserialize, Serialize};
+use snafu::{OptionExt, ResultExt};
+use store_api::storage::{RegionId, TableId};
+use uuid::Uuid;
+
+use crate::error::{self, Result};
+use crate::procedure::repartition::plan::RegionDescriptor;
+
+pub type GroupId = Uuid;
+
+pub struct RepartitionGroupProcedure {}
+
+pub struct Context {
+    pub persistent_ctx: PersistentContext,
+
+    pub cache_invalidator: CacheInvalidatorRef,
+
+    pub table_metadata_manager: TableMetadataManagerRef,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct GroupPrepareResult {
+    pub source_routes: Vec<RegionRoute>,
+    pub target_routes: Vec<RegionRoute>,
+    pub central_region: RegionId,
+    pub central_region_datanode_id: DatanodeId,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct PersistentContext {
+    pub group_id: GroupId,
+    /// The table id of the repartition group.
+    pub table_id: TableId,
+    /// The source regions of the repartition group.
+    pub sources: Vec<RegionDescriptor>,
+    /// The target regions of the repartition group.
+    pub targets: Vec<RegionDescriptor>,
+    /// The result of group prepare.
+    /// The value will be set in [RepartitionStart](crate::procedure::repartition::group::repartition_start::RepartitionStart) state.
+    pub group_prepare_result: Option<GroupPrepareResult>,
+}
+
+impl Context {
+    /// Retrieves the table route value for the given table id.
+    ///
+    /// Retry:
+    /// - Failed to retrieve the metadata of table.
+    ///
+    /// Abort:
+    /// - Table route not found.
+    pub async fn get_table_route_value(
+        &self,
+    ) -> Result<DeserializedValueWithBytes<TableRouteValue>> {
+        let table_id = self.persistent_ctx.table_id;
+        let group_id = self.persistent_ctx.group_id;
+        let table_route_value = self
+            .table_metadata_manager
+            .table_route_manager()
+            .table_route_storage()
+            .get_with_raw_bytes(table_id)
+            .await
+            .map_err(BoxedError::new)
+            .with_context(|_| error::RetryLaterWithSourceSnafu {
+                reason: format!(
+                    "Failed to get table route for table: {}, repartition group: {}",
+                    table_id, group_id
+                ),
+            })?
+            .context(error::TableRouteNotFoundSnafu { table_id })?;
+
+        Ok(table_route_value)
+    }
+
+    /// Returns the `datanode_table_value`
+    ///
+    /// Retry:
+    /// - Failed to retrieve the metadata of datanode table.
+    pub async fn get_datanode_table_value(
+        &self,
+        table_id: TableId,
+        datanode_id: u64,
+    ) -> Result<DatanodeTableValue> {
+        let datanode_table_value = self
+            .table_metadata_manager
+            .datanode_table_manager()
+            .get(&DatanodeTableKey {
+                datanode_id,
+                table_id,
+            })
+            .await
+            .context(error::TableMetadataManagerSnafu)
+            .map_err(BoxedError::new)
+            .with_context(|_| error::RetryLaterWithSourceSnafu {
+                reason: format!("Failed to get DatanodeTable: {table_id}"),
+            })?
+            .context(error::DatanodeTableNotFoundSnafu {
+                table_id,
+                datanode_id,
+            })?;
+        Ok(datanode_table_value)
+    }
+
+    /// Broadcasts the invalidate table cache message.
+    pub async fn invalidate_table_cache(&self) -> Result<()> {
+        let table_id = self.persistent_ctx.table_id;
+        let group_id = self.persistent_ctx.group_id;
+        let subject = format!(
+            "Invalidate table cache for repartition table, group: {}, table: {}",
+            group_id, table_id,
+        );
+        let ctx = common_meta::cache_invalidator::Context {
+            subject: Some(subject),
+        };
+        let _ = self
+            .cache_invalidator
+            .invalidate(&ctx, &[CacheIdent::TableId(table_id)])
+            .await;
+        Ok(())
+    }
+
+    /// Updates the table route.
+    ///
+    /// Retry:
+    /// - Failed to retrieve the metadata of datanode table.
+    ///
+    /// Abort:
+    /// - Table route not found.
+    /// - Failed to update the table route.
+    pub async fn update_table_route(
+        &self,
+        current_table_route_value: &DeserializedValueWithBytes<TableRouteValue>,
+        new_region_routes: Vec<RegionRoute>,
+    ) -> Result<()> {
+        let table_id = self.persistent_ctx.table_id;
+        // Safety: prepare result is set in [RepartitionStart] state.
+        let prepare_result = self.persistent_ctx.group_prepare_result.as_ref().unwrap();
+        let central_region_datanode_table_value = self
+            .get_datanode_table_value(table_id, prepare_result.central_region_datanode_id)
+            .await?;
+        let RegionInfo {
+            region_options,
+            region_wal_options,
+            ..
+        } = &central_region_datanode_table_value.region_info;
+
+        self.table_metadata_manager
+            .update_table_route(
+                table_id,
+                central_region_datanode_table_value.region_info.clone(),
+                current_table_route_value,
+                new_region_routes,
+                region_options,
+                region_wal_options,
+            )
+            .await
+            .context(error::TableMetadataManagerSnafu)
+    }
+}
+
+/// Returns the region routes of the given table route value.
+///
+/// Abort:
+/// - Table route value is not physical.
+pub fn region_routes(
+    table_id: TableId,
+    table_route_value: &TableRouteValue,
+) -> Result<&Vec<RegionRoute>> {
+    table_route_value
+        .region_routes()
+        .with_context(|_| error::UnexpectedLogicalRouteTableSnafu {
+            err_msg: format!(
+                "TableRoute({:?}) is a non-physical TableRouteValue.",
+                table_id
+            ),
+        })
+}
+
+#[async_trait::async_trait]
+#[typetag::serde(tag = "repartition_group_state")]
+pub(crate) trait State: Sync + Send + Debug {
+    fn name(&self) -> &'static str {
+        let type_name = std::any::type_name::<Self>();
+        // short name
+        type_name.split("::").last().unwrap_or(type_name)
+    }
+
+    /// Yields the next [State] and [Status].
+    async fn next(
+        &mut self,
+        ctx: &mut Context,
+        procedure_ctx: &ProcedureContext,
+    ) -> Result<(Box<dyn State>, Status)>;
+
+    fn as_any(&self) -> &dyn Any;
+}
+
+#[cfg(test)]
+mod tests {
+    use std::assert_matches::assert_matches;
+    use std::sync::Arc;
+
+    use common_meta::key::TableMetadataManager;
+    use common_meta::kv_backend::test_util::MockKvBackendBuilder;
+
+    use crate::error::Error;
+    use crate::procedure::repartition::test_util::{TestingEnv, new_persistent_context};
+
+    #[tokio::test]
+    async fn test_get_table_route_value_not_found_error() {
+        let env = TestingEnv::new();
+        let persistent_context = new_persistent_context(1024, vec![], vec![]);
+        let ctx = env.create_context(persistent_context);
+        let err = ctx.get_table_route_value().await.unwrap_err();
+        assert_matches!(err, Error::TableRouteNotFound { .. });
+        assert!(!err.is_retryable());
+    }
+
+    #[tokio::test]
+    async fn test_get_table_route_value_retry_error() {
+        let kv = MockKvBackendBuilder::default()
+            .range_fn(Arc::new(|_| {
+                common_meta::error::UnexpectedSnafu {
+                    err_msg: "mock err",
+                }
+                .fail()
+            }))
+            .build()
+            .unwrap();
+        let mut env = TestingEnv::new();
+        env.table_metadata_manager = Arc::new(TableMetadataManager::new(Arc::new(kv)));
+        let persistent_context = new_persistent_context(1024, vec![], vec![]);
+        let ctx = env.create_context(persistent_context);
+        let err = ctx.get_table_route_value().await.unwrap_err();
+        assert!(err.is_retryable());
+    }
+
+    #[tokio::test]
+    async fn test_get_datanode_table_value_retry_error() {
+        let kv = MockKvBackendBuilder::default()
+            .range_fn(Arc::new(|_| {
+                common_meta::error::UnexpectedSnafu {
+                    err_msg: "mock err",
+                }
+                .fail()
+            }))
+            .build()
+            .unwrap();
+        let mut env = TestingEnv::new();
+        env.table_metadata_manager = Arc::new(TableMetadataManager::new(Arc::new(kv)));
+        let persistent_context = new_persistent_context(1024, vec![], vec![]);
+        let ctx = env.create_context(persistent_context);
+        let err = ctx.get_datanode_table_value(1024, 1).await.unwrap_err();
+        assert!(err.is_retryable());
+    }
+}
diff --git a/src/meta-srv/src/procedure/repartition/group/repartition_start.rs b/src/meta-srv/src/procedure/repartition/group/repartition_start.rs
new file mode 100644
index 0000000000..5e72ce613c
--- /dev/null
+++ b/src/meta-srv/src/procedure/repartition/group/repartition_start.rs
@@ -0,0 +1,273 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+use std::collections::HashMap;
+
+use common_meta::rpc::router::RegionRoute;
+use common_procedure::{Context as ProcedureContext, Status};
+use common_telemetry::debug;
+use serde::{Deserialize, Serialize};
+use snafu::{OptionExt, ResultExt, ensure};
+
+use crate::error::{self, Result};
+use crate::procedure::repartition::group::{
+    Context, GroupId, GroupPrepareResult, State, region_routes,
+};
+use crate::procedure::repartition::plan::RegionDescriptor;
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct RepartitionStart;
+
+/// Ensures that the partition expression of the region route matches the partition expression of the region descriptor.
+fn ensure_region_route_expr_match(
+    region_route: &RegionRoute,
+    region_descriptor: &RegionDescriptor,
+) -> Result<RegionRoute> {
+    let actual = &region_route.region.partition_expr;
+    let expected = region_descriptor
+        .partition_expr
+        .as_json_str()
+        .context(error::SerializePartitionExprSnafu)?;
+    ensure!(
+        actual == &expected,
+        error::PartitionExprMismatchSnafu {
+            region_id: region_route.region.id,
+            expected,
+            actual,
+        }
+    );
+    Ok(region_route.clone())
+}
+
+impl RepartitionStart {
+    /// Ensures that both source and target regions are present in the region routes.
+    ///
+    /// Both source and target regions must be present in the region routes (target regions should be allocated before repartitioning).
+    #[allow(dead_code)]
+    fn ensure_route_present(
+        group_id: GroupId,
+        region_routes: &[RegionRoute],
+        sources: &[RegionDescriptor],
+        targets: &[RegionDescriptor],
+    ) -> Result<GroupPrepareResult> {
+        ensure!(
+            !sources.is_empty(),
+            error::UnexpectedSnafu {
+                violated: "Sources are empty"
+            }
+        );
+
+        let region_routes_map = region_routes
+            .iter()
+            .map(|r| (r.region.id, r))
+            .collect::<HashMap<_, _>>();
+        let source_region_routes = sources
+            .iter()
+            .map(|s| {
+                region_routes_map
+                    .get(&s.region_id)
+                    .context(error::RepartitionSourceRegionMissingSnafu {
+                        group_id,
+                        region_id: s.region_id,
+                    })
+                    .and_then(|r| ensure_region_route_expr_match(r, s))
+            })
+            .collect::<Result<Vec<_>>>()?;
+        let target_region_routes = targets
+            .iter()
+            .map(|t| {
+                region_routes_map
+                    .get(&t.region_id)
+                    .context(error::RepartitionTargetRegionMissingSnafu {
+                        group_id,
+                        region_id: t.region_id,
+                    })
+                    .map(|r| (*r).clone())
+            })
+            .collect::<Result<Vec<_>>>()?;
+        let central_region = sources[0].region_id;
+        let central_region_datanode_id = source_region_routes[0]
+            .leader_peer
+            .as_ref()
+            .context(error::UnexpectedSnafu {
+                violated: format!(
+                    "Leader peer is not set for central region: {}",
+                    central_region
+                ),
+            })?
+            .id;
+
+        Ok(GroupPrepareResult {
+            source_routes: source_region_routes,
+            target_routes: target_region_routes,
+            central_region,
+            central_region_datanode_id,
+        })
+    }
+
+    #[allow(dead_code)]
+    fn next_state() -> (Box<dyn State>, Status) {
+        // TODO(weny): change it later.
+        (Box::new(RepartitionStart), Status::executing(true))
+    }
+}
+
+#[async_trait::async_trait]
+#[typetag::serde]
+impl State for RepartitionStart {
+    /// Captures the group prepare result.
+    ///
+    /// Retry:
+    /// - Failed to get the table route.
+    ///
+    /// Abort
+    /// - Table route not found.
+    /// - Table route is not physical.
+    /// - Failed to ensure the route is present.
+    /// - Failed to capture the group prepare result.
+    async fn next(
+        &mut self,
+        ctx: &mut Context,
+        _procedure_ctx: &ProcedureContext,
+    ) -> Result<(Box<dyn State>, Status)> {
+        if ctx.persistent_ctx.group_prepare_result.is_some() {
+            return Ok(Self::next_state());
+        }
+        let table_id = ctx.persistent_ctx.table_id;
+        let group_id = ctx.persistent_ctx.group_id;
+        let table_route_value = ctx.get_table_route_value().await?.into_inner();
+        let region_routes = region_routes(table_id, &table_route_value)?;
+        let group_prepare_result = Self::ensure_route_present(
+            group_id,
+            region_routes,
+            &ctx.persistent_ctx.sources,
+            &ctx.persistent_ctx.targets,
+        )?;
+        ctx.persistent_ctx.group_prepare_result = Some(group_prepare_result);
+        debug!(
+            "Repartition group {}: captured {} sources, {} targets",
+            group_id,
+            ctx.persistent_ctx.sources.len(),
+            ctx.persistent_ctx.targets.len()
+        );
+
+        Ok(Self::next_state())
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::assert_matches::assert_matches;
+
+    use common_meta::peer::Peer;
+    use common_meta::rpc::router::{Region, RegionRoute};
+    use store_api::storage::RegionId;
+    use uuid::Uuid;
+
+    use crate::error::Error;
+    use crate::procedure::repartition::group::repartition_start::RepartitionStart;
+    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::test_util::range_expr;
+
+    #[test]
+    fn test_ensure_route_present_missing_source_region() {
+        let source_region = RegionDescriptor {
+            region_id: RegionId::new(1024, 1),
+            partition_expr: range_expr("x", 0, 100),
+        };
+        let target_region = RegionDescriptor {
+            region_id: RegionId::new(1024, 2),
+            partition_expr: range_expr("x", 0, 10),
+        };
+        let region_routes = vec![RegionRoute {
+            region: Region {
+                id: RegionId::new(1024, 2),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            ..Default::default()
+        }];
+        let err = RepartitionStart::ensure_route_present(
+            Uuid::new_v4(),
+            &region_routes,
+            &[source_region],
+            &[target_region],
+        )
+        .unwrap_err();
+        assert_matches!(err, Error::RepartitionSourceRegionMissing { .. });
+    }
+
+    #[test]
+    fn test_ensure_route_present_partition_expr_mismatch() {
+        let source_region = RegionDescriptor {
+            region_id: RegionId::new(1024, 1),
+            partition_expr: range_expr("x", 0, 100),
+        };
+        let target_region = RegionDescriptor {
+            region_id: RegionId::new(1024, 2),
+            partition_expr: range_expr("x", 0, 10),
+        };
+        let region_routes = vec![RegionRoute {
+            region: Region {
+                id: RegionId::new(1024, 1),
+                partition_expr: range_expr("x", 0, 5).as_json_str().unwrap(),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            ..Default::default()
+        }];
+        let err = RepartitionStart::ensure_route_present(
+            Uuid::new_v4(),
+            &region_routes,
+            &[source_region],
+            &[target_region],
+        )
+        .unwrap_err();
+        assert_matches!(err, Error::PartitionExprMismatch { .. });
+    }
+
+    #[test]
+    fn test_ensure_route_present_missing_target_region() {
+        let source_region = RegionDescriptor {
+            region_id: RegionId::new(1024, 1),
+            partition_expr: range_expr("x", 0, 100),
+        };
+        let target_region = RegionDescriptor {
+            region_id: RegionId::new(1024, 2),
+            partition_expr: range_expr("x", 0, 10),
+        };
+        let region_routes = vec![RegionRoute {
+            region: Region {
+                id: RegionId::new(1024, 1),
+                partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            ..Default::default()
+        }];
+        let err = RepartitionStart::ensure_route_present(
+            Uuid::new_v4(),
+            &region_routes,
+            &[source_region],
+            &[target_region],
+        )
+        .unwrap_err();
+        assert_matches!(err, Error::RepartitionTargetRegionMissing { .. });
+    }
+}
diff --git a/src/meta-srv/src/procedure/repartition/group/update_metadata.rs b/src/meta-srv/src/procedure/repartition/group/update_metadata.rs
new file mode 100644
index 0000000000..8f42ff8432
--- /dev/null
+++ b/src/meta-srv/src/procedure/repartition/group/update_metadata.rs
@@ -0,0 +1,80 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pub(crate) mod apply_staging_region;
+pub(crate) mod rollback_staging_region;
+
+use std::any::Any;
+
+use common_procedure::{Context as ProcedureContext, Status};
+use common_telemetry::warn;
+use serde::{Deserialize, Serialize};
+
+use crate::error::Result;
+use crate::procedure::repartition::group::repartition_start::RepartitionStart;
+use crate::procedure::repartition::group::{Context, State};
+
+#[derive(Debug, Serialize, Deserialize)]
+pub enum UpdateMetadata {
+    /// Applies the new partition expressions for staging regions.
+    ApplyStaging,
+    /// Rolls back the new partition expressions for staging regions.
+    RollbackStaging,
+}
+
+impl UpdateMetadata {
+    #[allow(dead_code)]
+    fn next_state() -> (Box<dyn State>, Status) {
+        // TODO(weny): change it later.
+        (Box::new(RepartitionStart), Status::executing(true))
+    }
+}
+
+#[async_trait::async_trait]
+#[typetag::serde]
+impl State for UpdateMetadata {
+    async fn next(
+        &mut self,
+        ctx: &mut Context,
+        _procedure_ctx: &ProcedureContext,
+    ) -> Result<(Box<dyn State>, Status)> {
+        match self {
+            UpdateMetadata::ApplyStaging => {
+                // TODO(weny): If all metadata have already been updated, skip applying staging regions.
+                self.apply_staging_regions(ctx).await?;
+
+                if let Err(err) = ctx.invalidate_table_cache().await {
+                    warn!(
+                        "Failed to broadcast the invalidate table cache message during the apply staging regions, error: {err:?}"
+                    );
+                };
+                Ok(Self::next_state())
+            }
+            UpdateMetadata::RollbackStaging => {
+                self.rollback_staging_regions(ctx).await?;
+
+                if let Err(err) = ctx.invalidate_table_cache().await {
+                    warn!(
+                        "Failed to broadcast the invalidate table cache message during the rollback staging regions, error: {err:?}"
+                    );
+                };
+                Ok(Self::next_state())
+            }
+        }
+    }
+
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
diff --git a/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs
new file mode 100644
index 0000000000..6f342931a8
--- /dev/null
+++ b/src/meta-srv/src/procedure/repartition/group/update_metadata/apply_staging_region.rs
@@ -0,0 +1,181 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use common_error::ext::BoxedError;
+use common_meta::rpc::router::RegionRoute;
+use common_telemetry::error;
+use snafu::{OptionExt, ResultExt};
+
+use crate::error::{self, Result};
+use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
+use crate::procedure::repartition::group::{Context, GroupId, region_routes};
+use crate::procedure::repartition::plan::RegionDescriptor;
+
+impl UpdateMetadata {
+    /// Applies the new partition expressions for staging regions.
+    ///
+    /// Abort:
+    /// - Target region not found.
+    /// - Source region not found.
+    fn apply_staging_region_routes(
+        group_id: GroupId,
+        sources: &[RegionDescriptor],
+        targets: &[RegionDescriptor],
+        current_region_routes: &[RegionRoute],
+    ) -> Result<Vec<RegionRoute>> {
+        let mut region_routes = current_region_routes.to_vec();
+        let mut region_routes_map = region_routes
+            .iter_mut()
+            .map(|route| (route.region.id, route))
+            .collect::<HashMap<_, _>>();
+
+        for target in targets {
+            let region_route = region_routes_map.get_mut(&target.region_id).context(
+                error::RepartitionTargetRegionMissingSnafu {
+                    group_id,
+                    region_id: target.region_id,
+                },
+            )?;
+            region_route.region.partition_expr = target
+                .partition_expr
+                .as_json_str()
+                .context(error::SerializePartitionExprSnafu)?;
+            region_route.set_leader_staging();
+        }
+
+        for source in sources {
+            let region_route = region_routes_map.get_mut(&source.region_id).context(
+                error::RepartitionSourceRegionMissingSnafu {
+                    group_id,
+                    region_id: source.region_id,
+                },
+            )?;
+            region_route.set_leader_staging();
+        }
+
+        Ok(region_routes)
+    }
+
+    /// Applies the new partition expressions for staging regions.
+    ///
+    /// Abort:
+    /// - Table route is not physical.
+    /// - Target region not found.
+    /// - Source region not found.
+    /// - Failed to update the table route.
+    /// - Central region datanode table value not found.
+    #[allow(dead_code)]
+    pub(crate) async fn apply_staging_regions(&self, ctx: &mut Context) -> Result<()> {
+        let table_id = ctx.persistent_ctx.table_id;
+        let group_id = ctx.persistent_ctx.group_id;
+        let current_table_route_value = ctx.get_table_route_value().await?;
+        let region_routes = region_routes(table_id, current_table_route_value.get_inner_ref())?;
+        let new_region_routes = Self::apply_staging_region_routes(
+            group_id,
+            &ctx.persistent_ctx.sources,
+            &ctx.persistent_ctx.targets,
+            region_routes,
+        )?;
+
+        if let Err(err) = ctx
+            .update_table_route(&current_table_route_value, new_region_routes)
+            .await
+        {
+            error!(err; "Failed to update the table route during the updating metadata for repartition: {table_id}, group_id: {group_id}");
+            return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
+                reason: format!(
+                    "Failed to update the table route during the updating metadata for repartition: {table_id}, group_id: {group_id}"
+                ),
+            });
+        };
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use common_meta::peer::Peer;
+    use common_meta::rpc::router::{Region, RegionRoute};
+    use store_api::storage::RegionId;
+    use uuid::Uuid;
+
+    use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
+    use crate::procedure::repartition::plan::RegionDescriptor;
+    use crate::procedure::repartition::test_util::range_expr;
+
+    #[test]
+    fn test_generate_region_routes() {
+        let group_id = Uuid::new_v4();
+        let table_id = 1024;
+        let region_routes = vec![
+            RegionRoute {
+                region: Region {
+                    id: RegionId::new(table_id, 1),
+                    partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
+                    ..Default::default()
+                },
+                leader_peer: Some(Peer::empty(1)),
+                ..Default::default()
+            },
+            RegionRoute {
+                region: Region {
+                    id: RegionId::new(table_id, 2),
+                    partition_expr: String::new(),
+                    ..Default::default()
+                },
+                leader_peer: Some(Peer::empty(1)),
+                ..Default::default()
+            },
+            RegionRoute {
+                region: Region {
+                    id: RegionId::new(table_id, 3),
+                    partition_expr: String::new(),
+                    ..Default::default()
+                },
+                leader_peer: Some(Peer::empty(1)),
+                ..Default::default()
+            },
+        ];
+        let source_region = RegionDescriptor {
+            region_id: RegionId::new(table_id, 1),
+            partition_expr: range_expr("x", 0, 100),
+        };
+        let target_region = RegionDescriptor {
+            region_id: RegionId::new(table_id, 2),
+            partition_expr: range_expr("x", 0, 10),
+        };
+
+        let new_region_routes = UpdateMetadata::apply_staging_region_routes(
+            group_id,
+            &[source_region],
+            &[target_region],
+            &region_routes,
+        )
+        .unwrap();
+        assert!(new_region_routes[0].is_leader_staging());
+        assert_eq!(
+            new_region_routes[0].region.partition_expr,
+            range_expr("x", 0, 100).as_json_str().unwrap()
+        );
+        assert_eq!(
+            new_region_routes[1].region.partition_expr,
+            range_expr("x", 0, 10).as_json_str().unwrap()
+        );
+        assert!(new_region_routes[1].is_leader_staging());
+        assert!(!new_region_routes[2].is_leader_staging());
+    }
+}
diff --git a/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs b/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs
new file mode 100644
index 0000000000..3d147d82ad
--- /dev/null
+++ b/src/meta-srv/src/procedure/repartition/group/update_metadata/rollback_staging_region.rs
@@ -0,0 +1,187 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use common_error::ext::BoxedError;
+use common_meta::rpc::router::RegionRoute;
+use common_telemetry::error;
+use snafu::{OptionExt, ResultExt};
+
+use crate::error::{self, Result};
+use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
+use crate::procedure::repartition::group::{Context, GroupId, region_routes};
+
+impl UpdateMetadata {
+    /// Rolls back the staging regions.
+    ///
+    /// Abort:
+    /// - Source region not found.
+    /// - Target region not found.
+    #[allow(dead_code)]
+    fn rollback_staging_region_routes(
+        group_id: GroupId,
+        source_routes: &[RegionRoute],
+        target_routes: &[RegionRoute],
+        current_region_routes: &[RegionRoute],
+    ) -> Result<Vec<RegionRoute>> {
+        let mut region_routes = current_region_routes.to_vec();
+        let mut region_routes_map = region_routes
+            .iter_mut()
+            .map(|route| (route.region.id, route))
+            .collect::<HashMap<_, _>>();
+
+        for source in source_routes {
+            let region_route = region_routes_map.get_mut(&source.region.id).context(
+                error::RepartitionSourceRegionMissingSnafu {
+                    group_id,
+                    region_id: source.region.id,
+                },
+            )?;
+            region_route.region.partition_expr = source.region.partition_expr.clone();
+            region_route.clear_leader_staging();
+        }
+
+        for target in target_routes {
+            let region_route = region_routes_map.get_mut(&target.region.id).context(
+                error::RepartitionTargetRegionMissingSnafu {
+                    group_id,
+                    region_id: target.region.id,
+                },
+            )?;
+            region_route.clear_leader_staging();
+        }
+
+        Ok(region_routes)
+    }
+
+    /// Rolls back the metadata for staging regions.
+    ///
+    /// Abort:
+    /// - Table route is not physical.
+    /// - Source region not found.
+    /// - Target region not found.
+    /// - Failed to update the table route.
+    /// - Central region datanode table value not found.
+    #[allow(dead_code)]
+    pub(crate) async fn rollback_staging_regions(&self, ctx: &mut Context) -> Result<()> {
+        let table_id = ctx.persistent_ctx.table_id;
+        let group_id = ctx.persistent_ctx.group_id;
+        let current_table_route_value = ctx.get_table_route_value().await?;
+        let region_routes = region_routes(table_id, current_table_route_value.get_inner_ref())?;
+        // Safety: prepare result is set in [RepartitionStart] state.
+        let prepare_result = ctx.persistent_ctx.group_prepare_result.as_ref().unwrap();
+        let new_region_routes = Self::rollback_staging_region_routes(
+            group_id,
+            &prepare_result.source_routes,
+            &prepare_result.target_routes,
+            region_routes,
+        )?;
+
+        if let Err(err) = ctx
+            .update_table_route(&current_table_route_value, new_region_routes)
+            .await
+        {
+            error!(err; "Failed to update the table route during the updating metadata for repartition: {table_id}, group_id: {group_id}");
+            return Err(BoxedError::new(err)).context(error::RetryLaterWithSourceSnafu {
+                reason: format!(
+                    "Failed to update the table route during the updating metadata for repartition: {table_id}, group_id: {group_id}"
+                ),
+            });
+        };
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use common_meta::peer::Peer;
+    use common_meta::rpc::router::{LeaderState, Region, RegionRoute};
+    use store_api::storage::RegionId;
+    use uuid::Uuid;
+
+    use crate::procedure::repartition::group::update_metadata::UpdateMetadata;
+    use crate::procedure::repartition::test_util::range_expr;
+
+    #[test]
+    fn test_rollback_staging_region_routes() {
+        let group_id = Uuid::new_v4();
+        let table_id = 1024;
+        let region_routes = vec![
+            RegionRoute {
+                region: Region {
+                    id: RegionId::new(table_id, 1),
+                    partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
+                    ..Default::default()
+                },
+                leader_peer: Some(Peer::empty(1)),
+                leader_state: Some(LeaderState::Staging),
+                ..Default::default()
+            },
+            RegionRoute {
+                region: Region {
+                    id: RegionId::new(table_id, 2),
+                    partition_expr: String::new(),
+                    ..Default::default()
+                },
+                leader_peer: Some(Peer::empty(1)),
+                leader_state: Some(LeaderState::Staging),
+                ..Default::default()
+            },
+            RegionRoute {
+                region: Region {
+                    id: RegionId::new(table_id, 3),
+                    partition_expr: String::new(),
+                    ..Default::default()
+                },
+                leader_peer: Some(Peer::empty(1)),
+                leader_state: Some(LeaderState::Downgrading),
+                ..Default::default()
+            },
+        ];
+        let source_routes = vec![RegionRoute {
+            region: Region {
+                id: RegionId::new(table_id, 1),
+                partition_expr: range_expr("x", 0, 20).as_json_str().unwrap(),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            ..Default::default()
+        }];
+        let target_routes = vec![RegionRoute {
+            region: Region {
+                id: RegionId::new(table_id, 2),
+                partition_expr: range_expr("x", 0, 20).as_json_str().unwrap(),
+                ..Default::default()
+            },
+            leader_peer: Some(Peer::empty(1)),
+            ..Default::default()
+        }];
+        let new_region_routes = UpdateMetadata::rollback_staging_region_routes(
+            group_id,
+            &source_routes,
+            &target_routes,
+            &region_routes,
+        )
+        .unwrap();
+        assert!(!new_region_routes[0].is_leader_staging());
+        assert_eq!(
+            new_region_routes[0].region.partition_expr,
+            range_expr("x", 0, 20).as_json_str().unwrap(),
+        );
+        assert!(!new_region_routes[1].is_leader_staging());
+        assert!(new_region_routes[2].is_leader_downgrading());
+    }
+}
diff --git a/src/query/src/dataframe.rs b/src/meta-srv/src/procedure/repartition/plan.rs
similarity index 56%
rename from src/query/src/dataframe.rs
rename to src/meta-srv/src/procedure/repartition/plan.rs
index ce630b99e7..6d753a044c 100644
--- a/src/query/src/dataframe.rs
+++ b/src/meta-srv/src/procedure/repartition/plan.rs
@@ -12,20 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use datafusion::dataframe::DataFrame as DfDataFrame;
-use datafusion_expr::LogicalPlan;
+use partition::expr::PartitionExpr;
+use serde::{Deserialize, Serialize};
+use store_api::storage::RegionId;
 
-/// DataFrame represents a logical set of rows with the same named columns.
-/// Similar to a Pandas DataFrame or Spark DataFrame
-#[derive(Clone)]
-pub enum DataFrame {
-    DataFusion(DfDataFrame),
-}
-
-impl DataFrame {
-    pub fn into_logical_plan(self) -> LogicalPlan {
-        match self {
-            Self::DataFusion(dataframe) => dataframe.into_parts().1,
-        }
-    }
+/// Metadata describing a region involved in the plan.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct RegionDescriptor {
+    /// The region id of the region involved in the plan.
+    pub region_id: RegionId,
+    /// The new partition expression of the region.
+    pub partition_expr: PartitionExpr,
 }
diff --git a/src/meta-srv/src/procedure/repartition/test_util.rs b/src/meta-srv/src/procedure/repartition/test_util.rs
new file mode 100644
index 0000000000..3c0ebee58a
--- /dev/null
+++ b/src/meta-srv/src/procedure/repartition/test_util.rs
@@ -0,0 +1,91 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_meta::key::{TableMetadataManager, TableMetadataManagerRef};
+use common_meta::kv_backend::memory::MemoryKvBackend;
+use common_meta::sequence::SequenceBuilder;
+use datatypes::value::Value;
+use partition::expr::{PartitionExpr, col};
+use store_api::storage::TableId;
+use uuid::Uuid;
+
+use crate::cache_invalidator::MetasrvCacheInvalidator;
+use crate::metasrv::MetasrvInfo;
+use crate::procedure::repartition::group::{Context, PersistentContext};
+use crate::procedure::repartition::plan::RegionDescriptor;
+use crate::procedure::test_util::MailboxContext;
+
+/// `TestingEnv` provides components during the tests.
+pub struct TestingEnv {
+    pub table_metadata_manager: TableMetadataManagerRef,
+    pub mailbox_ctx: MailboxContext,
+}
+
+impl Default for TestingEnv {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl TestingEnv {
+    pub fn new() -> Self {
+        let kv_backend = Arc::new(MemoryKvBackend::new());
+        let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend.clone()));
+        let mailbox_sequence =
+            SequenceBuilder::new("test_heartbeat_mailbox", kv_backend.clone()).build();
+        let mailbox_ctx = MailboxContext::new(mailbox_sequence);
+
+        Self {
+            table_metadata_manager,
+            mailbox_ctx,
+        }
+    }
+
+    pub fn create_context(self, persistent_context: PersistentContext) -> Context {
+        let cache_invalidator = Arc::new(MetasrvCacheInvalidator::new(
+            self.mailbox_ctx.mailbox().clone(),
+            MetasrvInfo {
+                server_addr: String::new(),
+            },
+        ));
+
+        Context {
+            persistent_ctx: persistent_context,
+            table_metadata_manager: self.table_metadata_manager.clone(),
+            cache_invalidator,
+        }
+    }
+}
+
+pub fn range_expr(col_name: &str, start: i64, end: i64) -> PartitionExpr {
+    col(col_name)
+        .gt_eq(Value::Int64(start))
+        .and(col(col_name).lt(Value::Int64(end)))
+}
+
+pub fn new_persistent_context(
+    table_id: TableId,
+    sources: Vec<RegionDescriptor>,
+    targets: Vec<RegionDescriptor>,
+) -> PersistentContext {
+    PersistentContext {
+        group_id: Uuid::new_v4(),
+        table_id,
+        sources,
+        targets,
+        group_prepare_result: None,
+    }
+}
diff --git a/src/meta-srv/src/region/supervisor.rs b/src/meta-srv/src/region/supervisor.rs
index 97acaf4b07..866431dec1 100644
--- a/src/meta-srv/src/region/supervisor.rs
+++ b/src/meta-srv/src/region/supervisor.rs
@@ -32,7 +32,6 @@ use common_meta::rpc::store::RangeRequest;
 use common_runtime::JoinHandle;
 use common_telemetry::{debug, error, info, warn};
 use common_time::util::current_time_millis;
-use error::Error::{LeaderPeerChanged, MigrationRunning, RegionMigrated, TableRouteNotFound};
 use futures::{StreamExt, TryStreamExt};
 use snafu::{ResultExt, ensure};
 use store_api::storage::RegionId;
@@ -45,8 +44,9 @@ use crate::error::{self, Result};
 use crate::failure_detector::PhiAccrualFailureDetectorOptions;
 use crate::metasrv::{RegionStatAwareSelectorRef, SelectTarget, SelectorContext, SelectorRef};
 use crate::procedure::region_migration::manager::{
-    RegionMigrationManagerRef, RegionMigrationTriggerReason,
+    RegionMigrationManagerRef, RegionMigrationTriggerReason, SubmitRegionMigrationTaskResult,
 };
+use crate::procedure::region_migration::utils::RegionMigrationTaskBatch;
 use crate::procedure::region_migration::{
     DEFAULT_REGION_MIGRATION_TIMEOUT, RegionMigrationProcedureTask,
 };
@@ -131,7 +131,7 @@ pub struct RegionSupervisorTicker {
     tick_handle: Mutex<Option<JoinHandle<()>>>,
 
     /// The [`Option`] wrapper allows us to abort the job while dropping the [`RegionSupervisor`].
-    initialization_handler: Mutex<Option<JoinHandle<()>>>,
+    initialization_handle: Mutex<Option<JoinHandle<()>>>,
 
     /// The interval of tick.
     tick_interval: Duration,
@@ -176,7 +176,7 @@ impl RegionSupervisorTicker {
         );
         Self {
             tick_handle: Mutex::new(None),
-            initialization_handler: Mutex::new(None),
+            initialization_handle: Mutex::new(None),
             tick_interval,
             initialization_delay,
             initialization_retry_period,
@@ -213,7 +213,7 @@ impl RegionSupervisorTicker {
                     }
                 }
             });
-            *self.initialization_handler.lock().unwrap() = Some(initialization_handler);
+            *self.initialization_handle.lock().unwrap() = Some(initialization_handler);
 
             let sender = self.sender.clone();
             let ticker_loop = tokio::spawn(async move {
@@ -243,7 +243,7 @@ impl RegionSupervisorTicker {
             handle.abort();
             info!("The tick loop is stopped.");
         }
-        let initialization_handler = self.initialization_handler.lock().unwrap().take();
+        let initialization_handler = self.initialization_handle.lock().unwrap().take();
         if let Some(initialization_handler) = initialization_handler {
             initialization_handler.abort();
             info!("The initialization loop is stopped.");
@@ -575,11 +575,22 @@ impl RegionSupervisor {
                 .await
             {
                 Ok(tasks) => {
+                    let mut grouped_tasks: HashMap<(u64, u64), Vec<_>> = HashMap::new();
                     for (task, count) in tasks {
-                        let region_id = task.region_id;
-                        let datanode_id = task.from_peer.id;
-                        if let Err(err) = self.do_failover(task, count).await {
-                            error!(err; "Failed to execute region failover for region: {}, datanode: {}", region_id, datanode_id);
+                        grouped_tasks
+                            .entry((task.from_peer.id, task.to_peer.id))
+                            .or_default()
+                            .push((task, count));
+                    }
+
+                    for ((from_peer_id, to_peer_id), tasks) in grouped_tasks {
+                        if tasks.is_empty() {
+                            continue;
+                        }
+                        let task = RegionMigrationTaskBatch::from_tasks(tasks);
+                        let region_ids = task.region_ids.clone();
+                        if let Err(err) = self.do_failover_tasks(task).await {
+                            error!(err; "Failed to execute region failover for regions: {:?}, from_peer: {}, to_peer: {}", region_ids, from_peer_id, to_peer_id);
                         }
                     }
                 }
@@ -688,56 +699,92 @@ impl RegionSupervisor {
         Ok(tasks)
     }
 
-    async fn do_failover(&mut self, task: RegionMigrationProcedureTask, count: u32) -> Result<()> {
+    async fn do_failover_tasks(&mut self, task: RegionMigrationTaskBatch) -> Result<()> {
         let from_peer_id = task.from_peer.id;
         let to_peer_id = task.to_peer.id;
-        let region_id = task.region_id;
+        let timeout = task.timeout;
+        let trigger_reason = task.trigger_reason;
+        let result = self
+            .region_migration_manager
+            .submit_region_migration_task(task)
+            .await?;
+        self.handle_submit_region_migration_task_result(
+            from_peer_id,
+            to_peer_id,
+            timeout,
+            trigger_reason,
+            result,
+        )
+        .await
+    }
 
-        info!(
-            "Failover for region: {}, from_peer: {}, to_peer: {}, timeout: {:?}, tries: {}",
-            task.region_id, task.from_peer, task.to_peer, task.timeout, count
-        );
-
-        if let Err(err) = self.region_migration_manager.submit_procedure(task).await {
-            return match err {
-                RegionMigrated { .. } => {
-                    info!(
-                        "Region has been migrated to target peer: {}, removed failover detector for region: {}, datanode: {}",
-                        to_peer_id, region_id, from_peer_id
-                    );
-                    self.deregister_failure_detectors(vec![(from_peer_id, region_id)])
-                        .await;
-                    Ok(())
-                }
-                // Returns Ok if it's running or table is dropped.
-                MigrationRunning { .. } => {
-                    info!(
-                        "Another region migration is running, skip failover for region: {}, datanode: {}",
-                        region_id, from_peer_id
-                    );
-                    Ok(())
-                }
-                TableRouteNotFound { .. } => {
-                    self.deregister_failure_detectors(vec![(from_peer_id, region_id)])
-                        .await;
-                    info!(
-                        "Table route is not found, the table is dropped, removed failover detector for region: {}, datanode: {}",
-                        region_id, from_peer_id
-                    );
-                    Ok(())
-                }
-                LeaderPeerChanged { .. } => {
-                    self.deregister_failure_detectors(vec![(from_peer_id, region_id)])
-                        .await;
-                    info!(
-                        "Region's leader peer changed, removed failover detector for region: {}, datanode: {}",
-                        region_id, from_peer_id
-                    );
-                    Ok(())
-                }
-                err => Err(err),
-            };
-        };
+    async fn handle_submit_region_migration_task_result(
+        &mut self,
+        from_peer_id: DatanodeId,
+        to_peer_id: DatanodeId,
+        timeout: Duration,
+        trigger_reason: RegionMigrationTriggerReason,
+        result: SubmitRegionMigrationTaskResult,
+    ) -> Result<()> {
+        if !result.migrated.is_empty() {
+            let detecting_regions = result
+                .migrated
+                .iter()
+                .map(|region_id| (from_peer_id, *region_id))
+                .collect::<Vec<_>>();
+            self.deregister_failure_detectors(detecting_regions).await;
+            info!(
+                "Region has been migrated to target peer: {}, removed failover detectors for regions: {:?}",
+                to_peer_id, result.migrated,
+            )
+        }
+        if !result.migrating.is_empty() {
+            info!(
+                "Region is still migrating, skipping failover for regions: {:?}",
+                result.migrating
+            );
+        }
+        if !result.table_not_found.is_empty() {
+            let detecting_regions = result
+                .table_not_found
+                .iter()
+                .map(|region_id| (from_peer_id, *region_id))
+                .collect::<Vec<_>>();
+            self.deregister_failure_detectors(detecting_regions).await;
+            info!(
+                "Table is not found, removed failover detectors for regions: {:?}",
+                result.table_not_found
+            );
+        }
+        if !result.leader_changed.is_empty() {
+            let detecting_regions = result
+                .leader_changed
+                .iter()
+                .map(|region_id| (from_peer_id, *region_id))
+                .collect::<Vec<_>>();
+            self.deregister_failure_detectors(detecting_regions).await;
+            info!(
+                "Region's leader peer changed, removed failover detectors for regions: {:?}",
+                result.leader_changed
+            );
+        }
+        if !result.peer_conflict.is_empty() {
+            info!(
+                "Region has peer conflict, ignore failover for regions: {:?}",
+                result.peer_conflict
+            );
+        }
+        if !result.submitted.is_empty() {
+            info!(
+                "Failover for regions: {:?}, from_peer: {}, to_peer: {}, procedure_id: {:?}, timeout: {:?}, trigger_reason: {:?}",
+                result.submitted,
+                from_peer_id,
+                to_peer_id,
+                result.procedure_id,
+                timeout,
+                trigger_reason,
+            );
+        }
 
         Ok(())
     }
@@ -813,7 +860,10 @@ pub(crate) mod tests {
     use tokio::time::sleep;
 
     use super::RegionSupervisorSelector;
-    use crate::procedure::region_migration::manager::RegionMigrationManager;
+    use crate::procedure::region_migration::RegionMigrationTriggerReason;
+    use crate::procedure::region_migration::manager::{
+        RegionMigrationManager, SubmitRegionMigrationTaskResult,
+    };
     use crate::procedure::region_migration::test_util::TestingEnv;
     use crate::region::supervisor::{
         DatanodeHeartbeat, Event, RegionFailureDetectorControl, RegionSupervisor,
@@ -929,7 +979,7 @@ pub(crate) mod tests {
         let (tx, mut rx) = tokio::sync::mpsc::channel(128);
         let ticker = RegionSupervisorTicker {
             tick_handle: Mutex::new(None),
-            initialization_handler: Mutex::new(None),
+            initialization_handle: Mutex::new(None),
             tick_interval: Duration::from_millis(10),
             initialization_delay: Duration::from_millis(100),
             initialization_retry_period: Duration::from_millis(100),
@@ -947,6 +997,8 @@ pub(crate) mod tests {
                     Event::Tick | Event::Clear | Event::InitializeAllRegions(_)
                 );
             }
+            assert!(ticker.initialization_handle.lock().unwrap().is_none());
+            assert!(ticker.tick_handle.lock().unwrap().is_none());
         }
     }
 
@@ -956,7 +1008,7 @@ pub(crate) mod tests {
         let (tx, mut rx) = tokio::sync::mpsc::channel(128);
         let ticker = RegionSupervisorTicker {
             tick_handle: Mutex::new(None),
-            initialization_handler: Mutex::new(None),
+            initialization_handle: Mutex::new(None),
             tick_interval: Duration::from_millis(1000),
             initialization_delay: Duration::from_millis(50),
             initialization_retry_period: Duration::from_millis(50),
@@ -1085,4 +1137,172 @@ pub(crate) mod tests {
         sender.send(Event::Dump(tx)).await.unwrap();
         assert!(rx.await.unwrap().is_empty());
     }
+
+    #[tokio::test]
+    async fn test_handle_submit_region_migration_task_result_migrated() {
+        common_telemetry::init_default_ut_logging();
+        let (mut supervisor, _) = new_test_supervisor();
+        let region_id = RegionId::new(1, 1);
+        let detecting_region = (1, region_id);
+        supervisor
+            .register_failure_detectors(vec![detecting_region])
+            .await;
+        supervisor.failover_counts.insert(detecting_region, 1);
+        let result = SubmitRegionMigrationTaskResult {
+            migrated: vec![region_id],
+            ..Default::default()
+        };
+        supervisor
+            .handle_submit_region_migration_task_result(
+                1,
+                2,
+                Duration::from_millis(1000),
+                RegionMigrationTriggerReason::Manual,
+                result,
+            )
+            .await
+            .unwrap();
+        assert!(!supervisor.failure_detector.contains(&detecting_region));
+        assert!(supervisor.failover_counts.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_handle_submit_region_migration_task_result_migrating() {
+        common_telemetry::init_default_ut_logging();
+        let (mut supervisor, _) = new_test_supervisor();
+        let region_id = RegionId::new(1, 1);
+        let detecting_region = (1, region_id);
+        supervisor
+            .register_failure_detectors(vec![detecting_region])
+            .await;
+        supervisor.failover_counts.insert(detecting_region, 1);
+        let result = SubmitRegionMigrationTaskResult {
+            migrating: vec![region_id],
+            ..Default::default()
+        };
+        supervisor
+            .handle_submit_region_migration_task_result(
+                1,
+                2,
+                Duration::from_millis(1000),
+                RegionMigrationTriggerReason::Manual,
+                result,
+            )
+            .await
+            .unwrap();
+        assert!(supervisor.failure_detector.contains(&detecting_region));
+        assert!(supervisor.failover_counts.contains_key(&detecting_region));
+    }
+
+    #[tokio::test]
+    async fn test_handle_submit_region_migration_task_result_table_not_found() {
+        common_telemetry::init_default_ut_logging();
+        let (mut supervisor, _) = new_test_supervisor();
+        let region_id = RegionId::new(1, 1);
+        let detecting_region = (1, region_id);
+        supervisor
+            .register_failure_detectors(vec![detecting_region])
+            .await;
+        supervisor.failover_counts.insert(detecting_region, 1);
+        let result = SubmitRegionMigrationTaskResult {
+            table_not_found: vec![region_id],
+            ..Default::default()
+        };
+        supervisor
+            .handle_submit_region_migration_task_result(
+                1,
+                2,
+                Duration::from_millis(1000),
+                RegionMigrationTriggerReason::Manual,
+                result,
+            )
+            .await
+            .unwrap();
+        assert!(!supervisor.failure_detector.contains(&detecting_region));
+        assert!(supervisor.failover_counts.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_handle_submit_region_migration_task_result_leader_changed() {
+        common_telemetry::init_default_ut_logging();
+        let (mut supervisor, _) = new_test_supervisor();
+        let region_id = RegionId::new(1, 1);
+        let detecting_region = (1, region_id);
+        supervisor
+            .register_failure_detectors(vec![detecting_region])
+            .await;
+        supervisor.failover_counts.insert(detecting_region, 1);
+        let result = SubmitRegionMigrationTaskResult {
+            leader_changed: vec![region_id],
+            ..Default::default()
+        };
+        supervisor
+            .handle_submit_region_migration_task_result(
+                1,
+                2,
+                Duration::from_millis(1000),
+                RegionMigrationTriggerReason::Manual,
+                result,
+            )
+            .await
+            .unwrap();
+        assert!(!supervisor.failure_detector.contains(&detecting_region));
+        assert!(supervisor.failover_counts.is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_handle_submit_region_migration_task_result_peer_conflict() {
+        common_telemetry::init_default_ut_logging();
+        let (mut supervisor, _) = new_test_supervisor();
+        let region_id = RegionId::new(1, 1);
+        let detecting_region = (1, region_id);
+        supervisor
+            .register_failure_detectors(vec![detecting_region])
+            .await;
+        supervisor.failover_counts.insert(detecting_region, 1);
+        let result = SubmitRegionMigrationTaskResult {
+            peer_conflict: vec![region_id],
+            ..Default::default()
+        };
+        supervisor
+            .handle_submit_region_migration_task_result(
+                1,
+                2,
+                Duration::from_millis(1000),
+                RegionMigrationTriggerReason::Manual,
+                result,
+            )
+            .await
+            .unwrap();
+        assert!(supervisor.failure_detector.contains(&detecting_region));
+        assert!(supervisor.failover_counts.contains_key(&detecting_region));
+    }
+
+    #[tokio::test]
+    async fn test_handle_submit_region_migration_task_result_submitted() {
+        common_telemetry::init_default_ut_logging();
+        let (mut supervisor, _) = new_test_supervisor();
+        let region_id = RegionId::new(1, 1);
+        let detecting_region = (1, region_id);
+        supervisor
+            .register_failure_detectors(vec![detecting_region])
+            .await;
+        supervisor.failover_counts.insert(detecting_region, 1);
+        let result = SubmitRegionMigrationTaskResult {
+            submitted: vec![region_id],
+            ..Default::default()
+        };
+        supervisor
+            .handle_submit_region_migration_task_result(
+                1,
+                2,
+                Duration::from_millis(1000),
+                RegionMigrationTriggerReason::Manual,
+                result,
+            )
+            .await
+            .unwrap();
+        assert!(supervisor.failure_detector.contains(&detecting_region));
+        assert!(supervisor.failover_counts.contains_key(&detecting_region));
+    }
 }
diff --git a/src/meta-srv/src/selector/weight_compute.rs b/src/meta-srv/src/selector/weight_compute.rs
index 4e651e4ecc..6508f78efe 100644
--- a/src/meta-srv/src/selector/weight_compute.rs
+++ b/src/meta-srv/src/selector/weight_compute.rs
@@ -195,6 +195,7 @@ mod tests {
                 region_manifest: RegionManifestInfo::Mito {
                     manifest_version: 0,
                     flushed_entry_id: 0,
+                    file_removed_cnt: 0,
                 },
                 data_topic_latest_entry_id: 0,
                 metadata_topic_latest_entry_id: 0,
@@ -224,6 +225,7 @@ mod tests {
                 region_manifest: RegionManifestInfo::Mito {
                     manifest_version: 0,
                     flushed_entry_id: 0,
+                    file_removed_cnt: 0,
                 },
                 data_topic_latest_entry_id: 0,
                 metadata_topic_latest_entry_id: 0,
@@ -253,6 +255,7 @@ mod tests {
                 region_manifest: RegionManifestInfo::Mito {
                     manifest_version: 0,
                     flushed_entry_id: 0,
+                    file_removed_cnt: 0,
                 },
                 data_topic_latest_entry_id: 0,
                 metadata_topic_latest_entry_id: 0,
diff --git a/src/meta-srv/src/service/mailbox.rs b/src/meta-srv/src/service/mailbox.rs
index f339e5c4da..bede162936 100644
--- a/src/meta-srv/src/service/mailbox.rs
+++ b/src/meta-srv/src/service/mailbox.rs
@@ -207,6 +207,9 @@ pub trait Mailbox: Send + Sync {
     async fn broadcast(&self, ch: &BroadcastChannel, msg: &MailboxMessage) -> Result<()>;
 
     async fn on_recv(&self, id: MessageId, maybe_msg: Result<MailboxMessage>) -> Result<()>;
+
+    /// Reset all pushers of the mailbox.
+    async fn reset(&self);
 }
 
 #[cfg(test)]
diff --git a/src/meta-srv/src/utils/etcd.rs b/src/meta-srv/src/utils/etcd.rs
index 15ac76fd75..508db7c148 100644
--- a/src/meta-srv/src/utils/etcd.rs
+++ b/src/meta-srv/src/utils/etcd.rs
@@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use common_meta::distributed_time_constants::default_etcd_client_options;
 use common_meta::kv_backend::etcd::create_etcd_tls_options;
-use etcd_client::{Client, ConnectOptions};
+use etcd_client::Client;
 use servers::tls::{TlsMode, TlsOption};
 use snafu::ResultExt;
 
@@ -30,14 +31,15 @@ pub async fn create_etcd_client_with_tls(
         .filter(|x| !x.is_empty())
         .collect::<Vec<_>>();
 
-    let connect_options = tls_config
-        .map(|c| create_etcd_tls_options(&convert_tls_option(c)))
-        .transpose()
-        .context(BuildTlsOptionsSnafu)?
-        .flatten()
-        .map(|tls_options| ConnectOptions::new().with_tls(tls_options));
+    let mut connect_options = default_etcd_client_options();
+    if let Some(tls_config) = tls_config
+        && let Some(tls_options) = create_etcd_tls_options(&convert_tls_option(tls_config))
+            .context(BuildTlsOptionsSnafu)?
+    {
+        connect_options = connect_options.with_tls(tls_options);
+    }
 
-    Client::connect(&etcd_endpoints, connect_options)
+    Client::connect(&etcd_endpoints, Some(connect_options))
         .await
         .context(error::ConnectEtcdSnafu)
 }
diff --git a/src/metric-engine/Cargo.toml b/src/metric-engine/Cargo.toml
index c601c10912..9beadade16 100644
--- a/src/metric-engine/Cargo.toml
+++ b/src/metric-engine/Cargo.toml
@@ -14,6 +14,7 @@ async-stream.workspace = true
 async-trait.workspace = true
 base64.workspace = true
 bytes.workspace = true
+fxhash = "0.2"
 common-base.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
@@ -31,7 +32,6 @@ lazy_static = "1.4"
 mito-codec.workspace = true
 mito2.workspace = true
 moka.workspace = true
-mur3 = "0.1"
 object-store.workspace = true
 prometheus.workspace = true
 serde.workspace = true
@@ -47,6 +47,12 @@ common-meta = { workspace = true, features = ["testing"] }
 common-test-util.workspace = true
 mito2 = { workspace = true, features = ["test"] }
 common-wal = { workspace = true }
+criterion = { version = "0.4", features = ["async", "async_tokio"] }
+mur3 = "0.1"
+
+[[bench]]
+name = "bench_tsid_generator"
+harness = false
 
 [package.metadata.cargo-udeps.ignore]
 normal = ["aquamarine"]
diff --git a/src/metric-engine/benches/bench_tsid_generator.rs b/src/metric-engine/benches/bench_tsid_generator.rs
new file mode 100644
index 0000000000..2908bc67ce
--- /dev/null
+++ b/src/metric-engine/benches/bench_tsid_generator.rs
@@ -0,0 +1,273 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::hash::Hasher;
+
+use criterion::{Criterion, black_box, criterion_group, criterion_main};
+use fxhash::FxHasher;
+use mur3::Hasher128;
+
+// A random number (from original implementation)
+const TSID_HASH_SEED: u32 = 846793005;
+
+/// Original TSID generator using mur3::Hasher128
+/// Hashes both label name and value for each label pair
+struct OriginalTsidGenerator {
+    hasher: Hasher128,
+}
+
+impl OriginalTsidGenerator {
+    fn new() -> Self {
+        Self {
+            hasher: Hasher128::with_seed(TSID_HASH_SEED),
+        }
+    }
+
+    /// Writes a label pair (name and value) to the generator.
+    fn write_label(&mut self, name: &str, value: &str) {
+        use std::hash::Hash;
+        name.hash(&mut self.hasher);
+        value.hash(&mut self.hasher);
+    }
+
+    /// Generates a new TSID.
+    fn finish(&mut self) -> u64 {
+        // TSID is 64 bits, simply truncate the 128 bits hash
+        let (hash, _) = self.hasher.finish128();
+        hash
+    }
+}
+
+/// Current TSID generator using fxhash::FxHasher
+/// Fast path: pre-computes label name hash, only hashes values
+struct CurrentTsidGenerator {
+    hasher: FxHasher,
+}
+
+impl CurrentTsidGenerator {
+    fn new() -> Self {
+        Self {
+            hasher: FxHasher::default(),
+        }
+    }
+
+    fn new_with_label_name_hash(label_name_hash: u64) -> Self {
+        let mut hasher = FxHasher::default();
+        hasher.write_u64(label_name_hash);
+        Self { hasher }
+    }
+
+    /// Writes a label value to the generator.
+    fn write_str(&mut self, value: &str) {
+        self.hasher.write(value.as_bytes());
+        self.hasher.write_u8(0xff);
+    }
+
+    /// Generates a new TSID.
+    fn finish(&mut self) -> u64 {
+        self.hasher.finish()
+    }
+}
+
+/// Pre-computes label name hash (used in fast path)
+fn compute_label_name_hash(labels: &[(&str, &str)]) -> u64 {
+    let mut hasher = FxHasher::default();
+    for (name, _) in labels {
+        hasher.write(name.as_bytes());
+        hasher.write_u8(0xff);
+    }
+    hasher.finish()
+}
+
+fn bench_tsid_generator_small(c: &mut Criterion) {
+    let labels = vec![("namespace", "greptimedb"), ("host", "127.0.0.1")];
+
+    let mut group = c.benchmark_group("tsid_generator_small_2_labels");
+    group.bench_function("original_mur3", |b| {
+        b.iter(|| {
+            let mut tsid_gen = OriginalTsidGenerator::new();
+            for (name, value) in &labels {
+                tsid_gen.write_label(black_box(name), black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    let label_name_hash = compute_label_name_hash(&labels);
+    group.bench_function("current_fxhash_fast_path", |b| {
+        b.iter(|| {
+            let mut tsid_gen =
+                CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
+            for (_, value) in &labels {
+                tsid_gen.write_str(black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    group.finish();
+}
+
+fn bench_tsid_generator_medium(c: &mut Criterion) {
+    let labels = vec![
+        ("namespace", "greptimedb"),
+        ("host", "127.0.0.1"),
+        ("region", "us-west-2"),
+        ("env", "production"),
+        ("service", "api"),
+    ];
+
+    let mut group = c.benchmark_group("tsid_generator_medium_5_labels");
+    group.bench_function("original_mur3", |b| {
+        b.iter(|| {
+            let mut tsid_gen = OriginalTsidGenerator::new();
+            for (name, value) in &labels {
+                tsid_gen.write_label(black_box(name), black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    let label_name_hash = compute_label_name_hash(&labels);
+    group.bench_function("current_fxhash_fast_path", |b| {
+        b.iter(|| {
+            let mut tsid_gen =
+                CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
+            for (_, value) in &labels {
+                tsid_gen.write_str(black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    group.finish();
+}
+
+fn bench_tsid_generator_large(c: &mut Criterion) {
+    let labels = vec![
+        ("namespace", "greptimedb"),
+        ("host", "127.0.0.1"),
+        ("region", "us-west-2"),
+        ("env", "production"),
+        ("service", "api"),
+        ("version", "v1.0.0"),
+        ("cluster", "cluster-1"),
+        ("dc", "dc1"),
+        ("rack", "rack-1"),
+        ("pod", "pod-123"),
+    ];
+
+    let mut group = c.benchmark_group("tsid_generator_large_10_labels");
+    group.bench_function("original_mur3", |b| {
+        b.iter(|| {
+            let mut tsid_gen = OriginalTsidGenerator::new();
+            for (name, value) in &labels {
+                tsid_gen.write_label(black_box(name), black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    let label_name_hash = compute_label_name_hash(&labels);
+    group.bench_function("current_fxhash_fast_path", |b| {
+        b.iter(|| {
+            let mut tsid_gen =
+                CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
+            for (_, value) in &labels {
+                tsid_gen.write_str(black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    group.finish();
+}
+
+fn bench_tsid_generator_slow_path(c: &mut Criterion) {
+    // Simulate slow path: some labels have null values (empty strings)
+    let labels_with_nulls = vec![
+        ("namespace", "greptimedb"),
+        ("host", "127.0.0.1"),
+        ("region", ""), // null
+        ("env", "production"),
+    ];
+
+    let labels_all_non_null = vec![
+        ("namespace", "greptimedb"),
+        ("host", "127.0.0.1"),
+        ("env", "production"),
+    ];
+
+    let mut group = c.benchmark_group("tsid_generator_slow_path_with_nulls");
+
+    // Original: always hashes name and value
+    group.bench_function("original_mur3_with_nulls", |b| {
+        b.iter(|| {
+            let mut tsid_gen = OriginalTsidGenerator::new();
+            for (name, value) in &labels_with_nulls {
+                if !value.is_empty() {
+                    tsid_gen.write_label(black_box(name), black_box(value));
+                }
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    // Current slow path: recomputes label name hash
+    group.bench_function("current_fxhash_slow_path", |b| {
+        b.iter(|| {
+            // Step 1: Compute label name hash for non-null labels
+            let mut name_hasher = CurrentTsidGenerator::new();
+            for (name, value) in &labels_with_nulls {
+                if !value.is_empty() {
+                    name_hasher.write_str(black_box(name));
+                }
+            }
+            let label_name_hash = name_hasher.finish();
+
+            // Step 2: Use label name hash and hash values
+            let mut tsid_gen = CurrentTsidGenerator::new_with_label_name_hash(label_name_hash);
+            for (_, value) in &labels_with_nulls {
+                if !value.is_empty() {
+                    tsid_gen.write_str(black_box(value));
+                }
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    // Current fast path: pre-computed (for comparison)
+    let label_name_hash = compute_label_name_hash(&labels_all_non_null);
+    group.bench_function("current_fxhash_fast_path_no_nulls", |b| {
+        b.iter(|| {
+            let mut tsid_gen =
+                CurrentTsidGenerator::new_with_label_name_hash(black_box(label_name_hash));
+            for (_, value) in &labels_all_non_null {
+                tsid_gen.write_str(black_box(value));
+            }
+            black_box(tsid_gen.finish())
+        })
+    });
+
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    bench_tsid_generator_small,
+    bench_tsid_generator_medium,
+    bench_tsid_generator_large,
+    bench_tsid_generator_slow_path
+);
+criterion_main!(benches);
diff --git a/src/metric-engine/src/config.rs b/src/metric-engine/src/config.rs
index 150885ed49..e342cd9d73 100644
--- a/src/metric-engine/src/config.rs
+++ b/src/metric-engine/src/config.rs
@@ -17,13 +17,14 @@ use std::time::Duration;
 use common_telemetry::warn;
 use serde::{Deserialize, Serialize};
 
-/// The default flush interval of the metadata region.  
+/// The default flush interval of the metadata region.
 pub(crate) const DEFAULT_FLUSH_METADATA_REGION_INTERVAL: Duration = Duration::from_secs(30);
 
 /// Configuration for the metric engine.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct EngineConfig {
     /// Whether to use sparse primary key encoding.
+    #[serde(default = "EngineConfig::default_sparse_primary_key_encoding")]
     pub sparse_primary_key_encoding: bool,
     /// The flush interval of the metadata region.
     #[serde(
@@ -37,7 +38,7 @@ impl Default for EngineConfig {
     fn default() -> Self {
         Self {
             flush_metadata_region_interval: DEFAULT_FLUSH_METADATA_REGION_INTERVAL,
-            sparse_primary_key_encoding: true,
+            sparse_primary_key_encoding: Self::default_sparse_primary_key_encoding(),
         }
     }
 }
@@ -47,6 +48,10 @@ impl EngineConfig {
         DEFAULT_FLUSH_METADATA_REGION_INTERVAL
     }
 
+    fn default_sparse_primary_key_encoding() -> bool {
+        true
+    }
+
     /// Sanitizes the configuration.
     pub fn sanitize(&mut self) {
         if self.flush_metadata_region_interval.is_zero() {
diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs
index 2ca7aa971a..9a4a2ef9df 100644
--- a/src/metric-engine/src/engine.rs
+++ b/src/metric-engine/src/engine.rs
@@ -23,6 +23,7 @@ mod options;
 mod put;
 mod read;
 mod region_metadata;
+mod staging;
 mod state;
 mod sync;
 
@@ -42,8 +43,9 @@ pub(crate) use state::MetricEngineState;
 use store_api::metadata::RegionMetadataRef;
 use store_api::metric_engine_consts::METRIC_ENGINE_NAME;
 use store_api::region_engine::{
-    BatchResponses, RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef,
-    RegionStatistic, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
+    BatchResponses, CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine,
+    RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic, RemapManifestsRequest,
+    RemapManifestsResponse, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
     SettableRegionRoleState, SyncManifestResponse,
 };
 use store_api::region_request::{
@@ -53,7 +55,10 @@ use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
 
 use crate::config::EngineConfig;
 use crate::data_region::DataRegion;
-use crate::error::{self, Error, Result, StartRepeatedTaskSnafu, UnsupportedRegionRequestSnafu};
+use crate::error::{
+    self, Error, Result, StartRepeatedTaskSnafu, UnsupportedRegionRequestSnafu,
+    UnsupportedRemapManifestsRequestSnafu,
+};
 use crate::metadata_region::MetadataRegion;
 use crate::repeated_task::FlushMetadataRegionTask;
 use crate::row_modifier::RowModifier;
@@ -208,6 +213,13 @@ impl RegionEngine for MetricEngine {
         let mut extension_return_value = HashMap::new();
 
         let result = match request {
+            RegionRequest::EnterStaging(_) => {
+                if self.inner.is_physical_region(region_id) {
+                    self.handle_enter_staging_request(region_id, request).await
+                } else {
+                    UnsupportedRegionRequestSnafu { request }.fail()
+                }
+            }
             RegionRequest::Put(put) => self.inner.put_region(region_id, put).await,
             RegionRequest::Create(create) => {
                 self.inner
@@ -350,6 +362,28 @@ impl RegionEngine for MetricEngine {
             .map_err(BoxedError::new)
     }
 
+    async fn remap_manifests(
+        &self,
+        request: RemapManifestsRequest,
+    ) -> Result<RemapManifestsResponse, BoxedError> {
+        let region_id = request.region_id;
+        if self.inner.is_physical_region(region_id) {
+            self.inner.mito.remap_manifests(request).await
+        } else {
+            Err(BoxedError::new(
+                UnsupportedRemapManifestsRequestSnafu { region_id }.build(),
+            ))
+        }
+    }
+
+    async fn copy_region_from(
+        &self,
+        _region_id: RegionId,
+        _request: CopyRegionFromRequest,
+    ) -> Result<CopyRegionFromResponse, BoxedError> {
+        todo!()
+    }
+
     async fn set_region_role_state_gracefully(
         &self,
         region_id: RegionId,
diff --git a/src/metric-engine/src/engine/alter.rs b/src/metric-engine/src/engine/alter.rs
index e25ce43e3d..4b6b67f31b 100644
--- a/src/metric-engine/src/engine/alter.rs
+++ b/src/metric-engine/src/engine/alter.rs
@@ -15,7 +15,7 @@
 mod extract_new_columns;
 mod validate;
 
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeSet, HashMap, HashSet};
 
 use extract_new_columns::extract_new_columns;
 use snafu::{OptionExt, ResultExt, ensure};
@@ -143,16 +143,20 @@ impl MetricEngineInner {
         };
         let data_region_id = to_data_region_id(physical_region_id);
 
-        let mut write_guards = HashMap::with_capacity(requests.len());
-        for (region_id, _) in requests.iter() {
-            if write_guards.contains_key(region_id) {
-                continue;
-            }
-            let _write_guard = self
-                .metadata_region
-                .write_lock_logical_region(*region_id)
-                .await?;
-            write_guards.insert(*region_id, _write_guard);
+        // Acquire logical region locks in a deterministic order to avoid deadlocks when multiple
+        // alter operations target overlapping regions concurrently.
+        let region_ids = requests
+            .iter()
+            .map(|(region_id, _)| *region_id)
+            .collect::<BTreeSet<_>>();
+
+        let mut write_guards = Vec::with_capacity(region_ids.len());
+        for region_id in region_ids {
+            write_guards.push(
+                self.metadata_region
+                    .write_lock_logical_region(region_id)
+                    .await?,
+            );
         }
 
         self.data_region
diff --git a/src/metric-engine/src/engine/flush.rs b/src/metric-engine/src/engine/flush.rs
index cdc11db852..02726fc8be 100644
--- a/src/metric-engine/src/engine/flush.rs
+++ b/src/metric-engine/src/engine/flush.rs
@@ -119,7 +119,7 @@ mod tests {
                     .index_file_path
                     .map(|path| path.replace(&e.file_id, "<file_id>"));
                 e.file_id = "<file_id>".to_string();
-                e.index_file_id = e.index_file_id.map(|_| "<index_file_id>".to_string());
+                e.index_version = 0;
                 format!("\n{:?}", e)
             })
             .sorted()
@@ -128,12 +128,12 @@ mod tests {
         assert_eq!(
             debug_format,
             r#"
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3487, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", index_file_id: None, level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000001/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(20), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/11_0000000002/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417473(11, 16777217), table_id: 11, region_number: 16777217, region_group: 1, region_sequence: 1, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000001/metadata/<file_id>.parquet", file_size: 3487, index_file_path: None, index_file_size: None, num_rows: 8, num_row_groups: 1, num_series: Some(8), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(8), origin_region_id: 47261417473(11, 16777217), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 47261417474(11, 16777218), table_id: 11, region_number: 16777218, region_group: 1, region_sequence: 2, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/11_0000000002/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 47261417474(11, 16777218), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/data/<file_id>.parquet", file_size: 3217, index_file_path: Some("test_metric_region/22_0000000042/data/index/<file_id>.puffin"), index_file_size: Some(235), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test_metric_region/", region_id: 94506057770(22, 16777258), table_id: 22, region_number: 16777258, region_group: 1, region_sequence: 42, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test_metric_region/22_0000000042/metadata/<file_id>.parquet", file_size: 3471, index_file_path: None, index_file_size: None, num_rows: 4, num_row_groups: 1, num_series: Some(4), min_ts: 0::Millisecond, max_ts: 0::Millisecond, sequence: Some(4), origin_region_id: 94506057770(22, 16777258), node_id: None, visible: true }"#,
         );
         // list from storage
         let storage_entries = mito
diff --git a/src/metric-engine/src/engine/put.rs b/src/metric-engine/src/engine/put.rs
index 0d4693ee42..c8c513f48c 100644
--- a/src/metric-engine/src/engine/put.rs
+++ b/src/metric-engine/src/engine/put.rs
@@ -272,15 +272,15 @@ mod tests {
             .unwrap();
         let batches = RecordBatches::try_collect(stream).await.unwrap();
         let expected = "\
-+-------------------------+----------------+------------+----------------------+-------+
-| greptime_timestamp      | greptime_value | __table_id | __tsid               | job   |
-+-------------------------+----------------+------------+----------------------+-------+
-| 1970-01-01T00:00:00     | 0.0            | 3          | 12881218023286672757 | tag_0 |
-| 1970-01-01T00:00:00.001 | 1.0            | 3          | 12881218023286672757 | tag_0 |
-| 1970-01-01T00:00:00.002 | 2.0            | 3          | 12881218023286672757 | tag_0 |
-| 1970-01-01T00:00:00.003 | 3.0            | 3          | 12881218023286672757 | tag_0 |
-| 1970-01-01T00:00:00.004 | 4.0            | 3          | 12881218023286672757 | tag_0 |
-+-------------------------+----------------+------------+----------------------+-------+";
++-------------------------+----------------+------------+---------------------+-------+
+| greptime_timestamp      | greptime_value | __table_id | __tsid              | job   |
++-------------------------+----------------+------------+---------------------+-------+
+| 1970-01-01T00:00:00     | 0.0            | 3          | 2955007454552897459 | tag_0 |
+| 1970-01-01T00:00:00.001 | 1.0            | 3          | 2955007454552897459 | tag_0 |
+| 1970-01-01T00:00:00.002 | 2.0            | 3          | 2955007454552897459 | tag_0 |
+| 1970-01-01T00:00:00.003 | 3.0            | 3          | 2955007454552897459 | tag_0 |
+| 1970-01-01T00:00:00.004 | 4.0            | 3          | 2955007454552897459 | tag_0 |
++-------------------------+----------------+------------+---------------------+-------+";
         assert_eq!(expected, batches.pretty_print().unwrap(), "physical region");
 
         // read data from logical region
diff --git a/src/metric-engine/src/engine/staging.rs b/src/metric-engine/src/engine/staging.rs
new file mode 100644
index 0000000000..9db500957c
--- /dev/null
+++ b/src/metric-engine/src/engine/staging.rs
@@ -0,0 +1,54 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use common_base::AffectedRows;
+use snafu::ResultExt;
+use store_api::region_engine::RegionEngine;
+use store_api::region_request::{EnterStagingRequest, RegionRequest};
+use store_api::storage::RegionId;
+
+use crate::engine::MetricEngine;
+use crate::error::{MitoEnterStagingOperationSnafu, Result};
+use crate::utils;
+
+impl MetricEngine {
+    /// Handles the enter staging request for the given region.
+    pub(crate) async fn handle_enter_staging_request(
+        &self,
+        region_id: RegionId,
+        request: RegionRequest,
+    ) -> Result<AffectedRows> {
+        let metadata_region_id = utils::to_metadata_region_id(region_id);
+        let data_region_id = utils::to_data_region_id(region_id);
+
+        // For metadata region, it doesn't care about the partition expr, so we can just pass an empty string.
+        self.inner
+            .mito
+            .handle_request(
+                metadata_region_id,
+                RegionRequest::EnterStaging(EnterStagingRequest {
+                    partition_expr: String::new(),
+                }),
+            )
+            .await
+            .context(MitoEnterStagingOperationSnafu)?;
+
+        self.inner
+            .mito
+            .handle_request(data_region_id, request)
+            .await
+            .context(MitoEnterStagingOperationSnafu)
+            .map(|response| response.affected_rows)
+    }
+}
diff --git a/src/metric-engine/src/engine/sync.rs b/src/metric-engine/src/engine/sync.rs
index 741938f8d7..4a2741c12b 100644
--- a/src/metric-engine/src/engine/sync.rs
+++ b/src/metric-engine/src/engine/sync.rs
@@ -45,7 +45,7 @@ impl MetricEngineInner {
             .metadata_flushed_entry_id()
             .unwrap_or_default();
         let metadata_region_manifest =
-            RegionManifestInfo::mito(metadata_manifest_version, metadata_flushed_entry_id);
+            RegionManifestInfo::mito(metadata_manifest_version, metadata_flushed_entry_id, 0);
         let metadata_synced = self
             .mito
             .sync_region(metadata_region_id, metadata_region_manifest)
@@ -57,7 +57,7 @@ impl MetricEngineInner {
         let data_manifest_version = manifest_info.data_manifest_version();
         let data_flushed_entry_id = manifest_info.data_flushed_entry_id();
         let data_region_manifest =
-            RegionManifestInfo::mito(data_manifest_version, data_flushed_entry_id);
+            RegionManifestInfo::mito(data_manifest_version, data_flushed_entry_id, 0);
 
         let data_synced = self
             .mito
diff --git a/src/metric-engine/src/error.rs b/src/metric-engine/src/error.rs
index 0f12e16cfc..3d00b737c3 100644
--- a/src/metric-engine/src/error.rs
+++ b/src/metric-engine/src/error.rs
@@ -156,6 +156,13 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Mito enter staging operation fails"))]
+    MitoEnterStagingOperation {
+        source: BoxedError,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display("Failed to collect record batch stream"))]
     CollectRecordBatchStream {
         source: common_recordbatch::error::Error,
@@ -242,6 +249,13 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Unsupported remap manifests request for region {}", region_id))]
+    UnsupportedRemapManifestsRequest {
+        region_id: RegionId,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display("Unsupported alter kind: {}", kind))]
     UnsupportedAlterKind {
         kind: String,
@@ -324,7 +338,8 @@ impl ErrorExt for Error {
             | AddingFieldColumn { .. }
             | ParseRegionOptions { .. }
             | UnexpectedRequest { .. }
-            | UnsupportedAlterKind { .. } => StatusCode::InvalidArguments,
+            | UnsupportedAlterKind { .. }
+            | UnsupportedRemapManifestsRequest { .. } => StatusCode::InvalidArguments,
 
             ForbiddenPhysicalAlter { .. } | UnsupportedRegionRequest { .. } => {
                 StatusCode::Unsupported
@@ -352,6 +367,7 @@ impl ErrorExt for Error {
             | MitoWriteOperation { source, .. }
             | MitoFlushOperation { source, .. }
             | MitoSyncOperation { source, .. }
+            | MitoEnterStagingOperation { source, .. }
             | BatchOpenMitoRegion { source, .. }
             | BatchCatchupMitoRegion { source, .. } => source.status_code(),
 
diff --git a/src/metric-engine/src/metadata_region.rs b/src/metric-engine/src/metadata_region.rs
index 677cf58899..c34b44e4a7 100644
--- a/src/metric-engine/src/metadata_region.rs
+++ b/src/metric-engine/src/metadata_region.rs
@@ -317,45 +317,20 @@ pub fn decode_batch_stream<T: Send + 'static>(
 
 /// Decode a record batch to a list of key and value.
 fn decode_record_batch_to_key_and_value(batch: RecordBatch) -> Vec<(String, String)> {
-    let key_col = batch.column(0);
-    let val_col = batch.column(1);
-
-    (0..batch.num_rows())
-        .flat_map(move |row_index| {
-            let key = key_col
-                .get_ref(row_index)
-                .try_into_string()
-                .unwrap()
-                .map(|s| s.to_string());
-
-            key.map(|k| {
-                (
-                    k,
-                    val_col
-                        .get_ref(row_index)
-                        .try_into_string()
-                        .unwrap()
-                        .map(|s| s.to_string())
-                        .unwrap_or_default(),
-                )
-            })
+    let keys = batch.iter_column_as_string(0);
+    let values = batch.iter_column_as_string(1);
+    keys.zip(values)
+        .filter_map(|(k, v)| match (k, v) {
+            (Some(k), Some(v)) => Some((k, v)),
+            (Some(k), None) => Some((k, "".to_string())),
+            (None, _) => None,
         })
-        .collect()
+        .collect::<Vec<_>>()
 }
 
 /// Decode a record batch to a list of key.
 fn decode_record_batch_to_key(batch: RecordBatch) -> Vec<String> {
-    let key_col = batch.column(0);
-
-    (0..batch.num_rows())
-        .flat_map(move |row_index| {
-            key_col
-                .get_ref(row_index)
-                .try_into_string()
-                .unwrap()
-                .map(|s| s.to_string())
-        })
-        .collect()
+    batch.iter_column_as_string(0).flatten().collect::<Vec<_>>()
 }
 
 // simulate to `KvBackend`
@@ -590,6 +565,8 @@ impl MetadataRegion {
     /// Retrieves the value associated with the given key in the specified region.
     /// Returns `Ok(None)` if the key is not found.
     pub async fn get(&self, region_id: RegionId, key: &str) -> Result<Option<String>> {
+        use datatypes::arrow::array::{Array, AsArray};
+
         let filter_expr = datafusion::prelude::col(METADATA_SCHEMA_KEY_COLUMN_NAME)
             .eq(datafusion::prelude::lit(key));
 
@@ -611,12 +588,9 @@ impl MetadataRegion {
             return Ok(None);
         };
 
-        let val = first_batch
-            .column(0)
-            .get_ref(0)
-            .try_into_string()
-            .unwrap()
-            .map(|s| s.to_string());
+        let column = first_batch.column(0);
+        let column = column.as_string::<i32>();
+        let val = column.is_valid(0).then(|| column.value(0).to_string());
 
         Ok(val)
     }
diff --git a/src/metric-engine/src/row_modifier.rs b/src/metric-engine/src/row_modifier.rs
index 4759a76215..0732359c39 100644
--- a/src/metric-engine/src/row_modifier.rs
+++ b/src/metric-engine/src/row_modifier.rs
@@ -13,11 +13,12 @@
 // limitations under the License.
 
 use std::collections::{BTreeMap, HashMap};
-use std::hash::Hash;
+use std::hash::Hasher;
 
 use api::v1::value::ValueData;
 use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value};
 use datatypes::value::ValueRef;
+use fxhash::FxHasher;
 use mito_codec::row_converter::SparsePrimaryKeyCodec;
 use smallvec::SmallVec;
 use snafu::ResultExt;
@@ -30,9 +31,6 @@ use store_api::storage::{ColumnId, TableId};
 
 use crate::error::{EncodePrimaryKeySnafu, Result};
 
-// A random number
-const TSID_HASH_SEED: u32 = 846793005;
-
 /// A row modifier modifies [`Rows`].
 ///
 /// - For [`PrimaryKeyEncoding::Sparse`] encoding,
@@ -75,6 +73,7 @@ impl RowModifier {
         let num_output_column = num_column - num_primary_key_column + 1;
 
         let mut buffer = vec![];
+
         for mut iter in iter.iter_mut() {
             let (table_id, tsid) = Self::fill_internal_columns(table_id, &iter);
             let mut values = Vec::with_capacity(num_output_column);
@@ -147,47 +146,72 @@ impl RowModifier {
 
     /// Fills internal columns of a row with table name and a hash of tag values.
     pub fn fill_internal_columns(table_id: TableId, iter: &RowIter<'_>) -> (Value, Value) {
-        let mut hasher = TsidGenerator::default();
-        for (name, value) in iter.primary_keys_with_name() {
-            // The type is checked before. So only null is ignored.
-            if let Some(ValueData::StringValue(string)) = &value.value_data {
-                hasher.write_label(name, string);
+        let ts_id = if !iter.has_null_labels() {
+            // No null labels in row, we can safely reuse the precomputed label name hash.
+            let mut ts_id_gen = TsidGenerator::new(iter.index.label_name_hash);
+            for (_, value) in iter.primary_keys_with_name() {
+                // The type is checked before. So only null is ignored.
+                if let Some(ValueData::StringValue(string)) = &value.value_data {
+                    ts_id_gen.write_str(string);
+                } else {
+                    unreachable!(
+                        "Should not contain null or non-string value: {:?}, table id: {}",
+                        value, table_id
+                    );
+                }
             }
-        }
-        let hash = hasher.finish();
+            ts_id_gen.finish()
+        } else {
+            // Slow path: row contains null, recompute label hash
+            let mut hasher = TsidGenerator::default();
+            // 1. Find out label names with non-null values and get the hash.
+            for (name, value) in iter.primary_keys_with_name() {
+                // The type is checked before. So only null is ignored.
+                if let Some(ValueData::StringValue(_)) = &value.value_data {
+                    hasher.write_str(name);
+                }
+            }
+            let label_name_hash = hasher.finish();
+
+            // 2. Use label name hash as seed and continue with label values.
+            let mut final_hasher = TsidGenerator::new(label_name_hash);
+            for (_, value) in iter.primary_keys_with_name() {
+                if let Some(ValueData::StringValue(value)) = &value.value_data {
+                    final_hasher.write_str(value);
+                }
+            }
+            final_hasher.finish()
+        };
 
         (
             ValueData::U32Value(table_id).into(),
-            ValueData::U64Value(hash).into(),
+            ValueData::U64Value(ts_id).into(),
         )
     }
 }
 
 /// Tsid generator.
+#[derive(Default)]
 pub struct TsidGenerator {
-    hasher: mur3::Hasher128,
-}
-
-impl Default for TsidGenerator {
-    fn default() -> Self {
-        Self {
-            hasher: mur3::Hasher128::with_seed(TSID_HASH_SEED),
-        }
-    }
+    hasher: FxHasher,
 }
 
 impl TsidGenerator {
+    pub fn new(label_name_hash: u64) -> Self {
+        let mut hasher = FxHasher::default();
+        hasher.write_u64(label_name_hash);
+        Self { hasher }
+    }
+
     /// Writes a label pair to the generator.
-    pub fn write_label(&mut self, name: &str, value: &str) {
-        name.hash(&mut self.hasher);
-        value.hash(&mut self.hasher);
+    pub fn write_str(&mut self, value: &str) {
+        self.hasher.write(value.as_bytes());
+        self.hasher.write_u8(0xff);
     }
 
     /// Generates a new TSID.
     pub fn finish(&mut self) -> u64 {
-        // TSID is 64 bits, simply truncate the 128 bits hash
-        let (hash, _) = self.hasher.finish128();
-        hash
+        self.hasher.finish()
     }
 }
 
@@ -202,6 +226,8 @@ struct ValueIndex {
 struct IterIndex {
     indices: Vec<ValueIndex>,
     num_primary_key_column: usize,
+    /// Precomputed hash for label names.
+    label_name_hash: u64,
 }
 
 impl IterIndex {
@@ -252,15 +278,22 @@ impl IterIndex {
             }
         }
         let num_primary_key_column = primary_key_indices.len() + reserved_indices.len();
-        let indices = reserved_indices
-            .into_iter()
-            .chain(primary_key_indices.values().cloned())
-            .chain(ts_index)
-            .chain(field_indices)
-            .collect();
+        let mut indices = Vec::with_capacity(num_primary_key_column + 2);
+        indices.extend(reserved_indices);
+        let mut label_name_hasher = TsidGenerator::default();
+        for (pk_name, pk_index) in primary_key_indices {
+            // primary_key_indices already sorted.
+            label_name_hasher.write_str(pk_name);
+            indices.push(pk_index);
+        }
+        let label_name_hash = label_name_hasher.finish();
+
+        indices.extend(ts_index);
+        indices.extend(field_indices);
         IterIndex {
             indices,
             num_primary_key_column,
+            label_name_hash,
         }
     }
 }
@@ -314,6 +347,13 @@ impl RowIter<'_> {
             })
     }
 
+    /// Returns true if any label in current row is null.
+    fn has_null_labels(&self) -> bool {
+        self.index.indices[..self.index.num_primary_key_column]
+            .iter()
+            .any(|idx| self.row.values[idx.index].value_data.is_none())
+    }
+
     /// Returns the primary keys.
     pub fn primary_keys(&self) -> impl Iterator<Item = (ColumnId, ValueRef<'_>)> {
         self.index.indices[..self.index.num_primary_key_column]
@@ -399,9 +439,9 @@ mod tests {
         let result = encoder.modify_rows_sparse(rows_iter, table_id).unwrap();
         assert_eq!(result.rows[0].values.len(), 1);
         let encoded_primary_key = vec![
-            128, 0, 0, 4, 1, 0, 0, 4, 1, 128, 0, 0, 3, 1, 131, 9, 166, 190, 173, 37, 39, 240, 0, 0,
-            0, 2, 1, 1, 49, 50, 55, 46, 48, 46, 48, 46, 9, 49, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
-            1, 1, 103, 114, 101, 112, 116, 105, 109, 101, 9, 100, 98, 0, 0, 0, 0, 0, 0, 2,
+            128, 0, 0, 4, 1, 0, 0, 4, 1, 128, 0, 0, 3, 1, 37, 196, 242, 181, 117, 224, 7, 137, 0,
+            0, 0, 2, 1, 1, 49, 50, 55, 46, 48, 46, 48, 46, 9, 49, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+            1, 1, 1, 103, 114, 101, 112, 116, 105, 109, 101, 9, 100, 98, 0, 0, 0, 0, 0, 0, 2,
         ];
         assert_eq!(
             result.rows[0].values[0],
@@ -477,7 +517,7 @@ mod tests {
         assert_eq!(result.rows[0].values[2], ValueData::U32Value(1025).into());
         assert_eq!(
             result.rows[0].values[3],
-            ValueData::U64Value(9442261431637846000).into()
+            ValueData::U64Value(2721566936019240841).into()
         );
         assert_eq!(result.schema, expected_dense_schema());
     }
@@ -496,7 +536,7 @@ mod tests {
         let row_iter = rows_iter.iter_mut().next().unwrap();
         let (encoded_table_id, tsid) = RowModifier::fill_internal_columns(table_id, &row_iter);
         assert_eq!(encoded_table_id, ValueData::U32Value(1025).into());
-        assert_eq!(tsid, ValueData::U64Value(9442261431637846000).into());
+        assert_eq!(tsid, ValueData::U64Value(2721566936019240841).into());
 
         // Change the column order
         let schema = vec![
@@ -524,6 +564,264 @@ mod tests {
         let row_iter = rows_iter.iter_mut().next().unwrap();
         let (encoded_table_id, tsid) = RowModifier::fill_internal_columns(table_id, &row_iter);
         assert_eq!(encoded_table_id, ValueData::U32Value(1025).into());
-        assert_eq!(tsid, ValueData::U64Value(9442261431637846000).into());
+        assert_eq!(tsid, ValueData::U64Value(2721566936019240841).into());
+    }
+
+    /// Helper function to create a schema with multiple label columns
+    fn create_multi_label_schema(labels: &[&str]) -> Vec<ColumnSchema> {
+        labels
+            .iter()
+            .map(|name| ColumnSchema {
+                column_name: name.to_string(),
+                datatype: ColumnDataType::String as i32,
+                semantic_type: SemanticType::Tag as _,
+                datatype_extension: None,
+                options: None,
+            })
+            .collect()
+    }
+
+    /// Helper function to create a name_to_column_id map
+    fn create_name_to_column_id(labels: &[&str]) -> HashMap<String, ColumnId> {
+        labels
+            .iter()
+            .enumerate()
+            .map(|(idx, name)| (name.to_string(), idx as ColumnId + 1))
+            .collect()
+    }
+
+    /// Helper function to create a row with string values
+    fn create_row_with_values(values: &[&str]) -> Row {
+        Row {
+            values: values
+                .iter()
+                .map(|v| ValueData::StringValue(v.to_string()).into())
+                .collect(),
+        }
+    }
+
+    /// Helper function to create a row with some null values
+    fn create_row_with_nulls(values: &[Option<&str>]) -> Row {
+        Row {
+            values: values
+                .iter()
+                .map(|v| {
+                    v.map(|s| ValueData::StringValue(s.to_string()).into())
+                        .unwrap_or(Value { value_data: None })
+                })
+                .collect(),
+        }
+    }
+
+    /// Helper function to extract TSID from a row
+    fn extract_tsid(
+        schema: Vec<ColumnSchema>,
+        row: Row,
+        name_to_column_id: &HashMap<String, ColumnId>,
+        table_id: TableId,
+    ) -> u64 {
+        let rows = Rows {
+            schema,
+            rows: vec![row],
+        };
+        let mut rows_iter = RowsIter::new(rows, name_to_column_id);
+        let row_iter = rows_iter.iter_mut().next().unwrap();
+        let (_, tsid_value) = RowModifier::fill_internal_columns(table_id, &row_iter);
+        match tsid_value.value_data {
+            Some(ValueData::U64Value(tsid)) => tsid,
+            _ => panic!("Expected U64Value for TSID"),
+        }
+    }
+
+    #[test]
+    fn test_tsid_same_for_different_label_orders() {
+        // Test that rows with the same label name-value pairs but in different orders
+        // produce the same TSID
+        let table_id = 1025;
+
+        // Schema 1: a, b, c
+        let schema1 = create_multi_label_schema(&["a", "b", "c"]);
+        let name_to_column_id1 = create_name_to_column_id(&["a", "b", "c"]);
+        let row1 = create_row_with_values(&["A", "B", "C"]);
+        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
+
+        // Schema 2: b, a, c (different order)
+        let schema2 = create_multi_label_schema(&["b", "a", "c"]);
+        let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
+        let row2 = create_row_with_values(&["B", "A", "C"]);
+        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
+
+        // Schema 3: c, b, a (another different order)
+        let schema3 = create_multi_label_schema(&["c", "b", "a"]);
+        let name_to_column_id3 = create_name_to_column_id(&["a", "b", "c"]);
+        let row3 = create_row_with_values(&["C", "B", "A"]);
+        let tsid3 = extract_tsid(schema3, row3, &name_to_column_id3, table_id);
+
+        // All should have the same TSID since label names are sorted lexicographically
+        // and we're using the same label name-value pairs
+        assert_eq!(
+            tsid1, tsid2,
+            "TSID should be same for different column orders"
+        );
+        assert_eq!(
+            tsid2, tsid3,
+            "TSID should be same for different column orders"
+        );
+    }
+
+    #[test]
+    fn test_tsid_same_with_null_labels() {
+        // Test that rows that differ only by null label values produce the same TSID
+        let table_id = 1025;
+
+        // Row 1: a=A, b=B (no nulls, fast path)
+        let schema1 = create_multi_label_schema(&["a", "b"]);
+        let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
+        let row1 = create_row_with_values(&["A", "B"]);
+        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
+
+        // Row 2: a=A, b=B, c=null (has null, slow path)
+        let schema2 = create_multi_label_schema(&["a", "b", "c"]);
+        let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
+        let row2 = create_row_with_nulls(&[Some("A"), Some("B"), None]);
+        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
+
+        // Both should have the same TSID since null labels are ignored
+        assert_eq!(
+            tsid1, tsid2,
+            "TSID should be same when only difference is null label values"
+        );
+    }
+
+    #[test]
+    fn test_tsid_same_with_multiple_null_labels() {
+        // Test with multiple null labels
+        let table_id = 1025;
+
+        // Row 1: a=A, b=B (no nulls)
+        let schema1 = create_multi_label_schema(&["a", "b"]);
+        let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
+        let row1 = create_row_with_values(&["A", "B"]);
+        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
+
+        // Row 2: a=A, b=B, c=null, d=null (multiple nulls)
+        let schema2 = create_multi_label_schema(&["a", "b", "c", "d"]);
+        let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c", "d"]);
+        let row2 = create_row_with_nulls(&[Some("A"), Some("B"), None, None]);
+        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
+
+        assert_eq!(
+            tsid1, tsid2,
+            "TSID should be same when only difference is multiple null label values"
+        );
+    }
+
+    #[test]
+    fn test_tsid_different_with_different_non_null_values() {
+        // Test that rows with different non-null values produce different TSIDs
+        let table_id = 1025;
+
+        // Row 1: a=A, b=B
+        let schema1 = create_multi_label_schema(&["a", "b"]);
+        let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
+        let row1 = create_row_with_values(&["A", "B"]);
+        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
+
+        // Row 2: a=A, b=C (different value for b)
+        let schema2 = create_multi_label_schema(&["a", "b"]);
+        let name_to_column_id2 = create_name_to_column_id(&["a", "b"]);
+        let row2 = create_row_with_values(&["A", "C"]);
+        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
+
+        assert_ne!(
+            tsid1, tsid2,
+            "TSID should be different when label values differ"
+        );
+    }
+
+    #[test]
+    fn test_tsid_fast_path_vs_slow_path_consistency() {
+        // Test that fast path (no nulls) and slow path (with nulls) produce
+        // the same TSID for the same non-null label values
+        let table_id = 1025;
+
+        // Fast path: a=A, b=B (no nulls)
+        let schema_fast = create_multi_label_schema(&["a", "b"]);
+        let name_to_column_id_fast = create_name_to_column_id(&["a", "b"]);
+        let row_fast = create_row_with_values(&["A", "B"]);
+        let tsid_fast = extract_tsid(schema_fast, row_fast, &name_to_column_id_fast, table_id);
+
+        // Slow path: a=A, b=B, c=null (has null, triggers slow path)
+        let schema_slow = create_multi_label_schema(&["a", "b", "c"]);
+        let name_to_column_id_slow = create_name_to_column_id(&["a", "b", "c"]);
+        let row_slow = create_row_with_nulls(&[Some("A"), Some("B"), None]);
+        let tsid_slow = extract_tsid(schema_slow, row_slow, &name_to_column_id_slow, table_id);
+
+        assert_eq!(
+            tsid_fast, tsid_slow,
+            "Fast path and slow path should produce same TSID for same non-null values"
+        );
+    }
+
+    #[test]
+    fn test_tsid_with_null_in_middle() {
+        // Test with null in the middle of labels
+        let table_id = 1025;
+
+        // Row 1: a=A, b=B, c=C
+        let schema1 = create_multi_label_schema(&["a", "b", "c"]);
+        let name_to_column_id1 = create_name_to_column_id(&["a", "b", "c"]);
+        let row1 = create_row_with_values(&["A", "B", "C"]);
+        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
+
+        // Row 2: a=A, b=null, c=C (null in middle)
+        let schema2 = create_multi_label_schema(&["a", "b", "c"]);
+        let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
+        let row2 = create_row_with_nulls(&[Some("A"), None, Some("C")]);
+        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
+
+        // Should be different because b is null in row2 but B in row1
+        // Actually wait, let me reconsider - if b is null, it should be ignored
+        // So row2 should be equivalent to a=A, c=C
+        // But row1 is a=A, b=B, c=C, so they should be different
+        assert_ne!(
+            tsid1, tsid2,
+            "TSID should be different when a non-null value becomes null"
+        );
+
+        // Row 3: a=A, c=C (no b at all, equivalent to row2)
+        let schema3 = create_multi_label_schema(&["a", "c"]);
+        let name_to_column_id3 = create_name_to_column_id(&["a", "c"]);
+        let row3 = create_row_with_values(&["A", "C"]);
+        let tsid3 = extract_tsid(schema3, row3, &name_to_column_id3, table_id);
+
+        // Row2 (a=A, b=null, c=C) should be same as row3 (a=A, c=C)
+        assert_eq!(
+            tsid2, tsid3,
+            "TSID should be same when null label is ignored"
+        );
+    }
+
+    #[test]
+    fn test_tsid_all_null_labels() {
+        // Test with all labels being null
+        let table_id = 1025;
+
+        // Row with all nulls
+        let schema = create_multi_label_schema(&["a", "b", "c"]);
+        let name_to_column_id = create_name_to_column_id(&["a", "b", "c"]);
+        let row = create_row_with_nulls(&[None, None, None]);
+        let tsid = extract_tsid(schema.clone(), row, &name_to_column_id, table_id);
+
+        // Should still produce a TSID (based on label names only when all values are null)
+        // This tests that the slow path handles the case where all values are null
+        // The TSID will be based on the label name hash only
+        // Test that it's consistent - same schema with all nulls should produce same TSID
+        let row2 = create_row_with_nulls(&[None, None, None]);
+        let tsid2 = extract_tsid(schema, row2, &name_to_column_id, table_id);
+        assert_eq!(
+            tsid, tsid2,
+            "TSID should be consistent when all label values are null"
+        );
     }
 }
diff --git a/src/mito-codec/src/row_converter/sparse.rs b/src/mito-codec/src/row_converter/sparse.rs
index 191c2bd011..731de5c0b0 100644
--- a/src/mito-codec/src/row_converter/sparse.rs
+++ b/src/mito-codec/src/row_converter/sparse.rs
@@ -83,6 +83,11 @@ impl SparseValues {
     pub fn insert(&mut self, column_id: ColumnId, value: Value) {
         self.values.insert(column_id, value);
     }
+
+    /// Returns an iterator over all stored column id/value pairs.
+    pub fn iter(&self) -> impl Iterator<Item = (&ColumnId, &Value)> {
+        self.values.iter()
+    }
 }
 
 /// The column id of the tsid.
diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml
index 7926ae198a..c453534317 100644
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -30,6 +30,7 @@ common-error.workspace = true
 common-grpc.workspace = true
 common-macro.workspace = true
 common-meta.workspace = true
+common-memory-manager.workspace = true
 common-query.workspace = true
 common-recordbatch.workspace = true
 common-runtime.workspace = true
@@ -48,6 +49,7 @@ dotenv.workspace = true
 either.workspace = true
 futures.workspace = true
 humantime-serde.workspace = true
+humantime.workspace = true
 index.workspace = true
 itertools.workspace = true
 greptime-proto.workspace = true
@@ -55,7 +57,7 @@ lazy_static = "1.4"
 log-store = { workspace = true }
 mito-codec.workspace = true
 moka = { workspace = true, features = ["sync", "future"] }
-object-store.workspace = true
+object-store = { workspace = true, features = ["testing"] }
 parquet = { workspace = true, features = ["async"] }
 paste.workspace = true
 pin-project.workspace = true
diff --git a/src/mito2/benches/simple_bulk_memtable.rs b/src/mito2/benches/simple_bulk_memtable.rs
index 02b6538aa9..7166c54afd 100644
--- a/src/mito2/benches/simple_bulk_memtable.rs
+++ b/src/mito2/benches/simple_bulk_memtable.rs
@@ -144,6 +144,7 @@ async fn flush(mem: &SimpleBulkMemtable) {
         let reader = Box::new(DedupReader::new(
             merge_reader,
             read::dedup::LastRow::new(true),
+            None,
         ));
         Source::Reader(reader)
     };
diff --git a/src/mito2/src/access_layer.rs b/src/mito2/src/access_layer.rs
index b6891d7410..8888ade815 100644
--- a/src/mito2/src/access_layer.rs
+++ b/src/mito2/src/access_layer.rs
@@ -37,7 +37,7 @@ use crate::error::{CleanDirSnafu, DeleteIndexSnafu, DeleteSstSnafu, OpenDalSnafu
 use crate::metrics::{COMPACTION_STAGE_ELAPSED, FLUSH_ELAPSED};
 use crate::read::{FlatSource, Source};
 use crate::region::options::IndexOptions;
-use crate::sst::file::{FileHandle, RegionFileId};
+use crate::sst::file::{FileHandle, RegionFileId, RegionIndexId};
 use crate::sst::index::IndexerBuilderImpl;
 use crate::sst::index::intermediate::IntermediateManager;
 use crate::sst::index::puffin_manager::{PuffinManagerFactory, SstPuffinManager};
@@ -216,7 +216,7 @@ impl AccessLayer {
     pub(crate) async fn delete_sst(
         &self,
         region_file_id: &RegionFileId,
-        index_file_id: &RegionFileId,
+        index_file_id: &RegionIndexId,
     ) -> Result<()> {
         let path = location::sst_file_path(&self.table_dir, *region_file_id, self.path_type);
         self.object_store
@@ -226,14 +226,30 @@ impl AccessLayer {
                 file_id: region_file_id.file_id(),
             })?;
 
-        let path = location::index_file_path(&self.table_dir, *index_file_id, self.path_type);
+        // Delete all versions of the index file.
+        for version in 0..=index_file_id.version {
+            let index_id = RegionIndexId::new(*region_file_id, version);
+            self.delete_index(index_id).await?;
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn delete_index(
+        &self,
+        index_file_id: RegionIndexId,
+    ) -> Result<(), crate::error::Error> {
+        let path = location::index_file_path(
+            &self.table_dir,
+            RegionIndexId::new(index_file_id.file_id, index_file_id.version),
+            self.path_type,
+        );
         self.object_store
             .delete(&path)
             .await
             .context(DeleteIndexSnafu {
-                file_id: region_file_id.file_id(),
+                file_id: index_file_id.file_id(),
             })?;
-
         Ok(())
     }
 
@@ -291,6 +307,7 @@ impl AccessLayer {
                 puffin_manager: self
                     .puffin_manager_factory
                     .build(store, path_provider.clone()),
+                write_cache_enabled: false,
                 intermediate_manager: self.intermediate_manager.clone(),
                 index_options: request.index_options,
                 inverted_index_config: request.inverted_index_config,
@@ -468,9 +485,10 @@ impl TempFileCleaner {
     }
 
     /// Removes the SST and index file from the local atomic dir by the file id.
+    /// This only removes the initial index, since the index version is always 0 for a new SST, this method should be safe to pass 0.
     pub(crate) async fn clean_by_file_id(&self, file_id: FileId) {
         let sst_key = IndexKey::new(self.region_id, file_id, FileType::Parquet).to_string();
-        let index_key = IndexKey::new(self.region_id, file_id, FileType::Puffin).to_string();
+        let index_key = IndexKey::new(self.region_id, file_id, FileType::Puffin(0)).to_string();
 
         Self::clean_atomic_dir_files(&self.object_store, &[&sst_key, &index_key]).await;
     }
@@ -553,9 +571,12 @@ async fn clean_dir(dir: &str) -> Result<()> {
 
 /// Path provider for SST file and index file.
 pub trait FilePathProvider: Send + Sync {
-    /// Creates index file path of given file id.
+    /// Creates index file path of given file id. Version default to 0, and not shown in the path.
     fn build_index_file_path(&self, file_id: RegionFileId) -> String;
 
+    /// Creates index file path of given index id (with version support).
+    fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String;
+
     /// Creates SST file path of given file id.
     fn build_sst_file_path(&self, file_id: RegionFileId) -> String;
 }
@@ -575,7 +596,16 @@ impl WriteCachePathProvider {
 
 impl FilePathProvider for WriteCachePathProvider {
     fn build_index_file_path(&self, file_id: RegionFileId) -> String {
-        let puffin_key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Puffin);
+        let puffin_key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Puffin(0));
+        self.file_cache.cache_file_path(puffin_key)
+    }
+
+    fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
+        let puffin_key = IndexKey::new(
+            index_id.region_id(),
+            index_id.file_id(),
+            FileType::Puffin(index_id.version),
+        );
         self.file_cache.cache_file_path(puffin_key)
     }
 
@@ -605,7 +635,11 @@ impl RegionFilePathFactory {
 
 impl FilePathProvider for RegionFilePathFactory {
     fn build_index_file_path(&self, file_id: RegionFileId) -> String {
-        location::index_file_path(&self.table_dir, file_id, self.path_type)
+        location::index_file_path_legacy(&self.table_dir, file_id, self.path_type)
+    }
+
+    fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
+        location::index_file_path(&self.table_dir, index_id, self.path_type)
     }
 
     fn build_sst_file_path(&self, file_id: RegionFileId) -> String {
diff --git a/src/mito2/src/cache.rs b/src/mito2/src/cache.rs
index b3a9bfb2df..f502fb51f5 100644
--- a/src/mito2/src/cache.rs
+++ b/src/mito2/src/cache.rs
@@ -18,6 +18,7 @@ mod cache_size;
 
 pub(crate) mod file_cache;
 pub(crate) mod index;
+pub(crate) mod manifest_cache;
 #[cfg(test)]
 pub(crate) mod test_util;
 pub(crate) mod write_cache;
@@ -33,6 +34,7 @@ use index::bloom_filter_index::{BloomFilterIndexCache, BloomFilterIndexCacheRef}
 use index::result_cache::IndexResultCache;
 use moka::notification::RemovalCause;
 use moka::sync::Cache;
+use object_store::ObjectStore;
 use parquet::file::metadata::ParquetMetaData;
 use puffin::puffin_manager::cache::{PuffinMetadataCache, PuffinMetadataCacheRef};
 use store_api::storage::{ConcreteDataType, FileId, RegionId, TimeSeriesRowSelector};
@@ -43,7 +45,8 @@ use crate::cache::index::inverted_index::{InvertedIndexCache, InvertedIndexCache
 use crate::cache::write_cache::WriteCacheRef;
 use crate::metrics::{CACHE_BYTES, CACHE_EVICTION, CACHE_HIT, CACHE_MISS};
 use crate::read::Batch;
-use crate::sst::file::RegionFileId;
+use crate::sst::file::{RegionFileId, RegionIndexId};
+use crate::sst::parquet::reader::MetadataCacheMetrics;
 
 /// Metrics type key for sst meta.
 const SST_META_TYPE: &str = "sst_meta";
@@ -74,19 +77,24 @@ pub enum CacheStrategy {
 }
 
 impl CacheStrategy {
-    /// Calls [CacheManager::get_parquet_meta_data()].
-    pub async fn get_parquet_meta_data(
+    /// Gets parquet metadata with cache metrics tracking.
+    /// Returns the metadata and updates the provided metrics.
+    pub(crate) async fn get_parquet_meta_data(
         &self,
         file_id: RegionFileId,
+        metrics: &mut MetadataCacheMetrics,
     ) -> Option<Arc<ParquetMetaData>> {
         match self {
             CacheStrategy::EnableAll(cache_manager) => {
-                cache_manager.get_parquet_meta_data(file_id).await
+                cache_manager.get_parquet_meta_data(file_id, metrics).await
             }
             CacheStrategy::Compaction(cache_manager) => {
-                cache_manager.get_parquet_meta_data(file_id).await
+                cache_manager.get_parquet_meta_data(file_id, metrics).await
+            }
+            CacheStrategy::Disabled => {
+                metrics.cache_miss += 1;
+                None
             }
-            CacheStrategy::Disabled => None,
         }
     }
 
@@ -173,7 +181,7 @@ impl CacheStrategy {
     }
 
     /// Calls [CacheManager::evict_puffin_cache()].
-    pub async fn evict_puffin_cache(&self, file_id: RegionFileId) {
+    pub async fn evict_puffin_cache(&self, file_id: RegionIndexId) {
         match self {
             CacheStrategy::EnableAll(cache_manager) => {
                 cache_manager.evict_puffin_cache(file_id).await
@@ -256,6 +264,26 @@ impl CacheStrategy {
             CacheStrategy::Compaction(_) | CacheStrategy::Disabled => None,
         }
     }
+
+    /// Triggers download if the strategy is [CacheStrategy::EnableAll] and write cache is available.
+    pub fn maybe_download_background(
+        &self,
+        index_key: IndexKey,
+        remote_path: String,
+        remote_store: ObjectStore,
+        file_size: u64,
+    ) {
+        if let CacheStrategy::EnableAll(cache_manager) = self
+            && let Some(write_cache) = cache_manager.write_cache()
+        {
+            write_cache.file_cache().maybe_download_background(
+                index_key,
+                remote_path,
+                remote_store,
+                file_size,
+            );
+        }
+    }
 }
 
 /// Manages cached data for the engine.
@@ -291,16 +319,17 @@ impl CacheManager {
         CacheManagerBuilder::default()
     }
 
-    /// Gets cached [ParquetMetaData] from in-memory cache first.
-    /// If not found, tries to get it from write cache and fill the in-memory cache.
-    pub async fn get_parquet_meta_data(
+    /// Gets cached [ParquetMetaData] with metrics tracking.
+    /// Tries in-memory cache first, then file cache, updating metrics accordingly.
+    pub(crate) async fn get_parquet_meta_data(
         &self,
         file_id: RegionFileId,
+        metrics: &mut MetadataCacheMetrics,
     ) -> Option<Arc<ParquetMetaData>> {
         // Try to get metadata from sst meta cache
-        let metadata = self.get_parquet_meta_data_from_mem_cache(file_id);
-        if metadata.is_some() {
-            return metadata;
+        if let Some(metadata) = self.get_parquet_meta_data_from_mem_cache(file_id) {
+            metrics.mem_cache_hit += 1;
+            return Some(metadata);
         }
 
         // Try to get metadata from write cache
@@ -308,11 +337,13 @@ impl CacheManager {
         if let Some(write_cache) = &self.write_cache
             && let Some(metadata) = write_cache.file_cache().get_parquet_meta_data(key).await
         {
+            metrics.file_cache_hit += 1;
             let metadata = Arc::new(metadata);
             // Put metadata into sst meta cache
             self.put_parquet_meta_data(file_id, metadata.clone());
             return Some(metadata);
         };
+        metrics.cache_miss += 1;
 
         None
     }
@@ -390,7 +421,7 @@ impl CacheManager {
     }
 
     /// Evicts every puffin-related cache entry for the given file.
-    pub async fn evict_puffin_cache(&self, file_id: RegionFileId) {
+    pub async fn evict_puffin_cache(&self, file_id: RegionIndexId) {
         if let Some(cache) = &self.bloom_filter_index_cache {
             cache.invalidate_file(file_id.file_id());
         }
@@ -412,7 +443,7 @@ impl CacheManager {
                 .remove(IndexKey::new(
                     file_id.region_id(),
                     file_id.file_id(),
-                    FileType::Puffin,
+                    FileType::Puffin(file_id.version),
                 ))
                 .await;
         }
@@ -825,8 +856,14 @@ mod tests {
         let region_id = RegionId::new(1, 1);
         let file_id = RegionFileId::new(region_id, FileId::random());
         let metadata = parquet_meta();
+        let mut metrics = MetadataCacheMetrics::default();
         cache.put_parquet_meta_data(file_id, metadata);
-        assert!(cache.get_parquet_meta_data(file_id).await.is_none());
+        assert!(
+            cache
+                .get_parquet_meta_data(file_id, &mut metrics)
+                .await
+                .is_none()
+        );
 
         let value = Value::Int64(10);
         let vector: VectorRef = Arc::new(Int64Vector::from_slice([10, 10, 10, 10]));
@@ -848,14 +885,30 @@ mod tests {
     #[tokio::test]
     async fn test_parquet_meta_cache() {
         let cache = CacheManager::builder().sst_meta_cache_size(2000).build();
+        let mut metrics = MetadataCacheMetrics::default();
         let region_id = RegionId::new(1, 1);
         let file_id = RegionFileId::new(region_id, FileId::random());
-        assert!(cache.get_parquet_meta_data(file_id).await.is_none());
+        assert!(
+            cache
+                .get_parquet_meta_data(file_id, &mut metrics)
+                .await
+                .is_none()
+        );
         let metadata = parquet_meta();
         cache.put_parquet_meta_data(file_id, metadata);
-        assert!(cache.get_parquet_meta_data(file_id).await.is_some());
+        assert!(
+            cache
+                .get_parquet_meta_data(file_id, &mut metrics)
+                .await
+                .is_some()
+        );
         cache.remove_parquet_meta_data(file_id);
-        assert!(cache.get_parquet_meta_data(file_id).await.is_none());
+        assert!(
+            cache
+                .get_parquet_meta_data(file_id, &mut metrics)
+                .await
+                .is_none()
+        );
     }
 
     #[test]
@@ -917,7 +970,7 @@ mod tests {
         let cache = Arc::new(cache);
 
         let region_id = RegionId::new(1, 1);
-        let region_file_id = RegionFileId::new(region_id, FileId::random());
+        let index_id = RegionIndexId::new(RegionFileId::new(region_id, FileId::random()), 0);
         let column_id: ColumnId = 1;
 
         let bloom_cache = cache.bloom_filter_index_cache().unwrap().clone();
@@ -925,16 +978,21 @@ mod tests {
         let result_cache = cache.index_result_cache().unwrap();
         let puffin_metadata_cache = cache.puffin_metadata_cache().unwrap().clone();
 
-        let bloom_key = (region_file_id.file_id(), column_id, Tag::Skipping);
+        let bloom_key = (
+            index_id.file_id(),
+            index_id.version,
+            column_id,
+            Tag::Skipping,
+        );
         bloom_cache.put_metadata(bloom_key, Arc::new(BloomFilterMeta::default()));
         inverted_cache.put_metadata(
-            region_file_id.file_id(),
+            (index_id.file_id(), index_id.version),
             Arc::new(InvertedIndexMetas::default()),
         );
         let predicate = PredicateKey::new_bloom(Arc::new(BTreeMap::new()));
         let selection = Arc::new(RowGroupSelection::default());
-        result_cache.put(predicate.clone(), region_file_id.file_id(), selection);
-        let file_id_str = region_file_id.to_string();
+        result_cache.put(predicate.clone(), index_id.file_id(), selection);
+        let file_id_str = index_id.to_string();
         let metadata = Arc::new(FileMetadata {
             blobs: Vec::new(),
             properties: HashMap::new(),
@@ -944,40 +1002,32 @@ mod tests {
         assert!(bloom_cache.get_metadata(bloom_key).is_some());
         assert!(
             inverted_cache
-                .get_metadata(region_file_id.file_id())
-                .is_some()
-        );
-        assert!(
-            result_cache
-                .get(&predicate, region_file_id.file_id())
+                .get_metadata((index_id.file_id(), index_id.version))
                 .is_some()
         );
+        assert!(result_cache.get(&predicate, index_id.file_id()).is_some());
         assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_some());
 
-        cache.evict_puffin_cache(region_file_id).await;
+        cache.evict_puffin_cache(index_id).await;
 
         assert!(bloom_cache.get_metadata(bloom_key).is_none());
         assert!(
             inverted_cache
-                .get_metadata(region_file_id.file_id())
-                .is_none()
-        );
-        assert!(
-            result_cache
-                .get(&predicate, region_file_id.file_id())
+                .get_metadata((index_id.file_id(), index_id.version))
                 .is_none()
         );
+        assert!(result_cache.get(&predicate, index_id.file_id()).is_none());
         assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_none());
 
         // Refill caches and evict via CacheStrategy to ensure delegation works.
         bloom_cache.put_metadata(bloom_key, Arc::new(BloomFilterMeta::default()));
         inverted_cache.put_metadata(
-            region_file_id.file_id(),
+            (index_id.file_id(), index_id.version),
             Arc::new(InvertedIndexMetas::default()),
         );
         result_cache.put(
             predicate.clone(),
-            region_file_id.file_id(),
+            index_id.file_id(),
             Arc::new(RowGroupSelection::default()),
         );
         puffin_metadata_cache.put_metadata(
@@ -989,19 +1039,15 @@ mod tests {
         );
 
         let strategy = CacheStrategy::EnableAll(cache.clone());
-        strategy.evict_puffin_cache(region_file_id).await;
+        strategy.evict_puffin_cache(index_id).await;
 
         assert!(bloom_cache.get_metadata(bloom_key).is_none());
         assert!(
             inverted_cache
-                .get_metadata(region_file_id.file_id())
-                .is_none()
-        );
-        assert!(
-            result_cache
-                .get(&predicate, region_file_id.file_id())
+                .get_metadata((index_id.file_id(), index_id.version))
                 .is_none()
         );
+        assert!(result_cache.get(&predicate, index_id.file_id()).is_none());
         assert!(puffin_metadata_cache.get_metadata(&file_id_str).is_none());
     }
 }
diff --git a/src/mito2/src/cache/file_cache.rs b/src/mito2/src/cache/file_cache.rs
index e9c67aaa45..9392cbeaba 100644
--- a/src/mito2/src/cache/file_cache.rs
+++ b/src/mito2/src/cache/file_cache.rs
@@ -31,7 +31,7 @@ use object_store::{ErrorKind, ObjectStore, Reader};
 use parquet::file::metadata::ParquetMetaData;
 use snafu::ResultExt;
 use store_api::storage::{FileId, RegionId};
-use tokio::sync::mpsc::UnboundedReceiver;
+use tokio::sync::mpsc::{Sender, UnboundedReceiver};
 
 use crate::access_layer::TempFileCleaner;
 use crate::cache::{FILE_TYPE, INDEX_TYPE};
@@ -55,121 +55,46 @@ pub(crate) const DEFAULT_INDEX_CACHE_PERCENT: u8 = 20;
 /// Minimum capacity for each cache (512MB).
 const MIN_CACHE_CAPACITY: u64 = 512 * 1024 * 1024;
 
-/// A file cache manages files on local store and evict files based
-/// on size.
+/// Channel capacity for background download tasks.
+const DOWNLOAD_TASK_CHANNEL_SIZE: usize = 64;
+
+/// A task to download a file in the background.
+struct DownloadTask {
+    index_key: IndexKey,
+    remote_path: String,
+    remote_store: ObjectStore,
+    file_size: u64,
+}
+
+/// Inner struct for FileCache that can be used in spawned tasks.
 #[derive(Debug)]
-pub(crate) struct FileCache {
+struct FileCacheInner {
     /// Local store to cache files.
     local_store: ObjectStore,
     /// Index to track cached Parquet files.
     parquet_index: Cache<IndexKey, IndexValue>,
     /// Index to track cached Puffin files.
     puffin_index: Cache<IndexKey, IndexValue>,
-    /// Capacity of the puffin (index) cache in bytes.
-    puffin_capacity: u64,
 }
 
-pub(crate) type FileCacheRef = Arc<FileCache>;
-
-impl FileCache {
-    /// Creates a new file cache.
-    pub(crate) fn new(
-        local_store: ObjectStore,
-        capacity: ReadableSize,
-        ttl: Option<Duration>,
-        index_cache_percent: Option<u8>,
-    ) -> FileCache {
-        // Validate and use the provided percent or default
-        let index_percent = index_cache_percent
-            .filter(|&percent| percent > 0 && percent < 100)
-            .unwrap_or(DEFAULT_INDEX_CACHE_PERCENT);
-        let total_capacity = capacity.as_bytes();
-
-        // Convert percent to ratio and calculate capacity for each cache
-        let index_ratio = index_percent as f64 / 100.0;
-        let puffin_capacity = (total_capacity as f64 * index_ratio) as u64;
-        let parquet_capacity = total_capacity - puffin_capacity;
-
-        // Ensure both capacities are at least 512MB
-        let puffin_capacity = puffin_capacity.max(MIN_CACHE_CAPACITY);
-        let parquet_capacity = parquet_capacity.max(MIN_CACHE_CAPACITY);
-
-        info!(
-            "Initializing file cache with index_percent: {}%, total_capacity: {}, parquet_capacity: {}, puffin_capacity: {}",
-            index_percent,
-            ReadableSize(total_capacity),
-            ReadableSize(parquet_capacity),
-            ReadableSize(puffin_capacity)
-        );
-
-        let parquet_index = Self::build_cache(local_store.clone(), parquet_capacity, ttl, "file");
-        let puffin_index = Self::build_cache(local_store.clone(), puffin_capacity, ttl, "index");
-
-        FileCache {
-            local_store,
-            parquet_index,
-            puffin_index,
-            puffin_capacity,
-        }
-    }
-
-    /// Builds a cache for a specific file type.
-    fn build_cache(
-        local_store: ObjectStore,
-        capacity: u64,
-        ttl: Option<Duration>,
-        label: &'static str,
-    ) -> Cache<IndexKey, IndexValue> {
-        let cache_store = local_store;
-        let mut builder = Cache::builder()
-            .eviction_policy(EvictionPolicy::lru())
-            .weigher(|_key, value: &IndexValue| -> u32 {
-                // We only measure space on local store.
-                value.file_size
-            })
-            .max_capacity(capacity)
-            .async_eviction_listener(move |key, value, cause| {
-                let store = cache_store.clone();
-                // Stores files under FILE_DIR.
-                let file_path = cache_file_path(FILE_DIR, *key);
-                async move {
-                    if let RemovalCause::Replaced = cause {
-                        // The cache is replaced by another file. This is unexpected, we don't remove the same
-                        // file but updates the metrics as the file is already replaced by users.
-                        CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
-                        warn!("Replace existing cache {} for region {} unexpectedly", file_path, key.region_id);
-                        return;
-                    }
-
-                    match store.delete(&file_path).await {
-                        Ok(()) => {
-                            CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
-                        }
-                        Err(e) => {
-                            warn!(e; "Failed to delete cached file {} for region {}", file_path, key.region_id);
-                        }
-                    }
-                }
-                .boxed()
-            });
-        if let Some(ttl) = ttl {
-            builder = builder.time_to_idle(ttl);
-        }
-        builder.build()
-    }
-
+impl FileCacheInner {
     /// Returns the appropriate memory index for the given file type.
     fn memory_index(&self, file_type: FileType) -> &Cache<IndexKey, IndexValue> {
         match file_type {
             FileType::Parquet => &self.parquet_index,
-            FileType::Puffin => &self.puffin_index,
+            FileType::Puffin { .. } => &self.puffin_index,
         }
     }
 
+    /// Returns the cache file path for the key.
+    fn cache_file_path(&self, key: IndexKey) -> String {
+        cache_file_path(FILE_DIR, key)
+    }
+
     /// Puts a file into the cache index.
     ///
     /// The `WriteCache` should ensure the file is in the correct path.
-    pub(crate) async fn put(&self, key: IndexKey, value: IndexValue) {
+    async fn put(&self, key: IndexKey, value: IndexValue) {
         CACHE_BYTES
             .with_label_values(&[key.file_type.metric_label()])
             .add(value.file_size.into());
@@ -180,100 +105,8 @@ impl FileCache {
         index.run_pending_tasks().await;
     }
 
-    pub(crate) async fn get(&self, key: IndexKey) -> Option<IndexValue> {
-        self.memory_index(key.file_type).get(&key).await
-    }
-
-    /// Reads a file from the cache.
-    #[allow(unused)]
-    pub(crate) async fn reader(&self, key: IndexKey) -> Option<Reader> {
-        // We must use `get()` to update the estimator of the cache.
-        // See https://docs.rs/moka/latest/moka/future/struct.Cache.html#method.contains_key
-        let index = self.memory_index(key.file_type);
-        if index.get(&key).await.is_none() {
-            CACHE_MISS
-                .with_label_values(&[key.file_type.metric_label()])
-                .inc();
-            return None;
-        }
-
-        let file_path = self.cache_file_path(key);
-        match self.get_reader(&file_path).await {
-            Ok(Some(reader)) => {
-                CACHE_HIT
-                    .with_label_values(&[key.file_type.metric_label()])
-                    .inc();
-                return Some(reader);
-            }
-            Err(e) => {
-                if e.kind() != ErrorKind::NotFound {
-                    warn!(e; "Failed to get file for key {:?}", key);
-                }
-            }
-            Ok(None) => {}
-        }
-
-        // We removes the file from the index.
-        index.remove(&key).await;
-        CACHE_MISS
-            .with_label_values(&[key.file_type.metric_label()])
-            .inc();
-        None
-    }
-
-    /// Reads ranges from the cache.
-    pub(crate) async fn read_ranges(
-        &self,
-        key: IndexKey,
-        ranges: &[Range<u64>],
-    ) -> Option<Vec<Bytes>> {
-        let index = self.memory_index(key.file_type);
-        if index.get(&key).await.is_none() {
-            CACHE_MISS
-                .with_label_values(&[key.file_type.metric_label()])
-                .inc();
-            return None;
-        }
-
-        let file_path = self.cache_file_path(key);
-        // In most cases, it will use blocking read,
-        // because FileCache is normally based on local file system, which supports blocking read.
-        let bytes_result = fetch_byte_ranges(&file_path, self.local_store.clone(), ranges).await;
-        match bytes_result {
-            Ok(bytes) => {
-                CACHE_HIT
-                    .with_label_values(&[key.file_type.metric_label()])
-                    .inc();
-                Some(bytes)
-            }
-            Err(e) => {
-                if e.kind() != ErrorKind::NotFound {
-                    warn!(e; "Failed to get file for key {:?}", key);
-                }
-
-                // We removes the file from the index.
-                index.remove(&key).await;
-                CACHE_MISS
-                    .with_label_values(&[key.file_type.metric_label()])
-                    .inc();
-                None
-            }
-        }
-    }
-
-    /// Removes a file from the cache explicitly.
-    /// It always tries to remove the file from the local store because we may not have the file
-    /// in the memory index if upload is failed.
-    pub(crate) async fn remove(&self, key: IndexKey) {
-        let file_path = self.cache_file_path(key);
-        self.memory_index(key.file_type).remove(&key).await;
-        // Always delete the file from the local store.
-        if let Err(e) = self.local_store.delete(&file_path).await {
-            warn!(e; "Failed to delete a cached file {}", file_path);
-        }
-    }
-
-    async fn recover_inner(&self) -> Result<()> {
+    /// Recovers the index from local store.
+    async fn recover(&self) -> Result<()> {
         let now = Instant::now();
         let mut lister = self
             .local_store
@@ -308,7 +141,7 @@ impl FileCache {
             // Track sizes separately for each file type
             match key.file_type {
                 FileType::Parquet => parquet_size += size,
-                FileType::Puffin => puffin_size += size,
+                FileType::Puffin { .. } => puffin_size += size,
             }
         }
         // The metrics is a signed int gauge so we can updates it finally.
@@ -341,157 +174,28 @@ impl FileCache {
         Ok(())
     }
 
-    /// Recovers the index from local store.
-    ///
-    /// If `task_receiver` is provided, spawns a background task after recovery
-    /// to process `RegionLoadCacheTask` messages for loading files into the cache.
-    pub(crate) async fn recover(
-        self: &Arc<Self>,
-        sync: bool,
-        task_receiver: Option<UnboundedReceiver<RegionLoadCacheTask>>,
-    ) {
-        let moved_self = self.clone();
-        let handle = tokio::spawn(async move {
-            if let Err(err) = moved_self.recover_inner().await {
-                error!(err; "Failed to recover file cache.")
-            }
-
-            // Spawns background task to process region load cache tasks after recovery.
-            // So it won't block the recovery when `sync` is true.
-            if let Some(mut receiver) = task_receiver {
-                let cache_ref = moved_self.clone();
-                info!("Spawning background task for processing region load cache tasks");
-                tokio::spawn(async move {
-                    while let Some(task) = receiver.recv().await {
-                        let file_cache = cache_ref.clone();
-                        task.fill_cache(file_cache).await;
-                    }
-                    info!("Background task for processing region load cache tasks stopped");
-                });
-            }
-        });
-
-        if sync {
-            let _ = handle.await;
-        }
-    }
-
-    /// Returns the cache file path for the key.
-    pub(crate) fn cache_file_path(&self, key: IndexKey) -> String {
-        cache_file_path(FILE_DIR, key)
-    }
-
-    /// Returns the local store of the file cache.
-    pub(crate) fn local_store(&self) -> ObjectStore {
-        self.local_store.clone()
-    }
-
-    /// Get the parquet metadata in file cache.
-    /// If the file is not in the cache or fail to load metadata, return None.
-    pub(crate) async fn get_parquet_meta_data(&self, key: IndexKey) -> Option<ParquetMetaData> {
-        // Check if file cache contains the key
-        if let Some(index_value) = self.parquet_index.get(&key).await {
-            // Load metadata from file cache
-            let local_store = self.local_store();
-            let file_path = self.cache_file_path(key);
-            let file_size = index_value.file_size as u64;
-            let metadata_loader = MetadataLoader::new(local_store, &file_path, file_size);
-
-            match metadata_loader.load().await {
-                Ok(metadata) => {
-                    CACHE_HIT
-                        .with_label_values(&[key.file_type.metric_label()])
-                        .inc();
-                    Some(metadata)
-                }
-                Err(e) => {
-                    if !e.is_object_not_found() {
-                        warn!(
-                            e; "Failed to get parquet metadata for key {:?}",
-                            key
-                        );
-                    }
-                    // We removes the file from the index.
-                    self.parquet_index.remove(&key).await;
-                    CACHE_MISS
-                        .with_label_values(&[key.file_type.metric_label()])
-                        .inc();
-                    None
-                }
-            }
-        } else {
-            CACHE_MISS
-                .with_label_values(&[key.file_type.metric_label()])
-                .inc();
-            None
-        }
-    }
-
-    async fn get_reader(&self, file_path: &str) -> object_store::Result<Option<Reader>> {
-        if self.local_store.exists(file_path).await? {
-            Ok(Some(self.local_store.reader(file_path).await?))
-        } else {
-            Ok(None)
-        }
-    }
-
-    /// Checks if the key is in the file cache.
-    pub(crate) fn contains_key(&self, key: &IndexKey) -> bool {
-        self.memory_index(key.file_type).contains_key(key)
-    }
-
-    /// Returns the capacity of the puffin (index) cache in bytes.
-    pub(crate) fn puffin_cache_capacity(&self) -> u64 {
-        self.puffin_capacity
-    }
-
-    /// Returns the current weighted size (used bytes) of the puffin (index) cache.
-    pub(crate) fn puffin_cache_size(&self) -> u64 {
-        self.puffin_index.weighted_size()
-    }
-
-    /// Downloads a file in `remote_path` from the remote object store to the local cache
-    /// (specified by `index_key`).
-    pub(crate) async fn download(
-        &self,
-        index_key: IndexKey,
-        remote_path: &str,
-        remote_store: &ObjectStore,
-        file_size: u64,
-    ) -> Result<()> {
-        if let Err(e) = self
-            .download_without_cleaning(index_key, remote_path, remote_store, file_size)
-            .await
-        {
-            let filename = index_key.to_string();
-            TempFileCleaner::clean_atomic_dir_files(&self.local_store, &[&filename]).await;
-
-            return Err(e);
-        }
-        Ok(())
-    }
-
+    /// Downloads a file without cleaning up on error.
     async fn download_without_cleaning(
         &self,
         index_key: IndexKey,
         remote_path: &str,
         remote_store: &ObjectStore,
         file_size: u64,
+        concurrency: usize,
     ) -> Result<()> {
-        const DOWNLOAD_READER_CONCURRENCY: usize = 8;
         const DOWNLOAD_READER_CHUNK_SIZE: ReadableSize = ReadableSize::mb(8);
 
         let file_type = index_key.file_type;
         let timer = WRITE_CACHE_DOWNLOAD_ELAPSED
             .with_label_values(&[match file_type {
                 FileType::Parquet => "download_parquet",
-                FileType::Puffin => "download_puffin",
+                FileType::Puffin { .. } => "download_puffin",
             }])
             .start_timer();
 
         let reader = remote_store
             .reader_with(remote_path)
-            .concurrent(DOWNLOAD_READER_CONCURRENCY)
+            .concurrent(concurrency)
             .chunk(DOWNLOAD_READER_CHUNK_SIZE.as_bytes() as usize)
             .await
             .context(error::OpenDalSnafu)?
@@ -537,11 +241,444 @@ impl FileCache {
         self.put(index_key, index_value).await;
         Ok(())
     }
+
+    /// Downloads a file from remote store to local cache.
+    async fn download(
+        &self,
+        index_key: IndexKey,
+        remote_path: &str,
+        remote_store: &ObjectStore,
+        file_size: u64,
+        concurrency: usize,
+    ) -> Result<()> {
+        if let Err(e) = self
+            .download_without_cleaning(index_key, remote_path, remote_store, file_size, concurrency)
+            .await
+        {
+            error!(e; "Failed to download file '{}' for region {}", remote_path, index_key.region_id);
+
+            let filename = index_key.to_string();
+            TempFileCleaner::clean_atomic_dir_files(&self.local_store, &[&filename]).await;
+
+            return Err(e);
+        }
+
+        Ok(())
+    }
+
+    /// Checks if the key is in the file cache.
+    fn contains_key(&self, key: &IndexKey) -> bool {
+        self.memory_index(key.file_type).contains_key(key)
+    }
+}
+
+/// A file cache manages files on local store and evict files based
+/// on size.
+#[derive(Debug, Clone)]
+pub(crate) struct FileCache {
+    /// Inner cache state shared with background worker.
+    inner: Arc<FileCacheInner>,
+    /// Capacity of the puffin (index) cache in bytes.
+    puffin_capacity: u64,
+    /// Channel for background download tasks. None if background worker is disabled.
+    download_task_tx: Option<Sender<DownloadTask>>,
+}
+
+pub(crate) type FileCacheRef = Arc<FileCache>;
+
+impl FileCache {
+    /// Creates a new file cache.
+    pub(crate) fn new(
+        local_store: ObjectStore,
+        capacity: ReadableSize,
+        ttl: Option<Duration>,
+        index_cache_percent: Option<u8>,
+        enable_background_worker: bool,
+    ) -> FileCache {
+        // Validate and use the provided percent or default
+        let index_percent = index_cache_percent
+            .filter(|&percent| percent > 0 && percent < 100)
+            .unwrap_or(DEFAULT_INDEX_CACHE_PERCENT);
+        let total_capacity = capacity.as_bytes();
+
+        // Convert percent to ratio and calculate capacity for each cache
+        let index_ratio = index_percent as f64 / 100.0;
+        let puffin_capacity = (total_capacity as f64 * index_ratio) as u64;
+        let parquet_capacity = total_capacity - puffin_capacity;
+
+        // Ensure both capacities are at least 512MB
+        let puffin_capacity = puffin_capacity.max(MIN_CACHE_CAPACITY);
+        let parquet_capacity = parquet_capacity.max(MIN_CACHE_CAPACITY);
+
+        info!(
+            "Initializing file cache with index_percent: {}%, total_capacity: {}, parquet_capacity: {}, puffin_capacity: {}",
+            index_percent,
+            ReadableSize(total_capacity),
+            ReadableSize(parquet_capacity),
+            ReadableSize(puffin_capacity)
+        );
+
+        let parquet_index = Self::build_cache(local_store.clone(), parquet_capacity, ttl, "file");
+        let puffin_index = Self::build_cache(local_store.clone(), puffin_capacity, ttl, "index");
+
+        // Create inner cache shared with background worker
+        let inner = Arc::new(FileCacheInner {
+            local_store,
+            parquet_index,
+            puffin_index,
+        });
+
+        // Only create channel and spawn worker if background download is enabled
+        let download_task_tx = if enable_background_worker {
+            let (tx, rx) = tokio::sync::mpsc::channel(DOWNLOAD_TASK_CHANNEL_SIZE);
+            Self::spawn_download_worker(inner.clone(), rx);
+            Some(tx)
+        } else {
+            None
+        };
+
+        FileCache {
+            inner,
+            puffin_capacity,
+            download_task_tx,
+        }
+    }
+
+    /// Spawns a background worker to process download tasks.
+    fn spawn_download_worker(
+        inner: Arc<FileCacheInner>,
+        mut download_task_rx: tokio::sync::mpsc::Receiver<DownloadTask>,
+    ) {
+        tokio::spawn(async move {
+            info!("Background download worker started");
+            while let Some(task) = download_task_rx.recv().await {
+                // Check if the file is already in the cache
+                if inner.contains_key(&task.index_key) {
+                    debug!(
+                        "Skipping background download for region {}, file {} - already in cache",
+                        task.index_key.region_id, task.index_key.file_id
+                    );
+                    continue;
+                }
+
+                // Ignores background download errors.
+                let _ = inner
+                    .download(
+                        task.index_key,
+                        &task.remote_path,
+                        &task.remote_store,
+                        task.file_size,
+                        1, // Background downloads use concurrency=1
+                    )
+                    .await;
+            }
+            info!("Background download worker stopped");
+        });
+    }
+
+    /// Builds a cache for a specific file type.
+    fn build_cache(
+        local_store: ObjectStore,
+        capacity: u64,
+        ttl: Option<Duration>,
+        label: &'static str,
+    ) -> Cache<IndexKey, IndexValue> {
+        let cache_store = local_store;
+        let mut builder = Cache::builder()
+            .eviction_policy(EvictionPolicy::lru())
+            .weigher(|_key, value: &IndexValue| -> u32 {
+                // We only measure space on local store.
+                value.file_size
+            })
+            .max_capacity(capacity)
+            .async_eviction_listener(move |key, value, cause| {
+                let store = cache_store.clone();
+                // Stores files under FILE_DIR.
+                let file_path = cache_file_path(FILE_DIR, *key);
+                async move {
+                    if let RemovalCause::Replaced = cause {
+                        // The cache is replaced by another file (maybe download again). We don't remove the same
+                        // file but updates the metrics as the file is already replaced by users.
+                        CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
+                        return;
+                    }
+
+                    match store.delete(&file_path).await {
+                        Ok(()) => {
+                            CACHE_BYTES.with_label_values(&[label]).sub(value.file_size.into());
+                        }
+                        Err(e) => {
+                            warn!(e; "Failed to delete cached file {} for region {}", file_path, key.region_id);
+                        }
+                    }
+                }
+                .boxed()
+            });
+        if let Some(ttl) = ttl {
+            builder = builder.time_to_idle(ttl);
+        }
+        builder.build()
+    }
+
+    /// Puts a file into the cache index.
+    ///
+    /// The `WriteCache` should ensure the file is in the correct path.
+    pub(crate) async fn put(&self, key: IndexKey, value: IndexValue) {
+        self.inner.put(key, value).await
+    }
+
+    pub(crate) async fn get(&self, key: IndexKey) -> Option<IndexValue> {
+        self.inner.memory_index(key.file_type).get(&key).await
+    }
+
+    /// Reads a file from the cache.
+    #[allow(unused)]
+    pub(crate) async fn reader(&self, key: IndexKey) -> Option<Reader> {
+        // We must use `get()` to update the estimator of the cache.
+        // See https://docs.rs/moka/latest/moka/future/struct.Cache.html#method.contains_key
+        let index = self.inner.memory_index(key.file_type);
+        if index.get(&key).await.is_none() {
+            CACHE_MISS
+                .with_label_values(&[key.file_type.metric_label()])
+                .inc();
+            return None;
+        }
+
+        let file_path = self.inner.cache_file_path(key);
+        match self.get_reader(&file_path).await {
+            Ok(Some(reader)) => {
+                CACHE_HIT
+                    .with_label_values(&[key.file_type.metric_label()])
+                    .inc();
+                return Some(reader);
+            }
+            Err(e) => {
+                if e.kind() != ErrorKind::NotFound {
+                    warn!(e; "Failed to get file for key {:?}", key);
+                }
+            }
+            Ok(None) => {}
+        }
+
+        // We removes the file from the index.
+        index.remove(&key).await;
+        CACHE_MISS
+            .with_label_values(&[key.file_type.metric_label()])
+            .inc();
+        None
+    }
+
+    /// Reads ranges from the cache.
+    pub(crate) async fn read_ranges(
+        &self,
+        key: IndexKey,
+        ranges: &[Range<u64>],
+    ) -> Option<Vec<Bytes>> {
+        let index = self.inner.memory_index(key.file_type);
+        if index.get(&key).await.is_none() {
+            CACHE_MISS
+                .with_label_values(&[key.file_type.metric_label()])
+                .inc();
+            return None;
+        }
+
+        let file_path = self.inner.cache_file_path(key);
+        // In most cases, it will use blocking read,
+        // because FileCache is normally based on local file system, which supports blocking read.
+        let bytes_result =
+            fetch_byte_ranges(&file_path, self.inner.local_store.clone(), ranges).await;
+        match bytes_result {
+            Ok(bytes) => {
+                CACHE_HIT
+                    .with_label_values(&[key.file_type.metric_label()])
+                    .inc();
+                Some(bytes)
+            }
+            Err(e) => {
+                if e.kind() != ErrorKind::NotFound {
+                    warn!(e; "Failed to get file for key {:?}", key);
+                }
+
+                // We removes the file from the index.
+                index.remove(&key).await;
+                CACHE_MISS
+                    .with_label_values(&[key.file_type.metric_label()])
+                    .inc();
+                None
+            }
+        }
+    }
+
+    /// Removes a file from the cache explicitly.
+    /// It always tries to remove the file from the local store because we may not have the file
+    /// in the memory index if upload is failed.
+    pub(crate) async fn remove(&self, key: IndexKey) {
+        let file_path = self.inner.cache_file_path(key);
+        self.inner.memory_index(key.file_type).remove(&key).await;
+        // Always delete the file from the local store.
+        if let Err(e) = self.inner.local_store.delete(&file_path).await {
+            warn!(e; "Failed to delete a cached file {}", file_path);
+        }
+    }
+
+    /// Recovers the index from local store.
+    ///
+    /// If `task_receiver` is provided, spawns a background task after recovery
+    /// to process `RegionLoadCacheTask` messages for loading files into the cache.
+    pub(crate) async fn recover(
+        &self,
+        sync: bool,
+        task_receiver: Option<UnboundedReceiver<RegionLoadCacheTask>>,
+    ) {
+        let moved_self = self.clone();
+        let handle = tokio::spawn(async move {
+            if let Err(err) = moved_self.inner.recover().await {
+                error!(err; "Failed to recover file cache.")
+            }
+
+            // Spawns background task to process region load cache tasks after recovery.
+            // So it won't block the recovery when `sync` is true.
+            if let Some(mut receiver) = task_receiver {
+                info!("Spawning background task for processing region load cache tasks");
+                tokio::spawn(async move {
+                    while let Some(task) = receiver.recv().await {
+                        task.fill_cache(&moved_self).await;
+                    }
+                    info!("Background task for processing region load cache tasks stopped");
+                });
+            }
+        });
+
+        if sync {
+            let _ = handle.await;
+        }
+    }
+
+    /// Returns the cache file path for the key.
+    pub(crate) fn cache_file_path(&self, key: IndexKey) -> String {
+        self.inner.cache_file_path(key)
+    }
+
+    /// Returns the local store of the file cache.
+    pub(crate) fn local_store(&self) -> ObjectStore {
+        self.inner.local_store.clone()
+    }
+
+    /// Get the parquet metadata in file cache.
+    /// If the file is not in the cache or fail to load metadata, return None.
+    pub(crate) async fn get_parquet_meta_data(&self, key: IndexKey) -> Option<ParquetMetaData> {
+        // Check if file cache contains the key
+        if let Some(index_value) = self.inner.parquet_index.get(&key).await {
+            // Load metadata from file cache
+            let local_store = self.local_store();
+            let file_path = self.inner.cache_file_path(key);
+            let file_size = index_value.file_size as u64;
+            let metadata_loader = MetadataLoader::new(local_store, &file_path, file_size);
+
+            match metadata_loader.load().await {
+                Ok(metadata) => {
+                    CACHE_HIT
+                        .with_label_values(&[key.file_type.metric_label()])
+                        .inc();
+                    Some(metadata)
+                }
+                Err(e) => {
+                    if !e.is_object_not_found() {
+                        warn!(
+                            e; "Failed to get parquet metadata for key {:?}",
+                            key
+                        );
+                    }
+                    // We removes the file from the index.
+                    self.inner.parquet_index.remove(&key).await;
+                    CACHE_MISS
+                        .with_label_values(&[key.file_type.metric_label()])
+                        .inc();
+                    None
+                }
+            }
+        } else {
+            CACHE_MISS
+                .with_label_values(&[key.file_type.metric_label()])
+                .inc();
+            None
+        }
+    }
+
+    async fn get_reader(&self, file_path: &str) -> object_store::Result<Option<Reader>> {
+        if self.inner.local_store.exists(file_path).await? {
+            Ok(Some(self.inner.local_store.reader(file_path).await?))
+        } else {
+            Ok(None)
+        }
+    }
+
+    /// Checks if the key is in the file cache.
+    pub(crate) fn contains_key(&self, key: &IndexKey) -> bool {
+        self.inner.contains_key(key)
+    }
+
+    /// Returns the capacity of the puffin (index) cache in bytes.
+    pub(crate) fn puffin_cache_capacity(&self) -> u64 {
+        self.puffin_capacity
+    }
+
+    /// Returns the current weighted size (used bytes) of the puffin (index) cache.
+    pub(crate) fn puffin_cache_size(&self) -> u64 {
+        self.inner.puffin_index.weighted_size()
+    }
+
+    /// Downloads a file in `remote_path` from the remote object store to the local cache
+    /// (specified by `index_key`).
+    pub(crate) async fn download(
+        &self,
+        index_key: IndexKey,
+        remote_path: &str,
+        remote_store: &ObjectStore,
+        file_size: u64,
+    ) -> Result<()> {
+        self.inner
+            .download(index_key, remote_path, remote_store, file_size, 8) // Foreground uses concurrency=8
+            .await
+    }
+
+    /// Downloads a file in `remote_path` from the remote object store to the local cache
+    /// (specified by `index_key`) in the background. Errors are logged but not returned.
+    ///
+    /// This method attempts to send a download task to the background worker.
+    /// If the channel is full, the task is silently dropped.
+    pub(crate) fn maybe_download_background(
+        &self,
+        index_key: IndexKey,
+        remote_path: String,
+        remote_store: ObjectStore,
+        file_size: u64,
+    ) {
+        // Do nothing if background worker is disabled (channel is None)
+        let Some(tx) = &self.download_task_tx else {
+            return;
+        };
+
+        let task = DownloadTask {
+            index_key,
+            remote_path,
+            remote_store,
+            file_size,
+        };
+
+        // Try to send the task; if the channel is full, just drop it
+        if let Err(e) = tx.try_send(task) {
+            debug!(
+                "Failed to queue background download task for region {}, file {}: {:?}",
+                index_key.region_id, index_key.file_id, e
+            );
+        }
+    }
 }
 
 /// Key of file cache index.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub(crate) struct IndexKey {
+pub struct IndexKey {
     pub region_id: RegionId,
     pub file_id: FileId,
     pub file_type: FileType,
@@ -565,7 +702,7 @@ impl fmt::Display for IndexKey {
             "{}.{}.{}",
             self.region_id.as_u64(),
             self.file_id,
-            self.file_type.as_str()
+            self.file_type
         )
     }
 }
@@ -576,7 +713,16 @@ pub enum FileType {
     /// Parquet file.
     Parquet,
     /// Puffin file.
-    Puffin,
+    Puffin(u64),
+}
+
+impl fmt::Display for FileType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            FileType::Parquet => write!(f, "parquet"),
+            FileType::Puffin(version) => write!(f, "{}.puffin", version),
+        }
+    }
 }
 
 impl FileType {
@@ -584,16 +730,16 @@ impl FileType {
     fn parse(s: &str) -> Option<FileType> {
         match s {
             "parquet" => Some(FileType::Parquet),
-            "puffin" => Some(FileType::Puffin),
-            _ => None,
-        }
-    }
-
-    /// Converts the file type to string.
-    fn as_str(&self) -> &'static str {
-        match self {
-            FileType::Parquet => "parquet",
-            FileType::Puffin => "puffin",
+            "puffin" => Some(FileType::Puffin(0)),
+            _ => {
+                // if post-fix with .puffin, try to parse the version
+                if let Some(version_str) = s.strip_suffix(".puffin") {
+                    let version = version_str.parse::<u64>().ok()?;
+                    Some(FileType::Puffin(version))
+                } else {
+                    None
+                }
+            }
         }
     }
 
@@ -601,7 +747,7 @@ impl FileType {
     fn metric_label(&self) -> &'static str {
         match self {
             FileType::Parquet => FILE_TYPE,
-            FileType::Puffin => INDEX_TYPE,
+            FileType::Puffin(_) => INDEX_TYPE,
         }
     }
 }
@@ -657,6 +803,7 @@ mod tests {
             ReadableSize::mb(10),
             Some(Duration::from_millis(10)),
             None,
+            true, // enable_background_worker
         );
         let region_id = RegionId::new(2000, 0);
         let file_id = FileId::random();
@@ -683,7 +830,7 @@ mod tests {
         let exist = cache.reader(key).await;
         assert!(exist.is_some());
         tokio::time::sleep(Duration::from_millis(15)).await;
-        cache.parquet_index.run_pending_tasks().await;
+        cache.inner.parquet_index.run_pending_tasks().await;
         let non = cache.reader(key).await;
         assert!(non.is_none());
     }
@@ -693,7 +840,13 @@ mod tests {
         let dir = create_temp_dir("");
         let local_store = new_fs_store(dir.path().to_str().unwrap());
 
-        let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None);
+        let cache = FileCache::new(
+            local_store.clone(),
+            ReadableSize::mb(10),
+            None,
+            None,
+            true, // enable_background_worker
+        );
         let region_id = RegionId::new(2000, 0);
         let file_id = FileId::random();
         let key = IndexKey::new(region_id, file_id, FileType::Parquet);
@@ -721,19 +874,19 @@ mod tests {
         assert_eq!("hello", String::from_utf8(buf).unwrap());
 
         // Get weighted size.
-        cache.parquet_index.run_pending_tasks().await;
-        assert_eq!(5, cache.parquet_index.weighted_size());
+        cache.inner.parquet_index.run_pending_tasks().await;
+        assert_eq!(5, cache.inner.parquet_index.weighted_size());
 
         // Remove the file.
         cache.remove(key).await;
         assert!(cache.reader(key).await.is_none());
 
         // Ensure all pending tasks of the moka cache is done before assertion.
-        cache.parquet_index.run_pending_tasks().await;
+        cache.inner.parquet_index.run_pending_tasks().await;
 
         // The file also not exists.
         assert!(!local_store.exists(&file_path).await.unwrap());
-        assert_eq!(0, cache.parquet_index.weighted_size());
+        assert_eq!(0, cache.inner.parquet_index.weighted_size());
     }
 
     #[tokio::test]
@@ -741,7 +894,13 @@ mod tests {
         let dir = create_temp_dir("");
         let local_store = new_fs_store(dir.path().to_str().unwrap());
 
-        let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None);
+        let cache = FileCache::new(
+            local_store.clone(),
+            ReadableSize::mb(10),
+            None,
+            None,
+            true, // enable_background_worker
+        );
         let region_id = RegionId::new(2000, 0);
         let file_id = FileId::random();
         let key = IndexKey::new(region_id, file_id, FileType::Parquet);
@@ -766,14 +925,20 @@ mod tests {
         // Reader is none.
         assert!(cache.reader(key).await.is_none());
         // Key is removed.
-        assert!(!cache.parquet_index.contains_key(&key));
+        assert!(!cache.inner.parquet_index.contains_key(&key));
     }
 
     #[tokio::test]
     async fn test_file_cache_recover() {
         let dir = create_temp_dir("");
         let local_store = new_fs_store(dir.path().to_str().unwrap());
-        let cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None);
+        let cache = FileCache::new(
+            local_store.clone(),
+            ReadableSize::mb(10),
+            None,
+            None,
+            true, // enable_background_worker
+        );
 
         let region_id = RegionId::new(2000, 0);
         let file_type = FileType::Parquet;
@@ -799,12 +964,13 @@ mod tests {
         }
 
         // Recover the cache.
-        let cache = Arc::new(FileCache::new(
+        let cache = FileCache::new(
             local_store.clone(),
             ReadableSize::mb(10),
             None,
             None,
-        ));
+            true, // enable_background_worker
+        );
         // No entry before recovery.
         assert!(
             cache
@@ -815,8 +981,11 @@ mod tests {
         cache.recover(true, None).await;
 
         // Check size.
-        cache.parquet_index.run_pending_tasks().await;
-        assert_eq!(total_size, cache.parquet_index.weighted_size() as usize);
+        cache.inner.parquet_index.run_pending_tasks().await;
+        assert_eq!(
+            total_size,
+            cache.inner.parquet_index.weighted_size() as usize
+        );
 
         for (i, file_id) in file_ids.iter().enumerate() {
             let key = IndexKey::new(region_id, *file_id, file_type);
@@ -830,7 +999,13 @@ mod tests {
     async fn test_file_cache_read_ranges() {
         let dir = create_temp_dir("");
         let local_store = new_fs_store(dir.path().to_str().unwrap());
-        let file_cache = FileCache::new(local_store.clone(), ReadableSize::mb(10), None, None);
+        let file_cache = FileCache::new(
+            local_store.clone(),
+            ReadableSize::mb(10),
+            None,
+            None,
+            true, // enable_background_worker
+        );
         let region_id = RegionId::new(2000, 0);
         let file_id = FileId::random();
         let key = IndexKey::new(region_id, file_id, FileType::Parquet);
@@ -881,6 +1056,15 @@ mod tests {
             IndexKey::new(region_id, file_id, FileType::Parquet),
             parse_index_key("5299989643269.3368731b-a556-42b8-a5df-9c31ce155095.parquet").unwrap()
         );
+        assert_eq!(
+            IndexKey::new(region_id, file_id, FileType::Puffin(0)),
+            parse_index_key("5299989643269.3368731b-a556-42b8-a5df-9c31ce155095.puffin").unwrap()
+        );
+        assert_eq!(
+            IndexKey::new(region_id, file_id, FileType::Puffin(42)),
+            parse_index_key("5299989643269.3368731b-a556-42b8-a5df-9c31ce155095.42.puffin")
+                .unwrap()
+        );
         assert!(parse_index_key("").is_none());
         assert!(parse_index_key(".").is_none());
         assert!(parse_index_key("5299989643269").is_none());
diff --git a/src/mito2/src/cache/index.rs b/src/mito2/src/cache/index.rs
index cf24772994..7393773a89 100644
--- a/src/mito2/src/cache/index.rs
+++ b/src/mito2/src/cache/index.rs
@@ -31,6 +31,29 @@ const INDEX_METADATA_TYPE: &str = "index_metadata";
 /// Metrics for index content.
 const INDEX_CONTENT_TYPE: &str = "index_content";
 
+/// Metrics collected from IndexCache operations.
+#[derive(Debug, Default, Clone)]
+pub struct IndexCacheMetrics {
+    /// Number of cache hits.
+    pub cache_hit: usize,
+    /// Number of cache misses.
+    pub cache_miss: usize,
+    /// Number of pages accessed.
+    pub num_pages: usize,
+    /// Total bytes from pages.
+    pub page_bytes: u64,
+}
+
+impl IndexCacheMetrics {
+    /// Merges another set of metrics into this one.
+    pub fn merge(&mut self, other: &Self) {
+        self.cache_hit += other.cache_hit;
+        self.cache_miss += other.cache_miss;
+        self.num_pages += other.num_pages;
+        self.page_bytes += other.page_bytes;
+    }
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub struct PageKey {
     page_id: u64,
@@ -160,18 +183,20 @@ where
         offset: u64,
         size: u32,
         load: F,
-    ) -> Result<Vec<u8>, E>
+    ) -> Result<(Vec<u8>, IndexCacheMetrics), E>
     where
         F: Fn(Vec<Range<u64>>) -> Fut,
         Fut: Future<Output = Result<Vec<Bytes>, E>>,
         E: std::error::Error,
     {
+        let mut metrics = IndexCacheMetrics::default();
         let page_keys =
             PageKey::generate_page_keys(offset, size, self.page_size).collect::<Vec<_>>();
         // Size is 0, return empty data.
         if page_keys.is_empty() {
-            return Ok(Vec::new());
+            return Ok((Vec::new(), metrics));
         }
+        metrics.num_pages = page_keys.len();
         let mut data = Vec::with_capacity(page_keys.len());
         data.resize(page_keys.len(), Bytes::new());
         let mut cache_miss_range = vec![];
@@ -182,10 +207,13 @@ where
             match self.get_page(key, *page_key) {
                 Some(page) => {
                     CACHE_HIT.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
+                    metrics.cache_hit += 1;
+                    metrics.page_bytes += page.len() as u64;
                     data[i] = page;
                 }
                 None => {
                     CACHE_MISS.with_label_values(&[INDEX_CONTENT_TYPE]).inc();
+                    metrics.cache_miss += 1;
                     let base_offset = page_key.page_id * self.page_size;
                     let pruned_size = if i == last_index {
                         prune_size(page_keys.iter(), file_size, self.page_size)
@@ -201,14 +229,18 @@ where
             let pages = load(cache_miss_range).await?;
             for (i, page) in cache_miss_idx.into_iter().zip(pages.into_iter()) {
                 let page_key = page_keys[i];
+                metrics.page_bytes += page.len() as u64;
                 data[i] = page.clone();
                 self.put_page(key, page_key, page.clone());
             }
         }
         let buffer = Buffer::from_iter(data.into_iter());
-        Ok(buffer
-            .slice(PageKey::calculate_range(offset, size, self.page_size))
-            .to_vec())
+        Ok((
+            buffer
+                .slice(PageKey::calculate_range(offset, size, self.page_size))
+                .to_vec(),
+            metrics,
+        ))
     }
 
     fn get_page(&self, key: K, page_key: PageKey) -> Option<Bytes> {
@@ -216,6 +248,8 @@ where
     }
 
     fn put_page(&self, key: K, page_key: PageKey, value: Bytes) {
+        // Clones the value to ensure it doesn't reference a larger buffer.
+        let value = Bytes::from(value.to_vec());
         CACHE_BYTES
             .with_label_values(&[INDEX_CONTENT_TYPE])
             .add((self.weight_of_content)(&(key, page_key), &value).into());
diff --git a/src/mito2/src/cache/index/bloom_filter_index.rs b/src/mito2/src/cache/index/bloom_filter_index.rs
index b4e7804b93..707afb18a6 100644
--- a/src/mito2/src/cache/index/bloom_filter_index.rs
+++ b/src/mito2/src/cache/index/bloom_filter_index.rs
@@ -14,13 +14,14 @@
 
 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Instant;
 
 use api::v1::index::{BloomFilterLoc, BloomFilterMeta};
 use async_trait::async_trait;
 use bytes::Bytes;
 use index::bloom_filter::error::Result;
-use index::bloom_filter::reader::BloomFilterReader;
-use store_api::storage::{ColumnId, FileId};
+use index::bloom_filter::reader::{BloomFilterReadMetrics, BloomFilterReader};
+use store_api::storage::{ColumnId, FileId, IndexVersion};
 
 use crate::cache::index::{INDEX_METADATA_TYPE, IndexCache, PageKey};
 use crate::metrics::{CACHE_HIT, CACHE_MISS};
@@ -34,8 +35,10 @@ pub enum Tag {
     Fulltext,
 }
 
+pub type BloomFilterIndexKey = (FileId, IndexVersion, ColumnId, Tag);
+
 /// Cache for bloom filter index.
-pub type BloomFilterIndexCache = IndexCache<(FileId, ColumnId, Tag), BloomFilterMeta>;
+pub type BloomFilterIndexCache = IndexCache<BloomFilterIndexKey, BloomFilterMeta>;
 pub type BloomFilterIndexCacheRef = Arc<BloomFilterIndexCache>;
 
 impl BloomFilterIndexCache {
@@ -58,11 +61,9 @@ impl BloomFilterIndexCache {
 }
 
 /// Calculates weight for bloom filter index metadata.
-fn bloom_filter_index_metadata_weight(
-    k: &(FileId, ColumnId, Tag),
-    meta: &Arc<BloomFilterMeta>,
-) -> u32 {
+fn bloom_filter_index_metadata_weight(k: &BloomFilterIndexKey, meta: &Arc<BloomFilterMeta>) -> u32 {
     let base = k.0.as_bytes().len()
+        + std::mem::size_of::<IndexVersion>()
         + std::mem::size_of::<ColumnId>()
         + std::mem::size_of::<Tag>()
         + std::mem::size_of::<BloomFilterMeta>();
@@ -74,16 +75,14 @@ fn bloom_filter_index_metadata_weight(
 }
 
 /// Calculates weight for bloom filter index content.
-fn bloom_filter_index_content_weight(
-    (k, _): &((FileId, ColumnId, Tag), PageKey),
-    v: &Bytes,
-) -> u32 {
+fn bloom_filter_index_content_weight((k, _): &(BloomFilterIndexKey, PageKey), v: &Bytes) -> u32 {
     (k.0.as_bytes().len() + std::mem::size_of::<ColumnId>() + v.len()) as u32
 }
 
 /// Bloom filter index blob reader with cache.
 pub struct CachedBloomFilterIndexBlobReader<R> {
     file_id: FileId,
+    index_version: IndexVersion,
     column_id: ColumnId,
     tag: Tag,
     blob_size: u64,
@@ -95,6 +94,7 @@ impl<R> CachedBloomFilterIndexBlobReader<R> {
     /// Creates a new bloom filter index blob reader with cache.
     pub fn new(
         file_id: FileId,
+        index_version: IndexVersion,
         column_id: ColumnId,
         tag: Tag,
         blob_size: u64,
@@ -103,6 +103,7 @@ impl<R> CachedBloomFilterIndexBlobReader<R> {
     ) -> Self {
         Self {
             file_id,
+            index_version,
             column_id,
             tag,
             blob_size,
@@ -114,53 +115,95 @@ impl<R> CachedBloomFilterIndexBlobReader<R> {
 
 #[async_trait]
 impl<R: BloomFilterReader + Send> BloomFilterReader for CachedBloomFilterIndexBlobReader<R> {
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Bytes> {
+    async fn range_read(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Bytes> {
+        let start = metrics.as_ref().map(|_| Instant::now());
         let inner = &self.inner;
-        self.cache
+        let (result, cache_metrics) = self
+            .cache
             .get_or_load(
-                (self.file_id, self.column_id, self.tag),
+                (self.file_id, self.index_version, self.column_id, self.tag),
                 self.blob_size,
                 offset,
                 size,
-                move |ranges| async move { inner.read_vec(&ranges).await },
+                move |ranges| async move { inner.read_vec(&ranges, None).await },
             )
-            .await
-            .map(|b| b.into())
+            .await?;
+
+        if let Some(m) = metrics {
+            m.total_ranges += cache_metrics.num_pages;
+            m.total_bytes += cache_metrics.page_bytes;
+            m.cache_hit += cache_metrics.cache_hit;
+            m.cache_miss += cache_metrics.cache_miss;
+            if let Some(start) = start {
+                m.fetch_elapsed += start.elapsed();
+            }
+        }
+
+        Ok(result.into())
     }
 
-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
+    async fn read_vec(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<Vec<Bytes>> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+
         let mut pages = Vec::with_capacity(ranges.len());
+        let mut total_cache_metrics = crate::cache::index::IndexCacheMetrics::default();
         for range in ranges {
             let inner = &self.inner;
-            let page = self
+            let (page, cache_metrics) = self
                 .cache
                 .get_or_load(
-                    (self.file_id, self.column_id, self.tag),
+                    (self.file_id, self.index_version, self.column_id, self.tag),
                     self.blob_size,
                     range.start,
                     (range.end - range.start) as u32,
-                    move |ranges| async move { inner.read_vec(&ranges).await },
+                    move |ranges| async move { inner.read_vec(&ranges, None).await },
                 )
                 .await?;
 
+            total_cache_metrics.merge(&cache_metrics);
             pages.push(Bytes::from(page));
         }
 
+        if let Some(m) = metrics {
+            m.total_ranges += total_cache_metrics.num_pages;
+            m.total_bytes += total_cache_metrics.page_bytes;
+            m.cache_hit += total_cache_metrics.cache_hit;
+            m.cache_miss += total_cache_metrics.cache_miss;
+            if let Some(start) = start {
+                m.fetch_elapsed += start.elapsed();
+            }
+        }
+
         Ok(pages)
     }
 
     /// Reads the meta information of the bloom filter.
-    async fn metadata(&self) -> Result<BloomFilterMeta> {
-        if let Some(cached) = self
-            .cache
-            .get_metadata((self.file_id, self.column_id, self.tag))
+    async fn metadata(
+        &self,
+        metrics: Option<&mut BloomFilterReadMetrics>,
+    ) -> Result<BloomFilterMeta> {
+        if let Some(cached) =
+            self.cache
+                .get_metadata((self.file_id, self.index_version, self.column_id, self.tag))
         {
             CACHE_HIT.with_label_values(&[INDEX_METADATA_TYPE]).inc();
+            if let Some(m) = metrics {
+                m.cache_hit += 1;
+            }
             Ok((*cached).clone())
         } else {
-            let meta = self.inner.metadata().await?;
+            let meta = self.inner.metadata(metrics).await?;
             self.cache.put_metadata(
-                (self.file_id, self.column_id, self.tag),
+                (self.file_id, self.index_version, self.column_id, self.tag),
                 Arc::new(meta.clone()),
             );
             CACHE_MISS.with_label_values(&[INDEX_METADATA_TYPE]).inc();
@@ -180,6 +223,7 @@ mod test {
     #[test]
     fn bloom_filter_metadata_weight_counts_vec_contents() {
         let file_id = FileId::parse_str("00000000-0000-0000-0000-000000000001").unwrap();
+        let version = 0;
         let column_id: ColumnId = 42;
         let tag = Tag::Skipping;
 
@@ -203,10 +247,13 @@ mod test {
             ],
         };
 
-        let weight =
-            bloom_filter_index_metadata_weight(&(file_id, column_id, tag), &Arc::new(meta.clone()));
+        let weight = bloom_filter_index_metadata_weight(
+            &(file_id, version, column_id, tag),
+            &Arc::new(meta.clone()),
+        );
 
         let base = file_id.as_bytes().len()
+            + std::mem::size_of::<IndexVersion>()
             + std::mem::size_of::<ColumnId>()
             + std::mem::size_of::<Tag>()
             + std::mem::size_of::<BloomFilterMeta>();
diff --git a/src/mito2/src/cache/index/inverted_index.rs b/src/mito2/src/cache/index/inverted_index.rs
index 06a7a3f6d4..4c1b07126c 100644
--- a/src/mito2/src/cache/index/inverted_index.rs
+++ b/src/mito2/src/cache/index/inverted_index.rs
@@ -14,14 +14,15 @@
 
 use core::ops::Range;
 use std::sync::Arc;
+use std::time::Instant;
 
 use api::v1::index::InvertedIndexMetas;
 use async_trait::async_trait;
 use bytes::Bytes;
 use index::inverted_index::error::Result;
-use index::inverted_index::format::reader::InvertedIndexReader;
+use index::inverted_index::format::reader::{InvertedIndexReadMetrics, InvertedIndexReader};
 use prost::Message;
-use store_api::storage::FileId;
+use store_api::storage::{FileId, IndexVersion};
 
 use crate::cache::index::{INDEX_METADATA_TYPE, IndexCache, PageKey};
 use crate::metrics::{CACHE_HIT, CACHE_MISS};
@@ -29,7 +30,7 @@ use crate::metrics::{CACHE_HIT, CACHE_MISS};
 const INDEX_TYPE_INVERTED_INDEX: &str = "inverted_index";
 
 /// Cache for inverted index.
-pub type InvertedIndexCache = IndexCache<FileId, InvertedIndexMetas>;
+pub type InvertedIndexCache = IndexCache<(FileId, IndexVersion), InvertedIndexMetas>;
 pub type InvertedIndexCacheRef = Arc<InvertedIndexCache>;
 
 impl InvertedIndexCache {
@@ -47,23 +48,24 @@ impl InvertedIndexCache {
 
     /// Removes all cached entries for the given `file_id`.
     pub fn invalidate_file(&self, file_id: FileId) {
-        self.invalidate_if(move |key| *key == file_id);
+        self.invalidate_if(move |key| key.0 == file_id);
     }
 }
 
 /// Calculates weight for inverted index metadata.
-fn inverted_index_metadata_weight(k: &FileId, v: &Arc<InvertedIndexMetas>) -> u32 {
-    (k.as_bytes().len() + v.encoded_len()) as u32
+fn inverted_index_metadata_weight(k: &(FileId, IndexVersion), v: &Arc<InvertedIndexMetas>) -> u32 {
+    (k.0.as_bytes().len() + size_of::<IndexVersion>() + v.encoded_len()) as u32
 }
 
 /// Calculates weight for inverted index content.
-fn inverted_index_content_weight((k, _): &(FileId, PageKey), v: &Bytes) -> u32 {
-    (k.as_bytes().len() + v.len()) as u32
+fn inverted_index_content_weight((k, _): &((FileId, IndexVersion), PageKey), v: &Bytes) -> u32 {
+    (k.0.as_bytes().len() + size_of::<IndexVersion>() + v.len()) as u32
 }
 
 /// Inverted index blob reader with cache.
 pub struct CachedInvertedIndexBlobReader<R> {
     file_id: FileId,
+    index_version: IndexVersion,
     blob_size: u64,
     inner: R,
     cache: InvertedIndexCacheRef,
@@ -71,9 +73,16 @@ pub struct CachedInvertedIndexBlobReader<R> {
 
 impl<R> CachedInvertedIndexBlobReader<R> {
     /// Creates a new inverted index blob reader with cache.
-    pub fn new(file_id: FileId, blob_size: u64, inner: R, cache: InvertedIndexCacheRef) -> Self {
+    pub fn new(
+        file_id: FileId,
+        index_version: IndexVersion,
+        blob_size: u64,
+        inner: R,
+        cache: InvertedIndexCacheRef,
+    ) -> Self {
         Self {
             file_id,
+            index_version,
             blob_size,
             inner,
             cache,
@@ -83,47 +92,88 @@ impl<R> CachedInvertedIndexBlobReader<R> {
 
 #[async_trait]
 impl<R: InvertedIndexReader> InvertedIndexReader for CachedInvertedIndexBlobReader<R> {
-    async fn range_read(&self, offset: u64, size: u32) -> Result<Vec<u8>> {
+    async fn range_read<'a>(
+        &self,
+        offset: u64,
+        size: u32,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<u8>> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+
         let inner = &self.inner;
-        self.cache
+        let (result, cache_metrics) = self
+            .cache
             .get_or_load(
-                self.file_id,
+                (self.file_id, self.index_version),
                 self.blob_size,
                 offset,
                 size,
-                move |ranges| async move { inner.read_vec(&ranges).await },
+                move |ranges| async move { inner.read_vec(&ranges, None).await },
             )
-            .await
+            .await?;
+
+        if let Some(m) = metrics {
+            m.total_bytes += cache_metrics.page_bytes;
+            m.total_ranges += cache_metrics.num_pages;
+            m.cache_hit += cache_metrics.cache_hit;
+            m.cache_miss += cache_metrics.cache_miss;
+            m.fetch_elapsed += start.unwrap().elapsed();
+        }
+
+        Ok(result)
     }
 
-    async fn read_vec(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
+    async fn read_vec<'a>(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Vec<Bytes>> {
+        let start = metrics.as_ref().map(|_| Instant::now());
+
         let mut pages = Vec::with_capacity(ranges.len());
+        let mut total_cache_metrics = crate::cache::index::IndexCacheMetrics::default();
         for range in ranges {
             let inner = &self.inner;
-            let page = self
+            let (page, cache_metrics) = self
                 .cache
                 .get_or_load(
-                    self.file_id,
+                    (self.file_id, self.index_version),
                     self.blob_size,
                     range.start,
                     (range.end - range.start) as u32,
-                    move |ranges| async move { inner.read_vec(&ranges).await },
+                    move |ranges| async move { inner.read_vec(&ranges, None).await },
                 )
                 .await?;
 
+            total_cache_metrics.merge(&cache_metrics);
             pages.push(Bytes::from(page));
         }
 
+        if let Some(m) = metrics {
+            m.total_bytes += total_cache_metrics.page_bytes;
+            m.total_ranges += total_cache_metrics.num_pages;
+            m.cache_hit += total_cache_metrics.cache_hit;
+            m.cache_miss += total_cache_metrics.cache_miss;
+            m.fetch_elapsed += start.unwrap().elapsed();
+        }
+
         Ok(pages)
     }
 
-    async fn metadata(&self) -> Result<Arc<InvertedIndexMetas>> {
-        if let Some(cached) = self.cache.get_metadata(self.file_id) {
+    async fn metadata<'a>(
+        &self,
+        metrics: Option<&'a mut InvertedIndexReadMetrics>,
+    ) -> Result<Arc<InvertedIndexMetas>> {
+        if let Some(cached) = self.cache.get_metadata((self.file_id, self.index_version)) {
             CACHE_HIT.with_label_values(&[INDEX_METADATA_TYPE]).inc();
+            if let Some(m) = metrics {
+                m.cache_hit += 1;
+            }
             Ok(cached)
         } else {
-            let meta = self.inner.metadata().await?;
-            self.cache.put_metadata(self.file_id, meta.clone());
+            let meta = self.inner.metadata(metrics).await?;
+            self.cache
+                .put_metadata((self.file_id, self.index_version), meta.clone());
             CACHE_MISS.with_label_values(&[INDEX_METADATA_TYPE]).inc();
             Ok(meta)
         }
@@ -258,6 +308,7 @@ mod test {
         // Init a test range reader in local fs.
         let mut env = TestEnv::new().await;
         let file_size = blob.len() as u64;
+        let index_version = 0;
         let store = env.init_object_store_manager();
         let temp_path = "data";
         store.write(temp_path, blob).await.unwrap();
@@ -273,11 +324,12 @@ mod test {
         let reader = InvertedIndexBlobReader::new(range_reader);
         let cached_reader = CachedInvertedIndexBlobReader::new(
             FileId::random(),
+            index_version,
             file_size,
             reader,
             Arc::new(InvertedIndexCache::new(8192, 8192, 50)),
         );
-        let metadata = cached_reader.metadata().await.unwrap();
+        let metadata = cached_reader.metadata(None).await.unwrap();
         assert_eq!(metadata.total_row_count, 8);
         assert_eq!(metadata.segment_row_count, 1);
         assert_eq!(metadata.metas.len(), 2);
@@ -292,13 +344,19 @@ mod test {
             .fst(
                 tag0.base_offset + tag0.relative_fst_offset as u64,
                 tag0.fst_size,
+                None,
             )
             .await
             .unwrap();
         assert_eq!(fst0.len(), 3);
         let [offset, size] = unpack(fst0.get(b"a").unwrap());
         let bitmap = cached_reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag0.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
@@ -307,7 +365,12 @@ mod test {
         );
         let [offset, size] = unpack(fst0.get(b"b").unwrap());
         let bitmap = cached_reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag0.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
@@ -316,7 +379,12 @@ mod test {
         );
         let [offset, size] = unpack(fst0.get(b"c").unwrap());
         let bitmap = cached_reader
-            .bitmap(tag0.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag0.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
@@ -335,13 +403,19 @@ mod test {
             .fst(
                 tag1.base_offset + tag1.relative_fst_offset as u64,
                 tag1.fst_size,
+                None,
             )
             .await
             .unwrap();
         assert_eq!(fst1.len(), 3);
         let [offset, size] = unpack(fst1.get(b"x").unwrap());
         let bitmap = cached_reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag1.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
@@ -350,7 +424,12 @@ mod test {
         );
         let [offset, size] = unpack(fst1.get(b"y").unwrap());
         let bitmap = cached_reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag1.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
@@ -359,7 +438,12 @@ mod test {
         );
         let [offset, size] = unpack(fst1.get(b"z").unwrap());
         let bitmap = cached_reader
-            .bitmap(tag1.base_offset + offset as u64, size, BitmapType::Roaring)
+            .bitmap(
+                tag1.base_offset + offset as u64,
+                size,
+                BitmapType::Roaring,
+                None,
+            )
             .await
             .unwrap();
         assert_eq!(
@@ -372,16 +456,16 @@ mod test {
         for _ in 0..FUZZ_REPEAT_TIMES {
             let offset = rng.random_range(0..file_size);
             let size = rng.random_range(0..file_size as u32 - offset as u32);
-            let expected = cached_reader.range_read(offset, size).await.unwrap();
+            let expected = cached_reader.range_read(offset, size, None).await.unwrap();
             let inner = &cached_reader.inner;
-            let read = cached_reader
+            let (read, _cache_metrics) = cached_reader
                 .cache
                 .get_or_load(
-                    cached_reader.file_id,
+                    (cached_reader.file_id, cached_reader.index_version),
                     file_size,
                     offset,
                     size,
-                    |ranges| async move { inner.read_vec(&ranges).await },
+                    |ranges| async move { inner.read_vec(&ranges, None).await },
                 )
                 .await
                 .unwrap();
diff --git a/src/mito2/src/cache/manifest_cache.rs b/src/mito2/src/cache/manifest_cache.rs
new file mode 100644
index 0000000000..6d2c563499
--- /dev/null
+++ b/src/mito2/src/cache/manifest_cache.rs
@@ -0,0 +1,706 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! A cache for manifest files.
+
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use common_base::readable_size::ReadableSize;
+use common_telemetry::{error, info, warn};
+use futures::{FutureExt, TryStreamExt};
+use moka::future::Cache;
+use moka::notification::RemovalCause;
+use moka::policy::EvictionPolicy;
+use object_store::ObjectStore;
+use object_store::util::join_path;
+use snafu::ResultExt;
+
+use crate::error::{OpenDalSnafu, Result};
+use crate::metrics::{CACHE_BYTES, CACHE_HIT, CACHE_MISS};
+
+/// Subdirectory of cached manifest files.
+///
+/// This must contain three layers, corresponding to [`build_prometheus_metrics_layer`](object_store::layers::build_prometheus_metrics_layer).
+const MANIFEST_DIR: &str = "cache/object/manifest/";
+
+/// Metric label for manifest files.
+const MANIFEST_TYPE: &str = "manifest";
+
+/// A manifest cache manages manifest files on local store and evicts files based
+/// on size.
+#[derive(Debug, Clone)]
+pub struct ManifestCache {
+    /// Local store to cache files.
+    local_store: ObjectStore,
+    /// Index to track cached manifest files.
+    index: Cache<String, IndexValue>,
+}
+
+impl ManifestCache {
+    /// Creates a new manifest cache and recovers the index from local store.
+    pub async fn new(
+        local_store: ObjectStore,
+        capacity: ReadableSize,
+        ttl: Option<Duration>,
+    ) -> ManifestCache {
+        let total_capacity = capacity.as_bytes();
+
+        info!(
+            "Initializing manifest cache with capacity: {}",
+            ReadableSize(total_capacity)
+        );
+
+        let index = Self::build_cache(local_store.clone(), total_capacity, ttl);
+
+        let cache = ManifestCache { local_store, index };
+
+        // Recovers the cache index from local store asynchronously
+        cache.recover(false).await;
+
+        cache
+    }
+
+    /// Builds the cache.
+    fn build_cache(
+        local_store: ObjectStore,
+        capacity: u64,
+        ttl: Option<Duration>,
+    ) -> Cache<String, IndexValue> {
+        let cache_store = local_store;
+        let mut builder = Cache::builder()
+            .eviction_policy(EvictionPolicy::lru())
+            .weigher(|key: &String, value: &IndexValue| -> u32 {
+                key.len() as u32 + value.file_size
+            })
+            .max_capacity(capacity)
+            .async_eviction_listener(move |key: Arc<String>, value: IndexValue, cause| {
+                let store = cache_store.clone();
+                // Stores files under MANIFEST_DIR.
+                let file_path = join_path(MANIFEST_DIR, &key);
+                async move {
+                    if let RemovalCause::Replaced = cause {
+                        // The cache is replaced by another file. We don't remove the same
+                        // file but updates the metrics as the file is already replaced by users.
+                        CACHE_BYTES
+                            .with_label_values(&[MANIFEST_TYPE])
+                            .sub(value.file_size.into());
+                        return;
+                    }
+
+                    match store.delete(&file_path).await {
+                        Ok(()) => {
+                            CACHE_BYTES
+                                .with_label_values(&[MANIFEST_TYPE])
+                                .sub(value.file_size.into());
+                        }
+                        Err(e) => {
+                            warn!(e; "Failed to delete cached manifest file {}", file_path);
+                        }
+                    }
+                }
+                .boxed()
+            });
+        if let Some(ttl) = ttl {
+            builder = builder.time_to_idle(ttl);
+        }
+        builder.build()
+    }
+
+    /// Puts a file into the cache index.
+    ///
+    /// The caller should ensure the file is in the correct path.
+    pub(crate) async fn put(&self, key: String, value: IndexValue) {
+        CACHE_BYTES
+            .with_label_values(&[MANIFEST_TYPE])
+            .add(value.file_size.into());
+        self.index.insert(key, value).await;
+
+        // Since files can be large items, we run the pending tasks immediately.
+        self.index.run_pending_tasks().await;
+    }
+
+    /// Gets the index value for the key.
+    pub(crate) async fn get(&self, key: &str) -> Option<IndexValue> {
+        self.index.get(key).await
+    }
+
+    /// Removes a file from the cache explicitly.
+    pub(crate) async fn remove(&self, key: &str) {
+        let file_path = self.cache_file_path(key);
+        self.index.remove(key).await;
+        // Always deletes the file from the local store.
+        if let Err(e) = self.local_store.delete(&file_path).await {
+            warn!(e; "Failed to delete a cached manifest file {}", file_path);
+        }
+    }
+
+    /// Removes multiple files from the cache in batch.
+    pub(crate) async fn remove_batch(&self, keys: &[String]) {
+        if keys.is_empty() {
+            return;
+        }
+
+        for key in keys {
+            self.index.remove(key).await;
+        }
+
+        let file_paths: Vec<String> = keys.iter().map(|key| self.cache_file_path(key)).collect();
+
+        if let Err(e) = self.local_store.delete_iter(file_paths).await {
+            warn!(e; "Failed to delete cached manifest files in batch");
+        }
+    }
+
+    async fn recover_inner(&self) -> Result<()> {
+        let now = Instant::now();
+        let mut lister = self
+            .local_store
+            .lister_with(MANIFEST_DIR)
+            .recursive(true)
+            .await
+            .context(OpenDalSnafu)?;
+        let (mut total_size, mut total_keys) = (0i64, 0);
+        while let Some(entry) = lister.try_next().await.context(OpenDalSnafu)? {
+            let meta = entry.metadata();
+            if !meta.is_file() {
+                continue;
+            }
+
+            let meta = self
+                .local_store
+                .stat(entry.path())
+                .await
+                .context(OpenDalSnafu)?;
+            let file_size = meta.content_length() as u32;
+            let key = entry.path().trim_start_matches(MANIFEST_DIR).to_string();
+            common_telemetry::info!("Manifest cache recover {}, size: {}", key, file_size);
+            self.index.insert(key, IndexValue { file_size }).await;
+            let size = i64::from(file_size);
+            total_size += size;
+            total_keys += 1;
+        }
+        CACHE_BYTES
+            .with_label_values(&[MANIFEST_TYPE])
+            .add(total_size);
+
+        // Runs all pending tasks of the moka cache so that the cache size is updated
+        // and the eviction policy is applied.
+        self.index.run_pending_tasks().await;
+
+        let weight = self.index.weighted_size();
+        let count = self.index.entry_count();
+        info!(
+            "Recovered manifest cache, num_keys: {}, num_bytes: {}, count: {}, weight: {}, cost: {:?}",
+            total_keys,
+            total_size,
+            count,
+            weight,
+            now.elapsed()
+        );
+        Ok(())
+    }
+
+    /// Recovers the index from local store.
+    pub(crate) async fn recover(&self, sync: bool) {
+        let moved_self = self.clone();
+        let handle = tokio::spawn(async move {
+            if let Err(err) = moved_self.recover_inner().await {
+                error!(err; "Failed to recover manifest cache.")
+            }
+
+            moved_self.clean_empty_dirs(true).await;
+        });
+
+        if sync {
+            let _ = handle.await;
+        }
+    }
+
+    /// Returns the cache file path for the key.
+    pub(crate) fn cache_file_path(&self, key: &str) -> String {
+        join_path(MANIFEST_DIR, key)
+    }
+
+    /// Gets a manifest file from cache.
+    /// Returns the file data if found in cache, None otherwise.
+    pub(crate) async fn get_file(&self, key: &str) -> Option<Vec<u8>> {
+        if self.get(key).await.is_none() {
+            CACHE_MISS.with_label_values(&[MANIFEST_TYPE]).inc();
+            return None;
+        }
+
+        let cache_file_path = self.cache_file_path(key);
+        match self.local_store.read(&cache_file_path).await {
+            Ok(data) => {
+                CACHE_HIT.with_label_values(&[MANIFEST_TYPE]).inc();
+                Some(data.to_vec())
+            }
+            Err(e) => {
+                warn!(e; "Failed to read cached manifest file {}", cache_file_path);
+                CACHE_MISS.with_label_values(&[MANIFEST_TYPE]).inc();
+                None
+            }
+        }
+    }
+
+    /// Puts a manifest file into cache.
+    pub(crate) async fn put_file(&self, key: String, data: Vec<u8>) {
+        let cache_file_path = self.cache_file_path(&key);
+
+        if let Err(e) = self.local_store.write(&cache_file_path, data.clone()).await {
+            warn!(e; "Failed to write manifest to cache {}", cache_file_path);
+            return;
+        }
+
+        let file_size = data.len() as u32;
+        self.put(key, IndexValue { file_size }).await;
+    }
+
+    /// Removes empty directories recursively under the manifest cache directory.
+    ///
+    /// If `check_mtime` is true, only removes directories that have not been modified
+    /// for at least 1 hour.
+    pub(crate) async fn clean_empty_dirs(&self, check_mtime: bool) {
+        info!("Clean empty dirs start");
+
+        let root = self.local_store.info().root();
+        let manifest_dir = PathBuf::from(root).join(MANIFEST_DIR);
+        let manifest_dir_clone = manifest_dir.clone();
+
+        let result = tokio::task::spawn_blocking(move || {
+            Self::clean_empty_dirs_sync(&manifest_dir_clone, check_mtime)
+        })
+        .await;
+
+        match result {
+            Ok(Ok(())) => {
+                info!("Clean empty dirs end");
+            }
+            Ok(Err(e)) => {
+                warn!(e; "Failed to clean empty directories under {}", manifest_dir.display());
+            }
+            Err(e) => {
+                warn!(e; "Failed to spawn blocking task for cleaning empty directories");
+            }
+        }
+    }
+
+    /// Removes all manifest files under the given directory from cache and cleans up empty directories.
+    pub(crate) async fn clean_manifests(&self, dir: &str) {
+        info!("Clean manifest cache for directory: {}", dir);
+
+        let cache_dir = join_path(MANIFEST_DIR, dir);
+        let mut lister = match self
+            .local_store
+            .lister_with(&cache_dir)
+            .recursive(true)
+            .await
+        {
+            Ok(lister) => lister,
+            Err(e) => {
+                warn!(e; "Failed to list manifest files under {}", cache_dir);
+                return;
+            }
+        };
+
+        let mut keys_to_remove = Vec::new();
+        loop {
+            match lister.try_next().await {
+                Ok(Some(entry)) => {
+                    let meta = entry.metadata();
+                    if meta.is_file() {
+                        keys_to_remove
+                            .push(entry.path().trim_start_matches(MANIFEST_DIR).to_string());
+                    }
+                }
+                Ok(None) => break,
+                Err(e) => {
+                    warn!(e; "Failed to read entry while listing {}", cache_dir);
+                    break;
+                }
+            }
+        }
+
+        info!(
+            "Going to remove files from manifest cache, files: {:?}",
+            keys_to_remove
+        );
+
+        // Removes all files from cache in batch
+        self.remove_batch(&keys_to_remove).await;
+
+        // Cleans up empty directories under the given dir
+        let root = self.local_store.info().root();
+        let dir_path = PathBuf::from(root).join(&cache_dir);
+        let dir_path_clone = dir_path.clone();
+
+        let result = tokio::task::spawn_blocking(move || {
+            Self::clean_empty_dirs_sync(&dir_path_clone, false)
+        })
+        .await;
+
+        match result {
+            Ok(Ok(())) => {
+                info!("Cleaned manifest cache for directory: {}", dir);
+            }
+            Ok(Err(e)) => {
+                warn!(e; "Failed to clean empty directories under {}", dir_path.display());
+            }
+            Err(e) => {
+                warn!(e; "Failed to spawn blocking task for cleaning empty directories");
+            }
+        }
+    }
+
+    /// Synchronously removes empty directories recursively.
+    ///
+    /// If `check_mtime` is true, only removes directories that have not been modified
+    /// for at least 1 hour.
+    fn clean_empty_dirs_sync(dir: &PathBuf, check_mtime: bool) -> std::io::Result<()> {
+        let is_empty = Self::remove_empty_dirs_recursive_sync(dir, check_mtime)?;
+        if is_empty {
+            if let Err(e) = std::fs::remove_dir(dir) {
+                if e.kind() != std::io::ErrorKind::NotFound {
+                    warn!(e; "Failed to remove empty root dir {}", dir.display());
+                    return Err(e);
+                } else {
+                    warn!("Empty root dir not found before removal {}", dir.display());
+                }
+            } else {
+                info!(
+                    "Removed empty root dir {} from manifest cache",
+                    dir.display()
+                );
+            }
+        }
+        Ok(())
+    }
+
+    fn remove_empty_dirs_recursive_sync(dir: &PathBuf, check_mtime: bool) -> std::io::Result<bool> {
+        common_telemetry::debug!(
+            "Maybe remove empty dir: {:?}, check_mtime: {}",
+            dir,
+            check_mtime
+        );
+        let entries = match std::fs::read_dir(dir) {
+            Ok(entries) => entries,
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                // Directory doesn't exist, treat as already removed (empty)
+                return Ok(true);
+            }
+            Err(e) => return Err(e),
+        };
+
+        let mut is_empty = true;
+        // Iterates all entries under the directory.
+        // We have to check all entries to clean up all empty subdirectories.
+        for entry in entries {
+            let entry = entry?;
+            let path = entry.path();
+            let metadata = std::fs::metadata(&path)?;
+
+            if metadata.is_dir() {
+                // Checks if we should skip this directory based on modification time
+                if check_mtime
+                    && let Ok(modified) = metadata.modified()
+                    && let Ok(elapsed) = modified.elapsed()
+                    && elapsed < Duration::from_secs(3600)
+                {
+                    common_telemetry::debug!("Skip directory by mtime, elapsed: {:?}", elapsed);
+                    // Only removes if not modified for at least 1 hour.
+                    is_empty = false;
+                    continue;
+                }
+
+                let subdir_empty = Self::remove_empty_dirs_recursive_sync(&path, check_mtime)?;
+                if subdir_empty {
+                    if let Err(e) = std::fs::remove_dir(&path) {
+                        if e.kind() != std::io::ErrorKind::NotFound {
+                            warn!(e; "Failed to remove empty directory {}", path.display());
+                            is_empty = false;
+                        } else {
+                            info!(
+                                "Empty directory {} not found before removal",
+                                path.display()
+                            );
+                        }
+                    } else {
+                        info!(
+                            "Removed empty directory {} from manifest cache",
+                            path.display()
+                        );
+                    }
+                } else {
+                    is_empty = false;
+                }
+            } else {
+                is_empty = false;
+            }
+        }
+
+        Ok(is_empty)
+    }
+}
+
+/// An entity that describes the file in the manifest cache.
+///
+/// It should only keep minimal information needed by the cache.
+#[derive(Debug, Clone)]
+pub(crate) struct IndexValue {
+    /// Size of the file in bytes.
+    pub(crate) file_size: u32,
+}
+
+#[cfg(test)]
+mod tests {
+    use common_test_util::temp_dir::create_temp_dir;
+    use object_store::services::Fs;
+
+    use super::*;
+
+    fn new_fs_store(path: &str) -> ObjectStore {
+        let builder = Fs::default().root(path);
+        ObjectStore::new(builder).unwrap().finish()
+    }
+
+    #[tokio::test]
+    async fn test_manifest_cache_basic() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("");
+        let local_store = new_fs_store(dir.path().to_str().unwrap());
+
+        let cache = ManifestCache::new(local_store.clone(), ReadableSize::mb(10), None).await;
+        let key = "region_1/manifest/00000000000000000007.json";
+        let file_path = cache.cache_file_path(key);
+
+        // Get an empty file.
+        assert!(cache.get(key).await.is_none());
+
+        // Write a file.
+        local_store
+            .write(&file_path, b"manifest content".as_slice())
+            .await
+            .unwrap();
+        // Add to the cache.
+        cache
+            .put(key.to_string(), IndexValue { file_size: 16 })
+            .await;
+
+        // Get the cached value.
+        let value = cache.get(key).await.unwrap();
+        assert_eq!(16, value.file_size);
+
+        // Get weighted size.
+        cache.index.run_pending_tasks().await;
+        assert_eq!(59, cache.index.weighted_size());
+
+        // Remove the file.
+        cache.remove(key).await;
+        cache.index.run_pending_tasks().await;
+        assert!(cache.get(key).await.is_none());
+
+        // Ensure all pending tasks of the moka cache is done before assertion.
+        cache.index.run_pending_tasks().await;
+
+        // The file also not exists.
+        assert!(!local_store.exists(&file_path).await.unwrap());
+        assert_eq!(0, cache.index.weighted_size());
+    }
+
+    #[tokio::test]
+    async fn test_manifest_cache_recover() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("");
+        let local_store = new_fs_store(dir.path().to_str().unwrap());
+        let cache = ManifestCache::new(local_store.clone(), ReadableSize::mb(10), None).await;
+
+        // Write some manifest files with different paths
+        let keys = [
+            "region_1/manifest/00000000000000000001.json",
+            "region_1/manifest/00000000000000000002.json",
+            "region_1/manifest/00000000000000000001.checkpoint",
+            "region_2/manifest/00000000000000000001.json",
+        ];
+
+        let mut total_size = 0;
+        for (i, key) in keys.iter().enumerate() {
+            let file_path = cache.cache_file_path(key);
+            let content = format!("manifest-{}", i).into_bytes();
+            local_store
+                .write(&file_path, content.clone())
+                .await
+                .unwrap();
+
+            // Add to the cache.
+            cache
+                .put(
+                    key.to_string(),
+                    IndexValue {
+                        file_size: content.len() as u32,
+                    },
+                )
+                .await;
+            total_size += content.len() + key.len();
+        }
+
+        // Create a new cache instance which will automatically recover from local store
+        let cache = ManifestCache::new(local_store.clone(), ReadableSize::mb(10), None).await;
+
+        // Wait for recovery to complete synchronously
+        cache.recover(true).await;
+
+        // Check size.
+        cache.index.run_pending_tasks().await;
+        let total_cached = cache.index.weighted_size() as usize;
+        assert_eq!(total_size, total_cached);
+
+        // Verify all files
+        for (i, key) in keys.iter().enumerate() {
+            let value = cache.get(key).await.unwrap();
+            assert_eq!(format!("manifest-{}", i).len() as u32, value.file_size);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_cache_file_path() {
+        let dir = create_temp_dir("");
+        let local_store = new_fs_store(dir.path().to_str().unwrap());
+        let cache = ManifestCache::new(local_store, ReadableSize::mb(10), None).await;
+
+        assert_eq!(
+            "cache/object/manifest/region_1/manifest/00000000000000000007.json",
+            cache.cache_file_path("region_1/manifest/00000000000000000007.json")
+        );
+        assert_eq!(
+            "cache/object/manifest/region_1/manifest/00000000000000000007.checkpoint",
+            cache.cache_file_path("region_1/manifest/00000000000000000007.checkpoint")
+        );
+    }
+
+    #[tokio::test]
+    async fn test_clean_empty_dirs_sync_no_mtime_check() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("");
+        let root = PathBuf::from(dir.path());
+
+        // Create a directory structure:
+        // root/
+        //   empty_dir1/
+        //   empty_dir2/
+        //     empty_subdir/
+        //   non_empty_dir/
+        //     file.txt
+        //   nested/
+        //     empty_subdir1/
+        //     non_empty_subdir/
+        //       file.txt
+
+        let empty_dir1 = root.join("empty_dir1");
+        let empty_dir2 = root.join("empty_dir2");
+        let empty_subdir = empty_dir2.join("empty_subdir");
+        let non_empty_dir = root.join("non_empty_dir");
+        let nested = root.join("nested");
+        let nested_empty = nested.join("empty_subdir1");
+        let nested_non_empty = nested.join("non_empty_subdir");
+
+        // Create directories
+        std::fs::create_dir_all(&empty_dir1).unwrap();
+        std::fs::create_dir_all(&empty_subdir).unwrap();
+        std::fs::create_dir_all(&non_empty_dir).unwrap();
+        std::fs::create_dir_all(&nested_empty).unwrap();
+        std::fs::create_dir_all(&nested_non_empty).unwrap();
+
+        // Create files in non-empty directories
+        std::fs::write(non_empty_dir.join("file.txt"), b"content").unwrap();
+        std::fs::write(nested_non_empty.join("file.txt"), b"content").unwrap();
+
+        // Verify initial state
+        assert!(empty_dir1.exists());
+        assert!(empty_dir2.exists());
+        assert!(empty_subdir.exists());
+        assert!(non_empty_dir.exists());
+        assert!(nested.exists());
+        assert!(nested_empty.exists());
+        assert!(nested_non_empty.exists());
+
+        // Clean empty directories with check_mtime = false
+        ManifestCache::clean_empty_dirs_sync(&root, false).unwrap();
+
+        // Verify empty directories are removed
+        assert!(!empty_dir1.exists());
+        assert!(!empty_dir2.exists());
+        assert!(!empty_subdir.exists());
+        assert!(!nested_empty.exists());
+
+        // Verify non-empty directories still exist
+        assert!(non_empty_dir.exists());
+        assert!(non_empty_dir.join("file.txt").exists());
+        assert!(nested.exists());
+        assert!(nested_non_empty.exists());
+        assert!(nested_non_empty.join("file.txt").exists());
+    }
+
+    #[tokio::test]
+    async fn test_clean_empty_dirs_sync_with_mtime_check() {
+        common_telemetry::init_default_ut_logging();
+
+        let dir = create_temp_dir("");
+        let root = PathBuf::from(dir.path());
+
+        // Create a directory structure with recently created empty directories
+        // root/
+        //   empty_dir1/
+        //   empty_dir2/
+        //     empty_subdir/
+        //   non_empty_dir/
+        //     file.txt
+
+        let empty_dir1 = root.join("empty_dir1");
+        let empty_dir2 = root.join("empty_dir2");
+        let empty_subdir = empty_dir2.join("empty_subdir");
+        let non_empty_dir = root.join("non_empty_dir");
+
+        // Create directories
+        std::fs::create_dir_all(&empty_dir1).unwrap();
+        std::fs::create_dir_all(&empty_subdir).unwrap();
+        std::fs::create_dir_all(&non_empty_dir).unwrap();
+
+        // Create file in non-empty directory
+        std::fs::write(non_empty_dir.join("file.txt"), b"content").unwrap();
+
+        // Verify initial state
+        assert!(empty_dir1.exists());
+        assert!(empty_dir2.exists());
+        assert!(empty_subdir.exists());
+        assert!(non_empty_dir.exists());
+
+        // Clean empty directories with check_mtime = true
+        // Since the directories were just created (mtime < 1 hour), they should NOT be removed
+        ManifestCache::clean_empty_dirs_sync(&root, true).unwrap();
+
+        // Verify empty directories are NOT removed (they're too recent)
+        assert!(empty_dir1.exists());
+        assert!(empty_dir2.exists());
+        assert!(empty_subdir.exists());
+
+        // Verify non-empty directory still exists
+        assert!(non_empty_dir.exists());
+        assert!(non_empty_dir.join("file.txt").exists());
+    }
+}
diff --git a/src/mito2/src/cache/write_cache.rs b/src/mito2/src/cache/write_cache.rs
index b54e3e6f73..25d0d7b060 100644
--- a/src/mito2/src/cache/write_cache.rs
+++ b/src/mito2/src/cache/write_cache.rs
@@ -30,6 +30,7 @@ use crate::access_layer::{
     TempFileCleaner, WriteCachePathProvider, WriteType, new_fs_cache_store,
 };
 use crate::cache::file_cache::{FileCache, FileCacheRef, FileType, IndexKey, IndexValue};
+use crate::cache::manifest_cache::ManifestCache;
 use crate::error::{self, Result};
 use crate::metrics::UPLOAD_BYTES_TOTAL;
 use crate::region::opener::RegionLoadCacheTask;
@@ -53,6 +54,8 @@ pub struct WriteCache {
     intermediate_manager: IntermediateManager,
     /// Sender for region load cache tasks.
     task_sender: UnboundedSender<RegionLoadCacheTask>,
+    /// Optional cache for manifest files.
+    manifest_cache: Option<ManifestCache>,
 }
 
 pub type WriteCacheRef = Arc<WriteCache>;
@@ -60,13 +63,16 @@ pub type WriteCacheRef = Arc<WriteCache>;
 impl WriteCache {
     /// Create the cache with a `local_store` to cache files and a
     /// `object_store_manager` for all object stores.
+    #[allow(clippy::too_many_arguments)]
     pub async fn new(
         local_store: ObjectStore,
         cache_capacity: ReadableSize,
         ttl: Option<Duration>,
         index_cache_percent: Option<u8>,
+        enable_background_worker: bool,
         puffin_manager_factory: PuffinManagerFactory,
         intermediate_manager: IntermediateManager,
+        manifest_cache: Option<ManifestCache>,
     ) -> Result<Self> {
         let (task_sender, task_receiver) = unbounded_channel();
 
@@ -75,6 +81,7 @@ impl WriteCache {
             cache_capacity,
             ttl,
             index_cache_percent,
+            enable_background_worker,
         ));
         file_cache.recover(false, Some(task_receiver)).await;
 
@@ -83,28 +90,42 @@ impl WriteCache {
             puffin_manager_factory,
             intermediate_manager,
             task_sender,
+            manifest_cache,
         })
     }
 
     /// Creates a write cache based on local fs.
+    #[allow(clippy::too_many_arguments)]
     pub async fn new_fs(
         cache_dir: &str,
         cache_capacity: ReadableSize,
         ttl: Option<Duration>,
         index_cache_percent: Option<u8>,
+        enable_background_worker: bool,
         puffin_manager_factory: PuffinManagerFactory,
         intermediate_manager: IntermediateManager,
+        manifest_cache_capacity: ReadableSize,
     ) -> Result<Self> {
         info!("Init write cache on {cache_dir}, capacity: {cache_capacity}");
 
         let local_store = new_fs_cache_store(cache_dir).await?;
+
+        // Create manifest cache if capacity is non-zero
+        let manifest_cache = if manifest_cache_capacity.as_bytes() > 0 {
+            Some(ManifestCache::new(local_store.clone(), manifest_cache_capacity, ttl).await)
+        } else {
+            None
+        };
+
         Self::new(
             local_store,
             cache_capacity,
             ttl,
             index_cache_percent,
+            enable_background_worker,
             puffin_manager_factory,
             intermediate_manager,
+            manifest_cache,
         )
         .await
     }
@@ -114,6 +135,11 @@ impl WriteCache {
         self.file_cache.clone()
     }
 
+    /// Returns the manifest cache if available.
+    pub(crate) fn manifest_cache(&self) -> Option<ManifestCache> {
+        self.manifest_cache.clone()
+    }
+
     /// Build the puffin manager
     pub(crate) fn build_puffin_manager(&self) -> SstPuffinManager {
         let store = self.file_cache.local_store();
@@ -195,6 +221,7 @@ impl WriteCache {
             puffin_manager: self
                 .puffin_manager_factory
                 .build(store.clone(), path_provider.clone()),
+            write_cache_enabled: true,
             intermediate_manager: self.intermediate_manager.clone(),
             index_options: write_request.index_options,
             inverted_index_config: write_request.inverted_index_config,
@@ -246,7 +273,7 @@ impl WriteCache {
             upload_tracker.push_uploaded_file(parquet_path);
 
             if sst.index_metadata.file_size > 0 {
-                let puffin_key = IndexKey::new(region_id, sst.file_id, FileType::Puffin);
+                let puffin_key = IndexKey::new(region_id, sst.file_id, FileType::Puffin(0));
                 let puffin_path = upload_request
                     .dest_path_provider
                     .build_index_file_path(RegionFileId::new(region_id, sst.file_id));
@@ -419,7 +446,11 @@ impl UploadTracker {
             file_cache.remove(parquet_key).await;
 
             if sst.index_metadata.file_size > 0 {
-                let puffin_key = IndexKey::new(self.region_id, sst.file_id, FileType::Puffin);
+                let puffin_key = IndexKey::new(
+                    self.region_id,
+                    sst.file_id,
+                    FileType::Puffin(sst.index_metadata.version),
+                );
                 file_cache.remove(puffin_key).await;
             }
         }
@@ -528,7 +559,7 @@ mod tests {
         assert_eq!(remote_data.to_vec(), cache_data.to_vec());
 
         // Check write cache contains the index key
-        let index_key = IndexKey::new(region_id, file_id, FileType::Puffin);
+        let index_key = IndexKey::new(region_id, file_id, FileType::Puffin(0));
         assert!(write_cache.file_cache.contains_key(&index_key));
 
         let remote_index_data = mock_store.read(&index_upload_path).await.unwrap();
diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs
index be4c12aa1b..3bb3fe932f 100644
--- a/src/mito2/src/compaction.rs
+++ b/src/mito2/src/compaction.rs
@@ -14,6 +14,7 @@
 
 mod buckets;
 pub mod compactor;
+pub mod memory_manager;
 pub mod picker;
 pub mod run;
 mod task;
@@ -29,6 +30,7 @@ use std::time::Instant;
 use api::v1::region::compact_request;
 use api::v1::region::compact_request::Options;
 use common_base::Plugins;
+use common_memory_manager::OnExhaustedPolicy;
 use common_meta::key::SchemaMetadataManagerRef;
 use common_telemetry::{debug, error, info, warn};
 use common_time::range::TimestampRange;
@@ -46,7 +48,8 @@ use tokio::sync::mpsc::{self, Sender};
 use crate::access_layer::AccessLayerRef;
 use crate::cache::{CacheManagerRef, CacheStrategy};
 use crate::compaction::compactor::{CompactionRegion, CompactionVersion, DefaultCompactor};
-use crate::compaction::picker::{CompactionTask, new_picker};
+use crate::compaction::memory_manager::CompactionMemoryManager;
+use crate::compaction::picker::{CompactionTask, PickerOutput, new_picker};
 use crate::compaction::task::CompactionTaskImpl;
 use crate::config::MitoConfig;
 use crate::error::{
@@ -104,12 +107,15 @@ pub(crate) struct CompactionScheduler {
     request_sender: Sender<WorkerRequestWithTime>,
     cache_manager: CacheManagerRef,
     engine_config: Arc<MitoConfig>,
+    memory_manager: Arc<CompactionMemoryManager>,
+    memory_policy: OnExhaustedPolicy,
     listener: WorkerListener,
     /// Plugins for the compaction scheduler.
     plugins: Plugins,
 }
 
 impl CompactionScheduler {
+    #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
         scheduler: SchedulerRef,
         request_sender: Sender<WorkerRequestWithTime>,
@@ -117,6 +123,8 @@ impl CompactionScheduler {
         engine_config: Arc<MitoConfig>,
         listener: WorkerListener,
         plugins: Plugins,
+        memory_manager: Arc<CompactionMemoryManager>,
+        memory_policy: OnExhaustedPolicy,
     ) -> Self {
         Self {
             scheduler,
@@ -124,6 +132,8 @@ impl CompactionScheduler {
             request_sender,
             cache_manager,
             engine_config,
+            memory_manager,
+            memory_policy,
             listener,
             plugins,
         }
@@ -429,7 +439,8 @@ impl CompactionScheduler {
         };
 
         // Create a local compaction task.
-        let mut local_compaction_task = Box::new(CompactionTaskImpl {
+        let estimated_bytes = estimate_compaction_bytes(&picker_output);
+        let local_compaction_task = Box::new(CompactionTaskImpl {
             request_sender,
             waiters,
             start_time,
@@ -437,18 +448,27 @@ impl CompactionScheduler {
             picker_output,
             compaction_region,
             compactor: Arc::new(DefaultCompactor {}),
+            memory_manager: self.memory_manager.clone(),
+            memory_policy: self.memory_policy,
+            estimated_memory_bytes: estimated_bytes,
         });
 
-        // Submit the compaction task.
+        self.submit_compaction_task(local_compaction_task, region_id)
+    }
+
+    fn submit_compaction_task(
+        &mut self,
+        mut task: Box<CompactionTaskImpl>,
+        region_id: RegionId,
+    ) -> Result<()> {
         self.scheduler
             .schedule(Box::pin(async move {
                 INFLIGHT_COMPACTION_COUNT.inc();
-                local_compaction_task.run().await;
+                task.run().await;
                 INFLIGHT_COMPACTION_COUNT.dec();
             }))
             .map_err(|e| {
                 error!(e; "Failed to submit compaction request for region {}", region_id);
-                // If failed to submit the job, we need to remove the region from the scheduler.
                 self.region_status.remove(&region_id);
                 e
             })
@@ -758,6 +778,20 @@ fn get_expired_ssts(
         .collect()
 }
 
+/// Estimates compaction memory as the sum of all input files' maximum row-group
+/// uncompressed sizes.
+fn estimate_compaction_bytes(picker_output: &PickerOutput) -> u64 {
+    picker_output
+        .outputs
+        .iter()
+        .flat_map(|output| output.inputs.iter())
+        .map(|file: &FileHandle| {
+            let meta = file.meta_ref();
+            meta.max_row_group_uncompressed_size
+        })
+        .sum()
+}
+
 /// Pending compaction request that is supposed to run after current task is finished,
 /// typically used for manual compactions.
 struct PendingCompaction {
@@ -773,9 +807,10 @@ struct PendingCompaction {
 mod tests {
     use api::v1::region::StrictWindow;
     use common_datasource::compression::CompressionType;
-    use tokio::sync::oneshot;
+    use tokio::sync::{Barrier, oneshot};
 
     use super::*;
+    use crate::compaction::memory_manager::{CompactionMemoryGuard, new_compaction_memory_manager};
     use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
     use crate::region::ManifestContext;
     use crate::sst::FormatType;
@@ -1110,10 +1145,10 @@ mod tests {
                     compress_type: CompressionType::Uncompressed,
                     checkpoint_distance: 10,
                     remove_file_options: Default::default(),
+                    manifest_cache: None,
                 },
-                Default::default(),
-                Default::default(),
                 FormatType::PrimaryKey,
+                &Default::default(),
             )
             .await
             .unwrap();
@@ -1145,4 +1180,39 @@ mod tests {
         assert_eq!(result.unwrap(), 0); // is there a better way to check this?
         assert_eq!(0, scheduler.region_status.len());
     }
+
+    #[tokio::test]
+    async fn test_concurrent_memory_competition() {
+        let manager = Arc::new(new_compaction_memory_manager(3 * 1024 * 1024)); // 3MB
+        let barrier = Arc::new(Barrier::new(3));
+        let mut handles = vec![];
+
+        // Spawn 3 tasks competing for memory, each trying to acquire 2MB
+        for _i in 0..3 {
+            let mgr = manager.clone();
+            let bar = barrier.clone();
+            let handle = tokio::spawn(async move {
+                bar.wait().await; // Synchronize start
+                mgr.try_acquire(2 * 1024 * 1024)
+            });
+            handles.push(handle);
+        }
+
+        let results: Vec<Option<CompactionMemoryGuard>> = futures::future::join_all(handles)
+            .await
+            .into_iter()
+            .map(|r| r.unwrap())
+            .collect();
+
+        // Only 1 should succeed (3MB limit, 2MB request, can only fit one)
+        let succeeded = results.iter().filter(|r| r.is_some()).count();
+        let failed = results.iter().filter(|r| r.is_none()).count();
+
+        assert_eq!(succeeded, 1, "Expected exactly 1 task to acquire memory");
+        assert_eq!(failed, 2, "Expected 2 tasks to fail");
+
+        // Clean up
+        drop(results);
+        assert_eq!(manager.used_bytes(), 0);
+    }
 }
diff --git a/src/mito2/src/compaction/compactor.rs b/src/mito2/src/compaction/compactor.rs
index 71698471c3..4f9089c13d 100644
--- a/src/mito2/src/compaction/compactor.rs
+++ b/src/mito2/src/compaction/compactor.rs
@@ -41,11 +41,9 @@ use crate::error::{
     EmptyRegionDirSnafu, InvalidPartitionExprSnafu, JoinSnafu, ObjectStoreNotFoundSnafu, Result,
 };
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
-use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions, RemoveFileOptions};
-use crate::manifest::storage::manifest_compress_type;
+use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
 use crate::metrics;
 use crate::read::{FlatSource, Source};
-use crate::region::opener::new_manifest_dir;
 use crate::region::options::RegionOptions;
 use crate::region::version::VersionRef;
 use crate::region::{ManifestContext, RegionLeaderState, RegionRoleState};
@@ -162,31 +160,16 @@ pub async fn open_compaction_region(
     };
 
     let manifest_manager = {
-        let region_manifest_options = RegionManifestOptions {
-            manifest_dir: new_manifest_dir(&region_dir_from_table_dir(
-                &req.table_dir,
-                req.region_id,
-                req.path_type,
-            )),
-            object_store: object_store.clone(),
-            compress_type: manifest_compress_type(mito_config.compress_manifest),
-            checkpoint_distance: mito_config.manifest_checkpoint_distance,
-            remove_file_options: RemoveFileOptions {
-                keep_count: mito_config.experimental_manifest_keep_removed_file_count,
-                keep_ttl: mito_config.experimental_manifest_keep_removed_file_ttl,
-            },
-        };
+        let region_dir = region_dir_from_table_dir(&req.table_dir, req.region_id, req.path_type);
+        let region_manifest_options =
+            RegionManifestOptions::new(mito_config, &region_dir, object_store);
 
-        RegionManifestManager::open(
-            region_manifest_options,
-            Default::default(),
-            Default::default(),
-        )
-        .await?
-        .context(EmptyRegionDirSnafu {
-            region_id: req.region_id,
-            region_dir: &region_dir_from_table_dir(&req.table_dir, req.region_id, req.path_type),
-        })?
+        RegionManifestManager::open(region_manifest_options, &Default::default())
+            .await?
+            .with_context(|| EmptyRegionDirSnafu {
+                region_id: req.region_id,
+                region_dir: region_dir_from_table_dir(&req.table_dir, req.region_id, req.path_type),
+            })?
     };
 
     let manifest = manifest_manager.manifest();
@@ -413,9 +396,11 @@ impl DefaultCompactor {
                 time_range: sst_info.time_range,
                 level: output.output_level,
                 file_size: sst_info.file_size,
+                max_row_group_uncompressed_size: sst_info.max_row_group_uncompressed_size,
                 available_indexes: sst_info.index_metadata.build_available_indexes(),
+                indexes: sst_info.index_metadata.build_indexes(),
                 index_file_size: sst_info.index_metadata.file_size,
-                index_file_id: None,
+                index_version: 0,
                 num_rows: sst_info.num_rows as u64,
                 num_row_groups: sst_info.num_row_groups,
                 sequence: max_sequence,
@@ -517,7 +502,7 @@ impl Compactor for DefaultCompactor {
         // TODO: We might leak files if we fail to update manifest. We can add a cleanup task to remove them later.
         compaction_region
             .manifest_ctx
-            .update_manifest(RegionLeaderState::Writable, action_list)
+            .update_manifest(RegionLeaderState::Writable, action_list, false)
             .await?;
 
         Ok(edit)
diff --git a/src/mito2/src/compaction/memory_manager.rs b/src/mito2/src/compaction/memory_manager.rs
new file mode 100644
index 0000000000..8cbb5d293a
--- /dev/null
+++ b/src/mito2/src/compaction/memory_manager.rs
@@ -0,0 +1,50 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use common_memory_manager::{MemoryGuard, MemoryManager, MemoryMetrics};
+
+use crate::metrics::{
+    COMPACTION_MEMORY_IN_USE, COMPACTION_MEMORY_LIMIT, COMPACTION_MEMORY_REJECTED,
+};
+
+/// Compaction-specific memory metrics implementation.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct CompactionMemoryMetrics;
+
+impl MemoryMetrics for CompactionMemoryMetrics {
+    fn set_limit(&self, bytes: i64) {
+        COMPACTION_MEMORY_LIMIT.set(bytes);
+    }
+
+    fn set_in_use(&self, bytes: i64) {
+        COMPACTION_MEMORY_IN_USE.set(bytes);
+    }
+
+    fn inc_rejected(&self, reason: &str) {
+        COMPACTION_MEMORY_REJECTED
+            .with_label_values(&[reason])
+            .inc();
+    }
+}
+
+/// Compaction memory manager.
+pub type CompactionMemoryManager = MemoryManager<CompactionMemoryMetrics>;
+
+/// Compaction memory guard.
+pub type CompactionMemoryGuard = MemoryGuard<CompactionMemoryMetrics>;
+
+/// Helper to construct a compaction memory manager without passing metrics explicitly.
+pub fn new_compaction_memory_manager(limit_bytes: u64) -> CompactionMemoryManager {
+    CompactionMemoryManager::new(limit_bytes, CompactionMemoryMetrics)
+}
diff --git a/src/mito2/src/compaction/run.rs b/src/mito2/src/compaction/run.rs
index e691709948..a7e5ca490c 100644
--- a/src/mito2/src/compaction/run.rs
+++ b/src/mito2/src/compaction/run.rs
@@ -163,6 +163,10 @@ impl FileGroup {
         self.files.push(file);
     }
 
+    pub(crate) fn num_files(&self) -> usize {
+        self.files.len()
+    }
+
     #[cfg(test)]
     pub(crate) fn files(&self) -> &[FileHandle] {
         &self.files[..]
@@ -175,10 +179,6 @@ impl FileGroup {
     pub(crate) fn into_files(self) -> impl Iterator<Item = FileHandle> {
         self.files.into_iter()
     }
-
-    pub(crate) fn is_all_level_0(&self) -> bool {
-        self.files.iter().all(|f| f.level() == 0)
-    }
 }
 
 impl Ranged for FileGroup {
diff --git a/src/mito2/src/compaction/task.rs b/src/mito2/src/compaction/task.rs
index 8488c9af9e..87a3ad7349 100644
--- a/src/mito2/src/compaction/task.rs
+++ b/src/mito2/src/compaction/task.rs
@@ -16,16 +16,18 @@ use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
 use std::time::Instant;
 
+use common_memory_manager::OnExhaustedPolicy;
 use common_telemetry::{error, info, warn};
 use itertools::Itertools;
 use snafu::ResultExt;
 use tokio::sync::mpsc;
 
 use crate::compaction::compactor::{CompactionRegion, Compactor};
+use crate::compaction::memory_manager::{CompactionMemoryGuard, CompactionMemoryManager};
 use crate::compaction::picker::{CompactionTask, PickerOutput};
-use crate::error::CompactRegionSnafu;
+use crate::error::{CompactRegionSnafu, CompactionMemoryExhaustedSnafu, MemoryAcquireFailedSnafu};
 use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
-use crate::metrics::{COMPACTION_FAILURE_COUNT, COMPACTION_STAGE_ELAPSED};
+use crate::metrics::{COMPACTION_FAILURE_COUNT, COMPACTION_MEMORY_WAIT, COMPACTION_STAGE_ELAPSED};
 use crate::region::RegionRoleState;
 use crate::request::{
     BackgroundNotify, CompactionFailed, CompactionFinished, OutputTx, RegionEditResult,
@@ -52,6 +54,12 @@ pub(crate) struct CompactionTaskImpl {
     pub(crate) compactor: Arc<dyn Compactor>,
     /// Output of the picker.
     pub(crate) picker_output: PickerOutput,
+    /// Memory manager to acquire memory budget.
+    pub(crate) memory_manager: Arc<CompactionMemoryManager>,
+    /// Policy when memory is exhausted.
+    pub(crate) memory_policy: OnExhaustedPolicy,
+    /// Estimated memory bytes needed for this compaction.
+    pub(crate) estimated_memory_bytes: u64,
 }
 
 impl Debug for CompactionTaskImpl {
@@ -81,6 +89,88 @@ impl CompactionTaskImpl {
             .for_each(|o| o.inputs.iter().for_each(|f| f.set_compacting(compacting)));
     }
 
+    /// Acquires memory budget based on the configured policy.
+    ///
+    /// Returns an error if memory cannot be acquired according to the policy.
+    async fn acquire_memory_with_policy(&self) -> error::Result<CompactionMemoryGuard> {
+        let region_id = self.compaction_region.region_id;
+        let requested_bytes = self.estimated_memory_bytes;
+        let limit_bytes = self.memory_manager.limit_bytes();
+
+        if limit_bytes > 0 && requested_bytes > limit_bytes {
+            warn!(
+                "Compaction for region {} requires {} bytes but limit is {} bytes; cannot satisfy request",
+                region_id, requested_bytes, limit_bytes
+            );
+            return Err(CompactionMemoryExhaustedSnafu {
+                region_id,
+                required_bytes: requested_bytes,
+                limit_bytes,
+                policy: "exceed_limit".to_string(),
+            }
+            .build());
+        }
+
+        match self.memory_policy {
+            OnExhaustedPolicy::Wait {
+                timeout: wait_timeout,
+            } => {
+                let timer = COMPACTION_MEMORY_WAIT.start_timer();
+
+                match tokio::time::timeout(
+                    wait_timeout,
+                    self.memory_manager.acquire(requested_bytes),
+                )
+                .await
+                {
+                    Ok(Ok(guard)) => {
+                        timer.observe_duration();
+                        Ok(guard)
+                    }
+                    Ok(Err(e)) => {
+                        timer.observe_duration();
+                        Err(e).with_context(|_| MemoryAcquireFailedSnafu {
+                            region_id,
+                            policy: format!("wait_timeout({}ms)", wait_timeout.as_millis()),
+                        })
+                    }
+                    Err(_) => {
+                        timer.observe_duration();
+                        warn!(
+                            "Compaction for region {} waited {:?} for {} bytes but timed out",
+                            region_id, wait_timeout, requested_bytes
+                        );
+                        CompactionMemoryExhaustedSnafu {
+                            region_id,
+                            required_bytes: requested_bytes,
+                            limit_bytes,
+                            policy: format!("wait_timeout({}ms)", wait_timeout.as_millis()),
+                        }
+                        .fail()
+                    }
+                }
+            }
+            OnExhaustedPolicy::Fail => {
+                // Try to acquire, fail immediately if not available
+                self.memory_manager
+                    .try_acquire(requested_bytes)
+                    .ok_or_else(|| {
+                        warn!(
+                            "Compaction memory exhausted for region {} (policy=fail, need {} bytes, limit {} bytes)",
+                            region_id, requested_bytes, limit_bytes
+                        );
+                        CompactionMemoryExhaustedSnafu {
+                            region_id,
+                            required_bytes: requested_bytes,
+                            limit_bytes,
+                            policy: "fail".to_string(),
+                        }
+                        .build()
+                    })
+            }
+        }
+    }
+
     /// Remove expired ssts files, update manifest immediately
     /// and apply the edit to region version.
     ///
@@ -117,7 +207,7 @@ impl CompactionTaskImpl {
         };
         if let Err(e) = compaction_region
             .manifest_ctx
-            .update_manifest(current_region_state, action_list)
+            .update_manifest(current_region_state, action_list, false)
             .await
         {
             warn!(
@@ -222,7 +312,7 @@ impl CompactionTaskImpl {
     }
 
     /// Handles compaction failure, notifies all waiters.
-    fn on_failure(&mut self, err: Arc<error::Error>) {
+    pub(crate) fn on_failure(&mut self, err: Arc<error::Error>) {
         COMPACTION_FAILURE_COUNT.inc();
         for waiter in self.waiters.drain(..) {
             waiter.send(Err(err.clone()).context(CompactRegionSnafu {
@@ -249,6 +339,26 @@ impl CompactionTaskImpl {
 #[async_trait::async_trait]
 impl CompactionTask for CompactionTaskImpl {
     async fn run(&mut self) {
+        // Acquire memory budget before starting compaction
+        let _memory_guard = match self.acquire_memory_with_policy().await {
+            Ok(guard) => guard,
+            Err(e) => {
+                error!(e; "Failed to acquire memory for compaction, region id: {}", self.compaction_region.region_id);
+                let err = Arc::new(e);
+                self.on_failure(err.clone());
+                let notify = BackgroundNotify::CompactionFailed(CompactionFailed {
+                    region_id: self.compaction_region.region_id,
+                    err,
+                });
+                self.send_to_worker(WorkerRequest::Background {
+                    region_id: self.compaction_region.region_id,
+                    notify,
+                })
+                .await;
+                return;
+            }
+        };
+
         let notify = match self.handle_expiration_and_compaction().await {
             Ok(edit) => BackgroundNotify::CompactionFinished(CompactionFinished {
                 region_id: self.compaction_region.region_id,
diff --git a/src/mito2/src/compaction/test_util.rs b/src/mito2/src/compaction/test_util.rs
index 781b905349..6061e294bd 100644
--- a/src/mito2/src/compaction/test_util.rs
+++ b/src/mito2/src/compaction/test_util.rs
@@ -74,9 +74,11 @@ pub fn new_file_handle_with_size_and_sequence(
             ),
             level,
             file_size,
+            max_row_group_uncompressed_size: file_size,
             available_indexes: Default::default(),
+            indexes: Default::default(),
             index_file_size: 0,
-            index_file_id: None,
+            index_version: 0,
             num_rows: 0,
             num_row_groups: 0,
             num_series: 0,
diff --git a/src/mito2/src/compaction/twcs.rs b/src/mito2/src/compaction/twcs.rs
index 371fb8f989..9012457f75 100644
--- a/src/mito2/src/compaction/twcs.rs
+++ b/src/mito2/src/compaction/twcs.rs
@@ -36,6 +36,9 @@ use crate::sst::version::LevelMeta;
 
 const LEVEL_COMPACTED: Level = 1;
 
+/// Default value for max compaction input file num.
+const DEFAULT_MAX_INPUT_FILE_NUM: usize = 32;
+
 /// `TwcsPicker` picks files of which the max timestamp are in the same time window as compaction
 /// candidates.
 #[derive(Debug)]
@@ -73,7 +76,7 @@ impl TwcsPicker {
             {
                 let (kept_files, ignored_files) = files_to_merge
                     .into_iter()
-                    .partition(|fg| fg.size() <= max_size as usize && fg.is_all_level_0());
+                    .partition(|fg| fg.size() <= max_size as usize);
                 files_to_merge = kept_files;
                 info!(
                     "Skipped {} large files in append mode for region {}, window {}, max_size: {}",
@@ -93,7 +96,7 @@ impl TwcsPicker {
                 continue;
             }
 
-            let inputs = if found_runs > 1 {
+            let mut inputs = if found_runs > 1 {
                 reduce_runs(sorted_runs)
             } else {
                 let run = sorted_runs.last().unwrap();
@@ -104,7 +107,32 @@ impl TwcsPicker {
                 merge_seq_files(run.items(), self.max_output_file_size)
             };
 
-            if !inputs.is_empty() {
+            // Limits the number of input files.
+            let total_input_files: usize = inputs.iter().map(|fg| fg.num_files()).sum();
+            if total_input_files > DEFAULT_MAX_INPUT_FILE_NUM {
+                // Sorts file groups by size first.
+                inputs.sort_unstable_by_key(|fg| fg.size());
+                let mut num_picked_files = 0;
+                inputs = inputs
+                    .into_iter()
+                    .take_while(|fg| {
+                        let current_group_file_num = fg.num_files();
+                        if current_group_file_num + num_picked_files <= DEFAULT_MAX_INPUT_FILE_NUM {
+                            num_picked_files += current_group_file_num;
+                            true
+                        } else {
+                            false
+                        }
+                    })
+                    .collect::<Vec<_>>();
+                info!(
+                    "Compaction for region {} enforces max input file num limit: {}, current total: {}, input: {:?}",
+                    region_id, DEFAULT_MAX_INPUT_FILE_NUM, total_input_files, inputs
+                );
+            }
+
+            if inputs.len() > 1 {
+                // If we have more than one group to compact.
                 log_pick_result(
                     region_id,
                     *window,
@@ -1024,5 +1052,85 @@ mod tests {
         assert!(!output.is_empty(), "Should have at least one output");
     }
 
+    #[test]
+    fn test_pick_multiple_runs() {
+        common_telemetry::init_default_ut_logging();
+
+        let num_files = 8;
+        let file_ids = (0..num_files).map(|_| FileId::random()).collect::<Vec<_>>();
+
+        // Create files with different sequences so they form multiple runs
+        let files: Vec<_> = file_ids
+            .iter()
+            .enumerate()
+            .map(|(idx, file_id)| {
+                new_file_handle_with_size_and_sequence(
+                    *file_id,
+                    0,
+                    999,
+                    0,
+                    (idx + 1) as u64,
+                    1024 * 1024,
+                )
+            })
+            .collect();
+
+        let mut windows = assign_to_windows(files.iter(), 3);
+
+        let picker = TwcsPicker {
+            trigger_file_num: 4,
+            time_window_seconds: Some(3),
+            max_output_file_size: None,
+            append_mode: false,
+            max_background_tasks: None,
+        };
+
+        let active_window = find_latest_window_in_seconds(files.iter(), 3);
+        let output = picker.build_output(RegionId::from_u64(123), &mut windows, active_window);
+
+        assert_eq!(1, output.len());
+        assert_eq!(output[0].inputs.len(), 2);
+    }
+
+    #[test]
+    fn test_limit_max_input_files() {
+        common_telemetry::init_default_ut_logging();
+
+        let num_files = 50;
+        let file_ids = (0..num_files).map(|_| FileId::random()).collect::<Vec<_>>();
+
+        // Create files with different sequences so they form 2 runs
+        let files: Vec<_> = file_ids
+            .iter()
+            .enumerate()
+            .map(|(idx, file_id)| {
+                new_file_handle_with_size_and_sequence(
+                    *file_id,
+                    (idx / 2 * 10) as i64,
+                    (idx / 2 * 10 + 5) as i64,
+                    0,
+                    (idx + 1) as u64,
+                    1024 * 1024,
+                )
+            })
+            .collect();
+
+        let mut windows = assign_to_windows(files.iter(), 3);
+
+        let picker = TwcsPicker {
+            trigger_file_num: 4,
+            time_window_seconds: Some(3),
+            max_output_file_size: None,
+            append_mode: false,
+            max_background_tasks: None,
+        };
+
+        let active_window = find_latest_window_in_seconds(files.iter(), 3);
+        let output = picker.build_output(RegionId::from_u64(123), &mut windows, active_window);
+
+        assert_eq!(1, output.len());
+        assert_eq!(output[0].inputs.len(), 32);
+    }
+
     // TODO(hl): TTL tester that checks if get_expired_ssts function works as expected.
 }
diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs
index 53cc745fe5..767eb2c81b 100644
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -20,6 +20,7 @@ use std::time::Duration;
 
 use common_base::memory_limit::MemoryLimit;
 use common_base::readable_size::ReadableSize;
+use common_memory_manager::OnExhaustedPolicy;
 use common_stat::{get_total_cpu_cores, get_total_memory_readable};
 use common_telemetry::warn;
 use serde::{Deserialize, Serialize};
@@ -92,6 +93,10 @@ pub struct MitoConfig {
     pub max_background_compactions: usize,
     /// Max number of running background purge jobs (default: number of cpu cores).
     pub max_background_purges: usize,
+    /// Memory budget for compaction tasks. Setting it to 0 or "unlimited" disables the limit.
+    pub experimental_compaction_memory_limit: MemoryLimit,
+    /// Behavior when compaction cannot acquire memory from the budget.
+    pub experimental_compaction_on_exhausted: OnExhaustedPolicy,
 
     // Flush configs:
     /// Interval to auto flush a region if it has not flushed yet (default 30 min).
@@ -126,6 +131,11 @@ pub struct MitoConfig {
     /// The remaining capacity is used for data (parquet) files.
     /// Must be between 0 and 100 (exclusive).
     pub index_cache_percent: u8,
+    /// Enable background downloading of files to the local cache when accessed during queries (default: true).
+    /// When enabled, files will be asynchronously downloaded to improve performance for subsequent reads.
+    pub enable_refill_cache_on_read: bool,
+    /// Capacity for manifest cache (default: 256MB).
+    pub manifest_cache_size: ReadableSize,
 
     // Other configs:
     /// Buffer size for SST writing.
@@ -178,6 +188,8 @@ impl Default for MitoConfig {
             max_background_flushes: divide_num_cpus(2),
             max_background_compactions: divide_num_cpus(4),
             max_background_purges: get_total_cpu_cores(),
+            experimental_compaction_memory_limit: MemoryLimit::Unlimited,
+            experimental_compaction_on_exhausted: OnExhaustedPolicy::default(),
             auto_flush_interval: Duration::from_secs(30 * 60),
             global_write_buffer_size: ReadableSize::gb(1),
             global_write_buffer_reject_size: ReadableSize::gb(2),
@@ -191,6 +203,8 @@ impl Default for MitoConfig {
             write_cache_ttl: None,
             preload_index_cache: true,
             index_cache_percent: DEFAULT_INDEX_CACHE_PERCENT,
+            enable_refill_cache_on_read: true,
+            manifest_cache_size: ReadableSize::mb(256),
             sst_write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE,
             parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
             max_concurrent_scan_files: DEFAULT_MAX_CONCURRENT_SCAN_FILES,
diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs
index 587552d02f..10c116a3a7 100644
--- a/src/mito2/src/engine.rs
+++ b/src/mito2/src/engine.rs
@@ -31,7 +31,7 @@ mod catchup_test;
 #[cfg(test)]
 mod close_test;
 #[cfg(test)]
-mod compaction_test;
+pub(crate) mod compaction_test;
 #[cfg(test)]
 mod create_test;
 #[cfg(test)]
@@ -71,6 +71,11 @@ mod sync_test;
 #[cfg(test)]
 mod truncate_test;
 
+#[cfg(test)]
+mod copy_region_from_test;
+#[cfg(test)]
+mod remap_manifests_test;
+
 mod puffin_index;
 
 use std::any::Any;
@@ -100,8 +105,10 @@ use store_api::metric_engine_consts::{
     MANIFEST_INFO_EXTENSION_KEY, TABLE_COLUMN_METADATA_EXTENSION_KEY,
 };
 use store_api::region_engine::{
-    BatchResponses, RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef,
-    RegionStatistic, SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse,
+    BatchResponses, CopyRegionFromRequest, CopyRegionFromResponse, MitoCopyRegionFromResponse,
+    RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
+    RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse,
+    SettableRegionRoleState, SyncManifestResponse,
 };
 use store_api::region_request::{
     AffectedRows, RegionCatchupRequest, RegionOpenRequest, RegionRequest,
@@ -115,8 +122,8 @@ use crate::cache::{CacheManagerRef, CacheStrategy};
 use crate::config::MitoConfig;
 use crate::engine::puffin_index::{IndexEntryContext, collect_index_entries_from_puffin};
 use crate::error::{
-    InvalidRequestSnafu, JoinSnafu, MitoManifestInfoSnafu, RecvSnafu, RegionNotFoundSnafu, Result,
-    SerdeJsonSnafu, SerializeColumnMetadataSnafu,
+    self, InvalidRequestSnafu, JoinSnafu, MitoManifestInfoSnafu, RecvSnafu, RegionNotFoundSnafu,
+    Result, SerdeJsonSnafu, SerializeColumnMetadataSnafu, SerializeManifestSnafu,
 };
 #[cfg(feature = "enterprise")]
 use crate::extension::BoxedExtensionRangeProviderFactory;
@@ -131,7 +138,7 @@ use crate::read::stream::ScanBatchStream;
 use crate::region::MitoRegionRef;
 use crate::region::opener::PartitionExprFetcherRef;
 use crate::request::{RegionEditRequest, WorkerRequest};
-use crate::sst::file::{FileMeta, RegionFileId};
+use crate::sst::file::{FileMeta, RegionFileId, RegionIndexId};
 use crate::sst::file_ref::FileReferenceManagerRef;
 use crate::wal::entry_distributor::{
     DEFAULT_ENTRY_RECEIVER_BUFFER_SIZE, build_wal_entry_distributor_and_receivers,
@@ -290,25 +297,30 @@ impl MitoEngine {
     }
 
     /// Get all tmp ref files for given region ids, excluding files that's already in manifest.
-    pub async fn get_snapshot_of_unmanifested_refs(
+    pub async fn get_snapshot_of_file_refs(
         &self,
-        region_ids: impl IntoIterator<Item = RegionId>,
+        file_handle_regions: impl IntoIterator<Item = RegionId>,
+        manifest_regions: HashMap<RegionId, Vec<RegionId>>,
     ) -> Result<FileRefsManifest> {
         let file_ref_mgr = self.file_ref_manager();
 
-        let region_ids = region_ids.into_iter().collect::<Vec<_>>();
-
-        // Convert region IDs to MitoRegionRef objects, error if any region doesn't exist
-        let regions: Vec<MitoRegionRef> = region_ids
+        let file_handle_regions = file_handle_regions.into_iter().collect::<Vec<_>>();
+        // Convert region IDs to MitoRegionRef objects, ignore regions that do not exist on current datanode
+        // as regions on other datanodes are not managed by this engine.
+        let query_regions: Vec<MitoRegionRef> = file_handle_regions
             .into_iter()
-            .map(|region_id| {
-                self.find_region(region_id)
-                    .with_context(|| RegionNotFoundSnafu { region_id })
+            .filter_map(|region_id| self.find_region(region_id))
+            .collect();
+
+        let related_regions: Vec<(MitoRegionRef, Vec<RegionId>)> = manifest_regions
+            .into_iter()
+            .filter_map(|(related_region, queries)| {
+                self.find_region(related_region).map(|r| (r, queries))
             })
-            .collect::<Result<_>>()?;
+            .collect();
 
         file_ref_mgr
-            .get_snapshot_of_unmanifested_refs(regions)
+            .get_snapshot_of_file_refs(query_regions, related_regions)
             .await
     }
 
@@ -369,7 +381,11 @@ impl MitoEngine {
     }
 
     /// Returns a scanner to scan for `request`.
-    async fn scanner(&self, region_id: RegionId, request: ScanRequest) -> Result<Scanner> {
+    pub(crate) async fn scanner(
+        &self,
+        region_id: RegionId,
+        request: ScanRequest,
+    ) -> Result<Scanner> {
         self.scan_region(region_id, request)?.scanner().await
     }
 
@@ -408,6 +424,17 @@ impl MitoEngine {
         rx.await.context(RecvSnafu)?
     }
 
+    /// Handles copy region from request.
+    ///
+    /// This method is only supported for internal use and is not exposed in the trait implementation.
+    pub async fn copy_region_from(
+        &self,
+        region_id: RegionId,
+        request: CopyRegionFromRequest,
+    ) -> Result<MitoCopyRegionFromResponse> {
+        self.inner.copy_region_from(region_id, request).await
+    }
+
     #[cfg(test)]
     pub(crate) fn get_region(&self, id: RegionId) -> Option<crate::region::MitoRegionRef> {
         self.find_region(id)
@@ -528,22 +555,23 @@ impl MitoEngine {
                         return Vec::new();
                     };
 
-                    let Some(index_file_id) = entry.index_file_id.as_ref() else {
-                        return Vec::new();
-                    };
-                    let file_id = match FileId::parse_str(index_file_id) {
+                    let index_version = entry.index_version;
+                    let file_id = match FileId::parse_str(&entry.file_id) {
                         Ok(file_id) => file_id,
                         Err(err) => {
                             warn!(
                                 err;
                                 "Failed to parse puffin index file id, table_dir: {}, file_id: {}",
                                 entry.table_dir,
-                                index_file_id
+                                entry.file_id
                             );
                             return Vec::new();
                         }
                     };
-                    let region_file_id = RegionFileId::new(entry.region_id, file_id);
+                    let region_index_id = RegionIndexId::new(
+                        RegionFileId::new(entry.region_id, file_id),
+                        index_version,
+                    );
                     let context = IndexEntryContext {
                         table_dir: &entry.table_dir,
                         index_file_path: index_file_path.as_str(),
@@ -552,7 +580,7 @@ impl MitoEngine {
                         region_number: entry.region_number,
                         region_group: entry.region_group,
                         region_sequence: entry.region_sequence,
-                        file_id: index_file_id,
+                        file_id: &entry.file_id,
                         index_file_size: entry.index_file_size,
                         node_id,
                     };
@@ -563,7 +591,7 @@ impl MitoEngine {
 
                     collect_index_entries_from_puffin(
                         manager,
-                        region_file_id,
+                        region_index_id,
                         context,
                         bloom_filter_cache,
                         inverted_index_cache,
@@ -607,7 +635,9 @@ impl MitoEngine {
     }
 }
 
-/// Check whether the region edit is valid. Only adding files to region is considered valid now.
+/// Check whether the region edit is valid.
+///
+/// Only adding or removing files to region is considered valid now.
 fn is_valid_region_edit(edit: &RegionEdit) -> bool {
     !edit.files_to_add.is_empty()
         && edit.files_to_remove.is_empty()
@@ -1018,6 +1048,40 @@ impl EngineInner {
         receiver.await.context(RecvSnafu)?
     }
 
+    async fn remap_manifests(
+        &self,
+        request: RemapManifestsRequest,
+    ) -> Result<RemapManifestsResponse> {
+        let region_id = request.region_id;
+        let (request, receiver) = WorkerRequest::try_from_remap_manifests_request(request)?;
+        self.workers.submit_to_worker(region_id, request).await?;
+        let manifests = receiver.await.context(RecvSnafu)??;
+
+        let new_manifests = manifests
+            .into_iter()
+            .map(|(region_id, manifest)| {
+                Ok((
+                    region_id,
+                    serde_json::to_string(&manifest)
+                        .context(SerializeManifestSnafu { region_id })?,
+                ))
+            })
+            .collect::<Result<HashMap<_, _>>>()?;
+        Ok(RemapManifestsResponse { new_manifests })
+    }
+
+    async fn copy_region_from(
+        &self,
+        region_id: RegionId,
+        request: CopyRegionFromRequest,
+    ) -> Result<MitoCopyRegionFromResponse> {
+        let (request, receiver) =
+            WorkerRequest::try_from_copy_region_from_request(region_id, request)?;
+        self.workers.submit_to_worker(region_id, request).await?;
+        let response = receiver.await.context(RecvSnafu)??;
+        Ok(response)
+    }
+
     fn role(&self, region_id: RegionId) -> Option<RegionRole> {
         self.workers.get_region(region_id).map(|region| {
             if region.is_follower() {
@@ -1194,6 +1258,29 @@ impl RegionEngine for MitoEngine {
         Ok(SyncManifestResponse::Mito { synced })
     }
 
+    async fn remap_manifests(
+        &self,
+        request: RemapManifestsRequest,
+    ) -> Result<RemapManifestsResponse, BoxedError> {
+        self.inner
+            .remap_manifests(request)
+            .await
+            .map_err(BoxedError::new)
+    }
+
+    async fn copy_region_from(
+        &self,
+        _region_id: RegionId,
+        _request: CopyRegionFromRequest,
+    ) -> Result<CopyRegionFromResponse, BoxedError> {
+        Err(BoxedError::new(
+            error::UnsupportedOperationSnafu {
+                err_msg: "copy_region_from is not supported",
+            }
+            .build(),
+        ))
+    }
+
     fn role(&self, region_id: RegionId) -> Option<RegionRole> {
         self.inner.role(region_id)
     }
diff --git a/src/mito2/src/engine/alter_test.rs b/src/mito2/src/engine/alter_test.rs
index 7717bbceb7..2aa26ba204 100644
--- a/src/mito2/src/engine/alter_test.rs
+++ b/src/mito2/src/engine/alter_test.rs
@@ -199,7 +199,7 @@ async fn test_alter_region_with_format(flat_format: bool) {
     assert_eq!(manifests.len(), 1);
     let (return_region_id, manifest) = manifests.remove(0);
     assert_eq!(return_region_id, region_id);
-    assert_eq!(manifest, RegionManifestInfo::mito(2, 1));
+    assert_eq!(manifest, RegionManifestInfo::mito(2, 1, 0));
     let column_metadatas =
         parse_column_metadatas(&response.extensions, TABLE_COLUMN_METADATA_EXTENSION_KEY).unwrap();
     assert_column_metadatas(
@@ -901,7 +901,7 @@ async fn test_alter_region_ttl_options_with_format(flat_format: bool) {
     check_ttl(&engine, &Duration::from_secs(500));
 }
 
-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn test_write_stall_on_altering() {
     common_telemetry::init_default_ut_logging();
 
@@ -952,6 +952,8 @@ async fn test_write_stall_on_altering_with_format(flat_format: bool) {
             .await
             .unwrap();
     });
+    // Make sure the loop is handling the alter request.
+    tokio::time::sleep(Duration::from_millis(100)).await;
 
     let column_schemas_cloned = column_schemas.clone();
     let engine_cloned = engine.clone();
@@ -962,6 +964,8 @@ async fn test_write_stall_on_altering_with_format(flat_format: bool) {
         };
         put_rows(&engine_cloned, region_id, rows).await;
     });
+    // Make sure the loop is handling the put request.
+    tokio::time::sleep(Duration::from_millis(100)).await;
 
     listener.wake_notify();
     alter_job.await.unwrap();
diff --git a/src/mito2/src/engine/basic_test.rs b/src/mito2/src/engine/basic_test.rs
index f17726abef..88303d3f70 100644
--- a/src/mito2/src/engine/basic_test.rs
+++ b/src/mito2/src/engine/basic_test.rs
@@ -861,9 +861,10 @@ async fn test_cache_null_primary_key_with_format(flat_format: bool) {
 #[tokio::test]
 async fn test_list_ssts() {
     test_list_ssts_with_format(false, r#"
-ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2513, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2513, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2513, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"# ,r#"
+ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2513, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2513, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2513, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(250), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"# ,
+r#"
 StorageSstEntry { file_path: "test/11_0000000001/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
 StorageSstEntry { file_path: "test/11_0000000001/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }
 StorageSstEntry { file_path: "test/11_0000000002/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
@@ -871,9 +872,10 @@ StorageSstEntry { file_path: "test/11_0000000002/index/<file_id>.puffin", file_s
 StorageSstEntry { file_path: "test/22_0000000042/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
 StorageSstEntry { file_path: "test/22_0000000042/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }"#).await;
     test_list_ssts_with_format(true, r#"
-ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
-ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_file_id: Some("<index_file_id>"), level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, r#"
+ManifestSstEntry { table_dir: "test/", region_id: 47244640257(11, 1), table_id: 11, region_number: 1, region_group: 0, region_sequence: 1, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/11_0000000001/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/11_0000000001/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640257(11, 1), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test/", region_id: 47244640258(11, 2), table_id: 11, region_number: 2, region_group: 0, region_sequence: 2, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/11_0000000002/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/11_0000000002/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 47244640258(11, 2), node_id: None, visible: true }
+ManifestSstEntry { table_dir: "test/", region_id: 94489280554(22, 42), table_id: 22, region_number: 42, region_group: 0, region_sequence: 42, file_id: "<file_id>", index_version: 0, level: 0, file_path: "test/22_0000000042/<file_id>.parquet", file_size: 2837, index_file_path: Some("test/22_0000000042/index/<file_id>.puffin"), index_file_size: Some(292), num_rows: 10, num_row_groups: 1, num_series: Some(1), min_ts: 0::Millisecond, max_ts: 9000::Millisecond, sequence: Some(10), origin_region_id: 94489280554(22, 42), node_id: None, visible: true }"#, 
+r#"
 StorageSstEntry { file_path: "test/11_0000000001/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
 StorageSstEntry { file_path: "test/11_0000000001/index/<file_id>.puffin", file_size: None, last_modified_ms: None, node_id: None }
 StorageSstEntry { file_path: "test/11_0000000002/<file_id>.parquet", file_size: None, last_modified_ms: None, node_id: None }
@@ -945,13 +947,13 @@ async fn test_list_ssts_with_format(
                 .index_file_path
                 .map(|p| p.replace(&e.file_id, "<file_id>"));
             e.file_id = "<file_id>".to_string();
-            e.index_file_id = e.index_file_id.map(|_| "<index_file_id>".to_string());
+            e.index_version = 0;
             format!("\n{:?}", e)
         })
         .sorted()
         .collect::<Vec<_>>()
         .join("");
-    assert_eq!(debug_format, expected_manifest_ssts,);
+    assert_eq!(debug_format, expected_manifest_ssts, "{}", debug_format);
 
     // list from storage
     let storage_entries = engine
@@ -969,7 +971,7 @@ async fn test_list_ssts_with_format(
         .sorted()
         .collect::<Vec<_>>()
         .join("");
-    assert_eq!(debug_format, expected_storage_ssts,);
+    assert_eq!(debug_format, expected_storage_ssts, "{}", debug_format);
 }
 
 #[tokio::test]
diff --git a/src/mito2/src/engine/compaction_test.rs b/src/mito2/src/engine/compaction_test.rs
index 0e91c542a0..09fc4e2935 100644
--- a/src/mito2/src/engine/compaction_test.rs
+++ b/src/mito2/src/engine/compaction_test.rs
@@ -19,8 +19,8 @@ use std::time::Duration;
 
 use api::v1::{ColumnSchema, Rows};
 use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
-use datatypes::prelude::ScalarVector;
-use datatypes::vectors::TimestampMillisecondVector;
+use datatypes::arrow::array::AsArray;
+use datatypes::arrow::datatypes::TimestampMillisecondType;
 use store_api::region_engine::{RegionEngine, RegionRole};
 use store_api::region_request::AlterKind::SetRegionOptions;
 use store_api::region_request::{
@@ -125,10 +125,8 @@ async fn collect_stream_ts(stream: SendableRecordBatchStream) -> Vec<i64> {
         let ts_col = batch
             .column_by_name("ts")
             .unwrap()
-            .as_any()
-            .downcast_ref::<TimestampMillisecondVector>()
-            .unwrap();
-        res.extend(ts_col.iter_data().map(|t| t.unwrap().0.value()));
+            .as_primitive::<TimestampMillisecondType>();
+        res.extend((0..ts_col.len()).map(|i| ts_col.value(i)));
     }
     res
 }
diff --git a/src/mito2/src/engine/copy_region_from_test.rs b/src/mito2/src/engine/copy_region_from_test.rs
new file mode 100644
index 0000000000..c42e1fc781
--- /dev/null
+++ b/src/mito2/src/engine/copy_region_from_test.rs
@@ -0,0 +1,361 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::assert_matches::assert_matches;
+use std::fs;
+use std::sync::Arc;
+
+use api::v1::Rows;
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
+use object_store::layers::mock::{Error as MockError, ErrorKind, MockLayerBuilder};
+use store_api::region_engine::{CopyRegionFromRequest, RegionEngine, RegionRole};
+use store_api::region_request::{RegionFlushRequest, RegionRequest};
+use store_api::storage::RegionId;
+
+use crate::config::MitoConfig;
+use crate::error::Error;
+use crate::test_util::{CreateRequestBuilder, TestEnv, build_rows, put_rows, rows_schema};
+
+#[tokio::test]
+async fn test_engine_copy_region_from() {
+    common_telemetry::init_default_ut_logging();
+
+    test_engine_copy_region_from_with_format(true, true).await;
+    test_engine_copy_region_from_with_format(true, false).await;
+    test_engine_copy_region_from_with_format(false, true).await;
+    test_engine_copy_region_from_with_format(false, false).await;
+}
+
+async fn test_engine_copy_region_from_with_format(flat_format: bool, with_index: bool) {
+    let mut env = TestEnv::with_prefix("copy-region-from").await;
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+    // Creates a source region and adds some data
+    let source_region_id = RegionId::new(1, 1);
+    let mut request = CreateRequestBuilder::new().build();
+    if with_index {
+        request
+            .column_metadatas
+            .iter_mut()
+            .find(|c| c.column_schema.name == "tag_0")
+            .unwrap()
+            .column_schema
+            .set_inverted_index(true);
+    }
+
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(source_region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+    let rows = Rows {
+        schema: column_schemas,
+        rows: build_rows(0, 42),
+    };
+    put_rows(&engine, source_region_id, rows).await;
+    engine
+        .handle_request(
+            source_region_id,
+            RegionRequest::Flush(RegionFlushRequest {
+                row_group_size: None,
+            }),
+        )
+        .await
+        .unwrap();
+
+    // Creates a target region and enters staging mode
+    let target_region_id = RegionId::new(1, 2);
+    engine
+        .handle_request(target_region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+    common_telemetry::debug!("copy region from");
+    let resp = engine
+        .copy_region_from(
+            target_region_id,
+            CopyRegionFromRequest {
+                source_region_id,
+                parallelism: 1,
+            },
+        )
+        .await
+        .unwrap();
+
+    let manifest = engine
+        .get_region(target_region_id)
+        .unwrap()
+        .manifest_ctx
+        .manifest()
+        .await;
+    assert!(!manifest.files.is_empty());
+    for meta in manifest.files.values() {
+        assert_eq!(meta.region_id, target_region_id);
+        assert_eq!(meta.exists_index(), with_index);
+    }
+
+    let source_region_dir = format!("{}/data/test/1_0000000001", env.data_home().display());
+    let source_region_files = collect_filename_in_dir(&source_region_dir);
+    let target_region_dir = format!("{}/data/test/1_0000000002", env.data_home().display());
+    let target_region_files = collect_filename_in_dir(&target_region_dir);
+    assert_eq!(source_region_files, target_region_files);
+
+    if with_index {
+        let source_region_index_files =
+            collect_filename_in_dir(&format!("{}/index", source_region_dir));
+        let target_region_index_files =
+            collect_filename_in_dir(&format!("{}/index", target_region_dir));
+        assert_eq!(source_region_index_files, target_region_index_files);
+    }
+    common_telemetry::debug!("copy region from again");
+    let resp2 = engine
+        .copy_region_from(
+            target_region_id,
+            CopyRegionFromRequest {
+                source_region_id,
+                parallelism: 1,
+            },
+        )
+        .await
+        .unwrap();
+    assert_eq!(resp.copied_file_ids, resp2.copied_file_ids);
+}
+
+#[tokio::test]
+async fn test_engine_copy_region_failure() {
+    common_telemetry::init_default_ut_logging();
+    test_engine_copy_region_failure_with_format(false).await;
+    test_engine_copy_region_failure_with_format(true).await;
+}
+
+async fn test_engine_copy_region_failure_with_format(flat_format: bool) {
+    let mock_layer = MockLayerBuilder::default()
+        .copy_interceptor(Arc::new(|from, _, _args| {
+            if from.contains(".puffin") {
+                Some(Err(MockError::new(ErrorKind::Unexpected, "mock err")))
+            } else {
+                None
+            }
+        }))
+        .build()
+        .unwrap();
+    let mut env = TestEnv::new().await.with_mock_layer(mock_layer);
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+    // Creates a source region and adds some data
+    let source_region_id = RegionId::new(1, 1);
+    let mut request = CreateRequestBuilder::new().build();
+    request
+        .column_metadatas
+        .iter_mut()
+        .find(|c| c.column_schema.name == "tag_0")
+        .unwrap()
+        .column_schema
+        .set_inverted_index(true);
+
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(source_region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+    let rows = Rows {
+        schema: column_schemas,
+        rows: build_rows(0, 42),
+    };
+    put_rows(&engine, source_region_id, rows).await;
+    engine
+        .handle_request(
+            source_region_id,
+            RegionRequest::Flush(RegionFlushRequest {
+                row_group_size: None,
+            }),
+        )
+        .await
+        .unwrap();
+    let source_region_dir = format!("{}/data/test/1_0000000001", env.data_home().display());
+    assert_file_num_in_dir(&source_region_dir, 1);
+    assert_file_num_in_dir(&format!("{}/index", source_region_dir), 1);
+    let source_region_files = collect_filename_in_dir(&source_region_dir);
+    let source_region_index_files =
+        collect_filename_in_dir(&format!("{}/index", source_region_dir));
+
+    // Creates a target region and enters staging mode
+    let target_region_id = RegionId::new(1, 2);
+    engine
+        .handle_request(target_region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+    let err = engine
+        .copy_region_from(
+            target_region_id,
+            CopyRegionFromRequest {
+                source_region_id,
+                parallelism: 1,
+            },
+        )
+        .await
+        .unwrap_err();
+    assert_eq!(err.status_code(), StatusCode::StorageUnavailable);
+
+    // Check target region directory is empty
+    let target_region_dir = format!("{}/data/test/1_0000000002", env.data_home().display());
+    assert_file_num_in_dir(&target_region_dir, 0);
+    assert!(!fs::exists(format!("{}/index", target_region_dir)).unwrap());
+
+    // Check source region directory is not affected
+    let source_region_dir = format!("{}/data/test/1_0000000001", env.data_home().display());
+    assert_file_num_in_dir(&source_region_dir, 1);
+    assert_file_num_in_dir(&format!("{}/index", source_region_dir), 1);
+
+    assert_eq!(
+        source_region_files,
+        collect_filename_in_dir(&source_region_dir)
+    );
+    assert_eq!(
+        source_region_index_files,
+        collect_filename_in_dir(&format!("{}/index", source_region_dir))
+    );
+}
+
+fn assert_file_num_in_dir(dir: &str, expected_num: usize) {
+    let files = fs::read_dir(dir)
+        .unwrap()
+        .collect::<Result<Vec<_>, _>>()
+        .unwrap()
+        .into_iter()
+        .filter(|f| f.metadata().unwrap().is_file())
+        .collect::<Vec<_>>();
+    assert_eq!(
+        files.len(),
+        expected_num,
+        "The number of files in the directory should be {}, got: {:?}",
+        expected_num,
+        files
+    );
+}
+
+fn collect_filename_in_dir(dir: &str) -> Vec<String> {
+    let mut files = fs::read_dir(dir)
+        .unwrap()
+        .collect::<Result<Vec<_>, _>>()
+        .unwrap()
+        .into_iter()
+        .filter(|f| f.metadata().unwrap().is_file())
+        .map(|f| {
+            f.path()
+                .to_string_lossy()
+                .rsplit("/")
+                .last()
+                .unwrap()
+                .to_string()
+        })
+        .collect::<Vec<_>>();
+    files.sort_unstable();
+
+    files
+}
+
+#[tokio::test]
+async fn test_engine_copy_region_invalid_args() {
+    common_telemetry::init_default_ut_logging();
+    test_engine_copy_region_invalid_args_with_format(false).await;
+    test_engine_copy_region_invalid_args_with_format(true).await;
+}
+
+async fn test_engine_copy_region_invalid_args_with_format(flat_format: bool) {
+    let mut env = TestEnv::new().await;
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+    let region_id = RegionId::new(1, 1);
+    let request = CreateRequestBuilder::new().build();
+    engine
+        .handle_request(region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+    let err = engine
+        .copy_region_from(
+            region_id,
+            CopyRegionFromRequest {
+                source_region_id: RegionId::new(2, 1),
+                parallelism: 1,
+            },
+        )
+        .await
+        .unwrap_err();
+    assert_eq!(err.status_code(), StatusCode::InvalidArguments);
+    let err = engine
+        .copy_region_from(
+            region_id,
+            CopyRegionFromRequest {
+                source_region_id: RegionId::new(1, 1),
+                parallelism: 1,
+            },
+        )
+        .await
+        .unwrap_err();
+    assert_eq!(err.status_code(), StatusCode::InvalidArguments);
+}
+
+#[tokio::test]
+async fn test_engine_copy_region_unexpected_state() {
+    common_telemetry::init_default_ut_logging();
+    test_engine_copy_region_unexpected_state_with_format(false).await;
+    test_engine_copy_region_unexpected_state_with_format(true).await;
+}
+
+async fn test_engine_copy_region_unexpected_state_with_format(flat_format: bool) {
+    let mut env = TestEnv::new().await;
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+    let region_id = RegionId::new(1, 1);
+    let request = CreateRequestBuilder::new().build();
+    engine
+        .handle_request(region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+    engine
+        .set_region_role(region_id, RegionRole::Follower)
+        .unwrap();
+
+    let err = engine
+        .copy_region_from(
+            region_id,
+            CopyRegionFromRequest {
+                source_region_id: RegionId::new(1, 2),
+                parallelism: 1,
+            },
+        )
+        .await
+        .unwrap_err();
+    assert_matches!(
+        err.as_any().downcast_ref::<Error>().unwrap(),
+        Error::RegionState { .. }
+    )
+}
diff --git a/src/mito2/src/engine/flush_test.rs b/src/mito2/src/engine/flush_test.rs
index 1224aac116..78bae2b461 100644
--- a/src/mito2/src/engine/flush_test.rs
+++ b/src/mito2/src/engine/flush_test.rs
@@ -515,6 +515,7 @@ async fn test_flush_workers() {
 }
 
 async fn test_flush_workers_with_format(flat_format: bool) {
+    common_telemetry::init_default_ut_logging();
     let mut env = TestEnv::new().await;
     let write_buffer_manager = Arc::new(MockWriteBufferManager::default());
     let listener = Arc::new(FlushListener::default());
@@ -574,7 +575,7 @@ async fn test_flush_workers_with_format(flat_format: bool) {
     put_rows(&engine, region_id0, rows).await;
 
     // Waits until flush is finished.
-    while listener.success_count() < 2 {
+    while listener.success_count() < 3 {
         listener.wait().await;
     }
 
diff --git a/src/mito2/src/engine/index_build_test.rs b/src/mito2/src/engine/index_build_test.rs
index 9b71aa2bb3..8de0aec041 100644
--- a/src/mito2/src/engine/index_build_test.rs
+++ b/src/mito2/src/engine/index_build_test.rs
@@ -19,7 +19,9 @@ use std::sync::Arc;
 
 use api::v1::Rows;
 use store_api::region_engine::RegionEngine;
-use store_api::region_request::{AlterKind, RegionAlterRequest, RegionRequest, SetIndexOption};
+use store_api::region_request::{
+    AlterKind, RegionAlterRequest, RegionBuildIndexRequest, RegionRequest, SetIndexOption,
+};
 use store_api::storage::{RegionId, ScanRequest};
 
 use crate::config::{IndexBuildMode, MitoConfig, Mode};
@@ -53,10 +55,10 @@ async fn num_of_index_files(engine: &MitoEngine, scanner: &Scanner, region_id: R
         return 0;
     }
     let mut index_files_count: usize = 0;
-    for region_file_id in scanner.file_ids() {
+    for region_index_id in scanner.index_ids() {
         let index_path = location::index_file_path(
             access_layer.table_dir(),
-            region_file_id,
+            region_index_id,
             access_layer.path_type(),
         );
         if access_layer
@@ -71,11 +73,9 @@ async fn num_of_index_files(engine: &MitoEngine, scanner: &Scanner, region_id: R
     index_files_count
 }
 
-#[allow(dead_code)]
 fn assert_listener_counts(
     listener: &IndexBuildListener,
     expected_begin_count: usize,
-
     expected_success_count: usize,
 ) {
     assert_eq!(listener.begin_count(), expected_begin_count);
@@ -160,6 +160,8 @@ async fn test_index_build_type_flush() {
 
 #[tokio::test]
 async fn test_index_build_type_compact() {
+    common_telemetry::init_default_ut_logging();
+
     let mut env = TestEnv::with_prefix("test_index_build_type_compact_").await;
     let listener = Arc::new(IndexBuildListener::default());
     let engine = env
@@ -299,3 +301,171 @@ async fn test_index_build_type_schema_change() {
     assert_eq!(scanner.num_files(), 1);
     assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 1);
 }
+
+#[tokio::test]
+async fn test_index_build_type_manual_basic() {
+    let mut env = TestEnv::with_prefix("test_index_build_type_manual_").await;
+    let listener = Arc::new(IndexBuildListener::default());
+    let engine = env
+        .create_engine_with(
+            async_build_mode_config(false), // Disable index file creation on flush.
+            None,
+            Some(listener.clone()),
+            None,
+        )
+        .await;
+
+    let region_id = RegionId::new(1, 1);
+    env.get_schema_metadata_manager()
+        .register_region_table_info(
+            region_id.table_id(),
+            "test_table",
+            "test_catalog",
+            "test_schema",
+            None,
+            env.get_kv_backend(),
+        )
+        .await;
+
+    // Create a region with index.
+    let request = CreateRequestBuilder::new().build_with_index();
+    let table_dir = request.table_dir.clone();
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+
+    // Flush and make sure there is no index file (because create_on_flush is disabled).
+    put_and_flush(&engine, region_id, &column_schemas, 10..20).await;
+    reopen_region(&engine, region_id, table_dir.clone(), true, HashMap::new()).await;
+    let scanner = engine
+        .scanner(region_id, ScanRequest::default())
+        .await
+        .unwrap();
+    // Index build task is triggered on flush, but not finished.
+    assert_listener_counts(&listener, 1, 0);
+    assert_eq!(scanner.num_files(), 1);
+    assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 0);
+
+    // Trigger manual index build task and make sure index file is built without flush or compaction.
+    let request = RegionRequest::BuildIndex(RegionBuildIndexRequest {});
+    engine.handle_request(region_id, request).await.unwrap();
+    listener.wait_finish(1).await;
+    let scanner = engine
+        .scanner(region_id, ScanRequest::default())
+        .await
+        .unwrap();
+    assert_listener_counts(&listener, 2, 1);
+    assert_eq!(scanner.num_files(), 1);
+    assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 1);
+
+    // Test idempotency: Second manual index build request on the same file.
+    let request = RegionRequest::BuildIndex(RegionBuildIndexRequest {});
+    engine.handle_request(region_id, request).await.unwrap();
+    reopen_region(&engine, region_id, table_dir.clone(), true, HashMap::new()).await;
+    let scanner = engine
+        .scanner(region_id, ScanRequest::default())
+        .await
+        .unwrap();
+    // Should still be 2 begin and 1 finish - no new task should be created for already indexed file.
+    assert_listener_counts(&listener, 2, 1);
+    assert_eq!(scanner.num_files(), 1);
+    assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 1);
+
+    // Test idempotency again: Third manual index build request to further verify.
+    let request = RegionRequest::BuildIndex(RegionBuildIndexRequest {});
+    engine.handle_request(region_id, request).await.unwrap();
+    reopen_region(&engine, region_id, table_dir.clone(), true, HashMap::new()).await;
+    let scanner = engine
+        .scanner(region_id, ScanRequest::default())
+        .await
+        .unwrap();
+    assert_listener_counts(&listener, 2, 1);
+    assert_eq!(scanner.num_files(), 1);
+    assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 1);
+}
+
+#[tokio::test]
+async fn test_index_build_type_manual_consistency() {
+    let mut env = TestEnv::with_prefix("test_index_build_type_manual_consistency_").await;
+    let listener = Arc::new(IndexBuildListener::default());
+    let engine = env
+        .create_engine_with(
+            async_build_mode_config(true),
+            None,
+            Some(listener.clone()),
+            None,
+        )
+        .await;
+
+    let region_id = RegionId::new(1, 1);
+    env.get_schema_metadata_manager()
+        .register_region_table_info(
+            region_id.table_id(),
+            "test_table",
+            "test_catalog",
+            "test_schema",
+            None,
+            env.get_kv_backend(),
+        )
+        .await;
+
+    // Create a region with index.
+    let create_request = CreateRequestBuilder::new().build_with_index();
+    let table_dir = create_request.table_dir.clone();
+    let column_schemas = rows_schema(&create_request);
+    engine
+        .handle_request(region_id, RegionRequest::Create(create_request.clone()))
+        .await
+        .unwrap();
+    assert_listener_counts(&listener, 0, 0);
+
+    // Flush and make sure index file exists.
+    put_and_flush(&engine, region_id, &column_schemas, 10..20).await;
+    listener.wait_finish(1).await;
+    let scanner = engine
+        .scanner(region_id, ScanRequest::default())
+        .await
+        .unwrap();
+    assert_listener_counts(&listener, 1, 1);
+    assert_eq!(scanner.num_files(), 1);
+    assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 1);
+
+    // Check index build task for consistent file will be skipped.
+    let request = RegionRequest::BuildIndex(RegionBuildIndexRequest {});
+    engine.handle_request(region_id, request).await.unwrap();
+    // Reopen the region to ensure the task wasn't skipped due to insufficient time.
+    reopen_region(&engine, region_id, table_dir.clone(), true, HashMap::new()).await;
+    let scanner = engine
+        .scanner(region_id, ScanRequest::default())
+        .await
+        .unwrap();
+    // Because the file is consistent, no new index build task is triggered.
+    assert_listener_counts(&listener, 1, 1);
+    assert_eq!(scanner.num_files(), 1);
+    assert_eq!(num_of_index_files(&engine, &scanner, region_id).await, 1);
+
+    let mut altered_metadata = create_request.column_metadatas.clone();
+    // Set index for field_0.
+    altered_metadata[1].column_schema.set_inverted_index(true);
+    let sync_columns_request = RegionAlterRequest {
+        kind: AlterKind::SyncColumns {
+            column_metadatas: altered_metadata,
+        },
+    };
+    // Use SyncColumns to avoid triggering SchemaChange index build.
+    engine
+        .handle_request(region_id, RegionRequest::Alter(sync_columns_request))
+        .await
+        .unwrap();
+    reopen_region(&engine, region_id, table_dir, true, HashMap::new()).await;
+    // SyncColumns won't trigger index build.
+    assert_listener_counts(&listener, 1, 1);
+
+    let request = RegionRequest::BuildIndex(RegionBuildIndexRequest {});
+    engine.handle_request(region_id, request).await.unwrap();
+    listener.wait_finish(2).await; // previous 1 + new 1
+    // Because the file is inconsistent, new index build task is triggered.
+    assert_listener_counts(&listener, 2, 2);
+}
diff --git a/src/mito2/src/engine/listener.rs b/src/mito2/src/engine/listener.rs
index ebc20ac280..277c9a4050 100644
--- a/src/mito2/src/engine/listener.rs
+++ b/src/mito2/src/engine/listener.rs
@@ -74,6 +74,9 @@ pub trait EventListener: Send + Sync {
     /// Notifies the listener that region starts to send a region change result to worker.
     async fn on_notify_region_change_result_begin(&self, _region_id: RegionId) {}
 
+    /// Notifies the listener that region starts to send a enter staging result to worker.
+    async fn on_enter_staging_result_begin(&self, _region_id: RegionId) {}
+
     /// Notifies the listener that the index build task is executed successfully.
     async fn on_index_build_finish(&self, _region_file_id: RegionFileId) {}
 
@@ -307,6 +310,37 @@ impl EventListener for NotifyRegionChangeResultListener {
             region_id
         );
         self.notify.notified().await;
+        info!(
+            "Continue to sending region change result for region {}",
+            region_id
+        );
+    }
+}
+
+#[derive(Default)]
+pub struct NotifyEnterStagingResultListener {
+    notify: Notify,
+}
+
+impl NotifyEnterStagingResultListener {
+    /// Continue to sending enter staging result.
+    pub fn wake_notify(&self) {
+        self.notify.notify_one();
+    }
+}
+
+#[async_trait]
+impl EventListener for NotifyEnterStagingResultListener {
+    async fn on_enter_staging_result_begin(&self, region_id: RegionId) {
+        info!(
+            "Wait on notify to start notify enter staging result for region {}",
+            region_id
+        );
+        self.notify.notified().await;
+        info!(
+            "Continue to sending enter staging result for region {}",
+            region_id
+        );
     }
 }
 
diff --git a/src/mito2/src/engine/puffin_index.rs b/src/mito2/src/engine/puffin_index.rs
index 05529db59b..281b619bc5 100644
--- a/src/mito2/src/engine/puffin_index.rs
+++ b/src/mito2/src/engine/puffin_index.rs
@@ -32,7 +32,7 @@ use crate::cache::index::bloom_filter_index::{
     BloomFilterIndexCacheRef, CachedBloomFilterIndexBlobReader, Tag,
 };
 use crate::cache::index::inverted_index::{CachedInvertedIndexBlobReader, InvertedIndexCacheRef};
-use crate::sst::file::RegionFileId;
+use crate::sst::file::RegionIndexId;
 use crate::sst::index::bloom_filter::INDEX_BLOB_TYPE as BLOOM_BLOB_TYPE;
 use crate::sst::index::fulltext_index::{
     INDEX_BLOB_TYPE_BLOOM as FULLTEXT_BLOOM_BLOB_TYPE,
@@ -66,14 +66,14 @@ pub(crate) struct IndexEntryContext<'a> {
 /// Collect index metadata entries present in the SST puffin file.
 pub(crate) async fn collect_index_entries_from_puffin(
     manager: SstPuffinManager,
-    region_file_id: RegionFileId,
+    region_index_id: RegionIndexId,
     context: IndexEntryContext<'_>,
     bloom_filter_cache: Option<BloomFilterIndexCacheRef>,
     inverted_index_cache: Option<InvertedIndexCacheRef>,
 ) -> Vec<PuffinIndexMetaEntry> {
     let mut entries = Vec::new();
 
-    let reader = match manager.reader(&region_file_id).await {
+    let reader = match manager.reader(&region_index_id).await {
         Ok(reader) => reader,
         Err(err) => {
             warn!(
@@ -104,7 +104,7 @@ pub(crate) async fn collect_index_entries_from_puffin(
             Some(BlobIndexTypeTargetKey::BloomFilter(target_key)) => {
                 let bloom_meta = try_read_bloom_meta(
                     &reader,
-                    region_file_id,
+                    region_index_id,
                     blob.blob_type.as_str(),
                     target_key,
                     bloom_filter_cache.as_ref(),
@@ -130,7 +130,7 @@ pub(crate) async fn collect_index_entries_from_puffin(
             Some(BlobIndexTypeTargetKey::FulltextBloom(target_key)) => {
                 let bloom_meta = try_read_bloom_meta(
                     &reader,
-                    region_file_id,
+                    region_index_id,
                     blob.blob_type.as_str(),
                     target_key,
                     bloom_filter_cache.as_ref(),
@@ -172,7 +172,7 @@ pub(crate) async fn collect_index_entries_from_puffin(
             Some(BlobIndexTypeTargetKey::Inverted) => {
                 let mut inverted_entries = collect_inverted_entries(
                     &reader,
-                    region_file_id,
+                    region_index_id,
                     inverted_index_cache.as_ref(),
                     &context,
                 )
@@ -188,12 +188,12 @@ pub(crate) async fn collect_index_entries_from_puffin(
 
 async fn collect_inverted_entries(
     reader: &SstPuffinReader,
-    region_file_id: RegionFileId,
+    region_index_id: RegionIndexId,
     cache: Option<&InvertedIndexCacheRef>,
     context: &IndexEntryContext<'_>,
 ) -> Vec<PuffinIndexMetaEntry> {
     // Read the inverted index blob and surface its per-column metadata entries.
-    let file_id = region_file_id.file_id();
+    let file_id = region_index_id.file_id();
 
     let guard = match reader.blob(INVERTED_BLOB_TYPE).await {
         Ok(guard) => guard,
@@ -229,11 +229,12 @@ async fn collect_inverted_entries(
     let metas = if let (Some(cache), Some(blob_size)) = (cache, blob_size) {
         let reader = CachedInvertedIndexBlobReader::new(
             file_id,
+            region_index_id.version,
             blob_size,
             InvertedIndexBlobReader::new(blob_reader),
             cache.clone(),
         );
-        match reader.metadata().await {
+        match reader.metadata(None).await {
             Ok(metas) => metas,
             Err(err) => {
                 warn!(
@@ -247,7 +248,7 @@ async fn collect_inverted_entries(
         }
     } else {
         let reader = InvertedIndexBlobReader::new(blob_reader);
-        match reader.metadata().await {
+        match reader.metadata(None).await {
             Ok(metas) => metas,
             Err(err) => {
                 warn!(
@@ -289,7 +290,7 @@ fn build_inverted_entries(
 
 async fn try_read_bloom_meta(
     reader: &SstPuffinReader,
-    region_file_id: RegionFileId,
+    region_index_id: RegionIndexId,
     blob_type: &str,
     target_key: &str,
     cache: Option<&BloomFilterIndexCacheRef>,
@@ -311,17 +312,18 @@ async fn try_read_bloom_meta(
                 let result = match (cache, column_id, blob_size) {
                     (Some(cache), Some(column_id), Some(blob_size)) => {
                         CachedBloomFilterIndexBlobReader::new(
-                            region_file_id.file_id(),
+                            region_index_id.file_id(),
+                            region_index_id.version,
                             column_id,
                             tag,
                             blob_size,
                             bloom_reader,
                             cache.clone(),
                         )
-                        .metadata()
+                        .metadata(None)
                         .await
                     }
-                    _ => bloom_reader.metadata().await,
+                    _ => bloom_reader.metadata(None).await,
                 };
 
                 match result {
diff --git a/src/mito2/src/engine/remap_manifests_test.rs b/src/mito2/src/engine/remap_manifests_test.rs
new file mode 100644
index 0000000000..bd38e87e2a
--- /dev/null
+++ b/src/mito2/src/engine/remap_manifests_test.rs
@@ -0,0 +1,239 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::assert_matches::assert_matches;
+
+use api::v1::Rows;
+use datatypes::value::Value;
+use partition::expr::{PartitionExpr, col};
+use store_api::region_engine::{RegionEngine, RemapManifestsRequest, SettableRegionRoleState};
+use store_api::region_request::{RegionFlushRequest, RegionRequest};
+use store_api::storage::RegionId;
+
+use crate::config::MitoConfig;
+use crate::error::Error;
+use crate::manifest::action::RegionManifest;
+use crate::test_util::{CreateRequestBuilder, TestEnv, build_rows, put_rows, rows_schema};
+
+#[tokio::test]
+async fn test_remap_manifests_invalid_partition_expr() {
+    common_telemetry::init_default_ut_logging();
+    test_remap_manifests_invalid_partition_expr_with_format(false).await;
+    test_remap_manifests_invalid_partition_expr_with_format(true).await;
+}
+
+async fn test_remap_manifests_invalid_partition_expr_with_format(flat_format: bool) {
+    let mut env = TestEnv::with_prefix("invalid-partition-expr").await;
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+
+    let region_id = RegionId::new(1, 1);
+    let request = CreateRequestBuilder::new().build();
+    engine
+        .handle_request(region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+
+    let err = engine
+        .remap_manifests(RemapManifestsRequest {
+            region_id,
+            input_regions: vec![region_id],
+            region_mapping: [(region_id, vec![region_id])].into_iter().collect(),
+            new_partition_exprs: [(region_id, "invalid expr".to_string())]
+                .into_iter()
+                .collect(),
+        })
+        .await
+        .unwrap_err();
+    assert_matches!(
+        err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
+        Error::InvalidPartitionExpr { .. }
+    )
+}
+
+#[tokio::test]
+async fn test_remap_manifests_invalid_region_state() {
+    common_telemetry::init_default_ut_logging();
+    test_remap_manifests_invalid_region_state_with_format(false).await;
+    test_remap_manifests_invalid_region_state_with_format(true).await;
+}
+
+fn range_expr(col_name: &str, start: i64, end: i64) -> PartitionExpr {
+    col(col_name)
+        .gt_eq(Value::Int64(start))
+        .and(col(col_name).lt(Value::Int64(end)))
+}
+
+async fn test_remap_manifests_invalid_region_state_with_format(flat_format: bool) {
+    let mut env = TestEnv::with_prefix("invalid-region-state").await;
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+
+    let region_id = RegionId::new(1, 1);
+    let request = CreateRequestBuilder::new().build();
+    engine
+        .handle_request(region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+
+    let err = engine
+        .remap_manifests(RemapManifestsRequest {
+            region_id,
+            input_regions: vec![region_id],
+            region_mapping: [(region_id, vec![region_id])].into_iter().collect(),
+            new_partition_exprs: [(region_id, range_expr("x", 0, 100).as_json_str().unwrap())]
+                .into_iter()
+                .collect(),
+        })
+        .await
+        .unwrap_err();
+    assert_matches!(
+        err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
+        Error::RegionState { .. }
+    )
+}
+
+#[tokio::test]
+async fn test_remap_manifests_invalid_input_regions() {
+    common_telemetry::init_default_ut_logging();
+    test_remap_manifests_invalid_input_regions_with_format(false).await;
+    test_remap_manifests_invalid_input_regions_with_format(true).await;
+}
+
+async fn test_remap_manifests_invalid_input_regions_with_format(flat_format: bool) {
+    let mut env = TestEnv::with_prefix("invalid-input-regions").await;
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+
+    let region_id = RegionId::new(1, 1);
+    let request = CreateRequestBuilder::new().build();
+    engine
+        .handle_request(region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+    engine
+        .set_region_role_state_gracefully(region_id, SettableRegionRoleState::StagingLeader)
+        .await
+        .unwrap();
+    let err = engine
+        .remap_manifests(RemapManifestsRequest {
+            region_id,
+            input_regions: vec![region_id, RegionId::new(2, 1)],
+            region_mapping: [(region_id, vec![region_id])].into_iter().collect(),
+            new_partition_exprs: [(region_id, range_expr("x", 0, 100).as_json_str().unwrap())]
+                .into_iter()
+                .collect(),
+        })
+        .await
+        .unwrap_err();
+    assert_matches!(
+        err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
+        Error::InvalidRequest { .. }
+    )
+}
+
+#[tokio::test]
+async fn test_remap_manifests_success() {
+    common_telemetry::init_default_ut_logging();
+    test_remap_manifests_success_with_format(false).await;
+    test_remap_manifests_success_with_format(true).await;
+}
+
+async fn test_remap_manifests_success_with_format(flat_format: bool) {
+    let mut env = TestEnv::with_prefix("engine-stop").await;
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+
+    let region_id = RegionId::new(1, 1);
+    let request = CreateRequestBuilder::new()
+        .partition_expr_json(Some(range_expr("tag_0", 0, 100).as_json_str().unwrap()))
+        .build();
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+
+    let new_region_id_1 = RegionId::new(1, 2);
+    let new_region_id_2 = RegionId::new(1, 3);
+
+    // Generate some data
+    for i in 0..3 {
+        let rows_data = Rows {
+            schema: column_schemas.clone(),
+            rows: build_rows(i * 10, (i + 1) * 10),
+        };
+        put_rows(&engine, region_id, rows_data).await;
+        engine
+            .handle_request(
+                region_id,
+                RegionRequest::Flush(RegionFlushRequest {
+                    row_group_size: None,
+                }),
+            )
+            .await
+            .unwrap();
+    }
+
+    engine
+        .set_region_role_state_gracefully(region_id, SettableRegionRoleState::StagingLeader)
+        .await
+        .unwrap();
+
+    let result = engine
+        .remap_manifests(RemapManifestsRequest {
+            region_id,
+            input_regions: vec![region_id],
+            region_mapping: [(region_id, vec![new_region_id_1, new_region_id_2])]
+                .into_iter()
+                .collect(),
+            new_partition_exprs: [
+                (
+                    new_region_id_1,
+                    range_expr("tag_0", 0, 50).as_json_str().unwrap(),
+                ),
+                (
+                    new_region_id_2,
+                    range_expr("tag_0", 50, 100).as_json_str().unwrap(),
+                ),
+            ]
+            .into_iter()
+            .collect(),
+        })
+        .await
+        .unwrap();
+    assert_eq!(result.new_manifests.len(), 2);
+    let new_manifest_1 =
+        serde_json::from_str::<RegionManifest>(&result.new_manifests[&new_region_id_1]).unwrap();
+    let new_manifest_2 =
+        serde_json::from_str::<RegionManifest>(&result.new_manifests[&new_region_id_2]).unwrap();
+    assert_eq!(new_manifest_1.files.len(), 3);
+    assert_eq!(new_manifest_2.files.len(), 3);
+}
diff --git a/src/mito2/src/engine/staging_test.rs b/src/mito2/src/engine/staging_test.rs
index 6d802a5d9d..91816a4f9f 100644
--- a/src/mito2/src/engine/staging_test.rs
+++ b/src/mito2/src/engine/staging_test.rs
@@ -14,17 +14,30 @@
 
 //! Integration tests for staging state functionality.
 
+use std::assert_matches::assert_matches;
 use std::fs;
+use std::sync::Arc;
+use std::time::Duration;
 
 use api::v1::Rows;
+use common_error::ext::ErrorExt;
+use common_error::status_code::StatusCode;
 use common_recordbatch::RecordBatches;
+use object_store::Buffer;
+use object_store::layers::mock::{
+    Entry, Error as MockError, ErrorKind, List, Lister, Metadata, MockLayerBuilder,
+    Result as MockResult, Write, Writer,
+};
 use store_api::region_engine::{RegionEngine, SettableRegionRoleState};
 use store_api::region_request::{
-    RegionAlterRequest, RegionFlushRequest, RegionRequest, RegionTruncateRequest,
+    EnterStagingRequest, RegionAlterRequest, RegionFlushRequest, RegionRequest,
+    RegionTruncateRequest,
 };
 use store_api::storage::{RegionId, ScanRequest};
 
 use crate::config::MitoConfig;
+use crate::engine::listener::NotifyEnterStagingResultListener;
+use crate::error::Error;
 use crate::region::{RegionLeaderState, RegionRoleState};
 use crate::request::WorkerRequest;
 use crate::test_util::{CreateRequestBuilder, TestEnv, build_rows, put_rows, rows_schema};
@@ -214,6 +227,8 @@ async fn test_staging_state_validation_patterns() {
     );
 }
 
+const PARTITION_EXPR: &str = "partition_expr";
+
 #[tokio::test]
 async fn test_staging_manifest_directory() {
     test_staging_manifest_directory_with_format(false).await;
@@ -221,6 +236,7 @@ async fn test_staging_manifest_directory() {
 }
 
 async fn test_staging_manifest_directory_with_format(flat_format: bool) {
+    common_telemetry::init_default_ut_logging();
     let mut env = TestEnv::new().await;
     let engine = env
         .create_engine(MitoConfig {
@@ -255,9 +271,57 @@ async fn test_staging_manifest_directory_with_format(flat_format: bool) {
     // Now test staging mode manifest creation
     // Set region to staging mode using the engine API
     engine
-        .set_region_role_state_gracefully(region_id, SettableRegionRoleState::StagingLeader)
+        .handle_request(
+            region_id,
+            RegionRequest::EnterStaging(EnterStagingRequest {
+                partition_expr: PARTITION_EXPR.to_string(),
+            }),
+        )
         .await
         .unwrap();
+    let region = engine.get_region(region_id).unwrap();
+    let staging_partition_expr = region.staging_partition_expr.lock().unwrap().clone();
+    assert_eq!(staging_partition_expr.unwrap(), PARTITION_EXPR);
+    {
+        let manager = region.manifest_ctx.manifest_manager.read().await;
+        assert_eq!(
+            manager
+                .staging_manifest()
+                .unwrap()
+                .metadata
+                .partition_expr
+                .as_deref()
+                .unwrap(),
+            PARTITION_EXPR
+        );
+        assert!(manager.manifest().metadata.partition_expr.is_none());
+    }
+
+    // Should be ok to enter staging mode again with the same partition expr
+    engine
+        .handle_request(
+            region_id,
+            RegionRequest::EnterStaging(EnterStagingRequest {
+                partition_expr: PARTITION_EXPR.to_string(),
+            }),
+        )
+        .await
+        .unwrap();
+
+    // Should throw error if try to enter staging mode again with a different partition expr
+    let err = engine
+        .handle_request(
+            region_id,
+            RegionRequest::EnterStaging(EnterStagingRequest {
+                partition_expr: "".to_string(),
+            }),
+        )
+        .await
+        .unwrap_err();
+    assert_matches!(
+        err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
+        Error::StagingPartitionExprMismatch { .. }
+    );
 
     // Put some data and flush in staging mode
     let rows_data = Rows {
@@ -312,6 +376,7 @@ async fn test_staging_exit_success_with_manifests() {
 }
 
 async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool) {
+    common_telemetry::init_default_ut_logging();
     let mut env = TestEnv::new().await;
     let engine = env
         .create_engine(MitoConfig {
@@ -330,16 +395,28 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool)
         .await
         .unwrap();
 
+    // Add some data and flush in staging mode to generate staging manifests
+    let rows_data = Rows {
+        schema: column_schemas.clone(),
+        rows: build_rows(0, 3),
+    };
+    put_rows(&engine, region_id, rows_data).await;
+
     // Enter staging mode
     engine
-        .set_region_role_state_gracefully(region_id, SettableRegionRoleState::StagingLeader)
+        .handle_request(
+            region_id,
+            RegionRequest::EnterStaging(EnterStagingRequest {
+                partition_expr: PARTITION_EXPR.to_string(),
+            }),
+        )
         .await
         .unwrap();
 
     // Add some data and flush in staging mode to generate staging manifests
     let rows_data = Rows {
         schema: column_schemas.clone(),
-        rows: build_rows(0, 5),
+        rows: build_rows(3, 8),
     };
     put_rows(&engine, region_id, rows_data).await;
 
@@ -357,7 +434,7 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool)
     // Add more data and flush again to generate multiple staging manifests
     let rows_data2 = Rows {
         schema: column_schemas.clone(),
-        rows: build_rows(5, 10),
+        rows: build_rows(8, 10),
     };
     put_rows(&engine, region_id, rows_data2).await;
 
@@ -382,8 +459,11 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool)
         .unwrap();
     assert_eq!(
         staging_files_before.len(),
-        2,
-        "Staging manifest directory should contain two files before exit"
+        // Two files for flush operation
+        // One file for entering staging mode
+        3,
+        "Staging manifest directory should contain 3 files before exit, got: {:?}",
+        staging_files_before
     );
 
     // Count normal manifest files before exit
@@ -394,8 +474,11 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool)
         .unwrap();
     let normal_count_before = normal_files_before.len();
     assert_eq!(
-        normal_count_before, 1,
-        "Normal manifest directory should initially contain one file"
+        // One file for table creation
+        // One file for flush operation
+        normal_count_before,
+        2,
+        "Normal manifest directory should initially contain 2 files"
     );
 
     // Try read data before exiting staging, SST files should be invisible
@@ -403,8 +486,8 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool)
     let scanner = engine.scanner(region_id, request).await.unwrap();
     assert_eq!(
         scanner.num_files(),
-        0,
-        "No SST files should be scanned before exit"
+        1,
+        "1 SST files should be scanned before exit"
     );
     assert_eq!(
         scanner.num_memtables(),
@@ -415,14 +498,20 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool)
     let batches = RecordBatches::try_collect(stream).await.unwrap();
     let total_rows: usize = batches.iter().map(|rb| rb.num_rows()).sum();
     assert_eq!(
-        total_rows, 0,
-        "No data should be readable before exit staging mode"
+        total_rows, 3,
+        "3 rows should be readable before exit staging mode"
     );
 
     // Inspect SSTs from manifest
     let sst_entries = engine.all_ssts_from_manifest().await;
-    assert_eq!(sst_entries.len(), 2);
-    assert!(sst_entries.iter().all(|e| !e.visible));
+    assert_eq!(
+        sst_entries.len(),
+        3,
+        "sst entries should be 3, got: {:?}",
+        sst_entries
+    );
+    assert_eq!(sst_entries.iter().filter(|e| e.visible).count(), 1);
+    assert_eq!(sst_entries.iter().filter(|e| !e.visible).count(), 2);
 
     // Exit staging mode successfully
     engine
@@ -470,7 +559,7 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool)
     let scanner = engine.scanner(region_id, request).await.unwrap();
     assert_eq!(
         scanner.num_files(),
-        2,
+        3,
         "SST files should be scanned after exit"
     );
 
@@ -482,6 +571,209 @@ async fn test_staging_exit_success_with_manifests_with_format(flat_format: bool)
 
     // Inspect SSTs from manifest
     let sst_entries = engine.all_ssts_from_manifest().await;
-    assert_eq!(sst_entries.len(), 2);
+    assert_eq!(sst_entries.len(), 3);
     assert!(sst_entries.iter().all(|e| e.visible));
 }
+
+#[tokio::test(flavor = "multi_thread")]
+async fn test_write_stall_on_enter_staging() {
+    test_write_stall_on_enter_staging_with_format(false).await;
+    test_write_stall_on_enter_staging_with_format(true).await;
+}
+
+async fn test_write_stall_on_enter_staging_with_format(flat_format: bool) {
+    let mut env = TestEnv::new().await;
+    let listener = Arc::new(NotifyEnterStagingResultListener::default());
+    let engine = env
+        .create_engine_with(
+            MitoConfig {
+                default_experimental_flat_format: flat_format,
+                ..Default::default()
+            },
+            None,
+            Some(listener.clone()),
+            None,
+        )
+        .await;
+
+    let region_id = RegionId::new(1, 1);
+    let request = CreateRequestBuilder::new().build();
+
+    env.get_schema_metadata_manager()
+        .register_region_table_info(
+            region_id.table_id(),
+            "test_table",
+            "test_catalog",
+            "test_schema",
+            None,
+            env.get_kv_backend(),
+        )
+        .await;
+
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+
+    let engine_cloned = engine.clone();
+    let alter_job = tokio::spawn(async move {
+        engine_cloned
+            .handle_request(
+                region_id,
+                RegionRequest::EnterStaging(EnterStagingRequest {
+                    partition_expr: PARTITION_EXPR.to_string(),
+                }),
+            )
+            .await
+            .unwrap();
+    });
+    // Make sure the loop is handling the alter request.
+    tokio::time::sleep(Duration::from_millis(100)).await;
+
+    let column_schemas_cloned = column_schemas.clone();
+    let engine_cloned = engine.clone();
+    let put_job = tokio::spawn(async move {
+        let rows = Rows {
+            schema: column_schemas_cloned,
+            rows: build_rows(0, 3),
+        };
+        put_rows(&engine_cloned, region_id, rows).await;
+    });
+    // Make sure the loop is handling the put request.
+    tokio::time::sleep(Duration::from_millis(100)).await;
+
+    listener.wake_notify();
+    alter_job.await.unwrap();
+    put_job.await.unwrap();
+
+    let expected = "\
++-------+---------+---------------------+
+| tag_0 | field_0 | ts                  |
++-------+---------+---------------------+
+| 0     | 0.0     | 1970-01-01T00:00:00 |
+| 1     | 1.0     | 1970-01-01T00:00:01 |
+| 2     | 2.0     | 1970-01-01T00:00:02 |
++-------+---------+---------------------+";
+    let request = ScanRequest::default();
+    let scanner = engine.scanner(region_id, request).await.unwrap();
+    let stream = scanner.scan().await.unwrap();
+    let batches = RecordBatches::try_collect(stream).await.unwrap();
+    assert_eq!(expected, batches.pretty_print().unwrap());
+}
+
+#[tokio::test]
+async fn test_enter_staging_clean_staging_manifest_error() {
+    common_telemetry::init_default_ut_logging();
+    test_enter_staging_clean_staging_manifest_error_with_format(false).await;
+    test_enter_staging_clean_staging_manifest_error_with_format(true).await;
+}
+
+struct MockLister {
+    path: String,
+    inner: Lister,
+}
+
+impl List for MockLister {
+    async fn next(&mut self) -> MockResult<Option<Entry>> {
+        if self.path.contains("staging") {
+            return Err(MockError::new(ErrorKind::Unexpected, "mock error"));
+        }
+        self.inner.next().await
+    }
+}
+
+struct MockWriter {
+    path: String,
+    inner: Writer,
+}
+
+impl Write for MockWriter {
+    async fn write(&mut self, bs: Buffer) -> MockResult<()> {
+        self.inner.write(bs).await
+    }
+
+    async fn close(&mut self) -> MockResult<Metadata> {
+        if self.path.contains("staging") {
+            return Err(MockError::new(ErrorKind::Unexpected, "mock error"));
+        }
+        self.inner.close().await
+    }
+
+    async fn abort(&mut self) -> MockResult<()> {
+        self.inner.abort().await
+    }
+}
+
+async fn test_enter_staging_error(env: &mut TestEnv, flat_format: bool) {
+    let engine = env
+        .create_engine(MitoConfig {
+            default_experimental_flat_format: flat_format,
+            ..Default::default()
+        })
+        .await;
+    let region_id = RegionId::new(1024, 0);
+    let request = CreateRequestBuilder::new().build();
+    engine
+        .handle_request(region_id, RegionRequest::Create(request))
+        .await
+        .unwrap();
+
+    let err = engine
+        .handle_request(
+            region_id,
+            RegionRequest::EnterStaging(EnterStagingRequest {
+                partition_expr: PARTITION_EXPR.to_string(),
+            }),
+        )
+        .await
+        .unwrap_err();
+    assert_eq!(err.status_code(), StatusCode::StorageUnavailable);
+    let region = engine.get_region(region_id).unwrap();
+    assert!(
+        region
+            .manifest_ctx
+            .manifest_manager
+            .read()
+            .await
+            .staging_manifest()
+            .is_none()
+    );
+    let state = region.state();
+    assert_eq!(state, RegionRoleState::Leader(RegionLeaderState::Writable));
+}
+
+async fn test_enter_staging_clean_staging_manifest_error_with_format(flat_format: bool) {
+    let mock_layer = MockLayerBuilder::default()
+        .lister_factory(Arc::new(|path, _args, lister| {
+            Box::new(MockLister {
+                path: path.to_string(),
+                inner: lister,
+            })
+        }))
+        .build()
+        .unwrap();
+    let mut env = TestEnv::new().await.with_mock_layer(mock_layer);
+    test_enter_staging_error(&mut env, flat_format).await;
+}
+
+#[tokio::test]
+async fn test_enter_staging_save_staging_manifest_error() {
+    common_telemetry::init_default_ut_logging();
+    test_enter_staging_save_staging_manifest_error_with_format(false).await;
+    test_enter_staging_save_staging_manifest_error_with_format(true).await;
+}
+
+async fn test_enter_staging_save_staging_manifest_error_with_format(flat_format: bool) {
+    let mock_layer = MockLayerBuilder::default()
+        .writer_factory(Arc::new(|path, _args, lister| {
+            Box::new(MockWriter {
+                path: path.to_string(),
+                inner: lister,
+            })
+        }))
+        .build()
+        .unwrap();
+    let mut env = TestEnv::new().await.with_mock_layer(mock_layer);
+    test_enter_staging_error(&mut env, flat_format).await;
+}
diff --git a/src/mito2/src/engine/sync_test.rs b/src/mito2/src/engine/sync_test.rs
index 5d6d5802f2..6b98d4ba0f 100644
--- a/src/mito2/src/engine/sync_test.rs
+++ b/src/mito2/src/engine/sync_test.rs
@@ -151,7 +151,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) {
     scan_check(&follower_engine, region_id, expected, 0, 0).await;
 
     // Returns error since the max manifest is 1
-    let manifest_info = RegionManifestInfo::mito(2, 0);
+    let manifest_info = RegionManifestInfo::mito(2, 0, 0);
     let err = follower_engine
         .sync_region(region_id, manifest_info)
         .await
@@ -159,7 +159,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) {
     let err = err.as_any().downcast_ref::<Error>().unwrap();
     assert_matches!(err, Error::InstallManifestTo { .. });
 
-    let manifest_info = RegionManifestInfo::mito(1, 0);
+    let manifest_info = RegionManifestInfo::mito(1, 0, 0);
     follower_engine
         .sync_region(region_id, manifest_info)
         .await
@@ -264,7 +264,7 @@ async fn test_sync_after_alter_region_with_format(flat_format: bool) {
     scan_check(&follower_engine, region_id, expected, 0, 0).await;
 
     // Sync the region from the leader engine to the follower engine
-    let manifest_info = RegionManifestInfo::mito(2, 0);
+    let manifest_info = RegionManifestInfo::mito(2, 0, 0);
     follower_engine
         .sync_region(region_id, manifest_info)
         .await
diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs
index 2a6fc855bc..cda2c75403 100644
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -19,6 +19,7 @@ use common_datasource::compression::CompressionType;
 use common_error::ext::{BoxedError, ErrorExt};
 use common_error::status_code::StatusCode;
 use common_macro::stack_trace_debug;
+use common_memory_manager;
 use common_runtime::JoinError;
 use common_time::Timestamp;
 use common_time::timestamp::TimeUnit;
@@ -104,6 +105,15 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Failed to serialize manifest, region_id: {}", region_id))]
+    SerializeManifest {
+        region_id: RegionId,
+        #[snafu(source)]
+        error: serde_json::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display("Invalid scan index, start: {}, end: {}", start, end))]
     InvalidScanIndex {
         start: ManifestVersion,
@@ -232,6 +242,13 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Manifest missing for region {}", region_id))]
+    MissingManifest {
+        region_id: RegionId,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display("File consistency check failed for file {}: {}", file_id, reason))]
     InconsistentFile {
         file_id: FileId,
@@ -254,6 +271,13 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Failed to fetch manifests"))]
+    FetchManifests {
+        #[snafu(implicit)]
+        location: Location,
+        source: BoxedError,
+    },
+
     #[snafu(display("Partition expression missing for region {}", region_id))]
     MissingPartitionExpr {
         region_id: RegionId,
@@ -1018,6 +1042,28 @@ pub enum Error {
     #[snafu(display("Manual compaction is override by following operations."))]
     ManualCompactionOverride {},
 
+    #[snafu(display(
+        "Compaction memory limit exceeded for region {region_id}: required {required_bytes} bytes, limit {limit_bytes} bytes (policy: {policy})",
+    ))]
+    CompactionMemoryExhausted {
+        region_id: RegionId,
+        required_bytes: u64,
+        limit_bytes: u64,
+        policy: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to acquire memory for region {region_id} (policy: {policy})"))]
+    MemoryAcquireFailed {
+        region_id: RegionId,
+        policy: String,
+        #[snafu(source)]
+        source: common_memory_manager::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display(
         "Incompatible WAL provider change. This is typically caused by changing WAL provider in database config file without completely cleaning existing files. Global provider: {}, region provider: {}",
         global,
@@ -1127,6 +1173,30 @@ pub enum Error {
         #[snafu(implicit)]
         location: Location,
     },
+
+    #[snafu(display(
+        "Staging partition expr mismatch, manifest: {:?}, request: {}",
+        manifest_expr,
+        request_expr
+    ))]
+    StagingPartitionExprMismatch {
+        manifest_expr: Option<String>,
+        request_expr: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display(
+        "Invalid source and target region, source: {}, target: {}",
+        source_region_id,
+        target_region_id
+    ))]
+    InvalidSourceAndTargetRegion {
+        source_region_id: RegionId,
+        target_region_id: RegionId,
+        #[snafu(implicit)]
+        location: Location,
+    },
 }
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -1172,7 +1242,9 @@ impl ErrorExt for Error {
             | FilesLost { .. }
             | InstallManifestTo { .. }
             | Unexpected { .. }
-            | SerializeColumnMetadata { .. } => StatusCode::Unexpected,
+            | SerializeColumnMetadata { .. }
+            | SerializeManifest { .. }
+            | StagingPartitionExprMismatch { .. } => StatusCode::Unexpected,
 
             RegionNotFound { .. } => StatusCode::RegionNotFound,
             ObjectStoreNotFound { .. }
@@ -1190,9 +1262,11 @@ impl ErrorExt for Error {
             | DurationOutOfRange { .. }
             | MissingOldManifest { .. }
             | MissingNewManifest { .. }
+            | MissingManifest { .. }
             | NoOldManifests { .. }
             | MissingPartitionExpr { .. }
-            | SerializePartitionExpr { .. } => StatusCode::InvalidArguments,
+            | SerializePartitionExpr { .. }
+            | InvalidSourceAndTargetRegion { .. } => StatusCode::InvalidArguments,
 
             RegionMetadataNotFound { .. }
             | Join { .. }
@@ -1211,6 +1285,8 @@ impl ErrorExt for Error {
             | Metadata { .. }
             | MitoManifestInfo { .. } => StatusCode::Internal,
 
+            FetchManifests { source, .. } => source.status_code(),
+
             OpenRegion { source, .. } => source.status_code(),
 
             WriteParquet { .. } => StatusCode::StorageUnavailable,
@@ -1283,6 +1359,10 @@ impl ErrorExt for Error {
 
             ManualCompactionOverride {} => StatusCode::Cancelled,
 
+            CompactionMemoryExhausted { .. } => StatusCode::RuntimeResourcesExhausted,
+
+            MemoryAcquireFailed { source, .. } => source.status_code(),
+
             IncompatibleWalProviderChange { .. } => StatusCode::InvalidArguments,
 
             ScanSeries { source, .. } => source.status_code(),
diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs
index 819a227e4b..058c5272c2 100644
--- a/src/mito2/src/flush.rs
+++ b/src/mito2/src/flush.rs
@@ -20,7 +20,7 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::time::Instant;
 
-use common_telemetry::{debug, error, info, trace};
+use common_telemetry::{debug, error, info};
 use datatypes::arrow::datatypes::SchemaRef;
 use either::Either;
 use partition::expr::PartitionExpr;
@@ -89,6 +89,12 @@ pub trait WriteBufferManager: Send + Sync + std::fmt::Debug {
 
     /// Returns the total memory used by memtables.
     fn memory_usage(&self) -> usize;
+
+    /// Returns the mutable memtable memory limit.
+    ///
+    /// The write buffer manager should flush memtables when the mutable memory usage
+    /// exceeds this limit.
+    fn flush_limit(&self) -> usize;
 }
 
 pub type WriteBufferManagerRef = Arc<dyn WriteBufferManager>;
@@ -145,7 +151,7 @@ impl WriteBufferManagerImpl {
 impl WriteBufferManager for WriteBufferManagerImpl {
     fn should_flush_engine(&self) -> bool {
         let mutable_memtable_memory_usage = self.memory_active.load(Ordering::Relaxed);
-        if mutable_memtable_memory_usage > self.mutable_limit {
+        if mutable_memtable_memory_usage >= self.mutable_limit {
             debug!(
                 "Engine should flush (over mutable limit), mutable_usage: {}, memory_usage: {}, mutable_limit: {}, global_limit: {}",
                 mutable_memtable_memory_usage,
@@ -157,23 +163,8 @@ impl WriteBufferManager for WriteBufferManagerImpl {
         }
 
         let memory_usage = self.memory_used.load(Ordering::Relaxed);
-        // If the memory exceeds the buffer size, we trigger more aggressive
-        // flush. But if already more than half memory is being flushed,
-        // triggering more flush may not help. We will hold it instead.
         if memory_usage >= self.global_write_buffer_size {
-            if mutable_memtable_memory_usage >= self.global_write_buffer_size / 2 {
-                debug!(
-                    "Engine should flush (over total limit), memory_usage: {}, global_write_buffer_size: {}, \
-                 mutable_usage: {}.",
-                    memory_usage, self.global_write_buffer_size, mutable_memtable_memory_usage
-                );
-                return true;
-            } else {
-                trace!(
-                    "Engine won't flush, memory_usage: {}, global_write_buffer_size: {}, mutable_usage: {}.",
-                    memory_usage, self.global_write_buffer_size, mutable_memtable_memory_usage
-                );
-            }
+            return true;
         }
 
         false
@@ -205,10 +196,14 @@ impl WriteBufferManager for WriteBufferManagerImpl {
     fn memory_usage(&self) -> usize {
         self.memory_used.load(Ordering::Relaxed)
     }
+
+    fn flush_limit(&self) -> usize {
+        self.mutable_limit
+    }
 }
 
 /// Reason of a flush task.
-#[derive(Debug, IntoStaticStr)]
+#[derive(Debug, IntoStaticStr, Clone, Copy, PartialEq, Eq)]
 pub enum FlushReason {
     /// Other reasons.
     Others,
@@ -222,6 +217,8 @@ pub enum FlushReason {
     Periodically,
     /// Flush memtable during downgrading state.
     Downgrading,
+    /// Enter staging mode.
+    EnterStaging,
 }
 
 impl FlushReason {
@@ -253,6 +250,8 @@ pub(crate) struct RegionFlushTask {
     pub(crate) index_options: IndexOptions,
     /// Semaphore to control flush concurrency.
     pub(crate) flush_semaphore: Arc<Semaphore>,
+    /// Whether the region is in staging mode.
+    pub(crate) is_staging: bool,
 }
 
 impl RegionFlushTask {
@@ -316,6 +315,7 @@ impl RegionFlushTask {
                     _timer: timer,
                     edit,
                     memtables_to_remove,
+                    is_staging: self.is_staging,
                 };
                 WorkerRequest::Background {
                     region_id: self.region_id,
@@ -398,7 +398,10 @@ impl RegionFlushTask {
             flushed_sequence: Some(version_data.committed_sequence),
             committed_sequence: None,
         };
-        info!("Applying {edit:?} to region {}", self.region_id);
+        info!(
+            "Applying {edit:?} to region {}, is_staging: {}",
+            self.region_id, self.is_staging
+        );
 
         let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone()));
 
@@ -417,11 +420,12 @@ impl RegionFlushTask {
         // add a cleanup job to remove them later.
         let version = self
             .manifest_ctx
-            .update_manifest(expected_state, action_list)
+            .update_manifest(expected_state, action_list, self.is_staging)
             .await?;
         info!(
-            "Successfully update manifest version to {version}, region: {}, reason: {}",
+            "Successfully update manifest version to {version}, region: {}, is_staging: {}, reason: {}",
             self.region_id,
+            self.is_staging,
             self.reason.as_str()
         );
 
@@ -636,9 +640,11 @@ impl RegionFlushTask {
             time_range: sst_info.time_range,
             level: 0,
             file_size: sst_info.file_size,
+            max_row_group_uncompressed_size: sst_info.max_row_group_uncompressed_size,
             available_indexes: sst_info.index_metadata.build_available_indexes(),
+            indexes: sst_info.index_metadata.build_indexes(),
             index_file_size: sst_info.index_metadata.file_size,
-            index_file_id: None,
+            index_version: 0,
             num_rows: sst_info.num_rows as u64,
             num_row_groups: sst_info.num_row_groups,
             sequence: NonZeroU64::new(max_sequence),
@@ -725,11 +731,13 @@ async fn memtable_source(mem_ranges: MemtableRanges, options: &RegionOptions) ->
             // dedup according to merge mode
             match options.merge_mode.unwrap_or(MergeMode::LastRow) {
                 MergeMode::LastRow => {
-                    Box::new(DedupReader::new(merge_reader, LastRow::new(false))) as _
-                }
-                MergeMode::LastNonNull => {
-                    Box::new(DedupReader::new(merge_reader, LastNonNull::new(false))) as _
+                    Box::new(DedupReader::new(merge_reader, LastRow::new(false), None)) as _
                 }
+                MergeMode::LastNonNull => Box::new(DedupReader::new(
+                    merge_reader,
+                    LastNonNull::new(false),
+                    None,
+                )) as _,
             }
         };
         Source::Reader(maybe_dedup)
@@ -766,7 +774,12 @@ fn memtable_flat_sources(
             let iter = only_range.build_record_batch_iter(None)?;
             // Dedup according to append mode and merge mode.
             // Even single range may have duplicate rows.
-            let iter = maybe_dedup_one(options, field_column_start, iter);
+            let iter = maybe_dedup_one(
+                options.append_mode,
+                options.merge_mode(),
+                field_column_start,
+                iter,
+            );
             flat_sources.sources.push(FlatSource::Iter(iter));
         };
     } else {
@@ -834,17 +847,18 @@ fn merge_and_dedup(
     Ok(maybe_dedup)
 }
 
-fn maybe_dedup_one(
-    options: &RegionOptions,
+pub fn maybe_dedup_one(
+    append_mode: bool,
+    merge_mode: MergeMode,
     field_column_start: usize,
     input_iter: BoxedRecordBatchIterator,
 ) -> BoxedRecordBatchIterator {
-    if options.append_mode {
+    if append_mode {
         // No dedup in append mode
         input_iter
     } else {
         // Dedup according to merge mode.
-        match options.merge_mode() {
+        match merge_mode {
             MergeMode::LastRow => {
                 Box::new(FlatDedupIterator::new(input_iter, FlatLastRow::new(false)))
             }
@@ -878,6 +892,31 @@ impl FlushScheduler {
         self.region_status.contains_key(&region_id)
     }
 
+    fn schedule_flush_task(
+        &mut self,
+        version_control: &VersionControlRef,
+        task: RegionFlushTask,
+    ) -> Result<()> {
+        let region_id = task.region_id;
+
+        // If current region doesn't have flush status, we can flush the region directly.
+        if let Err(e) = version_control.freeze_mutable() {
+            error!(e; "Failed to freeze the mutable memtable for region {}", region_id);
+
+            return Err(e);
+        }
+        // Submit a flush job.
+        let job = task.into_flush_job(version_control);
+        if let Err(e) = self.scheduler.schedule(job) {
+            // If scheduler returns error, senders in the job will be dropped and waiters
+            // can get recv errors.
+            error!(e; "Failed to schedule flush job for region {}", region_id);
+
+            return Err(e);
+        }
+        Ok(())
+    }
+
     /// Schedules a flush `task` for specific `region`.
     pub(crate) fn schedule_flush(
         &mut self,
@@ -900,46 +939,21 @@ impl FlushScheduler {
             .with_label_values(&[task.reason.as_str()])
             .inc();
 
+        // If current region has flush status, merge the task.
+        if let Some(flush_status) = self.region_status.get_mut(&region_id) {
+            // Checks whether we can flush the region now.
+            debug!("Merging flush task for region {}", region_id);
+            flush_status.merge_task(task);
+            return Ok(());
+        }
+
+        self.schedule_flush_task(version_control, task)?;
+
         // Add this region to status map.
-        let flush_status = self
-            .region_status
-            .entry(region_id)
-            .or_insert_with(|| FlushStatus::new(region_id, version_control.clone()));
-        // Checks whether we can flush the region now.
-        if flush_status.flushing {
-            // There is already a flush job running.
-            flush_status.merge_task(task);
-            return Ok(());
-        }
-
-        // TODO(yingwen): We can merge with pending and execute directly.
-        // If there are pending tasks, then we should push it to pending list.
-        if flush_status.pending_task.is_some() {
-            flush_status.merge_task(task);
-            return Ok(());
-        }
-
-        // Now we can flush the region directly.
-        if let Err(e) = version_control.freeze_mutable() {
-            error!(e; "Failed to freeze the mutable memtable for region {}", region_id);
-
-            // Remove from region status if we can't freeze the mutable memtable.
-            self.region_status.remove(&region_id);
-            return Err(e);
-        }
-        // Submit a flush job.
-        let job = task.into_flush_job(version_control);
-        if let Err(e) = self.scheduler.schedule(job) {
-            // If scheduler returns error, senders in the job will be dropped and waiters
-            // can get recv errors.
-            error!(e; "Failed to schedule flush job for region {}", region_id);
-
-            // Remove from region status if we can't submit the task.
-            self.region_status.remove(&region_id);
-            return Err(e);
-        }
-
-        flush_status.flushing = true;
+        let _ = self.region_status.insert(
+            region_id,
+            FlushStatus::new(region_id, version_control.clone()),
+        );
 
         Ok(())
     }
@@ -956,48 +970,56 @@ impl FlushScheduler {
         Vec<SenderBulkRequest>,
     )> {
         let flush_status = self.region_status.get_mut(&region_id)?;
-
-        // This region doesn't have running flush job.
-        flush_status.flushing = false;
-
-        let pending_requests = if flush_status.pending_task.is_none() {
+        // If region doesn't have any pending flush task, we need to remove it from the status.
+        if flush_status.pending_task.is_none() {
             // The region doesn't have any pending flush task.
             // Safety: The flush status must exist.
+            debug!(
+                "Region {} doesn't have any pending flush task, removing it from the status",
+                region_id
+            );
             let flush_status = self.region_status.remove(&region_id).unwrap();
-            Some((
+            return Some((
                 flush_status.pending_ddls,
                 flush_status.pending_writes,
                 flush_status.pending_bulk_writes,
-            ))
-        } else {
-            let version_data = flush_status.version_control.current();
-            if version_data.version.memtables.is_empty() {
-                // The region has nothing to flush, we also need to remove it from the status.
-                // Safety: The pending task is not None.
-                let task = flush_status.pending_task.take().unwrap();
-                // The region has nothing to flush. We can notify pending task.
-                task.on_success();
-                // `schedule_next_flush()` may pick up the same region to flush, so we must remove
-                // it from the status to avoid leaking pending requests.
-                // Safety: The flush status must exist.
-                let flush_status = self.region_status.remove(&region_id).unwrap();
-                Some((
-                    flush_status.pending_ddls,
-                    flush_status.pending_writes,
-                    flush_status.pending_bulk_writes,
-                ))
-            } else {
-                // We can flush the region again, keep it in the region status.
-                None
-            }
-        };
-
-        // Schedule next flush job.
-        if let Err(e) = self.schedule_next_flush() {
-            error!(e; "Flush of region {} is successful, but failed to schedule next flush", region_id);
+            ));
         }
 
-        pending_requests
+        // If region has pending task, but has nothing to flush, we need to remove it from the status.
+        let version_data = flush_status.version_control.current();
+        if version_data.version.memtables.is_empty() {
+            // The region has nothing to flush, we also need to remove it from the status.
+            // Safety: The pending task is not None.
+            let task = flush_status.pending_task.take().unwrap();
+            // The region has nothing to flush. We can notify pending task.
+            task.on_success();
+            debug!(
+                "Region {} has nothing to flush, removing it from the status",
+                region_id
+            );
+            // Safety: The flush status must exist.
+            let flush_status = self.region_status.remove(&region_id).unwrap();
+            return Some((
+                flush_status.pending_ddls,
+                flush_status.pending_writes,
+                flush_status.pending_bulk_writes,
+            ));
+        }
+
+        // If region has pending task and has something to flush, we need to schedule it.
+        debug!("Scheduling pending flush task for region {}", region_id);
+        // Safety: The flush status must exist.
+        let task = flush_status.pending_task.take().unwrap();
+        let version_control = flush_status.version_control.clone();
+        if let Err(err) = self.schedule_flush_task(&version_control, task) {
+            error!(
+                err;
+                "Flush succeeded for region {region_id}, but failed to schedule next flush for it."
+            );
+        }
+        // We can flush the region again, keep it in the region status.
+        None
     }
 
     /// Notifies the scheduler that the flush job is failed.
@@ -1013,11 +1035,6 @@ impl FlushScheduler {
 
         // Fast fail: cancels all pending tasks and sends error to their waiters.
         flush_status.on_failure(err);
-
-        // Still tries to schedule a new flush.
-        if let Err(e) = self.schedule_next_flush() {
-            error!(e; "Failed to schedule next flush after region {} flush is failed", region_id);
-        }
     }
 
     /// Notifies the scheduler that the region is dropped.
@@ -1088,30 +1105,6 @@ impl FlushScheduler {
             .map(|status| !status.pending_ddls.is_empty())
             .unwrap_or(false)
     }
-
-    /// Schedules a new flush task when the scheduler can submit next task.
-    pub(crate) fn schedule_next_flush(&mut self) -> Result<()> {
-        debug_assert!(
-            self.region_status
-                .values()
-                .all(|status| status.flushing || status.pending_task.is_some())
-        );
-
-        // Get the first region from status map.
-        let Some(flush_status) = self
-            .region_status
-            .values_mut()
-            .find(|status| status.pending_task.is_some())
-        else {
-            return Ok(());
-        };
-        debug_assert!(!flush_status.flushing);
-        let task = flush_status.pending_task.take().unwrap();
-        let region_id = flush_status.region_id;
-        let version_control = flush_status.version_control.clone();
-
-        self.schedule_flush(region_id, &version_control, task)
-    }
 }
 
 impl Drop for FlushScheduler {
@@ -1131,11 +1124,6 @@ struct FlushStatus {
     region_id: RegionId,
     /// Version control of the region.
     version_control: VersionControlRef,
-    /// There is a flush task running.
-    ///
-    /// It is possible that a region is not flushing but has pending task if the scheduler
-    /// doesn't schedules this region.
-    flushing: bool,
     /// Task waiting for next flush.
     pending_task: Option<RegionFlushTask>,
     /// Pending ddl requests.
@@ -1151,7 +1139,6 @@ impl FlushStatus {
         FlushStatus {
             region_id,
             version_control,
-            flushing: false,
             pending_task: None,
             pending_ddls: Vec::new(),
             pending_writes: Vec::new(),
@@ -1243,10 +1230,12 @@ mod tests {
         // Global usage is still 1100.
         manager.schedule_free_mem(200);
         assert!(manager.should_flush_engine());
+        assert!(manager.should_stall());
 
-        // More than global limit, but mutable (1100-200-450=450) is not enough (< 500).
+        // More than global limit, mutable (1100-200-450=450) is less than mutable limit (< 500).
         manager.schedule_free_mem(450);
-        assert!(!manager.should_flush_engine());
+        assert!(manager.should_flush_engine());
+        assert!(manager.should_stall());
 
         // Now mutable is enough.
         manager.reserve_mem(50);
@@ -1291,6 +1280,7 @@ mod tests {
                 .await,
             index_options: IndexOptions::default(),
             flush_semaphore: Arc::new(Semaphore::new(2)),
+            is_staging: false,
         };
         task.push_sender(OptionOutputTx::from(output_tx));
         scheduler
@@ -1333,6 +1323,7 @@ mod tests {
                 manifest_ctx: manifest_ctx.clone(),
                 index_options: IndexOptions::default(),
                 flush_semaphore: Arc::new(Semaphore::new(2)),
+                is_staging: false,
             })
             .collect();
         // Schedule first task.
@@ -1491,4 +1482,92 @@ mod tests {
             assert_eq!(2, total_rows, "append_mode should preserve duplicates");
         }
     }
+
+    #[tokio::test]
+    async fn test_schedule_pending_request_on_flush_success() {
+        common_telemetry::init_default_ut_logging();
+        let job_scheduler = Arc::new(VecScheduler::default());
+        let env = SchedulerEnv::new().await.scheduler(job_scheduler.clone());
+        let (tx, _rx) = mpsc::channel(4);
+        let mut scheduler = env.mock_flush_scheduler();
+        let mut builder = VersionControlBuilder::new();
+        // Overwrites the empty memtable builder.
+        builder.set_memtable_builder(Arc::new(TimeSeriesMemtableBuilder::default()));
+        let version_control = Arc::new(builder.build());
+        // Writes data to the memtable so it is not empty.
+        let version_data = version_control.current();
+        write_rows_to_version(&version_data.version, "host0", 0, 10);
+        let manifest_ctx = env
+            .mock_manifest_context(version_data.version.metadata.clone())
+            .await;
+        // Creates 2 tasks.
+        let mut tasks: Vec<_> = (0..2)
+            .map(|_| RegionFlushTask {
+                region_id: builder.region_id(),
+                reason: FlushReason::Others,
+                senders: Vec::new(),
+                request_sender: tx.clone(),
+                access_layer: env.access_layer.clone(),
+                listener: WorkerListener::default(),
+                engine_config: Arc::new(MitoConfig::default()),
+                row_group_size: None,
+                cache_manager: Arc::new(CacheManager::default()),
+                manifest_ctx: manifest_ctx.clone(),
+                index_options: IndexOptions::default(),
+                flush_semaphore: Arc::new(Semaphore::new(2)),
+                is_staging: false,
+            })
+            .collect();
+        // Schedule first task.
+        let task = tasks.pop().unwrap();
+        scheduler
+            .schedule_flush(builder.region_id(), &version_control, task)
+            .unwrap();
+        // Should schedule 1 flush.
+        assert_eq!(1, scheduler.region_status.len());
+        assert_eq!(1, job_scheduler.num_jobs());
+        // Schedule second task.
+        let task = tasks.pop().unwrap();
+        scheduler
+            .schedule_flush(builder.region_id(), &version_control, task)
+            .unwrap();
+        assert!(
+            scheduler
+                .region_status
+                .get(&builder.region_id())
+                .unwrap()
+                .pending_task
+                .is_some()
+        );
+
+        // Check the new version.
+        let version_data = version_control.current();
+        assert_eq!(0, version_data.version.memtables.immutables()[0].id());
+        // Assumes the flush job is finished.
+        version_control.apply_edit(
+            Some(RegionEdit {
+                files_to_add: Vec::new(),
+                files_to_remove: Vec::new(),
+                timestamp_ms: None,
+                compaction_time_window: None,
+                flushed_entry_id: None,
+                flushed_sequence: None,
+                committed_sequence: None,
+            }),
+            &[0],
+            builder.file_purger(),
+        );
+        write_rows_to_version(&version_data.version, "host1", 0, 10);
+        scheduler.on_flush_success(builder.region_id());
+        assert_eq!(2, job_scheduler.num_jobs());
+        // The pending task is cleared.
+        assert!(
+            scheduler
+                .region_status
+                .get(&builder.region_id())
+                .unwrap()
+                .pending_task
+                .is_none()
+        );
+    }
 }
diff --git a/src/mito2/src/gc.rs b/src/mito2/src/gc.rs
index 822fd6820d..d4c02b0b0a 100644
--- a/src/mito2/src/gc.rs
+++ b/src/mito2/src/gc.rs
@@ -21,7 +21,7 @@
 //! `unknown files`: files that are not recorded in the manifest, usually due to saved checkpoint which remove actions before the checkpoint.
 //!
 
-use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
+use std::collections::{BTreeMap, HashMap, HashSet};
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -30,7 +30,7 @@ use common_telemetry::{debug, error, info, warn};
 use common_time::Timestamp;
 use object_store::{Entry, Lister};
 use serde::{Deserialize, Serialize};
-use snafu::{OptionExt, ResultExt as _, ensure};
+use snafu::ResultExt as _;
 use store_api::storage::{FileId, FileRefsManifest, GcReport, RegionId};
 use tokio::sync::{OwnedSemaphorePermit, TryAcquireError};
 use tokio_stream::StreamExt;
@@ -39,15 +39,16 @@ use crate::access_layer::AccessLayerRef;
 use crate::cache::CacheManagerRef;
 use crate::config::MitoConfig;
 use crate::error::{
-    DurationOutOfRangeSnafu, EmptyRegionDirSnafu, JoinSnafu, OpenDalSnafu, RegionNotFoundSnafu,
-    Result, TooManyGcJobsSnafu, UnexpectedSnafu,
+    DurationOutOfRangeSnafu, JoinSnafu, OpenDalSnafu, Result, TooManyGcJobsSnafu, UnexpectedSnafu,
 };
-use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions, RemoveFileOptions};
-use crate::manifest::storage::manifest_compress_type;
-use crate::metrics::GC_DEL_FILE_CNT;
-use crate::region::opener::new_manifest_dir;
+use crate::manifest::action::RegionManifest;
+use crate::metrics::GC_DELETE_FILE_CNT;
+use crate::region::{MitoRegionRef, RegionRoleState};
 use crate::sst::file::delete_files;
-use crate::sst::location::{self, region_dir_from_table_dir};
+use crate::sst::location::{self};
+
+#[cfg(test)]
+mod worker_test;
 
 /// Limit the amount of concurrent GC jobs on the datanode
 pub struct GcLimiter {
@@ -95,16 +96,18 @@ impl GcLimiter {
 }
 
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(default)]
 pub struct GcConfig {
     /// Whether GC is enabled.
     pub enable: bool,
     /// Lingering time before deleting files.
     /// Should be long enough to allow long running queries to finish.
+    /// If set to None, then unused files will be deleted immediately.
     ///
     /// TODO(discord9): long running queries should actively write tmp manifest files
     /// to prevent deletion of files they are using.
     #[serde(with = "humantime_serde")]
-    pub lingering_time: Duration,
+    pub lingering_time: Option<Duration>,
     /// Lingering time before deleting unknown files(files with undetermine expel time).
     /// expel time is the time when the file is considered as removed, as in removed from the manifest.
     /// This should only occur rarely, as manifest keep tracks in `removed_files` field
@@ -124,10 +127,10 @@ impl Default for GcConfig {
     fn default() -> Self {
         Self {
             enable: false,
-            // expect long running queries to be finished within a reasonable time
-            lingering_time: Duration::from_secs(60 * 5),
-            // 6 hours, for unknown expel time, which is when this file get removed from manifest, it should rarely happen, can keep it longer
-            unknown_file_lingering_time: Duration::from_secs(60 * 60 * 6),
+            // expect long running queries to be finished(or at least be able to notify it's using a deleted file) within a reasonable time
+            lingering_time: Some(Duration::from_secs(60)),
+            // 1 hours, for unknown expel time, which is when this file get removed from manifest, it should rarely happen, can keep it longer
+            unknown_file_lingering_time: Duration::from_secs(60 * 60),
             max_concurrent_lister_per_gc_job: 32,
             max_concurrent_gc_job: 4,
         }
@@ -137,10 +140,9 @@ impl Default for GcConfig {
 pub struct LocalGcWorker {
     pub(crate) access_layer: AccessLayerRef,
     pub(crate) cache_manager: Option<CacheManagerRef>,
-    pub(crate) manifest_mgrs: HashMap<RegionId, RegionManifestManager>,
+    pub(crate) regions: BTreeMap<RegionId, MitoRegionRef>,
     /// Lingering time before deleting files.
     pub(crate) opt: GcConfig,
-    pub(crate) manifest_open_config: ManifestOpenConfig,
     /// Tmp ref files manifest, used to determine which files are still in use by ongoing queries.
     ///
     /// Also contains manifest versions of regions when the tmp ref files are generated.
@@ -186,81 +188,29 @@ impl LocalGcWorker {
     pub async fn try_new(
         access_layer: AccessLayerRef,
         cache_manager: Option<CacheManagerRef>,
-        regions_to_gc: BTreeSet<RegionId>,
+        regions_to_gc: BTreeMap<RegionId, MitoRegionRef>,
         opt: GcConfig,
-        manifest_open_config: ManifestOpenConfig,
         file_ref_manifest: FileRefsManifest,
         limiter: &GcLimiterRef,
         full_file_listing: bool,
     ) -> Result<Self> {
-        let table_id = regions_to_gc
-            .first()
-            .context(UnexpectedSnafu {
-                reason: "Expect at least one region, found none",
-            })?
-            .table_id();
         let permit = limiter.permit()?;
-        let mut zelf = Self {
+
+        Ok(Self {
             access_layer,
             cache_manager,
-            manifest_mgrs: HashMap::new(),
+            regions: regions_to_gc,
             opt,
-            manifest_open_config,
             file_ref_manifest,
             _permit: permit,
             full_file_listing,
-        };
-
-        // dedup just in case
-        for region_id in regions_to_gc {
-            ensure!(
-                region_id.table_id() == table_id,
-                UnexpectedSnafu {
-                    reason: format!(
-                        "All regions should belong to the same table, found region {} and table {}",
-                        region_id, table_id
-                    ),
-                }
-            );
-            let mgr = zelf.open_mgr_for(region_id).await?;
-            zelf.manifest_mgrs.insert(region_id, mgr);
-        }
-
-        Ok(zelf)
+        })
     }
 
     /// Get tmp ref files for all current regions
-    ///
-    /// Outdated regions are added to `outdated_regions` set
-    pub async fn read_tmp_ref_files(
-        &self,
-        outdated_regions: &mut HashSet<RegionId>,
-    ) -> Result<HashMap<RegionId, HashSet<FileId>>> {
-        for (region_id, region_mgr) in &self.manifest_mgrs {
-            let current_version = region_mgr.manifest().manifest_version;
-            if &current_version
-                > self
-                    .file_ref_manifest
-                    .manifest_version
-                    .get(region_id)
-                    .with_context(|| UnexpectedSnafu {
-                        reason: format!(
-                            "Region {} not found in tmp ref manifest version map",
-                            region_id
-                        ),
-                    })?
-            {
-                outdated_regions.insert(*region_id);
-            }
-        }
-        // TODO(discord9): verify manifest version before reading tmp ref files
-
+    pub async fn read_tmp_ref_files(&self) -> Result<HashMap<RegionId, HashSet<FileId>>> {
         let mut tmp_ref_files = HashMap::new();
         for (region_id, file_refs) in &self.file_ref_manifest.file_refs {
-            if outdated_regions.contains(region_id) {
-                // skip outdated regions
-                continue;
-            }
             tmp_ref_files
                 .entry(*region_id)
                 .or_insert_with(HashSet::new)
@@ -279,26 +229,38 @@ impl LocalGcWorker {
         info!("LocalGcWorker started");
         let now = std::time::Instant::now();
 
-        let mut outdated_regions = HashSet::new();
         let mut deleted_files = HashMap::new();
-        let tmp_ref_files = self.read_tmp_ref_files(&mut outdated_regions).await?;
-        for region_id in self.manifest_mgrs.keys() {
-            debug!("Doing gc for region {}", region_id);
+        let tmp_ref_files = self.read_tmp_ref_files().await?;
+        for (region_id, region) in &self.regions {
+            let per_region_time = std::time::Instant::now();
+            if region.manifest_ctx.current_state() == RegionRoleState::Follower {
+                return UnexpectedSnafu {
+                    reason: format!(
+                        "Region {} is in Follower state, should not run GC on follower regions",
+                        region_id
+                    ),
+                }
+                .fail();
+            }
             let tmp_ref_files = tmp_ref_files
                 .get(region_id)
                 .cloned()
                 .unwrap_or_else(HashSet::new);
-            let files = self.do_region_gc(*region_id, &tmp_ref_files).await?;
+            let files = self.do_region_gc(region.clone(), &tmp_ref_files).await?;
             deleted_files.insert(*region_id, files);
-            debug!("Gc for region {} finished", region_id);
+            debug!(
+                "GC for region {} took {} secs.",
+                region_id,
+                per_region_time.elapsed().as_secs_f32()
+            );
         }
         info!(
             "LocalGcWorker finished after {} secs.",
-            now.elapsed().as_secs()
+            now.elapsed().as_secs_f32()
         );
         let report = GcReport {
             deleted_files,
-            need_retry_regions: outdated_regions.into_iter().collect(),
+            need_retry_regions: HashSet::new(),
         };
         Ok(report)
     }
@@ -319,62 +281,64 @@ impl LocalGcWorker {
     /// to avoid deleting files that are still needed.
     pub async fn do_region_gc(
         &self,
-        region_id: RegionId,
+        region: MitoRegionRef,
         tmp_ref_files: &HashSet<FileId>,
     ) -> Result<Vec<FileId>> {
+        let region_id = region.region_id();
+
         debug!("Doing gc for region {}", region_id);
-        let manifest = self
-            .manifest_mgrs
-            .get(&region_id)
-            .context(RegionNotFoundSnafu { region_id })?
-            .manifest();
+        // do the time consuming listing only when full_file_listing is true
+        // and do it first to make sure we have the latest manifest etc.
+        let all_entries = if self.full_file_listing {
+            self.list_from_object_store(&region).await?
+        } else {
+            vec![]
+        };
+
+        let manifest = region.manifest_ctx.manifest().await;
         let region_id = manifest.metadata.region_id;
         let current_files = &manifest.files;
 
-        let recently_removed_files = self.get_removed_files_expel_times(region_id).await?;
+        let recently_removed_files = self.get_removed_files_expel_times(&manifest).await?;
 
         if recently_removed_files.is_empty() {
             // no files to remove, skip
             debug!("No recently removed files to gc for region {}", region_id);
         }
 
-        debug!(
-            "Found {} recently removed files sets for region {}",
-            recently_removed_files.len(),
-            region_id
-        );
+        let removed_file_cnt = recently_removed_files
+            .values()
+            .map(|s| s.len())
+            .sum::<usize>();
 
-        let concurrency = (current_files.len() / Self::CONCURRENCY_LIST_PER_FILES)
-            .max(1)
-            .min(self.opt.max_concurrent_lister_per_gc_job);
-
-        let in_used = current_files
+        let in_used: HashSet<FileId> = current_files
             .keys()
             .cloned()
             .chain(tmp_ref_files.clone().into_iter())
             .collect();
 
         let unused_files = self
-            .list_to_be_deleted_files(region_id, in_used, recently_removed_files, concurrency)
+            .list_to_be_deleted_files(region_id, &in_used, recently_removed_files, all_entries)
             .await?;
 
-        let unused_len = unused_files.len();
+        let unused_file_cnt = unused_files.len();
 
         debug!(
-            "Found {} unused files to delete for region {}",
-            unused_len, region_id
+            "gc: for region {region_id}: In manifest files: {}, Tmp ref file cnt: {}, In-used files: {}, recently removed files: {}, Unused files to delete: {} ",
+            current_files.len(),
+            tmp_ref_files.len(),
+            in_used.len(),
+            removed_file_cnt,
+            unused_files.len()
         );
 
-        let file_pairs: Vec<(FileId, FileId)> = unused_files
-            .iter()
-            .filter_map(|file_id| {
-                current_files
-                    .get(file_id)
-                    .map(|meta| (meta.file_id().file_id(), meta.index_file_id().file_id()))
-            })
-            .collect();
+        // TODO(discord9): for now, ignore async index file as it's design is not stable, need to be improved once
+        // index file design is stable
+        let file_pairs: Vec<(FileId, u64)> =
+            unused_files.iter().map(|file_id| (*file_id, 0)).collect();
+        // TODO(discord9): gc worker need another major refactor to support versioned index files
 
-        info!(
+        debug!(
             "Found {} unused index files to delete for region {}",
             file_pairs.len(),
             region_id
@@ -384,13 +348,16 @@ impl LocalGcWorker {
 
         debug!(
             "Successfully deleted {} unused files for region {}",
-            unused_len, region_id
+            unused_file_cnt, region_id
         );
+        // TODO(discord9): update region manifest about deleted files
+        self.update_manifest_removed_files(&region, unused_files.clone())
+            .await?;
 
         Ok(unused_files)
     }
 
-    async fn delete_files(&self, region_id: RegionId, file_ids: &[(FileId, FileId)]) -> Result<()> {
+    async fn delete_files(&self, region_id: RegionId, file_ids: &[(FileId, u64)]) -> Result<()> {
         delete_files(
             region_id,
             file_ids,
@@ -401,40 +368,32 @@ impl LocalGcWorker {
         .await?;
 
         // FIXME(discord9): if files are already deleted before calling delete_files, the metric will be inaccurate, no clean way to fix it now
-        GC_DEL_FILE_CNT.add(file_ids.len() as i64);
+        GC_DELETE_FILE_CNT.add(file_ids.len() as i64);
 
         Ok(())
     }
 
-    /// Get the manifest manager for the region.
-    async fn open_mgr_for(&self, region_id: RegionId) -> Result<RegionManifestManager> {
-        let table_dir = self.access_layer.table_dir();
-        let path_type = self.access_layer.path_type();
-        let mito_config = &self.manifest_open_config;
+    /// Update region manifest for clear the actually deleted files
+    async fn update_manifest_removed_files(
+        &self,
+        region: &MitoRegionRef,
+        deleted_files: Vec<FileId>,
+    ) -> Result<()> {
+        let deleted_file_cnt = deleted_files.len();
+        debug!(
+            "Trying to update manifest for {deleted_file_cnt} removed files for region {}",
+            region.region_id()
+        );
 
-        let region_manifest_options = RegionManifestOptions {
-            manifest_dir: new_manifest_dir(&region_dir_from_table_dir(
-                table_dir, region_id, path_type,
-            )),
-            object_store: self.access_layer.object_store().clone(),
-            compress_type: manifest_compress_type(mito_config.compress_manifest),
-            checkpoint_distance: mito_config.manifest_checkpoint_distance,
-            remove_file_options: RemoveFileOptions {
-                keep_count: mito_config.experimental_manifest_keep_removed_file_count,
-                keep_ttl: mito_config.experimental_manifest_keep_removed_file_ttl,
-            },
-        };
+        let mut manager = region.manifest_ctx.manifest_manager.write().await;
+        let cnt = deleted_files.len();
+        manager.clear_deleted_files(deleted_files);
+        debug!(
+            "Updated region_id={} region manifest to clear {cnt} deleted files",
+            region.region_id(),
+        );
 
-        RegionManifestManager::open(
-            region_manifest_options,
-            Default::default(),
-            Default::default(),
-        )
-        .await?
-        .context(EmptyRegionDirSnafu {
-            region_id,
-            region_dir: &region_dir_from_table_dir(table_dir, region_id, path_type),
-        })
+        Ok(())
     }
 
     /// Get all the removed files in delta manifest files and their expel times.
@@ -443,14 +402,8 @@ impl LocalGcWorker {
     ///
     pub async fn get_removed_files_expel_times(
         &self,
-        region_id: RegionId,
+        region_manifest: &Arc<RegionManifest>,
     ) -> Result<BTreeMap<Timestamp, HashSet<FileId>>> {
-        let region_manifest = self
-            .manifest_mgrs
-            .get(&region_id)
-            .context(RegionNotFoundSnafu { region_id })?
-            .manifest();
-
         let mut ret = BTreeMap::new();
         for files in &region_manifest.removed_files.removed_files {
             let expel_time = Timestamp::new_millisecond(files.removed_at);
@@ -493,6 +446,32 @@ impl LocalGcWorker {
         Ok(listers)
     }
 
+    /// List all files in the region directory.
+    /// Returns a vector of all file entries found.
+    /// This might take a long time if there are many files in the region directory.
+    async fn list_from_object_store(&self, region: &MitoRegionRef) -> Result<Vec<Entry>> {
+        let start = tokio::time::Instant::now();
+        let region_id = region.region_id();
+        let manifest = region.manifest_ctx.manifest().await;
+        let current_files = &manifest.files;
+        let concurrency = (current_files.len() / Self::CONCURRENCY_LIST_PER_FILES)
+            .max(1)
+            .min(self.opt.max_concurrent_lister_per_gc_job);
+
+        let listers = self.partition_region_files(region_id, concurrency).await?;
+        let lister_cnt = listers.len();
+
+        // Step 2: Concurrently list all files in the region directory
+        let all_entries = self.list_region_files_concurrent(listers).await?;
+        let cnt = all_entries.len();
+        info!(
+            "gc: full listing mode cost {} secs using {lister_cnt} lister for {cnt} files in region {}.",
+            start.elapsed().as_secs_f64(),
+            region_id
+        );
+        Ok(all_entries)
+    }
+
     /// Concurrently list all files in the region directory using the provided listers.
     /// Returns a vector of all file entries found across all partitions.
     async fn list_region_files_concurrent(
@@ -621,18 +600,22 @@ impl LocalGcWorker {
     pub async fn list_to_be_deleted_files(
         &self,
         region_id: RegionId,
-        in_used: HashSet<FileId>,
+        in_used: &HashSet<FileId>,
         recently_removed_files: BTreeMap<Timestamp, HashSet<FileId>>,
-        concurrency: usize,
+        all_entries: Vec<Entry>,
     ) -> Result<Vec<FileId>> {
-        let start = tokio::time::Instant::now();
         let now = chrono::Utc::now();
-        let may_linger_until = now
-            - chrono::Duration::from_std(self.opt.lingering_time).with_context(|_| {
-                DurationOutOfRangeSnafu {
-                    input: self.opt.lingering_time,
-                }
-            })?;
+        let may_linger_until = self
+            .opt
+            .lingering_time
+            .map(|lingering_time| {
+                chrono::Duration::from_std(lingering_time)
+                    .with_context(|_| DurationOutOfRangeSnafu {
+                        input: lingering_time,
+                    })
+                    .map(|t| now - t)
+            })
+            .transpose()?;
 
         let unknown_file_may_linger_until = now
             - chrono::Duration::from_std(self.opt.unknown_file_lingering_time).with_context(
@@ -642,9 +625,15 @@ impl LocalGcWorker {
             )?;
 
         // files that may linger, which means they are not in use but may still be kept for a while
-        let threshold = Timestamp::new_millisecond(may_linger_until.timestamp_millis());
+        let threshold =
+            may_linger_until.map(|until| Timestamp::new_millisecond(until.timestamp_millis()));
         let mut recently_removed_files = recently_removed_files;
-        let may_linger_files = recently_removed_files.split_off(&threshold);
+        let may_linger_files = match threshold {
+            Some(threshold) => recently_removed_files.split_off(&threshold),
+            None => BTreeMap::new(),
+        };
+        debug!("may_linger_files: {:?}", may_linger_files);
+
         let may_linger_filenames = may_linger_files.values().flatten().collect::<HashSet<_>>();
 
         let eligible_for_removal = recently_removed_files
@@ -669,8 +658,7 @@ impl LocalGcWorker {
                 .collect();
 
             info!(
-                "gc: fast mode (no full listing) cost {} secs for region {}, found {} files to delete from manifest",
-                start.elapsed().as_secs_f64(),
+                "gc: fast mode (no full listing) for region {}, found {} files to delete from manifest",
                 region_id,
                 files_to_delete.len()
             );
@@ -678,15 +666,7 @@ impl LocalGcWorker {
             return Ok(files_to_delete);
         }
 
-        // Full file listing mode: perform expensive list operations to find orphan files
-        // Step 1: Create partitioned listers for concurrent processing
-        let listers = self.partition_region_files(region_id, concurrency).await?;
-        let lister_cnt = listers.len();
-
-        // Step 2: Concurrently list all files in the region directory
-        let all_entries = self.list_region_files_concurrent(listers).await?;
-
-        let cnt = all_entries.len();
+        // Full file listing mode: get the full list of files from object store
 
         // Step 3: Filter files to determine which ones can be deleted
         let (all_unused_files_ready_for_delete, all_in_exist_linger_files) = self
@@ -698,12 +678,6 @@ impl LocalGcWorker {
                 unknown_file_may_linger_until,
             );
 
-        info!(
-            "gc: full listing mode cost {} secs using {lister_cnt} lister for {cnt} files in region {}, found {} unused files to delete",
-            start.elapsed().as_secs_f64(),
-            region_id,
-            all_unused_files_ready_for_delete.len()
-        );
         debug!("All in exist linger files: {:?}", all_in_exist_linger_files);
 
         Ok(all_unused_files_ready_for_delete)
diff --git a/src/mito2/src/gc/worker_test.rs b/src/mito2/src/gc/worker_test.rs
new file mode 100644
index 0000000000..6e3f5288c0
--- /dev/null
+++ b/src/mito2/src/gc/worker_test.rs
@@ -0,0 +1,401 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::sync::Arc;
+
+use api::v1::Rows;
+use common_telemetry::init_default_ut_logging;
+use store_api::region_engine::RegionEngine as _;
+use store_api::region_request::{RegionCompactRequest, RegionRequest};
+use store_api::storage::{FileRefsManifest, RegionId};
+
+use crate::config::MitoConfig;
+use crate::engine::MitoEngine;
+use crate::engine::compaction_test::{delete_and_flush, put_and_flush};
+use crate::gc::{GcConfig, LocalGcWorker};
+use crate::region::MitoRegionRef;
+use crate::test_util::{
+    CreateRequestBuilder, TestEnv, build_rows, flush_region, put_rows, rows_schema,
+};
+
+async fn create_gc_worker(
+    mito_engine: &MitoEngine,
+    regions: BTreeMap<RegionId, MitoRegionRef>,
+    file_ref_manifest: &FileRefsManifest,
+    full_file_listing: bool,
+) -> LocalGcWorker {
+    let access_layer = regions.first_key_value().unwrap().1.access_layer.clone();
+    let cache_manager = mito_engine.cache_manager();
+
+    LocalGcWorker::try_new(
+        access_layer,
+        Some(cache_manager),
+        regions,
+        mito_engine.mito_config().gc.clone(),
+        file_ref_manifest.clone(),
+        &mito_engine.gc_limiter(),
+        full_file_listing,
+    )
+    .await
+    .unwrap()
+}
+
+/// Test insert/flush then truncate can allow gc worker to delete files
+#[tokio::test]
+async fn test_gc_worker_basic_truncate() {
+    init_default_ut_logging();
+
+    let mut env = TestEnv::new().await;
+    env.log_store = Some(env.create_log_store().await);
+    // use in memory object store for gc test, so it will use `ObjectStoreFilePurger`
+    env.object_store_manager = Some(Arc::new(env.create_in_memory_object_store_manager()));
+
+    let engine = env
+        .new_mito_engine(MitoConfig {
+            gc: GcConfig {
+                enable: true,
+                // for faster delete file
+                lingering_time: None,
+                ..Default::default()
+            },
+            ..Default::default()
+        })
+        .await;
+
+    let region_id = RegionId::new(1, 1);
+    env.get_schema_metadata_manager()
+        .register_region_table_info(
+            region_id.table_id(),
+            "test_table",
+            "test_catalog",
+            "test_schema",
+            None,
+            env.get_kv_backend(),
+        )
+        .await;
+
+    let request = CreateRequestBuilder::new().build();
+
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+
+    let rows = Rows {
+        schema: column_schemas.clone(),
+        rows: build_rows(0, 3),
+    };
+    put_rows(&engine, region_id, rows).await;
+
+    flush_region(&engine, region_id, None).await;
+
+    let region = engine.get_region(region_id).unwrap();
+    let manifest = region.manifest_ctx.manifest().await;
+
+    let to_be_deleted_file_id = *manifest.files.iter().next().unwrap().0;
+
+    assert_eq!(manifest.files.len(), 1);
+
+    engine
+        .handle_request(
+            region.region_id,
+            RegionRequest::Truncate(store_api::region_request::RegionTruncateRequest::All),
+        )
+        .await
+        .unwrap();
+
+    let manifest = region.manifest_ctx.manifest().await;
+    assert!(
+        manifest.removed_files.removed_files[0]
+            .file_ids
+            .contains(&to_be_deleted_file_id)
+            && manifest.removed_files.removed_files[0].file_ids.len() == 1
+            && manifest.files.is_empty(),
+        "Manifest after truncate: {:?}",
+        manifest
+    );
+    let version = manifest.manifest_version;
+
+    let regions = BTreeMap::from([(region_id, region.clone())]);
+    let file_ref_manifest = FileRefsManifest {
+        file_refs: Default::default(),
+        manifest_version: [(region_id, version)].into(),
+    };
+    let gc_worker = create_gc_worker(&engine, regions, &file_ref_manifest, true).await;
+    let report = gc_worker.run().await.unwrap();
+    assert_eq!(
+        report.deleted_files.get(&region_id).unwrap(),
+        &vec![to_be_deleted_file_id],
+    );
+    assert!(report.need_retry_regions.is_empty());
+
+    let manifest = region.manifest_ctx.manifest().await;
+    assert!(manifest.removed_files.removed_files.is_empty() && manifest.files.is_empty());
+}
+
+/// Truncate with file refs should not delete files
+#[tokio::test]
+async fn test_gc_worker_truncate_with_ref() {
+    init_default_ut_logging();
+
+    let mut env = TestEnv::new().await;
+    env.log_store = Some(env.create_log_store().await);
+    // use in memory object store for gc test, so it will use `ObjectStoreFilePurger`
+    env.object_store_manager = Some(Arc::new(env.create_in_memory_object_store_manager()));
+
+    let engine = env
+        .new_mito_engine(MitoConfig {
+            gc: GcConfig {
+                enable: true,
+                // for faster delete file
+                lingering_time: None,
+                ..Default::default()
+            },
+            ..Default::default()
+        })
+        .await;
+
+    let region_id = RegionId::new(1, 1);
+    env.get_schema_metadata_manager()
+        .register_region_table_info(
+            region_id.table_id(),
+            "test_table",
+            "test_catalog",
+            "test_schema",
+            None,
+            env.get_kv_backend(),
+        )
+        .await;
+
+    let request = CreateRequestBuilder::new().build();
+
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+
+    let rows = Rows {
+        schema: column_schemas.clone(),
+        rows: build_rows(0, 3),
+    };
+    put_rows(&engine, region_id, rows).await;
+
+    flush_region(&engine, region_id, None).await;
+
+    let region = engine.get_region(region_id).unwrap();
+    let manifest = region.manifest_ctx.manifest().await;
+
+    assert_eq!(manifest.files.len(), 1);
+
+    let to_be_deleted_file_id = *manifest.files.iter().next().unwrap().0;
+
+    engine
+        .handle_request(
+            region.region_id,
+            RegionRequest::Truncate(store_api::region_request::RegionTruncateRequest::All),
+        )
+        .await
+        .unwrap();
+
+    let manifest = region.manifest_ctx.manifest().await;
+    assert!(
+        manifest.removed_files.removed_files[0]
+            .file_ids
+            .contains(&to_be_deleted_file_id)
+            && manifest.removed_files.removed_files[0].file_ids.len() == 1
+            && manifest.files.is_empty(),
+        "Manifest after truncate: {:?}",
+        manifest
+    );
+    let version = manifest.manifest_version;
+
+    let regions = BTreeMap::from([(region_id, region.clone())]);
+    let file_ref_manifest = FileRefsManifest {
+        file_refs: [(region_id, HashSet::from([to_be_deleted_file_id]))].into(),
+        manifest_version: [(region_id, version)].into(),
+    };
+    let gc_worker = create_gc_worker(&engine, regions, &file_ref_manifest, true).await;
+    let report = gc_worker.run().await.unwrap();
+    assert!(report.deleted_files.get(&region_id).unwrap().is_empty());
+    assert!(report.need_retry_regions.is_empty());
+
+    let manifest = region.manifest_ctx.manifest().await;
+    assert!(
+        manifest.removed_files.removed_files[0].file_ids.len() == 1 && manifest.files.is_empty(),
+        "Manifest: {:?}",
+        manifest
+    );
+}
+
+/// Test insert/flush then compact can allow gc worker to delete files
+#[tokio::test]
+async fn test_gc_worker_basic_compact() {
+    init_default_ut_logging();
+
+    let mut env = TestEnv::new().await;
+    env.log_store = Some(env.create_log_store().await);
+    // use in memory object store for gc test, so it will use `ObjectStoreFilePurger`
+    env.object_store_manager = Some(Arc::new(env.create_in_memory_object_store_manager()));
+
+    let engine = env
+        .new_mito_engine(MitoConfig {
+            gc: GcConfig {
+                enable: true,
+                // for faster delete file
+                lingering_time: None,
+                ..Default::default()
+            },
+            ..Default::default()
+        })
+        .await;
+
+    let region_id = RegionId::new(1, 1);
+    env.get_schema_metadata_manager()
+        .register_region_table_info(
+            region_id.table_id(),
+            "test_table",
+            "test_catalog",
+            "test_schema",
+            None,
+            env.get_kv_backend(),
+        )
+        .await;
+
+    let request = CreateRequestBuilder::new().build();
+
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+
+    put_and_flush(&engine, region_id, &column_schemas, 0..10).await;
+    put_and_flush(&engine, region_id, &column_schemas, 10..20).await;
+    put_and_flush(&engine, region_id, &column_schemas, 20..30).await;
+    delete_and_flush(&engine, region_id, &column_schemas, 15..30).await;
+    put_and_flush(&engine, region_id, &column_schemas, 15..25).await;
+
+    let result = engine
+        .handle_request(
+            region_id,
+            RegionRequest::Compact(RegionCompactRequest::default()),
+        )
+        .await
+        .unwrap();
+    assert_eq!(result.affected_rows, 0);
+
+    let region = engine.get_region(region_id).unwrap();
+    let manifest = region.manifest_ctx.manifest().await;
+    assert_eq!(manifest.removed_files.removed_files[0].file_ids.len(), 3);
+
+    let version = manifest.manifest_version;
+
+    let regions = BTreeMap::from([(region_id, region.clone())]);
+    let file_ref_manifest = FileRefsManifest {
+        file_refs: Default::default(),
+        manifest_version: [(region_id, version)].into(),
+    };
+
+    let gc_worker = create_gc_worker(&engine, regions, &file_ref_manifest, true).await;
+    let report = gc_worker.run().await.unwrap();
+
+    assert_eq!(report.deleted_files.get(&region_id).unwrap().len(), 3,);
+    assert!(report.need_retry_regions.is_empty());
+}
+
+/// Compact with file refs should not delete files
+#[tokio::test]
+async fn test_gc_worker_compact_with_ref() {
+    init_default_ut_logging();
+
+    let mut env = TestEnv::new().await;
+    env.log_store = Some(env.create_log_store().await);
+    // use in memory object store for gc test, so it will use `ObjectStoreFilePurger`
+    env.object_store_manager = Some(Arc::new(env.create_in_memory_object_store_manager()));
+
+    let engine = env
+        .new_mito_engine(MitoConfig {
+            gc: GcConfig {
+                enable: true,
+                // for faster delete file
+                lingering_time: None,
+                ..Default::default()
+            },
+            ..Default::default()
+        })
+        .await;
+
+    let region_id = RegionId::new(1, 1);
+    env.get_schema_metadata_manager()
+        .register_region_table_info(
+            region_id.table_id(),
+            "test_table",
+            "test_catalog",
+            "test_schema",
+            None,
+            env.get_kv_backend(),
+        )
+        .await;
+
+    let request = CreateRequestBuilder::new().build();
+
+    let column_schemas = rows_schema(&request);
+    engine
+        .handle_request(region_id, RegionRequest::Create(request.clone()))
+        .await
+        .unwrap();
+
+    put_and_flush(&engine, region_id, &column_schemas, 0..10).await;
+    put_and_flush(&engine, region_id, &column_schemas, 10..20).await;
+    put_and_flush(&engine, region_id, &column_schemas, 20..30).await;
+    delete_and_flush(&engine, region_id, &column_schemas, 15..30).await;
+    put_and_flush(&engine, region_id, &column_schemas, 15..25).await;
+
+    let result = engine
+        .handle_request(
+            region_id,
+            RegionRequest::Compact(RegionCompactRequest::default()),
+        )
+        .await
+        .unwrap();
+    assert_eq!(result.affected_rows, 0);
+
+    let region = engine.get_region(region_id).unwrap();
+    let manifest = region.manifest_ctx.manifest().await;
+    assert_eq!(manifest.removed_files.removed_files[0].file_ids.len(), 3);
+
+    let version = manifest.manifest_version;
+
+    let regions = BTreeMap::from([(region_id, region.clone())]);
+    let file_ref_manifest = FileRefsManifest {
+        file_refs: HashMap::from([(
+            region_id,
+            manifest.removed_files.removed_files[0]
+                .file_ids
+                .iter()
+                .cloned()
+                .collect(),
+        )]),
+        manifest_version: [(region_id, version)].into(),
+    };
+
+    let gc_worker = create_gc_worker(&engine, regions, &file_ref_manifest, true).await;
+    let report = gc_worker.run().await.unwrap();
+
+    assert_eq!(report.deleted_files.get(&region_id).unwrap().len(), 0);
+    assert!(report.need_retry_regions.is_empty());
+}
diff --git a/src/mito2/src/manifest/action.rs b/src/mito2/src/manifest/action.rs
index af09e6c861..dedb228e25 100644
--- a/src/mito2/src/manifest/action.rs
+++ b/src/mito2/src/manifest/action.rs
@@ -25,10 +25,9 @@ use store_api::metadata::RegionMetadataRef;
 use store_api::storage::{FileId, RegionId, SequenceNumber};
 use strum::Display;
 
-use crate::error::{
-    DurationOutOfRangeSnafu, RegionMetadataNotFoundSnafu, Result, SerdeJsonSnafu, Utf8Snafu,
-};
+use crate::error::{RegionMetadataNotFoundSnafu, Result, SerdeJsonSnafu, Utf8Snafu};
 use crate::manifest::manager::RemoveFileOptions;
+use crate::region::ManifestStats;
 use crate::sst::FormatType;
 use crate::sst::file::FileMeta;
 use crate::wal::EntryId;
@@ -236,13 +235,13 @@ impl RegionManifestBuilder {
                 self.flushed_entry_id = truncated_entry_id;
                 self.flushed_sequence = truncated_sequence;
                 self.truncated_entry_id = Some(truncated_entry_id);
-                self.files.clear();
                 self.removed_files.add_removed_files(
                     self.files.values().map(|meta| meta.file_id).collect(),
                     truncate
                         .timestamp_ms
                         .unwrap_or_else(|| Utc::now().timestamp_millis()),
                 );
+                self.files.clear();
             }
             TruncateKind::Partial { files_to_remove } => {
                 self.removed_files.add_removed_files(
@@ -294,6 +293,29 @@ pub struct RemovedFilesRecord {
     pub removed_files: Vec<RemovedFiles>,
 }
 
+impl RemovedFilesRecord {
+    /// Clear the actually deleted files from the list of removed files
+    pub fn clear_deleted_files(&mut self, deleted_files: Vec<FileId>) {
+        let deleted_file_set: HashSet<_> = HashSet::from_iter(deleted_files);
+        for files in self.removed_files.iter_mut() {
+            files.file_ids.retain(|fid| !deleted_file_set.contains(fid));
+        }
+
+        self.removed_files.retain(|fs| !fs.file_ids.is_empty());
+    }
+
+    pub fn update_file_removed_cnt_to_stats(&self, stats: &ManifestStats) {
+        let cnt = self
+            .removed_files
+            .iter()
+            .map(|r| r.file_ids.len() as u64)
+            .sum();
+        stats
+            .file_removed_cnt
+            .store(cnt, std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
 #[derive(Serialize, Deserialize, Clone, Debug, Default, PartialEq, Eq)]
 pub struct RemovedFiles {
     /// The timestamp is the time when
@@ -306,6 +328,9 @@ pub struct RemovedFiles {
 impl RemovedFilesRecord {
     /// Add a record of removed files with the current timestamp.
     pub fn add_removed_files(&mut self, file_ids: HashSet<FileId>, at: i64) {
+        if file_ids.is_empty() {
+            return;
+        }
         self.removed_files.push(RemovedFiles {
             removed_at: at,
             file_ids,
@@ -313,35 +338,13 @@ impl RemovedFilesRecord {
     }
 
     pub fn evict_old_removed_files(&mut self, opt: &RemoveFileOptions) -> Result<()> {
-        let total_removed_files: usize = self.removed_files.iter().map(|s| s.file_ids.len()).sum();
-        if opt.keep_count > 0 && total_removed_files <= opt.keep_count {
+        if !opt.enable_gc {
+            // If GC is not enabled, always keep removed files empty.
+            self.removed_files.clear();
             return Ok(());
         }
 
-        let mut cur_file_cnt = total_removed_files;
-
-        let can_evict_until = chrono::Utc::now()
-            - chrono::Duration::from_std(opt.keep_ttl).context(DurationOutOfRangeSnafu {
-                input: opt.keep_ttl,
-            })?;
-
-        self.removed_files.sort_unstable_by_key(|f| f.removed_at);
-        let updated = std::mem::take(&mut self.removed_files)
-            .into_iter()
-            .filter_map(|f| {
-                if f.removed_at < can_evict_until.timestamp_millis()
-                    && (opt.keep_count == 0 || cur_file_cnt >= opt.keep_count)
-                {
-                    // can evict all files
-                    // TODO(discord9): maybe only evict to below keep_count? Maybe not, or the update might be too frequent.
-                    cur_file_cnt -= f.file_ids.len();
-                    None
-                } else {
-                    Some(f)
-                }
-            })
-            .collect();
-        self.removed_files = updated;
+        // if GC is enabled, rely on gc worker to delete files, and evict removed files based on options.
 
         Ok(())
     }
diff --git a/src/mito2/src/manifest/checkpointer.rs b/src/mito2/src/manifest/checkpointer.rs
index 3f3164ad93..1da03dda21 100644
--- a/src/mito2/src/manifest/checkpointer.rs
+++ b/src/mito2/src/manifest/checkpointer.rs
@@ -25,7 +25,6 @@ use crate::manifest::action::{RegionCheckpoint, RegionManifest};
 use crate::manifest::manager::RegionManifestOptions;
 use crate::manifest::storage::ManifestObjectStore;
 use crate::metrics::MANIFEST_OP_ELAPSED;
-use crate::region::{RegionLeaderState, RegionRoleState};
 
 /// [`Checkpointer`] is responsible for doing checkpoint for a region, in an asynchronous way.
 #[derive(Debug)]
@@ -129,26 +128,15 @@ impl Checkpointer {
 
         manifest.removed_files.evict_old_removed_files(opt)?;
 
+        // TODO(discord9): consider also check object store to clear removed files that are already deleted? How costly it is?
+
         Ok(manifest)
     }
 
     /// Check if it's needed to do checkpoint for the region by the checkpoint distance.
     /// If needed, and there's no currently running checkpoint task, it will start a new checkpoint
     /// task running in the background.
-    pub(crate) fn maybe_do_checkpoint(
-        &self,
-        manifest: &RegionManifest,
-        region_state: RegionRoleState,
-    ) {
-        // Skip checkpoint if region is in staging state
-        if region_state == RegionRoleState::Leader(RegionLeaderState::Staging) {
-            info!(
-                "Skipping checkpoint for region {} in staging mode, manifest version: {}",
-                manifest.metadata.region_id, manifest.manifest_version
-            );
-            return;
-        }
-
+    pub(crate) fn maybe_do_checkpoint(&self, manifest: &RegionManifest) {
         if self.manifest_options.checkpoint_distance == 0 {
             return;
         }
diff --git a/src/mito2/src/manifest/manager.rs b/src/mito2/src/manifest/manager.rs
index b65d9c840d..043a1293d9 100644
--- a/src/mito2/src/manifest/manager.rs
+++ b/src/mito2/src/manifest/manager.rs
@@ -21,8 +21,11 @@ use futures::TryStreamExt;
 use object_store::ObjectStore;
 use snafu::{OptionExt, ResultExt, ensure};
 use store_api::metadata::RegionMetadataRef;
+use store_api::storage::FileId;
 use store_api::{MAX_VERSION, MIN_VERSION, ManifestVersion};
 
+use crate::cache::manifest_cache::ManifestCache;
+use crate::config::MitoConfig;
 use crate::error::{
     self, InstallManifestToSnafu, NoCheckpointSnafu, NoManifestsSnafu, RegionStoppedSnafu, Result,
 };
@@ -32,10 +35,11 @@ use crate::manifest::action::{
 };
 use crate::manifest::checkpointer::Checkpointer;
 use crate::manifest::storage::{
-    ManifestObjectStore, file_version, is_checkpoint_file, is_delta_file,
+    ManifestObjectStore, file_version, is_checkpoint_file, is_delta_file, manifest_compress_type,
+    manifest_dir,
 };
 use crate::metrics::MANIFEST_OP_ELAPSED;
-use crate::region::{RegionLeaderState, RegionRoleState};
+use crate::region::{ManifestStats, RegionLeaderState, RegionRoleState};
 use crate::sst::FormatType;
 
 /// Options for [RegionManifestManager].
@@ -49,27 +53,34 @@ pub struct RegionManifestOptions {
     /// Set to 0 to disable checkpoint.
     pub checkpoint_distance: u64,
     pub remove_file_options: RemoveFileOptions,
+    /// Optional cache for manifest files.
+    pub manifest_cache: Option<ManifestCache>,
+}
+
+impl RegionManifestOptions {
+    /// Creates a new [RegionManifestOptions] with the given region directory, object store, and configuration.
+    pub fn new(config: &MitoConfig, region_dir: &str, object_store: &ObjectStore) -> Self {
+        RegionManifestOptions {
+            manifest_dir: manifest_dir(region_dir),
+            object_store: object_store.clone(),
+            // We don't allow users to set the compression algorithm as we use it as a file suffix.
+            // Currently, the manifest storage doesn't have good support for changing compression algorithms.
+            compress_type: manifest_compress_type(config.compress_manifest),
+            checkpoint_distance: config.manifest_checkpoint_distance,
+            remove_file_options: RemoveFileOptions {
+                enable_gc: config.gc.enable,
+            },
+            manifest_cache: None,
+        }
+    }
 }
 
 /// Options for updating `removed_files` field in [RegionManifest].
 #[derive(Debug, Clone)]
+#[cfg_attr(any(test, feature = "test"), derive(Default))]
 pub struct RemoveFileOptions {
-    /// Number of removed files to keep in manifest's `removed_files` field before also
-    /// remove them from `removed_files`. Only remove files when both `keep_count` and `keep_duration` is reached.
-    pub keep_count: usize,
-    /// Duration to keep removed files in manifest's `removed_files` field before also
-    /// remove them from `removed_files`. Only remove files when both `keep_count` and `keep_duration` is reached.
-    pub keep_ttl: std::time::Duration,
-}
-
-#[cfg(any(test, feature = "test"))]
-impl Default for RemoveFileOptions {
-    fn default() -> Self {
-        Self {
-            keep_count: 256,
-            keep_ttl: std::time::Duration::from_secs(3600),
-        }
-    }
+    /// Whether GC is enabled. If not, the removed files should always be empty when persisting manifest.
+    pub enable_gc: bool,
 }
 
 // rewrite note:
@@ -144,6 +155,11 @@ pub struct RegionManifestManager {
     last_version: Arc<AtomicU64>,
     checkpointer: Checkpointer,
     manifest: Arc<RegionManifest>,
+    // Staging manifest is used to store the manifest of the staging region before it becomes available.
+    // It is initially inherited from the previous manifest(i.e., `self.manifest`).
+    // When the staging manifest becomes available, it will be used to construct the new manifest.
+    staging_manifest: Option<Arc<RegionManifest>>,
+    stats: ManifestStats,
     stopped: bool,
 }
 
@@ -153,17 +169,18 @@ impl RegionManifestManager {
         metadata: RegionMetadataRef,
         flushed_entry_id: u64,
         options: RegionManifestOptions,
-        total_manifest_size: Arc<AtomicU64>,
-        manifest_version: Arc<AtomicU64>,
         sst_format: FormatType,
+        stats: &ManifestStats,
     ) -> Result<Self> {
         // construct storage
         let mut store = ManifestObjectStore::new(
             &options.manifest_dir,
             options.object_store.clone(),
             options.compress_type,
-            total_manifest_size,
+            stats.total_manifest_size.clone(),
+            options.manifest_cache.clone(),
         );
+        let manifest_version = stats.manifest_version.clone();
 
         info!(
             "Creating region manifest in {} with metadata {:?}, flushed_entry_id: {}",
@@ -213,11 +230,16 @@ impl RegionManifestManager {
 
         let checkpointer = Checkpointer::new(region_id, options, store.clone(), MIN_VERSION);
         manifest_version.store(version, Ordering::Relaxed);
+        manifest
+            .removed_files
+            .update_file_removed_cnt_to_stats(stats);
         Ok(Self {
             store,
             last_version: manifest_version,
             checkpointer,
             manifest: Arc::new(manifest),
+            staging_manifest: None,
+            stats: stats.clone(),
             stopped: false,
         })
     }
@@ -227,8 +249,7 @@ impl RegionManifestManager {
     /// Returns `Ok(None)` if no such manifest.
     pub async fn open(
         options: RegionManifestOptions,
-        total_manifest_size: Arc<AtomicU64>,
-        manifest_version: Arc<AtomicU64>,
+        stats: &ManifestStats,
     ) -> Result<Option<Self>> {
         let _t = MANIFEST_OP_ELAPSED
             .with_label_values(&["open"])
@@ -239,8 +260,10 @@ impl RegionManifestManager {
             &options.manifest_dir,
             options.object_store.clone(),
             options.compress_type,
-            total_manifest_size,
+            stats.total_manifest_size.clone(),
+            options.manifest_cache.clone(),
         );
+        let manifest_version = stats.manifest_version.clone();
 
         // recover from storage
         // construct manifest builder
@@ -314,11 +337,17 @@ impl RegionManifestManager {
             last_checkpoint_version,
         );
         manifest_version.store(version, Ordering::Relaxed);
+        manifest
+            .removed_files
+            .update_file_removed_cnt_to_stats(stats);
         Ok(Some(Self {
             store,
             last_version: manifest_version,
             checkpointer,
             manifest: Arc::new(manifest),
+            // TODO(weny): open the staging manifest if exists.
+            staging_manifest: None,
+            stats: stats.clone(),
             stopped: false,
         }))
     }
@@ -442,6 +471,9 @@ impl RegionManifestManager {
         );
 
         let version = self.last_version();
+        new_manifest
+            .removed_files
+            .update_file_removed_cnt_to_stats(&self.stats);
         self.manifest = Arc::new(new_manifest);
         let last_version = self.set_version(self.manifest.manifest_version);
         info!(
@@ -469,6 +501,9 @@ impl RegionManifestManager {
         let builder = RegionManifestBuilder::with_checkpoint(checkpoint.checkpoint);
         let manifest = builder.try_build()?;
         let last_version = self.set_version(manifest.manifest_version);
+        manifest
+            .removed_files
+            .update_file_removed_cnt_to_stats(&self.stats);
         self.manifest = Arc::new(manifest);
         info!(
             "Installed region manifest from checkpoint: {}, region: {}",
@@ -482,7 +517,7 @@ impl RegionManifestManager {
     pub async fn update(
         &mut self,
         action_list: RegionMetaActionList,
-        region_state: RegionRoleState,
+        is_staging: bool,
     ) -> Result<ManifestVersion> {
         let _t = MANIFEST_OP_ELAPSED
             .with_label_values(&["update"])
@@ -496,13 +531,19 @@ impl RegionManifestManager {
         );
 
         let version = self.increase_version();
-        let is_staging = region_state == RegionRoleState::Leader(RegionLeaderState::Staging);
         self.store
             .save(version, &action_list.encode()?, is_staging)
             .await?;
 
+        // For a staging region, the manifest is initially inherited from the previous manifest(i.e., `self.manifest`).
+        // When the staging manifest becomes available, it will be used to construct the new manifest.
         let mut manifest_builder =
-            RegionManifestBuilder::with_checkpoint(Some(self.manifest.as_ref().clone()));
+            if is_staging && let Some(staging_manifest) = self.staging_manifest.as_ref() {
+                RegionManifestBuilder::with_checkpoint(Some(staging_manifest.as_ref().clone()))
+            } else {
+                RegionManifestBuilder::with_checkpoint(Some(self.manifest.as_ref().clone()))
+            };
+
         for action in action_list.actions {
             match action {
                 RegionMetaAction::Change(action) => {
@@ -522,23 +563,52 @@ impl RegionManifestManager {
                 }
             }
         }
-        let new_manifest = manifest_builder.try_build()?;
-        let updated_manifest = self
-            .checkpointer
-            .update_manifest_removed_files(new_manifest)?;
-        self.manifest = Arc::new(updated_manifest);
 
-        self.checkpointer
-            .maybe_do_checkpoint(self.manifest.as_ref(), region_state);
+        if is_staging {
+            let new_manifest = manifest_builder.try_build()?;
+            self.staging_manifest = Some(Arc::new(new_manifest));
+
+            info!(
+                "Skipping checkpoint for region {} in staging mode, manifest version: {}",
+                self.manifest.metadata.region_id, self.manifest.manifest_version
+            );
+        } else {
+            let new_manifest = manifest_builder.try_build()?;
+            new_manifest
+                .removed_files
+                .update_file_removed_cnt_to_stats(&self.stats);
+            let updated_manifest = self
+                .checkpointer
+                .update_manifest_removed_files(new_manifest)?;
+            self.manifest = Arc::new(updated_manifest);
+            self.checkpointer
+                .maybe_do_checkpoint(self.manifest.as_ref());
+        }
 
         Ok(version)
     }
 
+    /// Clear deleted files from manifest's `removed_files` field without update version. Notice if datanode exit before checkpoint then new manifest by open region may still contain these deleted files, which is acceptable for gc process.
+    pub fn clear_deleted_files(&mut self, deleted_files: Vec<FileId>) {
+        let mut manifest = (*self.manifest()).clone();
+        manifest.removed_files.clear_deleted_files(deleted_files);
+        self.set_manifest(Arc::new(manifest));
+    }
+
+    pub(crate) fn set_manifest(&mut self, manifest: Arc<RegionManifest>) {
+        self.manifest = manifest;
+    }
+
     /// Retrieves the current [RegionManifest].
     pub fn manifest(&self) -> Arc<RegionManifest> {
         self.manifest.clone()
     }
 
+    /// Retrieves the current [RegionManifest].
+    pub fn staging_manifest(&self) -> Option<Arc<RegionManifest>> {
+        self.staging_manifest.clone()
+    }
+
     /// Returns total manifest size.
     pub fn manifest_usage(&self) -> u64 {
         self.store.total_manifest_size()
@@ -675,6 +745,22 @@ impl RegionManifestManager {
 
         Ok(Some(RegionMetaActionList::new(merged_actions)))
     }
+
+    /// Unsets the staging manifest.
+    pub(crate) fn unset_staging_manifest(&mut self) {
+        self.staging_manifest = None;
+    }
+
+    /// Clear all staging manifests.
+    pub(crate) async fn clear_staging_manifest_and_dir(&mut self) -> Result<()> {
+        self.staging_manifest = None;
+        self.store.clear_staging_manifests().await?;
+        info!(
+            "Cleared all staging manifests for region {}",
+            self.manifest.metadata.region_id
+        );
+        Ok(())
+    }
 }
 
 #[cfg(test)]
@@ -801,13 +887,7 @@ mod test {
                 sst_format: FormatType::PrimaryKey,
             }));
 
-        let current_version = manager
-            .update(
-                action_list,
-                RegionRoleState::Leader(RegionLeaderState::Writable),
-            )
-            .await
-            .unwrap();
+        let current_version = manager.update(action_list, false).await.unwrap();
         assert_eq!(current_version, 1);
         manager.validate_manifest(&new_metadata, 1);
 
@@ -870,13 +950,7 @@ mod test {
                 sst_format: FormatType::PrimaryKey,
             }));
 
-        let current_version = manager
-            .update(
-                action_list,
-                RegionRoleState::Leader(RegionLeaderState::Writable),
-            )
-            .await
-            .unwrap();
+        let current_version = manager.update(action_list, false).await.unwrap();
         assert_eq!(current_version, 1);
         manager.validate_manifest(&new_metadata, 1);
 
@@ -897,7 +971,7 @@ mod test {
                         flushed_sequence: None,
                         committed_sequence: None,
                     })]),
-                    RegionRoleState::Leader(RegionLeaderState::Writable),
+                    false,
                 )
                 .await
                 .unwrap();
@@ -923,6 +997,6 @@ mod test {
 
         // get manifest size again
         let manifest_size = manager.manifest_usage();
-        assert_eq!(manifest_size, 1764);
+        assert_eq!(manifest_size, 1378);
     }
 }
diff --git a/src/mito2/src/manifest/storage.rs b/src/mito2/src/manifest/storage.rs
index 48a30af16e..7472a4dc28 100644
--- a/src/mito2/src/manifest/storage.rs
+++ b/src/mito2/src/manifest/storage.rs
@@ -24,6 +24,7 @@ use crc32fast::Hasher;
 use futures::TryStreamExt;
 use futures::future::try_join_all;
 use lazy_static::lazy_static;
+use object_store::util::join_dir;
 use object_store::{Entry, ErrorKind, Lister, ObjectStore, util};
 use regex::Regex;
 use serde::{Deserialize, Serialize};
@@ -32,6 +33,7 @@ use store_api::ManifestVersion;
 use store_api::storage::RegionId;
 use tokio::sync::Semaphore;
 
+use crate::cache::manifest_cache::ManifestCache;
 use crate::error::{
     ChecksumMismatchSnafu, CompressObjectSnafu, DecompressObjectSnafu, InvalidScanIndexSnafu,
     OpenDalSnafu, Result, SerdeJsonSnafu, Utf8Snafu,
@@ -49,6 +51,11 @@ const DEFAULT_MANIFEST_COMPRESSION_TYPE: CompressionType = CompressionType::Gzip
 const FALL_BACK_COMPRESS_TYPE: CompressionType = CompressionType::Uncompressed;
 const FETCH_MANIFEST_PARALLELISM: usize = 16;
 
+/// Returns the directory to the manifest files.
+pub fn manifest_dir(region_dir: &str) -> String {
+    join_dir(region_dir, "manifest")
+}
+
 /// Returns the [CompressionType] according to whether to compress manifest files.
 pub const fn manifest_compress_type(compress: bool) -> CompressionType {
     if compress {
@@ -138,6 +145,8 @@ pub struct ManifestObjectStore {
     /// Stores the size of each manifest file.
     manifest_size_map: Arc<RwLock<HashMap<FileKey, u64>>>,
     total_manifest_size: Arc<AtomicU64>,
+    /// Optional manifest cache for local caching.
+    manifest_cache: Option<ManifestCache>,
 }
 
 impl ManifestObjectStore {
@@ -146,7 +155,10 @@ impl ManifestObjectStore {
         object_store: ObjectStore,
         compress_type: CompressionType,
         total_manifest_size: Arc<AtomicU64>,
+        manifest_cache: Option<ManifestCache>,
     ) -> Self {
+        common_telemetry::info!("Create manifest store, cache: {}", manifest_cache.is_some());
+
         let path = util::normalize_dir(path);
         let staging_path = {
             // Convert "region_dir/manifest/" to "region_dir/staging/manifest/"
@@ -160,6 +172,7 @@ impl ManifestObjectStore {
             staging_path,
             manifest_size_map: Arc::new(RwLock::new(HashMap::new())),
             total_manifest_size,
+            manifest_cache,
         }
     }
 
@@ -285,9 +298,11 @@ impl ManifestObjectStore {
     }
 
     /// Common implementation for fetching manifests from entries in parallel.
+    /// If `is_staging` is true, cache is skipped.
     async fn fetch_manifests_from_entries(
         &self,
         entries: Vec<(ManifestVersion, Entry)>,
+        is_staging: bool,
     ) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
         if entries.is_empty() {
             return Ok(vec![]);
@@ -300,6 +315,13 @@ impl ManifestObjectStore {
             // Safety: semaphore must exist.
             let _permit = semaphore.acquire().await.unwrap();
 
+            let cache_key = entry.path();
+            // Try to get from cache first
+            if let Some(data) = self.get_from_cache(cache_key, is_staging).await {
+                return Ok((*v, data));
+            }
+
+            // Fetch from remote object store
             let compress_type = file_compress_type(entry.name());
             let bytes = self
                 .object_store
@@ -313,6 +335,11 @@ impl ManifestObjectStore {
                     compress_type,
                     path: entry.path(),
                 })?;
+
+            // Add to cache
+            self.put_to_cache(cache_key.to_string(), &data, is_staging)
+                .await;
+
             Ok((*v, data))
         });
 
@@ -329,7 +356,7 @@ impl ManifestObjectStore {
         end_version: ManifestVersion,
     ) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
         let manifests = self.scan(start_version, end_version).await?;
-        self.fetch_manifests_from_entries(manifests).await
+        self.fetch_manifests_from_entries(manifests, false).await
     }
 
     /// Delete manifest files that version < end.
@@ -399,6 +426,11 @@ impl ManifestObjectStore {
             ret, self.path, end, checkpoint_version, paths,
         );
 
+        // Remove from cache first
+        for (entry, _, _) in &del_entries {
+            self.remove_from_cache(entry.path()).await;
+        }
+
         self.object_store
             .delete_iter(paths)
             .await
@@ -434,11 +466,10 @@ impl ManifestObjectStore {
                 path: &path,
             })?;
         let delta_size = data.len();
-        self.object_store
-            .write(&path, data)
-            .await
-            .context(OpenDalSnafu)?;
+
+        self.write_and_put_cache(&path, data, is_staging).await?;
         self.set_delta_file_size(version, delta_size as u64);
+
         Ok(())
     }
 
@@ -459,10 +490,8 @@ impl ManifestObjectStore {
             })?;
         let checkpoint_size = data.len();
         let checksum = checkpoint_checksum(bytes);
-        self.object_store
-            .write(&path, data)
-            .await
-            .context(OpenDalSnafu)?;
+
+        self.write_and_put_cache(&path, data, false).await?;
         self.set_checkpoint_file_size(version, checkpoint_size as u64);
 
         // Because last checkpoint file only contain size and version, which is tiny, so we don't compress it.
@@ -495,60 +524,80 @@ impl ManifestObjectStore {
     ) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
         let version = metadata.version;
         let path = self.checkpoint_file_path(version);
+
+        // Try to get from cache first
+        if let Some(data) = self.get_from_cache(&path, false).await {
+            verify_checksum(&data, metadata.checksum)?;
+            return Ok(Some((version, data)));
+        }
+
         // Due to backward compatibility, it is possible that the user's checkpoint not compressed,
         // so if we don't find file by compressed type. fall back to checkpoint not compressed find again.
-        let checkpoint_data =
-            match self.object_store.read(&path).await {
-                Ok(checkpoint) => {
-                    let checkpoint_size = checkpoint.len();
-                    let decompress_data = self.compress_type.decode(checkpoint).await.context(
-                        DecompressObjectSnafu {
+        let checkpoint_data = match self.object_store.read(&path).await {
+            Ok(checkpoint) => {
+                let checkpoint_size = checkpoint.len();
+                let decompress_data =
+                    self.compress_type
+                        .decode(checkpoint)
+                        .await
+                        .with_context(|_| DecompressObjectSnafu {
                             compress_type: self.compress_type,
-                            path,
-                        },
-                    )?;
-                    verify_checksum(&decompress_data, metadata.checksum)?;
-                    // set the checkpoint size
-                    self.set_checkpoint_file_size(version, checkpoint_size as u64);
-                    Ok(Some(decompress_data))
-                }
-                Err(e) => {
-                    if e.kind() == ErrorKind::NotFound {
-                        if self.compress_type != FALL_BACK_COMPRESS_TYPE {
-                            let fall_back_path = gen_path(
-                                &self.path,
-                                &checkpoint_file(version),
-                                FALL_BACK_COMPRESS_TYPE,
-                            );
-                            debug!(
-                                "Failed to load checkpoint from path: {}, fall back to path: {}",
-                                path, fall_back_path
-                            );
-                            match self.object_store.read(&fall_back_path).await {
-                                Ok(checkpoint) => {
-                                    let checkpoint_size = checkpoint.len();
-                                    let decompress_data = FALL_BACK_COMPRESS_TYPE
-                                        .decode(checkpoint)
-                                        .await
-                                        .context(DecompressObjectSnafu {
-                                            compress_type: FALL_BACK_COMPRESS_TYPE,
-                                            path,
-                                        })?;
-                                    verify_checksum(&decompress_data, metadata.checksum)?;
-                                    self.set_checkpoint_file_size(version, checkpoint_size as u64);
-                                    Ok(Some(decompress_data))
-                                }
-                                Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
-                                Err(e) => Err(e).context(OpenDalSnafu),
+                            path: path.clone(),
+                        })?;
+                verify_checksum(&decompress_data, metadata.checksum)?;
+                // set the checkpoint size
+                self.set_checkpoint_file_size(version, checkpoint_size as u64);
+                // Add to cache
+                self.put_to_cache(path, &decompress_data, false).await;
+                Ok(Some(decompress_data))
+            }
+            Err(e) => {
+                if e.kind() == ErrorKind::NotFound {
+                    if self.compress_type != FALL_BACK_COMPRESS_TYPE {
+                        let fall_back_path = gen_path(
+                            &self.path,
+                            &checkpoint_file(version),
+                            FALL_BACK_COMPRESS_TYPE,
+                        );
+                        debug!(
+                            "Failed to load checkpoint from path: {}, fall back to path: {}",
+                            path, fall_back_path
+                        );
+
+                        // Try to get fallback from cache first
+                        if let Some(data) = self.get_from_cache(&fall_back_path, false).await {
+                            verify_checksum(&data, metadata.checksum)?;
+                            return Ok(Some((version, data)));
+                        }
+
+                        match self.object_store.read(&fall_back_path).await {
+                            Ok(checkpoint) => {
+                                let checkpoint_size = checkpoint.len();
+                                let decompress_data = FALL_BACK_COMPRESS_TYPE
+                                    .decode(checkpoint)
+                                    .await
+                                    .with_context(|_| DecompressObjectSnafu {
+                                        compress_type: FALL_BACK_COMPRESS_TYPE,
+                                        path: fall_back_path.clone(),
+                                    })?;
+                                verify_checksum(&decompress_data, metadata.checksum)?;
+                                self.set_checkpoint_file_size(version, checkpoint_size as u64);
+                                // Add fallback to cache
+                                self.put_to_cache(fall_back_path, &decompress_data, false)
+                                    .await;
+                                Ok(Some(decompress_data))
                             }
-                        } else {
-                            Ok(None)
+                            Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
+                            Err(e) => Err(e).context(OpenDalSnafu),
                         }
                     } else {
-                        Err(e).context(OpenDalSnafu)
+                        Ok(None)
                     }
+                } else {
+                    Err(e).context(OpenDalSnafu)
                 }
-            }?;
+            }
+        }?;
         Ok(checkpoint_data.map(|data| (version, data)))
     }
 
@@ -556,8 +605,10 @@ impl ManifestObjectStore {
     /// Return manifest version and the raw [RegionCheckpoint](crate::manifest::action::RegionCheckpoint) content if any
     pub async fn load_last_checkpoint(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
         let last_checkpoint_path = self.last_checkpoint_path();
+
+        // Fetch from remote object store without cache
         let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await {
-            Ok(data) => data,
+            Ok(data) => data.to_vec(),
             Err(e) if e.kind() == ErrorKind::NotFound => {
                 return Ok(None);
             }
@@ -566,7 +617,7 @@ impl ManifestObjectStore {
             }
         };
 
-        let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data.to_vec())?;
+        let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data)?;
 
         debug!(
             "Load checkpoint in path: {}, metadata: {:?}",
@@ -696,7 +747,8 @@ impl ManifestObjectStore {
         let mut sorted_entries = manifest_entries;
         Self::sort_manifests(&mut sorted_entries);
 
-        self.fetch_manifests_from_entries(sorted_entries).await
+        self.fetch_manifests_from_entries(sorted_entries, true)
+            .await
     }
 
     /// Clear all staging manifest files.
@@ -713,6 +765,63 @@ impl ManifestObjectStore {
 
         Ok(())
     }
+
+    /// Gets a manifest file from cache.
+    /// Returns the file data if found in cache, None otherwise.
+    /// If `is_staging` is true, always returns None.
+    async fn get_from_cache(&self, key: &str, is_staging: bool) -> Option<Vec<u8>> {
+        if is_staging {
+            return None;
+        }
+        let cache = self.manifest_cache.as_ref()?;
+        cache.get_file(key).await
+    }
+
+    /// Puts a manifest file into cache.
+    /// If `is_staging` is true, does nothing.
+    async fn put_to_cache(&self, key: String, data: &[u8], is_staging: bool) {
+        if is_staging {
+            return;
+        }
+        let Some(cache) = &self.manifest_cache else {
+            return;
+        };
+
+        cache.put_file(key, data.to_vec()).await;
+    }
+
+    /// Writes data to object store and puts it into cache.
+    /// If `is_staging` is true, cache is skipped.
+    async fn write_and_put_cache(&self, path: &str, data: Vec<u8>, is_staging: bool) -> Result<()> {
+        // Clone data for cache before writing, only if cache is enabled and not staging
+        let cache_data = if !is_staging && self.manifest_cache.is_some() {
+            Some(data.clone())
+        } else {
+            None
+        };
+
+        // Write to object store
+        self.object_store
+            .write(path, data)
+            .await
+            .context(OpenDalSnafu)?;
+
+        // Put to cache if we cloned the data
+        if let Some(data) = cache_data {
+            self.put_to_cache(path.to_string(), &data, is_staging).await;
+        }
+
+        Ok(())
+    }
+
+    /// Removes a manifest file from cache.
+    async fn remove_from_cache(&self, key: &str) {
+        let Some(cache) = &self.manifest_cache else {
+            return;
+        };
+
+        cache.remove(key).await;
+    }
 }
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -756,6 +865,7 @@ mod tests {
             object_store,
             CompressionType::Uncompressed,
             Default::default(),
+            None,
         )
     }
 
diff --git a/src/mito2/src/manifest/tests/checkpoint.rs b/src/mito2/src/manifest/tests/checkpoint.rs
index 71391457bb..c584ae3276 100644
--- a/src/mito2/src/manifest/tests/checkpoint.rs
+++ b/src/mito2/src/manifest/tests/checkpoint.rs
@@ -27,7 +27,6 @@ use crate::manifest::action::{
 use crate::manifest::manager::RegionManifestManager;
 use crate::manifest::storage::CheckpointMetadata;
 use crate::manifest::tests::utils::basic_region_metadata;
-use crate::region::{RegionLeaderState, RegionRoleState};
 use crate::sst::file::FileMeta;
 use crate::test_util::TestEnv;
 
@@ -87,13 +86,7 @@ async fn manager_without_checkpoint() {
 
     // apply 10 actions
     for _ in 0..10 {
-        manager
-            .update(
-                nop_action(),
-                RegionRoleState::Leader(RegionLeaderState::Writable),
-            )
-            .await
-            .unwrap();
+        manager.update(nop_action(), false).await.unwrap();
     }
 
     // no checkpoint
@@ -138,13 +131,7 @@ async fn manager_with_checkpoint_distance_1() {
 
     // apply 10 actions
     for _ in 0..10 {
-        manager
-            .update(
-                nop_action(),
-                RegionRoleState::Leader(RegionLeaderState::Writable),
-            )
-            .await
-            .unwrap();
+        manager.update(nop_action(), false).await.unwrap();
 
         while manager.checkpointer().is_doing_checkpoint() {
             tokio::time::sleep(Duration::from_millis(10)).await;
@@ -205,13 +192,7 @@ async fn test_corrupted_data_causing_checksum_error() {
 
     // Apply actions
     for _ in 0..10 {
-        manager
-            .update(
-                nop_action(),
-                RegionRoleState::Leader(RegionLeaderState::Writable),
-            )
-            .await
-            .unwrap();
+        manager.update(nop_action(), false).await.unwrap();
     }
 
     // Wait for the checkpoint to finish.
@@ -263,9 +244,11 @@ async fn checkpoint_with_different_compression_types() {
             time_range: (0.into(), 10000000.into()),
             level: 0,
             file_size: 1024000,
+            max_row_group_uncompressed_size: 1024000,
             available_indexes: Default::default(),
+            indexes: Default::default(),
             index_file_size: 0,
-            index_file_id: None,
+            index_version: 0,
             num_rows: 0,
             num_row_groups: 0,
             sequence: None,
@@ -301,10 +284,7 @@ async fn generate_checkpoint_with_compression_types(
     let (_env, mut manager) = build_manager(1, compress_type).await;
 
     for action in actions {
-        manager
-            .update(action, RegionRoleState::Leader(RegionLeaderState::Writable))
-            .await
-            .unwrap();
+        manager.update(action, false).await.unwrap();
 
         while manager.checkpointer().is_doing_checkpoint() {
             tokio::time::sleep(Duration::from_millis(10)).await;
@@ -330,9 +310,11 @@ fn generate_action_lists(num: usize) -> (Vec<FileId>, Vec<RegionMetaActionList>)
             time_range: (0.into(), 10000000.into()),
             level: 0,
             file_size: 1024000,
+            max_row_group_uncompressed_size: 1024000,
             available_indexes: Default::default(),
+            indexes: Default::default(),
             index_file_size: 0,
-            index_file_id: None,
+            index_version: 0,
             num_rows: 0,
             num_row_groups: 0,
             sequence: None,
@@ -359,10 +341,7 @@ async fn manifest_install_manifest_to() {
     let (env, mut manager) = build_manager(0, CompressionType::Uncompressed).await;
     let (files, actions) = generate_action_lists(10);
     for action in actions {
-        manager
-            .update(action, RegionRoleState::Leader(RegionLeaderState::Writable))
-            .await
-            .unwrap();
+        manager.update(action, false).await.unwrap();
     }
 
     // Nothing to install
@@ -400,10 +379,7 @@ async fn manifest_install_manifest_to_with_checkpoint() {
     let (env, mut manager) = build_manager(3, CompressionType::Uncompressed).await;
     let (files, actions) = generate_action_lists(10);
     for action in actions {
-        manager
-            .update(action, RegionRoleState::Leader(RegionLeaderState::Writable))
-            .await
-            .unwrap();
+        manager.update(action, false).await.unwrap();
 
         while manager.checkpointer().is_doing_checkpoint() {
             tokio::time::sleep(Duration::from_millis(10)).await;
@@ -475,13 +451,7 @@ async fn test_checkpoint_bypass_in_staging_mode() {
 
     // Apply actions in staging mode - checkpoint should be bypassed
     for _ in 0..15 {
-        manager
-            .update(
-                nop_action(),
-                RegionRoleState::Leader(RegionLeaderState::Staging),
-            )
-            .await
-            .unwrap();
+        manager.update(nop_action(), true).await.unwrap();
     }
     assert!(!manager.checkpointer().is_doing_checkpoint());
 
@@ -496,13 +466,7 @@ async fn test_checkpoint_bypass_in_staging_mode() {
     );
 
     // Now switch to normal mode and apply one more action
-    manager
-        .update(
-            nop_action(),
-            RegionRoleState::Leader(RegionLeaderState::Writable),
-        )
-        .await
-        .unwrap();
+    manager.update(nop_action(), false).await.unwrap();
 
     // Wait for potential checkpoint
     while manager.checkpointer().is_doing_checkpoint() {
diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs
index ea3875ac7a..c9ff2c0a98 100644
--- a/src/mito2/src/memtable.rs
+++ b/src/mito2/src/memtable.rs
@@ -55,8 +55,10 @@ pub mod time_partition;
 pub mod time_series;
 pub(crate) mod version;
 
-#[cfg(any(test, feature = "test"))]
-pub use bulk::part::BulkPart;
+pub use bulk::part::{
+    BulkPart, BulkPartEncoder, BulkPartMeta, UnorderedPart, record_batch_estimated_size,
+    sort_primary_key_record_batch,
+};
 #[cfg(any(test, feature = "test"))]
 pub use time_partition::filter_record_batch;
 
diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs
index 2c26410ca6..beae618520 100644
--- a/src/mito2/src/memtable/bulk.rs
+++ b/src/mito2/src/memtable/bulk.rs
@@ -668,10 +668,10 @@ impl BulkMemtable {
 }
 
 /// Iterator builder for bulk range
-struct BulkRangeIterBuilder {
-    part: BulkPart,
-    context: Arc<BulkIterContext>,
-    sequence: Option<SequenceRange>,
+pub struct BulkRangeIterBuilder {
+    pub part: BulkPart,
+    pub context: Arc<BulkIterContext>,
+    pub sequence: Option<SequenceRange>,
 }
 
 impl IterBuilder for BulkRangeIterBuilder {
@@ -1188,7 +1188,6 @@ impl MemtableBuilder for BulkMemtableBuilder {
 
 #[cfg(test)]
 mod tests {
-
     use mito_codec::row_converter::build_primary_key_codec;
 
     use super::*;
diff --git a/src/mito2/src/memtable/bulk/part.rs b/src/mito2/src/memtable/bulk/part.rs
index 9d6577f1d4..e79d1d83b8 100644
--- a/src/mito2/src/memtable/bulk/part.rs
+++ b/src/mito2/src/memtable/bulk/part.rs
@@ -14,11 +14,11 @@
 
 //! Bulk part encoder/decoder.
 
-use std::collections::VecDeque;
+use std::collections::{HashMap, HashSet, VecDeque};
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 
-use api::helper::{ColumnDataTypeWrapper, value_to_grpc_value};
+use api::helper::{ColumnDataTypeWrapper, to_grpc_value};
 use api::v1::bulk_wal_entry::Body;
 use api::v1::{ArrowIpc, BulkWalEntry, Mutation, OpType, bulk_wal_entry};
 use bytes::Bytes;
@@ -34,7 +34,9 @@ use datatypes::arrow::array::{
     UInt64Array, UInt64Builder,
 };
 use datatypes::arrow::compute::{SortColumn, SortOptions, TakeOptions};
-use datatypes::arrow::datatypes::{SchemaRef, UInt32Type};
+use datatypes::arrow::datatypes::{
+    DataType as ArrowDataType, Field, Schema, SchemaRef, UInt32Type,
+};
 use datatypes::arrow_array::BinaryArray;
 use datatypes::data_type::DataType;
 use datatypes::prelude::{MutableVector, ScalarVectorBuilder, Vector};
@@ -51,14 +53,15 @@ use parquet::file::metadata::ParquetMetaData;
 use parquet::file::properties::WriterProperties;
 use snafu::{OptionExt, ResultExt, Snafu};
 use store_api::codec::PrimaryKeyEncoding;
-use store_api::metadata::{RegionMetadata, RegionMetadataRef};
+use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataRef};
 use store_api::storage::consts::PRIMARY_KEY_COLUMN_NAME;
-use store_api::storage::{FileId, SequenceNumber, SequenceRange};
+use store_api::storage::{FileId, RegionId, SequenceNumber, SequenceRange};
 use table::predicate::Predicate;
 
 use crate::error::{
-    self, ColumnNotFoundSnafu, ComputeArrowSnafu, DataTypeMismatchSnafu, EncodeMemtableSnafu,
-    EncodeSnafu, InvalidMetadataSnafu, NewRecordBatchSnafu, Result,
+    self, ColumnNotFoundSnafu, ComputeArrowSnafu, ConvertColumnDataTypeSnafu, CreateDefaultSnafu,
+    DataTypeMismatchSnafu, EncodeMemtableSnafu, EncodeSnafu, InvalidMetadataSnafu,
+    InvalidRequestSnafu, NewRecordBatchSnafu, Result, UnexpectedSnafu,
 };
 use crate::memtable::bulk::context::BulkIterContextRef;
 use crate::memtable::bulk::part_reader::EncodedBulkPartIter;
@@ -167,6 +170,86 @@ impl BulkPart {
         }
     }
 
+    /// Fills missing columns in the BulkPart batch with default values.
+    ///
+    /// This function checks if the batch schema matches the region metadata schema,
+    /// and if there are missing columns, it fills them with default values (or null
+    /// for nullable columns).
+    ///
+    /// # Arguments
+    ///
+    /// * `region_metadata` - The region metadata containing the expected schema
+    pub fn fill_missing_columns(&mut self, region_metadata: &RegionMetadata) -> Result<()> {
+        // Builds a map of existing columns in the batch
+        let batch_schema = self.batch.schema();
+        let batch_columns: HashSet<_> = batch_schema
+            .fields()
+            .iter()
+            .map(|f| f.name().as_str())
+            .collect();
+
+        // Finds columns that need to be filled
+        let mut columns_to_fill = Vec::new();
+        for column_meta in &region_metadata.column_metadatas {
+            // TODO(yingwen): Returns error if it is impure default after we support filling
+            // bulk insert request in the frontend
+            if !batch_columns.contains(column_meta.column_schema.name.as_str()) {
+                columns_to_fill.push(column_meta);
+            }
+        }
+
+        if columns_to_fill.is_empty() {
+            return Ok(());
+        }
+
+        let num_rows = self.batch.num_rows();
+
+        let mut new_columns = Vec::new();
+        let mut new_fields = Vec::new();
+
+        // First, adds all existing columns
+        new_fields.extend(batch_schema.fields().iter().cloned());
+        new_columns.extend_from_slice(self.batch.columns());
+
+        let region_id = region_metadata.region_id;
+        // Then adds the missing columns with default values
+        for column_meta in columns_to_fill {
+            let default_vector = column_meta
+                .column_schema
+                .create_default_vector(num_rows)
+                .context(CreateDefaultSnafu {
+                    region_id,
+                    column: &column_meta.column_schema.name,
+                })?
+                .with_context(|| InvalidRequestSnafu {
+                    region_id,
+                    reason: format!(
+                        "column {} does not have default value",
+                        column_meta.column_schema.name
+                    ),
+                })?;
+            let arrow_array = default_vector.to_arrow_array();
+            column_meta.column_schema.data_type.as_arrow_type();
+
+            new_fields.push(Arc::new(Field::new(
+                column_meta.column_schema.name.clone(),
+                column_meta.column_schema.data_type.as_arrow_type(),
+                column_meta.column_schema.is_nullable(),
+            )));
+            new_columns.push(arrow_array);
+        }
+
+        // Create a new schema and batch with the filled columns
+        let new_schema = Arc::new(Schema::new(new_fields));
+        let new_batch =
+            RecordBatch::try_new(new_schema, new_columns).context(NewRecordBatchSnafu)?;
+
+        // Update the batch
+        self.batch = new_batch;
+
+        Ok(())
+    }
+
     /// Converts [BulkPart] to [Mutation] for fallback `write_bulk` implementation.
     pub(crate) fn to_mutation(&self, region_metadata: &RegionMetadataRef) -> Result<Mutation> {
         let vectors = region_metadata
@@ -185,7 +268,7 @@ impl BulkPart {
                 let values = (0..self.batch.num_columns())
                     .map(|col_idx| {
                         if let Some(v) = &vectors[col_idx] {
-                            value_to_grpc_value(v.get(row_idx))
+                            to_grpc_value(v.get(row_idx))
                         } else {
                             api::v1::Value { value_data: None }
                         }
@@ -381,7 +464,7 @@ impl UnorderedPart {
 }
 
 /// More accurate estimation of the size of a record batch.
-pub(crate) fn record_batch_estimated_size(batch: &RecordBatch) -> usize {
+pub fn record_batch_estimated_size(batch: &RecordBatch) -> usize {
     batch
         .columns()
         .iter()
@@ -632,7 +715,7 @@ fn new_primary_key_column_builders(
 }
 
 /// Sorts the record batch with primary key format.
-fn sort_primary_key_record_batch(batch: &RecordBatch) -> Result<RecordBatch> {
+pub fn sort_primary_key_record_batch(batch: &RecordBatch) -> Result<RecordBatch> {
     let total_columns = batch.num_columns();
     let sort_columns = vec![
         // Primary key column (ascending)
@@ -667,6 +750,196 @@ fn sort_primary_key_record_batch(batch: &RecordBatch) -> Result<RecordBatch> {
     datatypes::arrow::compute::take_record_batch(batch, &indices).context(ComputeArrowSnafu)
 }
 
+/// Converts a `BulkPart` that is unordered and without encoded primary keys into a `BulkPart`
+/// with the same format as produced by [BulkPartConverter].
+///
+/// This function takes a `BulkPart` where:
+/// - For dense encoding: Primary key columns may be stored as individual columns
+/// - For sparse encoding: The `__primary_key` column should already be present with encoded keys
+/// - The batch may not be sorted
+///
+/// And produces a `BulkPart` where:
+/// - Primary key columns are optionally stored (depending on `store_primary_key_columns` and encoding)
+/// - An encoded `__primary_key` dictionary column is present
+/// - The batch is sorted by (primary_key, timestamp, sequence desc)
+///
+/// # Arguments
+///
+/// * `part` - The input `BulkPart` to convert
+/// * `region_metadata` - Region metadata containing schema information
+/// * `primary_key_codec` - Codec for encoding primary keys
+/// * `schema` - Target schema for the output batch
+/// * `store_primary_key_columns` - If true and encoding is not sparse, stores individual primary key columns
+///
+/// # Returns
+///
+/// Returns `None` if the input part has no rows, otherwise returns a new `BulkPart` with
+/// encoded primary keys and sorted data.
+pub fn convert_bulk_part(
+    part: BulkPart,
+    region_metadata: &RegionMetadataRef,
+    primary_key_codec: Arc<dyn PrimaryKeyCodec>,
+    schema: SchemaRef,
+    store_primary_key_columns: bool,
+) -> Result<Option<BulkPart>> {
+    if part.num_rows() == 0 {
+        return Ok(None);
+    }
+
+    let num_rows = part.num_rows();
+    let is_sparse = region_metadata.primary_key_encoding == PrimaryKeyEncoding::Sparse;
+
+    // Builds a column name-to-index map for efficient lookups
+    let input_schema = part.batch.schema();
+    let column_indices: HashMap<&str, usize> = input_schema
+        .fields()
+        .iter()
+        .enumerate()
+        .map(|(idx, field)| (field.name().as_str(), idx))
+        .collect();
+
+    // Determines the structure of the input batch by looking up columns by name
+    let mut output_columns = Vec::new();
+
+    // Extracts primary key columns if we need to encode them (dense encoding)
+    let pk_array = if is_sparse {
+        // For sparse encoding, the input should already have the __primary_key column
+        // We need to find it in the input batch
+        None
+    } else {
+        // For dense encoding, extract and encode primary key columns by name
+        let pk_vectors: Result<Vec<_>> = region_metadata
+            .primary_key_columns()
+            .map(|col_meta| {
+                let col_idx = column_indices
+                    .get(col_meta.column_schema.name.as_str())
+                    .context(ColumnNotFoundSnafu {
+                        column: &col_meta.column_schema.name,
+                    })?;
+                let col = part.batch.column(*col_idx);
+                Helper::try_into_vector(col).context(error::ComputeVectorSnafu)
+            })
+            .collect();
+        let pk_vectors = pk_vectors?;
+
+        let mut key_array_builder = PrimaryKeyArrayBuilder::new();
+        let mut encode_buf = Vec::new();
+
+        for row_idx in 0..num_rows {
+            encode_buf.clear();
+
+            // Collects primary key values with column IDs for this row
+            let pk_values_with_ids: Vec<_> = region_metadata
+                .primary_key
+                .iter()
+                .zip(pk_vectors.iter())
+                .map(|(col_id, vector)| (*col_id, vector.get_ref(row_idx)))
+                .collect();
+
+            // Encodes the primary key
+            primary_key_codec
+                .encode_value_refs(&pk_values_with_ids, &mut encode_buf)
+                .context(EncodeSnafu)?;
+
+            key_array_builder
+                .append(&encode_buf)
+                .context(ComputeArrowSnafu)?;
+        }
+
+        Some(key_array_builder.finish())
+    };
+
+    // Adds primary key columns if storing them (only for dense encoding)
+    if store_primary_key_columns && !is_sparse {
+        for col_meta in region_metadata.primary_key_columns() {
+            let col_idx = column_indices
+                .get(col_meta.column_schema.name.as_str())
+                .context(ColumnNotFoundSnafu {
+                    column: &col_meta.column_schema.name,
+                })?;
+            let col = part.batch.column(*col_idx);
+
+            // Converts to dictionary if needed for string types
+            let col = if col_meta.column_schema.data_type.is_string() {
+                let target_type = ArrowDataType::Dictionary(
+                    Box::new(ArrowDataType::UInt32),
+                    Box::new(ArrowDataType::Utf8),
+                );
+                arrow::compute::cast(col, &target_type).context(ComputeArrowSnafu)?
+            } else {
+                col.clone()
+            };
+            output_columns.push(col);
+        }
+    }
+
+    // Adds field columns
+    for col_meta in region_metadata.field_columns() {
+        let col_idx = column_indices
+            .get(col_meta.column_schema.name.as_str())
+            .context(ColumnNotFoundSnafu {
+                column: &col_meta.column_schema.name,
+            })?;
+        output_columns.push(part.batch.column(*col_idx).clone());
+    }
+
+    // Adds timestamp column
+    let new_timestamp_index = output_columns.len();
+    let ts_col_idx = column_indices
+        .get(
+            region_metadata
+                .time_index_column()
+                .column_schema
+                .name
+                .as_str(),
+        )
+        .context(ColumnNotFoundSnafu {
+            column: &region_metadata.time_index_column().column_schema.name,
+        })?;
+    output_columns.push(part.batch.column(*ts_col_idx).clone());
+
+    // Adds encoded primary key dictionary column
+    let pk_dictionary = if let Some(pk_dict_array) = pk_array {
+        Arc::new(pk_dict_array) as ArrayRef
+    } else {
+        let pk_col_idx =
+            column_indices
+                .get(PRIMARY_KEY_COLUMN_NAME)
+                .context(ColumnNotFoundSnafu {
+                    column: PRIMARY_KEY_COLUMN_NAME,
+                })?;
+        let col = part.batch.column(*pk_col_idx);
+
+        // Casts to dictionary type if needed
+        let target_type = ArrowDataType::Dictionary(
+            Box::new(ArrowDataType::UInt32),
+            Box::new(ArrowDataType::Binary),
+        );
+        arrow::compute::cast(col, &target_type).context(ComputeArrowSnafu)?
+    };
+    output_columns.push(pk_dictionary);
+
+    let sequence_array = UInt64Array::from(vec![part.sequence; num_rows]);
+    output_columns.push(Arc::new(sequence_array) as ArrayRef);
+
+    let op_type_array = UInt8Array::from(vec![OpType::Put as u8; num_rows]);
+    output_columns.push(Arc::new(op_type_array) as ArrayRef);
+
+    let batch = RecordBatch::try_new(schema, output_columns).context(NewRecordBatchSnafu)?;
+
+    // Sorts the batch by (primary_key, timestamp, sequence desc)
+    let sorted_batch = sort_primary_key_record_batch(&batch)?;
+
+    Ok(Some(BulkPart {
+        batch: sorted_batch,
+        max_timestamp: part.max_timestamp,
+        min_timestamp: part.min_timestamp,
+        sequence: part.sequence,
+        timestamp_index: new_timestamp_index,
+        raw_data: None,
+    }))
+}
+
 #[derive(Debug, Clone)]
 pub struct EncodedBulkPart {
     data: Bytes,
@@ -701,6 +974,19 @@ impl EncodedBulkPart {
     /// Returns a `SstInfo` instance with information derived from this bulk part's metadata
     pub(crate) fn to_sst_info(&self, file_id: FileId) -> SstInfo {
         let unit = self.metadata.region_metadata.time_index_type().unit();
+        let max_row_group_uncompressed_size: u64 = self
+            .metadata
+            .parquet_metadata
+            .row_groups()
+            .iter()
+            .map(|rg| {
+                rg.columns()
+                    .iter()
+                    .map(|c| c.uncompressed_size() as u64)
+                    .sum::<u64>()
+            })
+            .max()
+            .unwrap_or(0);
         SstInfo {
             file_id,
             time_range: (
@@ -708,6 +994,7 @@ impl EncodedBulkPart {
                 Timestamp::new(self.metadata.max_timestamp, unit),
             ),
             file_size: self.data.len() as u64,
+            max_row_group_uncompressed_size,
             num_rows: self.metadata.num_rows,
             num_row_groups: self.metadata.parquet_metadata.num_row_groups() as u64,
             file_metadata: Some(self.metadata.parquet_metadata.clone()),
@@ -924,340 +1211,24 @@ impl BulkPartEncoder {
     }
 }
 
-/// Converts mutations to record batches.
-fn mutations_to_record_batch(
-    mutations: &[Mutation],
-    metadata: &RegionMetadataRef,
-    pk_encoder: &DensePrimaryKeyCodec,
-    dedup: bool,
-) -> Result<Option<(RecordBatch, i64, i64)>> {
-    let total_rows: usize = mutations
-        .iter()
-        .map(|m| m.rows.as_ref().map(|r| r.rows.len()).unwrap_or(0))
-        .sum();
-
-    if total_rows == 0 {
-        return Ok(None);
-    }
-
-    let mut pk_builder = BinaryBuilder::with_capacity(total_rows, 0);
-
-    let mut ts_vector: Box<dyn MutableVector> = metadata
-        .time_index_column()
-        .column_schema
-        .data_type
-        .create_mutable_vector(total_rows);
-    let mut sequence_builder = UInt64Builder::with_capacity(total_rows);
-    let mut op_type_builder = UInt8Builder::with_capacity(total_rows);
-
-    let mut field_builders: Vec<Box<dyn MutableVector>> = metadata
-        .field_columns()
-        .map(|f| f.column_schema.data_type.create_mutable_vector(total_rows))
-        .collect();
-
-    let mut pk_buffer = vec![];
-    for m in mutations {
-        let Some(key_values) = KeyValuesRef::new(metadata, m) else {
-            continue;
-        };
-
-        for row in key_values.iter() {
-            pk_buffer.clear();
-            pk_encoder
-                .encode_to_vec(row.primary_keys(), &mut pk_buffer)
-                .context(EncodeSnafu)?;
-            pk_builder.append_value(pk_buffer.as_bytes());
-            ts_vector.push_value_ref(&row.timestamp());
-            sequence_builder.append_value(row.sequence());
-            op_type_builder.append_value(row.op_type() as u8);
-            for (builder, field) in field_builders.iter_mut().zip(row.fields()) {
-                builder.push_value_ref(&field);
-            }
-        }
-    }
-
-    let arrow_schema = to_sst_arrow_schema(metadata);
-    // safety: timestamp column must be valid, and values must not be None.
-    let timestamp_unit = metadata
-        .time_index_column()
-        .column_schema
-        .data_type
-        .as_timestamp()
-        .unwrap()
-        .unit();
-    let sorter = ArraysSorter {
-        encoded_primary_keys: pk_builder.finish(),
-        timestamp_unit,
-        timestamp: ts_vector.to_vector().to_arrow_array(),
-        sequence: sequence_builder.finish(),
-        op_type: op_type_builder.finish(),
-        fields: field_builders
-            .iter_mut()
-            .map(|f| f.to_vector().to_arrow_array()),
-        dedup,
-        arrow_schema,
-    };
-
-    sorter.sort().map(Some)
-}
-
-struct ArraysSorter<I> {
-    encoded_primary_keys: BinaryArray,
-    timestamp_unit: TimeUnit,
-    timestamp: ArrayRef,
-    sequence: UInt64Array,
-    op_type: UInt8Array,
-    fields: I,
-    dedup: bool,
-    arrow_schema: SchemaRef,
-}
-
-impl<I> ArraysSorter<I>
-where
-    I: Iterator<Item = ArrayRef>,
-{
-    /// Converts arrays to record batch.
-    fn sort(self) -> Result<(RecordBatch, i64, i64)> {
-        debug_assert!(!self.timestamp.is_empty());
-        debug_assert!(self.timestamp.len() == self.sequence.len());
-        debug_assert!(self.timestamp.len() == self.op_type.len());
-        debug_assert!(self.timestamp.len() == self.encoded_primary_keys.len());
-
-        let timestamp_iter = timestamp_array_to_iter(self.timestamp_unit, &self.timestamp);
-        let (mut min_timestamp, mut max_timestamp) = (i64::MAX, i64::MIN);
-        let mut to_sort = self
-            .encoded_primary_keys
-            .iter()
-            .zip(timestamp_iter)
-            .zip(self.sequence.iter())
-            .map(|((pk, timestamp), sequence)| {
-                max_timestamp = max_timestamp.max(*timestamp);
-                min_timestamp = min_timestamp.min(*timestamp);
-                (pk, timestamp, sequence)
-            })
-            .enumerate()
-            .collect::<Vec<_>>();
-
-        to_sort.sort_unstable_by(|(_, (l_pk, l_ts, l_seq)), (_, (r_pk, r_ts, r_seq))| {
-            l_pk.cmp(r_pk)
-                .then(l_ts.cmp(r_ts))
-                .then(l_seq.cmp(r_seq).reverse())
-        });
-
-        if self.dedup {
-            // Dedup by timestamps while ignore sequence.
-            to_sort.dedup_by(|(_, (l_pk, l_ts, _)), (_, (r_pk, r_ts, _))| {
-                l_pk == r_pk && l_ts == r_ts
-            });
-        }
-
-        let indices = UInt32Array::from_iter_values(to_sort.iter().map(|v| v.0 as u32));
-
-        let pk_dictionary = Arc::new(binary_array_to_dictionary(
-            // safety: pk must be BinaryArray
-            arrow::compute::take(
-                &self.encoded_primary_keys,
-                &indices,
-                Some(TakeOptions {
-                    check_bounds: false,
-                }),
-            )
-            .context(ComputeArrowSnafu)?
-            .as_any()
-            .downcast_ref::<BinaryArray>()
-            .unwrap(),
-        )?) as ArrayRef;
-
-        let mut arrays = Vec::with_capacity(self.arrow_schema.fields.len());
-        for arr in self.fields {
-            arrays.push(
-                arrow::compute::take(
-                    &arr,
-                    &indices,
-                    Some(TakeOptions {
-                        check_bounds: false,
-                    }),
-                )
-                .context(ComputeArrowSnafu)?,
-            );
-        }
-
-        let timestamp = arrow::compute::take(
-            &self.timestamp,
-            &indices,
-            Some(TakeOptions {
-                check_bounds: false,
-            }),
-        )
-        .context(ComputeArrowSnafu)?;
-
-        arrays.push(timestamp);
-        arrays.push(pk_dictionary);
-        arrays.push(
-            arrow::compute::take(
-                &self.sequence,
-                &indices,
-                Some(TakeOptions {
-                    check_bounds: false,
-                }),
-            )
-            .context(ComputeArrowSnafu)?,
-        );
-
-        arrays.push(
-            arrow::compute::take(
-                &self.op_type,
-                &indices,
-                Some(TakeOptions {
-                    check_bounds: false,
-                }),
-            )
-            .context(ComputeArrowSnafu)?,
-        );
-
-        let batch = RecordBatch::try_new(self.arrow_schema, arrays).context(NewRecordBatchSnafu)?;
-        Ok((batch, min_timestamp, max_timestamp))
-    }
-}
-
-/// Converts timestamp array to an iter of i64 values.
-fn timestamp_array_to_iter(
-    timestamp_unit: TimeUnit,
-    timestamp: &ArrayRef,
-) -> impl Iterator<Item = &i64> {
-    match timestamp_unit {
-        // safety: timestamp column must be valid.
-        TimeUnit::Second => timestamp
-            .as_any()
-            .downcast_ref::<TimestampSecondArray>()
-            .unwrap()
-            .values()
-            .iter(),
-        TimeUnit::Millisecond => timestamp
-            .as_any()
-            .downcast_ref::<TimestampMillisecondArray>()
-            .unwrap()
-            .values()
-            .iter(),
-        TimeUnit::Microsecond => timestamp
-            .as_any()
-            .downcast_ref::<TimestampMicrosecondArray>()
-            .unwrap()
-            .values()
-            .iter(),
-        TimeUnit::Nanosecond => timestamp
-            .as_any()
-            .downcast_ref::<TimestampNanosecondArray>()
-            .unwrap()
-            .values()
-            .iter(),
-    }
-}
-
-/// Converts a **sorted** [BinaryArray] to [DictionaryArray].
-fn binary_array_to_dictionary(input: &BinaryArray) -> Result<PrimaryKeyArray> {
-    if input.is_empty() {
-        return Ok(DictionaryArray::new(
-            UInt32Array::from(Vec::<u32>::new()),
-            Arc::new(BinaryArray::from_vec(vec![])) as ArrayRef,
-        ));
-    }
-    let mut keys = Vec::with_capacity(16);
-    let mut values = BinaryBuilder::new();
-    let mut prev: usize = 0;
-    keys.push(prev as u32);
-    values.append_value(input.value(prev));
-
-    for current_bytes in input.iter().skip(1) {
-        // safety: encoded pk must present.
-        let current_bytes = current_bytes.unwrap();
-        let prev_bytes = input.value(prev);
-        if current_bytes != prev_bytes {
-            values.append_value(current_bytes);
-            prev += 1;
-        }
-        keys.push(prev as u32);
-    }
-
-    Ok(DictionaryArray::new(
-        UInt32Array::from(keys),
-        Arc::new(values.finish()) as ArrayRef,
-    ))
-}
-
 #[cfg(test)]
 mod tests {
-    use std::collections::VecDeque;
-
-    use api::v1::{Row, WriteHint};
+    use api::v1::{Row, SemanticType, WriteHint};
     use datafusion_common::ScalarValue;
     use datatypes::arrow::array::Float64Array;
-    use datatypes::prelude::{ConcreteDataType, ScalarVector, Value};
-    use datatypes::vectors::{Float64Vector, TimestampMillisecondVector};
+    use datatypes::prelude::{ConcreteDataType, Value};
+    use datatypes::schema::ColumnSchema;
+    use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
+    use store_api::storage::RegionId;
     use store_api::storage::consts::ReservedColumnId;
 
     use super::*;
     use crate::memtable::bulk::context::BulkIterContext;
-    use crate::sst::parquet::format::{PrimaryKeyReadFormat, ReadFormat};
     use crate::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
     use crate::test_util::memtable_util::{
         build_key_values_with_ts_seq_values, metadata_for_test, region_metadata_to_row_schema,
     };
 
-    fn check_binary_array_to_dictionary(
-        input: &[&[u8]],
-        expected_keys: &[u32],
-        expected_values: &[&[u8]],
-    ) {
-        let input = BinaryArray::from_iter_values(input.iter());
-        let array = binary_array_to_dictionary(&input).unwrap();
-        assert_eq!(
-            &expected_keys,
-            &array.keys().iter().map(|v| v.unwrap()).collect::<Vec<_>>()
-        );
-        assert_eq!(
-            expected_values,
-            &array
-                .values()
-                .as_any()
-                .downcast_ref::<BinaryArray>()
-                .unwrap()
-                .iter()
-                .map(|v| v.unwrap())
-                .collect::<Vec<_>>()
-        );
-    }
-
-    #[test]
-    fn test_binary_array_to_dictionary() {
-        check_binary_array_to_dictionary(&[], &[], &[]);
-
-        check_binary_array_to_dictionary(&["a".as_bytes()], &[0], &["a".as_bytes()]);
-
-        check_binary_array_to_dictionary(
-            &["a".as_bytes(), "a".as_bytes()],
-            &[0, 0],
-            &["a".as_bytes()],
-        );
-
-        check_binary_array_to_dictionary(
-            &["a".as_bytes(), "a".as_bytes(), "b".as_bytes()],
-            &[0, 0, 1],
-            &["a".as_bytes(), "b".as_bytes()],
-        );
-
-        check_binary_array_to_dictionary(
-            &[
-                "a".as_bytes(),
-                "a".as_bytes(),
-                "b".as_bytes(),
-                "c".as_bytes(),
-            ],
-            &[0, 0, 1, 2],
-            &["a".as_bytes(), "b".as_bytes(), "c".as_bytes()],
-        );
-    }
-
     struct MutationInput<'a> {
         k0: &'a str,
         k1: u32,
@@ -1273,232 +1244,6 @@ mod tests {
         v1: &'a [Option<f64>],
     }
 
-    fn check_mutations_to_record_batches(
-        input: &[MutationInput],
-        expected: &[BatchOutput],
-        expected_timestamp: (i64, i64),
-        dedup: bool,
-    ) {
-        let metadata = metadata_for_test();
-        let mutations = input
-            .iter()
-            .map(|m| {
-                build_key_values_with_ts_seq_values(
-                    &metadata,
-                    m.k0.to_string(),
-                    m.k1,
-                    m.timestamps.iter().copied(),
-                    m.v1.iter().copied(),
-                    m.sequence,
-                )
-                .mutation
-            })
-            .collect::<Vec<_>>();
-        let total_rows: usize = mutations
-            .iter()
-            .flat_map(|m| m.rows.iter())
-            .map(|r| r.rows.len())
-            .sum();
-
-        let pk_encoder = DensePrimaryKeyCodec::new(&metadata);
-
-        let (batch, _, _) = mutations_to_record_batch(&mutations, &metadata, &pk_encoder, dedup)
-            .unwrap()
-            .unwrap();
-        let read_format = PrimaryKeyReadFormat::new_with_all_columns(metadata.clone());
-        let mut batches = VecDeque::new();
-        read_format
-            .convert_record_batch(&batch, None, &mut batches)
-            .unwrap();
-        if !dedup {
-            assert_eq!(
-                total_rows,
-                batches.iter().map(|b| { b.num_rows() }).sum::<usize>()
-            );
-        }
-        let batch_values = batches
-            .into_iter()
-            .map(|b| {
-                let pk_values = pk_encoder.decode(b.primary_key()).unwrap().into_dense();
-                let timestamps = b
-                    .timestamps()
-                    .as_any()
-                    .downcast_ref::<TimestampMillisecondVector>()
-                    .unwrap()
-                    .iter_data()
-                    .map(|v| v.unwrap().0.value())
-                    .collect::<Vec<_>>();
-                let float_values = b.fields()[1]
-                    .data
-                    .as_any()
-                    .downcast_ref::<Float64Vector>()
-                    .unwrap()
-                    .iter_data()
-                    .collect::<Vec<_>>();
-
-                (pk_values, timestamps, float_values)
-            })
-            .collect::<Vec<_>>();
-        assert_eq!(expected.len(), batch_values.len());
-
-        for idx in 0..expected.len() {
-            assert_eq!(expected[idx].pk_values, &batch_values[idx].0);
-            assert_eq!(expected[idx].timestamps, &batch_values[idx].1);
-            assert_eq!(expected[idx].v1, &batch_values[idx].2);
-        }
-    }
-
-    #[test]
-    fn test_mutations_to_record_batch() {
-        check_mutations_to_record_batches(
-            &[MutationInput {
-                k0: "a",
-                k1: 0,
-                timestamps: &[0],
-                v1: &[Some(0.1)],
-                sequence: 0,
-            }],
-            &[BatchOutput {
-                pk_values: &[Value::String("a".into()), Value::UInt32(0)],
-                timestamps: &[0],
-                v1: &[Some(0.1)],
-            }],
-            (0, 0),
-            true,
-        );
-
-        check_mutations_to_record_batches(
-            &[
-                MutationInput {
-                    k0: "a",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.1)],
-                    sequence: 0,
-                },
-                MutationInput {
-                    k0: "b",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.0)],
-                    sequence: 0,
-                },
-                MutationInput {
-                    k0: "a",
-                    k1: 0,
-                    timestamps: &[1],
-                    v1: &[Some(0.2)],
-                    sequence: 1,
-                },
-                MutationInput {
-                    k0: "a",
-                    k1: 1,
-                    timestamps: &[1],
-                    v1: &[Some(0.3)],
-                    sequence: 2,
-                },
-            ],
-            &[
-                BatchOutput {
-                    pk_values: &[Value::String("a".into()), Value::UInt32(0)],
-                    timestamps: &[0, 1],
-                    v1: &[Some(0.1), Some(0.2)],
-                },
-                BatchOutput {
-                    pk_values: &[Value::String("a".into()), Value::UInt32(1)],
-                    timestamps: &[1],
-                    v1: &[Some(0.3)],
-                },
-                BatchOutput {
-                    pk_values: &[Value::String("b".into()), Value::UInt32(0)],
-                    timestamps: &[0],
-                    v1: &[Some(0.0)],
-                },
-            ],
-            (0, 1),
-            true,
-        );
-
-        check_mutations_to_record_batches(
-            &[
-                MutationInput {
-                    k0: "a",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.1)],
-                    sequence: 0,
-                },
-                MutationInput {
-                    k0: "b",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.0)],
-                    sequence: 0,
-                },
-                MutationInput {
-                    k0: "a",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.2)],
-                    sequence: 1,
-                },
-            ],
-            &[
-                BatchOutput {
-                    pk_values: &[Value::String("a".into()), Value::UInt32(0)],
-                    timestamps: &[0],
-                    v1: &[Some(0.2)],
-                },
-                BatchOutput {
-                    pk_values: &[Value::String("b".into()), Value::UInt32(0)],
-                    timestamps: &[0],
-                    v1: &[Some(0.0)],
-                },
-            ],
-            (0, 0),
-            true,
-        );
-        check_mutations_to_record_batches(
-            &[
-                MutationInput {
-                    k0: "a",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.1)],
-                    sequence: 0,
-                },
-                MutationInput {
-                    k0: "b",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.0)],
-                    sequence: 0,
-                },
-                MutationInput {
-                    k0: "a",
-                    k1: 0,
-                    timestamps: &[0],
-                    v1: &[Some(0.2)],
-                    sequence: 1,
-                },
-            ],
-            &[
-                BatchOutput {
-                    pk_values: &[Value::String("a".into()), Value::UInt32(0)],
-                    timestamps: &[0, 0],
-                    v1: &[Some(0.2), Some(0.1)],
-                },
-                BatchOutput {
-                    pk_values: &[Value::String("b".into()), Value::UInt32(0)],
-                    timestamps: &[0],
-                    v1: &[Some(0.0)],
-                },
-            ],
-            (0, 0),
-            false,
-        );
-    }
-
     fn encode(input: &[MutationInput]) -> EncodedBulkPart {
         let metadata = metadata_for_test();
         let kvs = input
@@ -2166,4 +1911,379 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn test_convert_bulk_part_empty() {
+        let metadata = metadata_for_test();
+        let schema = to_flat_sst_arrow_schema(
+            &metadata,
+            &FlatSchemaOptions::from_encoding(metadata.primary_key_encoding),
+        );
+        let primary_key_codec = build_primary_key_codec(&metadata);
+
+        // Create empty batch
+        let empty_batch = RecordBatch::new_empty(schema.clone());
+        let empty_part = BulkPart {
+            batch: empty_batch,
+            max_timestamp: 0,
+            min_timestamp: 0,
+            sequence: 0,
+            timestamp_index: 0,
+            raw_data: None,
+        };
+
+        let result =
+            convert_bulk_part(empty_part, &metadata, primary_key_codec, schema, true).unwrap();
+        assert!(result.is_none());
+    }
+
+    #[test]
+    fn test_convert_bulk_part_dense_with_pk_columns() {
+        let metadata = metadata_for_test();
+        let primary_key_codec = build_primary_key_codec(&metadata);
+
+        let k0_array = Arc::new(arrow::array::StringArray::from(vec![
+            "key1", "key2", "key1",
+        ]));
+        let k1_array = Arc::new(arrow::array::UInt32Array::from(vec![1, 2, 1]));
+        let v0_array = Arc::new(arrow::array::Int64Array::from(vec![100, 200, 300]));
+        let v1_array = Arc::new(arrow::array::Float64Array::from(vec![1.0, 2.0, 3.0]));
+        let ts_array = Arc::new(TimestampMillisecondArray::from(vec![1000, 2000, 1500]));
+
+        let input_schema = Arc::new(Schema::new(vec![
+            Field::new("k0", ArrowDataType::Utf8, false),
+            Field::new("k1", ArrowDataType::UInt32, false),
+            Field::new("v0", ArrowDataType::Int64, true),
+            Field::new("v1", ArrowDataType::Float64, true),
+            Field::new(
+                "ts",
+                ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None),
+                false,
+            ),
+        ]));
+
+        let input_batch = RecordBatch::try_new(
+            input_schema,
+            vec![k0_array, k1_array, v0_array, v1_array, ts_array],
+        )
+        .unwrap();
+
+        let part = BulkPart {
+            batch: input_batch,
+            max_timestamp: 2000,
+            min_timestamp: 1000,
+            sequence: 5,
+            timestamp_index: 4,
+            raw_data: None,
+        };
+
+        let output_schema = to_flat_sst_arrow_schema(
+            &metadata,
+            &FlatSchemaOptions::from_encoding(metadata.primary_key_encoding),
+        );
+
+        let result = convert_bulk_part(
+            part,
+            &metadata,
+            primary_key_codec,
+            output_schema,
+            true, // store primary key columns
+        )
+        .unwrap();
+
+        let converted = result.unwrap();
+
+        assert_eq!(converted.num_rows(), 3);
+        assert_eq!(converted.max_timestamp, 2000);
+        assert_eq!(converted.min_timestamp, 1000);
+        assert_eq!(converted.sequence, 5);
+
+        let schema = converted.batch.schema();
+        let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect();
+        assert_eq!(
+            field_names,
+            vec![
+                "k0",
+                "k1",
+                "v0",
+                "v1",
+                "ts",
+                "__primary_key",
+                "__sequence",
+                "__op_type"
+            ]
+        );
+
+        let k0_col = converted.batch.column_by_name("k0").unwrap();
+        assert!(matches!(
+            k0_col.data_type(),
+            ArrowDataType::Dictionary(_, _)
+        ));
+
+        let pk_col = converted.batch.column_by_name("__primary_key").unwrap();
+        let dict_array = pk_col
+            .as_any()
+            .downcast_ref::<DictionaryArray<UInt32Type>>()
+            .unwrap();
+        let keys = dict_array.keys();
+
+        assert_eq!(keys.len(), 3);
+    }
+
+    #[test]
+    fn test_convert_bulk_part_dense_without_pk_columns() {
+        let metadata = metadata_for_test();
+        let primary_key_codec = build_primary_key_codec(&metadata);
+
+        // Create input batch with primary key columns (k0, k1)
+        let k0_array = Arc::new(arrow::array::StringArray::from(vec!["key1", "key2"]));
+        let k1_array = Arc::new(arrow::array::UInt32Array::from(vec![1, 2]));
+        let v0_array = Arc::new(arrow::array::Int64Array::from(vec![100, 200]));
+        let v1_array = Arc::new(arrow::array::Float64Array::from(vec![1.0, 2.0]));
+        let ts_array = Arc::new(TimestampMillisecondArray::from(vec![1000, 2000]));
+
+        let input_schema = Arc::new(Schema::new(vec![
+            Field::new("k0", ArrowDataType::Utf8, false),
+            Field::new("k1", ArrowDataType::UInt32, false),
+            Field::new("v0", ArrowDataType::Int64, true),
+            Field::new("v1", ArrowDataType::Float64, true),
+            Field::new(
+                "ts",
+                ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None),
+                false,
+            ),
+        ]));
+
+        let input_batch = RecordBatch::try_new(
+            input_schema,
+            vec![k0_array, k1_array, v0_array, v1_array, ts_array],
+        )
+        .unwrap();
+
+        let part = BulkPart {
+            batch: input_batch,
+            max_timestamp: 2000,
+            min_timestamp: 1000,
+            sequence: 3,
+            timestamp_index: 4,
+            raw_data: None,
+        };
+
+        let output_schema = to_flat_sst_arrow_schema(
+            &metadata,
+            &FlatSchemaOptions {
+                raw_pk_columns: false,
+                string_pk_use_dict: true,
+            },
+        );
+
+        let result = convert_bulk_part(
+            part,
+            &metadata,
+            primary_key_codec,
+            output_schema,
+            false, // don't store primary key columns
+        )
+        .unwrap();
+
+        let converted = result.unwrap();
+
+        assert_eq!(converted.num_rows(), 2);
+        assert_eq!(converted.max_timestamp, 2000);
+        assert_eq!(converted.min_timestamp, 1000);
+        assert_eq!(converted.sequence, 3);
+
+        // Verify schema does NOT include individual primary key columns
+        let schema = converted.batch.schema();
+        let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect();
+        assert_eq!(
+            field_names,
+            vec!["v0", "v1", "ts", "__primary_key", "__sequence", "__op_type"]
+        );
+
+        // Verify __primary_key column is present and is a dictionary
+        let pk_col = converted.batch.column_by_name("__primary_key").unwrap();
+        assert!(matches!(
+            pk_col.data_type(),
+            ArrowDataType::Dictionary(_, _)
+        ));
+    }
+
+    #[test]
+    fn test_convert_bulk_part_sparse_encoding() {
+        let mut builder = RegionMetadataBuilder::new(RegionId::new(123, 456));
+        builder
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new("k0", ConcreteDataType::string_datatype(), false),
+                semantic_type: SemanticType::Tag,
+                column_id: 0,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new("k1", ConcreteDataType::string_datatype(), false),
+                semantic_type: SemanticType::Tag,
+                column_id: 1,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "ts",
+                    ConcreteDataType::timestamp_millisecond_datatype(),
+                    false,
+                ),
+                semantic_type: SemanticType::Timestamp,
+                column_id: 2,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new("v0", ConcreteDataType::int64_datatype(), true),
+                semantic_type: SemanticType::Field,
+                column_id: 3,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new("v1", ConcreteDataType::float64_datatype(), true),
+                semantic_type: SemanticType::Field,
+                column_id: 4,
+            })
+            .primary_key(vec![0, 1])
+            .primary_key_encoding(PrimaryKeyEncoding::Sparse);
+        let metadata = Arc::new(builder.build().unwrap());
+
+        let primary_key_codec = build_primary_key_codec(&metadata);
+
+        // Create input batch with __primary_key column (sparse encoding)
+        let pk_array = Arc::new(arrow::array::BinaryArray::from(vec![
+            b"encoded_key_1".as_slice(),
+            b"encoded_key_2".as_slice(),
+        ]));
+        let v0_array = Arc::new(arrow::array::Int64Array::from(vec![100, 200]));
+        let v1_array = Arc::new(arrow::array::Float64Array::from(vec![1.0, 2.0]));
+        let ts_array = Arc::new(TimestampMillisecondArray::from(vec![1000, 2000]));
+
+        let input_schema = Arc::new(Schema::new(vec![
+            Field::new("__primary_key", ArrowDataType::Binary, false),
+            Field::new("v0", ArrowDataType::Int64, true),
+            Field::new("v1", ArrowDataType::Float64, true),
+            Field::new(
+                "ts",
+                ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None),
+                false,
+            ),
+        ]));
+
+        let input_batch =
+            RecordBatch::try_new(input_schema, vec![pk_array, v0_array, v1_array, ts_array])
+                .unwrap();
+
+        let part = BulkPart {
+            batch: input_batch,
+            max_timestamp: 2000,
+            min_timestamp: 1000,
+            sequence: 7,
+            timestamp_index: 3,
+            raw_data: None,
+        };
+
+        let output_schema = to_flat_sst_arrow_schema(
+            &metadata,
+            &FlatSchemaOptions::from_encoding(metadata.primary_key_encoding),
+        );
+
+        let result = convert_bulk_part(
+            part,
+            &metadata,
+            primary_key_codec,
+            output_schema,
+            true, // store_primary_key_columns (ignored for sparse)
+        )
+        .unwrap();
+
+        let converted = result.unwrap();
+
+        assert_eq!(converted.num_rows(), 2);
+        assert_eq!(converted.max_timestamp, 2000);
+        assert_eq!(converted.min_timestamp, 1000);
+        assert_eq!(converted.sequence, 7);
+
+        // Verify schema does NOT include individual primary key columns (sparse encoding)
+        let schema = converted.batch.schema();
+        let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect();
+        assert_eq!(
+            field_names,
+            vec!["v0", "v1", "ts", "__primary_key", "__sequence", "__op_type"]
+        );
+
+        // Verify __primary_key is dictionary encoded
+        let pk_col = converted.batch.column_by_name("__primary_key").unwrap();
+        assert!(matches!(
+            pk_col.data_type(),
+            ArrowDataType::Dictionary(_, _)
+        ));
+    }
+
+    #[test]
+    fn test_convert_bulk_part_sorting_with_multiple_series() {
+        let metadata = metadata_for_test();
+        let primary_key_codec = build_primary_key_codec(&metadata);
+
+        // Create unsorted batch with multiple series and timestamps
+        let k0_array = Arc::new(arrow::array::StringArray::from(vec![
+            "series_b", "series_a", "series_b", "series_a",
+        ]));
+        let k1_array = Arc::new(arrow::array::UInt32Array::from(vec![2, 1, 2, 1]));
+        let v0_array = Arc::new(arrow::array::Int64Array::from(vec![200, 100, 400, 300]));
+        let v1_array = Arc::new(arrow::array::Float64Array::from(vec![2.0, 1.0, 4.0, 3.0]));
+        let ts_array = Arc::new(TimestampMillisecondArray::from(vec![
+            2000, 1000, 4000, 3000,
+        ]));
+
+        let input_schema = Arc::new(Schema::new(vec![
+            Field::new("k0", ArrowDataType::Utf8, false),
+            Field::new("k1", ArrowDataType::UInt32, false),
+            Field::new("v0", ArrowDataType::Int64, true),
+            Field::new("v1", ArrowDataType::Float64, true),
+            Field::new(
+                "ts",
+                ArrowDataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None),
+                false,
+            ),
+        ]));
+
+        let input_batch = RecordBatch::try_new(
+            input_schema,
+            vec![k0_array, k1_array, v0_array, v1_array, ts_array],
+        )
+        .unwrap();
+
+        let part = BulkPart {
+            batch: input_batch,
+            max_timestamp: 4000,
+            min_timestamp: 1000,
+            sequence: 10,
+            timestamp_index: 4,
+            raw_data: None,
+        };
+
+        let output_schema = to_flat_sst_arrow_schema(
+            &metadata,
+            &FlatSchemaOptions::from_encoding(metadata.primary_key_encoding),
+        );
+
+        let result =
+            convert_bulk_part(part, &metadata, primary_key_codec, output_schema, true).unwrap();
+
+        let converted = result.unwrap();
+
+        assert_eq!(converted.num_rows(), 4);
+
+        // Verify data is sorted by (primary_key, timestamp, sequence desc)
+        let ts_col = converted.batch.column(converted.timestamp_index);
+        let ts_array = ts_col
+            .as_any()
+            .downcast_ref::<TimestampMillisecondArray>()
+            .unwrap();
+
+        // After sorting by (pk, ts), we should have:
+        // series_a,1: ts=1000, 3000
+        // series_b,2: ts=2000, 4000
+        let timestamps: Vec<i64> = ts_array.values().to_vec();
+        assert_eq!(timestamps, vec![1000, 3000, 2000, 4000]);
+    }
 }
diff --git a/src/mito2/src/memtable/partition_tree/dict.rs b/src/mito2/src/memtable/partition_tree/dict.rs
index 62adda62bb..77cc835ea0 100644
--- a/src/mito2/src/memtable/partition_tree/dict.rs
+++ b/src/mito2/src/memtable/partition_tree/dict.rs
@@ -103,7 +103,7 @@ impl KeyDictBuilder {
         self.key_bytes_in_index += full_primary_key.len() + sparse_key_len;
 
         // Adds key size of index to the metrics.
-        MEMTABLE_DICT_BYTES.add(self.key_bytes_in_index as i64);
+        MEMTABLE_DICT_BYTES.add((full_primary_key.len() + sparse_key_len) as i64);
 
         pk_index
     }
diff --git a/src/mito2/src/memtable/simple_bulk_memtable.rs b/src/mito2/src/memtable/simple_bulk_memtable.rs
index 4e0b9ac525..4c3f31c2b8 100644
--- a/src/mito2/src/memtable/simple_bulk_memtable.rs
+++ b/src/mito2/src/memtable/simple_bulk_memtable.rs
@@ -627,7 +627,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut reader = DedupReader::new(reader, read::dedup::LastRow::new(false));
+        let mut reader = DedupReader::new(reader, read::dedup::LastRow::new(false), None);
         let mut num_rows = 0;
         while let Some(b) = reader.next_batch().await.unwrap() {
             num_rows += b.num_rows();
@@ -659,7 +659,7 @@ mod tests {
             .await
             .unwrap();
 
-        let mut reader = DedupReader::new(reader, read::dedup::LastRow::new(false));
+        let mut reader = DedupReader::new(reader, read::dedup::LastRow::new(false), None);
         let mut num_rows = 0;
         while let Some(b) = reader.next_batch().await.unwrap() {
             num_rows += b.num_rows();
diff --git a/src/mito2/src/memtable/time_partition.rs b/src/mito2/src/memtable/time_partition.rs
index 9131de32a5..6f11c813cb 100644
--- a/src/mito2/src/memtable/time_partition.rs
+++ b/src/mito2/src/memtable/time_partition.rs
@@ -261,7 +261,7 @@ impl TimePartitions {
             converter.append_key_values(kvs)?;
             let part = converter.convert()?;
 
-            return self.write_bulk(part);
+            return self.write_bulk_inner(part);
         }
 
         // Get all parts.
@@ -291,7 +291,31 @@ impl TimePartitions {
         self.write_multi_parts(kvs, &parts)
     }
 
+    /// Writes a bulk part.
     pub fn write_bulk(&self, part: BulkPart) -> Result<()> {
+        // Convert the bulk part if bulk_schema is Some
+        let part = if let Some(bulk_schema) = &self.bulk_schema {
+            let converted = crate::memtable::bulk::part::convert_bulk_part(
+                part,
+                &self.metadata,
+                self.primary_key_codec.clone(),
+                bulk_schema.clone(),
+                // Always store primary keys for bulk mode.
+                true,
+            )?;
+            match converted {
+                Some(p) => p,
+                None => return Ok(()),
+            }
+        } else {
+            part
+        };
+
+        self.write_bulk_inner(part)
+    }
+
+    /// Writes a bulk part without converting.
+    fn write_bulk_inner(&self, part: BulkPart) -> Result<()> {
         let time_type = self
             .metadata
             .time_index_column()
diff --git a/src/mito2/src/metrics.rs b/src/mito2/src/metrics.rs
index a4b2c570e7..be5f4945fd 100644
--- a/src/mito2/src/metrics.rs
+++ b/src/mito2/src/metrics.rs
@@ -157,6 +157,35 @@ lazy_static! {
             "greptime_mito_inflight_compaction_count",
             "inflight compaction count",
         ).unwrap();
+
+    /// Bytes reserved by compaction memory manager.
+    pub static ref COMPACTION_MEMORY_IN_USE: IntGauge =
+        register_int_gauge!(
+            "greptime_mito_compaction_memory_in_use_bytes",
+            "bytes currently reserved for compaction tasks",
+        )
+        .unwrap();
+    /// Configured compaction memory limit.
+    pub static ref COMPACTION_MEMORY_LIMIT: IntGauge =
+        register_int_gauge!(
+            "greptime_mito_compaction_memory_limit_bytes",
+            "maximum bytes allowed for compaction tasks",
+        )
+        .unwrap();
+    /// Wait time to obtain compaction memory.
+    pub static ref COMPACTION_MEMORY_WAIT: Histogram = register_histogram!(
+        "greptime_mito_compaction_memory_wait_seconds",
+        "time waiting for compaction memory",
+        // 0.01s ~ ~10s
+        exponential_buckets(0.01, 2.0, 10).unwrap(),
+    ).unwrap();
+    /// Counter of rejected compaction memory allocations.
+    pub static ref COMPACTION_MEMORY_REJECTED: IntCounterVec =
+        register_int_counter_vec!(
+            "greptime_mito_compaction_memory_rejected_total",
+            "number of compaction tasks rejected due to memory limit",
+            &[TYPE_LABEL]
+        ).unwrap();
 }
 
 // Query metrics.
@@ -474,7 +503,7 @@ lazy_static! {
         .unwrap();
 
     /// Counter for the number of files deleted by the GC worker.
-    pub static ref GC_DEL_FILE_CNT: IntGauge =
+    pub static ref GC_DELETE_FILE_CNT: IntGauge =
         register_int_gauge!(
             "greptime_mito_gc_delete_file_count",
             "mito gc deleted file count",
diff --git a/src/mito2/src/read/compat.rs b/src/mito2/src/read/compat.rs
index 8bc24a4953..8a69b1856f 100644
--- a/src/mito2/src/read/compat.rs
+++ b/src/mito2/src/read/compat.rs
@@ -18,6 +18,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 
 use api::v1::SemanticType;
+use common_recordbatch::recordbatch::align_json_array;
 use datatypes::arrow::array::{
     Array, ArrayRef, BinaryArray, BinaryBuilder, DictionaryArray, UInt32Array,
 };
@@ -27,7 +28,7 @@ use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::data_type::ConcreteDataType;
 use datatypes::prelude::DataType;
 use datatypes::value::Value;
-use datatypes::vectors::VectorRef;
+use datatypes::vectors::{Helper, VectorRef};
 use mito_codec::row_converter::{
     CompositeValues, PrimaryKeyCodec, SortField, build_primary_key_codec,
     build_primary_key_codec_with_fields,
@@ -38,8 +39,9 @@ use store_api::metadata::{RegionMetadata, RegionMetadataRef};
 use store_api::storage::ColumnId;
 
 use crate::error::{
-    CompatReaderSnafu, ComputeArrowSnafu, CreateDefaultSnafu, DecodeSnafu, EncodeSnafu,
-    NewRecordBatchSnafu, Result, UnexpectedSnafu, UnsupportedOperationSnafu,
+    CastVectorSnafu, CompatReaderSnafu, ComputeArrowSnafu, ConvertVectorSnafu, CreateDefaultSnafu,
+    DecodeSnafu, EncodeSnafu, NewRecordBatchSnafu, RecordBatchSnafu, Result, UnexpectedSnafu,
+    UnsupportedOperationSnafu,
 };
 use crate::read::flat_projection::{FlatProjectionMapper, flat_projected_columns};
 use crate::read::projection::{PrimaryKeyProjectionMapper, ProjectionMapper};
@@ -150,7 +152,7 @@ impl PrimaryKeyCompatBatch {
             batch = compat_pk.compat(batch)?;
         }
         if let Some(compat_fields) = &self.compat_fields {
-            batch = compat_fields.compat(batch);
+            batch = compat_fields.compat(batch)?;
         }
 
         Ok(batch)
@@ -351,11 +353,13 @@ impl FlatCompatBatch {
                     let old_column = batch.column(*pos);
 
                     if let Some(ty) = cast_type {
-                        // Safety: We ensure type can be converted and the new batch should be valid.
-                        // Tips: `safe` must be true in `CastOptions`, which will replace the specific value with null when it cannot be converted.
-                        let casted =
+                        let casted = if let Some(json_type) = ty.as_json() {
+                            align_json_array(old_column, &json_type.as_arrow_type())
+                                .context(RecordBatchSnafu)?
+                        } else {
                             datatypes::arrow::compute::cast(old_column, &ty.as_arrow_type())
-                                .context(ComputeArrowSnafu)?;
+                                .context(ComputeArrowSnafu)?
+                        };
                         Ok(casted)
                     } else {
                         Ok(old_column.clone())
@@ -452,8 +456,7 @@ struct CompatFields {
 
 impl CompatFields {
     /// Make fields of the `batch` compatible.
-    #[must_use]
-    fn compat(&self, batch: Batch) -> Batch {
+    fn compat(&self, batch: Batch) -> Result<Batch> {
         debug_assert_eq!(self.actual_fields.len(), batch.fields().len());
         debug_assert!(
             self.actual_fields
@@ -463,24 +466,32 @@ impl CompatFields {
         );
 
         let len = batch.num_rows();
-        let fields = self
-            .index_or_defaults
+        self.index_or_defaults
             .iter()
             .map(|index_or_default| match index_or_default {
                 IndexOrDefault::Index { pos, cast_type } => {
                     let old_column = &batch.fields()[*pos];
 
                     let data = if let Some(ty) = cast_type {
-                        // Safety: We ensure type can be converted and the new batch should be valid.
-                        // Tips: `safe` must be true in `CastOptions`, which will replace the specific value with null when it cannot be converted.
-                        old_column.data.cast(ty).unwrap()
+                        if let Some(json_type) = ty.as_json() {
+                            let json_array = old_column.data.to_arrow_array();
+                            let json_array =
+                                align_json_array(&json_array, &json_type.as_arrow_type())
+                                    .context(RecordBatchSnafu)?;
+                            Helper::try_into_vector(&json_array).context(ConvertVectorSnafu)?
+                        } else {
+                            old_column.data.cast(ty).with_context(|_| CastVectorSnafu {
+                                from: old_column.data.data_type(),
+                                to: ty.clone(),
+                            })?
+                        }
                     } else {
                         old_column.data.clone()
                     };
-                    BatchColumn {
+                    Ok(BatchColumn {
                         column_id: old_column.column_id,
                         data,
-                    }
+                    })
                 }
                 IndexOrDefault::DefaultValue {
                     column_id,
@@ -488,16 +499,14 @@ impl CompatFields {
                     semantic_type: _,
                 } => {
                     let data = default_vector.replicate(&[len]);
-                    BatchColumn {
+                    Ok(BatchColumn {
                         column_id: *column_id,
                         data,
-                    }
+                    })
                 }
             })
-            .collect();
-
-        // Safety: We ensure all columns have the same length and the new batch should be valid.
-        batch.with_fields(fields).unwrap()
+            .collect::<Result<Vec<_>>>()
+            .and_then(|fields| batch.with_fields(fields))
     }
 }
 
diff --git a/src/mito2/src/read/dedup.rs b/src/mito2/src/read/dedup.rs
index c3db629f84..5c881459b2 100644
--- a/src/mito2/src/read/dedup.rs
+++ b/src/mito2/src/read/dedup.rs
@@ -14,6 +14,10 @@
 
 //! Utilities to remove duplicate rows from a sorted batch.
 
+use std::fmt;
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
 use api::v1::OpType;
 use async_trait::async_trait;
 use common_telemetry::debug;
@@ -27,21 +31,34 @@ use crate::error::Result;
 use crate::metrics::MERGE_FILTER_ROWS_TOTAL;
 use crate::read::{Batch, BatchColumn, BatchReader};
 
+/// Trait for reporting dedup metrics.
+pub trait DedupMetricsReport: Send + Sync {
+    /// Reports and resets the metrics.
+    fn report(&self, metrics: &mut DedupMetrics);
+}
+
 /// A reader that dedup sorted batches from a source based on the
 /// dedup strategy.
 pub struct DedupReader<R, S> {
     source: R,
     strategy: S,
     metrics: DedupMetrics,
+    /// Optional metrics reporter.
+    metrics_reporter: Option<Arc<dyn DedupMetricsReport>>,
 }
 
 impl<R, S> DedupReader<R, S> {
     /// Creates a new dedup reader.
-    pub fn new(source: R, strategy: S) -> Self {
+    pub fn new(
+        source: R,
+        strategy: S,
+        metrics_reporter: Option<Arc<dyn DedupMetricsReport>>,
+    ) -> Self {
         Self {
             source,
             strategy,
             metrics: DedupMetrics::default(),
+            metrics_reporter,
         }
     }
 }
@@ -51,11 +68,14 @@ impl<R: BatchReader, S: DedupStrategy> DedupReader<R, S> {
     async fn fetch_next_batch(&mut self) -> Result<Option<Batch>> {
         while let Some(batch) = self.source.next_batch().await? {
             if let Some(batch) = self.strategy.push_batch(batch, &mut self.metrics)? {
+                self.metrics.maybe_report(&self.metrics_reporter);
                 return Ok(Some(batch));
             }
         }
 
-        self.strategy.finish(&mut self.metrics)
+        let result = self.strategy.finish(&mut self.metrics)?;
+        self.metrics.maybe_report(&self.metrics_reporter);
+        Ok(result)
     }
 }
 
@@ -76,6 +96,11 @@ impl<R, S> Drop for DedupReader<R, S> {
         MERGE_FILTER_ROWS_TOTAL
             .with_label_values(&["delete"])
             .inc_by(self.metrics.num_unselected_rows as u64);
+
+        // Report any remaining metrics.
+        if let Some(reporter) = &self.metrics_reporter {
+            reporter.report(&mut self.metrics);
+        }
     }
 }
 
@@ -138,6 +163,8 @@ impl DedupStrategy for LastRow {
         mut batch: Batch,
         metrics: &mut DedupMetrics,
     ) -> Result<Option<Batch>> {
+        let start = Instant::now();
+
         if batch.is_empty() {
             return Ok(None);
         }
@@ -160,6 +187,7 @@ impl DedupStrategy for LastRow {
             if batch.num_rows() == 1 {
                 // We don't need to update `prev_batch` because they have the same
                 // key and timestamp.
+                metrics.dedup_cost += start.elapsed();
                 return Ok(None);
             }
             // Skips the first row.
@@ -189,6 +217,8 @@ impl DedupStrategy for LastRow {
             filter_deleted_from_batch(&mut batch, metrics)?;
         }
 
+        metrics.dedup_cost += start.elapsed();
+
         // The batch can become empty if all rows are deleted.
         if batch.is_empty() {
             Ok(None)
@@ -215,12 +245,58 @@ fn filter_deleted_from_batch(batch: &mut Batch, metrics: &mut DedupMetrics) -> R
 }
 
 /// Metrics for deduplication.
-#[derive(Debug, Default)]
+#[derive(Default)]
 pub struct DedupMetrics {
     /// Number of rows removed during deduplication.
     pub(crate) num_unselected_rows: usize,
     /// Number of deleted rows.
     pub(crate) num_deleted_rows: usize,
+    /// Time spent on deduplication.
+    pub(crate) dedup_cost: Duration,
+}
+
+impl fmt::Debug for DedupMetrics {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // Skip output if dedup_cost is zero
+        if self.dedup_cost.is_zero() {
+            return write!(f, "{{}}");
+        }
+
+        write!(f, r#"{{"dedup_cost":"{:?}""#, self.dedup_cost)?;
+
+        if self.num_unselected_rows > 0 {
+            write!(f, r#", "num_unselected_rows":{}"#, self.num_unselected_rows)?;
+        }
+        if self.num_deleted_rows > 0 {
+            write!(f, r#", "num_deleted_rows":{}"#, self.num_deleted_rows)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl DedupMetrics {
+    /// Merges metrics from another DedupMetrics instance.
+    pub(crate) fn merge(&mut self, other: &DedupMetrics) {
+        let DedupMetrics {
+            num_unselected_rows,
+            num_deleted_rows,
+            dedup_cost,
+        } = other;
+
+        self.num_unselected_rows += *num_unselected_rows;
+        self.num_deleted_rows += *num_deleted_rows;
+        self.dedup_cost += *dedup_cost;
+    }
+
+    /// Reports the metrics if dedup_cost exceeds 10ms and resets them.
+    pub(crate) fn maybe_report(&mut self, reporter: &Option<Arc<dyn DedupMetricsReport>>) {
+        if self.dedup_cost.as_millis() > 10
+            && let Some(r) = reporter
+        {
+            r.report(self);
+        }
+    }
 }
 
 /// Buffer to store fields in the last row to merge.
@@ -427,6 +503,8 @@ impl LastNonNull {
 
 impl DedupStrategy for LastNonNull {
     fn push_batch(&mut self, batch: Batch, metrics: &mut DedupMetrics) -> Result<Option<Batch>> {
+        let start = Instant::now();
+
         if batch.is_empty() {
             return Ok(None);
         }
@@ -444,6 +522,7 @@ impl DedupStrategy for LastNonNull {
             // Next key is different.
             let buffer = std::mem::replace(buffer, batch);
             let merged = self.last_fields.merge_last_non_null(buffer, metrics)?;
+            metrics.dedup_cost += start.elapsed();
             return Ok(merged);
         }
 
@@ -451,6 +530,7 @@ impl DedupStrategy for LastNonNull {
             // The next batch has a different timestamp.
             let buffer = std::mem::replace(buffer, batch);
             let merged = self.last_fields.merge_last_non_null(buffer, metrics)?;
+            metrics.dedup_cost += start.elapsed();
             return Ok(merged);
         }
 
@@ -460,6 +540,7 @@ impl DedupStrategy for LastNonNull {
         // We assumes each batch doesn't contain duplicate rows so we only need to check the first row.
         if batch.num_rows() == 1 {
             self.last_fields.push_first_row(&batch);
+            metrics.dedup_cost += start.elapsed();
             return Ok(None);
         }
 
@@ -472,10 +553,14 @@ impl DedupStrategy for LastNonNull {
         let buffer = std::mem::replace(buffer, batch);
         let merged = self.last_fields.merge_last_non_null(buffer, metrics)?;
 
+        metrics.dedup_cost += start.elapsed();
+
         Ok(merged)
     }
 
     fn finish(&mut self, metrics: &mut DedupMetrics) -> Result<Option<Batch>> {
+        let start = Instant::now();
+
         let Some(buffer) = self.buffer.take() else {
             return Ok(None);
         };
@@ -485,6 +570,8 @@ impl DedupStrategy for LastNonNull {
 
         let merged = self.last_fields.merge_last_non_null(buffer, metrics)?;
 
+        metrics.dedup_cost += start.elapsed();
+
         Ok(merged)
     }
 }
@@ -614,14 +701,14 @@ mod tests {
 
         // Test last row.
         let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastRow::new(true));
+        let mut reader = DedupReader::new(reader, LastRow::new(true), None);
         check_reader_result(&mut reader, &input).await;
         assert_eq!(0, reader.metrics().num_unselected_rows);
         assert_eq!(0, reader.metrics().num_deleted_rows);
 
         // Test last non-null.
         let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(true));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(true), None);
         check_reader_result(&mut reader, &input).await;
         assert_eq!(0, reader.metrics().num_unselected_rows);
         assert_eq!(0, reader.metrics().num_deleted_rows);
@@ -662,7 +749,7 @@ mod tests {
         ];
         // Filter deleted.
         let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastRow::new(true));
+        let mut reader = DedupReader::new(reader, LastRow::new(true), None);
         check_reader_result(
             &mut reader,
             &[
@@ -684,7 +771,7 @@ mod tests {
 
         // Does not filter deleted.
         let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastRow::new(false));
+        let mut reader = DedupReader::new(reader, LastRow::new(false), None);
         check_reader_result(
             &mut reader,
             &[
@@ -801,7 +888,7 @@ mod tests {
 
         // Filter deleted.
         let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(true));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(true), None);
         check_reader_result(
             &mut reader,
             &[
@@ -835,7 +922,7 @@ mod tests {
 
         // Does not filter deleted.
         let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(false));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(false), None);
         check_reader_result(
             &mut reader,
             &[
@@ -885,7 +972,7 @@ mod tests {
         )];
 
         let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(true));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(true), None);
         check_reader_result(
             &mut reader,
             &[new_batch_multi_fields(
@@ -901,7 +988,7 @@ mod tests {
         assert_eq!(1, reader.metrics().num_deleted_rows);
 
         let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(false));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(false), None);
         check_reader_result(&mut reader, &input).await;
         assert_eq!(0, reader.metrics().num_unselected_rows);
         assert_eq!(0, reader.metrics().num_deleted_rows);
@@ -928,7 +1015,7 @@ mod tests {
         ];
 
         let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(true));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(true), None);
         check_reader_result(
             &mut reader,
             &[
@@ -962,7 +1049,7 @@ mod tests {
         ];
 
         let reader = VecBatchReader::new(&input);
-        let mut reader = DedupReader::new(reader, LastNonNull::new(true));
+        let mut reader = DedupReader::new(reader, LastNonNull::new(true), None);
         check_reader_result(
             &mut reader,
             &[
diff --git a/src/mito2/src/read/flat_dedup.rs b/src/mito2/src/read/flat_dedup.rs
index 62484f9c12..3f8a7ae507 100644
--- a/src/mito2/src/read/flat_dedup.rs
+++ b/src/mito2/src/read/flat_dedup.rs
@@ -15,9 +15,12 @@
 //! Dedup implementation for flat format.
 
 use std::ops::Range;
+use std::sync::Arc;
+use std::time::Instant;
 
 use api::v1::OpType;
 use async_stream::try_stream;
+use common_telemetry::debug;
 use datatypes::arrow::array::{
     Array, ArrayRef, BinaryArray, BooleanArray, BooleanBufferBuilder, UInt8Array, UInt64Array,
     make_comparator,
@@ -36,7 +39,8 @@ use snafu::ResultExt;
 
 use crate::error::{ComputeArrowSnafu, NewRecordBatchSnafu, Result};
 use crate::memtable::partition_tree::data::timestamp_array_to_i64_slice;
-use crate::read::dedup::DedupMetrics;
+use crate::metrics::MERGE_FILTER_ROWS_TOTAL;
+use crate::read::dedup::{DedupMetrics, DedupMetricsReport};
 use crate::sst::parquet::flat_format::{
     op_type_column_index, primary_key_column_index, time_index_column_index,
 };
@@ -88,15 +92,22 @@ pub struct FlatDedupReader<I, S> {
     stream: I,
     strategy: S,
     metrics: DedupMetrics,
+    /// Optional metrics reporter.
+    metrics_reporter: Option<Arc<dyn DedupMetricsReport>>,
 }
 
 impl<I, S> FlatDedupReader<I, S> {
-    /// Creates a new dedup iterator.
-    pub fn new(stream: I, strategy: S) -> Self {
+    /// Creates a new dedup reader.
+    pub fn new(
+        stream: I,
+        strategy: S,
+        metrics_reporter: Option<Arc<dyn DedupMetricsReport>>,
+    ) -> Self {
         Self {
             stream,
             strategy,
             metrics: DedupMetrics::default(),
+            metrics_reporter,
         }
     }
 }
@@ -108,11 +119,14 @@ impl<I: Stream<Item = Result<RecordBatch>> + Unpin, S: RecordBatchDedupStrategy>
     async fn fetch_next_batch(&mut self) -> Result<Option<RecordBatch>> {
         while let Some(batch) = self.stream.try_next().await? {
             if let Some(batch) = self.strategy.push_batch(batch, &mut self.metrics)? {
+                self.metrics.maybe_report(&self.metrics_reporter);
                 return Ok(Some(batch));
             }
         }
 
-        self.strategy.finish(&mut self.metrics)
+        let result = self.strategy.finish(&mut self.metrics)?;
+        self.metrics.maybe_report(&self.metrics_reporter);
+        Ok(result)
     }
 
     /// Converts the reader into a stream.
@@ -125,6 +139,24 @@ impl<I: Stream<Item = Result<RecordBatch>> + Unpin, S: RecordBatchDedupStrategy>
     }
 }
 
+impl<I, S> Drop for FlatDedupReader<I, S> {
+    fn drop(&mut self) {
+        debug!("Flat dedup reader finished, metrics: {:?}", self.metrics);
+
+        MERGE_FILTER_ROWS_TOTAL
+            .with_label_values(&["dedup"])
+            .inc_by(self.metrics.num_unselected_rows as u64);
+        MERGE_FILTER_ROWS_TOTAL
+            .with_label_values(&["delete"])
+            .inc_by(self.metrics.num_deleted_rows as u64);
+
+        // Report any remaining metrics.
+        if let Some(reporter) = &self.metrics_reporter {
+            reporter.report(&mut self.metrics);
+        }
+    }
+}
+
 /// Strategy to remove duplicate rows from sorted record batches.
 pub trait RecordBatchDedupStrategy: Send {
     /// Pushes a batch to the dedup strategy.
@@ -214,6 +246,8 @@ impl RecordBatchDedupStrategy for FlatLastRow {
         batch: RecordBatch,
         metrics: &mut DedupMetrics,
     ) -> Result<Option<RecordBatch>> {
+        let start = Instant::now();
+
         if batch.num_rows() == 0 {
             return Ok(None);
         }
@@ -235,6 +269,7 @@ impl RecordBatchDedupStrategy for FlatLastRow {
             // The batch after dedup is empty.
             // We don't need to update `prev_batch` because they have the same
             // key and timestamp.
+            metrics.dedup_cost += start.elapsed();
             return Ok(None);
         };
 
@@ -246,7 +281,11 @@ impl RecordBatchDedupStrategy for FlatLastRow {
         self.prev_batch = Some(batch_last_row);
 
         // Filters deleted rows at last.
-        maybe_filter_deleted(batch, self.filter_deleted, metrics)
+        let result = maybe_filter_deleted(batch, self.filter_deleted, metrics);
+
+        metrics.dedup_cost += start.elapsed();
+
+        result
     }
 
     fn finish(&mut self, _metrics: &mut DedupMetrics) -> Result<Option<RecordBatch>> {
@@ -275,6 +314,8 @@ impl RecordBatchDedupStrategy for FlatLastNonNull {
         batch: RecordBatch,
         metrics: &mut DedupMetrics,
     ) -> Result<Option<RecordBatch>> {
+        let start = Instant::now();
+
         if batch.num_rows() == 0 {
             return Ok(None);
         }
@@ -290,6 +331,7 @@ impl RecordBatchDedupStrategy for FlatLastNonNull {
             self.buffer = BatchLastRow::try_new(record_batch);
             self.contains_delete = contains_delete;
 
+            metrics.dedup_cost += start.elapsed();
             return Ok(None);
         };
 
@@ -305,7 +347,9 @@ impl RecordBatchDedupStrategy for FlatLastNonNull {
             self.buffer = BatchLastRow::try_new(record_batch);
             self.contains_delete = contains_delete;
 
-            return maybe_filter_deleted(buffer.last_batch, self.filter_deleted, metrics);
+            let result = maybe_filter_deleted(buffer.last_batch, self.filter_deleted, metrics);
+            metrics.dedup_cost += start.elapsed();
+            return result;
         }
 
         // The next batch has duplicated rows.
@@ -332,6 +376,8 @@ impl RecordBatchDedupStrategy for FlatLastNonNull {
         self.buffer = BatchLastRow::try_new(record_batch);
         self.contains_delete = contains_delete;
 
+        metrics.dedup_cost += start.elapsed();
+
         Ok(output)
     }
 
@@ -340,7 +386,13 @@ impl RecordBatchDedupStrategy for FlatLastNonNull {
             return Ok(None);
         };
 
-        maybe_filter_deleted(buffer.last_batch, self.filter_deleted, metrics)
+        let start = Instant::now();
+
+        let result = maybe_filter_deleted(buffer.last_batch, self.filter_deleted, metrics);
+
+        metrics.dedup_cost += start.elapsed();
+
+        result
     }
 }
 
diff --git a/src/mito2/src/read/flat_merge.rs b/src/mito2/src/read/flat_merge.rs
index 890334f91c..90df227ae9 100644
--- a/src/mito2/src/read/flat_merge.rs
+++ b/src/mito2/src/read/flat_merge.rs
@@ -15,8 +15,10 @@
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::sync::Arc;
+use std::time::Instant;
 
 use async_stream::try_stream;
+use common_telemetry::debug;
 use datatypes::arrow::array::{Int64Array, UInt64Array};
 use datatypes::arrow::compute::interleave;
 use datatypes::arrow::datatypes::SchemaRef;
@@ -29,7 +31,9 @@ use store_api::storage::SequenceNumber;
 
 use crate::error::{ComputeArrowSnafu, Result};
 use crate::memtable::BoxedRecordBatchIterator;
+use crate::metrics::READ_STAGE_ELAPSED;
 use crate::read::BoxedRecordBatchStream;
+use crate::read::merge::{MergeMetrics, MergeMetricsReport};
 use crate::sst::parquet::flat_format::{
     primary_key_column_index, sequence_column_index, time_index_column_index,
 };
@@ -462,12 +466,14 @@ impl FlatMergeIterator {
 
         let algo = MergeAlgo::new(nodes);
 
-        Ok(Self {
+        let iter = Self {
             algo,
             in_progress,
             output_batch: None,
             batch_size,
-        })
+        };
+
+        Ok(iter)
     }
 
     /// Fetches next sorted batch.
@@ -484,12 +490,7 @@ impl FlatMergeIterator {
             }
         }
 
-        if let Some(batch) = self.output_batch.take() {
-            Ok(Some(batch))
-        } else {
-            // No more batches.
-            Ok(None)
-        }
+        Ok(self.output_batch.take())
     }
 
     /// Fetches a batch from the hottest node.
@@ -562,6 +563,10 @@ pub struct FlatMergeReader {
     /// This is not a hard limit, the iterator may return smaller batches to avoid concatenating
     /// rows.
     batch_size: usize,
+    /// Local metrics.
+    metrics: MergeMetrics,
+    /// Optional metrics reporter.
+    metrics_reporter: Option<Arc<dyn MergeMetricsReport>>,
 }
 
 impl FlatMergeReader {
@@ -570,7 +575,10 @@ impl FlatMergeReader {
         schema: SchemaRef,
         iters: Vec<BoxedRecordBatchStream>,
         batch_size: usize,
+        metrics_reporter: Option<Arc<dyn MergeMetricsReport>>,
     ) -> Result<Self> {
+        let start = Instant::now();
+        let metrics = MergeMetrics::default();
         let mut in_progress = BatchBuilder::new(schema, iters.len(), batch_size);
         let mut nodes = Vec::with_capacity(iters.len());
         // Initialize nodes and the buffer.
@@ -588,16 +596,24 @@ impl FlatMergeReader {
 
         let algo = MergeAlgo::new(nodes);
 
-        Ok(Self {
+        let mut reader = Self {
             algo,
             in_progress,
             output_batch: None,
             batch_size,
-        })
+            metrics,
+            metrics_reporter,
+        };
+        let elapsed = start.elapsed();
+        reader.metrics.init_cost += elapsed;
+        reader.metrics.scan_cost += elapsed;
+
+        Ok(reader)
     }
 
     /// Fetches next sorted batch.
     pub async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
+        let start = Instant::now();
         while self.algo.has_rows() && self.output_batch.is_none() {
             if self.algo.can_fetch_batch() && !self.in_progress.is_empty() {
                 // Only one batch in the hot heap, but we have pending rows, output the pending rows first.
@@ -605,15 +621,21 @@ impl FlatMergeReader {
                 debug_assert!(self.output_batch.is_some());
             } else if self.algo.can_fetch_batch() {
                 self.fetch_batch_from_hottest().await?;
+                self.metrics.num_fetch_by_batches += 1;
             } else {
                 self.fetch_row_from_hottest().await?;
+                self.metrics.num_fetch_by_rows += 1;
             }
         }
 
         if let Some(batch) = self.output_batch.take() {
+            self.metrics.scan_cost += start.elapsed();
+            self.metrics.maybe_report(&self.metrics_reporter);
             Ok(Some(batch))
         } else {
             // No more batches.
+            self.metrics.scan_cost += start.elapsed();
+            self.metrics.maybe_report(&self.metrics_reporter);
             Ok(None)
         }
     }
@@ -634,7 +656,9 @@ impl FlatMergeReader {
         // Safety: next_batch() ensures the heap is not empty.
         let mut hottest = self.algo.pop_hot().unwrap();
         debug_assert!(!hottest.current_cursor().is_finished());
+        let start = Instant::now();
         let next = hottest.advance_batch().await?;
+        self.metrics.fetch_cost += start.elapsed();
         // The node is the heap is not empty, so it must have existing rows in the builder.
         let batch = self
             .in_progress
@@ -658,8 +682,12 @@ impl FlatMergeReader {
             }
         }
 
+        let start = Instant::now();
         if let Some(next) = hottest.advance_row().await? {
+            self.metrics.fetch_cost += start.elapsed();
             self.in_progress.push_batch(hottest.node_index, next);
+        } else {
+            self.metrics.fetch_cost += start.elapsed();
         }
 
         self.algo.reheap(hottest);
@@ -675,6 +703,24 @@ impl FlatMergeReader {
     }
 }
 
+impl Drop for FlatMergeReader {
+    fn drop(&mut self) {
+        debug!("Flat merge reader finished, metrics: {:?}", self.metrics);
+
+        READ_STAGE_ELAPSED
+            .with_label_values(&["flat_merge"])
+            .observe(self.metrics.scan_cost.as_secs_f64());
+        READ_STAGE_ELAPSED
+            .with_label_values(&["flat_merge_fetch"])
+            .observe(self.metrics.fetch_cost.as_secs_f64());
+
+        // Report any remaining metrics.
+        if let Some(reporter) = &self.metrics_reporter {
+            reporter.report(&mut self.metrics);
+        }
+    }
+}
+
 /// A sync node in the merge iterator.
 struct GenericNode<T> {
     /// Index of the node.
diff --git a/src/mito2/src/read/merge.rs b/src/mito2/src/read/merge.rs
index f9afbe66fd..0470e4b01a 100644
--- a/src/mito2/src/read/merge.rs
+++ b/src/mito2/src/read/merge.rs
@@ -16,8 +16,9 @@
 
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
-use std::mem;
+use std::sync::Arc;
 use std::time::{Duration, Instant};
+use std::{fmt, mem};
 
 use async_trait::async_trait;
 use common_telemetry::debug;
@@ -27,6 +28,12 @@ use crate::memtable::BoxedBatchIterator;
 use crate::metrics::READ_STAGE_ELAPSED;
 use crate::read::{Batch, BatchReader, BoxedBatchReader, Source};
 
+/// Trait for reporting merge metrics.
+pub trait MergeMetricsReport: Send + Sync {
+    /// Reports and resets the metrics.
+    fn report(&self, metrics: &mut MergeMetrics);
+}
+
 /// Reader to merge sorted batches.
 ///
 /// The merge reader merges [Batch]es from multiple sources that yield sorted batches.
@@ -51,7 +58,9 @@ pub struct MergeReader {
     /// Batch to output.
     output_batch: Option<Batch>,
     /// Local metrics.
-    metrics: Metrics,
+    metrics: MergeMetrics,
+    /// Optional metrics reporter.
+    metrics_reporter: Option<Arc<dyn MergeMetricsReport>>,
 }
 
 #[async_trait]
@@ -72,11 +81,12 @@ impl BatchReader for MergeReader {
 
         if let Some(batch) = self.output_batch.take() {
             self.metrics.scan_cost += start.elapsed();
-            self.metrics.num_output_rows += batch.num_rows();
+            self.metrics.maybe_report(&self.metrics_reporter);
             Ok(Some(batch))
         } else {
             // Nothing fetched.
             self.metrics.scan_cost += start.elapsed();
+            self.metrics.maybe_report(&self.metrics_reporter);
             Ok(None)
         }
     }
@@ -92,14 +102,22 @@ impl Drop for MergeReader {
         READ_STAGE_ELAPSED
             .with_label_values(&["merge_fetch"])
             .observe(self.metrics.fetch_cost.as_secs_f64());
+
+        // Report any remaining metrics.
+        if let Some(reporter) = &self.metrics_reporter {
+            reporter.report(&mut self.metrics);
+        }
     }
 }
 
 impl MergeReader {
     /// Creates and initializes a new [MergeReader].
-    pub async fn new(sources: Vec<Source>) -> Result<MergeReader> {
+    pub async fn new(
+        sources: Vec<Source>,
+        metrics_reporter: Option<Arc<dyn MergeMetricsReport>>,
+    ) -> Result<MergeReader> {
         let start = Instant::now();
-        let mut metrics = Metrics::default();
+        let mut metrics = MergeMetrics::default();
 
         let mut cold = BinaryHeap::with_capacity(sources.len());
         let hot = BinaryHeap::with_capacity(sources.len());
@@ -116,11 +134,14 @@ impl MergeReader {
             cold,
             output_batch: None,
             metrics,
+            metrics_reporter,
         };
         // Initializes the reader.
         reader.refill_hot();
 
-        reader.metrics.scan_cost += start.elapsed();
+        let elapsed = start.elapsed();
+        reader.metrics.init_cost += elapsed;
+        reader.metrics.scan_cost += elapsed;
         Ok(reader)
     }
 
@@ -250,6 +271,8 @@ pub struct MergeReaderBuilder {
     ///
     /// All source must yield batches with the same schema.
     sources: Vec<Source>,
+    /// Optional metrics reporter.
+    metrics_reporter: Option<Arc<dyn MergeMetricsReport>>,
 }
 
 impl MergeReaderBuilder {
@@ -260,7 +283,10 @@ impl MergeReaderBuilder {
 
     /// Creates a builder from sources.
     pub fn from_sources(sources: Vec<Source>) -> MergeReaderBuilder {
-        MergeReaderBuilder { sources }
+        MergeReaderBuilder {
+            sources,
+            metrics_reporter: None,
+        }
     }
 
     /// Pushes a batch reader to sources.
@@ -275,28 +301,94 @@ impl MergeReaderBuilder {
         self
     }
 
+    /// Sets the metrics reporter.
+    pub fn with_metrics_reporter(
+        &mut self,
+        reporter: Option<Arc<dyn MergeMetricsReport>>,
+    ) -> &mut Self {
+        self.metrics_reporter = reporter;
+        self
+    }
+
     /// Builds and initializes the reader, then resets the builder.
     pub async fn build(&mut self) -> Result<MergeReader> {
         let sources = mem::take(&mut self.sources);
-        MergeReader::new(sources).await
+        let metrics_reporter = self.metrics_reporter.take();
+        MergeReader::new(sources, metrics_reporter).await
     }
 }
 
 /// Metrics for the merge reader.
-#[derive(Debug, Default)]
-struct Metrics {
+#[derive(Default)]
+pub struct MergeMetrics {
+    /// Cost to initialize the reader.
+    pub(crate) init_cost: Duration,
     /// Total scan cost of the reader.
-    scan_cost: Duration,
+    pub(crate) scan_cost: Duration,
     /// Number of times to fetch batches.
-    num_fetch_by_batches: usize,
+    pub(crate) num_fetch_by_batches: usize,
     /// Number of times to fetch rows.
-    num_fetch_by_rows: usize,
-    /// Number of input rows.
-    num_input_rows: usize,
-    /// Number of output rows.
-    num_output_rows: usize,
+    pub(crate) num_fetch_by_rows: usize,
     /// Cost to fetch batches from sources.
-    fetch_cost: Duration,
+    pub(crate) fetch_cost: Duration,
+}
+
+impl fmt::Debug for MergeMetrics {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // Skip output if scan_cost is zero
+        if self.scan_cost.is_zero() {
+            return write!(f, "{{}}");
+        }
+
+        write!(f, r#"{{"scan_cost":"{:?}""#, self.scan_cost)?;
+
+        if !self.init_cost.is_zero() {
+            write!(f, r#", "init_cost":"{:?}""#, self.init_cost)?;
+        }
+        if self.num_fetch_by_batches > 0 {
+            write!(
+                f,
+                r#", "num_fetch_by_batches":{}"#,
+                self.num_fetch_by_batches
+            )?;
+        }
+        if self.num_fetch_by_rows > 0 {
+            write!(f, r#", "num_fetch_by_rows":{}"#, self.num_fetch_by_rows)?;
+        }
+        if !self.fetch_cost.is_zero() {
+            write!(f, r#", "fetch_cost":"{:?}""#, self.fetch_cost)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl MergeMetrics {
+    /// Merges metrics from another MergeMetrics instance.
+    pub(crate) fn merge(&mut self, other: &MergeMetrics) {
+        let MergeMetrics {
+            init_cost,
+            scan_cost,
+            num_fetch_by_batches,
+            num_fetch_by_rows,
+            fetch_cost,
+        } = other;
+
+        self.init_cost += *init_cost;
+        self.scan_cost += *scan_cost;
+        self.num_fetch_by_batches += *num_fetch_by_batches;
+        self.num_fetch_by_rows += *num_fetch_by_rows;
+        self.fetch_cost += *fetch_cost;
+    }
+
+    /// Reports the metrics if scan_cost exceeds 10ms and resets them.
+    pub(crate) fn maybe_report(&mut self, reporter: &Option<Arc<dyn MergeMetricsReport>>) {
+        if self.scan_cost.as_millis() > 10
+            && let Some(r) = reporter
+        {
+            r.report(self);
+        }
+    }
 }
 
 /// A `Node` represent an individual input data source to be merged.
@@ -313,12 +405,11 @@ impl Node {
     /// Initialize a node.
     ///
     /// It tries to fetch one batch from the `source`.
-    async fn new(mut source: Source, metrics: &mut Metrics) -> Result<Node> {
+    async fn new(mut source: Source, metrics: &mut MergeMetrics) -> Result<Node> {
         // Ensures batch is not empty.
         let start = Instant::now();
         let current_batch = source.next_batch().await?.map(CompareFirst);
         metrics.fetch_cost += start.elapsed();
-        metrics.num_input_rows += current_batch.as_ref().map(|b| b.0.num_rows()).unwrap_or(0);
 
         Ok(Node {
             source,
@@ -352,17 +443,12 @@ impl Node {
     ///
     /// # Panics
     /// Panics if the node has reached EOF.
-    async fn fetch_batch(&mut self, metrics: &mut Metrics) -> Result<Batch> {
+    async fn fetch_batch(&mut self, metrics: &mut MergeMetrics) -> Result<Batch> {
         let current = self.current_batch.take().unwrap();
         let start = Instant::now();
         // Ensures batch is not empty.
         self.current_batch = self.source.next_batch().await?.map(CompareFirst);
         metrics.fetch_cost += start.elapsed();
-        metrics.num_input_rows += self
-            .current_batch
-            .as_ref()
-            .map(|b| b.0.num_rows())
-            .unwrap_or(0);
         Ok(current.0)
     }
 
@@ -390,7 +476,7 @@ impl Node {
     ///
     /// # Panics
     /// Panics if the node is EOF.
-    async fn skip_rows(&mut self, num_to_skip: usize, metrics: &mut Metrics) -> Result<()> {
+    async fn skip_rows(&mut self, num_to_skip: usize, metrics: &mut MergeMetrics) -> Result<()> {
         let batch = self.current_batch();
         debug_assert!(batch.num_rows() >= num_to_skip);
 
@@ -547,9 +633,6 @@ mod tests {
             ],
         )
         .await;
-
-        assert_eq!(8, reader.metrics.num_input_rows);
-        assert_eq!(8, reader.metrics.num_output_rows);
     }
 
     #[tokio::test]
@@ -666,9 +749,6 @@ mod tests {
             ],
         )
         .await;
-
-        assert_eq!(11, reader.metrics.num_input_rows);
-        assert_eq!(11, reader.metrics.num_output_rows);
     }
 
     #[tokio::test]
diff --git a/src/mito2/src/read/projection.rs b/src/mito2/src/read/projection.rs
index d7171e7f60..e4a3af5831 100644
--- a/src/mito2/src/read/projection.rs
+++ b/src/mito2/src/read/projection.rs
@@ -84,6 +84,14 @@ impl ProjectionMapper {
         }
     }
 
+    /// Returns true if the projection includes any tag columns.
+    pub(crate) fn has_tags(&self) -> bool {
+        match self {
+            ProjectionMapper::PrimaryKey(m) => m.has_tags(),
+            ProjectionMapper::Flat(_) => false,
+        }
+    }
+
     /// Returns ids of projected columns that we need to read
     /// from memtables and SSTs.
     pub(crate) fn column_ids(&self) -> &[ColumnId] {
@@ -257,6 +265,11 @@ impl PrimaryKeyProjectionMapper {
         &self.metadata
     }
 
+    /// Returns true if the projection includes any tag columns.
+    pub(crate) fn has_tags(&self) -> bool {
+        self.has_tags
+    }
+
     /// Returns ids of projected columns that we need to read
     /// from memtables and SSTs.
     pub(crate) fn column_ids(&self) -> &[ColumnId] {
diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs
index babdd43b0b..8191dbcb7a 100644
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -135,6 +135,14 @@ impl Scanner {
         }
     }
 
+    pub(crate) fn index_ids(&self) -> Vec<crate::sst::file::RegionIndexId> {
+        match self {
+            Scanner::Seq(seq_scan) => seq_scan.input().index_ids(),
+            Scanner::Unordered(unordered_scan) => unordered_scan.input().index_ids(),
+            Scanner::Series(series_scan) => series_scan.input().index_ids(),
+        }
+    }
+
     /// Sets the target partitions for the scanner. It can controls the parallelism of the scanner.
     pub(crate) fn set_target_partitions(&mut self, target_partitions: usize) {
         use store_api::region_engine::{PrepareRequest, RegionScanner};
@@ -958,6 +966,7 @@ impl ScanInput {
     ) -> Result<FileRangeBuilder> {
         let predicate = self.predicate_for_file(file);
         let filter_mode = pre_filter_mode(self.append_mode, self.merge_mode);
+        let decode_pk_values = !self.compaction && self.mapper.has_tags();
         let res = self
             .access_layer
             .read_sst(file.clone())
@@ -971,6 +980,7 @@ impl ScanInput {
             .flat_format(self.flat_format)
             .compaction(self.compaction)
             .pre_filter_mode(filter_mode)
+            .decode_primary_key_values(decode_pk_values)
             .build_reader_input(reader_metrics)
             .await;
         let (mut file_range_ctx, selection) = match res {
@@ -1127,6 +1137,12 @@ impl ScanInput {
         self.files.len()
     }
 
+    /// Gets the file handle from a row group index.
+    pub(crate) fn file_from_index(&self, index: RowGroupIndex) -> &FileHandle {
+        let file_index = index.index - self.num_memtables();
+        &self.files[file_index]
+    }
+
     pub fn region_metadata(&self) -> &RegionMetadataRef {
         self.mapper.metadata()
     }
@@ -1160,6 +1176,10 @@ impl ScanInput {
     pub(crate) fn file_ids(&self) -> Vec<crate::sst::file::RegionFileId> {
         self.files.iter().map(|file| file.file_id()).collect()
     }
+
+    pub(crate) fn index_ids(&self) -> Vec<crate::sst::file::RegionIndexId> {
+        self.files.iter().map(|file| file.index_id()).collect()
+    }
 }
 
 fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode {
diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs
index 674a4fab4b..73df1e8fd8 100644
--- a/src/mito2/src/read/scan_util.rs
+++ b/src/mito2/src/read/scan_util.rs
@@ -14,13 +14,17 @@
 
 //! Utilities for scanners.
 
+use std::collections::{BinaryHeap, HashMap, VecDeque};
 use std::fmt;
+use std::pin::Pin;
 use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll};
 use std::time::{Duration, Instant};
 
 use async_stream::try_stream;
 use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder, Time};
 use datatypes::arrow::record_batch::RecordBatch;
+use datatypes::timestamp::timestamp_array_to_primitive;
 use futures::Stream;
 use prometheus::IntGauge;
 use smallvec::SmallVec;
@@ -33,12 +37,71 @@ use crate::metrics::{
     IN_PROGRESS_SCAN, PRECISE_FILTER_ROWS_TOTAL, READ_BATCHES_RETURN, READ_ROW_GROUPS_TOTAL,
     READ_ROWS_IN_ROW_GROUP_TOTAL, READ_ROWS_RETURN, READ_STAGE_ELAPSED,
 };
-use crate::read::range::{RangeBuilderList, RowGroupIndex};
+use crate::read::dedup::{DedupMetrics, DedupMetricsReport};
+use crate::read::merge::{MergeMetrics, MergeMetricsReport};
+use crate::read::range::{RangeBuilderList, RangeMeta, RowGroupIndex};
 use crate::read::scan_region::StreamContext;
 use crate::read::{Batch, BoxedBatchStream, BoxedRecordBatchStream, ScannerMetrics, Source};
-use crate::sst::file::FileTimeRange;
+use crate::sst::file::{FileTimeRange, RegionFileId};
+use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplyMetrics;
+use crate::sst::index::fulltext_index::applier::FulltextIndexApplyMetrics;
+use crate::sst::index::inverted_index::applier::InvertedIndexApplyMetrics;
+use crate::sst::parquet::DEFAULT_ROW_GROUP_SIZE;
 use crate::sst::parquet::file_range::FileRange;
-use crate::sst::parquet::reader::{ReaderFilterMetrics, ReaderMetrics};
+use crate::sst::parquet::flat_format::time_index_column_index;
+use crate::sst::parquet::reader::{MetadataCacheMetrics, ReaderFilterMetrics, ReaderMetrics};
+use crate::sst::parquet::row_group::ParquetFetchMetrics;
+
+/// Per-file scan metrics.
+#[derive(Default, Clone)]
+pub struct FileScanMetrics {
+    /// Number of ranges (row groups) read from this file.
+    pub num_ranges: usize,
+    /// Number of rows read from this file.
+    pub num_rows: usize,
+    /// Time spent building file ranges/parts (file-level preparation).
+    pub build_part_cost: Duration,
+    /// Time spent building readers for this file (accumulated across all ranges).
+    pub build_reader_cost: Duration,
+    /// Time spent scanning this file (accumulated across all ranges).
+    pub scan_cost: Duration,
+}
+
+impl fmt::Debug for FileScanMetrics {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{{\"build_part_cost\":\"{:?}\"", self.build_part_cost)?;
+
+        if self.num_ranges > 0 {
+            write!(f, ", \"num_ranges\":{}", self.num_ranges)?;
+        }
+        if self.num_rows > 0 {
+            write!(f, ", \"num_rows\":{}", self.num_rows)?;
+        }
+        if !self.build_reader_cost.is_zero() {
+            write!(
+                f,
+                ", \"build_reader_cost\":\"{:?}\"",
+                self.build_reader_cost
+            )?;
+        }
+        if !self.scan_cost.is_zero() {
+            write!(f, ", \"scan_cost\":\"{:?}\"", self.scan_cost)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl FileScanMetrics {
+    /// Merges another FileMetrics into this one.
+    pub(crate) fn merge_from(&mut self, other: &FileScanMetrics) {
+        self.num_ranges += other.num_ranges;
+        self.num_rows += other.num_rows;
+        self.build_part_cost += other.build_part_cost;
+        self.build_reader_cost += other.build_reader_cost;
+        self.scan_cost += other.scan_cost;
+    }
+}
 
 /// Verbose scan metrics for a partition.
 #[derive(Default)]
@@ -75,6 +138,8 @@ pub(crate) struct ScanMetricsSet {
     // SST related metrics:
     /// Duration to build file ranges.
     build_parts_cost: Duration,
+    /// Duration to scan SST files.
+    sst_scan_cost: Duration,
     /// Number of row groups before filtering.
     rg_total: usize,
     /// Number of row groups filtered by fulltext index.
@@ -118,8 +183,56 @@ pub(crate) struct ScanMetricsSet {
     /// Duration of the series distributor to yield.
     distributor_yield_cost: Duration,
 
+    /// Merge metrics.
+    merge_metrics: MergeMetrics,
+    /// Dedup metrics.
+    dedup_metrics: DedupMetrics,
+
     /// The stream reached EOF
     stream_eof: bool,
+
+    // Optional verbose metrics:
+    /// Inverted index apply metrics.
+    inverted_index_apply_metrics: Option<InvertedIndexApplyMetrics>,
+    /// Bloom filter index apply metrics.
+    bloom_filter_apply_metrics: Option<BloomFilterIndexApplyMetrics>,
+    /// Fulltext index apply metrics.
+    fulltext_index_apply_metrics: Option<FulltextIndexApplyMetrics>,
+    /// Parquet fetch metrics.
+    fetch_metrics: Option<ParquetFetchMetrics>,
+    /// Metadata cache metrics.
+    metadata_cache_metrics: Option<MetadataCacheMetrics>,
+    /// Per-file scan metrics, only populated when explain_verbose is true.
+    per_file_metrics: Option<HashMap<RegionFileId, FileScanMetrics>>,
+}
+
+/// Wrapper for file metrics that compares by total cost in reverse order.
+/// This allows using BinaryHeap as a min-heap for efficient top-K selection.
+struct CompareCostReverse<'a> {
+    total_cost: Duration,
+    file_id: RegionFileId,
+    metrics: &'a FileScanMetrics,
+}
+
+impl Ord for CompareCostReverse<'_> {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // Reverse comparison: smaller costs are "greater"
+        other.total_cost.cmp(&self.total_cost)
+    }
+}
+
+impl PartialOrd for CompareCostReverse<'_> {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Eq for CompareCostReverse<'_> {}
+
+impl PartialEq for CompareCostReverse<'_> {
+    fn eq(&self, other: &Self) -> bool {
+        self.total_cost == other.total_cost
+    }
 }
 
 impl fmt::Debug for ScanMetricsSet {
@@ -135,6 +248,7 @@ impl fmt::Debug for ScanMetricsSet {
             num_mem_ranges,
             num_file_ranges,
             build_parts_cost,
+            sst_scan_cost,
             rg_total,
             rg_fulltext_filtered,
             rg_inverted_filtered,
@@ -155,11 +269,19 @@ impl fmt::Debug for ScanMetricsSet {
             num_distributor_batches,
             distributor_scan_cost,
             distributor_yield_cost,
+            merge_metrics,
+            dedup_metrics,
             stream_eof,
             mem_scan_cost,
             mem_rows,
             mem_batches,
             mem_series,
+            inverted_index_apply_metrics,
+            bloom_filter_apply_metrics,
+            fulltext_index_apply_metrics,
+            fetch_metrics,
+            metadata_cache_metrics,
+            per_file_metrics,
         } = self;
 
         // Write core metrics
@@ -175,6 +297,7 @@ impl fmt::Debug for ScanMetricsSet {
             \"num_mem_ranges\":{num_mem_ranges}, \
             \"num_file_ranges\":{num_file_ranges}, \
             \"build_parts_cost\":\"{build_parts_cost:?}\", \
+            \"sst_scan_cost\":\"{sst_scan_cost:?}\", \
             \"rg_total\":{rg_total}, \
             \"rows_before_filter\":{rows_before_filter}, \
             \"num_sst_record_batches\":{num_sst_record_batches}, \
@@ -249,6 +372,89 @@ impl fmt::Debug for ScanMetricsSet {
             write!(f, ", \"mem_scan_cost\":\"{mem_scan_cost:?}\"")?;
         }
 
+        // Write optional verbose metrics if they are not empty
+        if let Some(metrics) = inverted_index_apply_metrics
+            && !metrics.is_empty()
+        {
+            write!(f, ", \"inverted_index_apply_metrics\":{:?}", metrics)?;
+        }
+        if let Some(metrics) = bloom_filter_apply_metrics
+            && !metrics.is_empty()
+        {
+            write!(f, ", \"bloom_filter_apply_metrics\":{:?}", metrics)?;
+        }
+        if let Some(metrics) = fulltext_index_apply_metrics
+            && !metrics.is_empty()
+        {
+            write!(f, ", \"fulltext_index_apply_metrics\":{:?}", metrics)?;
+        }
+        if let Some(metrics) = fetch_metrics
+            && !metrics.is_empty()
+        {
+            write!(f, ", \"fetch_metrics\":{:?}", metrics)?;
+        }
+        if let Some(metrics) = metadata_cache_metrics
+            && !metrics.is_empty()
+        {
+            write!(f, ", \"metadata_cache_metrics\":{:?}", metrics)?;
+        }
+
+        // Write merge metrics if not empty
+        if !merge_metrics.scan_cost.is_zero() {
+            write!(f, ", \"merge_metrics\":{:?}", merge_metrics)?;
+        }
+
+        // Write dedup metrics if not empty
+        if !dedup_metrics.dedup_cost.is_zero() {
+            write!(f, ", \"dedup_metrics\":{:?}", dedup_metrics)?;
+        }
+
+        // Write top file metrics if present and non-empty
+        if let Some(file_metrics) = per_file_metrics
+            && !file_metrics.is_empty()
+        {
+            // Use min-heap (BinaryHeap with reverse comparison) to keep only top 10
+            let mut heap = BinaryHeap::new();
+            for (file_id, metrics) in file_metrics.iter() {
+                let total_cost =
+                    metrics.build_part_cost + metrics.build_reader_cost + metrics.scan_cost;
+
+                if heap.len() < 10 {
+                    // Haven't reached 10 yet, just push
+                    heap.push(CompareCostReverse {
+                        total_cost,
+                        file_id: *file_id,
+                        metrics,
+                    });
+                } else if let Some(min_entry) = heap.peek() {
+                    // If current cost is higher than the minimum in our top-10, replace it
+                    if total_cost > min_entry.total_cost {
+                        heap.pop();
+                        heap.push(CompareCostReverse {
+                            total_cost,
+                            file_id: *file_id,
+                            metrics,
+                        });
+                    }
+                }
+            }
+
+            let top_files = heap.into_sorted_vec();
+            write!(f, ", \"top_file_metrics\": {{")?;
+            for (i, item) in top_files.iter().enumerate() {
+                let CompareCostReverse {
+                    total_cost: _,
+                    file_id,
+                    metrics,
+                } = item;
+                if i > 0 {
+                    write!(f, ", ")?;
+                }
+                write!(f, "\"{}\": {:?}", file_id, metrics)?;
+            }
+            write!(f, "}}")?;
+        }
+
         write!(f, ", \"stream_eof\":{stream_eof}}}")
     }
 }
@@ -298,14 +504,20 @@ impl ScanMetricsSet {
                     rows_inverted_filtered,
                     rows_bloom_filtered,
                     rows_precise_filtered,
+                    inverted_index_apply_metrics,
+                    bloom_filter_apply_metrics,
+                    fulltext_index_apply_metrics,
                 },
             num_record_batches,
             num_batches,
             num_rows,
-            scan_cost: _,
+            scan_cost,
+            metadata_cache_metrics,
+            fetch_metrics,
         } = other;
 
         self.build_parts_cost += *build_cost;
+        self.sst_scan_cost += *scan_cost;
 
         self.rg_total += *rg_total;
         self.rg_fulltext_filtered += *rg_fulltext_filtered;
@@ -322,6 +534,42 @@ impl ScanMetricsSet {
         self.num_sst_record_batches += *num_record_batches;
         self.num_sst_batches += *num_batches;
         self.num_sst_rows += *num_rows;
+
+        // Merge optional verbose metrics
+        if let Some(metrics) = inverted_index_apply_metrics {
+            self.inverted_index_apply_metrics
+                .get_or_insert_with(InvertedIndexApplyMetrics::default)
+                .merge_from(metrics);
+        }
+        if let Some(metrics) = bloom_filter_apply_metrics {
+            self.bloom_filter_apply_metrics
+                .get_or_insert_with(BloomFilterIndexApplyMetrics::default)
+                .merge_from(metrics);
+        }
+        if let Some(metrics) = fulltext_index_apply_metrics {
+            self.fulltext_index_apply_metrics
+                .get_or_insert_with(FulltextIndexApplyMetrics::default)
+                .merge_from(metrics);
+        }
+        if let Some(metrics) = fetch_metrics {
+            self.fetch_metrics
+                .get_or_insert_with(ParquetFetchMetrics::default)
+                .merge_from(metrics);
+        }
+        self.metadata_cache_metrics
+            .get_or_insert_with(MetadataCacheMetrics::default)
+            .merge_from(metadata_cache_metrics);
+    }
+
+    /// Merges per-file metrics.
+    fn merge_per_file_metrics(&mut self, other: &HashMap<RegionFileId, FileScanMetrics>) {
+        let self_file_metrics = self.per_file_metrics.get_or_insert_with(HashMap::new);
+        for (file_id, metrics) in other {
+            self_file_metrics
+                .entry(*file_id)
+                .or_default()
+                .merge_from(metrics);
+        }
     }
 
     /// Sets distributor metrics.
@@ -442,6 +690,28 @@ impl PartitionMetricsInner {
     }
 }
 
+impl MergeMetricsReport for PartitionMetricsInner {
+    fn report(&self, metrics: &mut MergeMetrics) {
+        let mut scan_metrics = self.metrics.lock().unwrap();
+        // Merge the metrics into scan_metrics
+        scan_metrics.merge_metrics.merge(metrics);
+
+        // Reset the input metrics
+        *metrics = MergeMetrics::default();
+    }
+}
+
+impl DedupMetricsReport for PartitionMetricsInner {
+    fn report(&self, metrics: &mut DedupMetrics) {
+        let mut scan_metrics = self.metrics.lock().unwrap();
+        // Merge the metrics into scan_metrics
+        scan_metrics.dedup_metrics.merge(metrics);
+
+        // Reset the input metrics
+        *metrics = DedupMetrics::default();
+    }
+}
+
 impl Drop for PartitionMetricsInner {
     fn drop(&mut self) {
         self.on_finish(false);
@@ -592,11 +862,20 @@ impl PartitionMetrics {
     }
 
     /// Merges [ReaderMetrics] and `build_reader_cost`.
-    pub fn merge_reader_metrics(&self, metrics: &ReaderMetrics) {
+    pub fn merge_reader_metrics(
+        &self,
+        metrics: &ReaderMetrics,
+        per_file_metrics: Option<&HashMap<RegionFileId, FileScanMetrics>>,
+    ) {
         self.0.build_parts_cost.add_duration(metrics.build_cost);
 
         let mut metrics_set = self.0.metrics.lock().unwrap();
         metrics_set.merge_reader_metrics(metrics);
+
+        // Merge per-file metrics if provided
+        if let Some(file_metrics) = per_file_metrics {
+            metrics_set.merge_per_file_metrics(file_metrics);
+        }
     }
 
     /// Finishes the query.
@@ -609,6 +888,21 @@ impl PartitionMetrics {
         let mut metrics_set = self.0.metrics.lock().unwrap();
         metrics_set.set_distributor_metrics(metrics);
     }
+
+    /// Returns whether verbose explain is enabled.
+    pub(crate) fn explain_verbose(&self) -> bool {
+        self.0.explain_verbose
+    }
+
+    /// Returns a MergeMetricsReport trait object for reporting merge metrics.
+    pub(crate) fn merge_metrics_reporter(&self) -> Arc<dyn MergeMetricsReport> {
+        self.0.clone()
+    }
+
+    /// Returns a DedupMetricsReport trait object for reporting dedup metrics.
+    pub(crate) fn dedup_metrics_reporter(&self) -> Arc<dyn DedupMetricsReport> {
+        self.0.clone()
+    }
 }
 
 impl fmt::Debug for PartitionMetrics {
@@ -697,6 +991,86 @@ pub(crate) fn scan_flat_mem_ranges(
     }
 }
 
+/// Files with row count greater than this threshold can contribute to the estimation.
+const SPLIT_ROW_THRESHOLD: u64 = DEFAULT_ROW_GROUP_SIZE as u64;
+/// Number of series threshold for splitting batches.
+const NUM_SERIES_THRESHOLD: u64 = 10240;
+/// Minimum batch size after splitting. The batch size is less than 60 because a series may only have
+/// 60 samples per hour.
+const BATCH_SIZE_THRESHOLD: u64 = 50;
+
+/// Returns true if splitting flat record batches may improve merge performance.
+pub(crate) fn should_split_flat_batches_for_merge(
+    stream_ctx: &Arc<StreamContext>,
+    range_meta: &RangeMeta,
+) -> bool {
+    // Number of files to split and scan.
+    let mut num_files_to_split = 0;
+    let mut num_mem_rows = 0;
+    let mut num_mem_series = 0;
+    // Checks each file range, returns early if any range is not splittable.
+    // For mem ranges, we collect the total number of rows and series because the number of rows in a
+    // mem range may be too small.
+    for index in &range_meta.row_group_indices {
+        if stream_ctx.is_mem_range_index(*index) {
+            let memtable = &stream_ctx.input.memtables[index.index];
+            // Is mem range
+            let stats = memtable.stats();
+            num_mem_rows += stats.num_rows();
+            num_mem_series += stats.series_count();
+        } else if stream_ctx.is_file_range_index(*index) {
+            // This is a file range.
+            let file_index = index.index - stream_ctx.input.num_memtables();
+            let file = &stream_ctx.input.files[file_index];
+            if file.meta_ref().num_rows < SPLIT_ROW_THRESHOLD || file.meta_ref().num_series == 0 {
+                // If the file doesn't have enough rows, or the number of series is unavailable, skips it.
+                continue;
+            }
+            debug_assert!(file.meta_ref().num_rows > 0);
+            if !can_split_series(file.meta_ref().num_rows, file.meta_ref().num_series) {
+                // We can't split batches in a file.
+                return false;
+            } else {
+                num_files_to_split += 1;
+            }
+        }
+        // Skips non-file and non-mem ranges.
+    }
+
+    if num_files_to_split > 0 {
+        // We mainly consider file ranges because they have enough data for sampling.
+        true
+    } else if num_mem_series > 0 && num_mem_rows > 0 {
+        // If we don't have files to scan, we check whether to split by the memtable.
+        can_split_series(num_mem_rows as u64, num_mem_series as u64)
+    } else {
+        false
+    }
+}
+
+fn can_split_series(num_rows: u64, num_series: u64) -> bool {
+    assert!(num_series > 0);
+    assert!(num_rows > 0);
+
+    // It doesn't have too many series or it will have enough rows for each batch.
+    num_series < NUM_SERIES_THRESHOLD || num_rows / num_series >= BATCH_SIZE_THRESHOLD
+}
+
+/// Creates a new [ReaderFilterMetrics] with optional apply metrics initialized
+/// based on the `explain_verbose` flag.
+fn new_filter_metrics(explain_verbose: bool) -> ReaderFilterMetrics {
+    if explain_verbose {
+        ReaderFilterMetrics {
+            inverted_index_apply_metrics: Some(InvertedIndexApplyMetrics::default()),
+            bloom_filter_apply_metrics: Some(BloomFilterIndexApplyMetrics::default()),
+            fulltext_index_apply_metrics: Some(FulltextIndexApplyMetrics::default()),
+            ..Default::default()
+        }
+    } else {
+        ReaderFilterMetrics::default()
+    }
+}
+
 /// Scans file ranges at `index`.
 pub(crate) async fn scan_file_ranges(
     stream_ctx: Arc<StreamContext>,
@@ -705,18 +1079,40 @@ pub(crate) async fn scan_file_ranges(
     read_type: &'static str,
     range_builder: Arc<RangeBuilderList>,
 ) -> Result<impl Stream<Item = Result<Batch>>> {
-    let mut reader_metrics = ReaderMetrics::default();
+    let mut reader_metrics = ReaderMetrics {
+        filter_metrics: new_filter_metrics(part_metrics.explain_verbose()),
+        ..Default::default()
+    };
     let ranges = range_builder
         .build_file_ranges(&stream_ctx.input, index, &mut reader_metrics)
         .await?;
     part_metrics.inc_num_file_ranges(ranges.len());
-    part_metrics.merge_reader_metrics(&reader_metrics);
+    part_metrics.merge_reader_metrics(&reader_metrics, None);
+
+    // Creates initial per-file metrics with build_part_cost.
+    let init_per_file_metrics = if part_metrics.explain_verbose() {
+        let file = stream_ctx.input.file_from_index(index);
+        let file_id = file.file_id();
+
+        let mut map = HashMap::new();
+        map.insert(
+            file_id,
+            FileScanMetrics {
+                build_part_cost: reader_metrics.build_cost,
+                ..Default::default()
+            },
+        );
+        Some(map)
+    } else {
+        None
+    };
 
     Ok(build_file_range_scan_stream(
         stream_ctx,
         part_metrics,
         read_type,
         ranges,
+        init_per_file_metrics,
     ))
 }
 
@@ -728,18 +1124,40 @@ pub(crate) async fn scan_flat_file_ranges(
     read_type: &'static str,
     range_builder: Arc<RangeBuilderList>,
 ) -> Result<impl Stream<Item = Result<RecordBatch>>> {
-    let mut reader_metrics = ReaderMetrics::default();
+    let mut reader_metrics = ReaderMetrics {
+        filter_metrics: new_filter_metrics(part_metrics.explain_verbose()),
+        ..Default::default()
+    };
     let ranges = range_builder
         .build_file_ranges(&stream_ctx.input, index, &mut reader_metrics)
         .await?;
     part_metrics.inc_num_file_ranges(ranges.len());
-    part_metrics.merge_reader_metrics(&reader_metrics);
+    part_metrics.merge_reader_metrics(&reader_metrics, None);
+
+    // Creates initial per-file metrics with build_part_cost.
+    let init_per_file_metrics = if part_metrics.explain_verbose() {
+        let file = stream_ctx.input.file_from_index(index);
+        let file_id = file.file_id();
+
+        let mut map = HashMap::new();
+        map.insert(
+            file_id,
+            FileScanMetrics {
+                build_part_cost: reader_metrics.build_cost,
+                ..Default::default()
+            },
+        );
+        Some(map)
+    } else {
+        None
+    };
 
     Ok(build_flat_file_range_scan_stream(
         stream_ctx,
         part_metrics,
         read_type,
         ranges,
+        init_per_file_metrics,
     ))
 }
 
@@ -749,12 +1167,21 @@ pub fn build_file_range_scan_stream(
     part_metrics: PartitionMetrics,
     read_type: &'static str,
     ranges: SmallVec<[FileRange; 2]>,
+    mut per_file_metrics: Option<HashMap<RegionFileId, FileScanMetrics>>,
 ) -> impl Stream<Item = Result<Batch>> {
     try_stream! {
-        let reader_metrics = &mut ReaderMetrics::default();
+        let fetch_metrics = if part_metrics.explain_verbose() {
+            Some(Arc::new(ParquetFetchMetrics::default()))
+        } else {
+            None
+        };
+        let reader_metrics = &mut ReaderMetrics {
+            fetch_metrics: fetch_metrics.clone(),
+            ..Default::default()
+        };
         for range in ranges {
             let build_reader_start = Instant::now();
-            let reader = range.reader(stream_ctx.input.series_row_selector).await?;
+            let reader = range.reader(stream_ctx.input.series_row_selector, fetch_metrics.as_deref()).await?;
             let build_cost = build_reader_start.elapsed();
             part_metrics.inc_build_reader_cost(build_cost);
             let compat_batch = range.compat_batch();
@@ -767,6 +1194,20 @@ pub fn build_file_range_scan_stream(
             }
             if let Source::PruneReader(reader) = source {
                 let prune_metrics = reader.metrics();
+
+                // Update per-file metrics if tracking is enabled
+                if let Some(file_metrics_map) = per_file_metrics.as_mut() {
+                    let file_id = range.file_handle().file_id();
+                    let file_metrics = file_metrics_map
+                        .entry(file_id)
+                        .or_insert_with(FileScanMetrics::default);
+
+                    file_metrics.num_ranges += 1;
+                    file_metrics.num_rows += prune_metrics.num_rows;
+                    file_metrics.build_reader_cost += build_cost;
+                    file_metrics.scan_cost += prune_metrics.scan_cost;
+                }
+
                 reader_metrics.merge_from(&prune_metrics);
             }
         }
@@ -774,7 +1215,7 @@ pub fn build_file_range_scan_stream(
         // Reports metrics.
         reader_metrics.observe_rows(read_type);
         reader_metrics.filter_metrics.observe();
-        part_metrics.merge_reader_metrics(reader_metrics);
+        part_metrics.merge_reader_metrics(reader_metrics, per_file_metrics.as_ref());
     }
 }
 
@@ -784,12 +1225,21 @@ pub fn build_flat_file_range_scan_stream(
     part_metrics: PartitionMetrics,
     read_type: &'static str,
     ranges: SmallVec<[FileRange; 2]>,
+    mut per_file_metrics: Option<HashMap<RegionFileId, FileScanMetrics>>,
 ) -> impl Stream<Item = Result<RecordBatch>> {
     try_stream! {
-        let reader_metrics = &mut ReaderMetrics::default();
+        let fetch_metrics = if part_metrics.explain_verbose() {
+            Some(Arc::new(ParquetFetchMetrics::default()))
+        } else {
+            None
+        };
+        let reader_metrics = &mut ReaderMetrics {
+            fetch_metrics: fetch_metrics.clone(),
+            ..Default::default()
+        };
         for range in ranges {
             let build_reader_start = Instant::now();
-            let mut reader = range.flat_reader().await?;
+            let mut reader = range.flat_reader(fetch_metrics.as_deref()).await?;
             let build_cost = build_reader_start.elapsed();
             part_metrics.inc_build_reader_cost(build_cost);
 
@@ -811,13 +1261,27 @@ pub fn build_flat_file_range_scan_stream(
             }
 
             let prune_metrics = reader.metrics();
+
+            // Update per-file metrics if tracking is enabled
+            if let Some(file_metrics_map) = per_file_metrics.as_mut() {
+                let file_id = range.file_handle().file_id();
+                let file_metrics = file_metrics_map
+                    .entry(file_id)
+                    .or_insert_with(FileScanMetrics::default);
+
+                file_metrics.num_ranges += 1;
+                file_metrics.num_rows += prune_metrics.num_rows;
+                file_metrics.build_reader_cost += build_cost;
+                file_metrics.scan_cost += prune_metrics.scan_cost;
+            }
+
             reader_metrics.merge_from(&prune_metrics);
         }
 
         // Reports metrics.
         reader_metrics.observe_rows(read_type);
         reader_metrics.filter_metrics.observe();
-        part_metrics.merge_reader_metrics(reader_metrics);
+        part_metrics.merge_reader_metrics(reader_metrics, per_file_metrics.as_ref());
     }
 }
 
@@ -876,3 +1340,83 @@ pub(crate) async fn maybe_scan_flat_other_ranges(
     }
     .fail()
 }
+
+/// A stream wrapper that splits record batches from an inner stream.
+pub(crate) struct SplitRecordBatchStream<S> {
+    /// The inner stream that yields record batches.
+    inner: S,
+    /// Buffer for split batches.
+    batches: VecDeque<RecordBatch>,
+}
+
+impl<S> SplitRecordBatchStream<S> {
+    /// Creates a new splitting stream wrapper.
+    pub(crate) fn new(inner: S) -> Self {
+        Self {
+            inner,
+            batches: VecDeque::new(),
+        }
+    }
+}
+
+impl<S> Stream for SplitRecordBatchStream<S>
+where
+    S: Stream<Item = Result<RecordBatch>> + Unpin,
+{
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        loop {
+            // First, check if we have buffered split batches
+            if let Some(batch) = self.batches.pop_front() {
+                return Poll::Ready(Some(Ok(batch)));
+            }
+
+            // Poll the inner stream for the next batch
+            let record_batch = match futures::ready!(Pin::new(&mut self.inner).poll_next(cx)) {
+                Some(Ok(batch)) => batch,
+                Some(Err(e)) => return Poll::Ready(Some(Err(e))),
+                None => return Poll::Ready(None),
+            };
+
+            // Split the batch and buffer the results
+            split_record_batch(record_batch, &mut self.batches);
+            // Continue the loop to return the first split batch
+        }
+    }
+}
+
+/// Splits the batch by timestamps.
+///
+/// # Panics
+/// Panics if the timestamp array is invalid.
+pub(crate) fn split_record_batch(record_batch: RecordBatch, batches: &mut VecDeque<RecordBatch>) {
+    let batch_rows = record_batch.num_rows();
+    if batch_rows == 0 {
+        return;
+    }
+    if batch_rows < 2 {
+        batches.push_back(record_batch);
+        return;
+    }
+
+    let time_index_pos = time_index_column_index(record_batch.num_columns());
+    let timestamps = record_batch.column(time_index_pos);
+    let (ts_values, _unit) = timestamp_array_to_primitive(timestamps).unwrap();
+    let mut offsets = Vec::with_capacity(16);
+    offsets.push(0);
+    let values = ts_values.values();
+    for (i, &value) in values.iter().take(batch_rows - 1).enumerate() {
+        if value > values[i + 1] {
+            offsets.push(i + 1);
+        }
+    }
+    offsets.push(values.len());
+
+    // Splits the batch by offsets.
+    for (i, &start) in offsets[..offsets.len() - 1].iter().enumerate() {
+        let end = offsets[i + 1];
+        let rows_in_batch = end - start;
+        batches.push_back(record_batch.slice(start, rows_in_batch));
+    }
+}
diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs
index c90ea89b90..41f6dc7772 100644
--- a/src/mito2/src/read/seq_scan.rs
+++ b/src/mito2/src/read/seq_scan.rs
@@ -44,8 +44,9 @@ use crate::read::merge::MergeReaderBuilder;
 use crate::read::range::{RangeBuilderList, RangeMeta};
 use crate::read::scan_region::{ScanInput, StreamContext};
 use crate::read::scan_util::{
-    PartitionMetrics, PartitionMetricsList, scan_file_ranges, scan_flat_file_ranges,
-    scan_flat_mem_ranges, scan_mem_ranges,
+    PartitionMetrics, PartitionMetricsList, SplitRecordBatchStream, scan_file_ranges,
+    scan_flat_file_ranges, scan_flat_mem_ranges, scan_mem_ranges,
+    should_split_flat_batches_for_merge,
 };
 use crate::read::stream::{ConvertBatchStream, ScanBatch, ScanBatchStream};
 use crate::read::{
@@ -177,6 +178,7 @@ impl SeqScan {
                 part_metrics,
                 range_builder_list.clone(),
                 &mut sources,
+                None,
             )
             .await?;
         }
@@ -187,7 +189,7 @@ impl SeqScan {
             partition_ranges.len(),
             sources.len()
         );
-        Self::build_reader_from_sources(stream_ctx, sources, None).await
+        Self::build_reader_from_sources(stream_ctx, sources, None, None).await
     }
 
     /// Builds a merge reader that reads all flat ranges.
@@ -210,6 +212,7 @@ impl SeqScan {
                 part_metrics,
                 range_builder_list.clone(),
                 &mut sources,
+                None,
             )
             .await?;
         }
@@ -220,7 +223,7 @@ impl SeqScan {
             partition_ranges.len(),
             sources.len()
         );
-        Self::build_flat_reader_from_sources(stream_ctx, sources, None).await
+        Self::build_flat_reader_from_sources(stream_ctx, sources, None, None).await
     }
 
     /// Builds a reader to read sources. If `semaphore` is provided, reads sources in parallel
@@ -230,6 +233,7 @@ impl SeqScan {
         stream_ctx: &StreamContext,
         mut sources: Vec<Source>,
         semaphore: Option<Arc<Semaphore>>,
+        part_metrics: Option<&PartitionMetrics>,
     ) -> Result<BoxedBatchReader> {
         if let Some(semaphore) = semaphore.as_ref() {
             // Read sources in parallel.
@@ -241,18 +245,24 @@ impl SeqScan {
         }
 
         let mut builder = MergeReaderBuilder::from_sources(sources);
+        if let Some(metrics) = part_metrics {
+            builder.with_metrics_reporter(Some(metrics.merge_metrics_reporter()));
+        }
         let reader = builder.build().await?;
 
         let dedup = !stream_ctx.input.append_mode;
+        let dedup_metrics_reporter = part_metrics.map(|m| m.dedup_metrics_reporter());
         let reader = if dedup {
             match stream_ctx.input.merge_mode {
                 MergeMode::LastRow => Box::new(DedupReader::new(
                     reader,
                     LastRow::new(stream_ctx.input.filter_deleted),
+                    dedup_metrics_reporter,
                 )) as _,
                 MergeMode::LastNonNull => Box::new(DedupReader::new(
                     reader,
                     LastNonNull::new(stream_ctx.input.filter_deleted),
+                    dedup_metrics_reporter,
                 )) as _,
             }
         } else {
@@ -274,6 +284,7 @@ impl SeqScan {
         stream_ctx: &StreamContext,
         mut sources: Vec<BoxedRecordBatchStream>,
         semaphore: Option<Arc<Semaphore>>,
+        part_metrics: Option<&PartitionMetrics>,
     ) -> Result<BoxedRecordBatchStream> {
         if let Some(semaphore) = semaphore.as_ref() {
             // Read sources in parallel.
@@ -287,15 +298,20 @@ impl SeqScan {
         let mapper = stream_ctx.input.mapper.as_flat().unwrap();
         let schema = mapper.input_arrow_schema(stream_ctx.input.compaction);
 
-        let reader = FlatMergeReader::new(schema, sources, DEFAULT_READ_BATCH_SIZE).await?;
+        let metrics_reporter = part_metrics.map(|m| m.merge_metrics_reporter());
+        let reader =
+            FlatMergeReader::new(schema, sources, DEFAULT_READ_BATCH_SIZE, metrics_reporter)
+                .await?;
 
         let dedup = !stream_ctx.input.append_mode;
+        let dedup_metrics_reporter = part_metrics.map(|m| m.dedup_metrics_reporter());
         let reader = if dedup {
             match stream_ctx.input.merge_mode {
                 MergeMode::LastRow => Box::pin(
                     FlatDedupReader::new(
                         reader.into_stream().boxed(),
                         FlatLastRow::new(stream_ctx.input.filter_deleted),
+                        dedup_metrics_reporter,
                     )
                     .into_stream(),
                 ) as _,
@@ -306,6 +322,7 @@ impl SeqScan {
                             mapper.field_column_start(),
                             stream_ctx.input.filter_deleted,
                         ),
+                        dedup_metrics_reporter,
                     )
                     .into_stream(),
                 ) as _,
@@ -378,6 +395,7 @@ impl SeqScan {
         let partition_ranges = self.properties.partitions[partition].clone();
         let compaction = self.stream_ctx.input.compaction;
         let distinguish_range = self.properties.distinguish_partition_range;
+        let file_scan_semaphore = if compaction { None } else { semaphore.clone() };
 
         let stream = try_stream! {
             part_metrics.on_first_poll();
@@ -399,12 +417,13 @@ impl SeqScan {
                     &part_metrics,
                     range_builder_list.clone(),
                     &mut sources,
+                    file_scan_semaphore.clone(),
                 ).await?;
 
                 let mut metrics = ScannerMetrics::default();
                 let mut fetch_start = Instant::now();
                 let mut reader =
-                    Self::build_reader_from_sources(&stream_ctx, sources, semaphore.clone())
+                    Self::build_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics))
                         .await?;
                 #[cfg(debug_assertions)]
                 let mut checker = crate::read::BatchChecker::default()
@@ -475,6 +494,7 @@ impl SeqScan {
         let semaphore = self.new_semaphore();
         let partition_ranges = self.properties.partitions[partition].clone();
         let compaction = self.stream_ctx.input.compaction;
+        let file_scan_semaphore = if compaction { None } else { semaphore.clone() };
 
         let stream = try_stream! {
             part_metrics.on_first_poll();
@@ -493,12 +513,13 @@ impl SeqScan {
                     &part_metrics,
                     range_builder_list.clone(),
                     &mut sources,
+                    file_scan_semaphore.clone(),
                 ).await?;
 
                 let mut metrics = ScannerMetrics::default();
                 let mut fetch_start = Instant::now();
                 let mut reader =
-                    Self::build_flat_reader_from_sources(&stream_ctx, sources, semaphore.clone())
+                    Self::build_flat_reader_from_sources(&stream_ctx, sources, semaphore.clone(), Some(&part_metrics))
                         .await?;
 
                 while let Some(record_batch) = reader.try_next().await? {
@@ -602,6 +623,10 @@ impl SeqScan {
 }
 
 impl RegionScanner for SeqScan {
+    fn name(&self) -> &str {
+        "SeqScan"
+    }
+
     fn properties(&self) -> &ScannerProperties {
         &self.properties
     }
@@ -682,6 +707,7 @@ pub(crate) async fn build_sources(
     part_metrics: &PartitionMetrics,
     range_builder_list: Arc<RangeBuilderList>,
     sources: &mut Vec<Source>,
+    semaphore: Option<Arc<Semaphore>>,
 ) -> Result<()> {
     // Gets range meta.
     let range_meta = &stream_ctx.ranges[part_range.identifier];
@@ -699,35 +725,78 @@ pub(crate) async fn build_sources(
         }
     }
 
-    sources.reserve(range_meta.row_group_indices.len());
-    for index in &range_meta.row_group_indices {
-        let stream = if stream_ctx.is_mem_range_index(*index) {
+    let read_type = if compaction {
+        "compaction"
+    } else {
+        "seq_scan_files"
+    };
+    let num_indices = range_meta.row_group_indices.len();
+    if num_indices == 0 {
+        return Ok(());
+    }
+
+    sources.reserve(num_indices);
+    let mut ordered_sources = Vec::with_capacity(num_indices);
+    ordered_sources.resize_with(num_indices, || None);
+    let mut file_scan_tasks = Vec::new();
+
+    for (position, index) in range_meta.row_group_indices.iter().enumerate() {
+        if stream_ctx.is_mem_range_index(*index) {
             let stream = scan_mem_ranges(
                 stream_ctx.clone(),
                 part_metrics.clone(),
                 *index,
                 range_meta.time_range,
             );
-            Box::pin(stream) as _
+            ordered_sources[position] = Some(Source::Stream(Box::pin(stream) as _));
         } else if stream_ctx.is_file_range_index(*index) {
-            let read_type = if compaction {
-                "compaction"
+            if let Some(semaphore_ref) = semaphore.as_ref() {
+                // run in parallel, controlled by semaphore
+                let stream_ctx = stream_ctx.clone();
+                let part_metrics = part_metrics.clone();
+                let range_builder_list = range_builder_list.clone();
+                let semaphore = Arc::clone(semaphore_ref);
+                let row_group_index = *index;
+                file_scan_tasks.push(async move {
+                    let _permit = semaphore.acquire().await.unwrap();
+                    let stream = scan_file_ranges(
+                        stream_ctx,
+                        part_metrics,
+                        row_group_index,
+                        read_type,
+                        range_builder_list,
+                    )
+                    .await?;
+                    Ok((position, Source::Stream(Box::pin(stream) as _)))
+                });
             } else {
-                "seq_scan_files"
-            };
-            let stream = scan_file_ranges(
-                stream_ctx.clone(),
-                part_metrics.clone(),
-                *index,
-                read_type,
-                range_builder_list.clone(),
-            )
-            .await?;
-            Box::pin(stream) as _
+                // no semaphore, run sequentially
+                let stream = scan_file_ranges(
+                    stream_ctx.clone(),
+                    part_metrics.clone(),
+                    *index,
+                    read_type,
+                    range_builder_list.clone(),
+                )
+                .await?;
+                ordered_sources[position] = Some(Source::Stream(Box::pin(stream) as _));
+            }
         } else {
-            scan_util::maybe_scan_other_ranges(stream_ctx, *index, part_metrics).await?
-        };
-        sources.push(Source::Stream(stream));
+            let stream =
+                scan_util::maybe_scan_other_ranges(stream_ctx, *index, part_metrics).await?;
+            ordered_sources[position] = Some(Source::Stream(stream));
+        }
+    }
+
+    if !file_scan_tasks.is_empty() {
+        let results = futures::future::try_join_all(file_scan_tasks).await?;
+        for (position, source) in results {
+            ordered_sources[position] = Some(source);
+        }
+    }
+
+    for source in ordered_sources.into_iter().flatten() {
+        sources.push(source);
     }
     Ok(())
 }
@@ -740,6 +809,7 @@ pub(crate) async fn build_flat_sources(
     part_metrics: &PartitionMetrics,
     range_builder_list: Arc<RangeBuilderList>,
     sources: &mut Vec<BoxedRecordBatchStream>,
+    semaphore: Option<Arc<Semaphore>>,
 ) -> Result<()> {
     // Gets range meta.
     let range_meta = &stream_ctx.ranges[part_range.identifier];
@@ -757,31 +827,89 @@ pub(crate) async fn build_flat_sources(
         }
     }
 
-    sources.reserve(range_meta.row_group_indices.len());
-    for index in &range_meta.row_group_indices {
-        let stream = if stream_ctx.is_mem_range_index(*index) {
-            let stream = scan_flat_mem_ranges(stream_ctx.clone(), part_metrics.clone(), *index);
-            Box::pin(stream) as _
-        } else if stream_ctx.is_file_range_index(*index) {
-            let read_type = if compaction {
-                "compaction"
-            } else {
-                "seq_scan_files"
-            };
-            let stream = scan_flat_file_ranges(
-                stream_ctx.clone(),
-                part_metrics.clone(),
-                *index,
-                read_type,
-                range_builder_list.clone(),
-            )
-            .await?;
-            Box::pin(stream) as _
-        } else {
-            scan_util::maybe_scan_flat_other_ranges(stream_ctx, *index, part_metrics).await?
-        };
-        sources.push(stream);
+    let read_type = if compaction {
+        "compaction"
+    } else {
+        "seq_scan_files"
+    };
+    let num_indices = range_meta.row_group_indices.len();
+    if num_indices == 0 {
+        return Ok(());
     }
+
+    let should_split = should_split_flat_batches_for_merge(stream_ctx, range_meta);
+    sources.reserve(num_indices);
+    let mut ordered_sources = Vec::with_capacity(num_indices);
+    ordered_sources.resize_with(num_indices, || None);
+    let mut file_scan_tasks = Vec::new();
+
+    for (position, index) in range_meta.row_group_indices.iter().enumerate() {
+        if stream_ctx.is_mem_range_index(*index) {
+            let stream = scan_flat_mem_ranges(stream_ctx.clone(), part_metrics.clone(), *index);
+            ordered_sources[position] = Some(Box::pin(stream) as _);
+        } else if stream_ctx.is_file_range_index(*index) {
+            if let Some(semaphore_ref) = semaphore.as_ref() {
+                // run in parallel, controlled by semaphore
+                let stream_ctx = stream_ctx.clone();
+                let part_metrics = part_metrics.clone();
+                let range_builder_list = range_builder_list.clone();
+                let semaphore = Arc::clone(semaphore_ref);
+                let row_group_index = *index;
+                file_scan_tasks.push(async move {
+                    let _permit = semaphore.acquire().await.unwrap();
+                    let stream = scan_flat_file_ranges(
+                        stream_ctx,
+                        part_metrics,
+                        row_group_index,
+                        read_type,
+                        range_builder_list,
+                    )
+                    .await?;
+                    Ok((position, Box::pin(stream) as _))
+                });
+            } else {
+                // no semaphore, run sequentially
+                let stream = scan_flat_file_ranges(
+                    stream_ctx.clone(),
+                    part_metrics.clone(),
+                    *index,
+                    read_type,
+                    range_builder_list.clone(),
+                )
+                .await?;
+                ordered_sources[position] = Some(Box::pin(stream) as _);
+            }
+        } else {
+            let stream =
+                scan_util::maybe_scan_flat_other_ranges(stream_ctx, *index, part_metrics).await?;
+            ordered_sources[position] = Some(stream);
+        }
+    }
+
+    if !file_scan_tasks.is_empty() {
+        let results = futures::future::try_join_all(file_scan_tasks).await?;
+        for (position, stream) in results {
+            ordered_sources[position] = Some(stream);
+        }
+    }
+
+    for stream in ordered_sources.into_iter().flatten() {
+        if should_split {
+            sources.push(Box::pin(SplitRecordBatchStream::new(stream)));
+        } else {
+            sources.push(stream);
+        }
+    }
+
+    if should_split {
+        common_telemetry::debug!(
+            "Splitting record batches, region: {}, sources: {}, part_range: {:?}",
+            stream_ctx.input.region_metadata().region_id,
+            sources.len(),
+            part_range,
+        );
+    }
+
     Ok(())
 }
 
diff --git a/src/mito2/src/read/series_scan.rs b/src/mito2/src/read/series_scan.rs
index a99e3c46bb..c485348806 100644
--- a/src/mito2/src/read/series_scan.rs
+++ b/src/mito2/src/read/series_scan.rs
@@ -284,6 +284,10 @@ fn new_channel_list(num_partitions: usize) -> (SenderList, ReceiverList) {
 }
 
 impl RegionScanner for SeriesScan {
+    fn name(&self) -> &str {
+        "SeriesScan"
+    }
+
     fn properties(&self) -> &ScannerProperties {
         &self.properties
     }
@@ -423,6 +427,7 @@ impl SeriesDistributor {
                     &part_metrics,
                     range_builder_list.clone(),
                     &mut sources,
+                    self.semaphore.clone(),
                 )
                 .await?;
             }
@@ -433,6 +438,7 @@ impl SeriesDistributor {
             &self.stream_ctx,
             sources,
             self.semaphore.clone(),
+            Some(&part_metrics),
         )
         .await?;
         let mut metrics = SeriesDistributorMetrics::default();
@@ -507,15 +513,20 @@ impl SeriesDistributor {
                     &part_metrics,
                     range_builder_list.clone(),
                     &mut sources,
+                    self.semaphore.clone(),
                 )
                 .await?;
             }
         }
 
         // Builds a reader that merge sources from all parts.
-        let mut reader =
-            SeqScan::build_reader_from_sources(&self.stream_ctx, sources, self.semaphore.clone())
-                .await?;
+        let mut reader = SeqScan::build_reader_from_sources(
+            &self.stream_ctx,
+            sources,
+            self.semaphore.clone(),
+            Some(&part_metrics),
+        )
+        .await?;
         let mut metrics = SeriesDistributorMetrics::default();
         let mut fetch_start = Instant::now();
 
diff --git a/src/mito2/src/read/stream.rs b/src/mito2/src/read/stream.rs
index b51f9f5fcc..e621c77e36 100644
--- a/src/mito2/src/read/stream.rs
+++ b/src/mito2/src/read/stream.rs
@@ -109,7 +109,10 @@ impl ConvertBatchStream {
                     compute::concat_batches(output_schema.arrow_schema(), &self.buffer)
                         .context(ArrowComputeSnafu)?;
 
-                RecordBatch::try_from_df_record_batch(output_schema, record_batch)
+                Ok(RecordBatch::from_df_record_batch(
+                    output_schema,
+                    record_batch,
+                ))
             }
             ScanBatch::RecordBatch(df_record_batch) => {
                 // Safety: Only flat format returns this batch.
diff --git a/src/mito2/src/read/unordered_scan.rs b/src/mito2/src/read/unordered_scan.rs
index 8dbfcf07ec..c0a48f60da 100644
--- a/src/mito2/src/read/unordered_scan.rs
+++ b/src/mito2/src/read/unordered_scan.rs
@@ -399,6 +399,10 @@ impl UnorderedScan {
 }
 
 impl RegionScanner for UnorderedScan {
+    fn name(&self) -> &str {
+        "UnorderedScan"
+    }
+
     fn properties(&self) -> &ScannerProperties {
         &self.properties
     }
diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs
index 76ff739351..e83a08ba74 100644
--- a/src/mito2/src/region.rs
+++ b/src/mito2/src/region.rs
@@ -17,12 +17,13 @@
 pub mod catchup;
 pub mod opener;
 pub mod options;
+pub mod utils;
 pub(crate) mod version;
 
 use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
-use std::sync::{Arc, RwLock};
+use std::sync::{Arc, Mutex, RwLock};
 
 use common_telemetry::{error, info, warn};
 use crossbeam_utils::atomic::AtomicCell;
@@ -34,9 +35,11 @@ use store_api::metadata::RegionMetadataRef;
 use store_api::region_engine::{
     RegionManifestInfo, RegionRole, RegionStatistic, SettableRegionRoleState,
 };
+use store_api::region_request::PathType;
 use store_api::sst_entry::ManifestSstEntry;
-use store_api::storage::{RegionId, SequenceNumber};
+use store_api::storage::{FileId, RegionId, SequenceNumber};
 use tokio::sync::RwLockWriteGuard;
+pub use utils::*;
 
 use crate::access_layer::AccessLayerRef;
 use crate::error::{
@@ -49,6 +52,7 @@ use crate::manifest::action::{
 use crate::manifest::manager::RegionManifestManager;
 use crate::region::version::{VersionControlRef, VersionRef};
 use crate::request::{OnFailure, OptionOutputTx};
+use crate::sst::file::FileMeta;
 use crate::sst::file_purger::FilePurgerRef;
 use crate::sst::location::{index_file_path, sst_file_path};
 use crate::time_provider::TimeProviderRef;
@@ -77,6 +81,8 @@ pub enum RegionLeaderState {
     Writable,
     /// The region is in staging mode - writable but no checkpoint/compaction.
     Staging,
+    /// The region is entering staging mode. - write requests will be stalled.
+    EnteringStaging,
     /// The region is altering.
     Altering,
     /// The region is dropping.
@@ -138,6 +144,14 @@ pub struct MitoRegion {
     pub(crate) topic_latest_entry_id: AtomicU64,
     /// The total bytes written to the region.
     pub(crate) written_bytes: Arc<AtomicU64>,
+    /// The partition expression of the region in staging mode.
+    ///
+    /// During the staging mode, the region metadata in [`VersionControlRef`] is not updated,
+    /// so we need to store the partition expression separately.
+    /// TODO(weny):
+    /// 1. Reload the staging partition expr during region open.
+    /// 2. Rejects requests with mismatching partition expr.
+    pub(crate) staging_partition_expr: Mutex<Option<String>>,
     /// manifest stats
     stats: ManifestStats,
 }
@@ -205,6 +219,11 @@ impl MitoRegion {
         self.access_layer.table_dir()
     }
 
+    /// Returns the path type of the region.
+    pub(crate) fn path_type(&self) -> PathType {
+        self.access_layer.path_type()
+    }
+
     /// Returns whether the region is writable.
     pub(crate) fn is_writable(&self) -> bool {
         matches!(
@@ -326,11 +345,19 @@ impl MitoRegion {
         )
     }
 
+    /// Sets the entering staging state.
+    pub(crate) fn set_entering_staging(&self) -> Result<()> {
+        self.compare_exchange_state(
+            RegionLeaderState::Writable,
+            RegionRoleState::Leader(RegionLeaderState::EnteringStaging),
+        )
+    }
+
     /// Exits the staging state back to writable.
     ///
     /// You should call this method in the worker loop.
     /// Transitions from Staging to Writable state.
-    fn exit_staging(&self) -> Result<()> {
+    pub fn exit_staging(&self) -> Result<()> {
         self.compare_exchange_state(
             RegionLeaderState::Staging,
             RegionRoleState::Leader(RegionLeaderState::Writable),
@@ -457,10 +484,7 @@ impl MitoRegion {
                     sst_format: current_version.options.sst_format.unwrap_or_default(),
                 });
                 let result = manager
-                    .update(
-                        RegionMetaActionList::with_action(action),
-                        RegionRoleState::Leader(RegionLeaderState::Writable),
-                    )
+                    .update(RegionMetaActionList::with_action(action), false)
                     .await;
 
                 match result {
@@ -492,6 +516,16 @@ impl MitoRegion {
         }
     }
 
+    /// Switches the region state to `RegionRoleState::Leader(RegionLeaderState::Staging)` if the current state is `expect`.
+    /// Otherwise, logs an error.
+    pub(crate) fn switch_state_to_staging(&self, expect: RegionLeaderState) {
+        if let Err(e) =
+            self.compare_exchange_state(expect, RegionRoleState::Leader(RegionLeaderState::Staging))
+        {
+            error!(e; "failed to switch region state to staging, expect state is {:?}", expect);
+        }
+    }
+
     /// Returns the region statistic.
     pub(crate) fn region_statistic(&self) -> RegionStatistic {
         let version = self.version();
@@ -507,6 +541,7 @@ impl MitoRegion {
         let num_rows = version.ssts.num_rows() + version.memtables.num_rows();
         let num_files = version.ssts.num_files();
         let manifest_version = self.stats.manifest_version();
+        let file_removed_cnt = self.stats.file_removed_cnt();
 
         let topic_latest_entry_id = self.topic_latest_entry_id.load(Ordering::Relaxed);
         let written_bytes = self.written_bytes.load(Ordering::Relaxed);
@@ -522,6 +557,7 @@ impl MitoRegion {
             manifest: RegionManifestInfo::Mito {
                 manifest_version,
                 flushed_entry_id,
+                file_removed_cnt,
             },
             data_topic_latest_entry_id: topic_latest_entry_id,
             metadata_topic_latest_entry_id: topic_latest_entry_id,
@@ -573,25 +609,33 @@ impl MitoRegion {
             .flat_map(|level| level.files().map(|file| file.file_id().file_id()))
             .collect::<HashSet<_>>();
 
-        self.manifest_ctx
-            .manifest()
+        let manifest_files = self.manifest_ctx.manifest().await.files.clone();
+        let staging_files = self
+            .manifest_ctx
+            .staging_manifest()
             .await
-            .files
+            .map(|m| m.files.clone())
+            .unwrap_or_default();
+        let files = manifest_files
+            .into_iter()
+            .chain(staging_files.into_iter())
+            .collect::<HashMap<_, _>>();
+
+        files
             .values()
             .map(|meta| {
                 let region_id = self.region_id;
                 let origin_region_id = meta.region_id;
-                let (index_file_id, index_file_path, index_file_size) = if meta.index_file_size > 0
+                let (index_version, index_file_path, index_file_size) = if meta.index_file_size > 0
                 {
-                    let index_file_path =
-                        index_file_path(table_dir, meta.index_file_id(), path_type);
+                    let index_file_path = index_file_path(table_dir, meta.index_id(), path_type);
                     (
-                        Some(meta.index_file_id().file_id().to_string()),
+                        meta.index_version,
                         Some(index_file_path),
                         Some(meta.index_file_size),
                     )
                 } else {
-                    (None, None, None)
+                    (0, None, None)
                 };
                 let visible = visible_ssts.contains(&meta.file_id);
                 ManifestSstEntry {
@@ -602,7 +646,7 @@ impl MitoRegion {
                     region_group: region_id.region_group(),
                     region_sequence: region_id.region_sequence(),
                     file_id: meta.file_id.to_string(),
-                    index_file_id,
+                    index_version,
                     level: meta.level,
                     file_path: sst_file_path(table_dir, meta.file_id(), path_type),
                     file_size: meta.file_size,
@@ -622,6 +666,16 @@ impl MitoRegion {
             .collect()
     }
 
+    /// Returns the file metas of the region by file ids.
+    pub async fn file_metas(&self, file_ids: &[FileId]) -> Vec<Option<FileMeta>> {
+        let manifest_files = self.manifest_ctx.manifest().await.files.clone();
+
+        file_ids
+            .iter()
+            .map(|file_id| manifest_files.get(file_id).cloned())
+            .collect::<Vec<_>>()
+    }
+
     /// Exit staging mode successfully by merging all staged manifests and making them visible.
     pub(crate) async fn exit_staging_on_success(
         &self,
@@ -652,9 +706,8 @@ impl MitoRegion {
         };
 
         // Submit merged actions using the manifest manager's update method
-        // Pass the target state (Writable) so it saves to normal directory, not staging
-        let target_state = RegionRoleState::Leader(RegionLeaderState::Writable);
-        let new_version = manager.update(merged_actions.clone(), target_state).await?;
+        // Pass the `false` so it saves to normal directory, not staging
+        let new_version = manager.update(merged_actions.clone(), false).await?;
 
         info!(
             "Successfully submitted merged staged manifests for region {}, new version: {}",
@@ -729,6 +782,7 @@ impl ManifestContext {
         &self,
         expect_state: RegionLeaderState,
         action_list: RegionMetaActionList,
+        is_staging: bool,
     ) -> Result<ManifestVersion> {
         // Acquires the write lock of the manifest manager.
         let mut manager = self.manifest_manager.write().await;
@@ -804,7 +858,7 @@ impl ManifestContext {
         }
 
         // Now we can update the manifest.
-        let version = manager.update(action_list, current_state).await.inspect_err(
+        let version = manager.update(action_list, is_staging).await.inspect_err(
             |e| error!(e; "Failed to update manifest, region_id: {}", manifest.metadata.region_id),
         )?;
 
@@ -911,9 +965,17 @@ impl ManifestContext {
         }
     }
 
+    /// Returns the normal manifest of the region.
     pub(crate) async fn manifest(&self) -> Arc<crate::manifest::action::RegionManifest> {
         self.manifest_manager.read().await.manifest()
     }
+
+    /// Returns the staging manifest of the region.
+    pub(crate) async fn staging_manifest(
+        &self,
+    ) -> Option<Arc<crate::manifest::action::RegionManifest>> {
+        self.manifest_manager.read().await.staging_manifest()
+    }
 }
 
 pub(crate) type ManifestContextRef = Arc<ManifestContext>;
@@ -1033,6 +1095,24 @@ impl RegionMap {
         Ok(region)
     }
 
+    /// Gets staging region by region id.
+    ///
+    /// Returns error if the region does not exist or is not in staging state.
+    pub(crate) fn staging_region(&self, region_id: RegionId) -> Result<MitoRegionRef> {
+        let region = self
+            .get_region(region_id)
+            .context(RegionNotFoundSnafu { region_id })?;
+        ensure!(
+            region.is_staging(),
+            RegionStateSnafu {
+                region_id,
+                state: region.state(),
+                expect: RegionRoleState::Leader(RegionLeaderState::Staging),
+            }
+        );
+        Ok(region)
+    }
+
     /// Gets flushable region by region id.
     ///
     /// Returns error if the region does not exist or is not operable.
@@ -1171,9 +1251,10 @@ pub(crate) type CatchupRegionsRef = Arc<CatchupRegions>;
 
 /// Manifest stats.
 #[derive(Default, Debug, Clone)]
-pub(crate) struct ManifestStats {
-    total_manifest_size: Arc<AtomicU64>,
-    manifest_version: Arc<AtomicU64>,
+pub struct ManifestStats {
+    pub(crate) total_manifest_size: Arc<AtomicU64>,
+    pub(crate) manifest_version: Arc<AtomicU64>,
+    pub(crate) file_removed_cnt: Arc<AtomicU64>,
 }
 
 impl ManifestStats {
@@ -1184,12 +1265,16 @@ impl ManifestStats {
     fn manifest_version(&self) -> u64 {
         self.manifest_version.load(Ordering::Relaxed)
     }
+
+    fn file_removed_cnt(&self) -> u64 {
+        self.file_removed_cnt.load(Ordering::Relaxed)
+    }
 }
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
     use std::sync::atomic::AtomicU64;
+    use std::sync::{Arc, Mutex};
 
     use common_datasource::compression::CompressionType;
     use common_test_util::temp_dir::create_temp_dir;
@@ -1288,10 +1373,10 @@ mod tests {
                     compress_type: CompressionType::Uncompressed,
                     checkpoint_distance: 10,
                     remove_file_options: Default::default(),
+                    manifest_cache: None,
                 },
-                Default::default(),
-                Default::default(),
                 FormatType::PrimaryKey,
+                &Default::default(),
             )
             .await
             .unwrap();
@@ -1355,10 +1440,10 @@ mod tests {
                 compress_type: CompressionType::Uncompressed,
                 checkpoint_distance: 10,
                 remove_file_options: Default::default(),
+                manifest_cache: None,
             },
-            Default::default(),
-            Default::default(),
             FormatType::PrimaryKey,
+            &Default::default(),
         )
         .await
         .unwrap();
@@ -1381,6 +1466,7 @@ mod tests {
             topic_latest_entry_id: Default::default(),
             written_bytes: Arc::new(AtomicU64::new(0)),
             stats: ManifestStats::default(),
+            staging_partition_expr: Mutex::new(None),
         };
 
         // Test initial state
diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs
index 06e603d613..60abdbd29b 100644
--- a/src/mito2/src/region/opener.rs
+++ b/src/mito2/src/region/opener.rs
@@ -16,8 +16,8 @@
 
 use std::any::TypeId;
 use std::collections::HashMap;
-use std::sync::Arc;
 use std::sync::atomic::{AtomicI64, AtomicU64};
+use std::sync::{Arc, Mutex};
 use std::time::Instant;
 
 use common_telemetry::{debug, error, info, warn};
@@ -28,7 +28,7 @@ use log_store::kafka::log_store::KafkaLogStore;
 use log_store::noop::log_store::NoopLogStore;
 use log_store::raft_engine::log_store::RaftEngineLogStore;
 use object_store::manager::ObjectStoreManagerRef;
-use object_store::util::{join_dir, normalize_dir};
+use object_store::util::normalize_dir;
 use snafu::{OptionExt, ResultExt, ensure};
 use store_api::logstore::LogStore;
 use store_api::logstore::provider::Provider;
@@ -41,7 +41,7 @@ use store_api::storage::{ColumnId, RegionId};
 
 use crate::access_layer::AccessLayer;
 use crate::cache::CacheManagerRef;
-use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
+use crate::cache::file_cache::{FileCache, FileType, IndexKey};
 use crate::config::MitoConfig;
 use crate::error;
 use crate::error::{
@@ -49,8 +49,7 @@ use crate::error::{
     Result, StaleLogEntrySnafu,
 };
 use crate::manifest::action::RegionManifest;
-use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions, RemoveFileOptions};
-use crate::manifest::storage::manifest_compress_type;
+use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
 use crate::memtable::MemtableBuilderProvider;
 use crate::memtable::bulk::part::BulkPart;
 use crate::memtable::time_partition::{TimePartitions, TimePartitionsRef};
@@ -64,8 +63,8 @@ use crate::region_write_ctx::RegionWriteCtx;
 use crate::request::OptionOutputTx;
 use crate::schedule::scheduler::SchedulerRef;
 use crate::sst::FormatType;
-use crate::sst::file::RegionFileId;
-use crate::sst::file_purger::{FilePurgerRef, create_local_file_purger};
+use crate::sst::file::{RegionFileId, RegionIndexId};
+use crate::sst::file_purger::{FilePurgerRef, create_file_purger};
 use crate::sst::file_ref::FileReferenceManagerRef;
 use crate::sst::index::intermediate::IntermediateManager;
 use crate::sst::index::puffin_manager::PuffinManagerFactory;
@@ -271,17 +270,22 @@ impl RegionOpener {
             FormatType::PrimaryKey
         };
         // Create a manifest manager for this region and writes regions to the manifest file.
-        let region_manifest_options =
-            Self::manifest_options(config, &options, &region_dir, &self.object_store_manager)?;
+        let mut region_manifest_options =
+            RegionManifestOptions::new(config, &region_dir, &object_store);
+        // Set manifest cache if available
+        region_manifest_options.manifest_cache = self
+            .cache_manager
+            .as_ref()
+            .and_then(|cm| cm.write_cache())
+            .and_then(|wc| wc.manifest_cache());
         // For remote WAL, we need to set flushed_entry_id to current topic's latest entry id.
         let flushed_entry_id = provider.initial_flushed_entry_id::<S>(wal.store());
         let manifest_manager = RegionManifestManager::new(
             metadata.clone(),
             flushed_entry_id,
             region_manifest_options,
-            self.stats.total_manifest_size.clone(),
-            self.stats.manifest_version.clone(),
             sst_format,
+            &self.stats,
         )
         .await?;
 
@@ -322,7 +326,8 @@ impl RegionOpener {
                 manifest_manager,
                 RegionRoleState::Leader(RegionLeaderState::Writable),
             )),
-            file_purger: create_local_file_purger(
+            file_purger: create_file_purger(
+                config.gc.enable,
                 self.purge_scheduler,
                 access_layer,
                 self.cache_manager,
@@ -335,6 +340,7 @@ impl RegionOpener {
             topic_latest_entry_id: AtomicU64::new(0),
             written_bytes: Arc::new(AtomicU64::new(0)),
             stats: self.stats,
+            staging_partition_expr: Mutex::new(None),
         }))
     }
 
@@ -351,7 +357,7 @@ impl RegionOpener {
         let region = self
             .maybe_open(config, wal)
             .await?
-            .context(EmptyRegionDirSnafu {
+            .with_context(|| EmptyRegionDirSnafu {
                 region_id,
                 region_dir: &region_dir,
             })?;
@@ -406,19 +412,17 @@ impl RegionOpener {
     ) -> Result<Option<MitoRegionRef>> {
         let now = Instant::now();
         let mut region_options = self.options.as_ref().unwrap().clone();
-
-        let region_manifest_options = Self::manifest_options(
-            config,
-            &region_options,
-            &self.region_dir(),
-            &self.object_store_manager,
-        )?;
-        let Some(manifest_manager) = RegionManifestManager::open(
-            region_manifest_options,
-            self.stats.total_manifest_size.clone(),
-            self.stats.manifest_version.clone(),
-        )
-        .await?
+        let object_storage = get_object_store(&region_options.storage, &self.object_store_manager)?;
+        let mut region_manifest_options =
+            RegionManifestOptions::new(config, &self.region_dir(), &object_storage);
+        // Set manifest cache if available
+        region_manifest_options.manifest_cache = self
+            .cache_manager
+            .as_ref()
+            .and_then(|cm| cm.write_cache())
+            .and_then(|wc| wc.manifest_cache());
+        let Some(manifest_manager) =
+            RegionManifestManager::open(region_manifest_options, &self.stats).await?
         else {
             return Ok(None);
         };
@@ -459,7 +463,8 @@ impl RegionOpener {
             self.puffin_manager_factory.clone(),
             self.intermediate_manager.clone(),
         ));
-        let file_purger = create_local_file_purger(
+        let file_purger = create_file_purger(
+            config.gc.enable,
             self.purge_scheduler.clone(),
             access_layer.clone(),
             self.cache_manager.clone(),
@@ -571,6 +576,8 @@ impl RegionOpener {
             topic_latest_entry_id: AtomicU64::new(topic_latest_entry_id),
             written_bytes: Arc::new(AtomicU64::new(0)),
             stats: self.stats.clone(),
+            // TODO(weny): reload the staging partition expr from the manifest.
+            staging_partition_expr: Mutex::new(None),
         };
 
         let region = Arc::new(region);
@@ -579,28 +586,6 @@ impl RegionOpener {
 
         Ok(Some(region))
     }
-
-    /// Returns a new manifest options.
-    fn manifest_options(
-        config: &MitoConfig,
-        options: &RegionOptions,
-        region_dir: &str,
-        object_store_manager: &ObjectStoreManagerRef,
-    ) -> Result<RegionManifestOptions> {
-        let object_store = get_object_store(&options.storage, object_store_manager)?;
-        Ok(RegionManifestOptions {
-            manifest_dir: new_manifest_dir(region_dir),
-            object_store,
-            // We don't allow users to set the compression algorithm as we use it as a file suffix.
-            // Currently, the manifest storage doesn't have good support for changing compression algorithms.
-            compress_type: manifest_compress_type(config.compress_manifest),
-            checkpoint_distance: config.manifest_checkpoint_distance,
-            remove_file_options: RemoveFileOptions {
-                keep_count: config.experimental_manifest_keep_removed_file_count,
-                keep_ttl: config.experimental_manifest_keep_removed_file_ttl,
-            },
-        })
-    }
 }
 
 /// Creates a version builder from a region manifest.
@@ -651,58 +636,6 @@ pub fn get_object_store(
     }
 }
 
-/// A loader for loading metadata from a region dir.
-pub struct RegionMetadataLoader {
-    config: Arc<MitoConfig>,
-    object_store_manager: ObjectStoreManagerRef,
-}
-
-impl RegionMetadataLoader {
-    /// Creates a new `RegionOpenerBuilder`.
-    pub fn new(config: Arc<MitoConfig>, object_store_manager: ObjectStoreManagerRef) -> Self {
-        Self {
-            config,
-            object_store_manager,
-        }
-    }
-
-    /// Loads the metadata of the region from the region dir.
-    pub async fn load(
-        &self,
-        region_dir: &str,
-        region_options: &RegionOptions,
-    ) -> Result<Option<RegionMetadataRef>> {
-        let manifest = self.load_manifest(region_dir, region_options).await?;
-        Ok(manifest.map(|m| m.metadata.clone()))
-    }
-
-    /// Loads the manifest of the region from the region dir.
-    pub async fn load_manifest(
-        &self,
-        region_dir: &str,
-        region_options: &RegionOptions,
-    ) -> Result<Option<Arc<RegionManifest>>> {
-        let region_manifest_options = RegionOpener::manifest_options(
-            &self.config,
-            region_options,
-            region_dir,
-            &self.object_store_manager,
-        )?;
-        let Some(manifest_manager) = RegionManifestManager::open(
-            region_manifest_options,
-            Arc::new(AtomicU64::new(0)),
-            Arc::new(AtomicU64::new(0)),
-        )
-        .await?
-        else {
-            return Ok(None);
-        };
-
-        let manifest = manifest_manager.manifest();
-        Ok(Some(manifest))
-    }
-}
-
 /// Checks whether the recovered region has the same schema as region to create.
 pub(crate) fn check_recovered_region(
     recovered: &RegionMetadata,
@@ -856,11 +789,6 @@ where
     Ok(last_entry_id)
 }
 
-/// Returns the directory to the manifest files.
-pub(crate) fn new_manifest_dir(region_dir: &str) -> String {
-    join_dir(region_dir, "manifest")
-}
-
 /// A task to load and fill the region file cache.
 pub(crate) struct RegionLoadCacheTask {
     region: MitoRegionRef,
@@ -872,14 +800,14 @@ impl RegionLoadCacheTask {
     }
 
     /// Fills the file cache with index files from the region.
-    pub(crate) async fn fill_cache(&self, file_cache: FileCacheRef) {
+    pub(crate) async fn fill_cache(&self, file_cache: &FileCache) {
         let region_id = self.region.region_id;
         let table_dir = self.region.access_layer.table_dir();
         let path_type = self.region.access_layer.path_type();
         let object_store = self.region.access_layer.object_store();
         let version_control = &self.region.version_control;
 
-        // Collects IndexKeys and file sizes for files that need to be downloaded
+        // Collects IndexKeys, file sizes, and max timestamps for files that need to be downloaded
         let mut files_to_download = Vec::new();
         let mut files_already_cached = 0;
 
@@ -891,12 +819,16 @@ impl RegionLoadCacheTask {
                     if file_meta.exists_index() {
                         let puffin_key = IndexKey::new(
                             file_meta.region_id,
-                            file_meta.index_file_id().file_id(),
-                            FileType::Puffin,
+                            file_meta.file_id,
+                            FileType::Puffin(file_meta.index_version),
                         );
 
                         if !file_cache.contains_key(&puffin_key) {
-                            files_to_download.push((puffin_key, file_meta.index_file_size));
+                            files_to_download.push((
+                                puffin_key,
+                                file_meta.index_file_size,
+                                file_meta.time_range.1, // max timestamp
+                            ));
                         } else {
                             files_already_cached += 1;
                         }
@@ -906,6 +838,10 @@ impl RegionLoadCacheTask {
             // Releases the Version after the scope to avoid holding the memtables and file handles
             // for a long time.
         }
+
+        // Sorts files by max timestamp in descending order to loads latest files first
+        files_to_download.sort_by(|a, b| b.2.cmp(&a.2));
+
         let total_files = files_to_download.len() as i64;
 
         info!(
@@ -918,7 +854,7 @@ impl RegionLoadCacheTask {
         let mut files_downloaded = 0;
         let mut files_skipped = 0;
 
-        for (puffin_key, file_size) in files_to_download {
+        for (puffin_key, file_size, max_timestamp) in files_to_download {
             let current_size = file_cache.puffin_cache_size();
             let capacity = file_cache.puffin_cache_capacity();
             let region_state = self.region.state();
@@ -933,20 +869,26 @@ impl RegionLoadCacheTask {
             // Checks if adding this file would exceed capacity
             if current_size + file_size > capacity {
                 info!(
-                    "Stopping index cache preload due to capacity limit, region: {}, file_id: {}, current_size: {}, file_size: {}, capacity: {}",
-                    region_id, puffin_key.file_id, current_size, file_size, capacity
+                    "Stopping index cache preload due to capacity limit, region: {}, file_id: {}, current_size: {}, file_size: {}, capacity: {}, file_timestamp: {:?}",
+                    region_id, puffin_key.file_id, current_size, file_size, capacity, max_timestamp
                 );
                 files_skipped = (total_files - files_downloaded) as usize;
                 CACHE_FILL_PENDING_FILES.sub(total_files - files_downloaded);
                 break;
             }
 
-            let index_remote_path = location::index_file_path(
-                table_dir,
+            let index_version = if let FileType::Puffin(version) = puffin_key.file_type {
+                version
+            } else {
+                unreachable!("`files_to_download` should only contains Puffin files");
+            };
+            let index_id = RegionIndexId::new(
                 RegionFileId::new(puffin_key.region_id, puffin_key.file_id),
-                path_type,
+                index_version,
             );
 
+            let index_remote_path = location::index_file_path(table_dir, index_id, path_type);
+
             match file_cache
                 .download(puffin_key, &index_remote_path, object_store, file_size)
                 .await
@@ -1004,6 +946,7 @@ fn can_load_cache(state: RegionRoleState) -> bool {
         RegionRoleState::Leader(RegionLeaderState::Writable)
         | RegionRoleState::Leader(RegionLeaderState::Staging)
         | RegionRoleState::Leader(RegionLeaderState::Altering)
+        | RegionRoleState::Leader(RegionLeaderState::EnteringStaging)
         | RegionRoleState::Leader(RegionLeaderState::Editing)
         | RegionRoleState::Follower => true,
         // The region will be closed soon if it is downgrading.
diff --git a/src/mito2/src/region/utils.rs b/src/mito2/src/region/utils.rs
new file mode 100644
index 0000000000..25c084ef7a
--- /dev/null
+++ b/src/mito2/src/region/utils.rs
@@ -0,0 +1,345 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+use std::time::Instant;
+
+use common_base::readable_size::ReadableSize;
+use common_telemetry::{debug, error, info};
+use futures::future::try_join_all;
+use object_store::manager::ObjectStoreManagerRef;
+use snafu::{ResultExt, ensure};
+use store_api::metadata::RegionMetadataRef;
+use store_api::region_request::PathType;
+use store_api::storage::{FileId, IndexVersion, RegionId};
+
+use crate::access_layer::AccessLayerRef;
+use crate::config::MitoConfig;
+use crate::error::{self, InvalidSourceAndTargetRegionSnafu, Result};
+use crate::manifest::action::RegionManifest;
+use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
+use crate::region::opener::get_object_store;
+use crate::region::options::RegionOptions;
+use crate::sst::file::{RegionFileId, RegionIndexId};
+use crate::sst::location;
+
+/// A loader for loading metadata from a region dir.
+#[derive(Debug, Clone)]
+pub struct RegionMetadataLoader {
+    config: Arc<MitoConfig>,
+    object_store_manager: ObjectStoreManagerRef,
+}
+
+impl RegionMetadataLoader {
+    /// Creates a new `RegionMetadataLoader`.
+    pub fn new(config: Arc<MitoConfig>, object_store_manager: ObjectStoreManagerRef) -> Self {
+        Self {
+            config,
+            object_store_manager,
+        }
+    }
+
+    /// Loads the metadata of the region from the region dir.
+    pub async fn load(
+        &self,
+        region_dir: &str,
+        region_options: &RegionOptions,
+    ) -> Result<Option<RegionMetadataRef>> {
+        let manifest = self
+            .load_manifest(region_dir, &region_options.storage)
+            .await?;
+        Ok(manifest.map(|m| m.metadata.clone()))
+    }
+
+    /// Loads the manifest of the region from the region dir.
+    pub async fn load_manifest(
+        &self,
+        region_dir: &str,
+        storage: &Option<String>,
+    ) -> Result<Option<Arc<RegionManifest>>> {
+        let object_store = get_object_store(storage, &self.object_store_manager)?;
+        let region_manifest_options =
+            RegionManifestOptions::new(&self.config, region_dir, &object_store);
+        let Some(manifest_manager) =
+            RegionManifestManager::open(region_manifest_options, &Default::default()).await?
+        else {
+            return Ok(None);
+        };
+
+        let manifest = manifest_manager.manifest();
+        Ok(Some(manifest))
+    }
+}
+
+/// A copier for copying files from a region to another region.
+#[derive(Debug, Clone)]
+pub struct RegionFileCopier {
+    access_layer: AccessLayerRef,
+}
+
+/// A descriptor for a file.
+#[derive(Debug, Clone, Copy)]
+pub enum FileDescriptor {
+    /// An index file.
+    Index {
+        file_id: FileId,
+        version: IndexVersion,
+        size: u64,
+    },
+    /// A data file.
+    Data { file_id: FileId, size: u64 },
+}
+
+impl FileDescriptor {
+    pub fn size(&self) -> u64 {
+        match self {
+            FileDescriptor::Index { size, .. } => *size,
+            FileDescriptor::Data { size, .. } => *size,
+        }
+    }
+}
+
+/// Builds the source and target file paths for a given file descriptor.
+///
+/// # Arguments
+///
+/// * `source_region_id`: The ID of the source region.
+/// * `target_region_id`: The ID of the target region.
+/// * `file_id`: The ID of the file.
+///
+/// # Returns
+///
+/// A tuple containing the source and target file paths.
+fn build_copy_file_paths(
+    source_region_id: RegionId,
+    target_region_id: RegionId,
+    file_descriptor: FileDescriptor,
+    table_dir: &str,
+    path_type: PathType,
+) -> (String, String) {
+    match file_descriptor {
+        FileDescriptor::Index {
+            file_id, version, ..
+        } => (
+            location::index_file_path(
+                table_dir,
+                RegionIndexId::new(RegionFileId::new(source_region_id, file_id), version),
+                path_type,
+            ),
+            location::index_file_path(
+                table_dir,
+                RegionIndexId::new(RegionFileId::new(target_region_id, file_id), version),
+                path_type,
+            ),
+        ),
+        FileDescriptor::Data { file_id, .. } => (
+            location::sst_file_path(
+                table_dir,
+                RegionFileId::new(source_region_id, file_id),
+                path_type,
+            ),
+            location::sst_file_path(
+                table_dir,
+                RegionFileId::new(target_region_id, file_id),
+                path_type,
+            ),
+        ),
+    }
+}
+
+fn build_delete_file_path(
+    target_region_id: RegionId,
+    file_descriptor: FileDescriptor,
+    table_dir: &str,
+    path_type: PathType,
+) -> String {
+    match file_descriptor {
+        FileDescriptor::Index {
+            file_id, version, ..
+        } => location::index_file_path(
+            table_dir,
+            RegionIndexId::new(RegionFileId::new(target_region_id, file_id), version),
+            path_type,
+        ),
+        FileDescriptor::Data { file_id, .. } => location::sst_file_path(
+            table_dir,
+            RegionFileId::new(target_region_id, file_id),
+            path_type,
+        ),
+    }
+}
+
+impl RegionFileCopier {
+    pub fn new(access_layer: AccessLayerRef) -> Self {
+        Self { access_layer }
+    }
+
+    /// Copies files from a source region to a target region.
+    ///
+    /// # Arguments
+    ///
+    /// * `source_region_id`: The ID of the source region.
+    /// * `target_region_id`: The ID of the target region.
+    /// * `file_ids`: The IDs of the files to copy.
+    pub async fn copy_files(
+        &self,
+        source_region_id: RegionId,
+        target_region_id: RegionId,
+        file_ids: Vec<FileDescriptor>,
+        parallelism: usize,
+    ) -> Result<()> {
+        ensure!(
+            source_region_id.table_id() == target_region_id.table_id(),
+            InvalidSourceAndTargetRegionSnafu {
+                source_region_id,
+                target_region_id,
+            },
+        );
+        let table_dir = self.access_layer.table_dir();
+        let path_type = self.access_layer.path_type();
+        let object_store = self.access_layer.object_store();
+
+        info!(
+            "Copying {} files from region {} to region {}",
+            file_ids.len(),
+            source_region_id,
+            target_region_id
+        );
+        debug!(
+            "Copying files: {:?} from region {} to region {}",
+            file_ids, source_region_id, target_region_id
+        );
+        let mut tasks = Vec::with_capacity(parallelism);
+        for skip in 0..parallelism {
+            let target_file_ids = file_ids.iter().skip(skip).step_by(parallelism).copied();
+            let object_store = object_store.clone();
+            tasks.push(async move {
+                for file_desc in target_file_ids {
+                    let (source_path, target_path) = build_copy_file_paths(
+                        source_region_id,
+                        target_region_id,
+                        file_desc,
+                        table_dir,
+                        path_type,
+                    );
+                    let now = Instant::now();
+                    object_store
+                        .copy(&source_path, &target_path)
+                        .await
+                        .inspect_err(
+                            |e| error!(e; "Failed to copy file {} to {}", source_path, target_path),
+                        )
+                        .context(error::OpenDalSnafu)?;
+                    let file_size = ReadableSize(file_desc.size());
+                    info!(
+                        "Copied file {} to {}, file size: {}, elapsed: {:?}",
+                        source_path,
+                        target_path,
+                        file_size,
+                        now.elapsed(),
+                    );
+                }
+
+                Ok(())
+            });
+        }
+
+        if let Err(err) = try_join_all(tasks).await {
+            error!(err; "Failed to copy files from region {} to region {}", source_region_id, target_region_id);
+            self.clean_target_region(target_region_id, file_ids).await;
+            return Err(err);
+        }
+
+        Ok(())
+    }
+
+    /// Cleans the copied files from the target region.
+    async fn clean_target_region(&self, target_region_id: RegionId, file_ids: Vec<FileDescriptor>) {
+        let table_dir = self.access_layer.table_dir();
+        let path_type = self.access_layer.path_type();
+        let object_store = self.access_layer.object_store();
+        let delete_file_path = file_ids
+            .into_iter()
+            .map(|file_descriptor| {
+                build_delete_file_path(target_region_id, file_descriptor, table_dir, path_type)
+            })
+            .collect::<Vec<_>>();
+        debug!(
+            "Deleting files: {:?} after failed to copy files to target region {}",
+            delete_file_path, target_region_id
+        );
+        if let Err(err) = object_store.delete_iter(delete_file_path).await {
+            error!(err; "Failed to delete files from region {}", target_region_id);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_build_copy_file_paths() {
+        common_telemetry::init_default_ut_logging();
+        let file_id = FileId::random();
+        let source_region_id = RegionId::new(1, 1);
+        let target_region_id = RegionId::new(1, 2);
+        let file_descriptor = FileDescriptor::Data { file_id, size: 100 };
+        let table_dir = "/table_dir";
+        let path_type = PathType::Bare;
+        let (source_path, target_path) = build_copy_file_paths(
+            source_region_id,
+            target_region_id,
+            file_descriptor,
+            table_dir,
+            path_type,
+        );
+        assert_eq!(
+            source_path,
+            format!("/table_dir/1_0000000001/{}.parquet", file_id)
+        );
+        assert_eq!(
+            target_path,
+            format!("/table_dir/1_0000000002/{}.parquet", file_id)
+        );
+
+        let version = 1;
+        let file_descriptor = FileDescriptor::Index {
+            file_id,
+            version,
+            size: 100,
+        };
+        let (source_path, target_path) = build_copy_file_paths(
+            source_region_id,
+            target_region_id,
+            file_descriptor,
+            table_dir,
+            path_type,
+        );
+        assert_eq!(
+            source_path,
+            format!(
+                "/table_dir/1_0000000001/index/{}.{}.puffin",
+                file_id, version
+            )
+        );
+        assert_eq!(
+            target_path,
+            format!(
+                "/table_dir/1_0000000002/index/{}.{}.puffin",
+                file_id, version
+            )
+        );
+    }
+}
diff --git a/src/mito2/src/remap_manifest.rs b/src/mito2/src/remap_manifest.rs
index cafb62f191..59920ad945 100644
--- a/src/mito2/src/remap_manifest.rs
+++ b/src/mito2/src/remap_manifest.rs
@@ -425,9 +425,11 @@ mod tests {
             time_range: FileTimeRange::default(),
             level: 0,
             file_size: 1024,
+            max_row_group_uncompressed_size: 1024,
             available_indexes: SmallVec::new(),
+            indexes: Default::default(),
             index_file_size: 0,
-            index_file_id: None,
+            index_version: 0,
             num_rows: 100,
             num_row_groups: 1,
             sequence: NonZeroU64::new(1),
diff --git a/src/mito2/src/request.rs b/src/mito2/src/request.rs
index 65a1fff9ef..4f4aaeb4bc 100644
--- a/src/mito2/src/request.rs
+++ b/src/mito2/src/request.rs
@@ -20,12 +20,12 @@ use std::time::Instant;
 
 use api::helper::{
     ColumnDataTypeWrapper, is_column_type_value_eq, is_semantic_type_eq, proto_value_type,
-    to_proto_value,
 };
 use api::v1::column_def::options_from_column_schema;
 use api::v1::{ColumnDataType, ColumnSchema, OpType, Rows, SemanticType, Value, WriteHint};
 use common_telemetry::info;
 use datatypes::prelude::DataType;
+use partition::expr::PartitionExpr;
 use prometheus::HistogramTimer;
 use prost::Message;
 use smallvec::SmallVec;
@@ -33,20 +33,24 @@ use snafu::{OptionExt, ResultExt, ensure};
 use store_api::ManifestVersion;
 use store_api::codec::{PrimaryKeyEncoding, infer_primary_key_encoding_from_hint};
 use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataRef};
-use store_api::region_engine::{SetRegionRoleStateResponse, SettableRegionRoleState};
+use store_api::region_engine::{
+    MitoCopyRegionFromResponse, SetRegionRoleStateResponse, SettableRegionRoleState,
+};
 use store_api::region_request::{
-    AffectedRows, RegionAlterRequest, RegionBuildIndexRequest, RegionBulkInsertsRequest,
-    RegionCatchupRequest, RegionCloseRequest, RegionCompactRequest, RegionCreateRequest,
-    RegionFlushRequest, RegionOpenRequest, RegionRequest, RegionTruncateRequest,
+    AffectedRows, EnterStagingRequest, RegionAlterRequest, RegionBuildIndexRequest,
+    RegionBulkInsertsRequest, RegionCatchupRequest, RegionCloseRequest, RegionCompactRequest,
+    RegionCreateRequest, RegionFlushRequest, RegionOpenRequest, RegionRequest,
+    RegionTruncateRequest,
 };
 use store_api::storage::{FileId, RegionId};
 use tokio::sync::oneshot::{self, Receiver, Sender};
 
 use crate::error::{
     CompactRegionSnafu, ConvertColumnDataTypeSnafu, CreateDefaultSnafu, Error, FillDefaultSnafu,
-    FlushRegionSnafu, InvalidRequestSnafu, Result, UnexpectedSnafu,
+    FlushRegionSnafu, InvalidPartitionExprSnafu, InvalidRequestSnafu, MissingPartitionExprSnafu,
+    Result, UnexpectedSnafu,
 };
-use crate::manifest::action::{RegionEdit, TruncateKind};
+use crate::manifest::action::{RegionEdit, RegionManifest, TruncateKind};
 use crate::memtable::MemtableId;
 use crate::memtable::bulk::part::BulkPart;
 use crate::metrics::COMPACTION_ELAPSED_TOTAL;
@@ -412,7 +416,7 @@ impl WriteRequest {
         };
 
         // Convert default value into proto's value.
-        Ok(to_proto_value(default_value))
+        Ok(api::helper::to_grpc_value(default_value))
     }
 }
 
@@ -600,6 +604,12 @@ pub(crate) enum WorkerRequest {
         request: RegionBulkInsertsRequest,
         sender: OptionOutputTx,
     },
+
+    /// Remap manifests request.
+    RemapManifests(RemapManifestsRequest),
+
+    /// Copy region from request.
+    CopyRegionFrom(CopyRegionFromRequest),
 }
 
 impl WorkerRequest {
@@ -721,6 +731,11 @@ impl WorkerRequest {
                 sender: sender.into(),
                 request: DdlRequest::Catchup((v, None)),
             }),
+            RegionRequest::EnterStaging(v) => WorkerRequest::Ddl(SenderDdlRequest {
+                region_id,
+                sender: sender.into(),
+                request: DdlRequest::EnterStaging(v),
+            }),
             RegionRequest::BulkInserts(region_bulk_inserts_request) => WorkerRequest::BulkInserts {
                 metadata: region_metadata,
                 sender: sender.into(),
@@ -761,6 +776,66 @@ impl WorkerRequest {
             receiver,
         )
     }
+
+    /// Converts [RemapManifestsRequest] from a [RemapManifestsRequest](store_api::region_engine::RemapManifestsRequest).
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the partition expression is invalid or missing.
+    /// Returns an error if the new partition expressions are not found for some regions.
+    #[allow(clippy::type_complexity)]
+    pub(crate) fn try_from_remap_manifests_request(
+        store_api::region_engine::RemapManifestsRequest {
+            region_id,
+            input_regions,
+            region_mapping,
+            new_partition_exprs,
+        }: store_api::region_engine::RemapManifestsRequest,
+    ) -> Result<(
+        WorkerRequest,
+        Receiver<Result<HashMap<RegionId, RegionManifest>>>,
+    )> {
+        let (sender, receiver) = oneshot::channel();
+        let new_partition_exprs = new_partition_exprs
+            .into_iter()
+            .map(|(k, v)| {
+                Ok((
+                    k,
+                    PartitionExpr::from_json_str(&v)
+                        .context(InvalidPartitionExprSnafu { expr: v })?
+                        .context(MissingPartitionExprSnafu { region_id: k })?,
+                ))
+            })
+            .collect::<Result<HashMap<_, _>>>()?;
+
+        let request = RemapManifestsRequest {
+            region_id,
+            input_regions,
+            region_mapping,
+            new_partition_exprs,
+            sender,
+        };
+
+        Ok((WorkerRequest::RemapManifests(request), receiver))
+    }
+
+    /// Converts [CopyRegionFromRequest] from a [CopyRegionFromRequest](store_api::region_engine::CopyRegionFromRequest).
+    pub(crate) fn try_from_copy_region_from_request(
+        region_id: RegionId,
+        store_api::region_engine::CopyRegionFromRequest {
+            source_region_id,
+            parallelism,
+        }: store_api::region_engine::CopyRegionFromRequest,
+    ) -> Result<(WorkerRequest, Receiver<Result<MitoCopyRegionFromResponse>>)> {
+        let (sender, receiver) = oneshot::channel();
+        let request = CopyRegionFromRequest {
+            region_id,
+            source_region_id,
+            parallelism,
+            sender,
+        };
+        Ok((WorkerRequest::CopyRegionFrom(request), receiver))
+    }
 }
 
 /// DDL request to a region.
@@ -776,6 +851,7 @@ pub(crate) enum DdlRequest {
     BuildIndex(RegionBuildIndexRequest),
     Truncate(RegionTruncateRequest),
     Catchup((RegionCatchupRequest, Option<WalEntryReceiver>)),
+    EnterStaging(EnterStagingRequest),
 }
 
 /// Sender and Ddl request.
@@ -812,6 +888,10 @@ pub(crate) enum BackgroundNotify {
     RegionChange(RegionChangeResult),
     /// Region edit result.
     RegionEdit(RegionEditResult),
+    /// Enter staging result.
+    EnterStaging(EnterStagingResult),
+    /// Copy region result.
+    CopyRegionFromFinished(CopyRegionFromFinished),
 }
 
 /// Notifies a flush job is finished.
@@ -829,6 +909,8 @@ pub(crate) struct FlushFinished {
     pub(crate) edit: RegionEdit,
     /// Memtables to remove.
     pub(crate) memtables_to_remove: SmallVec<[MemtableId; 2]>,
+    /// Whether the region is in staging mode.
+    pub(crate) is_staging: bool,
 }
 
 impl FlushFinished {
@@ -953,6 +1035,29 @@ pub(crate) struct RegionChangeResult {
     pub(crate) new_options: Option<RegionOptions>,
 }
 
+/// Notifies the region the result of entering staging.
+#[derive(Debug)]
+pub(crate) struct EnterStagingResult {
+    /// Region id.
+    pub(crate) region_id: RegionId,
+    /// The new partition expression to apply.
+    pub(crate) partition_expr: String,
+    /// Result sender.
+    pub(crate) sender: OptionOutputTx,
+    /// Result from the manifest manager.
+    pub(crate) result: Result<()>,
+}
+
+#[derive(Debug)]
+pub(crate) struct CopyRegionFromFinished {
+    /// Region id.
+    pub(crate) region_id: RegionId,
+    /// Region edit to apply.
+    pub(crate) edit: RegionEdit,
+    /// Result sender.
+    pub(crate) sender: Sender<Result<MitoCopyRegionFromResponse>>,
+}
+
 /// Request to edit a region directly.
 #[derive(Debug)]
 pub(crate) struct RegionEditRequest {
@@ -993,6 +1098,32 @@ pub(crate) struct RegionSyncRequest {
     pub(crate) sender: Sender<Result<(ManifestVersion, bool)>>,
 }
 
+#[derive(Debug)]
+pub(crate) struct RemapManifestsRequest {
+    /// The [`RegionId`] of a staging region used to obtain table directory and storage configuration for the remap operation.
+    pub(crate) region_id: RegionId,
+    /// Regions to remap manifests from.
+    pub(crate) input_regions: Vec<RegionId>,
+    /// For each old region, which new regions should receive its files
+    pub(crate) region_mapping: HashMap<RegionId, Vec<RegionId>>,
+    /// New partition expressions for the new regions.
+    pub(crate) new_partition_exprs: HashMap<RegionId, PartitionExpr>,
+    /// Result sender.
+    pub(crate) sender: Sender<Result<HashMap<RegionId, RegionManifest>>>,
+}
+
+#[derive(Debug)]
+pub(crate) struct CopyRegionFromRequest {
+    /// The [`RegionId`] of the target region.
+    pub(crate) region_id: RegionId,
+    /// The [`RegionId`] of the source region.
+    pub(crate) source_region_id: RegionId,
+    /// The parallelism of the copy operation.
+    pub(crate) parallelism: usize,
+    /// Result sender.
+    pub(crate) sender: Sender<Result<MitoCopyRegionFromResponse>>,
+}
+
 #[cfg(test)]
 mod tests {
     use api::v1::value::ValueData;
diff --git a/src/mito2/src/row_converter.rs b/src/mito2/src/row_converter.rs
deleted file mode 100644
index 2bafc49ca3..0000000000
--- a/src/mito2/src/row_converter.rs
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-mod dense;
-mod sparse;
-use std::fmt::Debug;
-use std::sync::Arc;
-
-use common_recordbatch::filter::SimpleFilterEvaluator;
-use datatypes::value::{Value, ValueRef};
-pub use dense::{DensePrimaryKeyCodec, SortField};
-use mito_codec::key_values::KeyValue;
-pub use sparse::{SparsePrimaryKeyCodec, SparseValues, COLUMN_ID_ENCODE_SIZE};
-use store_api::codec::PrimaryKeyEncoding;
-use store_api::metadata::{RegionMetadata, RegionMetadataRef};
-use store_api::storage::ColumnId;
-
-use crate::error::Result;
-
-/// Row value encoder/decoder.
-pub trait PrimaryKeyCodecExt {
-    /// Encodes rows to bytes.
-    /// # Note
-    /// Ensure the length of row iterator matches the length of fields.
-    fn encode<'a, I>(&self, row: I) -> Result<Vec<u8>>
-    where
-        I: Iterator<Item = ValueRef<'a>>,
-    {
-        let mut buffer = Vec::new();
-        self.encode_to_vec(row, &mut buffer)?;
-        Ok(buffer)
-    }
-
-    /// Encodes rows to specific vec.
-    /// # Note
-    /// Ensure the length of row iterator matches the length of fields.
-    fn encode_to_vec<'a, I>(&self, row: I, buffer: &mut Vec<u8>) -> Result<()>
-    where
-        I: Iterator<Item = ValueRef<'a>>;
-}
-
-pub trait PrimaryKeyFilter: Send + Sync {
-    /// Returns true if the primary key matches the filter.
-    fn matches(&mut self, pk: &[u8]) -> bool;
-}
-
-/// Composite values decoded from primary key bytes.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum CompositeValues {
-    Dense(Vec<(ColumnId, Value)>),
-    Sparse(SparseValues),
-}
-
-impl CompositeValues {
-    /// Extends the composite values with the given values.
-    pub fn extend(&mut self, values: &[(ColumnId, Value)]) {
-        match self {
-            CompositeValues::Dense(dense_values) => {
-                for (column_id, value) in values {
-                    dense_values.push((*column_id, value.clone()));
-                }
-            }
-            CompositeValues::Sparse(sprase_value) => {
-                for (column_id, value) in values {
-                    sprase_value.insert(*column_id, value.clone());
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-impl CompositeValues {
-    pub fn into_sparse(self) -> SparseValues {
-        match self {
-            CompositeValues::Sparse(v) => v,
-            _ => panic!("CompositeValues is not sparse"),
-        }
-    }
-
-    pub fn into_dense(self) -> Vec<Value> {
-        match self {
-            CompositeValues::Dense(v) => v.into_iter().map(|(_, v)| v).collect(),
-            _ => panic!("CompositeValues is not dense"),
-        }
-    }
-}
-
-pub trait PrimaryKeyCodec: Send + Sync + Debug {
-    /// Encodes a key value to bytes.
-    fn encode_key_value(&self, key_value: &KeyValue, buffer: &mut Vec<u8>) -> Result<()>;
-
-    /// Encodes values to bytes.
-    fn encode_values(&self, values: &[(ColumnId, Value)], buffer: &mut Vec<u8>) -> Result<()>;
-
-    /// Encodes values to bytes.
-    fn encode_value_refs(
-        &self,
-        values: &[(ColumnId, ValueRef)],
-        buffer: &mut Vec<u8>,
-    ) -> Result<()>;
-
-    /// Returns the number of fields in the primary key.
-    fn num_fields(&self) -> Option<usize>;
-
-    /// Returns a primary key filter factory.
-    fn primary_key_filter(
-        &self,
-        metadata: &RegionMetadataRef,
-        filters: Arc<Vec<SimpleFilterEvaluator>>,
-    ) -> Box<dyn PrimaryKeyFilter>;
-
-    /// Returns the estimated size of the primary key.
-    fn estimated_size(&self) -> Option<usize> {
-        None
-    }
-
-    /// Returns the encoding type of the primary key.
-    fn encoding(&self) -> PrimaryKeyEncoding;
-
-    /// Decodes the primary key from the given bytes.
-    ///
-    /// Returns a [`CompositeValues`] that follows the primary key ordering.
-    fn decode(&self, bytes: &[u8]) -> Result<CompositeValues>;
-
-    /// Decode the leftmost value from bytes.
-    fn decode_leftmost(&self, bytes: &[u8]) -> Result<Option<Value>>;
-}
-
-/// Builds a primary key codec from region metadata.
-pub fn build_primary_key_codec(region_metadata: &RegionMetadata) -> Arc<dyn PrimaryKeyCodec> {
-    let fields = region_metadata.primary_key_columns().map(|col| {
-        (
-            col.column_id,
-            SortField::new(col.column_schema.data_type.clone()),
-        )
-    });
-    build_primary_key_codec_with_fields(region_metadata.primary_key_encoding, fields)
-}
-
-/// Builds a primary key codec from region metadata.
-pub fn build_primary_key_codec_with_fields(
-    encoding: PrimaryKeyEncoding,
-    fields: impl Iterator<Item = (ColumnId, SortField)>,
-) -> Arc<dyn PrimaryKeyCodec> {
-    match encoding {
-        PrimaryKeyEncoding::Dense => Arc::new(DensePrimaryKeyCodec::with_fields(fields.collect())),
-        PrimaryKeyEncoding::Sparse => {
-            Arc::new(SparsePrimaryKeyCodec::with_fields(fields.collect()))
-        }
-    }
-}
diff --git a/src/mito2/src/sst.rs b/src/mito2/src/sst.rs
index f3f51bdc08..78e4c563b1 100644
--- a/src/mito2/src/sst.rs
+++ b/src/mito2/src/sst.rs
@@ -121,7 +121,7 @@ impl FlatSchemaOptions {
 ///
 /// The schema is:
 /// ```text
-/// primary key columns, field columns, time index, __prmary_key, __sequence, __op_type
+/// primary key columns, field columns, time index, __primary_key, __sequence, __op_type
 /// ```
 ///
 /// # Panics
diff --git a/src/mito2/src/sst/file.rs b/src/mito2/src/sst/file.rs
index 70c4f5a016..94209d7b0c 100644
--- a/src/mito2/src/sst/file.rs
+++ b/src/mito2/src/sst/file.rs
@@ -21,13 +21,14 @@ use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, Ordering};
 
 use common_base::readable_size::ReadableSize;
-use common_telemetry::{error, info};
+use common_telemetry::{debug, error};
 use common_time::Timestamp;
 use partition::expr::PartitionExpr;
 use serde::{Deserialize, Serialize};
 use smallvec::SmallVec;
+use store_api::metadata::ColumnMetadata;
 use store_api::region_request::PathType;
-use store_api::storage::{FileId, RegionId};
+use store_api::storage::{ColumnId, FileId, IndexVersion, RegionId};
 
 use crate::access_layer::AccessLayerRef;
 use crate::cache::CacheManagerRef;
@@ -79,6 +80,8 @@ where
 pub type Level = u8;
 /// Maximum level of SSTs.
 pub const MAX_LEVEL: Level = 2;
+/// Type to store index types for a column.
+pub type IndexTypes = SmallVec<[IndexType; 4]>;
 
 /// Cross-region file id.
 ///
@@ -114,6 +117,41 @@ impl fmt::Display for RegionFileId {
     }
 }
 
+/// Unique identifier for an index file, combining the SST file ID and the index version.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct RegionIndexId {
+    pub file_id: RegionFileId,
+    pub version: IndexVersion,
+}
+
+impl RegionIndexId {
+    pub fn new(file_id: RegionFileId, version: IndexVersion) -> Self {
+        Self { file_id, version }
+    }
+
+    pub fn region_id(&self) -> RegionId {
+        self.file_id.region_id
+    }
+
+    pub fn file_id(&self) -> FileId {
+        self.file_id.file_id
+    }
+}
+
+impl fmt::Display for RegionIndexId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if self.version == 0 {
+            write!(f, "{}/{}", self.file_id.region_id, self.file_id.file_id)
+        } else {
+            write!(
+                f,
+                "{}/{}.{}",
+                self.file_id.region_id, self.file_id.file_id, self.version
+            )
+        }
+    }
+}
+
 /// Time range (min and max timestamps) of a SST file.
 /// Both min and max are inclusive.
 pub type FileTimeRange = (Timestamp, Timestamp);
@@ -142,16 +180,26 @@ pub struct FileMeta {
     pub level: Level,
     /// Size of the file.
     pub file_size: u64,
+    /// Maximum uncompressed row group size of the file. 0 means unknown.
+    pub max_row_group_uncompressed_size: u64,
     /// Available indexes of the file.
-    pub available_indexes: SmallVec<[IndexType; 4]>,
+    pub available_indexes: IndexTypes,
+    /// Created indexes of the file for each column.
+    ///
+    /// This is essentially a more granular, column-level version of `available_indexes`,
+    /// primarily used for manual index building in the asynchronous index construction mode.
+    ///
+    /// For backward compatibility, older `FileMeta` versions might only contain `available_indexes`.
+    /// In such cases, we cannot deduce specific column index information from `available_indexes` alone.
+    /// Therefore, defaulting this `indexes` field to an empty list during deserialization is a
+    /// reasonable and necessary step to ensure column information consistency.
+    pub indexes: Vec<ColumnIndexMetadata>,
     /// Size of the index file.
     pub index_file_size: u64,
-    /// File ID of the index file.
-    ///
-    /// When this field is None, it means the index file id is the same as the file id.
-    /// Only meaningful when index_file_size > 0.
-    /// Used for rebuilding index files.
-    pub index_file_id: Option<FileId>,
+    /// Version of the index file.
+    /// Used to generate the index file name: "{file_id}.{index_version}.puffin".
+    /// Default is 0 (which maps to "{file_id}.puffin" for compatibility).
+    pub index_version: u64,
     /// Number of rows in the file.
     ///
     /// For historical reasons, this field might be missing in old files. Thus
@@ -202,10 +250,15 @@ impl Debug for FileMeta {
                 )
             })
             .field("level", &self.level)
-            .field("file_size", &ReadableSize(self.file_size));
+            .field("file_size", &ReadableSize(self.file_size))
+            .field(
+                "max_row_group_uncompressed_size",
+                &ReadableSize(self.max_row_group_uncompressed_size),
+            );
         if !self.available_indexes.is_empty() {
             debug_struct
                 .field("available_indexes", &self.available_indexes)
+                .field("indexes", &self.indexes)
                 .field("index_file_size", &ReadableSize(self.index_file_size));
         }
         debug_struct
@@ -236,11 +289,34 @@ pub enum IndexType {
     BloomFilterIndex,
 }
 
+/// Metadata of indexes created for a specific column in an SST file.
+///
+/// This structure tracks which index types have been successfully created for a column.
+/// It provides more granular, column-level index information compared to the file-level
+/// `available_indexes` field in [`FileMeta`].
+///
+/// This is primarily used for:
+/// - Manual index building in asynchronous index construction mode
+/// - Verifying index consistency between files and region metadata
+#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
+#[serde(default)]
+pub struct ColumnIndexMetadata {
+    /// The column ID this index metadata applies to.
+    pub column_id: ColumnId,
+    /// List of index types that have been successfully created for this column.
+    pub created_indexes: IndexTypes,
+}
+
 impl FileMeta {
     pub fn exists_index(&self) -> bool {
         !self.available_indexes.is_empty()
     }
 
+    /// Whether the index file is up-to-date comparing to another file meta.    
+    pub fn is_index_up_to_date(&self, other: &FileMeta) -> bool {
+        self.exists_index() && other.exists_index() && self.index_version >= other.index_version
+    }
+
     /// Returns true if the file has an inverted index
     pub fn inverted_index_available(&self) -> bool {
         self.available_indexes.contains(&IndexType::InvertedIndex)
@@ -261,19 +337,48 @@ impl FileMeta {
         self.index_file_size
     }
 
+    /// Check whether the file index is consistent with the given region metadata.
+    pub fn is_index_consistent_with_region(&self, metadata: &[ColumnMetadata]) -> bool {
+        let id_to_indexes = self
+            .indexes
+            .iter()
+            .map(|index| (index.column_id, index.created_indexes.clone()))
+            .collect::<std::collections::HashMap<_, _>>();
+        for column in metadata {
+            if !column.column_schema.is_indexed() {
+                continue;
+            }
+            if let Some(indexes) = id_to_indexes.get(&column.column_id) {
+                if column.column_schema.is_inverted_indexed()
+                    && !indexes.contains(&IndexType::InvertedIndex)
+                {
+                    return false;
+                }
+                if column.column_schema.is_fulltext_indexed()
+                    && !indexes.contains(&IndexType::FulltextIndex)
+                {
+                    return false;
+                }
+                if column.column_schema.is_skipping_indexed()
+                    && !indexes.contains(&IndexType::BloomFilterIndex)
+                {
+                    return false;
+                }
+            } else {
+                return false;
+            }
+        }
+        true
+    }
+
     /// Returns the cross-region file id.
     pub fn file_id(&self) -> RegionFileId {
         RegionFileId::new(self.region_id, self.file_id)
     }
 
-    /// Returns the cross-region index file id.
-    /// If the index file id is not set, returns the file id.
-    pub fn index_file_id(&self) -> RegionFileId {
-        if let Some(index_file_id) = self.index_file_id {
-            RegionFileId::new(self.region_id, index_file_id)
-        } else {
-            self.file_id()
-        }
+    /// Returns the RegionIndexId for this file.
+    pub fn index_id(&self) -> RegionIndexId {
+        RegionIndexId::new(self.file_id(), self.index_version)
     }
 }
 
@@ -310,14 +415,9 @@ impl FileHandle {
         RegionFileId::new(self.inner.meta.region_id, self.inner.meta.file_id)
     }
 
-    /// Returns the cross-region index file id.
-    /// If the index file id is not set, returns the file id.
-    pub fn index_file_id(&self) -> RegionFileId {
-        if let Some(index_file_id) = self.inner.meta.index_file_id {
-            RegionFileId::new(self.inner.meta.region_id, index_file_id)
-        } else {
-            self.file_id()
-        }
+    /// Returns the RegionIndexId for this file.
+    pub fn index_id(&self) -> RegionIndexId {
+        RegionIndexId::new(self.file_id(), self.inner.meta.index_version)
     }
 
     /// Returns the complete file path of the file.
@@ -343,6 +443,16 @@ impl FileHandle {
         self.inner.compacting.store(compacting, Ordering::Relaxed);
     }
 
+    pub fn index_outdated(&self) -> bool {
+        self.inner.index_outdated.load(Ordering::Relaxed)
+    }
+
+    pub fn set_index_outdated(&self, index_outdated: bool) {
+        self.inner
+            .index_outdated
+            .store(index_outdated, Ordering::Relaxed);
+    }
+
     /// Returns a reference to the [FileMeta].
     pub fn meta_ref(&self) -> &FileMeta {
         &self.inner.meta
@@ -380,32 +490,43 @@ struct FileHandleInner {
     meta: FileMeta,
     compacting: AtomicBool,
     deleted: AtomicBool,
+    index_outdated: AtomicBool,
     file_purger: FilePurgerRef,
 }
 
 impl Drop for FileHandleInner {
     fn drop(&mut self) {
-        self.file_purger
-            .remove_file(self.meta.clone(), self.deleted.load(Ordering::Relaxed));
+        self.file_purger.remove_file(
+            self.meta.clone(),
+            self.deleted.load(Ordering::Acquire),
+            self.index_outdated.load(Ordering::Acquire),
+        );
     }
 }
 
 impl FileHandleInner {
+    /// There should only be one `FileHandleInner` for each file on a datanode
     fn new(meta: FileMeta, file_purger: FilePurgerRef) -> FileHandleInner {
         file_purger.new_file(&meta);
         FileHandleInner {
             meta,
             compacting: AtomicBool::new(false),
             deleted: AtomicBool::new(false),
+            index_outdated: AtomicBool::new(false),
             file_purger,
         }
     }
 }
 
-/// Delete
+/// Delete files for a region.
+/// - `region_id`: Region id.
+/// - `file_ids`: List of (file id, index version) tuples to delete.
+/// - `delete_index`: Whether to delete the index file from the cache.
+/// - `access_layer`: Access layer to delete files.
+/// - `cache_manager`: Cache manager to remove files from cache.
 pub async fn delete_files(
     region_id: RegionId,
-    file_ids: &[(FileId, FileId)],
+    file_ids: &[(FileId, u64)],
     delete_index: bool,
     access_layer: &AccessLayerRef,
     cache_manager: &Option<CacheManagerRef>,
@@ -418,12 +539,12 @@ pub async fn delete_files(
     }
     let mut deleted_files = Vec::with_capacity(file_ids.len());
 
-    for (file_id, index_file_id) in file_ids {
+    for (file_id, index_version) in file_ids {
         let region_file_id = RegionFileId::new(region_id, *file_id);
         match access_layer
             .delete_sst(
-                &RegionFileId::new(region_id, *file_id),
-                &RegionFileId::new(region_id, *index_file_id),
+                &region_file_id,
+                &RegionIndexId::new(region_file_id, *index_version),
             )
             .await
         {
@@ -436,45 +557,95 @@ pub async fn delete_files(
         }
     }
 
-    info!(
+    debug!(
         "Deleted {} files for region {}: {:?}",
         deleted_files.len(),
         region_id,
         deleted_files
     );
 
-    for (file_id, index_file_id) in file_ids {
-        if let Some(write_cache) = cache_manager.as_ref().and_then(|cache| cache.write_cache()) {
-            // Removes index file from the cache.
-            if delete_index {
-                write_cache
-                    .remove(IndexKey::new(region_id, *index_file_id, FileType::Puffin))
-                    .await;
-            }
+    for (file_id, index_version) in file_ids {
+        purge_index_cache_stager(
+            region_id,
+            delete_index,
+            access_layer,
+            cache_manager,
+            *file_id,
+            *index_version,
+        )
+        .await;
+    }
+    Ok(())
+}
 
-            // Remove the SST file from the cache.
+pub async fn delete_index(
+    region_index_id: RegionIndexId,
+    access_layer: &AccessLayerRef,
+    cache_manager: &Option<CacheManagerRef>,
+) -> crate::error::Result<()> {
+    access_layer.delete_index(region_index_id).await?;
+
+    purge_index_cache_stager(
+        region_index_id.region_id(),
+        true,
+        access_layer,
+        cache_manager,
+        region_index_id.file_id(),
+        region_index_id.version,
+    )
+    .await;
+
+    Ok(())
+}
+
+async fn purge_index_cache_stager(
+    region_id: RegionId,
+    delete_index: bool,
+    access_layer: &AccessLayerRef,
+    cache_manager: &Option<CacheManagerRef>,
+    file_id: FileId,
+    index_version: u64,
+) {
+    if let Some(write_cache) = cache_manager.as_ref().and_then(|cache| cache.write_cache()) {
+        // Removes index file from the cache.
+        if delete_index {
             write_cache
-                .remove(IndexKey::new(region_id, *file_id, FileType::Parquet))
+                .remove(IndexKey::new(
+                    region_id,
+                    file_id,
+                    FileType::Puffin(index_version),
+                ))
                 .await;
         }
 
-        // Purges index content in the stager.
-        if let Err(e) = access_layer
-            .puffin_manager_factory()
-            .purge_stager(RegionFileId::new(region_id, *index_file_id))
-            .await
-        {
-            error!(e; "Failed to purge stager with index file, file_id: {}, region: {}",
-                    index_file_id, region_id);
-        }
+        // Remove the SST file from the cache.
+        write_cache
+            .remove(IndexKey::new(region_id, file_id, FileType::Parquet))
+            .await;
+    }
+
+    // Purges index content in the stager.
+    if let Err(e) = access_layer
+        .puffin_manager_factory()
+        .purge_stager(RegionIndexId::new(
+            RegionFileId::new(region_id, file_id),
+            index_version,
+        ))
+        .await
+    {
+        error!(e; "Failed to purge stager with index file, file_id: {}, index_version: {}, region: {}",
+                file_id, index_version, region_id);
     }
-    Ok(())
 }
 
 #[cfg(test)]
 mod tests {
     use std::str::FromStr;
 
+    use datatypes::prelude::ConcreteDataType;
+    use datatypes::schema::{
+        ColumnSchema, FulltextAnalyzer, FulltextBackend, FulltextOptions, SkippingIndexOptions,
+    };
     use datatypes::value::Value;
     use partition::expr::{PartitionExpr, col};
 
@@ -487,9 +658,14 @@ mod tests {
             time_range: FileTimeRange::default(),
             level,
             file_size: 0,
+            max_row_group_uncompressed_size: 0,
             available_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+            indexes: vec![ColumnIndexMetadata {
+                column_id: 0,
+                created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+            }],
             index_file_size: 0,
-            index_file_id: None,
+            index_version: 0,
             num_rows: 0,
             num_row_groups: 0,
             sequence: None,
@@ -510,7 +686,7 @@ mod tests {
     fn test_deserialize_from_string() {
         let json_file_meta = "{\"region_id\":0,\"file_id\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55\",\
         \"time_range\":[{\"value\":0,\"unit\":\"Millisecond\"},{\"value\":0,\"unit\":\"Millisecond\"}],\
-        \"available_indexes\":[\"InvertedIndex\"],\"level\":0}";
+        \"available_indexes\":[\"InvertedIndex\"],\"indexes\":[{\"column_id\": 0, \"created_indexes\": [\"InvertedIndex\"]}],\"level\":0}";
         let file_meta = create_file_meta(
             FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(),
             0,
@@ -534,9 +710,14 @@ mod tests {
             time_range: FileTimeRange::default(),
             level: 0,
             file_size: 0,
+            max_row_group_uncompressed_size: 0,
             available_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+            indexes: vec![ColumnIndexMetadata {
+                column_id: 0,
+                created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+            }],
             index_file_size: 0,
-            index_file_id: None,
+            index_version: 0,
             num_rows: 0,
             num_row_groups: 0,
             sequence: None,
@@ -653,4 +834,147 @@ mod tests {
         let file_meta_empty: FileMeta = serde_json::from_str(json_with_empty_expr).unwrap();
         assert!(file_meta_empty.partition_expr.is_none());
     }
+
+    #[test]
+    fn test_file_meta_indexes_backward_compatibility() {
+        // Old FileMeta format without the 'indexes' field
+        let json_old_file_meta = r#"{
+            "region_id": 0,
+            "file_id": "bc5896ec-e4d8-4017-a80d-f2de73188d55",
+            "time_range": [
+                {"value": 0, "unit": "Millisecond"},
+                {"value": 0, "unit": "Millisecond"}
+            ],
+            "available_indexes": ["InvertedIndex"],
+            "level": 0,
+            "file_size": 0,
+            "index_file_size": 0,
+            "num_rows": 0,
+            "num_row_groups": 0
+        }"#;
+
+        let deserialized_file_meta: FileMeta = serde_json::from_str(json_old_file_meta).unwrap();
+
+        // Verify backward compatibility: indexes field should default to empty vec
+        assert_eq!(deserialized_file_meta.indexes, vec![]);
+
+        let expected_indexes: IndexTypes = SmallVec::from_iter([IndexType::InvertedIndex]);
+        assert_eq!(deserialized_file_meta.available_indexes, expected_indexes);
+
+        assert_eq!(
+            deserialized_file_meta.file_id,
+            FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap()
+        );
+    }
+    #[test]
+    fn test_is_index_consistent_with_region() {
+        fn new_column_meta(
+            id: ColumnId,
+            name: &str,
+            inverted: bool,
+            fulltext: bool,
+            skipping: bool,
+        ) -> ColumnMetadata {
+            let mut column_schema =
+                ColumnSchema::new(name, ConcreteDataType::string_datatype(), true);
+            if inverted {
+                column_schema = column_schema.with_inverted_index(true);
+            }
+            if fulltext {
+                column_schema = column_schema
+                    .with_fulltext_options(FulltextOptions::new_unchecked(
+                        true,
+                        FulltextAnalyzer::English,
+                        false,
+                        FulltextBackend::Bloom,
+                        1000,
+                        0.01,
+                    ))
+                    .unwrap();
+            }
+            if skipping {
+                column_schema = column_schema
+                    .with_skipping_options(SkippingIndexOptions::new_unchecked(
+                        1024,
+                        0.01,
+                        datatypes::schema::SkippingIndexType::BloomFilter,
+                    ))
+                    .unwrap();
+            }
+
+            ColumnMetadata {
+                column_schema,
+                semantic_type: api::v1::SemanticType::Tag,
+                column_id: id,
+            }
+        }
+
+        // Case 1: Perfect match. File has exactly the required indexes.
+        let mut file_meta = FileMeta {
+            indexes: vec![ColumnIndexMetadata {
+                column_id: 1,
+                created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+            }],
+            ..Default::default()
+        };
+        let region_meta = vec![new_column_meta(1, "tag1", true, false, false)];
+        assert!(file_meta.is_index_consistent_with_region(&region_meta));
+
+        // Case 2: Superset match. File has more indexes than required.
+        file_meta.indexes = vec![ColumnIndexMetadata {
+            column_id: 1,
+            created_indexes: SmallVec::from_iter([
+                IndexType::InvertedIndex,
+                IndexType::BloomFilterIndex,
+            ]),
+        }];
+        let region_meta = vec![new_column_meta(1, "tag1", true, false, false)];
+        assert!(file_meta.is_index_consistent_with_region(&region_meta));
+
+        // Case 3: Missing index type. File has the column but lacks the required index type.
+        file_meta.indexes = vec![ColumnIndexMetadata {
+            column_id: 1,
+            created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+        }];
+        let region_meta = vec![new_column_meta(1, "tag1", true, true, false)]; // Requires fulltext too
+        assert!(!file_meta.is_index_consistent_with_region(&region_meta));
+
+        // Case 4: Missing column. Region requires an index on a column not in the file's index list.
+        file_meta.indexes = vec![ColumnIndexMetadata {
+            column_id: 2, // File only has index for column 2
+            created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+        }];
+        let region_meta = vec![new_column_meta(1, "tag1", true, false, false)]; // Requires index on column 1
+        assert!(!file_meta.is_index_consistent_with_region(&region_meta));
+
+        // Case 5: No indexes required by region. Should always be consistent.
+        file_meta.indexes = vec![ColumnIndexMetadata {
+            column_id: 1,
+            created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+        }];
+        let region_meta = vec![new_column_meta(1, "tag1", false, false, false)]; // No index required
+        assert!(file_meta.is_index_consistent_with_region(&region_meta));
+
+        // Case 6: Empty file indexes. Region requires an index.
+        file_meta.indexes = vec![];
+        let region_meta = vec![new_column_meta(1, "tag1", true, false, false)];
+        assert!(!file_meta.is_index_consistent_with_region(&region_meta));
+
+        // Case 7: Multiple columns, one is inconsistent.
+        file_meta.indexes = vec![
+            ColumnIndexMetadata {
+                column_id: 1,
+                created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+            },
+            ColumnIndexMetadata {
+                column_id: 2, // Column 2 is missing the required BloomFilterIndex
+                created_indexes: SmallVec::from_iter([IndexType::FulltextIndex]),
+            },
+        ];
+        let region_meta = vec![
+            new_column_meta(1, "tag1", true, false, false),
+            new_column_meta(2, "tag2", false, true, true), // Requires Fulltext and BloomFilter
+        ];
+        assert!(!file_meta.is_index_consistent_with_region(&region_meta));
+    }
 }
diff --git a/src/mito2/src/sst/file_purger.rs b/src/mito2/src/sst/file_purger.rs
index 64e83c1a54..5b23f3e069 100644
--- a/src/mito2/src/sst/file_purger.rs
+++ b/src/mito2/src/sst/file_purger.rs
@@ -21,7 +21,7 @@ use crate::access_layer::AccessLayerRef;
 use crate::cache::CacheManagerRef;
 use crate::error::Result;
 use crate::schedule::scheduler::SchedulerRef;
-use crate::sst::file::{FileMeta, delete_files};
+use crate::sst::file::{FileMeta, delete_files, delete_index};
 use crate::sst::file_ref::FileReferenceManagerRef;
 
 /// A worker to delete files in background.
@@ -29,7 +29,8 @@ pub trait FilePurger: Send + Sync + fmt::Debug {
     /// Send a request to remove the file.
     /// If `is_delete` is true, the file will be deleted from the storage.
     /// Otherwise, only the reference will be removed.
-    fn remove_file(&self, file_meta: FileMeta, is_delete: bool);
+    /// If `index_outdated` is true, the index file will be deleted regardless of `is_delete`.
+    fn remove_file(&self, file_meta: FileMeta, is_delete: bool, index_outdated: bool);
 
     /// Notify the purger of a new file created.
     /// This is useful for object store based storage, where we need to track the file references
@@ -46,7 +47,7 @@ pub type FilePurgerRef = Arc<dyn FilePurger>;
 pub struct NoopFilePurger;
 
 impl FilePurger for NoopFilePurger {
-    fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool) {
+    fn remove_file(&self, _file_meta: FileMeta, _is_delete: bool, _index_outdated: bool) {
         // noop
     }
 }
@@ -80,15 +81,16 @@ pub fn is_local_fs(sst_layer: &AccessLayerRef) -> bool {
 /// only manages the file references without deleting the actual files.
 ///
 pub fn create_file_purger(
+    gc_enabled: bool,
     scheduler: SchedulerRef,
     sst_layer: AccessLayerRef,
     cache_manager: Option<CacheManagerRef>,
     file_ref_manager: FileReferenceManagerRef,
 ) -> FilePurgerRef {
-    if is_local_fs(&sst_layer) {
-        Arc::new(LocalFilePurger::new(scheduler, sst_layer, cache_manager))
-    } else {
+    if gc_enabled && !is_local_fs(&sst_layer) {
         Arc::new(ObjectStoreFilePurger { file_ref_manager })
+    } else {
+        Arc::new(LocalFilePurger::new(scheduler, sst_layer, cache_manager))
     }
 }
 
@@ -128,7 +130,7 @@ impl LocalFilePurger {
         if let Err(e) = self.scheduler.schedule(Box::pin(async move {
             if let Err(e) = delete_files(
                 file_meta.region_id,
-                &[(file_meta.file_id, file_meta.index_file_id().file_id())],
+                &[(file_meta.file_id, file_meta.index_id().version)],
                 file_meta.exists_index(),
                 &sst_layer,
                 &cache_manager,
@@ -141,12 +143,27 @@ impl LocalFilePurger {
             error!(e; "Failed to schedule the file purge request");
         }
     }
+
+    fn delete_index(&self, file_meta: FileMeta) {
+        let sst_layer = self.sst_layer.clone();
+        let cache_manager = self.cache_manager.clone();
+        if let Err(e) = self.scheduler.schedule(Box::pin(async move {
+            let index_id = file_meta.index_id();
+            if let Err(e) = delete_index(index_id, &sst_layer, &cache_manager).await {
+                error!(e; "Failed to delete index for file {:?} from storage", file_meta);
+            }
+        })) {
+            error!(e; "Failed to schedule the index purge request");
+        }
+    }
 }
 
 impl FilePurger for LocalFilePurger {
-    fn remove_file(&self, file_meta: FileMeta, is_delete: bool) {
+    fn remove_file(&self, file_meta: FileMeta, is_delete: bool, index_outdated: bool) {
         if is_delete {
             self.delete_file(file_meta);
+        } else if index_outdated {
+            self.delete_index(file_meta);
         }
     }
 }
@@ -157,7 +174,7 @@ pub struct ObjectStoreFilePurger {
 }
 
 impl FilePurger for ObjectStoreFilePurger {
-    fn remove_file(&self, file_meta: FileMeta, _is_delete: bool) {
+    fn remove_file(&self, file_meta: FileMeta, _is_delete: bool, _index_outdated: bool) {
         // if not on local file system, instead inform the global file purger to remove the file reference.
         // notice that no matter whether the file is deleted or not, we need to remove the reference
         // because the file is no longer in use nonetheless.
@@ -184,7 +201,10 @@ mod tests {
     use super::*;
     use crate::access_layer::AccessLayer;
     use crate::schedule::scheduler::{LocalScheduler, Scheduler};
-    use crate::sst::file::{FileHandle, FileMeta, FileTimeRange, IndexType, RegionFileId};
+    use crate::sst::file::{
+        ColumnIndexMetadata, FileHandle, FileMeta, FileTimeRange, IndexType, RegionFileId,
+        RegionIndexId,
+    };
     use crate::sst::index::intermediate::IntermediateManager;
     use crate::sst::index::puffin_manager::PuffinManagerFactory;
     use crate::sst::location;
@@ -231,9 +251,11 @@ mod tests {
                     time_range: FileTimeRange::default(),
                     level: 0,
                     file_size: 4096,
+                    max_row_group_uncompressed_size: 4096,
                     available_indexes: Default::default(),
+                    indexes: Default::default(),
                     index_file_size: 0,
-                    index_file_id: None,
+                    index_version: 0,
                     num_rows: 0,
                     num_row_groups: 0,
                     sequence: None,
@@ -259,6 +281,7 @@ mod tests {
         let dir_path = dir.path().display().to_string();
         let builder = Fs::default().root(&dir_path);
         let sst_file_id = RegionFileId::new(RegionId::new(0, 0), FileId::random());
+        let index_file_id = RegionIndexId::new(sst_file_id, 0);
         let sst_dir = "table1";
 
         let index_aux_path = dir.path().join("index_aux");
@@ -281,7 +304,7 @@ mod tests {
         let path = location::sst_file_path(sst_dir, sst_file_id, layer.path_type());
         object_store.write(&path, vec![0; 4096]).await.unwrap();
 
-        let index_path = location::index_file_path(sst_dir, sst_file_id, layer.path_type());
+        let index_path = location::index_file_path(sst_dir, index_file_id, layer.path_type());
         object_store
             .write(&index_path, vec![0; 4096])
             .await
@@ -299,9 +322,14 @@ mod tests {
                     time_range: FileTimeRange::default(),
                     level: 0,
                     file_size: 4096,
+                    max_row_group_uncompressed_size: 4096,
                     available_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+                    indexes: vec![ColumnIndexMetadata {
+                        column_id: 0,
+                        created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+                    }],
                     index_file_size: 4096,
-                    index_file_id: None,
+                    index_version: 0,
                     num_rows: 1024,
                     num_row_groups: 1,
                     sequence: NonZeroU64::new(4096),
diff --git a/src/mito2/src/sst/file_ref.rs b/src/mito2/src/sst/file_ref.rs
index 8f750ebf2a..9e69bd42cf 100644
--- a/src/mito2/src/sst/file_ref.rs
+++ b/src/mito2/src/sst/file_ref.rs
@@ -80,56 +80,47 @@ impl FileReferenceManager {
         Some(ref_file_set)
     }
 
-    /// Gets all ref files for the given table id, excluding those already in region manifest.
-    ///
-    /// It's safe if manifest version became outdated when gc worker is called, as gc worker will check the changes between those two versions and act accordingly to make sure to get the real truly tmp ref file sets at the time of old manifest version.
-    ///
-    /// TODO(discord9): Since query will only possible refer to files in latest manifest when it's started, the only true risks is files removed from manifest between old version(when reading refs) and new version(at gc worker), so in case of having outdated manifest version, gc worker should make sure not to delete those files(Until next gc round which will use the latest manifest version and handle those files normally).
-    /// or perhaps using a two-phase commit style process where it proposes a set of files for deletion and then verifies no new references have appeared before committing the delete.
-    ///
-    /// gc worker could do this:
-    /// 1. if can get the files that got removed from old manifest to new manifest, then shouldn't delete those files even if they are not in tmp ref file, other files can be normally handled(deleted if not in use, otherwise keep)
-    ///    and report back allow next gc round to handle those files with newer tmp ref file sets.
-    /// 2. if can't get the files that got removed from old manifest to new manifest(possible if just did a checkpoint),
-    ///    then can do nothing as can't sure whether a file is truly unused or just tmp ref file sets haven't report it, so need to report back and try next gc round to handle those files with newer tmp ref file sets.
-    ///
-    #[allow(unused)]
-    pub(crate) async fn get_snapshot_of_unmanifested_refs(
+    /// Gets all ref files for the given regions, meaning all open FileHandles for those regions
+    /// and from related regions' manifests.
+    pub(crate) async fn get_snapshot_of_file_refs(
         &self,
-        regions: Vec<MitoRegionRef>,
+        query_regions: Vec<MitoRegionRef>,
+        related_regions: Vec<(MitoRegionRef, Vec<RegionId>)>,
     ) -> Result<FileRefsManifest> {
         let mut ref_files = HashMap::new();
-        for region_id in regions.iter().map(|r| r.region_id()) {
+        // get from in memory file handles
+        for region_id in query_regions.iter().map(|r| r.region_id()) {
             if let Some(files) = self.ref_file_set(region_id) {
-                ref_files.insert(region_id, files);
+                ref_files.insert(region_id, files.into_iter().map(|f| f.file_id).collect());
             }
         }
 
-        let mut in_manifest_files = HashSet::new();
         let mut manifest_version = HashMap::new();
 
-        for r in &regions {
+        for r in &query_regions {
             let manifest = r.manifest_ctx.manifest().await;
-            let files = manifest.files.keys().cloned().collect::<Vec<_>>();
-            in_manifest_files.extend(files);
             manifest_version.insert(r.region_id(), manifest.manifest_version);
         }
 
-        let ref_files_excluding_in_manifest = ref_files
-            .iter()
-            .map(|(r, f)| {
-                (
-                    *r,
-                    f.iter()
-                        .filter_map(|f| {
-                            (!in_manifest_files.contains(&f.file_id)).then_some(f.file_id)
-                        })
-                        .collect::<HashSet<_>>(),
-                )
-            })
-            .collect();
+        // get file refs from related regions' manifests
+        for (related_region, queries) in &related_regions {
+            let queries = queries.iter().cloned().collect::<HashSet<_>>();
+            let manifest = related_region.manifest_ctx.manifest().await;
+            for meta in manifest.files.values() {
+                if queries.contains(&meta.region_id) {
+                    ref_files
+                        .entry(meta.region_id)
+                        .or_insert_with(HashSet::new)
+                        .insert(meta.file_id);
+                }
+            }
+            // not sure if related region's manifest version is needed, but record it for now.
+            manifest_version.insert(related_region.region_id(), manifest.manifest_version);
+        }
+
+        // simply return all ref files, no manifest version filtering for now.
         Ok(FileRefsManifest {
-            file_refs: ref_files_excluding_in_manifest,
+            file_refs: ref_files,
             manifest_version,
         })
     }
@@ -166,7 +157,7 @@ impl FileReferenceManager {
     /// If the reference count reaches zero, the file reference will be removed from the manager.
     pub fn remove_file(&self, file_meta: &FileMeta) {
         let region_id = file_meta.region_id;
-        let file_ref = FileRef::new(file_meta.region_id, file_meta.file_id);
+        let file_ref = FileRef::new(region_id, file_meta.file_id);
 
         let mut remove_table_entry = false;
         let mut remove_file_ref = false;
@@ -217,7 +208,7 @@ mod tests {
     use store_api::storage::{FileId, RegionId};
 
     use super::*;
-    use crate::sst::file::{FileMeta, FileTimeRange, IndexType, RegionFileId};
+    use crate::sst::file::{ColumnIndexMetadata, FileMeta, FileTimeRange, IndexType, RegionFileId};
 
     #[tokio::test]
     async fn test_file_ref_mgr() {
@@ -233,9 +224,14 @@ mod tests {
             time_range: FileTimeRange::default(),
             level: 0,
             file_size: 4096,
+            max_row_group_uncompressed_size: 4096,
             available_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+            indexes: vec![ColumnIndexMetadata {
+                column_id: 0,
+                created_indexes: SmallVec::from_iter([IndexType::InvertedIndex]),
+            }],
             index_file_size: 4096,
-            index_file_id: None,
+            index_version: 0,
             num_rows: 1024,
             num_row_groups: 1,
             sequence: NonZeroU64::new(4096),
diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs
index c51a3893e6..9b624062bf 100644
--- a/src/mito2/src/sst/index.rs
+++ b/src/mito2/src/sst/index.rs
@@ -32,6 +32,7 @@ use datatypes::arrow::array::BinaryArray;
 use datatypes::arrow::record_batch::RecordBatch;
 use mito_codec::index::IndexValuesCodec;
 use mito_codec::row_converter::CompositeValues;
+use object_store::ObjectStore;
 use puffin_manager::SstPuffinManager;
 use smallvec::{SmallVec, smallvec};
 use snafu::{OptionExt, ResultExt};
@@ -42,7 +43,7 @@ use strum::IntoStaticStr;
 use tokio::sync::mpsc::Sender;
 
 use crate::access_layer::{AccessLayerRef, FilePathProvider, OperationType, RegionFilePathFactory};
-use crate::cache::file_cache::{FileType, IndexKey};
+use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
 use crate::cache::write_cache::{UploadTracker, WriteCacheRef};
 use crate::config::{BloomFilterConfig, FulltextIndexConfig, InvertedIndexConfig};
 use crate::error::{
@@ -60,7 +61,9 @@ use crate::request::{
     WorkerRequestWithTime,
 };
 use crate::schedule::scheduler::{Job, SchedulerRef};
-use crate::sst::file::{FileHandle, FileMeta, IndexType, RegionFileId};
+use crate::sst::file::{
+    ColumnIndexMetadata, FileHandle, FileMeta, IndexType, IndexTypes, RegionFileId, RegionIndexId,
+};
 use crate::sst::file_purger::FilePurgerRef;
 use crate::sst::index::fulltext_index::creator::FulltextIndexer;
 use crate::sst::index::intermediate::IntermediateManager;
@@ -74,11 +77,37 @@ pub(crate) const TYPE_INVERTED_INDEX: &str = "inverted_index";
 pub(crate) const TYPE_FULLTEXT_INDEX: &str = "fulltext_index";
 pub(crate) const TYPE_BLOOM_FILTER_INDEX: &str = "bloom_filter_index";
 
+/// Triggers background download of an index file to the local cache.
+pub(crate) fn trigger_index_background_download(
+    file_cache: Option<&FileCacheRef>,
+    file_id: &RegionIndexId,
+    file_size_hint: Option<u64>,
+    path_factory: &RegionFilePathFactory,
+    object_store: &ObjectStore,
+) {
+    if let (Some(file_cache), Some(file_size)) = (file_cache, file_size_hint) {
+        let index_key = IndexKey::new(
+            file_id.region_id(),
+            file_id.file_id(),
+            FileType::Puffin(file_id.version),
+        );
+        let remote_path = path_factory.build_index_file_path(file_id.file_id);
+        file_cache.maybe_download_background(
+            index_key,
+            remote_path,
+            object_store.clone(),
+            file_size,
+        );
+    }
+}
+
 /// Output of the index creation.
 #[derive(Debug, Clone, Default)]
 pub struct IndexOutput {
     /// Size of the file.
     pub file_size: u64,
+    /// Index version.
+    pub version: u64,
     /// Inverted index output.
     pub inverted_index: InvertedIndexOutput,
     /// Fulltext index output.
@@ -101,6 +130,35 @@ impl IndexOutput {
         }
         indexes
     }
+
+    pub fn build_indexes(&self) -> Vec<ColumnIndexMetadata> {
+        let mut map: HashMap<ColumnId, IndexTypes> = HashMap::new();
+
+        if self.inverted_index.is_available() {
+            for &col in &self.inverted_index.columns {
+                map.entry(col).or_default().push(IndexType::InvertedIndex);
+            }
+        }
+        if self.fulltext_index.is_available() {
+            for &col in &self.fulltext_index.columns {
+                map.entry(col).or_default().push(IndexType::FulltextIndex);
+            }
+        }
+        if self.bloom_filter.is_available() {
+            for &col in &self.bloom_filter.columns {
+                map.entry(col)
+                    .or_default()
+                    .push(IndexType::BloomFilterIndex);
+            }
+        }
+
+        map.into_iter()
+            .map(|(column_id, created_indexes)| ColumnIndexMetadata {
+                column_id,
+                created_indexes,
+            })
+            .collect::<Vec<_>>()
+    }
 }
 
 /// Base output of the index creation.
@@ -132,7 +190,9 @@ pub type BloomFilterOutput = IndexBaseOutput;
 pub struct Indexer {
     file_id: FileId,
     region_id: RegionId,
+    index_version: u64,
     puffin_manager: Option<SstPuffinManager>,
+    write_cache_enabled: bool,
     inverted_indexer: Option<InvertedIndexer>,
     last_mem_inverted_index: usize,
     fulltext_indexer: Option<FulltextIndexer>,
@@ -205,7 +265,7 @@ impl Indexer {
 #[async_trait::async_trait]
 pub trait IndexerBuilder {
     /// Builds indexer of given file id to [index_file_path].
-    async fn build(&self, file_id: FileId) -> Indexer;
+    async fn build(&self, file_id: FileId, index_version: u64) -> Indexer;
 }
 #[derive(Clone)]
 pub(crate) struct IndexerBuilderImpl {
@@ -213,6 +273,7 @@ pub(crate) struct IndexerBuilderImpl {
     pub(crate) metadata: RegionMetadataRef,
     pub(crate) row_group_size: usize,
     pub(crate) puffin_manager: SstPuffinManager,
+    pub(crate) write_cache_enabled: bool,
     pub(crate) intermediate_manager: IntermediateManager,
     pub(crate) index_options: IndexOptions,
     pub(crate) inverted_index_config: InvertedIndexConfig,
@@ -223,10 +284,12 @@ pub(crate) struct IndexerBuilderImpl {
 #[async_trait::async_trait]
 impl IndexerBuilder for IndexerBuilderImpl {
     /// Sanity check for arguments and create a new [Indexer] if arguments are valid.
-    async fn build(&self, file_id: FileId) -> Indexer {
+    async fn build(&self, file_id: FileId, index_version: u64) -> Indexer {
         let mut indexer = Indexer {
             file_id,
             region_id: self.metadata.region_id,
+            index_version,
+            write_cache_enabled: self.write_cache_enabled,
             ..Default::default()
         };
 
@@ -416,7 +479,7 @@ impl IndexerBuilderImpl {
 }
 
 /// Type of an index build task.
-#[derive(Debug, Clone, IntoStaticStr)]
+#[derive(Debug, Clone, IntoStaticStr, PartialEq)]
 pub enum IndexBuildType {
     /// Build index when schema change.
     SchemaChange,
@@ -465,6 +528,8 @@ pub type ResultMpscSender = Sender<Result<IndexBuildOutcome>>;
 
 #[derive(Clone)]
 pub struct IndexBuildTask {
+    /// The SST file handle to build index for.
+    pub file: FileHandle,
     /// The file meta to build index for.
     pub file_meta: FileMeta,
     pub reason: IndexBuildType,
@@ -580,13 +645,20 @@ impl IndexBuildTask {
         &mut self,
         version_control: VersionControlRef,
     ) -> Result<IndexBuildOutcome> {
-        let index_file_id = if self.file_meta.index_file_size > 0 {
-            // Generate new file ID if index file exists to avoid overwrite.
-            FileId::random()
+        // Determine the new index version
+        let new_index_version = if self.file_meta.index_file_size > 0 {
+            // Increment version if index file exists to avoid overwrite.
+            self.file_meta.index_version + 1
         } else {
-            self.file_meta.file_id
+            0 // Default version for new index files
         };
-        let mut indexer = self.indexer_builder.build(index_file_id).await;
+
+        // Use the same file_id but with new version for index file
+        let index_file_id = self.file_meta.file_id;
+        let mut indexer = self
+            .indexer_builder
+            .build(index_file_id, new_index_version)
+            .await;
 
         // Check SST file existence before building index to avoid failure of parquet reader.
         if !self.check_sst_file_exists(&version_control).await {
@@ -606,18 +678,15 @@ impl IndexBuildTask {
 
         let mut parquet_reader = self
             .access_layer
-            .read_sst(FileHandle::new(
-                self.file_meta.clone(),
-                self.file_purger.clone(),
-            ))
+            .read_sst(self.file.clone()) // use the latest file handle instead of creating a new one
             .build()
             .await?;
 
         // TODO(SNC123): optimize index batch
         loop {
             match parquet_reader.next_batch().await {
-                Ok(Some(batch)) => {
-                    indexer.update(&mut batch.clone()).await;
+                Ok(Some(mut batch)) => {
+                    indexer.update(&mut batch).await;
                 }
                 Ok(None) => break,
                 Err(e) => {
@@ -646,10 +715,10 @@ impl IndexBuildTask {
             }
 
             // Upload index file if write cache is enabled.
-            self.maybe_upload_index_file(index_output.clone(), index_file_id)
+            self.maybe_upload_index_file(index_output.clone(), index_file_id, new_index_version)
                 .await?;
 
-            let worker_request = match self.update_manifest(index_output, index_file_id).await {
+            let worker_request = match self.update_manifest(index_output, new_index_version).await {
                 Ok(edit) => {
                     let index_build_finished = IndexBuildFinished {
                         region_id: self.file_meta.region_id,
@@ -681,6 +750,7 @@ impl IndexBuildTask {
         &self,
         output: IndexOutput,
         index_file_id: FileId,
+        index_version: u64,
     ) -> Result<()> {
         if let Some(write_cache) = &self.write_cache {
             let file_id = self.file_meta.file_id;
@@ -688,12 +758,14 @@ impl IndexBuildTask {
             let remote_store = self.access_layer.object_store();
             let mut upload_tracker = UploadTracker::new(region_id);
             let mut err = None;
-            let puffin_key = IndexKey::new(region_id, index_file_id, FileType::Puffin);
+            let puffin_key =
+                IndexKey::new(region_id, index_file_id, FileType::Puffin(output.version));
+            let index_id = RegionIndexId::new(RegionFileId::new(region_id, file_id), index_version);
             let puffin_path = RegionFilePathFactory::new(
                 self.access_layer.table_dir().to_string(),
                 self.access_layer.path_type(),
             )
-            .build_index_file_path(RegionFileId::new(region_id, file_id));
+            .build_index_file_path_with_version(index_id);
             if let Err(e) = write_cache
                 .upload(puffin_key, &puffin_path, remote_store)
                 .await
@@ -725,11 +797,12 @@ impl IndexBuildTask {
     async fn update_manifest(
         &mut self,
         output: IndexOutput,
-        index_file_id: FileId,
+        new_index_version: u64,
     ) -> Result<RegionEdit> {
         self.file_meta.available_indexes = output.build_available_indexes();
+        self.file_meta.indexes = output.build_indexes();
         self.file_meta.index_file_size = output.file_size;
-        self.file_meta.index_file_id = Some(index_file_id);
+        self.file_meta.index_version = new_index_version;
         let edit = RegionEdit {
             files_to_add: vec![self.file_meta.clone()],
             files_to_remove: vec![],
@@ -744,6 +817,7 @@ impl IndexBuildTask {
             .update_manifest(
                 RegionLeaderState::Writable,
                 RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone())),
+                false,
             )
             .await?;
         info!(
@@ -1130,6 +1204,10 @@ mod tests {
             unreachable!()
         }
 
+        fn build_index_file_path_with_version(&self, _index_id: RegionIndexId) -> String {
+            unreachable!()
+        }
+
         fn build_sst_file_path(&self, _file_id: RegionFileId) -> String {
             unreachable!()
         }
@@ -1203,6 +1281,7 @@ mod tests {
             metadata,
             row_group_size: 1024,
             puffin_manager,
+            write_cache_enabled: false,
             intermediate_manager: intm_manager,
             index_options: IndexOptions::default(),
             inverted_index_config: InvertedIndexConfig::default(),
@@ -1227,13 +1306,14 @@ mod tests {
             metadata,
             row_group_size: 1024,
             puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
+            write_cache_enabled: false,
             intermediate_manager: intm_manager,
             index_options: IndexOptions::default(),
             inverted_index_config: InvertedIndexConfig::default(),
             fulltext_index_config: FulltextIndexConfig::default(),
             bloom_filter_index_config: BloomFilterConfig::default(),
         }
-        .build(FileId::random())
+        .build(FileId::random(), 0)
         .await;
 
         assert!(indexer.inverted_indexer.is_some());
@@ -1257,6 +1337,7 @@ mod tests {
             metadata: metadata.clone(),
             row_group_size: 1024,
             puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
+            write_cache_enabled: false,
             intermediate_manager: intm_manager.clone(),
             index_options: IndexOptions::default(),
             inverted_index_config: InvertedIndexConfig {
@@ -1266,7 +1347,7 @@ mod tests {
             fulltext_index_config: FulltextIndexConfig::default(),
             bloom_filter_index_config: BloomFilterConfig::default(),
         }
-        .build(FileId::random())
+        .build(FileId::random(), 0)
         .await;
 
         assert!(indexer.inverted_indexer.is_none());
@@ -1278,6 +1359,7 @@ mod tests {
             metadata: metadata.clone(),
             row_group_size: 1024,
             puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
+            write_cache_enabled: false,
             intermediate_manager: intm_manager.clone(),
             index_options: IndexOptions::default(),
             inverted_index_config: InvertedIndexConfig::default(),
@@ -1287,7 +1369,7 @@ mod tests {
             },
             bloom_filter_index_config: BloomFilterConfig::default(),
         }
-        .build(FileId::random())
+        .build(FileId::random(), 0)
         .await;
 
         assert!(indexer.inverted_indexer.is_some());
@@ -1299,6 +1381,7 @@ mod tests {
             metadata,
             row_group_size: 1024,
             puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
+            write_cache_enabled: false,
             intermediate_manager: intm_manager,
             index_options: IndexOptions::default(),
             inverted_index_config: InvertedIndexConfig::default(),
@@ -1308,7 +1391,7 @@ mod tests {
                 ..Default::default()
             },
         }
-        .build(FileId::random())
+        .build(FileId::random(), 0)
         .await;
 
         assert!(indexer.inverted_indexer.is_some());
@@ -1332,13 +1415,14 @@ mod tests {
             metadata: metadata.clone(),
             row_group_size: 1024,
             puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
+            write_cache_enabled: false,
             intermediate_manager: intm_manager.clone(),
             index_options: IndexOptions::default(),
             inverted_index_config: InvertedIndexConfig::default(),
             fulltext_index_config: FulltextIndexConfig::default(),
             bloom_filter_index_config: BloomFilterConfig::default(),
         }
-        .build(FileId::random())
+        .build(FileId::random(), 0)
         .await;
 
         assert!(indexer.inverted_indexer.is_none());
@@ -1355,13 +1439,14 @@ mod tests {
             metadata: metadata.clone(),
             row_group_size: 1024,
             puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
+            write_cache_enabled: false,
             intermediate_manager: intm_manager.clone(),
             index_options: IndexOptions::default(),
             inverted_index_config: InvertedIndexConfig::default(),
             fulltext_index_config: FulltextIndexConfig::default(),
             bloom_filter_index_config: BloomFilterConfig::default(),
         }
-        .build(FileId::random())
+        .build(FileId::random(), 0)
         .await;
 
         assert!(indexer.inverted_indexer.is_some());
@@ -1378,13 +1463,14 @@ mod tests {
             metadata: metadata.clone(),
             row_group_size: 1024,
             puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
+            write_cache_enabled: false,
             intermediate_manager: intm_manager,
             index_options: IndexOptions::default(),
             inverted_index_config: InvertedIndexConfig::default(),
             fulltext_index_config: FulltextIndexConfig::default(),
             bloom_filter_index_config: BloomFilterConfig::default(),
         }
-        .build(FileId::random())
+        .build(FileId::random(), 0)
         .await;
 
         assert!(indexer.inverted_indexer.is_some());
@@ -1408,13 +1494,14 @@ mod tests {
             metadata,
             row_group_size: 0,
             puffin_manager: factory.build(mock_object_store(), NoopPathProvider),
+            write_cache_enabled: false,
             intermediate_manager: intm_manager,
             index_options: IndexOptions::default(),
             inverted_index_config: InvertedIndexConfig::default(),
             fulltext_index_config: FulltextIndexConfig::default(),
             bloom_filter_index_config: BloomFilterConfig::default(),
         }
-        .build(FileId::random())
+        .build(FileId::random(), 0)
         .await;
 
         assert!(indexer.inverted_indexer.is_none());
@@ -1435,14 +1522,19 @@ mod tests {
         let region_id = metadata.region_id;
         let indexer_builder = mock_indexer_builder(metadata, &env).await;
 
+        let file_meta = FileMeta {
+            region_id,
+            file_id: FileId::random(),
+            file_size: 100,
+            ..Default::default()
+        };
+
+        let file = FileHandle::new(file_meta.clone(), file_purger.clone());
+
         // Create mock task.
         let task = IndexBuildTask {
-            file_meta: FileMeta {
-                region_id,
-                file_id: FileId::random(),
-                file_size: 100,
-                ..Default::default()
-            },
+            file,
+            file_meta,
             reason: IndexBuildType::Flush,
             access_layer: env.access_layer.clone(),
             listener: WorkerListener::default(),
@@ -1482,6 +1574,7 @@ mod tests {
             region_id,
             file_id: sst_info.file_id,
             file_size: sst_info.file_size,
+            max_row_group_uncompressed_size: sst_info.max_row_group_uncompressed_size,
             index_file_size: sst_info.index_metadata.file_size,
             num_rows: sst_info.num_rows as u64,
             num_row_groups: sst_info.num_row_groups,
@@ -1492,10 +1585,13 @@ mod tests {
             mock_version_control(metadata.clone(), file_purger.clone(), files).await;
         let indexer_builder = mock_indexer_builder(metadata.clone(), &env).await;
 
+        let file = FileHandle::new(file_meta.clone(), file_purger.clone());
+
         // Create mock task.
         let (tx, mut rx) = mpsc::channel(4);
         let (result_tx, mut result_rx) = mpsc::channel::<Result<IndexBuildOutcome>>(4);
         let task = IndexBuildTask {
+            file,
             file_meta: file_meta.clone(),
             reason: IndexBuildType::Flush,
             access_layer: env.access_layer.clone(),
@@ -1553,6 +1649,7 @@ mod tests {
             region_id,
             file_id: sst_info.file_id,
             file_size: sst_info.file_size,
+            max_row_group_uncompressed_size: sst_info.max_row_group_uncompressed_size,
             index_file_size: sst_info.index_metadata.file_size,
             num_rows: sst_info.num_rows as u64,
             num_row_groups: sst_info.num_row_groups,
@@ -1563,10 +1660,13 @@ mod tests {
             mock_version_control(metadata.clone(), file_purger.clone(), files).await;
         let indexer_builder = mock_indexer_builder(metadata.clone(), &env).await;
 
+        let file = FileHandle::new(file_meta.clone(), file_purger.clone());
+
         // Create mock task.
         let (tx, _rx) = mpsc::channel(4);
         let (result_tx, mut result_rx) = mpsc::channel::<Result<IndexBuildOutcome>>(4);
         let task = IndexBuildTask {
+            file,
             file_meta: file_meta.clone(),
             reason: IndexBuildType::Flush,
             access_layer: env.access_layer.clone(),
@@ -1586,7 +1686,7 @@ mod tests {
 
         let puffin_path = location::index_file_path(
             env.access_layer.table_dir(),
-            RegionFileId::new(region_id, file_meta.file_id),
+            RegionIndexId::new(RegionFileId::new(region_id, file_meta.file_id), 0),
             env.access_layer.path_type(),
         );
 
@@ -1653,6 +1753,7 @@ mod tests {
             region_id,
             file_id: sst_info.file_id,
             file_size: sst_info.file_size,
+            max_row_group_uncompressed_size: sst_info.max_row_group_uncompressed_size,
             index_file_size: sst_info.index_metadata.file_size,
             num_rows: sst_info.num_rows as u64,
             num_row_groups: sst_info.num_row_groups,
@@ -1663,10 +1764,13 @@ mod tests {
             mock_version_control(metadata.clone(), file_purger.clone(), files).await;
         let indexer_builder = mock_indexer_builder(metadata.clone(), &env).await;
 
+        let file = FileHandle::new(file_meta.clone(), file_purger.clone());
+
         // Create mock task.
         let (tx, mut rx) = mpsc::channel(4);
         let (result_tx, mut result_rx) = mpsc::channel::<Result<IndexBuildOutcome>>(4);
         let task = IndexBuildTask {
+            file,
             file_meta: file_meta.clone(),
             reason: IndexBuildType::Flush,
             access_layer: env.access_layer.clone(),
@@ -1715,8 +1819,10 @@ mod tests {
                 ReadableSize::mb(10),
                 None,
                 None,
+                true, // enable_background_worker
                 factory,
                 intm_manager,
+                ReadableSize::mb(10),
             )
             .await
             .unwrap(),
@@ -1727,6 +1833,7 @@ mod tests {
             metadata: metadata.clone(),
             row_group_size: 1024,
             puffin_manager: write_cache.build_puffin_manager().clone(),
+            write_cache_enabled: true,
             intermediate_manager: write_cache.intermediate_manager().clone(),
             index_options: IndexOptions::default(),
             inverted_index_config: InvertedIndexConfig::default(),
@@ -1748,10 +1855,13 @@ mod tests {
         let version_control =
             mock_version_control(metadata.clone(), file_purger.clone(), files).await;
 
+        let file = FileHandle::new(file_meta.clone(), file_purger.clone());
+
         // Create mock task.
         let (tx, mut _rx) = mpsc::channel(4);
         let (result_tx, mut result_rx) = mpsc::channel::<Result<IndexBuildOutcome>>(4);
         let task = IndexBuildTask {
+            file,
             file_meta: file_meta.clone(),
             reason: IndexBuildType::Flush,
             access_layer: env.access_layer.clone(),
@@ -1778,7 +1888,11 @@ mod tests {
         }
 
         // The write cache should contain the uploaded index file.
-        let index_key = IndexKey::new(region_id, file_meta.file_id, FileType::Puffin);
+        let index_key = IndexKey::new(
+            region_id,
+            file_meta.file_id,
+            FileType::Puffin(sst_info.index_metadata.version),
+        );
         assert!(write_cache.file_cache().contains_key(&index_key));
     }
 
@@ -1795,13 +1909,18 @@ mod tests {
         let (tx, _rx) = mpsc::channel(4);
         let (result_tx, _result_rx) = mpsc::channel::<Result<IndexBuildOutcome>>(4);
 
+        let file_meta = FileMeta {
+            region_id,
+            file_id,
+            file_size: 100,
+            ..Default::default()
+        };
+
+        let file = FileHandle::new(file_meta.clone(), file_purger.clone());
+
         IndexBuildTask {
-            file_meta: FileMeta {
-                region_id,
-                file_id,
-                file_size: 100,
-                ..Default::default()
-            },
+            file,
+            file_meta,
             reason,
             access_layer: env.access_layer.clone(),
             listener: WorkerListener::default(),
diff --git a/src/mito2/src/sst/index/bloom_filter/applier.rs b/src/mito2/src/sst/index/bloom_filter/applier.rs
index 3fa387c8dc..547e67b66d 100644
--- a/src/mito2/src/sst/index/bloom_filter/applier.rs
+++ b/src/mito2/src/sst/index/bloom_filter/applier.rs
@@ -17,11 +17,14 @@ mod builder;
 use std::collections::BTreeMap;
 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Instant;
 
 use common_base::range_read::RangeReader;
 use common_telemetry::warn;
 use index::bloom_filter::applier::{BloomFilterApplier, InListPredicate};
-use index::bloom_filter::reader::{BloomFilterReader, BloomFilterReaderImpl};
+use index::bloom_filter::reader::{
+    BloomFilterReadMetrics, BloomFilterReader, BloomFilterReaderImpl,
+};
 use index::target::IndexTarget;
 use object_store::ObjectStore;
 use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
@@ -41,11 +44,67 @@ use crate::error::{
     Result,
 };
 use crate::metrics::INDEX_APPLY_ELAPSED;
-use crate::sst::file::RegionFileId;
-use crate::sst::index::TYPE_BLOOM_FILTER_INDEX;
+use crate::sst::file::RegionIndexId;
 use crate::sst::index::bloom_filter::INDEX_BLOB_TYPE;
 pub use crate::sst::index::bloom_filter::applier::builder::BloomFilterIndexApplierBuilder;
 use crate::sst::index::puffin_manager::{BlobReader, PuffinManagerFactory};
+use crate::sst::index::{TYPE_BLOOM_FILTER_INDEX, trigger_index_background_download};
+
+/// Metrics for tracking bloom filter index apply operations.
+#[derive(Default, Clone)]
+pub struct BloomFilterIndexApplyMetrics {
+    /// Total time spent applying the index.
+    pub apply_elapsed: std::time::Duration,
+    /// Number of blob cache misses.
+    pub blob_cache_miss: usize,
+    /// Total size of blobs read (in bytes).
+    pub blob_read_bytes: u64,
+    /// Metrics for bloom filter read operations.
+    pub read_metrics: BloomFilterReadMetrics,
+}
+
+impl std::fmt::Debug for BloomFilterIndexApplyMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let Self {
+            apply_elapsed,
+            blob_cache_miss,
+            blob_read_bytes,
+            read_metrics,
+        } = self;
+
+        if self.is_empty() {
+            return write!(f, "{{}}");
+        }
+        write!(f, "{{")?;
+
+        write!(f, "\"apply_elapsed\":\"{:?}\"", apply_elapsed)?;
+
+        if *blob_cache_miss > 0 {
+            write!(f, ", \"blob_cache_miss\":{}", blob_cache_miss)?;
+        }
+        if *blob_read_bytes > 0 {
+            write!(f, ", \"blob_read_bytes\":{}", blob_read_bytes)?;
+        }
+        write!(f, ", \"read_metrics\":{:?}", read_metrics)?;
+
+        write!(f, "}}")
+    }
+}
+
+impl BloomFilterIndexApplyMetrics {
+    /// Returns true if the metrics are empty (contain no meaningful data).
+    pub fn is_empty(&self) -> bool {
+        self.apply_elapsed.is_zero()
+    }
+
+    /// Merges another metrics into this one.
+    pub fn merge_from(&mut self, other: &Self) {
+        self.apply_elapsed += other.apply_elapsed;
+        self.blob_cache_miss += other.blob_cache_miss;
+        self.blob_read_bytes += other.blob_read_bytes;
+        self.read_metrics.merge_from(&other.read_metrics);
+    }
+}
 
 pub(crate) type BloomFilterIndexApplierRef = Arc<BloomFilterIndexApplier>;
 
@@ -133,15 +192,20 @@ impl BloomFilterIndexApplier {
     ///
     /// Row group id existing in the returned result means that the row group is searched.
     /// Empty ranges means that the row group is searched but no rows are found.
+    ///
+    /// # Arguments
+    /// * `file_id` - The region file ID to apply predicates to
+    /// * `file_size_hint` - Optional hint for file size to avoid extra metadata reads
+    /// * `row_groups` - Iterator of row group lengths and whether to search in the row group
+    /// * `metrics` - Optional mutable reference to collect metrics on demand
     pub async fn apply(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         file_size_hint: Option<u64>,
         row_groups: impl Iterator<Item = (usize, bool)>,
+        mut metrics: Option<&mut BloomFilterIndexApplyMetrics>,
     ) -> Result<Vec<(usize, Vec<Range<usize>>)>> {
-        let _timer = INDEX_APPLY_ELAPSED
-            .with_label_values(&[TYPE_BLOOM_FILTER_INDEX])
-            .start_timer();
+        let apply_start = Instant::now();
 
         // Calculates row groups' ranges based on start of the file.
         let mut input = Vec::with_capacity(row_groups.size_hint().0);
@@ -163,7 +227,7 @@ impl BloomFilterIndexApplier {
 
         for (column_id, predicates) in self.predicates.iter() {
             let blob = match self
-                .blob_reader(file_id, *column_id, file_size_hint)
+                .blob_reader(file_id, *column_id, file_size_hint, metrics.as_deref_mut())
                 .await?
             {
                 Some(blob) => blob,
@@ -173,20 +237,24 @@ impl BloomFilterIndexApplier {
             // Create appropriate reader based on whether we have caching enabled
             if let Some(bloom_filter_cache) = &self.bloom_filter_index_cache {
                 let blob_size = blob.metadata().await.context(MetadataSnafu)?.content_length;
+                if let Some(m) = &mut metrics {
+                    m.blob_read_bytes += blob_size;
+                }
                 let reader = CachedBloomFilterIndexBlobReader::new(
                     file_id.file_id(),
+                    file_id.version,
                     *column_id,
                     Tag::Skipping,
                     blob_size,
                     BloomFilterReaderImpl::new(blob),
                     bloom_filter_cache.clone(),
                 );
-                self.apply_predicates(reader, predicates, &mut output)
+                self.apply_predicates(reader, predicates, &mut output, metrics.as_deref_mut())
                     .await
                     .context(ApplyBloomFilterIndexSnafu)?;
             } else {
                 let reader = BloomFilterReaderImpl::new(blob);
-                self.apply_predicates(reader, predicates, &mut output)
+                self.apply_predicates(reader, predicates, &mut output, metrics.as_deref_mut())
                     .await
                     .context(ApplyBloomFilterIndexSnafu)?;
             }
@@ -201,6 +269,16 @@ impl BloomFilterIndexApplier {
             }
         }
 
+        // Record elapsed time to histogram and collect metrics if requested
+        let elapsed = apply_start.elapsed();
+        INDEX_APPLY_ELAPSED
+            .with_label_values(&[TYPE_BLOOM_FILTER_INDEX])
+            .observe(elapsed.as_secs_f64());
+
+        if let Some(m) = metrics {
+            m.apply_elapsed += elapsed;
+        }
+
         Ok(output)
     }
 
@@ -209,9 +287,10 @@ impl BloomFilterIndexApplier {
     /// Returus `None` if the column does not have an index.
     async fn blob_reader(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         column_id: ColumnId,
         file_size_hint: Option<u64>,
+        metrics: Option<&mut BloomFilterIndexApplyMetrics>,
     ) -> Result<Option<BlobReader>> {
         let reader = match self
             .cached_blob_reader(file_id, column_id, file_size_hint)
@@ -219,6 +298,9 @@ impl BloomFilterIndexApplier {
         {
             Ok(Some(puffin_reader)) => puffin_reader,
             other => {
+                if let Some(m) = metrics {
+                    m.blob_cache_miss += 1;
+                }
                 if let Err(err) = other {
                     // Blob not found means no index for this column
                     if is_blob_not_found(&err) {
@@ -247,7 +329,7 @@ impl BloomFilterIndexApplier {
     /// Creates a blob reader from the cached index file
     async fn cached_blob_reader(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         column_id: ColumnId,
         file_size_hint: Option<u64>,
     ) -> Result<Option<BlobReader>> {
@@ -255,7 +337,11 @@ impl BloomFilterIndexApplier {
             return Ok(None);
         };
 
-        let index_key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Puffin);
+        let index_key = IndexKey::new(
+            file_id.region_id(),
+            file_id.file_id(),
+            FileType::Puffin(file_id.version),
+        );
         if file_cache.get(index_key).await.is_none() {
             return Ok(None);
         };
@@ -288,16 +374,24 @@ impl BloomFilterIndexApplier {
     /// Creates a blob reader from the remote index file
     async fn remote_blob_reader(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         column_id: ColumnId,
         file_size_hint: Option<u64>,
     ) -> Result<BlobReader> {
+        let path_factory = RegionFilePathFactory::new(self.table_dir.clone(), self.path_type);
+
+        // Trigger background download if file cache and file size are available
+        trigger_index_background_download(
+            self.file_cache.as_ref(),
+            &file_id,
+            file_size_hint,
+            &path_factory,
+            &self.object_store,
+        );
+
         let puffin_manager = self
             .puffin_manager_factory
-            .build(
-                self.object_store.clone(),
-                RegionFilePathFactory::new(self.table_dir.clone(), self.path_type),
-            )
+            .build(self.object_store.clone(), path_factory)
             .with_puffin_metadata_cache(self.puffin_metadata_cache.clone());
 
         let blob_name = Self::column_blob_name(column_id);
@@ -320,6 +414,7 @@ impl BloomFilterIndexApplier {
         reader: R,
         predicates: &[InListPredicate],
         output: &mut [(usize, Vec<Range<usize>>)],
+        mut metrics: Option<&mut BloomFilterIndexApplyMetrics>,
     ) -> std::result::Result<(), index::bloom_filter::error::Error> {
         let mut applier = BloomFilterApplier::new(Box::new(reader)).await?;
 
@@ -329,7 +424,10 @@ impl BloomFilterIndexApplier {
                 continue;
             }
 
-            *row_group_output = applier.search(predicates, row_group_output).await?;
+            let read_metrics = metrics.as_deref_mut().map(|m| &mut m.read_metrics);
+            *row_group_output = applier
+                .search(predicates, row_group_output, read_metrics)
+                .await?;
         }
 
         Ok(())
@@ -361,6 +459,7 @@ mod tests {
     use store_api::storage::FileId;
 
     use super::*;
+    use crate::sst::file::RegionFileId;
     use crate::sst::index::bloom_filter::creator::BloomFilterIndexer;
     use crate::sst::index::bloom_filter::creator::tests::{
         mock_object_store, mock_region_metadata, new_batch, new_intm_mgr,
@@ -372,7 +471,7 @@ mod tests {
         object_store: ObjectStore,
         metadata: &RegionMetadata,
         puffin_manager_factory: PuffinManagerFactory,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
     ) -> impl Fn(&[Expr], Vec<(usize, bool)>) -> BoxFuture<'static, Vec<(usize, Vec<Range<usize>>)>>
     + use<'_> {
         move |exprs, row_groups| {
@@ -393,7 +492,7 @@ mod tests {
 
                 let applier = builder.build(&exprs).unwrap().unwrap();
                 applier
-                    .apply(file_id, None, row_groups.into_iter())
+                    .apply(file_id, None, row_groups.into_iter(), None)
                     .await
                     .unwrap()
                     .into_iter()
@@ -429,6 +528,7 @@ mod tests {
         let intm_mgr = new_intm_mgr(d.path().to_string_lossy()).await;
         let memory_usage_threshold = Some(1024);
         let file_id = RegionFileId::new(region_metadata.region_id, FileId::random());
+        let file_id = RegionIndexId::new(file_id, 0);
         let table_dir = "table_dir".to_string();
 
         let mut indexer = BloomFilterIndexer::new(
diff --git a/src/mito2/src/sst/index/bloom_filter/creator.rs b/src/mito2/src/sst/index/bloom_filter/creator.rs
index 0d16a21d7c..a7283f9191 100644
--- a/src/mito2/src/sst/index/bloom_filter/creator.rs
+++ b/src/mito2/src/sst/index/bloom_filter/creator.rs
@@ -481,7 +481,7 @@ pub(crate) mod tests {
     use super::*;
     use crate::access_layer::FilePathProvider;
     use crate::read::BatchColumn;
-    use crate::sst::file::RegionFileId;
+    use crate::sst::file::{RegionFileId, RegionIndexId};
     use crate::sst::index::puffin_manager::PuffinManagerFactory;
 
     pub fn mock_object_store() -> ObjectStore {
@@ -499,6 +499,10 @@ pub(crate) mod tests {
             file_id.file_id().to_string()
         }
 
+        fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
+            index_id.file_id.file_id().to_string()
+        }
+
         fn build_sst_file_path(&self, file_id: RegionFileId) -> String {
             file_id.file_id().to_string()
         }
@@ -621,6 +625,7 @@ pub(crate) mod tests {
         let puffin_manager = factory.build(object_store, TestPathProvider);
 
         let file_id = RegionFileId::new(region_metadata.region_id, file_id);
+        let file_id = RegionIndexId::new(file_id, 0);
         let mut puffin_writer = puffin_manager.writer(&file_id).await.unwrap();
         let (row_count, byte_count) = indexer.finish(&mut puffin_writer).await.unwrap();
         assert_eq!(row_count, 20);
@@ -637,17 +642,17 @@ pub(crate) mod tests {
                 .unwrap();
             let reader = blob_guard.reader().await.unwrap();
             let bloom_filter = BloomFilterReaderImpl::new(reader);
-            let metadata = bloom_filter.metadata().await.unwrap();
+            let metadata = bloom_filter.metadata(None).await.unwrap();
 
             assert_eq!(metadata.segment_count, 10);
             for i in 0..5 {
                 let loc = &metadata.bloom_filter_locs[metadata.segment_loc_indices[i] as usize];
-                let bf = bloom_filter.bloom_filter(loc).await.unwrap();
+                let bf = bloom_filter.bloom_filter(loc, None).await.unwrap();
                 assert!(bf.contains(b"tag1"));
             }
             for i in 5..10 {
                 let loc = &metadata.bloom_filter_locs[metadata.segment_loc_indices[i] as usize];
-                let bf = bloom_filter.bloom_filter(loc).await.unwrap();
+                let bf = bloom_filter.bloom_filter(loc, None).await.unwrap();
                 assert!(bf.contains(b"tag2"));
             }
         }
@@ -662,13 +667,13 @@ pub(crate) mod tests {
                 .unwrap();
             let reader = blob_guard.reader().await.unwrap();
             let bloom_filter = BloomFilterReaderImpl::new(reader);
-            let metadata = bloom_filter.metadata().await.unwrap();
+            let metadata = bloom_filter.metadata(None).await.unwrap();
 
             assert_eq!(metadata.segment_count, 5);
             for i in 0u64..20 {
                 let idx = i as usize / 4;
                 let loc = &metadata.bloom_filter_locs[metadata.segment_loc_indices[idx] as usize];
-                let bf = bloom_filter.bloom_filter(loc).await.unwrap();
+                let bf = bloom_filter.bloom_filter(loc, None).await.unwrap();
                 let mut buf = vec![];
                 IndexValueCodec::encode_nonnull_value(ValueRef::UInt64(i), &sort_field, &mut buf)
                     .unwrap();
diff --git a/src/mito2/src/sst/index/fulltext_index/applier.rs b/src/mito2/src/sst/index/fulltext_index/applier.rs
index 6b68fc348d..54a8e11e89 100644
--- a/src/mito2/src/sst/index/fulltext_index/applier.rs
+++ b/src/mito2/src/sst/index/fulltext_index/applier.rs
@@ -16,11 +16,12 @@ use std::collections::{BTreeMap, BTreeSet, HashSet};
 use std::iter;
 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Instant;
 
 use common_base::range_read::RangeReader;
 use common_telemetry::warn;
 use index::bloom_filter::applier::{BloomFilterApplier, InListPredicate};
-use index::bloom_filter::reader::BloomFilterReaderImpl;
+use index::bloom_filter::reader::{BloomFilterReadMetrics, BloomFilterReaderImpl};
 use index::fulltext_index::search::{FulltextIndexSearcher, RowId, TantivyFulltextIndexSearcher};
 use index::fulltext_index::tokenizer::{ChineseTokenizer, EnglishTokenizer, Tokenizer};
 use index::fulltext_index::{Analyzer, Config};
@@ -43,16 +44,105 @@ use crate::error::{
     PuffinReadBlobSnafu, Result,
 };
 use crate::metrics::INDEX_APPLY_ELAPSED;
-use crate::sst::file::RegionFileId;
-use crate::sst::index::TYPE_FULLTEXT_INDEX;
+use crate::sst::file::RegionIndexId;
 use crate::sst::index::fulltext_index::applier::builder::{FulltextRequest, FulltextTerm};
 use crate::sst::index::fulltext_index::{INDEX_BLOB_TYPE_BLOOM, INDEX_BLOB_TYPE_TANTIVY};
 use crate::sst::index::puffin_manager::{
     PuffinManagerFactory, SstPuffinBlob, SstPuffinDir, SstPuffinReader,
 };
+use crate::sst::index::{TYPE_FULLTEXT_INDEX, trigger_index_background_download};
 
 pub mod builder;
 
+/// Metrics for tracking fulltext index apply operations.
+#[derive(Default, Clone)]
+pub struct FulltextIndexApplyMetrics {
+    /// Total time spent applying the index.
+    pub apply_elapsed: std::time::Duration,
+    /// Number of blob cache misses.
+    pub blob_cache_miss: usize,
+    /// Number of directory cache hits.
+    pub dir_cache_hit: usize,
+    /// Number of directory cache misses.
+    pub dir_cache_miss: usize,
+    /// Elapsed time to initialize directory data.
+    pub dir_init_elapsed: std::time::Duration,
+    /// Metrics for bloom filter reads.
+    pub bloom_filter_read_metrics: BloomFilterReadMetrics,
+}
+
+impl std::fmt::Debug for FulltextIndexApplyMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let Self {
+            apply_elapsed,
+            blob_cache_miss,
+            dir_cache_hit,
+            dir_cache_miss,
+            dir_init_elapsed,
+            bloom_filter_read_metrics,
+        } = self;
+
+        if self.is_empty() {
+            return write!(f, "{{}}");
+        }
+        write!(f, "{{")?;
+
+        write!(f, "\"apply_elapsed\":\"{:?}\"", apply_elapsed)?;
+
+        if *blob_cache_miss > 0 {
+            write!(f, ", \"blob_cache_miss\":{}", blob_cache_miss)?;
+        }
+        if *dir_cache_hit > 0 {
+            write!(f, ", \"dir_cache_hit\":{}", dir_cache_hit)?;
+        }
+        if *dir_cache_miss > 0 {
+            write!(f, ", \"dir_cache_miss\":{}", dir_cache_miss)?;
+        }
+        if !dir_init_elapsed.is_zero() {
+            write!(f, ", \"dir_init_elapsed\":\"{:?}\"", dir_init_elapsed)?;
+        }
+        write!(
+            f,
+            ", \"bloom_filter_read_metrics\":{:?}",
+            bloom_filter_read_metrics
+        )?;
+
+        write!(f, "}}")
+    }
+}
+
+impl FulltextIndexApplyMetrics {
+    /// Returns true if the metrics are empty (contain no meaningful data).
+    pub fn is_empty(&self) -> bool {
+        self.apply_elapsed.is_zero()
+    }
+
+    /// Collects metrics from a directory read operation.
+    pub fn collect_dir_metrics(
+        &mut self,
+        elapsed: std::time::Duration,
+        dir_metrics: puffin::puffin_manager::DirMetrics,
+    ) {
+        self.dir_init_elapsed += elapsed;
+        if dir_metrics.cache_hit {
+            self.dir_cache_hit += 1;
+        } else {
+            self.dir_cache_miss += 1;
+        }
+    }
+
+    /// Merges another metrics into this one.
+    pub fn merge_from(&mut self, other: &Self) {
+        self.apply_elapsed += other.apply_elapsed;
+        self.blob_cache_miss += other.blob_cache_miss;
+        self.dir_cache_hit += other.dir_cache_hit;
+        self.dir_cache_miss += other.dir_cache_miss;
+        self.dir_init_elapsed += other.dir_init_elapsed;
+        self.bloom_filter_read_metrics
+            .merge_from(&other.bloom_filter_read_metrics);
+    }
+}
+
 /// `FulltextIndexApplier` is responsible for applying fulltext index to the provided SST files
 pub struct FulltextIndexApplier {
     /// Requests to be applied.
@@ -124,14 +214,18 @@ impl FulltextIndexApplier {
 impl FulltextIndexApplier {
     /// Applies fine-grained fulltext index to the specified SST file.
     /// Returns the row ids that match the queries.
+    ///
+    /// # Arguments
+    /// * `file_id` - The region file ID to apply predicates to
+    /// * `file_size_hint` - Optional hint for file size to avoid extra metadata reads
+    /// * `metrics` - Optional mutable reference to collect metrics on demand
     pub async fn apply_fine(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         file_size_hint: Option<u64>,
+        mut metrics: Option<&mut FulltextIndexApplyMetrics>,
     ) -> Result<Option<BTreeSet<RowId>>> {
-        let timer = INDEX_APPLY_ELAPSED
-            .with_label_values(&[TYPE_FULLTEXT_INDEX])
-            .start_timer();
+        let apply_start = Instant::now();
 
         let mut row_ids: Option<BTreeSet<RowId>> = None;
         for (column_id, request) in self.requests.iter() {
@@ -140,7 +234,13 @@ impl FulltextIndexApplier {
             }
 
             let Some(result) = self
-                .apply_fine_one_column(file_size_hint, file_id, *column_id, request)
+                .apply_fine_one_column(
+                    file_size_hint,
+                    file_id,
+                    *column_id,
+                    request,
+                    metrics.as_deref_mut(),
+                )
                 .await?
             else {
                 continue;
@@ -159,18 +259,26 @@ impl FulltextIndexApplier {
             }
         }
 
-        if row_ids.is_none() {
-            timer.stop_and_discard();
+        // Record elapsed time to histogram and collect metrics if requested
+        let elapsed = apply_start.elapsed();
+        INDEX_APPLY_ELAPSED
+            .with_label_values(&[TYPE_FULLTEXT_INDEX])
+            .observe(elapsed.as_secs_f64());
+
+        if let Some(m) = metrics {
+            m.apply_elapsed += elapsed;
         }
+
         Ok(row_ids)
     }
 
     async fn apply_fine_one_column(
         &self,
         file_size_hint: Option<u64>,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         column_id: ColumnId,
         request: &FulltextRequest,
+        metrics: Option<&mut FulltextIndexApplyMetrics>,
     ) -> Result<Option<BTreeSet<RowId>>> {
         let blob_key = format!(
             "{INDEX_BLOB_TYPE_TANTIVY}-{}",
@@ -178,7 +286,7 @@ impl FulltextIndexApplier {
         );
         let dir = self
             .index_source
-            .dir(file_id, &blob_key, file_size_hint)
+            .dir(file_id, &blob_key, file_size_hint, metrics)
             .await?;
 
         let dir = match &dir {
@@ -240,15 +348,20 @@ impl FulltextIndexApplier {
     ///
     /// Row group id existing in the returned result means that the row group is searched.
     /// Empty ranges means that the row group is searched but no rows are found.
+    ///
+    /// # Arguments
+    /// * `file_id` - The region file ID to apply predicates to
+    /// * `file_size_hint` - Optional hint for file size to avoid extra metadata reads
+    /// * `row_groups` - Iterator of row group lengths and whether to search in the row group
+    /// * `metrics` - Optional mutable reference to collect metrics on demand
     pub async fn apply_coarse(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         file_size_hint: Option<u64>,
         row_groups: impl Iterator<Item = (usize, bool)>,
+        mut metrics: Option<&mut FulltextIndexApplyMetrics>,
     ) -> Result<Option<Vec<(usize, Vec<Range<usize>>)>>> {
-        let timer = INDEX_APPLY_ELAPSED
-            .with_label_values(&[TYPE_FULLTEXT_INDEX])
-            .start_timer();
+        let apply_start = Instant::now();
 
         let (input, mut output) = Self::init_coarse_output(row_groups);
         let mut applied = false;
@@ -266,26 +379,38 @@ impl FulltextIndexApplier {
                     *column_id,
                     &request.terms,
                     &mut output,
+                    metrics.as_deref_mut(),
                 )
                 .await?;
         }
 
         if !applied {
-            timer.stop_and_discard();
             return Ok(None);
         }
 
         Self::adjust_coarse_output(input, &mut output);
+
+        // Record elapsed time to histogram and collect metrics if requested
+        let elapsed = apply_start.elapsed();
+        INDEX_APPLY_ELAPSED
+            .with_label_values(&[TYPE_FULLTEXT_INDEX])
+            .observe(elapsed.as_secs_f64());
+
+        if let Some(m) = metrics {
+            m.apply_elapsed += elapsed;
+        }
+
         Ok(Some(output))
     }
 
     async fn apply_coarse_one_column(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         file_size_hint: Option<u64>,
         column_id: ColumnId,
         terms: &[FulltextTerm],
         output: &mut [(usize, Vec<Range<usize>>)],
+        mut metrics: Option<&mut FulltextIndexApplyMetrics>,
     ) -> Result<bool> {
         let blob_key = format!(
             "{INDEX_BLOB_TYPE_BLOOM}-{}",
@@ -293,7 +418,7 @@ impl FulltextIndexApplier {
         );
         let Some(reader) = self
             .index_source
-            .blob(file_id, &blob_key, file_size_hint)
+            .blob(file_id, &blob_key, file_size_hint, metrics.as_deref_mut())
             .await?
         else {
             return Ok(false);
@@ -315,6 +440,7 @@ impl FulltextIndexApplier {
                 .content_length;
             let reader = CachedBloomFilterIndexBlobReader::new(
                 file_id.file_id(),
+                file_id.version,
                 column_id,
                 Tag::Fulltext,
                 blob_size,
@@ -336,7 +462,13 @@ impl FulltextIndexApplier {
             }
 
             *row_group_output = applier
-                .search(&predicates, row_group_output)
+                .search(
+                    &predicates,
+                    row_group_output,
+                    metrics
+                        .as_deref_mut()
+                        .map(|m| &mut m.bloom_filter_read_metrics),
+                )
                 .await
                 .context(ApplyBloomFilterIndexSnafu)?;
         }
@@ -480,11 +612,18 @@ impl IndexSource {
     /// Returns `None` if the blob is not found.
     async fn blob(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         key: &str,
         file_size_hint: Option<u64>,
+        metrics: Option<&mut FulltextIndexApplyMetrics>,
     ) -> Result<Option<GuardWithMetadata<SstPuffinBlob>>> {
         let (reader, fallbacked) = self.ensure_reader(file_id, file_size_hint).await?;
+
+        // Track cache miss if fallbacked to remote
+        if fallbacked && let Some(m) = metrics {
+            m.blob_cache_miss += 1;
+        }
+
         let res = reader.blob(key).await;
         match res {
             Ok(blob) => Ok(Some(blob)),
@@ -511,14 +650,28 @@ impl IndexSource {
     /// Returns `None` if the directory is not found.
     async fn dir(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         key: &str,
         file_size_hint: Option<u64>,
+        mut metrics: Option<&mut FulltextIndexApplyMetrics>,
     ) -> Result<Option<GuardWithMetadata<SstPuffinDir>>> {
         let (reader, fallbacked) = self.ensure_reader(file_id, file_size_hint).await?;
+
+        // Track cache miss if fallbacked to remote
+        if fallbacked && let Some(m) = &mut metrics {
+            m.blob_cache_miss += 1;
+        }
+
+        let start = metrics.as_ref().map(|_| Instant::now());
         let res = reader.dir(key).await;
         match res {
-            Ok(dir) => Ok(Some(dir)),
+            Ok((dir, dir_metrics)) => {
+                if let Some(m) = metrics {
+                    // Safety: start is Some when metrics is Some
+                    m.collect_dir_metrics(start.unwrap().elapsed(), dir_metrics);
+                }
+                Ok(Some(dir))
+            }
             Err(err) if err.is_blob_not_found() => Ok(None),
             Err(err) => {
                 if fallbacked {
@@ -526,9 +679,16 @@ impl IndexSource {
                 } else {
                     warn!(err; "An unexpected error occurred while reading the cached index file. Fallback to remote index file.");
                     let reader = self.build_remote(file_id, file_size_hint).await?;
+                    let start = metrics.as_ref().map(|_| Instant::now());
                     let res = reader.dir(key).await;
                     match res {
-                        Ok(dir) => Ok(Some(dir)),
+                        Ok((dir, dir_metrics)) => {
+                            if let Some(m) = metrics {
+                                // Safety: start is Some when metrics is Some
+                                m.collect_dir_metrics(start.unwrap().elapsed(), dir_metrics);
+                            }
+                            Ok(Some(dir))
+                        }
                         Err(err) if err.is_blob_not_found() => Ok(None),
                         Err(err) => Err(err).context(PuffinReadBlobSnafu),
                     }
@@ -540,7 +700,7 @@ impl IndexSource {
     /// Return reader and whether it is fallbacked to remote store.
     async fn ensure_reader(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         file_size_hint: Option<u64>,
     ) -> Result<(SstPuffinReader, bool)> {
         match self.build_local_cache(file_id, file_size_hint).await {
@@ -552,14 +712,18 @@ impl IndexSource {
 
     async fn build_local_cache(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         file_size_hint: Option<u64>,
     ) -> Result<Option<SstPuffinReader>> {
         let Some(file_cache) = &self.file_cache else {
             return Ok(None);
         };
 
-        let index_key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Puffin);
+        let index_key = IndexKey::new(
+            file_id.region_id(),
+            file_id.file_id(),
+            FileType::Puffin(file_id.version),
+        );
         if file_cache.get(index_key).await.is_none() {
             return Ok(None);
         };
@@ -581,15 +745,23 @@ impl IndexSource {
 
     async fn build_remote(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         file_size_hint: Option<u64>,
     ) -> Result<SstPuffinReader> {
+        let path_factory = RegionFilePathFactory::new(self.table_dir.clone(), self.path_type);
+
+        // Trigger background download if file cache and file size are available
+        trigger_index_background_download(
+            self.file_cache.as_ref(),
+            &file_id,
+            file_size_hint,
+            &path_factory,
+            &self.remote_store,
+        );
+
         let puffin_manager = self
             .puffin_manager_factory
-            .build(
-                self.remote_store.clone(),
-                RegionFilePathFactory::new(self.table_dir.clone(), self.path_type),
-            )
+            .build(self.remote_store.clone(), path_factory)
             .with_puffin_metadata_cache(self.puffin_metadata_cache.clone());
 
         let reader = puffin_manager
diff --git a/src/mito2/src/sst/index/fulltext_index/creator.rs b/src/mito2/src/sst/index/fulltext_index/creator.rs
index 2efa154ec4..58c3f1a9bc 100644
--- a/src/mito2/src/sst/index/fulltext_index/creator.rs
+++ b/src/mito2/src/sst/index/fulltext_index/creator.rs
@@ -481,7 +481,7 @@ mod tests {
     use super::*;
     use crate::access_layer::RegionFilePathFactory;
     use crate::read::{Batch, BatchColumn};
-    use crate::sst::file::RegionFileId;
+    use crate::sst::file::{RegionFileId, RegionIndexId};
     use crate::sst::index::fulltext_index::applier::FulltextIndexApplier;
     use crate::sst::index::fulltext_index::applier::builder::{
         FulltextQuery, FulltextRequest, FulltextTerm,
@@ -672,7 +672,8 @@ mod tests {
             RegionFilePathFactory::new(table_dir.clone(), PathType::Bare),
         );
         let region_file_id = RegionFileId::new(region_metadata.region_id, sst_file_id);
-        let mut writer = puffin_manager.writer(&region_file_id).await.unwrap();
+        let index_id = RegionIndexId::new(region_file_id, 0);
+        let mut writer = puffin_manager.writer(&index_id).await.unwrap();
         let _ = indexer.finish(&mut writer).await.unwrap();
         writer.finish().await.unwrap();
 
@@ -724,14 +725,14 @@ mod tests {
             async move {
                 match backend {
                     FulltextBackend::Tantivy => {
-                        applier.apply_fine(region_file_id, None).await.unwrap()
+                        applier.apply_fine(index_id, None, None).await.unwrap()
                     }
                     FulltextBackend::Bloom => {
                         let coarse_mask = coarse_mask.unwrap_or_default();
                         let row_groups = (0..coarse_mask.len()).map(|i| (1, coarse_mask[i]));
                         // row group id == row id
                         let resp = applier
-                            .apply_coarse(region_file_id, None, row_groups)
+                            .apply_coarse(index_id, None, row_groups, None)
                             .await
                             .unwrap();
                         resp.map(|r| {
diff --git a/src/mito2/src/sst/index/indexer/abort.rs b/src/mito2/src/sst/index/indexer/abort.rs
index b285b4891d..9a95554f22 100644
--- a/src/mito2/src/sst/index/indexer/abort.rs
+++ b/src/mito2/src/sst/index/indexer/abort.rs
@@ -14,6 +14,8 @@
 
 use common_telemetry::warn;
 
+use crate::access_layer::TempFileCleaner;
+use crate::sst::file::{RegionFileId, RegionIndexId};
 use crate::sst::index::Indexer;
 
 impl Indexer {
@@ -22,6 +24,9 @@ impl Indexer {
         self.do_abort_fulltext_index().await;
         self.do_abort_bloom_filter().await;
         self.do_prune_intm_sst_dir().await;
+        if self.write_cache_enabled {
+            self.do_abort_clean_fs_temp_dir().await;
+        }
         self.puffin_manager = None;
     }
 
@@ -87,4 +92,18 @@ impl Indexer {
             );
         }
     }
+
+    async fn do_abort_clean_fs_temp_dir(&mut self) {
+        let Some(puffin_manager) = &self.puffin_manager else {
+            return;
+        };
+        let fs_accessor = puffin_manager.file_accessor();
+
+        let fs_handle = RegionIndexId::new(
+            RegionFileId::new(self.region_id, self.file_id),
+            self.index_version,
+        )
+        .to_string();
+        TempFileCleaner::clean_atomic_dir_files(fs_accessor.store().store(), &[&fs_handle]).await;
+    }
 }
diff --git a/src/mito2/src/sst/index/indexer/finish.rs b/src/mito2/src/sst/index/indexer/finish.rs
index 632b0a68d1..4f620dfe42 100644
--- a/src/mito2/src/sst/index/indexer/finish.rs
+++ b/src/mito2/src/sst/index/indexer/finish.rs
@@ -16,7 +16,7 @@ use common_telemetry::{debug, warn};
 use puffin::puffin_manager::{PuffinManager, PuffinWriter};
 use store_api::storage::ColumnId;
 
-use crate::sst::file::RegionFileId;
+use crate::sst::file::{RegionFileId, RegionIndexId};
 use crate::sst::index::puffin_manager::SstPuffinWriter;
 use crate::sst::index::statistics::{ByteCount, RowCount};
 use crate::sst::index::{
@@ -56,14 +56,18 @@ impl Indexer {
 
         self.do_prune_intm_sst_dir().await;
         output.file_size = self.do_finish_puffin_writer(writer).await;
+        output.version = self.index_version;
         output
     }
 
     async fn build_puffin_writer(&mut self) -> Option<SstPuffinWriter> {
-        let puffin_manager = self.puffin_manager.take()?;
+        let puffin_manager = self.puffin_manager.clone()?;
 
         let err = match puffin_manager
-            .writer(&RegionFileId::new(self.region_id, self.file_id))
+            .writer(&RegionIndexId::new(
+                RegionFileId::new(self.region_id, self.file_id),
+                self.index_version,
+            ))
             .await
         {
             Ok(writer) => return Some(writer),
diff --git a/src/mito2/src/sst/index/inverted_index/applier.rs b/src/mito2/src/sst/index/inverted_index/applier.rs
index 350880cc9f..e4a0f25398 100644
--- a/src/mito2/src/sst/index/inverted_index/applier.rs
+++ b/src/mito2/src/sst/index/inverted_index/applier.rs
@@ -16,10 +16,11 @@ pub mod builder;
 
 use std::collections::BTreeMap;
 use std::sync::Arc;
+use std::time::Instant;
 
 use common_base::range_read::RangeReader;
 use common_telemetry::warn;
-use index::inverted_index::format::reader::InvertedIndexBlobReader;
+use index::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReadMetrics};
 use index::inverted_index::search::index_apply::{
     ApplyOutput, IndexApplier, IndexNotFoundStrategy, SearchContext,
 };
@@ -39,10 +40,71 @@ use crate::error::{
     ApplyInvertedIndexSnafu, MetadataSnafu, PuffinBuildReaderSnafu, PuffinReadBlobSnafu, Result,
 };
 use crate::metrics::{INDEX_APPLY_ELAPSED, INDEX_APPLY_MEMORY_USAGE};
-use crate::sst::file::RegionFileId;
-use crate::sst::index::TYPE_INVERTED_INDEX;
+use crate::sst::file::RegionIndexId;
 use crate::sst::index::inverted_index::INDEX_BLOB_TYPE;
 use crate::sst::index::puffin_manager::{BlobReader, PuffinManagerFactory};
+use crate::sst::index::{TYPE_INVERTED_INDEX, trigger_index_background_download};
+
+/// Metrics for tracking inverted index apply operations.
+#[derive(Default, Clone)]
+pub struct InvertedIndexApplyMetrics {
+    /// Total time spent applying the index.
+    pub apply_elapsed: std::time::Duration,
+    /// Number of blob cache misses (0 or 1).
+    pub blob_cache_miss: usize,
+    /// Total size of blobs read (in bytes).
+    pub blob_read_bytes: u64,
+    /// Metrics for inverted index reads.
+    pub inverted_index_read_metrics: InvertedIndexReadMetrics,
+}
+
+impl std::fmt::Debug for InvertedIndexApplyMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let Self {
+            apply_elapsed,
+            blob_cache_miss,
+            blob_read_bytes,
+            inverted_index_read_metrics,
+        } = self;
+
+        if self.is_empty() {
+            return write!(f, "{{}}");
+        }
+        write!(f, "{{")?;
+
+        write!(f, "\"apply_elapsed\":\"{:?}\"", apply_elapsed)?;
+
+        if *blob_cache_miss > 0 {
+            write!(f, ", \"blob_cache_miss\":{}", blob_cache_miss)?;
+        }
+        if *blob_read_bytes > 0 {
+            write!(f, ", \"blob_read_bytes\":{}", blob_read_bytes)?;
+        }
+        write!(
+            f,
+            ", \"inverted_index_read_metrics\":{:?}",
+            inverted_index_read_metrics
+        )?;
+
+        write!(f, "}}")
+    }
+}
+
+impl InvertedIndexApplyMetrics {
+    /// Returns true if the metrics are empty (contain no meaningful data).
+    pub fn is_empty(&self) -> bool {
+        self.apply_elapsed.is_zero()
+    }
+
+    /// Merges another metrics into this one.
+    pub fn merge_from(&mut self, other: &Self) {
+        self.apply_elapsed += other.apply_elapsed;
+        self.blob_cache_miss += other.blob_cache_miss;
+        self.blob_read_bytes += other.blob_read_bytes;
+        self.inverted_index_read_metrics
+            .merge_from(&other.inverted_index_read_metrics);
+    }
+}
 
 /// `InvertedIndexApplier` is responsible for applying predicates to the provided SST files
 /// and returning the relevant row group ids for further scan.
@@ -124,24 +186,30 @@ impl InvertedIndexApplier {
         self
     }
 
-    /// Applies predicates to the provided SST file id and returns the relevant row group ids
+    /// Applies predicates to the provided SST file id and returns the relevant row group ids.
+    ///
+    /// # Arguments
+    /// * `file_id` - The region file ID to apply predicates to
+    /// * `file_size_hint` - Optional hint for file size to avoid extra metadata reads
+    /// * `metrics` - Optional mutable reference to collect metrics on demand
     pub async fn apply(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         file_size_hint: Option<u64>,
+        mut metrics: Option<&mut InvertedIndexApplyMetrics>,
     ) -> Result<ApplyOutput> {
-        let _timer = INDEX_APPLY_ELAPSED
-            .with_label_values(&[TYPE_INVERTED_INDEX])
-            .start_timer();
+        let start = Instant::now();
 
         let context = SearchContext {
             // Encountering a non-existing column indicates that it doesn't match predicates.
             index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty,
         };
 
+        let mut cache_miss = 0;
         let blob = match self.cached_blob_reader(file_id, file_size_hint).await {
             Ok(Some(puffin_reader)) => puffin_reader,
             other => {
+                cache_miss += 1;
                 if let Err(err) = other {
                     warn!(err; "An unexpected error occurred while reading the cached index file. Fallback to remote index file.")
                 }
@@ -149,38 +217,70 @@ impl InvertedIndexApplier {
             }
         };
 
-        if let Some(index_cache) = &self.inverted_index_cache {
-            let blob_size = blob.metadata().await.context(MetadataSnafu)?.content_length;
+        let blob_size = blob.metadata().await.context(MetadataSnafu)?.content_length;
+
+        let result = if let Some(index_cache) = &self.inverted_index_cache {
             let mut index_reader = CachedInvertedIndexBlobReader::new(
                 file_id.file_id(),
+                file_id.version,
                 blob_size,
                 InvertedIndexBlobReader::new(blob),
                 index_cache.clone(),
             );
             self.index_applier
-                .apply(context, &mut index_reader)
+                .apply(
+                    context,
+                    &mut index_reader,
+                    metrics
+                        .as_deref_mut()
+                        .map(|m| &mut m.inverted_index_read_metrics),
+                )
                 .await
                 .context(ApplyInvertedIndexSnafu)
         } else {
             let mut index_reader = InvertedIndexBlobReader::new(blob);
             self.index_applier
-                .apply(context, &mut index_reader)
+                .apply(
+                    context,
+                    &mut index_reader,
+                    metrics
+                        .as_deref_mut()
+                        .map(|m| &mut m.inverted_index_read_metrics),
+                )
                 .await
                 .context(ApplyInvertedIndexSnafu)
+        };
+
+        // Record elapsed time to histogram and collect metrics if requested
+        let elapsed = start.elapsed();
+        INDEX_APPLY_ELAPSED
+            .with_label_values(&[TYPE_INVERTED_INDEX])
+            .observe(elapsed.as_secs_f64());
+
+        if let Some(metrics) = metrics {
+            metrics.apply_elapsed = elapsed;
+            metrics.blob_cache_miss = cache_miss;
+            metrics.blob_read_bytes = blob_size;
         }
+
+        result
     }
 
     /// Creates a blob reader from the cached index file.
     async fn cached_blob_reader(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         file_size_hint: Option<u64>,
     ) -> Result<Option<BlobReader>> {
         let Some(file_cache) = &self.file_cache else {
             return Ok(None);
         };
 
-        let index_key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Puffin);
+        let index_key = IndexKey::new(
+            file_id.region_id(),
+            file_id.file_id(),
+            FileType::Puffin(file_id.version),
+        );
         if file_cache.get(index_key).await.is_none() {
             return Ok(None);
         };
@@ -208,15 +308,23 @@ impl InvertedIndexApplier {
     /// Creates a blob reader from the remote index file.
     async fn remote_blob_reader(
         &self,
-        file_id: RegionFileId,
+        file_id: RegionIndexId,
         file_size_hint: Option<u64>,
     ) -> Result<BlobReader> {
+        let path_factory = RegionFilePathFactory::new(self.table_dir.clone(), self.path_type);
+
+        // Trigger background download if file cache and file size are available
+        trigger_index_background_download(
+            self.file_cache.as_ref(),
+            &file_id,
+            file_size_hint,
+            &path_factory,
+            &self.store,
+        );
+
         let puffin_manager = self
             .puffin_manager_factory
-            .build(
-                self.store.clone(),
-                RegionFilePathFactory::new(self.table_dir.clone(), self.path_type),
-            )
+            .build(self.store.clone(), path_factory)
             .with_puffin_metadata_cache(self.puffin_metadata_cache.clone());
 
         puffin_manager
@@ -254,6 +362,7 @@ mod tests {
     use store_api::storage::FileId;
 
     use super::*;
+    use crate::sst::index::RegionFileId;
 
     #[tokio::test]
     async fn test_index_applier_apply_basic() {
@@ -261,13 +370,14 @@ mod tests {
             PuffinManagerFactory::new_for_test_async("test_index_applier_apply_basic_").await;
         let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
         let file_id = RegionFileId::new(0.into(), FileId::random());
+        let index_id = RegionIndexId::new(file_id, 0);
         let table_dir = "table_dir".to_string();
 
         let puffin_manager = puffin_manager_factory.build(
             object_store.clone(),
             RegionFilePathFactory::new(table_dir.clone(), PathType::Bare),
         );
-        let mut writer = puffin_manager.writer(&file_id).await.unwrap();
+        let mut writer = puffin_manager.writer(&index_id).await.unwrap();
         writer
             .put_blob(
                 INDEX_BLOB_TYPE,
@@ -281,7 +391,7 @@ mod tests {
 
         let mut mock_index_applier = MockIndexApplier::new();
         mock_index_applier.expect_memory_usage().returning(|| 100);
-        mock_index_applier.expect_apply().returning(|_, _| {
+        mock_index_applier.expect_apply().returning(|_, _, _| {
             Ok(ApplyOutput {
                 matched_segment_ids: Bitmap::new_bitvec(),
                 total_row_count: 100,
@@ -297,7 +407,7 @@ mod tests {
             puffin_manager_factory,
             Default::default(),
         );
-        let output = sst_index_applier.apply(file_id, None).await.unwrap();
+        let output = sst_index_applier.apply(index_id, None, None).await.unwrap();
         assert_eq!(
             output,
             ApplyOutput {
@@ -315,13 +425,14 @@ mod tests {
                 .await;
         let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
         let file_id = RegionFileId::new(0.into(), FileId::random());
+        let index_id = RegionIndexId::new(file_id, 0);
         let table_dir = "table_dir".to_string();
 
         let puffin_manager = puffin_manager_factory.build(
             object_store.clone(),
             RegionFilePathFactory::new(table_dir.clone(), PathType::Bare),
         );
-        let mut writer = puffin_manager.writer(&file_id).await.unwrap();
+        let mut writer = puffin_manager.writer(&index_id).await.unwrap();
         writer
             .put_blob(
                 "invalid_blob_type",
@@ -345,7 +456,7 @@ mod tests {
             puffin_manager_factory,
             Default::default(),
         );
-        let res = sst_index_applier.apply(file_id, None).await;
+        let res = sst_index_applier.apply(index_id, None, None).await;
         assert!(format!("{:?}", res.unwrap_err()).contains("Blob not found"));
     }
 }
diff --git a/src/mito2/src/sst/index/inverted_index/creator.rs b/src/mito2/src/sst/index/inverted_index/creator.rs
index f31cfaf1dc..386ee11b9b 100644
--- a/src/mito2/src/sst/index/inverted_index/creator.rs
+++ b/src/mito2/src/sst/index/inverted_index/creator.rs
@@ -466,7 +466,7 @@ mod tests {
     use crate::cache::index::inverted_index::InvertedIndexCache;
     use crate::metrics::CACHE_BYTES;
     use crate::read::BatchColumn;
-    use crate::sst::file::RegionFileId;
+    use crate::sst::file::{RegionFileId, RegionIndexId};
     use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder;
     use crate::sst::index::puffin_manager::PuffinManagerFactory;
 
@@ -591,7 +591,8 @@ mod tests {
         );
 
         let sst_file_id = RegionFileId::new(region_metadata.region_id, sst_file_id);
-        let mut writer = puffin_manager.writer(&sst_file_id).await.unwrap();
+        let index_id = RegionIndexId::new(sst_file_id, 0);
+        let mut writer = puffin_manager.writer(&index_id).await.unwrap();
         let (row_count, _) = creator.finish(&mut writer).await.unwrap();
         assert_eq!(row_count, rows.len() * segment_row_count);
         writer.finish().await.unwrap();
@@ -615,7 +616,7 @@ mod tests {
             .unwrap();
             Box::pin(async move {
                 applier
-                    .apply(sst_file_id, None)
+                    .apply(index_id, None, None)
                     .await
                     .unwrap()
                     .matched_segment_ids
diff --git a/src/mito2/src/sst/index/puffin_manager.rs b/src/mito2/src/sst/index/puffin_manager.rs
index 3f8d3f8819..edff8aab58 100644
--- a/src/mito2/src/sst/index/puffin_manager.rs
+++ b/src/mito2/src/sst/index/puffin_manager.rs
@@ -32,14 +32,14 @@ use crate::metrics::{
     INDEX_PUFFIN_FLUSH_OP_TOTAL, INDEX_PUFFIN_READ_BYTES_TOTAL, INDEX_PUFFIN_READ_OP_TOTAL,
     INDEX_PUFFIN_WRITE_BYTES_TOTAL, INDEX_PUFFIN_WRITE_OP_TOTAL, StagerMetrics,
 };
-use crate::sst::file::RegionFileId;
+use crate::sst::file::RegionIndexId;
 use crate::sst::index::store::{self, InstrumentedStore};
 
 type InstrumentedRangeReader = store::InstrumentedRangeReader<'static>;
 type InstrumentedAsyncWrite = store::InstrumentedAsyncWrite<'static, FuturesAsyncWriter>;
 
 pub(crate) type SstPuffinManager =
-    FsPuffinManager<Arc<BoundedStager<RegionFileId>>, ObjectStorePuffinFileAccessor>;
+    FsPuffinManager<Arc<BoundedStager<RegionIndexId>>, ObjectStorePuffinFileAccessor>;
 pub(crate) type SstPuffinReader = <SstPuffinManager as PuffinManager>::Reader;
 pub(crate) type SstPuffinWriter = <SstPuffinManager as PuffinManager>::Writer;
 pub(crate) type SstPuffinBlob = <SstPuffinReader as PuffinReader>::Blob;
@@ -52,7 +52,7 @@ const STAGING_DIR: &str = "staging";
 #[derive(Clone)]
 pub struct PuffinManagerFactory {
     /// The stager used by the puffin manager.
-    stager: Arc<BoundedStager<RegionFileId>>,
+    stager: Arc<BoundedStager<RegionIndexId>>,
 
     /// The size of the write buffer used to create object store.
     write_buffer_size: Option<usize>,
@@ -92,7 +92,7 @@ impl PuffinManagerFactory {
         SstPuffinManager::new(self.stager.clone(), puffin_file_accessor)
     }
 
-    pub(crate) async fn purge_stager(&self, file_id: RegionFileId) -> Result<()> {
+    pub(crate) async fn purge_stager(&self, file_id: RegionIndexId) -> Result<()> {
         self.stager
             .purge(&file_id)
             .await
@@ -136,16 +136,22 @@ impl ObjectStorePuffinFileAccessor {
             path_provider,
         }
     }
+
+    pub fn store(&self) -> &InstrumentedStore {
+        &self.object_store
+    }
 }
 
 #[async_trait]
 impl PuffinFileAccessor for ObjectStorePuffinFileAccessor {
     type Reader = InstrumentedRangeReader;
     type Writer = InstrumentedAsyncWrite;
-    type FileHandle = RegionFileId;
+    type FileHandle = RegionIndexId;
 
-    async fn reader(&self, handle: &RegionFileId) -> PuffinResult<Self::Reader> {
-        let file_path = self.path_provider.build_index_file_path(*handle);
+    async fn reader(&self, handle: &RegionIndexId) -> PuffinResult<Self::Reader> {
+        let file_path = self
+            .path_provider
+            .build_index_file_path_with_version(*handle);
         self.object_store
             .range_reader(
                 &file_path,
@@ -157,8 +163,10 @@ impl PuffinFileAccessor for ObjectStorePuffinFileAccessor {
             .context(puffin_error::ExternalSnafu)
     }
 
-    async fn writer(&self, handle: &RegionFileId) -> PuffinResult<Self::Writer> {
-        let file_path = self.path_provider.build_index_file_path(*handle);
+    async fn writer(&self, handle: &RegionIndexId) -> PuffinResult<Self::Writer> {
+        let file_path = self
+            .path_provider
+            .build_index_file_path_with_version(*handle);
         self.object_store
             .writer(
                 &file_path,
@@ -184,7 +192,7 @@ mod tests {
     use store_api::storage::FileId;
 
     use super::*;
-    use crate::sst::file::RegionFileId;
+    use crate::sst::file::{RegionFileId, RegionIndexId};
 
     struct TestFilePathProvider;
 
@@ -193,6 +201,10 @@ mod tests {
             file_id.file_id().to_string()
         }
 
+        fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
+            index_id.file_id.file_id().to_string()
+        }
+
         fn build_sst_file_path(&self, file_id: RegionFileId) -> String {
             file_id.file_id().to_string()
         }
@@ -206,7 +218,7 @@ mod tests {
         let object_store = ObjectStore::new(Memory::default()).unwrap().finish();
         let manager = factory.build(object_store, TestFilePathProvider);
 
-        let file_id = RegionFileId::new(0.into(), FileId::random());
+        let file_id = RegionIndexId::new(RegionFileId::new(0.into(), FileId::random()), 0);
         let blob_key = "blob-key";
         let dir_key = "dir-key";
         let raw_data = b"hello world!";
@@ -245,7 +257,7 @@ mod tests {
         let bs = blob_reader.read(0..meta.content_length).await.unwrap();
         assert_eq!(&*bs, raw_data);
 
-        let dir_guard = reader.dir(dir_key).await.unwrap();
+        let (dir_guard, _metrics) = reader.dir(dir_key).await.unwrap();
         let file = dir_guard.path().join("hello");
         let data = tokio::fs::read(file).await.unwrap();
         assert_eq!(data, raw_data);
diff --git a/src/mito2/src/sst/index/store.rs b/src/mito2/src/sst/index/store.rs
index f37fdc5c7a..1662c6d876 100644
--- a/src/mito2/src/sst/index/store.rs
+++ b/src/mito2/src/sst/index/store.rs
@@ -49,6 +49,10 @@ impl InstrumentedStore {
         }
     }
 
+    pub fn store(&self) -> &ObjectStore {
+        &self.object_store
+    }
+
     /// Set the size of the write buffer.
     pub fn with_write_buffer_size(mut self, write_buffer_size: Option<usize>) -> Self {
         self.write_buffer_size = write_buffer_size.filter(|&size| size > 0);
diff --git a/src/mito2/src/sst/location.rs b/src/mito2/src/sst/location.rs
index 8b9fa9f88a..f3d9e1bdeb 100644
--- a/src/mito2/src/sst/location.rs
+++ b/src/mito2/src/sst/location.rs
@@ -20,7 +20,7 @@ use store_api::region_request::PathType;
 use store_api::storage::{FileId, RegionId};
 
 use crate::error::UnexpectedSnafu;
-use crate::sst::file::RegionFileId;
+use crate::sst::file::{RegionFileId, RegionIndexId};
 
 /// Generate region dir from table_dir, region_id and path_type
 pub fn region_dir_from_table_dir(
@@ -46,14 +46,68 @@ pub fn sst_file_path(table_dir: &str, region_file_id: RegionFileId, path_type: P
     )
 }
 
-pub fn index_file_path(
+pub fn index_file_path(table_dir: &str, index_id: RegionIndexId, path_type: PathType) -> String {
+    let region_dir = region_dir_from_table_dir(table_dir, index_id.file_id.region_id(), path_type);
+    let index_dir = util::join_dir(&region_dir, "index");
+
+    let filename = if index_id.version == 0 {
+        format!("{}.puffin", index_id.file_id.file_id())
+    } else {
+        format!("{}.{}.puffin", index_id.file_id.file_id(), index_id.version)
+    };
+
+    util::join_path(&index_dir, &filename)
+}
+
+/// Legacy function for backward compatibility - creates index file path using RegionFileId with version 0
+pub fn index_file_path_legacy(
     table_dir: &str,
     region_file_id: RegionFileId,
     path_type: PathType,
 ) -> String {
-    let region_dir = region_dir_from_table_dir(table_dir, region_file_id.region_id(), path_type);
-    let index_dir = util::join_dir(&region_dir, "index");
-    util::join_path(&index_dir, &format!("{}.puffin", region_file_id.file_id()))
+    let index_id = RegionIndexId::new(region_file_id, 0);
+    index_file_path(table_dir, index_id, path_type)
+}
+
+/// Parse file ID and version from index filename
+pub fn parse_index_file_info(filepath: &str) -> crate::error::Result<(FileId, u64)> {
+    let filename = filepath.rsplit('/').next().context(UnexpectedSnafu {
+        reason: format!("invalid file path: {}", filepath),
+    })?;
+    let parts: Vec<&str> = filename.split('.').collect();
+
+    if parts.len() == 2 && parts[1] == "puffin" {
+        // Legacy format: {file_id}.puffin (version 0)
+        let file_id = parts[0];
+        FileId::parse_str(file_id).map(|id| (id, 0)).map_err(|e| {
+            UnexpectedSnafu {
+                reason: format!("invalid file id: {}, err: {}", file_id, e),
+            }
+            .build()
+        })
+    } else if parts.len() == 3 && parts[2] == "puffin" {
+        // New format: {file_id}.{version}.puffin
+        let file_id = parts[0];
+        let version = parts[1].parse::<u64>().map_err(|_| {
+            UnexpectedSnafu {
+                reason: format!("invalid version in file name: {}", filename),
+            }
+            .build()
+        })?;
+        FileId::parse_str(file_id)
+            .map(|id| (id, version))
+            .map_err(|e| {
+                UnexpectedSnafu {
+                    reason: format!("invalid file id: {}, err: {}", file_id, e),
+                }
+                .build()
+            })
+    } else {
+        UnexpectedSnafu {
+            reason: format!("invalid index file name: {}", filename),
+        }
+        .fail()
+    }
 }
 
 /// Get RegionFileId from sst or index filename
@@ -111,17 +165,59 @@ mod tests {
     fn test_index_file_path() {
         let file_id = FileId::random();
         let region_file_id = RegionFileId::new(RegionId::new(1, 2), file_id);
+        let index_id = RegionIndexId::new(region_file_id, 0);
         assert_eq!(
-            index_file_path("table_dir", region_file_id, PathType::Bare),
+            index_file_path("table_dir", index_id, PathType::Bare),
             format!("table_dir/1_0000000002/index/{}.puffin", file_id)
         );
         assert_eq!(
-            index_file_path("table_dir", region_file_id, PathType::Data),
+            index_file_path("table_dir", index_id, PathType::Data),
             format!("table_dir/1_0000000002/data/index/{}.puffin", file_id)
         );
         assert_eq!(
-            index_file_path("table_dir", region_file_id, PathType::Metadata),
+            index_file_path("table_dir", index_id, PathType::Metadata),
             format!("table_dir/1_0000000002/metadata/index/{}.puffin", file_id)
         );
     }
+
+    #[test]
+    fn test_index_file_path_versioned() {
+        let file_id = FileId::random();
+        let region_file_id = RegionFileId::new(RegionId::new(1, 2), file_id);
+        let index_id_v1 = RegionIndexId::new(region_file_id, 1);
+        let index_id_v2 = RegionIndexId::new(region_file_id, 2);
+
+        assert_eq!(
+            index_file_path("table_dir", index_id_v1, PathType::Bare),
+            format!("table_dir/1_0000000002/index/{}.1.puffin", file_id)
+        );
+        assert_eq!(
+            index_file_path("table_dir", index_id_v2, PathType::Bare),
+            format!("table_dir/1_0000000002/index/{}.2.puffin", file_id)
+        );
+    }
+
+    #[test]
+    fn test_parse_index_file_info() {
+        // Test legacy format
+        let file_id = FileId::random();
+        let result =
+            parse_index_file_info(&format!("table_dir/1_0000000002/index/{file_id}.puffin"))
+                .unwrap();
+        assert_eq!(result.0.to_string(), file_id.to_string());
+        assert_eq!(result.1, 0);
+
+        // Test versioned format
+        let result =
+            parse_index_file_info(&format!("table_dir/1_0000000002/index/{file_id}.1.puffin"))
+                .unwrap();
+        assert_eq!(result.0.to_string(), file_id.to_string());
+        assert_eq!(result.1, 1);
+
+        let result =
+            parse_index_file_info(&format!("table_dir/1_0000000002/index/{file_id}.42.puffin"))
+                .unwrap();
+        assert_eq!(result.0.to_string(), file_id.to_string());
+        assert_eq!(result.1, 42);
+    }
 }
diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs
index 95ba0b28b3..21187bedd3 100644
--- a/src/mito2/src/sst/parquet.rs
+++ b/src/mito2/src/sst/parquet.rs
@@ -76,6 +76,8 @@ pub struct SstInfo {
     pub time_range: FileTimeRange,
     /// File size in bytes.
     pub file_size: u64,
+    /// Maximum uncompressed row group size in bytes. 0 if unknown.
+    pub max_row_group_uncompressed_size: u64,
     /// Number of rows.
     pub num_rows: usize,
     /// Number of row groups
@@ -117,7 +119,7 @@ mod tests {
     use crate::config::IndexConfig;
     use crate::read::{BatchBuilder, BatchReader, FlatSource};
     use crate::region::options::{IndexOptions, InvertedIndexOptions};
-    use crate::sst::file::{FileHandle, FileMeta, RegionFileId};
+    use crate::sst::file::{FileHandle, FileMeta, RegionFileId, RegionIndexId};
     use crate::sst::file_purger::NoopFilePurger;
     use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplierBuilder;
     use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder;
@@ -144,7 +146,11 @@ mod tests {
 
     impl FilePathProvider for FixedPathProvider {
         fn build_index_file_path(&self, _file_id: RegionFileId) -> String {
-            location::index_file_path(FILE_DIR, self.region_file_id, PathType::Bare)
+            location::index_file_path_legacy(FILE_DIR, self.region_file_id, PathType::Bare)
+        }
+
+        fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
+            location::index_file_path(FILE_DIR, index_id, PathType::Bare)
         }
 
         fn build_sst_file_path(&self, _file_id: RegionFileId) -> String {
@@ -156,7 +162,7 @@ mod tests {
 
     #[async_trait::async_trait]
     impl IndexerBuilder for NoopIndexBuilder {
-        async fn build(&self, _file_id: FileId) -> Indexer {
+        async fn build(&self, _file_id: FileId, _index_version: u64) -> Indexer {
             Indexer::default()
         }
     }
@@ -711,6 +717,7 @@ mod tests {
             metadata: metadata.clone(),
             row_group_size,
             puffin_manager,
+            write_cache_enabled: false,
             intermediate_manager,
             index_options: IndexOptions {
                 inverted_index: InvertedIndexOptions {
@@ -766,9 +773,11 @@ mod tests {
                 time_range: info.time_range,
                 level: 0,
                 file_size: info.file_size,
+                max_row_group_uncompressed_size: info.max_row_group_uncompressed_size,
                 available_indexes: info.index_metadata.build_available_indexes(),
+                indexes: info.index_metadata.build_indexes(),
                 index_file_size: info.index_metadata.file_size,
-                index_file_id: None,
+                index_version: 0,
                 num_row_groups: info.num_row_groups,
                 num_rows: info.num_rows as u64,
                 sequence: None,
@@ -1089,6 +1098,7 @@ mod tests {
             metadata: metadata.clone(),
             row_group_size,
             puffin_manager,
+            write_cache_enabled: false,
             intermediate_manager,
             index_options: IndexOptions {
                 inverted_index: InvertedIndexOptions {
diff --git a/src/mito2/src/sst/parquet/file_range.rs b/src/mito2/src/sst/parquet/file_range.rs
index 689a8de599..46cd53e6ea 100644
--- a/src/mito2/src/sst/parquet/file_range.rs
+++ b/src/mito2/src/sst/parquet/file_range.rs
@@ -45,6 +45,7 @@ use crate::sst::parquet::format::ReadFormat;
 use crate::sst::parquet::reader::{
     FlatRowGroupReader, MaybeFilter, RowGroupReader, RowGroupReaderBuilder, SimpleFilterContext,
 };
+use crate::sst::parquet::row_group::ParquetFetchMetrics;
 
 /// Checks if a row group contains delete operations by examining the min value of op_type column.
 ///
@@ -117,11 +118,16 @@ impl FileRange {
     pub(crate) async fn reader(
         &self,
         selector: Option<TimeSeriesRowSelector>,
+        fetch_metrics: Option<&ParquetFetchMetrics>,
     ) -> Result<PruneReader> {
         let parquet_reader = self
             .context
             .reader_builder
-            .build(self.row_group_idx, self.row_selection.clone())
+            .build(
+                self.row_group_idx,
+                self.row_selection.clone(),
+                fetch_metrics,
+            )
             .await?;
 
         let use_last_row_reader = if selector
@@ -168,11 +174,18 @@ impl FileRange {
     }
 
     /// Creates a flat reader that returns RecordBatch.
-    pub(crate) async fn flat_reader(&self) -> Result<FlatPruneReader> {
+    pub(crate) async fn flat_reader(
+        &self,
+        fetch_metrics: Option<&ParquetFetchMetrics>,
+    ) -> Result<FlatPruneReader> {
         let parquet_reader = self
             .context
             .reader_builder
-            .build(self.row_group_idx, self.row_selection.clone())
+            .build(
+                self.row_group_idx,
+                self.row_selection.clone(),
+                fetch_metrics,
+            )
             .await?;
 
         // Compute skip_fields once for this row group
diff --git a/src/mito2/src/sst/parquet/format.rs b/src/mito2/src/sst/parquet/format.rs
index f6b9c4272b..4d66292696 100644
--- a/src/mito2/src/sst/parquet/format.rs
+++ b/src/mito2/src/sst/parquet/format.rs
@@ -40,7 +40,10 @@ use datatypes::arrow::datatypes::{SchemaRef, UInt32Type};
 use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::prelude::DataType;
 use datatypes::vectors::{Helper, Vector};
-use mito_codec::row_converter::{SortField, build_primary_key_codec_with_fields};
+use mito_codec::row_converter::{
+    CompositeValues, PrimaryKeyCodec, SortField, build_primary_key_codec,
+    build_primary_key_codec_with_fields,
+};
 use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
 use parquet::file::statistics::Statistics;
 use snafu::{OptionExt, ResultExt, ensure};
@@ -48,7 +51,8 @@ use store_api::metadata::{ColumnMetadata, RegionMetadataRef};
 use store_api::storage::{ColumnId, SequenceNumber};
 
 use crate::error::{
-    ConvertVectorSnafu, InvalidBatchSnafu, InvalidRecordBatchSnafu, NewRecordBatchSnafu, Result,
+    ConvertVectorSnafu, DecodeSnafu, InvalidBatchSnafu, InvalidRecordBatchSnafu,
+    NewRecordBatchSnafu, Result,
 };
 use crate::read::{Batch, BatchBuilder, BatchColumn};
 use crate::sst::file::{FileMeta, FileTimeRange};
@@ -386,6 +390,13 @@ impl ReadFormat {
         }
     }
 
+    /// Enables or disables eager decoding of primary key values into batches.
+    pub(crate) fn set_decode_primary_key_values(&mut self, decode: bool) {
+        if let ReadFormat::PrimaryKey(format) = self {
+            format.set_decode_primary_key_values(decode);
+        }
+    }
+
     /// Creates a sequence array to override.
     pub(crate) fn new_override_sequence_array(&self, length: usize) -> Option<ArrayRef> {
         match self {
@@ -411,6 +422,8 @@ pub struct PrimaryKeyReadFormat {
     field_id_to_projected_index: HashMap<ColumnId, usize>,
     /// Sequence number to override the sequence read from the SST.
     override_sequence: Option<SequenceNumber>,
+    /// Codec used to decode primary key values if eager decoding is enabled.
+    primary_key_codec: Option<Arc<dyn PrimaryKeyCodec>>,
 }
 
 impl PrimaryKeyReadFormat {
@@ -439,6 +452,7 @@ impl PrimaryKeyReadFormat {
             projection_indices: format_projection.projection_indices,
             field_id_to_projected_index: format_projection.column_id_to_projected_index,
             override_sequence: None,
+            primary_key_codec: None,
         }
     }
 
@@ -447,6 +461,15 @@ impl PrimaryKeyReadFormat {
         self.override_sequence = sequence;
     }
 
+    /// Enables or disables eager decoding of primary key values into batches.
+    pub(crate) fn set_decode_primary_key_values(&mut self, decode: bool) {
+        self.primary_key_codec = if decode {
+            Some(build_primary_key_codec(&self.metadata))
+        } else {
+            None
+        };
+    }
+
     /// Gets the arrow schema of the SST file.
     ///
     /// This schema is computed from the region metadata but should be the same
@@ -561,7 +584,12 @@ impl PrimaryKeyReadFormat {
                 });
             }
 
-            let batch = builder.build()?;
+            let mut batch = builder.build()?;
+            if let Some(codec) = &self.primary_key_codec {
+                let pk_values: CompositeValues =
+                    codec.decode(batch.primary_key()).context(DecodeSnafu)?;
+                batch.set_pk_values(pk_values);
+            }
             batches.push_back(batch);
         }
 
diff --git a/src/mito2/src/sst/parquet/metadata.rs b/src/mito2/src/sst/parquet/metadata.rs
index 2cf1ecfda8..05c7aac462 100644
--- a/src/mito2/src/sst/parquet/metadata.rs
+++ b/src/mito2/src/sst/parquet/metadata.rs
@@ -12,17 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::result::Result as StdResult;
+
+use bytes::Bytes;
+use futures::FutureExt;
+use futures::future::BoxFuture;
 use object_store::ObjectStore;
-use parquet::file::FOOTER_SIZE;
+use parquet::arrow::async_reader::MetadataFetch;
+use parquet::errors::{ParquetError, Result as ParquetResult};
 use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
-use snafu::ResultExt;
+use snafu::{IntoError as _, ResultExt};
 
 use crate::error::{self, Result};
 
 /// The estimated size of the footer and metadata need to read from the end of parquet file.
 const DEFAULT_PREFETCH_SIZE: u64 = 64 * 1024;
 
-/// Load the metadata of parquet file in an async way.
 pub(crate) struct MetadataLoader<'a> {
     // An object store that supports async read
     object_store: ObjectStore,
@@ -46,111 +51,7 @@ impl<'a> MetadataLoader<'a> {
         }
     }
 
-    /// Async load the metadata of parquet file.
-    ///
-    /// Read [DEFAULT_PREFETCH_SIZE] from the end of parquet file at first, if File Metadata is in the
-    /// read range, decode it and return [ParquetMetaData], otherwise, read again to get the rest of the metadata.
-    ///
-    /// Parquet File Format:
-    /// ```text
-    /// ┌───────────────────────────────────┐
-    /// |4-byte magic number "PAR1"         |
-    /// |───────────────────────────────────|
-    /// |Column 1 Chunk 1 + Column Metadata |
-    /// |Column 2 Chunk 1 + Column Metadata |
-    /// |...                                |
-    /// |Column N Chunk M + Column Metadata |
-    /// |───────────────────────────────────|
-    /// |File Metadata                      |
-    /// |───────────────────────────────────|
-    /// |4-byte length of file metadata     |
-    /// |4-byte magic number "PAR1"         |
-    /// └───────────────────────────────────┘
-    /// ```
-    ///
-    /// Refer to https://github.com/apache/arrow-rs/blob/093a10e46203be1a0e94ae117854701bf58d4c79/parquet/src/arrow/async_reader/metadata.rs#L55-L106
-    pub async fn load(&self) -> Result<ParquetMetaData> {
-        let object_store = &self.object_store;
-        let path = self.file_path;
-        let file_size = self.get_file_size().await?;
-
-        if file_size < FOOTER_SIZE as u64 {
-            return error::InvalidParquetSnafu {
-                file: path,
-                reason: "file size is smaller than footer size",
-            }
-            .fail();
-        }
-
-        // Prefetch bytes for metadata from the end and process the footer
-        let buffer_start = file_size.saturating_sub(DEFAULT_PREFETCH_SIZE);
-        let buffer = object_store
-            .read_with(path)
-            .range(buffer_start..file_size)
-            .await
-            .context(error::OpenDalSnafu)?
-            .to_vec();
-        let buffer_len = buffer.len();
-
-        let mut footer = [0; 8];
-        footer.copy_from_slice(&buffer[buffer_len - FOOTER_SIZE..]);
-
-        let footer_tail = ParquetMetaDataReader::decode_footer_tail(&footer).map_err(|e| {
-            error::InvalidParquetSnafu {
-                file: path,
-                reason: format!("failed to decode footer, {e}"),
-            }
-            .build()
-        })?;
-        let metadata_len = footer_tail.metadata_length() as u64;
-
-        if file_size - (FOOTER_SIZE as u64) < metadata_len {
-            return error::InvalidParquetSnafu {
-                file: path,
-                reason: format!(
-                    "the sum of Metadata length {} and Footer size {} is larger than file size {}",
-                    metadata_len, FOOTER_SIZE, file_size
-                ),
-            }
-            .fail();
-        }
-
-        if (metadata_len as usize) <= buffer_len - FOOTER_SIZE {
-            // The whole metadata is in the first read
-            let metadata_start = buffer_len - metadata_len as usize - FOOTER_SIZE;
-            let metadata = ParquetMetaDataReader::decode_metadata(
-                &buffer[metadata_start..buffer_len - FOOTER_SIZE],
-            )
-            .map_err(|e| {
-                error::InvalidParquetSnafu {
-                    file: path,
-                    reason: format!("failed to decode metadata, {e}"),
-                }
-                .build()
-            })?;
-            Ok(metadata)
-        } else {
-            // The metadata is out of buffer, need to make a second read
-            let metadata_start = file_size - metadata_len - FOOTER_SIZE as u64;
-            let data = object_store
-                .read_with(path)
-                .range(metadata_start..(file_size - FOOTER_SIZE as u64))
-                .await
-                .context(error::OpenDalSnafu)?
-                .to_vec();
-
-            let metadata = ParquetMetaDataReader::decode_metadata(&data).map_err(|e| {
-                error::InvalidParquetSnafu {
-                    file: path,
-                    reason: format!("failed to decode metadata, {e}"),
-                }
-                .build()
-            })?;
-            Ok(metadata)
-        }
-    }
-
-    /// Get the size of parquet file.
+    /// Get the size of parquet file. If file_size is 0, stat the object store to get the size.
     async fn get_file_size(&self) -> Result<u64> {
         let file_size = match self.file_size {
             0 => self
@@ -163,4 +64,55 @@ impl<'a> MetadataLoader<'a> {
         };
         Ok(file_size)
     }
+
+    pub async fn load(&self) -> Result<ParquetMetaData> {
+        let path = self.file_path;
+        let file_size = self.get_file_size().await?;
+        let reader =
+            ParquetMetaDataReader::new().with_prefetch_hint(Some(DEFAULT_PREFETCH_SIZE as usize));
+
+        let fetch = ObjectStoreFetch {
+            object_store: &self.object_store,
+            file_path: self.file_path,
+        };
+
+        reader
+            .load_and_finish(fetch, file_size)
+            .await
+            .map_err(|e| match unbox_external_error(e) {
+                Ok(os_err) => error::OpenDalSnafu {}.into_error(os_err),
+                Err(parquet_err) => error::ReadParquetSnafu { path }.into_error(parquet_err),
+            })
+    }
+}
+
+/// Unpack ParquetError to get object_store::Error if possible.
+fn unbox_external_error(e: ParquetError) -> StdResult<object_store::Error, ParquetError> {
+    match e {
+        ParquetError::External(boxed_err) => match boxed_err.downcast::<object_store::Error>() {
+            Ok(os_err) => Ok(*os_err),
+            Err(parquet_error) => Err(ParquetError::External(parquet_error)),
+        },
+        other => Err(other),
+    }
+}
+
+struct ObjectStoreFetch<'a> {
+    object_store: &'a ObjectStore,
+    file_path: &'a str,
+}
+
+impl MetadataFetch for ObjectStoreFetch<'_> {
+    fn fetch(&mut self, range: std::ops::Range<u64>) -> BoxFuture<'_, ParquetResult<Bytes>> {
+        async move {
+            let data = self
+                .object_store
+                .read_with(self.file_path)
+                .range(range)
+                .await
+                .map_err(|e| ParquetError::External(Box::new(e)))?;
+            Ok(data.to_bytes())
+        }
+        .boxed()
+    }
 }
diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs
index 2c77145e5b..739e9a09a0 100644
--- a/src/mito2/src/sst/parquet/reader.rs
+++ b/src/mito2/src/sst/parquet/reader.rs
@@ -52,15 +52,21 @@ use crate::metrics::{
 use crate::read::prune::{PruneReader, Source};
 use crate::read::{Batch, BatchReader};
 use crate::sst::file::FileHandle;
-use crate::sst::index::bloom_filter::applier::BloomFilterIndexApplierRef;
-use crate::sst::index::fulltext_index::applier::FulltextIndexApplierRef;
-use crate::sst::index::inverted_index::applier::InvertedIndexApplierRef;
+use crate::sst::index::bloom_filter::applier::{
+    BloomFilterIndexApplierRef, BloomFilterIndexApplyMetrics,
+};
+use crate::sst::index::fulltext_index::applier::{
+    FulltextIndexApplierRef, FulltextIndexApplyMetrics,
+};
+use crate::sst::index::inverted_index::applier::{
+    InvertedIndexApplierRef, InvertedIndexApplyMetrics,
+};
 use crate::sst::parquet::file_range::{
     FileRangeContext, FileRangeContextRef, PreFilterMode, row_group_contains_delete,
 };
 use crate::sst::parquet::format::{ReadFormat, need_override_sequence};
 use crate::sst::parquet::metadata::MetadataLoader;
-use crate::sst::parquet::row_group::InMemoryRowGroup;
+use crate::sst::parquet::row_group::{InMemoryRowGroup, ParquetFetchMetrics};
 use crate::sst::parquet::row_selection::RowGroupSelection;
 use crate::sst::parquet::stats::RowGroupPruningStats;
 use crate::sst::parquet::{DEFAULT_READ_BATCH_SIZE, PARQUET_METADATA_KEY};
@@ -121,6 +127,8 @@ pub struct ParquetReaderBuilder {
     compaction: bool,
     /// Mode to pre-filter columns.
     pre_filter_mode: PreFilterMode,
+    /// Whether to decode primary key values eagerly when reading primary key format SSTs.
+    decode_primary_key_values: bool,
 }
 
 impl ParquetReaderBuilder {
@@ -146,6 +154,7 @@ impl ParquetReaderBuilder {
             flat_format: false,
             compaction: false,
             pre_filter_mode: PreFilterMode::All,
+            decode_primary_key_values: false,
         }
     }
 
@@ -230,6 +239,13 @@ impl ParquetReaderBuilder {
         self
     }
 
+    /// Decodes primary key values eagerly when reading primary key format SSTs.
+    #[must_use]
+    pub(crate) fn decode_primary_key_values(mut self, decode: bool) -> Self {
+        self.decode_primary_key_values = decode;
+        self
+    }
+
     /// Builds a [ParquetReader].
     ///
     /// This needs to perform IO operation.
@@ -253,7 +269,9 @@ impl ParquetReaderBuilder {
         let file_size = self.file_handle.meta_ref().file_size;
 
         // Loads parquet metadata of the file.
-        let parquet_meta = self.read_parquet_metadata(&file_path, file_size).await?;
+        let (parquet_meta, cache_miss) = self
+            .read_parquet_metadata(&file_path, file_size, &mut metrics.metadata_cache_metrics)
+            .await?;
         // Decodes region metadata.
         let key_value_meta = parquet_meta.file_metadata().key_value_metadata();
         // Gets the metadata stored in the SST.
@@ -284,6 +302,9 @@ impl ParquetReaderBuilder {
                 self.compaction,
             )?
         };
+        if self.decode_primary_key_values {
+            read_format.set_decode_primary_key_values(true);
+        }
         if need_override_sequence(&parquet_meta) {
             read_format
                 .set_override_sequence(self.file_handle.meta_ref().sequence.map(|x| x.get()));
@@ -305,6 +326,22 @@ impl ParquetReaderBuilder {
             .row_groups_to_read(&read_format, &parquet_meta, &mut metrics.filter_metrics)
             .await;
 
+        // Trigger background download if metadata had a cache miss and selection is not empty
+        if cache_miss && !selection.is_empty() {
+            use crate::cache::file_cache::{FileType, IndexKey};
+            let index_key = IndexKey::new(
+                self.file_handle.region_id(),
+                self.file_handle.file_id().file_id(),
+                FileType::Parquet,
+            );
+            self.cache_strategy.maybe_download_background(
+                index_key,
+                file_path.clone(),
+                self.object_store.clone(),
+                file_size,
+            );
+        }
+
         let reader_builder = RowGroupReaderBuilder {
             file_handle: self.file_handle.clone(),
             file_path,
@@ -374,30 +411,40 @@ impl ParquetReaderBuilder {
     }
 
     /// Reads parquet metadata of specific file.
+    /// Returns (metadata, cache_miss_flag).
     async fn read_parquet_metadata(
         &self,
         file_path: &str,
         file_size: u64,
-    ) -> Result<Arc<ParquetMetaData>> {
+        cache_metrics: &mut MetadataCacheMetrics,
+    ) -> Result<(Arc<ParquetMetaData>, bool)> {
+        let start = Instant::now();
         let _t = READ_STAGE_ELAPSED
             .with_label_values(&["read_parquet_metadata"])
             .start_timer();
 
         let file_id = self.file_handle.file_id();
-        // Tries to get from global cache.
-        if let Some(metadata) = self.cache_strategy.get_parquet_meta_data(file_id).await {
-            return Ok(metadata);
+        // Tries to get from cache with metrics tracking.
+        if let Some(metadata) = self
+            .cache_strategy
+            .get_parquet_meta_data(file_id, cache_metrics)
+            .await
+        {
+            cache_metrics.metadata_load_cost += start.elapsed();
+            return Ok((metadata, false));
         }
 
         // Cache miss, load metadata directly.
         let metadata_loader = MetadataLoader::new(self.object_store.clone(), file_path, file_size);
         let metadata = metadata_loader.load().await?;
+
         let metadata = Arc::new(metadata);
         // Cache the metadata.
         self.cache_strategy
             .put_parquet_meta_data(file_id, metadata.clone());
 
-        Ok(metadata)
+        cache_metrics.metadata_load_cost += start.elapsed();
+        Ok((metadata, true))
     }
 
     /// Computes row groups to read, along with their respective row selections.
@@ -527,7 +574,11 @@ impl ParquetReaderBuilder {
             // Slow path: apply the index from the file.
             let file_size_hint = self.file_handle.meta_ref().index_file_size();
             let apply_res = index_applier
-                .apply_fine(self.file_handle.file_id(), Some(file_size_hint))
+                .apply_fine(
+                    self.file_handle.index_id(),
+                    Some(file_size_hint),
+                    metrics.fulltext_index_apply_metrics.as_mut(),
+                )
                 .await;
             let selection = match apply_res {
                 Ok(Some(res)) => {
@@ -595,13 +646,17 @@ impl ParquetReaderBuilder {
             // Slow path: apply the index from the file.
             let file_size_hint = self.file_handle.meta_ref().index_file_size();
             let apply_res = index_applier
-                .apply(self.file_handle.file_id(), Some(file_size_hint))
+                .apply(
+                    self.file_handle.index_id(),
+                    Some(file_size_hint),
+                    metrics.inverted_index_apply_metrics.as_mut(),
+                )
                 .await;
             let selection = match apply_res {
-                Ok(output) => RowGroupSelection::from_inverted_index_apply_output(
+                Ok(apply_output) => RowGroupSelection::from_inverted_index_apply_output(
                     row_group_size,
                     num_row_groups,
-                    output,
+                    apply_output,
                 ),
                 Err(err) => {
                     handle_index_error!(err, self.file_handle, INDEX_TYPE_INVERTED);
@@ -670,7 +725,12 @@ impl ParquetReaderBuilder {
                 )
             });
             let apply_res = index_applier
-                .apply(self.file_handle.file_id(), Some(file_size_hint), rgs)
+                .apply(
+                    self.file_handle.index_id(),
+                    Some(file_size_hint),
+                    rgs,
+                    metrics.bloom_filter_apply_metrics.as_mut(),
+                )
                 .await;
             let mut selection = match apply_res {
                 Ok(apply_output) => {
@@ -748,7 +808,12 @@ impl ParquetReaderBuilder {
                 )
             });
             let apply_res = index_applier
-                .apply_coarse(self.file_handle.file_id(), Some(file_size_hint), rgs)
+                .apply_coarse(
+                    self.file_handle.index_id(),
+                    Some(file_size_hint),
+                    rgs,
+                    metrics.fulltext_index_apply_metrics.as_mut(),
+                )
                 .await;
             let mut selection = match apply_res {
                 Ok(Some(apply_output)) => {
@@ -892,7 +957,7 @@ fn all_required_row_groups_searched(
 }
 
 /// Metrics of filtering rows groups and rows.
-#[derive(Debug, Default, Clone, Copy)]
+#[derive(Debug, Default, Clone)]
 pub(crate) struct ReaderFilterMetrics {
     /// Number of row groups before filtering.
     pub(crate) rg_total: usize,
@@ -915,6 +980,13 @@ pub(crate) struct ReaderFilterMetrics {
     pub(crate) rows_bloom_filtered: usize,
     /// Number of rows filtered by precise filter.
     pub(crate) rows_precise_filtered: usize,
+
+    /// Optional metrics for inverted index applier.
+    pub(crate) inverted_index_apply_metrics: Option<InvertedIndexApplyMetrics>,
+    /// Optional metrics for bloom filter index applier.
+    pub(crate) bloom_filter_apply_metrics: Option<BloomFilterIndexApplyMetrics>,
+    /// Optional metrics for fulltext index applier.
+    pub(crate) fulltext_index_apply_metrics: Option<FulltextIndexApplyMetrics>,
 }
 
 impl ReaderFilterMetrics {
@@ -931,6 +1003,23 @@ impl ReaderFilterMetrics {
         self.rows_inverted_filtered += other.rows_inverted_filtered;
         self.rows_bloom_filtered += other.rows_bloom_filtered;
         self.rows_precise_filtered += other.rows_precise_filtered;
+
+        // Merge optional applier metrics
+        if let Some(other_metrics) = &other.inverted_index_apply_metrics {
+            self.inverted_index_apply_metrics
+                .get_or_insert_with(Default::default)
+                .merge_from(other_metrics);
+        }
+        if let Some(other_metrics) = &other.bloom_filter_apply_metrics {
+            self.bloom_filter_apply_metrics
+                .get_or_insert_with(Default::default)
+                .merge_from(other_metrics);
+        }
+        if let Some(other_metrics) = &other.fulltext_index_apply_metrics {
+            self.fulltext_index_apply_metrics
+                .get_or_insert_with(Default::default)
+                .merge_from(other_metrics);
+        }
     }
 
     /// Reports metrics.
@@ -987,6 +1076,64 @@ impl ReaderFilterMetrics {
     }
 }
 
+/// Metrics for parquet metadata cache operations.
+#[derive(Default, Clone, Copy)]
+pub(crate) struct MetadataCacheMetrics {
+    /// Number of memory cache hits for parquet metadata.
+    pub(crate) mem_cache_hit: usize,
+    /// Number of file cache hits for parquet metadata.
+    pub(crate) file_cache_hit: usize,
+    /// Number of cache misses for parquet metadata.
+    pub(crate) cache_miss: usize,
+    /// Duration to load parquet metadata.
+    pub(crate) metadata_load_cost: Duration,
+}
+
+impl std::fmt::Debug for MetadataCacheMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let Self {
+            mem_cache_hit,
+            file_cache_hit,
+            cache_miss,
+            metadata_load_cost,
+        } = self;
+
+        if self.is_empty() {
+            return write!(f, "{{}}");
+        }
+        write!(f, "{{")?;
+
+        write!(f, "\"metadata_load_cost\":\"{:?}\"", metadata_load_cost)?;
+
+        if *mem_cache_hit > 0 {
+            write!(f, ", \"mem_cache_hit\":{}", mem_cache_hit)?;
+        }
+        if *file_cache_hit > 0 {
+            write!(f, ", \"file_cache_hit\":{}", file_cache_hit)?;
+        }
+        if *cache_miss > 0 {
+            write!(f, ", \"cache_miss\":{}", cache_miss)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl MetadataCacheMetrics {
+    /// Returns true if the metrics are empty (contain no meaningful data).
+    pub(crate) fn is_empty(&self) -> bool {
+        self.metadata_load_cost.is_zero()
+    }
+
+    /// Adds `other` metrics to this metrics.
+    pub(crate) fn merge_from(&mut self, other: &MetadataCacheMetrics) {
+        self.mem_cache_hit += other.mem_cache_hit;
+        self.file_cache_hit += other.file_cache_hit;
+        self.cache_miss += other.cache_miss;
+        self.metadata_load_cost += other.metadata_load_cost;
+    }
+}
+
 /// Parquet reader metrics.
 #[derive(Debug, Default, Clone)]
 pub struct ReaderMetrics {
@@ -1002,6 +1149,10 @@ pub struct ReaderMetrics {
     pub(crate) num_batches: usize,
     /// Number of rows read.
     pub(crate) num_rows: usize,
+    /// Metrics for parquet metadata cache.
+    pub(crate) metadata_cache_metrics: MetadataCacheMetrics,
+    /// Optional metrics for page/row group fetch operations.
+    pub(crate) fetch_metrics: Option<Arc<ParquetFetchMetrics>>,
 }
 
 impl ReaderMetrics {
@@ -1013,6 +1164,15 @@ impl ReaderMetrics {
         self.num_record_batches += other.num_record_batches;
         self.num_batches += other.num_batches;
         self.num_rows += other.num_rows;
+        self.metadata_cache_metrics
+            .merge_from(&other.metadata_cache_metrics);
+        if let Some(other_fetch) = &other.fetch_metrics {
+            if let Some(self_fetch) = &self.fetch_metrics {
+                self_fetch.merge_from(other_fetch);
+            } else {
+                self.fetch_metrics = Some(other_fetch.clone());
+            }
+        }
     }
 
     /// Reports total rows.
@@ -1067,7 +1227,10 @@ impl RowGroupReaderBuilder {
         &self,
         row_group_idx: usize,
         row_selection: Option<RowSelection>,
+        fetch_metrics: Option<&ParquetFetchMetrics>,
     ) -> Result<ParquetRecordBatchReader> {
+        let fetch_start = Instant::now();
+
         let mut row_group = InMemoryRowGroup::create(
             self.file_handle.region_id(),
             self.file_handle.file_id().file_id(),
@@ -1079,12 +1242,17 @@ impl RowGroupReaderBuilder {
         );
         // Fetches data into memory.
         row_group
-            .fetch(&self.projection, row_selection.as_ref())
+            .fetch(&self.projection, row_selection.as_ref(), fetch_metrics)
             .await
             .context(ReadParquetSnafu {
                 path: &self.file_path,
             })?;
 
+        // Record total fetch elapsed time.
+        if let Some(metrics) = fetch_metrics {
+            metrics.data.lock().unwrap().total_fetch_elapsed += fetch_start.elapsed();
+        }
+
         // Builds the parquet reader.
         // Now the row selection is None.
         ParquetRecordBatchReader::try_new_with_row_groups(
@@ -1228,6 +1396,8 @@ pub struct ParquetReader {
     selection: RowGroupSelection,
     /// Reader of current row group.
     reader_state: ReaderState,
+    /// Metrics for tracking row group fetch operations.
+    fetch_metrics: ParquetFetchMetrics,
 }
 
 #[async_trait]
@@ -1247,7 +1417,11 @@ impl BatchReader for ParquetReader {
             let parquet_reader = self
                 .context
                 .reader_builder()
-                .build(row_group_idx, Some(row_selection))
+                .build(
+                    row_group_idx,
+                    Some(row_selection),
+                    Some(&self.fetch_metrics),
+                )
                 .await?;
 
             // Resets the parquet reader.
@@ -1303,11 +1477,12 @@ impl ParquetReader {
         context: FileRangeContextRef,
         mut selection: RowGroupSelection,
     ) -> Result<Self> {
+        let fetch_metrics = ParquetFetchMetrics::default();
         // No more items in current row group, reads next row group.
         let reader_state = if let Some((row_group_idx, row_selection)) = selection.pop_first() {
             let parquet_reader = context
                 .reader_builder()
-                .build(row_group_idx, Some(row_selection))
+                .build(row_group_idx, Some(row_selection), Some(&fetch_metrics))
                 .await?;
             // Compute skip_fields once for this row group
             let skip_fields = context.should_skip_fields(row_group_idx);
@@ -1324,6 +1499,7 @@ impl ParquetReader {
             context,
             selection,
             reader_state,
+            fetch_metrics,
         })
     }
 
diff --git a/src/mito2/src/sst/parquet/row_group.rs b/src/mito2/src/sst/parquet/row_group.rs
index d10526057d..b8baf7960f 100644
--- a/src/mito2/src/sst/parquet/row_group.rs
+++ b/src/mito2/src/sst/parquet/row_group.rs
@@ -35,6 +35,175 @@ use crate::cache::{CacheStrategy, PageKey, PageValue};
 use crate::metrics::{READ_STAGE_ELAPSED, READ_STAGE_FETCH_PAGES};
 use crate::sst::parquet::helper::{MERGE_GAP, fetch_byte_ranges};
 
+/// Inner data for ParquetFetchMetrics.
+#[derive(Default, Debug, Clone)]
+pub struct ParquetFetchMetricsData {
+    /// Number of page cache hits.
+    pub page_cache_hit: usize,
+    /// Number of write cache hits.
+    pub write_cache_hit: usize,
+    /// Number of cache misses.
+    pub cache_miss: usize,
+    /// Number of pages to fetch from mem cache.
+    pub pages_to_fetch_mem: usize,
+    /// Total size in bytes of pages to fetch from mem cache.
+    pub page_size_to_fetch_mem: u64,
+    /// Number of pages to fetch from write cache.
+    pub pages_to_fetch_write_cache: usize,
+    /// Total size in bytes of pages to fetch from write cache.
+    pub page_size_to_fetch_write_cache: u64,
+    /// Number of pages to fetch from store.
+    pub pages_to_fetch_store: usize,
+    /// Total size in bytes of pages to fetch from store.
+    pub page_size_to_fetch_store: u64,
+    /// Total size in bytes of pages actually returned.
+    pub page_size_needed: u64,
+    /// Elapsed time fetching from write cache.
+    pub write_cache_fetch_elapsed: std::time::Duration,
+    /// Elapsed time fetching from object store.
+    pub store_fetch_elapsed: std::time::Duration,
+    /// Total elapsed time for fetching row groups.
+    pub total_fetch_elapsed: std::time::Duration,
+}
+
+impl ParquetFetchMetricsData {
+    /// Returns true if the metrics are empty (contain no meaningful data).
+    fn is_empty(&self) -> bool {
+        self.total_fetch_elapsed.is_zero()
+    }
+}
+
+/// Metrics for tracking page/row group fetch operations.
+#[derive(Default)]
+pub struct ParquetFetchMetrics {
+    pub data: std::sync::Mutex<ParquetFetchMetricsData>,
+}
+
+impl std::fmt::Debug for ParquetFetchMetrics {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let data = self.data.lock().unwrap();
+        if data.is_empty() {
+            return write!(f, "{{}}");
+        }
+
+        let ParquetFetchMetricsData {
+            page_cache_hit,
+            write_cache_hit,
+            cache_miss,
+            pages_to_fetch_mem,
+            page_size_to_fetch_mem,
+            pages_to_fetch_write_cache,
+            page_size_to_fetch_write_cache,
+            pages_to_fetch_store,
+            page_size_to_fetch_store,
+            page_size_needed,
+            write_cache_fetch_elapsed,
+            store_fetch_elapsed,
+            total_fetch_elapsed,
+        } = *data;
+
+        write!(f, "{{")?;
+
+        write!(f, "\"total_fetch_elapsed\":\"{:?}\"", total_fetch_elapsed)?;
+
+        if page_cache_hit > 0 {
+            write!(f, ", \"page_cache_hit\":{}", page_cache_hit)?;
+        }
+        if write_cache_hit > 0 {
+            write!(f, ", \"write_cache_hit\":{}", write_cache_hit)?;
+        }
+        if cache_miss > 0 {
+            write!(f, ", \"cache_miss\":{}", cache_miss)?;
+        }
+        if pages_to_fetch_mem > 0 {
+            write!(f, ", \"pages_to_fetch_mem\":{}", pages_to_fetch_mem)?;
+        }
+        if page_size_to_fetch_mem > 0 {
+            write!(f, ", \"page_size_to_fetch_mem\":{}", page_size_to_fetch_mem)?;
+        }
+        if pages_to_fetch_write_cache > 0 {
+            write!(
+                f,
+                ", \"pages_to_fetch_write_cache\":{}",
+                pages_to_fetch_write_cache
+            )?;
+        }
+        if page_size_to_fetch_write_cache > 0 {
+            write!(
+                f,
+                ", \"page_size_to_fetch_write_cache\":{}",
+                page_size_to_fetch_write_cache
+            )?;
+        }
+        if pages_to_fetch_store > 0 {
+            write!(f, ", \"pages_to_fetch_store\":{}", pages_to_fetch_store)?;
+        }
+        if page_size_to_fetch_store > 0 {
+            write!(
+                f,
+                ", \"page_size_to_fetch_store\":{}",
+                page_size_to_fetch_store
+            )?;
+        }
+        if page_size_needed > 0 {
+            write!(f, ", \"page_size_needed\":{}", page_size_needed)?;
+        }
+        if !write_cache_fetch_elapsed.is_zero() {
+            write!(
+                f,
+                ", \"write_cache_fetch_elapsed\":\"{:?}\"",
+                write_cache_fetch_elapsed
+            )?;
+        }
+        if !store_fetch_elapsed.is_zero() {
+            write!(f, ", \"store_fetch_elapsed\":\"{:?}\"", store_fetch_elapsed)?;
+        }
+
+        write!(f, "}}")
+    }
+}
+
+impl ParquetFetchMetrics {
+    /// Returns true if the metrics are empty (contain no meaningful data).
+    pub fn is_empty(&self) -> bool {
+        self.data.lock().unwrap().is_empty()
+    }
+
+    /// Merges metrics from another [ParquetFetchMetrics].
+    pub fn merge_from(&self, other: &ParquetFetchMetrics) {
+        let ParquetFetchMetricsData {
+            page_cache_hit,
+            write_cache_hit,
+            cache_miss,
+            pages_to_fetch_mem,
+            page_size_to_fetch_mem,
+            pages_to_fetch_write_cache,
+            page_size_to_fetch_write_cache,
+            pages_to_fetch_store,
+            page_size_to_fetch_store,
+            page_size_needed,
+            write_cache_fetch_elapsed,
+            store_fetch_elapsed,
+            total_fetch_elapsed,
+        } = *other.data.lock().unwrap();
+
+        let mut data = self.data.lock().unwrap();
+        data.page_cache_hit += page_cache_hit;
+        data.write_cache_hit += write_cache_hit;
+        data.cache_miss += cache_miss;
+        data.pages_to_fetch_mem += pages_to_fetch_mem;
+        data.page_size_to_fetch_mem += page_size_to_fetch_mem;
+        data.pages_to_fetch_write_cache += pages_to_fetch_write_cache;
+        data.page_size_to_fetch_write_cache += page_size_to_fetch_write_cache;
+        data.pages_to_fetch_store += pages_to_fetch_store;
+        data.page_size_to_fetch_store += page_size_to_fetch_store;
+        data.page_size_needed += page_size_needed;
+        data.write_cache_fetch_elapsed += write_cache_fetch_elapsed;
+        data.store_fetch_elapsed += store_fetch_elapsed;
+        data.total_fetch_elapsed += total_fetch_elapsed;
+    }
+}
+
 pub(crate) struct RowGroupBase<'a> {
     metadata: &'a RowGroupMetaData,
     pub(crate) offset_index: Option<&'a [OffsetIndexMetaData]>,
@@ -244,13 +413,14 @@ impl<'a> InMemoryRowGroup<'a> {
         &mut self,
         projection: &ProjectionMask,
         selection: Option<&RowSelection>,
+        metrics: Option<&ParquetFetchMetrics>,
     ) -> Result<()> {
         if let Some((selection, offset_index)) = selection.zip(self.base.offset_index) {
             let (fetch_ranges, page_start_offsets) =
                 self.base
                     .calc_sparse_read_ranges(projection, offset_index, selection);
 
-            let chunk_data = self.fetch_bytes(&fetch_ranges).await?;
+            let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?;
             // Assign sparse chunk data to base.
             self.base
                 .assign_sparse_chunk(projection, chunk_data, page_start_offsets);
@@ -268,7 +438,7 @@ impl<'a> InMemoryRowGroup<'a> {
             }
 
             // Fetch data with ranges
-            let chunk_data = self.fetch_bytes(&fetch_ranges).await?;
+            let chunk_data = self.fetch_bytes(&fetch_ranges, metrics).await?;
 
             // Assigns fetched data to base.
             self.base.assign_dense_chunk(projection, chunk_data);
@@ -279,31 +449,74 @@ impl<'a> InMemoryRowGroup<'a> {
 
     /// Try to fetch data from the memory cache or the WriteCache,
     /// if not in WriteCache, fetch data from object store directly.
-    async fn fetch_bytes(&self, ranges: &[Range<u64>]) -> Result<Vec<Bytes>> {
+    async fn fetch_bytes(
+        &self,
+        ranges: &[Range<u64>],
+        metrics: Option<&ParquetFetchMetrics>,
+    ) -> Result<Vec<Bytes>> {
         // Now fetch page timer includes the whole time to read pages.
         let _timer = READ_STAGE_FETCH_PAGES.start_timer();
+
         let page_key = PageKey::new(self.file_id, self.row_group_idx, ranges.to_vec());
         if let Some(pages) = self.cache_strategy.get_pages(&page_key) {
+            if let Some(metrics) = metrics {
+                let total_size: u64 = ranges.iter().map(|r| r.end - r.start).sum();
+                let mut metrics_data = metrics.data.lock().unwrap();
+                metrics_data.page_cache_hit += 1;
+                metrics_data.pages_to_fetch_mem += ranges.len();
+                metrics_data.page_size_to_fetch_mem += total_size;
+                metrics_data.page_size_needed += total_size;
+            }
             return Ok(pages.compressed.clone());
         }
 
+        // Calculate total range size for metrics.
+        let (total_range_size, unaligned_size) = compute_total_range_size(ranges);
+
         let key = IndexKey::new(self.region_id, self.file_id, FileType::Parquet);
-        let pages = match self.fetch_ranges_from_write_cache(key, ranges).await {
-            Some(data) => data,
+        let fetch_write_cache_start = metrics.map(|_| std::time::Instant::now());
+        let write_cache_result = self.fetch_ranges_from_write_cache(key, ranges).await;
+        let pages = match write_cache_result {
+            Some(data) => {
+                if let Some(metrics) = metrics {
+                    let elapsed = fetch_write_cache_start
+                        .map(|start| start.elapsed())
+                        .unwrap_or_default();
+                    let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
+                    let mut metrics_data = metrics.data.lock().unwrap();
+                    metrics_data.write_cache_fetch_elapsed += elapsed;
+                    metrics_data.write_cache_hit += 1;
+                    metrics_data.pages_to_fetch_write_cache += ranges.len();
+                    metrics_data.page_size_to_fetch_write_cache += unaligned_size;
+                    metrics_data.page_size_needed += range_size_needed;
+                }
+                data
+            }
             None => {
                 // Fetch data from object store.
                 let _timer = READ_STAGE_ELAPSED
                     .with_label_values(&["cache_miss_read"])
                     .start_timer();
 
-                fetch_byte_ranges(self.file_path, self.object_store.clone(), ranges)
+                let start = metrics.map(|_| std::time::Instant::now());
+                let data = fetch_byte_ranges(self.file_path, self.object_store.clone(), ranges)
                     .await
-                    .map_err(|e| ParquetError::External(Box::new(e)))?
+                    .map_err(|e| ParquetError::External(Box::new(e)))?;
+                if let Some(metrics) = metrics {
+                    let elapsed = start.map(|start| start.elapsed()).unwrap_or_default();
+                    let range_size_needed: u64 = ranges.iter().map(|r| r.end - r.start).sum();
+                    let mut metrics_data = metrics.data.lock().unwrap();
+                    metrics_data.store_fetch_elapsed += elapsed;
+                    metrics_data.cache_miss += 1;
+                    metrics_data.pages_to_fetch_store += ranges.len();
+                    metrics_data.page_size_to_fetch_store += unaligned_size;
+                    metrics_data.page_size_needed += range_size_needed;
+                }
+                data
             }
         };
 
         // Put pages back to the cache.
-        let total_range_size = compute_total_range_size(ranges);
         let page_value = PageValue::new(pages.clone(), total_range_size);
         self.cache_strategy
             .put_pages(page_key, Arc::new(page_value));
@@ -326,17 +539,21 @@ impl<'a> InMemoryRowGroup<'a> {
 }
 
 /// Computes the max possible buffer size to read the given `ranges`.
+/// Returns (aligned_size, unaligned_size) where:
+/// - aligned_size: total size aligned to pooled buffer size
+/// - unaligned_size: actual total size without alignment
 // See https://github.com/apache/opendal/blob/v0.54.0/core/src/types/read/reader.rs#L166-L192
-fn compute_total_range_size(ranges: &[Range<u64>]) -> u64 {
+fn compute_total_range_size(ranges: &[Range<u64>]) -> (u64, u64) {
     if ranges.is_empty() {
-        return 0;
+        return (0, 0);
     }
 
     let gap = MERGE_GAP as u64;
     let mut sorted_ranges = ranges.to_vec();
     sorted_ranges.sort_unstable_by(|a, b| a.start.cmp(&b.start));
 
-    let mut total_size = 0;
+    let mut total_size_aligned = 0;
+    let mut total_size_unaligned = 0;
     let mut cur = sorted_ranges[0].clone();
 
     for range in sorted_ranges.into_iter().skip(1) {
@@ -345,15 +562,19 @@ fn compute_total_range_size(ranges: &[Range<u64>]) -> u64 {
             cur.end = cur.end.max(range.end);
         } else {
             // No overlap and the gap is too large, add current range to total and start a new one
-            total_size += align_to_pooled_buf_size(cur.end - cur.start);
+            let range_size = cur.end - cur.start;
+            total_size_aligned += align_to_pooled_buf_size(range_size);
+            total_size_unaligned += range_size;
             cur = range;
         }
     }
 
     // Add the last range
-    total_size += align_to_pooled_buf_size(cur.end - cur.start);
+    let range_size = cur.end - cur.start;
+    total_size_aligned += align_to_pooled_buf_size(range_size);
+    total_size_unaligned += range_size;
 
-    total_size
+    (total_size_aligned, total_size_unaligned)
 }
 
 /// Aligns the given size to the multiple of the pooled buffer size.
diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs
index 5247e2eec8..8c03a51368 100644
--- a/src/mito2/src/sst/parquet/writer.rs
+++ b/src/mito2/src/sst/parquet/writer.rs
@@ -153,7 +153,7 @@ where
         metrics: &'a mut Metrics,
     ) -> ParquetWriter<'a, F, I, P> {
         let init_file = FileId::random();
-        let indexer = indexer_builder.build(init_file).await;
+        let indexer = indexer_builder.build(init_file, 0).await;
 
         ParquetWriter {
             path_provider,
@@ -213,11 +213,23 @@ where
 
             // convert FileMetaData to ParquetMetaData
             let parquet_metadata = parse_parquet_metadata(file_meta)?;
+            let max_row_group_uncompressed_size: u64 = parquet_metadata
+                .row_groups()
+                .iter()
+                .map(|rg| {
+                    rg.columns()
+                        .iter()
+                        .map(|c| c.uncompressed_size() as u64)
+                        .sum::<u64>()
+                })
+                .max()
+                .unwrap_or(0);
             let num_series = stats.series_estimator.finish();
             ssts.push(SstInfo {
                 file_id: self.current_file,
                 time_range,
                 file_size,
+                max_row_group_uncompressed_size,
                 num_rows: stats.num_rows,
                 num_row_groups: parquet_metadata.num_row_groups() as u64,
                 file_metadata: Some(Arc::new(parquet_metadata)),
@@ -482,7 +494,7 @@ where
                     .context(WriteParquetSnafu)?;
             self.writer = Some(arrow_writer);
 
-            let indexer = self.indexer_builder.build(self.current_file).await;
+            let indexer = self.indexer_builder.build(self.current_file, 0).await;
             self.current_indexer = Some(indexer);
 
             // safety: self.writer is assigned above
diff --git a/src/mito2/src/sst/version.rs b/src/mito2/src/sst/version.rs
index 6cae6ce83d..a9e71eb5d9 100644
--- a/src/mito2/src/sst/version.rs
+++ b/src/mito2/src/sst/version.rs
@@ -57,9 +57,28 @@ impl SstVersion {
     ) {
         for file in files_to_add {
             let level = file.level;
+            let new_index_version = file.index_version;
+            // If the file already exists, then we should only replace the handle when the index is outdated.
             self.levels[level as usize]
                 .files
-                .insert(file.file_id, FileHandle::new(file, file_purger.clone()));
+                .entry(file.file_id)
+                .and_modify(|f| {
+                    if *f.meta_ref() == file || f.meta_ref().is_index_up_to_date(&file) {
+                        // same file meta or current file handle's index is up-to-date, skip adding
+                        if f.index_id().version > new_index_version {
+                            // what does it mean for us to see older index version?
+                            common_telemetry::warn!(
+                                "Adding file with older index version, existing: {:?}, new: {:?}, ignoring new file",
+                                f.meta_ref(),
+                                file
+                            );
+                        }
+                    } else {
+                        // include case like old file have no index or index is outdated
+                        *f = FileHandle::new(file.clone(), file_purger.clone());
+                    }
+                })
+                .or_insert_with(|| FileHandle::new(file.clone(), file_purger.clone()));
         }
     }
 
diff --git a/src/mito2/src/test_util.rs b/src/mito2/src/test_util.rs
index baaa7fe343..f60f04d514 100644
--- a/src/mito2/src/test_util.rs
+++ b/src/mito2/src/test_util.rs
@@ -39,7 +39,7 @@ use common_meta::cache::{new_schema_cache, new_table_schema_cache};
 use common_meta::key::{SchemaMetadataManager, SchemaMetadataManagerRef};
 use common_meta::kv_backend::KvBackendRef;
 use common_meta::kv_backend::memory::MemoryKvBackend;
-use common_telemetry::warn;
+use common_telemetry::{debug, warn};
 use common_test_util::temp_dir::{TempDir, create_temp_dir};
 use common_wal::options::{KafkaWalOptions, WAL_OPTIONS_KEY, WalOptions};
 use datatypes::arrow::array::{TimestampMillisecondArray, UInt8Array, UInt64Array};
@@ -50,6 +50,7 @@ use log_store::raft_engine::log_store::RaftEngineLogStore;
 use log_store::test_util::log_store_util;
 use moka::future::CacheBuilder;
 use object_store::ObjectStore;
+use object_store::layers::mock::MockLayer;
 use object_store::manager::{ObjectStoreManager, ObjectStoreManagerRef};
 use object_store::services::Fs;
 use rskafka::client::partition::{Compression, UnknownTopicHandling};
@@ -221,13 +222,14 @@ pub struct TestEnv {
     data_home: TempDir,
     intermediate_manager: IntermediateManager,
     puffin_manager: PuffinManagerFactory,
-    log_store: Option<LogStoreImpl>,
+    pub(crate) log_store: Option<LogStoreImpl>,
     log_store_factory: LogStoreFactory,
-    object_store_manager: Option<ObjectStoreManagerRef>,
+    pub(crate) object_store_manager: Option<ObjectStoreManagerRef>,
     schema_metadata_manager: SchemaMetadataManagerRef,
     file_ref_manager: FileReferenceManagerRef,
     kv_backend: KvBackendRef,
     partition_expr_fetcher: PartitionExprFetcherRef,
+    object_store_mock_layer: Option<MockLayer>,
 }
 
 impl TestEnv {
@@ -264,6 +266,7 @@ impl TestEnv {
             file_ref_manager: Arc::new(FileReferenceManager::new(None)),
             kv_backend,
             partition_expr_fetcher: noop_partition_expr_fetcher(),
+            object_store_mock_layer: None,
         }
     }
 
@@ -273,6 +276,12 @@ impl TestEnv {
         self
     }
 
+    /// Sets the original `object_store_mock_layer`.
+    pub fn with_mock_layer(mut self, mock_layer: MockLayer) -> TestEnv {
+        self.object_store_mock_layer = Some(mock_layer);
+        self
+    }
+
     pub fn get_object_store(&self) -> Option<ObjectStore> {
         self.object_store_manager
             .as_ref()
@@ -287,7 +296,7 @@ impl TestEnv {
         self.object_store_manager.clone()
     }
 
-    async fn new_mito_engine(&self, config: MitoConfig) -> MitoEngine {
+    pub(crate) async fn new_mito_engine(&self, config: MitoConfig) -> MitoEngine {
         async fn create<S: LogStore>(
             zelf: &TestEnv,
             config: MitoConfig,
@@ -541,37 +550,53 @@ impl TestEnv {
 
     /// Returns the log store and object store manager.
     async fn create_log_and_object_store_manager(&self) -> (LogStoreImpl, ObjectStoreManager) {
+        let log_store = self.create_log_store().await;
+        let object_store_manager = self.create_object_store_manager();
+
+        (log_store, object_store_manager)
+    }
+
+    pub(crate) async fn create_log_store(&self) -> LogStoreImpl {
         let data_home = self.data_home.path();
         let wal_path = data_home.join("wal");
-        let object_store_manager = self.create_object_store_manager();
 
         match &self.log_store_factory {
             LogStoreFactory::RaftEngine(factory) => {
                 let log_store = factory.create_log_store(wal_path).await;
-                (
-                    LogStoreImpl::RaftEngine(Arc::new(log_store)),
-                    object_store_manager,
-                )
+
+                LogStoreImpl::RaftEngine(Arc::new(log_store))
             }
             LogStoreFactory::Kafka(factory) => {
                 let log_store = factory.create_log_store().await;
 
-                (
-                    LogStoreImpl::Kafka(Arc::new(log_store)),
-                    object_store_manager,
-                )
+                LogStoreImpl::Kafka(Arc::new(log_store))
             }
         }
     }
 
-    fn create_object_store_manager(&self) -> ObjectStoreManager {
+    pub(crate) fn create_object_store_manager(&self) -> ObjectStoreManager {
         let data_home = self.data_home.path();
         let data_path = data_home.join("data").as_path().display().to_string();
         let builder = Fs::default().root(&data_path);
-        let object_store = ObjectStore::new(builder).unwrap().finish();
+
+        let object_store = if let Some(mock_layer) = self.object_store_mock_layer.as_ref() {
+            debug!("create object store with mock layer");
+            ObjectStore::new(builder)
+                .unwrap()
+                .layer(mock_layer.clone())
+                .finish()
+        } else {
+            ObjectStore::new(builder).unwrap().finish()
+        };
         ObjectStoreManager::new("default", object_store)
     }
 
+    pub(crate) fn create_in_memory_object_store_manager(&self) -> ObjectStoreManager {
+        let builder = object_store::services::Memory::default();
+        let object_store = ObjectStore::new(builder).unwrap().finish();
+        ObjectStoreManager::new("memory", object_store)
+    }
+
     /// If `initial_metadata` is `Some`, creates a new manifest. If `initial_metadata`
     /// is `None`, opens an existing manifest and returns `None` if no such manifest.
     pub async fn create_manifest_manager(
@@ -601,6 +626,7 @@ impl TestEnv {
             compress_type,
             checkpoint_distance,
             remove_file_options: Default::default(),
+            manifest_cache: None,
         };
 
         if let Some(metadata) = initial_metadata {
@@ -608,14 +634,13 @@ impl TestEnv {
                 metadata,
                 0,
                 manifest_opts,
-                Default::default(),
-                Default::default(),
                 FormatType::PrimaryKey,
+                &Default::default(),
             )
             .await
             .map(Some)
         } else {
-            RegionManifestManager::open(manifest_opts, Default::default(), Default::default()).await
+            RegionManifestManager::open(manifest_opts, &Default::default()).await
         }
     }
 
@@ -630,8 +655,10 @@ impl TestEnv {
             capacity,
             None,
             None,
+            true, // enable_background_worker
             self.puffin_manager.clone(),
             self.intermediate_manager.clone(),
+            None, // manifest_cache
         )
         .await
         .unwrap();
@@ -650,8 +677,10 @@ impl TestEnv {
             capacity,
             None,
             None,
+            true, // enable_background_worker
             self.puffin_manager.clone(),
             self.intermediate_manager.clone(),
+            ReadableSize::mb(0), // manifest_cache_capacity
         )
         .await
         .unwrap();
@@ -996,9 +1025,15 @@ pub struct MockWriteBufferManager {
     should_stall: AtomicBool,
     memory_used: AtomicUsize,
     memory_active: AtomicUsize,
+    flush_limit: usize,
 }
 
 impl MockWriteBufferManager {
+    /// Set flush limit.
+    pub fn set_flush_limit(&mut self, flush_limit: usize) {
+        self.flush_limit = flush_limit;
+    }
+
     /// Set whether to flush the engine.
     pub fn set_should_flush(&self, value: bool) {
         self.should_flush.store(value, Ordering::Relaxed);
@@ -1040,6 +1075,10 @@ impl WriteBufferManager for MockWriteBufferManager {
     fn memory_usage(&self) -> usize {
         self.memory_used.load(Ordering::Relaxed)
     }
+
+    fn flush_limit(&self) -> usize {
+        self.flush_limit
+    }
 }
 
 pub fn column_metadata_to_column_schema(metadata: &ColumnMetadata) -> api::v1::ColumnSchema {
diff --git a/src/mito2/src/test_util/scheduler_util.rs b/src/mito2/src/test_util/scheduler_util.rs
index 8e5b8b9434..9f91a51747 100644
--- a/src/mito2/src/test_util/scheduler_util.rs
+++ b/src/mito2/src/test_util/scheduler_util.rs
@@ -18,6 +18,7 @@ use std::sync::{Arc, Mutex};
 
 use common_base::Plugins;
 use common_datasource::compression::CompressionType;
+use common_memory_manager::OnExhaustedPolicy;
 use common_test_util::temp_dir::{TempDir, create_temp_dir};
 use object_store::ObjectStore;
 use object_store::services::Fs;
@@ -28,6 +29,7 @@ use tokio::sync::mpsc::Sender;
 use crate::access_layer::{AccessLayer, AccessLayerRef};
 use crate::cache::CacheManager;
 use crate::compaction::CompactionScheduler;
+use crate::compaction::memory_manager::{CompactionMemoryManager, new_compaction_memory_manager};
 use crate::config::MitoConfig;
 use crate::error::Result;
 use crate::flush::FlushScheduler;
@@ -100,6 +102,8 @@ impl SchedulerEnv {
             Arc::new(MitoConfig::default()),
             WorkerListener::default(),
             Plugins::new(),
+            Arc::new(new_compaction_memory_manager(0)),
+            OnExhaustedPolicy::default(),
         )
     }
 
@@ -132,10 +136,10 @@ impl SchedulerEnv {
                     compress_type: CompressionType::Uncompressed,
                     checkpoint_distance: 10,
                     remove_file_options: Default::default(),
+                    manifest_cache: None,
                 },
-                Default::default(),
-                Default::default(),
                 FormatType::PrimaryKey,
+                &Default::default(),
             )
             .await
             .unwrap(),
diff --git a/src/mito2/src/test_util/sst_util.rs b/src/mito2/src/test_util/sst_util.rs
index dc75a1e08c..9f7b6a0658 100644
--- a/src/mito2/src/test_util/sst_util.rs
+++ b/src/mito2/src/test_util/sst_util.rs
@@ -123,9 +123,11 @@ pub fn sst_file_handle_with_file_id(file_id: FileId, start_ms: i64, end_ms: i64)
             ),
             level: 0,
             file_size: 0,
+            max_row_group_uncompressed_size: 0,
             available_indexes: Default::default(),
+            indexes: Default::default(),
             index_file_size: 0,
-            index_file_id: None,
+            index_version: 0,
             num_rows: 0,
             num_row_groups: 0,
             num_series: 0,
diff --git a/src/mito2/src/test_util/version_util.rs b/src/mito2/src/test_util/version_util.rs
index 53b28478c9..7a407ece4f 100644
--- a/src/mito2/src/test_util/version_util.rs
+++ b/src/mito2/src/test_util/version_util.rs
@@ -101,9 +101,11 @@ impl VersionControlBuilder {
                 ),
                 level: 0,
                 file_size: 0, // We don't care file size.
+                max_row_group_uncompressed_size: 0,
                 available_indexes: Default::default(),
+                indexes: Default::default(),
                 index_file_size: 0,
-                index_file_id: None,
+                index_version: 0,
                 num_rows: 0,
                 num_row_groups: 0,
                 num_series: 0,
@@ -191,9 +193,11 @@ pub(crate) fn apply_edit(
                 ),
                 level: 0,
                 file_size: 0, // We don't care file size.
+                max_row_group_uncompressed_size: 0,
                 available_indexes: Default::default(),
+                indexes: Default::default(),
                 index_file_size: 0,
-                index_file_id: None,
+                index_version: 0,
                 num_rows: 0,
                 num_row_groups: 0,
                 num_series: 0,
diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs
index 60ac23af33..5b0e5728a5 100644
--- a/src/mito2/src/worker.rs
+++ b/src/mito2/src/worker.rs
@@ -19,12 +19,15 @@ mod handle_bulk_insert;
 mod handle_catchup;
 mod handle_close;
 mod handle_compaction;
+mod handle_copy_region;
 mod handle_create;
 mod handle_drop;
+mod handle_enter_staging;
 mod handle_flush;
 mod handle_manifest;
 mod handle_open;
 mod handle_rebuild_index;
+mod handle_remap;
 mod handle_truncate;
 mod handle_write;
 
@@ -38,6 +41,7 @@ use common_base::Plugins;
 use common_error::ext::BoxedError;
 use common_meta::key::SchemaMetadataManagerRef;
 use common_runtime::JoinHandle;
+use common_stat::get_total_memory_bytes;
 use common_telemetry::{error, info, warn};
 use futures::future::try_join_all;
 use object_store::manager::ObjectStoreManagerRef;
@@ -55,6 +59,7 @@ use tokio::sync::{Mutex, Semaphore, mpsc, oneshot, watch};
 use crate::cache::write_cache::{WriteCache, WriteCacheRef};
 use crate::cache::{CacheManager, CacheManagerRef};
 use crate::compaction::CompactionScheduler;
+use crate::compaction::memory_manager::{CompactionMemoryManager, new_compaction_memory_manager};
 use crate::config::MitoConfig;
 use crate::error::{self, CreateDirSnafu, JoinSnafu, Result, WorkerStoppedSnafu};
 use crate::flush::{FlushScheduler, WriteBufferManagerImpl, WriteBufferManagerRef};
@@ -202,6 +207,17 @@ impl WorkerGroup {
                 .build(),
         );
         let time_provider = Arc::new(StdTimeProvider);
+        let total_memory = get_total_memory_bytes();
+        let total_memory = if total_memory > 0 {
+            total_memory as u64
+        } else {
+            0
+        };
+        let compaction_limit_bytes = config
+            .experimental_compaction_memory_limit
+            .resolve(total_memory);
+        let compaction_memory_manager =
+            Arc::new(new_compaction_memory_manager(compaction_limit_bytes));
         let gc_limiter = Arc::new(GcLimiter::new(config.gc.max_concurrent_gc_job));
 
         let workers = (0..config.num_workers)
@@ -218,6 +234,7 @@ impl WorkerGroup {
                     purge_scheduler: purge_scheduler.clone(),
                     listener: WorkerListener::default(),
                     cache_manager: cache_manager.clone(),
+                    compaction_memory_manager: compaction_memory_manager.clone(),
                     puffin_manager_factory: puffin_manager_factory.clone(),
                     intermediate_manager: intermediate_manager.clone(),
                     time_provider: time_provider.clone(),
@@ -378,6 +395,17 @@ impl WorkerGroup {
                 .write_cache(write_cache)
                 .build(),
         );
+        let total_memory = get_total_memory_bytes();
+        let total_memory = if total_memory > 0 {
+            total_memory as u64
+        } else {
+            0
+        };
+        let compaction_limit_bytes = config
+            .experimental_compaction_memory_limit
+            .resolve(total_memory);
+        let compaction_memory_manager =
+            Arc::new(new_compaction_memory_manager(compaction_limit_bytes));
         let gc_limiter = Arc::new(GcLimiter::new(config.gc.max_concurrent_gc_job));
         let workers = (0..config.num_workers)
             .map(|id| {
@@ -393,6 +421,7 @@ impl WorkerGroup {
                     purge_scheduler: purge_scheduler.clone(),
                     listener: WorkerListener::new(listener.clone()),
                     cache_manager: cache_manager.clone(),
+                    compaction_memory_manager: compaction_memory_manager.clone(),
                     puffin_manager_factory: puffin_manager_factory.clone(),
                     intermediate_manager: intermediate_manager.clone(),
                     time_provider: time_provider.clone(),
@@ -451,8 +480,10 @@ pub async fn write_cache_from_config(
         config.write_cache_size,
         config.write_cache_ttl,
         Some(config.index_cache_percent),
+        config.enable_refill_cache_on_read,
         puffin_manager_factory,
         intermediate_manager,
+        config.manifest_cache_size,
     )
     .await?;
     Ok(Some(Arc::new(cache)))
@@ -477,6 +508,7 @@ struct WorkerStarter<S> {
     purge_scheduler: SchedulerRef,
     listener: WorkerListener,
     cache_manager: CacheManagerRef,
+    compaction_memory_manager: Arc<CompactionMemoryManager>,
     puffin_manager_factory: PuffinManagerFactory,
     intermediate_manager: IntermediateManager,
     time_provider: TimeProviderRef,
@@ -529,9 +561,11 @@ impl<S: LogStore> WorkerStarter<S> {
                 self.compact_job_pool,
                 sender.clone(),
                 self.cache_manager.clone(),
-                self.config,
+                self.config.clone(),
                 self.listener.clone(),
                 self.plugins.clone(),
+                self.compaction_memory_manager.clone(),
+                self.config.experimental_compaction_on_exhausted,
             ),
             stalled_requests: StalledRequests::default(),
             listener: self.listener,
@@ -1002,6 +1036,12 @@ impl<S: LogStore> RegionWorkerLoop<S> {
                         );
                     }
                 }
+                WorkerRequest::RemapManifests(req) => {
+                    self.handle_remap_manifests_request(req);
+                }
+                WorkerRequest::CopyRegionFrom(req) => {
+                    self.handle_copy_region_from_request(req);
+                }
             }
         }
 
@@ -1035,8 +1075,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
                     continue;
                 }
                 DdlRequest::Flush(req) => {
-                    self.handle_flush_request(ddl.region_id, req, ddl.sender)
-                        .await;
+                    self.handle_flush_request(ddl.region_id, req, ddl.sender);
                     continue;
                 }
                 DdlRequest::Compact(req) => {
@@ -1059,6 +1098,15 @@ impl<S: LogStore> RegionWorkerLoop<S> {
                         .await;
                     continue;
                 }
+                DdlRequest::EnterStaging(req) => {
+                    self.handle_enter_staging_request(
+                        ddl.region_id,
+                        req.partition_expr,
+                        ddl.sender,
+                    )
+                    .await;
+                    continue;
+                }
             };
 
             ddl.sender.send(res);
@@ -1107,7 +1155,11 @@ impl<S: LogStore> RegionWorkerLoop<S> {
             BackgroundNotify::RegionChange(req) => {
                 self.handle_manifest_region_change_result(req).await
             }
+            BackgroundNotify::EnterStaging(req) => self.handle_enter_staging_result(req).await,
             BackgroundNotify::RegionEdit(req) => self.handle_region_edit_result(req).await,
+            BackgroundNotify::CopyRegionFromFinished(req) => {
+                self.handle_copy_region_from_finished(req)
+            }
         }
     }
 
@@ -1268,6 +1320,13 @@ impl WorkerListener {
         }
     }
 
+    pub(crate) async fn on_enter_staging_result_begin(&self, _region_id: RegionId) {
+        #[cfg(any(test, feature = "test"))]
+        if let Some(listener) = &self.listener {
+            listener.on_enter_staging_result_begin(_region_id).await;
+        }
+    }
+
     pub(crate) async fn on_index_build_finish(&self, _region_file_id: RegionFileId) {
         #[cfg(any(test, feature = "test"))]
         if let Some(listener) = &self.listener {
diff --git a/src/mito2/src/worker/handle_alter.rs b/src/mito2/src/worker/handle_alter.rs
index 39a1fa665a..37ca5de2ba 100644
--- a/src/mito2/src/worker/handle_alter.rs
+++ b/src/mito2/src/worker/handle_alter.rs
@@ -22,6 +22,7 @@ use common_telemetry::info;
 use common_telemetry::tracing::warn;
 use humantime_serde::re::humantime;
 use snafu::{ResultExt, ensure};
+use store_api::logstore::LogStore;
 use store_api::metadata::{
     InvalidSetRegionOptionRequestSnafu, MetadataError, RegionMetadata, RegionMetadataBuilder,
     RegionMetadataRef,
@@ -41,7 +42,7 @@ use crate::request::{DdlRequest, OptionOutputTx, SenderDdlRequest};
 use crate::sst::FormatType;
 use crate::worker::RegionWorkerLoop;
 
-impl<S> RegionWorkerLoop<S> {
+impl<S: LogStore> RegionWorkerLoop<S> {
     pub(crate) async fn handle_alter_request(
         &mut self,
         region_id: RegionId,
@@ -113,7 +114,13 @@ impl<S> RegionWorkerLoop<S> {
             info!("Flush region: {} before alteration", region_id);
 
             // Try to submit a flush task.
-            let task = self.new_flush_task(&region, FlushReason::Alter, None, self.config.clone());
+            let task = self.new_flush_task(
+                &region,
+                FlushReason::Alter,
+                None,
+                self.config.clone(),
+                region.is_staging(),
+            );
             if let Err(e) =
                 self.flush_scheduler
                     .schedule_flush(region.region_id, &region.version_control, task)
diff --git a/src/mito2/src/worker/handle_copy_region.rs b/src/mito2/src/worker/handle_copy_region.rs
new file mode 100644
index 0000000000..e929013fc6
--- /dev/null
+++ b/src/mito2/src/worker/handle_copy_region.rs
@@ -0,0 +1,245 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use common_telemetry::{debug, error, info};
+use snafu::OptionExt;
+use store_api::region_engine::MitoCopyRegionFromResponse;
+use store_api::storage::{FileId, RegionId};
+
+use crate::error::{InvalidRequestSnafu, MissingManifestSnafu, Result};
+use crate::manifest::action::{RegionEdit, RegionMetaAction, RegionMetaActionList};
+use crate::region::{FileDescriptor, MitoRegionRef, RegionFileCopier, RegionMetadataLoader};
+use crate::request::{
+    BackgroundNotify, CopyRegionFromFinished, CopyRegionFromRequest, WorkerRequest,
+};
+use crate::sst::location::region_dir_from_table_dir;
+use crate::worker::{RegionWorkerLoop, WorkerRequestWithTime};
+
+impl<S> RegionWorkerLoop<S> {
+    pub(crate) fn handle_copy_region_from_request(&mut self, request: CopyRegionFromRequest) {
+        let region_id = request.region_id;
+        let source_region_id = request.source_region_id;
+        let sender = request.sender;
+        let region = match self.regions.writable_region(region_id) {
+            Ok(region) => region,
+            Err(e) => {
+                let _ = sender.send(Err(e));
+                return;
+            }
+        };
+
+        let same_table = source_region_id.table_id() == region_id.table_id();
+        if !same_table {
+            let _ = sender.send(
+                InvalidRequestSnafu {
+                    region_id,
+                    reason: format!("Source and target regions must be from the same table, source_region_id: {source_region_id}, target_region_id: {region_id}"),
+                }
+                .fail(),
+            );
+            return;
+        }
+        if source_region_id == region_id {
+            let _ = sender.send(
+                InvalidRequestSnafu {
+                    region_id,
+                    reason: format!("Source and target regions must be different, source_region_id: {source_region_id}, target_region_id: {region_id}"),
+                }
+                .fail(),
+            );
+            return;
+        }
+
+        let region_metadata_loader =
+            RegionMetadataLoader::new(self.config.clone(), self.object_store_manager.clone());
+        let worker_sender = self.sender.clone();
+
+        common_runtime::spawn_global(async move {
+            let (region_edit, file_ids) = match Self::copy_region_from(
+                &region,
+                region_metadata_loader,
+                source_region_id,
+                region_id,
+                request.parallelism.max(1),
+            )
+            .await
+            {
+                Ok(region_files) => region_files,
+                Err(e) => {
+                    let _ = sender.send(Err(e));
+                    return;
+                }
+            };
+
+            match region_edit {
+                Some(region_edit) => {
+                    if let Err(e) = worker_sender
+                        .send(WorkerRequestWithTime::new(WorkerRequest::Background {
+                            region_id,
+                            notify: BackgroundNotify::CopyRegionFromFinished(
+                                CopyRegionFromFinished {
+                                    region_id,
+                                    edit: region_edit,
+                                    sender,
+                                },
+                            ),
+                        }))
+                        .await
+                    {
+                        error!(e; "Failed to send copy region from finished notification to worker, region_id: {}", region_id);
+                    }
+                }
+                None => {
+                    let _ = sender.send(Ok(MitoCopyRegionFromResponse {
+                        copied_file_ids: file_ids,
+                    }));
+                }
+            }
+        });
+    }
+
+    pub(crate) fn handle_copy_region_from_finished(&mut self, request: CopyRegionFromFinished) {
+        let region_id = request.region_id;
+        let sender = request.sender;
+        let region = match self.regions.writable_region(region_id) {
+            Ok(region) => region,
+            Err(e) => {
+                let _ = sender.send(Err(e));
+                return;
+            }
+        };
+
+        let copied_file_ids = request
+            .edit
+            .files_to_add
+            .iter()
+            .map(|file_meta| file_meta.file_id)
+            .collect();
+
+        region
+            .version_control
+            .apply_edit(Some(request.edit), &[], region.file_purger.clone());
+
+        let _ = sender.send(Ok(MitoCopyRegionFromResponse { copied_file_ids }));
+    }
+
+    /// Returns the region edit and the file ids that were copied from the source region to the target region.
+    ///
+    /// If no need to copy files, returns (None, file_ids).
+    async fn copy_region_from(
+        region: &MitoRegionRef,
+        region_metadata_loader: RegionMetadataLoader,
+        source_region_id: RegionId,
+        target_region_id: RegionId,
+        parallelism: usize,
+    ) -> Result<(Option<RegionEdit>, Vec<FileId>)> {
+        let table_dir = region.table_dir();
+        let path_type = region.path_type();
+        let region_dir = region_dir_from_table_dir(table_dir, source_region_id, path_type);
+        info!(
+            "Loading source region manifest from region dir: {region_dir}, target region: {target_region_id}"
+        );
+        let source_region_manifest = region_metadata_loader
+            .load_manifest(&region_dir, &region.version().options.storage)
+            .await?
+            .context(MissingManifestSnafu {
+                region_id: source_region_id,
+            })?;
+        let mut files_to_copy = vec![];
+        let target_region_manifest = region.manifest_ctx.manifest().await;
+        let file_ids = source_region_manifest
+            .files
+            .keys()
+            .cloned()
+            .collect::<Vec<_>>();
+        debug!(
+            "source region files: {:?}, source region id: {}",
+            source_region_manifest.files, source_region_id
+        );
+        for (file_id, file_meta) in &source_region_manifest.files {
+            if !target_region_manifest.files.contains_key(file_id) {
+                let mut new_file_meta = file_meta.clone();
+                new_file_meta.region_id = target_region_id;
+                files_to_copy.push(new_file_meta);
+            }
+        }
+        if files_to_copy.is_empty() {
+            return Ok((None, file_ids));
+        }
+
+        let file_descriptors = files_to_copy
+            .iter()
+            .flat_map(|file_meta| {
+                if file_meta.exists_index() {
+                    let region_index_id = file_meta.index_id();
+                    let file_id = region_index_id.file_id.file_id();
+                    let version = region_index_id.version;
+                    let file_size = file_meta.file_size;
+                    let index_file_size = file_meta.index_file_size();
+                    vec![
+                        FileDescriptor::Data {
+                            file_id: file_meta.file_id,
+                            size: file_size,
+                        },
+                        FileDescriptor::Index {
+                            file_id,
+                            version,
+                            size: index_file_size,
+                        },
+                    ]
+                } else {
+                    let file_size = file_meta.file_size;
+                    vec![FileDescriptor::Data {
+                        file_id: file_meta.file_id,
+                        size: file_size,
+                    }]
+                }
+            })
+            .collect();
+        debug!("File descriptors to copy: {:?}", file_descriptors);
+        let copier = RegionFileCopier::new(region.access_layer());
+        // TODO(weny): ensure the target region is empty.
+        copier
+            .copy_files(
+                source_region_id,
+                target_region_id,
+                file_descriptors,
+                parallelism,
+            )
+            .await?;
+        let edit = RegionEdit {
+            files_to_add: files_to_copy,
+            files_to_remove: vec![],
+            timestamp_ms: Some(chrono::Utc::now().timestamp_millis()),
+            compaction_time_window: None,
+            flushed_entry_id: None,
+            flushed_sequence: None,
+            committed_sequence: None,
+        };
+        let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit.clone()));
+        info!("Applying {edit:?} to region {target_region_id}, reason: CopyRegionFrom");
+        let version = region
+            .manifest_ctx
+            .manifest_manager
+            .write()
+            .await
+            .update(action_list, false)
+            .await?;
+        info!(
+            "Successfully update manifest version to {version}, region: {target_region_id}, reason: CopyRegionFrom"
+        );
+
+        Ok((Some(edit), file_ids))
+    }
+}
diff --git a/src/mito2/src/worker/handle_drop.rs b/src/mito2/src/worker/handle_drop.rs
index 84337bd9d0..9d36507407 100644
--- a/src/mito2/src/worker/handle_drop.rs
+++ b/src/mito2/src/worker/handle_drop.rs
@@ -51,6 +51,7 @@ where
         // Writes dropping marker
         // We rarely drop a region so we still operate in the worker loop.
         let region_dir = region.access_layer.build_region_dir(region_id);
+        let table_dir = region.access_layer.table_dir().to_string();
         let marker_path = join_path(&region_dir, DROPPING_MARKER_FILE);
         region
             .access_layer
@@ -102,13 +103,14 @@ where
         let dropping_regions = self.dropping_regions.clone();
         let listener = self.listener.clone();
         let intm_manager = self.intermediate_manager.clone();
+        let cache_manager = self.cache_manager.clone();
         common_runtime::spawn_global(async move {
             let gc_duration = listener
                 .on_later_drop_begin(region_id)
                 .unwrap_or(Duration::from_secs(GC_TASK_INTERVAL_SEC));
             let removed = later_drop_task(
                 region_id,
-                region_dir,
+                region_dir.clone(),
                 object_store,
                 dropping_regions,
                 gc_duration,
@@ -117,6 +119,16 @@ where
             if let Err(err) = intm_manager.prune_region_dir(&region_id).await {
                 warn!(err; "Failed to prune intermediate region directory, region_id: {}", region_id);
             }
+
+            // Clean manifest cache for the region
+            if let Some(write_cache) = cache_manager.write_cache()
+                && let Some(manifest_cache) = write_cache.manifest_cache()
+            {
+                // We pass the table dir so we can remove the table dir in manifest cache
+                // when the last region in the same host is dropped.
+                manifest_cache.clean_manifests(&table_dir).await;
+            }
+
             listener.on_later_drop_end(region_id, removed);
         });
 
diff --git a/src/mito2/src/worker/handle_enter_staging.rs b/src/mito2/src/worker/handle_enter_staging.rs
new file mode 100644
index 0000000000..6dee72525e
--- /dev/null
+++ b/src/mito2/src/worker/handle_enter_staging.rs
@@ -0,0 +1,249 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+use std::time::Instant;
+
+use common_telemetry::{error, info, warn};
+use store_api::logstore::LogStore;
+use store_api::region_request::EnterStagingRequest;
+use store_api::storage::RegionId;
+
+use crate::error::{RegionNotFoundSnafu, Result, StagingPartitionExprMismatchSnafu};
+use crate::flush::FlushReason;
+use crate::manifest::action::{RegionChange, RegionMetaAction, RegionMetaActionList};
+use crate::region::{MitoRegionRef, RegionLeaderState};
+use crate::request::{
+    BackgroundNotify, DdlRequest, EnterStagingResult, OptionOutputTx, SenderDdlRequest,
+    WorkerRequest, WorkerRequestWithTime,
+};
+use crate::worker::RegionWorkerLoop;
+
+impl<S: LogStore> RegionWorkerLoop<S> {
+    pub(crate) async fn handle_enter_staging_request(
+        &mut self,
+        region_id: RegionId,
+        partition_expr: String,
+        mut sender: OptionOutputTx,
+    ) {
+        let Some(region) = self.regions.writable_region_or(region_id, &mut sender) else {
+            return;
+        };
+
+        // If the region is already in staging mode, verify the partition expr matches.
+        if region.is_staging() {
+            let staging_partition_expr = region.staging_partition_expr.lock().unwrap().clone();
+            // If the partition expr mismatch, return error.
+            if staging_partition_expr.as_ref() != Some(&partition_expr) {
+                sender.send(Err(StagingPartitionExprMismatchSnafu {
+                    manifest_expr: staging_partition_expr,
+                    request_expr: partition_expr,
+                }
+                .build()));
+                return;
+            }
+
+            // If the partition expr matches, return success.
+            sender.send(Ok(0));
+            return;
+        }
+
+        let version = region.version();
+        if !version.memtables.is_empty() {
+            // If memtable is not empty, we can't enter staging directly and need to flush
+            // all memtables first.
+            info!("Flush region: {} before entering staging", region_id);
+            debug_assert!(!region.is_staging());
+            let task = self.new_flush_task(
+                &region,
+                FlushReason::EnterStaging,
+                None,
+                self.config.clone(),
+                region.is_staging(),
+            );
+            if let Err(e) =
+                self.flush_scheduler
+                    .schedule_flush(region.region_id, &region.version_control, task)
+            {
+                // Unable to flush the region, send error to waiter.
+                sender.send(Err(e));
+                return;
+            }
+
+            // Safety: We have requested flush.
+            self.flush_scheduler
+                .add_ddl_request_to_pending(SenderDdlRequest {
+                    region_id,
+                    sender,
+                    request: DdlRequest::EnterStaging(EnterStagingRequest { partition_expr }),
+                });
+
+            return;
+        }
+
+        self.handle_enter_staging(region, partition_expr, sender);
+    }
+
+    async fn enter_staging(region: &MitoRegionRef, partition_expr: String) -> Result<()> {
+        let now = Instant::now();
+        // First step: clear all staging manifest files.
+        {
+            let mut manager = region.manifest_ctx.manifest_manager.write().await;
+            manager
+                .clear_staging_manifest_and_dir()
+                .await
+                .inspect_err(|e| {
+                    error!(
+                        e;
+                        "Failed to clear staging manifest files for region {}",
+                        region.region_id
+                    );
+                })?;
+
+            info!(
+                "Cleared all staging manifest files for region {}, elapsed: {:?}",
+                region.region_id,
+                now.elapsed(),
+            );
+        }
+
+        // Second step: write new staging manifest.
+        let mut new_meta = (*region.metadata()).clone();
+        new_meta.partition_expr = Some(partition_expr.clone());
+        let sst_format = region.version().options.sst_format.unwrap_or_default();
+        let change = RegionChange {
+            metadata: Arc::new(new_meta),
+            sst_format,
+        };
+        let action_list = RegionMetaActionList::with_action(RegionMetaAction::Change(change));
+        region
+            .manifest_ctx
+            .update_manifest(RegionLeaderState::EnteringStaging, action_list, true)
+            .await?;
+
+        Ok(())
+    }
+
+    fn handle_enter_staging(
+        &self,
+        region: MitoRegionRef,
+        partition_expr: String,
+        sender: OptionOutputTx,
+    ) {
+        if let Err(e) = region.set_entering_staging() {
+            sender.send(Err(e));
+            return;
+        }
+
+        let listener = self.listener.clone();
+        let request_sender = self.sender.clone();
+        common_runtime::spawn_global(async move {
+            let now = Instant::now();
+            let result = Self::enter_staging(&region, partition_expr.clone()).await;
+            match result {
+                Ok(_) => {
+                    info!(
+                        "Created staging manifest for region {}, elapsed: {:?}",
+                        region.region_id,
+                        now.elapsed(),
+                    );
+                }
+                Err(ref e) => {
+                    // Unset the staging manifest
+                    region
+                        .manifest_ctx
+                        .manifest_manager
+                        .write()
+                        .await
+                        .unset_staging_manifest();
+                    error!(
+                        "Failed to create staging manifest for region {}: {:?}, elapsed: {:?}",
+                        region.region_id,
+                        e,
+                        now.elapsed(),
+                    );
+                }
+            }
+
+            let notify = WorkerRequest::Background {
+                region_id: region.region_id,
+                notify: BackgroundNotify::EnterStaging(EnterStagingResult {
+                    region_id: region.region_id,
+                    sender,
+                    result,
+                    partition_expr,
+                }),
+            };
+            listener
+                .on_enter_staging_result_begin(region.region_id)
+                .await;
+
+            if let Err(res) = request_sender
+                .send(WorkerRequestWithTime::new(notify))
+                .await
+            {
+                warn!(
+                    "Failed to send enter staging result back to the worker, region_id: {}, res: {:?}",
+                    region.region_id, res
+                );
+            }
+        });
+    }
+
+    /// Handles enter staging result.
+    pub(crate) async fn handle_enter_staging_result(
+        &mut self,
+        enter_staging_result: EnterStagingResult,
+    ) {
+        let region = match self.regions.get_region(enter_staging_result.region_id) {
+            Some(region) => region,
+            None => {
+                self.reject_region_stalled_requests(&enter_staging_result.region_id);
+                enter_staging_result.sender.send(
+                    RegionNotFoundSnafu {
+                        region_id: enter_staging_result.region_id,
+                    }
+                    .fail(),
+                );
+                return;
+            }
+        };
+
+        if enter_staging_result.result.is_ok() {
+            info!(
+                "Updating region {} staging partition expr to {}",
+                region.region_id, enter_staging_result.partition_expr
+            );
+            Self::update_region_staging_partition_expr(
+                &region,
+                enter_staging_result.partition_expr,
+            );
+            region.switch_state_to_staging(RegionLeaderState::EnteringStaging);
+        } else {
+            region.switch_state_to_writable(RegionLeaderState::EnteringStaging);
+        }
+        enter_staging_result
+            .sender
+            .send(enter_staging_result.result.map(|_| 0));
+        // Handles the stalled requests.
+        self.handle_region_stalled_requests(&enter_staging_result.region_id)
+            .await;
+    }
+
+    fn update_region_staging_partition_expr(region: &MitoRegionRef, partition_expr: String) {
+        let mut staging_partition_expr = region.staging_partition_expr.lock().unwrap();
+        debug_assert!(staging_partition_expr.is_none());
+        *staging_partition_expr = Some(partition_expr);
+    }
+}
diff --git a/src/mito2/src/worker/handle_flush.rs b/src/mito2/src/worker/handle_flush.rs
index 04dbb4ae78..a838fd91d8 100644
--- a/src/mito2/src/worker/handle_flush.rs
+++ b/src/mito2/src/worker/handle_flush.rs
@@ -30,16 +30,26 @@ use crate::request::{BuildIndexRequest, FlushFailed, FlushFinished, OnFailure, O
 use crate::sst::index::IndexBuildType;
 use crate::worker::RegionWorkerLoop;
 
-impl<S> RegionWorkerLoop<S> {
+impl<S: LogStore> RegionWorkerLoop<S> {
     /// On region flush job failed.
     pub(crate) async fn handle_flush_failed(&mut self, region_id: RegionId, request: FlushFailed) {
         self.flush_scheduler.on_flush_failed(region_id, request.err);
+        debug!(
+            "Flush failed for region {}, handling stalled requests",
+            region_id
+        );
+        // Maybe flush worker again.
+        self.maybe_flush_worker();
+
+        // Handle stalled requests.
+        self.handle_stalled_requests().await;
     }
 
     /// Checks whether the engine reaches flush threshold. If so, finds regions in this
     /// worker to flush.
     pub(crate) fn maybe_flush_worker(&mut self) {
         if !self.write_buffer_manager.should_flush_engine() {
+            debug!("No need to flush worker");
             // No need to flush worker.
             return;
         }
@@ -56,9 +66,7 @@ impl<S> RegionWorkerLoop<S> {
         let regions = self.regions.list_regions();
         let now = self.time_provider.current_time_millis();
         let min_last_flush_time = now - self.config.auto_flush_interval.as_millis() as i64;
-        let mut max_mutable_size = 0;
-        // Region with max mutable memtable size.
-        let mut max_mem_region = None;
+        let mut pending_regions = vec![];
 
         for region in &regions {
             if self.flush_scheduler.is_flush_requested(region.region_id) || !region.is_writable() {
@@ -67,34 +75,68 @@ impl<S> RegionWorkerLoop<S> {
             }
 
             let version = region.version();
-            let region_mutable_size = version.memtables.mutable_usage();
-            // Tracks region with max mutable memtable size.
-            if region_mutable_size > max_mutable_size {
-                max_mem_region = Some(region);
-                max_mutable_size = region_mutable_size;
-            }
+            let region_memtable_size =
+                version.memtables.mutable_usage() + version.memtables.immutables_usage();
 
             if region.last_flush_millis() < min_last_flush_time {
                 // If flush time of this region is earlier than `min_last_flush_time`, we can flush this region.
-                let task =
-                    self.new_flush_task(region, FlushReason::EngineFull, None, self.config.clone());
+                let task = self.new_flush_task(
+                    region,
+                    FlushReason::EngineFull,
+                    None,
+                    self.config.clone(),
+                    region.is_staging(),
+                );
                 self.flush_scheduler.schedule_flush(
                     region.region_id,
                     &region.version_control,
                     task,
                 )?;
+            } else if region_memtable_size > 0 {
+                // We should only consider regions with memtable size > 0 to flush.
+                pending_regions.push((region, region_memtable_size));
             }
         }
+        pending_regions.sort_unstable_by_key(|(_, size)| std::cmp::Reverse(*size));
+        // The flush target is the mutable memtable limit (half of the global buffer).
+        // When memory is full, we aggressively flush regions until usage drops below this target,
+        // not just below the full limit.
+        let target_memory_usage = self.write_buffer_manager.flush_limit();
+        let mut memory_usage = self.write_buffer_manager.memory_usage();
 
-        // Flush memtable with max mutable memtable.
-        // TODO(yingwen): Maybe flush more tables to reduce write buffer size.
-        if let Some(region) = max_mem_region
-            && !self.flush_scheduler.is_flush_requested(region.region_id)
+        #[cfg(test)]
         {
-            let task =
-                self.new_flush_task(region, FlushReason::EngineFull, None, self.config.clone());
+            debug!(
+                "Flushing regions on engine full, target memory usage: {}, memory usage: {}, pending regions: {:?}",
+                target_memory_usage,
+                memory_usage,
+                pending_regions
+                    .iter()
+                    .map(|(region, mem_size)| (region.region_id, mem_size))
+                    .collect::<Vec<_>>()
+            );
+        }
+        // Iterate over pending regions in descending order of their memory size and schedule flush tasks
+        // for each region until the overall memory usage drops below the flush limit.
+        for (region, region_mem_size) in pending_regions.into_iter() {
+            // Make sure the first region is always flushed.
+            if memory_usage < target_memory_usage {
+                // Stop flushing regions if memory usage is already below the flush limit
+                break;
+            }
+            let task = self.new_flush_task(
+                region,
+                FlushReason::EngineFull,
+                None,
+                self.config.clone(),
+                region.is_staging(),
+            );
+            debug!("Scheduling flush task for region {}", region.region_id);
+            // Schedule a flush task for the current region
             self.flush_scheduler
                 .schedule_flush(region.region_id, &region.version_control, task)?;
+            // Reduce memory usage by the region's size, ensuring it doesn't go negative
+            memory_usage = memory_usage.saturating_sub(region_mem_size);
         }
 
         Ok(())
@@ -107,6 +149,7 @@ impl<S> RegionWorkerLoop<S> {
         reason: FlushReason,
         row_group_size: Option<usize>,
         engine_config: Arc<MitoConfig>,
+        is_staging: bool,
     ) -> RegionFlushTask {
         RegionFlushTask {
             region_id: region.region_id,
@@ -121,13 +164,14 @@ impl<S> RegionWorkerLoop<S> {
             manifest_ctx: region.manifest_ctx.clone(),
             index_options: region.version().options.index_options.clone(),
             flush_semaphore: self.flush_semaphore.clone(),
+            is_staging,
         }
     }
 }
 
 impl<S: LogStore> RegionWorkerLoop<S> {
     /// Handles manual flush request.
-    pub(crate) async fn handle_flush_request(
+    pub(crate) fn handle_flush_request(
         &mut self,
         region_id: RegionId,
         request: RegionFlushRequest,
@@ -147,8 +191,13 @@ impl<S: LogStore> RegionWorkerLoop<S> {
             FlushReason::Manual
         };
 
-        let mut task =
-            self.new_flush_task(&region, reason, request.row_group_size, self.config.clone());
+        let mut task = self.new_flush_task(
+            &region,
+            reason,
+            request.row_group_size,
+            self.config.clone(),
+            region.is_staging(),
+        );
         task.push_sender(sender);
         if let Err(e) =
             self.flush_scheduler
@@ -178,6 +227,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
                     FlushReason::Periodically,
                     None,
                     self.config.clone(),
+                    region.is_staging(),
                 );
                 self.flush_scheduler.schedule_flush(
                     region.region_id,
@@ -208,11 +258,8 @@ impl<S: LogStore> RegionWorkerLoop<S> {
             }
         };
 
-        // Check if region is currently in staging mode
-        let is_staging = region.manifest_ctx.current_state()
-            == crate::region::RegionRoleState::Leader(crate::region::RegionLeaderState::Staging);
-
-        if is_staging {
+        if request.is_staging {
+            // Skip the region metadata update.
             info!(
                 "Skipping region metadata update for region {} in staging mode",
                 region_id
@@ -276,6 +323,9 @@ impl<S: LogStore> RegionWorkerLoop<S> {
                 .await;
         }
 
+        // Maybe flush worker again.
+        self.maybe_flush_worker();
+
         // Handle stalled requests.
         self.handle_stalled_requests().await;
 
diff --git a/src/mito2/src/worker/handle_manifest.rs b/src/mito2/src/worker/handle_manifest.rs
index c91c7adc6b..a1abde753b 100644
--- a/src/mito2/src/worker/handle_manifest.rs
+++ b/src/mito2/src/worker/handle_manifest.rs
@@ -346,6 +346,7 @@ impl<S> RegionWorkerLoop<S> {
 
         let request_sender = self.sender.clone();
         let manifest_ctx = region.manifest_ctx.clone();
+        let is_staging = region.is_staging();
 
         // Updates manifest in background.
         common_runtime::spawn_global(async move {
@@ -354,7 +355,7 @@ impl<S> RegionWorkerLoop<S> {
                 RegionMetaActionList::with_action(RegionMetaAction::Truncate(truncate.clone()));
 
             let result = manifest_ctx
-                .update_manifest(RegionLeaderState::Truncating, action_list)
+                .update_manifest(RegionLeaderState::Truncating, action_list, is_staging)
                 .await
                 .map(|_| ());
 
@@ -391,6 +392,7 @@ impl<S> RegionWorkerLoop<S> {
         }
         let listener = self.listener.clone();
         let request_sender = self.sender.clone();
+        let is_staging = region.is_staging();
         // Now the region is in altering state.
         common_runtime::spawn_global(async move {
             let new_meta = change.metadata.clone();
@@ -398,7 +400,7 @@ impl<S> RegionWorkerLoop<S> {
 
             let result = region
                 .manifest_ctx
-                .update_manifest(RegionLeaderState::Altering, action_list)
+                .update_manifest(RegionLeaderState::Altering, action_list, is_staging)
                 .await
                 .map(|_| ());
             let notify = WorkerRequest::Background {
@@ -463,6 +465,7 @@ async fn edit_region(
     listener: WorkerListener,
 ) -> Result<()> {
     let region_id = region.region_id;
+    let is_staging = region.is_staging();
     if let Some(write_cache) = cache_manager.write_cache() {
         for file_meta in &edit.files_to_add {
             let write_cache = write_cache.clone();
@@ -478,12 +481,12 @@ async fn edit_region(
 
             let index_file_index_key = IndexKey::new(
                 region_id,
-                file_meta.index_file_id().file_id(),
-                FileType::Puffin,
+                file_meta.index_id().file_id.file_id(),
+                FileType::Puffin(file_meta.index_version),
             );
             let index_remote_path = location::index_file_path(
                 layer.table_dir(),
-                file_meta.file_id(),
+                file_meta.index_id(),
                 layer.path_type(),
             );
 
@@ -532,7 +535,7 @@ async fn edit_region(
     let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit));
     region
         .manifest_ctx
-        .update_manifest(RegionLeaderState::Editing, action_list)
+        .update_manifest(RegionLeaderState::Editing, action_list, is_staging)
         .await
         .map(|_| ())
 }
diff --git a/src/mito2/src/worker/handle_rebuild_index.rs b/src/mito2/src/worker/handle_rebuild_index.rs
index 6e7e96d7f2..ed2390d853 100644
--- a/src/mito2/src/worker/handle_rebuild_index.rs
+++ b/src/mito2/src/worker/handle_rebuild_index.rs
@@ -28,7 +28,7 @@ use crate::region::MitoRegionRef;
 use crate::request::{
     BuildIndexRequest, IndexBuildFailed, IndexBuildFinished, IndexBuildStopped, OptionOutputTx,
 };
-use crate::sst::file::{FileHandle, RegionFileId};
+use crate::sst::file::{FileHandle, RegionFileId, RegionIndexId};
 use crate::sst::index::{
     IndexBuildOutcome, IndexBuildTask, IndexBuildType, IndexerBuilderImpl, ResultMpscSender,
 };
@@ -68,9 +68,11 @@ impl<S> RegionWorkerLoop<S> {
             row_group_size: WriteOptions::default().row_group_size,
             intermediate_manager,
             puffin_manager,
+            write_cache_enabled: self.cache_manager.write_cache().is_some(),
         });
 
         IndexBuildTask {
+            file: file.clone(),
             file_meta: file.meta_ref().clone(),
             reason: build_type,
             access_layer: access_layer.clone(),
@@ -85,7 +87,6 @@ impl<S> RegionWorkerLoop<S> {
     }
 
     /// Handles manual build index requests.
-    /// TODO(SNC123): Support admin function of manual index building later.
     pub(crate) async fn handle_build_index_request(
         &mut self,
         region_id: RegionId,
@@ -126,10 +127,16 @@ impl<S> RegionWorkerLoop<S> {
             .collect();
 
         let build_tasks = if request.file_metas.is_empty() {
-            // NOTE: Currently, rebuilding the index will reconstruct the index for all
-            // files in the region, which is a simplified approach and is not yet available for
-            // production use; further optimization is required.
-            all_files.values().cloned().collect::<Vec<_>>()
+            // If no specific files are provided, find files whose index is inconsistent with the region metadata.
+            all_files
+                .values()
+                .filter(|file| {
+                    !file
+                        .meta_ref()
+                        .is_index_consistent_with_region(&version.metadata.column_metadatas)
+                })
+                .cloned()
+                .collect::<Vec<_>>()
         } else {
             request
                 .file_metas
@@ -211,7 +218,8 @@ impl<S> RegionWorkerLoop<S> {
         let cache_strategy = CacheStrategy::EnableAll(self.cache_manager.clone());
         for file_meta in &request.edit.files_to_add {
             let region_file_id = RegionFileId::new(region_id, file_meta.file_id);
-            cache_strategy.evict_puffin_cache(region_file_id).await;
+            let index_id = RegionIndexId::new(region_file_id, file_meta.index_version);
+            cache_strategy.evict_puffin_cache(index_id).await;
         }
 
         region.version_control.apply_edit(
diff --git a/src/mito2/src/worker/handle_remap.rs b/src/mito2/src/worker/handle_remap.rs
new file mode 100644
index 0000000000..5e94221f7d
--- /dev/null
+++ b/src/mito2/src/worker/handle_remap.rs
@@ -0,0 +1,123 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+use std::time::Instant;
+
+use common_error::ext::BoxedError;
+use common_telemetry::info;
+use futures::future::try_join_all;
+use partition::expr::PartitionExpr;
+use snafu::{OptionExt, ResultExt};
+use store_api::storage::RegionId;
+
+use crate::error::{FetchManifestsSnafu, InvalidRequestSnafu, MissingManifestSnafu, Result};
+use crate::manifest::action::RegionManifest;
+use crate::region::{MitoRegionRef, RegionMetadataLoader};
+use crate::remap_manifest::RemapManifest;
+use crate::request::RemapManifestsRequest;
+use crate::sst::location::region_dir_from_table_dir;
+use crate::worker::RegionWorkerLoop;
+
+impl<S> RegionWorkerLoop<S> {
+    pub(crate) fn handle_remap_manifests_request(&mut self, request: RemapManifestsRequest) {
+        let region_id = request.region_id;
+        let sender = request.sender;
+        let region = match self.regions.staging_region(region_id) {
+            Ok(region) => region,
+            Err(e) => {
+                let _ = sender.send(Err(e));
+                return;
+            }
+        };
+
+        let same_table = request
+            .input_regions
+            .iter()
+            .map(|r| r.table_id())
+            .all(|t| t == region_id.table_id());
+
+        if !same_table {
+            let _ = sender.send(
+                InvalidRequestSnafu {
+                    region_id,
+                    reason: "Input regions must be from the same table",
+                }
+                .fail(),
+            );
+            return;
+        }
+
+        let region_metadata_loader =
+            RegionMetadataLoader::new(self.config.clone(), self.object_store_manager.clone());
+        common_runtime::spawn_global(async move {
+            let result = Self::fetch_and_remap_manifests(
+                region,
+                region_metadata_loader,
+                request.input_regions,
+                request.new_partition_exprs,
+                request.region_mapping,
+            )
+            .await;
+
+            let _ = sender.send(result);
+        });
+    }
+
+    async fn fetch_and_remap_manifests(
+        region: MitoRegionRef,
+        region_metadata_loader: RegionMetadataLoader,
+        input_regions: Vec<RegionId>,
+        new_partition_exprs: HashMap<RegionId, PartitionExpr>,
+        region_mapping: HashMap<RegionId, Vec<RegionId>>,
+    ) -> Result<HashMap<RegionId, RegionManifest>> {
+        let mut tasks = Vec::with_capacity(input_regions.len());
+        let region_options = region.version().options.clone();
+        let table_dir = region.table_dir();
+        let path_type = region.path_type();
+        let now = Instant::now();
+        for input_region in &input_regions {
+            let region_dir = region_dir_from_table_dir(table_dir, *input_region, path_type);
+            let storage = region_options.storage.clone();
+            let moved_region_metadata_loader = region_metadata_loader.clone();
+            tasks.push(async move {
+                moved_region_metadata_loader
+                    .load_manifest(&region_dir, &storage)
+                    .await
+            });
+        }
+
+        let results = try_join_all(tasks)
+            .await
+            .map_err(BoxedError::new)
+            .context(FetchManifestsSnafu)?;
+        let manifests = results
+            .into_iter()
+            .zip(input_regions)
+            .map(|(manifest_res, region_id)| {
+                let manifest = manifest_res.context(MissingManifestSnafu { region_id })?;
+                Ok((region_id, (*manifest).clone()))
+            })
+            .collect::<Result<HashMap<_, _>>>()?;
+        let mut mapper = RemapManifest::new(manifests, new_partition_exprs, region_mapping);
+        let remap_result = mapper.remap_manifests()?;
+        info!(
+            "Remap manifests cost: {:?}, region: {}",
+            now.elapsed(),
+            region.region_id
+        );
+
+        Ok(remap_result.new_manifests)
+    }
+}
diff --git a/src/mito2/src/worker/handle_write.rs b/src/mito2/src/worker/handle_write.rs
index e86aa67630..c338eef88f 100644
--- a/src/mito2/src/worker/handle_write.rs
+++ b/src/mito2/src/worker/handle_write.rs
@@ -241,6 +241,12 @@ impl<S> RegionWorkerLoop<S> {
                     // No such region.
                     continue;
                 };
+                #[cfg(test)]
+                debug!(
+                    "Handling write request for region {}, state: {:?}",
+                    region_id,
+                    region.state()
+                );
                 match region.state() {
                     RegionRoleState::Leader(RegionLeaderState::Writable)
                     | RegionRoleState::Leader(RegionLeaderState::Staging) => {
@@ -263,6 +269,16 @@ impl<S> RegionWorkerLoop<S> {
                         self.stalled_requests.push(sender_req);
                         continue;
                     }
+                    RegionRoleState::Leader(RegionLeaderState::EnteringStaging) => {
+                        debug!(
+                            "Region {} is entering staging, add request to pending writes",
+                            region.region_id
+                        );
+                        self.stalling_count.add(1);
+                        WRITE_STALL_TOTAL.inc();
+                        self.stalled_requests.push(sender_req);
+                        continue;
+                    }
                     state => {
                         // The region is not writable.
                         sender_req.sender.send(
@@ -388,17 +404,14 @@ impl<S> RegionWorkerLoop<S> {
             let need_fill_missing_columns = region_ctx.version().metadata.schema_version
                 != bulk_req.region_metadata.schema_version;
 
-            // Only fill missing columns if primary key is dense encoded.
-            if need_fill_missing_columns {
-                // todo(hl): support filling default columns
-                bulk_req.sender.send(
-                    InvalidRequestSnafu {
-                        region_id,
-                        reason: "Schema mismatch",
-                    }
-                    .fail(),
-                );
-                return;
+            // Fill missing columns if needed
+            if need_fill_missing_columns
+                && let Err(e) = bulk_req
+                    .request
+                    .fill_missing_columns(&region_ctx.version().metadata)
+            {
+                bulk_req.sender.send(Err(e));
+                continue;
             }
 
             // Collect requests by region.
diff --git a/src/object-store/Cargo.toml b/src/object-store/Cargo.toml
index 8a1a877567..2ef251d04d 100644
--- a/src/object-store/Cargo.toml
+++ b/src/object-store/Cargo.toml
@@ -9,6 +9,7 @@ workspace = true
 
 [features]
 services-memory = ["opendal/services-memory"]
+testing = ["derive_builder"]
 
 [dependencies]
 bytes.workspace = true
@@ -16,10 +17,10 @@ common-base.workspace = true
 common-error.workspace = true
 common-macro.workspace = true
 common-telemetry.workspace = true
+derive_builder = { workspace = true, optional = true }
 futures.workspace = true
 humantime-serde.workspace = true
 lazy_static.workspace = true
-md5 = "0.7"
 moka = { workspace = true, features = ["future"] }
 opendal = { version = "0.54", features = [
     "layers-tracing",
diff --git a/src/object-store/src/layers.rs b/src/object-store/src/layers.rs
index 7b111927e2..cc9fa9f4df 100644
--- a/src/object-store/src/layers.rs
+++ b/src/object-store/src/layers.rs
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-mod lru_cache;
+#[cfg(feature = "testing")]
+pub mod mock;
 
-pub use lru_cache::*;
 pub use opendal::layers::*;
 pub use prometheus::build_prometheus_metrics_layer;
 
diff --git a/src/object-store/src/layers/lru_cache.rs b/src/object-store/src/layers/lru_cache.rs
deleted file mode 100644
index 967efe80fe..0000000000
--- a/src/object-store/src/layers/lru_cache.rs
+++ /dev/null
@@ -1,134 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use opendal::Result;
-use opendal::raw::oio::Reader;
-use opendal::raw::{
-    Access, Layer, LayeredAccess, OpList, OpRead, OpWrite, RpDelete, RpList, RpRead, RpWrite,
-};
-mod read_cache;
-use std::time::Instant;
-
-use common_telemetry::{error, info};
-use read_cache::ReadCache;
-
-use crate::layers::lru_cache::read_cache::CacheAwareDeleter;
-
-/// An opendal layer with local LRU file cache supporting.
-pub struct LruCacheLayer<C: Access> {
-    // The read cache
-    read_cache: ReadCache<C>,
-}
-
-impl<C: Access> Clone for LruCacheLayer<C> {
-    fn clone(&self) -> Self {
-        Self {
-            read_cache: self.read_cache.clone(),
-        }
-    }
-}
-
-impl<C: Access> LruCacheLayer<C> {
-    /// Create a [`LruCacheLayer`] with local file cache and capacity in bytes.
-    pub fn new(file_cache: Arc<C>, capacity: usize) -> Result<Self> {
-        let read_cache = ReadCache::new(file_cache, capacity);
-        Ok(Self { read_cache })
-    }
-
-    /// Recovers cache
-    pub async fn recover_cache(&self, sync: bool) {
-        let now = Instant::now();
-        let moved_read_cache = self.read_cache.clone();
-        let handle = tokio::spawn(async move {
-            match moved_read_cache.recover_cache().await {
-                Ok((entries, bytes)) => info!(
-                    "Recovered {} entries and total size {} in bytes for LruCacheLayer, cost: {:?}",
-                    entries,
-                    bytes,
-                    now.elapsed()
-                ),
-                Err(err) => error!(err; "Failed to recover file cache."),
-            }
-        });
-        if sync {
-            let _ = handle.await;
-        }
-    }
-
-    /// Returns true when the local cache contains the specific file
-    pub async fn contains_file(&self, path: &str) -> bool {
-        self.read_cache.contains_file(path).await
-    }
-
-    /// Returns the read cache statistics info `(EntryCount, SizeInBytes)`.
-    pub async fn read_cache_stat(&self) -> (u64, u64) {
-        self.read_cache.cache_stat().await
-    }
-}
-
-impl<I: Access, C: Access> Layer<I> for LruCacheLayer<C> {
-    type LayeredAccess = LruCacheAccess<I, C>;
-
-    fn layer(&self, inner: I) -> Self::LayeredAccess {
-        LruCacheAccess {
-            inner,
-            read_cache: self.read_cache.clone(),
-        }
-    }
-}
-
-#[derive(Debug)]
-pub struct LruCacheAccess<I, C> {
-    inner: I,
-    read_cache: ReadCache<C>,
-}
-
-impl<I: Access, C: Access> LayeredAccess for LruCacheAccess<I, C> {
-    type Inner = I;
-    type Reader = Reader;
-    type Writer = I::Writer;
-    type Lister = I::Lister;
-    type Deleter = CacheAwareDeleter<C, I::Deleter>;
-
-    fn inner(&self) -> &Self::Inner {
-        &self.inner
-    }
-
-    async fn read(&self, path: &str, args: OpRead) -> Result<(RpRead, Self::Reader)> {
-        self.read_cache
-            .read_from_cache(&self.inner, path, args)
-            .await
-    }
-
-    async fn write(&self, path: &str, args: OpWrite) -> Result<(RpWrite, Self::Writer)> {
-        let result = self.inner.write(path, args).await;
-
-        self.read_cache.invalidate_entries_with_prefix(path);
-
-        result
-    }
-
-    async fn delete(&self) -> Result<(RpDelete, Self::Deleter)> {
-        self.inner
-            .delete()
-            .await
-            .map(|(rp, deleter)| (rp, CacheAwareDeleter::new(self.read_cache.clone(), deleter)))
-    }
-
-    async fn list(&self, path: &str, args: OpList) -> Result<(RpList, Self::Lister)> {
-        self.inner.list(path, args).await
-    }
-}
diff --git a/src/object-store/src/layers/lru_cache/read_cache.rs b/src/object-store/src/layers/lru_cache/read_cache.rs
deleted file mode 100644
index 721ee74483..0000000000
--- a/src/object-store/src/layers/lru_cache/read_cache.rs
+++ /dev/null
@@ -1,366 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::sync::Arc;
-
-use common_telemetry::{debug, trace};
-use futures::{FutureExt, TryStreamExt};
-use moka::future::Cache;
-use moka::notification::ListenerFuture;
-use moka::policy::EvictionPolicy;
-use opendal::raw::oio::{Read, Reader, Write};
-use opendal::raw::{Access, OpDelete, OpRead, OpStat, OpWrite, RpRead, oio};
-use opendal::{Error as OpendalError, ErrorKind, OperatorBuilder, Result};
-
-use crate::metrics::{
-    OBJECT_STORE_LRU_CACHE_BYTES, OBJECT_STORE_LRU_CACHE_ENTRIES, OBJECT_STORE_LRU_CACHE_HIT,
-    OBJECT_STORE_LRU_CACHE_MISS, OBJECT_STORE_READ_ERROR,
-};
-
-const RECOVER_CACHE_LIST_CONCURRENT: usize = 8;
-/// Subdirectory of cached files for read.
-///
-/// This must contain three layers, corresponding to [`build_prometheus_metrics_layer`](object_store::layers::build_prometheus_metrics_layer).
-const READ_CACHE_DIR: &str = "cache/object/read";
-
-/// Cache value for read file
-#[derive(Debug, Clone, PartialEq, Eq, Copy)]
-enum ReadResult {
-    // Read success with size
-    Success(u32),
-    // File not found
-    NotFound,
-}
-
-impl ReadResult {
-    fn size_bytes(&self) -> u32 {
-        match self {
-            ReadResult::NotFound => 0,
-            ReadResult::Success(size) => *size,
-        }
-    }
-}
-
-/// Returns true when the path of the file can be cached.
-fn can_cache(path: &str) -> bool {
-    // TODO(dennis): find a better way
-    !path.ends_with("_last_checkpoint")
-}
-
-/// Generate a unique cache key for the read path and range.
-fn read_cache_key(path: &str, args: &OpRead) -> String {
-    format!(
-        "{READ_CACHE_DIR}/{:x}.cache-{}",
-        md5::compute(path),
-        args.range().to_header()
-    )
-}
-
-fn read_cache_root() -> String {
-    format!("/{READ_CACHE_DIR}/")
-}
-
-fn read_cache_key_prefix(path: &str) -> String {
-    format!("{READ_CACHE_DIR}/{:x}", md5::compute(path))
-}
-
-/// Local read cache for files in object storage
-#[derive(Debug)]
-pub(crate) struct ReadCache<C> {
-    /// Local file cache backend
-    file_cache: Arc<C>,
-    /// Local memory cache to track local cache files
-    mem_cache: Cache<String, ReadResult>,
-}
-
-impl<C> Clone for ReadCache<C> {
-    fn clone(&self) -> Self {
-        Self {
-            file_cache: self.file_cache.clone(),
-            mem_cache: self.mem_cache.clone(),
-        }
-    }
-}
-
-impl<C: Access> ReadCache<C> {
-    /// Create a [`ReadCache`] with capacity in bytes.
-    pub(crate) fn new(file_cache: Arc<C>, capacity: usize) -> Self {
-        let file_cache_cloned = OperatorBuilder::new(file_cache.clone()).finish();
-        let eviction_listener =
-            move |read_key: Arc<String>, read_result: ReadResult, cause| -> ListenerFuture {
-                // Delete the file from local file cache when it's purged from mem_cache.
-                OBJECT_STORE_LRU_CACHE_ENTRIES.dec();
-                let file_cache_cloned = file_cache_cloned.clone();
-
-                async move {
-                    if let ReadResult::Success(size) = read_result {
-                        OBJECT_STORE_LRU_CACHE_BYTES.sub(size as i64);
-
-                        let result = file_cache_cloned.delete(&read_key).await;
-                        debug!(
-                            "Deleted local cache file `{}`, result: {:?}, cause: {:?}.",
-                            read_key, result, cause
-                        );
-                    }
-                }
-                .boxed()
-            };
-
-        Self {
-            file_cache,
-            mem_cache: Cache::builder()
-                .max_capacity(capacity as u64)
-                .eviction_policy(EvictionPolicy::lru())
-                .weigher(|_key, value: &ReadResult| -> u32 {
-                    // TODO(dennis): add key's length to weight?
-                    value.size_bytes()
-                })
-                .async_eviction_listener(eviction_listener)
-                .support_invalidation_closures()
-                .build(),
-        }
-    }
-
-    /// Returns the cache's entry count and total approximate entry size in bytes.
-    pub(crate) async fn cache_stat(&self) -> (u64, u64) {
-        self.mem_cache.run_pending_tasks().await;
-
-        (self.mem_cache.entry_count(), self.mem_cache.weighted_size())
-    }
-
-    /// Invalidate all cache items belong to the specific path.
-    pub(crate) fn invalidate_entries_with_prefix(&self, path: &str) {
-        let prefix = read_cache_key_prefix(path);
-        // Safety: always ok when building cache with `support_invalidation_closures`.
-        self.mem_cache
-            .invalidate_entries_if(move |k: &String, &_v| k.starts_with(&prefix))
-            .ok();
-    }
-
-    /// Recover existing cache items from `file_cache` to `mem_cache`.
-    /// Return entry count and total approximate entry size in bytes.
-    pub(crate) async fn recover_cache(&self) -> Result<(u64, u64)> {
-        let op = OperatorBuilder::new(self.file_cache.clone()).finish();
-        let cloned_op = op.clone();
-        let root = read_cache_root();
-        let mut entries = op
-            .lister_with(&root)
-            .await?
-            .map_ok(|entry| async {
-                let (path, mut meta) = entry.into_parts();
-
-                // TODO(dennis): Use a better API, see https://github.com/apache/opendal/issues/6522
-                if meta.content_length() == 0 {
-                    meta = cloned_op.stat(&path).await?;
-                }
-
-                Ok((path, meta))
-            })
-            .try_buffer_unordered(RECOVER_CACHE_LIST_CONCURRENT)
-            .try_collect::<Vec<_>>()
-            .await?;
-
-        while let Some((read_key, metadata)) = entries.pop() {
-            if !metadata.is_file() {
-                continue;
-            }
-
-            let size = metadata.content_length();
-            OBJECT_STORE_LRU_CACHE_ENTRIES.inc();
-            OBJECT_STORE_LRU_CACHE_BYTES.add(size as i64);
-
-            self.mem_cache
-                .insert(read_key.clone(), ReadResult::Success(size as u32))
-                .await;
-        }
-
-        Ok(self.cache_stat().await)
-    }
-
-    /// Returns true when the read cache contains the specific file.
-    pub(crate) async fn contains_file(&self, path: &str) -> bool {
-        self.mem_cache.run_pending_tasks().await;
-        self.mem_cache.contains_key(path)
-            && self.file_cache.stat(path, OpStat::default()).await.is_ok()
-    }
-
-    /// Read from a specific path using the OpRead operation.
-    /// It will attempt to retrieve the data from the local cache.
-    /// If the data is not found in the local cache,
-    /// it will fall back to retrieving it from remote object storage
-    /// and cache the result locally.
-    pub(crate) async fn read_from_cache<I>(
-        &self,
-        inner: &I,
-        path: &str,
-        args: OpRead,
-    ) -> Result<(RpRead, Reader)>
-    where
-        I: Access,
-    {
-        if !can_cache(path) {
-            return inner.read(path, args).await.map(to_output_reader);
-        }
-
-        let read_key = read_cache_key(path, &args);
-
-        let read_result = self
-            .mem_cache
-            .try_get_with(
-                read_key.clone(),
-                self.read_remote(inner, &read_key, path, args.clone()),
-            )
-            .await
-            .map_err(|e| OpendalError::new(e.kind(), e.to_string()))?;
-
-        match read_result {
-            ReadResult::Success(_) => {
-                // There is a concurrent issue here, the local cache may be purged
-                // while reading, we have to fall back to remote read
-                match self.file_cache.read(&read_key, OpRead::default()).await {
-                    Ok(ret) => {
-                        OBJECT_STORE_LRU_CACHE_HIT
-                            .with_label_values(&["success"])
-                            .inc();
-                        Ok(to_output_reader(ret))
-                    }
-                    Err(_) => {
-                        OBJECT_STORE_LRU_CACHE_MISS.inc();
-                        inner.read(path, args).await.map(to_output_reader)
-                    }
-                }
-            }
-            ReadResult::NotFound => {
-                OBJECT_STORE_LRU_CACHE_HIT
-                    .with_label_values(&["not_found"])
-                    .inc();
-
-                Err(OpendalError::new(
-                    ErrorKind::NotFound,
-                    format!("File not found: {path}"),
-                ))
-            }
-        }
-    }
-
-    async fn try_write_cache<I>(&self, mut reader: I::Reader, read_key: &str) -> Result<usize>
-    where
-        I: Access,
-    {
-        let (_, mut writer) = self.file_cache.write(read_key, OpWrite::new()).await?;
-        let mut total = 0;
-        loop {
-            let bytes = reader.read().await?;
-            if bytes.is_empty() {
-                break;
-            }
-
-            total += bytes.len();
-            writer.write(bytes).await?;
-        }
-        // Call `close` to ensure data is written.
-        writer.close().await?;
-        Ok(total)
-    }
-
-    /// Read the file from remote storage. If success, write the content into local cache.
-    async fn read_remote<I>(
-        &self,
-        inner: &I,
-        read_key: &str,
-        path: &str,
-        args: OpRead,
-    ) -> Result<ReadResult>
-    where
-        I: Access,
-    {
-        OBJECT_STORE_LRU_CACHE_MISS.inc();
-
-        let (_, reader) = inner.read(path, args).await?;
-        let result = self.try_write_cache::<I>(reader, read_key).await;
-
-        trace!(
-            "Read cache miss for key '{}' and fetch file '{}' from object store",
-            read_key, path,
-        );
-
-        match result {
-            Ok(read_bytes) => {
-                OBJECT_STORE_LRU_CACHE_ENTRIES.inc();
-                OBJECT_STORE_LRU_CACHE_BYTES.add(read_bytes as i64);
-
-                Ok(ReadResult::Success(read_bytes as u32))
-            }
-
-            Err(e) if e.kind() == ErrorKind::NotFound => {
-                OBJECT_STORE_READ_ERROR
-                    .with_label_values(&[e.kind().to_string().as_str()])
-                    .inc();
-                OBJECT_STORE_LRU_CACHE_ENTRIES.inc();
-
-                Ok(ReadResult::NotFound)
-            }
-
-            Err(e) => {
-                OBJECT_STORE_READ_ERROR
-                    .with_label_values(&[e.kind().to_string().as_str()])
-                    .inc();
-                Err(e)
-            }
-        }
-    }
-}
-
-pub struct CacheAwareDeleter<C, D> {
-    cache: ReadCache<C>,
-    deleter: D,
-}
-
-impl<C: Access, D: oio::Delete> CacheAwareDeleter<C, D> {
-    pub(crate) fn new(cache: ReadCache<C>, deleter: D) -> Self {
-        Self { cache, deleter }
-    }
-}
-
-impl<C: Access, D: oio::Delete> oio::Delete for CacheAwareDeleter<C, D> {
-    fn delete(&mut self, path: &str, args: OpDelete) -> Result<()> {
-        self.cache.invalidate_entries_with_prefix(path);
-        self.deleter.delete(path, args)?;
-        Ok(())
-    }
-
-    async fn flush(&mut self) -> Result<usize> {
-        self.deleter.flush().await
-    }
-}
-
-fn to_output_reader<R: Read + 'static>(input: (RpRead, R)) -> (RpRead, Reader) {
-    (input.0, Box::new(input.1))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_can_cache() {
-        assert!(can_cache("test"));
-        assert!(can_cache("a/b/c.parquet"));
-        assert!(can_cache("1.json"));
-        assert!(can_cache("100.checkpoint"));
-        assert!(can_cache("test/last_checkpoint"));
-        assert!(!can_cache("test/__last_checkpoint"));
-        assert!(!can_cache("a/b/c/__last_checkpoint"));
-    }
-}
diff --git a/src/object-store/src/layers/mock.rs b/src/object-store/src/layers/mock.rs
new file mode 100644
index 0000000000..e55af3bfe0
--- /dev/null
+++ b/src/object-store/src/layers/mock.rs
@@ -0,0 +1,236 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::Debug;
+use std::sync::Arc;
+
+use derive_builder::Builder;
+pub use oio::*;
+pub use opendal::raw::{
+    Access, Layer, LayeredAccess, OpDelete, OpList, OpRead, OpWrite, RpDelete, RpList, RpRead,
+    RpWrite, oio,
+};
+use opendal::raw::{OpCopy, RpCopy};
+pub use opendal::{Buffer, Error, ErrorKind, Metadata, Result};
+
+pub type MockWriterFactory = Arc<dyn Fn(&str, OpWrite, oio::Writer) -> oio::Writer + Send + Sync>;
+pub type MockReaderFactory = Arc<dyn Fn(&str, OpRead, oio::Reader) -> oio::Reader + Send + Sync>;
+pub type MockListerFactory = Arc<dyn Fn(&str, OpList, oio::Lister) -> oio::Lister + Send + Sync>;
+pub type MockDeleterFactory = Arc<dyn Fn(oio::Deleter) -> oio::Deleter + Send + Sync>;
+pub type CopyInterceptor = Arc<dyn Fn(&str, &str, OpCopy) -> Option<Result<RpCopy>> + Send + Sync>;
+
+#[derive(Builder)]
+pub struct MockLayer {
+    #[builder(setter(strip_option), default)]
+    writer_factory: Option<MockWriterFactory>,
+    #[builder(setter(strip_option), default)]
+    reader_factory: Option<MockReaderFactory>,
+    #[builder(setter(strip_option), default)]
+    lister_factory: Option<MockListerFactory>,
+    #[builder(setter(strip_option), default)]
+    deleter_factory: Option<MockDeleterFactory>,
+    #[builder(setter(strip_option), default)]
+    copy_interceptor: Option<CopyInterceptor>,
+}
+
+impl Clone for MockLayer {
+    fn clone(&self) -> Self {
+        Self {
+            writer_factory: self.writer_factory.clone(),
+            reader_factory: self.reader_factory.clone(),
+            lister_factory: self.lister_factory.clone(),
+            deleter_factory: self.deleter_factory.clone(),
+            copy_interceptor: self.copy_interceptor.clone(),
+        }
+    }
+}
+
+impl<A: Access> Layer<A> for MockLayer {
+    type LayeredAccess = MockAccessor<A>;
+
+    fn layer(&self, inner: A) -> Self::LayeredAccess {
+        MockAccessor {
+            inner,
+            writer_factory: self.writer_factory.clone(),
+            reader_factory: self.reader_factory.clone(),
+            lister_factory: self.lister_factory.clone(),
+            deleter_factory: self.deleter_factory.clone(),
+            copy_interceptor: self.copy_interceptor.clone(),
+        }
+    }
+}
+
+pub struct MockAccessor<A> {
+    inner: A,
+    writer_factory: Option<MockWriterFactory>,
+    reader_factory: Option<MockReaderFactory>,
+    lister_factory: Option<MockListerFactory>,
+    deleter_factory: Option<MockDeleterFactory>,
+    copy_interceptor: Option<CopyInterceptor>,
+}
+
+impl<A: Debug> Debug for MockAccessor<A> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("MockAccessor")
+            .field("inner", &self.inner)
+            .finish()
+    }
+}
+
+pub struct MockReader {
+    inner: oio::Reader,
+}
+
+impl oio::Read for MockReader {
+    async fn read(&mut self) -> Result<Buffer> {
+        self.inner.read().await
+    }
+}
+
+pub struct MockWriter {
+    inner: oio::Writer,
+}
+
+impl oio::Write for MockWriter {
+    async fn write(&mut self, bs: Buffer) -> Result<()> {
+        self.inner.write(bs).await
+    }
+
+    async fn close(&mut self) -> Result<Metadata> {
+        self.inner.close().await
+    }
+
+    async fn abort(&mut self) -> Result<()> {
+        self.inner.abort().await
+    }
+}
+
+pub struct MockLister {
+    inner: oio::Lister,
+}
+
+impl oio::List for MockLister {
+    async fn next(&mut self) -> Result<Option<oio::Entry>> {
+        self.inner.next().await
+    }
+}
+
+pub struct MockDeleter {
+    inner: oio::Deleter,
+}
+
+impl oio::Delete for MockDeleter {
+    fn delete(&mut self, path: &str, args: OpDelete) -> Result<()> {
+        self.inner.delete(path, args)
+    }
+
+    async fn flush(&mut self) -> Result<usize> {
+        self.inner.flush().await
+    }
+}
+
+impl<A: Access> LayeredAccess for MockAccessor<A> {
+    type Inner = A;
+    type Reader = MockReader;
+    type Writer = MockWriter;
+    type Lister = MockLister;
+    type Deleter = MockDeleter;
+
+    fn inner(&self) -> &Self::Inner {
+        &self.inner
+    }
+
+    async fn read(&self, path: &str, args: OpRead) -> Result<(RpRead, Self::Reader)> {
+        if let Some(reader_factory) = self.reader_factory.as_ref() {
+            let (rp_read, reader) = self.inner.read(path, args.clone()).await?;
+            let reader = reader_factory(path, args, Box::new(reader));
+            Ok((rp_read, MockReader { inner: reader }))
+        } else {
+            self.inner.read(path, args).await.map(|(rp_read, reader)| {
+                (
+                    rp_read,
+                    MockReader {
+                        inner: Box::new(reader),
+                    },
+                )
+            })
+        }
+    }
+
+    async fn write(&self, path: &str, args: OpWrite) -> Result<(RpWrite, Self::Writer)> {
+        if let Some(writer_factory) = self.writer_factory.as_ref() {
+            let (rp_write, writer) = self.inner.write(path, args.clone()).await?;
+            let writer = writer_factory(path, args, Box::new(writer));
+            Ok((rp_write, MockWriter { inner: writer }))
+        } else {
+            self.inner
+                .write(path, args)
+                .await
+                .map(|(rp_write, writer)| {
+                    (
+                        rp_write,
+                        MockWriter {
+                            inner: Box::new(writer),
+                        },
+                    )
+                })
+        }
+    }
+
+    async fn delete(&self) -> Result<(RpDelete, Self::Deleter)> {
+        if let Some(deleter_factory) = self.deleter_factory.as_ref() {
+            let (rp_delete, deleter) = self.inner.delete().await?;
+            let deleter = deleter_factory(Box::new(deleter));
+            Ok((rp_delete, MockDeleter { inner: deleter }))
+        } else {
+            self.inner.delete().await.map(|(rp_delete, deleter)| {
+                (
+                    rp_delete,
+                    MockDeleter {
+                        inner: Box::new(deleter),
+                    },
+                )
+            })
+        }
+    }
+
+    async fn list(&self, path: &str, args: OpList) -> Result<(RpList, Self::Lister)> {
+        if let Some(lister_factory) = self.lister_factory.as_ref() {
+            let (rp_list, lister) = self.inner.list(path, args.clone()).await?;
+            let lister = lister_factory(path, args, Box::new(lister));
+            Ok((rp_list, MockLister { inner: lister }))
+        } else {
+            self.inner.list(path, args).await.map(|(rp_list, lister)| {
+                (
+                    rp_list,
+                    MockLister {
+                        inner: Box::new(lister),
+                    },
+                )
+            })
+        }
+    }
+
+    async fn copy(&self, from: &str, to: &str, args: OpCopy) -> Result<RpCopy> {
+        let Some(copy_interceptor) = self.copy_interceptor.as_ref() else {
+            return self.inner.copy(from, to, args).await;
+        };
+
+        let Some(result) = copy_interceptor(from, to, args.clone()) else {
+            return self.inner.copy(from, to, args).await;
+        };
+
+        result
+    }
+}
diff --git a/src/object-store/src/metrics.rs b/src/object-store/src/metrics.rs
index 9ab3b7df1c..28aee460f6 100644
--- a/src/object-store/src/metrics.rs
+++ b/src/object-store/src/metrics.rs
@@ -13,38 +13,3 @@
 // limitations under the License.
 
 //! object-store metrics
-
-/// Cache hit counter, no matter what the cache result is.
-use lazy_static::lazy_static;
-use prometheus::*;
-
-lazy_static! {
-    /// Cache hit counter, no matter what the cache result is.
-    pub static ref OBJECT_STORE_LRU_CACHE_HIT: IntCounterVec = register_int_counter_vec!(
-        "greptime_object_store_lru_cache_hit",
-        "object store lru cache hit",
-        &["result"]
-    )
-    .unwrap();
-    /// Cache miss counter
-    pub static ref OBJECT_STORE_LRU_CACHE_MISS: IntCounter =
-        register_int_counter!("greptime_object_store_lru_cache_miss", "object store lru cache miss")
-            .unwrap();
-    /// Object store read error counter
-    pub static ref OBJECT_STORE_READ_ERROR: IntCounterVec = register_int_counter_vec!(
-        "greptime_object_store_read_errors",
-        "object store read errors",
-        &["kind"]
-    )
-    .unwrap();
-
-    /// Cache entry number
-    pub static ref OBJECT_STORE_LRU_CACHE_ENTRIES: IntGauge =
-        register_int_gauge!("greptime_object_store_lru_cache_entries", "object store lru cache entries")
-            .unwrap();
-
-    /// Cache size in bytes
-    pub static ref OBJECT_STORE_LRU_CACHE_BYTES: IntGauge =
-        register_int_gauge!("greptime_object_store_lru_cache_bytes",  "object store lru cache bytes")
-            .unwrap();
-}
diff --git a/src/object-store/tests/object_store_test.rs b/src/object-store/tests/object_store_test.rs
index 58fecb0f93..4a8730111c 100644
--- a/src/object-store/tests/object_store_test.rs
+++ b/src/object-store/tests/object_store_test.rs
@@ -13,22 +13,15 @@
 // limitations under the License.
 
 use std::env;
-use std::sync::Arc;
 
 use anyhow::Result;
 use common_telemetry::info;
 use common_test_util::temp_dir::create_temp_dir;
-use object_store::layers::LruCacheLayer;
+use object_store::ObjectStore;
 use object_store::services::{Fs, S3};
 use object_store::test_util::TempFolder;
-use object_store::{ObjectStore, ObjectStoreBuilder};
-use opendal::raw::oio::{List, Read};
-use opendal::raw::{Access, OpList, OpRead};
+use opendal::EntryMode;
 use opendal::services::{Azblob, Gcs, Oss};
-use opendal::{EntryMode, OperatorBuilder};
-
-/// Duplicate of the constant in `src/layers/lru_cache/read_cache.rs`
-const READ_CACHE_DIR: &str = "cache/object/read";
 
 async fn test_object_crud(store: &ObjectStore) -> Result<()> {
     // Create object handler.
@@ -231,249 +224,3 @@ async fn test_gcs_backend() -> Result<()> {
     }
     Ok(())
 }
-
-#[tokio::test]
-async fn test_file_backend_with_lru_cache() -> Result<()> {
-    common_telemetry::init_default_ut_logging();
-
-    let data_dir = create_temp_dir("test_file_backend_with_lru_cache");
-    let tmp_dir = create_temp_dir("test_file_backend_with_lru_cache");
-    let builder = Fs::default()
-        .root(&data_dir.path().to_string_lossy())
-        .atomic_write_dir(&tmp_dir.path().to_string_lossy());
-
-    let store = builder.build().unwrap();
-
-    let cache_dir = create_temp_dir("test_file_backend_with_lru_cache");
-    let cache_layer = {
-        let builder = Fs::default()
-            .root(&cache_dir.path().to_string_lossy())
-            .atomic_write_dir(&cache_dir.path().to_string_lossy());
-        let file_cache = Arc::new(builder.build().unwrap());
-
-        let cache_layer = LruCacheLayer::new(file_cache, 32).unwrap();
-        cache_layer.recover_cache(true).await;
-        cache_layer
-    };
-
-    let store = OperatorBuilder::new(store)
-        .layer(cache_layer.clone())
-        .finish();
-
-    test_object_crud(&store).await?;
-    test_object_list(&store).await?;
-
-    assert_eq!(cache_layer.read_cache_stat().await, (0, 0));
-
-    Ok(())
-}
-
-async fn assert_lru_cache<C: Access>(cache_layer: &LruCacheLayer<C>, file_names: &[&str]) {
-    for file_name in file_names {
-        let file_path = format!("{READ_CACHE_DIR}/{file_name}");
-        assert!(cache_layer.contains_file(&file_path).await, "{file_path:?}");
-    }
-}
-
-async fn assert_cache_files<C: Access>(
-    store: &C,
-    file_names: &[&str],
-    file_contents: &[&str],
-) -> Result<()> {
-    let (_, mut lister) = store.list("/", OpList::default()).await?;
-    let mut objects = vec![];
-    while let Some(e) = lister.next().await? {
-        if e.mode() == EntryMode::FILE {
-            objects.push(e);
-        }
-    }
-
-    // compare the cache file with the expected cache file; ignore orders
-    for o in objects {
-        let position = file_names.iter().position(|&x| x == o.path());
-        assert!(position.is_some(), "file not found: {}", o.path());
-
-        let position = position.unwrap();
-        let (_, mut r) = store.read(o.path(), OpRead::default()).await.unwrap();
-        let bs = r.read_all().await.unwrap();
-        assert_eq!(
-            file_contents[position],
-            String::from_utf8(bs.to_vec())?,
-            "file content not match: {}",
-            o.path()
-        );
-    }
-
-    Ok(())
-}
-
-#[tokio::test]
-async fn test_object_store_cache_policy() -> Result<()> {
-    common_telemetry::init_default_ut_logging();
-    // create file storage
-    let root_dir = create_temp_dir("test_object_store_cache_policy");
-    let store = OperatorBuilder::new(
-        Fs::default()
-            .root(&root_dir.path().to_string_lossy())
-            .atomic_write_dir(&root_dir.path().to_string_lossy())
-            .build()
-            .unwrap(),
-    )
-    .finish();
-
-    // create file cache layer
-    let cache_dir = create_temp_dir("test_object_store_cache_policy_cache");
-    let atomic_temp_dir = create_temp_dir("test_object_store_cache_policy_cache_tmp");
-    let builder = Fs::default()
-        .root(&cache_dir.path().to_string_lossy())
-        .atomic_write_dir(&atomic_temp_dir.path().to_string_lossy());
-    let file_cache = Arc::new(builder.build().unwrap());
-    let cache_store = file_cache.clone();
-
-    // create operator for cache dir to verify cache file
-    let cache_layer = LruCacheLayer::new(cache_store.clone(), 38).unwrap();
-    cache_layer.recover_cache(true).await;
-    let store = store.layer(cache_layer.clone());
-
-    // create several object handler.
-    // write data into object;
-    let p1 = "test_file1";
-    let p2 = "test_file2";
-    store.write(p1, "Hello, object1!").await.unwrap();
-    store.write(p2, "Hello, object2!").await.unwrap();
-
-    // Try to read p1 and p2
-    let _ = store.read_with(p1).range(0..).await?;
-    let _ = store.read(p1).await?;
-    let _ = store.read_with(p2).range(0..).await?;
-    let _ = store.read_with(p2).range(7..).await?;
-    let _ = store.read(p2).await?;
-
-    assert_eq!(cache_layer.read_cache_stat().await, (3, 38));
-    assert_cache_files(
-        &cache_store,
-        &[
-            "6d29752bdc6e4d5ba5483b96615d6c48.cache-bytes=0-",
-            "ecfe0dce85de452eb0a325158e7bfb75.cache-bytes=7-",
-            "ecfe0dce85de452eb0a325158e7bfb75.cache-bytes=0-",
-        ],
-        &["Hello, object1!", "object2!", "Hello, object2!"],
-    )
-    .await?;
-    assert_lru_cache(
-        &cache_layer,
-        &[
-            "6d29752bdc6e4d5ba5483b96615d6c48.cache-bytes=0-",
-            "ecfe0dce85de452eb0a325158e7bfb75.cache-bytes=7-",
-            "ecfe0dce85de452eb0a325158e7bfb75.cache-bytes=0-",
-        ],
-    )
-    .await;
-
-    // Delete p2 file
-    store.delete(p2).await.unwrap();
-
-    assert_eq!(cache_layer.read_cache_stat().await, (1, 15));
-    assert_cache_files(
-        &cache_store,
-        &["6d29752bdc6e4d5ba5483b96615d6c48.cache-bytes=0-"],
-        &["Hello, object1!"],
-    )
-    .await?;
-    assert_lru_cache(
-        &cache_layer,
-        &["6d29752bdc6e4d5ba5483b96615d6c48.cache-bytes=0-"],
-    )
-    .await;
-
-    // Read the deleted file without a deterministic range size requires an extra `stat.`
-    // Therefore, it won't go into the cache.
-    assert!(store.read(p2).await.is_err());
-
-    let p3 = "test_file3";
-    store.write(p3, "Hello, object3!").await.unwrap();
-
-    // Try to read p3
-    let _ = store.read(p3).await.unwrap();
-    let _ = store.read_with(p3).range(0..5).await.unwrap();
-
-    assert_eq!(cache_layer.read_cache_stat().await, (3, 35));
-
-    // However, The real open file happens after the reader is created.
-    // The reader will throw an error during the reading
-    // instead of returning `NotFound` during the reader creation.
-    // The entry count is 4, because we have the p2 `NotFound` cache.
-    assert!(store.read_with(p2).range(0..4).await.is_err());
-    assert_eq!(cache_layer.read_cache_stat().await, (3, 35));
-
-    assert_cache_files(
-        &cache_store,
-        &[
-            "6d29752bdc6e4d5ba5483b96615d6c48.cache-bytes=0-",
-            "a8b1dc21e24bb55974e3e68acc77ed52.cache-bytes=0-",
-            "a8b1dc21e24bb55974e3e68acc77ed52.cache-bytes=0-4",
-        ],
-        &["Hello, object1!", "Hello, object3!", "Hello"],
-    )
-    .await?;
-    assert_lru_cache(
-        &cache_layer,
-        &[
-            "6d29752bdc6e4d5ba5483b96615d6c48.cache-bytes=0-",
-            "a8b1dc21e24bb55974e3e68acc77ed52.cache-bytes=0-",
-            "a8b1dc21e24bb55974e3e68acc77ed52.cache-bytes=0-4",
-        ],
-    )
-    .await;
-
-    // try to read p1, p2, p3
-    let _ = store.read(p3).await.unwrap();
-    let _ = store.read_with(p3).range(0..5).await.unwrap();
-    assert!(store.read(p2).await.is_err());
-    // Read p1 with range `1..` , the existing p1 with range `0..` must be evicted.
-    let _ = store.read_with(p1).range(1..15).await.unwrap();
-    assert_eq!(cache_layer.read_cache_stat().await, (3, 34));
-    assert_cache_files(
-        &cache_store,
-        &[
-            "6d29752bdc6e4d5ba5483b96615d6c48.cache-bytes=1-14",
-            "a8b1dc21e24bb55974e3e68acc77ed52.cache-bytes=0-",
-            "a8b1dc21e24bb55974e3e68acc77ed52.cache-bytes=0-4",
-        ],
-        &["ello, object1!", "Hello, object3!", "Hello"],
-    )
-    .await?;
-    assert_lru_cache(
-        &cache_layer,
-        &[
-            "6d29752bdc6e4d5ba5483b96615d6c48.cache-bytes=1-14",
-            "a8b1dc21e24bb55974e3e68acc77ed52.cache-bytes=0-",
-            "a8b1dc21e24bb55974e3e68acc77ed52.cache-bytes=0-4",
-        ],
-    )
-    .await;
-
-    let metric_text = common_telemetry::dump_metrics().unwrap();
-
-    assert!(metric_text.contains("object_store_lru_cache_hit"));
-    assert!(metric_text.contains("object_store_lru_cache_miss"));
-
-    drop(cache_layer);
-    // Test recover
-    let cache_layer = LruCacheLayer::new(cache_store, 38).unwrap();
-    cache_layer.recover_cache(true).await;
-
-    // The p2 `NotFound` cache will not be recovered
-    assert_eq!(cache_layer.read_cache_stat().await, (3, 34));
-    assert_lru_cache(
-        &cache_layer,
-        &[
-            "6d29752bdc6e4d5ba5483b96615d6c48.cache-bytes=1-14",
-            "a8b1dc21e24bb55974e3e68acc77ed52.cache-bytes=0-",
-            "a8b1dc21e24bb55974e3e68acc77ed52.cache-bytes=0-4",
-        ],
-    )
-    .await;
-
-    Ok(())
-}
diff --git a/src/operator/Cargo.toml b/src/operator/Cargo.toml
index d883c15689..82ddb12e20 100644
--- a/src/operator/Cargo.toml
+++ b/src/operator/Cargo.toml
@@ -36,6 +36,7 @@ common-query.workspace = true
 common-recordbatch.workspace = true
 common-runtime.workspace = true
 common-sql.workspace = true
+common-stat.workspace = true
 common-telemetry.workspace = true
 common-time.workspace = true
 datafusion.workspace = true
@@ -46,6 +47,7 @@ file-engine.workspace = true
 futures.workspace = true
 futures-util.workspace = true
 humantime.workspace = true
+itertools.workspace = true
 jsonb.workspace = true
 lazy_static.workspace = true
 meta-client.workspace = true
diff --git a/src/operator/src/bulk_insert.rs b/src/operator/src/bulk_insert.rs
index 15b92958b4..cfc427e19c 100644
--- a/src/operator/src/bulk_insert.rs
+++ b/src/operator/src/bulk_insert.rs
@@ -22,9 +22,10 @@ use api::v1::region::{
 };
 use arrow::array::Array;
 use arrow::record_batch::RecordBatch;
+use bytes::Bytes;
 use common_base::AffectedRows;
 use common_grpc::FlightData;
-use common_grpc::flight::{FlightDecoder, FlightEncoder, FlightMessage};
+use common_grpc::flight::{FlightEncoder, FlightMessage};
 use common_telemetry::error;
 use common_telemetry::tracing_context::TracingContext;
 use snafu::{OptionExt, ResultExt, ensure};
@@ -40,32 +41,20 @@ impl Inserter {
     pub async fn handle_bulk_insert(
         &self,
         table: TableRef,
-        decoder: &mut FlightDecoder,
-        data: FlightData,
+        raw_flight_data: FlightData,
+        record_batch: RecordBatch,
+        schema_bytes: Bytes,
     ) -> error::Result<AffectedRows> {
         let table_info = table.table_info();
         let table_id = table_info.table_id();
         let db_name = table_info.get_db_string();
-        let decode_timer = metrics::HANDLE_BULK_INSERT_ELAPSED
-            .with_label_values(&["decode_request"])
-            .start_timer();
-        let body_size = data.data_body.len();
-        // Build region server requests
-        let message = decoder
-            .try_decode(&data)
-            .context(error::DecodeFlightDataSnafu)?
-            .context(error::NotSupportedSnafu {
-                feat: "bulk insert RecordBatch with dictionary arrays",
-            })?;
-        let FlightMessage::RecordBatch(record_batch) = message else {
-            return Ok(0);
-        };
-        decode_timer.observe_duration();
 
         if record_batch.num_rows() == 0 {
             return Ok(0);
         }
 
+        let body_size = raw_flight_data.data_body.len();
+        // TODO(yingwen): Fill record batch impure default values. Note that we should override `raw_flight_data` if we have to fill defaults.
         // notify flownode to update dirty timestamps if flow is configured.
         self.maybe_update_flow_dirty_window(table_info.clone(), record_batch.clone());
 
@@ -74,8 +63,6 @@ impl Inserter {
             .with_label_values(&["raw"])
             .observe(record_batch.num_rows() as f64);
 
-        // safety: when reach here schema must be present.
-        let schema_bytes = decoder.schema_bytes().unwrap();
         let partition_timer = metrics::HANDLE_BULK_INSERT_ELAPSED
             .with_label_values(&["partition"])
             .start_timer();
@@ -105,6 +92,7 @@ impl Inserter {
                 .find_region_leader(region_id)
                 .await
                 .context(error::FindRegionLeaderSnafu)?;
+
             let request = RegionRequest {
                 header: Some(RegionRequestHeader {
                     tracing_context: TracingContext::from_current_span().to_w3c(),
@@ -113,9 +101,9 @@ impl Inserter {
                 body: Some(region_request::Body::BulkInsert(BulkInsertRequest {
                     region_id: region_id.as_u64(),
                     body: Some(bulk_insert_request::Body::ArrowIpc(ArrowIpc {
-                        schema: schema_bytes,
-                        data_header: data.data_header,
-                        payload: data.data_body,
+                        schema: schema_bytes.clone(),
+                        data_header: raw_flight_data.data_header,
+                        payload: raw_flight_data.data_body,
                     })),
                 })),
             };
@@ -157,8 +145,6 @@ impl Inserter {
 
         let mut handles = Vec::with_capacity(mask_per_datanode.len());
 
-        // raw daya header and payload bytes.
-        let mut raw_data_bytes = None;
         for (peer, masks) in mask_per_datanode {
             for (region_id, mask) in masks {
                 if mask.select_none() {
@@ -169,13 +155,10 @@ impl Inserter {
                 let node_manager = self.node_manager.clone();
                 let peer = peer.clone();
                 let raw_header_and_data = if mask.select_all() {
-                    Some(
-                        raw_data_bytes
-                            .get_or_insert_with(|| {
-                                (data.data_header.clone(), data.data_body.clone())
-                            })
-                            .clone(),
-                    )
+                    Some((
+                        raw_flight_data.data_header.clone(),
+                        raw_flight_data.data_body.clone(),
+                    ))
                 } else {
                     None
                 };
diff --git a/src/operator/src/error.rs b/src/operator/src/error.rs
index 2ba71444e7..68576db582 100644
--- a/src/operator/src/error.rs
+++ b/src/operator/src/error.rs
@@ -578,7 +578,7 @@ pub enum Error {
         #[snafu(implicit)]
         location: Location,
         #[snafu(source)]
-        error: datafusion::error::DataFusionError,
+        error: common_datasource::error::Error,
     },
 
     #[snafu(display(
diff --git a/src/operator/src/expr_helper.rs b/src/operator/src/expr_helper.rs
index 3fa9a0ae1f..4b7e0946cd 100644
--- a/src/operator/src/expr_helper.rs
+++ b/src/operator/src/expr_helper.rs
@@ -762,7 +762,8 @@ pub(crate) fn to_alter_table_expr(
             target_type,
         } => {
             let target_type =
-                sql_data_type_to_concrete_data_type(&target_type).context(ParseSqlSnafu)?;
+                sql_data_type_to_concrete_data_type(&target_type, &Default::default())
+                    .context(ParseSqlSnafu)?;
             let (target_type, target_type_extension) = ColumnDataTypeWrapper::try_from(target_type)
                 .map(|w| w.to_parts())
                 .context(ColumnDataTypeSnafu)?;
diff --git a/src/operator/src/insert.rs b/src/operator/src/insert.rs
index c323677f43..201d5d99f4 100644
--- a/src/operator/src/insert.rs
+++ b/src/operator/src/insert.rs
@@ -353,10 +353,11 @@ impl Inserter {
         &self,
         insert: &Insert,
         ctx: &QueryContextRef,
+        statement_executor: &StatementExecutor,
     ) -> Result<Output> {
         let (inserts, table_info) =
             StatementToRegion::new(self.catalog_manager.as_ref(), &self.partition_manager, ctx)
-                .convert(insert, ctx)
+                .convert(insert, ctx, statement_executor)
                 .await?;
 
         let table_infos =
diff --git a/src/operator/src/req_convert/common.rs b/src/operator/src/req_convert/common.rs
index 37529d55c6..63226ef8e4 100644
--- a/src/operator/src/req_convert/common.rs
+++ b/src/operator/src/req_convert/common.rs
@@ -223,7 +223,15 @@ fn push_column_to_rows(column: Column, rows: &mut [Row]) -> Result<()> {
             }
         }
 
-        )* }}
+        )* _ => {
+            return InvalidInsertRequestSnafu {
+                reason: format!(
+                    "Column '{}' with type {:?} is not supported in row inserts.",
+                    column.column_name, column_type
+                ),
+            }
+            .fail();
+        } }}
     }
 
     push_column_values_match_types!(
diff --git a/src/operator/src/req_convert/insert/fill_impure_default.rs b/src/operator/src/req_convert/insert/fill_impure_default.rs
index 0de49611d9..0e39bc7241 100644
--- a/src/operator/src/req_convert/insert/fill_impure_default.rs
+++ b/src/operator/src/req_convert/insert/fill_impure_default.rs
@@ -36,6 +36,7 @@ pub fn find_all_impure_columns(table_info: &TableInfo) -> Vec<ColumnSchema> {
         .collect()
 }
 
+// TODO(yingwen): Support Bulk insert request.
 /// Fill impure default values in the request
 pub struct ImpureDefaultFiller {
     impure_columns: HashMap<String, (api::v1::ColumnSchema, api::v1::Value)>,
@@ -62,7 +63,7 @@ impl ImpureDefaultFiller {
                         column.default_constraint()
                     ),
                 })?;
-            let grpc_default_value = api::helper::to_proto_value(default_value);
+            let grpc_default_value = api::helper::to_grpc_value(default_value);
             let def = column_schemas_to_defs(vec![column], &pk_names)?.swap_remove(0);
             let grpc_column_schema = api::v1::ColumnSchema {
                 column_name: def.name,
diff --git a/src/operator/src/req_convert/insert/stmt_to_region.rs b/src/operator/src/req_convert/insert/stmt_to_region.rs
index aca31b289a..ef4e7cac8e 100644
--- a/src/operator/src/req_convert/insert/stmt_to_region.rs
+++ b/src/operator/src/req_convert/insert/stmt_to_region.rs
@@ -12,13 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use api::helper::{ColumnDataTypeWrapper, value_to_grpc_value};
+use std::cell::LazyCell;
+use std::collections::HashMap;
+
+use api::helper::{ColumnDataTypeWrapper, to_grpc_value};
+use api::v1::alter_table_expr::Kind;
 use api::v1::column_def::options_from_column_schema;
 use api::v1::region::InsertRequests as RegionInsertRequests;
-use api::v1::{ColumnSchema as GrpcColumnSchema, Row, Rows, Value as GrpcValue};
+use api::v1::{
+    AlterTableExpr, ColumnSchema as GrpcColumnSchema, ModifyColumnType, ModifyColumnTypes, Row,
+    Rows,
+};
 use catalog::CatalogManager;
+use common_telemetry::info;
 use common_time::Timezone;
+use datatypes::data_type::ConcreteDataType;
 use datatypes::schema::{ColumnSchema, SchemaRef};
+use datatypes::types::JsonType;
+use datatypes::value::Value;
 use partition::manager::PartitionRuleManager;
 use session::context::{QueryContext, QueryContextRef};
 use snafu::{OptionExt, ResultExt, ensure};
@@ -30,12 +41,13 @@ use table::metadata::TableInfoRef;
 
 use crate::error::{
     CatalogSnafu, ColumnDataTypeSnafu, ColumnDefaultValueSnafu, ColumnNoneDefaultValueSnafu,
-    ColumnNotFoundSnafu, InvalidSqlSnafu, MissingInsertBodySnafu, ParseSqlSnafu, Result,
-    SchemaReadOnlySnafu, TableNotFoundSnafu,
+    ColumnNotFoundSnafu, InvalidInsertRequestSnafu, InvalidSqlSnafu, MissingInsertBodySnafu,
+    ParseSqlSnafu, Result, SchemaReadOnlySnafu, TableNotFoundSnafu,
 };
 use crate::insert::InstantAndNormalInsertRequests;
 use crate::req_convert::common::partitioner::Partitioner;
 use crate::req_convert::insert::semantic_type;
+use crate::statement::StatementExecutor;
 
 const DEFAULT_PLACEHOLDER_VALUE: &str = "default";
 
@@ -62,12 +74,12 @@ impl<'a> StatementToRegion<'a> {
         &self,
         stmt: &Insert,
         query_ctx: &QueryContextRef,
+        statement_executor: &StatementExecutor,
     ) -> Result<(InstantAndNormalInsertRequests, TableInfoRef)> {
         let name = stmt.table_name().context(ParseSqlSnafu)?;
         let (catalog, schema, table_name) = self.get_full_name(name)?;
-        let table = self.get_table(&catalog, &schema, &table_name).await?;
+        let mut table = self.get_table(&catalog, &schema, &table_name).await?;
         let table_schema = table.schema();
-        let table_info = table.table_info();
 
         ensure!(
             !common_catalog::consts::is_readonly_schema(&schema),
@@ -94,7 +106,6 @@ impl<'a> StatementToRegion<'a> {
             Ok(())
         })?;
 
-        let mut schema = Vec::with_capacity(column_count);
         let mut rows = vec![
             Row {
                 values: Vec::with_capacity(column_count)
@@ -102,17 +113,57 @@ impl<'a> StatementToRegion<'a> {
             row_count
         ];
 
-        for (i, column_name) in column_names.into_iter().enumerate() {
-            let column_schema = table_schema
-                .column_schema_by_name(column_name)
-                .with_context(|| ColumnNotFoundSnafu {
-                    msg: format!("Column {} not found in table {}", column_name, &table_name),
-                })?;
+        fn find_insert_columns<'a>(
+            table: &'a TableRef,
+            column_names: &[&String],
+        ) -> Result<Vec<&'a ColumnSchema>> {
+            let schema = table.schema_ref();
+            column_names
+                .iter()
+                .map(|name| {
+                    schema
+                        .column_schema_by_name(name)
+                        .context(ColumnNotFoundSnafu { msg: *name })
+                })
+                .collect::<Result<Vec<_>>>()
+        }
 
+        let mut insert_columns = find_insert_columns(&table, &column_names)?;
+        let converter = SqlRowConverter::new(&insert_columns, query_ctx);
+
+        // Convert the SQL values to GreptimeDB values, and merge a "largest" JSON types of all
+        // values on the way by `JsonColumnTypeUpdater`.
+        let mut updater = JsonColumnTypeUpdater::new(statement_executor, query_ctx);
+        let value_rows = converter.convert(&mut updater, &sql_rows)?;
+
+        // If the JSON values have a "larger" json type than the one in the table schema, modify
+        // the column's json type first, by executing an "alter table" DDL.
+        if updater
+            .maybe_update_column_type(&catalog, &schema, &table_name, &insert_columns)
+            .await?
+        {
+            // Update with the latest schema, if changed.
+            table = self.get_table(&catalog, &schema, &table_name).await?;
+            insert_columns = find_insert_columns(&table, &column_names)?;
+        }
+
+        // Finally convert GreptimeDB values to GRPC values, ready to do insertion on Datanode.
+        for (i, row) in value_rows.into_iter().enumerate() {
+            for value in row {
+                let grpc_value = to_grpc_value(value);
+                rows[i].values.push(grpc_value);
+            }
+        }
+
+        let table_info = table.table_info();
+        let mut schema = Vec::with_capacity(column_count);
+        for column_schema in insert_columns {
             let (datatype, datatype_extension) =
                 ColumnDataTypeWrapper::try_from(column_schema.data_type.clone())
                     .context(ColumnDataTypeSnafu)?
                     .to_parts();
+
+            let column_name = &column_schema.name;
             let semantic_type = semantic_type(&table_info, column_name)?;
 
             let grpc_column_schema = GrpcColumnSchema {
@@ -123,16 +174,6 @@ impl<'a> StatementToRegion<'a> {
                 options: options_from_column_schema(column_schema),
             };
             schema.push(grpc_column_schema);
-
-            for (sql_row, grpc_row) in sql_rows.iter().zip(rows.iter_mut()) {
-                let value = sql_value_to_grpc_value(
-                    column_schema,
-                    &sql_row[i],
-                    Some(&query_ctx.timezone()),
-                    query_ctx.auto_string_to_numeric(),
-                )?;
-                grpc_row.values.push(value);
-            }
         }
 
         let requests = Partitioner::new(self.partition_manager)
@@ -194,6 +235,147 @@ impl<'a> StatementToRegion<'a> {
     }
 }
 
+struct SqlRowConverter<'a, 'b> {
+    insert_columns: &'a [&'a ColumnSchema],
+    query_context: &'b QueryContextRef,
+}
+
+impl<'a, 'b> SqlRowConverter<'a, 'b> {
+    fn new(insert_columns: &'a [&'a ColumnSchema], query_context: &'b QueryContextRef) -> Self {
+        Self {
+            insert_columns,
+            query_context,
+        }
+    }
+
+    fn convert(
+        &self,
+        updater: &mut JsonColumnTypeUpdater<'_, 'a>,
+        sql_rows: &[Vec<SqlValue>],
+    ) -> Result<Vec<Vec<Value>>> {
+        let timezone = Some(&self.query_context.timezone());
+        let auto_string_to_numeric = self.query_context.auto_string_to_numeric();
+
+        let mut value_rows = Vec::with_capacity(sql_rows.len());
+        for sql_row in sql_rows {
+            let mut value_row = Vec::with_capacity(self.insert_columns.len());
+
+            for (insert_column, sql_value) in self.insert_columns.iter().zip(sql_row) {
+                let value =
+                    sql_value_to_value(insert_column, sql_value, timezone, auto_string_to_numeric)?;
+
+                updater.merge_types(insert_column, &value)?;
+
+                value_row.push(value);
+            }
+            value_rows.push(value_row);
+        }
+        Ok(value_rows)
+    }
+}
+
+struct JsonColumnTypeUpdater<'a, 'b> {
+    statement_executor: &'a StatementExecutor,
+    query_context: &'a QueryContextRef,
+    merged_value_types: LazyCell<HashMap<&'b str, JsonType>>,
+}
+
+impl<'a, 'b> JsonColumnTypeUpdater<'a, 'b> {
+    fn new(statement_executor: &'a StatementExecutor, query_context: &'a QueryContextRef) -> Self {
+        Self {
+            statement_executor,
+            query_context,
+            merged_value_types: LazyCell::new(Default::default),
+        }
+    }
+
+    fn merge_types(&mut self, column_schema: &'b ColumnSchema, value: &Value) -> Result<()> {
+        if !matches!(value, Value::Json(_)) {
+            return Ok(());
+        }
+
+        if let ConcreteDataType::Json(value_type) = value.data_type() {
+            let merged_type = self
+                .merged_value_types
+                .entry(&column_schema.name)
+                .or_insert_with(|| value_type.clone());
+
+            if !merged_type.is_include(&value_type) {
+                merged_type.merge(&value_type).map_err(|e| {
+                    InvalidInsertRequestSnafu {
+                        reason: format!(r#"cannot merge "{value_type}" into "{merged_type}": {e}"#),
+                    }
+                    .build()
+                })?;
+            }
+        }
+        Ok(())
+    }
+
+    async fn maybe_update_column_type(
+        self,
+        catalog: &str,
+        schema: &str,
+        table: &str,
+        insert_columns: &[&ColumnSchema],
+    ) -> Result<bool> {
+        let mut has_update = false;
+        for (column_name, merged_type) in self.merged_value_types.iter() {
+            let Some(column_type) = insert_columns
+                .iter()
+                .find_map(|x| (&x.name == column_name).then(|| x.data_type.as_json()))
+                .flatten()
+            else {
+                continue;
+            };
+            if column_type.is_include(merged_type) {
+                continue;
+            }
+
+            let new_column_type = {
+                let mut x = column_type.clone();
+                x.merge(merged_type)
+                    .map_err(|e| {
+                        InvalidInsertRequestSnafu {
+                            reason: format!(
+                                r#"cannot merge "{merged_type}" into "{column_type}": {e}"#
+                            ),
+                        }
+                        .build()
+                    })
+                    .map(|()| x)
+            }?;
+            info!(
+                "updating table {}.{}.{} column {} json type: {} => {}",
+                catalog, schema, table, column_name, column_type, new_column_type,
+            );
+
+            let (target_type, target_type_extension) =
+                ColumnDataTypeWrapper::try_from(ConcreteDataType::Json(new_column_type))
+                    .context(ColumnDataTypeSnafu)?
+                    .into_parts();
+            let alter_expr = AlterTableExpr {
+                catalog_name: catalog.to_string(),
+                schema_name: schema.to_string(),
+                table_name: table.to_string(),
+                kind: Some(Kind::ModifyColumnTypes(ModifyColumnTypes {
+                    modify_column_types: vec![ModifyColumnType {
+                        column_name: column_name.to_string(),
+                        target_type: target_type as i32,
+                        target_type_extension,
+                    }],
+                })),
+            };
+            self.statement_executor
+                .alter_table_inner(alter_expr, self.query_context.clone())
+                .await?;
+
+            has_update = true;
+        }
+        Ok(has_update)
+    }
+}
+
 fn column_names<'a>(stmt: &'a Insert, table_schema: &'a SchemaRef) -> Vec<&'a String> {
     if !stmt.columns().is_empty() {
         stmt.columns()
@@ -209,12 +391,12 @@ fn column_names<'a>(stmt: &'a Insert, table_schema: &'a SchemaRef) -> Vec<&'a St
 /// Converts SQL value to gRPC value according to the column schema.
 /// If `auto_string_to_numeric` is true, tries to cast the string value to numeric values,
 /// and fills the default value if the cast fails.
-fn sql_value_to_grpc_value(
+fn sql_value_to_value(
     column_schema: &ColumnSchema,
     sql_val: &SqlValue,
     timezone: Option<&Timezone>,
     auto_string_to_numeric: bool,
-) -> Result<GrpcValue> {
+) -> Result<Value> {
     let column = &column_schema.name;
     let value = if replace_default(sql_val) {
         let default_value = column_schema
@@ -237,9 +419,25 @@ fn sql_value_to_grpc_value(
         )
         .context(crate::error::SqlCommonSnafu)?
     };
+    validate(&value)?;
+    Ok(value)
+}
 
-    let grpc_value = value_to_grpc_value(value);
-    Ok(grpc_value)
+fn validate(value: &Value) -> Result<()> {
+    match value {
+        Value::Json(value) => {
+            // Json object will be stored as Arrow struct in parquet, and it has the restriction:
+            // "Parquet does not support writing empty structs".
+            ensure!(
+                !value.is_empty_object(),
+                InvalidInsertRequestSnafu {
+                    reason: "empty json object is not supported, consider adding a dummy field"
+                }
+            );
+            Ok(())
+        }
+        _ => Ok(()),
+    }
 }
 
 fn replace_default(sql_val: &SqlValue) -> bool {
diff --git a/src/operator/src/request.rs b/src/operator/src/request.rs
index 1bca461842..a5ed045313 100644
--- a/src/operator/src/request.rs
+++ b/src/operator/src/request.rs
@@ -15,19 +15,19 @@
 use std::sync::Arc;
 
 use api::v1::region::region_request::Body as RegionRequestBody;
-use api::v1::region::{CompactRequest, FlushRequest, RegionRequestHeader};
+use api::v1::region::{BuildIndexRequest, CompactRequest, FlushRequest, RegionRequestHeader};
 use catalog::CatalogManagerRef;
 use common_catalog::build_db_string;
 use common_meta::node_manager::{AffectedRows, NodeManagerRef};
 use common_meta::peer::Peer;
 use common_telemetry::tracing_context::TracingContext;
-use common_telemetry::{error, info};
+use common_telemetry::{debug, error, info};
 use futures_util::future;
 use partition::manager::{PartitionInfo, PartitionRuleManagerRef};
 use session::context::QueryContextRef;
 use snafu::prelude::*;
 use store_api::storage::RegionId;
-use table::requests::{CompactTableRequest, FlushTableRequest};
+use table::requests::{BuildIndexTableRequest, CompactTableRequest, FlushTableRequest};
 
 use crate::error::{
     CatalogSnafu, FindRegionLeaderSnafu, FindTablePartitionRuleSnafu, JoinTaskSnafu,
@@ -90,6 +90,43 @@ impl Requester {
         .await
     }
 
+    /// Handle the request to build index for table.
+    pub async fn handle_table_build_index(
+        &self,
+        request: BuildIndexTableRequest,
+        ctx: QueryContextRef,
+    ) -> Result<AffectedRows> {
+        let partitions = self
+            .get_table_partitions(
+                &request.catalog_name,
+                &request.schema_name,
+                &request.table_name,
+            )
+            .await?;
+
+        let requests = partitions
+            .into_iter()
+            .map(|partition| {
+                RegionRequestBody::BuildIndex(BuildIndexRequest {
+                    region_id: partition.id.into(),
+                })
+            })
+            .collect();
+
+        info!(
+            "Handle table manual build index for table {}",
+            request.table_name
+        );
+        debug!("Request details: {:?}", request);
+
+        self.do_request(
+            requests,
+            Some(build_db_string(&request.catalog_name, &request.schema_name)),
+            &ctx,
+        )
+        .await
+    }
+
     /// Handle the request to compact table.
     pub async fn handle_table_compaction(
         &self,
@@ -201,6 +238,7 @@ impl Requester {
         let region_id = match req {
             RegionRequestBody::Flush(req) => req.region_id,
             RegionRequestBody::Compact(req) => req.region_id,
+            RegionRequestBody::BuildIndex(req) => req.region_id,
             _ => {
                 error!("Unsupported region request: {:?}", req);
                 return UnsupportedRegionRequestSnafu {}.fail();
diff --git a/src/operator/src/statement.rs b/src/operator/src/statement.rs
index 5dd39681b6..9c1b6a749e 100644
--- a/src/operator/src/statement.rs
+++ b/src/operator/src/statement.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 mod admin;
+mod comment;
 mod copy_database;
 mod copy_query_to;
 mod copy_table_from;
@@ -46,12 +47,13 @@ use common_meta::key::{TableMetadataManager, TableMetadataManagerRef};
 use common_meta::kv_backend::KvBackendRef;
 use common_meta::procedure_executor::ProcedureExecutorRef;
 use common_query::Output;
-use common_telemetry::tracing;
+use common_telemetry::{debug, tracing, warn};
 use common_time::Timestamp;
 use common_time::range::TimestampRange;
 use datafusion_expr::LogicalPlan;
 use datatypes::prelude::ConcreteDataType;
 use humantime::format_duration;
+use itertools::Itertools;
 use partition::manager::{PartitionRuleManager, PartitionRuleManagerRef};
 use query::QueryEngineRef;
 use query::parser::QueryStatement;
@@ -87,6 +89,22 @@ use crate::insert::InserterRef;
 use crate::statement::copy_database::{COPY_DATABASE_TIME_END_KEY, COPY_DATABASE_TIME_START_KEY};
 use crate::statement::set::set_allow_query_fallback;
 
+/// A configurator that customizes or enhances a [`StatementExecutor`].
+#[async_trait::async_trait]
+pub trait StatementExecutorConfigurator: Send + Sync {
+    async fn configure(
+        &self,
+        executor: StatementExecutor,
+        ctx: ExecutorConfigureContext,
+    ) -> std::result::Result<StatementExecutor, BoxedError>;
+}
+
+pub type StatementExecutorConfiguratorRef = Arc<dyn StatementExecutorConfigurator>;
+
+pub struct ExecutorConfigureContext {
+    pub kv_backend: KvBackendRef,
+}
+
 #[derive(Clone)]
 pub struct StatementExecutor {
     catalog_manager: CatalogManagerRef,
@@ -105,15 +123,6 @@ pub struct StatementExecutor {
 
 pub type StatementExecutorRef = Arc<StatementExecutor>;
 
-/// Trait for creating [`TriggerQuerier`] instance.
-#[cfg(feature = "enterprise")]
-pub trait TriggerQuerierFactory: Send + Sync {
-    fn create(&self, kv_backend: KvBackendRef) -> TriggerQuerierRef;
-}
-
-#[cfg(feature = "enterprise")]
-pub type TriggerQuerierFactoryRef = Arc<dyn TriggerQuerierFactory>;
-
 /// Trait for querying trigger info, such as `SHOW CREATE TRIGGER` etc.
 #[cfg(feature = "enterprise")]
 #[async_trait::async_trait]
@@ -420,6 +429,7 @@ impl StatementExecutor {
             Statement::ShowCreateTrigger(show) => self.show_create_trigger(show, query_ctx).await,
             Statement::SetVariables(set_var) => self.set_variables(set_var, query_ctx),
             Statement::ShowVariables(show_variable) => self.show_variable(show_variable, query_ctx),
+            Statement::Comment(stmt) => self.comment(stmt, query_ctx).await,
             Statement::ShowColumns(show_columns) => {
                 self.show_columns(show_columns, query_ctx).await
             }
@@ -452,6 +462,13 @@ impl StatementExecutor {
     fn set_variables(&self, set_var: SetVariables, query_ctx: QueryContextRef) -> Result<Output> {
         let var_name = set_var.variable.to_string().to_uppercase();
 
+        debug!(
+            "Trying to set {}={} for session: {} ",
+            var_name,
+            set_var.value.iter().map(|e| e.to_string()).join(", "),
+            query_ctx.conn_info()
+        );
+
         match var_name.as_str() {
             "READ_PREFERENCE" => set_read_preference(set_var.value, query_ctx)?,
 
@@ -473,6 +490,11 @@ impl StatementExecutor {
             "@@SESSION.MAX_EXECUTION_TIME" | "MAX_EXECUTION_TIME" => match query_ctx.channel() {
                 Channel::Mysql => set_query_timeout(set_var.value, query_ctx)?,
                 Channel::Postgres => {
+                    warn!(
+                        "Unsupported set variable {} for channel {:?}",
+                        var_name,
+                        query_ctx.channel()
+                    );
                     query_ctx.set_warning(format!("Unsupported set variable {}", var_name))
                 }
                 _ => {
@@ -482,16 +504,23 @@ impl StatementExecutor {
                     .fail();
                 }
             },
-            "STATEMENT_TIMEOUT" => {
-                if query_ctx.channel() == Channel::Postgres {
-                    set_query_timeout(set_var.value, query_ctx)?
-                } else {
+            "STATEMENT_TIMEOUT" => match query_ctx.channel() {
+                Channel::Postgres => set_query_timeout(set_var.value, query_ctx)?,
+                Channel::Mysql => {
+                    warn!(
+                        "Unsupported set variable {} for channel {:?}",
+                        var_name,
+                        query_ctx.channel()
+                    );
+                    query_ctx.set_warning(format!("Unsupported set variable {}", var_name));
+                }
+                _ => {
                     return NotSupportedSnafu {
                         feat: format!("Unsupported set variable {}", var_name),
                     }
                     .fail();
                 }
-            }
+            },
             "SEARCH_PATH" => {
                 if query_ctx.channel() == Channel::Postgres {
                     set_search_path(set_var.value, query_ctx)?
@@ -503,14 +532,16 @@ impl StatementExecutor {
                 }
             }
             _ => {
-                // for postgres, we give unknown SET statements a warning with
-                //  success, this is prevent the SET call becoming a blocker
-                //  of connection establishment
-                //
-                if query_ctx.channel() == Channel::Postgres {
-                    query_ctx.set_warning(format!("Unsupported set variable {}", var_name));
-                } else if query_ctx.channel() == Channel::Mysql && var_name.starts_with("@@") {
-                    // Just ignore `SET @@` commands for MySQL
+                if query_ctx.channel() == Channel::Postgres || query_ctx.channel() == Channel::Mysql
+                {
+                    // For unknown SET statements, we give a warning with success.
+                    // This prevents the SET call from becoming a blocker of MySQL/Postgres clients'
+                    // connection establishment.
+                    warn!(
+                        "Unsupported set variable {} for channel {:?}",
+                        var_name,
+                        query_ctx.channel()
+                    );
                     query_ctx.set_warning(format!("Unsupported set variable {}", var_name));
                 } else {
                     return NotSupportedSnafu {
diff --git a/src/operator/src/statement/comment.rs b/src/operator/src/statement/comment.rs
new file mode 100644
index 0000000000..d82d059ad9
--- /dev/null
+++ b/src/operator/src/statement/comment.rs
@@ -0,0 +1,176 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use api::v1::CommentOnExpr;
+use common_error::ext::BoxedError;
+use common_meta::procedure_executor::ExecutorContext;
+use common_meta::rpc::ddl::{CommentObjectType, CommentOnTask, DdlTask, SubmitDdlTaskRequest};
+use common_query::Output;
+use session::context::QueryContextRef;
+use session::table_name::table_idents_to_full_name;
+use snafu::ResultExt;
+use sql::ast::ObjectNamePartExt;
+use sql::statements::comment::{Comment, CommentObject};
+
+use crate::error::{ExecuteDdlSnafu, ExternalSnafu, InvalidSqlSnafu, Result};
+use crate::statement::StatementExecutor;
+
+impl StatementExecutor {
+    /// Adds a comment to a database object (table, column, or flow).
+    ///
+    /// # Arguments
+    ///
+    /// * `stmt`: A `Comment` struct containing the object to comment on and the comment text.
+    /// * `query_ctx`: A `QueryContextRef` providing contextual information for the query.
+    ///
+    /// # Returns
+    ///
+    /// A `Result` containing the `Output` of the operation, or an error if the operation fails.
+    pub async fn comment(&self, stmt: Comment, query_ctx: QueryContextRef) -> Result<Output> {
+        let comment_on_task = self.create_comment_on_task_from_stmt(stmt, &query_ctx)?;
+
+        let request = SubmitDdlTaskRequest {
+            task: DdlTask::new_comment_on(comment_on_task),
+            query_context: query_ctx,
+        };
+
+        self.procedure_executor
+            .submit_ddl_task(&ExecutorContext::default(), request)
+            .await
+            .context(ExecuteDdlSnafu)
+            .map(|_| Output::new_with_affected_rows(0))
+    }
+
+    pub async fn comment_by_expr(
+        &self,
+        expr: CommentOnExpr,
+        query_ctx: QueryContextRef,
+    ) -> Result<Output> {
+        let comment_on_task = self.create_comment_on_task_from_expr(expr)?;
+
+        let request = SubmitDdlTaskRequest {
+            task: DdlTask::new_comment_on(comment_on_task),
+            query_context: query_ctx,
+        };
+
+        self.procedure_executor
+            .submit_ddl_task(&ExecutorContext::default(), request)
+            .await
+            .context(ExecuteDdlSnafu)
+            .map(|_| Output::new_with_affected_rows(0))
+    }
+
+    fn create_comment_on_task_from_expr(&self, expr: CommentOnExpr) -> Result<CommentOnTask> {
+        let object_type = match expr.object_type {
+            0 => CommentObjectType::Table,
+            1 => CommentObjectType::Column,
+            2 => CommentObjectType::Flow,
+            _ => {
+                return InvalidSqlSnafu {
+                    err_msg: format!(
+                        "Invalid CommentObjectType value: {}. Valid values are: 0 (Table), 1 (Column), 2 (Flow)",
+                        expr.object_type
+                    ),
+                }
+                .fail();
+            }
+        };
+
+        Ok(CommentOnTask {
+            catalog_name: expr.catalog_name,
+            schema_name: expr.schema_name,
+            object_type,
+            object_name: expr.object_name,
+            column_name: if expr.column_name.is_empty() {
+                None
+            } else {
+                Some(expr.column_name)
+            },
+            object_id: None,
+            comment: if expr.comment.is_empty() {
+                None
+            } else {
+                Some(expr.comment)
+            },
+        })
+    }
+
+    fn create_comment_on_task_from_stmt(
+        &self,
+        stmt: Comment,
+        query_ctx: &QueryContextRef,
+    ) -> Result<CommentOnTask> {
+        match stmt.object {
+            CommentObject::Table(table) => {
+                let (catalog_name, schema_name, table_name) =
+                    table_idents_to_full_name(&table, query_ctx)
+                        .map_err(BoxedError::new)
+                        .context(ExternalSnafu)?;
+
+                Ok(CommentOnTask {
+                    catalog_name,
+                    schema_name,
+                    object_type: CommentObjectType::Table,
+                    object_name: table_name,
+                    column_name: None,
+                    object_id: None,
+                    comment: stmt.comment,
+                })
+            }
+            CommentObject::Column { table, column } => {
+                let (catalog_name, schema_name, table_name) =
+                    table_idents_to_full_name(&table, query_ctx)
+                        .map_err(BoxedError::new)
+                        .context(ExternalSnafu)?;
+
+                Ok(CommentOnTask {
+                    catalog_name,
+                    schema_name,
+                    object_type: CommentObjectType::Column,
+                    object_name: table_name,
+                    column_name: Some(column.value),
+                    object_id: None,
+                    comment: stmt.comment,
+                })
+            }
+            CommentObject::Flow(flow_name) => {
+                let (catalog_name, flow_name_str) = match &flow_name.0[..] {
+                    [flow] => (
+                        query_ctx.current_catalog().to_string(),
+                        flow.to_string_unquoted(),
+                    ),
+                    [catalog, flow] => (catalog.to_string_unquoted(), flow.to_string_unquoted()),
+                    _ => {
+                        return InvalidSqlSnafu {
+                            err_msg: format!(
+                                "expect flow name to be <catalog>.<flow_name> or <flow_name>, actual: {flow_name}"
+                            ),
+                        }
+                        .fail();
+                    }
+                };
+
+                Ok(CommentOnTask {
+                    catalog_name,
+                    schema_name: String::new(), // Flow doesn't use schema
+                    object_type: CommentObjectType::Flow,
+                    object_name: flow_name_str,
+                    column_name: None,
+                    object_id: None,
+                    comment: stmt.comment,
+                })
+            }
+        }
+    }
+}
diff --git a/src/operator/src/statement/copy_database.rs b/src/operator/src/statement/copy_database.rs
index c7cf0b47b0..cd8eeb6d79 100644
--- a/src/operator/src/statement/copy_database.rs
+++ b/src/operator/src/statement/copy_database.rs
@@ -12,14 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::collections::HashMap;
 use std::path::Path;
 use std::str::FromStr;
+use std::sync::Arc;
 
 use client::{Output, OutputData, OutputMeta};
+use common_catalog::format_full_table_name;
 use common_datasource::file_format::Format;
 use common_datasource::lister::{Lister, Source};
 use common_datasource::object_store::build_backend;
+use common_stat::get_total_cpu_cores;
 use common_telemetry::{debug, error, info, tracing};
+use futures::future::try_join_all;
 use object_store::Entry;
 use regex::Regex;
 use session::context::QueryContextRef;
@@ -27,6 +32,7 @@ use snafu::{OptionExt, ResultExt, ensure};
 use store_api::metric_engine_consts::{LOGICAL_TABLE_METADATA_KEY, METRIC_ENGINE_NAME};
 use table::requests::{CopyDatabaseRequest, CopyDirection, CopyTableRequest};
 use table::table_reference::TableReference;
+use tokio::sync::Semaphore;
 
 use crate::error;
 use crate::error::{CatalogSnafu, InvalidCopyDatabasePathSnafu};
@@ -35,6 +41,16 @@ use crate::statement::StatementExecutor;
 pub(crate) const COPY_DATABASE_TIME_START_KEY: &str = "start_time";
 pub(crate) const COPY_DATABASE_TIME_END_KEY: &str = "end_time";
 pub(crate) const CONTINUE_ON_ERROR_KEY: &str = "continue_on_error";
+pub(crate) const PARALLELISM_KEY: &str = "parallelism";
+
+/// Get parallelism from options, default to total CPU cores.
+fn parse_parallelism_from_option_map(options: &HashMap<String, String>) -> usize {
+    options
+        .get(PARALLELISM_KEY)
+        .and_then(|v| v.parse::<usize>().ok())
+        .unwrap_or_else(get_total_cpu_cores)
+        .max(1)
+}
 
 impl StatementExecutor {
     #[tracing::instrument(skip_all)]
@@ -51,22 +67,26 @@ impl StatementExecutor {
             }
         );
 
+        let parallelism = parse_parallelism_from_option_map(&req.with);
         info!(
-            "Copy database {}.{} to dir: {}, time: {:?}",
-            req.catalog_name, req.schema_name, req.location, req.time_range
+            "Copy database {}.{} to dir: {}, time: {:?}, parallelism: {}",
+            req.catalog_name, req.schema_name, req.location, req.time_range, parallelism
         );
         let table_names = self
             .catalog_manager
             .table_names(&req.catalog_name, &req.schema_name, Some(&ctx))
             .await
             .context(CatalogSnafu)?;
+        let num_tables = table_names.len();
 
         let suffix = Format::try_from(&req.with)
             .context(error::ParseFileFormatSnafu)?
             .suffix();
 
-        let mut exported_rows = 0;
-        for table_name in table_names {
+        let mut tasks = Vec::with_capacity(num_tables);
+        let semaphore = Arc::new(Semaphore::new(parallelism));
+
+        for (i, table_name) in table_names.into_iter().enumerate() {
             let table = self
                 .get_table(&TableReference {
                     catalog: &req.catalog_name,
@@ -89,33 +109,40 @@ impl StatementExecutor {
             {
                 continue;
             }
+
+            let semaphore_moved = semaphore.clone();
             let mut table_file = req.location.clone();
             table_file.push_str(&table_name);
             table_file.push_str(suffix);
-            info!(
-                "Copy table: {}.{}.{} to {}",
-                req.catalog_name, req.schema_name, table_name, table_file
-            );
+            let table_no = i + 1;
+            let moved_ctx = ctx.clone();
+            let full_table_name =
+                format_full_table_name(&req.catalog_name, &req.schema_name, &table_name);
+            let copy_table_req = CopyTableRequest {
+                catalog_name: req.catalog_name.clone(),
+                schema_name: req.schema_name.clone(),
+                table_name,
+                location: table_file.clone(),
+                with: req.with.clone(),
+                connection: req.connection.clone(),
+                pattern: None,
+                direction: CopyDirection::Export,
+                timestamp_range: req.time_range,
+                limit: None,
+            };
 
-            let exported = self
-                .copy_table_to(
-                    CopyTableRequest {
-                        catalog_name: req.catalog_name.clone(),
-                        schema_name: req.schema_name.clone(),
-                        table_name,
-                        location: table_file,
-                        with: req.with.clone(),
-                        connection: req.connection.clone(),
-                        pattern: None,
-                        direction: CopyDirection::Export,
-                        timestamp_range: req.time_range,
-                        limit: None,
-                    },
-                    ctx.clone(),
-                )
-                .await?;
-            exported_rows += exported;
+            tasks.push(async move {
+                let _permit = semaphore_moved.acquire().await.unwrap();
+                info!(
+                    "Copy table({}/{}): {} to {}",
+                    table_no, num_tables, full_table_name, table_file
+                );
+                self.copy_table_to(copy_table_req, moved_ctx).await
+            });
         }
+
+        let results = try_join_all(tasks).await?;
+        let exported_rows = results.into_iter().sum();
         Ok(Output::new_with_affected_rows(exported_rows))
     }
 
@@ -134,9 +161,10 @@ impl StatementExecutor {
             }
         );
 
+        let parallelism = parse_parallelism_from_option_map(&req.with);
         info!(
-            "Copy database {}.{} from dir: {}, time: {:?}",
-            req.catalog_name, req.schema_name, req.location, req.time_range
+            "Copy database {}.{} from dir: {}, time: {:?}, parallelism: {}",
+            req.catalog_name, req.schema_name, req.location, req.time_range, parallelism
         );
         let suffix = Format::try_from(&req.with)
             .context(error::ParseFileFormatSnafu)?
@@ -150,8 +178,8 @@ impl StatementExecutor {
             .and_then(|v| bool::from_str(v).ok())
             .unwrap_or(false);
 
-        let mut rows_inserted = 0;
-        let mut insert_cost = 0;
+        let mut tasks = Vec::with_capacity(entries.len());
+        let semaphore = Arc::new(Semaphore::new(parallelism));
 
         for e in entries {
             let table_name = match parse_file_name_to_copy(&e) {
@@ -165,6 +193,7 @@ impl StatementExecutor {
                     }
                 }
             };
+
             let req = CopyTableRequest {
                 catalog_name: req.catalog_name.clone(),
                 schema_name: req.schema_name.clone(),
@@ -177,23 +206,36 @@ impl StatementExecutor {
                 timestamp_range: None,
                 limit: None,
             };
-            debug!("Copy table, arg: {:?}", req);
-            match self.copy_table_from(req, ctx.clone()).await {
-                Ok(o) => {
-                    let (rows, cost) = o.extract_rows_and_cost();
-                    rows_inserted += rows;
-                    insert_cost += cost;
-                }
-                Err(err) => {
-                    if continue_on_error {
-                        error!(err; "Failed to import file to table: {}", table_name);
-                        continue;
-                    } else {
-                        return Err(err);
+            let moved_ctx = ctx.clone();
+            let moved_table_name = table_name.clone();
+            let moved_semaphore = semaphore.clone();
+            tasks.push(async move {
+                let _permit = moved_semaphore.acquire().await.unwrap();
+                debug!("Copy table, arg: {:?}", req);
+                match self.copy_table_from(req, moved_ctx).await {
+                    Ok(o) => {
+                        let (rows, cost) = o.extract_rows_and_cost();
+                        Ok((rows, cost))
+                    }
+                    Err(err) => {
+                        if continue_on_error {
+                            error!(err; "Failed to import file to table: {}", moved_table_name);
+                            Ok((0, 0))
+                        } else {
+                            Err(err)
+                        }
                     }
                 }
-            }
+            });
         }
+
+        let results = try_join_all(tasks).await?;
+        let (rows_inserted, insert_cost) = results
+            .into_iter()
+            .fold((0, 0), |(acc_rows, acc_cost), (rows, cost)| {
+                (acc_rows + rows, acc_cost + cost)
+            });
+
         Ok(Output::new(
             OutputData::AffectedRows(rows_inserted),
             OutputMeta::new_with_cost(insert_cost),
@@ -229,15 +271,18 @@ async fn list_files_to_copy(req: &CopyDatabaseRequest, suffix: &str) -> error::R
 
 #[cfg(test)]
 mod tests {
-    use std::collections::HashSet;
+    use std::collections::{HashMap, HashSet};
 
+    use common_stat::get_total_cpu_cores;
     use object_store::ObjectStore;
     use object_store::services::Fs;
     use object_store::util::normalize_dir;
     use path_slash::PathExt;
     use table::requests::CopyDatabaseRequest;
 
-    use crate::statement::copy_database::{list_files_to_copy, parse_file_name_to_copy};
+    use crate::statement::copy_database::{
+        list_files_to_copy, parse_file_name_to_copy, parse_parallelism_from_option_map,
+    };
 
     #[tokio::test]
     async fn test_list_files_and_parse_table_name() {
@@ -276,4 +321,16 @@ mod tests {
             listed
         );
     }
+
+    #[test]
+    fn test_parse_parallelism_from_option_map() {
+        let options = HashMap::new();
+        assert_eq!(
+            parse_parallelism_from_option_map(&options),
+            get_total_cpu_cores()
+        );
+
+        let options = HashMap::from([("parallelism".to_string(), "0".to_string())]);
+        assert_eq!(parse_parallelism_from_option_map(&options), 1);
+    }
 }
diff --git a/src/operator/src/statement/copy_table_from.rs b/src/operator/src/statement/copy_table_from.rs
index da120ff1bf..35cfdc7830 100644
--- a/src/operator/src/statement/copy_table_from.rs
+++ b/src/operator/src/statement/copy_table_from.rs
@@ -20,8 +20,9 @@ use std::sync::Arc;
 use client::{Output, OutputData, OutputMeta};
 use common_base::readable_size::ReadableSize;
 use common_datasource::file_format::csv::CsvFormat;
+use common_datasource::file_format::json::JsonFormat;
 use common_datasource::file_format::orc::{ReaderAdapter, infer_orc_schema, new_orc_stream_reader};
-use common_datasource::file_format::{FileFormat, Format};
+use common_datasource::file_format::{FileFormat, Format, file_to_stream};
 use common_datasource::lister::{Lister, Source};
 use common_datasource::object_store::{FS_SCHEMA, build_backend, parse_url};
 use common_datasource::util::find_dir_and_filename;
@@ -29,14 +30,9 @@ use common_query::{OutputCost, OutputRows};
 use common_recordbatch::DfSendableRecordBatchStream;
 use common_recordbatch::adapter::RecordBatchStreamTypeAdapter;
 use common_telemetry::{debug, tracing};
-use datafusion::datasource::listing::PartitionedFile;
-use datafusion::datasource::object_store::ObjectStoreUrl;
-use datafusion::datasource::physical_plan::{
-    CsvSource, FileGroup, FileScanConfigBuilder, FileSource, FileStream, JsonSource,
-};
+use datafusion::datasource::physical_plan::{CsvSource, FileSource, JsonSource};
 use datafusion::parquet::arrow::ParquetRecordBatchStreamBuilder;
 use datafusion::parquet::arrow::arrow_reader::ArrowReaderMetadata;
-use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
 use datafusion_expr::Expr;
 use datatypes::arrow::compute::can_cast_types;
 use datatypes::arrow::datatypes::{DataType as ArrowDataType, Schema, SchemaRef};
@@ -55,6 +51,7 @@ use crate::statement::StatementExecutor;
 
 const DEFAULT_BATCH_SIZE: usize = 8192;
 const DEFAULT_READ_BUFFER: usize = 256 * 1024;
+
 enum FileMetadata {
     Parquet {
         schema: SchemaRef,
@@ -67,6 +64,7 @@ enum FileMetadata {
     },
     Json {
         schema: SchemaRef,
+        format: JsonFormat,
         path: String,
     },
     Csv {
@@ -147,6 +145,7 @@ impl StatementExecutor {
                         .await
                         .context(error::InferSchemaSnafu { path: &path })?,
                 ),
+                format,
                 path,
             }),
             Format::Parquet(_) => {
@@ -195,33 +194,6 @@ impl StatementExecutor {
         }
     }
 
-    async fn build_file_stream(
-        &self,
-        store: &ObjectStore,
-        filename: &str,
-        file_schema: SchemaRef,
-        file_source: Arc<dyn FileSource>,
-        projection: Option<Vec<usize>>,
-    ) -> Result<DfSendableRecordBatchStream> {
-        let config = FileScanConfigBuilder::new(
-            ObjectStoreUrl::local_filesystem(),
-            file_schema,
-            file_source.clone(),
-        )
-        .with_file_group(FileGroup::new(vec![PartitionedFile::new(filename, 0)]))
-        .with_projection(projection)
-        .build();
-
-        let store = Arc::new(object_store_opendal::OpendalStore::new(store.clone()));
-        let file_opener = file_source
-            .with_projection(&config)
-            .create_file_opener(store, &config, 0);
-        let stream = FileStream::new(&config, 0, file_opener, &ExecutionPlanMetricsSet::new())
-            .context(error::BuildFileStreamSnafu)?;
-
-        Ok(Box::pin(stream))
-    }
-
     async fn build_read_stream(
         &self,
         compat_schema: SchemaRef,
@@ -245,16 +217,16 @@ impl StatementExecutor {
                 let csv_source = CsvSource::new(format.has_header, format.delimiter, b'"')
                     .with_schema(schema.clone())
                     .with_batch_size(DEFAULT_BATCH_SIZE);
-
-                let stream = self
-                    .build_file_stream(
-                        object_store,
-                        path,
-                        schema.clone(),
-                        csv_source,
-                        Some(projection),
-                    )
-                    .await?;
+                let stream = file_to_stream(
+                    object_store,
+                    path,
+                    schema.clone(),
+                    csv_source,
+                    Some(projection),
+                    format.compression_type,
+                )
+                .await
+                .context(error::BuildFileStreamSnafu)?;
 
                 Ok(Box::pin(
                     // The projection is already applied in the CSV reader when we created the stream,
@@ -264,7 +236,11 @@ impl StatementExecutor {
                         .context(error::PhysicalExprSnafu)?,
                 ))
             }
-            FileMetadata::Json { path, schema } => {
+            FileMetadata::Json {
+                path,
+                format,
+                schema,
+            } => {
                 let output_schema = Arc::new(
                     compat_schema
                         .project(&projection)
@@ -274,16 +250,16 @@ impl StatementExecutor {
                 let json_source = JsonSource::new()
                     .with_schema(schema.clone())
                     .with_batch_size(DEFAULT_BATCH_SIZE);
-
-                let stream = self
-                    .build_file_stream(
-                        object_store,
-                        path,
-                        schema.clone(),
-                        json_source,
-                        Some(projection),
-                    )
-                    .await?;
+                let stream = file_to_stream(
+                    object_store,
+                    path,
+                    schema.clone(),
+                    json_source,
+                    Some(projection),
+                    format.compression_type,
+                )
+                .await
+                .context(error::BuildFileStreamSnafu)?;
 
                 Ok(Box::pin(
                     // The projection is already applied in the JSON reader when we created the stream,
diff --git a/src/operator/src/statement/copy_table_to.rs b/src/operator/src/statement/copy_table_to.rs
index d542f8acbb..3e982373c4 100644
--- a/src/operator/src/statement/copy_table_to.rs
+++ b/src/operator/src/statement/copy_table_to.rs
@@ -76,12 +76,13 @@ impl StatementExecutor {
             )
             .await
             .context(error::WriteStreamToFileSnafu { path }),
-            Format::Json(_) => stream_to_json(
+            Format::Json(format) => stream_to_json(
                 Box::pin(DfRecordBatchStreamAdapter::new(stream)),
                 object_store,
                 path,
                 threshold,
                 WRITE_CONCURRENCY,
+                format,
             )
             .await
             .context(error::WriteStreamToFileSnafu { path }),
diff --git a/src/operator/src/statement/dml.rs b/src/operator/src/statement/dml.rs
index 827bfd8b66..41169398ab 100644
--- a/src/operator/src/statement/dml.rs
+++ b/src/operator/src/statement/dml.rs
@@ -28,7 +28,7 @@ impl StatementExecutor {
         if insert.can_extract_values() {
             // Fast path: plain insert ("insert with literal values") is executed directly
             self.inserter
-                .handle_statement_insert(insert.as_ref(), &query_ctx)
+                .handle_statement_insert(insert.as_ref(), &query_ctx, self)
                 .await
         } else {
             // Slow path: insert with subquery. Execute using query engine.
diff --git a/src/operator/src/statement/show.rs b/src/operator/src/statement/show.rs
index 11c34fb2ff..08c0b4661a 100644
--- a/src/operator/src/statement/show.rs
+++ b/src/operator/src/statement/show.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::sync::Arc;
+
 use common_error::ext::BoxedError;
 use common_meta::key::schema_name::SchemaNameKey;
 use common_query::Output;
@@ -120,7 +122,30 @@ impl StatementExecutor {
         table: TableRef,
         query_ctx: QueryContextRef,
     ) -> Result<Output> {
-        let table_info = table.table_info();
+        let mut table_info = table.table_info();
+        let partition_column_names: Vec<_> =
+            table_info.meta.partition_column_names().cloned().collect();
+
+        if let Some(latest) = self
+            .table_metadata_manager
+            .table_info_manager()
+            .get(table_info.table_id())
+            .await
+            .context(TableMetadataManagerSnafu)?
+        {
+            let mut latest_info = TableInfo::try_from(latest.into_inner().table_info)
+                .context(error::CreateTableInfoSnafu)?;
+
+            if !partition_column_names.is_empty() {
+                latest_info.meta.partition_key_indices = partition_column_names
+                    .iter()
+                    .filter_map(|name| latest_info.meta.schema.column_index_by_name(name.as_str()))
+                    .collect();
+            }
+
+            table_info = Arc::new(latest_info);
+        }
+
         if table_info.table_type != TableType::Base {
             return error::ShowCreateTableBaseOnlySnafu {
                 table_name: table_name.to_string(),
@@ -150,7 +175,7 @@ impl StatementExecutor {
 
         let partitions = create_partitions_stmt(&table_info, partitions)?;
 
-        query::sql::show_create_table(table, schema_options, partitions, query_ctx)
+        query::sql::show_create_table(table_info, schema_options, partitions, query_ctx)
             .context(ExecuteStatementSnafu)
     }
 
diff --git a/src/operator/src/table.rs b/src/operator/src/table.rs
index 52c37bb401..13ed57200c 100644
--- a/src/operator/src/table.rs
+++ b/src/operator/src/table.rs
@@ -23,8 +23,8 @@ use session::context::QueryContextRef;
 use snafu::ResultExt;
 use store_api::storage::RegionId;
 use table::requests::{
-    CompactTableRequest, DeleteRequest as TableDeleteRequest, FlushTableRequest,
-    InsertRequest as TableInsertRequest,
+    BuildIndexTableRequest, CompactTableRequest, DeleteRequest as TableDeleteRequest,
+    FlushTableRequest, InsertRequest as TableInsertRequest,
 };
 
 use crate::delete::DeleterRef;
@@ -97,6 +97,18 @@ impl TableMutationHandler for TableMutationOperator {
             .context(query_error::TableMutationSnafu)
     }
 
+    async fn build_index(
+        &self,
+        request: BuildIndexTableRequest,
+        ctx: QueryContextRef,
+    ) -> QueryResult<AffectedRows> {
+        self.requester
+            .handle_table_build_index(request, ctx)
+            .await
+            .map_err(BoxedError::new)
+            .context(query_error::TableMutationSnafu)
+    }
+
     async fn flush_region(
         &self,
         region_id: RegionId,
diff --git a/src/partition/src/collider.rs b/src/partition/src/collider.rs
index 1bd5000f9d..c426e84575 100644
--- a/src/partition/src/collider.rs
+++ b/src/partition/src/collider.rs
@@ -173,6 +173,9 @@ impl<'a> Collider<'a> {
         for (column, mut column_values) in values {
             column_values.sort_unstable();
             column_values.dedup(); // Remove duplicates
+
+            // allowed because we have carefully implemented `Hash` to eliminate the mutable
+            #[allow(clippy::mutable_key_type)]
             let mut value_map = HashMap::with_capacity(column_values.len());
             let mut start_value = ZERO;
             for value in column_values {
diff --git a/src/pipeline/benches/processor.rs b/src/pipeline/benches/processor.rs
index 74b136e873..d27b087ba3 100644
--- a/src/pipeline/benches/processor.rs
+++ b/src/pipeline/benches/processor.rs
@@ -33,7 +33,7 @@ fn processor_mut(
             .exec_mut(v, pipeline_ctx, schema_info)?
             .into_transformed()
             .expect("expect transformed result ");
-        result.push(r.0);
+        result.extend(r.into_iter().map(|v| v.0));
     }
 
     Ok(result)
diff --git a/src/pipeline/src/error.rs b/src/pipeline/src/error.rs
index 73ffb711a1..651f1cd4a9 100644
--- a/src/pipeline/src/error.rs
+++ b/src/pipeline/src/error.rs
@@ -19,6 +19,7 @@ use common_error::status_code::StatusCode;
 use common_macro::stack_trace_debug;
 use datatypes::timestamp::TimestampNanosecond;
 use snafu::{Location, Snafu};
+use vrl::value::Kind;
 
 #[derive(Snafu)]
 #[snafu(visibility(pub))]
@@ -676,8 +677,12 @@ pub enum Error {
         location: Location,
     },
 
-    #[snafu(display("Vrl script should return `.` in the end"))]
+    #[snafu(display(
+        "Vrl script should return object or array in the end, got `{:?}`",
+        result_kind
+    ))]
     VrlReturnValue {
+        result_kind: Kind,
         #[snafu(implicit)]
         location: Location,
     },
@@ -695,6 +700,25 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display(
+        "Array element at index {index} must be an object for one-to-many transformation, got {actual_type}"
+    ))]
+    ArrayElementMustBeObject {
+        index: usize,
+        actual_type: String,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
+    #[snafu(display("Failed to transform array element at index {index}: {source}"))]
+    TransformArrayElement {
+        index: usize,
+        #[snafu(source)]
+        source: Box<Error>,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display("Failed to build DataFusion logical plan"))]
     BuildDfLogicalPlan {
         #[snafu(source)]
@@ -792,7 +816,10 @@ impl ErrorExt for Error {
             | InvalidPipelineVersion { .. }
             | InvalidCustomTimeIndex { .. }
             | TimeIndexMustBeNonNull { .. } => StatusCode::InvalidArguments,
-            MultiPipelineWithDiffSchema { .. } | ValueMustBeMap { .. } => StatusCode::IllegalState,
+            MultiPipelineWithDiffSchema { .. }
+            | ValueMustBeMap { .. }
+            | ArrayElementMustBeObject { .. } => StatusCode::IllegalState,
+            TransformArrayElement { source, .. } => source.status_code(),
             BuildDfLogicalPlan { .. } | RecordBatchLenNotMatch { .. } => StatusCode::Internal,
             ExecuteInternalStatement { source, .. } => source.status_code(),
             DataFrame { source, .. } => source.status_code(),
diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs
index 2fe2a7ba53..dd4d540376 100644
--- a/src/pipeline/src/etl.rs
+++ b/src/pipeline/src/etl.rs
@@ -19,6 +19,8 @@ pub mod processor;
 pub mod transform;
 pub mod value;
 
+use std::collections::HashMap;
+
 use api::v1::Row;
 use common_time::timestamp::TimeUnit;
 use itertools::Itertools;
@@ -30,13 +32,17 @@ use yaml_rust::{Yaml, YamlLoader};
 
 use crate::dispatcher::{Dispatcher, Rule};
 use crate::error::{
-    AutoTransformOneTimestampSnafu, Error, IntermediateKeyIndexSnafu, InvalidVersionNumberSnafu,
-    Result, YamlLoadSnafu, YamlParseSnafu,
+    ArrayElementMustBeObjectSnafu, AutoTransformOneTimestampSnafu, Error,
+    IntermediateKeyIndexSnafu, InvalidVersionNumberSnafu, Result, TransformArrayElementSnafu,
+    YamlLoadSnafu, YamlParseSnafu,
 };
 use crate::etl::processor::ProcessorKind;
-use crate::etl::transform::transformer::greptime::values_to_row;
+use crate::etl::transform::transformer::greptime::{RowWithTableSuffix, values_to_rows};
 use crate::tablesuffix::TableSuffixTemplate;
-use crate::{ContextOpt, GreptimeTransformer, IdentityTimeIndex, PipelineContext, SchemaInfo};
+use crate::{
+    ContextOpt, GreptimeTransformer, IdentityTimeIndex, PipelineContext, SchemaInfo,
+    unwrap_or_continue_if_err,
+};
 
 const DESCRIPTION: &str = "description";
 const DOC_VERSION: &str = "version";
@@ -230,21 +236,51 @@ pub enum PipelineExecOutput {
     Filtered,
 }
 
+/// Output from a successful pipeline transformation.
+///
+/// Rows are grouped by their ContextOpt, with each row having its own optional
+/// table_suffix for routing to different tables when using one-to-many expansion.
+/// This enables true per-row configuration options where different rows can have
+/// different database settings (TTL, merge mode, etc.).
 #[derive(Debug)]
 pub struct TransformedOutput {
-    pub opt: ContextOpt,
-    pub row: Row,
-    pub table_suffix: Option<String>,
+    /// Rows grouped by their ContextOpt, each with optional table suffix
+    pub rows_by_context: HashMap<ContextOpt, Vec<RowWithTableSuffix>>,
 }
 
 impl PipelineExecOutput {
     // Note: This is a test only function, do not use it in production.
-    pub fn into_transformed(self) -> Option<(Row, Option<String>)> {
-        if let Self::Transformed(TransformedOutput {
-            row, table_suffix, ..
-        }) = self
-        {
-            Some((row, table_suffix))
+    pub fn into_transformed(self) -> Option<Vec<RowWithTableSuffix>> {
+        if let Self::Transformed(TransformedOutput { rows_by_context }) = self {
+            // For backward compatibility, merge all rows with a default ContextOpt
+            Some(rows_by_context.into_values().flatten().collect())
+        } else {
+            None
+        }
+    }
+
+    // New method for accessing the HashMap structure directly
+    pub fn into_transformed_hashmap(self) -> Option<HashMap<ContextOpt, Vec<RowWithTableSuffix>>> {
+        if let Self::Transformed(TransformedOutput { rows_by_context }) = self {
+            Some(rows_by_context)
+        } else {
+            None
+        }
+    }
+
+    // Backward compatibility helper that returns first ContextOpt with all its rows
+    // or merges all rows with default ContextOpt for multi-context scenarios
+    pub fn into_legacy_format(self) -> Option<(ContextOpt, Vec<RowWithTableSuffix>)> {
+        if let Self::Transformed(TransformedOutput { rows_by_context }) = self {
+            if rows_by_context.len() == 1 {
+                let (opt, rows) = rows_by_context.into_iter().next().unwrap();
+                Some((opt, rows))
+            } else {
+                // Multiple contexts: merge all rows with default ContextOpt for test compatibility
+                let all_rows: Vec<RowWithTableSuffix> =
+                    rows_by_context.into_values().flatten().collect();
+                Some((ContextOpt::default(), all_rows))
+            }
         } else {
             None
         }
@@ -285,45 +321,43 @@ impl Pipeline {
             return Ok(PipelineExecOutput::DispatchedTo(rule.into(), val));
         }
 
-        // extract the options first
-        // this might be a breaking change, for table_suffix is now right after the processors
-        let mut opt = ContextOpt::from_pipeline_map_to_opt(&mut val)?;
-        let table_suffix = opt.resolve_table_suffix(self.tablesuffix.as_ref(), &val);
+        let mut val = if val.is_array() {
+            val
+        } else {
+            VrlValue::Array(vec![val])
+        };
 
-        let row = match self.transformer() {
+        let rows_by_context = match self.transformer() {
             TransformerMode::GreptimeTransformer(greptime_transformer) => {
-                let values = greptime_transformer.transform_mut(&mut val, self.is_v1())?;
-                if self.is_v1() {
-                    // v1 dont combine with auto-transform
-                    // so return immediately
-                    return Ok(PipelineExecOutput::Transformed(TransformedOutput {
-                        opt,
-                        row: Row { values },
-                        table_suffix,
-                    }));
-                }
-                // continue v2 process, and set the rest fields with auto-transform
-                // if transformer presents, then ts has been set
-                values_to_row(schema_info, val, pipeline_ctx, Some(values), false)?
+                transform_array_elements_by_ctx(
+                    // SAFETY: by line 326, val must be an array
+                    val.as_array_mut().unwrap(),
+                    greptime_transformer,
+                    self.is_v1(),
+                    schema_info,
+                    pipeline_ctx,
+                    self.tablesuffix.as_ref(),
+                )?
             }
             TransformerMode::AutoTransform(ts_name, time_unit) => {
-                // infer ts from the context
-                // we've check that only one timestamp should exist
-
-                // Create pipeline context with the found timestamp
                 let def = crate::PipelineDefinition::GreptimeIdentityPipeline(Some(
                     IdentityTimeIndex::Epoch(ts_name.clone(), *time_unit, false),
                 ));
                 let n_ctx =
                     PipelineContext::new(&def, pipeline_ctx.pipeline_param, pipeline_ctx.channel);
-                values_to_row(schema_info, val, &n_ctx, None, true)?
+                values_to_rows(
+                    schema_info,
+                    val,
+                    &n_ctx,
+                    None,
+                    true,
+                    self.tablesuffix.as_ref(),
+                )?
             }
         };
 
         Ok(PipelineExecOutput::Transformed(TransformedOutput {
-            opt,
-            row,
-            table_suffix,
+            rows_by_context,
         }))
     }
 
@@ -350,6 +384,65 @@ impl Pipeline {
     }
 }
 
+/// Transforms an array of VRL values into rows grouped by their ContextOpt.
+/// Each element can have its own ContextOpt for per-row configuration.
+fn transform_array_elements_by_ctx(
+    arr: &mut [VrlValue],
+    transformer: &GreptimeTransformer,
+    is_v1: bool,
+    schema_info: &mut SchemaInfo,
+    pipeline_ctx: &PipelineContext<'_>,
+    tablesuffix_template: Option<&TableSuffixTemplate>,
+) -> Result<HashMap<ContextOpt, Vec<RowWithTableSuffix>>> {
+    let skip_error = pipeline_ctx.pipeline_param.skip_error();
+    let mut rows_by_context = HashMap::new();
+
+    for (index, element) in arr.iter_mut().enumerate() {
+        if !element.is_object() {
+            unwrap_or_continue_if_err!(
+                ArrayElementMustBeObjectSnafu {
+                    index,
+                    actual_type: element.kind_str().to_string(),
+                }
+                .fail(),
+                skip_error
+            );
+        }
+
+        let values =
+            unwrap_or_continue_if_err!(transformer.transform_mut(element, is_v1), skip_error);
+        if is_v1 {
+            // v1 mode: just use transformer output directly
+            let mut opt = unwrap_or_continue_if_err!(
+                ContextOpt::from_pipeline_map_to_opt(element),
+                skip_error
+            );
+            let table_suffix = opt.resolve_table_suffix(tablesuffix_template, element);
+            rows_by_context
+                .entry(opt)
+                .or_insert_with(Vec::new)
+                .push((Row { values }, table_suffix));
+        } else {
+            // v2 mode: combine with auto-transform for remaining fields
+            let element_rows_map = values_to_rows(
+                schema_info,
+                element.clone(),
+                pipeline_ctx,
+                Some(values),
+                false,
+                tablesuffix_template,
+            )
+            .map_err(Box::new)
+            .context(TransformArrayElementSnafu { index })?;
+            for (k, v) in element_rows_map {
+                rows_by_context.entry(k).or_default().extend(v);
+            }
+        }
+    }
+
+    Ok(rows_by_context)
+}
+
 pub(crate) fn find_key_index(intermediate_keys: &[String], key: &str, kind: &str) -> Result<usize> {
     intermediate_keys
         .iter()
@@ -361,7 +454,7 @@ pub(crate) fn find_key_index(intermediate_keys: &[String], key: &str, kind: &str
 /// The schema_info cannot be used in auto-transform ts-infer mode for lacking the ts schema.
 ///
 /// Usage:
-/// ```rust
+/// ```ignore
 /// let (pipeline, schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
 /// let pipeline_ctx = PipelineContext::new(&pipeline_def, &pipeline_param, Channel::Unknown);
 /// ```
@@ -382,6 +475,7 @@ macro_rules! setup_pipeline {
         (pipeline, schema_info, pipeline_def, pipeline_param)
     }};
 }
+
 #[cfg(test)]
 mod tests {
     use std::collections::BTreeMap;
@@ -433,15 +527,16 @@ transform:
         );
 
         let payload = input_value.into();
-        let result = pipeline
+        let mut result = pipeline
             .exec_mut(payload, &pipeline_ctx, &mut schema_info)
             .unwrap()
             .into_transformed()
             .unwrap();
 
-        assert_eq!(result.0.values[0].value_data, Some(ValueData::U32Value(1)));
-        assert_eq!(result.0.values[1].value_data, Some(ValueData::U32Value(2)));
-        match &result.0.values[2].value_data {
+        let (row, _table_suffix) = result.swap_remove(0);
+        assert_eq!(row.values[0].value_data, Some(ValueData::U32Value(1)));
+        assert_eq!(row.values[1].value_data, Some(ValueData::U32Value(2)));
+        match &row.values[2].value_data {
             Some(ValueData::TimestampNanosecondValue(v)) => {
                 assert_ne!(v, &0);
             }
@@ -504,7 +599,7 @@ transform:
             .into_transformed()
             .unwrap();
 
-        assert_eq!(schema_info.schema.len(), result.0.values.len());
+        assert_eq!(schema_info.schema.len(), result[0].0.values.len());
         let test = [
             (
                 ColumnDataType::String as i32,
@@ -545,7 +640,7 @@ transform:
         let schema = pipeline.schemas().unwrap();
         for i in 0..schema.len() {
             let schema = &schema[i];
-            let value = &result.0.values[i];
+            let value = &result[0].0.values[i];
             assert_eq!(schema.datatype, test[i].0);
             assert_eq!(value.value_data, test[i].1);
         }
@@ -595,9 +690,15 @@ transform:
             .unwrap()
             .into_transformed()
             .unwrap();
-        assert_eq!(result.0.values[0].value_data, Some(ValueData::U32Value(1)));
-        assert_eq!(result.0.values[1].value_data, Some(ValueData::U32Value(2)));
-        match &result.0.values[2].value_data {
+        assert_eq!(
+            result[0].0.values[0].value_data,
+            Some(ValueData::U32Value(1))
+        );
+        assert_eq!(
+            result[0].0.values[1].value_data,
+            Some(ValueData::U32Value(2))
+        );
+        match &result[0].0.values[2].value_data {
             Some(ValueData::TimestampNanosecondValue(v)) => {
                 assert_ne!(v, &0);
             }
@@ -644,14 +745,14 @@ transform:
         let schema = pipeline.schemas().unwrap().clone();
         let result = input_value.into();
 
-        let row = pipeline
+        let rows_with_suffix = pipeline
             .exec_mut(result, &pipeline_ctx, &mut schema_info)
             .unwrap()
             .into_transformed()
             .unwrap();
         let output = Rows {
             schema,
-            rows: vec![row.0],
+            rows: rows_with_suffix.into_iter().map(|(r, _)| r).collect(),
         };
         let schemas = output.schema;
 
@@ -804,4 +905,566 @@ transform:
         let r: Result<Pipeline> = parse(&Content::Yaml(bad_yaml3));
         assert!(r.is_err());
     }
+
+    /// Test one-to-many VRL pipeline expansion.
+    /// A VRL processor can return an array, which results in multiple output rows.
+    #[test]
+    fn test_one_to_many_vrl_expansion() {
+        let pipeline_yaml = r#"
+processors:
+  - epoch:
+      field: timestamp
+      resolution: ms
+  - vrl:
+      source: |
+        events = del(.events)
+        base_host = del(.host)
+        base_ts = del(.timestamp)
+        map_values(array!(events)) -> |event| {
+            {
+                "host": base_host,
+                "event_type": event.type,
+                "event_value": event.value,
+                "timestamp": base_ts
+            }
+        }
+
+transform:
+  - field: host
+    type: string
+  - field: event_type
+    type: string
+  - field: event_value
+    type: int32
+  - field: timestamp
+    type: timestamp, ms
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        // Input with 3 events
+        let input_value: serde_json::Value = serde_json::from_str(
+            r#"{
+                "host": "server1",
+                "timestamp": 1716668197217,
+                "events": [
+                    {"type": "cpu", "value": 80},
+                    {"type": "memory", "value": 60},
+                    {"type": "disk", "value": 45}
+                ]
+            }"#,
+        )
+        .unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+
+        // Should produce 3 rows from 1 input
+        assert_eq!(result.len(), 3);
+
+        // Verify each row has correct structure
+        for (row, _table_suffix) in &result {
+            assert_eq!(row.values.len(), 4); // host, event_type, event_value, timestamp
+            // First value should be "server1"
+            assert_eq!(
+                row.values[0].value_data,
+                Some(ValueData::StringValue("server1".to_string()))
+            );
+            // Last value should be the timestamp
+            assert_eq!(
+                row.values[3].value_data,
+                Some(ValueData::TimestampMillisecondValue(1716668197217))
+            );
+        }
+
+        // Verify event types
+        let event_types: Vec<_> = result
+            .iter()
+            .map(|(r, _)| match &r.values[1].value_data {
+                Some(ValueData::StringValue(s)) => s.clone(),
+                _ => panic!("expected string"),
+            })
+            .collect();
+        assert!(event_types.contains(&"cpu".to_string()));
+        assert!(event_types.contains(&"memory".to_string()));
+        assert!(event_types.contains(&"disk".to_string()));
+    }
+
+    /// Test that single object output still works (backward compatibility)
+    #[test]
+    fn test_single_object_output_unchanged() {
+        let pipeline_yaml = r#"
+processors:
+  - epoch:
+      field: ts
+      resolution: ms
+  - vrl:
+      source: |
+        .processed = true
+        .
+
+transform:
+  - field: name
+    type: string
+  - field: processed
+    type: boolean
+  - field: ts
+    type: timestamp, ms
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        let input_value: serde_json::Value = serde_json::from_str(
+            r#"{
+                "name": "test",
+                "ts": 1716668197217
+            }"#,
+        )
+        .unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+
+        // Should produce exactly 1 row
+        assert_eq!(result.len(), 1);
+        assert_eq!(
+            result[0].0.values[0].value_data,
+            Some(ValueData::StringValue("test".to_string()))
+        );
+        assert_eq!(
+            result[0].0.values[1].value_data,
+            Some(ValueData::BoolValue(true))
+        );
+    }
+
+    /// Test that empty array produces zero rows
+    #[test]
+    fn test_empty_array_produces_zero_rows() {
+        let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        .events
+
+transform:
+  - field: value
+    type: int32
+  - field: greptime_timestamp
+    type: timestamp, ns
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        let input_value: serde_json::Value = serde_json::from_str(r#"{"events": []}"#).unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+
+        // Empty array should produce zero rows
+        assert_eq!(result.len(), 0);
+    }
+
+    /// Test that array elements must be objects
+    #[test]
+    fn test_array_element_must_be_object() {
+        let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        .items
+
+transform:
+  - field: value
+    type: int32
+  - field: greptime_timestamp
+    type: timestamp, ns
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        // Array with non-object elements should fail
+        let input_value: serde_json::Value =
+            serde_json::from_str(r#"{"items": [1, 2, 3]}"#).unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline.exec_mut(payload, &pipeline_ctx, &mut schema_info);
+
+        assert!(result.is_err());
+        let err_msg = result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("must be an object"),
+            "Expected error about non-object element, got: {}",
+            err_msg
+        );
+    }
+
+    /// Test one-to-many with table suffix from VRL hint
+    #[test]
+    fn test_one_to_many_with_table_suffix_hint() {
+        let pipeline_yaml = r#"
+processors:
+  - epoch:
+      field: ts
+      resolution: ms
+  - vrl:
+      source: |
+        .greptime_table_suffix = "_" + string!(.category)
+        .
+
+transform:
+  - field: name
+    type: string
+  - field: category
+    type: string
+  - field: ts
+    type: timestamp, ms
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        let input_value: serde_json::Value = serde_json::from_str(
+            r#"{
+                "name": "test",
+                "category": "metrics",
+                "ts": 1716668197217
+            }"#,
+        )
+        .unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+
+        // Should have table suffix extracted per row
+        assert_eq!(result.len(), 1);
+        assert_eq!(result[0].1, Some("_metrics".to_string()));
+    }
+
+    /// Test one-to-many with per-row table suffix
+    #[test]
+    fn test_one_to_many_per_row_table_suffix() {
+        let pipeline_yaml = r#"
+processors:
+  - epoch:
+      field: timestamp
+      resolution: ms
+  - vrl:
+      source: |
+        events = del(.events)
+        base_ts = del(.timestamp)
+
+        map_values(array!(events)) -> |event| {
+            suffix = "_" + string!(event.category)
+            {
+                "name": event.name,
+                "value": event.value,
+                "timestamp": base_ts,
+                "greptime_table_suffix": suffix
+            }
+        }
+
+transform:
+  - field: name
+    type: string
+  - field: value
+    type: int32
+  - field: timestamp
+    type: timestamp, ms
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        // Input with events that should go to different tables
+        let input_value: serde_json::Value = serde_json::from_str(
+            r#"{
+                "timestamp": 1716668197217,
+                "events": [
+                    {"name": "cpu_usage", "value": 80, "category": "cpu"},
+                    {"name": "mem_usage", "value": 60, "category": "memory"},
+                    {"name": "cpu_temp", "value": 45, "category": "cpu"}
+                ]
+            }"#,
+        )
+        .unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+
+        // Should produce 3 rows
+        assert_eq!(result.len(), 3);
+
+        // Collect table suffixes
+        let table_suffixes: Vec<_> = result.iter().map(|(_, suffix)| suffix.clone()).collect();
+
+        // Should have different table suffixes per row
+        assert!(table_suffixes.contains(&Some("_cpu".to_string())));
+        assert!(table_suffixes.contains(&Some("_memory".to_string())));
+
+        // Count rows per table suffix
+        let cpu_count = table_suffixes
+            .iter()
+            .filter(|s| *s == &Some("_cpu".to_string()))
+            .count();
+        let memory_count = table_suffixes
+            .iter()
+            .filter(|s| *s == &Some("_memory".to_string()))
+            .count();
+        assert_eq!(cpu_count, 2);
+        assert_eq!(memory_count, 1);
+    }
+
+    /// Test that one-to-many mapping preserves per-row ContextOpt in HashMap
+    #[test]
+    fn test_one_to_many_hashmap_contextopt_preservation() {
+        let pipeline_yaml = r#"
+processors:
+  - epoch:
+      field: timestamp
+      resolution: ms
+  - vrl:
+      source: |
+        events = del(.events)
+        base_ts = del(.timestamp)
+
+        map_values(array!(events)) -> |event| {
+            # Set different TTL values per event type
+            ttl = if event.type == "critical" {
+                "1h"
+            } else if event.type == "warning" {
+                "24h"
+            } else {
+                "7d"
+            }
+
+            {
+                "host": del(.host),
+                "event_type": event.type,
+                "event_value": event.value,
+                "timestamp": base_ts,
+                "greptime_ttl": ttl
+            }
+        }
+
+transform:
+  - field: host
+    type: string
+  - field: event_type
+    type: string
+  - field: event_value
+    type: int32
+  - field: timestamp
+    type: timestamp, ms
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        // Input with events that should have different ContextOpt values
+        let input_value: serde_json::Value = serde_json::from_str(
+            r#"{
+                "host": "server1",
+                "timestamp": 1716668197217,
+                "events": [
+                    {"type": "critical", "value": 100},
+                    {"type": "warning", "value": 50},
+                    {"type": "info", "value": 25}
+                ]
+            }"#,
+        )
+        .unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap();
+
+        // Extract the HashMap structure
+        let rows_by_context = result.into_transformed_hashmap().unwrap();
+
+        // Should have 3 different ContextOpt groups due to different TTL values
+        assert_eq!(rows_by_context.len(), 3);
+
+        // Verify each ContextOpt group has exactly 1 row and different configurations
+        let mut context_opts = Vec::new();
+        for (opt, rows) in &rows_by_context {
+            assert_eq!(rows.len(), 1); // Each group should have exactly 1 row
+            context_opts.push(opt.clone());
+        }
+
+        // ContextOpts should be different due to different TTL values
+        assert_ne!(context_opts[0], context_opts[1]);
+        assert_ne!(context_opts[1], context_opts[2]);
+        assert_ne!(context_opts[0], context_opts[2]);
+
+        // Verify the rows are correctly structured
+        for rows in rows_by_context.values() {
+            for (row, _table_suffix) in rows {
+                assert_eq!(row.values.len(), 4); // host, event_type, event_value, timestamp
+            }
+        }
+    }
+
+    /// Test that single object input still works with HashMap structure
+    #[test]
+    fn test_single_object_hashmap_compatibility() {
+        let pipeline_yaml = r#"
+processors:
+  - epoch:
+      field: ts
+      resolution: ms
+  - vrl:
+      source: |
+        .processed = true
+        .
+
+transform:
+  - field: name
+    type: string
+  - field: processed
+    type: boolean
+  - field: ts
+    type: timestamp, ms
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        let input_value: serde_json::Value = serde_json::from_str(
+            r#"{
+                "name": "test",
+                "ts": 1716668197217
+            }"#,
+        )
+        .unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap();
+
+        // Extract the HashMap structure
+        let rows_by_context = result.into_transformed_hashmap().unwrap();
+
+        // Single object should produce exactly 1 ContextOpt group
+        assert_eq!(rows_by_context.len(), 1);
+
+        let (_opt, rows) = rows_by_context.into_iter().next().unwrap();
+        assert_eq!(rows.len(), 1);
+
+        // Verify the row structure
+        let (row, _table_suffix) = &rows[0];
+        assert_eq!(row.values.len(), 3); // name, processed, timestamp
+    }
+
+    /// Test that empty arrays work correctly with HashMap structure
+    #[test]
+    fn test_empty_array_hashmap() {
+        let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        .events
+
+transform:
+  - field: value
+    type: int32
+  - field: greptime_timestamp
+    type: timestamp, ns
+    index: time
+"#;
+
+        let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+        let pipeline_ctx = PipelineContext::new(
+            &pipeline_def,
+            &pipeline_param,
+            session::context::Channel::Unknown,
+        );
+
+        let input_value: serde_json::Value = serde_json::from_str(r#"{"events": []}"#).unwrap();
+
+        let payload = input_value.into();
+        let result = pipeline
+            .exec_mut(payload, &pipeline_ctx, &mut schema_info)
+            .unwrap();
+
+        // Extract the HashMap structure
+        let rows_by_context = result.into_transformed_hashmap().unwrap();
+
+        // Empty array should produce empty HashMap
+        assert_eq!(rows_by_context.len(), 0);
+    }
 }
diff --git a/src/pipeline/src/etl/ctx_req.rs b/src/pipeline/src/etl/ctx_req.rs
index f8fc7c11f2..23873cfdf1 100644
--- a/src/pipeline/src/etl/ctx_req.rs
+++ b/src/pipeline/src/etl/ctx_req.rs
@@ -57,7 +57,7 @@ const PIPELINE_HINT_PREFIX: &str = "greptime_";
 ///
 /// The options are set in the format of hint keys. See [`PIPELINE_HINT_KEYS`].
 /// It's is used as the key in [`ContextReq`] for grouping the row insert requests.
-#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
 pub struct ContextOpt {
     // table options, that need to be set in the query context before making row insert requests
     auto_create_table: Option<String>,
@@ -192,8 +192,15 @@ impl ContextReq {
         Self { req: req_map }
     }
 
-    pub fn add_row(&mut self, opt: ContextOpt, req: RowInsertRequest) {
-        self.req.entry(opt).or_default().push(req);
+    pub fn add_row(&mut self, opt: &ContextOpt, req: RowInsertRequest) {
+        match self.req.get_mut(opt) {
+            None => {
+                self.req.insert(opt.clone(), vec![req]);
+            }
+            Some(e) => {
+                e.push(req);
+            }
+        }
     }
 
     pub fn add_rows(&mut self, opt: ContextOpt, reqs: impl IntoIterator<Item = RowInsertRequest>) {
diff --git a/src/pipeline/src/etl/processor/vrl_processor.rs b/src/pipeline/src/etl/processor/vrl_processor.rs
index e84f0b3e4c..20258a0427 100644
--- a/src/pipeline/src/etl/processor/vrl_processor.rs
+++ b/src/pipeline/src/etl/processor/vrl_processor.rs
@@ -15,7 +15,7 @@
 use std::collections::BTreeMap;
 
 use chrono_tz::Tz;
-use snafu::OptionExt;
+use snafu::{OptionExt, ensure};
 use vrl::compiler::runtime::Runtime;
 use vrl::compiler::{Program, TargetValue, compile};
 use vrl::diagnostic::Formatter;
@@ -53,9 +53,15 @@ impl VrlProcessor {
         // check if the return value is have regex
         let result_def = program.final_type_info().result;
         let kind = result_def.kind();
-        if !kind.is_object() {
-            return VrlReturnValueSnafu.fail();
-        }
+        // Check if the return type could possibly be an object or array.
+        // We use contains_* methods since VRL type inference may return
+        // a Kind that represents multiple possible types.
+        ensure!(
+            kind.contains_object() || kind.contains_array(),
+            VrlReturnValueSnafu {
+                result_kind: kind.clone(),
+            }
+        );
         check_regex_output(kind)?;
 
         Ok(Self { source, program })
@@ -111,13 +117,7 @@ impl crate::etl::processor::Processor for VrlProcessor {
     }
 
     fn exec_mut(&self, val: VrlValue) -> Result<VrlValue> {
-        let val = self.resolve(val)?;
-
-        if let VrlValue::Object(_) = val {
-            Ok(val)
-        } else {
-            VrlRegexValueSnafu.fail()
-        }
+        self.resolve(val)
     }
 }
 
diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs
index 6774842ef1..85494b24dc 100644
--- a/src/pipeline/src/etl/transform/transformer/greptime.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime.rs
@@ -37,8 +37,8 @@ use vrl::prelude::{Bytes, VrlValueConvert};
 use vrl::value::{KeyString, Value as VrlValue};
 
 use crate::error::{
-    IdentifyPipelineColumnTypeMismatchSnafu, InvalidTimestampSnafu, Result,
-    TimeIndexMustBeNonNullSnafu, TransformColumnNameMustBeUniqueSnafu,
+    ArrayElementMustBeObjectSnafu, IdentifyPipelineColumnTypeMismatchSnafu, InvalidTimestampSnafu,
+    Result, TimeIndexMustBeNonNullSnafu, TransformColumnNameMustBeUniqueSnafu,
     TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu, ValueMustBeMapSnafu,
 };
 use crate::etl::PipelineDocVersion;
@@ -50,6 +50,9 @@ use crate::{PipelineContext, truthy, unwrap_or_continue_if_err};
 
 const DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING: usize = 10;
 
+/// Row with potentially designated table suffix.
+pub type RowWithTableSuffix = (Row, Option<String>);
+
 /// fields not in the columns will be discarded
 /// to prevent automatic column creation in GreptimeDB
 #[derive(Debug, Clone)]
@@ -363,6 +366,73 @@ fn calc_ts(p_ctx: &PipelineContext, values: &VrlValue) -> Result<Option<ValueDat
     }
 }
 
+/// Converts VRL values to Greptime rows grouped by their ContextOpt.
+/// # Returns
+/// A HashMap where keys are `ContextOpt` and values are vectors of (row, table_suffix) pairs.
+/// Single object input produces one ContextOpt group with one row.
+/// Array input groups rows by their per-element ContextOpt values.
+///
+/// # Errors
+/// - `ArrayElementMustBeObject` if an array element is not an object
+pub(crate) fn values_to_rows(
+    schema_info: &mut SchemaInfo,
+    mut values: VrlValue,
+    pipeline_ctx: &PipelineContext<'_>,
+    row: Option<Vec<GreptimeValue>>,
+    need_calc_ts: bool,
+    tablesuffix_template: Option<&crate::tablesuffix::TableSuffixTemplate>,
+) -> Result<std::collections::HashMap<ContextOpt, Vec<RowWithTableSuffix>>> {
+    let skip_error = pipeline_ctx.pipeline_param.skip_error();
+    let VrlValue::Array(arr) = values else {
+        // Single object: extract ContextOpt and table_suffix
+        let mut result = std::collections::HashMap::new();
+
+        let mut opt = match ContextOpt::from_pipeline_map_to_opt(&mut values) {
+            Ok(r) => r,
+            Err(e) => return if skip_error { Ok(result) } else { Err(e) },
+        };
+
+        let table_suffix = opt.resolve_table_suffix(tablesuffix_template, &values);
+        let row = match values_to_row(schema_info, values, pipeline_ctx, row, need_calc_ts) {
+            Ok(r) => r,
+            Err(e) => return if skip_error { Ok(result) } else { Err(e) },
+        };
+        result.insert(opt, vec![(row, table_suffix)]);
+        return Ok(result);
+    };
+
+    let mut rows_by_context: std::collections::HashMap<ContextOpt, Vec<RowWithTableSuffix>> =
+        std::collections::HashMap::new();
+    for (index, mut value) in arr.into_iter().enumerate() {
+        if !value.is_object() {
+            unwrap_or_continue_if_err!(
+                ArrayElementMustBeObjectSnafu {
+                    index,
+                    actual_type: value.kind_str().to_string(),
+                }
+                .fail(),
+                skip_error
+            );
+        }
+
+        // Extract ContextOpt and table_suffix for this element
+        let mut opt = unwrap_or_continue_if_err!(
+            ContextOpt::from_pipeline_map_to_opt(&mut value),
+            skip_error
+        );
+        let table_suffix = opt.resolve_table_suffix(tablesuffix_template, &value);
+        let transformed_row = unwrap_or_continue_if_err!(
+            values_to_row(schema_info, value, pipeline_ctx, row.clone(), need_calc_ts),
+            skip_error
+        );
+        rows_by_context
+            .entry(opt)
+            .or_default()
+            .push((transformed_row, table_suffix));
+    }
+    Ok(rows_by_context)
+}
+
 /// `need_calc_ts` happens in two cases:
 /// 1. full greptime_identity
 /// 2. auto-transform without transformer
@@ -992,4 +1062,139 @@ mod tests {
             assert_eq!(flattened_object, expected);
         }
     }
+
+    use ahash::HashMap as AHashMap;
+    #[test]
+    fn test_values_to_rows_skip_error_handling() {
+        let table_suffix_template: Option<crate::tablesuffix::TableSuffixTemplate> = None;
+
+        // Case 1: skip_error=true, mixed valid/invalid elements
+        {
+            let schema_info = &mut SchemaInfo::default();
+            let input_array = vec![
+                // Valid object
+                serde_json::json!({"name": "Alice", "age": 25}).into(),
+                // Invalid element (string)
+                VrlValue::Bytes("invalid_string".into()),
+                // Valid object
+                serde_json::json!({"name": "Bob", "age": 30}).into(),
+                // Invalid element (number)
+                VrlValue::Integer(42),
+                // Valid object
+                serde_json::json!({"name": "Charlie", "age": 35}).into(),
+            ];
+
+            let params = GreptimePipelineParams::from_map(AHashMap::from_iter([(
+                "skip_error".to_string(),
+                "true".to_string(),
+            )]));
+
+            let pipeline_ctx = PipelineContext::new(
+                &PipelineDefinition::GreptimeIdentityPipeline(None),
+                &params,
+                Channel::Unknown,
+            );
+
+            let result = values_to_rows(
+                schema_info,
+                VrlValue::Array(input_array),
+                &pipeline_ctx,
+                None,
+                true,
+                table_suffix_template.as_ref(),
+            );
+
+            // Should succeed and only process valid objects
+            assert!(result.is_ok());
+            let rows_by_context = result.unwrap();
+            // Count total rows across all ContextOpt groups
+            let total_rows: usize = rows_by_context.values().map(|v| v.len()).sum();
+            assert_eq!(total_rows, 3); // Only 3 valid objects
+        }
+
+        // Case 2: skip_error=false, invalid elements present
+        {
+            let schema_info = &mut SchemaInfo::default();
+            let input_array = vec![
+                serde_json::json!({"name": "Alice", "age": 25}).into(),
+                VrlValue::Bytes("invalid_string".into()), // This should cause error
+            ];
+
+            let params = GreptimePipelineParams::default(); // skip_error = false
+
+            let pipeline_ctx = PipelineContext::new(
+                &PipelineDefinition::GreptimeIdentityPipeline(None),
+                &params,
+                Channel::Unknown,
+            );
+
+            let result = values_to_rows(
+                schema_info,
+                VrlValue::Array(input_array),
+                &pipeline_ctx,
+                None,
+                true,
+                table_suffix_template.as_ref(),
+            );
+
+            // Should fail with ArrayElementMustBeObject error
+            assert!(result.is_err());
+            let error_msg = result.unwrap_err().to_string();
+            assert!(error_msg.contains("Array element at index 1 must be an object for one-to-many transformation, got string"));
+        }
+    }
+
+    /// Test that values_to_rows correctly groups rows by per-element ContextOpt
+    #[test]
+    fn test_values_to_rows_per_element_context_opt() {
+        let table_suffix_template: Option<crate::tablesuffix::TableSuffixTemplate> = None;
+        let schema_info = &mut SchemaInfo::default();
+
+        // Create array with elements having different TTL values (ContextOpt)
+        let input_array = vec![
+            serde_json::json!({"name": "Alice", "greptime_ttl": "1h"}).into(),
+            serde_json::json!({"name": "Bob", "greptime_ttl": "1h"}).into(),
+            serde_json::json!({"name": "Charlie", "greptime_ttl": "24h"}).into(),
+        ];
+
+        let params = GreptimePipelineParams::default();
+        let pipeline_ctx = PipelineContext::new(
+            &PipelineDefinition::GreptimeIdentityPipeline(None),
+            &params,
+            Channel::Unknown,
+        );
+
+        let result = values_to_rows(
+            schema_info,
+            VrlValue::Array(input_array),
+            &pipeline_ctx,
+            None,
+            true,
+            table_suffix_template.as_ref(),
+        );
+
+        assert!(result.is_ok());
+        let rows_by_context = result.unwrap();
+
+        // Should have 2 different ContextOpt groups (1h TTL and 24h TTL)
+        assert_eq!(rows_by_context.len(), 2);
+
+        // Count rows per group
+        let total_rows: usize = rows_by_context.values().map(|v| v.len()).sum();
+        assert_eq!(total_rows, 3);
+
+        // Verify that rows are correctly grouped by TTL
+        let mut ttl_1h_count = 0;
+        let mut ttl_24h_count = 0;
+        for rows in rows_by_context.values() {
+            // ContextOpt doesn't expose ttl directly, but we can count by group size
+            if rows.len() == 2 {
+                ttl_1h_count = rows.len();
+            } else if rows.len() == 1 {
+                ttl_24h_count = rows.len();
+            }
+        }
+        assert_eq!(ttl_1h_count, 2); // Alice and Bob with 1h TTL
+        assert_eq!(ttl_24h_count, 1); // Charlie with 24h TTL
+    }
 }
diff --git a/src/pipeline/src/manager/table.rs b/src/pipeline/src/manager/table.rs
index ad9a8c4ac5..5369478e9a 100644
--- a/src/pipeline/src/manager/table.rs
+++ b/src/pipeline/src/manager/table.rs
@@ -19,6 +19,8 @@ use api::v1::{
     ColumnDataType, ColumnDef, ColumnSchema as PbColumnSchema, Row, RowInsertRequest,
     RowInsertRequests, Rows, SemanticType,
 };
+use arrow::array::{Array, AsArray};
+use arrow::datatypes::TimestampNanosecondType;
 use common_query::OutputData;
 use common_recordbatch::util as record_util;
 use common_telemetry::{debug, info};
@@ -27,14 +29,11 @@ use datafusion::datasource::DefaultTableSource;
 use datafusion::logical_expr::col;
 use datafusion_common::TableReference;
 use datafusion_expr::{DmlStatement, LogicalPlan};
-use datatypes::prelude::ScalarVector;
 use datatypes::timestamp::TimestampNanosecond;
-use datatypes::vectors::{StringVector, TimestampNanosecondVector, Vector};
 use itertools::Itertools;
 use operator::insert::InserterRef;
 use operator::statement::StatementExecutorRef;
 use query::QueryEngineRef;
-use query::dataframe::DataFrame;
 use session::context::{QueryContextBuilder, QueryContextRef};
 use snafu::{OptionExt, ResultExt, ensure};
 use table::TableRef;
@@ -413,7 +412,6 @@ impl PipelineTable {
             .query_engine
             .read_table(self.table.clone())
             .context(DataFrameSnafu)?;
-        let DataFrame::DataFusion(dataframe) = dataframe;
 
         let dataframe = dataframe
             .filter(prepare_dataframe_conditions(name, version))
@@ -474,7 +472,6 @@ impl PipelineTable {
             .query_engine
             .read_table(self.table.clone())
             .context(DataFrameSnafu)?;
-        let DataFrame::DataFusion(dataframe) = dataframe;
 
         // select all pipelines with name and version
         let dataframe = dataframe
@@ -527,8 +524,7 @@ impl PipelineTable {
         for r in records {
             let pipeline_content_column = r.column(0);
             let pipeline_content = pipeline_content_column
-                .as_any()
-                .downcast_ref::<StringVector>()
+                .as_string_opt::<i32>()
                 .with_context(|| CastTypeSnafu {
                     msg: format!(
                         "can't downcast {:?} array into string vector",
@@ -537,20 +533,19 @@ impl PipelineTable {
                 })?;
 
             let pipeline_schema_column = r.column(1);
-            let pipeline_schema = pipeline_schema_column
-                .as_any()
-                .downcast_ref::<StringVector>()
-                .with_context(|| CastTypeSnafu {
-                    msg: format!(
-                        "can't downcast {:?} array into string vector",
-                        pipeline_schema_column.data_type()
-                    ),
-                })?;
+            let pipeline_schema =
+                pipeline_schema_column
+                    .as_string_opt::<i32>()
+                    .with_context(|| CastTypeSnafu {
+                        msg: format!(
+                            "expecting pipeline schema column of type string, actual: {}",
+                            pipeline_schema_column.data_type()
+                        ),
+                    })?;
 
             let pipeline_created_at_column = r.column(2);
             let pipeline_created_at = pipeline_created_at_column
-                .as_any()
-                .downcast_ref::<TimestampNanosecondVector>()
+                .as_primitive_opt::<TimestampNanosecondType>()
                 .with_context(|| CastTypeSnafu {
                     msg: format!(
                         "can't downcast {:?} array into scalar vector",
@@ -572,9 +567,9 @@ impl PipelineTable {
             let len = pipeline_content.len();
             for i in 0..len {
                 re.push((
-                    pipeline_content.get_data(i).unwrap().to_string(),
-                    pipeline_schema.get_data(i).unwrap().to_string(),
-                    pipeline_created_at.get_data(i).unwrap(),
+                    pipeline_content.value(i).to_string(),
+                    pipeline_schema.value(i).to_string(),
+                    TimestampNanosecond::new(pipeline_created_at.value(i)),
                 ));
             }
         }
diff --git a/src/pipeline/tests/common.rs b/src/pipeline/tests/common.rs
index 09ea340235..b102bede02 100644
--- a/src/pipeline/tests/common.rs
+++ b/src/pipeline/tests/common.rs
@@ -35,21 +35,25 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
     match input_value {
         VrlValue::Array(array) => {
             for value in array {
-                let row = pipeline
+                let rows_with_suffix = pipeline
                     .exec_mut(value, &pipeline_ctx, &mut schema_info)
                     .expect("failed to exec pipeline")
                     .into_transformed()
                     .expect("expect transformed result ");
-                rows.push(row.0);
+                for (r, _) in rows_with_suffix {
+                    rows.push(r);
+                }
             }
         }
         VrlValue::Object(_) => {
-            let row = pipeline
+            let rows_with_suffix = pipeline
                 .exec_mut(input_value, &pipeline_ctx, &mut schema_info)
                 .expect("failed to exec pipeline")
                 .into_transformed()
                 .expect("expect transformed result ");
-            rows.push(row.0);
+            for (r, _) in rows_with_suffix {
+                rows.push(r);
+            }
         }
         _ => {
             panic!("invalid input value");
diff --git a/src/pipeline/tests/pipeline.rs b/src/pipeline/tests/pipeline.rs
index ca94dbe3f0..300b7431b6 100644
--- a/src/pipeline/tests/pipeline.rs
+++ b/src/pipeline/tests/pipeline.rs
@@ -427,7 +427,7 @@ transform:
     );
     let stats = input_value.into();
 
-    let row = pipeline
+    let rows_with_suffix = pipeline
         .exec_mut(stats, &pipeline_ctx, &mut schema_info)
         .expect("failed to exec pipeline")
         .into_transformed()
@@ -435,7 +435,7 @@ transform:
 
     let output = Rows {
         schema: pipeline.schemas().unwrap().clone(),
-        rows: vec![row.0],
+        rows: rows_with_suffix.into_iter().map(|(r, _)| r).collect(),
     };
 
     assert_eq!(output.rows.len(), 1);
@@ -501,13 +501,13 @@ transform:
     );
 
     let status = input_value.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
         .exec_mut(status, &pipeline_ctx, &mut schema_info)
         .unwrap()
         .into_transformed()
         .expect("expect transformed result ");
+    let (row, _) = rows_with_suffix.swap_remove(0);
     let r = row
-        .0
         .values
         .into_iter()
         .map(|v| v.value_data.unwrap())
@@ -616,15 +616,16 @@ transform:
     );
 
     let status = input_value.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
         .exec_mut(status, &pipeline_ctx, &mut schema_info)
         .unwrap()
         .into_transformed()
         .expect("expect transformed result ");
 
+    let (row, _) = rows_with_suffix.swap_remove(0);
     let r = row
-        .0
         .values
+        .clone()
         .into_iter()
         .map(|v| v.value_data.unwrap())
         .collect::<Vec<_>>();
@@ -688,13 +689,13 @@ transform:
     );
 
     let status = input_value.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
         .exec_mut(status, &pipeline_ctx, &mut schema_info)
         .unwrap()
         .into_transformed()
         .expect("expect transformed result ");
+    let (row, _) = rows_with_suffix.swap_remove(0);
     let r = row
-        .0
         .values
         .into_iter()
         .map(|v| v.value_data.unwrap())
@@ -734,14 +735,14 @@ transform:
     );
 
     let status = input_value.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
         .exec_mut(status, &pipeline_ctx, &mut schema_info)
         .unwrap()
         .into_transformed()
         .expect("expect transformed result ");
 
+    let (row, _) = rows_with_suffix.swap_remove(0);
     let r = row
-        .0
         .values
         .into_iter()
         .map(|v| v.value_data.unwrap())
@@ -799,14 +800,14 @@ transform:
     );
 
     let status = input_value.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
         .exec_mut(status, &pipeline_ctx, &mut schema_info)
         .unwrap()
         .into_transformed()
         .expect("expect transformed result ");
 
+    let (row, _) = rows_with_suffix.swap_remove(0);
     let mut r = row
-        .0
         .values
         .into_iter()
         .map(|v| v.value_data.unwrap())
@@ -846,13 +847,14 @@ transform:
     );
 
     let status = input_value.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
         .exec_mut(status, &pipeline_ctx, &mut schema_info)
         .unwrap()
         .into_transformed()
         .expect("expect transformed result ");
 
-    row.0.values.into_iter().for_each(|v| {
+    let (row, _) = rows_with_suffix.swap_remove(0);
+    row.values.into_iter().for_each(|v| {
         if let ValueData::TimestampNanosecondValue(v) = v.value_data.unwrap() {
             let now = chrono::Utc::now().timestamp_nanos_opt().unwrap();
             assert!(now - v < 5_000_000);
@@ -923,13 +925,13 @@ transform:
     assert_eq!(dispatched_to.pipeline.unwrap(), "access_log_pipeline");
 
     let status = input_value2.into();
-    let row = pipeline
+    let mut rows_with_suffix = pipeline
         .exec_mut(status, &pipeline_ctx, &mut schema_info)
         .unwrap()
         .into_transformed()
         .expect("expect transformed result ");
+    let (row, _) = rows_with_suffix.swap_remove(0);
     let r = row
-        .0
         .values
         .into_iter()
         .map(|v| v.value_data.unwrap())
@@ -988,8 +990,8 @@ table_suffix: _${logger}
         .exec_mut(status, &pipeline_ctx, &mut schema_info)
         .unwrap();
 
-    let (row, table_name) = exec_re.into_transformed().unwrap();
-    let values = row.values;
+    let mut rows_with_suffix = exec_re.into_transformed().unwrap();
+    let (row, table_suffix) = rows_with_suffix.swap_remove(0);
     let expected_values = vec![
         Value {
             value_data: Some(ValueData::StringValue("hello world".into())),
@@ -998,6 +1000,234 @@ table_suffix: _${logger}
             value_data: Some(ValueData::TimestampNanosecondValue(1716668197217000000)),
         },
     ];
-    assert_eq!(expected_values, values);
-    assert_eq!(table_name, Some("_http".to_string()));
+    assert_eq!(expected_values, row.values);
+    assert_eq!(table_suffix, Some("_http".to_string()));
+}
+
+/// Test one-to-many pipeline expansion using VRL processor that returns an array
+#[test]
+fn test_one_to_many_pipeline() {
+    // Input: single log entry with a list of events
+    let input_value = serde_json::json!({
+        "request_id": "req-123",
+        "events": [
+            {"type": "click", "value": 100},
+            {"type": "scroll", "value": 200},
+            {"type": "submit", "value": 300}
+        ]
+    });
+
+    // VRL processor that expands events into separate rows using map
+    let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        events = del(.events)
+        request_id = del(.request_id)
+        map_values(array!(events)) -> |event| {
+            {
+                "request_id": request_id,
+                "event_type": event.type,
+                "event_value": event.value
+            }
+        }
+
+transform:
+  - field: request_id
+    type: string
+  - field: event_type
+    type: string
+  - field: event_value
+    type: uint64
+"#;
+
+    let yaml_content = Content::Yaml(pipeline_yaml);
+    let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline");
+    let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+    let pipeline_ctx = PipelineContext::new(
+        &pipeline_def,
+        &pipeline_param,
+        session::context::Channel::Unknown,
+    );
+
+    let status = input_value.into();
+    let rows_with_suffix = pipeline
+        .exec_mut(status, &pipeline_ctx, &mut schema_info)
+        .expect("failed to exec pipeline")
+        .into_transformed()
+        .expect("expect transformed result");
+
+    // Should produce 3 rows from the single input
+    assert_eq!(rows_with_suffix.len(), 3);
+
+    // Row 0: click event
+    assert_eq!(
+        rows_with_suffix[0].0.values[0].value_data,
+        Some(StringValue("req-123".into()))
+    );
+    assert_eq!(
+        rows_with_suffix[0].0.values[1].value_data,
+        Some(StringValue("click".into()))
+    );
+    assert_eq!(
+        rows_with_suffix[0].0.values[2].value_data,
+        Some(U64Value(100))
+    );
+
+    // Row 1: scroll event
+    assert_eq!(
+        rows_with_suffix[1].0.values[0].value_data,
+        Some(StringValue("req-123".into()))
+    );
+    assert_eq!(
+        rows_with_suffix[1].0.values[1].value_data,
+        Some(StringValue("scroll".into()))
+    );
+    assert_eq!(
+        rows_with_suffix[1].0.values[2].value_data,
+        Some(U64Value(200))
+    );
+
+    // Row 2: submit event
+    assert_eq!(
+        rows_with_suffix[2].0.values[0].value_data,
+        Some(StringValue("req-123".into()))
+    );
+    assert_eq!(
+        rows_with_suffix[2].0.values[1].value_data,
+        Some(StringValue("submit".into()))
+    );
+    assert_eq!(
+        rows_with_suffix[2].0.values[2].value_data,
+        Some(U64Value(300))
+    );
+}
+
+/// Test that single object input still works correctly (backward compatibility)
+#[test]
+fn test_one_to_many_single_object_unchanged() {
+    let input_value = serde_json::json!({
+        "name": "Alice",
+        "age": 30
+    });
+
+    let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        .processed = true
+        .
+
+transform:
+  - field: name
+    type: string
+  - field: age
+    type: uint32
+  - field: processed
+    type: boolean
+"#;
+
+    let yaml_content = Content::Yaml(pipeline_yaml);
+    let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline");
+    let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+    let pipeline_ctx = PipelineContext::new(
+        &pipeline_def,
+        &pipeline_param,
+        session::context::Channel::Unknown,
+    );
+
+    let status = input_value.into();
+    let rows_with_suffix = pipeline
+        .exec_mut(status, &pipeline_ctx, &mut schema_info)
+        .expect("failed to exec pipeline")
+        .into_transformed()
+        .expect("expect transformed result");
+
+    // Should produce exactly 1 row
+    assert_eq!(rows_with_suffix.len(), 1);
+
+    let (row, _) = &rows_with_suffix[0];
+    assert_eq!(row.values[0].value_data, Some(StringValue("Alice".into())));
+    assert_eq!(row.values[1].value_data, Some(U32Value(30)));
+    assert_eq!(row.values[2].value_data, Some(BoolValue(true)));
+}
+
+/// Test error handling when array contains non-object elements
+#[test]
+fn test_one_to_many_array_element_validation() {
+    let input_value = serde_json::json!({
+        "items": ["string", 123, true]
+    });
+
+    // VRL that returns an array with non-object elements
+    let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        .items
+
+transform:
+  - field: value
+    type: string
+"#;
+
+    let yaml_content = Content::Yaml(pipeline_yaml);
+    let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline");
+    let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+    let pipeline_ctx = PipelineContext::new(
+        &pipeline_def,
+        &pipeline_param,
+        session::context::Channel::Unknown,
+    );
+
+    let status = input_value.into();
+    let result = pipeline.exec_mut(status, &pipeline_ctx, &mut schema_info);
+
+    // Should fail because array elements are not objects
+    assert!(result.is_err());
+    let err = result.unwrap_err();
+    let err_msg = err.to_string();
+    assert!(
+        err_msg.contains("must be an object"),
+        "Expected 'must be an object' error, got: {}",
+        err_msg
+    );
+}
+
+/// Test that empty array produces zero rows
+#[test]
+fn test_one_to_many_empty_array() {
+    let input_value = serde_json::json!({
+        "events": []
+    });
+
+    let pipeline_yaml = r#"
+processors:
+  - vrl:
+      source: |
+        .events
+
+transform:
+  - field: value
+    type: string
+"#;
+
+    let yaml_content = Content::Yaml(pipeline_yaml);
+    let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline");
+    let (pipeline, mut schema_info, pipeline_def, pipeline_param) = setup_pipeline!(pipeline);
+    let pipeline_ctx = PipelineContext::new(
+        &pipeline_def,
+        &pipeline_param,
+        session::context::Channel::Unknown,
+    );
+
+    let status = input_value.into();
+    let rows_with_suffix = pipeline
+        .exec_mut(status, &pipeline_ctx, &mut schema_info)
+        .expect("failed to exec pipeline")
+        .into_transformed()
+        .expect("expect transformed result");
+
+    // Empty array should produce zero rows
+    assert_eq!(rows_with_suffix.len(), 0);
 }
diff --git a/src/plugins/Cargo.toml b/src/plugins/Cargo.toml
index 14df62c4fa..658e1c95e3 100644
--- a/src/plugins/Cargo.toml
+++ b/src/plugins/Cargo.toml
@@ -9,6 +9,7 @@ workspace = true
 
 [dependencies]
 auth.workspace = true
+catalog.workspace = true
 clap.workspace = true
 cli.workspace = true
 common-base.workspace = true
@@ -17,6 +18,7 @@ common-meta.workspace = true
 datanode.workspace = true
 flow.workspace = true
 frontend.workspace = true
+meta-client.workspace = true
 meta-srv.workspace = true
 serde.workspace = true
 snafu.workspace = true
diff --git a/src/plugins/src/flownode.rs b/src/plugins/src/flownode.rs
index 6b56b008da..9fbb018030 100644
--- a/src/plugins/src/flownode.rs
+++ b/src/plugins/src/flownode.rs
@@ -30,3 +30,20 @@ pub async fn setup_flownode_plugins(
 pub async fn start_flownode_plugins(_plugins: Plugins) -> Result<()> {
     Ok(())
 }
+
+pub mod context {
+    use std::sync::Arc;
+
+    use catalog::CatalogManagerRef;
+    use common_meta::FlownodeId;
+    use common_meta::kv_backend::KvBackendRef;
+    use flow::FrontendClient;
+
+    /// The context for `GrpcBuilderConfiguratorRef` in flownode.
+    pub struct GrpcConfigureContext {
+        pub kv_backend: KvBackendRef,
+        pub fe_client: Arc<FrontendClient>,
+        pub flownode_id: FlownodeId,
+        pub catalog_manager: CatalogManagerRef,
+    }
+}
diff --git a/src/plugins/src/frontend.rs b/src/plugins/src/frontend.rs
index 85049d8f80..0d1c1af7b9 100644
--- a/src/plugins/src/frontend.rs
+++ b/src/plugins/src/frontend.rs
@@ -40,3 +40,25 @@ pub async fn setup_frontend_plugins(
 pub async fn start_frontend_plugins(_plugins: Plugins) -> Result<()> {
     Ok(())
 }
+
+pub mod context {
+    use std::sync::Arc;
+
+    use flow::FrontendClient;
+    use meta_client::MetaClientRef;
+
+    /// The context for [`catalog::kvbackend::CatalogManagerConfiguratorRef`] in standalone or
+    /// distributed.
+    pub enum CatalogManagerConfigureContext {
+        Distributed(DistributedCatalogManagerConfigureContext),
+        Standalone(StandaloneCatalogManagerConfigureContext),
+    }
+
+    pub struct DistributedCatalogManagerConfigureContext {
+        pub meta_client: MetaClientRef,
+    }
+
+    pub struct StandaloneCatalogManagerConfigureContext {
+        pub fe_client: Arc<FrontendClient>,
+    }
+}
diff --git a/src/plugins/src/lib.rs b/src/plugins/src/lib.rs
index 9a979a23a1..c973cb3131 100644
--- a/src/plugins/src/lib.rs
+++ b/src/plugins/src/lib.rs
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 mod cli;
-mod datanode;
-mod flownode;
-mod frontend;
+pub mod datanode;
+pub mod flownode;
+pub mod frontend;
 mod meta_srv;
 mod options;
-mod standalone;
+pub mod standalone;
 
 pub use cli::SubCommand;
 pub use datanode::{setup_datanode_plugins, start_datanode_plugins};
diff --git a/src/plugins/src/standalone.rs b/src/plugins/src/standalone.rs
index 97b1c22aa7..0cb7ee60e5 100644
--- a/src/plugins/src/standalone.rs
+++ b/src/plugins/src/standalone.rs
@@ -33,3 +33,18 @@ pub async fn setup_standalone_plugins(
 pub async fn start_standalone_plugins(_plugins: Plugins) -> Result<()> {
     Ok(())
 }
+
+pub mod context {
+    use std::sync::Arc;
+
+    use catalog::CatalogManagerRef;
+    use common_meta::kv_backend::KvBackendRef;
+    use flow::FrontendClient;
+
+    /// The context for [`common_meta::ddl_manager::DdlManagerConfiguratorRef`] in standalone.
+    pub struct DdlManagerConfigureContext {
+        pub kv_backend: KvBackendRef,
+        pub fe_client: Arc<FrontendClient>,
+        pub catalog_manager: CatalogManagerRef,
+    }
+}
diff --git a/src/promql/src/extension_plan/histogram_fold.rs b/src/promql/src/extension_plan/histogram_fold.rs
index e80d4a7676..f4637e36f0 100644
--- a/src/promql/src/extension_plan/histogram_fold.rs
+++ b/src/promql/src/extension_plan/histogram_fold.rs
@@ -13,15 +13,15 @@
 // limitations under the License.
 
 use std::any::Any;
+use std::borrow::Cow;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 use std::task::Poll;
 use std::time::Instant;
 
-use common_recordbatch::RecordBatch as GtRecordBatch;
 use common_telemetry::warn;
-use datafusion::arrow::array::AsArray;
-use datafusion::arrow::compute::{self, SortOptions, concat_batches};
+use datafusion::arrow::array::{Array, AsArray, StringArray};
+use datafusion::arrow::compute::{SortOptions, concat_batches};
 use datafusion::arrow::datatypes::{DataType, Float64Type, SchemaRef};
 use datafusion::arrow::record_batch::RecordBatch;
 use datafusion::common::stats::Precision;
@@ -36,14 +36,13 @@ use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion::physical_plan::expressions::{CastExpr as PhyCast, Column as PhyColumn};
 use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use datafusion::physical_plan::{
-    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, PhysicalExpr,
-    PlanProperties, RecordBatchStream, SendableRecordBatchStream,
+    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
+    Partitioning, PhysicalExpr, PlanProperties, RecordBatchStream, SendableRecordBatchStream,
 };
 use datafusion::prelude::{Column, Expr};
 use datatypes::prelude::{ConcreteDataType, DataType as GtDataType};
-use datatypes::schema::Schema as GtSchema;
-use datatypes::value::{OrderedF64, ValueRef};
-use datatypes::vectors::MutableVector;
+use datatypes::value::{OrderedF64, Value, ValueRef};
+use datatypes::vectors::{Helper, MutableVector, VectorRef};
 use futures::{Stream, StreamExt, ready};
 
 /// `HistogramFold` will fold the conventional (non-native) histogram ([1]) for later
@@ -181,10 +180,33 @@ impl HistogramFold {
             .index_of_column_by_name(None, &self.ts_column)
             .unwrap();
 
+        let tag_columns = exec_input
+            .schema()
+            .fields()
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, field)| {
+                if idx == le_column_index || idx == field_column_index || idx == ts_column_index {
+                    None
+                } else {
+                    Some(Arc::new(PhyColumn::new(field.name(), idx)) as _)
+                }
+            })
+            .collect::<Vec<_>>();
+
+        let mut partition_exprs = tag_columns.clone();
+        partition_exprs.push(Arc::new(PhyColumn::new(
+            self.input.schema().field(ts_column_index).name(),
+            ts_column_index,
+        )) as _);
+
         let output_schema: SchemaRef = self.output_schema.inner().clone();
         let properties = PlanProperties::new(
             EquivalenceProperties::new(output_schema.clone()),
-            Partitioning::UnknownPartitioning(1),
+            Partitioning::Hash(
+                partition_exprs.clone(),
+                exec_input.output_partitioning().partition_count(),
+            ),
             EmissionType::Incremental,
             Boundedness::Bounded,
         );
@@ -193,6 +215,8 @@ impl HistogramFold {
             field_column_index,
             ts_column_index,
             input: exec_input,
+            tag_columns,
+            partition_exprs,
             quantile: self.quantile.into(),
             output_schema,
             metric: ExecutionPlanMetricsSet::new(),
@@ -254,6 +278,9 @@ pub struct HistogramFoldExec {
     /// Index for field column in the schema of input.
     field_column_index: usize,
     ts_column_index: usize,
+    /// Tag columns are all columns except `le`, `field` and `ts` columns.
+    tag_columns: Vec<Arc<dyn PhysicalExpr>>,
+    partition_exprs: Vec<Arc<dyn PhysicalExpr>>,
     quantile: f64,
     metric: ExecutionPlanMetricsSet,
     properties: PlanProperties,
@@ -270,10 +297,10 @@ impl ExecutionPlan for HistogramFoldExec {
 
     fn required_input_ordering(&self) -> Vec<Option<OrderingRequirements>> {
         let mut cols = self
-            .tag_col_exprs()
-            .into_iter()
+            .tag_columns
+            .iter()
             .map(|expr| PhysicalSortRequirement {
-                expr,
+                expr: expr.clone(),
                 options: None,
             })
             .collect::<Vec<PhysicalSortRequirement>>();
@@ -308,7 +335,7 @@ impl ExecutionPlan for HistogramFoldExec {
     }
 
     fn required_input_distribution(&self) -> Vec<Distribution> {
-        self.input.required_input_distribution()
+        vec![Distribution::HashPartitioned(self.partition_exprs.clone())]
     }
 
     fn maintains_input_order(&self) -> Vec<bool> {
@@ -325,15 +352,27 @@ impl ExecutionPlan for HistogramFoldExec {
         children: Vec<Arc<dyn ExecutionPlan>>,
     ) -> DataFusionResult<Arc<dyn ExecutionPlan>> {
         assert!(!children.is_empty());
+        let new_input = children[0].clone();
+        let properties = PlanProperties::new(
+            EquivalenceProperties::new(self.output_schema.clone()),
+            Partitioning::Hash(
+                self.partition_exprs.clone(),
+                new_input.output_partitioning().partition_count(),
+            ),
+            EmissionType::Incremental,
+            Boundedness::Bounded,
+        );
         Ok(Arc::new(Self {
-            input: children[0].clone(),
+            input: new_input,
             metric: self.metric.clone(),
             le_column_index: self.le_column_index,
             ts_column_index: self.ts_column_index,
+            tag_columns: self.tag_columns.clone(),
+            partition_exprs: self.partition_exprs.clone(),
             quantile: self.quantile,
             output_schema: self.output_schema.clone(),
             field_column_index: self.field_column_index,
-            properties: self.properties.clone(),
+            properties,
         }))
     }
 
@@ -360,6 +399,9 @@ impl ExecutionPlan for HistogramFoldExec {
             input_buffer: vec![],
             input,
             output_schema,
+            input_schema: self.input.schema(),
+            mode: FoldMode::Optimistic,
+            safe_group: None,
             metric: baseline_metric,
             batch_size,
             input_buffered_rows: 0,
@@ -392,30 +434,6 @@ impl ExecutionPlan for HistogramFoldExec {
     }
 }
 
-impl HistogramFoldExec {
-    /// Return all the [PhysicalExpr] of tag columns in order.
-    ///
-    /// Tag columns are all columns except `le`, `field` and `ts` columns.
-    pub fn tag_col_exprs(&self) -> Vec<Arc<dyn PhysicalExpr>> {
-        self.input
-            .schema()
-            .fields()
-            .iter()
-            .enumerate()
-            .filter_map(|(idx, field)| {
-                if idx == self.le_column_index
-                    || idx == self.field_column_index
-                    || idx == self.ts_column_index
-                {
-                    None
-                } else {
-                    Some(Arc::new(PhyColumn::new(field.name(), idx)) as _)
-                }
-            })
-            .collect()
-    }
-}
-
 impl DisplayAs for HistogramFoldExec {
     fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         match t {
@@ -432,6 +450,12 @@ impl DisplayAs for HistogramFoldExec {
     }
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum FoldMode {
+    Optimistic,
+    Safe,
+}
+
 pub struct HistogramFoldStream {
     // internal states
     le_column_index: usize,
@@ -443,6 +467,9 @@ pub struct HistogramFoldStream {
     /// Expected output batch size
     batch_size: usize,
     output_schema: SchemaRef,
+    input_schema: SchemaRef,
+    mode: FoldMode,
+    safe_group: Option<SafeGroup>,
 
     // buffers
     input_buffer: Vec<RecordBatch>,
@@ -455,6 +482,13 @@ pub struct HistogramFoldStream {
     metric: BaselineMetrics,
 }
 
+#[derive(Debug, Default)]
+struct SafeGroup {
+    tag_values: Vec<Value>,
+    buckets: Vec<f64>,
+    counters: Vec<f64>,
+}
+
 impl RecordBatchStream for HistogramFoldStream {
     fn schema(&self) -> SchemaRef {
         self.output_schema.clone()
@@ -480,7 +514,10 @@ impl Stream for HistogramFoldStream {
                     self.metric.elapsed_compute().add_elapsed(timer);
                     break Poll::Ready(Some(result));
                 }
-                None => break Poll::Ready(self.take_output_buf()?.map(Ok)),
+                None => {
+                    self.flush_remaining()?;
+                    break Poll::Ready(self.take_output_buf()?.map(Ok));
+                }
             }
         };
         self.metric.record_poll(poll)
@@ -493,22 +530,28 @@ impl HistogramFoldStream {
         &mut self,
         input: RecordBatch,
     ) -> DataFusionResult<Option<DataFusionResult<RecordBatch>>> {
-        let Some(bucket_num) = self.calculate_bucket_num(&input)? else {
-            return Ok(None);
-        };
+        match self.mode {
+            FoldMode::Safe => {
+                self.push_input_buf(input);
+                self.process_safe_mode_buffer()?;
+            }
+            FoldMode::Optimistic => {
+                self.push_input_buf(input);
+                let Some(bucket_num) = self.calculate_bucket_num_from_buffer()? else {
+                    return Ok(None);
+                };
+                self.bucket_size = Some(bucket_num);
 
-        if self.input_buffered_rows + input.num_rows() < bucket_num {
-            // not enough rows to fold
-            self.push_input_buf(input);
-            return Ok(None);
+                if self.input_buffered_rows < bucket_num {
+                    // not enough rows to fold
+                    return Ok(None);
+                }
+
+                self.fold_buf(bucket_num)?;
+            }
         }
 
-        self.fold_buf(bucket_num, input)?;
-        if self.output_buffered_rows >= self.batch_size {
-            return Ok(self.take_output_buf()?.map(Ok));
-        }
-
-        Ok(None)
+        self.maybe_take_output()
     }
 
     /// Generate a group of empty [MutableVector]s from the output schema.
@@ -534,62 +577,100 @@ impl HistogramFoldStream {
         Ok(builders)
     }
 
-    fn calculate_bucket_num(&mut self, batch: &RecordBatch) -> DataFusionResult<Option<usize>> {
+    /// Determines bucket count using buffered batches, concatenating them to
+    /// detect the first complete bucket that may span batch boundaries.
+    fn calculate_bucket_num_from_buffer(&mut self) -> DataFusionResult<Option<usize>> {
         if let Some(size) = self.bucket_size {
             return Ok(Some(size));
         }
 
-        let inf_pos = self.find_positive_inf(batch)?;
-        if inf_pos == batch.num_rows() {
-            // no positive inf found, append to buffer and wait for next batch
-            self.push_input_buf(batch.clone());
+        if self.input_buffer.is_empty() {
             return Ok(None);
         }
 
-        // else we found the positive inf.
-        // calculate the bucket size
-        let bucket_size = inf_pos + self.input_buffered_rows + 1;
-        Ok(Some(bucket_size))
+        let batch_refs: Vec<&RecordBatch> = self.input_buffer.iter().collect();
+        let batch = concat_batches(&self.input_schema, batch_refs)?;
+        self.find_first_complete_bucket(&batch)
+    }
+
+    fn find_first_complete_bucket(&self, batch: &RecordBatch) -> DataFusionResult<Option<usize>> {
+        if batch.num_rows() == 0 {
+            return Ok(None);
+        }
+
+        let vectors = Helper::try_into_vectors(batch.columns())
+            .map_err(|e| DataFusionError::Execution(e.to_string()))?;
+        let le_array = batch.column(self.le_column_index).as_string::<i32>();
+
+        let mut tag_values_buf = Vec::with_capacity(self.normal_indices.len());
+        self.collect_tag_values(&vectors, 0, &mut tag_values_buf);
+        let mut group_start = 0usize;
+
+        for row in 0..batch.num_rows() {
+            if !self.is_same_group(&vectors, row, &tag_values_buf) {
+                // new group begins
+                self.collect_tag_values(&vectors, row, &mut tag_values_buf);
+                group_start = row;
+            }
+
+            if Self::is_positive_infinity(le_array, row) {
+                return Ok(Some(row - group_start + 1));
+            }
+        }
+
+        Ok(None)
     }
 
     /// Fold record batches from input buffer and put to output buffer
-    fn fold_buf(&mut self, bucket_num: usize, input: RecordBatch) -> DataFusionResult<()> {
-        self.push_input_buf(input);
-        // TODO(ruihang): this concat is avoidable.
-        let batch = concat_batches(&self.input.schema(), self.input_buffer.drain(..).as_ref())?;
+    fn fold_buf(&mut self, bucket_num: usize) -> DataFusionResult<()> {
+        let batch = concat_batches(&self.input_schema, self.input_buffer.drain(..).as_ref())?;
         let mut remaining_rows = self.input_buffered_rows;
         let mut cursor = 0;
 
-        let gt_schema = GtSchema::try_from(self.input.schema()).unwrap();
-        let batch = GtRecordBatch::try_from_df_record_batch(Arc::new(gt_schema), batch).unwrap();
+        // TODO(LFC): Try to get rid of the Arrow array to vector conversion here.
+        let vectors = Helper::try_into_vectors(batch.columns())
+            .map_err(|e| DataFusionError::Execution(e.to_string()))?;
+        let le_array = batch.column(self.le_column_index);
+        let le_array = le_array.as_string::<i32>();
+        let field_array = batch.column(self.field_column_index);
+        let field_array = field_array.as_primitive::<Float64Type>();
+        let mut tag_values_buf = Vec::with_capacity(self.normal_indices.len());
+
+        while remaining_rows >= bucket_num && self.mode == FoldMode::Optimistic {
+            self.collect_tag_values(&vectors, cursor, &mut tag_values_buf);
+            if !self.validate_optimistic_group(
+                &vectors,
+                le_array,
+                cursor,
+                bucket_num,
+                &tag_values_buf,
+            ) {
+                let remaining_input_batch = batch.slice(cursor, remaining_rows);
+                self.switch_to_safe_mode(remaining_input_batch)?;
+                return Ok(());
+            }
 
-        while remaining_rows >= bucket_num {
             // "sample" normal columns
-            for normal_index in &self.normal_indices {
-                let val = batch.column(*normal_index).get(cursor);
-                self.output_buffer[*normal_index].push_value_ref(&val.as_value_ref());
+            for (idx, value) in self.normal_indices.iter().zip(tag_values_buf.iter()) {
+                self.output_buffer[*idx].push_value_ref(value);
             }
             // "fold" `le` and field columns
-            let le_array = batch.column(self.le_column_index);
-            let field_array = batch.column(self.field_column_index);
-            let mut bucket = vec![];
-            let mut counters = vec![];
+            let mut bucket = Vec::with_capacity(bucket_num);
+            let mut counters = Vec::with_capacity(bucket_num);
             for bias in 0..bucket_num {
-                let le_str_val = le_array.get(cursor + bias);
-                let le_str_val_ref = le_str_val.as_value_ref();
-                let le_str = le_str_val_ref
-                    .try_into_string()
-                    .unwrap()
-                    .expect("le column should not be nullable");
-                let le = le_str.parse::<f64>().unwrap();
+                let position = cursor + bias;
+                let le = if le_array.is_valid(position) {
+                    le_array.value(position).parse::<f64>().unwrap_or(f64::NAN)
+                } else {
+                    f64::NAN
+                };
                 bucket.push(le);
 
-                let counter = field_array
-                    .get(cursor + bias)
-                    .as_value_ref()
-                    .try_into_f64()
-                    .unwrap()
-                    .expect("field column should not be nullable");
+                let counter = if field_array.is_valid(position) {
+                    field_array.value(position)
+                } else {
+                    f64::NAN
+                };
                 counters.push(counter);
             }
             // ignore invalid data
@@ -600,9 +681,11 @@ impl HistogramFoldStream {
             self.output_buffered_rows += 1;
         }
 
-        let remaining_input_batch = batch.into_df_record_batch().slice(cursor, remaining_rows);
+        let remaining_input_batch = batch.slice(cursor, remaining_rows);
         self.input_buffered_rows = remaining_input_batch.num_rows();
-        self.input_buffer.push(remaining_input_batch);
+        if self.input_buffered_rows > 0 {
+            self.input_buffer.push(remaining_input_batch);
+        }
 
         Ok(())
     }
@@ -612,6 +695,170 @@ impl HistogramFoldStream {
         self.input_buffer.push(batch);
     }
 
+    fn maybe_take_output(&mut self) -> DataFusionResult<Option<DataFusionResult<RecordBatch>>> {
+        if self.output_buffered_rows >= self.batch_size {
+            return Ok(self.take_output_buf()?.map(Ok));
+        }
+        Ok(None)
+    }
+
+    fn switch_to_safe_mode(&mut self, remaining_batch: RecordBatch) -> DataFusionResult<()> {
+        self.mode = FoldMode::Safe;
+        self.bucket_size = None;
+        self.input_buffer.clear();
+        self.input_buffered_rows = remaining_batch.num_rows();
+
+        if self.input_buffered_rows > 0 {
+            self.input_buffer.push(remaining_batch);
+            self.process_safe_mode_buffer()?;
+        }
+
+        Ok(())
+    }
+
+    fn collect_tag_values<'a>(
+        &self,
+        vectors: &'a [VectorRef],
+        row: usize,
+        tag_values: &mut Vec<ValueRef<'a>>,
+    ) {
+        tag_values.clear();
+        for idx in self.normal_indices.iter() {
+            tag_values.push(vectors[*idx].get_ref(row));
+        }
+    }
+
+    fn validate_optimistic_group(
+        &self,
+        vectors: &[VectorRef],
+        le_array: &StringArray,
+        cursor: usize,
+        bucket_num: usize,
+        tag_values: &[ValueRef<'_>],
+    ) -> bool {
+        let inf_index = cursor + bucket_num - 1;
+        if !Self::is_positive_infinity(le_array, inf_index) {
+            return false;
+        }
+
+        for offset in 1..bucket_num {
+            let row = cursor + offset;
+            for (idx, expected) in self.normal_indices.iter().zip(tag_values.iter()) {
+                if vectors[*idx].get_ref(row) != *expected {
+                    return false;
+                }
+            }
+        }
+        true
+    }
+
+    /// Checks whether a row belongs to the current group (same series).
+    fn is_same_group(
+        &self,
+        vectors: &[VectorRef],
+        row: usize,
+        tag_values: &[ValueRef<'_>],
+    ) -> bool {
+        self.normal_indices
+            .iter()
+            .zip(tag_values.iter())
+            .all(|(idx, expected)| vectors[*idx].get_ref(row) == *expected)
+    }
+
+    fn push_output_row(&mut self, tag_values: &[ValueRef<'_>], result: f64) {
+        debug_assert_eq!(self.normal_indices.len(), tag_values.len());
+        for (idx, value) in self.normal_indices.iter().zip(tag_values.iter()) {
+            self.output_buffer[*idx].push_value_ref(value);
+        }
+        self.output_buffer[self.field_column_index].push_value_ref(&ValueRef::from(result));
+        self.output_buffered_rows += 1;
+    }
+
+    fn finalize_safe_group(&mut self) -> DataFusionResult<()> {
+        if let Some(group) = self.safe_group.take() {
+            if group.tag_values.is_empty() {
+                return Ok(());
+            }
+
+            let has_inf = group
+                .buckets
+                .last()
+                .map(|v| v.is_infinite() && v.is_sign_positive())
+                .unwrap_or(false);
+            let result = if group.buckets.len() < 2 || !has_inf {
+                f64::NAN
+            } else {
+                Self::evaluate_row(self.quantile, &group.buckets, &group.counters)
+                    .unwrap_or(f64::NAN)
+            };
+            let mut tag_value_refs = Vec::with_capacity(group.tag_values.len());
+            tag_value_refs.extend(group.tag_values.iter().map(|v| v.as_value_ref()));
+            self.push_output_row(&tag_value_refs, result);
+        }
+        Ok(())
+    }
+
+    fn process_safe_mode_buffer(&mut self) -> DataFusionResult<()> {
+        if self.input_buffer.is_empty() {
+            self.input_buffered_rows = 0;
+            return Ok(());
+        }
+
+        let batch = concat_batches(&self.input_schema, self.input_buffer.drain(..).as_ref())?;
+        self.input_buffered_rows = 0;
+        let vectors = Helper::try_into_vectors(batch.columns())
+            .map_err(|e| DataFusionError::Execution(e.to_string()))?;
+        let le_array = batch.column(self.le_column_index).as_string::<i32>();
+        let field_array = batch
+            .column(self.field_column_index)
+            .as_primitive::<Float64Type>();
+        let mut tag_values_buf = Vec::with_capacity(self.normal_indices.len());
+
+        for row in 0..batch.num_rows() {
+            self.collect_tag_values(&vectors, row, &mut tag_values_buf);
+            let should_start_new_group = self
+                .safe_group
+                .as_ref()
+                .is_none_or(|group| !Self::tag_values_equal(&group.tag_values, &tag_values_buf));
+            if should_start_new_group {
+                self.finalize_safe_group()?;
+                self.safe_group = Some(SafeGroup {
+                    tag_values: tag_values_buf.iter().cloned().map(Value::from).collect(),
+                    buckets: Vec::new(),
+                    counters: Vec::new(),
+                });
+            }
+
+            let Some(group) = self.safe_group.as_mut() else {
+                continue;
+            };
+
+            let bucket = if le_array.is_valid(row) {
+                le_array.value(row).parse::<f64>().unwrap_or(f64::NAN)
+            } else {
+                f64::NAN
+            };
+            let counter = if field_array.is_valid(row) {
+                field_array.value(row)
+            } else {
+                f64::NAN
+            };
+
+            group.buckets.push(bucket);
+            group.counters.push(counter);
+        }
+
+        Ok(())
+    }
+
+    fn tag_values_equal(group_values: &[Value], current: &[ValueRef<'_>]) -> bool {
+        group_values.len() == current.len()
+            && group_values
+                .iter()
+                .zip(current.iter())
+                .all(|(group, now)| group.as_value_ref() == *now)
+    }
+
     /// Compute result from output buffer
     fn take_output_buf(&mut self) -> DataFusionResult<Option<RecordBatch>> {
         if self.output_buffered_rows == 0 {
@@ -639,41 +886,31 @@ impl HistogramFoldStream {
             .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))
     }
 
-    /// Find the first `+Inf` which indicates the end of the bucket group
-    ///
-    /// If the return value equals to batch's num_rows means the it's not found
-    /// in this batch
-    fn find_positive_inf(&self, batch: &RecordBatch) -> DataFusionResult<usize> {
-        // fuse this function. It should not be called when the
-        // bucket size is already know.
-        if let Some(bucket_size) = self.bucket_size {
-            return Ok(bucket_size);
-        }
-        let string_le_array = batch.column(self.le_column_index);
-        let float_le_array = compute::cast(&string_le_array, &DataType::Float64).map_err(|e| {
-            DataFusionError::Execution(format!(
-                "cannot cast {} array to float64 array: {:?}",
-                string_le_array.data_type(),
-                e
-            ))
-        })?;
-        let le_as_f64_array = float_le_array
-            .as_primitive_opt::<Float64Type>()
-            .ok_or_else(|| {
-                DataFusionError::Execution(format!(
-                    "expect a float64 array, but found {}",
-                    float_le_array.data_type()
-                ))
-            })?;
-        for (i, v) in le_as_f64_array.iter().enumerate() {
-            if let Some(v) = v
-                && v == f64::INFINITY
-            {
-                return Ok(i);
+    fn flush_remaining(&mut self) -> DataFusionResult<()> {
+        if self.mode == FoldMode::Optimistic && self.input_buffered_rows > 0 {
+            let buffered_batches: Vec<_> = self.input_buffer.drain(..).collect();
+            if !buffered_batches.is_empty() {
+                let batch = concat_batches(&self.input_schema, buffered_batches.as_slice())?;
+                self.switch_to_safe_mode(batch)?;
+            } else {
+                self.input_buffered_rows = 0;
             }
         }
 
-        Ok(batch.num_rows())
+        if self.mode == FoldMode::Safe {
+            self.process_safe_mode_buffer()?;
+            self.finalize_safe_group()?;
+        }
+
+        Ok(())
+    }
+
+    fn is_positive_infinity(le_array: &StringArray, index: usize) -> bool {
+        le_array.is_valid(index)
+            && matches!(
+                le_array.value(index).parse::<f64>(),
+                Ok(value) if value.is_infinite() && value.is_sign_positive()
+            )
     }
 
     /// Evaluate the field column and return the result
@@ -702,8 +939,28 @@ impl HistogramFoldStream {
         }
 
         // check input value
-        debug_assert!(bucket.windows(2).all(|w| w[0] <= w[1]), "{bucket:?}");
-        debug_assert!(counter.windows(2).all(|w| w[0] <= w[1]), "{counter:?}");
+        if !bucket.windows(2).all(|w| w[0] <= w[1]) {
+            return Ok(f64::NAN);
+        }
+        let counter = {
+            let needs_fix =
+                counter.iter().any(|v| !v.is_finite()) || !counter.windows(2).all(|w| w[0] <= w[1]);
+            if !needs_fix {
+                Cow::Borrowed(counter)
+            } else {
+                let mut fixed = Vec::with_capacity(counter.len());
+                let mut prev = 0.0;
+                for (idx, &v) in counter.iter().enumerate() {
+                    let mut val = if v.is_finite() { v } else { prev };
+                    if idx > 0 && val < prev {
+                        val = prev;
+                    }
+                    fixed.push(val);
+                    prev = val;
+                }
+                Cow::Owned(fixed)
+            }
+        };
 
         let total = *counter.last().unwrap();
         let expected_pos = total * quantile;
@@ -722,6 +979,9 @@ impl HistogramFoldStream {
                 lower_bound = bucket[fit_bucket_pos - 1];
                 lower_count = counter[fit_bucket_pos - 1];
             }
+            if (upper_count - lower_count).abs() < 1e-10 {
+                return Ok(f64::NAN);
+            }
             Ok(lower_bound
                 + (upper_bound - lower_bound) / (upper_count - lower_count)
                     * (expected_pos - lower_count))
@@ -733,8 +993,8 @@ impl HistogramFoldStream {
 mod test {
     use std::sync::Arc;
 
-    use datafusion::arrow::array::Float64Array;
-    use datafusion::arrow::datatypes::{Field, Schema};
+    use datafusion::arrow::array::{Float64Array, TimestampMillisecondArray};
+    use datafusion::arrow::datatypes::{Field, Schema, SchemaRef, TimeUnit};
     use datafusion::common::ToDFSchema;
     use datafusion::datasource::memory::MemorySourceConfig;
     use datafusion::datasource::source::DataSourceExec;
@@ -801,9 +1061,89 @@ mod test {
         ))
     }
 
+    fn build_fold_exec_from_batches(
+        batches: Vec<RecordBatch>,
+        schema: SchemaRef,
+        quantile: f64,
+        ts_column_index: usize,
+    ) -> Arc<HistogramFoldExec> {
+        let input: Arc<dyn ExecutionPlan> = Arc::new(DataSourceExec::new(Arc::new(
+            MemorySourceConfig::try_new(&[batches], schema.clone(), None).unwrap(),
+        )));
+        let output_schema: SchemaRef = Arc::new(
+            HistogramFold::convert_schema(&Arc::new(input.schema().to_dfschema().unwrap()), "le")
+                .unwrap()
+                .as_arrow()
+                .clone(),
+        );
+
+        let (tag_columns, partition_exprs, properties) =
+            build_test_plan_properties(&input, output_schema.clone(), ts_column_index);
+
+        Arc::new(HistogramFoldExec {
+            le_column_index: 1,
+            field_column_index: 2,
+            quantile,
+            ts_column_index,
+            input,
+            output_schema,
+            tag_columns,
+            partition_exprs,
+            metric: ExecutionPlanMetricsSet::new(),
+            properties,
+        })
+    }
+
+    type PlanPropsResult = (
+        Vec<Arc<dyn PhysicalExpr>>,
+        Vec<Arc<dyn PhysicalExpr>>,
+        PlanProperties,
+    );
+
+    fn build_test_plan_properties(
+        input: &Arc<dyn ExecutionPlan>,
+        output_schema: SchemaRef,
+        ts_column_index: usize,
+    ) -> PlanPropsResult {
+        let tag_columns = input
+            .schema()
+            .fields()
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, field)| {
+                if idx == 1 || idx == 2 || idx == ts_column_index {
+                    None
+                } else {
+                    Some(Arc::new(PhyColumn::new(field.name(), idx)) as _)
+                }
+            })
+            .collect::<Vec<_>>();
+
+        let partition_exprs = if tag_columns.is_empty() {
+            vec![Arc::new(PhyColumn::new(
+                input.schema().field(ts_column_index).name(),
+                ts_column_index,
+            )) as _]
+        } else {
+            tag_columns.clone()
+        };
+
+        let properties = PlanProperties::new(
+            EquivalenceProperties::new(output_schema.clone()),
+            Partitioning::Hash(
+                partition_exprs.clone(),
+                input.output_partitioning().partition_count(),
+            ),
+            EmissionType::Incremental,
+            Boundedness::Bounded,
+        );
+
+        (tag_columns, partition_exprs, properties)
+    }
+
     #[tokio::test]
     async fn fold_overall() {
-        let memory_exec = Arc::new(prepare_test_data());
+        let memory_exec: Arc<dyn ExecutionPlan> = Arc::new(prepare_test_data());
         let output_schema: SchemaRef = Arc::new(
             HistogramFold::convert_schema(
                 &Arc::new(memory_exec.schema().to_dfschema().unwrap()),
@@ -813,19 +1153,17 @@ mod test {
             .as_arrow()
             .clone(),
         );
-        let properties = PlanProperties::new(
-            EquivalenceProperties::new(output_schema.clone()),
-            Partitioning::UnknownPartitioning(1),
-            EmissionType::Incremental,
-            Boundedness::Bounded,
-        );
+        let (tag_columns, partition_exprs, properties) =
+            build_test_plan_properties(&memory_exec, output_schema.clone(), 0);
         let fold_exec = Arc::new(HistogramFoldExec {
             le_column_index: 1,
             field_column_index: 2,
             quantile: 0.4,
-            ts_column_index: 9999, // not exist but doesn't matter
+            ts_column_index: 0,
             input: memory_exec,
             output_schema,
+            tag_columns,
+            partition_exprs,
             metric: ExecutionPlanMetricsSet::new(),
             properties,
         });
@@ -872,6 +1210,187 @@ mod test {
         assert_eq!(actual, expected_output_schema)
     }
 
+    #[tokio::test]
+    async fn fallback_to_safe_mode_on_missing_inf() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("host", DataType::Utf8, true),
+            Field::new("le", DataType::Utf8, true),
+            Field::new("val", DataType::Float64, true),
+        ]));
+        let host_column = Arc::new(StringArray::from(vec!["a", "a", "a", "a", "b", "b"])) as _;
+        let le_column = Arc::new(StringArray::from(vec![
+            "0.1", "+Inf", "0.1", "1.0", "0.1", "+Inf",
+        ])) as _;
+        let val_column = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 3.0, 1.0, 5.0])) as _;
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![host_column, le_column, val_column]).unwrap();
+        let fold_exec = build_fold_exec_from_batches(vec![batch], schema, 0.5, 0);
+        let session_context = SessionContext::default();
+        let result = datafusion::physical_plan::collect(fold_exec, session_context.task_ctx())
+            .await
+            .unwrap();
+        let result_literal = datatypes::arrow::util::pretty::pretty_format_batches(&result)
+            .unwrap()
+            .to_string();
+
+        let expected = String::from(
+            "+------+-----+
+| host | val |
++------+-----+
+| a    | 0.1 |
+| a    | NaN |
+| b    | 0.1 |
++------+-----+",
+        );
+        assert_eq!(result_literal, expected);
+    }
+
+    #[tokio::test]
+    async fn emit_nan_when_no_inf_present() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("host", DataType::Utf8, true),
+            Field::new("le", DataType::Utf8, true),
+            Field::new("val", DataType::Float64, true),
+        ]));
+        let host_column = Arc::new(StringArray::from(vec!["c", "c"])) as _;
+        let le_column = Arc::new(StringArray::from(vec!["0.1", "1.0"])) as _;
+        let val_column = Arc::new(Float64Array::from(vec![1.0, 2.0])) as _;
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![host_column, le_column, val_column]).unwrap();
+        let fold_exec = build_fold_exec_from_batches(vec![batch], schema, 0.9, 0);
+        let session_context = SessionContext::default();
+        let result = datafusion::physical_plan::collect(fold_exec, session_context.task_ctx())
+            .await
+            .unwrap();
+        let result_literal = datatypes::arrow::util::pretty::pretty_format_batches(&result)
+            .unwrap()
+            .to_string();
+
+        let expected = String::from(
+            "+------+-----+
+| host | val |
++------+-----+
+| c    | NaN |
++------+-----+",
+        );
+        assert_eq!(result_literal, expected);
+    }
+
+    #[tokio::test]
+    async fn safe_mode_handles_misaligned_groups() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true),
+            Field::new("le", DataType::Utf8, true),
+            Field::new("val", DataType::Float64, true),
+        ]));
+
+        let ts_column = Arc::new(TimestampMillisecondArray::from(vec![
+            2900000, 2900000, 2900000, 3000000, 3000000, 3000000, 3000000, 3005000, 3005000,
+            3010000, 3010000, 3010000, 3010000, 3010000,
+        ])) as _;
+        let le_column = Arc::new(StringArray::from(vec![
+            "0.1", "1", "5", "0.1", "1", "5", "+Inf", "0.1", "+Inf", "0.1", "1", "3", "5", "+Inf",
+        ])) as _;
+        let val_column = Arc::new(Float64Array::from(vec![
+            0.0, 0.0, 0.0, 50.0, 70.0, 110.0, 120.0, 10.0, 30.0, 10.0, 20.0, 30.0, 40.0, 50.0,
+        ])) as _;
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![ts_column, le_column, val_column]).unwrap();
+        let fold_exec = build_fold_exec_from_batches(vec![batch], schema, 0.5, 0);
+        let session_context = SessionContext::default();
+        let result = datafusion::physical_plan::collect(fold_exec, session_context.task_ctx())
+            .await
+            .unwrap();
+
+        let mut values = Vec::new();
+        for batch in result {
+            let array = batch.column(1).as_primitive::<Float64Type>();
+            values.extend(array.iter().map(|v| v.unwrap()));
+        }
+
+        assert_eq!(values.len(), 4);
+        assert!(values[0].is_nan());
+        assert!((values[1] - 0.55).abs() < 1e-10);
+        assert!((values[2] - 0.1).abs() < 1e-10);
+        assert!((values[3] - 2.0).abs() < 1e-10);
+    }
+
+    #[tokio::test]
+    async fn missing_buckets_at_first_timestamp() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true),
+            Field::new("le", DataType::Utf8, true),
+            Field::new("val", DataType::Float64, true),
+        ]));
+
+        let ts_column = Arc::new(TimestampMillisecondArray::from(vec![
+            2_900_000, 3_000_000, 3_000_000, 3_000_000, 3_000_000, 3_005_000, 3_005_000, 3_010_000,
+            3_010_000, 3_010_000, 3_010_000, 3_010_000,
+        ])) as _;
+        let le_column = Arc::new(StringArray::from(vec![
+            "0.1", "0.1", "1", "5", "+Inf", "0.1", "+Inf", "0.1", "1", "3", "5", "+Inf",
+        ])) as _;
+        let val_column = Arc::new(Float64Array::from(vec![
+            0.0, 50.0, 70.0, 110.0, 120.0, 10.0, 30.0, 10.0, 20.0, 30.0, 40.0, 50.0,
+        ])) as _;
+
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![ts_column, le_column, val_column]).unwrap();
+        let fold_exec = build_fold_exec_from_batches(vec![batch], schema, 0.5, 0);
+        let session_context = SessionContext::default();
+        let result = datafusion::physical_plan::collect(fold_exec, session_context.task_ctx())
+            .await
+            .unwrap();
+
+        let mut values = Vec::new();
+        for batch in result {
+            let array = batch.column(1).as_primitive::<Float64Type>();
+            values.extend(array.iter().map(|v| v.unwrap()));
+        }
+
+        assert_eq!(values.len(), 4);
+        assert!(values[0].is_nan());
+        assert!((values[1] - 0.55).abs() < 1e-10);
+        assert!((values[2] - 0.1).abs() < 1e-10);
+        assert!((values[3] - 2.0).abs() < 1e-10);
+    }
+
+    #[tokio::test]
+    async fn missing_inf_in_first_group() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true),
+            Field::new("le", DataType::Utf8, true),
+            Field::new("val", DataType::Float64, true),
+        ]));
+
+        let ts_column = Arc::new(TimestampMillisecondArray::from(vec![
+            1000, 1000, 1000, 2000, 2000, 2000, 2000,
+        ])) as _;
+        let le_column = Arc::new(StringArray::from(vec![
+            "0.1", "1", "5", "0.1", "1", "5", "+Inf",
+        ])) as _;
+        let val_column = Arc::new(Float64Array::from(vec![
+            0.0, 0.0, 0.0, 10.0, 20.0, 30.0, 30.0,
+        ])) as _;
+        let batch =
+            RecordBatch::try_new(schema.clone(), vec![ts_column, le_column, val_column]).unwrap();
+        let fold_exec = build_fold_exec_from_batches(vec![batch], schema, 0.5, 0);
+        let session_context = SessionContext::default();
+        let result = datafusion::physical_plan::collect(fold_exec, session_context.task_ctx())
+            .await
+            .unwrap();
+
+        let mut values = Vec::new();
+        for batch in result {
+            let array = batch.column(1).as_primitive::<Float64Type>();
+            values.extend(array.iter().map(|v| v.unwrap()));
+        }
+
+        assert_eq!(values.len(), 2);
+        assert!(values[0].is_nan());
+        assert!((values[1] - 0.55).abs() < 1e-10, "{values:?}");
+    }
+
     #[test]
     fn evaluate_row_normal_case() {
         let bucket = [0.0, 1.0, 2.0, 3.0, 4.0, f64::INFINITY];
@@ -944,11 +1463,11 @@ mod test {
     }
 
     #[test]
-    #[should_panic]
     fn evaluate_out_of_order_input() {
         let bucket = [0.0, 1.0, 2.0, 3.0, 4.0, f64::INFINITY];
         let counters = [5.0, 4.0, 3.0, 2.0, 1.0, 0.0];
-        HistogramFoldStream::evaluate_row(0.5, &bucket, &counters).unwrap();
+        let result = HistogramFoldStream::evaluate_row(0.5, &bucket, &counters).unwrap();
+        assert_eq!(0.0, result);
     }
 
     #[test]
@@ -966,4 +1485,20 @@ mod test {
         let result = HistogramFoldStream::evaluate_row(0.5, &bucket, &counters).unwrap();
         assert_eq!(3.0, result);
     }
+
+    #[test]
+    fn evaluate_non_monotonic_counter() {
+        let bucket = [0.0, 1.0, 2.0, 3.0, f64::INFINITY];
+        let counters = [0.1, 0.2, 0.4, 0.17, 0.5];
+        let result = HistogramFoldStream::evaluate_row(0.5, &bucket, &counters).unwrap();
+        assert!((result - 1.25).abs() < 1e-10, "{result}");
+    }
+
+    #[test]
+    fn evaluate_nan_counter() {
+        let bucket = [0.0, 1.0, 2.0, 3.0, f64::INFINITY];
+        let counters = [f64::NAN, 1.0, 2.0, 3.0, 3.0];
+        let result = HistogramFoldStream::evaluate_row(0.5, &bucket, &counters).unwrap();
+        assert!((result - 1.5).abs() < 1e-10, "{result}");
+    }
 }
diff --git a/src/promql/src/extension_plan/normalize.rs b/src/promql/src/extension_plan/normalize.rs
index ccd21a9cd7..9466508607 100644
--- a/src/promql/src/extension_plan/normalize.rs
+++ b/src/promql/src/extension_plan/normalize.rs
@@ -365,7 +365,7 @@ impl SeriesNormalizeStream {
             Arc::new(ts_column.clone()) as _
         } else {
             Arc::new(TimestampMillisecondArray::from_iter(
-                ts_column.iter().map(|ts| ts.map(|ts| ts - self.offset)),
+                ts_column.iter().map(|ts| ts.map(|ts| ts + self.offset)),
             ))
         };
         let mut columns = input.columns().to_vec();
@@ -518,11 +518,11 @@ mod test {
             "+---------------------+--------+------+\
             \n| timestamp           | value  | path |\
             \n+---------------------+--------+------+\
-            \n| 1970-01-01T00:00:59 | 0.0    | foo  |\
-            \n| 1970-01-01T00:01:59 | 1.0    | foo  |\
-            \n| 1969-12-31T23:59:59 | 10.0   | foo  |\
-            \n| 1970-01-01T00:00:29 | 100.0  | foo  |\
-            \n| 1970-01-01T00:01:29 | 1000.0 | foo  |\
+            \n| 1970-01-01T00:01:01 | 0.0    | foo  |\
+            \n| 1970-01-01T00:02:01 | 1.0    | foo  |\
+            \n| 1970-01-01T00:00:01 | 10.0   | foo  |\
+            \n| 1970-01-01T00:00:31 | 100.0  | foo  |\
+            \n| 1970-01-01T00:01:31 | 1000.0 | foo  |\
             \n+---------------------+--------+------+",
         );
 
diff --git a/src/puffin/src/puffin_manager.rs b/src/puffin/src/puffin_manager.rs
index 9f287128c1..060bd4237b 100644
--- a/src/puffin/src/puffin_manager.rs
+++ b/src/puffin/src/puffin_manager.rs
@@ -32,6 +32,15 @@ use crate::blob_metadata::{BlobMetadata, CompressionCodec};
 use crate::error::Result;
 use crate::file_metadata::FileMetadata;
 
+/// Metrics returned by `PuffinReader::dir` operations.
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
+pub struct DirMetrics {
+    /// Whether this was a cache hit (true) or cache miss (false).
+    pub cache_hit: bool,
+    /// Size of the directory in bytes.
+    pub dir_size: u64,
+}
+
 /// The `PuffinManager` trait provides a unified interface for creating `PuffinReader` and `PuffinWriter`.
 #[async_trait]
 pub trait PuffinManager {
@@ -106,9 +115,10 @@ pub trait PuffinReader {
 
     /// Reads a directory from the Puffin file.
     ///
-    /// The returned `GuardWithMetadata` is used to access the directory data and its metadata.
+    /// The returned tuple contains `GuardWithMetadata` and `DirMetrics`.
+    /// The `GuardWithMetadata` is used to access the directory data and its metadata.
     /// Users should hold the `GuardWithMetadata` until they are done with the directory data.
-    async fn dir(&self, key: &str) -> Result<GuardWithMetadata<Self::Dir>>;
+    async fn dir(&self, key: &str) -> Result<(GuardWithMetadata<Self::Dir>, DirMetrics)>;
 }
 
 /// `BlobGuard` is provided by the `PuffinReader` to access the blob data.
diff --git a/src/puffin/src/puffin_manager/fs_puffin_manager.rs b/src/puffin/src/puffin_manager/fs_puffin_manager.rs
index 61f6d5b597..8b4e6e64a7 100644
--- a/src/puffin/src/puffin_manager/fs_puffin_manager.rs
+++ b/src/puffin/src/puffin_manager/fs_puffin_manager.rs
@@ -56,6 +56,10 @@ impl<S, F> FsPuffinManager<S, F> {
         self.puffin_metadata_cache = puffin_metadata_cache;
         self
     }
+
+    pub fn file_accessor(&self) -> &F {
+        &self.puffin_file_accessor
+    }
 }
 
 #[async_trait]
diff --git a/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs b/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs
index 8339d32c95..c660d1e19a 100644
--- a/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs
+++ b/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs
@@ -36,7 +36,7 @@ use crate::puffin_manager::file_accessor::PuffinFileAccessor;
 use crate::puffin_manager::fs_puffin_manager::PuffinMetadataCacheRef;
 use crate::puffin_manager::fs_puffin_manager::dir_meta::DirMetadata;
 use crate::puffin_manager::stager::{BoxWriter, DirWriterProviderRef, Stager};
-use crate::puffin_manager::{BlobGuard, GuardWithMetadata, PuffinReader};
+use crate::puffin_manager::{BlobGuard, DirMetrics, GuardWithMetadata, PuffinReader};
 
 /// `FsPuffinReader` is a `PuffinReader` that provides fs readers for puffin files.
 pub struct FsPuffinReader<S, F>
@@ -130,10 +130,10 @@ where
         Ok(GuardWithMetadata::new(blob, blob_metadata))
     }
 
-    async fn dir(&self, key: &str) -> Result<GuardWithMetadata<Self::Dir>> {
+    async fn dir(&self, key: &str) -> Result<(GuardWithMetadata<Self::Dir>, DirMetrics)> {
         let mut file = self.puffin_reader().await?;
         let blob_metadata = self.get_blob_metadata(key, &mut file).await?;
-        let dir = self
+        let (dir, metrics) = self
             .stager
             .get_dir(
                 &self.handle,
@@ -153,7 +153,7 @@ where
             )
             .await?;
 
-        Ok(GuardWithMetadata::new(dir, blob_metadata))
+        Ok((GuardWithMetadata::new(dir, blob_metadata), metrics))
     }
 }
 
diff --git a/src/puffin/src/puffin_manager/stager.rs b/src/puffin/src/puffin_manager/stager.rs
index 708053bb27..512e94f4e8 100644
--- a/src/puffin/src/puffin_manager/stager.rs
+++ b/src/puffin/src/puffin_manager/stager.rs
@@ -23,7 +23,7 @@ use futures::AsyncWrite;
 use futures::future::BoxFuture;
 
 use crate::error::Result;
-use crate::puffin_manager::{BlobGuard, DirGuard};
+use crate::puffin_manager::{BlobGuard, DirGuard, DirMetrics};
 
 pub type BoxWriter = Box<dyn AsyncWrite + Unpin + Send>;
 
@@ -72,14 +72,15 @@ pub trait Stager: Send + Sync {
 
     /// Retrieves a directory, initializing it if necessary using the provided `init_fn`.
     ///
-    /// The returned `DirGuard` is used to access the directory in the filesystem.
+    /// The returned tuple contains the `DirGuard` and `DirMetrics`.
+    /// The `DirGuard` is used to access the directory in the filesystem.
     /// The caller is responsible for holding the `DirGuard` until they are done with the directory.
     async fn get_dir<'a>(
         &self,
         handle: &Self::FileHandle,
         key: &str,
         init_fn: Box<dyn InitDirFn + Send + Sync + 'a>,
-    ) -> Result<Self::Dir>;
+    ) -> Result<(Self::Dir, DirMetrics)>;
 
     /// Stores a directory in the staging area.
     async fn put_dir(
diff --git a/src/puffin/src/puffin_manager/stager/bounded_stager.rs b/src/puffin/src/puffin_manager/stager/bounded_stager.rs
index 380cce7930..dfb9285452 100644
--- a/src/puffin/src/puffin_manager/stager/bounded_stager.rs
+++ b/src/puffin/src/puffin_manager/stager/bounded_stager.rs
@@ -41,7 +41,7 @@ use crate::error::{
 use crate::puffin_manager::stager::{
     BoxWriter, DirWriterProvider, InitBlobFn, InitDirFn, Stager, StagerNotifier,
 };
-use crate::puffin_manager::{BlobGuard, DirGuard};
+use crate::puffin_manager::{BlobGuard, DirGuard, DirMetrics};
 
 const DELETE_QUEUE_SIZE: usize = 10240;
 const TMP_EXTENSION: &str = "tmp";
@@ -203,7 +203,7 @@ impl<H: ToString + Clone + Send + Sync> Stager for BoundedStager<H> {
         handle: &Self::FileHandle,
         key: &str,
         init_fn: Box<dyn InitDirFn + Send + Sync + 'a>,
-    ) -> Result<Self::Dir> {
+    ) -> Result<(Self::Dir, DirMetrics)> {
         let handle_str = handle.to_string();
 
         let cache_key = Self::encode_cache_key(&handle_str, key);
@@ -242,15 +242,22 @@ impl<H: ToString + Clone + Send + Sync> Stager for BoundedStager<H> {
             .await
             .context(CacheGetSnafu)?;
 
+        let dir_size = v.size();
         if let Some(notifier) = self.notifier.as_ref() {
             if miss {
-                notifier.on_cache_miss(v.size());
+                notifier.on_cache_miss(dir_size);
             } else {
-                notifier.on_cache_hit(v.size());
+                notifier.on_cache_hit(dir_size);
             }
         }
+
+        let metrics = DirMetrics {
+            cache_hit: !miss,
+            dir_size,
+        };
+
         match v {
-            CacheValue::Dir(guard) => Ok(guard),
+            CacheValue::Dir(guard) => Ok((guard, metrics)),
             _ => unreachable!(),
         }
     }
@@ -882,7 +889,7 @@ mod tests {
 
         let puffin_file_name = "test_get_dir".to_string();
         let key = "key";
-        let dir_path = stager
+        let (dir_path, metrics) = stager
             .get_dir(
                 &puffin_file_name,
                 key,
@@ -901,6 +908,9 @@ mod tests {
             .await
             .unwrap();
 
+        assert!(!metrics.cache_hit);
+        assert!(metrics.dir_size > 0);
+
         for (rel_path, content) in &files_in_dir {
             let file_path = dir_path.path().join(rel_path);
             let mut file = tokio::fs::File::open(&file_path).await.unwrap();
@@ -974,7 +984,7 @@ mod tests {
         ];
 
         let dir_key = "dir_key";
-        let guard = stager
+        let (guard, _metrics) = stager
             .get_dir(
                 &puffin_file_name,
                 dir_key,
@@ -1016,7 +1026,7 @@ mod tests {
         let buf = reader.read(0..m.content_length).await.unwrap();
         assert_eq!(&*buf, b"hello world");
 
-        let dir_path = stager
+        let (dir_path, metrics) = stager
             .get_dir(
                 &puffin_file_name,
                 dir_key,
@@ -1024,6 +1034,9 @@ mod tests {
             )
             .await
             .unwrap();
+
+        assert!(metrics.cache_hit);
+        assert!(metrics.dir_size > 0);
         for (rel_path, content) in &files_in_dir {
             let file_path = dir_path.path().join(rel_path);
             let mut file = tokio::fs::File::open(&file_path).await.unwrap();
@@ -1151,7 +1164,7 @@ mod tests {
         ];
 
         // First time to get the directory
-        let guard_0 = stager
+        let (guard_0, _metrics) = stager
             .get_dir(
                 &puffin_file_name,
                 dir_key,
@@ -1198,7 +1211,7 @@ mod tests {
         );
 
         // Second time to get the directory
-        let guard_1 = stager
+        let (guard_1, _metrics) = stager
             .get_dir(
                 &puffin_file_name,
                 dir_key,
@@ -1237,7 +1250,7 @@ mod tests {
         // Third time to get the directory and all guards are dropped
         drop(guard_0);
         drop(guard_1);
-        let guard_2 = stager
+        let (guard_2, _metrics) = stager
             .get_dir(
                 &puffin_file_name,
                 dir_key,
@@ -1390,7 +1403,7 @@ mod tests {
         ];
 
         let dir_key = "dir_key";
-        let guard = stager
+        let (guard, _metrics) = stager
             .get_dir(
                 &puffin_file_name,
                 dir_key,
diff --git a/src/puffin/src/puffin_manager/tests.rs b/src/puffin/src/puffin_manager/tests.rs
index 715668e40e..f1ee9fabd7 100644
--- a/src/puffin/src/puffin_manager/tests.rs
+++ b/src/puffin/src/puffin_manager/tests.rs
@@ -356,7 +356,7 @@ async fn check_dir(
     stager: &BoundedStager<String>,
     puffin_reader: &impl PuffinReader,
 ) {
-    let res_dir = puffin_reader.dir(key).await.unwrap();
+    let (res_dir, _metrics) = puffin_reader.dir(key).await.unwrap();
     let metadata = res_dir.metadata();
     assert_eq!(
         metadata.properties,
diff --git a/src/query/src/datafusion.rs b/src/query/src/datafusion.rs
index ab27666c01..9c81b8d524 100644
--- a/src/query/src/datafusion.rs
+++ b/src/query/src/datafusion.rs
@@ -32,6 +32,7 @@ use common_recordbatch::adapter::RecordBatchStreamAdapter;
 use common_recordbatch::{EmptyRecordBatchStream, SendableRecordBatchStream};
 use common_telemetry::tracing;
 use datafusion::catalog::TableFunction;
+use datafusion::dataframe::DataFrame;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_plan::analyze::AnalyzeExec;
 use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
@@ -49,7 +50,6 @@ use table::TableRef;
 use table::requests::{DeleteRequest, InsertRequest};
 
 use crate::analyze::DistAnalyzeExec;
-use crate::dataframe::DataFrame;
 pub use crate::datafusion::planner::DfContextProviderAdapter;
 use crate::dist_plan::{DistPlannerOptions, MergeScanLogicalPlan};
 use crate::error::{
@@ -515,13 +515,11 @@ impl QueryEngine for DatafusionQueryEngine {
     }
 
     fn read_table(&self, table: TableRef) -> Result<DataFrame> {
-        Ok(DataFrame::DataFusion(
-            self.state
-                .read_table(table)
-                .context(error::DatafusionSnafu)
-                .map_err(BoxedError::new)
-                .context(QueryExecutionSnafu)?,
-        ))
+        self.state
+            .read_table(table)
+            .context(error::DatafusionSnafu)
+            .map_err(BoxedError::new)
+            .context(QueryExecutionSnafu)
     }
 
     fn engine_context(&self, query_ctx: QueryContextRef) -> QueryEngineContext {
@@ -682,13 +680,14 @@ impl QueryExecutor for DatafusionQueryEngine {
 mod tests {
     use std::sync::Arc;
 
+    use arrow::array::{ArrayRef, UInt64Array};
     use catalog::RegisterTableRequest;
     use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, NUMBERS_TABLE_ID};
     use common_recordbatch::util;
     use datafusion::prelude::{col, lit};
     use datatypes::prelude::ConcreteDataType;
     use datatypes::schema::ColumnSchema;
-    use datatypes::vectors::{Helper, UInt32Vector, UInt64Vector, VectorRef};
+    use datatypes::vectors::{Helper, UInt32Vector, VectorRef};
     use session::context::{QueryContext, QueryContextBuilder};
     use table::table::numbers::{NUMBERS_TABLE_NAME, NumbersTable};
 
@@ -770,10 +769,8 @@ mod tests {
                 assert_eq!(1, batch.num_columns());
                 assert_eq!(batch.column(0).len(), 1);
 
-                assert_eq!(
-                    *batch.column(0),
-                    Arc::new(UInt64Vector::from_slice([4950])) as VectorRef
-                );
+                let expected = Arc::new(UInt64Array::from_iter_values([4950])) as ArrayRef;
+                assert_eq!(batch.column(0), &expected);
             }
             _ => unreachable!(),
         }
@@ -800,7 +797,7 @@ mod tests {
             .await
             .unwrap();
 
-        let DataFrame::DataFusion(df) = engine.read_table(table).unwrap();
+        let df = engine.read_table(table).unwrap();
         let df = df
             .select_columns(&["number"])
             .unwrap()
diff --git a/src/query/src/datafusion/planner.rs b/src/query/src/datafusion/planner.rs
index d9c74b9d5a..43e7a04db1 100644
--- a/src/query/src/datafusion/planner.rs
+++ b/src/query/src/datafusion/planner.rs
@@ -41,6 +41,8 @@ use snafu::{Location, ResultExt};
 use crate::error::{CatalogSnafu, Result};
 use crate::query_engine::{DefaultPlanDecoder, QueryEngineState};
 
+mod function_alias;
+
 pub struct DfContextProviderAdapter {
     engine_state: Arc<QueryEngineState>,
     session_state: SessionState,
@@ -147,7 +149,17 @@ impl ContextProvider for DfContextProviderAdapter {
 
     fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>> {
         self.engine_state.scalar_function(name).map_or_else(
-            || self.session_state.scalar_functions().get(name).cloned(),
+            || {
+                self.session_state
+                    .scalar_functions()
+                    .get(name)
+                    .cloned()
+                    .or_else(|| {
+                        function_alias::resolve_scalar(name).and_then(|name| {
+                            self.session_state.scalar_functions().get(name).cloned()
+                        })
+                    })
+            },
             |func| {
                 Some(Arc::new(func.provide(FunctionContext {
                     query_ctx: self.query_ctx.clone(),
@@ -159,7 +171,17 @@ impl ContextProvider for DfContextProviderAdapter {
 
     fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> {
         self.engine_state.aggr_function(name).map_or_else(
-            || self.session_state.aggregate_functions().get(name).cloned(),
+            || {
+                self.session_state
+                    .aggregate_functions()
+                    .get(name)
+                    .cloned()
+                    .or_else(|| {
+                        function_alias::resolve_aggregate(name).and_then(|name| {
+                            self.session_state.aggregate_functions().get(name).cloned()
+                        })
+                    })
+            },
             |func| Some(Arc::new(func)),
         )
     }
@@ -193,12 +215,14 @@ impl ContextProvider for DfContextProviderAdapter {
     fn udf_names(&self) -> Vec<String> {
         let mut names = self.engine_state.scalar_names();
         names.extend(self.session_state.scalar_functions().keys().cloned());
+        names.extend(function_alias::scalar_alias_names().map(|name| name.to_string()));
         names
     }
 
     fn udaf_names(&self) -> Vec<String> {
         let mut names = self.engine_state.aggr_names();
         names.extend(self.session_state.aggregate_functions().keys().cloned());
+        names.extend(function_alias::aggregate_alias_names().map(|name| name.to_string()));
         names
     }
 
@@ -233,9 +257,14 @@ impl ContextProvider for DfContextProviderAdapter {
                 .table_functions()
                 .get(name)
                 .cloned()
-                .ok_or_else(|| {
-                    DataFusionError::Plan(format!("table function '{name}' not found"))
-                })?;
+                .or_else(|| {
+                    function_alias::resolve_scalar(name)
+                        .and_then(|alias| self.session_state.table_functions().get(alias).cloned())
+                });
+
+            let tbl_func = tbl_func.ok_or_else(|| {
+                DataFusionError::Plan(format!("table function '{name}' not found"))
+            })?;
             let provider = tbl_func.create_table_provider(&args)?;
 
             Ok(provider_as_source(provider))
diff --git a/src/query/src/datafusion/planner/function_alias.rs b/src/query/src/datafusion/planner/function_alias.rs
new file mode 100644
index 0000000000..898ef81e93
--- /dev/null
+++ b/src/query/src/datafusion/planner/function_alias.rs
@@ -0,0 +1,86 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::HashMap;
+
+use once_cell::sync::Lazy;
+
+const SCALAR_ALIASES: &[(&str, &str)] = &[
+    // SQL compat aliases.
+    ("ucase", "upper"),
+    ("lcase", "lower"),
+    ("ceiling", "ceil"),
+    ("mid", "substr"),
+    // MySQL's RAND([seed]) accepts an optional seed argument, while DataFusion's `random()`
+    // does not. We alias the name for `rand()` compatibility, and `rand(seed)` will error
+    // due to mismatched arity.
+    ("rand", "random"),
+];
+
+const AGGREGATE_ALIASES: &[(&str, &str)] = &[
+    // MySQL compat aliases that don't override existing DataFusion aggregate names.
+    //
+    // NOTE: We intentionally do NOT alias `stddev` here, because DataFusion defines `stddev`
+    // as sample standard deviation while MySQL's `STDDEV` is population standard deviation.
+    ("std", "stddev_pop"),
+    ("variance", "var_pop"),
+];
+
+static SCALAR_FUNCTION_ALIAS: Lazy<HashMap<&'static str, &'static str>> =
+    Lazy::new(|| SCALAR_ALIASES.iter().copied().collect());
+
+static AGGREGATE_FUNCTION_ALIAS: Lazy<HashMap<&'static str, &'static str>> =
+    Lazy::new(|| AGGREGATE_ALIASES.iter().copied().collect());
+
+pub fn resolve_scalar(name: &str) -> Option<&'static str> {
+    let name = name.to_ascii_lowercase();
+    SCALAR_FUNCTION_ALIAS.get(name.as_str()).copied()
+}
+
+pub fn resolve_aggregate(name: &str) -> Option<&'static str> {
+    let name = name.to_ascii_lowercase();
+    AGGREGATE_FUNCTION_ALIAS.get(name.as_str()).copied()
+}
+
+pub fn scalar_alias_names() -> impl Iterator<Item = &'static str> {
+    SCALAR_ALIASES.iter().map(|(name, _)| *name)
+}
+
+pub fn aggregate_alias_names() -> impl Iterator<Item = &'static str> {
+    AGGREGATE_ALIASES.iter().map(|(name, _)| *name)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{resolve_aggregate, resolve_scalar};
+
+    #[test]
+    fn resolves_scalar_aliases_case_insensitive() {
+        assert_eq!(resolve_scalar("ucase"), Some("upper"));
+        assert_eq!(resolve_scalar("UCASE"), Some("upper"));
+        assert_eq!(resolve_scalar("lcase"), Some("lower"));
+        assert_eq!(resolve_scalar("ceiling"), Some("ceil"));
+        assert_eq!(resolve_scalar("MID"), Some("substr"));
+        assert_eq!(resolve_scalar("RAND"), Some("random"));
+        assert_eq!(resolve_scalar("not_a_real_alias"), None);
+    }
+
+    #[test]
+    fn resolves_aggregate_aliases_case_insensitive() {
+        assert_eq!(resolve_aggregate("std"), Some("stddev_pop"));
+        assert_eq!(resolve_aggregate("variance"), Some("var_pop"));
+        assert_eq!(resolve_aggregate("STDDEV"), None);
+        assert_eq!(resolve_aggregate("not_a_real_alias"), None);
+    }
+}
diff --git a/src/query/src/dist_plan/analyzer.rs b/src/query/src/dist_plan/analyzer.rs
index 34e035644b..0bdf4dd70a 100644
--- a/src/query/src/dist_plan/analyzer.rs
+++ b/src/query/src/dist_plan/analyzer.rs
@@ -30,6 +30,7 @@ use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder, Subquery, col as co
 use datafusion_optimizer::analyzer::AnalyzerRule;
 use datafusion_optimizer::simplify_expressions::SimplifyExpressions;
 use datafusion_optimizer::{OptimizerConfig, OptimizerRule};
+use promql::extension_plan::SeriesDivide;
 use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
 use table::metadata::TableType;
 use table::table::adapter::DfTableProviderAdapter;
@@ -380,7 +381,23 @@ impl PlanRewriter {
         }
 
         match Categorizer::check_plan(plan, self.partition_cols.clone())? {
-            Commutativity::Commutative => {}
+            Commutativity::Commutative => {
+                // PATCH: we should reconsider SORT's commutativity instead of doing this trick.
+                // explain: for a fully commutative SeriesDivide, its child Sort plan only serves it. I.e., that
+                //   Sort plan is also fully commutative, instead of conditional commutative. So we can remove
+                //   the generated MergeSort from stage safely.
+                if let LogicalPlan::Extension(ext_a) = plan
+                    && ext_a.node.name() == SeriesDivide::name()
+                    && let Some(LogicalPlan::Extension(ext_b)) = self.stage.last()
+                    && ext_b.node.name() == MergeSortLogicalPlan::name()
+                {
+                    // revert last `ConditionalCommutative` result for Sort plan in this case.
+                    // `update_column_requirements` left unchanged because Sort won't generate
+                    // new columns or remove existing columns.
+                    self.stage.pop();
+                    self.expand_on_next_part_cond_trans_commutative = false;
+                }
+            }
             Commutativity::PartialCommutative => {
                 if let Some(plan) = partial_commutative_transformer(plan) {
                     // notice this plan is parent of current node, so `self.level - 1` when updating column requirements
diff --git a/src/query/src/dist_plan/commutativity.rs b/src/query/src/dist_plan/commutativity.rs
index c8652b8d52..f8b3b18f1c 100644
--- a/src/query/src/dist_plan/commutativity.rs
+++ b/src/query/src/dist_plan/commutativity.rs
@@ -187,7 +187,7 @@ impl Categorizer {
                 if partition_cols.is_empty() {
                     Commutativity::Commutative
                 } else {
-                    Commutativity::Unimplemented
+                    Commutativity::PartialCommutative
                 }
             }
             LogicalPlan::Unnest(_) => Commutativity::Commutative,
diff --git a/src/query/src/dist_plan/merge_scan.rs b/src/query/src/dist_plan/merge_scan.rs
index aebf9a457d..a4dd5243a7 100644
--- a/src/query/src/dist_plan/merge_scan.rs
+++ b/src/query/src/dist_plan/merge_scan.rs
@@ -20,40 +20,35 @@ use ahash::{HashMap, HashSet};
 use arrow_schema::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, SortOptions};
 use async_stream::stream;
 use common_catalog::parse_catalog_and_schema_from_db_string;
-use common_error::ext::BoxedError;
 use common_plugins::GREPTIME_EXEC_READ_COST;
 use common_query::request::QueryRequest;
-use common_recordbatch::adapter::{DfRecordBatchStreamAdapter, RecordBatchMetrics};
-use common_recordbatch::error::ExternalSnafu;
-use common_recordbatch::{
-    DfSendableRecordBatchStream, RecordBatch, RecordBatchStreamWrapper, SendableRecordBatchStream,
-};
+use common_recordbatch::adapter::RecordBatchMetrics;
 use common_telemetry::tracing_context::TracingContext;
+use datafusion::arrow::record_batch::RecordBatch;
 use datafusion::execution::{SessionState, TaskContext};
 use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion::physical_plan::metrics::{
     Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder, MetricsSet, Time,
 };
+use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::{
     DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties,
+    SendableRecordBatchStream,
 };
-use datafusion_common::{Column as ColumnExpr, Result};
+use datafusion_common::{Column as ColumnExpr, DataFusionError, Result};
 use datafusion_expr::{Expr, Extension, LogicalPlan, UserDefinedLogicalNodeCore};
 use datafusion_physical_expr::expressions::Column;
 use datafusion_physical_expr::{Distribution, EquivalenceProperties, PhysicalSortExpr};
-use datatypes::schema::{Schema, SchemaRef};
 use futures_util::StreamExt;
 use greptime_proto::v1::region::RegionRequestHeader;
 use meter_core::data::ReadItem;
 use meter_macros::read_meter;
 use session::context::QueryContextRef;
-use snafu::ResultExt;
 use store_api::storage::RegionId;
 use table::table_name::TableName;
 use tokio::time::Instant;
 
 use crate::dist_plan::analyzer::AliasMapping;
-use crate::error::ConvertSchemaSnafu;
 use crate::metrics::{MERGE_SCAN_ERRORS_TOTAL, MERGE_SCAN_POLL_ELAPSED, MERGE_SCAN_REGIONS};
 use crate::region_query::RegionQueryHandlerRef;
 
@@ -140,7 +135,6 @@ pub struct MergeScanExec {
     table: TableName,
     regions: Vec<RegionId>,
     plan: LogicalPlan,
-    schema: SchemaRef,
     arrow_schema: ArrowSchemaRef,
     region_query_handler: RegionQueryHandlerRef,
     metric: ExecutionPlanMetricsSet,
@@ -159,7 +153,6 @@ impl std::fmt::Debug for MergeScanExec {
         f.debug_struct("MergeScanExec")
             .field("table", &self.table)
             .field("regions", &self.regions)
-            .field("schema", &self.schema)
             .field("plan", &self.plan)
             .finish()
     }
@@ -238,12 +231,10 @@ impl MergeScanExec {
             EmissionType::Incremental,
             Boundedness::Bounded,
         );
-        let schema = Self::arrow_schema_to_schema(arrow_schema.clone())?;
         Ok(Self {
             table,
             regions,
             plan,
-            schema,
             arrow_schema,
             region_query_handler,
             metric: ExecutionPlanMetricsSet::new(),
@@ -265,7 +256,7 @@ impl MergeScanExec {
         let regions = self.regions.clone();
         let region_query_handler = self.region_query_handler.clone();
         let metric = MergeScanMetric::new(&self.metric);
-        let schema = self.schema.clone();
+        let arrow_schema = self.arrow_schema.clone();
         let query_ctx = self.query_ctx.clone();
         let sub_stage_metrics_moved = self.sub_stage_metrics.clone();
         let partition_metrics_moved = self.partition_metrics.clone();
@@ -318,9 +309,8 @@ impl MergeScanExec {
                     .await
                     .map_err(|e| {
                         MERGE_SCAN_ERRORS_TOTAL.inc();
-                        BoxedError::new(e)
-                    })
-                    .context(ExternalSnafu)?;
+                        DataFusionError::External(Box::new(e))
+                    })?;
                 let do_get_cost = do_get_start.elapsed();
 
                 ready_timer.stop();
@@ -331,10 +321,11 @@ impl MergeScanExec {
                     let poll_elapsed = poll_timer.elapsed();
                     poll_duration += poll_elapsed;
 
-                    let batch = batch?;
-                    // reconstruct batch using `self.schema`
-                    // to remove metadata and correct column name
-                    let batch = RecordBatch::new(schema.clone(), batch.columns().iter().cloned())?;
+                    let batch = batch.map_err(|e| DataFusionError::External(Box::new(e)))?;
+                    let batch = RecordBatch::try_new(
+                        arrow_schema.clone(),
+                        batch.into_df_record_batch().columns().to_vec(),
+                    )?;
                     metric.record_output_batch_rows(batch.num_rows());
                     if let Some(mut first_consume_timer) = first_consume_timer.take() {
                         first_consume_timer.stop();
@@ -410,12 +401,10 @@ impl MergeScanExec {
             }
         }));
 
-        Ok(Box::pin(RecordBatchStreamWrapper {
-            schema: self.schema.clone(),
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            self.arrow_schema.clone(),
             stream,
-            output_ordering: None,
-            metrics: Default::default(),
-        }))
+        )))
     }
 
     pub fn try_with_new_distribution(&self, distribution: Distribution) -> Option<Self> {
@@ -453,7 +442,6 @@ impl MergeScanExec {
             table: self.table.clone(),
             regions: self.regions.clone(),
             plan: self.plan.clone(),
-            schema: self.schema.clone(),
             arrow_schema: self.arrow_schema.clone(),
             region_query_handler: self.region_query_handler.clone(),
             metric: self.metric.clone(),
@@ -471,11 +459,6 @@ impl MergeScanExec {
         })
     }
 
-    fn arrow_schema_to_schema(arrow_schema: ArrowSchemaRef) -> Result<SchemaRef> {
-        let schema = Schema::try_from(arrow_schema).context(ConvertSchemaSnafu)?;
-        Ok(Arc::new(schema))
-    }
-
     pub fn sub_stage_metrics(&self) -> Vec<RecordBatchMetrics> {
         self.sub_stage_metrics
             .lock()
@@ -614,10 +597,8 @@ impl ExecutionPlan for MergeScanExec {
         &self,
         partition: usize,
         context: Arc<TaskContext>,
-    ) -> Result<DfSendableRecordBatchStream> {
-        Ok(Box::pin(DfRecordBatchStreamAdapter::new(
-            self.to_stream(context, partition)?,
-        )))
+    ) -> Result<SendableRecordBatchStream> {
+        self.to_stream(context, partition)
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
diff --git a/src/query/src/lib.rs b/src/query/src/lib.rs
index 0c6ccaa41f..2b159a91b1 100644
--- a/src/query/src/lib.rs
+++ b/src/query/src/lib.rs
@@ -19,7 +19,6 @@
 #![feature(box_patterns)]
 
 mod analyze;
-pub mod dataframe;
 pub mod datafusion;
 pub mod dist_plan;
 pub mod dummy_catalog;
diff --git a/src/query/src/optimizer/parallelize_scan.rs b/src/query/src/optimizer/parallelize_scan.rs
index c6baecc4b6..26573df758 100644
--- a/src/query/src/optimizer/parallelize_scan.rs
+++ b/src/query/src/optimizer/parallelize_scan.rs
@@ -62,7 +62,9 @@ impl ParallelizeScan {
                     plan.as_any().downcast_ref::<RegionScanExec>()
                 {
                     let expected_partition_num = config.execution.target_partitions;
-                    if region_scan_exec.is_partition_set() {
+                    if region_scan_exec.is_partition_set()
+                        || region_scan_exec.scanner_type().as_str() == "SinglePartition"
+                    {
                         return Ok(Transformed::no(plan));
                     }
 
@@ -85,11 +87,19 @@ impl ParallelizeScan {
                         && order_expr.options.descending
                     {
                         for ranges in partition_ranges.iter_mut() {
-                            ranges.sort_by(|a, b| b.end.cmp(&a.end));
+                            // Primary: end descending (larger end first)
+                            // Secondary: start descending (shorter range first when ends are equal)
+                            ranges.sort_by(|a, b| {
+                                b.end.cmp(&a.end).then_with(|| b.start.cmp(&a.start))
+                            });
                         }
                     } else {
                         for ranges in partition_ranges.iter_mut() {
-                            ranges.sort_by(|a, b| a.start.cmp(&b.start));
+                            // Primary: start ascending (smaller start first)
+                            // Secondary: end ascending (shorter range first when starts are equal)
+                            ranges.sort_by(|a, b| {
+                                a.start.cmp(&b.start).then_with(|| a.end.cmp(&b.end))
+                            });
                         }
                     }
 
diff --git a/src/query/src/optimizer/test_util.rs b/src/query/src/optimizer/test_util.rs
index cc5712b8a5..8258b13490 100644
--- a/src/query/src/optimizer/test_util.rs
+++ b/src/query/src/optimizer/test_util.rs
@@ -28,7 +28,8 @@ use store_api::metadata::{
     ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
 };
 use store_api::region_engine::{
-    RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
+    CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
+    RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
     SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse,
 };
 use store_api::region_request::RegionRequest;
@@ -117,6 +118,21 @@ impl RegionEngine for MetaRegionEngine {
         unimplemented!()
     }
 
+    async fn remap_manifests(
+        &self,
+        _request: RemapManifestsRequest,
+    ) -> Result<RemapManifestsResponse, BoxedError> {
+        unimplemented!()
+    }
+
+    async fn copy_region_from(
+        &self,
+        _region_id: RegionId,
+        _request: CopyRegionFromRequest,
+    ) -> Result<CopyRegionFromResponse, BoxedError> {
+        unimplemented!()
+    }
+
     fn role(&self, _region_id: RegionId) -> Option<RegionRole> {
         None
     }
diff --git a/src/query/src/optimizer/windowed_sort.rs b/src/query/src/optimizer/windowed_sort.rs
index dcf63f6d73..469f4db159 100644
--- a/src/query/src/optimizer/windowed_sort.rs
+++ b/src/query/src/optimizer/windowed_sort.rs
@@ -110,12 +110,12 @@ impl WindowedSortPhysicalRule {
                     {
                         sort_input
                     } else {
-                        Arc::new(PartSortExec::new(
+                        Arc::new(PartSortExec::try_new(
                             first_sort_expr.clone(),
                             sort_exec.fetch(),
                             scanner_info.partition_ranges.clone(),
                             sort_input,
-                        ))
+                        )?)
                     };
 
                     let windowed_sort_exec = WindowedSortExec::try_new(
diff --git a/src/query/src/part_sort.rs b/src/query/src/part_sort.rs
index ebf4fddc1e..36e4cc8463 100644
--- a/src/query/src/part_sort.rs
+++ b/src/query/src/part_sort.rs
@@ -27,6 +27,7 @@ use arrow::array::ArrayRef;
 use arrow::compute::{concat, concat_batches, take_record_batch};
 use arrow_schema::SchemaRef;
 use common_recordbatch::{DfRecordBatch, DfSendableRecordBatchStream};
+use common_time::Timestamp;
 use datafusion::common::arrow::compute::sort_to_indices;
 use datafusion::execution::memory_pool::{MemoryConsumer, MemoryReservation};
 use datafusion::execution::{RecordBatchStream, TaskContext};
@@ -48,8 +49,51 @@ use parking_lot::RwLock;
 use snafu::location;
 use store_api::region_engine::PartitionRange;
 
+use crate::error::Result;
+use crate::window_sort::check_partition_range_monotonicity;
 use crate::{array_iter_helper, downcast_ts_array};
 
+/// Get the primary end of a `PartitionRange` based on sort direction.
+///
+/// - Descending: primary end is `end` (we process highest values first)
+/// - Ascending: primary end is `start` (we process lowest values first)
+fn get_primary_end(range: &PartitionRange, descending: bool) -> Timestamp {
+    if descending { range.end } else { range.start }
+}
+
+/// Group consecutive ranges by their primary end value.
+///
+/// Returns a vector of (primary_end, start_idx_inclusive, end_idx_exclusive) tuples.
+/// Ranges with the same primary end MUST be processed together because they may
+/// overlap and contain values that belong to the same "top-k" result.
+fn group_ranges_by_primary_end(
+    ranges: &[PartitionRange],
+    descending: bool,
+) -> Vec<(Timestamp, usize, usize)> {
+    if ranges.is_empty() {
+        return vec![];
+    }
+
+    let mut groups = Vec::new();
+    let mut group_start = 0;
+    let mut current_primary_end = get_primary_end(&ranges[0], descending);
+
+    for (idx, range) in ranges.iter().enumerate().skip(1) {
+        let primary_end = get_primary_end(range, descending);
+        if primary_end != current_primary_end {
+            // End current group
+            groups.push((current_primary_end, group_start, idx));
+            // Start new group
+            group_start = idx;
+            current_primary_end = primary_end;
+        }
+    }
+    // Push the last group
+    groups.push((current_primary_end, group_start, ranges.len()));
+
+    groups
+}
+
 /// Sort input within given PartitionRange
 ///
 /// Input is assumed to be segmented by empty RecordBatch, which indicates a new `PartitionRange` is starting
@@ -72,12 +116,14 @@ pub struct PartSortExec {
 }
 
 impl PartSortExec {
-    pub fn new(
+    pub fn try_new(
         expression: PhysicalSortExpr,
         limit: Option<usize>,
         partition_ranges: Vec<Vec<PartitionRange>>,
         input: Arc<dyn ExecutionPlan>,
-    ) -> Self {
+    ) -> Result<Self> {
+        check_partition_range_monotonicity(&partition_ranges, expression.options.descending)?;
+
         let metrics = ExecutionPlanMetricsSet::new();
         let properties = input.properties();
         let properties = PlanProperties::new(
@@ -91,7 +137,7 @@ impl PartSortExec {
             .is_some()
             .then(|| Self::create_filter(expression.expr.clone()));
 
-        Self {
+        Ok(Self {
             expression,
             limit,
             input,
@@ -99,7 +145,7 @@ impl PartSortExec {
             partition_ranges,
             properties,
             filter,
-        }
+        })
     }
 
     /// Add or reset `self.filter` to a new `TopKDynamicFilters`.
@@ -185,12 +231,13 @@ impl ExecutionPlan for PartSortExec {
         } else {
             internal_err!("No children found")?
         };
-        Ok(Arc::new(Self::new(
+        let new = Self::try_new(
             self.expression.clone(),
             self.limit,
             self.partition_ranges.clone(),
             new_input.clone(),
-        )))
+        )?;
+        Ok(Arc::new(new))
     }
 
     fn execute(
@@ -284,7 +331,6 @@ struct PartSortStream {
     buffer: PartSortBuffer,
     expression: PhysicalSortExpr,
     limit: Option<usize>,
-    produced: usize,
     input: DfSendableRecordBatchStream,
     input_complete: bool,
     schema: SchemaRef,
@@ -296,6 +342,11 @@ struct PartSortStream {
     metrics: BaselineMetrics,
     context: Arc<TaskContext>,
     root_metrics: ExecutionPlanMetricsSet,
+    /// Groups of ranges by primary end: (primary_end, start_idx_inclusive, end_idx_exclusive).
+    /// Ranges in the same group must be processed together before outputting results.
+    range_groups: Vec<(Timestamp, usize, usize)>,
+    /// Current group being processed (index into range_groups).
+    cur_group_idx: usize,
 }
 
 impl PartSortStream {
@@ -334,13 +385,16 @@ impl PartSortStream {
             PartSortBuffer::All(Vec::new())
         };
 
+        // Compute range groups by primary end
+        let descending = sort.expression.options.descending;
+        let range_groups = group_ranges_by_primary_end(&partition_ranges, descending);
+
         Ok(Self {
             reservation: MemoryConsumer::new("PartSortStream".to_string())
                 .register(&context.runtime_env().memory_pool),
             buffer,
             expression: sort.expression.clone(),
             limit,
-            produced: 0,
             input,
             input_complete: false,
             schema: sort.input.schema(),
@@ -351,10 +405,25 @@ impl PartSortStream {
             metrics: BaselineMetrics::new(&sort.metrics, partition),
             context,
             root_metrics: sort.metrics.clone(),
+            range_groups,
+            cur_group_idx: 0,
         })
     }
 }
 
+macro_rules! ts_to_timestamp {
+    ($t:ty, $unit:expr, $arr:expr) => {{
+        let arr = $arr
+            .as_any()
+            .downcast_ref::<arrow::array::PrimitiveArray<$t>>()
+            .unwrap();
+
+        arr.iter()
+            .map(|v| v.map(|v| Timestamp::new(v, common_time::timestamp::TimeUnit::from(&$unit))))
+            .collect_vec()
+    }};
+}
+
 macro_rules! array_check_helper {
     ($t:ty, $unit:expr, $arr:expr, $cur_range:expr, $min_max_idx:expr) => {{
             if $cur_range.start.unit().as_arrow_time_unit() != $unit
@@ -394,21 +463,22 @@ macro_rules! array_check_helper {
 }
 
 impl PartSortStream {
-    /// check whether the sort column's min/max value is within the partition range
+    /// check whether the sort column's min/max value is within the current group's effective range.
+    /// For group-based processing, data from multiple ranges with the same primary end
+    /// is accumulated together, so we check against the union of all ranges in the group.
     fn check_in_range(
         &self,
         sort_column: &ArrayRef,
         min_max_idx: (usize, usize),
     ) -> datafusion_common::Result<()> {
-        if self.cur_part_idx >= self.partition_ranges.len() {
+        // Use the group's effective range instead of the current partition range
+        let Some(cur_range) = self.get_current_group_effective_range() else {
             internal_err!(
-                "Partition index out of range: {} >= {} at {}",
-                self.cur_part_idx,
-                self.partition_ranges.len(),
+                "No effective range for current group {} at {}",
+                self.cur_group_idx,
                 snafu::location!()
-            )?;
-        }
-        let cur_range = self.partition_ranges[self.cur_part_idx];
+            )?
+        };
 
         downcast_ts_array!(
             sort_column.data_type() => (array_check_helper, sort_column, cur_range, min_max_idx),
@@ -430,7 +500,7 @@ impl PartSortStream {
         sort_column: &ArrayRef,
     ) -> datafusion_common::Result<Option<usize>> {
         if sort_column.is_empty() {
-            return Ok(Some(0));
+            return Ok(None);
         }
 
         // check if the current partition index is out of range
@@ -476,6 +546,103 @@ impl PartSortStream {
         Ok(())
     }
 
+    /// A temporary solution for stop read earlier when current group do not overlap with any of those next group
+    /// If not overlap, we can stop read further input as current top k is final
+    fn can_stop_early(&mut self) -> datafusion_common::Result<bool> {
+        let topk_cnt = match &self.buffer {
+            PartSortBuffer::Top(_, cnt) => *cnt,
+            _ => return Ok(false),
+        };
+        // not fulfill topk yet
+        if Some(topk_cnt) < self.limit {
+            return Ok(false);
+        }
+        // else check if last value in topk is not in next group range
+        let topk_buffer = self.sort_top_buffer()?;
+        let min_batch = topk_buffer.slice(topk_buffer.num_rows() - 1, 1);
+        let min_sort_column = self.expression.evaluate_to_sort_column(&min_batch)?.values;
+        let last_val = downcast_ts_array!(
+            min_sort_column.data_type() => (ts_to_timestamp, min_sort_column),
+            _ => internal_err!(
+                "Unsupported data type for sort column: {:?}",
+                min_sort_column.data_type()
+            )?,
+        )[0];
+        let Some(last_val) = last_val else {
+            return Ok(false);
+        };
+        let next_group_primary_end = if self.cur_group_idx + 1 < self.range_groups.len() {
+            self.range_groups[self.cur_group_idx + 1].0
+        } else {
+            // no next group
+            return Ok(false);
+        };
+        let descending = self.expression.options.descending;
+        let not_in_next_group_range = if descending {
+            last_val >= next_group_primary_end
+        } else {
+            last_val < next_group_primary_end
+        };
+
+        // refill topk buffer count
+        self.push_buffer(topk_buffer)?;
+
+        Ok(not_in_next_group_range)
+    }
+
+    /// Check if the given partition index is within the current group.
+    fn is_in_current_group(&self, part_idx: usize) -> bool {
+        if self.cur_group_idx >= self.range_groups.len() {
+            return false;
+        }
+        let (_, start, end) = self.range_groups[self.cur_group_idx];
+        part_idx >= start && part_idx < end
+    }
+
+    /// Advance to the next group. Returns true if there is a next group.
+    fn advance_to_next_group(&mut self) -> bool {
+        self.cur_group_idx += 1;
+        self.cur_group_idx < self.range_groups.len()
+    }
+
+    /// Get the effective range for the current group.
+    /// For a group of ranges with the same primary end, the effective range is
+    /// the union of all ranges in the group.
+    fn get_current_group_effective_range(&self) -> Option<PartitionRange> {
+        if self.cur_group_idx >= self.range_groups.len() {
+            return None;
+        }
+        let (_, start_idx, end_idx) = self.range_groups[self.cur_group_idx];
+        if start_idx >= end_idx || start_idx >= self.partition_ranges.len() {
+            return None;
+        }
+
+        let ranges_in_group =
+            &self.partition_ranges[start_idx..end_idx.min(self.partition_ranges.len())];
+        if ranges_in_group.is_empty() {
+            return None;
+        }
+
+        // Compute union of all ranges in the group
+        let mut min_start = ranges_in_group[0].start;
+        let mut max_end = ranges_in_group[0].end;
+        for range in ranges_in_group.iter().skip(1) {
+            if range.start < min_start {
+                min_start = range.start;
+            }
+            if range.end > max_end {
+                max_end = range.end;
+            }
+        }
+
+        Some(PartitionRange {
+            start: min_start,
+            end: max_end,
+            num_rows: 0,   // Not used for validation
+            identifier: 0, // Not used for validation
+        })
+    }
+
     /// Sort and clear the buffer and return the sorted record batch
     ///
     /// this function will return a empty record batch if the buffer is empty
@@ -565,7 +732,6 @@ impl PartSortStream {
             )
         })?;
 
-        self.produced += sorted.num_rows();
         drop(full_input);
         // here remove both buffer and full_input memory
         self.reservation.shrink(2 * total_mem);
@@ -627,6 +793,20 @@ impl PartSortStream {
         Ok(concat_batch)
     }
 
+    /// Sorts current buffer and returns `None` when there is nothing to emit.
+    fn sorted_buffer_if_non_empty(&mut self) -> datafusion_common::Result<Option<DfRecordBatch>> {
+        if self.buffer.is_empty() {
+            return Ok(None);
+        }
+
+        let sorted = self.sort_buffer()?;
+        if sorted.num_rows() == 0 {
+            Ok(None)
+        } else {
+            Ok(Some(sorted))
+        }
+    }
+
     /// Try to split the input batch if it contains data that exceeds the current partition range.
     ///
     /// When the input batch contains data that exceeds the current partition range, this function
@@ -634,11 +814,99 @@ impl PartSortStream {
     /// range will be merged and sorted with previous buffer, and the second part will be registered
     /// to `evaluating_batch` for next polling.
     ///
-    /// Returns `None` if the input batch is empty or fully within the current partition range, and
-    /// `Some(batch)` otherwise.
+    /// **Group-based processing**: Ranges with the same primary end are grouped together.
+    /// We only sort and output when transitioning to a NEW group, not when moving between
+    /// ranges within the same group.
+    ///
+    /// Returns `None` if the input batch is empty or fully within the current partition range
+    /// (or we're still collecting data within the same group), and `Some(batch)` when we've
+    /// completed a group and have sorted output. When operating in TopK (limit) mode, this
+    /// function will not emit intermediate batches; it only prepares state for a single final
+    /// output.
     fn split_batch(
         &mut self,
         batch: DfRecordBatch,
+    ) -> datafusion_common::Result<Option<DfRecordBatch>> {
+        if matches!(self.buffer, PartSortBuffer::Top(_, _)) {
+            self.split_batch_topk(batch)?;
+            return Ok(None);
+        }
+
+        self.split_batch_all(batch)
+    }
+
+    /// Specialized splitting logic for TopK (limit) mode.
+    ///
+    /// We only emit once when the TopK buffer is fulfilled or when input is fully consumed.
+    /// When the buffer is fulfilled and we are about to enter a new group, we stop consuming
+    /// further ranges.
+    fn split_batch_topk(&mut self, batch: DfRecordBatch) -> datafusion_common::Result<()> {
+        if batch.num_rows() == 0 {
+            return Ok(());
+        }
+
+        let sort_column = self
+            .expression
+            .expr
+            .evaluate(&batch)?
+            .into_array(batch.num_rows())?;
+
+        let next_range_idx = self.try_find_next_range(&sort_column)?;
+        let Some(idx) = next_range_idx else {
+            self.push_buffer(batch)?;
+            // keep polling input for next batch
+            return Ok(());
+        };
+
+        let this_range = batch.slice(0, idx);
+        let remaining_range = batch.slice(idx, batch.num_rows() - idx);
+        if this_range.num_rows() != 0 {
+            self.push_buffer(this_range)?;
+        }
+
+        // Step to next proper PartitionRange
+        self.cur_part_idx += 1;
+
+        // If we've processed all partitions, mark completion.
+        if self.cur_part_idx >= self.partition_ranges.len() {
+            debug_assert!(remaining_range.num_rows() == 0);
+            self.input_complete = true;
+            return Ok(());
+        }
+
+        // Check if we're still in the same group
+        let in_same_group = self.is_in_current_group(self.cur_part_idx);
+
+        // When TopK is fulfilled and we are switching to a new group, stop consuming further ranges if possible.
+        // read from topk heap and determine whether we can stop earlier.
+        if !in_same_group && self.can_stop_early()? {
+            self.input_complete = true;
+            self.evaluating_batch = None;
+            return Ok(());
+        }
+
+        // Transition to a new group if needed
+        if !in_same_group {
+            self.advance_to_next_group();
+        }
+
+        let next_sort_column = sort_column.slice(idx, batch.num_rows() - idx);
+        if self.try_find_next_range(&next_sort_column)?.is_some() {
+            // remaining batch still contains data that exceeds the current partition range
+            // register the remaining batch for next polling
+            self.evaluating_batch = Some(remaining_range);
+        } else if remaining_range.num_rows() != 0 {
+            // remaining batch is within the current partition range
+            // push to the buffer and continue polling
+            self.push_buffer(remaining_range)?;
+        }
+
+        Ok(())
+    }
+
+    fn split_batch_all(
+        &mut self,
+        batch: DfRecordBatch,
     ) -> datafusion_common::Result<Option<DfRecordBatch>> {
         if batch.num_rows() == 0 {
             return Ok(None);
@@ -662,10 +930,40 @@ impl PartSortStream {
         if this_range.num_rows() != 0 {
             self.push_buffer(this_range)?;
         }
-        // mark end of current PartitionRange
-        let sorted_batch = self.sort_buffer();
-        // step to next proper PartitionRange
+
+        // Step to next proper PartitionRange
         self.cur_part_idx += 1;
+
+        // If we've processed all partitions, sort and output
+        if self.cur_part_idx >= self.partition_ranges.len() {
+            // assert there is no data beyond the last partition range (remaining is empty).
+            debug_assert!(remaining_range.num_rows() == 0);
+
+            // Sort and output the final group
+            return self.sorted_buffer_if_non_empty();
+        }
+
+        // Check if we're still in the same group
+        if self.is_in_current_group(self.cur_part_idx) {
+            // Same group - don't sort yet, keep collecting
+            let next_sort_column = sort_column.slice(idx, batch.num_rows() - idx);
+            if self.try_find_next_range(&next_sort_column)?.is_some() {
+                // remaining batch still contains data that exceeds the current partition range
+                self.evaluating_batch = Some(remaining_range);
+            } else {
+                // remaining batch is within the current partition range
+                if remaining_range.num_rows() != 0 {
+                    self.push_buffer(remaining_range)?;
+                }
+            }
+            // Return None to continue collecting within the same group
+            return Ok(None);
+        }
+
+        // Transitioning to a new group - sort current group and output
+        let sorted_batch = self.sorted_buffer_if_non_empty()?;
+        self.advance_to_next_group();
+
         let next_sort_column = sort_column.slice(idx, batch.num_rows() - idx);
         if self.try_find_next_range(&next_sort_column)?.is_some() {
             // remaining batch still contains data that exceeds the current partition range
@@ -679,7 +977,7 @@ impl PartSortStream {
             }
         }
 
-        sorted_batch.map(|x| if x.num_rows() == 0 { None } else { Some(x) })
+        Ok(sorted_batch)
     }
 
     pub fn poll_next_inner(
@@ -687,13 +985,11 @@ impl PartSortStream {
         cx: &mut Context<'_>,
     ) -> Poll<Option<datafusion_common::Result<DfRecordBatch>>> {
         loop {
-            // no more input, sort the buffer and return
             if self.input_complete {
-                if self.buffer.is_empty() {
-                    return Poll::Ready(None);
-                } else {
-                    return Poll::Ready(Some(self.sort_buffer()));
+                if let Some(sorted_batch) = self.sorted_buffer_if_non_empty()? {
+                    return Poll::Ready(Some(Ok(sorted_batch)));
                 }
+                return Poll::Ready(None);
             }
 
             // if there is a remaining batch being evaluated from last run,
@@ -701,11 +997,19 @@ impl PartSortStream {
             if let Some(evaluating_batch) = self.evaluating_batch.take()
                 && evaluating_batch.num_rows() != 0
             {
+                // Check if we've already processed all partitions
+                if self.cur_part_idx >= self.partition_ranges.len() {
+                    // All partitions processed, discard remaining data
+                    if let Some(sorted_batch) = self.sorted_buffer_if_non_empty()? {
+                        return Poll::Ready(Some(Ok(sorted_batch)));
+                    }
+                    return Poll::Ready(None);
+                }
+
                 if let Some(sorted_batch) = self.split_batch(evaluating_batch)? {
                     return Poll::Ready(Some(Ok(sorted_batch)));
-                } else {
-                    continue;
                 }
+                continue;
             }
 
             // fetch next batch from input
@@ -714,14 +1018,11 @@ impl PartSortStream {
                 Poll::Ready(Some(Ok(batch))) => {
                     if let Some(sorted_batch) = self.split_batch(batch)? {
                         return Poll::Ready(Some(Ok(sorted_batch)));
-                    } else {
-                        continue;
                     }
                 }
                 // input stream end, mark and continue
                 Poll::Ready(None) => {
                     self.input_complete = true;
-                    continue;
                 }
                 Poll::Ready(Some(Err(e))) => return Poll::Ready(Some(Err(e))),
                 Poll::Pending => return Poll::Pending,
@@ -752,6 +1053,10 @@ impl RecordBatchStream for PartSortStream {
 mod test {
     use std::sync::Arc;
 
+    use arrow::array::{
+        TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
+        TimestampSecondArray,
+    };
     use arrow::json::ArrayWriter;
     use arrow_schema::{DataType, Field, Schema, SortOptions, TimeUnit};
     use common_time::Timestamp;
@@ -762,6 +1067,7 @@ mod test {
     use super::*;
     use crate::test_util::{MockInputExec, new_ts_array};
 
+    #[ignore = "hard to gen expected data correctly here, TODO(discord9): fix it later"]
     #[tokio::test]
     async fn fuzzy_test() {
         let test_cnt = 100;
@@ -788,7 +1094,7 @@ mod test {
                 nulls_first,
             };
             let limit = if rng.bool() {
-                Some(rng.usize(0..batch_cnt_bound * batch_size_bound))
+                Some(rng.usize(1..batch_cnt_bound * batch_size_bound))
             } else {
                 None
             };
@@ -813,10 +1119,11 @@ mod test {
             for part_id in 0..rng.usize(0..part_cnt_bound) {
                 // generate each `PartitionRange`'s timestamp range
                 let (start, end) = if descending {
+                    // Use 1..=range_offset_bound to ensure strictly decreasing end values
                     let end = bound_val
                         .map(
                             |i| i
-                            .checked_sub(rng.i64(0..range_offset_bound))
+                            .checked_sub(rng.i64(1..=range_offset_bound))
                             .expect("Bad luck, fuzzy test generate data that will overflow, change seed and try again")
                         )
                         .unwrap_or_else(|| rng.i64(-100000000..100000000));
@@ -826,8 +1133,9 @@ mod test {
                     let end = Timestamp::new(end, unit.into());
                     (start, end)
                 } else {
+                    // Use 1..=range_offset_bound to ensure strictly increasing start values
                     let start = bound_val
-                        .map(|i| i + rng.i64(0..range_offset_bound))
+                        .map(|i| i + rng.i64(1..=range_offset_bound))
                         .unwrap_or_else(|| rng.i64(..));
                     bound_val = Some(start);
                     let end = start + rng.i64(1..range_size_bound);
@@ -896,22 +1204,48 @@ mod test {
                 output_data.push(cur_data);
             }
 
-            let expected_output = output_data
-                .into_iter()
-                .map(|a| {
-                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, a)]).unwrap()
-                })
-                .map(|rb| {
-                    // trim expected output with limit
-                    if let Some(limit) = limit
-                        && rb.num_rows() > limit
-                    {
-                        rb.slice(0, limit)
-                    } else {
-                        rb
+            let expected_output = if let Some(limit) = limit {
+                let mut accumulated = Vec::new();
+                let mut seen = 0usize;
+                for mut range_values in output_data {
+                    seen += range_values.len();
+                    accumulated.append(&mut range_values);
+                    if seen >= limit {
+                        break;
                     }
-                })
-                .collect_vec();
+                }
+
+                if accumulated.is_empty() {
+                    None
+                } else {
+                    if descending {
+                        accumulated.sort_by(|a, b| b.cmp(a));
+                    } else {
+                        accumulated.sort();
+                    }
+                    accumulated.truncate(limit.min(accumulated.len()));
+
+                    Some(
+                        DfRecordBatch::try_new(
+                            schema.clone(),
+                            vec![new_ts_array(unit, accumulated)],
+                        )
+                        .unwrap(),
+                    )
+                }
+            } else {
+                let batches = output_data
+                    .into_iter()
+                    .map(|a| {
+                        DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, a)]).unwrap()
+                    })
+                    .collect_vec();
+                if batches.is_empty() {
+                    None
+                } else {
+                    Some(concat_batches(&schema, &batches).unwrap())
+                }
+            };
 
             test_cases.push((
                 case_id,
@@ -932,13 +1266,14 @@ mod test {
                 opt,
                 limit,
                 expected_output,
+                None,
             )
             .await;
         }
     }
 
     #[tokio::test]
-    async fn simple_case() {
+    async fn simple_cases() {
         let testcases = vec![
             (
                 TimeUnit::Millisecond,
@@ -950,6 +1285,8 @@ mod test {
                 None,
                 vec![vec![1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9]],
             ),
+            // Case 1: Descending sort with overlapping ranges that have the same primary end (end=10).
+            // Ranges [5,10) and [0,10) are grouped together, so their data is merged before sorting.
             (
                 TimeUnit::Millisecond,
                 vec![
@@ -958,7 +1295,7 @@ mod test {
                 ],
                 true,
                 None,
-                vec![vec![9, 8, 7, 6, 5], vec![8, 7, 6, 5, 4, 3, 2, 1]],
+                vec![vec![9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 3, 2, 1]],
             ),
             (
                 TimeUnit::Millisecond,
@@ -994,6 +1331,10 @@ mod test {
                 None,
                 vec![],
             ),
+            // Case 5: Data from one batch spans multiple ranges. Ranges with same end are grouped.
+            // Ranges: [15,20) end=20, [10,15) end=15, [5,10) end=10, [0,10) end=10
+            // Groups: {[15,20)}, {[10,15)}, {[5,10), [0,10)}
+            // The last two ranges are merged because they share end=10.
             (
                 TimeUnit::Millisecond,
                 vec![
@@ -1010,8 +1351,7 @@ mod test {
                 vec![
                     vec![19, 17, 15],
                     vec![12, 11, 10],
-                    vec![9, 8, 7, 6, 5],
-                    vec![4, 3, 2, 1],
+                    vec![9, 8, 7, 6, 5, 4, 3, 2, 1],
                 ],
             ),
             (
@@ -1027,7 +1367,7 @@ mod test {
                 ],
                 true,
                 Some(2),
-                vec![vec![19, 17], vec![12, 11], vec![9, 8], vec![4, 3]],
+                vec![vec![19, 17]],
             ),
         ];
 
@@ -1072,6 +1412,11 @@ mod test {
                     DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, a)]).unwrap()
                 })
                 .collect_vec();
+            let expected_output = if expected_output.is_empty() {
+                None
+            } else {
+                Some(concat_batches(&schema, &expected_output).unwrap())
+            };
 
             run_test(
                 identifier,
@@ -1080,6 +1425,7 @@ mod test {
                 opt,
                 limit,
                 expected_output,
+                None,
             )
             .await;
         }
@@ -1092,99 +1438,1391 @@ mod test {
         schema: SchemaRef,
         opt: SortOptions,
         limit: Option<usize>,
-        expected_output: Vec<DfRecordBatch>,
+        expected_output: Option<DfRecordBatch>,
+        expected_polled_rows: Option<usize>,
     ) {
-        for rb in &expected_output {
-            if let Some(limit) = limit {
-                assert!(
-                    rb.num_rows() <= limit,
-                    "Expect row count in expected output's batch({}) <= limit({})",
-                    rb.num_rows(),
-                    limit
-                );
-            }
+        if let (Some(limit), Some(rb)) = (limit, &expected_output) {
+            assert!(
+                rb.num_rows() <= limit,
+                "Expect row count in expected output({}) <= limit({})",
+                rb.num_rows(),
+                limit
+            );
         }
-        let (ranges, batches): (Vec<_>, Vec<_>) = input_ranged_data.clone().into_iter().unzip();
 
-        let batches = batches
-            .into_iter()
-            .flat_map(|mut cols| {
-                cols.push(DfRecordBatch::new_empty(schema.clone()));
-                cols
-            })
-            .collect_vec();
-        let mock_input = MockInputExec::new(batches, schema.clone());
+        let mut data_partition = Vec::with_capacity(input_ranged_data.len());
+        let mut ranges = Vec::with_capacity(input_ranged_data.len());
+        for (part_range, batches) in input_ranged_data {
+            data_partition.push(batches);
+            ranges.push(part_range);
+        }
 
-        let exec = PartSortExec::new(
+        let mock_input = Arc::new(MockInputExec::new(data_partition, schema.clone()));
+
+        let exec = PartSortExec::try_new(
             PhysicalSortExpr {
                 expr: Arc::new(Column::new("ts", 0)),
                 options: opt,
             },
             limit,
             vec![ranges.clone()],
-            Arc::new(mock_input),
-        );
+            mock_input.clone(),
+        )
+        .unwrap();
 
         let exec_stream = exec.execute(0, Arc::new(TaskContext::default())).unwrap();
 
         let real_output = exec_stream.map(|r| r.unwrap()).collect::<Vec<_>>().await;
-        // a makeshift solution for compare large data
-        if real_output != expected_output {
-            let mut first_diff = 0;
-            for (idx, (lhs, rhs)) in real_output.iter().zip(expected_output.iter()).enumerate() {
-                if lhs != rhs {
-                    first_diff = idx;
-                    break;
-                }
-            }
-            println!("first diff batch at {}", first_diff);
-            println!(
-                "ranges: {:?}",
-                ranges
-                    .into_iter()
-                    .map(|r| (r.start.to_chrono_datetime(), r.end.to_chrono_datetime()))
-                    .enumerate()
-                    .collect::<Vec<_>>()
-            );
-
-            let mut full_msg = String::new();
-            {
-                let mut buf = Vec::with_capacity(10 * real_output.len());
-                for batch in real_output.iter().skip(first_diff) {
-                    let mut rb_json: Vec<u8> = Vec::new();
-                    let mut writer = ArrayWriter::new(&mut rb_json);
-                    writer.write(batch).unwrap();
-                    writer.finish().unwrap();
-                    buf.append(&mut rb_json);
-                    buf.push(b',');
-                }
-                // TODO(discord9): better ways to print buf
-                let buf = String::from_utf8_lossy(&buf);
-                full_msg += &format!("\ncase_id:{case_id}, real_output \n{buf}\n");
-            }
-            {
-                let mut buf = Vec::with_capacity(10 * real_output.len());
-                for batch in expected_output.iter().skip(first_diff) {
-                    let mut rb_json: Vec<u8> = Vec::new();
-                    let mut writer = ArrayWriter::new(&mut rb_json);
-                    writer.write(batch).unwrap();
-                    writer.finish().unwrap();
-                    buf.append(&mut rb_json);
-                    buf.push(b',');
-                }
-                let buf = String::from_utf8_lossy(&buf);
-                full_msg += &format!("case_id:{case_id}, expected_output \n{buf}");
-            }
-            panic!(
-                "case_{} failed, opt: {:?},\n real output has {} batches, {} rows, expected has {} batches with {} rows\nfull msg: {}",
-                case_id,
-                opt,
-                real_output.len(),
-                real_output.iter().map(|x| x.num_rows()).sum::<usize>(),
-                expected_output.len(),
-                expected_output.iter().map(|x| x.num_rows()).sum::<usize>(),
-                full_msg
+        if limit.is_some() {
+            assert!(
+                real_output.len() <= 1,
+                "case_{case_id} expects a single output batch when limit is set, got {}",
+                real_output.len()
             );
         }
+
+        let actual_output = if real_output.is_empty() {
+            None
+        } else {
+            Some(concat_batches(&schema, &real_output).unwrap())
+        };
+
+        if let Some(expected_polled_rows) = expected_polled_rows {
+            let input_pulled_rows = mock_input.metrics().unwrap().output_rows().unwrap();
+            assert_eq!(input_pulled_rows, expected_polled_rows);
+        }
+
+        match (actual_output, expected_output) {
+            (None, None) => {}
+            (Some(actual), Some(expected)) => {
+                if actual != expected {
+                    let mut actual_json: Vec<u8> = Vec::new();
+                    let mut writer = ArrayWriter::new(&mut actual_json);
+                    writer.write(&actual).unwrap();
+                    writer.finish().unwrap();
+
+                    let mut expected_json: Vec<u8> = Vec::new();
+                    let mut writer = ArrayWriter::new(&mut expected_json);
+                    writer.write(&expected).unwrap();
+                    writer.finish().unwrap();
+
+                    panic!(
+                        "case_{} failed (limit {limit:?}), opt: {:?},\nreal_output: {}\nexpected: {}",
+                        case_id,
+                        opt,
+                        String::from_utf8_lossy(&actual_json),
+                        String::from_utf8_lossy(&expected_json),
+                    );
+                }
+            }
+            (None, Some(expected)) => panic!(
+                "case_{} failed (limit {limit:?}), opt: {:?},\nreal output is empty, expected {} rows",
+                case_id,
+                opt,
+                expected.num_rows()
+            ),
+            (Some(actual), None) => panic!(
+                "case_{} failed (limit {limit:?}), opt: {:?},\nreal output has {} rows, expected empty",
+                case_id,
+                opt,
+                actual.num_rows()
+            ),
+        }
+    }
+
+    /// Test that verifies the limit is correctly applied per partition when
+    /// multiple batches are received for the same partition.
+    #[tokio::test]
+    async fn test_limit_with_multiple_batches_per_partition() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // Test case: Multiple batches in a single partition with limit=3
+        // Input: 3 batches with [1,2,3], [4,5,6], [7,8,9] all in partition (0,10)
+        // Expected: Only top 3 values [9,8,7] for descending sort
+        let input_ranged_data = vec![(
+            PartitionRange {
+                start: Timestamp::new(0, unit.into()),
+                end: Timestamp::new(10, unit.into()),
+                num_rows: 9,
+                identifier: 0,
+            },
+            vec![
+                DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2, 3])])
+                    .unwrap(),
+                DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![4, 5, 6])])
+                    .unwrap(),
+                DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![7, 8, 9])])
+                    .unwrap(),
+            ],
+        )];
+
+        let expected_output = Some(
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![9, 8, 7])])
+                .unwrap(),
+        );
+
+        run_test(
+            1000,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(3),
+            expected_output,
+            None,
+        )
+        .await;
+
+        // Test case: Multiple batches across multiple partitions with limit=2
+        // Partition 0: batches [10,11,12], [13,14,15] -> top 2 descending = [15,14]
+        // Partition 1: batches [1,2,3], [4,5] -> top 2 descending = [5,4]
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(10, unit.into()),
+                    end: Timestamp::new(20, unit.into()),
+                    num_rows: 6,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![10, 11, 12])],
+                    )
+                    .unwrap(),
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![13, 14, 15])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(0, unit.into()),
+                    end: Timestamp::new(10, unit.into()),
+                    num_rows: 5,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2, 3])])
+                        .unwrap(),
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![4, 5])])
+                        .unwrap(),
+                ],
+            ),
+        ];
+
+        let expected_output = Some(
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![15, 14])]).unwrap(),
+        );
+
+        run_test(
+            1001,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(2),
+            expected_output,
+            None,
+        )
+        .await;
+
+        // Test case: Ascending sort with limit
+        // Partition: batches [7,8,9], [4,5,6], [1,2,3] -> top 2 ascending = [1,2]
+        let input_ranged_data = vec![(
+            PartitionRange {
+                start: Timestamp::new(0, unit.into()),
+                end: Timestamp::new(10, unit.into()),
+                num_rows: 9,
+                identifier: 0,
+            },
+            vec![
+                DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![7, 8, 9])])
+                    .unwrap(),
+                DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![4, 5, 6])])
+                    .unwrap(),
+                DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2, 3])])
+                    .unwrap(),
+            ],
+        )];
+
+        let expected_output = Some(
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![1, 2])]).unwrap(),
+        );
+
+        run_test(
+            1002,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: false,
+                ..Default::default()
+            },
+            Some(2),
+            expected_output,
+            None,
+        )
+        .await;
+    }
+
+    /// Test that verifies early termination behavior.
+    /// Once we've produced limit * num_partitions rows, we should stop
+    /// pulling from input stream.
+    #[tokio::test]
+    async fn test_early_termination() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // Create 3 partitions, each with more data than the limit
+        // limit=2 per partition, so total expected output = 6 rows
+        // After producing 6 rows, early termination should kick in
+        // For descending sort, ranges must be ordered by (end DESC, start DESC)
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(20, unit.into()),
+                    end: Timestamp::new(30, unit.into()),
+                    num_rows: 10,
+                    identifier: 2,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![21, 22, 23, 24, 25])],
+                    )
+                    .unwrap(),
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![26, 27, 28, 29, 30])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(10, unit.into()),
+                    end: Timestamp::new(20, unit.into()),
+                    num_rows: 10,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![11, 12, 13, 14, 15])],
+                    )
+                    .unwrap(),
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![16, 17, 18, 19, 20])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(0, unit.into()),
+                    end: Timestamp::new(10, unit.into()),
+                    num_rows: 10,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![1, 2, 3, 4, 5])],
+                    )
+                    .unwrap(),
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![6, 7, 8, 9, 10])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        // PartSort won't reorder `PartitionRange` (it assumes it's already ordered), so it will not read other partitions.
+        // This case is just to verify that early termination works as expected.
+        // First partition [20, 30) produces top 2 values: 29, 28
+        let expected_output = Some(
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![29, 28])]).unwrap(),
+        );
+
+        run_test(
+            1003,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(2),
+            expected_output,
+            Some(10),
+        )
+        .await;
+    }
+
+    /// Example:
+    /// - Range [70, 100) has data [80, 90, 95]
+    /// - Range [50, 100) has data [55, 65, 75, 85, 95]
+    #[tokio::test]
+    async fn test_primary_end_grouping_with_limit() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // Two ranges with the same end (100) - they should be grouped together
+        // For descending, ranges are ordered by (end DESC, start DESC)
+        // So [70, 100) comes before [50, 100) (70 > 50)
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(70, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 3,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![80, 90, 95])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(50, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 5,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![55, 65, 75, 85, 95])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        // With limit=4, descending: top 4 values from combined data
+        // Combined: [80, 90, 95, 55, 65, 75, 85, 95] -> sorted desc: [95, 95, 90, 85, 80, 75, 65, 55]
+        // Top 4: [95, 95, 90, 85]
+        let expected_output = Some(
+            DfRecordBatch::try_new(
+                schema.clone(),
+                vec![new_ts_array(unit, vec![95, 95, 90, 85])],
+            )
+            .unwrap(),
+        );
+
+        run_test(
+            2000,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(4),
+            expected_output,
+            None,
+        )
+        .await;
+    }
+
+    /// Test case with three ranges demonstrating the "keep pulling" behavior.
+    /// After processing ranges with end=100, the smallest value in top-k might still
+    /// be reachable by the next group.
+    ///
+    /// Ranges: [70, 100), [50, 100), [40, 95)
+    /// With descending sort and limit=4:
+    /// - Group 1 (end=100): [70, 100) and [50, 100) merged
+    /// - Group 2 (end=95): [40, 95)
+    /// After group 1, smallest in top-4 is 85. Range [40, 95) could have values >= 85,
+    /// so we continue to group 2.
+    #[tokio::test]
+    async fn test_three_ranges_keep_pulling() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // Three ranges, two with same end (100), one with different end (95)
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(70, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 3,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![80, 90, 95])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(50, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 3,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![55, 75, 85])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(40, unit.into()),
+                    end: Timestamp::new(95, unit.into()),
+                    num_rows: 3,
+                    identifier: 2,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![45, 65, 94])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        // All data: [80, 90, 95, 55, 75, 85, 45, 65, 94]
+        // Sorted descending: [95, 94, 90, 85, 80, 75, 65, 55, 45]
+        // With limit=4: should be top 4 largest values across all ranges: [95, 94, 90, 85]
+        let expected_output = Some(
+            DfRecordBatch::try_new(
+                schema.clone(),
+                vec![new_ts_array(unit, vec![95, 94, 90, 85])],
+            )
+            .unwrap(),
+        );
+
+        run_test(
+            2001,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(4),
+            expected_output,
+            None,
+        )
+        .await;
+    }
+
+    /// Test early termination based on threshold comparison with next group.
+    /// When the threshold (smallest value for descending) is >= next group's primary end,
+    /// we can stop early because the next group cannot have better values.
+    #[tokio::test]
+    async fn test_threshold_based_early_termination() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // Group 1 (end=100) has 6 rows, TopK will keep top 4
+        // Group 2 (end=90) has 3 rows - should NOT be processed because
+        // threshold (96) >= next_primary_end (90)
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(70, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 6,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![94, 95, 96, 97, 98, 99])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(50, unit.into()),
+                    end: Timestamp::new(90, unit.into()),
+                    num_rows: 3,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![85, 86, 87])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        // With limit=4, descending: top 4 from group 1 are [99, 98, 97, 96]
+        // Threshold is 96, next group's primary_end is 90
+        // Since 96 >= 90, we stop after group 1
+        let expected_output = Some(
+            DfRecordBatch::try_new(
+                schema.clone(),
+                vec![new_ts_array(unit, vec![99, 98, 97, 96])],
+            )
+            .unwrap(),
+        );
+
+        run_test(
+            2002,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(4),
+            expected_output,
+            Some(9), // Pull both batches since all rows fall within the first range
+        )
+        .await;
+    }
+
+    /// Test that we continue to next group when threshold is within next group's range.
+    /// Even after fulfilling limit, if threshold < next_primary_end (descending),
+    /// we would need to continue... but limit exhaustion stops us first.
+    #[tokio::test]
+    async fn test_continue_when_threshold_in_next_group_range() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // Group 1 (end=100) has 6 rows, TopK will keep top 4
+        // Group 2 (end=98) has 3 rows - threshold (96) < 98, so next group
+        // could theoretically have better values. But limit exhaustion stops us.
+        // Note: Data values must not overlap between ranges to avoid ambiguity.
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(70, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 6,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![94, 95, 96, 97, 98, 99])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(50, unit.into()),
+                    end: Timestamp::new(98, unit.into()),
+                    num_rows: 3,
+                    identifier: 1,
+                },
+                vec![
+                    // Values must be < 70 (outside group 1's range) to avoid ambiguity
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![55, 60, 65])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        // With limit=4, we get [99, 98, 97, 96] from group 1
+        // Threshold is 96, next group's primary_end is 98
+        // 96 < 98, so threshold check says "could continue"
+        // But limit is exhausted (0), so we stop anyway
+        let expected_output = Some(
+            DfRecordBatch::try_new(
+                schema.clone(),
+                vec![new_ts_array(unit, vec![99, 98, 97, 96])],
+            )
+            .unwrap(),
+        );
+
+        // Note: We pull 9 rows (both batches) because we need to read batch 2
+        // to detect the group boundary, even though we stop after outputting group 1.
+        run_test(
+            2003,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(4),
+            expected_output,
+            Some(9), // Pull both batches to detect boundary
+        )
+        .await;
+    }
+
+    /// Test ascending sort with threshold-based early termination.
+    #[tokio::test]
+    async fn test_ascending_threshold_early_termination() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // For ascending: primary_end is start, ranges sorted by (start ASC, end ASC)
+        // Group 1 (start=10) has 6 rows
+        // Group 2 (start=20) has 3 rows - should NOT be processed because
+        // threshold (13) < next_primary_end (20)
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(10, unit.into()),
+                    end: Timestamp::new(50, unit.into()),
+                    num_rows: 6,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![10, 11, 12, 13, 14, 15])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(20, unit.into()),
+                    end: Timestamp::new(60, unit.into()),
+                    num_rows: 3,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![25, 30, 35])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            // still read this batch to detect group boundary(?)
+            (
+                PartitionRange {
+                    start: Timestamp::new(60, unit.into()),
+                    end: Timestamp::new(70, unit.into()),
+                    num_rows: 2,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![60, 61])])
+                        .unwrap(),
+                ],
+            ),
+            // after boundary detected, this following one should not be read
+            (
+                PartitionRange {
+                    start: Timestamp::new(61, unit.into()),
+                    end: Timestamp::new(70, unit.into()),
+                    num_rows: 2,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![71, 72])])
+                        .unwrap(),
+                ],
+            ),
+        ];
+
+        // With limit=4, ascending: top 4 (smallest) from group 1 are [10, 11, 12, 13]
+        // Threshold is 13 (largest in top-k), next group's primary_end is 20
+        // Since 13 < 20, we stop after group 1 (no value in group 2 can be < 13)
+        let expected_output = Some(
+            DfRecordBatch::try_new(
+                schema.clone(),
+                vec![new_ts_array(unit, vec![10, 11, 12, 13])],
+            )
+            .unwrap(),
+        );
+
+        run_test(
+            2004,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: false,
+                ..Default::default()
+            },
+            Some(4),
+            expected_output,
+            Some(11), // Pull first two batches to detect boundary
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_ascending_threshold_early_termination_case_two() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // For ascending: primary_end is start, ranges sorted by (start ASC, end ASC)
+        // Group 1 (start=0) has 4 rows, Group 2 (start=4) has 1 row, Group 3 (start=5) has 4 rows
+        // After reading all data: [9,10,11,12, 21, 5,6,7,8]
+        // Sorted ascending: [5,6,7,8, 9,10,11,12, 21]
+        // With limit=4, output should be smallest 4: [5,6,7,8]
+        // Algorithm continues reading until start=42 > threshold=8, confirming no smaller values exist
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(0, unit.into()),
+                    end: Timestamp::new(20, unit.into()),
+                    num_rows: 4,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![9, 10, 11, 12])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(4, unit.into()),
+                    end: Timestamp::new(25, unit.into()),
+                    num_rows: 1,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![21])])
+                        .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(5, unit.into()),
+                    end: Timestamp::new(25, unit.into()),
+                    num_rows: 4,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![5, 6, 7, 8])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            // This still will be read to detect boundary, but should not contribute to output
+            (
+                PartitionRange {
+                    start: Timestamp::new(42, unit.into()),
+                    end: Timestamp::new(52, unit.into()),
+                    num_rows: 2,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![42, 51])])
+                        .unwrap(),
+                ],
+            ),
+            // This following one should not be read after boundary detected
+            (
+                PartitionRange {
+                    start: Timestamp::new(48, unit.into()),
+                    end: Timestamp::new(53, unit.into()),
+                    num_rows: 2,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![48, 51])])
+                        .unwrap(),
+                ],
+            ),
+        ];
+
+        // With limit=4, ascending: after processing all ranges, smallest 4 are [5, 6, 7, 8]
+        // Threshold is 8 (4th smallest value), algorithm reads until start=42 > threshold=8
+        let expected_output = Some(
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![5, 6, 7, 8])])
+                .unwrap(),
+        );
+
+        run_test(
+            2005,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: false,
+                ..Default::default()
+            },
+            Some(4),
+            expected_output,
+            Some(11), // Read first 4 ranges to confirm threshold boundary
+        )
+        .await;
+    }
+
+    /// Test early stop behavior with null values in sort column.
+    /// Verifies that nulls are handled correctly based on nulls_first option.
+    #[tokio::test]
+    async fn test_early_stop_with_nulls() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            true, // nullable
+        )]));
+
+        // Helper function to create nullable timestamp array
+        let new_nullable_ts_array = |unit: TimeUnit, arr: Vec<Option<i64>>| -> ArrayRef {
+            match unit {
+                TimeUnit::Second => Arc::new(TimestampSecondArray::from(arr)) as ArrayRef,
+                TimeUnit::Millisecond => Arc::new(TimestampMillisecondArray::from(arr)) as ArrayRef,
+                TimeUnit::Microsecond => Arc::new(TimestampMicrosecondArray::from(arr)) as ArrayRef,
+                TimeUnit::Nanosecond => Arc::new(TimestampNanosecondArray::from(arr)) as ArrayRef,
+            }
+        };
+
+        // Test case 1: nulls_first=true, null values should appear first
+        // Group 1 (end=100): [null, null, 99, 98, 97] -> with limit=3, top 3 are [null, null, 99]
+        // Threshold is 99, next group end=90, since 99 >= 90, we should stop early
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(70, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 5,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_nullable_ts_array(
+                            unit,
+                            vec![Some(99), Some(98), None, Some(97), None],
+                        )],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(50, unit.into()),
+                    end: Timestamp::new(90, unit.into()),
+                    num_rows: 3,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_nullable_ts_array(
+                            unit,
+                            vec![Some(89), Some(88), Some(87)],
+                        )],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        // With nulls_first=true, nulls sort before all values
+        // For descending, order is: null, null, 99, 98, 97
+        // With limit=3, we get: null, null, 99
+        let expected_output = Some(
+            DfRecordBatch::try_new(
+                schema.clone(),
+                vec![new_nullable_ts_array(unit, vec![None, None, Some(99)])],
+            )
+            .unwrap(),
+        );
+
+        run_test(
+            3000,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                nulls_first: true,
+            },
+            Some(3),
+            expected_output,
+            Some(8), // Must read both batches to detect group boundary
+        )
+        .await;
+
+        // Test case 2: nulls_last=true, null values should appear last
+        // Group 1 (end=100): [99, 98, 97, null, null] -> with limit=3, top 3 are [99, 98, 97]
+        // Threshold is 97, next group end=90, since 97 >= 90, we should stop early
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(70, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 5,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_nullable_ts_array(
+                            unit,
+                            vec![Some(99), Some(98), Some(97), None, None],
+                        )],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(50, unit.into()),
+                    end: Timestamp::new(90, unit.into()),
+                    num_rows: 3,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_nullable_ts_array(
+                            unit,
+                            vec![Some(89), Some(88), Some(87)],
+                        )],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        // With nulls_last=false (equivalent to nulls_first=false), values sort before nulls
+        // For descending, order is: 99, 98, 97, null, null
+        // With limit=3, we get: 99, 98, 97
+        let expected_output = Some(
+            DfRecordBatch::try_new(
+                schema.clone(),
+                vec![new_nullable_ts_array(
+                    unit,
+                    vec![Some(99), Some(98), Some(97)],
+                )],
+            )
+            .unwrap(),
+        );
+
+        run_test(
+            3001,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                nulls_first: false,
+            },
+            Some(3),
+            expected_output,
+            Some(8), // Must read both batches to detect group boundary
+        )
+        .await;
+    }
+
+    /// Test early stop behavior when there's only one group (no next group).
+    /// In this case, can_stop_early should return false and we should process all data.
+    #[tokio::test]
+    async fn test_early_stop_single_group() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // Only one group (all ranges have the same end), no next group to compare against
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(70, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 6,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![94, 95, 96, 97, 98, 99])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(50, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 3,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![85, 86, 87])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        // Even though we have enough data in first range, we must process all
+        // because there's no next group to compare threshold against
+        let expected_output = Some(
+            DfRecordBatch::try_new(
+                schema.clone(),
+                vec![new_ts_array(unit, vec![99, 98, 97, 96])],
+            )
+            .unwrap(),
+        );
+
+        run_test(
+            3002,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(4),
+            expected_output,
+            Some(9), // Must read all batches since no early stop is possible
+        )
+        .await;
+    }
+
+    /// Test early stop behavior when threshold exactly equals next group's boundary.
+    #[tokio::test]
+    async fn test_early_stop_exact_boundary_equality() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // Test case 1: Descending sort, threshold == next_group_end
+        // Group 1 (end=100): data up to 90, threshold = 90, next_group_end = 90
+        // Since 90 >= 90, we should stop early
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(70, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 4,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![92, 91, 90, 89])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(50, unit.into()),
+                    end: Timestamp::new(90, unit.into()),
+                    num_rows: 3,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![88, 87, 86])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        let expected_output = Some(
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![92, 91, 90])])
+                .unwrap(),
+        );
+
+        run_test(
+            3003,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(3),
+            expected_output,
+            Some(7), // Must read both batches to detect boundary
+        )
+        .await;
+
+        // Test case 2: Ascending sort, threshold == next_group_start
+        // Group 1 (start=10): data from 10, threshold = 20, next_group_start = 20
+        // Since 20 < 20 is false, we should continue
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(10, unit.into()),
+                    end: Timestamp::new(50, unit.into()),
+                    num_rows: 4,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![10, 15, 20, 25])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(20, unit.into()),
+                    end: Timestamp::new(60, unit.into()),
+                    num_rows: 3,
+                    identifier: 1,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![21, 22, 23])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        let expected_output = Some(
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![10, 15, 20])])
+                .unwrap(),
+        );
+
+        run_test(
+            3004,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: false,
+                ..Default::default()
+            },
+            Some(3),
+            expected_output,
+            Some(7), // Must read both batches since 20 is not < 20
+        )
+        .await;
+    }
+
+    /// Test early stop behavior with empty partition groups.
+    #[tokio::test]
+    async fn test_early_stop_with_empty_partitions() {
+        let unit = TimeUnit::Millisecond;
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "ts",
+            DataType::Timestamp(unit, None),
+            false,
+        )]));
+
+        // Test case 1: First group is empty, second group has data
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(70, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 0,
+                    identifier: 0,
+                },
+                vec![
+                    // Empty batch for first range
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![])])
+                        .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(50, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 0,
+                    identifier: 1,
+                },
+                vec![
+                    // Empty batch for second range
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![])])
+                        .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(30, unit.into()),
+                    end: Timestamp::new(80, unit.into()),
+                    num_rows: 4,
+                    identifier: 2,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![74, 75, 76, 77])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(10, unit.into()),
+                    end: Timestamp::new(60, unit.into()),
+                    num_rows: 3,
+                    identifier: 3,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![58, 59, 60])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        // Group 1 (end=100) is empty, Group 2 (end=80) has data
+        // Should continue to Group 2 since Group 1 has no data
+        let expected_output = Some(
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![77, 76])]).unwrap(),
+        );
+
+        run_test(
+            3005,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(2),
+            expected_output,
+            Some(7), // Must read until finding actual data
+        )
+        .await;
+
+        // Test case 2: Empty partitions between data groups
+        let input_ranged_data = vec![
+            (
+                PartitionRange {
+                    start: Timestamp::new(70, unit.into()),
+                    end: Timestamp::new(100, unit.into()),
+                    num_rows: 4,
+                    identifier: 0,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![96, 97, 98, 99])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(50, unit.into()),
+                    end: Timestamp::new(90, unit.into()),
+                    num_rows: 0,
+                    identifier: 1,
+                },
+                vec![
+                    // Empty range - should be skipped
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![])])
+                        .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(30, unit.into()),
+                    end: Timestamp::new(70, unit.into()),
+                    num_rows: 0,
+                    identifier: 2,
+                },
+                vec![
+                    // Another empty range
+                    DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![])])
+                        .unwrap(),
+                ],
+            ),
+            (
+                PartitionRange {
+                    start: Timestamp::new(10, unit.into()),
+                    end: Timestamp::new(50, unit.into()),
+                    num_rows: 3,
+                    identifier: 3,
+                },
+                vec![
+                    DfRecordBatch::try_new(
+                        schema.clone(),
+                        vec![new_ts_array(unit, vec![48, 49, 50])],
+                    )
+                    .unwrap(),
+                ],
+            ),
+        ];
+
+        // With limit=2 from group 1: [99, 98], threshold=98, next group end=50
+        // Since 98 >= 50, we should stop early
+        let expected_output = Some(
+            DfRecordBatch::try_new(schema.clone(), vec![new_ts_array(unit, vec![99, 98])]).unwrap(),
+        );
+
+        run_test(
+            3006,
+            input_ranged_data,
+            schema.clone(),
+            SortOptions {
+                descending: true,
+                ..Default::default()
+            },
+            Some(2),
+            expected_output,
+            Some(7), // Must read to detect early stop condition
+        )
+        .await;
     }
 }
diff --git a/src/query/src/promql/planner.rs b/src/query/src/promql/planner.rs
index 5cc26cee05..8fc90d7f71 100644
--- a/src/query/src/promql/planner.rs
+++ b/src/query/src/promql/planner.rs
@@ -1414,14 +1414,14 @@ impl PromPlanner {
                 .clone()
                 .gt_eq(DfExpr::Literal(
                     ScalarValue::TimestampMillisecond(
-                        Some(self.ctx.start + offset_duration - self.ctx.lookback_delta - range),
+                        Some(self.ctx.start - offset_duration - self.ctx.lookback_delta - range),
                         None,
                     ),
                     None,
                 ))
                 .and(time_index_expr.lt_eq(DfExpr::Literal(
                     ScalarValue::TimestampMillisecond(
-                        Some(self.ctx.end + offset_duration + self.ctx.lookback_delta),
+                        Some(self.ctx.end - offset_duration + self.ctx.lookback_delta),
                         None,
                     ),
                     None,
@@ -1437,14 +1437,14 @@ impl PromPlanner {
                     .clone()
                     .gt_eq(DfExpr::Literal(
                         ScalarValue::TimestampMillisecond(
-                            Some(timestamp + offset_duration - lookback_delta - range),
+                            Some(timestamp - offset_duration - lookback_delta - range),
                             None,
                         ),
                         None,
                     ))
                     .and(time_index_expr.clone().lt_eq(DfExpr::Literal(
                         ScalarValue::TimestampMillisecond(
-                            Some(timestamp + offset_duration + lookback_delta),
+                            Some(timestamp - offset_duration + lookback_delta),
                             None,
                         ),
                         None,
diff --git a/src/query/src/query_engine.rs b/src/query/src/query_engine.rs
index 34a4fee209..53383cf905 100644
--- a/src/query/src/query_engine.rs
+++ b/src/query/src/query_engine.rs
@@ -29,6 +29,7 @@ use common_function::handlers::{
 };
 use common_query::Output;
 use datafusion::catalog::TableFunction;
+use datafusion::dataframe::DataFrame;
 use datafusion_expr::{AggregateUDF, LogicalPlan};
 use datatypes::schema::Schema;
 pub use default_serializer::{DefaultPlanDecoder, DefaultSerializer};
@@ -36,7 +37,6 @@ use partition::manager::PartitionRuleManagerRef;
 use session::context::QueryContextRef;
 use table::TableRef;
 
-use crate::dataframe::DataFrame;
 use crate::datafusion::DatafusionQueryEngine;
 use crate::error::Result;
 use crate::options::QueryOptions;
diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs
index ffe64e7005..9328f5f736 100644
--- a/src/query/src/query_engine/state.rs
+++ b/src/query/src/query_engine/state.rs
@@ -14,6 +14,7 @@
 
 use std::collections::HashMap;
 use std::fmt;
+use std::num::NonZeroUsize;
 use std::sync::{Arc, RwLock};
 
 use async_trait::async_trait;
@@ -34,6 +35,7 @@ use datafusion::execution::SessionStateBuilder;
 use datafusion::execution::context::{QueryPlanner, SessionConfig, SessionContext, SessionState};
 use datafusion::execution::memory_pool::{
     GreedyMemoryPool, MemoryConsumer, MemoryLimit, MemoryPool, MemoryReservation,
+    TrackConsumersPool,
 };
 use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder};
 use datafusion::physical_optimizer::PhysicalOptimizerRule;
@@ -437,19 +439,25 @@ impl DfQueryPlanner {
     }
 }
 
-/// A wrapper around GreedyMemoryPool that records metrics.
+/// A wrapper around TrackConsumersPool that records metrics.
 ///
 /// This wrapper intercepts all memory pool operations and updates
 /// Prometheus metrics for monitoring query memory usage and rejections.
 #[derive(Debug)]
 struct MetricsMemoryPool {
-    inner: Arc<GreedyMemoryPool>,
+    inner: Arc<TrackConsumersPool<GreedyMemoryPool>>,
 }
 
 impl MetricsMemoryPool {
+    // Number of top memory consumers to report in OOM error messages
+    const TOP_CONSUMERS_TO_REPORT: usize = 5;
+
     fn new(limit: usize) -> Self {
         Self {
-            inner: Arc::new(GreedyMemoryPool::new(limit)),
+            inner: Arc::new(TrackConsumersPool::new(
+                GreedyMemoryPool::new(limit),
+                NonZeroUsize::new(Self::TOP_CONSUMERS_TO_REPORT).unwrap(),
+            )),
         }
     }
 
diff --git a/src/query/src/sql.rs b/src/query/src/sql.rs
index 693b1aa068..b36f9e4df5 100644
--- a/src/query/src/sql.rs
+++ b/src/query/src/sql.rs
@@ -15,6 +15,7 @@
 mod show_create_table;
 
 use std::collections::HashMap;
+use std::ops::ControlFlow;
 use std::sync::Arc;
 
 use catalog::CatalogManagerRef;
@@ -52,7 +53,7 @@ use regex::Regex;
 use session::context::{Channel, QueryContextRef};
 pub use show_create_table::create_table_stmt;
 use snafu::{OptionExt, ResultExt, ensure};
-use sql::ast::Ident;
+use sql::ast::{Ident, visit_expressions_mut};
 use sql::parser::ParserContext;
 use sql::statements::OptionMap;
 use sql::statements::create::{CreateDatabase, CreateFlow, CreateView, Partitions, SqlOrTql};
@@ -64,16 +65,15 @@ use sql::statements::statement::Statement;
 use sqlparser::ast::ObjectName;
 use store_api::metric_engine_consts::{is_metric_engine, is_metric_engine_internal_column};
 use table::TableRef;
+use table::metadata::TableInfoRef;
 use table::requests::{FILE_TABLE_LOCATION_KEY, FILE_TABLE_PATTERN_KEY};
 
 use crate::QueryEngineRef;
-use crate::dataframe::DataFrame;
 use crate::error::{self, Result, UnsupportedVariableSnafu};
 use crate::planner::DfLogicalPlanner;
 
 const SCHEMAS_COLUMN: &str = "Database";
 const OPTIONS_COLUMN: &str = "Options";
-const TABLES_COLUMN: &str = "Tables";
 const VIEWS_COLUMN: &str = "Views";
 const FLOWS_COLUMN: &str = "Flows";
 const FIELD_COLUMN: &str = "Field";
@@ -210,6 +210,29 @@ pub async fn show_databases(
     .await
 }
 
+/// Replaces column identifier references in a SQL expression.
+/// Used for backward compatibility where old column names should work with new ones.
+fn replace_column_in_expr(expr: &mut sqlparser::ast::Expr, from_column: &str, to_column: &str) {
+    let _ = visit_expressions_mut(expr, |e| {
+        match e {
+            sqlparser::ast::Expr::Identifier(ident) => {
+                if ident.value.eq_ignore_ascii_case(from_column) {
+                    ident.value = to_column.to_string();
+                }
+            }
+            sqlparser::ast::Expr::CompoundIdentifier(idents) => {
+                if let Some(last) = idents.last_mut()
+                    && last.value.eq_ignore_ascii_case(from_column)
+                {
+                    last.value = to_column.to_string();
+                }
+            }
+            _ => {}
+        }
+        ControlFlow::<()>::Continue(())
+    });
+}
+
 /// Cast a `show` statement execution into a query from tables in  `information_schema`.
 /// - `table_name`: the table name in `information_schema`,
 /// - `projects`: query projection, a list of `(column, renamed_column)`,
@@ -247,7 +270,7 @@ async fn query_from_information_schema_table(
             ),
         })?;
 
-    let DataFrame::DataFusion(dataframe) = query_engine.read_table(table)?;
+    let dataframe = query_engine.read_table(table)?;
 
     // Apply filters
     let dataframe = filters.into_iter().try_fold(dataframe, |df, expr| {
@@ -540,15 +563,15 @@ pub async fn show_tables(
         query_ctx.current_schema()
     };
 
-    // (dennis): MySQL rename `table_name` to `Tables_in_{schema}`, but we use `Tables` instead.
-    // I don't want to modify this currently, our dashboard may depend on it.
+    // MySQL renames `table_name` to `Tables_in_{schema}` for protocol compatibility
+    let tables_column = format!("Tables_in_{}", schema_name);
     let projects = if stmt.full {
         vec![
-            (tables::TABLE_NAME, TABLES_COLUMN),
+            (tables::TABLE_NAME, tables_column.as_str()),
             (tables::TABLE_TYPE, TABLE_TYPE_COLUMN),
         ]
     } else {
-        vec![(tables::TABLE_NAME, TABLES_COLUMN)]
+        vec![(tables::TABLE_NAME, tables_column.as_str())]
     };
     let filters = vec![
         col(tables::TABLE_SCHEMA).eq(lit(schema_name.clone())),
@@ -557,6 +580,16 @@ pub async fn show_tables(
     let like_field = Some(tables::TABLE_NAME);
     let sort = vec![col(tables::TABLE_NAME).sort(true, true)];
 
+    // Transform the WHERE clause for backward compatibility:
+    // Replace "Tables" with "Tables_in_{schema}" to support old queries
+    let kind = match stmt.kind {
+        ShowKind::Where(mut filter) => {
+            replace_column_in_expr(&mut filter, "Tables", &tables_column);
+            ShowKind::Where(filter)
+        }
+        other => other,
+    };
+
     query_from_information_schema_table(
         query_engine,
         catalog_manager,
@@ -567,7 +600,7 @@ pub async fn show_tables(
         filters,
         like_field,
         sort,
-        stmt.kind,
+        kind,
     )
     .await
 }
@@ -789,13 +822,12 @@ pub fn show_create_database(database_name: &str, options: OptionMap) -> Result<O
 }
 
 pub fn show_create_table(
-    table: TableRef,
+    table_info: TableInfoRef,
     schema_options: Option<SchemaOptions>,
     partitions: Option<Partitions>,
     query_ctx: QueryContextRef,
 ) -> Result<Output> {
-    let table_info = table.table_info();
-    let table_name = &table_info.name;
+    let table_name = table_info.name.clone();
 
     let quote_style = query_ctx.quote_style();
 
@@ -806,7 +838,7 @@ pub fn show_create_table(
     });
     let sql = format!("{}", stmt);
     let columns = vec![
-        Arc::new(StringVector::from(vec![table_name.clone()])) as _,
+        Arc::new(StringVector::from(vec![table_name])) as _,
         Arc::new(StringVector::from(vec![sql])) as _,
     ];
     let records = RecordBatches::try_from_columns(SHOW_CREATE_TABLE_OUTPUT_SCHEMA.clone(), columns)
@@ -1440,8 +1472,7 @@ mod test {
                 ..
             }) => {
                 let record = record.take().first().cloned().unwrap();
-                let data = record.column(0);
-                Ok(data.get(0).to_string())
+                Ok(record.iter_column_as_string(0).next().unwrap().unwrap())
             }
             Ok(_) => unreachable!(),
             Err(e) => Err(e),
diff --git a/src/query/src/sql/show_create_table.rs b/src/query/src/sql/show_create_table.rs
index 2c566af508..bf34e61564 100644
--- a/src/query/src/sql/show_create_table.rs
+++ b/src/query/src/sql/show_create_table.rs
@@ -34,7 +34,9 @@ use sql::statements::create::{Column, ColumnExtensions, CreateTable, TableConstr
 use sql::statements::{self, OptionMap};
 use store_api::metric_engine_consts::{is_metric_engine, is_metric_engine_internal_column};
 use table::metadata::{TableInfoRef, TableMeta};
-use table::requests::{FILE_TABLE_META_KEY, TTL_KEY, WRITE_BUFFER_SIZE_KEY};
+use table::requests::{
+    COMMENT_KEY as TABLE_COMMENT_KEY, FILE_TABLE_META_KEY, TTL_KEY, WRITE_BUFFER_SIZE_KEY,
+};
 
 use crate::error::{
     ConvertSqlTypeSnafu, ConvertSqlValueSnafu, GetFulltextOptionsSnafu,
@@ -249,6 +251,13 @@ pub fn create_table_stmt(
 
     let constraints = create_table_constraints(&table_meta.engine, schema, table_meta, quote_style);
 
+    let mut options = create_sql_options(table_meta, schema_options);
+    if let Some(comment) = &table_info.desc
+        && options.get(TABLE_COMMENT_KEY).is_none()
+    {
+        options.insert(format!("'{TABLE_COMMENT_KEY}'"), comment.clone());
+    }
+
     Ok(CreateTable {
         if_not_exists: true,
         table_id: table_info.ident.table_id,
@@ -256,7 +265,7 @@ pub fn create_table_stmt(
         columns,
         engine: table_meta.engine.clone(),
         constraints,
-        options: create_sql_options(table_meta, schema_options),
+        options,
         partitions: None,
     })
 }
diff --git a/src/query/src/test_util.rs b/src/query/src/test_util.rs
index f64718b84a..55891b0063 100644
--- a/src/query/src/test_util.rs
+++ b/src/query/src/test_util.rs
@@ -25,6 +25,7 @@ use arrow_schema::{SchemaRef, TimeUnit};
 use common_recordbatch::{DfRecordBatch, DfSendableRecordBatchStream};
 use datafusion::execution::{RecordBatchStream, TaskContext};
 use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
+use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
 use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties};
 use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
 use futures::Stream;
@@ -46,13 +47,14 @@ pub fn new_ts_array(unit: TimeUnit, arr: Vec<i64>) -> ArrayRef {
 
 #[derive(Debug)]
 pub struct MockInputExec {
-    input: Vec<DfRecordBatch>,
+    input: Vec<Vec<DfRecordBatch>>,
     schema: SchemaRef,
     properties: PlanProperties,
+    metrics: ExecutionPlanMetricsSet,
 }
 
 impl MockInputExec {
-    pub fn new(input: Vec<DfRecordBatch>, schema: SchemaRef) -> Self {
+    pub fn new(input: Vec<Vec<DfRecordBatch>>, schema: SchemaRef) -> Self {
         Self {
             properties: PlanProperties::new(
                 EquivalenceProperties::new(schema.clone()),
@@ -62,6 +64,7 @@ impl MockInputExec {
             ),
             input,
             schema,
+            metrics: ExecutionPlanMetricsSet::new(),
         }
     }
 }
@@ -98,22 +101,28 @@ impl ExecutionPlan for MockInputExec {
 
     fn execute(
         &self,
-        _partition: usize,
+        partition: usize,
         _context: Arc<TaskContext>,
     ) -> datafusion_common::Result<DfSendableRecordBatchStream> {
         let stream = MockStream {
-            stream: self.input.clone(),
+            stream: self.input.clone().into_iter().flatten().collect(),
             schema: self.schema.clone(),
             idx: 0,
+            metrics: BaselineMetrics::new(&self.metrics, partition),
         };
         Ok(Box::pin(stream))
     }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
 }
 
 struct MockStream {
     stream: Vec<DfRecordBatch>,
     schema: SchemaRef,
     idx: usize,
+    metrics: BaselineMetrics,
 }
 
 impl Stream for MockStream {
@@ -125,7 +134,7 @@ impl Stream for MockStream {
         if self.idx < self.stream.len() {
             let ret = self.stream[self.idx].clone();
             self.idx += 1;
-            Poll::Ready(Some(Ok(ret)))
+            self.metrics.record_poll(Poll::Ready(Some(Ok(ret))))
         } else {
             Poll::Ready(None)
         }
diff --git a/src/query/src/tests/function.rs b/src/query/src/tests/function.rs
index b383daf521..9f6ce0137e 100644
--- a/src/query/src/tests/function.rs
+++ b/src/query/src/tests/function.rs
@@ -18,7 +18,7 @@ use common_function::scalars::vector::impl_conv::veclit_to_binlit;
 use common_recordbatch::RecordBatch;
 use datatypes::prelude::*;
 use datatypes::schema::{ColumnSchema, Schema};
-use datatypes::vectors::BinaryVector;
+use datatypes::vectors::{BinaryVector, Helper};
 use rand::Rng;
 use table::test_util::MemTable;
 
@@ -64,5 +64,6 @@ pub fn get_value_from_batches(column_name: &str, batches: Vec<RecordBatch>) -> V
     assert_eq!(batch.column(0).len(), 1);
     let v = batch.column(0);
     assert_eq!(1, v.len());
+    let v = Helper::try_into_vector(v).unwrap();
     v.get(0)
 }
diff --git a/src/query/src/tests/query_engine_test.rs b/src/query/src/tests/query_engine_test.rs
index 797d2cf26a..a96abc36d7 100644
--- a/src/query/src/tests/query_engine_test.rs
+++ b/src/query/src/tests/query_engine_test.rs
@@ -14,6 +14,7 @@
 
 use std::sync::Arc;
 
+use arrow::array::{ArrayRef, UInt32Array};
 use catalog::RegisterTableRequest;
 use catalog::memory::MemoryCatalogManager;
 use common_base::Plugins;
@@ -97,11 +98,10 @@ async fn test_datafusion_query_engine() -> Result<()> {
     let batch = &numbers[0];
     assert_eq!(1, batch.num_columns());
     assert_eq!(batch.column(0).len(), limit);
-    let expected: Vec<u32> = (0u32..limit as u32).collect();
-    assert_eq!(
-        *batch.column(0),
-        Arc::new(UInt32Vector::from_slice(expected)) as VectorRef
-    );
+    let expected = Arc::new(UInt32Array::from_iter_values(
+        (0u32..limit as u32).collect::<Vec<_>>(),
+    )) as ArrayRef;
+    assert_eq!(batch.column(0), &expected);
 
     Ok(())
 }
diff --git a/src/query/src/tests/vec_avg_test.rs b/src/query/src/tests/vec_avg_test.rs
index 46bb3528a9..672cbeaa27 100644
--- a/src/query/src/tests/vec_avg_test.rs
+++ b/src/query/src/tests/vec_avg_test.rs
@@ -34,7 +34,7 @@ async fn test_vec_avg_aggregator() -> Result<(), common_query::error::Error> {
     let sql = "SELECT vector FROM vectors";
     let vectors = exec_selection(engine, sql).await;
 
-    let column = vectors[0].column(0).to_arrow_array();
+    let column = vectors[0].column(0);
     let len = column.len();
     for i in 0..column.len() {
         let v = ScalarValue::try_from_array(&column, i)?;
diff --git a/src/query/src/tests/vec_product_test.rs b/src/query/src/tests/vec_product_test.rs
index 53eb0d3272..26c275a5cc 100644
--- a/src/query/src/tests/vec_product_test.rs
+++ b/src/query/src/tests/vec_product_test.rs
@@ -32,7 +32,7 @@ async fn test_vec_product_aggregator() -> Result<(), common_query::error::Error>
     let sql = "SELECT vector FROM vectors";
     let vectors = exec_selection(engine, sql).await;
 
-    let column = vectors[0].column(0).to_arrow_array();
+    let column = vectors[0].column(0);
     for i in 0..column.len() {
         let v = ScalarValue::try_from_array(&column, i)?;
         let vector = as_veclit(&v)?;
diff --git a/src/query/src/tests/vec_sum_test.rs b/src/query/src/tests/vec_sum_test.rs
index 2c488c3c53..389bb0724d 100644
--- a/src/query/src/tests/vec_sum_test.rs
+++ b/src/query/src/tests/vec_sum_test.rs
@@ -34,7 +34,7 @@ async fn test_vec_sum_aggregator() -> Result<(), common_query::error::Error> {
     let sql = "SELECT vector FROM vectors";
     let vectors = exec_selection(engine, sql).await;
 
-    let column = vectors[0].column(0).to_arrow_array();
+    let column = vectors[0].column(0);
     for i in 0..column.len() {
         let v = ScalarValue::try_from_array(&column, i)?;
         let vector = as_veclit(&v)?;
diff --git a/src/query/src/window_sort.rs b/src/query/src/window_sort.rs
index eb0aa2d071..fad4e95db4 100644
--- a/src/query/src/window_sort.rs
+++ b/src/query/src/window_sort.rs
@@ -84,23 +84,31 @@ pub struct WindowedSortExec {
     properties: PlanProperties,
 }
 
-fn check_partition_range_monotonicity(
+/// Checks that partition ranges are sorted correctly for the given sort direction.
+/// - Descending: sorted by (end DESC, start DESC) - shorter ranges first when ends are equal
+/// - Ascending: sorted by (start ASC, end ASC) - shorter ranges first when starts are equal
+pub fn check_partition_range_monotonicity(
     ranges: &[Vec<PartitionRange>],
     descending: bool,
 ) -> Result<()> {
     let is_valid = ranges.iter().all(|r| {
         if descending {
-            r.windows(2).all(|w| w[0].end >= w[1].end)
+            // Primary: end descending, Secondary: start descending (shorter range first)
+            r.windows(2)
+                .all(|w| w[0].end > w[1].end || (w[0].end == w[1].end && w[0].start >= w[1].start))
         } else {
-            r.windows(2).all(|w| w[0].start <= w[1].start)
+            // Primary: start ascending, Secondary: end ascending (shorter range first)
+            r.windows(2).all(|w| {
+                w[0].start < w[1].start || (w[0].start == w[1].start && w[0].end <= w[1].end)
+            })
         }
     });
 
     if !is_valid {
         let msg = if descending {
-            "Input `PartitionRange`s's upper bound is not monotonic non-increase"
+            "Input `PartitionRange`s are not sorted by (end DESC, start DESC)"
         } else {
-            "Input `PartitionRange`s's lower bound is not monotonic non-decrease"
+            "Input `PartitionRange`s are not sorted by (start ASC, end ASC)"
         };
         let plain_error = PlainError::new(msg.to_string(), StatusCode::Unexpected);
         Err(BoxedError::new(plain_error)).context(QueryExecutionSnafu {})
@@ -1259,6 +1267,41 @@ mod test {
     use super::*;
     use crate::test_util::{MockInputExec, new_ts_array};
 
+    // Test helpers to reduce duplication
+    mod helpers {
+        use datafusion::physical_plan::expressions::Column;
+
+        use super::*;
+
+        pub fn default_sort_opts(descending: bool) -> SortOptions {
+            SortOptions {
+                descending,
+                nulls_first: true,
+            }
+        }
+
+        pub fn ts_field(unit: TimeUnit) -> Field {
+            Field::new("ts", DataType::Timestamp(unit, None), false)
+        }
+
+        pub fn ts_column() -> Column {
+            Column::new("ts", 0)
+        }
+
+        pub fn partition_range(start: i64, end: i64, num_rows: usize, id: usize) -> PartitionRange {
+            PartitionRange {
+                start: Timestamp::new_millisecond(start),
+                end: Timestamp::new_millisecond(end),
+                num_rows,
+                identifier: id,
+            }
+        }
+
+        pub fn ts_array(values: impl IntoIterator<Item = i64>) -> ArrayRef {
+            Arc::new(TimestampMillisecondArray::from_iter_values(values))
+        }
+    }
+
     #[test]
     fn test_overlapping() {
         let testcases = [
@@ -1455,8 +1498,32 @@ mod test {
         }
     }
 
+    #[allow(clippy::type_complexity)]
+    fn run_compute_working_ranges_test(
+        testcases: Vec<(
+            BTreeMap<(Timestamp, Timestamp), Vec<usize>>,
+            Vec<((Timestamp, Timestamp), BTreeSet<usize>)>,
+        )>,
+        descending: bool,
+    ) {
+        for (input, expected) in testcases {
+            let expected = expected
+                .into_iter()
+                .map(|(r, s)| (r.into(), s))
+                .collect_vec();
+            let input = input.into_iter().map(|(r, s)| (r.into(), s)).collect();
+            assert_eq!(
+                compute_all_working_ranges(&input, descending),
+                expected,
+                "input: {:?}, descending: {}",
+                input,
+                descending
+            );
+        }
+    }
+
     #[test]
-    fn test_compute_working_ranges_rev() {
+    fn test_compute_working_ranges_descending() {
         let testcases = vec![
             (
                 BTreeMap::from([(
@@ -1655,23 +1722,11 @@ mod test {
             ),
         ];
 
-        for (input, expected) in testcases {
-            let expected = expected
-                .into_iter()
-                .map(|(r, s)| (r.into(), s))
-                .collect_vec();
-            let input = input.into_iter().map(|(r, s)| (r.into(), s)).collect();
-            assert_eq!(
-                compute_all_working_ranges(&input, true),
-                expected,
-                "input: {:?}",
-                input
-            );
-        }
+        run_compute_working_ranges_test(testcases, true);
     }
 
     #[test]
-    fn test_compute_working_ranges() {
+    fn test_compute_working_ranges_ascending() {
         let testcases = vec![
             (
                 BTreeMap::from([(
@@ -1871,19 +1926,7 @@ mod test {
             ),
         ];
 
-        for (input, expected) in testcases {
-            let expected = expected
-                .into_iter()
-                .map(|(r, s)| (r.into(), s))
-                .collect_vec();
-            let input = input.into_iter().map(|(r, s)| (r.into(), s)).collect();
-            assert_eq!(
-                compute_all_working_ranges(&input, false),
-                expected,
-                "input: {:?}",
-                input
-            );
-        }
+        run_compute_working_ranges_test(testcases, false);
     }
 
     #[test]
@@ -2174,6 +2217,7 @@ mod test {
     #[test]
     fn test_cmp_with_opts() {
         let testcases = vec![
+            // Test ascending vs descending for Some values
             (
                 Some(1),
                 Some(2),
@@ -2192,6 +2236,7 @@ mod test {
                 }),
                 std::cmp::Ordering::Greater,
             ),
+            // Test Some vs None with nulls_first
             (
                 Some(1),
                 None,
@@ -2210,6 +2255,7 @@ mod test {
                 }),
                 std::cmp::Ordering::Greater,
             ),
+            // Test Some vs None with nulls_last
             (
                 Some(1),
                 None,
@@ -2228,15 +2274,7 @@ mod test {
                 }),
                 std::cmp::Ordering::Less,
             ),
-            (
-                None,
-                None,
-                Some(SortOptions {
-                    descending: true,
-                    nulls_first: true,
-                }),
-                std::cmp::Ordering::Equal,
-            ),
+            // Test None vs None - always Equal regardless of sort options
             (
                 None,
                 None,
@@ -2246,24 +2284,6 @@ mod test {
                 }),
                 std::cmp::Ordering::Equal,
             ),
-            (
-                None,
-                None,
-                Some(SortOptions {
-                    descending: true,
-                    nulls_first: false,
-                }),
-                std::cmp::Ordering::Equal,
-            ),
-            (
-                None,
-                None,
-                Some(SortOptions {
-                    descending: false,
-                    nulls_first: false,
-                }),
-                std::cmp::Ordering::Equal,
-            ),
         ];
         for (a, b, opts, expected) in testcases {
             assert_eq!(
@@ -2492,21 +2512,20 @@ mod test {
         output: Vec<DfRecordBatch>,
         schema: SchemaRef,
     }
-    use datafusion::physical_plan::expressions::Column;
+
     impl TestStream {
         fn new(
-            ts_col: Column,
             opt: SortOptions,
             fetch: Option<usize>,
-            schema: impl Into<arrow_schema::Fields>,
+            unit: TimeUnit,
             input: Vec<(PartitionRange, Vec<ArrayRef>)>,
             expected: Vec<Vec<ArrayRef>>,
         ) -> Self {
             let expression = PhysicalSortExpr {
-                expr: Arc::new(ts_col),
+                expr: Arc::new(helpers::ts_column()),
                 options: opt,
             };
-            let schema = Schema::new(schema.into());
+            let schema = Schema::new(vec![helpers::ts_field(unit)]);
             let schema = Arc::new(schema);
             let input = input
                 .into_iter()
@@ -2525,10 +2544,25 @@ mod test {
             }
         }
 
+        fn new_simple(
+            descending: bool,
+            fetch: Option<usize>,
+            input: Vec<(PartitionRange, Vec<ArrayRef>)>,
+            expected: Vec<Vec<ArrayRef>>,
+        ) -> Self {
+            Self::new(
+                helpers::default_sort_opts(descending),
+                fetch,
+                TimeUnit::Millisecond,
+                input,
+                expected,
+            )
+        }
+
         async fn run_test(&self) -> Vec<DfRecordBatch> {
             let (ranges, batches): (Vec<_>, Vec<_>) = self.input.clone().into_iter().unzip();
 
-            let mock_input = MockInputExec::new(batches, self.schema.clone());
+            let mock_input = MockInputExec::new(vec![batches], self.schema.clone());
 
             let exec = WindowedSortExec::try_new(
                 self.expression.clone(),
@@ -2547,607 +2581,218 @@ mod test {
     }
 
     #[tokio::test]
-    async fn test_window_sort_stream() {
+    async fn test_window_sort_empty_and_minimal() {
+        use helpers::*;
         let test_cases = [
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: false,
-                    nulls_first: true,
-                },
+            // Empty input
+            TestStream::new_simple(false, None, vec![], vec![]),
+            // One empty batch, one with data
+            TestStream::new_simple(
+                false,
                 None,
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
-                vec![],
-                vec![],
-            ),
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: false,
-                    nulls_first: true,
-                },
-                None,
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
                 vec![
-                    // test one empty
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(2),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(3),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([2]))],
-                    ),
+                    (partition_range(1, 2, 1, 0), vec![ts_array([])]),
+                    (partition_range(1, 3, 1, 0), vec![ts_array([2])]),
                 ],
-                vec![vec![Arc::new(TimestampMillisecondArray::from_iter_values(
-                    [2],
-                ))]],
+                vec![vec![ts_array([2])]],
             ),
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: false,
-                    nulls_first: true,
-                },
+            // Both batches empty
+            TestStream::new_simple(
+                false,
                 None,
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
                 vec![
-                    // test one empty
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(2),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(3),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([]))],
-                    ),
+                    (partition_range(1, 2, 1, 0), vec![ts_array([])]),
+                    (partition_range(1, 3, 1, 0), vec![ts_array([])]),
                 ],
                 vec![],
             ),
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: false,
-                    nulls_first: true,
-                },
+            // Indistinguishable boundary case - value at exact boundary
+            TestStream::new_simple(
+                false,
                 None,
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
                 vec![
-                    // test indistinguishable case
-                    // we can't know whether `2` belong to which range
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(2),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([1]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(3),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([2]))],
-                    ),
-                ],
-                vec![
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([1]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([2]))],
+                    (partition_range(1, 2, 1, 0), vec![ts_array([1])]),
+                    (partition_range(1, 3, 1, 0), vec![ts_array([2])]),
                 ],
+                vec![vec![ts_array([1])], vec![ts_array([2])]],
             ),
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: false,
-                    nulls_first: true,
-                },
+        ];
+
+        for (idx, testcase) in test_cases.iter().enumerate() {
+            let output = testcase.run_test().await;
+            assert_eq!(output, testcase.output, "empty/minimal case {idx} failed");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_window_sort_overlapping() {
+        use helpers::*;
+        let test_cases = [
+            // Direct emit - overlapping ranges without merge
+            TestStream::new_simple(
+                false,
                 None,
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
                 vec![
-                    // test direct emit
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(3),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 2,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(4),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            2, 3,
-                        ]))],
-                    ),
+                    (partition_range(1, 3, 1, 0), vec![ts_array([1, 2])]),
+                    (partition_range(1, 4, 1, 0), vec![ts_array([2, 3])]),
                 ],
                 vec![
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        1, 2,
-                    ]))],
-                    // didn't trigger a merge sort/concat here so this is it
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([2]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([3]))],
+                    vec![ts_array([1, 2])],
+                    vec![ts_array([2])],
+                    vec![ts_array([3])],
                 ],
             ),
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: false,
-                    nulls_first: true,
-                },
+            // Cross working range batch intersection - triggers merge
+            TestStream::new_simple(
+                false,
                 None,
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
                 vec![
-                    // test more of cross working range batch intersection
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(3),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 2,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(4),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 2, 3,
-                        ]))],
-                    ),
-                ],
-                vec![
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        1, 1, 2, 2,
-                    ]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([3]))],
+                    (partition_range(1, 3, 1, 0), vec![ts_array([1, 2])]),
+                    (partition_range(1, 4, 1, 1), vec![ts_array([1, 2, 3])]),
                 ],
+                vec![vec![ts_array([1, 1, 2, 2])], vec![ts_array([3])]],
             ),
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: false,
-                    nulls_first: true,
-                },
+            // No overlap case - separate ranges
+            TestStream::new_simple(
+                false,
                 None,
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
                 vec![
-                    // no overlap, empty intersection batch case
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(3),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 2,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(4),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 2, 3,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(4),
-                            end: Timestamp::new_millisecond(6),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            4, 5,
-                        ]))],
-                    ),
+                    (partition_range(1, 3, 1, 0), vec![ts_array([1, 2])]),
+                    (partition_range(1, 4, 1, 1), vec![ts_array([1, 2, 3])]),
+                    (partition_range(4, 6, 1, 1), vec![ts_array([4, 5])]),
                 ],
                 vec![
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        1, 1, 2, 2,
-                    ]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([3]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        4, 5,
-                    ]))],
-                ],
-            ),
-            // test fetch
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: false,
-                    nulls_first: true,
-                },
-                Some(6),
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
-                vec![
-                    // no overlap, empty intersection batch case
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(3),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 2,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(4),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 2, 3,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(3),
-                            end: Timestamp::new_millisecond(6),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            4, 5,
-                        ]))],
-                    ),
-                ],
-                vec![
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        1, 1, 2, 2,
-                    ]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([3]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([4]))],
-                ],
-            ),
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: false,
-                    nulls_first: true,
-                },
-                Some(3),
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
-                vec![
-                    // no overlap, empty intersection batch case
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(3),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 2,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(4),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 2, 3,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(3),
-                            end: Timestamp::new_millisecond(6),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            4, 5,
-                        ]))],
-                    ),
-                ],
-                vec![vec![Arc::new(TimestampMillisecondArray::from_iter_values(
-                    [1, 1, 2],
-                ))]],
-            ),
-            // rev case
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: true,
-                    nulls_first: true,
-                },
-                None,
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
-                vec![
-                    // reverse order
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(3),
-                            end: Timestamp::new_millisecond(6),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            5, 4,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(4),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            3, 2, 1,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(3),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            2, 1,
-                        ]))],
-                    ),
-                ],
-                vec![
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        5, 4,
-                    ]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([3]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        2, 2, 1, 1,
-                    ]))],
-                ],
-            ),
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: false,
-                    nulls_first: true,
-                },
-                None,
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
-                vec![
-                    // long have subset short run case
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(10),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 5, 9,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(3),
-                            end: Timestamp::new_millisecond(7),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            3, 4, 5, 6,
-                        ]))],
-                    ),
-                ],
-                vec![
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([1]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        3, 4, 5, 5, 6, 9,
-                    ]))],
-                ],
-            ),
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: false,
-                    nulls_first: true,
-                },
-                None,
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
-                vec![
-                    // complex overlap
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(3),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 2,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(10),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 3, 4, 5, 6, 8,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(7),
-                            end: Timestamp::new_millisecond(10),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            7, 8, 9,
-                        ]))],
-                    ),
-                ],
-                vec![
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        1, 1, 2,
-                    ]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        3, 4, 5, 6,
-                    ]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        7, 8, 8, 9,
-                    ]))],
-                ],
-            ),
-            TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending: false,
-                    nulls_first: true,
-                },
-                None,
-                vec![Field::new(
-                    "ts",
-                    DataType::Timestamp(TimeUnit::Millisecond, None),
-                    false,
-                )],
-                vec![
-                    // complex subset with having same datapoint
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(1),
-                            end: Timestamp::new_millisecond(11),
-                            num_rows: 1,
-                            identifier: 0,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                        ]))],
-                    ),
-                    (
-                        PartitionRange {
-                            start: Timestamp::new_millisecond(5),
-                            end: Timestamp::new_millisecond(7),
-                            num_rows: 1,
-                            identifier: 1,
-                        },
-                        vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                            5, 6,
-                        ]))],
-                    ),
-                ],
-                vec![
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        1, 2, 3, 4,
-                    ]))],
-                    vec![Arc::new(TimestampMillisecondArray::from_iter_values([
-                        5, 5, 6, 6, 7, 8, 9, 10,
-                    ]))],
+                    vec![ts_array([1, 1, 2, 2])],
+                    vec![ts_array([3])],
+                    vec![ts_array([4, 5])],
                 ],
             ),
         ];
 
-        let indexed_test_cases = test_cases.iter().enumerate().collect_vec();
-
-        for (idx, testcase) in &indexed_test_cases {
+        for (idx, testcase) in test_cases.iter().enumerate() {
             let output = testcase.run_test().await;
-            assert_eq!(output, testcase.output, "case {idx} failed.");
+            assert_eq!(output, testcase.output, "overlapping case {idx} failed");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_window_sort_with_fetch() {
+        use helpers::*;
+        let test_cases = [
+            // Fetch limit stops at 6 rows
+            TestStream::new_simple(
+                false,
+                Some(6),
+                vec![
+                    (partition_range(1, 3, 1, 0), vec![ts_array([1, 2])]),
+                    (partition_range(1, 4, 1, 1), vec![ts_array([1, 2, 3])]),
+                    (partition_range(3, 6, 1, 1), vec![ts_array([4, 5])]),
+                ],
+                vec![
+                    vec![ts_array([1, 1, 2, 2])],
+                    vec![ts_array([3])],
+                    vec![ts_array([4])],
+                ],
+            ),
+            // Fetch limit stops at 3 rows
+            TestStream::new_simple(
+                false,
+                Some(3),
+                vec![
+                    (partition_range(1, 3, 1, 0), vec![ts_array([1, 2])]),
+                    (partition_range(1, 4, 1, 1), vec![ts_array([1, 2, 3])]),
+                    (partition_range(3, 6, 1, 1), vec![ts_array([4, 5])]),
+                ],
+                vec![vec![ts_array([1, 1, 2])]],
+            ),
+        ];
+
+        for (idx, testcase) in test_cases.iter().enumerate() {
+            let output = testcase.run_test().await;
+            assert_eq!(output, testcase.output, "fetch case {idx} failed");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_window_sort_descending() {
+        use helpers::*;
+        let test_cases = [
+            // Descending order sort
+            TestStream::new_simple(
+                true,
+                None,
+                vec![
+                    (partition_range(3, 6, 1, 1), vec![ts_array([5, 4])]),
+                    (partition_range(1, 4, 1, 1), vec![ts_array([3, 2, 1])]),
+                    (partition_range(1, 3, 1, 0), vec![ts_array([2, 1])]),
+                ],
+                vec![
+                    vec![ts_array([5, 4])],
+                    vec![ts_array([3])],
+                    vec![ts_array([2, 2, 1, 1])],
+                ],
+            ),
+        ];
+
+        for (idx, testcase) in test_cases.iter().enumerate() {
+            let output = testcase.run_test().await;
+            assert_eq!(output, testcase.output, "descending case {idx} failed");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_window_sort_complex() {
+        use helpers::*;
+        let test_cases = [
+            // Long range with subset short run
+            TestStream::new_simple(
+                false,
+                None,
+                vec![
+                    (partition_range(1, 10, 1, 0), vec![ts_array([1, 5, 9])]),
+                    (partition_range(3, 7, 1, 1), vec![ts_array([3, 4, 5, 6])]),
+                ],
+                vec![vec![ts_array([1])], vec![ts_array([3, 4, 5, 5, 6, 9])]],
+            ),
+            // Complex multi-range overlap
+            TestStream::new_simple(
+                false,
+                None,
+                vec![
+                    (partition_range(1, 3, 1, 0), vec![ts_array([1, 2])]),
+                    (
+                        partition_range(1, 10, 1, 1),
+                        vec![ts_array([1, 3, 4, 5, 6, 8])],
+                    ),
+                    (partition_range(7, 10, 1, 1), vec![ts_array([7, 8, 9])]),
+                ],
+                vec![
+                    vec![ts_array([1, 1, 2])],
+                    vec![ts_array([3, 4, 5, 6])],
+                    vec![ts_array([7, 8, 8, 9])],
+                ],
+            ),
+            // Subset with duplicate datapoints
+            TestStream::new_simple(
+                false,
+                None,
+                vec![
+                    (
+                        partition_range(1, 11, 1, 0),
+                        vec![ts_array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])],
+                    ),
+                    (partition_range(5, 7, 1, 1), vec![ts_array([5, 6])]),
+                ],
+                vec![
+                    vec![ts_array([1, 2, 3, 4])],
+                    vec![ts_array([5, 5, 6, 6, 7, 8, 9, 10])],
+                ],
+            ),
+        ];
+
+        for (idx, testcase) in test_cases.iter().enumerate() {
+            let output = testcase.run_test().await;
+            assert_eq!(output, testcase.output, "complex case {idx} failed");
         }
     }
 
@@ -3192,8 +2837,9 @@ mod test {
             // generate input data
             for part_id in 0..rng.usize(0..part_cnt_bound) {
                 let (start, end) = if descending {
+                    // Use 1..=range_offset_bound to ensure strictly decreasing end values
                     let end = bound_val
-                        .map(|i| i - rng.i64(0..range_offset_bound))
+                        .map(|i| i - rng.i64(1..=range_offset_bound))
                         .unwrap_or_else(|| rng.i64(..));
                     bound_val = Some(end);
                     let start = end - rng.i64(1..range_size_bound);
@@ -3201,8 +2847,9 @@ mod test {
                     let end = Timestamp::new(end, unit.into());
                     (start, end)
                 } else {
+                    // Use 1..=range_offset_bound to ensure strictly increasing start values
                     let start = bound_val
-                        .map(|i| i + rng.i64(0..range_offset_bound))
+                        .map(|i| i + rng.i64(1..=range_offset_bound))
                         .unwrap_or_else(|| rng.i64(..));
                     bound_val = Some(start);
                     let end = start + rng.i64(1..range_size_bound);
@@ -3234,13 +2881,9 @@ mod test {
             let output_arr = new_ts_array(unit, output_data);
 
             let test_stream = TestStream::new(
-                Column::new("ts", 0),
-                SortOptions {
-                    descending,
-                    nulls_first: true,
-                },
+                helpers::default_sort_opts(descending),
                 fetch,
-                vec![Field::new("ts", DataType::Timestamp(unit, None), false)],
+                unit,
                 input_ranged_data.clone(),
                 vec![vec![output_arr]],
             );
diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml
index ee7d4fbdd4..42ab4d1cd0 100644
--- a/src/servers/Cargo.toml
+++ b/src/servers/Cargo.toml
@@ -81,17 +81,13 @@ notify.workspace = true
 object-pool = "0.5"
 once_cell.workspace = true
 openmetrics-parser = "0.4"
-simd-json.workspace = true
-socket2 = "0.5"
-# use crates.io version once the following PRs is merged into the nextest release
-# 1. fix: Use After Free in PacketReader in https://github.com/databendlabs/opensrv/pull/67
-# 2. Use ring, instead of aws-lc-rs in https://github.com/databendlabs/opensrv/pull/72
-opensrv-mysql = { git = "https://github.com/datafuselabs/opensrv", rev = "a1fb4da215c8693c7e4f62be249a01b7fec52997" }
+opensrv-mysql = { git = "https://github.com/datafuselabs/opensrv", tag = "v0.10.0" }
 opentelemetry-proto.workspace = true
 operator.workspace = true
 otel-arrow-rust.workspace = true
 parking_lot.workspace = true
-pgwire = { version = "0.34", default-features = false, features = [
+pg_interval = "0.4"
+pgwire = { version = "0.36.3", default-features = false, features = [
     "server-api-ring",
     "pg-ext-types",
 ] }
@@ -113,8 +109,10 @@ rustls-pki-types = "1.0"
 serde.workspace = true
 serde_json.workspace = true
 session.workspace = true
+simd-json.workspace = true
 snafu.workspace = true
 snap = "1"
+socket2 = "0.5"
 sql.workspace = true
 store-api.workspace = true
 strum.workspace = true
@@ -128,6 +126,7 @@ tonic-reflection = "0.13"
 tower = { workspace = true, features = ["full"] }
 tower-http = { version = "0.6", features = ["full"] }
 tracing.workspace = true
+tracing-opentelemetry.workspace = true
 urlencoding = "2.1"
 uuid.workspace = true
 vrl.workspace = true
diff --git a/src/servers/dashboard/VERSION b/src/servers/dashboard/VERSION
index 97bd28bcfd..6b72fde647 100644
--- a/src/servers/dashboard/VERSION
+++ b/src/servers/dashboard/VERSION
@@ -1 +1 @@
-v0.11.7
+v0.11.9
diff --git a/src/servers/src/configurator.rs b/src/servers/src/configurator.rs
index 2b2e47d1e7..e8ba8264bd 100644
--- a/src/servers/src/configurator.rs
+++ b/src/servers/src/configurator.rs
@@ -15,16 +15,45 @@
 use std::sync::Arc;
 
 use axum::Router as HttpRouter;
+use common_error::ext::BoxedError;
 use tonic::transport::server::Router as GrpcRouter;
 
-pub trait Configurator: Send + Sync {
-    fn config_http(&self, route: HttpRouter) -> HttpRouter {
-        route
-    }
+use crate::grpc::builder::GrpcServerBuilder;
 
-    fn config_grpc(&self, route: GrpcRouter) -> GrpcRouter {
-        route
-    }
+/// A configurator that customizes or enhances an HTTP router.
+#[async_trait::async_trait]
+pub trait HttpConfigurator<C>: Send + Sync {
+    /// Configures the given HTTP router using the provided context.
+    async fn configure_http(
+        &self,
+        route: HttpRouter,
+        ctx: C,
+    ) -> std::result::Result<HttpRouter, BoxedError>;
 }
 
-pub type ConfiguratorRef = Arc<dyn Configurator>;
+pub type HttpConfiguratorRef<C> = Arc<dyn HttpConfigurator<C>>;
+
+/// A configurator that customizes or enhances a gRPC router.
+#[async_trait::async_trait]
+pub trait GrpcRouterConfigurator<C>: Send + Sync {
+    /// Configures the given gRPC router using the provided context.
+    async fn configure_grpc_router(
+        &self,
+        route: GrpcRouter,
+        ctx: C,
+    ) -> std::result::Result<GrpcRouter, BoxedError>;
+}
+
+pub type GrpcRouterConfiguratorRef<C> = Arc<dyn GrpcRouterConfigurator<C>>;
+
+/// A configurator that customizes or enhances a [`GrpcServerBuilder`].
+#[async_trait::async_trait]
+pub trait GrpcBuilderConfigurator<C>: Send + Sync {
+    async fn configure(
+        &self,
+        builder: GrpcServerBuilder,
+        ctx: C,
+    ) -> std::result::Result<GrpcServerBuilder, BoxedError>;
+}
+
+pub type GrpcBuilderConfiguratorRef<C> = Arc<dyn GrpcBuilderConfigurator<C>>;
diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs
index c7e5c5d07a..c52ddb7b34 100644
--- a/src/servers/src/error.rs
+++ b/src/servers/src/error.rs
@@ -229,14 +229,33 @@ pub enum Error {
         error: prost::DecodeError,
     },
 
-    #[snafu(display("Failed to decode OTLP request"))]
+    #[snafu(display(
+        "Failed to decode OTLP request (content-type: {content_type}): {error}. The endpoint only accepts 'application/x-protobuf' format."
+    ))]
     DecodeOtlpRequest {
+        content_type: String,
         #[snafu(implicit)]
         location: Location,
         #[snafu(source)]
         error: prost::DecodeError,
     },
 
+    #[snafu(display("Failed to decode Loki request: {error}"))]
+    DecodeLokiRequest {
+        #[snafu(implicit)]
+        location: Location,
+        #[snafu(source)]
+        error: prost::DecodeError,
+    },
+
+    #[snafu(display(
+        "Unsupported content type 'application/json'. OTLP endpoint only supports 'application/x-protobuf'. Please configure your OTLP exporter to use protobuf encoding."
+    ))]
+    UnsupportedJsonContentType {
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display(
         "OTLP metric input have incompatible existing tables, please refer to docs for details"
     ))]
@@ -269,21 +288,6 @@ pub enum Error {
         error: std::io::Error,
     },
 
-    #[snafu(display("Failed to send prometheus remote request"))]
-    SendPromRemoteRequest {
-        #[snafu(implicit)]
-        location: Location,
-        #[snafu(source)]
-        error: reqwest::Error,
-    },
-
-    #[snafu(display("Invalid export metrics config, msg: {}", msg))]
-    InvalidExportMetricsConfig {
-        msg: String,
-        #[snafu(implicit)]
-        location: Location,
-    },
-
     #[snafu(display("Failed to compress prometheus remote request"))]
     CompressPromRemoteRequest {
         #[snafu(implicit)]
@@ -647,6 +651,12 @@ pub enum Error {
         #[snafu(implicit)]
         location: Location,
     },
+
+    #[snafu(display("Service suspended"))]
+    Suspended {
+        #[snafu(implicit)]
+        location: Location,
+    },
 }
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -661,7 +671,6 @@ impl ErrorExt for Error {
             | StartHttp { .. }
             | StartGrpc { .. }
             | TcpBind { .. }
-            | SendPromRemoteRequest { .. }
             | BuildHttpResponse { .. }
             | Arrow { .. }
             | FileWatch { .. } => StatusCode::Internal,
@@ -694,11 +703,12 @@ impl ErrorExt for Error {
             | InvalidOpentsdbJsonRequest { .. }
             | DecodePromRemoteRequest { .. }
             | DecodeOtlpRequest { .. }
+            | DecodeLokiRequest { .. }
+            | UnsupportedJsonContentType { .. }
             | CompressPromRemoteRequest { .. }
             | DecompressSnappyPromRemoteRequest { .. }
             | DecompressZstdPromRemoteRequest { .. }
             | InvalidPromRemoteRequest { .. }
-            | InvalidExportMetricsConfig { .. }
             | InvalidFlightTicket { .. }
             | InvalidPrepareStatement { .. }
             | DataFrame { .. }
@@ -773,6 +783,8 @@ impl ErrorExt for Error {
             HandleOtelArrowRequest { .. } => StatusCode::Internal,
 
             Cancelled { .. } => StatusCode::Cancelled,
+
+            Suspended { .. } => StatusCode::Suspended,
         }
     }
 
@@ -853,7 +865,8 @@ pub fn status_code_to_http_status(status_code: &StatusCode) -> HttpStatusCode {
         | StatusCode::TableUnavailable
         | StatusCode::RegionBusy
         | StatusCode::StorageUnavailable
-        | StatusCode::External => HttpStatusCode::SERVICE_UNAVAILABLE,
+        | StatusCode::External
+        | StatusCode::Suspended => HttpStatusCode::SERVICE_UNAVAILABLE,
 
         StatusCode::Internal
         | StatusCode::Unexpected
diff --git a/src/servers/src/export_metrics.rs b/src/servers/src/export_metrics.rs
deleted file mode 100644
index aac7e8dda4..0000000000
--- a/src/servers/src/export_metrics.rs
+++ /dev/null
@@ -1,369 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::HashMap;
-use std::sync::Arc;
-use std::time::Duration;
-
-use common_base::Plugins;
-use common_telemetry::metric::{MetricFilter, convert_metric_to_write_request};
-use common_telemetry::{error, info};
-use common_time::Timestamp;
-use prost::Message;
-use reqwest::header::{HeaderMap, HeaderName, HeaderValue};
-use serde::{Deserialize, Serialize};
-use session::context::QueryContextBuilder;
-use snafu::{ResultExt, ensure};
-use tokio::time::{self, Interval};
-
-use crate::error::{InvalidExportMetricsConfigSnafu, Result, SendPromRemoteRequestSnafu};
-use crate::prom_store::{snappy_compress, to_grpc_row_insert_requests};
-use crate::query_handler::PromStoreProtocolHandlerRef;
-
-/// Use to export the metrics generated by greptimedb.
-///
-/// Encoded to Prometheus [RemoteWrite format](https://prometheus.io/docs/concepts/remote_write_spec/),
-/// and send to Prometheus remote-write compatible receiver (e.g. send to `greptimedb` itself)
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(default)]
-pub struct ExportMetricsOption {
-    pub enable: bool,
-    #[serde(with = "humantime_serde")]
-    pub write_interval: Duration,
-    pub self_import: Option<SelfImportOption>,
-    pub remote_write: Option<RemoteWriteOption>,
-}
-
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Default)]
-#[serde(default)]
-pub struct RemoteWriteOption {
-    pub url: String,
-    pub headers: HashMap<String, String>,
-}
-
-#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(default)]
-pub struct SelfImportOption {
-    pub db: String,
-}
-
-impl Default for SelfImportOption {
-    fn default() -> Self {
-        Self {
-            db: "greptime_metrics".to_string(),
-        }
-    }
-}
-
-impl Default for ExportMetricsOption {
-    fn default() -> Self {
-        Self {
-            enable: false,
-            write_interval: Duration::from_secs(30),
-            self_import: None,
-            remote_write: None,
-        }
-    }
-}
-
-#[derive(Default, Clone)]
-pub struct ExportMetricsTask {
-    config: ExportMetricsOption,
-    filter: Option<MetricFilter>,
-    headers: HeaderMap<HeaderValue>,
-    pub send_by_handler: bool,
-}
-
-impl ExportMetricsTask {
-    pub fn try_new(
-        config: &ExportMetricsOption,
-        plugins: Option<&Plugins>,
-    ) -> Result<Option<Self>> {
-        if !config.enable {
-            return Ok(None);
-        }
-        let filter = plugins.map(|p| p.get::<MetricFilter>()).unwrap_or(None);
-        ensure!(
-            config.write_interval.as_secs() != 0,
-            InvalidExportMetricsConfigSnafu {
-                msg: "Expected export metrics write_interval greater than zero"
-            }
-        );
-        ensure!(
-            (config.remote_write.is_none() && config.self_import.is_some())
-                || (config.remote_write.is_some() && config.self_import.is_none()),
-            InvalidExportMetricsConfigSnafu {
-                msg: "Only one of `self_import` or `remote_write` can be used as the export method"
-            }
-        );
-        if let Some(self_import) = &config.self_import {
-            ensure!(
-                !self_import.db.is_empty(),
-                InvalidExportMetricsConfigSnafu {
-                    msg: "Expected `self_import` metrics `db` not empty"
-                }
-            );
-        }
-        let mut headers = HeaderMap::new();
-        if let Some(remote_write) = &config.remote_write {
-            ensure!(
-                !remote_write.url.is_empty(),
-                InvalidExportMetricsConfigSnafu {
-                    msg: "Expected `remote_write` metrics `url` not empty"
-                }
-            );
-            // construct http header
-            remote_write.headers.iter().try_for_each(|(k, v)| {
-                let header = match TryInto::<HeaderName>::try_into(k) {
-                    Ok(header) => header,
-                    Err(_) => {
-                        return InvalidExportMetricsConfigSnafu {
-                            msg: format!("Export metrics: invalid HTTP header name: {}", k),
-                        }
-                        .fail();
-                    }
-                };
-                match TryInto::<HeaderValue>::try_into(v) {
-                    Ok(value) => headers.insert(header, value),
-                    Err(_) => {
-                        return InvalidExportMetricsConfigSnafu {
-                            msg: format!("Export metrics: invalid HTTP header value: {}", v),
-                        }
-                        .fail();
-                    }
-                };
-                Ok(())
-            })?;
-        }
-        Ok(Some(Self {
-            config: config.clone(),
-            filter,
-            headers,
-            send_by_handler: config.self_import.is_some(),
-        }))
-    }
-
-    pub fn start(&self, handler: Option<PromStoreProtocolHandlerRef>) -> Result<()> {
-        if !self.config.enable {
-            return Ok(());
-        }
-        let interval = time::interval(self.config.write_interval);
-        let filter = self.filter.clone();
-        let _handle = if let Some(self_import) = &self.config.self_import {
-            ensure!(
-                handler.is_some(),
-                InvalidExportMetricsConfigSnafu {
-                    msg: "Only `frontend` or `standalone` can use `self_import` as export method."
-                }
-            );
-            common_runtime::spawn_global(write_system_metric_by_handler(
-                self_import.db.clone(),
-                handler.unwrap(),
-                filter,
-                interval,
-            ))
-        } else if let Some(remote_write) = &self.config.remote_write {
-            common_runtime::spawn_global(write_system_metric_by_network(
-                self.headers.clone(),
-                remote_write.url.clone(),
-                filter,
-                interval,
-            ))
-        } else {
-            unreachable!()
-        };
-        Ok(())
-    }
-}
-
-/// Send metrics collected by standard Prometheus [RemoteWrite format](https://prometheus.io/docs/concepts/remote_write_spec/)
-pub async fn write_system_metric_by_network(
-    headers: HeaderMap,
-    endpoint: String,
-    filter: Option<MetricFilter>,
-    mut interval: Interval,
-) {
-    info!(
-        "Start export metrics task to endpoint: {}, interval: {}s",
-        endpoint,
-        interval.period().as_secs()
-    );
-    // Pass the first tick. Because the first tick completes immediately.
-    interval.tick().await;
-    let client = reqwest::Client::new();
-    loop {
-        interval.tick().await;
-        let metric_families = prometheus::gather();
-        let request = convert_metric_to_write_request(
-            metric_families,
-            filter.as_ref(),
-            Timestamp::current_millis().value(),
-        );
-        let resp = match snappy_compress(&request.encode_to_vec()) {
-            Ok(body) => client
-                .post(endpoint.as_str())
-                .header("X-Prometheus-Remote-Write-Version", "0.1.0")
-                .header("Content-Type", "application/x-protobuf")
-                .headers(headers.clone())
-                .body(body)
-                .send()
-                .await
-                .context(SendPromRemoteRequestSnafu),
-            Err(e) => Err(e),
-        };
-        match resp {
-            Ok(resp) => {
-                if !resp.status().is_success() {
-                    error!("report export metrics error, msg: {:#?}", resp);
-                }
-            }
-            Err(e) => error!(e; "report export metrics failed"),
-        };
-    }
-}
-
-/// Send metrics collected by our internal handler
-/// for case `frontend` and `standalone` dispose it's own metrics,
-/// reducing compression and network transmission overhead.
-pub async fn write_system_metric_by_handler(
-    db: String,
-    handler: PromStoreProtocolHandlerRef,
-    filter: Option<MetricFilter>,
-    mut interval: Interval,
-) {
-    info!(
-        "Start export metrics task by handler, interval: {}s",
-        interval.period().as_secs()
-    );
-    // Pass the first tick. Because the first tick completes immediately.
-    interval.tick().await;
-    let ctx = Arc::new(QueryContextBuilder::default().current_schema(db).build());
-    loop {
-        interval.tick().await;
-        let metric_families = prometheus::gather();
-        let request = convert_metric_to_write_request(
-            metric_families,
-            filter.as_ref(),
-            Timestamp::current_millis().value(),
-        );
-
-        let (requests, samples) = match to_grpc_row_insert_requests(&request) {
-            Ok((requests, samples)) => (requests, samples),
-            Err(e) => {
-                error!(e; "Failed to convert gathered metrics to RowInsertRequests");
-                continue;
-            }
-        };
-
-        if let Err(e) = handler.write(requests, ctx.clone(), false).await {
-            error!(e; "report export metrics by handler failed");
-        } else {
-            crate::metrics::PROM_STORE_REMOTE_WRITE_SAMPLES
-                .with_label_values(&[ctx.get_db_string().as_str()])
-                .inc_by(samples as u64);
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use std::time::Duration;
-
-    use crate::export_metrics::{
-        ExportMetricsOption, ExportMetricsTask, RemoteWriteOption, SelfImportOption,
-    };
-
-    #[tokio::test]
-    async fn test_config() {
-        // zero write_interval
-        assert!(
-            ExportMetricsTask::try_new(
-                &ExportMetricsOption {
-                    enable: true,
-                    write_interval: Duration::from_secs(0),
-                    ..Default::default()
-                },
-                None
-            )
-            .is_err()
-        );
-        // none self_import and remote_write
-        assert!(
-            ExportMetricsTask::try_new(
-                &ExportMetricsOption {
-                    enable: true,
-                    ..Default::default()
-                },
-                None
-            )
-            .is_err()
-        );
-        // both self_import and remote_write
-        assert!(
-            ExportMetricsTask::try_new(
-                &ExportMetricsOption {
-                    enable: true,
-                    self_import: Some(SelfImportOption::default()),
-                    remote_write: Some(RemoteWriteOption::default()),
-                    ..Default::default()
-                },
-                None
-            )
-            .is_err()
-        );
-        // empty db
-        assert!(
-            ExportMetricsTask::try_new(
-                &ExportMetricsOption {
-                    enable: true,
-                    self_import: Some(SelfImportOption {
-                        db: String::default()
-                    }),
-                    remote_write: None,
-                    ..Default::default()
-                },
-                None
-            )
-            .is_err()
-        );
-        // empty url
-        assert!(
-            ExportMetricsTask::try_new(
-                &ExportMetricsOption {
-                    enable: true,
-                    self_import: None,
-                    remote_write: Some(RemoteWriteOption {
-                        url: String::default(),
-                        ..Default::default()
-                    }),
-                    ..Default::default()
-                },
-                None
-            )
-            .is_err()
-        );
-        // self import but no handle
-        let s = ExportMetricsTask::try_new(
-            &ExportMetricsOption {
-                enable: true,
-                self_import: Some(SelfImportOption::default()),
-                ..Default::default()
-            },
-            None,
-        )
-        .unwrap()
-        .unwrap();
-        assert!(s.start(None).is_err());
-    }
-}
diff --git a/src/servers/src/grpc/builder.rs b/src/servers/src/grpc/builder.rs
index ae5c226138..129f07c3c5 100644
--- a/src/servers/src/grpc/builder.rs
+++ b/src/servers/src/grpc/builder.rs
@@ -12,21 +12,29 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::convert::Infallible;
+
 use api::v1::frontend::frontend_server::FrontendServer;
 use api::v1::greptime_database_server::GreptimeDatabaseServer;
 use api::v1::prometheus_gateway_server::PrometheusGatewayServer;
 use api::v1::region::region_server::RegionServer;
 use arrow_flight::flight_service_server::FlightServiceServer;
 use auth::UserProviderRef;
-use common_grpc::error::{Error, InvalidConfigFilePathSnafu, Result};
+use axum::extract::Request;
+use axum::response::IntoResponse;
+use axum::routing::Route;
+use common_grpc::error::{InvalidConfigFilePathSnafu, Result};
 use common_runtime::Runtime;
+use common_telemetry::warn;
 use otel_arrow_rust::proto::opentelemetry::arrow::v1::arrow_metrics_service_server::ArrowMetricsServiceServer;
 use snafu::ResultExt;
 use tokio::sync::Mutex;
 use tonic::codec::CompressionEncoding;
+use tonic::codegen::Service;
 use tonic::service::RoutesBuilder;
 use tonic::service::interceptor::InterceptedService;
 use tonic::transport::{Identity, ServerTlsConfig};
+use tower::Layer;
 
 use crate::grpc::database::DatabaseService;
 use crate::grpc::flight::{FlightCraftRef, FlightCraftWrapper};
@@ -188,10 +196,7 @@ impl GrpcServerBuilder {
         // tonic does not support watching for tls config changes
         // so we don't support it either for now
         if tls_option.watch {
-            return Err(Error::NotSupported {
-                feat: "Certificates watch and reloading for gRPC is not supported at the moment"
-                    .to_string(),
-            });
+            warn!("Certificates watch and reloading for gRPC is NOT supported at the moment");
         }
         self.tls_config = if tls_option.should_force_tls() {
             let cert = std::fs::read_to_string(tls_option.cert_path)
@@ -206,6 +211,23 @@ impl GrpcServerBuilder {
         Ok(self)
     }
 
+    pub fn add_layer<L>(self, layer: L) -> Self
+    where
+        L: Layer<Route> + Clone + Send + Sync + 'static,
+        L::Service: Service<Request> + Clone + Send + Sync + 'static,
+        <L::Service as Service<Request>>::Response: IntoResponse + 'static,
+        <L::Service as Service<Request>>::Error: Into<Infallible> + 'static,
+        <L::Service as Service<Request>>::Future: Send + 'static,
+    {
+        let routes = self.routes_builder.routes();
+        let router = routes.into_axum_router();
+        let router = router.layer(layer);
+        Self {
+            routes_builder: RoutesBuilder::from(router),
+            ..self
+        }
+    }
+
     pub fn build(self) -> GrpcServer {
         GrpcServer {
             routes: Mutex::new(Some(self.routes_builder.routes())),
diff --git a/src/servers/src/grpc/flight.rs b/src/servers/src/grpc/flight.rs
index 44b307fe71..a3835b14ff 100644
--- a/src/servers/src/grpc/flight.rs
+++ b/src/servers/src/grpc/flight.rs
@@ -25,12 +25,16 @@ use arrow_flight::{
     HandshakeRequest, HandshakeResponse, PollInfo, PutResult, SchemaResult, Ticket,
 };
 use async_trait::async_trait;
+use bytes;
 use bytes::Bytes;
 use common_grpc::flight::do_put::{DoPutMetadata, DoPutResponse};
-use common_grpc::flight::{FlightEncoder, FlightMessage};
+use common_grpc::flight::{FlightDecoder, FlightEncoder, FlightMessage};
 use common_query::{Output, OutputData};
+use common_recordbatch::DfRecordBatch;
+use common_telemetry::debug;
 use common_telemetry::tracing::info_span;
 use common_telemetry::tracing_context::{FutureExt, TracingContext};
+use datatypes::arrow::datatypes::SchemaRef;
 use futures::{Stream, future, ready};
 use futures_util::{StreamExt, TryStreamExt};
 use prost::Message;
@@ -41,7 +45,7 @@ use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
 use tonic::{Request, Response, Status, Streaming};
 
-use crate::error::{InvalidParameterSnafu, ParseJsonSnafu, Result, ToJsonSnafu};
+use crate::error::{InvalidParameterSnafu, Result, ToJsonSnafu};
 pub use crate::grpc::flight::stream::FlightRecordBatchStream;
 use crate::grpc::greptime_handler::{GreptimeRequestHandler, get_request_type};
 use crate::grpc::{FlightCompression, TonicResult, context_auth};
@@ -223,14 +227,15 @@ impl FlightCraft for GreptimeRequestHandler {
         const MAX_PENDING_RESPONSES: usize = 32;
         let (tx, rx) = mpsc::channel::<TonicResult<DoPutResponse>>(MAX_PENDING_RESPONSES);
 
-        let stream = PutRecordBatchRequestStream {
-            flight_data_stream: stream,
-            state: PutRecordBatchRequestStreamState::Init(
-                query_ctx.current_catalog().to_string(),
-                query_ctx.current_schema(),
-            ),
+        let stream = PutRecordBatchRequestStream::new(
+            stream,
+            query_ctx.current_catalog().to_string(),
+            query_ctx.current_schema(),
             limiter,
-        };
+        )
+        .await?;
+        // Ack immediately when stream is created successfully (in Init state)
+        let _ = tx.send(Ok(DoPutResponse::new(0, 0, 0.0))).await;
         self.put_record_batches(stream, tx, query_ctx).await;
 
         let response = ReceiverStream::new(rx)
@@ -249,33 +254,33 @@ impl FlightCraft for GreptimeRequestHandler {
     }
 }
 
-pub(crate) struct PutRecordBatchRequest {
-    pub(crate) table_name: TableName,
-    pub(crate) request_id: i64,
-    pub(crate) data: FlightData,
+pub struct PutRecordBatchRequest {
+    pub table_name: TableName,
+    pub request_id: i64,
+    pub record_batch: DfRecordBatch,
+    pub schema_bytes: Bytes,
+    pub flight_data: FlightData,
     pub(crate) _guard: Option<RequestMemoryGuard>,
 }
 
 impl PutRecordBatchRequest {
     fn try_new(
         table_name: TableName,
+        record_batch: DfRecordBatch,
+        request_id: i64,
+        schema_bytes: Bytes,
         flight_data: FlightData,
         limiter: Option<&RequestMemoryLimiter>,
     ) -> Result<Self> {
-        let request_id = if !flight_data.app_metadata.is_empty() {
-            let metadata: DoPutMetadata =
-                serde_json::from_slice(&flight_data.app_metadata).context(ParseJsonSnafu)?;
-            metadata.request_id()
-        } else {
-            0
-        };
+        let memory_usage = flight_data.data_body.len()
+            + flight_data.app_metadata.len()
+            + flight_data.data_header.len();
 
         let _guard = limiter
             .filter(|limiter| limiter.is_enabled())
             .map(|limiter| {
-                let message_size = flight_data.encoded_len();
                 limiter
-                    .try_acquire(message_size)
+                    .try_acquire(memory_usage)
                     .map(|guard| {
                         guard.inspect(|g| {
                             METRIC_GRPC_MEMORY_USAGE_BYTES.set(g.current_usage() as i64);
@@ -291,93 +296,224 @@ impl PutRecordBatchRequest {
         Ok(Self {
             table_name,
             request_id,
-            data: flight_data,
+            record_batch,
+            schema_bytes,
+            flight_data,
             _guard,
         })
     }
 }
 
-pub(crate) struct PutRecordBatchRequestStream {
+pub struct PutRecordBatchRequestStream {
     flight_data_stream: Streaming<FlightData>,
-    state: PutRecordBatchRequestStreamState,
+    catalog: String,
+    schema_name: String,
     limiter: Option<RequestMemoryLimiter>,
+    // Client now lazily sends schema data so we cannot eagerly wait for it.
+    // Instead, we need to decode while receiving record batches.
+    state: StreamState,
 }
 
-enum PutRecordBatchRequestStreamState {
-    Init(String, String),
-    Started(TableName),
+enum StreamState {
+    Init,
+    Ready {
+        table_name: TableName,
+        schema: SchemaRef,
+        schema_bytes: Bytes,
+        decoder: FlightDecoder,
+    },
+}
+
+impl PutRecordBatchRequestStream {
+    /// Creates a new `PutRecordBatchRequestStream` in Init state.
+    /// The stream will transition to Ready state when it receives the schema message.
+    pub async fn new(
+        flight_data_stream: Streaming<FlightData>,
+        catalog: String,
+        schema: String,
+        limiter: Option<RequestMemoryLimiter>,
+    ) -> TonicResult<Self> {
+        Ok(Self {
+            flight_data_stream,
+            catalog,
+            schema_name: schema,
+            limiter,
+            state: StreamState::Init,
+        })
+    }
+
+    /// Returns the table name extracted from the flight descriptor.
+    /// Returns None if the stream is still in Init state.
+    pub fn table_name(&self) -> Option<&TableName> {
+        match &self.state {
+            StreamState::Init => None,
+            StreamState::Ready { table_name, .. } => Some(table_name),
+        }
+    }
+
+    /// Returns the Arrow schema decoded from the first flight message.
+    /// Returns None if the stream is still in Init state.
+    pub fn schema(&self) -> Option<&SchemaRef> {
+        match &self.state {
+            StreamState::Init => None,
+            StreamState::Ready { schema, .. } => Some(schema),
+        }
+    }
+
+    /// Returns the raw schema bytes in IPC format.
+    /// Returns None if the stream is still in Init state.
+    pub fn schema_bytes(&self) -> Option<&Bytes> {
+        match &self.state {
+            StreamState::Init => None,
+            StreamState::Ready { schema_bytes, .. } => Some(schema_bytes),
+        }
+    }
+
+    fn extract_table_name(mut descriptor: FlightDescriptor) -> Result<String> {
+        ensure!(
+            descriptor.r#type == arrow_flight::flight_descriptor::DescriptorType::Path as i32,
+            InvalidParameterSnafu {
+                reason: "expect FlightDescriptor::type == 'Path' only",
+            }
+        );
+        ensure!(
+            descriptor.path.len() == 1,
+            InvalidParameterSnafu {
+                reason: "expect FlightDescriptor::path has only one table name",
+            }
+        );
+        Ok(descriptor.path.remove(0))
+    }
 }
 
 impl Stream for PutRecordBatchRequestStream {
     type Item = TonicResult<PutRecordBatchRequest>;
 
     fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        fn extract_table_name(mut descriptor: FlightDescriptor) -> Result<String> {
-            ensure!(
-                descriptor.r#type == arrow_flight::flight_descriptor::DescriptorType::Path as i32,
-                InvalidParameterSnafu {
-                    reason: "expect FlightDescriptor::type == 'Path' only",
+        loop {
+            let poll = ready!(self.flight_data_stream.poll_next_unpin(cx));
+
+            match poll {
+                Some(Ok(flight_data)) => {
+                    // Clone limiter once to avoid borrowing issues
+                    let limiter = self.limiter.clone();
+
+                    match &mut self.state {
+                        StreamState::Init => {
+                            // First message - expecting schema
+                            let flight_descriptor = match flight_data.flight_descriptor.as_ref() {
+                                Some(descriptor) => descriptor.clone(),
+                                None => {
+                                    return Poll::Ready(Some(Err(Status::failed_precondition(
+                                        "table to put is not found in flight descriptor",
+                                    ))));
+                                }
+                            };
+
+                            let table_name_str = match Self::extract_table_name(flight_descriptor) {
+                                Ok(name) => name,
+                                Err(e) => {
+                                    return Poll::Ready(Some(Err(Status::invalid_argument(
+                                        e.to_string(),
+                                    ))));
+                                }
+                            };
+                            let table_name = TableName::new(
+                                self.catalog.clone(),
+                                self.schema_name.clone(),
+                                table_name_str,
+                            );
+
+                            // Decode the schema
+                            let mut decoder = FlightDecoder::default();
+                            let schema_message = decoder.try_decode(&flight_data).map_err(|e| {
+                                Status::invalid_argument(format!("Failed to decode schema: {}", e))
+                            })?;
+
+                            match schema_message {
+                                Some(FlightMessage::Schema(schema)) => {
+                                    let schema_bytes = decoder.schema_bytes().ok_or_else(|| {
+                                        Status::internal(
+                                            "decoder should have schema bytes after decoding schema",
+                                        )
+                                    })?;
+
+                                    // Transition to Ready state with all necessary data
+                                    self.state = StreamState::Ready {
+                                        table_name,
+                                        schema,
+                                        schema_bytes,
+                                        decoder,
+                                    };
+                                    // Continue to next iteration to process RecordBatch messages
+                                    continue;
+                                }
+                                _ => {
+                                    return Poll::Ready(Some(Err(Status::failed_precondition(
+                                        "first message must be a Schema message",
+                                    ))));
+                                }
+                            }
+                        }
+                        StreamState::Ready {
+                            table_name,
+                            schema: _,
+                            schema_bytes,
+                            decoder,
+                        } => {
+                            // Extract request_id and body_size from FlightData before decoding
+                            let request_id = if !flight_data.app_metadata.is_empty() {
+                                serde_json::from_slice::<DoPutMetadata>(&flight_data.app_metadata)
+                                    .map(|meta| meta.request_id())
+                                    .unwrap_or_default()
+                            } else {
+                                0
+                            };
+
+                            // Decode FlightData to RecordBatch
+                            match decoder.try_decode(&flight_data) {
+                                Ok(Some(FlightMessage::RecordBatch(record_batch))) => {
+                                    let table_name = table_name.clone();
+                                    let schema_bytes = schema_bytes.clone();
+                                    return Poll::Ready(Some(
+                                        PutRecordBatchRequest::try_new(
+                                            table_name,
+                                            record_batch,
+                                            request_id,
+                                            schema_bytes,
+                                            flight_data,
+                                            limiter.as_ref(),
+                                        )
+                                        .map_err(|e| Status::invalid_argument(e.to_string())),
+                                    ));
+                                }
+                                Ok(Some(other)) => {
+                                    debug!("Unexpected flight message: {:?}", other);
+                                    return Poll::Ready(Some(Err(Status::invalid_argument(
+                                        "Expected RecordBatch message, got other message type",
+                                    ))));
+                                }
+                                Ok(None) => {
+                                    // Dictionary batch - processed internally by decoder, continue polling
+                                    continue;
+                                }
+                                Err(e) => {
+                                    return Poll::Ready(Some(Err(Status::invalid_argument(
+                                        format!("Failed to decode RecordBatch: {}", e),
+                                    ))));
+                                }
+                            }
+                        }
+                    }
                 }
-            );
-            ensure!(
-                descriptor.path.len() == 1,
-                InvalidParameterSnafu {
-                    reason: "expect FlightDescriptor::path has only one table name",
+                Some(Err(e)) => {
+                    return Poll::Ready(Some(Err(e)));
                 }
-            );
-            Ok(descriptor.path.remove(0))
+                None => {
+                    return Poll::Ready(None);
+                }
+            }
         }
-
-        let poll = ready!(self.flight_data_stream.poll_next_unpin(cx));
-        let limiter = self.limiter.clone();
-
-        let result = match &mut self.state {
-            PutRecordBatchRequestStreamState::Init(catalog, schema) => match poll {
-                Some(Ok(mut flight_data)) => {
-                    let flight_descriptor = flight_data.flight_descriptor.take();
-                    let result = if let Some(descriptor) = flight_descriptor {
-                        let table_name = extract_table_name(descriptor)
-                            .map(|x| TableName::new(catalog.clone(), schema.clone(), x));
-                        let table_name = match table_name {
-                            Ok(table_name) => table_name,
-                            Err(e) => return Poll::Ready(Some(Err(e.into()))),
-                        };
-
-                        let request = PutRecordBatchRequest::try_new(
-                            table_name.clone(),
-                            flight_data,
-                            limiter.as_ref(),
-                        );
-                        let request = match request {
-                            Ok(request) => request,
-                            Err(e) => return Poll::Ready(Some(Err(e.into()))),
-                        };
-
-                        self.state = PutRecordBatchRequestStreamState::Started(table_name);
-
-                        Ok(request)
-                    } else {
-                        Err(Status::failed_precondition(
-                            "table to put is not found in flight descriptor",
-                        ))
-                    };
-                    Some(result)
-                }
-                Some(Err(e)) => Some(Err(e)),
-                None => None,
-            },
-            PutRecordBatchRequestStreamState::Started(table_name) => poll.map(|x| {
-                x.and_then(|flight_data| {
-                    PutRecordBatchRequest::try_new(
-                        table_name.clone(),
-                        flight_data,
-                        limiter.as_ref(),
-                    )
-                    .map_err(Into::into)
-                })
-            }),
-        };
-        Poll::Ready(result)
     }
 }
 
diff --git a/src/servers/src/grpc/greptime_handler.rs b/src/servers/src/grpc/greptime_handler.rs
index 095c36abb1..c1f146db6d 100644
--- a/src/servers/src/grpc/greptime_handler.rs
+++ b/src/servers/src/grpc/greptime_handler.rs
@@ -24,7 +24,6 @@ use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
 use common_catalog::parse_catalog_and_schema_from_db_string;
 use common_error::ext::ErrorExt;
 use common_error::status_code::StatusCode;
-use common_grpc::flight::FlightDecoder;
 use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
 use common_runtime::Runtime;
@@ -37,15 +36,14 @@ use futures_util::StreamExt;
 use session::context::{Channel, QueryContextBuilder, QueryContextRef};
 use session::hints::READ_PREFERENCE_HINT;
 use snafu::{OptionExt, ResultExt};
-use table::TableRef;
 use tokio::sync::mpsc;
 use tokio::sync::mpsc::error::TrySendError;
+use tonic::Status;
 
 use crate::error::{InvalidQuerySnafu, JoinTaskSnafu, Result, UnknownHintSnafu};
-use crate::grpc::flight::{PutRecordBatchRequest, PutRecordBatchRequestStream};
+use crate::grpc::flight::PutRecordBatchRequestStream;
 use crate::grpc::{FlightCompression, TonicResult, context_auth};
-use crate::metrics;
-use crate::metrics::METRIC_SERVER_GRPC_DB_REQUEST_TIMER;
+use crate::metrics::{self, METRIC_SERVER_GRPC_DB_REQUEST_TIMER};
 use crate::query_handler::grpc::ServerGrpcQueryHandlerRef;
 
 #[derive(Clone)]
@@ -134,7 +132,7 @@ impl GreptimeRequestHandler {
 
     pub(crate) async fn put_record_batches(
         &self,
-        mut stream: PutRecordBatchRequestStream,
+        stream: PutRecordBatchRequestStream,
         result_sender: mpsc::Sender<TonicResult<DoPutResponse>>,
         query_ctx: QueryContextRef,
     ) {
@@ -144,37 +142,24 @@ impl GreptimeRequestHandler {
             .clone()
             .unwrap_or_else(common_runtime::global_runtime);
         runtime.spawn(async move {
-            // Cached table ref
-            let mut table_ref: Option<TableRef> = None;
+            let mut result_stream = handler.handle_put_record_batch_stream(stream, query_ctx);
 
-            let mut decoder = FlightDecoder::default();
-            while let Some(request) = stream.next().await {
-                let request = match request {
-                    Ok(request) => request,
-                    Err(e) => {
-                        let _ = result_sender.try_send(Err(e));
-                        break;
+            while let Some(result) = result_stream.next().await {
+                match &result {
+                    Ok(response) => {
+                        // Record the elapsed time metric from the response
+                        metrics::GRPC_BULK_INSERT_ELAPSED.observe(response.elapsed_secs());
                     }
-                };
-                let PutRecordBatchRequest {
-                    table_name,
-                    request_id,
-                    data,
-                    _guard,
-                } = request;
+                    Err(e) => {
+                        error!(e; "Failed to handle flight record batches");
+                    }
+                }
 
-                let timer = metrics::GRPC_BULK_INSERT_ELAPSED.start_timer();
-                let result = handler
-                    .put_record_batch(&table_name, &mut table_ref, &mut decoder, data, query_ctx.clone())
-                    .await
-                    .inspect_err(|e| error!(e; "Failed to handle flight record batches"));
-                timer.observe_duration();
-                let result = result
-                    .map(|x| DoPutResponse::new(request_id, x))
-                    .map_err(Into::into);
-                if let Err(e)= result_sender.try_send(result)
-                    && let TrySendError::Closed(_) = e {
-                    warn!(r#""DoPut" client with request_id {} maybe unreachable, abort handling its message"#, request_id);
+                if let Err(e) =
+                    result_sender.try_send(result.map_err(|e| Status::from_error(Box::new(e))))
+                    && let TrySendError::Closed(_) = e
+                {
+                    warn!(r#""DoPut" client maybe unreachable, abort handling its message"#);
                     break;
                 }
             }
diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs
index 8fa658b6bb..2d2b3a4320 100644
--- a/src/servers/src/http.rs
+++ b/src/servers/src/http.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 use std::collections::HashMap;
+use std::convert::Infallible;
 use std::fmt::Display;
 use std::net::SocketAddr;
 use std::sync::Mutex as StdMutex;
@@ -20,9 +21,10 @@ use std::time::Duration;
 
 use async_trait::async_trait;
 use auth::UserProviderRef;
-use axum::extract::DefaultBodyLimit;
+use axum::extract::{DefaultBodyLimit, Request};
 use axum::http::StatusCode as HttpStatusCode;
 use axum::response::{IntoResponse, Response};
+use axum::routing::Route;
 use axum::serve::ListenerExt;
 use axum::{Router, middleware, routing};
 use common_base::Plugins;
@@ -32,9 +34,7 @@ use common_telemetry::{debug, error, info};
 use common_time::Timestamp;
 use common_time::timestamp::TimeUnit;
 use datatypes::data_type::DataType;
-use datatypes::prelude::ConcreteDataType;
 use datatypes::schema::SchemaRef;
-use datatypes::types::jsonb_to_serde_json;
 use event::{LogState, LogValidatorRef};
 use futures::FutureExt;
 use http::{HeaderValue, Method};
@@ -44,7 +44,8 @@ use serde_json::Value;
 use snafu::{ResultExt, ensure};
 use tokio::sync::Mutex;
 use tokio::sync::oneshot::{self, Sender};
-use tower::ServiceBuilder;
+use tonic::codegen::Service;
+use tower::{Layer, ServiceBuilder};
 use tower_http::compression::CompressionLayer;
 use tower_http::cors::{AllowOrigin, Any, CorsLayer};
 use tower_http::decompression::RequestDecompressionLayer;
@@ -52,11 +53,11 @@ use tower_http::trace::TraceLayer;
 
 use self::authorize::AuthState;
 use self::result::table_result::TableResponse;
-use crate::configurator::ConfiguratorRef;
+use crate::configurator::HttpConfiguratorRef;
 use crate::elasticsearch;
 use crate::error::{
-    AddressBindSnafu, AlreadyStartedSnafu, ConvertSqlValueSnafu, Error, InternalIoSnafu,
-    InvalidHeaderValueSnafu, Result, ToJsonSnafu,
+    AddressBindSnafu, AlreadyStartedSnafu, Error, InternalIoSnafu, InvalidHeaderValueSnafu,
+    OtherSnafu, Result,
 };
 use crate::http::influxdb::{influxdb_health, influxdb_ping, influxdb_write_v1, influxdb_write_v2};
 use crate::http::otlp::OtlpState;
@@ -89,6 +90,7 @@ pub mod authorize;
 #[cfg(feature = "dashboard")]
 mod dashboard;
 pub mod dyn_log;
+pub mod dyn_trace;
 pub mod event;
 pub mod extractor;
 pub mod handler;
@@ -108,6 +110,7 @@ pub mod result;
 mod timeout;
 pub mod utils;
 
+use result::HttpOutputWriter;
 pub(crate) use timeout::DynamicTimeoutLayer;
 
 mod hints;
@@ -297,30 +300,10 @@ impl HttpRecordsOutput {
         } else {
             let num_rows = recordbatches.iter().map(|r| r.num_rows()).sum::<usize>();
             let mut rows = Vec::with_capacity(num_rows);
-            let schemas = schema.column_schemas();
-            let num_cols = schema.column_schemas().len();
-            rows.resize_with(num_rows, || Vec::with_capacity(num_cols));
 
-            let mut finished_row_cursor = 0;
             for recordbatch in recordbatches {
-                for (col_idx, col) in recordbatch.columns().iter().enumerate() {
-                    // safety here: schemas length is equal to the number of columns in the recordbatch
-                    let schema = &schemas[col_idx];
-                    for row_idx in 0..recordbatch.num_rows() {
-                        let value = col.get(row_idx);
-                        // TODO(sunng87): is this duplicated with `map_json_type_to_string` in recordbatch?
-                        let value = if let ConcreteDataType::Json(_json_type) = &schema.data_type
-                            && let datatypes::value::Value::Binary(bytes) = value
-                        {
-                            jsonb_to_serde_json(bytes.as_ref()).context(ConvertSqlValueSnafu)?
-                        } else {
-                            serde_json::Value::try_from(col.get(row_idx)).context(ToJsonSnafu)?
-                        };
-
-                        rows[row_idx + finished_row_cursor].push(value);
-                    }
-                }
-                finished_row_cursor += recordbatch.num_rows();
+                let mut writer = HttpOutputWriter::new(schema.num_columns(), None);
+                writer.write(recordbatch, &mut rows)?;
             }
 
             Ok(HttpRecordsOutput {
@@ -752,6 +735,20 @@ impl HttpServerBuilder {
         }
     }
 
+    pub fn add_layer<L>(self, layer: L) -> Self
+    where
+        L: Layer<Route> + Clone + Send + Sync + 'static,
+        L::Service: Service<Request> + Clone + Send + Sync + 'static,
+        <L::Service as Service<Request>>::Response: IntoResponse + 'static,
+        <L::Service as Service<Request>>::Error: Into<Infallible> + 'static,
+        <L::Service as Service<Request>>::Future: Send + 'static,
+    {
+        Self {
+            router: self.router.layer(layer),
+            ..self
+        }
+    }
+
     pub fn build(self) -> HttpServer {
         let memory_limiter =
             RequestMemoryLimiter::new(self.options.max_total_body_memory.as_bytes() as usize);
@@ -908,6 +905,7 @@ impl HttpServer {
                 Router::new()
                     // handler for changing log level dynamically
                     .route("/log_level", routing::post(dyn_log::dyn_log_handler))
+                    .route("/enable_trace", routing::post(dyn_trace::dyn_trace_handler))
                     .nest(
                         "/prof",
                         Router::new()
@@ -1225,8 +1223,11 @@ impl Server for HttpServer {
             );
 
             let mut app = self.make_app();
-            if let Some(configurator) = self.plugins.get::<ConfiguratorRef>() {
-                app = configurator.config_http(app);
+            if let Some(configurator) = self.plugins.get::<HttpConfiguratorRef<()>>() {
+                app = configurator
+                    .configure_http(app, ())
+                    .await
+                    .context(OtherSnafu)?;
             }
             let app = self.build(app)?;
             let listener = tokio::net::TcpListener::bind(listening)
diff --git a/src/servers/src/http/dyn_log.rs b/src/servers/src/http/dyn_log.rs
index b82ecdadd6..e9a58c2d74 100644
--- a/src/servers/src/http/dyn_log.rs
+++ b/src/servers/src/http/dyn_log.rs
@@ -15,7 +15,7 @@
 use axum::http::StatusCode;
 use axum::response::IntoResponse;
 use common_telemetry::tracing_subscriber::filter;
-use common_telemetry::{RELOAD_HANDLE, info};
+use common_telemetry::{LOG_RELOAD_HANDLE, info};
 use snafu::OptionExt;
 
 use crate::error::{InternalSnafu, InvalidParameterSnafu, Result};
@@ -29,7 +29,7 @@ pub async fn dyn_log_handler(level: String) -> Result<impl IntoResponse> {
         .build()
     })?;
     let mut old_filter = None;
-    RELOAD_HANDLE
+    LOG_RELOAD_HANDLE
         .get()
         .context(InternalSnafu {
             err_msg: "Reload handle not initialized",
diff --git a/src/servers/src/http/dyn_trace.rs b/src/servers/src/http/dyn_trace.rs
new file mode 100644
index 0000000000..dcdb74c56a
--- /dev/null
+++ b/src/servers/src/http/dyn_trace.rs
@@ -0,0 +1,54 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use axum::http::StatusCode;
+use axum::response::IntoResponse;
+use common_telemetry::{TRACE_RELOAD_HANDLE, get_or_init_tracer, info};
+
+use crate::error::{InvalidParameterSnafu, Result};
+
+#[axum_macros::debug_handler]
+pub async fn dyn_trace_handler(enable_str: String) -> Result<impl IntoResponse> {
+    let enable = enable_str.parse::<bool>().map_err(|e| {
+        InvalidParameterSnafu {
+            reason: format!("Invalid parameter \"enable\": {e:?}"),
+        }
+        .build()
+    })?;
+
+    let Some(trace_reload_handle) = TRACE_RELOAD_HANDLE.get() else {
+        return Ok((
+            StatusCode::SERVICE_UNAVAILABLE,
+            "trace reload handle is not initialized".to_string(),
+        ));
+    };
+
+    if enable {
+        let tracer = match get_or_init_tracer() {
+            Ok(tracer) => tracer,
+            Err(reason) => {
+                return Ok((StatusCode::SERVICE_UNAVAILABLE, reason.to_string()));
+            }
+        };
+
+        let trace_layer = tracing_opentelemetry::layer().with_tracer(tracer);
+        trace_reload_handle.reload(Some(trace_layer));
+        info!("trace enabled");
+        Ok((StatusCode::OK, "trace enabled".to_string()))
+    } else {
+        trace_reload_handle.reload(None);
+        info!("trace disabled");
+        Ok((StatusCode::OK, "trace disabled".to_string()))
+    }
+}
diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index 24bb844dc7..2390e374a1 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -31,7 +31,7 @@ use axum_extra::TypedHeader;
 use common_catalog::consts::default_engine;
 use common_error::ext::{BoxedError, ErrorExt};
 use common_query::{Output, OutputData};
-use common_telemetry::{error, warn};
+use common_telemetry::{debug, error, warn};
 use headers::ContentType;
 use lazy_static::lazy_static;
 use mime_guess::mime;
@@ -738,6 +738,11 @@ pub async fn log_ingester(
 
     let value = extract_pipeline_value_by_content_type(content_type, payload, ignore_errors)?;
 
+    debug!(
+        "receiving logs: {:?}",
+        serde_json::to_string(&value).unwrap()
+    );
+
     query_ctx.set_channel(Channel::Log);
     let query_ctx = Arc::new(query_ctx);
 
diff --git a/src/servers/src/http/jaeger.rs b/src/servers/src/http/jaeger.rs
index 39bea1ac92..148e2ac77a 100644
--- a/src/servers/src/http/jaeger.rs
+++ b/src/servers/src/http/jaeger.rs
@@ -21,12 +21,14 @@ use axum::Extension;
 use axum::extract::{Path, Query, State};
 use axum::http::{HeaderMap, StatusCode as HttpStatusCode};
 use axum::response::IntoResponse;
+use axum_extra::TypedHeader;
 use common_catalog::consts::{PARENT_SPAN_ID_COLUMN, TRACE_TABLE_NAME};
 use common_error::ext::ErrorExt;
 use common_error::status_code::StatusCode;
 use common_query::{Output, OutputData};
 use common_recordbatch::util;
 use common_telemetry::{debug, error, tracing, warn};
+use headers::UserAgent;
 use serde::{Deserialize, Deserializer, Serialize, de};
 use serde_json::Value as JsonValue;
 use session::context::{Channel, QueryContext};
@@ -54,6 +56,9 @@ pub const JAEGER_QUERY_TABLE_NAME_KEY: &str = "jaeger_query_table_name";
 const REF_TYPE_CHILD_OF: &str = "CHILD_OF";
 const SPAN_KIND_TIME_FMTS: [&str; 2] = ["%Y-%m-%d %H:%M:%S%.6f%z", "%Y-%m-%d %H:%M:%S%.9f%z"];
 
+const TRACE_NOT_FOUND_ERROR_CODE: i32 = 404;
+const TRACE_NOT_FOUND_ERROR_MSG: &str = "trace not found";
+
 /// JaegerAPIResponse is the response of Jaeger HTTP API.
 /// The original version is `structuredResponse` which is defined in https://github.com/jaegertracing/jaeger/blob/main/cmd/query/app/http_handler.go.
 #[derive(Default, Debug, Serialize, Deserialize, PartialEq)]
@@ -65,6 +70,22 @@ pub struct JaegerAPIResponse {
     pub errors: Vec<JaegerAPIError>,
 }
 
+impl JaegerAPIResponse {
+    pub fn trace_not_found() -> Self {
+        Self {
+            data: None,
+            total: 0,
+            limit: 0,
+            offset: 0,
+            errors: vec![JaegerAPIError {
+                code: TRACE_NOT_FOUND_ERROR_CODE,
+                msg: TRACE_NOT_FOUND_ERROR_MSG.to_string(),
+                trace_id: None,
+            }],
+        }
+    }
+}
+
 /// JaegerData is the query result of Jaeger HTTP API.
 #[derive(Debug, Serialize, Deserialize, PartialEq)]
 #[serde(untagged)]
@@ -340,6 +361,30 @@ pub struct QueryTraceParams {
     pub end_time: Option<i64>,
     pub min_duration: Option<u64>,
     pub max_duration: Option<u64>,
+
+    // The user agent of the trace query, mainly find traces
+    pub user_agent: TraceUserAgent,
+}
+
+#[derive(Debug, Default, PartialEq, Eq)]
+pub enum TraceUserAgent {
+    Grafana,
+    // Jaeger-UI does not actually send user agent
+    // But it's a jaeger API, so let's treat it as jaeger
+    #[default]
+    Jaeger,
+}
+
+impl From<UserAgent> for TraceUserAgent {
+    fn from(value: UserAgent) -> Self {
+        let ua_str = value.as_str().to_lowercase();
+        debug!("received user agent: {}", ua_str);
+        if ua_str.contains("grafana") {
+            Self::Grafana
+        } else {
+            Self::Jaeger
+        }
+    }
 }
 
 /// Handle the GET `/api/services` request.
@@ -427,7 +472,13 @@ pub async fn handle_get_trace(
     let end_time_ns = query_params.end.map(|end_us| end_us * 1000);
 
     let output = match handler
-        .get_trace(query_ctx, &trace_id, start_time_ns, end_time_ns)
+        .get_trace(
+            query_ctx,
+            &trace_id,
+            start_time_ns,
+            end_time_ns,
+            query_params.limit,
+        )
         .await
     {
         Ok(output) => output,
@@ -442,6 +493,10 @@ pub async fn handle_get_trace(
 
     match covert_to_records(output).await {
         Ok(Some(records)) => match traces_from_records(records) {
+            Ok(traces) if traces.is_empty() => (
+                HttpStatusCode::NOT_FOUND,
+                axum::Json(JaegerAPIResponse::trace_not_found()),
+            ),
             Ok(traces) => (
                 HttpStatusCode::OK,
                 axum::Json(JaegerAPIResponse {
@@ -454,7 +509,10 @@ pub async fn handle_get_trace(
                 error_response(err)
             }
         },
-        Ok(None) => (HttpStatusCode::OK, axum::Json(JaegerAPIResponse::default())),
+        Ok(None) => (
+            HttpStatusCode::NOT_FOUND,
+            axum::Json(JaegerAPIResponse::trace_not_found()),
+        ),
         Err(err) => {
             error!("Failed to get trace '{}': {:?}", trace_id, err);
             error_response(err)
@@ -470,6 +528,7 @@ pub async fn handle_find_traces(
     Query(query_params): Query<JaegerQueryParams>,
     Extension(mut query_ctx): Extension<QueryContext>,
     TraceTableName(table_name): TraceTableName,
+    optional_user_agent: Option<TypedHeader<UserAgent>>,
 ) -> impl IntoResponse {
     debug!(
         "Received Jaeger '/api/traces' request, query_params: {:?}, query_ctx: {:?}",
@@ -486,7 +545,10 @@ pub async fn handle_find_traces(
         .start_timer();
 
     match QueryTraceParams::from_jaeger_query_params(query_params) {
-        Ok(query_params) => {
+        Ok(mut query_params) => {
+            if let Some(TypedHeader(user_agent)) = optional_user_agent {
+                query_params.user_agent = user_agent.into();
+            }
             let output = handler.find_traces(query_ctx, query_params).await;
             match output {
                 Ok(output) => match covert_to_records(output).await {
@@ -1565,6 +1627,7 @@ mod tests {
                         ("http.method".to_string(), JsonValue::String("GET".to_string())),
                         ("http.path".to_string(), JsonValue::String("/api/v1/users".to_string())),
                     ])),
+                    user_agent: TraceUserAgent::Jaeger,
                 },
             ),
         ];
diff --git a/src/servers/src/http/loki.rs b/src/servers/src/http/loki.rs
index f10ab53190..e6f1b064a3 100644
--- a/src/servers/src/http/loki.rs
+++ b/src/servers/src/http/loki.rs
@@ -43,7 +43,7 @@ use snafu::{OptionExt, ResultExt, ensure};
 use vrl::value::{KeyString, Value as VrlValue};
 
 use crate::error::{
-    DecodeOtlpRequestSnafu, InvalidLokiLabelsSnafu, InvalidLokiPayloadSnafu, ParseJsonSnafu,
+    DecodeLokiRequestSnafu, InvalidLokiLabelsSnafu, InvalidLokiPayloadSnafu, ParseJsonSnafu,
     PipelineSnafu, Result, UnsupportedContentTypeSnafu,
 };
 use crate::http::HttpResponse;
@@ -492,7 +492,7 @@ impl LokiPbParser {
     pub fn from_bytes(bytes: Bytes) -> Result<Self> {
         let decompressed = prom_store::snappy_decompress(&bytes).unwrap();
         let req = loki_proto::logproto::PushRequest::decode(&decompressed[..])
-            .context(DecodeOtlpRequestSnafu)?;
+            .context(DecodeLokiRequestSnafu)?;
 
         Ok(Self {
             streams: req.streams.into(),
diff --git a/src/servers/src/http/otlp.rs b/src/servers/src/http/otlp.rs
index fc0656cf0e..4fd2d42122 100644
--- a/src/servers/src/http/otlp.rs
+++ b/src/servers/src/http/otlp.rs
@@ -18,9 +18,12 @@ use axum::Extension;
 use axum::extract::State;
 use axum::http::header;
 use axum::response::IntoResponse;
+use axum_extra::TypedHeader;
 use bytes::Bytes;
 use common_catalog::consts::{TRACE_TABLE_NAME, TRACE_TABLE_NAME_SESSION_KEY};
 use common_telemetry::tracing;
+use headers::ContentType;
+use mime_guess::mime;
 use opentelemetry_proto::tonic::collector::logs::v1::{
     ExportLogsServiceRequest, ExportLogsServiceResponse,
 };
@@ -39,11 +42,26 @@ use crate::error::{self, PipelineSnafu, Result};
 use crate::http::extractor::{
     LogTableName, OtlpMetricOptions, PipelineInfo, SelectInfoWrapper, TraceTableName,
 };
-// use crate::http::header::constants::GREPTIME_METRICS_LEGACY_MODE_HEADER_NAME;
 use crate::http::header::{CONTENT_TYPE_PROTOBUF, write_cost_header_map};
 use crate::metrics::METRIC_HTTP_OPENTELEMETRY_LOGS_ELAPSED;
 use crate::query_handler::{OpenTelemetryProtocolHandlerRef, PipelineHandler};
 
+fn is_json_content_type(content_type: Option<&ContentType>) -> bool {
+    match content_type {
+        None => false,
+        Some(ct) => {
+            let mime: mime::Mime = ct.clone().into();
+            mime.subtype() == mime::JSON
+        }
+    }
+}
+
+fn content_type_to_string(content_type: Option<&TypedHeader<ContentType>>) -> String {
+    content_type
+        .map(|h| h.0.to_string())
+        .unwrap_or_else(|| "not specified".to_string())
+}
+
 #[derive(Clone)]
 pub struct OtlpState {
     pub with_metric_engine: bool,
@@ -56,16 +74,24 @@ pub async fn metrics(
     State(state): State<OtlpState>,
     Extension(mut query_ctx): Extension<QueryContext>,
     http_opts: OtlpMetricOptions,
+    content_type: Option<TypedHeader<ContentType>>,
     bytes: Bytes,
 ) -> Result<OtlpResponse<ExportMetricsServiceResponse>> {
+    if is_json_content_type(content_type.as_ref().map(|h| &h.0)) {
+        return error::UnsupportedJsonContentTypeSnafu {}.fail();
+    }
+
     let db = query_ctx.get_db_string();
     query_ctx.set_channel(Channel::Otlp);
 
     let _timer = crate::metrics::METRIC_HTTP_OPENTELEMETRY_METRICS_ELAPSED
         .with_label_values(&[db.as_str()])
         .start_timer();
-    let request =
-        ExportMetricsServiceRequest::decode(bytes).context(error::DecodeOtlpRequestSnafu)?;
+    let request = ExportMetricsServiceRequest::decode(bytes).with_context(|_| {
+        error::DecodeOtlpRequestSnafu {
+            content_type: content_type_to_string(content_type.as_ref()),
+        }
+    })?;
 
     let OtlpState {
         with_metric_engine,
@@ -101,8 +127,13 @@ pub async fn traces(
     TraceTableName(table_name): TraceTableName,
     pipeline_info: PipelineInfo,
     Extension(mut query_ctx): Extension<QueryContext>,
+    content_type: Option<TypedHeader<ContentType>>,
     bytes: Bytes,
 ) -> Result<OtlpResponse<ExportTraceServiceResponse>> {
+    if is_json_content_type(content_type.as_ref().map(|h| &h.0)) {
+        return error::UnsupportedJsonContentTypeSnafu {}.fail();
+    }
+
     let db = query_ctx.get_db_string();
     let table_name = table_name.unwrap_or_else(|| TRACE_TABLE_NAME.to_string());
 
@@ -113,8 +144,11 @@ pub async fn traces(
     let _timer = crate::metrics::METRIC_HTTP_OPENTELEMETRY_TRACES_ELAPSED
         .with_label_values(&[db.as_str()])
         .start_timer();
-    let request =
-        ExportTraceServiceRequest::decode(bytes).context(error::DecodeOtlpRequestSnafu)?;
+    let request = ExportTraceServiceRequest::decode(bytes).with_context(|_| {
+        error::DecodeOtlpRequestSnafu {
+            content_type: content_type_to_string(content_type.as_ref()),
+        }
+    })?;
 
     let pipeline = PipelineWay::from_name_and_default(
         pipeline_info.pipeline_name.as_deref(),
@@ -157,8 +191,13 @@ pub async fn logs(
     pipeline_info: PipelineInfo,
     LogTableName(tablename): LogTableName,
     SelectInfoWrapper(select_info): SelectInfoWrapper,
+    content_type: Option<TypedHeader<ContentType>>,
     bytes: Bytes,
 ) -> Result<OtlpResponse<ExportLogsServiceResponse>> {
+    if is_json_content_type(content_type.as_ref().map(|h| &h.0)) {
+        return error::UnsupportedJsonContentTypeSnafu {}.fail();
+    }
+
     let tablename = tablename.unwrap_or_else(|| "opentelemetry_logs".to_string());
     let db = query_ctx.get_db_string();
     query_ctx.set_channel(Channel::Otlp);
@@ -166,7 +205,11 @@ pub async fn logs(
     let _timer = METRIC_HTTP_OPENTELEMETRY_LOGS_ELAPSED
         .with_label_values(&[db.as_str()])
         .start_timer();
-    let request = ExportLogsServiceRequest::decode(bytes).context(error::DecodeOtlpRequestSnafu)?;
+    let request = ExportLogsServiceRequest::decode(bytes).with_context(|_| {
+        error::DecodeOtlpRequestSnafu {
+            content_type: content_type_to_string(content_type.as_ref()),
+        }
+    })?;
 
     let pipeline = PipelineWay::from_name_and_default(
         pipeline_info.pipeline_name.as_deref(),
diff --git a/src/servers/src/http/prometheus.rs b/src/servers/src/http/prometheus.rs
index f9d1e1c21b..26a91d51fa 100644
--- a/src/servers/src/http/prometheus.rs
+++ b/src/servers/src/http/prometheus.rs
@@ -19,16 +19,13 @@ use std::collections::{BTreeMap, HashMap, HashSet};
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
-use arrow::array::AsArray;
+use arrow::array::{Array, AsArray};
 use arrow::datatypes::{
-    Date32Type, Date64Type, Decimal128Type, DurationMicrosecondType, DurationMillisecondType,
-    DurationNanosecondType, DurationSecondType, Float32Type, Float64Type, Int8Type, Int16Type,
+    Date32Type, Date64Type, Decimal128Type, Float32Type, Float64Type, Int8Type, Int16Type,
     Int32Type, Int64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType,
-    Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
-    TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
-    TimestampSecondType, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
+    UInt8Type, UInt16Type, UInt32Type, UInt64Type,
 };
-use arrow_schema::{DataType, IntervalUnit, TimeUnit};
+use arrow_schema::{DataType, IntervalUnit};
 use axum::extract::{Path, Query, State};
 use axum::{Extension, Form};
 use catalog::CatalogManagerRef;
@@ -39,18 +36,13 @@ use common_error::status_code::StatusCode;
 use common_query::{Output, OutputData};
 use common_recordbatch::{RecordBatch, RecordBatches};
 use common_telemetry::{debug, tracing};
-use common_time::time::Time;
 use common_time::util::{current_time_rfc3339, yesterday_rfc3339};
-use common_time::{
-    Date, Duration, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth, Timestamp,
-};
+use common_time::{Date, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth};
 use common_version::OwnedBuildInfo;
 use datafusion_common::ScalarValue;
 use datatypes::prelude::ConcreteDataType;
-use datatypes::scalars::ScalarVector;
 use datatypes::schema::{ColumnSchema, SchemaRef};
 use datatypes::types::jsonb_to_string;
-use datatypes::vectors::Float64Vector;
 use futures::StreamExt;
 use futures::future::join_all;
 use itertools::Itertools;
@@ -950,47 +942,12 @@ impl RowWriter {
                         let v = Date::new((array.value(i) / 86_400_000) as i32);
                         self.insert(column, v);
                     }
-                    DataType::Timestamp(time_unit, _) => {
-                        let v = match time_unit {
-                            TimeUnit::Second => {
-                                let array = array.as_primitive::<TimestampSecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Millisecond => {
-                                let array = array.as_primitive::<TimestampMillisecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Microsecond => {
-                                let array = array.as_primitive::<TimestampMicrosecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Nanosecond => {
-                                let array = array.as_primitive::<TimestampNanosecondType>();
-                                array.value(i)
-                            }
-                        };
-                        let v = Timestamp::new(v, time_unit.into());
+                    DataType::Timestamp(_, _) => {
+                        let v = datatypes::arrow_array::timestamp_array_value(array, i);
                         self.insert(column, v.to_iso8601_string());
                     }
-                    DataType::Time32(time_unit) | DataType::Time64(time_unit) => {
-                        let v = match time_unit {
-                            TimeUnit::Second => {
-                                let array = array.as_primitive::<Time32SecondType>();
-                                Time::new_second(array.value(i) as i64)
-                            }
-                            TimeUnit::Millisecond => {
-                                let array = array.as_primitive::<Time32MillisecondType>();
-                                Time::new_millisecond(array.value(i) as i64)
-                            }
-                            TimeUnit::Microsecond => {
-                                let array = array.as_primitive::<Time64MicrosecondType>();
-                                Time::new_microsecond(array.value(i))
-                            }
-                            TimeUnit::Nanosecond => {
-                                let array = array.as_primitive::<Time64NanosecondType>();
-                                Time::new_nanosecond(array.value(i))
-                            }
-                        };
+                    DataType::Time32(_) | DataType::Time64(_) => {
+                        let v = datatypes::arrow_array::time_array_value(array, i);
                         self.insert(column, v.to_iso8601_string());
                     }
                     DataType::Interval(interval_unit) => match interval_unit {
@@ -1010,26 +967,8 @@ impl RowWriter {
                             self.insert(column, v.to_iso8601_string());
                         }
                     },
-                    DataType::Duration(time_unit) => {
-                        let v = match time_unit {
-                            TimeUnit::Second => {
-                                let array = array.as_primitive::<DurationSecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Millisecond => {
-                                let array = array.as_primitive::<DurationMillisecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Microsecond => {
-                                let array = array.as_primitive::<DurationMicrosecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Nanosecond => {
-                                let array = array.as_primitive::<DurationNanosecondType>();
-                                array.value(i)
-                            }
-                        };
-                        let d = Duration::new(v, time_unit.into());
+                    DataType::Duration(_) => {
+                        let d = datatypes::arrow_array::duration_array_value(array, i);
                         self.insert(column, d);
                     }
                     DataType::List(_) => {
@@ -1134,20 +1073,14 @@ fn record_batches_to_labels_name(
         let field_columns = field_column_indices
             .iter()
             .map(|i| {
-                batch
-                    .column(*i)
-                    .as_any()
-                    .downcast_ref::<Float64Vector>()
-                    .unwrap()
+                let column = batch.column(*i);
+                column.as_primitive::<Float64Type>()
             })
             .collect::<Vec<_>>();
 
         for row_index in 0..batch.num_rows() {
             // if all field columns are null, skip this row
-            if field_columns
-                .iter()
-                .all(|c| c.get_data(row_index).is_none())
-            {
+            if field_columns.iter().all(|c| c.is_null(row_index)) {
                 continue;
             }
 
diff --git a/src/servers/src/http/result.rs b/src/servers/src/http/result.rs
index dbad6dc3bc..ec3f6120d0 100644
--- a/src/servers/src/http/result.rs
+++ b/src/servers/src/http/result.rs
@@ -12,6 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use arrow::array::AsArray;
+use arrow::datatypes::{
+    Date32Type, Date64Type, Decimal128Type, Float32Type, Float64Type, Int8Type, Int16Type,
+    Int32Type, Int64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType,
+    UInt8Type, UInt16Type, UInt32Type, UInt64Type,
+};
+use arrow_schema::{DataType, IntervalUnit};
+use common_decimal::Decimal128;
+use common_recordbatch::RecordBatch;
+use common_time::{Date, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth};
+use datafusion_common::ScalarValue;
+use datatypes::data_type::ConcreteDataType;
+use datatypes::value::Value;
+use snafu::ResultExt;
+
+use crate::error::{
+    ConvertScalarValueSnafu, DataFusionSnafu, NotSupportedSnafu, Result, ToJsonSnafu,
+    UnexpectedResultSnafu,
+};
+
 pub(crate) mod arrow_result;
 pub(crate) mod csv_result;
 pub mod error_result;
@@ -22,3 +42,237 @@ pub(crate) mod json_result;
 pub(crate) mod null_result;
 pub(crate) mod prometheus_resp;
 pub(crate) mod table_result;
+
+pub struct HttpOutputWriter {
+    columns: usize,
+    value_transformer: Option<Box<dyn Fn(Value) -> Value>>,
+    current: Option<Vec<serde_json::Value>>,
+}
+
+impl HttpOutputWriter {
+    pub fn new(columns: usize, value_transformer: Option<Box<dyn Fn(Value) -> Value>>) -> Self {
+        Self {
+            columns,
+            value_transformer,
+            current: None,
+        }
+    }
+
+    fn write_bytes(&mut self, bytes: &[u8], datatype: &ConcreteDataType) -> Result<()> {
+        if datatype.is_json() {
+            let value = datatypes::types::jsonb_to_serde_json(bytes).map_err(|e| {
+                UnexpectedResultSnafu {
+                    reason: format!("corrupted jsonb data: {bytes:?}, error: {e}"),
+                }
+                .build()
+            })?;
+            self.push(value);
+            Ok(())
+        } else {
+            self.write_value(bytes)
+        }
+    }
+
+    fn write_value(&mut self, value: impl Into<Value>) -> Result<()> {
+        let value = value.into();
+
+        let value = if let Some(f) = &self.value_transformer {
+            f(value)
+        } else {
+            value
+        };
+
+        let value = serde_json::Value::try_from(value).context(ToJsonSnafu)?;
+        self.push(value);
+        Ok(())
+    }
+
+    fn push(&mut self, value: serde_json::Value) {
+        let current = self
+            .current
+            .get_or_insert_with(|| Vec::with_capacity(self.columns));
+        current.push(value);
+    }
+
+    fn finish(&mut self) -> Vec<serde_json::Value> {
+        self.current.take().unwrap_or_default()
+    }
+
+    pub fn write(
+        &mut self,
+        record_batch: RecordBatch,
+        rows: &mut Vec<Vec<serde_json::Value>>,
+    ) -> Result<()> {
+        let schema = record_batch.schema.clone();
+        let record_batch = record_batch.into_df_record_batch();
+        for i in 0..record_batch.num_rows() {
+            for (schema, array) in schema
+                .column_schemas()
+                .iter()
+                .zip(record_batch.columns().iter())
+            {
+                if array.is_null(i) {
+                    self.write_value(Value::Null)?;
+                    continue;
+                }
+
+                match array.data_type() {
+                    DataType::Null => {
+                        self.write_value(Value::Null)?;
+                    }
+                    DataType::Boolean => {
+                        let array = array.as_boolean();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::UInt8 => {
+                        let array = array.as_primitive::<UInt8Type>();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::UInt16 => {
+                        let array = array.as_primitive::<UInt16Type>();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::UInt32 => {
+                        let array = array.as_primitive::<UInt32Type>();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::UInt64 => {
+                        let array = array.as_primitive::<UInt64Type>();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::Int8 => {
+                        let array = array.as_primitive::<Int8Type>();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::Int16 => {
+                        let array = array.as_primitive::<Int16Type>();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::Int32 => {
+                        let array = array.as_primitive::<Int32Type>();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::Int64 => {
+                        let array = array.as_primitive::<Int64Type>();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::Float32 => {
+                        let array = array.as_primitive::<Float32Type>();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::Float64 => {
+                        let array = array.as_primitive::<Float64Type>();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::Utf8 => {
+                        let array = array.as_string::<i32>();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::LargeUtf8 => {
+                        let array = array.as_string::<i64>();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::Utf8View => {
+                        let array = array.as_string_view();
+                        let v = array.value(i);
+                        self.write_value(v)?;
+                    }
+                    DataType::Binary => {
+                        let array = array.as_binary::<i32>();
+                        let v = array.value(i);
+                        self.write_bytes(v, &schema.data_type)?;
+                    }
+                    DataType::LargeBinary => {
+                        let array = array.as_binary::<i64>();
+                        let v = array.value(i);
+                        self.write_bytes(v, &schema.data_type)?;
+                    }
+                    DataType::BinaryView => {
+                        let array = array.as_binary_view();
+                        let v = array.value(i);
+                        self.write_bytes(v, &schema.data_type)?;
+                    }
+                    DataType::Date32 => {
+                        let array = array.as_primitive::<Date32Type>();
+                        let v = Date::new(array.value(i));
+                        self.write_value(v)?;
+                    }
+                    DataType::Date64 => {
+                        let array = array.as_primitive::<Date64Type>();
+                        // `Date64` values are milliseconds representation of `Date32` values,
+                        // according to its specification. So we convert the `Date64` value here to
+                        // the `Date32` value to process them unified.
+                        let v = Date::new((array.value(i) / 86_400_000) as i32);
+                        self.write_value(v)?;
+                    }
+                    DataType::Timestamp(_, _) => {
+                        let ts = datatypes::arrow_array::timestamp_array_value(array, i);
+                        self.write_value(ts)?;
+                    }
+                    DataType::Time32(_) | DataType::Time64(_) => {
+                        let v = datatypes::arrow_array::time_array_value(array, i);
+                        self.write_value(v)?;
+                    }
+                    DataType::Interval(interval_unit) => match interval_unit {
+                        IntervalUnit::YearMonth => {
+                            let array = array.as_primitive::<IntervalYearMonthType>();
+                            let v: IntervalYearMonth = array.value(i).into();
+                            self.write_value(v)?;
+                        }
+                        IntervalUnit::DayTime => {
+                            let array = array.as_primitive::<IntervalDayTimeType>();
+                            let v: IntervalDayTime = array.value(i).into();
+                            self.write_value(v)?;
+                        }
+                        IntervalUnit::MonthDayNano => {
+                            let array = array.as_primitive::<IntervalMonthDayNanoType>();
+                            let v: IntervalMonthDayNano = array.value(i).into();
+                            self.write_value(v)?;
+                        }
+                    },
+                    DataType::Duration(_) => {
+                        let d = datatypes::arrow_array::duration_array_value(array, i);
+                        self.write_value(d)?;
+                    }
+                    DataType::List(_) => {
+                        let v = ScalarValue::try_from_array(array, i).context(DataFusionSnafu)?;
+                        let v: Value = v.try_into().context(ConvertScalarValueSnafu)?;
+                        self.write_value(v)?;
+                    }
+                    DataType::Struct(_) => {
+                        let v = ScalarValue::try_from_array(array, i).context(DataFusionSnafu)?;
+                        let v: Value = v.try_into().context(ConvertScalarValueSnafu)?;
+                        self.write_value(v)?;
+                    }
+                    DataType::Decimal128(precision, scale) => {
+                        let array = array.as_primitive::<Decimal128Type>();
+                        let v = Decimal128::new(array.value(i), *precision, *scale);
+                        self.write_value(v)?;
+                    }
+                    _ => {
+                        return NotSupportedSnafu {
+                            feat: format!("convert {} to http output value", array.data_type()),
+                        }
+                        .fail();
+                    }
+                }
+            }
+
+            rows.push(self.finish())
+        }
+        Ok(())
+    }
+}
diff --git a/src/servers/src/http/result/influxdb_result_v1.rs b/src/servers/src/http/result/influxdb_result_v1.rs
index e5f11d8aba..cc374aba70 100644
--- a/src/servers/src/http/result/influxdb_result_v1.rs
+++ b/src/servers/src/http/result/influxdb_result_v1.rs
@@ -12,35 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use arrow::array::AsArray;
-use arrow::datatypes::{
-    Date32Type, Date64Type, Decimal128Type, DurationMicrosecondType, DurationMillisecondType,
-    DurationNanosecondType, DurationSecondType, Float32Type, Float64Type, Int8Type, Int16Type,
-    Int32Type, Int64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType,
-    Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
-    TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
-    TimestampSecondType, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
-};
-use arrow_schema::{DataType, IntervalUnit, TimeUnit};
 use axum::Json;
 use axum::http::HeaderValue;
 use axum::response::{IntoResponse, Response};
-use common_decimal::Decimal128;
 use common_query::{Output, OutputData};
 use common_recordbatch::{RecordBatch, util};
-use common_time::time::Time;
-use common_time::{
-    Date, Duration, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth, Timestamp,
-};
-use datafusion_common::ScalarValue;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
-use snafu::ResultExt;
 
-use crate::error::{
-    ConvertScalarValueSnafu, DataFusionSnafu, Error, NotSupportedSnafu, Result, ToJsonSnafu,
-};
+use crate::error::{Error, Result};
 use crate::http::header::{GREPTIME_DB_HEADER_EXECUTION_TIME, GREPTIME_DB_HEADER_FORMAT};
+use crate::http::result::HttpOutputWriter;
 use crate::http::result::error_result::ErrorResponse;
 use crate::http::{Epoch, HttpResponse, ResponseFormat};
 
@@ -84,8 +66,8 @@ impl TryFrom<(Option<Epoch>, Vec<RecordBatch>)> for InfluxdbRecordsOutput {
         } else {
             // Safety: ensured by previous empty check
             let first = &recordbatches[0];
-            let columns = first
-                .schema
+            let schema = first.schema.clone();
+            let columns = schema
                 .column_schemas()
                 .iter()
                 .map(|cs| cs.name.clone())
@@ -94,8 +76,23 @@ impl TryFrom<(Option<Epoch>, Vec<RecordBatch>)> for InfluxdbRecordsOutput {
             let mut rows =
                 Vec::with_capacity(recordbatches.iter().map(|r| r.num_rows()).sum::<usize>());
 
+            let value_transformer =
+                move |value: datatypes::value::Value| -> datatypes::value::Value {
+                    match (value, epoch) {
+                        (datatypes::value::Value::Timestamp(ts), Some(epoch)) => {
+                            if let Some(converted) = epoch.convert_timestamp(ts) {
+                                datatypes::value::Value::Timestamp(converted)
+                            } else {
+                                datatypes::value::Value::Timestamp(ts)
+                            }
+                        }
+                        (value, _) => value,
+                    }
+                };
+
             for recordbatch in recordbatches {
-                let mut writer = RowWriter::new(epoch, recordbatch.num_columns());
+                let mut writer =
+                    HttpOutputWriter::new(schema.num_columns(), Some(Box::new(value_transformer)));
                 writer.write(recordbatch, &mut rows)?;
             }
 
@@ -104,266 +101,6 @@ impl TryFrom<(Option<Epoch>, Vec<RecordBatch>)> for InfluxdbRecordsOutput {
     }
 }
 
-struct RowWriter {
-    epoch: Option<Epoch>,
-    columns: usize,
-    current: Option<Vec<Value>>,
-}
-
-impl RowWriter {
-    fn new(epoch: Option<Epoch>, columns: usize) -> Self {
-        Self {
-            epoch,
-            columns,
-            current: None,
-        }
-    }
-
-    fn push(&mut self, value: impl Into<datatypes::value::Value>) -> Result<()> {
-        let value = value.into();
-
-        let current = self
-            .current
-            .get_or_insert_with(|| Vec::with_capacity(self.columns));
-        let value = Value::try_from(value).context(ToJsonSnafu)?;
-        current.push(value);
-        Ok(())
-    }
-
-    fn finish(&mut self) -> Vec<Value> {
-        self.current.take().unwrap_or_default()
-    }
-
-    fn write(&mut self, record_batch: RecordBatch, rows: &mut Vec<Vec<Value>>) -> Result<()> {
-        let record_batch = record_batch.into_df_record_batch();
-        for i in 0..record_batch.num_rows() {
-            for array in record_batch.columns().iter() {
-                if array.is_null(i) {
-                    self.push(datatypes::value::Value::Null)?;
-                    continue;
-                }
-
-                match array.data_type() {
-                    DataType::Null => {
-                        self.push(datatypes::value::Value::Null)?;
-                    }
-                    DataType::Boolean => {
-                        let array = array.as_boolean();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::UInt8 => {
-                        let array = array.as_primitive::<UInt8Type>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::UInt16 => {
-                        let array = array.as_primitive::<UInt16Type>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::UInt32 => {
-                        let array = array.as_primitive::<UInt32Type>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::UInt64 => {
-                        let array = array.as_primitive::<UInt64Type>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::Int8 => {
-                        let array = array.as_primitive::<Int8Type>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::Int16 => {
-                        let array = array.as_primitive::<Int16Type>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::Int32 => {
-                        let array = array.as_primitive::<Int32Type>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::Int64 => {
-                        let array = array.as_primitive::<Int64Type>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::Float32 => {
-                        let array = array.as_primitive::<Float32Type>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::Float64 => {
-                        let array = array.as_primitive::<Float64Type>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::Utf8 => {
-                        let array = array.as_string::<i32>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::LargeUtf8 => {
-                        let array = array.as_string::<i64>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::Utf8View => {
-                        let array = array.as_string_view();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::Binary => {
-                        let array = array.as_binary::<i32>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::LargeBinary => {
-                        let array = array.as_binary::<i64>();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::BinaryView => {
-                        let array = array.as_binary_view();
-                        let v = array.value(i);
-                        self.push(v)?;
-                    }
-                    DataType::Date32 => {
-                        let array = array.as_primitive::<Date32Type>();
-                        let v = Date::new(array.value(i));
-                        self.push(v)?;
-                    }
-                    DataType::Date64 => {
-                        let array = array.as_primitive::<Date64Type>();
-                        // `Date64` values are milliseconds representation of `Date32` values,
-                        // according to its specification. So we convert the `Date64` value here to
-                        // the `Date32` value to process them unified.
-                        let v = Date::new((array.value(i) / 86_400_000) as i32);
-                        self.push(v)?;
-                    }
-                    DataType::Timestamp(time_unit, _) => {
-                        let v = match time_unit {
-                            TimeUnit::Second => {
-                                let array = array.as_primitive::<TimestampSecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Millisecond => {
-                                let array = array.as_primitive::<TimestampMillisecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Microsecond => {
-                                let array = array.as_primitive::<TimestampMicrosecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Nanosecond => {
-                                let array = array.as_primitive::<TimestampNanosecondType>();
-                                array.value(i)
-                            }
-                        };
-                        let mut ts = Timestamp::new(v, time_unit.into());
-                        if let Some(epoch) = self.epoch
-                            && let Some(converted) = epoch.convert_timestamp(ts)
-                        {
-                            ts = converted;
-                        }
-                        self.push(ts)?;
-                    }
-                    DataType::Time32(time_unit) | DataType::Time64(time_unit) => {
-                        let v = match time_unit {
-                            TimeUnit::Second => {
-                                let array = array.as_primitive::<Time32SecondType>();
-                                Time::new_second(array.value(i) as i64)
-                            }
-                            TimeUnit::Millisecond => {
-                                let array = array.as_primitive::<Time32MillisecondType>();
-                                Time::new_millisecond(array.value(i) as i64)
-                            }
-                            TimeUnit::Microsecond => {
-                                let array = array.as_primitive::<Time64MicrosecondType>();
-                                Time::new_microsecond(array.value(i))
-                            }
-                            TimeUnit::Nanosecond => {
-                                let array = array.as_primitive::<Time64NanosecondType>();
-                                Time::new_nanosecond(array.value(i))
-                            }
-                        };
-                        self.push(v)?;
-                    }
-                    DataType::Interval(interval_unit) => match interval_unit {
-                        IntervalUnit::YearMonth => {
-                            let array = array.as_primitive::<IntervalYearMonthType>();
-                            let v: IntervalYearMonth = array.value(i).into();
-                            self.push(v)?;
-                        }
-                        IntervalUnit::DayTime => {
-                            let array = array.as_primitive::<IntervalDayTimeType>();
-                            let v: IntervalDayTime = array.value(i).into();
-                            self.push(v)?;
-                        }
-                        IntervalUnit::MonthDayNano => {
-                            let array = array.as_primitive::<IntervalMonthDayNanoType>();
-                            let v: IntervalMonthDayNano = array.value(i).into();
-                            self.push(v)?;
-                        }
-                    },
-                    DataType::Duration(time_unit) => {
-                        let v = match time_unit {
-                            TimeUnit::Second => {
-                                let array = array.as_primitive::<DurationSecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Millisecond => {
-                                let array = array.as_primitive::<DurationMillisecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Microsecond => {
-                                let array = array.as_primitive::<DurationMicrosecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Nanosecond => {
-                                let array = array.as_primitive::<DurationNanosecondType>();
-                                array.value(i)
-                            }
-                        };
-                        let d = Duration::new(v, time_unit.into());
-                        self.push(d)?;
-                    }
-                    DataType::List(_) => {
-                        let v = ScalarValue::try_from_array(array, i).context(DataFusionSnafu)?;
-                        let v: datatypes::value::Value =
-                            v.try_into().context(ConvertScalarValueSnafu)?;
-                        self.push(v)?;
-                    }
-                    DataType::Struct(_) => {
-                        let v = ScalarValue::try_from_array(array, i).context(DataFusionSnafu)?;
-                        let v: datatypes::value::Value =
-                            v.try_into().context(ConvertScalarValueSnafu)?;
-                        self.push(v)?;
-                    }
-                    DataType::Decimal128(precision, scale) => {
-                        let array = array.as_primitive::<Decimal128Type>();
-                        let v = Decimal128::new(array.value(i), *precision, *scale);
-                        self.push(v)?;
-                    }
-                    _ => {
-                        return NotSupportedSnafu {
-                            feat: format!("convert {} to influxdb value", array.data_type()),
-                        }
-                        .fail();
-                    }
-                }
-            }
-
-            rows.push(self.finish())
-        }
-        Ok(())
-    }
-}
-
 #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
 pub struct InfluxdbOutput {
     pub statement_id: u32,
diff --git a/src/servers/src/http/result/prometheus_resp.rs b/src/servers/src/http/result/prometheus_resp.rs
index 4bf386bbfd..9ecbe671b4 100644
--- a/src/servers/src/http/result/prometheus_resp.rs
+++ b/src/servers/src/http/result/prometheus_resp.rs
@@ -16,6 +16,9 @@
 use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap};
 
+use arrow::array::{Array, AsArray};
+use arrow::datatypes::{Float64Type, TimestampMillisecondType};
+use arrow_schema::DataType;
 use axum::Json;
 use axum::http::HeaderValue;
 use axum::response::{IntoResponse, Response};
@@ -24,8 +27,6 @@ use common_error::status_code::StatusCode;
 use common_query::{Output, OutputData};
 use common_recordbatch::RecordBatches;
 use datatypes::prelude::ConcreteDataType;
-use datatypes::scalars::ScalarVector;
-use datatypes::vectors::{Float64Vector, StringVector, TimestampMillisecondVector};
 use indexmap::IndexMap;
 use promql_parser::label::METRIC_NAME;
 use promql_parser::parser::value::ValueType;
@@ -34,7 +35,7 @@ use serde_json::Value;
 use snafu::{OptionExt, ResultExt};
 
 use crate::error::{
-    CollectRecordbatchSnafu, Result, UnexpectedResultSnafu, status_code_to_http_status,
+    ArrowSnafu, CollectRecordbatchSnafu, Result, UnexpectedResultSnafu, status_code_to_http_status,
 };
 use crate::http::header::{GREPTIME_DB_HEADER_METRICS, collect_plan_metrics};
 use crate::http::prometheus::{
@@ -247,13 +248,7 @@ impl PrometheusJsonResponse {
             // prepare things...
             let tag_columns = tag_column_indices
                 .iter()
-                .map(|i| {
-                    batch
-                        .column(*i)
-                        .as_any()
-                        .downcast_ref::<StringVector>()
-                        .unwrap()
-                })
+                .map(|i| batch.column(*i).as_string::<i32>())
                 .collect::<Vec<_>>();
             let tag_names = tag_column_indices
                 .iter()
@@ -261,22 +256,18 @@ impl PrometheusJsonResponse {
                 .collect::<Vec<_>>();
             let timestamp_column = batch
                 .column(timestamp_column_index)
-                .as_any()
-                .downcast_ref::<TimestampMillisecondVector>()
-                .unwrap();
-            let casted_field_column = batch
-                .column(first_field_column_index)
-                .cast(&ConcreteDataType::float64_datatype())
-                .unwrap();
-            let field_column = casted_field_column
-                .as_any()
-                .downcast_ref::<Float64Vector>()
-                .unwrap();
+                .as_primitive::<TimestampMillisecondType>();
+
+            let array =
+                arrow::compute::cast(batch.column(first_field_column_index), &DataType::Float64)
+                    .context(ArrowSnafu)?;
+            let field_column = array.as_primitive::<Float64Type>();
 
             // assemble rows
             for row_index in 0..batch.num_rows() {
                 // retrieve value
-                if let Some(v) = field_column.get_data(row_index) {
+                if field_column.is_valid(row_index) {
+                    let v = field_column.value(row_index);
                     // ignore all NaN values to reduce the amount of data to be sent.
                     if v.is_nan() {
                         continue;
@@ -289,14 +280,13 @@ impl PrometheusJsonResponse {
                     }
                     for (tag_column, tag_name) in tag_columns.iter().zip(tag_names.iter()) {
                         // TODO(ruihang): add test for NULL tag
-                        if let Some(tag_value) = tag_column.get_data(row_index) {
-                            tags.push((tag_name, tag_value));
+                        if tag_column.is_valid(row_index) {
+                            tags.push((tag_name, tag_column.value(row_index)));
                         }
                     }
 
                     // retrieve timestamp
-                    let timestamp_millis: i64 =
-                        timestamp_column.get_data(row_index).unwrap().into();
+                    let timestamp_millis = timestamp_column.value(row_index);
                     let timestamp = timestamp_millis as f64 / 1000.0;
 
                     buffer
diff --git a/src/servers/src/lib.rs b/src/servers/src/lib.rs
index c73883f0da..40635cd036 100644
--- a/src/servers/src/lib.rs
+++ b/src/servers/src/lib.rs
@@ -16,6 +16,7 @@
 #![feature(try_blocks)]
 #![feature(exclusive_wrapper)]
 #![feature(if_let_guard)]
+#![feature(box_patterns)]
 
 use datafusion_expr::LogicalPlan;
 use datatypes::schema::Schema;
@@ -28,7 +29,6 @@ pub mod addrs;
 pub mod configurator;
 pub(crate) mod elasticsearch;
 pub mod error;
-pub mod export_metrics;
 pub mod grpc;
 pub mod heartbeat_options;
 mod hint_headers;
@@ -56,7 +56,7 @@ pub mod server;
 pub mod tls;
 
 /// Cached SQL and logical plan for database interfaces
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct SqlPlan {
     query: String,
     // Store the parsed statement to determine if it is a query and whether to track it.
diff --git a/src/servers/src/mysql/federated.rs b/src/servers/src/mysql/federated.rs
index 7cba3a8d1c..92085f1e58 100644
--- a/src/servers/src/mysql/federated.rs
+++ b/src/servers/src/mysql/federated.rs
@@ -37,6 +37,8 @@ static SHOW_LOWER_CASE_PATTERN: Lazy<Regex> =
     Lazy::new(|| Regex::new("(?i)^(SHOW VARIABLES LIKE 'lower_case_table_names'(.*))").unwrap());
 static SHOW_VARIABLES_LIKE_PATTERN: Lazy<Regex> =
     Lazy::new(|| Regex::new("(?i)^(SHOW VARIABLES( LIKE (.*))?)").unwrap());
+static SHOW_WARNINGS_PATTERN: Lazy<Regex> =
+    Lazy::new(|| Regex::new("(?i)^(/\\* ApplicationName=.*)?SHOW WARNINGS").unwrap());
 
 // SELECT TIMEDIFF(NOW(), UTC_TIMESTAMP());
 static SELECT_TIME_DIFF_FUNC_PATTERN: Lazy<Regex> =
@@ -85,8 +87,6 @@ static OTHER_NOT_SUPPORTED_STMT: Lazy<RegexSet> = Lazy::new(|| {
         "(?i)^(/\\*!40101 SET(.*) \\*/)$",
 
         // DBeaver.
-        "(?i)^(SHOW WARNINGS)",
-        "(?i)^(/\\* ApplicationName=(.*)SHOW WARNINGS)",
         "(?i)^(/\\* ApplicationName=(.*)SHOW PLUGINS)",
         "(?i)^(/\\* ApplicationName=(.*)SHOW ENGINES)",
         "(?i)^(/\\* ApplicationName=(.*)SELECT @@(.*))",
@@ -252,6 +252,47 @@ fn check_show_variables(query: &str) -> Option<Output> {
     recordbatches.map(Output::new_with_record_batches)
 }
 
+/// Build SHOW WARNINGS result from session's warnings
+fn show_warnings(session: &SessionRef) -> RecordBatches {
+    let schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("Level", ConcreteDataType::string_datatype(), false),
+        ColumnSchema::new("Code", ConcreteDataType::uint16_datatype(), false),
+        ColumnSchema::new("Message", ConcreteDataType::string_datatype(), false),
+    ]));
+
+    let warnings = session.warnings();
+    let count = warnings.len();
+
+    let columns = if count > 0 {
+        vec![
+            Arc::new(StringVector::from(vec!["Warning"; count])) as _,
+            Arc::new(datatypes::vectors::UInt16Vector::from(vec![
+                Some(1000u16);
+                count
+            ])) as _,
+            Arc::new(StringVector::from(warnings)) as _,
+        ]
+    } else {
+        vec![
+            Arc::new(StringVector::from(Vec::<String>::new())) as _,
+            Arc::new(datatypes::vectors::UInt16Vector::from(
+                Vec::<Option<u16>>::new(),
+            )) as _,
+            Arc::new(StringVector::from(Vec::<String>::new())) as _,
+        ]
+    };
+
+    RecordBatches::try_from_columns(schema, columns).unwrap()
+}
+
+fn check_show_warnings(query: &str, session: &SessionRef) -> Option<Output> {
+    if SHOW_WARNINGS_PATTERN.is_match(query) {
+        Some(Output::new_with_record_batches(show_warnings(session)))
+    } else {
+        None
+    }
+}
+
 // Check for SET or others query, this is the final check of the federated query.
 fn check_others(query: &str, _query_ctx: QueryContextRef) -> Option<Output> {
     if OTHER_NOT_SUPPORTED_STMT.is_match(query.as_bytes()) {
@@ -274,7 +315,7 @@ fn check_others(query: &str, _query_ctx: QueryContextRef) -> Option<Output> {
 pub(crate) fn check(
     query: &str,
     query_ctx: QueryContextRef,
-    _session: SessionRef,
+    session: SessionRef,
 ) -> Option<Output> {
     // INSERT don't need MySQL federated check. We assume the query doesn't contain
     // federated or driver setup command if it starts with a 'INSERT' statement.
@@ -287,8 +328,8 @@ pub(crate) fn check(
 
     // First to check the query is like "select @@variables".
     check_select_variable(query, query_ctx.clone())
-        // Then to check "show variables like ...".
         .or_else(|| check_show_variables(query))
+        .or_else(|| check_show_warnings(query, &session))
         // Last check
         .or_else(|| check_others(query, query_ctx))
 }
@@ -392,4 +433,64 @@ mod test {
 +----------------------------------+";
         test(query, expected);
     }
+
+    #[test]
+    fn test_show_warnings() {
+        // Test SHOW WARNINGS with no warnings
+        let session = Arc::new(Session::new(None, Channel::Mysql, Default::default(), 0));
+        let output = check("SHOW WARNINGS", QueryContext::arc(), session.clone());
+        match output.unwrap().data {
+            OutputData::RecordBatches(r) => {
+                assert_eq!(r.iter().map(|b| b.num_rows()).sum::<usize>(), 0);
+            }
+            _ => unreachable!(),
+        }
+
+        // Test SHOW WARNINGS with a single warning
+        session.add_warning("Test warning message".to_string());
+        let output = check("SHOW WARNINGS", QueryContext::arc(), session.clone());
+        match output.unwrap().data {
+            OutputData::RecordBatches(r) => {
+                let expected = "\
++---------+------+----------------------+
+| Level   | Code | Message              |
++---------+------+----------------------+
+| Warning | 1000 | Test warning message |
++---------+------+----------------------+";
+                assert_eq!(&r.pretty_print().unwrap(), expected);
+            }
+            _ => unreachable!(),
+        }
+
+        // Test SHOW WARNINGS with multiple warnings
+        session.clear_warnings();
+        session.add_warning("First warning".to_string());
+        session.add_warning("Second warning".to_string());
+        let output = check("SHOW WARNINGS", QueryContext::arc(), session.clone());
+        match output.unwrap().data {
+            OutputData::RecordBatches(r) => {
+                let expected = "\
++---------+------+----------------+
+| Level   | Code | Message        |
++---------+------+----------------+
+| Warning | 1000 | First warning  |
+| Warning | 1000 | Second warning |
++---------+------+----------------+";
+                assert_eq!(&r.pretty_print().unwrap(), expected);
+            }
+            _ => unreachable!(),
+        }
+
+        // Test case insensitivity
+        let output = check("show warnings", QueryContext::arc(), session.clone());
+        assert!(output.is_some());
+
+        // Test with DBeaver-style comment prefix
+        let output = check(
+            "/* ApplicationName=DBeaver */SHOW WARNINGS",
+            QueryContext::arc(),
+            session.clone(),
+        );
+        assert!(output.is_some());
+    }
 }
diff --git a/src/servers/src/mysql/handler.rs b/src/servers/src/mysql/handler.rs
index aa57708ab7..b5fc66de2e 100644
--- a/src/servers/src/mysql/handler.rs
+++ b/src/servers/src/mysql/handler.rs
@@ -355,7 +355,7 @@ impl MysqlInstanceShim {
         let _ = guard.remove(&stmt_key);
     }
 
-    fn auth_plugin(&self) -> &str {
+    fn auth_plugin(&self) -> &'static str {
         if self
             .user_provider
             .as_ref()
@@ -385,7 +385,7 @@ impl<W: AsyncWrite + Send + Sync + Unpin> AsyncMysqlShim<W> for MysqlInstanceShi
         self.auth_plugin()
     }
 
-    async fn auth_plugin_for_username<'a, 'user>(&'a self, _user: &'user [u8]) -> &'a str {
+    async fn auth_plugin_for_username(&self, _user: &[u8]) -> &'static str {
         self.auth_plugin()
     }
 
@@ -475,6 +475,8 @@ impl<W: AsyncWrite + Send + Sync + Unpin> AsyncMysqlShim<W> for MysqlInstanceShi
         p: ParamParser<'a>,
         w: QueryResultWriter<'a, W>,
     ) -> Result<()> {
+        self.session.clear_warnings();
+
         let query_ctx = self.session.new_query_context();
         let db = query_ctx.get_db_string();
         let _timer = crate::metrics::METRIC_MYSQL_QUERY_TIMER
@@ -500,7 +502,7 @@ impl<W: AsyncWrite + Send + Sync + Unpin> AsyncMysqlShim<W> for MysqlInstanceShi
             }
         };
 
-        writer::write_output(w, query_ctx, outputs).await?;
+        writer::write_output(w, query_ctx, self.session.clone(), outputs).await?;
 
         Ok(())
     }
@@ -525,7 +527,12 @@ impl<W: AsyncWrite + Send + Sync + Unpin> AsyncMysqlShim<W> for MysqlInstanceShi
             .with_label_values(&[crate::metrics::METRIC_MYSQL_TEXTQUERY, db.as_str()])
             .start_timer();
 
+        // Clear warnings for non SHOW WARNINGS queries
         let query_upcase = query.to_uppercase();
+        if !query_upcase.starts_with("SHOW WARNINGS") {
+            self.session.clear_warnings();
+        }
+
         if query_upcase.starts_with("PREPARE ") {
             match ParserContext::parse_mysql_prepare_stmt(query, query_ctx.sql_dialect()) {
                 Ok((stmt_name, stmt)) => {
@@ -534,7 +541,8 @@ impl<W: AsyncWrite + Send + Sync + Unpin> AsyncMysqlShim<W> for MysqlInstanceShi
                     match prepare_results {
                         Ok(_) => {
                             let outputs = vec![Ok(Output::new_with_affected_rows(0))];
-                            writer::write_output(writer, query_ctx, outputs).await?;
+                            writer::write_output(writer, query_ctx, self.session.clone(), outputs)
+                                .await?;
                             return Ok(());
                         }
                         Err(e) => {
@@ -570,7 +578,8 @@ impl<W: AsyncWrite + Send + Sync + Unpin> AsyncMysqlShim<W> for MysqlInstanceShi
                             return Ok(());
                         }
                     };
-                    writer::write_output(writer, query_ctx, outputs).await?;
+                    writer::write_output(writer, query_ctx, self.session.clone(), outputs).await?;
+
                     return Ok(());
                 }
                 Err(e) => {
@@ -585,7 +594,7 @@ impl<W: AsyncWrite + Send + Sync + Unpin> AsyncMysqlShim<W> for MysqlInstanceShi
                 Ok(stmt_name) => {
                     self.do_close(stmt_name);
                     let outputs = vec![Ok(Output::new_with_affected_rows(0))];
-                    writer::write_output(writer, query_ctx, outputs).await?;
+                    writer::write_output(writer, query_ctx, self.session.clone(), outputs).await?;
                     return Ok(());
                 }
                 Err(e) => {
@@ -598,7 +607,8 @@ impl<W: AsyncWrite + Send + Sync + Unpin> AsyncMysqlShim<W> for MysqlInstanceShi
         }
 
         let outputs = self.do_query(query, query_ctx.clone()).await;
-        writer::write_output(writer, query_ctx, outputs).await?;
+        writer::write_output(writer, query_ctx, self.session.clone(), outputs).await?;
+
         Ok(())
     }
 
diff --git a/src/servers/src/mysql/helper.rs b/src/servers/src/mysql/helper.rs
index cf92741bea..f765fba2d4 100644
--- a/src/servers/src/mysql/helper.rs
+++ b/src/servers/src/mysql/helper.rs
@@ -18,7 +18,7 @@ use std::time::Duration;
 use chrono::NaiveDate;
 use common_query::prelude::ScalarValue;
 use common_sql::convert::sql_value_to_value;
-use common_time::Timestamp;
+use common_time::{Date, Timestamp};
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_expr::LogicalPlan;
 use datatypes::prelude::ConcreteDataType;
@@ -210,7 +210,8 @@ pub fn convert_value(param: &ParamValue, t: &ConcreteDataType) -> Result<ScalarV
                 }
             }
             ConcreteDataType::Binary(_) => Ok(ScalarValue::Binary(Some(b.to_vec()))),
-            ConcreteDataType::Timestamp(ts_type) => covert_bytes_to_timestamp(b, ts_type),
+            ConcreteDataType::Timestamp(ts_type) => convert_bytes_to_timestamp(b, ts_type),
+            ConcreteDataType::Date(_) => convert_bytes_to_date(b),
             _ => error::PreparedStmtTypeMismatchSnafu {
                 expected: t,
                 actual: param.coltype,
@@ -285,7 +286,7 @@ pub fn convert_expr_to_scalar_value(param: &Expr, t: &ConcreteDataType) -> Resul
     }
 }
 
-fn covert_bytes_to_timestamp(bytes: &[u8], ts_type: &TimestampType) -> Result<ScalarValue> {
+fn convert_bytes_to_timestamp(bytes: &[u8], ts_type: &TimestampType) -> Result<ScalarValue> {
     let ts = Timestamp::from_str_utc(&String::from_utf8_lossy(bytes))
         .map_err(|e| {
             error::MysqlValueConversionSnafu {
@@ -314,6 +315,17 @@ fn covert_bytes_to_timestamp(bytes: &[u8], ts_type: &TimestampType) -> Result<Sc
     }
 }
 
+fn convert_bytes_to_date(bytes: &[u8]) -> Result<ScalarValue> {
+    let date = Date::from_str_utc(&String::from_utf8_lossy(bytes)).map_err(|e| {
+        error::MysqlValueConversionSnafu {
+            err_msg: e.to_string(),
+        }
+        .build()
+    })?;
+
+    Ok(ScalarValue::Date32(Some(date.val())))
+}
+
 #[cfg(test)]
 mod tests {
     use datatypes::types::{
@@ -512,8 +524,28 @@ mod tests {
         ];
 
         for (input, ts_type, expected) in test_cases {
-            let result = covert_bytes_to_timestamp(input.as_bytes(), &ts_type).unwrap();
+            let result = convert_bytes_to_timestamp(input.as_bytes(), &ts_type).unwrap();
             assert_eq!(result, expected);
         }
     }
+
+    #[test]
+    fn test_convert_bytes_to_date() {
+        let test_cases = vec![
+            // Standard date format: YYYY-MM-DD
+            ("1970-01-01", ScalarValue::Date32(Some(0))),
+            ("1969-12-31", ScalarValue::Date32(Some(-1))),
+            ("2024-02-29", ScalarValue::Date32(Some(19782))),
+            ("2024-01-01", ScalarValue::Date32(Some(19723))),
+            ("2024-12-31", ScalarValue::Date32(Some(20088))),
+            ("2001-01-02", ScalarValue::Date32(Some(11324))),
+            ("2050-06-14", ScalarValue::Date32(Some(29384))),
+            ("2020-03-15", ScalarValue::Date32(Some(18336))),
+        ];
+
+        for (input, expected) in test_cases {
+            let result = convert_bytes_to_date(input.as_bytes()).unwrap();
+            assert_eq!(result, expected, "Failed for input: {}", input);
+        }
+    }
 }
diff --git a/src/servers/src/mysql/server.rs b/src/servers/src/mysql/server.rs
index c27d3ebbda..bda027ca55 100644
--- a/src/servers/src/mysql/server.rs
+++ b/src/servers/src/mysql/server.rs
@@ -99,7 +99,7 @@ impl MysqlSpawnConfig {
     }
 
     fn tls(&self) -> Option<Arc<ServerConfig>> {
-        self.tls.get_server_config()
+        self.tls.get_config()
     }
 }
 
diff --git a/src/servers/src/mysql/writer.rs b/src/servers/src/mysql/writer.rs
index 2b8495074a..0a4d1138e6 100644
--- a/src/servers/src/mysql/writer.rs
+++ b/src/servers/src/mysql/writer.rs
@@ -16,22 +16,18 @@ use std::time::Duration;
 
 use arrow::array::{Array, AsArray};
 use arrow::datatypes::{
-    Date32Type, Decimal128Type, DurationMicrosecondType, DurationMillisecondType,
-    DurationNanosecondType, DurationSecondType, Float32Type, Float64Type, Int8Type, Int16Type,
-    Int32Type, Int64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType,
-    Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
-    TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
-    TimestampSecondType, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
+    Date32Type, Decimal128Type, Float32Type, Float64Type, Int8Type, Int16Type, Int32Type,
+    Int64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, UInt8Type,
+    UInt16Type, UInt32Type, UInt64Type,
 };
-use arrow_schema::{DataType, IntervalUnit, TimeUnit};
+use arrow_schema::{DataType, IntervalUnit};
 use common_decimal::Decimal128;
 use common_error::ext::ErrorExt;
 use common_error::status_code::StatusCode;
 use common_query::{Output, OutputData};
 use common_recordbatch::{RecordBatch, SendableRecordBatchStream};
 use common_telemetry::{debug, error};
-use common_time::time::Time;
-use common_time::{Date, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth, Timestamp};
+use common_time::{Date, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth};
 use datafusion_common::ScalarValue;
 use datatypes::prelude::ConcreteDataType;
 use datatypes::schema::SchemaRef;
@@ -40,6 +36,7 @@ use futures::StreamExt;
 use opensrv_mysql::{
     Column, ColumnFlags, ColumnType, ErrorKind, OkResponse, QueryResultWriter, RowWriter,
 };
+use session::SessionRef;
 use session::context::QueryContextRef;
 use snafu::prelude::*;
 use tokio::io::AsyncWrite;
@@ -51,9 +48,18 @@ use crate::metrics::*;
 pub async fn write_output<W: AsyncWrite + Send + Sync + Unpin>(
     w: QueryResultWriter<'_, W>,
     query_context: QueryContextRef,
+    session: SessionRef,
     outputs: Vec<Result<Output>>,
 ) -> Result<()> {
-    let mut writer = Some(MysqlResultWriter::new(w, query_context.clone()));
+    if let Some(warning) = query_context.warning() {
+        session.add_warning(warning);
+    }
+
+    let mut writer = Some(MysqlResultWriter::new(
+        w,
+        query_context.clone(),
+        session.clone(),
+    ));
     for output in outputs {
         let result_writer = writer.take().context(error::InternalSnafu {
             err_msg: "Sending multiple result set is unsupported",
@@ -98,16 +104,19 @@ struct QueryResult {
 pub struct MysqlResultWriter<'a, W: AsyncWrite + Unpin> {
     writer: QueryResultWriter<'a, W>,
     query_context: QueryContextRef,
+    session: SessionRef,
 }
 
 impl<'a, W: AsyncWrite + Unpin> MysqlResultWriter<'a, W> {
     pub fn new(
         writer: QueryResultWriter<'a, W>,
         query_context: QueryContextRef,
+        session: SessionRef,
     ) -> MysqlResultWriter<'a, W> {
         MysqlResultWriter::<'a, W> {
             writer,
             query_context,
+            session,
         }
     }
 
@@ -135,10 +144,12 @@ impl<'a, W: AsyncWrite + Unpin> MysqlResultWriter<'a, W> {
                     Self::write_query_result(query_result, self.writer, self.query_context).await?;
                 }
                 OutputData::AffectedRows(rows) => {
-                    let next_writer = Self::write_affected_rows(self.writer, rows).await?;
+                    let next_writer =
+                        Self::write_affected_rows(self.writer, rows, &self.session).await?;
                     return Ok(Some(MysqlResultWriter::new(
                         next_writer,
                         self.query_context,
+                        self.session,
                     )));
                 }
             },
@@ -156,10 +167,14 @@ impl<'a, W: AsyncWrite + Unpin> MysqlResultWriter<'a, W> {
     async fn write_affected_rows(
         w: QueryResultWriter<'a, W>,
         rows: usize,
+        session: &SessionRef,
     ) -> Result<QueryResultWriter<'a, W>> {
+        let warnings = session.warnings_count() as u16;
+
         let next_writer = w
             .complete_one(OkResponse {
                 affected_rows: rows as u64,
+                warnings,
                 ..Default::default()
             })
             .await?;
@@ -312,26 +327,8 @@ impl<'a, W: AsyncWrite + Unpin> MysqlResultWriter<'a, W> {
                         let v = Date::new(array.value(i));
                         row_writer.write_col(v.to_chrono_date())?;
                     }
-                    DataType::Timestamp(time_unit, _) => {
-                        let v = match time_unit {
-                            TimeUnit::Second => {
-                                let array = column.as_primitive::<TimestampSecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Millisecond => {
-                                let array = column.as_primitive::<TimestampMillisecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Microsecond => {
-                                let array = column.as_primitive::<TimestampMicrosecondType>();
-                                array.value(i)
-                            }
-                            TimeUnit::Nanosecond => {
-                                let array = column.as_primitive::<TimestampNanosecondType>();
-                                array.value(i)
-                            }
-                        };
-                        let v = Timestamp::new(v, time_unit.into());
+                    DataType::Timestamp(_, _) => {
+                        let v = datatypes::arrow_array::timestamp_array_value(column, i);
                         let v = v.to_chrono_datetime_with_timezone(Some(&query_context.timezone()));
                         row_writer.write_col(v)?;
                     }
@@ -352,28 +349,11 @@ impl<'a, W: AsyncWrite + Unpin> MysqlResultWriter<'a, W> {
                             row_writer.write_col(v.to_iso8601_string())?;
                         }
                     },
-                    DataType::Duration(time_unit) => match time_unit {
-                        TimeUnit::Second => {
-                            let array = column.as_primitive::<DurationSecondType>();
-                            let v = array.value(i);
-                            row_writer.write_col(Duration::from_secs(v as u64))?;
-                        }
-                        TimeUnit::Millisecond => {
-                            let array = column.as_primitive::<DurationMillisecondType>();
-                            let v = array.value(i);
-                            row_writer.write_col(Duration::from_millis(v as u64))?;
-                        }
-                        TimeUnit::Microsecond => {
-                            let array = column.as_primitive::<DurationMicrosecondType>();
-                            let v = array.value(i);
-                            row_writer.write_col(Duration::from_micros(v as u64))?;
-                        }
-                        TimeUnit::Nanosecond => {
-                            let array = column.as_primitive::<DurationNanosecondType>();
-                            let v = array.value(i);
-                            row_writer.write_col(Duration::from_nanos(v as u64))?;
-                        }
-                    },
+                    DataType::Duration(_) => {
+                        let v: Duration =
+                            datatypes::arrow_array::duration_array_value(column, i).into();
+                        row_writer.write_col(v)?;
+                    }
                     DataType::List(_) => {
                         let v = ScalarValue::try_from_array(column, i).context(DataFusionSnafu)?;
                         row_writer.write_col(v.to_string())?;
@@ -382,37 +362,8 @@ impl<'a, W: AsyncWrite + Unpin> MysqlResultWriter<'a, W> {
                         let v = ScalarValue::try_from_array(column, i).context(DataFusionSnafu)?;
                         row_writer.write_col(v.to_string())?;
                     }
-                    DataType::Time32(time_unit) => {
-                        let time = match time_unit {
-                            TimeUnit::Second => {
-                                let array = column.as_primitive::<Time32SecondType>();
-                                Time::new_second(array.value(i) as i64)
-                            }
-                            TimeUnit::Millisecond => {
-                                let array = column.as_primitive::<Time32MillisecondType>();
-                                Time::new_millisecond(array.value(i) as i64)
-                            }
-                            _ => unreachable!(
-                                "`DataType::Time32` has only second and millisecond time units"
-                            ),
-                        };
-                        let v = time.to_timezone_aware_string(Some(&query_context.timezone()));
-                        row_writer.write_col(v)?;
-                    }
-                    DataType::Time64(time_unit) => {
-                        let time = match time_unit {
-                            TimeUnit::Microsecond => {
-                                let array = column.as_primitive::<Time64MicrosecondType>();
-                                Time::new_microsecond(array.value(i))
-                            }
-                            TimeUnit::Nanosecond => {
-                                let array = column.as_primitive::<Time64NanosecondType>();
-                                Time::new_nanosecond(array.value(i))
-                            }
-                            _ => unreachable!(
-                                "`DataType::Time64` has only microsecond and nanosecond time units"
-                            ),
-                        };
+                    DataType::Time32(_) | DataType::Time64(_) => {
+                        let time = datatypes::arrow_array::time_array_value(column, i);
                         let v = time.to_timezone_aware_string(Some(&query_context.timezone()));
                         row_writer.write_col(v)?;
                     }
@@ -498,10 +449,10 @@ pub(crate) fn create_mysql_column(
     column_type.map(|column_type| Column {
         column: column_name.to_string(),
         coltype: column_type,
-
         // TODO(LFC): Currently "table" and "colflags" are not relevant in MySQL server
         //   implementation, will revisit them again in the future.
         table: String::default(),
+        collen: 0, // 0 means "use default".
         colflags,
     })
 }
@@ -557,5 +508,6 @@ fn mysql_error_kind(status_code: &StatusCode) -> ErrorKind {
         StatusCode::FlowNotFound => ErrorKind::ER_NO_SUCH_TABLE,
         StatusCode::TriggerAlreadyExists => ErrorKind::ER_TABLE_EXISTS_ERROR,
         StatusCode::TriggerNotFound => ErrorKind::ER_NO_SUCH_TABLE,
+        StatusCode::Suspended => ErrorKind::ER_SERVER_SHUTDOWN,
     }
 }
diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs
index 5a6710f420..f1e4138e63 100644
--- a/src/servers/src/pipeline.rs
+++ b/src/servers/src/pipeline.rs
@@ -16,9 +16,8 @@ use std::collections::BTreeMap;
 use std::sync::Arc;
 
 use ahash::{HashMap, HashMapExt};
-use api::greptime_proto;
 use api::v1::helper::time_index_column_schema;
-use api::v1::{ColumnDataType, RowInsertRequest, Rows};
+use api::v1::{ColumnDataType, RowInsertRequest, Rows, Value};
 use common_time::timestamp::TimeUnit;
 use pipeline::{
     ContextReq, DispatchedTo, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, Pipeline, PipelineContext,
@@ -154,13 +153,18 @@ async fn run_custom_pipeline(
 
         let r = unwrap_or_continue_if_err!(result, skip_error);
         match r {
-            PipelineExecOutput::Transformed(TransformedOutput {
-                opt,
-                row,
-                table_suffix,
-            }) => {
-                let act_table_name = table_suffix_to_table_name(&table_name, table_suffix);
-                push_to_map!(transformed_map, (opt, act_table_name), row, arr_len);
+            PipelineExecOutput::Transformed(TransformedOutput { rows_by_context }) => {
+                // Process each ContextOpt group separately
+                for (opt, rows_with_suffix) in rows_by_context {
+                    // Group rows by table name within each context
+                    for (row, table_suffix) in rows_with_suffix {
+                        let act_table_name = table_suffix_to_table_name(&table_name, table_suffix);
+                        transformed_map
+                            .entry((opt.clone(), act_table_name))
+                            .or_insert_with(|| Vec::with_capacity(arr_len))
+                            .push(row);
+                    }
+                }
             }
             PipelineExecOutput::DispatchedTo(dispatched_to, val) => {
                 push_to_map!(dispatched, dispatched_to, val, arr_len);
@@ -173,22 +177,26 @@ async fn run_custom_pipeline(
 
     let mut results = ContextReq::default();
 
-    let s_len = schema_info.schema.len();
-
-    // if transformed
+    // Process transformed outputs. Each entry in transformed_map contains
+    // Vec<Row> grouped by (opt, table_name).
+    let column_count = schema_info.schema.len();
     for ((opt, table_name), mut rows) in transformed_map {
-        for row in rows.iter_mut() {
-            row.values
-                .resize(s_len, greptime_proto::v1::Value::default());
+        // Pad rows to match final schema size (schema may have evolved during processing)
+        for row in &mut rows {
+            let diff = column_count.saturating_sub(row.values.len());
+            for _ in 0..diff {
+                row.values.push(Value { value_data: None });
+            }
         }
+
         results.add_row(
-            opt,
+            &opt,
             RowInsertRequest {
                 rows: Some(Rows {
                     rows,
                     schema: schema_info.schema.clone(),
                 }),
-                table_name,
+                table_name: table_name.clone(),
             },
         );
     }
diff --git a/src/servers/src/postgres/auth_handler.rs b/src/servers/src/postgres/auth_handler.rs
index 14450289ba..286c39da64 100644
--- a/src/servers/src/postgres/auth_handler.rs
+++ b/src/servers/src/postgres/auth_handler.rs
@@ -19,6 +19,7 @@ use ::auth::{Identity, Password, UserInfoRef, UserProviderRef, userinfo_by_name}
 use async_trait::async_trait;
 use common_catalog::parse_catalog_and_schema_from_db_string;
 use common_error::ext::ErrorExt;
+use common_time::Timezone;
 use futures::{Sink, SinkExt};
 use pgwire::api::auth::StartupHandler;
 use pgwire::api::{ClientInfo, PgWireConnectionState, auth};
@@ -171,6 +172,23 @@ impl StartupHandler for PostgresServerHandlerInner {
                     }
                 }
 
+                // try to set TimeZone
+                if let Some(tz) = client.metadata().get("TimeZone") {
+                    match Timezone::from_tz_string(tz) {
+                        Ok(tz) => self.session.set_timezone(tz),
+                        Err(_) => {
+                            send_error(
+                                client,
+                                PgErrorCode::Ec22023
+                                    .to_err_info(format!("Invalid TimeZone: {}", tz)),
+                            )
+                            .await?;
+
+                            return Ok(());
+                        }
+                    }
+                }
+
                 if self.login_verifier.user_provider.is_some() {
                     client.set_state(PgWireConnectionState::AuthenticationInProgress);
                     client
diff --git a/src/servers/src/postgres/fixtures.rs b/src/servers/src/postgres/fixtures.rs
index 3b56d99241..dcd7842c95 100644
--- a/src/servers/src/postgres/fixtures.rs
+++ b/src/servers/src/postgres/fixtures.rs
@@ -22,7 +22,7 @@ use pgwire::api::results::{DataRowEncoder, FieldFormat, FieldInfo, QueryResponse
 use pgwire::error::PgWireResult;
 use pgwire::messages::data::DataRow;
 use regex::Regex;
-use session::context::QueryContextRef;
+use session::context::{QueryContext, QueryContextRef};
 
 fn build_string_data_rows(
     schema: Arc<Vec<FieldInfo>>,
@@ -60,11 +60,7 @@ static ABORT_TRANSACTION_PATTERN: Lazy<Regex> =
 
 /// Test if given query statement matches the patterns
 pub(crate) fn matches(query: &str) -> bool {
-    START_TRANSACTION_PATTERN.is_match(query)
-        || COMMIT_TRANSACTION_PATTERN.is_match(query)
-        || ABORT_TRANSACTION_PATTERN.is_match(query)
-        || SHOW_PATTERN.captures(query).is_some()
-        || SET_TRANSACTION_PATTERN.is_match(query)
+    process(query, QueryContext::arc()).is_some()
 }
 
 fn set_transaction_warning(query_ctx: QueryContextRef) {
diff --git a/src/servers/src/postgres/handler.rs b/src/servers/src/postgres/handler.rs
index daccf9dc26..ad242390ec 100644
--- a/src/servers/src/postgres/handler.rs
+++ b/src/servers/src/postgres/handler.rs
@@ -28,7 +28,7 @@ use futures::{Sink, SinkExt, Stream, StreamExt, future, stream};
 use pgwire::api::portal::{Format, Portal};
 use pgwire::api::query::{ExtendedQueryHandler, SimpleQueryHandler};
 use pgwire::api::results::{
-    DescribePortalResponse, DescribeStatementResponse, QueryResponse, Response, Tag,
+    DescribePortalResponse, DescribeStatementResponse, FieldInfo, QueryResponse, Response, Tag,
 };
 use pgwire::api::stmt::{QueryParser, StoredStatement};
 use pgwire::api::{ClientInfo, ErrorHandler, Type};
@@ -40,6 +40,7 @@ use session::context::QueryContextRef;
 use snafu::ResultExt;
 use sql::dialect::PostgreSqlDialect;
 use sql::parser::{ParseOptions, ParserContext};
+use sql::statements::statement::Statement;
 
 use crate::SqlPlan;
 use crate::error::{DataFusionSnafu, Result};
@@ -201,7 +202,7 @@ impl QueryParser for DefaultQueryParser {
         &self,
         _client: &C,
         sql: &str,
-        _types: &[Type],
+        _types: &[Option<Type>],
     ) -> PgWireResult<Self::Statement> {
         crate::metrics::METRIC_POSTGRES_PREPARED_COUNT.inc();
         let query_ctx = self.session.new_query_context();
@@ -341,7 +342,9 @@ impl ExtendedQueryHandler for PostgresServerHandlerInner {
         C: ClientInfo + Unpin + Send + Sync,
     {
         let sql_plan = &stmt.statement;
-        let (param_types, sql_plan, format) = if let Some(plan) = &sql_plan.plan {
+        // client provided parameter types, can be empty if client doesn't try to parse statement
+        let provided_param_types = &stmt.parameter_types;
+        let server_inferenced_types = if let Some(plan) = &sql_plan.plan {
             let param_types = plan
                 .get_parameter_types()
                 .context(DataFusionSnafu)
@@ -352,14 +355,36 @@ impl ExtendedQueryHandler for PostgresServerHandlerInner {
 
             let types = param_types_to_pg_types(&param_types).map_err(convert_err)?;
 
-            (types, sql_plan, &Format::UnifiedBinary)
+            Some(types)
         } else {
-            let param_types = stmt.parameter_types.clone();
-            (param_types, sql_plan, &Format::UnifiedBinary)
+            None
         };
 
+        let param_count = if provided_param_types.is_empty() {
+            server_inferenced_types
+                .as_ref()
+                .map(|types| types.len())
+                .unwrap_or(0)
+        } else {
+            provided_param_types.len()
+        };
+
+        let param_types = (0..param_count)
+            .map(|i| {
+                let client_type = provided_param_types.get(i);
+                // use server type when client provided type is None (oid: 0 or other invalid values)
+                match client_type {
+                    Some(Some(client_type)) => client_type.clone(),
+                    _ => server_inferenced_types
+                        .as_ref()
+                        .and_then(|types| types.get(i).cloned())
+                        .unwrap_or(Type::UNKNOWN),
+                }
+            })
+            .collect::<Vec<_>>();
+
         if let Some(schema) = &sql_plan.schema {
-            schema_to_pg(schema, format)
+            schema_to_pg(schema, &Format::UnifiedBinary)
                 .map(|fields| DescribeStatementResponse::new(param_types, fields))
                 .map_err(convert_err)
         } else {
@@ -388,21 +413,67 @@ impl ExtendedQueryHandler for PostgresServerHandlerInner {
         let sql_plan = &portal.statement.statement;
         let format = &portal.result_column_format;
 
-        if let Some(schema) = &sql_plan.schema {
-            schema_to_pg(schema, format)
-                .map(DescribePortalResponse::new)
-                .map_err(convert_err)
-        } else {
-            if let Some(mut resp) =
-                fixtures::process(&sql_plan.query, self.session.new_query_context())
-                && let Response::Query(query_response) = resp.remove(0)
-            {
-                return Ok(DescribePortalResponse::new(
-                    (*query_response.row_schema()).clone(),
-                ));
+        match sql_plan.statement.as_ref() {
+            Some(Statement::Query(_)) => {
+                // if the query has a schema, it is managed by datafusion, use the schema
+                if let Some(schema) = &sql_plan.schema {
+                    schema_to_pg(schema, format)
+                        .map(DescribePortalResponse::new)
+                        .map_err(convert_err)
+                } else {
+                    // fallback to NoData
+                    Ok(DescribePortalResponse::new(vec![]))
+                }
+            }
+            // We can cover only part of show statements
+            // these show create statements will return 2 columns
+            Some(Statement::ShowCreateDatabase(_))
+            | Some(Statement::ShowCreateTable(_))
+            | Some(Statement::ShowCreateFlow(_))
+            | Some(Statement::ShowCreateView(_)) => Ok(DescribePortalResponse::new(vec![
+                FieldInfo::new(
+                    "name".to_string(),
+                    None,
+                    None,
+                    Type::TEXT,
+                    format.format_for(0),
+                ),
+                FieldInfo::new(
+                    "create_statement".to_string(),
+                    None,
+                    None,
+                    Type::TEXT,
+                    format.format_for(1),
+                ),
+            ])),
+            // single column show statements
+            Some(Statement::ShowTables(_))
+            | Some(Statement::ShowFlows(_))
+            | Some(Statement::ShowViews(_)) => {
+                Ok(DescribePortalResponse::new(vec![FieldInfo::new(
+                    "name".to_string(),
+                    None,
+                    None,
+                    Type::TEXT,
+                    format.format_for(0),
+                )]))
+            }
+            // we will not support other show statements for extended query protocol at least for now.
+            // because the return columns is not predictable at this stage
+            _ => {
+                // test if query caught by fixture
+                if let Some(mut resp) =
+                    fixtures::process(&sql_plan.query, self.session.new_query_context())
+                    && let Response::Query(query_response) = resp.remove(0)
+                {
+                    Ok(DescribePortalResponse::new(
+                        (*query_response.row_schema()).clone(),
+                    ))
+                } else {
+                    // fallback to NoData
+                    Ok(DescribePortalResponse::new(vec![]))
+                }
             }
-
-            Ok(DescribePortalResponse::new(vec![]))
         }
     }
 }
diff --git a/src/servers/src/postgres/server.rs b/src/servers/src/postgres/server.rs
index 3c7a711780..3478a6da78 100644
--- a/src/servers/src/postgres/server.rs
+++ b/src/servers/src/postgres/server.rs
@@ -80,7 +80,7 @@ impl PostgresServer {
         let process_manager = self.process_manager.clone();
         accepting_stream.for_each(move |tcp_stream| {
             let io_runtime = io_runtime.clone();
-            let tls_acceptor = tls_server_config.get_server_config().map(TlsAcceptor::from);
+            let tls_acceptor = tls_server_config.get_config().map(TlsAcceptor::from);
             let handler_maker = handler_maker.clone();
             let process_id = process_manager.as_ref().map(|p| p.next_id()).unwrap_or(0);
 
diff --git a/src/servers/src/postgres/types.rs b/src/servers/src/postgres/types.rs
index b8251a5d95..96a2105d44 100644
--- a/src/servers/src/postgres/types.rs
+++ b/src/servers/src/postgres/types.rs
@@ -22,21 +22,18 @@ use std::sync::Arc;
 
 use arrow::array::{Array, ArrayRef, AsArray};
 use arrow::datatypes::{
-    Date32Type, Date64Type, Decimal128Type, DurationMicrosecondType, DurationMillisecondType,
-    DurationNanosecondType, DurationSecondType, Float32Type, Float64Type, Int8Type, Int16Type,
+    Date32Type, Date64Type, Decimal128Type, Float32Type, Float64Type, Int8Type, Int16Type,
     Int32Type, Int64Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType,
     Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
     TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
     TimestampSecondType, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
 };
 use arrow_schema::{DataType, IntervalUnit, TimeUnit};
-use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime};
+use chrono::{DateTime, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime};
 use common_decimal::Decimal128;
 use common_recordbatch::RecordBatch;
 use common_time::time::Time;
-use common_time::{
-    Date, Duration, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth, Timestamp,
-};
+use common_time::{Date, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth, Timestamp};
 use datafusion_common::ScalarValue;
 use datafusion_expr::LogicalPlan;
 use datatypes::arrow::datatypes::DataType as ArrowDataType;
@@ -567,26 +564,8 @@ impl RecordBatchRowIterator {
                     });
                     encoder.encode_field(&date)?;
                 }
-                DataType::Timestamp(time_unit, _) => {
-                    let v = match time_unit {
-                        TimeUnit::Second => {
-                            let array = column.as_primitive::<TimestampSecondType>();
-                            array.value(i)
-                        }
-                        TimeUnit::Millisecond => {
-                            let array = column.as_primitive::<TimestampMillisecondType>();
-                            array.value(i)
-                        }
-                        TimeUnit::Microsecond => {
-                            let array = column.as_primitive::<TimestampMicrosecondType>();
-                            array.value(i)
-                        }
-                        TimeUnit::Nanosecond => {
-                            let array = column.as_primitive::<TimestampNanosecondType>();
-                            array.value(i)
-                        }
-                    };
-                    let v = Timestamp::new(v, time_unit.into());
+                DataType::Timestamp(_, _) => {
+                    let v = datatypes::arrow_array::timestamp_array_value(column, i);
                     let datetime = v
                         .to_chrono_datetime_with_timezone(Some(&self.query_ctx.timezone()))
                         .map(|v| {
@@ -613,26 +592,8 @@ impl RecordBatchRowIterator {
                         encoder.encode_field(&PgInterval::from(v))?;
                     }
                 },
-                DataType::Duration(time_unit) => {
-                    let v = match time_unit {
-                        TimeUnit::Second => {
-                            let array = column.as_primitive::<DurationSecondType>();
-                            array.value(i)
-                        }
-                        TimeUnit::Millisecond => {
-                            let array = column.as_primitive::<DurationMillisecondType>();
-                            array.value(i)
-                        }
-                        TimeUnit::Microsecond => {
-                            let array = column.as_primitive::<DurationMicrosecondType>();
-                            array.value(i)
-                        }
-                        TimeUnit::Nanosecond => {
-                            let array = column.as_primitive::<DurationNanosecondType>();
-                            array.value(i)
-                        }
-                    };
-                    let d = Duration::new(v, time_unit.into());
+                DataType::Duration(_) => {
+                    let d = datatypes::arrow_array::duration_array_value(column, i);
                     match PgInterval::try_from(d) {
                         Ok(i) => encoder.encode_field(&i)?,
                         Err(e) => {
@@ -650,25 +611,8 @@ impl RecordBatchRowIterator {
                 DataType::Struct(_) => {
                     encode_struct(&self.query_ctx, Default::default(), encoder)?;
                 }
-                DataType::Time32(time_unit) | DataType::Time64(time_unit) => {
-                    let v = match time_unit {
-                        TimeUnit::Second => {
-                            let array = column.as_primitive::<Time32SecondType>();
-                            Time::new_second(array.value(i) as i64)
-                        }
-                        TimeUnit::Millisecond => {
-                            let array = column.as_primitive::<Time32MillisecondType>();
-                            Time::new_millisecond(array.value(i) as i64)
-                        }
-                        TimeUnit::Microsecond => {
-                            let array = column.as_primitive::<Time64MicrosecondType>();
-                            Time::new_microsecond(array.value(i))
-                        }
-                        TimeUnit::Nanosecond => {
-                            let array = column.as_primitive::<Time64NanosecondType>();
-                            Time::new_nanosecond(array.value(i))
-                        }
-                    };
+                DataType::Time32(_) | DataType::Time64(_) => {
+                    let v = datatypes::arrow_array::time_array_value(column, i);
                     encoder.encode_field(&v.to_chrono_time())?;
                 }
                 DataType::Decimal128(precision, scale) => {
@@ -773,7 +717,7 @@ pub(super) fn type_pg_to_gt(origin: &Type) -> Result<ConcreteDataType> {
         &Type::INT4 => Ok(ConcreteDataType::int32_datatype()),
         &Type::INT8 => Ok(ConcreteDataType::int64_datatype()),
         &Type::VARCHAR | &Type::TEXT => Ok(ConcreteDataType::string_datatype()),
-        &Type::TIMESTAMP => Ok(ConcreteDataType::timestamp_datatype(
+        &Type::TIMESTAMP | &Type::TIMESTAMPTZ => Ok(ConcreteDataType::timestamp_datatype(
             common_time::timestamp::TimeUnit::Millisecond,
         )),
         &Type::DATE => Ok(ConcreteDataType::date_datatype()),
@@ -805,7 +749,13 @@ pub(super) fn type_pg_to_gt(origin: &Type) -> Result<ConcreteDataType> {
 pub(super) fn parameter_to_string(portal: &Portal<SqlPlan>, idx: usize) -> PgWireResult<String> {
     // the index is managed from portal's parameters count so it's safe to
     // unwrap here.
-    let param_type = portal.statement.parameter_types.get(idx).unwrap();
+    let param_type = portal
+        .statement
+        .parameter_types
+        .get(idx)
+        .unwrap()
+        .as_ref()
+        .unwrap_or(&Type::UNKNOWN);
     match param_type {
         &Type::VARCHAR | &Type::TEXT => Ok(format!(
             "'{}'",
@@ -884,7 +834,7 @@ pub(super) fn parameters_to_scalar_values(
     let mut results = Vec::with_capacity(param_count);
 
     let client_param_types = &portal.statement.parameter_types;
-    let param_types = plan
+    let server_param_types = plan
         .get_parameter_types()
         .context(DataFusionSnafu)
         .map_err(convert_err)?
@@ -893,11 +843,11 @@ pub(super) fn parameters_to_scalar_values(
         .collect::<HashMap<_, _>>();
 
     for idx in 0..param_count {
-        let server_type = param_types
+        let server_type = server_param_types
             .get(&format!("${}", idx + 1))
             .and_then(|t| t.as_ref());
 
-        let client_type = if let Some(client_given_type) = client_param_types.get(idx) {
+        let client_type = if let Some(Some(client_given_type)) = client_param_types.get(idx) {
             client_given_type.clone()
         } else if let Some(server_provided_type) = &server_type {
             type_gt_to_pg(server_provided_type).map_err(convert_err)?
@@ -1100,7 +1050,7 @@ pub(super) fn parameters_to_scalar_values(
                                 None,
                             ),
                             TimestampType::Nanosecond(_) => ScalarValue::TimestampNanosecond(
-                                data.map(|ts| ts.and_utc().timestamp_micros()),
+                                data.and_then(|ts| ts.and_utc().timestamp_nanos_opt()),
                                 None,
                             ),
                         },
@@ -1118,6 +1068,38 @@ pub(super) fn parameters_to_scalar_values(
                     )
                 }
             }
+            &Type::TIMESTAMPTZ => {
+                let data = portal.parameter::<DateTime<FixedOffset>>(idx, &client_type)?;
+                if let Some(server_type) = &server_type {
+                    match server_type {
+                        ConcreteDataType::Timestamp(unit) => match *unit {
+                            TimestampType::Second(_) => {
+                                ScalarValue::TimestampSecond(data.map(|ts| ts.timestamp()), None)
+                            }
+                            TimestampType::Millisecond(_) => ScalarValue::TimestampMillisecond(
+                                data.map(|ts| ts.timestamp_millis()),
+                                None,
+                            ),
+                            TimestampType::Microsecond(_) => ScalarValue::TimestampMicrosecond(
+                                data.map(|ts| ts.timestamp_micros()),
+                                None,
+                            ),
+                            TimestampType::Nanosecond(_) => ScalarValue::TimestampNanosecond(
+                                data.and_then(|ts| ts.timestamp_nanos_opt()),
+                                None,
+                            ),
+                        },
+                        _ => {
+                            return Err(invalid_parameter_error(
+                                "invalid_parameter_type",
+                                Some(format!("Expected: {}, found: {}", server_type, client_type)),
+                            ));
+                        }
+                    }
+                } else {
+                    ScalarValue::TimestampMillisecond(data.map(|ts| ts.timestamp_millis()), None)
+                }
+            }
             &Type::DATE => {
                 let data = portal.parameter::<NaiveDate>(idx, &client_type)?;
                 if let Some(server_type) = &server_type {
@@ -1278,6 +1260,204 @@ pub(super) fn parameters_to_scalar_values(
                     ScalarValue::Null
                 }
             }
+            &Type::TIMESTAMP_ARRAY => {
+                let data = portal.parameter::<Vec<NaiveDateTime>>(idx, &client_type)?;
+                if let Some(data) = data {
+                    if let Some(ConcreteDataType::List(list_type)) = &server_type {
+                        match list_type.item_type() {
+                            ConcreteDataType::Timestamp(unit) => match *unit {
+                                TimestampType::Second(_) => {
+                                    let values = data
+                                        .into_iter()
+                                        .map(|ts| {
+                                            ScalarValue::TimestampSecond(
+                                                Some(ts.and_utc().timestamp()),
+                                                None,
+                                            )
+                                        })
+                                        .collect::<Vec<_>>();
+                                    ScalarValue::List(ScalarValue::new_list(
+                                        &values,
+                                        &ArrowDataType::Timestamp(TimeUnit::Second, None),
+                                        true,
+                                    ))
+                                }
+                                TimestampType::Millisecond(_) => {
+                                    let values = data
+                                        .into_iter()
+                                        .map(|ts| {
+                                            ScalarValue::TimestampMillisecond(
+                                                Some(ts.and_utc().timestamp_millis()),
+                                                None,
+                                            )
+                                        })
+                                        .collect::<Vec<_>>();
+                                    ScalarValue::List(ScalarValue::new_list(
+                                        &values,
+                                        &ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
+                                        true,
+                                    ))
+                                }
+                                TimestampType::Microsecond(_) => {
+                                    let values = data
+                                        .into_iter()
+                                        .map(|ts| {
+                                            ScalarValue::TimestampMicrosecond(
+                                                Some(ts.and_utc().timestamp_micros()),
+                                                None,
+                                            )
+                                        })
+                                        .collect::<Vec<_>>();
+                                    ScalarValue::List(ScalarValue::new_list(
+                                        &values,
+                                        &ArrowDataType::Timestamp(TimeUnit::Microsecond, None),
+                                        true,
+                                    ))
+                                }
+                                TimestampType::Nanosecond(_) => {
+                                    let values = data
+                                        .into_iter()
+                                        .filter_map(|ts| {
+                                            ts.and_utc().timestamp_nanos_opt().map(|nanos| {
+                                                ScalarValue::TimestampNanosecond(Some(nanos), None)
+                                            })
+                                        })
+                                        .collect::<Vec<_>>();
+                                    ScalarValue::List(ScalarValue::new_list(
+                                        &values,
+                                        &ArrowDataType::Timestamp(TimeUnit::Nanosecond, None),
+                                        true,
+                                    ))
+                                }
+                            },
+                            _ => {
+                                return Err(invalid_parameter_error(
+                                    "invalid_parameter_type",
+                                    Some(format!(
+                                        "Expected: {}, found: {}",
+                                        list_type.item_type(),
+                                        client_type
+                                    )),
+                                ));
+                            }
+                        }
+                    } else {
+                        // Default to millisecond when no server type is specified
+                        let values = data
+                            .into_iter()
+                            .map(|ts| {
+                                ScalarValue::TimestampMillisecond(
+                                    Some(ts.and_utc().timestamp_millis()),
+                                    None,
+                                )
+                            })
+                            .collect::<Vec<_>>();
+                        ScalarValue::List(ScalarValue::new_list(
+                            &values,
+                            &ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
+                            true,
+                        ))
+                    }
+                } else {
+                    ScalarValue::Null
+                }
+            }
+            &Type::TIMESTAMPTZ_ARRAY => {
+                let data = portal.parameter::<Vec<DateTime<FixedOffset>>>(idx, &client_type)?;
+                if let Some(data) = data {
+                    if let Some(ConcreteDataType::List(list_type)) = &server_type {
+                        match list_type.item_type() {
+                            ConcreteDataType::Timestamp(unit) => match *unit {
+                                TimestampType::Second(_) => {
+                                    let values = data
+                                        .into_iter()
+                                        .map(|ts| {
+                                            ScalarValue::TimestampSecond(Some(ts.timestamp()), None)
+                                        })
+                                        .collect::<Vec<_>>();
+                                    ScalarValue::List(ScalarValue::new_list(
+                                        &values,
+                                        &ArrowDataType::Timestamp(TimeUnit::Second, None),
+                                        true,
+                                    ))
+                                }
+                                TimestampType::Millisecond(_) => {
+                                    let values = data
+                                        .into_iter()
+                                        .map(|ts| {
+                                            ScalarValue::TimestampMillisecond(
+                                                Some(ts.timestamp_millis()),
+                                                None,
+                                            )
+                                        })
+                                        .collect::<Vec<_>>();
+                                    ScalarValue::List(ScalarValue::new_list(
+                                        &values,
+                                        &ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
+                                        true,
+                                    ))
+                                }
+                                TimestampType::Microsecond(_) => {
+                                    let values = data
+                                        .into_iter()
+                                        .map(|ts| {
+                                            ScalarValue::TimestampMicrosecond(
+                                                Some(ts.timestamp_micros()),
+                                                None,
+                                            )
+                                        })
+                                        .collect::<Vec<_>>();
+                                    ScalarValue::List(ScalarValue::new_list(
+                                        &values,
+                                        &ArrowDataType::Timestamp(TimeUnit::Microsecond, None),
+                                        true,
+                                    ))
+                                }
+                                TimestampType::Nanosecond(_) => {
+                                    let values = data
+                                        .into_iter()
+                                        .filter_map(|ts| {
+                                            ts.timestamp_nanos_opt().map(|nanos| {
+                                                ScalarValue::TimestampNanosecond(Some(nanos), None)
+                                            })
+                                        })
+                                        .collect::<Vec<_>>();
+                                    ScalarValue::List(ScalarValue::new_list(
+                                        &values,
+                                        &ArrowDataType::Timestamp(TimeUnit::Nanosecond, None),
+                                        true,
+                                    ))
+                                }
+                            },
+                            _ => {
+                                return Err(invalid_parameter_error(
+                                    "invalid_parameter_type",
+                                    Some(format!(
+                                        "Expected: {}, found: {}",
+                                        list_type.item_type(),
+                                        client_type
+                                    )),
+                                ));
+                            }
+                        }
+                    } else {
+                        // Default to millisecond when no server type is specified
+                        let values = data
+                            .into_iter()
+                            .map(|ts| {
+                                ScalarValue::TimestampMillisecond(Some(ts.timestamp_millis()), None)
+                            })
+                            .collect::<Vec<_>>();
+                        ScalarValue::List(ScalarValue::new_list(
+                            &values,
+                            &ArrowDataType::Timestamp(TimeUnit::Millisecond, None),
+                            true,
+                        ))
+                    }
+                } else {
+                    ScalarValue::Null
+                }
+            }
             _ => Err(invalid_parameter_error(
                 "unsupported_parameter_value",
                 Some(format!("Found type: {}", client_type)),
diff --git a/src/servers/src/postgres/types/bytea.rs b/src/servers/src/postgres/types/bytea.rs
index 78a2a20bd8..7b8b42f754 100644
--- a/src/servers/src/postgres/types/bytea.rs
+++ b/src/servers/src/postgres/types/bytea.rs
@@ -14,6 +14,7 @@
 
 use bytes::BufMut;
 use pgwire::types::ToSqlText;
+use pgwire::types::format::FormatOptions;
 use postgres_types::{IsNull, ToSql, Type};
 
 #[derive(Debug)]
@@ -23,11 +24,12 @@ impl ToSqlText for HexOutputBytea<'_> {
         &self,
         ty: &Type,
         out: &mut bytes::BytesMut,
+        format_options: &FormatOptions,
     ) -> std::result::Result<IsNull, Box<dyn std::error::Error + Sync + Send>>
     where
         Self: Sized,
     {
-        let _ = self.0.to_sql_text(ty, out);
+        let _ = self.0.to_sql_text(ty, out, format_options);
         Ok(IsNull::No)
     }
 }
@@ -66,6 +68,7 @@ impl ToSqlText for EscapeOutputBytea<'_> {
         &self,
         _ty: &Type,
         out: &mut bytes::BytesMut,
+        _format_options: &FormatOptions,
     ) -> std::result::Result<IsNull, Box<dyn std::error::Error + Sync + Send>>
     where
         Self: Sized,
@@ -120,7 +123,9 @@ mod tests {
 
         let expected = b"abcklm*\\251T";
         let mut out = bytes::BytesMut::new();
-        let is_null = input.to_sql_text(&Type::BYTEA, &mut out).unwrap();
+        let is_null = input
+            .to_sql_text(&Type::BYTEA, &mut out, &FormatOptions::default())
+            .unwrap();
         assert!(matches!(is_null, IsNull::No));
         assert_eq!(&out[..], expected);
 
@@ -138,7 +143,9 @@ mod tests {
 
         let expected = b"\\x68656c6c6f2c20776f726c6421";
         let mut out = bytes::BytesMut::new();
-        let is_null = input.to_sql_text(&Type::BYTEA, &mut out).unwrap();
+        let is_null = input
+            .to_sql_text(&Type::BYTEA, &mut out, &FormatOptions::default())
+            .unwrap();
         assert!(matches!(is_null, IsNull::No));
         assert_eq!(&out[..], expected);
 
diff --git a/src/servers/src/postgres/types/datetime.rs b/src/servers/src/postgres/types/datetime.rs
index 700f6dc2b5..5fdd87decf 100644
--- a/src/servers/src/postgres/types/datetime.rs
+++ b/src/servers/src/postgres/types/datetime.rs
@@ -15,6 +15,7 @@
 use bytes::BufMut;
 use chrono::{NaiveDate, NaiveDateTime};
 use pgwire::types::ToSqlText;
+use pgwire::types::format::FormatOptions;
 use postgres_types::{IsNull, ToSql, Type};
 use session::session_config::{PGDateOrder, PGDateTimeStyle};
 
@@ -58,6 +59,7 @@ impl ToSqlText for StylingDate {
         &self,
         ty: &Type,
         out: &mut bytes::BytesMut,
+        format_options: &FormatOptions,
     ) -> std::result::Result<IsNull, Box<dyn std::error::Error + Sync + Send>>
     where
         Self: Sized,
@@ -71,7 +73,7 @@ impl ToSqlText for StylingDate {
                 out.put_slice(fmt.as_bytes());
             }
             _ => {
-                self.0.to_sql_text(ty, out)?;
+                self.0.to_sql_text(ty, out, format_options)?;
             }
         }
         Ok(IsNull::No)
@@ -83,6 +85,7 @@ impl ToSqlText for StylingDateTime {
         &self,
         ty: &Type,
         out: &mut bytes::BytesMut,
+        format_options: &FormatOptions,
     ) -> Result<IsNull, Box<dyn std::error::Error + Sync + Send>>
     where
         Self: Sized,
@@ -103,7 +106,7 @@ impl ToSqlText for StylingDateTime {
                 out.put_slice(fmt.as_bytes());
             }
             _ => {
-                self.0.to_sql_text(ty, out)?;
+                self.0.to_sql_text(ty, out, format_options)?;
             }
         }
         Ok(IsNull::No)
@@ -151,7 +154,9 @@ mod tests {
             let styling_date = StylingDate(naive_date, PGDateTimeStyle::ISO, PGDateOrder::MDY);
             let expected = "1997-12-17";
             let mut out = bytes::BytesMut::new();
-            let is_null = styling_date.to_sql_text(&Type::DATE, &mut out).unwrap();
+            let is_null = styling_date
+                .to_sql_text(&Type::DATE, &mut out, &FormatOptions::default())
+                .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
         }
@@ -160,7 +165,9 @@ mod tests {
             let styling_date = StylingDate(naive_date, PGDateTimeStyle::ISO, PGDateOrder::YMD);
             let expected = "1997-12-17";
             let mut out = bytes::BytesMut::new();
-            let is_null = styling_date.to_sql_text(&Type::DATE, &mut out).unwrap();
+            let is_null = styling_date
+                .to_sql_text(&Type::DATE, &mut out, &FormatOptions::default())
+                .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
         }
@@ -169,7 +176,9 @@ mod tests {
             let styling_date = StylingDate(naive_date, PGDateTimeStyle::ISO, PGDateOrder::DMY);
             let expected = "1997-12-17";
             let mut out = bytes::BytesMut::new();
-            let is_null = styling_date.to_sql_text(&Type::DATE, &mut out).unwrap();
+            let is_null = styling_date
+                .to_sql_text(&Type::DATE, &mut out, &FormatOptions::default())
+                .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
         }
@@ -178,7 +187,9 @@ mod tests {
             let styling_date = StylingDate(naive_date, PGDateTimeStyle::German, PGDateOrder::MDY);
             let expected = "17.12.1997";
             let mut out = bytes::BytesMut::new();
-            let is_null = styling_date.to_sql_text(&Type::DATE, &mut out).unwrap();
+            let is_null = styling_date
+                .to_sql_text(&Type::DATE, &mut out, &FormatOptions::default())
+                .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
         }
@@ -187,7 +198,9 @@ mod tests {
             let styling_date = StylingDate(naive_date, PGDateTimeStyle::German, PGDateOrder::YMD);
             let expected = "17.12.1997";
             let mut out = bytes::BytesMut::new();
-            let is_null = styling_date.to_sql_text(&Type::DATE, &mut out).unwrap();
+            let is_null = styling_date
+                .to_sql_text(&Type::DATE, &mut out, &FormatOptions::default())
+                .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
         }
@@ -196,7 +209,9 @@ mod tests {
             let styling_date = StylingDate(naive_date, PGDateTimeStyle::German, PGDateOrder::DMY);
             let expected = "17.12.1997";
             let mut out = bytes::BytesMut::new();
-            let is_null = styling_date.to_sql_text(&Type::DATE, &mut out).unwrap();
+            let is_null = styling_date
+                .to_sql_text(&Type::DATE, &mut out, &FormatOptions::default())
+                .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
         }
@@ -205,7 +220,9 @@ mod tests {
             let styling_date = StylingDate(naive_date, PGDateTimeStyle::Postgres, PGDateOrder::MDY);
             let expected = "12-17-1997";
             let mut out = bytes::BytesMut::new();
-            let is_null = styling_date.to_sql_text(&Type::DATE, &mut out).unwrap();
+            let is_null = styling_date
+                .to_sql_text(&Type::DATE, &mut out, &FormatOptions::default())
+                .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
         }
@@ -214,7 +231,9 @@ mod tests {
             let styling_date = StylingDate(naive_date, PGDateTimeStyle::Postgres, PGDateOrder::YMD);
             let expected = "12-17-1997";
             let mut out = bytes::BytesMut::new();
-            let is_null = styling_date.to_sql_text(&Type::DATE, &mut out).unwrap();
+            let is_null = styling_date
+                .to_sql_text(&Type::DATE, &mut out, &FormatOptions::default())
+                .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
         }
@@ -223,7 +242,9 @@ mod tests {
             let styling_date = StylingDate(naive_date, PGDateTimeStyle::Postgres, PGDateOrder::DMY);
             let expected = "17-12-1997";
             let mut out = bytes::BytesMut::new();
-            let is_null = styling_date.to_sql_text(&Type::DATE, &mut out).unwrap();
+            let is_null = styling_date
+                .to_sql_text(&Type::DATE, &mut out, &FormatOptions::default())
+                .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
         }
@@ -232,7 +253,9 @@ mod tests {
             let styling_date = StylingDate(naive_date, PGDateTimeStyle::SQL, PGDateOrder::MDY);
             let expected = "12/17/1997";
             let mut out = bytes::BytesMut::new();
-            let is_null = styling_date.to_sql_text(&Type::DATE, &mut out).unwrap();
+            let is_null = styling_date
+                .to_sql_text(&Type::DATE, &mut out, &FormatOptions::default())
+                .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
         }
@@ -241,7 +264,9 @@ mod tests {
             let styling_date = StylingDate(naive_date, PGDateTimeStyle::SQL, PGDateOrder::YMD);
             let expected = "12/17/1997";
             let mut out = bytes::BytesMut::new();
-            let is_null = styling_date.to_sql_text(&Type::DATE, &mut out).unwrap();
+            let is_null = styling_date
+                .to_sql_text(&Type::DATE, &mut out, &FormatOptions::default())
+                .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
         }
@@ -250,7 +275,9 @@ mod tests {
             let styling_date = StylingDate(naive_date, PGDateTimeStyle::SQL, PGDateOrder::DMY);
             let expected = "17/12/1997";
             let mut out = bytes::BytesMut::new();
-            let is_null = styling_date.to_sql_text(&Type::DATE, &mut out).unwrap();
+            let is_null = styling_date
+                .to_sql_text(&Type::DATE, &mut out, &FormatOptions::default())
+                .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
         }
@@ -267,7 +294,7 @@ mod tests {
             let expected = "2021-09-01 12:34:56.789012";
             let mut out = bytes::BytesMut::new();
             let is_null = styling_datetime
-                .to_sql_text(&Type::TIMESTAMP, &mut out)
+                .to_sql_text(&Type::TIMESTAMP, &mut out, &FormatOptions::default())
                 .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
@@ -278,7 +305,7 @@ mod tests {
             let expected = "2021-09-01 12:34:56.789012";
             let mut out = bytes::BytesMut::new();
             let is_null = styling_datetime
-                .to_sql_text(&Type::TIMESTAMP, &mut out)
+                .to_sql_text(&Type::TIMESTAMP, &mut out, &FormatOptions::default())
                 .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
@@ -289,7 +316,7 @@ mod tests {
             let expected = "2021-09-01 12:34:56.789012";
             let mut out = bytes::BytesMut::new();
             let is_null = styling_datetime
-                .to_sql_text(&Type::TIMESTAMP, &mut out)
+                .to_sql_text(&Type::TIMESTAMP, &mut out, &FormatOptions::default())
                 .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
@@ -301,7 +328,7 @@ mod tests {
             let expected = "01.09.2021 12:34:56.789012";
             let mut out = bytes::BytesMut::new();
             let is_null = styling_datetime
-                .to_sql_text(&Type::TIMESTAMP, &mut out)
+                .to_sql_text(&Type::TIMESTAMP, &mut out, &FormatOptions::default())
                 .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
@@ -313,7 +340,7 @@ mod tests {
             let expected = "01.09.2021 12:34:56.789012";
             let mut out = bytes::BytesMut::new();
             let is_null = styling_datetime
-                .to_sql_text(&Type::TIMESTAMP, &mut out)
+                .to_sql_text(&Type::TIMESTAMP, &mut out, &FormatOptions::default())
                 .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
@@ -325,7 +352,7 @@ mod tests {
             let expected = "01.09.2021 12:34:56.789012";
             let mut out = bytes::BytesMut::new();
             let is_null = styling_datetime
-                .to_sql_text(&Type::TIMESTAMP, &mut out)
+                .to_sql_text(&Type::TIMESTAMP, &mut out, &FormatOptions::default())
                 .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
@@ -337,7 +364,7 @@ mod tests {
             let expected = "Wed Sep 01 12:34:56.789012 2021";
             let mut out = bytes::BytesMut::new();
             let is_null = styling_datetime
-                .to_sql_text(&Type::TIMESTAMP, &mut out)
+                .to_sql_text(&Type::TIMESTAMP, &mut out, &FormatOptions::default())
                 .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
@@ -349,7 +376,7 @@ mod tests {
             let expected = "Wed Sep 01 12:34:56.789012 2021";
             let mut out = bytes::BytesMut::new();
             let is_null = styling_datetime
-                .to_sql_text(&Type::TIMESTAMP, &mut out)
+                .to_sql_text(&Type::TIMESTAMP, &mut out, &FormatOptions::default())
                 .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
@@ -361,7 +388,7 @@ mod tests {
             let expected = "Wed 01 Sep 12:34:56.789012 2021";
             let mut out = bytes::BytesMut::new();
             let is_null = styling_datetime
-                .to_sql_text(&Type::TIMESTAMP, &mut out)
+                .to_sql_text(&Type::TIMESTAMP, &mut out, &FormatOptions::default())
                 .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
@@ -372,7 +399,7 @@ mod tests {
             let expected = "09/01/2021 12:34:56.789012";
             let mut out = bytes::BytesMut::new();
             let is_null = styling_datetime
-                .to_sql_text(&Type::TIMESTAMP, &mut out)
+                .to_sql_text(&Type::TIMESTAMP, &mut out, &FormatOptions::default())
                 .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
@@ -383,7 +410,7 @@ mod tests {
             let expected = "09/01/2021 12:34:56.789012";
             let mut out = bytes::BytesMut::new();
             let is_null = styling_datetime
-                .to_sql_text(&Type::TIMESTAMP, &mut out)
+                .to_sql_text(&Type::TIMESTAMP, &mut out, &FormatOptions::default())
                 .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
@@ -394,7 +421,7 @@ mod tests {
             let expected = "01/09/2021 12:34:56.789012";
             let mut out = bytes::BytesMut::new();
             let is_null = styling_datetime
-                .to_sql_text(&Type::TIMESTAMP, &mut out)
+                .to_sql_text(&Type::TIMESTAMP, &mut out, &FormatOptions::default())
                 .unwrap();
             assert!(matches!(is_null, IsNull::No));
             assert_eq!(out, expected.as_bytes());
diff --git a/src/servers/src/postgres/types/error.rs b/src/servers/src/postgres/types/error.rs
index 143f02342a..a76bb4362c 100644
--- a/src/servers/src/postgres/types/error.rs
+++ b/src/servers/src/postgres/types/error.rs
@@ -295,6 +295,10 @@ pub enum PgErrorCode {
     /// operator_intervention
     #[snafu(display("operator_intervention"))]
     Ec57000 = 3600,
+
+    /// cannot_connect_now
+    #[snafu(display("cannot_connect_now"))]
+    Ec57P03 = 3601,
     // === End of Class 57 — Operator Intervention =====
 
     // === Begin of Class 58 — System Error (errors external to PostgreSQL itself) ===
@@ -374,6 +378,7 @@ impl From<StatusCode> for PgErrorCode {
             StatusCode::Unsupported => PgErrorCode::Ec0A000,
             StatusCode::InvalidArguments => PgErrorCode::Ec22023,
             StatusCode::Cancelled => PgErrorCode::Ec57000,
+            StatusCode::Suspended => PgErrorCode::Ec57P03,
             StatusCode::DeadlineExceeded => PgErrorCode::Ec57000,
             StatusCode::External => PgErrorCode::Ec58000,
 
diff --git a/src/servers/src/postgres/types/interval.rs b/src/servers/src/postgres/types/interval.rs
index 5d977ae47e..2734d449b0 100644
--- a/src/servers/src/postgres/types/interval.rs
+++ b/src/servers/src/postgres/types/interval.rs
@@ -18,7 +18,8 @@ use bytes::{Buf, BufMut};
 use common_time::interval::IntervalFormat;
 use common_time::timestamp::TimeUnit;
 use common_time::{Duration, IntervalDayTime, IntervalMonthDayNano, IntervalYearMonth};
-use pgwire::types::ToSqlText;
+use pgwire::types::format::FormatOptions;
+use pgwire::types::{FromSqlText, ToSqlText};
 use postgres_types::{FromSql, IsNull, ToSql, Type, to_sql_checked};
 
 use crate::error;
@@ -201,6 +202,7 @@ impl ToSqlText for PgInterval {
         &self,
         ty: &Type,
         out: &mut bytes::BytesMut,
+        _format_options: &FormatOptions,
     ) -> std::result::Result<postgres_types::IsNull, Box<dyn snafu::Error + Sync + Send>>
     where
         Self: Sized,
@@ -215,6 +217,28 @@ impl ToSqlText for PgInterval {
     }
 }
 
+impl<'a> FromSqlText<'a> for PgInterval {
+    fn from_sql_text(
+        _ty: &Type,
+        input: &[u8],
+        _format_options: &FormatOptions,
+    ) -> std::result::Result<Self, Box<dyn snafu::Error + Sync + Send>>
+    where
+        Self: Sized,
+    {
+        // only support parsing interval from postgres format
+        if let Ok(interval) = pg_interval::Interval::from_postgres(str::from_utf8(input)?) {
+            Ok(PgInterval {
+                months: interval.months,
+                days: interval.days,
+                microseconds: interval.microseconds,
+            })
+        } else {
+            Err("invalid interval format".into())
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use common_time::Duration;
diff --git a/src/servers/src/prom_store.rs b/src/servers/src/prom_store.rs
index 81268d8663..92c1815145 100644
--- a/src/servers/src/prom_store.rs
+++ b/src/servers/src/prom_store.rs
@@ -21,18 +21,18 @@ use std::hash::{Hash, Hasher};
 use api::prom_store::remote::label_matcher::Type as MatcherType;
 use api::prom_store::remote::{Label, Query, ReadRequest, Sample, TimeSeries, WriteRequest};
 use api::v1::RowInsertRequests;
+use arrow::array::{Array, AsArray};
+use arrow::datatypes::{Float64Type, TimestampMillisecondType};
 use common_grpc::precision::Precision;
 use common_query::prelude::{greptime_timestamp, greptime_value};
 use common_recordbatch::{RecordBatch, RecordBatches};
 use common_telemetry::tracing;
-use common_time::timestamp::TimeUnit;
+use datafusion::dataframe::DataFrame;
 use datafusion::prelude::{Expr, col, lit, regexp_match};
 use datafusion_common::ScalarValue;
 use datafusion_expr::LogicalPlan;
-use datatypes::prelude::{ConcreteDataType, Value};
 use openmetrics_parser::{MetricsExposition, PrometheusType, PrometheusValue};
-use query::dataframe::DataFrame;
-use snafu::{OptionExt, ResultExt, ensure};
+use snafu::{OptionExt, ResultExt};
 use snap::raw::{Decoder, Encoder};
 
 use crate::error::{self, Result};
@@ -102,8 +102,6 @@ pub fn extract_schema_from_read_request(request: &ReadRequest) -> Option<String>
 /// Create a DataFrame from a remote Query
 #[tracing::instrument(skip_all)]
 pub fn query_to_plan(dataframe: DataFrame, q: &Query) -> Result<LogicalPlan> {
-    let DataFrame::DataFusion(dataframe) = dataframe;
-
     let start_timestamp_ms = q.start_timestamp_ms;
     let end_timestamp_ms = q.end_timestamp_ms;
 
@@ -233,6 +231,24 @@ fn collect_timeseries_ids(table_name: &str, recordbatch: &RecordBatch) -> Vec<Ti
     let row_count = recordbatch.num_rows();
     let mut timeseries_ids = Vec::with_capacity(row_count);
 
+    let column_names = recordbatch
+        .schema
+        .column_schemas()
+        .iter()
+        .map(|column_schema| &column_schema.name);
+    let columns = column_names
+        .enumerate()
+        .filter(|(_, column_name)| {
+            *column_name != greptime_timestamp() && *column_name != greptime_value()
+        })
+        .map(|(i, column_name)| {
+            (
+                column_name,
+                recordbatch.iter_column_as_string(i).collect::<Vec<_>>(),
+            )
+        })
+        .collect::<Vec<_>>();
+
     for row in 0..row_count {
         let mut labels = Vec::with_capacity(recordbatch.num_columns() - 1);
         labels.push(new_label(
@@ -240,20 +256,10 @@ fn collect_timeseries_ids(table_name: &str, recordbatch: &RecordBatch) -> Vec<Ti
             table_name.to_string(),
         ));
 
-        for (i, column_schema) in recordbatch.schema.column_schemas().iter().enumerate() {
-            if column_schema.name == greptime_value() || column_schema.name == greptime_timestamp()
-            {
-                continue;
+        for (column_name, column_values) in columns.iter() {
+            if let Some(value) = &column_values[row] {
+                labels.push(new_label((*column_name).clone(), value.clone()));
             }
-
-            let column = &recordbatch.columns()[i];
-            // A label with an empty label value is considered equivalent to a label that does not exist.
-            if column.is_null(row) {
-                continue;
-            }
-
-            let value = column.get(row).to_string();
-            labels.push(new_label(column_schema.name.clone(), value));
         }
         timeseries_ids.push(TimeSeriesId { labels });
     }
@@ -280,30 +286,28 @@ fn recordbatch_to_timeseries(table: &str, recordbatch: RecordBatch) -> Result<Ve
             msg: "missing greptime_timestamp column in query result",
         },
     )?;
-    ensure!(
-        ts_column.data_type() == ConcreteDataType::timestamp_millisecond_datatype(),
-        error::InvalidPromRemoteReadQueryResultSnafu {
+    let ts_column = ts_column
+        .as_primitive_opt::<TimestampMillisecondType>()
+        .with_context(|| error::InvalidPromRemoteReadQueryResultSnafu {
             msg: format!(
                 "Expect timestamp column of datatype Timestamp(Millisecond), actual {:?}",
                 ts_column.data_type()
-            )
-        }
-    );
+            ),
+        })?;
 
     let field_column = recordbatch.column_by_name(greptime_value()).context(
         error::InvalidPromRemoteReadQueryResultSnafu {
             msg: "missing greptime_value column in query result",
         },
     )?;
-    ensure!(
-        field_column.data_type() == ConcreteDataType::float64_datatype(),
-        error::InvalidPromRemoteReadQueryResultSnafu {
+    let field_column = field_column
+        .as_primitive_opt::<Float64Type>()
+        .with_context(|| error::InvalidPromRemoteReadQueryResultSnafu {
             msg: format!(
                 "Expect value column of datatype Float64, actual {:?}",
                 field_column.data_type()
-            )
-        }
-    );
+            ),
+        })?;
 
     // First, collect each row's timeseries id
     let timeseries_ids = collect_timeseries_ids(table, &recordbatch);
@@ -322,14 +326,8 @@ fn recordbatch_to_timeseries(table: &str, recordbatch: RecordBatch) -> Result<Ve
             continue;
         }
 
-        let value: f64 = match field_column.get(row) {
-            Value::Float64(value) => value.into(),
-            _ => unreachable!("checked by the \"ensure\" above"),
-        };
-        let timestamp = match ts_column.get(row) {
-            Value::Timestamp(t) if t.unit() == TimeUnit::Millisecond => t.value(),
-            _ => unreachable!("checked by the \"ensure\" above"),
-        };
+        let value = field_column.value(row);
+        let timestamp = ts_column.value(row);
         let sample = Sample { value, timestamp };
 
         timeseries.samples.push(sample);
@@ -579,6 +577,7 @@ mod tests {
     use api::prom_store::remote::LabelMatcher;
     use api::v1::{ColumnDataType, Row, SemanticType};
     use datafusion::prelude::SessionContext;
+    use datatypes::data_type::ConcreteDataType;
     use datatypes::schema::{ColumnSchema, Schema};
     use datatypes::vectors::{Float64Vector, StringVector, TimestampMillisecondVector};
     use table::table::adapter::DfTableProviderAdapter;
@@ -653,7 +652,7 @@ mod tests {
         let table_provider = Arc::new(DfTableProviderAdapter::new(table));
 
         let dataframe = ctx.read_table(table_provider.clone()).unwrap();
-        let plan = query_to_plan(DataFrame::DataFusion(dataframe), &q).unwrap();
+        let plan = query_to_plan(dataframe, &q).unwrap();
         let display_string = format!("{}", plan.display_indent());
 
         let ts_col = greptime_timestamp();
@@ -687,7 +686,7 @@ mod tests {
         };
 
         let dataframe = ctx.read_table(table_provider).unwrap();
-        let plan = query_to_plan(DataFrame::DataFusion(dataframe), &q).unwrap();
+        let plan = query_to_plan(dataframe, &q).unwrap();
         let display_string = format!("{}", plan.display_indent());
 
         let ts_col = greptime_timestamp();
diff --git a/src/servers/src/prometheus.rs b/src/servers/src/prometheus.rs
index 0fbdb97c34..64653570de 100644
--- a/src/servers/src/prometheus.rs
+++ b/src/servers/src/prometheus.rs
@@ -16,10 +16,10 @@ use catalog::system_schema::information_schema::tables::{
     ENGINE as TABLE_ENGINE, TABLE_CATALOG, TABLE_NAME, TABLE_SCHEMA,
 };
 use common_telemetry::tracing;
+use datafusion::dataframe::DataFrame;
 use datafusion::prelude::{Expr, col, lit, regexp_match};
 use datafusion_expr::LogicalPlan;
 use promql_parser::label::{MatchOp, Matcher};
-use query::dataframe::DataFrame;
 use session::context::QueryContextRef;
 use snafu::ResultExt;
 
@@ -71,7 +71,6 @@ pub fn metric_name_matchers_to_plan(
     // Safety: conditions MUST not be empty, reduce always return Some(expr).
     let conditions = conditions.into_iter().reduce(Expr::and).unwrap();
 
-    let DataFrame::DataFusion(dataframe) = dataframe;
     let dataframe = dataframe
         .filter(conditions)
         .context(error::DataFrameSnafu)?
diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs
index d41f68555b..60efe69faa 100644
--- a/src/servers/src/query_handler.rs
+++ b/src/servers/src/query_handler.rs
@@ -210,6 +210,7 @@ pub trait JaegerQueryHandler {
         trace_id: &str,
         start_time: Option<i64>,
         end_time: Option<i64>,
+        limit: Option<usize>,
     ) -> Result<Output>;
 
     /// Find traces by query params. It's used for `/api/traces` API.
diff --git a/src/servers/src/query_handler/grpc.rs b/src/servers/src/query_handler/grpc.rs
index 305fde4448..2403c82905 100644
--- a/src/servers/src/query_handler/grpc.rs
+++ b/src/servers/src/query_handler/grpc.rs
@@ -12,21 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::pin::Pin;
 use std::sync::Arc;
 
 use api::v1::greptime_request::Request;
-use arrow_flight::FlightData;
 use async_trait::async_trait;
 use common_base::AffectedRows;
 use common_error::ext::{BoxedError, ErrorExt};
-use common_grpc::flight::FlightDecoder;
+use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
+use futures::Stream;
 use session::context::QueryContextRef;
 use snafu::ResultExt;
 use table::TableRef;
-use table::table_name::TableName;
 
 use crate::error::{self, Result};
+use crate::grpc::flight::{PutRecordBatchRequest, PutRecordBatchRequestStream};
 
 pub type GrpcQueryHandlerRef<E> = Arc<dyn GrpcQueryHandler<Error = E> + Send + Sync>;
 pub type ServerGrpcQueryHandlerRef = GrpcQueryHandlerRef<error::Error>;
@@ -45,12 +46,16 @@ pub trait GrpcQueryHandler {
 
     async fn put_record_batch(
         &self,
-        table_name: &TableName,
+        request: PutRecordBatchRequest,
         table_ref: &mut Option<TableRef>,
-        decoder: &mut FlightDecoder,
-        flight_data: FlightData,
         ctx: QueryContextRef,
     ) -> std::result::Result<AffectedRows, Self::Error>;
+
+    fn handle_put_record_batch_stream(
+        &self,
+        stream: PutRecordBatchRequestStream,
+        ctx: QueryContextRef,
+    ) -> Pin<Box<dyn Stream<Item = std::result::Result<DoPutResponse, Self::Error>> + Send>>;
 }
 
 pub struct ServerGrpcQueryHandlerAdapter<E>(GrpcQueryHandlerRef<E>);
@@ -78,16 +83,31 @@ where
 
     async fn put_record_batch(
         &self,
-        table_name: &TableName,
+        request: PutRecordBatchRequest,
         table_ref: &mut Option<TableRef>,
-        decoder: &mut FlightDecoder,
-        data: FlightData,
         ctx: QueryContextRef,
     ) -> Result<AffectedRows> {
         self.0
-            .put_record_batch(table_name, table_ref, decoder, data, ctx)
+            .put_record_batch(request, table_ref, ctx)
             .await
             .map_err(BoxedError::new)
             .context(error::ExecuteGrpcRequestSnafu)
     }
+
+    fn handle_put_record_batch_stream(
+        &self,
+        stream: PutRecordBatchRequestStream,
+        ctx: QueryContextRef,
+    ) -> Pin<Box<dyn Stream<Item = Result<DoPutResponse>> + Send>> {
+        use futures_util::StreamExt;
+        Box::pin(
+            self.0
+                .handle_put_record_batch_stream(stream, ctx)
+                .map(|result| {
+                    result
+                        .map_err(|e| BoxedError::new(e))
+                        .context(error::ExecuteGrpcRequestSnafu)
+                }),
+        )
+    }
 }
diff --git a/src/servers/src/request_limiter.rs b/src/servers/src/request_limiter.rs
index 62fb4cf216..a93104581f 100644
--- a/src/servers/src/request_limiter.rs
+++ b/src/servers/src/request_limiter.rs
@@ -133,6 +133,8 @@ impl Drop for RequestMemoryGuard {
 
 #[cfg(test)]
 mod tests {
+    use tokio::sync::Barrier;
+
     use super::*;
 
     #[test]
@@ -188,21 +190,33 @@ mod tests {
     #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
     async fn test_limiter_concurrent() {
         let limiter = RequestMemoryLimiter::new(1000);
+        let barrier = Arc::new(Barrier::new(11)); // 10 tasks + main
         let mut handles = vec![];
 
         // Spawn 10 tasks each trying to acquire 200 bytes
         for _ in 0..10 {
             let limiter_clone = limiter.clone();
-            let handle = tokio::spawn(async move { limiter_clone.try_acquire(200) });
+            let barrier_clone = barrier.clone();
+            let handle = tokio::spawn(async move {
+                barrier_clone.wait().await;
+                limiter_clone.try_acquire(200)
+            });
             handles.push(handle);
         }
 
+        // Let all tasks start together
+        barrier.wait().await;
+
         let mut success_count = 0;
         let mut fail_count = 0;
+        let mut guards = Vec::new();
 
         for handle in handles {
             match handle.await.unwrap() {
-                Ok(Some(_)) => success_count += 1,
+                Ok(Some(guard)) => {
+                    success_count += 1;
+                    guards.push(guard);
+                }
                 Err(_) => fail_count += 1,
                 Ok(None) => unreachable!(),
             }
@@ -211,5 +225,6 @@ mod tests {
         // Only 5 tasks should succeed (5 * 200 = 1000)
         assert_eq!(success_count, 5);
         assert_eq!(fail_count, 5);
+        drop(guards);
     }
 }
diff --git a/src/servers/src/tls.rs b/src/servers/src/tls.rs
index 245bf4c71a..ba4025ab74 100644
--- a/src/servers/src/tls.rs
+++ b/src/servers/src/tls.rs
@@ -15,12 +15,10 @@
 use std::fs::File;
 use std::io::{BufReader, Error as IoError, ErrorKind};
 use std::path::Path;
-use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::mpsc::channel;
-use std::sync::{Arc, RwLock};
+use std::sync::Arc;
 
-use common_telemetry::{error, info};
-use notify::{EventKind, RecursiveMode, Watcher};
+use common_grpc::reloadable_tls::{ReloadableTlsConfig, TlsConfigLoader};
+use common_telemetry::error;
 use rustls::ServerConfig;
 use rustls_pemfile::{Item, certs, read_one};
 use rustls_pki_types::{CertificateDer, PrivateKeyDer};
@@ -28,7 +26,7 @@ use serde::{Deserialize, Serialize};
 use snafu::ResultExt;
 use strum::EnumString;
 
-use crate::error::{FileWatchSnafu, InternalIoSnafu, Result};
+use crate::error::{InternalIoSnafu, Result};
 
 /// TlsMode is used for Mysql and Postgres server start up.
 #[derive(Debug, Default, Serialize, Deserialize, Clone, PartialEq, Eq, EnumString)]
@@ -68,7 +66,12 @@ pub struct TlsOption {
 }
 
 impl TlsOption {
-    pub fn new(mode: Option<TlsMode>, cert_path: Option<String>, key_path: Option<String>) -> Self {
+    pub fn new(
+        mode: Option<TlsMode>,
+        cert_path: Option<String>,
+        key_path: Option<String>,
+        watch: bool,
+    ) -> Self {
         let mut tls_option = TlsOption::default();
 
         if let Some(mode) = mode {
@@ -83,9 +86,52 @@ impl TlsOption {
             tls_option.key_path = key_path
         };
 
+        tls_option.watch = watch;
+
         tls_option
     }
 
+    /// Validates the TLS configuration.
+    ///
+    /// Returns an error if:
+    /// - TLS mode is enabled (not `Disable`) but `cert_path` or `key_path` is empty
+    /// - TLS mode is `VerifyCa` or `VerifyFull` but `ca_cert_path` is empty
+    pub fn validate(&self) -> Result<()> {
+        if self.mode == TlsMode::Disable {
+            return Ok(());
+        }
+
+        // When TLS is enabled, cert_path and key_path are required
+        if self.cert_path.is_empty() {
+            return Err(crate::error::Error::Internal {
+                err_msg: format!(
+                    "TLS mode is {:?} but cert_path is not configured",
+                    self.mode
+                ),
+            });
+        }
+
+        if self.key_path.is_empty() {
+            return Err(crate::error::Error::Internal {
+                err_msg: format!("TLS mode is {:?} but key_path is not configured", self.mode),
+            });
+        }
+
+        // For VerifyCa and VerifyFull modes, ca_cert_path is required for client verification
+        if matches!(self.mode, TlsMode::VerifyCa | TlsMode::VerifyFull)
+            && self.ca_cert_path.is_empty()
+        {
+            return Err(crate::error::Error::Internal {
+                err_msg: format!(
+                    "TLS mode is {:?} but ca_cert_path is not configured",
+                    self.mode
+                ),
+            });
+        }
+
+        Ok(())
+    }
+
     pub fn setup(&self) -> Result<Option<ServerConfig>> {
         if let TlsMode::Disable = self.mode {
             return Ok(None);
@@ -142,96 +188,41 @@ impl TlsOption {
     }
 }
 
-/// A mutable container for TLS server config
-///
-/// This struct allows dynamic reloading of server certificates and keys
-pub struct ReloadableTlsServerConfig {
-    tls_option: TlsOption,
-    config: RwLock<Option<Arc<ServerConfig>>>,
-    version: AtomicUsize,
+pub fn merge_tls_option(main: &TlsOption, other: TlsOption) -> TlsOption {
+    if other.mode != TlsMode::Disable && other.validate().is_ok() {
+        return other;
+    }
+    main.clone()
 }
 
-impl ReloadableTlsServerConfig {
-    /// Create server config by loading configuration from `TlsOption`
-    pub fn try_new(tls_option: TlsOption) -> Result<ReloadableTlsServerConfig> {
-        let server_config = tls_option.setup()?;
-        Ok(Self {
-            tls_option,
-            config: RwLock::new(server_config.map(Arc::new)),
-            version: AtomicUsize::new(0),
-        })
+impl TlsConfigLoader<Arc<ServerConfig>> for TlsOption {
+    type Error = crate::error::Error;
+
+    fn load(&self) -> Result<Option<Arc<ServerConfig>>> {
+        Ok(self.setup()?.map(Arc::new))
     }
 
-    /// Reread server certificates and keys from file system.
-    pub fn reload(&self) -> Result<()> {
-        let server_config = self.tls_option.setup()?;
-        *self.config.write().unwrap() = server_config.map(Arc::new);
-        self.version.fetch_add(1, Ordering::Relaxed);
-        Ok(())
+    fn watch_paths(&self) -> Vec<&Path> {
+        vec![self.cert_path(), self.key_path()]
     }
 
-    /// Get the server config hold by this container
-    pub fn get_server_config(&self) -> Option<Arc<ServerConfig>> {
-        self.config.read().unwrap().clone()
-    }
-
-    /// Get associated `TlsOption`
-    pub fn get_tls_option(&self) -> &TlsOption {
-        &self.tls_option
-    }
-
-    /// Get version of current config
-    ///
-    /// this version will auto increase when server config get reloaded.
-    pub fn get_version(&self) -> usize {
-        self.version.load(Ordering::Relaxed)
+    fn watch_enabled(&self) -> bool {
+        self.mode != TlsMode::Disable && self.watch
     }
 }
 
-pub fn maybe_watch_tls_config(tls_server_config: Arc<ReloadableTlsServerConfig>) -> Result<()> {
-    if !tls_server_config.get_tls_option().watch_enabled() {
-        return Ok(());
-    }
+/// Type alias for server-side reloadable TLS config
+pub type ReloadableTlsServerConfig = ReloadableTlsConfig<Arc<ServerConfig>, TlsOption>;
 
-    let tls_server_config_for_watcher = tls_server_config.clone();
-
-    let (tx, rx) = channel::<notify::Result<notify::Event>>();
-    let mut watcher = notify::recommended_watcher(tx).context(FileWatchSnafu { path: "<none>" })?;
-
-    let cert_path = tls_server_config.get_tls_option().cert_path();
-    watcher
-        .watch(cert_path, RecursiveMode::NonRecursive)
-        .with_context(|_| FileWatchSnafu {
-            path: cert_path.display().to_string(),
-        })?;
-
-    let key_path = tls_server_config.get_tls_option().key_path();
-    watcher
-        .watch(key_path, RecursiveMode::NonRecursive)
-        .with_context(|_| FileWatchSnafu {
-            path: key_path.display().to_string(),
-        })?;
-
-    std::thread::spawn(move || {
-        let _watcher = watcher;
-        while let Ok(res) = rx.recv() {
-            if let Ok(event) = res {
-                match event.kind {
-                    EventKind::Modify(_) | EventKind::Create(_) => {
-                        info!("Detected TLS cert/key file change: {:?}", event);
-                        if let Err(err) = tls_server_config_for_watcher.reload() {
-                            error!(err; "Failed to reload TLS server config");
-                        } else {
-                            info!("Reloaded TLS cert/key file successfully.");
-                        }
-                    }
-                    _ => {}
-                }
-            }
+/// Convenience function for watching server TLS configuration
+pub fn maybe_watch_server_tls_config(
+    tls_server_config: Arc<ReloadableTlsServerConfig>,
+) -> Result<()> {
+    common_grpc::reloadable_tls::maybe_watch_tls_config(tls_server_config, || {}).map_err(|e| {
+        crate::error::Error::Internal {
+            err_msg: format!("Failed to watch TLS config: {}", e),
         }
-    });
-
-    Ok(())
+    })
 }
 
 #[cfg(test)]
@@ -240,15 +231,130 @@ mod tests {
     use crate::install_ring_crypto_provider;
     use crate::tls::TlsMode::Disable;
 
+    #[test]
+    fn test_validate_disable_mode() {
+        let tls = TlsOption {
+            mode: TlsMode::Disable,
+            cert_path: String::new(),
+            key_path: String::new(),
+            ca_cert_path: String::new(),
+            watch: false,
+        };
+        assert!(tls.validate().is_ok());
+    }
+
+    #[test]
+    fn test_validate_missing_cert_path() {
+        let tls = TlsOption {
+            mode: TlsMode::Require,
+            cert_path: String::new(),
+            key_path: "/path/to/key".to_string(),
+            ca_cert_path: String::new(),
+            watch: false,
+        };
+        let err = tls.validate().unwrap_err();
+        assert!(err.to_string().contains("cert_path"));
+    }
+
+    #[test]
+    fn test_validate_missing_key_path() {
+        let tls = TlsOption {
+            mode: TlsMode::Require,
+            cert_path: "/path/to/cert".to_string(),
+            key_path: String::new(),
+            ca_cert_path: String::new(),
+            watch: false,
+        };
+        let err = tls.validate().unwrap_err();
+        assert!(err.to_string().contains("key_path"));
+    }
+
+    #[test]
+    fn test_validate_require_mode_success() {
+        let tls = TlsOption {
+            mode: TlsMode::Require,
+            cert_path: "/path/to/cert".to_string(),
+            key_path: "/path/to/key".to_string(),
+            ca_cert_path: String::new(),
+            watch: false,
+        };
+        assert!(tls.validate().is_ok());
+    }
+
+    #[test]
+    fn test_validate_verify_ca_missing_ca_cert() {
+        let tls = TlsOption {
+            mode: TlsMode::VerifyCa,
+            cert_path: "/path/to/cert".to_string(),
+            key_path: "/path/to/key".to_string(),
+            ca_cert_path: String::new(),
+            watch: false,
+        };
+        let err = tls.validate().unwrap_err();
+        assert!(err.to_string().contains("ca_cert_path"));
+    }
+
+    #[test]
+    fn test_validate_verify_full_missing_ca_cert() {
+        let tls = TlsOption {
+            mode: TlsMode::VerifyFull,
+            cert_path: "/path/to/cert".to_string(),
+            key_path: "/path/to/key".to_string(),
+            ca_cert_path: String::new(),
+            watch: false,
+        };
+        let err = tls.validate().unwrap_err();
+        assert!(err.to_string().contains("ca_cert_path"));
+    }
+
+    #[test]
+    fn test_validate_verify_ca_success() {
+        let tls = TlsOption {
+            mode: TlsMode::VerifyCa,
+            cert_path: "/path/to/cert".to_string(),
+            key_path: "/path/to/key".to_string(),
+            ca_cert_path: "/path/to/ca".to_string(),
+            watch: false,
+        };
+        assert!(tls.validate().is_ok());
+    }
+
+    #[test]
+    fn test_validate_verify_full_success() {
+        let tls = TlsOption {
+            mode: TlsMode::VerifyFull,
+            cert_path: "/path/to/cert".to_string(),
+            key_path: "/path/to/key".to_string(),
+            ca_cert_path: "/path/to/ca".to_string(),
+            watch: false,
+        };
+        assert!(tls.validate().is_ok());
+    }
+
+    #[test]
+    fn test_validate_prefer_mode() {
+        let tls = TlsOption {
+            mode: TlsMode::Prefer,
+            cert_path: "/path/to/cert".to_string(),
+            key_path: "/path/to/key".to_string(),
+            ca_cert_path: String::new(),
+            watch: false,
+        };
+        assert!(tls.validate().is_ok());
+    }
+
     #[test]
     fn test_new_tls_option() {
-        assert_eq!(TlsOption::default(), TlsOption::new(None, None, None));
+        assert_eq!(
+            TlsOption::default(),
+            TlsOption::new(None, None, None, false)
+        );
         assert_eq!(
             TlsOption {
                 mode: Disable,
                 ..Default::default()
             },
-            TlsOption::new(Some(Disable), None, None)
+            TlsOption::new(Some(Disable), None, None, false)
         );
         assert_eq!(
             TlsOption {
@@ -261,7 +367,8 @@ mod tests {
             TlsOption::new(
                 Some(Disable),
                 Some("/path/to/cert_path".to_string()),
-                Some("/path/to/key_path".to_string())
+                Some("/path/to/key_path".to_string()),
+                false
             )
         );
     }
@@ -423,10 +530,11 @@ mod tests {
         let server_config = Arc::new(
             ReloadableTlsServerConfig::try_new(server_tls).expect("failed to create server config"),
         );
-        maybe_watch_tls_config(server_config.clone()).expect("failed to watch server config");
+        maybe_watch_server_tls_config(server_config.clone())
+            .expect("failed to watch server config");
 
         assert_eq!(0, server_config.get_version());
-        assert!(server_config.get_server_config().is_some());
+        assert!(server_config.get_config().is_some());
 
         let tmp_file = key_path.with_extension("tmp");
         std::fs::copy("tests/ssl/server-pkcs8.key", &tmp_file)
@@ -448,6 +556,6 @@ mod tests {
 
         assert!(version_updated, "TLS config did not reload in time");
         assert!(server_config.get_version() > 0);
-        assert!(server_config.get_server_config().is_some());
+        assert!(server_config.get_config().is_some());
     }
 }
diff --git a/src/servers/tests/mod.rs b/src/servers/tests/mod.rs
index 7d6268215c..3f85b6d3ad 100644
--- a/src/servers/tests/mod.rs
+++ b/src/servers/tests/mod.rs
@@ -16,12 +16,11 @@ use std::sync::Arc;
 
 use api::v1::greptime_request::Request;
 use api::v1::query_request::Query;
-use arrow_flight::FlightData;
 use async_trait::async_trait;
 use catalog::memory::MemoryCatalogManager;
 use common_base::AffectedRows;
 use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
-use common_grpc::flight::FlightDecoder;
+use common_grpc::flight::do_put::DoPutResponse;
 use common_query::Output;
 use datafusion_expr::LogicalPlan;
 use query::options::QueryOptions;
@@ -35,7 +34,6 @@ use session::context::QueryContextRef;
 use snafu::ensure;
 use sql::statements::statement::Statement;
 use table::TableRef;
-use table::table_name::TableName;
 
 mod http;
 mod interceptor;
@@ -165,14 +163,22 @@ impl GrpcQueryHandler for DummyInstance {
 
     async fn put_record_batch(
         &self,
-        _table_name: &TableName,
+        _request: servers::grpc::flight::PutRecordBatchRequest,
         _table_ref: &mut Option<TableRef>,
-        _decoder: &mut FlightDecoder,
-        _data: FlightData,
         _ctx: QueryContextRef,
     ) -> std::result::Result<AffectedRows, Self::Error> {
         unimplemented!()
     }
+
+    fn handle_put_record_batch_stream(
+        &self,
+        _stream: servers::grpc::flight::PutRecordBatchRequestStream,
+        _ctx: QueryContextRef,
+    ) -> std::pin::Pin<
+        Box<dyn futures::Stream<Item = std::result::Result<DoPutResponse, Self::Error>> + Send>,
+    > {
+        unimplemented!()
+    }
 }
 
 fn create_testing_instance(table: TableRef) -> DummyInstance {
diff --git a/src/session/Cargo.toml b/src/session/Cargo.toml
index d6ee98650f..5b8b60f5ab 100644
--- a/src/session/Cargo.toml
+++ b/src/session/Cargo.toml
@@ -24,6 +24,6 @@ common-telemetry.workspace = true
 common-time.workspace = true
 datafusion-common.workspace = true
 derive_builder.workspace = true
-derive_more = { version = "1", default-features = false, features = ["debug"] }
+derive_more.workspace = true
 snafu.workspace = true
 sql.workspace = true
diff --git a/src/session/src/lib.rs b/src/session/src/lib.rs
index 8696419014..8d2a3e2141 100644
--- a/src/session/src/lib.rs
+++ b/src/session/src/lib.rs
@@ -18,7 +18,7 @@ pub mod protocol_ctx;
 pub mod session_config;
 pub mod table_name;
 
-use std::collections::HashMap;
+use std::collections::{HashMap, VecDeque};
 use std::net::SocketAddr;
 use std::sync::{Arc, RwLock};
 use std::time::Duration;
@@ -35,6 +35,9 @@ use derive_more::Debug;
 
 use crate::context::{Channel, ConnInfo, QueryContextRef};
 
+/// Maximum number of warnings to store per session (similar to MySQL's max_error_count)
+const MAX_WARNINGS: usize = 64;
+
 /// Session for persistent connection such as MySQL, PostgreSQL etc.
 #[derive(Debug)]
 pub struct Session {
@@ -58,6 +61,8 @@ pub(crate) struct MutableInner {
     read_preference: ReadPreference,
     #[debug(skip)]
     pub(crate) cursors: HashMap<String, Arc<RecordBatchStreamCursor>>,
+    /// Warning messages for MySQL SHOW WARNINGS support
+    warnings: VecDeque<String>,
 }
 
 impl Default for MutableInner {
@@ -69,6 +74,7 @@ impl Default for MutableInner {
             query_timeout: None,
             read_preference: ReadPreference::Leader,
             cursors: HashMap::with_capacity(0),
+            warnings: VecDeque::new(),
         }
     }
 }
@@ -156,4 +162,35 @@ impl Session {
     pub fn process_id(&self) -> u32 {
         self.process_id
     }
+
+    pub fn warnings_count(&self) -> usize {
+        self.mutable_inner.read().unwrap().warnings.len()
+    }
+
+    pub fn warnings(&self) -> Vec<String> {
+        self.mutable_inner
+            .read()
+            .unwrap()
+            .warnings
+            .iter()
+            .cloned()
+            .collect()
+    }
+
+    /// Add a warning message. If the limit is reached, discard the oldest warning.
+    pub fn add_warning(&self, warning: String) {
+        let mut inner = self.mutable_inner.write().unwrap();
+        if inner.warnings.len() >= MAX_WARNINGS {
+            inner.warnings.pop_front();
+        }
+        inner.warnings.push_back(warning);
+    }
+
+    pub fn clear_warnings(&self) {
+        let mut inner = self.mutable_inner.write().unwrap();
+        if inner.warnings.is_empty() {
+            return;
+        }
+        inner.warnings.clear();
+    }
 }
diff --git a/src/sql/src/error.rs b/src/sql/src/error.rs
index 46fbd29d1a..cb7a71f0e4 100644
--- a/src/sql/src/error.rs
+++ b/src/sql/src/error.rs
@@ -285,6 +285,13 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Failed to set VECTOR index option"))]
+    SetVectorIndexOption {
+        source: datatypes::error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display(
         "Invalid partition number: {}, should be in range [2, 65536]",
         partition_num
@@ -394,7 +401,9 @@ impl ErrorExt for Error {
             ConvertValue { .. } => StatusCode::Unsupported,
 
             PermissionDenied { .. } => StatusCode::PermissionDenied,
-            SetFulltextOption { .. } | SetSkippingIndexOption { .. } => StatusCode::Unexpected,
+            SetFulltextOption { .. }
+            | SetSkippingIndexOption { .. }
+            | SetVectorIndexOption { .. } => StatusCode::Unexpected,
         }
     }
 
diff --git a/src/sql/src/parser.rs b/src/sql/src/parser.rs
index 6c2a7e11ab..50cd62360d 100644
--- a/src/sql/src/parser.rs
+++ b/src/sql/src/parser.rs
@@ -163,6 +163,8 @@ impl ParserContext<'_> {
 
                 Keyword::TRUNCATE => self.parse_truncate(),
 
+                Keyword::COMMENT => self.parse_comment(),
+
                 Keyword::SET => self.parse_set_variables(),
 
                 Keyword::ADMIN => self.parse_admin_command(),
@@ -353,7 +355,8 @@ mod tests {
                 let ts_col = columns.first().unwrap();
                 assert_eq!(
                     expected_type,
-                    sql_data_type_to_concrete_data_type(ts_col.data_type()).unwrap()
+                    sql_data_type_to_concrete_data_type(ts_col.data_type(), &Default::default())
+                        .unwrap()
                 );
             }
             _ => unreachable!(),
diff --git a/src/sql/src/parsers.rs b/src/sql/src/parsers.rs
index e3c41c49b2..7d68d5d1ce 100644
--- a/src/sql/src/parsers.rs
+++ b/src/sql/src/parsers.rs
@@ -14,6 +14,7 @@
 
 pub(crate) mod admin_parser;
 mod alter_parser;
+pub(crate) mod comment_parser;
 pub(crate) mod copy_parser;
 pub(crate) mod create_parser;
 pub(crate) mod cursor_parser;
diff --git a/src/sql/src/parsers/comment_parser.rs b/src/sql/src/parsers/comment_parser.rs
new file mode 100644
index 0000000000..bd2cb53c5d
--- /dev/null
+++ b/src/sql/src/parsers/comment_parser.rs
@@ -0,0 +1,196 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use snafu::{ResultExt, ensure};
+use sqlparser::ast::ObjectName;
+use sqlparser::keywords::Keyword;
+use sqlparser::tokenizer::Token;
+
+use crate::ast::{Ident, ObjectNamePart};
+use crate::error::{self, InvalidSqlSnafu, Result};
+use crate::parser::{FLOW, ParserContext};
+use crate::statements::comment::{Comment, CommentObject};
+use crate::statements::statement::Statement;
+
+impl ParserContext<'_> {
+    pub(crate) fn parse_comment(&mut self) -> Result<Statement> {
+        let _ = self.parser.next_token(); // consume COMMENT
+
+        if !self.parser.parse_keyword(Keyword::ON) {
+            return self.expected("ON", self.parser.peek_token());
+        }
+
+        let target_token = self.parser.next_token();
+        let comment = match target_token.token {
+            Token::Word(word) if word.keyword == Keyword::TABLE => {
+                let raw_table =
+                    self.parse_object_name()
+                        .with_context(|_| error::UnexpectedSnafu {
+                            expected: "a table name",
+                            actual: self.peek_token_as_string(),
+                        })?;
+                let table = Self::canonicalize_object_name(raw_table)?;
+                CommentObject::Table(table)
+            }
+            Token::Word(word) if word.keyword == Keyword::COLUMN => {
+                self.parse_column_comment_target()?
+            }
+            Token::Word(word)
+                if word.keyword == Keyword::NoKeyword && word.value.eq_ignore_ascii_case(FLOW) =>
+            {
+                let raw_flow =
+                    self.parse_object_name()
+                        .with_context(|_| error::UnexpectedSnafu {
+                            expected: "a flow name",
+                            actual: self.peek_token_as_string(),
+                        })?;
+                let flow = Self::canonicalize_object_name(raw_flow)?;
+                CommentObject::Flow(flow)
+            }
+            _ => return self.expected("TABLE, COLUMN or FLOW", target_token),
+        };
+
+        if !self.parser.parse_keyword(Keyword::IS) {
+            return self.expected("IS", self.parser.peek_token());
+        }
+
+        let comment_value = if self.parser.parse_keyword(Keyword::NULL) {
+            None
+        } else {
+            Some(
+                self.parser
+                    .parse_literal_string()
+                    .context(error::SyntaxSnafu)?,
+            )
+        };
+
+        Ok(Statement::Comment(Comment {
+            object: comment,
+            comment: comment_value,
+        }))
+    }
+
+    fn parse_column_comment_target(&mut self) -> Result<CommentObject> {
+        let raw = self
+            .parse_object_name()
+            .with_context(|_| error::UnexpectedSnafu {
+                expected: "a column reference",
+                actual: self.peek_token_as_string(),
+            })?;
+        let canonical = Self::canonicalize_object_name(raw)?;
+
+        let mut parts = canonical.0;
+        ensure!(
+            parts.len() >= 2,
+            InvalidSqlSnafu {
+                msg: "COMMENT ON COLUMN expects <table>.<column>".to_string(),
+            }
+        );
+
+        let column_part = parts.pop().unwrap();
+        let ObjectNamePart::Identifier(column_ident) = column_part else {
+            unreachable!("canonicalized object name should only contain identifiers");
+        };
+
+        let column = ParserContext::canonicalize_identifier(column_ident);
+
+        let mut table_idents: Vec<Ident> = Vec::with_capacity(parts.len());
+        for part in parts {
+            match part {
+                ObjectNamePart::Identifier(ident) => table_idents.push(ident),
+                ObjectNamePart::Function(_) => {
+                    unreachable!("canonicalized object name should only contain identifiers")
+                }
+            }
+        }
+
+        ensure!(
+            !table_idents.is_empty(),
+            InvalidSqlSnafu {
+                msg: "Table name is required before column name".to_string(),
+            }
+        );
+
+        let table = ObjectName::from(table_idents);
+
+        Ok(CommentObject::Column { table, column })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::assert_matches::assert_matches;
+
+    use crate::dialect::GreptimeDbDialect;
+    use crate::parser::{ParseOptions, ParserContext};
+    use crate::statements::comment::CommentObject;
+    use crate::statements::statement::Statement;
+
+    fn parse(sql: &str) -> Statement {
+        let mut stmts =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap();
+        assert_eq!(stmts.len(), 1);
+        stmts.pop().unwrap()
+    }
+
+    #[test]
+    fn test_parse_comment_on_table() {
+        let stmt = parse("COMMENT ON TABLE mytable IS 'test';");
+        match stmt {
+            Statement::Comment(comment) => {
+                assert_matches!(comment.object, CommentObject::Table(ref name) if name.to_string() == "mytable");
+                assert_eq!(comment.comment.as_deref(), Some("test"));
+            }
+            _ => panic!("expected comment statement"),
+        }
+
+        let stmt = parse("COMMENT ON TABLE mytable IS NULL;");
+        match stmt {
+            Statement::Comment(comment) => {
+                assert_matches!(comment.object, CommentObject::Table(ref name) if name.to_string() == "mytable");
+                assert!(comment.comment.is_none());
+            }
+            _ => panic!("expected comment statement"),
+        }
+    }
+
+    #[test]
+    fn test_parse_comment_on_column() {
+        let stmt = parse("COMMENT ON COLUMN my_schema.my_table.my_col IS 'desc';");
+        match stmt {
+            Statement::Comment(comment) => match comment.object {
+                CommentObject::Column { table, column } => {
+                    assert_eq!(table.to_string(), "my_schema.my_table");
+                    assert_eq!(column.value, "my_col");
+                    assert_eq!(comment.comment.as_deref(), Some("desc"));
+                }
+                _ => panic!("expected column comment"),
+            },
+            _ => panic!("expected comment statement"),
+        }
+    }
+
+    #[test]
+    fn test_parse_comment_on_flow() {
+        let stmt = parse("COMMENT ON FLOW my_flow IS 'desc';");
+        match stmt {
+            Statement::Comment(comment) => {
+                assert_matches!(comment.object, CommentObject::Flow(ref name) if name.to_string() == "my_flow");
+                assert_eq!(comment.comment.as_deref(), Some("desc"));
+            }
+            _ => panic!("expected comment statement"),
+        }
+    }
+}
diff --git a/src/sql/src/parsers/create_parser.rs b/src/sql/src/parsers/create_parser.rs
index 157f554071..fe68a07669 100644
--- a/src/sql/src/parsers/create_parser.rs
+++ b/src/sql/src/parsers/create_parser.rs
@@ -43,6 +43,7 @@ use crate::parser::{FLOW, ParserContext};
 use crate::parsers::tql_parser;
 use crate::parsers::utils::{
     self, validate_column_fulltext_create_option, validate_column_skipping_index_create_option,
+    validate_column_vector_index_create_option,
 };
 use crate::statements::create::{
     Column, ColumnExtensions, CreateDatabase, CreateExternalTable, CreateFlow, CreateTable,
@@ -60,6 +61,7 @@ pub const EXPIRE: &str = "EXPIRE";
 pub const AFTER: &str = "AFTER";
 pub const INVERTED: &str = "INVERTED";
 pub const SKIPPING: &str = "SKIPPING";
+pub const VECTOR: &str = "VECTOR";
 
 pub type RawIntervalExpr = String;
 
@@ -669,8 +671,7 @@ impl<'a> ParserContext<'a> {
         // Must immediately parse the JSON datatype format because it is closely after the "JSON"
         // datatype, like this: "JSON(format = ...)".
         if matches!(data_type, DataType::JSON) {
-            let options = json::parse_json_datatype_options(parser)?;
-            extensions.json_datatype_options = Some(options);
+            extensions.json_datatype_options = json::parse_json_datatype_options(parser)?;
         }
 
         let mut options = vec![];
@@ -856,7 +857,7 @@ impl<'a> ParserContext<'a> {
             );
 
             let column_type = get_unalias_type(column_type);
-            let data_type = sql_data_type_to_concrete_data_type(&column_type)?;
+            let data_type = sql_data_type_to_concrete_data_type(&column_type, column_extensions)?;
             ensure!(
                 data_type == ConcreteDataType::string_datatype(),
                 InvalidColumnOptionSnafu {
@@ -929,6 +930,61 @@ impl<'a> ParserContext<'a> {
             is_index_declared |= true;
         }
 
+        // vector index
+        if let Token::Word(word) = parser.peek_token().token
+            && word.value.eq_ignore_ascii_case(VECTOR)
+        {
+            parser.next_token();
+            // Consume `INDEX` keyword
+            ensure!(
+                parser.parse_keyword(Keyword::INDEX),
+                InvalidColumnOptionSnafu {
+                    name: column_name.to_string(),
+                    msg: "expect INDEX after VECTOR keyword",
+                }
+            );
+
+            ensure!(
+                column_extensions.vector_index_options.is_none(),
+                InvalidColumnOptionSnafu {
+                    name: column_name.to_string(),
+                    msg: "duplicated VECTOR INDEX option",
+                }
+            );
+
+            // Check that column is a vector type
+            let column_type = get_unalias_type(column_type);
+            let data_type = sql_data_type_to_concrete_data_type(&column_type, column_extensions)?;
+            ensure!(
+                matches!(data_type, ConcreteDataType::Vector(_)),
+                InvalidColumnOptionSnafu {
+                    name: column_name.to_string(),
+                    msg: "VECTOR INDEX only supports Vector type columns",
+                }
+            );
+
+            let options = parser
+                .parse_options(Keyword::WITH)
+                .context(error::SyntaxSnafu)?
+                .into_iter()
+                .map(parse_option_string)
+                .collect::<Result<Vec<_>>>()?;
+
+            for (key, _) in options.iter() {
+                ensure!(
+                    validate_column_vector_index_create_option(key),
+                    InvalidColumnOptionSnafu {
+                        name: column_name.to_string(),
+                        msg: format!("invalid VECTOR INDEX option: {key}"),
+                    }
+                );
+            }
+
+            let options = OptionMap::new(options);
+            column_extensions.vector_index_options = Some(options);
+            is_index_declared |= true;
+        }
+
         Ok(is_index_declared)
     }
 
@@ -2715,7 +2771,8 @@ CREATE TABLE log (
 
     #[test]
     fn test_parse_column_extensions_vector() {
-        let sql = "VECTOR(128)";
+        // Test that vector options are parsed from data_type (no additional SQL needed)
+        let sql = "";
         let dialect = GenericDialect {};
         let mut tokenizer = Tokenizer::new(&dialect, sql);
         let tokens = tokenizer.tokenize().unwrap();
@@ -2735,7 +2792,8 @@ CREATE TABLE log (
 
     #[test]
     fn test_parse_column_extensions_vector_invalid() {
-        let sql = "VECTOR()";
+        // Test that vector with no dimension fails
+        let sql = "";
         let dialect = GenericDialect {};
         let mut tokenizer = Tokenizer::new(&dialect, sql);
         let tokens = tokenizer.tokenize().unwrap();
@@ -2913,4 +2971,174 @@ CREATE TABLE log (
                 .unwrap();
         assert_eq!("SELECT '10 seconds'::INTERVAL", &stmts[0].to_string());
     }
+
+    #[test]
+    fn test_parse_create_table_vector_index_options() {
+        // Test basic vector index
+        let sql = r"
+CREATE TABLE vectors (
+    ts TIMESTAMP TIME INDEX,
+    vec VECTOR(128) VECTOR INDEX,
+)";
+        let result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap();
+
+        if let Statement::CreateTable(c) = &result[0] {
+            c.columns.iter().for_each(|col| {
+                if col.name().value == "vec" {
+                    assert!(
+                        col.extensions
+                            .vector_index_options
+                            .as_ref()
+                            .unwrap()
+                            .is_empty()
+                    );
+                }
+            });
+        } else {
+            panic!("should be create_table statement");
+        }
+
+        // Test vector index with options
+        let sql = r"
+CREATE TABLE vectors (
+    ts TIMESTAMP TIME INDEX,
+    vec VECTOR(128) VECTOR INDEX WITH (metric='cosine', connectivity='32', expansion_add='256', expansion_search='128')
+)";
+        let result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap();
+
+        if let Statement::CreateTable(c) = &result[0] {
+            c.columns.iter().for_each(|col| {
+                if col.name().value == "vec" {
+                    let options = col.extensions.vector_index_options.as_ref().unwrap();
+                    assert_eq!(options.len(), 4);
+                    assert_eq!(options.get("metric").unwrap(), "cosine");
+                    assert_eq!(options.get("connectivity").unwrap(), "32");
+                    assert_eq!(options.get("expansion_add").unwrap(), "256");
+                    assert_eq!(options.get("expansion_search").unwrap(), "128");
+                }
+            });
+        } else {
+            panic!("should be create_table statement");
+        }
+    }
+
+    #[test]
+    fn test_parse_create_table_vector_index_invalid_type() {
+        // Test vector index on non-vector type (should fail)
+        let sql = r"
+CREATE TABLE vectors (
+    ts TIMESTAMP TIME INDEX,
+    col INT VECTOR INDEX,
+)";
+        let result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("VECTOR INDEX only supports Vector type columns")
+        );
+    }
+
+    #[test]
+    fn test_parse_create_table_vector_index_duplicate() {
+        // Test duplicate vector index (should fail)
+        let sql = r"
+CREATE TABLE vectors (
+    ts TIMESTAMP TIME INDEX,
+    vec VECTOR(128) VECTOR INDEX VECTOR INDEX,
+)";
+        let result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("duplicated VECTOR INDEX option")
+        );
+    }
+
+    #[test]
+    fn test_parse_create_table_vector_index_invalid_option() {
+        // Test invalid option key (should fail)
+        let sql = r"
+CREATE TABLE vectors (
+    ts TIMESTAMP TIME INDEX,
+    vec VECTOR(128) VECTOR INDEX WITH (metric='l2sq', invalid_option='foo')
+)";
+        let result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("invalid VECTOR INDEX option")
+        );
+    }
+
+    #[test]
+    fn test_parse_column_extensions_vector_index() {
+        // Test vector index on vector type
+        {
+            let sql = "VECTOR INDEX WITH (metric = 'l2sq')";
+            let dialect = GenericDialect {};
+            let mut tokenizer = Tokenizer::new(&dialect, sql);
+            let tokens = tokenizer.tokenize().unwrap();
+            let mut parser = Parser::new(&dialect).with_tokens(tokens);
+            let name = Ident::new("vec_col");
+            let data_type =
+                DataType::Custom(vec![Ident::new("VECTOR")].into(), vec!["128".to_string()]);
+            // First, parse the vector type to set vector_options
+            let mut extensions = ColumnExtensions {
+                vector_options: Some(OptionMap::from([(
+                    VECTOR_OPT_DIM.to_string(),
+                    "128".to_string(),
+                )])),
+                ..Default::default()
+            };
+
+            let result = ParserContext::parse_column_extensions(
+                &mut parser,
+                &name,
+                &data_type,
+                &mut extensions,
+            );
+            assert!(result.is_ok());
+            assert!(extensions.vector_index_options.is_some());
+            let vi_options = extensions.vector_index_options.unwrap();
+            assert_eq!(vi_options.get("metric"), Some("l2sq"));
+        }
+
+        // Test vector index on non-vector type (should fail)
+        {
+            let sql = "VECTOR INDEX";
+            let dialect = GenericDialect {};
+            let mut tokenizer = Tokenizer::new(&dialect, sql);
+            let tokens = tokenizer.tokenize().unwrap();
+            let mut parser = Parser::new(&dialect).with_tokens(tokens);
+            let name = Ident::new("num_col");
+            let data_type = DataType::Int(None); // Non-vector type
+            let mut extensions = ColumnExtensions::default();
+            let result = ParserContext::parse_column_extensions(
+                &mut parser,
+                &name,
+                &data_type,
+                &mut extensions,
+            );
+            assert!(result.is_err());
+            assert!(
+                result
+                    .unwrap_err()
+                    .to_string()
+                    .contains("VECTOR INDEX only supports Vector type columns")
+            );
+        }
+    }
 }
diff --git a/src/sql/src/parsers/create_parser/json.rs b/src/sql/src/parsers/create_parser/json.rs
index 1556205fef..649a91106a 100644
--- a/src/sql/src/parsers/create_parser/json.rs
+++ b/src/sql/src/parsers/create_parser/json.rs
@@ -20,7 +20,7 @@ use crate::error::{Result, SyntaxSnafu};
 use crate::statements::OptionMap;
 use crate::util;
 
-pub(super) fn parse_json_datatype_options(parser: &mut Parser<'_>) -> Result<OptionMap> {
+pub(super) fn parse_json_datatype_options(parser: &mut Parser<'_>) -> Result<Option<OptionMap>> {
     if parser.consume_token(&Token::LParen) {
         let result = parser
             .parse_comma_separated0(Parser::parse_sql_option, Token::RParen)
@@ -32,9 +32,9 @@ pub(super) fn parse_json_datatype_options(parser: &mut Parser<'_>) -> Result<Opt
                     .collect::<Result<Vec<_>>>()
             })?;
         parser.expect_token(&Token::RParen).context(SyntaxSnafu)?;
-        Ok(OptionMap::new(result))
+        Ok(Some(OptionMap::new(result)))
     } else {
-        Ok(OptionMap::default())
+        Ok(None)
     }
 }
 
@@ -53,7 +53,7 @@ mod tests {
 
     #[test]
     fn test_parse_json_datatype_options() {
-        fn parse(sql: &str) -> OptionMap {
+        fn parse(sql: &str) -> Option<OptionMap> {
             let Statement::CreateTable(mut create_table) = ParserContext::create_with_dialect(
                 sql,
                 &GreptimeDbDialect {},
@@ -72,8 +72,7 @@ mod tests {
             assert_eq!(column_def.data_type, DataType::JSON);
             assert!(column_def.options.is_empty());
 
-            assert!(extensions.json_datatype_options.is_some());
-            extensions.json_datatype_options.unwrap()
+            extensions.json_datatype_options
         }
 
         let sql = r#"
@@ -81,7 +80,7 @@ CREATE TABLE json_data (
     my_json JSON(format = "partial", unstructured_keys = ["k", "foo.bar", "a.b.c"]),
     ts TIMESTAMP TIME INDEX,
 )"#;
-        let options = parse(sql);
+        let options = parse(sql).unwrap();
         assert_eq!(options.len(), 2);
         assert_eq!(
             options.value(JSON_OPT_FORMAT).and_then(|x| x.as_string()),
@@ -100,7 +99,7 @@ CREATE TABLE json_data (
     my_json JSON(format = "structured"),
     ts TIMESTAMP TIME INDEX,
 )"#;
-        let options = parse(sql);
+        let options = parse(sql).unwrap();
         assert_eq!(options.len(), 1);
         assert_eq!(
             options.value(JSON_OPT_FORMAT).and_then(|x| x.as_string()),
@@ -112,7 +111,7 @@ CREATE TABLE json_data (
     my_json JSON(format = "raw"),
     ts TIMESTAMP TIME INDEX,
 )"#;
-        let options = parse(sql);
+        let options = parse(sql).unwrap();
         assert_eq!(options.len(), 1);
         assert_eq!(
             options.value(JSON_OPT_FORMAT).and_then(|x| x.as_string()),
@@ -124,7 +123,7 @@ CREATE TABLE json_data (
     my_json JSON(),
     ts TIMESTAMP TIME INDEX,
 )"#;
-        let options = parse(sql);
+        let options = parse(sql).unwrap();
         assert!(options.is_empty());
 
         let sql = r#"
@@ -133,6 +132,6 @@ CREATE TABLE json_data (
     ts TIMESTAMP TIME INDEX,
 )"#;
         let options = parse(sql);
-        assert!(options.is_empty());
+        assert!(options.is_none());
     }
 }
diff --git a/src/sql/src/parsers/create_parser/trigger.rs b/src/sql/src/parsers/create_parser/trigger.rs
index 6b2b0c1eb7..d93cb26f33 100644
--- a/src/sql/src/parsers/create_parser/trigger.rs
+++ b/src/sql/src/parsers/create_parser/trigger.rs
@@ -25,7 +25,8 @@ pub const NOTIFY: &str = "NOTIFY";
 pub const WEBHOOK: &str = "WEBHOOK";
 pub const URL: &str = "URL";
 pub const FOR: &str = "FOR";
-pub const KEEP_FIRING_FOR: &str = "KEEP_FIRING_FOR";
+pub const KEEP: &str = "KEEP";
+pub const FIRING: &str = "FIRING";
 
 const TIMEOUT: &str = "timeout";
 
@@ -42,7 +43,7 @@ impl<'a> ParserContext<'a> {
     ///     ON (<query_expression>)
     ///         EVERY <interval_expression>
     ///     [FOR <interval_expression>]
-    ///     [KEEP_FIRING_FOR <interval_expression>]
+    ///     [KEEP FIRING FOR <interval_expression>]
     ///     [LABELS (<label_name>=<label_val>, ...)]
     ///     [ANNOTATIONS (<annotation_name>=<annotation_val>, ...)]
     ///     NOTIFY(
@@ -90,14 +91,14 @@ impl<'a> ParserContext<'a> {
                     self.parser.next_token();
                     r#for.replace(self.parse_trigger_for(true)?);
                 }
-                Token::Word(w) if w.value.eq_ignore_ascii_case(KEEP_FIRING_FOR) => {
+                Token::Word(w) if w.value.eq_ignore_ascii_case(KEEP) => {
                     self.parser.next_token();
                     keep_firing_for.replace(self.parse_trigger_keep_firing_for(true)?);
                 }
                 Token::EOF => break,
                 _ => {
                     return self.expected(
-                        "`ON` or `LABELS` or `ANNOTATIONS` or `NOTIFY` keyword or `FOR` or `KEEP_FIRING_FOR`",
+                        "`ON` or `LABELS` or `ANNOTATIONS` or `NOTIFY` keyword or `FOR` or `KEEP FIRING FOR`",
                         next_token,
                     );
                 }
@@ -237,14 +238,33 @@ impl<'a> ParserContext<'a> {
     ) -> Result<DurationExpr> {
         if !is_first_keyword_matched {
             if let Token::Word(w) = self.parser.peek_token().token
-                && w.value.eq_ignore_ascii_case(KEEP_FIRING_FOR)
+                && w.value.eq_ignore_ascii_case(KEEP)
             {
                 self.parser.next_token();
             } else {
-                return self.expected("`KEEP_FIRING_FOR` keyword", self.parser.peek_token());
+                return self.expected("`KEEP` keyword", self.parser.peek_token());
             }
         }
 
+        if let Token::Word(w) = self.parser.peek_token().token
+            && w.value.eq_ignore_ascii_case(FIRING)
+        {
+            self.parser.next_token();
+        } else {
+            return self.expected("`FIRING` keyword", self.parser.peek_token());
+        }
+
+        if let Token::Word(w) = self.parser.peek_token().token
+            && w.value.eq_ignore_ascii_case(FOR)
+        {
+            self.parser.next_token();
+        } else {
+            return self.expected(
+                "`FOR` keyword after `KEEP FIRING`",
+                self.parser.peek_token(),
+            );
+        }
+
         let (month_day_nano, raw_expr) = self.parse_interval_month_day_nano()?;
 
         // Trigger Interval (month_day_nano): the months field is prohibited,
@@ -252,7 +272,7 @@ impl<'a> ParserContext<'a> {
         ensure!(
             month_day_nano.months == 0,
             error::InvalidIntervalSnafu {
-                reason: "year and month is not supported in trigger KEEP_FIRING_FOR duration"
+                reason: "year and month is not supported in trigger KEEP FIRING FOR duration"
                     .to_string()
             }
         );
@@ -529,7 +549,7 @@ IF NOT EXISTS cpu_monitor
                 EVERY '5 minute'::INTERVAL
         LABELS (label_name=label_val)
         FOR '1ms'::INTERVAL
-        KEEP_FIRING_FOR '10 minute'::INTERVAL
+        KEEP FIRING FOR '10 minute'::INTERVAL
         ANNOTATIONS (annotation_name=annotation_val)
         NOTIFY(
                 WEBHOOK alert_manager_1 URL 'http://127.0.0.1:9093' WITH (timeout='1m'),
@@ -546,7 +566,7 @@ IF NOT EXISTS cpu_monitor
             )
         LABELS (label_name=label_val)
         ANNOTATIONS (annotation_name=annotation_val)
-        KEEP_FIRING_FOR '10 minute'::INTERVAL
+        KEEP FIRING FOR '10 minute'::INTERVAL
         FOR '1ms'::INTERVAL
         ON (SELECT host AS host_label, cpu, memory FROM machine_monitor WHERE cpu > 1)
                 EVERY '5 minute'::INTERVAL
@@ -875,29 +895,29 @@ IF NOT EXISTS cpu_monitor
     #[test]
     fn test_parse_trigger_keep_firing_for() {
         // Normal.
-        let sql = "KEEP_FIRING_FOR '10 minute'::INTERVAL";
+        let sql = "KEEP FIRING FOR '10 minute'::INTERVAL";
         let mut ctx = ParserContext::new(&GreptimeDbDialect {}, sql).unwrap();
         let expr = ctx.parse_trigger_keep_firing_for(false).unwrap();
         assert_eq!(expr.duration, Duration::from_secs(600));
         assert_eq!(expr.raw_expr, "'10 minute'::INTERVAL");
 
-        // Invalid, missing KEEP_FIRING_FOR keyword.
+        // Invalid, missing KEEP FIRING FOR keywords.
         let sql = "'10 minute'::INTERVAL";
         let mut ctx = ParserContext::new(&GreptimeDbDialect {}, sql).unwrap();
         assert!(ctx.parse_trigger_keep_firing_for(false).is_err());
 
         // Invalid, year not allowed.
-        let sql = "KEEP_FIRING_FOR '1 year'::INTERVAL";
+        let sql = "KEEP FIRING FOR '1 year'::INTERVAL";
         let mut ctx = ParserContext::new(&GreptimeDbDialect {}, sql).unwrap();
         assert!(ctx.parse_trigger_keep_firing_for(false).is_err());
 
         // Invalid, month not allowed.
-        let sql = "KEEP_FIRING_FOR '1 month'::INTERVAL";
+        let sql = "KEEP FIRING FOR '1 month'::INTERVAL";
         let mut ctx = ParserContext::new(&GreptimeDbDialect {}, sql).unwrap();
         assert!(ctx.parse_trigger_keep_firing_for(false).is_err());
 
         // Valid, interval less than 1 second is clamped.
-        let sql = "KEEP_FIRING_FOR '1ms'::INTERVAL";
+        let sql = "KEEP FIRING FOR '1ms'::INTERVAL";
         let mut ctx = ParserContext::new(&GreptimeDbDialect {}, sql).unwrap();
         let expr = ctx.parse_trigger_keep_firing_for(false).unwrap();
         assert_eq!(expr.duration, Duration::from_secs(1));
diff --git a/src/sql/src/parsers/utils.rs b/src/sql/src/parsers/utils.rs
index 5938018082..74bb6bd803 100644
--- a/src/sql/src/parsers/utils.rs
+++ b/src/sql/src/parsers/utils.rs
@@ -222,6 +222,29 @@ pub fn validate_column_skipping_index_create_option(key: &str) -> bool {
     .contains(&key)
 }
 
+/// Valid options for VECTOR INDEX:
+/// - engine: Vector index engine (usearch)
+/// - metric: Distance metric (l2sq, cosine, inner_product)
+/// - connectivity: HNSW M parameter
+/// - expansion_add: ef_construction parameter
+/// - expansion_search: ef_search parameter
+pub const COLUMN_VECTOR_INDEX_OPT_KEY_ENGINE: &str = "engine";
+pub const COLUMN_VECTOR_INDEX_OPT_KEY_METRIC: &str = "metric";
+pub const COLUMN_VECTOR_INDEX_OPT_KEY_CONNECTIVITY: &str = "connectivity";
+pub const COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_ADD: &str = "expansion_add";
+pub const COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_SEARCH: &str = "expansion_search";
+
+pub fn validate_column_vector_index_create_option(key: &str) -> bool {
+    [
+        COLUMN_VECTOR_INDEX_OPT_KEY_ENGINE,
+        COLUMN_VECTOR_INDEX_OPT_KEY_METRIC,
+        COLUMN_VECTOR_INDEX_OPT_KEY_CONNECTIVITY,
+        COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_ADD,
+        COLUMN_VECTOR_INDEX_OPT_KEY_EXPANSION_SEARCH,
+    ]
+    .contains(&key)
+}
+
 /// Convert an [`IntervalMonthDayNano`] to a [`Duration`].
 #[cfg(feature = "enterprise")]
 pub fn convert_month_day_nano_to_duration(
diff --git a/src/sql/src/statements.rs b/src/sql/src/statements.rs
index b493375373..a6ee60164a 100644
--- a/src/sql/src/statements.rs
+++ b/src/sql/src/statements.rs
@@ -14,6 +14,7 @@
 
 pub mod admin;
 pub mod alter;
+pub mod comment;
 pub mod copy;
 pub mod create;
 pub mod cursor;
@@ -41,7 +42,8 @@ use common_time::timezone::Timezone;
 use datatypes::extension::json::{JsonExtensionType, JsonMetadata};
 use datatypes::prelude::ConcreteDataType;
 use datatypes::schema::{COMMENT_KEY, ColumnDefaultConstraint, ColumnSchema};
-use datatypes::types::TimestampType;
+use datatypes::types::json_type::JsonNativeType;
+use datatypes::types::{JsonFormat, JsonType, TimestampType};
 use datatypes::value::Value;
 use snafu::ResultExt;
 use sqlparser::ast::{ExactNumberInfo, Ident};
@@ -53,9 +55,9 @@ use crate::ast::{
 use crate::error::{
     self, ConvertToGrpcDataTypeSnafu, ConvertValueSnafu, Result,
     SerializeColumnDefaultConstraintSnafu, SetFulltextOptionSnafu, SetJsonStructureSettingsSnafu,
-    SetSkippingIndexOptionSnafu, SqlCommonSnafu,
+    SetSkippingIndexOptionSnafu, SetVectorIndexOptionSnafu, SqlCommonSnafu,
 };
-use crate::statements::create::Column;
+use crate::statements::create::{Column, ColumnExtensions};
 pub use crate::statements::option_map::OptionMap;
 pub(crate) use crate::statements::transform::transform_statements;
 
@@ -109,7 +111,7 @@ pub fn column_to_schema(
         && !is_time_index;
 
     let name = column.name().value.clone();
-    let data_type = sql_data_type_to_concrete_data_type(column.data_type())?;
+    let data_type = sql_data_type_to_concrete_data_type(column.data_type(), &column.extensions)?;
     let default_constraint =
         parse_column_default_constraint(&name, &data_type, column.options(), timezone)
             .context(SqlCommonSnafu)?;
@@ -145,6 +147,12 @@ pub fn column_to_schema(
             .context(SetSkippingIndexOptionSnafu)?;
     }
 
+    if let Some(options) = column.extensions.build_vector_index_options()? {
+        column_schema = column_schema
+            .with_vector_index_options(&options)
+            .context(SetVectorIndexOptionSnafu)?;
+    }
+
     column_schema.set_inverted_index(column.extensions.inverted_index_options.is_some());
 
     if matches!(column.data_type(), SqlDataType::JSON) {
@@ -171,7 +179,7 @@ pub fn sql_column_def_to_grpc_column_def(
     timezone: Option<&Timezone>,
 ) -> Result<api::v1::ColumnDef> {
     let name = col.name.value.clone();
-    let data_type = sql_data_type_to_concrete_data_type(&col.data_type)?;
+    let data_type = sql_data_type_to_concrete_data_type(&col.data_type, &Default::default())?;
 
     let is_nullable = col
         .options
@@ -217,7 +225,10 @@ pub fn sql_column_def_to_grpc_column_def(
     })
 }
 
-pub fn sql_data_type_to_concrete_data_type(data_type: &SqlDataType) -> Result<ConcreteDataType> {
+pub fn sql_data_type_to_concrete_data_type(
+    data_type: &SqlDataType,
+    column_extensions: &ColumnExtensions,
+) -> Result<ConcreteDataType> {
     match data_type {
         SqlDataType::BigInt(_) | SqlDataType::Int64 => Ok(ConcreteDataType::int64_datatype()),
         SqlDataType::BigIntUnsigned(_) => Ok(ConcreteDataType::uint64_datatype()),
@@ -269,7 +280,14 @@ pub fn sql_data_type_to_concrete_data_type(data_type: &SqlDataType) -> Result<Co
                 Ok(ConcreteDataType::decimal128_datatype(*p as u8, *s as i8))
             }
         },
-        SqlDataType::JSON => Ok(ConcreteDataType::json_datatype()),
+        SqlDataType::JSON => {
+            let format = if column_extensions.json_datatype_options.is_some() {
+                JsonFormat::Native(Box::new(JsonNativeType::Null))
+            } else {
+                JsonFormat::Jsonb
+            };
+            Ok(ConcreteDataType::Json(JsonType::new(format)))
+        }
         // Vector type
         SqlDataType::Custom(name, d)
             if name.0.as_slice().len() == 1
@@ -354,7 +372,7 @@ mod tests {
     fn check_type(sql_type: SqlDataType, data_type: ConcreteDataType) {
         assert_eq!(
             data_type,
-            sql_data_type_to_concrete_data_type(&sql_type).unwrap()
+            sql_data_type_to_concrete_data_type(&sql_type, &Default::default()).unwrap()
         );
     }
 
@@ -698,6 +716,7 @@ mod tests {
                 skipping_index_options: None,
                 inverted_index_options: None,
                 json_datatype_options: None,
+                vector_index_options: None,
             },
         };
 
@@ -708,4 +727,82 @@ mod tests {
         assert_eq!(fulltext_options.analyzer, FulltextAnalyzer::English);
         assert!(fulltext_options.case_sensitive);
     }
+
+    #[test]
+    fn test_column_to_schema_with_vector_index() {
+        use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType};
+
+        // Test with custom metric and parameters
+        let column = Column {
+            column_def: ColumnDef {
+                name: "embedding".into(),
+                data_type: SqlDataType::Custom(
+                    vec![Ident::new(VECTOR_TYPE_NAME)].into(),
+                    vec!["128".to_string()],
+                ),
+                options: vec![],
+            },
+            extensions: ColumnExtensions {
+                fulltext_index_options: None,
+                vector_options: None,
+                skipping_index_options: None,
+                inverted_index_options: None,
+                json_datatype_options: None,
+                vector_index_options: Some(OptionMap::from([
+                    ("metric".to_string(), "cosine".to_string()),
+                    ("connectivity".to_string(), "32".to_string()),
+                    ("expansion_add".to_string(), "200".to_string()),
+                    ("expansion_search".to_string(), "100".to_string()),
+                ])),
+            },
+        };
+
+        let column_schema = column_to_schema(&column, "ts", None).unwrap();
+        assert_eq!("embedding", column_schema.name);
+        assert!(column_schema.is_vector_indexed());
+
+        let vector_options = column_schema.vector_index_options().unwrap().unwrap();
+        assert_eq!(vector_options.engine, VectorIndexEngineType::Usearch);
+        assert_eq!(vector_options.metric, VectorDistanceMetric::Cosine);
+        assert_eq!(vector_options.connectivity, 32);
+        assert_eq!(vector_options.expansion_add, 200);
+        assert_eq!(vector_options.expansion_search, 100);
+    }
+
+    #[test]
+    fn test_column_to_schema_with_vector_index_defaults() {
+        use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType};
+
+        // Test with default values (empty options map)
+        let column = Column {
+            column_def: ColumnDef {
+                name: "vec".into(),
+                data_type: SqlDataType::Custom(
+                    vec![Ident::new(VECTOR_TYPE_NAME)].into(),
+                    vec!["64".to_string()],
+                ),
+                options: vec![],
+            },
+            extensions: ColumnExtensions {
+                fulltext_index_options: None,
+                vector_options: None,
+                skipping_index_options: None,
+                inverted_index_options: None,
+                json_datatype_options: None,
+                vector_index_options: Some(OptionMap::default()),
+            },
+        };
+
+        let column_schema = column_to_schema(&column, "ts", None).unwrap();
+        assert_eq!("vec", column_schema.name);
+        assert!(column_schema.is_vector_indexed());
+
+        let vector_options = column_schema.vector_index_options().unwrap().unwrap();
+        // Verify defaults
+        assert_eq!(vector_options.engine, VectorIndexEngineType::Usearch);
+        assert_eq!(vector_options.metric, VectorDistanceMetric::L2sq);
+        assert_eq!(vector_options.connectivity, 16);
+        assert_eq!(vector_options.expansion_add, 128);
+        assert_eq!(vector_options.expansion_search, 64);
+    }
 }
diff --git a/src/sql/src/statements/comment.rs b/src/sql/src/statements/comment.rs
new file mode 100644
index 0000000000..ec0f8d37b6
--- /dev/null
+++ b/src/sql/src/statements/comment.rs
@@ -0,0 +1,67 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::{self, Display, Formatter};
+
+use serde::Serialize;
+use sqlparser_derive::{Visit, VisitMut};
+
+use crate::ast::{Ident, ObjectName};
+
+/// Represents a SQL COMMENT statement for adding or removing comments on database objects.
+///
+/// # Examples
+///
+/// ```sql
+/// COMMENT ON TABLE my_table IS 'This is a table comment';
+/// COMMENT ON COLUMN my_table.my_column IS 'This is a column comment';
+/// COMMENT ON FLOW my_flow IS NULL;
+/// ```
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
+pub struct Comment {
+    pub object: CommentObject,
+    pub comment: Option<String>,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Visit, VisitMut, Serialize)]
+pub enum CommentObject {
+    Table(ObjectName),
+    Column { table: ObjectName, column: Ident },
+    Flow(ObjectName),
+}
+
+impl Display for Comment {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "COMMENT ON {} IS ", self.object)?;
+        match &self.comment {
+            Some(comment) => {
+                let escaped = comment.replace('\'', "''");
+                write!(f, "'{}'", escaped)
+            }
+            None => f.write_str("NULL"),
+        }
+    }
+}
+
+impl Display for CommentObject {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        match self {
+            CommentObject::Table(name) => write!(f, "TABLE {}", name),
+            CommentObject::Column { table, column } => {
+                write!(f, "COLUMN {}.{}", table, column)
+            }
+            CommentObject::Flow(name) => write!(f, "FLOW {}", name),
+        }
+    }
+}
diff --git a/src/sql/src/statements/create.rs b/src/sql/src/statements/create.rs
index 3c7f6d1731..3791effac0 100644
--- a/src/sql/src/statements/create.rs
+++ b/src/sql/src/statements/create.rs
@@ -17,7 +17,10 @@ use std::fmt::{Display, Formatter};
 
 use common_catalog::consts::FILE_ENGINE;
 use datatypes::json::JsonStructureSettings;
-use datatypes::schema::{FulltextOptions, SkippingIndexOptions};
+use datatypes::schema::{
+    FulltextOptions, SkippingIndexOptions, VectorDistanceMetric, VectorIndexEngineType,
+    VectorIndexOptions,
+};
 use itertools::Itertools;
 use serde::Serialize;
 use snafu::ResultExt;
@@ -133,6 +136,8 @@ pub struct ColumnExtensions {
     ///
     /// Inverted index doesn't have options at present. There won't be any options in that map.
     pub inverted_index_options: Option<OptionMap>,
+    /// Vector index options for HNSW-based vector similarity search.
+    pub vector_index_options: Option<OptionMap>,
     pub json_datatype_options: Option<OptionMap>,
 }
 
@@ -208,6 +213,15 @@ impl Display for Column {
                 write!(f, " INVERTED INDEX")?;
             }
         }
+
+        if let Some(vector_index_options) = &self.extensions.vector_index_options {
+            if !vector_index_options.is_empty() {
+                let options = vector_index_options.kv_pairs();
+                write!(f, " VECTOR INDEX WITH({})", format_list_comma!(options))?;
+            } else {
+                write!(f, " VECTOR INDEX")?;
+            }
+        }
         Ok(())
     }
 }
@@ -233,6 +247,89 @@ impl ColumnExtensions {
         ))
     }
 
+    pub fn build_vector_index_options(&self) -> Result<Option<VectorIndexOptions>> {
+        let Some(options) = self.vector_index_options.as_ref() else {
+            return Ok(None);
+        };
+
+        let options_map: HashMap<String, String> = options.clone().into_map();
+        let mut result = VectorIndexOptions::default();
+
+        if let Some(s) = options_map.get("engine") {
+            result.engine = s.parse::<VectorIndexEngineType>().map_err(|e| {
+                InvalidSqlSnafu {
+                    msg: format!("invalid VECTOR INDEX engine: {e}"),
+                }
+                .build()
+            })?;
+        }
+
+        if let Some(s) = options_map.get("metric") {
+            result.metric = s.parse::<VectorDistanceMetric>().map_err(|e| {
+                InvalidSqlSnafu {
+                    msg: format!("invalid VECTOR INDEX metric: {e}"),
+                }
+                .build()
+            })?;
+        }
+
+        if let Some(s) = options_map.get("connectivity") {
+            let value = s.parse::<u32>().map_err(|_| {
+                InvalidSqlSnafu {
+                    msg: format!(
+                        "invalid VECTOR INDEX connectivity: {s}, expected positive integer"
+                    ),
+                }
+                .build()
+            })?;
+            if !(2..=2048).contains(&value) {
+                return InvalidSqlSnafu {
+                    msg: "VECTOR INDEX connectivity must be in the range [2, 2048].".to_string(),
+                }
+                .fail();
+            }
+            result.connectivity = value;
+        }
+
+        if let Some(s) = options_map.get("expansion_add") {
+            let value = s.parse::<u32>().map_err(|_| {
+                InvalidSqlSnafu {
+                    msg: format!(
+                        "invalid VECTOR INDEX expansion_add: {s}, expected positive integer"
+                    ),
+                }
+                .build()
+            })?;
+            if value == 0 {
+                return InvalidSqlSnafu {
+                    msg: "VECTOR INDEX expansion_add must be greater than 0".to_string(),
+                }
+                .fail();
+            }
+            result.expansion_add = value;
+        }
+
+        if let Some(s) = options_map.get("expansion_search") {
+            let value = s.parse::<u32>().map_err(|_| {
+                InvalidSqlSnafu {
+                    msg: format!(
+                        "invalid VECTOR INDEX expansion_search: {s}, expected positive integer"
+                    ),
+                }
+                .build()
+            })?;
+            if value == 0 {
+                return InvalidSqlSnafu {
+                    msg: "VECTOR INDEX expansion_search must be greater than 0".to_string(),
+                }
+                .fail();
+            }
+            result.expansion_search = value;
+        }
+
+        Ok(Some(result))
+    }
+
     pub fn build_json_structure_settings(&self) -> Result<Option<JsonStructureSettings>> {
         let Some(options) = self.json_datatype_options.as_ref() else {
             return Ok(None);
@@ -893,4 +990,92 @@ AS SELECT number FROM numbers_input where number > 10"#,
             _ => unreachable!(),
         }
     }
+
+    #[test]
+    fn test_vector_index_options_validation() {
+        use super::{ColumnExtensions, OptionMap};
+
+        // Test zero connectivity should fail
+        let extensions = ColumnExtensions {
+            fulltext_index_options: None,
+            vector_options: None,
+            skipping_index_options: None,
+            inverted_index_options: None,
+            json_datatype_options: None,
+            vector_index_options: Some(OptionMap::from([(
+                "connectivity".to_string(),
+                "0".to_string(),
+            )])),
+        };
+        let result = extensions.build_vector_index_options();
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("connectivity must be in the range [2, 2048]")
+        );
+
+        // Test zero expansion_add should fail
+        let extensions = ColumnExtensions {
+            fulltext_index_options: None,
+            vector_options: None,
+            skipping_index_options: None,
+            inverted_index_options: None,
+            json_datatype_options: None,
+            vector_index_options: Some(OptionMap::from([(
+                "expansion_add".to_string(),
+                "0".to_string(),
+            )])),
+        };
+        let result = extensions.build_vector_index_options();
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("expansion_add must be greater than 0")
+        );
+
+        // Test zero expansion_search should fail
+        let extensions = ColumnExtensions {
+            fulltext_index_options: None,
+            vector_options: None,
+            skipping_index_options: None,
+            inverted_index_options: None,
+            json_datatype_options: None,
+            vector_index_options: Some(OptionMap::from([(
+                "expansion_search".to_string(),
+                "0".to_string(),
+            )])),
+        };
+        let result = extensions.build_vector_index_options();
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("expansion_search must be greater than 0")
+        );
+
+        // Test valid values should succeed
+        let extensions = ColumnExtensions {
+            fulltext_index_options: None,
+            vector_options: None,
+            skipping_index_options: None,
+            inverted_index_options: None,
+            json_datatype_options: None,
+            vector_index_options: Some(OptionMap::from([
+                ("connectivity".to_string(), "32".to_string()),
+                ("expansion_add".to_string(), "200".to_string()),
+                ("expansion_search".to_string(), "100".to_string()),
+            ])),
+        };
+        let result = extensions.build_vector_index_options();
+        assert!(result.is_ok());
+        let options = result.unwrap().unwrap();
+        assert_eq!(options.connectivity, 32);
+        assert_eq!(options.expansion_add, 200);
+        assert_eq!(options.expansion_search, 100);
+    }
 }
diff --git a/src/sql/src/statements/create/trigger.rs b/src/sql/src/statements/create/trigger.rs
index 606c5882d1..a6b76c2211 100644
--- a/src/sql/src/statements/create/trigger.rs
+++ b/src/sql/src/statements/create/trigger.rs
@@ -37,7 +37,7 @@ impl Display for CreateTrigger {
         }
 
         if let Some(keep_firing_for) = &self.keep_firing_for {
-            writeln!(f, "  KEEP_FIRING_FOR {}", keep_firing_for)?;
+            writeln!(f, "  KEEP FIRING FOR {}", keep_firing_for)?;
         }
 
         if !self.labels.is_empty() {
@@ -153,7 +153,7 @@ mod tests {
         let sql = r#"CREATE TRIGGER IF NOT EXISTS cpu_monitor
 ON (SELECT host AS host_label, cpu, memory FROM machine_monitor WHERE cpu > 2) EVERY '1day 5 minute'::INTERVAL
 FOR '5 minute'::INTERVAL
-KEEP_FIRING_FOR '10 minute'::INTERVAL
+KEEP FIRING FOR '10 minute'::INTERVAL
 LABELS (label_name=label_val)
 ANNOTATIONS (annotation_name=annotation_val)
 NOTIFY
@@ -174,7 +174,7 @@ WEBHOOK alert_manager2 URL 'http://127.0.0.1:9093' WITH (timeout='1m')
         let expected = r#"CREATE TRIGGER IF NOT EXISTS cpu_monitor
   ON (SELECT host AS host_label, cpu, memory FROM machine_monitor WHERE cpu > 2) EVERY '1day 5 minute'::INTERVAL
   FOR '5 minute'::INTERVAL
-  KEEP_FIRING_FOR '10 minute'::INTERVAL
+  KEEP FIRING FOR '10 minute'::INTERVAL
   LABELS (label_name = 'label_val')
   ANNOTATIONS (annotation_name = 'annotation_val')
   NOTIFY(
diff --git a/src/sql/src/statements/statement.rs b/src/sql/src/statements/statement.rs
index f723409a6b..bac7e367cd 100644
--- a/src/sql/src/statements/statement.rs
+++ b/src/sql/src/statements/statement.rs
@@ -22,6 +22,7 @@ use sqlparser_derive::{Visit, VisitMut};
 use crate::error::{ConvertToDfStatementSnafu, Error};
 use crate::statements::admin::Admin;
 use crate::statements::alter::{AlterDatabase, AlterTable};
+use crate::statements::comment::Comment;
 use crate::statements::copy::Copy;
 use crate::statements::create::{
     CreateDatabase, CreateExternalTable, CreateFlow, CreateTable, CreateTableLike, CreateView,
@@ -137,6 +138,8 @@ pub enum Statement {
     SetVariables(SetVariables),
     // SHOW VARIABLES
     ShowVariables(ShowVariables),
+    // COMMENT ON
+    Comment(Comment),
     // USE
     Use(String),
     // Admin statement(extension)
@@ -204,6 +207,7 @@ impl Statement {
             | Statement::Copy(_)
             | Statement::TruncateTable(_)
             | Statement::SetVariables(_)
+            | Statement::Comment(_)
             | Statement::Use(_)
             | Statement::DeclareCursor(_)
             | Statement::CloseCursor(_)
@@ -267,6 +271,7 @@ impl Display for Statement {
             Statement::TruncateTable(s) => s.fmt(f),
             Statement::SetVariables(s) => s.fmt(f),
             Statement::ShowVariables(s) => s.fmt(f),
+            Statement::Comment(s) => s.fmt(f),
             Statement::ShowCharset(kind) => {
                 write!(f, "SHOW CHARSET {kind}")
             }
diff --git a/src/sql/src/statements/transform/type_alias.rs b/src/sql/src/statements/transform/type_alias.rs
index fb61abf27e..f76eb13ba6 100644
--- a/src/sql/src/statements/transform/type_alias.rs
+++ b/src/sql/src/statements/transform/type_alias.rs
@@ -33,12 +33,21 @@ use crate::statements::{TimezoneInfo, sql_data_type_to_concrete_data_type};
 ///  - `TimestampMillisecond`, `Timestamp_ms` for `Timestamp(3)`.
 ///  - `TimestampMicrosecond`, `Timestamp_us` for `Timestamp(6)`.
 ///  - `TimestampNanosecond`, `Timestamp_ns` for `Timestamp(9)`.
-///  - `INT8` for `tinyint`
+///  -  TinyText, MediumText, LongText for `Text`.
+///
+/// SQL dialect integer type aliases (MySQL & PostgreSQL):
+///  - `INT2` for `smallint`
+///  - `INT4` for `int`
+///  - `INT8` for `bigint`
+///  - `FLOAT4` for `float`
+///  - `FLOAT8` for `double`
+///
+/// Extended type aliases for Arrow types:
 ///  - `INT16` for `smallint`
 ///  - `INT32` for `int`
 ///  - `INT64` for `bigint`
 ///  -  And `UINT8`, `UINT16` etc. for `TinyIntUnsigned` etc.
-///  -  TinyText, MediumText, LongText for `Text`.
+///
 pub(crate) struct TypeAliasTransformRule;
 
 impl TransformRule for TypeAliasTransformRule {
@@ -108,7 +117,9 @@ impl TransformRule for TypeAliasTransformRule {
             } if get_type_by_alias(data_type).is_some() => {
                 // Safety: checked in the match arm.
                 let new_type = get_type_by_alias(data_type).unwrap();
-                if let Ok(new_type) = sql_data_type_to_concrete_data_type(&new_type) {
+                if let Ok(new_type) =
+                    sql_data_type_to_concrete_data_type(&new_type, &Default::default())
+                {
                     *expr = Expr::Function(cast_expr_to_arrow_cast_func(
                         (**cast_expr).clone(),
                         new_type.as_arrow_type().to_string(),
@@ -123,9 +134,10 @@ impl TransformRule for TypeAliasTransformRule {
                 expr: cast_expr,
                 ..
             } => {
-                if let Ok(concrete_type) =
-                    sql_data_type_to_concrete_data_type(&DataType::Timestamp(*precision, *zone))
-                {
+                if let Ok(concrete_type) = sql_data_type_to_concrete_data_type(
+                    &DataType::Timestamp(*precision, *zone),
+                    &Default::default(),
+                ) {
                     let new_type = concrete_type.as_arrow_type();
                     *expr = Expr::Function(cast_expr_to_arrow_cast_func(
                         (**cast_expr).clone(),
@@ -153,13 +165,12 @@ fn replace_type_alias(data_type: &mut DataType) {
 // Remember to update `get_data_type_by_alias_name()` if you modify this method.
 pub(crate) fn get_type_by_alias(data_type: &DataType) -> Option<DataType> {
     match data_type {
-        // The sqlparser latest version contains the Int8 alias for Postgres Bigint.
-        // Which means 8 bytes in postgres (not 8 bits).
-        // See https://docs.rs/sqlparser/latest/sqlparser/ast/enum.DataType.html#variant.Int8
         DataType::Custom(name, tokens) if name.0.len() == 1 && tokens.is_empty() => {
             get_data_type_by_alias_name(name.0[0].to_string_unquoted().as_str())
         }
-        DataType::Int8(None) => Some(DataType::TinyInt(None)),
+        DataType::Int2(None) => Some(DataType::SmallInt(None)),
+        DataType::Int4(None) => Some(DataType::Int(None)),
+        DataType::Int8(None) => Some(DataType::BigInt(None)),
         DataType::Int16 => Some(DataType::SmallInt(None)),
         DataType::Int32 => Some(DataType::Int(None)),
         DataType::Int64 => Some(DataType::BigInt(None)),
@@ -167,6 +178,8 @@ pub(crate) fn get_type_by_alias(data_type: &DataType) -> Option<DataType> {
         DataType::UInt16 => Some(DataType::SmallIntUnsigned(None)),
         DataType::UInt32 => Some(DataType::IntUnsigned(None)),
         DataType::UInt64 => Some(DataType::BigIntUnsigned(None)),
+        DataType::Float4 => Some(DataType::Float(None)),
+        DataType::Float8 => Some(DataType::Double(ExactNumberInfo::None)),
         DataType::Float32 => Some(DataType::Float(None)),
         DataType::Float64 => Some(DataType::Double(ExactNumberInfo::None)),
         DataType::Bool => Some(DataType::Boolean),
@@ -199,8 +212,9 @@ pub(crate) fn get_data_type_by_alias_name(name: &str) -> Option<DataType> {
             Some(DataType::Timestamp(Some(9), TimezoneInfo::None))
         }
         // Number type alias
-        // We keep them for backward compatibility.
-        "INT8" => Some(DataType::TinyInt(None)),
+        "INT2" => Some(DataType::SmallInt(None)),
+        "INT4" => Some(DataType::Int(None)),
+        "INT8" => Some(DataType::BigInt(None)),
         "INT16" => Some(DataType::SmallInt(None)),
         "INT32" => Some(DataType::Int(None)),
         "INT64" => Some(DataType::BigInt(None)),
@@ -208,6 +222,8 @@ pub(crate) fn get_data_type_by_alias_name(name: &str) -> Option<DataType> {
         "UINT16" => Some(DataType::SmallIntUnsigned(None)),
         "UINT32" => Some(DataType::IntUnsigned(None)),
         "UINT64" => Some(DataType::BigIntUnsigned(None)),
+        "FLOAT4" => Some(DataType::Float(None)),
+        "FLOAT8" => Some(DataType::Double(ExactNumberInfo::None)),
         "FLOAT32" => Some(DataType::Float(None)),
         "FLOAT64" => Some(DataType::Double(ExactNumberInfo::None)),
         // String type alias
@@ -238,14 +254,29 @@ mod tests {
             get_data_type_by_alias_name("FLOAT64"),
             Some(DataType::Double(ExactNumberInfo::None))
         );
-
         assert_eq!(
             get_data_type_by_alias_name("float32"),
             Some(DataType::Float(None))
         );
+        assert_eq!(
+            get_data_type_by_alias_name("float8"),
+            Some(DataType::Double(ExactNumberInfo::None))
+        );
+        assert_eq!(
+            get_data_type_by_alias_name("float4"),
+            Some(DataType::Float(None))
+        );
         assert_eq!(
             get_data_type_by_alias_name("int8"),
-            Some(DataType::TinyInt(None))
+            Some(DataType::BigInt(None))
+        );
+        assert_eq!(
+            get_data_type_by_alias_name("int4"),
+            Some(DataType::Int(None))
+        );
+        assert_eq!(
+            get_data_type_by_alias_name("int2"),
+            Some(DataType::SmallInt(None))
         );
         assert_eq!(
             get_data_type_by_alias_name("INT16"),
@@ -394,11 +425,15 @@ CREATE TABLE data_types (
   tt tinytext,
   mt mediumtext,
   lt longtext,
-  tint int8,
+  i2 int2,
+  i4 int4,
+  i8 int8,
   sint int16,
   i int32,
   bint int64,
   v varchar,
+  f4 float4,
+  f8 float8,
   f float32,
   d float64,
   b boolean,
@@ -423,11 +458,15 @@ CREATE TABLE data_types (
   tt TINYTEXT,
   mt MEDIUMTEXT,
   lt LONGTEXT,
-  tint TINYINT,
+  i2 SMALLINT,
+  i4 INT,
+  i8 BIGINT,
   sint SMALLINT,
   i INT,
   bint BIGINT,
   v VARCHAR,
+  f4 FLOAT,
+  f8 DOUBLE,
   f FLOAT,
   d DOUBLE,
   b BOOLEAN,
diff --git a/src/standalone/src/options.rs b/src/standalone/src/options.rs
index abbfcf64e2..20aad773b1 100644
--- a/src/standalone/src/options.rs
+++ b/src/standalone/src/options.rs
@@ -28,7 +28,6 @@ use frontend::service_config::{
 use mito2::config::MitoConfig;
 use query::options::QueryOptions;
 use serde::{Deserialize, Serialize};
-use servers::export_metrics::ExportMetricsOption;
 use servers::grpc::GrpcOptions;
 use servers::http::HttpOptions;
 
@@ -55,7 +54,6 @@ pub struct StandaloneOptions {
     pub user_provider: Option<String>,
     /// Options for different store engines.
     pub region_engine: Vec<RegionEngineConfig>,
-    pub export_metrics: ExportMetricsOption,
     pub tracing: TracingOptions,
     pub init_regions_in_background: bool,
     pub init_regions_parallelism: usize,
@@ -85,7 +83,6 @@ impl Default for StandaloneOptions {
             procedure: ProcedureConfig::default(),
             flow: FlowConfig::default(),
             logging: LoggingOptions::default(),
-            export_metrics: ExportMetricsOption::default(),
             user_provider: None,
             region_engine: vec![
                 RegionEngineConfig::Mito(MitoConfig::default()),
@@ -134,8 +131,6 @@ impl StandaloneOptions {
             meta_client: None,
             logging: cloned_opts.logging,
             user_provider: cloned_opts.user_provider,
-            // Handle the export metrics task run by standalone to frontend for execution
-            export_metrics: cloned_opts.export_metrics,
             max_in_flight_write_bytes: cloned_opts.max_in_flight_write_bytes,
             slow_query: cloned_opts.slow_query,
             ..Default::default()
diff --git a/src/store-api/src/manifest/storage.rs b/src/store-api/src/manifest/storage.rs
index 3de1d53199..58a3d27fe8 100644
--- a/src/store-api/src/manifest/storage.rs
+++ b/src/store-api/src/manifest/storage.rs
@@ -55,7 +55,7 @@ pub trait ManifestLogStorage {
 
     /// Delete logs in [start, end) and ignore checkpoints.
     async fn delete(&self, start: ManifestVersion, end: ManifestVersion)
-        -> Result<(), Self::Error>;
+    -> Result<(), Self::Error>;
 
     /// Save a checkpoint.
     async fn save_checkpoint(
@@ -66,7 +66,7 @@ pub trait ManifestLogStorage {
 
     /// Load the latest checkpoint
     async fn load_last_checkpoint(&self)
-        -> Result<Option<(ManifestVersion, Vec<u8>)>, Self::Error>;
+    -> Result<Option<(ManifestVersion, Vec<u8>)>, Self::Error>;
     /// Delete the checkpoint by version
     async fn delete_checkpoint(&self, version: ManifestVersion) -> Result<(), Self::Error>;
 
diff --git a/src/store-api/src/metadata.rs b/src/store-api/src/metadata.rs
index 4c4b2f3fb9..5cb00004ef 100644
--- a/src/store-api/src/metadata.rs
+++ b/src/store-api/src/metadata.rs
@@ -640,6 +640,18 @@ impl RegionMetadataBuilder {
 
     /// Consumes the builder and build a [RegionMetadata].
     pub fn build(self) -> Result<RegionMetadata> {
+        self.build_with_options(true)
+    }
+
+    /// Builds metadata without running validation.
+    ///
+    /// Intended for file/external engines that should accept arbitrary schemas
+    /// coming from files.
+    pub fn build_without_validation(self) -> Result<RegionMetadata> {
+        self.build_with_options(false)
+    }
+
+    fn build_with_options(self, validate: bool) -> Result<RegionMetadata> {
         let skipped = SkippedFields::new(&self.column_metadatas)?;
 
         let meta = RegionMetadata {
@@ -654,7 +666,9 @@ impl RegionMetadataBuilder {
             partition_expr: self.partition_expr,
         };
 
-        meta.validate()?;
+        if validate {
+            meta.validate()?;
+        }
 
         Ok(meta)
     }
@@ -1929,6 +1943,96 @@ mod test {
         );
     }
 
+    #[test]
+    fn test_allow_internal_column_name() {
+        let mut builder = create_builder();
+        builder
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "__primary_key",
+                    ConcreteDataType::string_datatype(),
+                    false,
+                ),
+                semantic_type: SemanticType::Tag,
+                column_id: 1,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "ts",
+                    ConcreteDataType::timestamp_millisecond_datatype(),
+                    false,
+                ),
+                semantic_type: SemanticType::Timestamp,
+                column_id: 2,
+            })
+            .primary_key(vec![1]);
+
+        let metadata = builder.build_without_validation().unwrap();
+        assert_eq!(
+            "__primary_key",
+            metadata.column_metadatas[0].column_schema.name
+        );
+    }
+
+    #[test]
+    fn test_build_without_validation() {
+        // Primary key points to a Field column, which would normally fail validation.
+        let mut builder = create_builder();
+        builder
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "ts",
+                    ConcreteDataType::timestamp_millisecond_datatype(),
+                    false,
+                ),
+                semantic_type: SemanticType::Timestamp,
+                column_id: 1,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "field",
+                    ConcreteDataType::string_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Field,
+                column_id: 2,
+            })
+            .primary_key(vec![2]);
+
+        // Unvalidated build should succeed.
+        let metadata = builder.build_without_validation().unwrap();
+        assert_eq!(vec![2], metadata.primary_key);
+
+        // Validated build still rejects it.
+        let mut builder = create_builder();
+        builder
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "ts",
+                    ConcreteDataType::timestamp_millisecond_datatype(),
+                    false,
+                ),
+                semantic_type: SemanticType::Timestamp,
+                column_id: 1,
+            })
+            .push_column_metadata(ColumnMetadata {
+                column_schema: ColumnSchema::new(
+                    "field",
+                    ConcreteDataType::string_datatype(),
+                    true,
+                ),
+                semantic_type: SemanticType::Field,
+                column_id: 2,
+            })
+            .primary_key(vec![2]);
+        let err = builder.build().unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("semantic type of column field should be Tag"),
+            "unexpected err: {err}"
+        );
+    }
+
     #[test]
     fn test_debug_for_column_metadata() {
         let region_metadata = build_test_region_metadata();
diff --git a/src/store-api/src/region_engine.rs b/src/store-api/src/region_engine.rs
index 1a19f68551..dd7809a0bf 100644
--- a/src/store-api/src/region_engine.rs
+++ b/src/store-api/src/region_engine.rs
@@ -37,7 +37,7 @@ use crate::metadata::RegionMetadataRef;
 use crate::region_request::{
     BatchRegionDdlRequest, RegionCatchupRequest, RegionOpenRequest, RegionRequest,
 };
-use crate::storage::{RegionId, ScanRequest, SequenceNumber};
+use crate::storage::{FileId, RegionId, ScanRequest, SequenceNumber};
 
 /// The settable region role state.
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -204,7 +204,7 @@ impl From<PbGrantedRegion> for GrantedRegion {
 
 /// The role of the region.
 /// TODO(weny): rename it to `RegionRoleState`
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
 pub enum RegionRole {
     // Readonly region(mito2)
     Follower,
@@ -421,6 +421,8 @@ pub struct QueryScanContext {
 /// The scanner splits the region into partitions so that each partition can be scanned concurrently.
 /// You can use this trait to implement an [`ExecutionPlan`](datafusion_physical_plan::ExecutionPlan).
 pub trait RegionScanner: Debug + DisplayAs + Send {
+    fn name(&self) -> &str;
+
     /// Returns the properties of the scanner.
     fn properties(&self) -> &ScannerProperties;
 
@@ -497,6 +499,8 @@ pub enum RegionManifestInfo {
     Mito {
         manifest_version: u64,
         flushed_entry_id: u64,
+        /// Number of files removed in the manifest's `removed_files` field.
+        file_removed_cnt: u64,
     },
     Metric {
         data_manifest_version: u64,
@@ -508,10 +512,11 @@ pub enum RegionManifestInfo {
 
 impl RegionManifestInfo {
     /// Creates a new [RegionManifestInfo] for mito2 engine.
-    pub fn mito(manifest_version: u64, flushed_entry_id: u64) -> Self {
+    pub fn mito(manifest_version: u64, flushed_entry_id: u64, file_removal_rate: u64) -> Self {
         Self::Mito {
             manifest_version,
             flushed_entry_id,
+            file_removed_cnt: file_removal_rate,
         }
     }
 
@@ -604,6 +609,7 @@ impl Default for RegionManifestInfo {
         Self::Mito {
             manifest_version: 0,
             flushed_entry_id: 0,
+            file_removed_cnt: 0,
         }
     }
 }
@@ -687,6 +693,72 @@ impl SyncManifestResponse {
     }
 }
 
+/// Request to remap manifests from old regions to new regions.
+#[derive(Debug, Clone)]
+pub struct RemapManifestsRequest {
+    /// The [`RegionId`] of a staging region used to obtain table directory and storage configuration for the remap operation.
+    pub region_id: RegionId,
+    /// Regions to remap manifests from.
+    pub input_regions: Vec<RegionId>,
+    /// For each old region, which new regions should receive its files
+    pub region_mapping: HashMap<RegionId, Vec<RegionId>>,
+    /// New partition expressions for the new regions.
+    pub new_partition_exprs: HashMap<RegionId, String>,
+}
+
+/// Response to remap manifests from old regions to new regions.
+#[derive(Debug, Clone)]
+pub struct RemapManifestsResponse {
+    /// The new manifests for the new regions.
+    pub new_manifests: HashMap<RegionId, String>,
+}
+
+/// Request to copy files from a source region to a target region.
+#[derive(Debug, Clone)]
+pub struct CopyRegionFromRequest {
+    /// The [`RegionId`] of the source region.
+    pub source_region_id: RegionId,
+    /// The parallelism of the copy operation.
+    pub parallelism: usize,
+}
+
+#[derive(Debug, Clone)]
+pub struct MitoCopyRegionFromResponse {
+    /// The file ids that were copied from the source region to the target region.
+    pub copied_file_ids: Vec<FileId>,
+}
+
+#[derive(Debug, Clone)]
+pub struct MetricCopyRegionFromResponse {
+    /// The logical regions that were newly opened after the copy operation.
+    pub new_opened_logical_region_ids: Vec<RegionId>,
+}
+
+/// Response to copy region from a source region to a target region.
+#[derive(Debug, Clone)]
+pub enum CopyRegionFromResponse {
+    Mito(MitoCopyRegionFromResponse),
+    Metric(MetricCopyRegionFromResponse),
+}
+
+impl CopyRegionFromResponse {
+    /// Converts the response to a mito2 response.
+    pub fn into_mito(self) -> Option<MitoCopyRegionFromResponse> {
+        match self {
+            CopyRegionFromResponse::Mito(response) => Some(response),
+            CopyRegionFromResponse::Metric(_) => None,
+        }
+    }
+
+    /// Converts the response to a metric response.
+    pub fn into_metric(self) -> Option<MetricCopyRegionFromResponse> {
+        match self {
+            CopyRegionFromResponse::Metric(response) => Some(response),
+            CopyRegionFromResponse::Mito(_) => None,
+        }
+    }
+}
+
 #[async_trait]
 pub trait RegionEngine: Send + Sync {
     /// Name of this engine
@@ -811,6 +883,19 @@ pub trait RegionEngine: Send + Sync {
         manifest_info: RegionManifestInfo,
     ) -> Result<SyncManifestResponse, BoxedError>;
 
+    /// Remaps manifests from old regions to new regions.
+    async fn remap_manifests(
+        &self,
+        request: RemapManifestsRequest,
+    ) -> Result<RemapManifestsResponse, BoxedError>;
+
+    /// Copies region from a source region to a target region.
+    async fn copy_region_from(
+        &self,
+        region_id: RegionId,
+        request: CopyRegionFromRequest,
+    ) -> Result<CopyRegionFromResponse, BoxedError>;
+
     /// Sets region role state gracefully.
     ///
     /// After the call returns, the engine ensures no more write operations will succeed in the region.
@@ -862,6 +947,10 @@ impl Debug for SinglePartitionScanner {
 }
 
 impl RegionScanner for SinglePartitionScanner {
+    fn name(&self) -> &str {
+        "SinglePartition"
+    }
+
     fn properties(&self) -> &ScannerProperties {
         &self.properties
     }
diff --git a/src/store-api/src/region_request.rs b/src/store-api/src/region_request.rs
index 7fb4b7fda0..b582db0a95 100644
--- a/src/store-api/src/region_request.rs
+++ b/src/store-api/src/region_request.rs
@@ -22,9 +22,10 @@ use api::v1::column_def::{
 };
 use api::v1::region::bulk_insert_request::Body;
 use api::v1::region::{
-    AlterRequest, AlterRequests, BulkInsertRequest, CloseRequest, CompactRequest, CreateRequest,
-    CreateRequests, DeleteRequests, DropRequest, DropRequests, FlushRequest, InsertRequests,
-    OpenRequest, TruncateRequest, alter_request, compact_request, region_request, truncate_request,
+    AlterRequest, AlterRequests, BuildIndexRequest, BulkInsertRequest, CloseRequest,
+    CompactRequest, CreateRequest, CreateRequests, DeleteRequests, DropRequest, DropRequests,
+    FlushRequest, InsertRequests, OpenRequest, TruncateRequest, alter_request, compact_request,
+    region_request, truncate_request,
 };
 use api::v1::{
     self, Analyzer, ArrowIpc, FulltextBackend as PbFulltextBackend, Option as PbOption, Rows,
@@ -150,6 +151,7 @@ pub enum RegionRequest {
     Truncate(RegionTruncateRequest),
     Catchup(RegionCatchupRequest),
     BulkInserts(RegionBulkInsertsRequest),
+    EnterStaging(EnterStagingRequest),
 }
 
 impl RegionRequest {
@@ -166,6 +168,7 @@ impl RegionRequest {
             region_request::Body::Alter(alter) => make_region_alter(alter),
             region_request::Body::Flush(flush) => make_region_flush(flush),
             region_request::Body::Compact(compact) => make_region_compact(compact),
+            region_request::Body::BuildIndex(index) => make_region_build_index(index),
             region_request::Body::Truncate(truncate) => make_region_truncate(truncate),
             region_request::Body::Creates(creates) => make_region_creates(creates),
             region_request::Body::Drops(drops) => make_region_drops(drops),
@@ -354,6 +357,14 @@ fn make_region_compact(compact: CompactRequest) -> Result<Vec<(RegionId, RegionR
     )])
 }
 
+fn make_region_build_index(index: BuildIndexRequest) -> Result<Vec<(RegionId, RegionRequest)>> {
+    let region_id = index.region_id.into();
+    Ok(vec![(
+        region_id,
+        RegionRequest::BuildIndex(RegionBuildIndexRequest {}),
+    )])
+}
+
 fn make_region_truncate(truncate: TruncateRequest) -> Result<Vec<(RegionId, RegionRequest)>> {
     let region_id = truncate.region_id.into();
     match truncate.kind {
@@ -1406,6 +1417,17 @@ impl RegionBulkInsertsRequest {
     }
 }
 
+/// Request to stage a region with a new region rule(partition expression).
+///
+/// This request transitions a region into the staging mode.
+/// It first flushes the memtable for the old region rule if it is not empty,
+/// then enters the staging mode with the new region rule.
+#[derive(Debug, Clone)]
+pub struct EnterStagingRequest {
+    /// The partition expression of the staging region.
+    pub partition_expr: String,
+}
+
 impl fmt::Display for RegionRequest {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
@@ -1422,6 +1444,7 @@ impl fmt::Display for RegionRequest {
             RegionRequest::Truncate(_) => write!(f, "Truncate"),
             RegionRequest::Catchup(_) => write!(f, "Catchup"),
             RegionRequest::BulkInserts(_) => write!(f, "BulkInserts"),
+            RegionRequest::EnterStaging(_) => write!(f, "EnterStaging"),
         }
     }
 }
diff --git a/src/store-api/src/sst_entry.rs b/src/store-api/src/sst_entry.rs
index d71e5f0cdc..832bfc1155 100644
--- a/src/store-api/src/sst_entry.rs
+++ b/src/store-api/src/sst_entry.rs
@@ -47,8 +47,8 @@ pub struct ManifestSstEntry {
     pub region_sequence: RegionSeq,
     /// Engine-specific file identifier (string form).
     pub file_id: String,
-    /// Engine-specific index file identifier (string form).
-    pub index_file_id: Option<String>,
+    /// Index version, increment when the index file is rebuilt.
+    pub index_version: u64,
     /// SST level.
     pub level: u8,
     /// Full path of the SST file in object store.
@@ -91,7 +91,7 @@ impl ManifestSstEntry {
             ColumnSchema::new("region_group", Ty::uint8_datatype(), false),
             ColumnSchema::new("region_sequence", Ty::uint32_datatype(), false),
             ColumnSchema::new("file_id", Ty::string_datatype(), false),
-            ColumnSchema::new("index_file_id", Ty::string_datatype(), true),
+            ColumnSchema::new("index_version", Ty::uint64_datatype(), false),
             ColumnSchema::new("level", Ty::uint8_datatype(), false),
             ColumnSchema::new("file_path", Ty::string_datatype(), false),
             ColumnSchema::new("file_size", Ty::uint64_datatype(), false),
@@ -119,7 +119,7 @@ impl ManifestSstEntry {
         let region_groups = entries.iter().map(|e| e.region_group);
         let region_sequences = entries.iter().map(|e| e.region_sequence);
         let file_ids = entries.iter().map(|e| e.file_id.as_str());
-        let index_file_ids = entries.iter().map(|e| e.index_file_id.as_ref());
+        let index_versions = entries.iter().map(|e| e.index_version);
         let levels = entries.iter().map(|e| e.level);
         let file_paths = entries.iter().map(|e| e.file_path.as_str());
         let file_sizes = entries.iter().map(|e| e.file_size);
@@ -151,7 +151,7 @@ impl ManifestSstEntry {
             Arc::new(UInt8Array::from_iter_values(region_groups)),
             Arc::new(UInt32Array::from_iter_values(region_sequences)),
             Arc::new(StringArray::from_iter_values(file_ids)),
-            Arc::new(StringArray::from_iter(index_file_ids)),
+            Arc::new(UInt64Array::from_iter(index_versions)),
             Arc::new(UInt8Array::from_iter_values(levels)),
             Arc::new(StringArray::from_iter_values(file_paths)),
             Arc::new(UInt64Array::from_iter_values(file_sizes)),
@@ -437,7 +437,7 @@ mod tests {
                 region_group: region_group1,
                 region_sequence: region_seq1,
                 file_id: "f1".to_string(),
-                index_file_id: None,
+                index_version: 0,
                 level: 1,
                 file_path: "/p1".to_string(),
                 file_size: 100,
@@ -461,7 +461,7 @@ mod tests {
                 region_group: region_group2,
                 region_sequence: region_seq2,
                 file_id: "f2".to_string(),
-                index_file_id: Some("idx".to_string()),
+                index_version: 1,
                 level: 3,
                 file_path: "/p2".to_string(),
                 file_size: 200,
@@ -548,13 +548,13 @@ mod tests {
         assert_eq!("f1", file_ids.value(0));
         assert_eq!("f2", file_ids.value(1));
 
-        let index_file_ids = batch
+        let index_versions = batch
             .column(7)
             .as_any()
-            .downcast_ref::<StringArray>()
+            .downcast_ref::<UInt64Array>()
             .unwrap();
-        assert!(index_file_ids.is_null(0));
-        assert_eq!("idx", index_file_ids.value(1));
+        assert_eq!(0, index_versions.value(0));
+        assert_eq!(1, index_versions.value(1));
 
         let levels = batch
             .column(8)
diff --git a/src/store-api/src/storage.rs b/src/store-api/src/storage.rs
index 2cafaf027c..b97fe0b3ad 100644
--- a/src/store-api/src/storage.rs
+++ b/src/store-api/src/storage.rs
@@ -26,6 +26,9 @@ pub use datatypes::schema::{
 };
 
 pub use self::descriptors::*;
-pub use self::file::{FileId, FileRef, FileRefsManifest, GcReport, ParseIdError};
-pub use self::requests::{ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector};
+pub use self::file::{FileId, FileRef, FileRefsManifest, GcReport, IndexVersion, ParseIdError};
+pub use self::requests::{
+    ScanRequest, TimeSeriesDistribution, TimeSeriesRowSelector, VectorDistanceMetric,
+    VectorIndexEngine, VectorIndexEngineType, VectorSearchMatches, VectorSearchRequest,
+};
 pub use self::types::{SequenceNumber, SequenceRange};
diff --git a/src/store-api/src/storage/file.rs b/src/store-api/src/storage/file.rs
index a028ec0401..bb7490ccf5 100644
--- a/src/store-api/src/storage/file.rs
+++ b/src/store-api/src/storage/file.rs
@@ -24,6 +24,9 @@ use uuid::Uuid;
 use crate::ManifestVersion;
 use crate::storage::RegionId;
 
+/// Index version
+pub type IndexVersion = u64;
+
 #[derive(Debug, Snafu, PartialEq)]
 pub struct ParseIdError {
     source: uuid::Error,
@@ -121,6 +124,9 @@ impl GcReport {
             *self_files = dedup.into_iter().collect();
         }
         self.need_retry_regions.extend(other.need_retry_regions);
+        // Remove regions that have succeeded from need_retry_regions
+        self.need_retry_regions
+            .retain(|region| !self.deleted_files.contains_key(region));
     }
 }
 
diff --git a/src/store-api/src/storage/requests.rs b/src/store-api/src/storage/requests.rs
index 5e9fae3215..e538127e73 100644
--- a/src/store-api/src/storage/requests.rs
+++ b/src/store-api/src/storage/requests.rs
@@ -14,11 +14,66 @@
 
 use std::fmt::{Display, Formatter};
 
+use common_error::ext::BoxedError;
 use common_recordbatch::OrderOption;
 use datafusion_expr::expr::Expr;
+// Re-export vector types from datatypes to avoid duplication
+pub use datatypes::schema::{VectorDistanceMetric, VectorIndexEngineType};
 use strum::Display;
 
-use crate::storage::SequenceNumber;
+use crate::storage::{ColumnId, SequenceNumber};
+
+/// A hint for KNN vector search.
+#[derive(Debug, Clone, PartialEq)]
+pub struct VectorSearchRequest {
+    /// Column ID of the vector column to search.
+    pub column_id: ColumnId,
+    /// The query vector to search for.
+    pub query_vector: Vec<f32>,
+    /// Number of nearest neighbors to return.
+    pub k: usize,
+    /// Distance metric to use (matches the index metric).
+    pub metric: VectorDistanceMetric,
+}
+
+/// Search results from vector index.
+#[derive(Debug, Clone, PartialEq)]
+pub struct VectorSearchMatches {
+    /// Keys (row offsets in the index).
+    pub keys: Vec<u64>,
+    /// Distances from the query vector.
+    pub distances: Vec<f32>,
+}
+
+/// Trait for vector index engines (HNSW implementations).
+///
+/// This trait defines the interface for pluggable vector index engines.
+/// Implementations (e.g., UsearchEngine) are provided by storage engines like mito2.
+pub trait VectorIndexEngine: Send + Sync {
+    /// Adds a vector with the given key.
+    fn add(&mut self, key: u64, vector: &[f32]) -> Result<(), BoxedError>;
+
+    /// Searches for k nearest neighbors.
+    fn search(&self, query: &[f32], k: usize) -> Result<VectorSearchMatches, BoxedError>;
+
+    /// Returns the serialized length.
+    fn serialized_length(&self) -> usize;
+
+    /// Serializes the index to a buffer.
+    fn save_to_buffer(&self, buffer: &mut [u8]) -> Result<(), BoxedError>;
+
+    /// Reserves capacity for vectors.
+    fn reserve(&mut self, capacity: usize) -> Result<(), BoxedError>;
+
+    /// Returns current size (number of vectors).
+    fn size(&self) -> usize;
+
+    /// Returns current capacity.
+    fn capacity(&self) -> usize;
+
+    /// Returns memory usage in bytes.
+    fn memory_usage(&self) -> usize;
+}
 
 /// A hint on how to select rows from a time-series.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Display)]
@@ -38,7 +93,7 @@ pub enum TimeSeriesDistribution {
     PerSeries,
 }
 
-#[derive(Default, Clone, Debug, PartialEq, Eq)]
+#[derive(Default, Clone, Debug, PartialEq)]
 pub struct ScanRequest {
     /// Indices of columns to read, `None` to read all columns. This indices is
     /// based on table schema.
@@ -66,6 +121,9 @@ pub struct ScanRequest {
     pub sst_min_sequence: Option<SequenceNumber>,
     /// Optional hint for the distribution of time-series data.
     pub distribution: Option<TimeSeriesDistribution>,
+    /// Optional hint for KNN vector search. When set, the scan should use
+    /// vector index to find the k nearest neighbors.
+    pub vector_search: Option<VectorSearchRequest>,
 }
 
 impl Display for ScanRequest {
@@ -138,6 +196,16 @@ impl Display for ScanRequest {
         if let Some(distribution) = &self.distribution {
             write!(f, "{}distribution: {}", delimiter.as_str(), distribution)?;
         }
+        if let Some(vector_search) = &self.vector_search {
+            write!(
+                f,
+                "{}vector_search: column_id={}, k={}, metric={}",
+                delimiter.as_str(),
+                vector_search.column_id,
+                vector_search.k,
+                vector_search.metric
+            )?;
+        }
         write!(f, " }}")
     }
 }
diff --git a/src/table/src/requests.rs b/src/table/src/requests.rs
index 6cdf945480..3b4058e083 100644
--- a/src/table/src/requests.rs
+++ b/src/table/src/requests.rs
@@ -395,6 +395,13 @@ pub struct FlushTableRequest {
     pub table_name: String,
 }
 
+#[derive(Debug, Clone, Default)]
+pub struct BuildIndexTableRequest {
+    pub catalog_name: String,
+    pub schema_name: String,
+    pub table_name: String,
+}
+
 #[derive(Debug, Clone, PartialEq)]
 pub struct CompactTableRequest {
     pub catalog_name: String,
diff --git a/src/table/src/table.rs b/src/table/src/table.rs
index ce36544b73..0ae7d580d8 100644
--- a/src/table/src/table.rs
+++ b/src/table/src/table.rs
@@ -118,6 +118,10 @@ impl Table {
         self.table_info.meta.schema.clone()
     }
 
+    pub fn schema_ref(&self) -> &SchemaRef {
+        &self.table_info.meta.schema
+    }
+
     /// Get a reference to the table info.
     pub fn table_info(&self) -> TableInfoRef {
         self.table_info.clone()
diff --git a/src/table/src/table/numbers.rs b/src/table/src/table/numbers.rs
index 9b1ed125c0..eb67dcfaef 100644
--- a/src/table/src/table/numbers.rs
+++ b/src/table/src/table/numbers.rs
@@ -166,9 +166,9 @@ impl Stream for NumbersStream {
             batch = batch.project(projection).unwrap();
         }
 
-        Poll::Ready(Some(RecordBatch::try_from_df_record_batch(
+        Poll::Ready(Some(Ok(RecordBatch::from_df_record_batch(
             self.projected_schema.clone(),
             batch,
-        )))
+        ))))
     }
 }
diff --git a/src/table/src/table/scan.rs b/src/table/src/table/scan.rs
index c67f5431e8..5db824e995 100644
--- a/src/table/src/table/scan.rs
+++ b/src/table/src/table/scan.rs
@@ -223,6 +223,11 @@ impl RegionScanExec {
         self.is_partition_set
     }
 
+    pub fn scanner_type(&self) -> String {
+        let scanner = self.scanner.lock().unwrap();
+        scanner.name().to_string()
+    }
+
     /// Update the partition ranges of underlying scanner.
     pub fn with_new_partitions(
         &self,
@@ -444,14 +449,10 @@ impl Stream for StreamWithMetricWrapper {
                 }
                 match result {
                     Ok(record_batch) => {
-                        let batch_mem_size = record_batch
-                            .columns()
-                            .iter()
-                            .map(|vec_ref| vec_ref.memory_size())
-                            .sum::<usize>();
                         // we don't record elapsed time here
                         // since it's calling storage api involving I/O ops
-                        this.metric.record_mem_usage(batch_mem_size);
+                        this.metric
+                            .record_mem_usage(record_batch.buffer_memory_size());
                         this.metric.record_output(record_batch.num_rows());
                         Poll::Ready(Some(Ok(record_batch.into_df_record_batch())))
                     }
diff --git a/src/table/src/test_util/memtable.rs b/src/table/src/test_util/memtable.rs
index 224c9e100f..812b5edaf2 100644
--- a/src/table/src/test_util/memtable.rs
+++ b/src/table/src/test_util/memtable.rs
@@ -29,7 +29,7 @@ use snafu::prelude::*;
 use store_api::data_source::DataSource;
 use store_api::storage::{RegionNumber, ScanRequest};
 
-use crate::error::{SchemaConversionSnafu, TableProjectionSnafu, TablesRecordBatchSnafu};
+use crate::error::{SchemaConversionSnafu, TableProjectionSnafu};
 use crate::metadata::{
     FilterPushDownType, TableId, TableInfoBuilder, TableMetaBuilder, TableType, TableVersion,
 };
@@ -146,17 +146,14 @@ impl DataSource for MemtableDataSource {
         };
         let df_recordbatch = df_recordbatch.slice(0, limit);
 
-        let recordbatch = RecordBatch::try_from_df_record_batch(
+        let recordbatch = RecordBatch::from_df_record_batch(
             Arc::new(
                 Schema::try_from(df_recordbatch.schema())
                     .context(SchemaConversionSnafu)
                     .map_err(BoxedError::new)?,
             ),
             df_recordbatch,
-        )
-        .map_err(BoxedError::new)
-        .context(TablesRecordBatchSnafu)
-        .map_err(BoxedError::new)?;
+        );
 
         Ok(Box::pin(MemtableStream {
             schema: recordbatch.schema.clone(),
diff --git a/tests-integration/Cargo.toml b/tests-integration/Cargo.toml
index 5721f12af9..0dbfededb3 100644
--- a/tests-integration/Cargo.toml
+++ b/tests-integration/Cargo.toml
@@ -54,6 +54,7 @@ log-query = { workspace = true }
 loki-proto.workspace = true
 meta-client.workspace = true
 meta-srv = { workspace = true, features = ["mock"] }
+mito2.workspace = true
 moka.workspace = true
 mysql_async = { version = "0.35", default-features = false, features = [
     "time",
diff --git a/tests-integration/src/cluster.rs b/tests-integration/src/cluster.rs
index 19c2ce4134..282ab110e7 100644
--- a/tests-integration/src/cluster.rs
+++ b/tests-integration/src/cluster.rs
@@ -30,13 +30,11 @@ use catalog::kvbackend::{CachedKvBackendBuilder, KvBackendCatalogManagerBuilder,
 use catalog::process_manager::ProcessManager;
 use client::Client;
 use client::client_manager::NodeClients;
+use cmd::frontend::create_heartbeat_task;
 use common_base::Plugins;
 use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
 use common_meta::DatanodeId;
 use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder};
-use common_meta::heartbeat::handler::HandlerGroupExecutor;
-use common_meta::heartbeat::handler::invalidate_table_cache::InvalidateCacheHandler;
-use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
 use common_meta::kv_backend::KvBackendRef;
 use common_meta::kv_backend::chroot::ChrootKvBackend;
 use common_meta::kv_backend::etcd::EtcdStore;
@@ -44,13 +42,12 @@ use common_meta::kv_backend::memory::MemoryKvBackend;
 use common_meta::peer::Peer;
 use common_runtime::Builder as RuntimeBuilder;
 use common_runtime::runtime::BuilderBuild;
-use common_stat::ResourceStatImpl;
 use common_test_util::temp_dir::create_temp_dir;
+use common_time::util::DefaultSystemTimer;
 use common_wal::config::{DatanodeWalConfig, MetasrvWalConfig};
 use datanode::config::DatanodeOptions;
 use datanode::datanode::{Datanode, DatanodeBuilder, ProcedureConfig};
 use frontend::frontend::{Frontend, FrontendOptions};
-use frontend::heartbeat::HeartbeatTask;
 use frontend::instance::Instance as FeInstance;
 use frontend::instance::builder::FrontendBuilder;
 use frontend::server::Services;
@@ -58,14 +55,15 @@ use hyper_util::rt::TokioIo;
 use meta_client::client::MetaClientBuilder;
 use meta_srv::cluster::MetaPeerClientRef;
 use meta_srv::discovery;
+use meta_srv::gc::GcSchedulerOptions;
 use meta_srv::metasrv::{Metasrv, MetasrvOptions, SelectorRef};
 use meta_srv::mocks::MockInfo;
+use mito2::gc::GcConfig;
 use object_store::config::ObjectStoreConfig;
 use rand::Rng;
 use servers::grpc::GrpcOptions;
 use servers::grpc::flight::FlightCraftWrapper;
 use servers::grpc::region_server::RegionServerRequestHandler;
-use servers::heartbeat_options::HeartbeatOptions;
 use servers::server::ServerHandlers;
 use tempfile::TempDir;
 use tonic::codec::CompressionEncoding;
@@ -102,6 +100,8 @@ pub struct GreptimeDbClusterBuilder {
     datanodes: Option<u32>,
     datanode_wal_config: DatanodeWalConfig,
     metasrv_wal_config: MetasrvWalConfig,
+    datanode_gc_config: GcConfig,
+    metasrv_gc_config: GcSchedulerOptions,
     shared_home_dir: Option<Arc<TempDir>>,
     meta_selector: Option<SelectorRef>,
 }
@@ -133,6 +133,8 @@ impl GreptimeDbClusterBuilder {
             datanodes: None,
             datanode_wal_config: DatanodeWalConfig::default(),
             metasrv_wal_config: MetasrvWalConfig::default(),
+            datanode_gc_config: GcConfig::default(),
+            metasrv_gc_config: GcSchedulerOptions::default(),
             shared_home_dir: None,
             meta_selector: None,
         }
@@ -168,6 +170,17 @@ impl GreptimeDbClusterBuilder {
         self
     }
 
+    #[must_use]
+    pub fn with_datanode_gc_config(mut self, datanode_gc_config: GcConfig) -> Self {
+        self.datanode_gc_config = datanode_gc_config;
+        self
+    }
+
+    pub fn with_metasrv_gc_config(mut self, metasrv_gc_config: GcSchedulerOptions) -> Self {
+        self.metasrv_gc_config = metasrv_gc_config;
+        self
+    }
+
     #[must_use]
     pub fn with_shared_home_dir(mut self, shared_home_dir: Arc<TempDir>) -> Self {
         self.shared_home_dir = Some(shared_home_dir);
@@ -204,6 +217,7 @@ impl GreptimeDbClusterBuilder {
                 server_addr: "127.0.0.1:3002".to_string(),
                 ..Default::default()
             },
+            gc: self.metasrv_gc_config.clone(),
             ..Default::default()
         };
 
@@ -278,6 +292,7 @@ impl GreptimeDbClusterBuilder {
                     vec![],
                     home_dir,
                     self.datanode_wal_config.clone(),
+                    self.datanode_gc_config.clone(),
                 )
             } else {
                 let (opts, guard) = create_tmp_dir_and_datanode_opts(
@@ -285,6 +300,7 @@ impl GreptimeDbClusterBuilder {
                     self.store_providers.clone().unwrap_or_default(),
                     &format!("{}-dn-{}", self.cluster_name, datanode_id),
                     self.datanode_wal_config.clone(),
+                    self.datanode_gc_config.clone(),
                 );
                 guards.push(guard);
 
@@ -319,6 +335,7 @@ impl GreptimeDbClusterBuilder {
     ) {
         for _ in 0..100 {
             let alive_datanodes = discovery::utils::alive_datanodes(
+                &DefaultSystemTimer,
                 meta_peer_client.as_ref(),
                 Duration::from_secs(u64::MAX),
                 None,
@@ -405,31 +422,15 @@ impl GreptimeDbClusterBuilder {
         )
         .build();
 
-        let handlers_executor = HandlerGroupExecutor::new(vec![
-            Arc::new(ParseMailboxMessageHandler),
-            Arc::new(InvalidateCacheHandler::new(cache_registry.clone())),
-        ]);
-
         let fe_opts = self.build_frontend_options();
 
-        let mut resource_stat = ResourceStatImpl::default();
-        resource_stat.start_collect_cpu_usage();
-
-        let heartbeat_task = HeartbeatTask::new(
-            &fe_opts,
-            meta_client.clone(),
-            HeartbeatOptions::default(),
-            Arc::new(handlers_executor),
-            Arc::new(resource_stat),
-        );
-
         let instance = FrontendBuilder::new(
             fe_opts.clone(),
             cached_meta_backend.clone(),
             cache_registry.clone(),
             catalog_manager,
             datanode_clients,
-            meta_client,
+            meta_client.clone(),
             Arc::new(ProcessManager::new(fe_opts.grpc.server_addr.clone(), None)),
         )
         .with_local_cache_invalidator(cache_registry)
@@ -437,6 +438,8 @@ impl GreptimeDbClusterBuilder {
         .await
         .unwrap();
 
+        let heartbeat_task = create_heartbeat_task(&fe_opts, meta_client, &instance);
+
         let instance = Arc::new(instance);
 
         // Build the servers for the frontend.
@@ -452,7 +455,6 @@ impl GreptimeDbClusterBuilder {
             instance,
             servers,
             heartbeat_task: Some(heartbeat_task),
-            export_metrics_task: None,
         }
     }
 
diff --git a/tests-integration/src/grpc.rs b/tests-integration/src/grpc.rs
index b5a130137c..8d4dc9c3ce 100644
--- a/tests-integration/src/grpc.rs
+++ b/tests-integration/src/grpc.rs
@@ -1256,6 +1256,7 @@ CREATE TABLE {table_name} (
 |                    |                                                   |
 |                    | ENGINE=mito                                       |
 |                    | WITH(                                             |
+|                    |   'comment' = 'Created on insertion',             |
 |                    |   'compaction.twcs.time_window' = '1d',           |
 |                    |   'compaction.type' = 'twcs'                      |
 |                    | )                                                 |
diff --git a/tests-integration/src/standalone.rs b/tests-integration/src/standalone.rs
index 5d7fdfa8ed..b43c000189 100644
--- a/tests-integration/src/standalone.rs
+++ b/tests-integration/src/standalone.rs
@@ -287,7 +287,6 @@ impl GreptimeDbStandaloneBuilder {
             instance,
             servers: ServerHandlers::default(),
             heartbeat_task: None,
-            export_metrics_task: None,
         };
 
         frontend.start().await.unwrap();
@@ -310,6 +309,7 @@ impl GreptimeDbStandaloneBuilder {
             store_types,
             &self.instance_name,
             self.datanode_wal_config.clone(),
+            Default::default(),
         );
 
         let kv_backend_config = KvBackendConfig::default();
diff --git a/tests-integration/src/test_util.rs b/tests-integration/src/test_util.rs
index e667bf7626..5a0619c1cc 100644
--- a/tests-integration/src/test_util.rs
+++ b/tests-integration/src/test_util.rs
@@ -32,6 +32,7 @@ use common_wal::config::DatanodeWalConfig;
 use datanode::config::{DatanodeOptions, StorageConfig};
 use frontend::instance::Instance;
 use frontend::service_config::{MysqlOptions, PostgresOptions};
+use mito2::gc::GcConfig;
 use object_store::config::{
     AzblobConfig, FileConfig, GcsConfig, ObjectStoreConfig, OssConfig, S3Config,
 };
@@ -145,6 +146,7 @@ fn s3_test_config() -> S3Config {
             secret_access_key: env::var("GT_S3_ACCESS_KEY").unwrap().into(),
             bucket: env::var("GT_S3_BUCKET").unwrap(),
             region: Some(env::var("GT_S3_REGION").unwrap()),
+            endpoint: env::var("GT_S3_ENDPOINT_URL").ok(),
             ..Default::default()
         },
         ..Default::default()
@@ -163,7 +165,7 @@ pub fn get_test_store_config(store_type: &StorageType) -> (ObjectStoreConfig, Te
                     scope: env::var("GT_GCS_SCOPE").unwrap(),
                     credential_path: env::var("GT_GCS_CREDENTIAL_PATH").unwrap().into(),
                     credential: env::var("GT_GCS_CREDENTIAL").unwrap().into(),
-                    endpoint: env::var("GT_GCS_ENDPOINT").unwrap(),
+                    endpoint: env::var("GT_GCS_ENDPOINT").unwrap_or_default(),
                 },
                 ..Default::default()
             };
@@ -297,6 +299,7 @@ pub fn create_tmp_dir_and_datanode_opts(
     store_provider_types: Vec<StorageType>,
     name: &str,
     wal_config: DatanodeWalConfig,
+    gc_config: GcConfig,
 ) -> (DatanodeOptions, TestGuard) {
     let home_tmp_dir = create_temp_dir(&format!("gt_data_{name}"));
     let home_dir = home_tmp_dir.path().to_str().unwrap().to_string();
@@ -314,7 +317,13 @@ pub fn create_tmp_dir_and_datanode_opts(
         store_providers.push(store);
         storage_guards.push(StorageGuard(data_tmp_dir))
     }
-    let opts = create_datanode_opts(default_store, store_providers, home_dir, wal_config);
+    let opts = create_datanode_opts(
+        default_store,
+        store_providers,
+        home_dir,
+        wal_config,
+        gc_config,
+    );
 
     (
         opts,
@@ -330,7 +339,18 @@ pub(crate) fn create_datanode_opts(
     providers: Vec<ObjectStoreConfig>,
     home_dir: String,
     wal_config: DatanodeWalConfig,
+    gc_config: GcConfig,
 ) -> DatanodeOptions {
+    let region_engine = DatanodeOptions::default()
+        .region_engine
+        .into_iter()
+        .map(|mut v| {
+            if let datanode::config::RegionEngineConfig::Mito(mito_config) = &mut v {
+                mito_config.gc = gc_config.clone();
+            }
+            v
+        })
+        .collect();
     DatanodeOptions {
         node_id: Some(0),
         require_lease_before_startup: true,
@@ -343,6 +363,7 @@ pub(crate) fn create_datanode_opts(
             .with_bind_addr(PEER_PLACEHOLDER_ADDR)
             .with_server_addr(PEER_PLACEHOLDER_ADDR),
         wal: wal_config,
+        region_engine,
         ..Default::default()
     }
 }
diff --git a/tests-integration/src/tests.rs b/tests-integration/src/tests.rs
index db5c00efff..74d713d3ee 100644
--- a/tests-integration/src/tests.rs
+++ b/tests-integration/src/tests.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+mod gc;
 mod instance_kafka_wal_test;
 mod instance_noop_wal_test;
 mod instance_test;
diff --git a/tests-integration/src/tests/gc.rs b/tests-integration/src/tests/gc.rs
new file mode 100644
index 0000000000..c2b402eb1a
--- /dev/null
+++ b/tests-integration/src/tests/gc.rs
@@ -0,0 +1,262 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+use std::time::Duration;
+
+use common_meta::key::TableMetadataManagerRef;
+use common_procedure::ProcedureWithId;
+use common_telemetry::info;
+use common_test_util::recordbatch::check_output_stream;
+use futures::TryStreamExt as _;
+use itertools::Itertools;
+use meta_srv::gc::{BatchGcProcedure, GcSchedulerOptions, Region2Peers};
+use mito2::gc::GcConfig;
+use store_api::storage::RegionId;
+use table::metadata::TableId;
+
+use crate::cluster::GreptimeDbClusterBuilder;
+use crate::test_util::{StorageType, TempDirGuard, get_test_store_config};
+use crate::tests::test_util::{MockInstanceBuilder, TestContext, execute_sql, wait_procedure};
+
+/// Helper function to get table route information for GC procedure
+async fn get_table_route(
+    table_metadata_manager: &TableMetadataManagerRef,
+    table_id: TableId,
+) -> (Region2Peers, Vec<RegionId>) {
+    // Get physical table route
+    let (_, physical_table_route) = table_metadata_manager
+        .table_route_manager()
+        .get_physical_table_route(table_id)
+        .await
+        .unwrap();
+
+    let mut region_routes = Region2Peers::new();
+    let mut regions = Vec::new();
+
+    // Convert region routes to Region2Peers format
+    for region_route in physical_table_route.region_routes {
+        let region_id = region_route.region.id;
+        let leader_peer = region_route.leader_peer.clone().unwrap();
+        let follower_peers = region_route.follower_peers.clone();
+
+        region_routes.insert(region_id, (leader_peer, follower_peers));
+        regions.push(region_id);
+    }
+
+    (region_routes, regions)
+}
+
+/// Helper function to list all SST files
+async fn list_sst_files(test_context: &TestContext) -> HashSet<String> {
+    let mut sst_files = HashSet::new();
+
+    for datanode in test_context.datanodes().values() {
+        let region_server = datanode.region_server();
+        let mito = region_server.mito_engine().unwrap();
+        let all_files = mito
+            .all_ssts_from_storage()
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap()
+            .into_iter()
+            .map(|e| e.file_path)
+            .collect_vec();
+        sst_files.extend(all_files);
+    }
+
+    sst_files
+}
+
+async fn distributed_with_gc(store_type: &StorageType) -> (TestContext, TempDirGuard) {
+    common_telemetry::init_default_ut_logging();
+    let test_name = uuid::Uuid::new_v4().to_string();
+    let (store_config, guard) = get_test_store_config(store_type);
+
+    let builder = GreptimeDbClusterBuilder::new(&test_name)
+        .await
+        .with_metasrv_gc_config(GcSchedulerOptions {
+            enable: true,
+            ..Default::default()
+        })
+        .with_datanode_gc_config(GcConfig {
+            enable: true,
+            // set lingering time to zero for test speedup
+            lingering_time: Some(Duration::ZERO),
+            ..Default::default()
+        })
+        .with_store_config(store_config);
+    (
+        TestContext::new(MockInstanceBuilder::Distributed(builder)).await,
+        guard,
+    )
+}
+
+#[tokio::test]
+async fn test_gc_basic_different_store() {
+    common_telemetry::init_default_ut_logging();
+    let store_type = StorageType::build_storage_types_based_on_env();
+    for store in store_type {
+        if store == StorageType::File {
+            continue; // no point in test gc in fs storage
+        }
+        info!("Running GC test with storage type: {}", store);
+        test_gc_basic(&store).await;
+    }
+}
+
+async fn test_gc_basic(store_type: &StorageType) {
+    let (test_context, _guard) = distributed_with_gc(store_type).await;
+    let instance = test_context.frontend();
+    let metasrv = test_context.metasrv();
+
+    // Step 1: Create table with append_mode to easily generate multiple files
+    let create_table_sql = r#"
+        CREATE TABLE test_gc_table (
+            ts TIMESTAMP TIME INDEX,
+            val DOUBLE,
+            host STRING
+        ) WITH (append_mode = 'true')
+    "#;
+    execute_sql(&instance, create_table_sql).await;
+
+    // Step 2: Generate SST files by inserting data and flushing multiple times
+    for i in 0..4 {
+        let insert_sql = format!(
+            r#"
+            INSERT INTO test_gc_table (ts, val, host) VALUES
+            ('2023-01-0{} 10:00:00', {}, 'host{}'),
+            ('2023-01-0{} 11:00:00', {}, 'host{}'),
+            ('2023-01-0{} 12:00:00', {}, 'host{}')
+            "#,
+            i + 1,
+            10.0 + i as f64,
+            i,
+            i + 1,
+            20.0 + i as f64,
+            i,
+            i + 1,
+            30.0 + i as f64,
+            i
+        );
+        execute_sql(&instance, &insert_sql).await;
+
+        // Flush the table to create SST files
+        let flush_sql = "ADMIN FLUSH_TABLE('test_gc_table')";
+        execute_sql(&instance, flush_sql).await;
+    }
+
+    // Step 3: Get table information
+    let table = instance
+        .catalog_manager()
+        .table("greptime", "public", "test_gc_table", None)
+        .await
+        .unwrap()
+        .unwrap();
+    let table_id = table.table_info().table_id();
+
+    // List SST files before compaction (for verification)
+    let sst_files_before_compaction = list_sst_files(&test_context).await;
+    info!(
+        "SST files before compaction: {:?}",
+        sst_files_before_compaction
+    );
+    assert_eq!(sst_files_before_compaction.len(), 4); // 4 files from 4 flushes
+
+    // Step 4: Trigger compaction to create garbage SST files
+    let compact_sql = "ADMIN COMPACT_TABLE('test_gc_table')";
+    execute_sql(&instance, compact_sql).await;
+
+    // Wait for compaction to complete
+    tokio::time::sleep(Duration::from_secs(2)).await;
+
+    // List SST files after compaction (should have both old and new files)
+    let sst_files_after_compaction = list_sst_files(&test_context).await;
+    info!(
+        "SST files after compaction: {:?}",
+        sst_files_after_compaction
+    );
+    assert_eq!(sst_files_after_compaction.len(), 5); // 4 old + 1 new
+
+    // Step 5: Get table route information for GC procedure
+    let (region_routes, regions) =
+        get_table_route(metasrv.table_metadata_manager(), table_id).await;
+
+    // Step 6: Create and execute BatchGcProcedure
+    let procedure = BatchGcProcedure::new(
+        metasrv.mailbox().clone(),
+        metasrv.options().grpc.server_addr.clone(),
+        regions.clone(),
+        false, // full_file_listing
+        region_routes,
+        HashMap::new(),          // related_regions (empty for this simple test)
+        Duration::from_secs(10), // timeout
+    );
+
+    // Submit the procedure to the procedure manager
+    let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
+    let procedure_id = procedure_with_id.id;
+
+    let _watcher = metasrv
+        .procedure_manager()
+        .submit(procedure_with_id)
+        .await
+        .unwrap();
+
+    // Wait for the procedure to complete
+    wait_procedure(metasrv.procedure_manager(), procedure_id).await;
+
+    // Step 7: Verify GC results
+    let sst_files_after_gc = list_sst_files(&test_context).await;
+    info!("SST files after GC: {:?}", sst_files_after_gc);
+    assert_eq!(sst_files_after_gc.len(), 1); // Only the compacted file should remain after gc
+
+    // Verify that data is still accessible
+    let count_sql = "SELECT COUNT(*) FROM test_gc_table";
+    let count_output = execute_sql(&instance, count_sql).await;
+    let expected = r#"
++----------+
+| count(*) |
++----------+
+| 12       |
++----------+"#
+        .trim();
+    check_output_stream(count_output.data, expected).await;
+
+    let select_sql = "SELECT * FROM test_gc_table ORDER BY ts";
+    let select_output = execute_sql(&instance, select_sql).await;
+    let expected = r#"
++---------------------+------+-------+
+| ts                  | val  | host  |
++---------------------+------+-------+
+| 2023-01-01T10:00:00 | 10.0 | host0 |
+| 2023-01-01T11:00:00 | 20.0 | host0 |
+| 2023-01-01T12:00:00 | 30.0 | host0 |
+| 2023-01-02T10:00:00 | 11.0 | host1 |
+| 2023-01-02T11:00:00 | 21.0 | host1 |
+| 2023-01-02T12:00:00 | 31.0 | host1 |
+| 2023-01-03T10:00:00 | 12.0 | host2 |
+| 2023-01-03T11:00:00 | 22.0 | host2 |
+| 2023-01-03T12:00:00 | 32.0 | host2 |
+| 2023-01-04T10:00:00 | 13.0 | host3 |
+| 2023-01-04T11:00:00 | 23.0 | host3 |
+| 2023-01-04T12:00:00 | 33.0 | host3 |
++---------------------+------+-------+"#
+        .trim();
+    check_output_stream(select_output.data, expected).await;
+
+    // TODO: Add more specific assertions once we have proper file system access
+    // For now, the test passes if the procedure executes without errors
+    info!("GC test completed successfully");
+}
diff --git a/tests-integration/src/tests/instance_kafka_wal_test.rs b/tests-integration/src/tests/instance_kafka_wal_test.rs
index 053521c86f..ed74525aef 100644
--- a/tests-integration/src/tests/instance_kafka_wal_test.rs
+++ b/tests-integration/src/tests/instance_kafka_wal_test.rs
@@ -18,9 +18,8 @@ use std::sync::atomic::{AtomicU64, Ordering};
 
 use client::DEFAULT_CATALOG_NAME;
 use common_query::{Output, OutputData};
-use datatypes::arrow::array::AsArray;
+use datatypes::arrow::array::{ArrayRef, AsArray, TimestampMillisecondArray};
 use datatypes::arrow::datatypes::TimestampMillisecondType;
-use datatypes::vectors::{TimestampMillisecondVector, VectorRef};
 use frontend::instance::Instance;
 use itertools::Itertools;
 use rand::Rng;
@@ -85,12 +84,10 @@ async fn test_create_database_and_insert_query(
         OutputData::Stream(s) => {
             let batches = common_recordbatch::util::collect(s).await.unwrap();
             assert_eq!(1, batches[0].num_columns());
-            assert_eq!(
-                Arc::new(TimestampMillisecondVector::from_vec(vec![
-                    1655276557000_i64
-                ])) as VectorRef,
-                *batches[0].column(0)
-            );
+            let expected = Arc::new(TimestampMillisecondArray::from_iter_values(vec![
+                1655276557000_i64,
+            ])) as ArrayRef;
+            assert_eq!(batches[0].column(0), &expected);
         }
         _ => unreachable!(),
     }
@@ -226,7 +223,7 @@ async fn ensure_data_exists(tables: &[Table], instance: &Arc<Instance>) {
         let queried = record_batches
             .into_iter()
             .flat_map(|rb| {
-                let array = rb.column(0).to_arrow_array();
+                let array = rb.column(0);
                 let array = array.as_primitive::<TimestampMillisecondType>();
                 array.iter().flatten().map(|x| x as u64).collect::<Vec<_>>()
             })
diff --git a/tests-integration/src/tests/instance_noop_wal_test.rs b/tests-integration/src/tests/instance_noop_wal_test.rs
index 1bc4870fa8..dd9c46cb1d 100644
--- a/tests-integration/src/tests/instance_noop_wal_test.rs
+++ b/tests-integration/src/tests/instance_noop_wal_test.rs
@@ -19,7 +19,7 @@ use common_wal::config::{DatanodeWalConfig, MetasrvWalConfig};
 
 use crate::cluster::GreptimeDbClusterBuilder;
 use crate::tests::test_util::{
-    MockInstance, MockInstanceBuilder, RebuildableMockInstance, TestContext, execute_sql,
+    MockInstanceBuilder, RebuildableMockInstance, TestContext, execute_sql,
 };
 
 pub(crate) async fn distributed_with_noop_wal() -> TestContext {
diff --git a/tests-integration/src/tests/instance_test.rs b/tests-integration/src/tests/instance_test.rs
index a29e468bf6..db72f75d35 100644
--- a/tests-integration/src/tests/instance_test.rs
+++ b/tests-integration/src/tests/instance_test.rs
@@ -22,7 +22,9 @@ use common_query::Output;
 use common_recordbatch::util;
 use common_test_util::recordbatch::check_output_stream;
 use common_test_util::temp_dir;
-use datatypes::vectors::{StringVector, TimestampMillisecondVector, UInt64Vector, VectorRef};
+use datatypes::arrow::array::{
+    ArrayRef, AsArray, StringArray, TimestampMillisecondArray, UInt64Array,
+};
 use frontend::error::{Error, Result};
 use frontend::instance::Instance;
 use operator::error::Error as OperatorError;
@@ -77,12 +79,10 @@ async fn test_create_database_and_insert_query(instance: Arc<dyn MockInstance>)
         OutputData::Stream(s) => {
             let batches = util::collect(s).await.unwrap();
             assert_eq!(1, batches[0].num_columns());
-            assert_eq!(
-                Arc::new(TimestampMillisecondVector::from_vec(vec![
-                    1655276557000_i64
-                ])) as VectorRef,
-                *batches[0].column(0)
-            );
+            let expected = Arc::new(TimestampMillisecondArray::from_iter_values(vec![
+                1655276557000_i64,
+            ])) as ArrayRef;
+            assert_eq!(batches[0].column(0), &expected);
         }
         _ => unreachable!(),
     }
@@ -210,7 +210,8 @@ async fn test_show_create_external_table(instance: Arc<dyn MockInstance>) {
     // We can't directly test `show create table` by check_output_stream because the location name length depends on the current filesystem.
     let record_batches = record_batches.iter().collect::<Vec<_>>();
     let column = record_batches[0].column_by_name("Create Table").unwrap();
-    let actual = column.get(0);
+    let column = column.as_string::<i32>();
+    let actual = column.value(0);
     let expect = format!(
         r#"CREATE EXTERNAL TABLE IF NOT EXISTS "various_type_csv" (
   "c_int" BIGINT NULL,
@@ -312,14 +313,11 @@ async fn assert_query_result(instance: &Arc<Instance>, sql: &str, ts: i64, host:
             let batches = util::collect(s).await.unwrap();
             // let columns = batches[0].df_recordbatch.columns();
             assert_eq!(2, batches[0].num_columns());
-            assert_eq!(
-                Arc::new(StringVector::from(vec![host])) as VectorRef,
-                *batches[0].column(0)
-            );
-            assert_eq!(
-                Arc::new(TimestampMillisecondVector::from_vec(vec![ts])) as VectorRef,
-                *batches[0].column(1)
-            );
+            let expected = vec![
+                Arc::new(StringArray::from_iter_values(vec![host])) as ArrayRef,
+                Arc::new(TimestampMillisecondArray::from_iter_values(vec![ts])) as ArrayRef,
+            ];
+            assert_eq!(batches[0].columns(), &expected);
         }
         _ => unreachable!(),
     }
@@ -446,10 +444,8 @@ async fn test_execute_query(instance: Arc<dyn MockInstance>) {
             assert_eq!(1, numbers[0].num_columns());
             assert_eq!(numbers[0].column(0).len(), 1);
 
-            assert_eq!(
-                Arc::new(UInt64Vector::from_vec(vec![4950_u64])) as VectorRef,
-                *numbers[0].column(0),
-            );
+            let expected = Arc::new(UInt64Array::from_iter_values(vec![4950_u64])) as ArrayRef;
+            assert_eq!(numbers[0].column(0), &expected);
         }
         _ => unreachable!(),
     }
@@ -482,11 +478,11 @@ async fn test_execute_show_databases_tables(instance: Arc<dyn MockInstance>) {
     check_unordered_output_stream(output, expected).await;
 
     let expected = "\
-+---------+
-| Tables  |
-+---------+
-| numbers |
-+---------+\
++------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
++------------------+\
 ";
     let output = execute_sql(&instance, "show tables").await;
     check_unordered_output_stream(output, expected).await;
@@ -498,23 +494,23 @@ async fn test_execute_show_databases_tables(instance: Arc<dyn MockInstance>) {
 
     let output = execute_sql(&instance, "show tables").await;
     let expected = "\
-+---------+
-| Tables  |
-+---------+
-| demo    |
-| numbers |
-+---------+\
++------------------+
+| Tables_in_public |
++------------------+
+| demo             |
+| numbers          |
++------------------+\
 ";
     check_unordered_output_stream(output, expected).await;
 
     let output = execute_sql(&instance, "SHOW FULL TABLES WHERE Table_type != 'VIEW'").await;
     let expected = "\
-+---------+-----------------+
-| Tables  | Table_type      |
-+---------+-----------------+
-| demo    | BASE TABLE      |
-| numbers | LOCAL TEMPORARY |
-+---------+-----------------+\
++------------------+-----------------+
+| Tables_in_public | Table_type      |
++------------------+-----------------+
+| demo             | BASE TABLE      |
+| numbers          | LOCAL TEMPORARY |
++------------------+-----------------+\
 ";
     check_unordered_output_stream(output, expected).await;
 
@@ -524,22 +520,22 @@ async fn test_execute_show_databases_tables(instance: Arc<dyn MockInstance>) {
     )
     .await;
     let expected = "\
-+--------+------------+
-| Tables | Table_type |
-+--------+------------+
-| demo   | BASE TABLE |
-+--------+------------+\
++------------------+------------+
+| Tables_in_public | Table_type |
++------------------+------------+
+| demo             | BASE TABLE |
++------------------+------------+\
 ";
     check_unordered_output_stream(output, expected).await;
 
     // show tables like [string]
     let output = execute_sql(&instance, "show tables like 'de%'").await;
     let expected = "\
-+--------+
-| Tables |
-+--------+
-| demo   |
-+--------+\
++------------------+
+| Tables_in_public |
++------------------+
+| demo             |
++------------------+\
 ";
     check_unordered_output_stream(output, expected).await;
 }
@@ -1256,11 +1252,11 @@ async fn test_rename_table(instance: Arc<dyn MockInstance>) {
         .await
         .data;
     let expect = "\
-+------------+
-| Tables     |
-+------------+
-| test_table |
-+------------+";
++--------------+
+| Tables_in_db |
++--------------+
+| test_table   |
++--------------+";
     check_output_stream(output, expect).await;
 
     let output = execute_sql_with(
@@ -1327,12 +1323,12 @@ async fn test_create_table_after_rename_table(instance: Arc<dyn MockInstance>) {
     assert!(matches!(output, OutputData::AffectedRows(0)));
 
     let expect = "\
-+------------+
-| Tables     |
-+------------+
-| demo       |
-| test_table |
-+------------+";
++--------------+
+| Tables_in_db |
++--------------+
+| demo         |
+| test_table   |
++--------------+";
     let output = execute_sql_with(&instance, "show tables", query_ctx)
         .await
         .data;
@@ -1520,11 +1516,11 @@ async fn test_use_database(instance: Arc<dyn MockInstance>) {
         .await
         .data;
     let expected = "\
-+--------+
-| Tables |
-+--------+
-| tb1    |
-+--------+";
++---------------+
+| Tables_in_db1 |
++---------------+
+| tb1           |
++---------------+";
     check_output_stream(output, expected).await;
 
     let output = execute_sql_with(
@@ -2175,7 +2171,8 @@ async fn test_custom_storage(instance: Arc<dyn MockInstance>) {
 
             let record_batches = record_batches.iter().collect::<Vec<_>>();
             let column = record_batches[0].column_by_name("Create Table").unwrap();
-            let actual = column.get(0);
+            let column = column.as_string::<i32>();
+            let actual = column.value(0);
 
             let expect = if instance.is_distributed_mode() {
                 format!(
diff --git a/tests-integration/src/tests/reconcile_table.rs b/tests-integration/src/tests/reconcile_table.rs
index 3e8414436d..d1204cba20 100644
--- a/tests-integration/src/tests/reconcile_table.rs
+++ b/tests-integration/src/tests/reconcile_table.rs
@@ -24,8 +24,8 @@ use table::table_reference::TableReference;
 
 use crate::cluster::GreptimeDbClusterBuilder;
 use crate::tests::test_util::{
-    MockInstance, MockInstanceBuilder, RebuildableMockInstance, TestContext, dump_kvbackend,
-    execute_sql, restore_kvbackend, try_execute_sql, wait_procedure,
+    MockInstanceBuilder, RebuildableMockInstance, TestContext, dump_kvbackend, execute_sql,
+    restore_kvbackend, try_execute_sql, wait_procedure,
 };
 
 const CREATE_MONITOR_TABLE_SQL: &str = r#"
@@ -409,11 +409,11 @@ async fn test_recover_metadata_failed() {
 
     // Only grpc_latencies table is visible.
     let output = execute_sql(&test_context.frontend(), "show tables;").await;
-    let expected = r#"+---------+
-| Tables  |
-+---------+
-| numbers |
-+---------+"#;
+    let expected = r#"+------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
++------------------+"#;
     check_output_stream(output.data, expected).await;
 
     // Expect table creation to fail because the region directory already exists.
@@ -474,12 +474,12 @@ async fn test_dropped_table() {
     test_context.rebuild().await;
 
     let output = execute_sql(&test_context.frontend(), "show tables;").await;
-    let expected = r#"+----------------+
-| Tables         |
-+----------------+
-| grpc_latencies |
-| numbers        |
-+----------------+"#;
+    let expected = r#"+------------------+
+| Tables_in_public |
++------------------+
+| grpc_latencies   |
+| numbers          |
++------------------+"#;
     check_output_stream(output.data, expected).await;
 
     // We can't query the table because the table is dropped.
@@ -531,12 +531,12 @@ async fn test_renamed_table() {
     check_output_stream(output.data, expected).await;
 
     let output = execute_sql(&test_context.frontend(), "show tables;").await;
-    let expected = r#"+----------------+
-| Tables         |
-+----------------+
-| grpc_latencies |
-| numbers        |
-+----------------+"#;
+    let expected = r#"+------------------+
+| Tables_in_public |
++------------------+
+| grpc_latencies   |
+| numbers          |
++------------------+"#;
     check_output_stream(output.data, expected).await;
 }
 
diff --git a/tests-integration/src/tests/test_util.rs b/tests-integration/src/tests/test_util.rs
index 019ccd79e5..eccca85305 100644
--- a/tests-integration/src/tests/test_util.rs
+++ b/tests-integration/src/tests/test_util.rs
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::collections::HashMap;
 use std::env;
 use std::sync::Arc;
 
 use async_trait::async_trait;
 use client::OutputData;
+use common_meta::DatanodeId;
 use common_meta::kv_backend::KvBackendRef;
 use common_meta::range_stream::{DEFAULT_PAGE_SIZE, PaginationStream};
 use common_meta::rpc::KeyValue;
@@ -30,6 +32,7 @@ use common_test_util::find_workspace_path;
 use common_wal::config::kafka::common::{KafkaConnectionConfig, KafkaTopicConfig};
 use common_wal::config::kafka::{DatanodeKafkaConfig, MetasrvKafkaConfig};
 use common_wal::config::{DatanodeWalConfig, MetasrvWalConfig};
+use datanode::datanode::Datanode;
 use frontend::error::Result;
 use frontend::instance::Instance;
 use futures::TryStreamExt;
@@ -95,6 +98,13 @@ impl MockInstanceImpl {
             MockInstanceImpl::Distributed(instance) => &instance.metasrv,
         }
     }
+
+    pub(crate) fn datanodes(&self) -> &HashMap<DatanodeId, Datanode> {
+        match self {
+            MockInstanceImpl::Standalone(_) => unreachable!(),
+            MockInstanceImpl::Distributed(instance) => &instance.datanode_instances,
+        }
+    }
 }
 
 impl MockInstance for MockInstanceImpl {
@@ -185,6 +195,14 @@ impl TestContext {
     pub(crate) fn metasrv(&self) -> &Arc<Metasrv> {
         self.instance.as_ref().unwrap().metasrv()
     }
+
+    pub(crate) fn frontend(&self) -> Arc<Instance> {
+        self.instance.as_ref().unwrap().frontend()
+    }
+
+    pub(crate) fn datanodes(&self) -> &HashMap<DatanodeId, Datanode> {
+        self.instance.as_ref().unwrap().datanodes()
+    }
 }
 
 #[async_trait::async_trait]
diff --git a/tests-integration/tests/grpc.rs b/tests-integration/tests/grpc.rs
index 6f82d4fc55..447d5afe50 100644
--- a/tests-integration/tests/grpc.rs
+++ b/tests-integration/tests/grpc.rs
@@ -514,23 +514,24 @@ async fn insert_with_hints_and_assert(db: &Database) {
 
     let pretty = record_batches.pretty_print().unwrap();
     let expected = "\
-+-------+-------------------------------------+
-| Table | Create Table                        |
-+-------+-------------------------------------+
-| demo  | CREATE TABLE IF NOT EXISTS \"demo\" ( |
-|       |   \"host\" STRING NULL,               |
-|       |   \"cpu\" DOUBLE NULL,                |
-|       |   \"memory\" DOUBLE NULL,             |
-|       |   \"ts\" TIMESTAMP(3) NOT NULL,       |
-|       |   TIME INDEX (\"ts\"),                |
-|       |   PRIMARY KEY (\"host\")              |
-|       | )                                   |
-|       |                                     |
-|       | ENGINE=mito                         |
-|       | WITH(                               |
-|       |   append_mode = 'true'              |
-|       | )                                   |
-+-------+-------------------------------------+\
++-------+---------------------------------------+
+| Table | Create Table                          |
++-------+---------------------------------------+
+| demo  | CREATE TABLE IF NOT EXISTS \"demo\" (   |
+|       |   \"host\" STRING NULL,                 |
+|       |   \"cpu\" DOUBLE NULL,                  |
+|       |   \"memory\" DOUBLE NULL,               |
+|       |   \"ts\" TIMESTAMP(3) NOT NULL,         |
+|       |   TIME INDEX (\"ts\"),                  |
+|       |   PRIMARY KEY (\"host\")                |
+|       | )                                     |
+|       |                                       |
+|       | ENGINE=mito                           |
+|       | WITH(                                 |
+|       |   'comment' = 'Created on insertion', |
+|       |   append_mode = 'true'                |
+|       | )                                     |
++-------+---------------------------------------+\
 ";
     assert_eq!(pretty, expected);
 
@@ -953,6 +954,7 @@ pub async fn test_grpc_tls_config(store_type: StorageType) {
         Some(TlsMode::Require),
         Some(server_cert_path),
         Some(server_key_path),
+        false,
     );
     let config = GrpcServerConfig {
         max_recv_message_size: 1024,
@@ -970,6 +972,7 @@ pub async fn test_grpc_tls_config(store_type: StorageType) {
         server_ca_cert_path: Some(ca_path),
         client_cert_path: Some(client_cert_path),
         client_key_path: Some(client_key_path),
+        watch: false,
     };
     {
         let grpc_client =
@@ -1007,7 +1010,8 @@ pub async fn test_grpc_tls_config(store_type: StorageType) {
         let runtime = Runtime::builder().build().unwrap();
         let grpc_builder =
             GrpcServerBuilder::new(config.clone(), runtime).with_tls_config(config.tls);
-        assert!(grpc_builder.is_err());
+        // ok but print warning
+        assert!(grpc_builder.is_ok());
     }
 
     let _ = fe_grpc_server.shutdown().await;
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index d0b603c800..912bb303b9 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -101,6 +101,7 @@ macro_rules! http_tests {
                 test_health_api,
                 test_status_api,
                 test_config_api,
+                test_dynamic_tracer_toggle,
                 test_dashboard_path,
                 test_prometheus_remote_write,
                 test_prometheus_remote_special_labels,
@@ -121,6 +122,7 @@ macro_rules! http_tests {
                 test_pipeline_context,
                 test_pipeline_with_vrl,
                 test_pipeline_with_hint_vrl,
+                test_pipeline_one_to_many_vrl,
                 test_pipeline_2,
                 test_pipeline_skip_error,
                 test_pipeline_filter,
@@ -1493,12 +1495,16 @@ manifest_checkpoint_distance = 10
 experimental_manifest_keep_removed_file_count = 256
 experimental_manifest_keep_removed_file_ttl = "1h"
 compress_manifest = false
+experimental_compaction_memory_limit = "unlimited"
+experimental_compaction_on_exhausted = "wait"
 auto_flush_interval = "30m"
 enable_write_cache = false
 write_cache_path = ""
 write_cache_size = "5GiB"
 preload_index_cache = true
 index_cache_percent = 20
+enable_refill_cache_on_read = true
+manifest_cache_size = "256MiB"
 sst_write_buffer_size = "8MiB"
 parallel_scan_channel_size = 32
 max_concurrent_scan_files = 384
@@ -1538,8 +1544,8 @@ type = "time_series"
 
 [region_engine.mito.gc]
 enable = false
-lingering_time = "5m"
-unknown_file_lingering_time = "6h"
+lingering_time = "1m"
+unknown_file_lingering_time = "1h"
 max_concurrent_lister_per_gc_job = 32
 max_concurrent_gc_job = 4
 
@@ -1547,10 +1553,6 @@ max_concurrent_gc_job = 4
 
 [region_engine.file]
 
-[export_metrics]
-enable = false
-write_interval = "30s"
-
 [tracing]
 
 [slow_query]
@@ -1631,6 +1633,35 @@ fn drop_lines_with_inconsistent_results(input: String) -> String {
         )
 }
 
+pub async fn test_dynamic_tracer_toggle(store_type: StorageType) {
+    common_telemetry::init_default_ut_logging();
+
+    let (app, mut guard) = setup_test_http_app(store_type, "test_dynamic_tracer_toggle").await;
+    let client = TestClient::new(app).await;
+
+    let disable_resp = client
+        .post("/debug/enable_trace")
+        .body("false")
+        .send()
+        .await;
+    assert_eq!(disable_resp.status(), StatusCode::OK);
+    assert_eq!(disable_resp.text().await, "trace disabled");
+
+    let enable_resp = client.post("/debug/enable_trace").body("true").send().await;
+    assert_eq!(enable_resp.status(), StatusCode::OK);
+    assert_eq!(enable_resp.text().await, "trace enabled");
+
+    let cleanup_resp = client
+        .post("/debug/enable_trace")
+        .body("false")
+        .send()
+        .await;
+    assert_eq!(cleanup_resp.status(), StatusCode::OK);
+    assert_eq!(cleanup_resp.text().await, "trace disabled");
+
+    guard.remove_all().await;
+}
+
 #[cfg(feature = "dashboard")]
 pub async fn test_dashboard_path(store_type: StorageType) {
     common_telemetry::init_default_ut_logging();
@@ -1756,7 +1787,7 @@ pub async fn test_prometheus_remote_special_labels(store_type: StorageType) {
         expected,
     )
     .await;
-    let expected = "[[\"idc3_lo_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc3_lo_table\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  on_physical_table = 'f1'\\n)\"]]";
+    let expected = "[[\"idc3_lo_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc3_lo_table\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  on_physical_table = 'f1'\\n)\"]]";
     validate_data(
         "test_prometheus_remote_special_labels_idc3_show_create_table",
         &client,
@@ -1782,7 +1813,7 @@ pub async fn test_prometheus_remote_special_labels(store_type: StorageType) {
         expected,
     )
     .await;
-    let expected = "[[\"idc4_local_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc4_local_table\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  on_physical_table = 'f2'\\n)\"]]";
+    let expected = "[[\"idc4_local_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc4_local_table\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  on_physical_table = 'f2'\\n)\"]]";
     validate_data(
         "test_prometheus_remote_special_labels_idc4_show_create_table",
         &client,
@@ -2244,7 +2275,7 @@ transform:
     assert_eq!(res.status(), StatusCode::OK);
 
     // 3. check schema
-    let expected_schema = "[[\"logs1\",\"CREATE TABLE IF NOT EXISTS \\\"logs1\\\" (\\n  \\\"id1\\\" INT NULL INVERTED INDEX,\\n  \\\"id2\\\" INT NULL INVERTED INDEX,\\n  \\\"logger\\\" STRING NULL,\\n  \\\"type\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"log\\\" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', backend = 'bloom', case_sensitive = 'false', false_positive_rate = '0.01', granularity = '10240'),\\n  \\\"time\\\" TIMESTAMP(9) NOT NULL,\\n  TIME INDEX (\\\"time\\\"),\\n  PRIMARY KEY (\\\"type\\\", \\\"log\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  append_mode = 'true'\\n)\"]]";
+    let expected_schema = "[[\"logs1\",\"CREATE TABLE IF NOT EXISTS \\\"logs1\\\" (\\n  \\\"id1\\\" INT NULL INVERTED INDEX,\\n  \\\"id2\\\" INT NULL INVERTED INDEX,\\n  \\\"logger\\\" STRING NULL,\\n  \\\"type\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"log\\\" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', backend = 'bloom', case_sensitive = 'false', false_positive_rate = '0.01', granularity = '10240'),\\n  \\\"time\\\" TIMESTAMP(9) NOT NULL,\\n  TIME INDEX (\\\"time\\\"),\\n  PRIMARY KEY (\\\"type\\\", \\\"log\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  append_mode = 'true'\\n)\"]]";
     validate_data(
         "pipeline_schema",
         &client,
@@ -3085,9 +3116,10 @@ table_suffix: _${type}
     //     )
     //   ENGINE=mito
     //   WITH(
+    //     'comment' = 'Created on insertion',
     //     append_mode = 'true'
     //     )
-    let expected = "[[\"d_table_db\",\"CREATE TABLE IF NOT EXISTS \\\"d_table_db\\\" (\\n  \\\"id1_root\\\" INT NULL,\\n  \\\"id2_root\\\" INT NULL,\\n  \\\"type\\\" STRING NULL,\\n  \\\"log\\\" STRING NULL,\\n  \\\"logger\\\" STRING NULL,\\n  \\\"time\\\" TIMESTAMP(9) NOT NULL,\\n  TIME INDEX (\\\"time\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  append_mode = 'true'\\n)\"]]";
+    let expected = "[[\"d_table_db\",\"CREATE TABLE IF NOT EXISTS \\\"d_table_db\\\" (\\n  \\\"id1_root\\\" INT NULL,\\n  \\\"id2_root\\\" INT NULL,\\n  \\\"type\\\" STRING NULL,\\n  \\\"log\\\" STRING NULL,\\n  \\\"logger\\\" STRING NULL,\\n  \\\"time\\\" TIMESTAMP(9) NOT NULL,\\n  TIME INDEX (\\\"time\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  'comment' = 'Created on insertion',\\n  append_mode = 'true'\\n)\"]]";
 
     validate_data(
         "test_pipeline_context_db",
@@ -3102,11 +3134,12 @@ table_suffix: _${type}
     //     )
     //     ENGINE=mito
     //   WITH(
+    //     'comment' = 'Created on insertion',
     //     append_mode = 'true',
     //     skip_wal = 'true',
     //     ttl = '1day'
     //     )
-    let expected = "[[\"d_table_http\",\"CREATE TABLE IF NOT EXISTS \\\"d_table_http\\\" (\\n  \\\"id1_root\\\" INT NULL,\\n  \\\"id2_root\\\" INT NULL,\\n  \\\"type\\\" STRING NULL,\\n  \\\"log\\\" STRING NULL,\\n  \\\"logger\\\" STRING NULL,\\n  \\\"time\\\" TIMESTAMP(9) NOT NULL,\\n  TIME INDEX (\\\"time\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  append_mode = 'true',\\n  skip_wal = 'true',\\n  ttl = '1day'\\n)\"]]";
+    let expected = "[[\"d_table_http\",\"CREATE TABLE IF NOT EXISTS \\\"d_table_http\\\" (\\n  \\\"id1_root\\\" INT NULL,\\n  \\\"id2_root\\\" INT NULL,\\n  \\\"type\\\" STRING NULL,\\n  \\\"log\\\" STRING NULL,\\n  \\\"logger\\\" STRING NULL,\\n  \\\"time\\\" TIMESTAMP(9) NOT NULL,\\n  TIME INDEX (\\\"time\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  append_mode = 'true',\\n  skip_wal = 'true',\\n  ttl = '1day'\\n)\"]]";
     validate_data(
         "test_pipeline_context_http",
         &client,
@@ -3259,6 +3292,151 @@ transform:
     guard.remove_all().await;
 }
 
+/// Test one-to-many VRL pipeline expansion.
+/// This test verifies that a VRL processor can return an array, which results in
+/// multiple output rows from a single input row.
+pub async fn test_pipeline_one_to_many_vrl(storage_type: StorageType) {
+    common_telemetry::init_default_ut_logging();
+    let (app, mut guard) =
+        setup_test_http_app_with_frontend(storage_type, "test_pipeline_one_to_many_vrl").await;
+
+    let client = TestClient::new(app).await;
+
+    // Pipeline that expands events array into multiple rows
+    let pipeline = r#"
+processors:
+  - date:
+      field: timestamp
+      formats:
+        - "%Y-%m-%d %H:%M:%S"
+      ignore_missing: true
+  - vrl:
+      source: |
+        # Extract events array and expand each event into a separate row
+        events = del(.events)
+        base_host = del(.host)
+        base_timestamp = del(.timestamp)
+        
+        # Map each event to a complete row object
+        map_values(array!(events)) -> |event| {
+            {
+                "host": base_host,
+                "event_type": event.type,
+                "event_value": event.value,
+                "timestamp": base_timestamp
+            }
+        }
+
+transform:
+  - field: host
+    type: string
+  - field: event_type
+    type: string
+  - field: event_value
+    type: int32
+  - field: timestamp
+    type: time
+    index: timestamp
+"#;
+
+    // 1. create pipeline
+    let res = client
+        .post("/v1/events/pipelines/one_to_many")
+        .header("Content-Type", "application/x-yaml")
+        .body(pipeline)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    // 2. write data - single input with multiple events
+    let data_body = r#"
+[
+  {
+    "host": "server1",
+    "timestamp": "2024-05-25 20:16:37",
+    "events": [
+      {"type": "cpu", "value": 80},
+      {"type": "memory", "value": 60},
+      {"type": "disk", "value": 45}
+    ]
+  }
+]
+"#;
+    let res = client
+        .post("/v1/events/logs?db=public&table=metrics&pipeline_name=one_to_many")
+        .header("Content-Type", "application/json")
+        .body(data_body)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    // 3. verify: one input row should produce three output rows
+    validate_data(
+        "test_pipeline_one_to_many_vrl_count",
+        &client,
+        "select count(*) from metrics",
+        "[[3]]",
+    )
+    .await;
+
+    // 4. verify the actual data
+    validate_data(
+        "test_pipeline_one_to_many_vrl_data",
+        &client,
+        "select host, event_type, event_value from metrics order by event_type",
+        "[[\"server1\",\"cpu\",80],[\"server1\",\"disk\",45],[\"server1\",\"memory\",60]]",
+    )
+    .await;
+
+    // 5. Test with multiple input rows, each producing multiple output rows
+    let data_body2 = r#"
+[
+  {
+    "host": "server2",
+    "timestamp": "2024-05-25 20:17:00",
+    "events": [
+      {"type": "cpu", "value": 90},
+      {"type": "memory", "value": 70}
+    ]
+  },
+  {
+    "host": "server3",
+    "timestamp": "2024-05-25 20:18:00",
+    "events": [
+      {"type": "cpu", "value": 50}
+    ]
+  }
+]
+"#;
+    let res = client
+        .post("/v1/events/logs?db=public&table=metrics&pipeline_name=one_to_many")
+        .header("Content-Type", "application/json")
+        .body(data_body2)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    // 6. verify total count: 3 (from first batch) + 2 + 1 = 6 rows
+    validate_data(
+        "test_pipeline_one_to_many_vrl_total_count",
+        &client,
+        "select count(*) from metrics",
+        "[[6]]",
+    )
+    .await;
+
+    // 7. verify rows per host
+    validate_data(
+        "test_pipeline_one_to_many_vrl_per_host",
+        &client,
+        "select host, count(*) as cnt from metrics group by host order by host",
+        "[[\"server1\",3],[\"server2\",2],[\"server3\",1]]",
+    )
+    .await;
+
+    guard.remove_all().await;
+}
+
 pub async fn test_pipeline_2(storage_type: StorageType) {
     common_telemetry::init_default_ut_logging();
     let (app, mut guard) = setup_test_http_app_with_frontend(storage_type, "test_pipeline_2").await;
@@ -3319,13 +3497,14 @@ transform:
     //   )
     //   ENGINE=mito
     //   WITH(
+    //     'comment' = 'Created on insertion',
     //     append_mode = 'true'
     //   )
     validate_data(
         "test_pipeline_2_schema",
         &client,
         "show create table d_table",
-        "[[\"d_table\",\"CREATE TABLE IF NOT EXISTS \\\"d_table\\\" (\\n  \\\"id1\\\" INT NULL INVERTED INDEX,\\n  \\\"time\\\" TIMESTAMP(9) NOT NULL,\\n  \\\"id2\\\" STRING NULL,\\n  TIME INDEX (\\\"time\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  append_mode = 'true'\\n)\"]]",
+        "[[\"d_table\",\"CREATE TABLE IF NOT EXISTS \\\"d_table\\\" (\\n  \\\"id1\\\" INT NULL INVERTED INDEX,\\n  \\\"time\\\" TIMESTAMP(9) NOT NULL,\\n  \\\"id2\\\" STRING NULL,\\n  TIME INDEX (\\\"time\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  'comment' = 'Created on insertion',\\n  append_mode = 'true'\\n)\"]]",
     )
     .await;
 
@@ -4288,10 +4467,11 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
     //   )
     // ENGINE=metric
     // WITH(
+    //   'comment' = 'Created on insertion',
     //   on_physical_table = 'greptime_physical_table',
     //   otlp_metric_compat = 'prom'
     // )
-    let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  \\\"host_arch\\\" STRING NULL,\\n  \\\"job\\\" STRING NULL,\\n  \\\"model\\\" STRING NULL,\\n  \\\"os_version\\\" STRING NULL,\\n  \\\"otel_scope_name\\\" STRING NULL,\\n  \\\"otel_scope_schema_url\\\" STRING NULL,\\n  \\\"otel_scope_version\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  \\\"service_version\\\" STRING NULL,\\n  \\\"session_id\\\" STRING NULL,\\n  \\\"terminal_type\\\" STRING NULL,\\n  \\\"user_id\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"host_arch\\\", \\\"job\\\", \\\"model\\\", \\\"os_version\\\", \\\"otel_scope_name\\\", \\\"otel_scope_schema_url\\\", \\\"otel_scope_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  on_physical_table = 'greptime_physical_table',\\n  otlp_metric_compat = 'prom'\\n)\"]]";
+    let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  \\\"host_arch\\\" STRING NULL,\\n  \\\"job\\\" STRING NULL,\\n  \\\"model\\\" STRING NULL,\\n  \\\"os_version\\\" STRING NULL,\\n  \\\"otel_scope_name\\\" STRING NULL,\\n  \\\"otel_scope_schema_url\\\" STRING NULL,\\n  \\\"otel_scope_version\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  \\\"service_version\\\" STRING NULL,\\n  \\\"session_id\\\" STRING NULL,\\n  \\\"terminal_type\\\" STRING NULL,\\n  \\\"user_id\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"host_arch\\\", \\\"job\\\", \\\"model\\\", \\\"os_version\\\", \\\"otel_scope_name\\\", \\\"otel_scope_schema_url\\\", \\\"otel_scope_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  on_physical_table = 'greptime_physical_table',\\n  otlp_metric_compat = 'prom'\\n)\"]]";
     validate_data(
         "otlp_metrics_all_show_create_table",
         &client,
@@ -4301,7 +4481,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
     .await;
 
     // select metrics data
-    let expected = "[[1753780559836,0.0052544,\"arm64\",\"claude-code\",\"claude-3-5-haiku-20241022\",\"25.0.0\",\"com.anthropic.claude_code\",\"\",\"1.0.62\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"],[1753780559836,2.244618,\"arm64\",\"claude-code\",\"claude-sonnet-4-20250514\",\"25.0.0\",\"com.anthropic.claude_code\",\"\",\"1.0.62\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"]]";
+    let expected = "[[1753780559836,2.244618,\"arm64\",\"claude-code\",\"claude-sonnet-4-20250514\",\"25.0.0\",\"com.anthropic.claude_code\",\"\",\"1.0.62\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"],[1753780559836,0.0052544,\"arm64\",\"claude-code\",\"claude-3-5-haiku-20241022\",\"25.0.0\",\"com.anthropic.claude_code\",\"\",\"1.0.62\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"]]";
     validate_data(
         "otlp_metrics_all_select",
         &client,
@@ -4360,10 +4540,11 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
     //     )
     //   ENGINE=metric
     //   WITH(
+    //     'comment' = 'Created on insertion',
     //     on_physical_table = 'greptime_physical_table',
     //     otlp_metric_compat = 'prom'
     //   )
-    let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  \\\"job\\\" STRING NULL,\\n  \\\"model\\\" STRING NULL,\\n  \\\"os_type\\\" STRING NULL,\\n  \\\"os_version\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  \\\"service_version\\\" STRING NULL,\\n  \\\"session_id\\\" STRING NULL,\\n  \\\"terminal_type\\\" STRING NULL,\\n  \\\"user_id\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"os_type\\\", \\\"os_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  on_physical_table = 'greptime_physical_table',\\n  otlp_metric_compat = 'prom'\\n)\"]]";
+    let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  \\\"job\\\" STRING NULL,\\n  \\\"model\\\" STRING NULL,\\n  \\\"os_type\\\" STRING NULL,\\n  \\\"os_version\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  \\\"service_version\\\" STRING NULL,\\n  \\\"session_id\\\" STRING NULL,\\n  \\\"terminal_type\\\" STRING NULL,\\n  \\\"user_id\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"os_type\\\", \\\"os_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  'comment' = 'Created on insertion',\\n  on_physical_table = 'greptime_physical_table',\\n  otlp_metric_compat = 'prom'\\n)\"]]";
     validate_data(
         "otlp_metrics_show_create_table",
         &client,
@@ -4373,7 +4554,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
     .await;
 
     // select metrics data
-    let expected = "[[1753780559836,2.244618,\"claude-code\",\"claude-sonnet-4-20250514\",\"darwin\",\"25.0.0\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"],[1753780559836,0.0052544,\"claude-code\",\"claude-3-5-haiku-20241022\",\"darwin\",\"25.0.0\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"]]";
+    let expected = "[[1753780559836,0.0052544,\"claude-code\",\"claude-3-5-haiku-20241022\",\"darwin\",\"25.0.0\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"],[1753780559836,2.244618,\"claude-code\",\"claude-sonnet-4-20250514\",\"darwin\",\"25.0.0\",\"claude-code\",\"1.0.62\",\"736525A3-F5D4-496B-933E-827AF23A5B97\",\"ghostty\",\"6DA02FD9-B5C5-4E61-9355-9FE8EC9A0CF4\"]]";
     validate_data(
         "otlp_metrics_select",
         &client,
@@ -4423,10 +4604,11 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
     //     )
     //   ENGINE=metric
     //   WITH(
+    //    'comment' = 'Created on insertion',
     //     on_physical_table = 'greptime_physical_table',
     //     otlp_metric_compat = 'prom'
     //   )
-    let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  \\\"job\\\" STRING NULL,\\n  \\\"model\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  \\\"service_version\\\" STRING NULL,\\n  \\\"session_id\\\" STRING NULL,\\n  \\\"terminal_type\\\" STRING NULL,\\n  \\\"user_id\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  on_physical_table = 'greptime_physical_table',\\n  otlp_metric_compat = 'prom'\\n)\"]]";
+    let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  \\\"job\\\" STRING NULL,\\n  \\\"model\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  \\\"service_version\\\" STRING NULL,\\n  \\\"session_id\\\" STRING NULL,\\n  \\\"terminal_type\\\" STRING NULL,\\n  \\\"user_id\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  'comment' = 'Created on insertion',\\n  on_physical_table = 'greptime_physical_table',\\n  otlp_metric_compat = 'prom'\\n)\"]]";
     validate_data(
         "otlp_metrics_show_create_table_none",
         &client,
@@ -4652,7 +4834,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
     let expected = r#"[[1736480942444376000,1736480942444499000,123000,null,"c05d7a4ec8e1f231f02ed6e8da8655b4","d24f921c75f68e23","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444376000,1736480942444499000,123000,"d24f921c75f68e23","c05d7a4ec8e1f231f02ed6e8da8655b4","9630f2916e2f7909","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]],[1736480942444589000,1736480942444712000,123000,null,"cc9e0991a2e63d274984bd44ee669203","eba7be77e3558179","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444589000,1736480942444712000,123000,"eba7be77e3558179","cc9e0991a2e63d274984bd44ee669203","8f847259b0f6e1ab","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]]]"#;
     validate_data("otlp_traces", &client, "select * from mytable;", expected).await;
 
-    let expected_ddl = r#"[["mytable","CREATE TABLE IF NOT EXISTS \"mytable\" (\n  \"timestamp\" TIMESTAMP(9) NOT NULL,\n  \"timestamp_end\" TIMESTAMP(9) NULL,\n  \"duration_nano\" BIGINT UNSIGNED NULL,\n  \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_id\" STRING NULL,\n  \"span_kind\" STRING NULL,\n  \"span_name\" STRING NULL,\n  \"span_status_code\" STRING NULL,\n  \"span_status_message\" STRING NULL,\n  \"trace_state\" STRING NULL,\n  \"scope_name\" STRING NULL,\n  \"scope_version\" STRING NULL,\n  \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_attributes.net.peer.ip\" STRING NULL,\n  \"span_attributes.peer.service\" STRING NULL,\n  \"span_events\" JSON NULL,\n  \"span_links\" JSON NULL,\n  TIME INDEX (\"timestamp\"),\n  PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n  trace_id < '1',\n  trace_id >= '1' AND trace_id < '2',\n  trace_id >= '2' AND trace_id < '3',\n  trace_id >= '3' AND trace_id < '4',\n  trace_id >= '4' AND trace_id < '5',\n  trace_id >= '5' AND trace_id < '6',\n  trace_id >= '6' AND trace_id < '7',\n  trace_id >= '7' AND trace_id < '8',\n  trace_id >= '8' AND trace_id < '9',\n  trace_id >= '9' AND trace_id < 'a',\n  trace_id >= 'a' AND trace_id < 'b',\n  trace_id >= 'b' AND trace_id < 'c',\n  trace_id >= 'c' AND trace_id < 'd',\n  trace_id >= 'd' AND trace_id < 'e',\n  trace_id >= 'e' AND trace_id < 'f',\n  trace_id >= 'f'\n)\nENGINE=mito\nWITH(\n  append_mode = 'true',\n  table_data_model = 'greptime_trace_v1'\n)"]]"#;
+    let expected_ddl = r#"[["mytable","CREATE TABLE IF NOT EXISTS \"mytable\" (\n  \"timestamp\" TIMESTAMP(9) NOT NULL,\n  \"timestamp_end\" TIMESTAMP(9) NULL,\n  \"duration_nano\" BIGINT UNSIGNED NULL,\n  \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_id\" STRING NULL,\n  \"span_kind\" STRING NULL,\n  \"span_name\" STRING NULL,\n  \"span_status_code\" STRING NULL,\n  \"span_status_message\" STRING NULL,\n  \"trace_state\" STRING NULL,\n  \"scope_name\" STRING NULL,\n  \"scope_version\" STRING NULL,\n  \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_attributes.net.peer.ip\" STRING NULL,\n  \"span_attributes.peer.service\" STRING NULL,\n  \"span_events\" JSON NULL,\n  \"span_links\" JSON NULL,\n  TIME INDEX (\"timestamp\"),\n  PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n  trace_id < '1',\n  trace_id >= '1' AND trace_id < '2',\n  trace_id >= '2' AND trace_id < '3',\n  trace_id >= '3' AND trace_id < '4',\n  trace_id >= '4' AND trace_id < '5',\n  trace_id >= '5' AND trace_id < '6',\n  trace_id >= '6' AND trace_id < '7',\n  trace_id >= '7' AND trace_id < '8',\n  trace_id >= '8' AND trace_id < '9',\n  trace_id >= '9' AND trace_id < 'a',\n  trace_id >= 'a' AND trace_id < 'b',\n  trace_id >= 'b' AND trace_id < 'c',\n  trace_id >= 'c' AND trace_id < 'd',\n  trace_id >= 'd' AND trace_id < 'e',\n  trace_id >= 'e' AND trace_id < 'f',\n  trace_id >= 'f'\n)\nENGINE=mito\nWITH(\n  'comment' = 'Created on insertion',\n  append_mode = 'true',\n  table_data_model = 'greptime_trace_v1'\n)"]]"#;
     validate_data(
         "otlp_traces",
         &client,
@@ -4661,7 +4843,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
     )
     .await;
 
-    let expected_ddl = r#"[["mytable_services","CREATE TABLE IF NOT EXISTS \"mytable_services\" (\n  \"timestamp\" TIMESTAMP(9) NOT NULL,\n  \"service_name\" STRING NULL,\n  TIME INDEX (\"timestamp\"),\n  PRIMARY KEY (\"service_name\")\n)\n\nENGINE=mito\nWITH(\n  append_mode = 'false'\n)"]]"#;
+    let expected_ddl = r#"[["mytable_services","CREATE TABLE IF NOT EXISTS \"mytable_services\" (\n  \"timestamp\" TIMESTAMP(9) NOT NULL,\n  \"service_name\" STRING NULL,\n  TIME INDEX (\"timestamp\"),\n  PRIMARY KEY (\"service_name\")\n)\n\nENGINE=mito\nWITH(\n  'comment' = 'Created on insertion',\n  append_mode = 'false'\n)"]]"#;
     validate_data(
         "otlp_traces",
         &client,
@@ -4904,7 +5086,7 @@ pub async fn test_loki_pb_logs(store_type: StorageType) {
     assert_eq!(StatusCode::OK, res.status());
 
     // test schema
-    let expected = "[[\"loki_table_name\",\"CREATE TABLE IF NOT EXISTS \\\"loki_table_name\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(9) NOT NULL,\\n  \\\"line\\\" STRING NULL,\\n  \\\"structured_metadata\\\" JSON NULL,\\n  \\\"service\\\" STRING NULL,\\n  \\\"source\\\" STRING NULL,\\n  \\\"wadaxi\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"service\\\", \\\"source\\\", \\\"wadaxi\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  append_mode = 'true'\\n)\"]]";
+    let expected = "[[\"loki_table_name\",\"CREATE TABLE IF NOT EXISTS \\\"loki_table_name\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(9) NOT NULL,\\n  \\\"line\\\" STRING NULL,\\n  \\\"structured_metadata\\\" JSON NULL,\\n  \\\"service\\\" STRING NULL,\\n  \\\"source\\\" STRING NULL,\\n  \\\"wadaxi\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"service\\\", \\\"source\\\", \\\"wadaxi\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  append_mode = 'true'\\n)\"]]";
     validate_data(
         "loki_pb_schema",
         &client,
@@ -5036,9 +5218,10 @@ processors:
     //     )
     //   ENGINE=mito
     //   WITH(
+    //     'comment' = 'Created on insertion',
     //     append_mode = 'true'
     //   )
-    let expected = "[[\"loki_table_name\",\"CREATE TABLE IF NOT EXISTS \\\"loki_table_name\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"loki_label_service\\\" STRING NULL,\\n  \\\"loki_label_source\\\" STRING NULL,\\n  \\\"loki_label_wadaxi\\\" STRING NULL,\\n  \\\"loki_line\\\" STRING NULL,\\n  \\\"loki_metadata_key1\\\" STRING NULL,\\n  \\\"loki_metadata_key2\\\" STRING NULL,\\n  \\\"loki_metadata_key3\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  append_mode = 'true'\\n)\"]]";
+    let expected = "[[\"loki_table_name\",\"CREATE TABLE IF NOT EXISTS \\\"loki_table_name\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"loki_label_service\\\" STRING NULL,\\n  \\\"loki_label_source\\\" STRING NULL,\\n  \\\"loki_label_wadaxi\\\" STRING NULL,\\n  \\\"loki_line\\\" STRING NULL,\\n  \\\"loki_metadata_key1\\\" STRING NULL,\\n  \\\"loki_metadata_key2\\\" STRING NULL,\\n  \\\"loki_metadata_key3\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  'comment' = 'Created on insertion',\\n  append_mode = 'true'\\n)\"]]";
     validate_data(
         "loki_pb_schema",
         &client,
@@ -5108,7 +5291,7 @@ pub async fn test_loki_json_logs(store_type: StorageType) {
     assert_eq!(StatusCode::OK, res.status());
 
     // test schema
-    let expected = "[[\"loki_table_name\",\"CREATE TABLE IF NOT EXISTS \\\"loki_table_name\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(9) NOT NULL,\\n  \\\"line\\\" STRING NULL,\\n  \\\"structured_metadata\\\" JSON NULL,\\n  \\\"sender\\\" STRING NULL,\\n  \\\"source\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"sender\\\", \\\"source\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  append_mode = 'true'\\n)\"]]";
+    let expected = "[[\"loki_table_name\",\"CREATE TABLE IF NOT EXISTS \\\"loki_table_name\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(9) NOT NULL,\\n  \\\"line\\\" STRING NULL,\\n  \\\"structured_metadata\\\" JSON NULL,\\n  \\\"sender\\\" STRING NULL,\\n  \\\"source\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"sender\\\", \\\"source\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  append_mode = 'true'\\n)\"]]";
     validate_data(
         "loki_json_schema",
         &client,
@@ -5209,9 +5392,10 @@ processors:
     //     )
     //   ENGINE=mito
     //   WITH(
+    //     'comment' = 'Created on insertion',
     //     append_mode = 'true'
     //   )
-    let expected = "[[\"loki_table_name\",\"CREATE TABLE IF NOT EXISTS \\\"loki_table_name\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"loki_label_sender\\\" STRING NULL,\\n  \\\"loki_label_source\\\" STRING NULL,\\n  \\\"loki_line\\\" STRING NULL,\\n  \\\"loki_metadata_key1\\\" STRING NULL,\\n  \\\"loki_metadata_key2\\\" STRING NULL,\\n  \\\"loki_metadata_key3\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  append_mode = 'true'\\n)\"]]";
+    let expected = "[[\"loki_table_name\",\"CREATE TABLE IF NOT EXISTS \\\"loki_table_name\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"loki_label_sender\\\" STRING NULL,\\n  \\\"loki_label_source\\\" STRING NULL,\\n  \\\"loki_line\\\" STRING NULL,\\n  \\\"loki_metadata_key1\\\" STRING NULL,\\n  \\\"loki_metadata_key2\\\" STRING NULL,\\n  \\\"loki_metadata_key3\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  'comment' = 'Created on insertion',\\n  append_mode = 'true'\\n)\"]]";
     validate_data(
         "loki_json_schema",
         &client,
@@ -6039,6 +6223,94 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) {
                                 }
                             }
                         ]
+                    },
+                    {
+                        "scope": {
+                        "name": "test-jaeger-grafana-ua",
+                        "version": "1.0.0"
+                        },
+                        "spans": [
+                            {
+                                "traceId": "5611dce1bc9ebed65352d99a027b08fb",
+                                "spanId": "ffa03416a7b9ea50",
+                                "name": "access-span-1",
+                                "kind": 2,
+                                "startTimeUnixNano": "1738726754600000000",
+                                "endTimeUnixNano": "1738726754700000000",
+                                "attributes": [
+                                    {
+                                        "key": "operation.type",
+                                        "value": {
+                                        "stringValue": "access-span-1"
+                                        }
+                                    }
+                                ],
+                                "status": {
+                                    "message": "success",
+                                    "code": 0
+                                }
+                            },
+                            {
+                                "traceId": "5611dce1bc9ebed65352d99a027b08fb",
+                                "spanId": "ffa03416a7b9ea51",
+                                "name": "access-span-2",
+                                "kind": 2,
+                                "startTimeUnixNano": "1738726754600001000",
+                                "endTimeUnixNano": "1738726754700001000",
+                                "attributes": [
+                                    {
+                                        "key": "operation.type",
+                                        "value": {
+                                        "stringValue": "access-span-2"
+                                        }
+                                    }
+                                ],
+                                "status": {
+                                    "message": "success",
+                                    "code": 0
+                                }
+                            },
+                            {
+                                "traceId": "5611dce1bc9ebed65352d99a027b08fb",
+                                "spanId": "ffa03416a7b9ea52",
+                                "name": "access-span-3",
+                                "kind": 2,
+                                "startTimeUnixNano": "1738726754600002000",
+                                "endTimeUnixNano": "1738726754700002000",
+                                "attributes": [
+                                    {
+                                        "key": "operation.type",
+                                        "value": {
+                                        "stringValue": "access-span-3"
+                                        }
+                                    }
+                                ],
+                                "status": {
+                                    "message": "success",
+                                    "code": 0
+                                }
+                            },
+                            {
+                                "traceId": "5611dce1bc9ebed65352d99a027b08fb",
+                                "spanId": "ffa03416a7b9ea53",
+                                "name": "access-span-4",
+                                "kind": 2,
+                                "startTimeUnixNano": "1738726754600003000",
+                                "endTimeUnixNano": "1738726754700003000",
+                                "attributes": [
+                                    {
+                                        "key": "operation.type",
+                                        "value": {
+                                        "stringValue": "access-span-4"
+                                        }
+                                    }
+                                ],
+                                "status": {
+                                    "message": "success",
+                                    "code": 0
+                                }
+                            }
+                        ]
                     }
                 ],
                 "schemaUrl": "https://opentelemetry.io/schemas/1.4.0"
@@ -6092,7 +6364,7 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) {
     .await;
     assert_eq!(StatusCode::OK, res.status());
 
-    let trace_table_sql = "[[\"mytable\",\"CREATE TABLE IF NOT EXISTS \\\"mytable\\\" (\\n  \\\"timestamp\\\" TIMESTAMP(9) NOT NULL,\\n  \\\"timestamp_end\\\" TIMESTAMP(9) NULL,\\n  \\\"duration_nano\\\" BIGINT UNSIGNED NULL,\\n  \\\"parent_span_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"trace_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"span_id\\\" STRING NULL,\\n  \\\"span_kind\\\" STRING NULL,\\n  \\\"span_name\\\" STRING NULL,\\n  \\\"span_status_code\\\" STRING NULL,\\n  \\\"span_status_message\\\" STRING NULL,\\n  \\\"trace_state\\\" STRING NULL,\\n  \\\"scope_name\\\" STRING NULL,\\n  \\\"scope_version\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"span_attributes.operation.type\\\" STRING NULL,\\n  \\\"span_attributes.net.peer.ip\\\" STRING NULL,\\n  \\\"span_attributes.peer.service\\\" STRING NULL,\\n  \\\"span_events\\\" JSON NULL,\\n  \\\"span_links\\\" JSON NULL,\\n  TIME INDEX (\\\"timestamp\\\"),\\n  PRIMARY KEY (\\\"service_name\\\")\\n)\\nPARTITION ON COLUMNS (\\\"trace_id\\\") (\\n  trace_id < '1',\\n  trace_id >= '1' AND trace_id < '2',\\n  trace_id >= '2' AND trace_id < '3',\\n  trace_id >= '3' AND trace_id < '4',\\n  trace_id >= '4' AND trace_id < '5',\\n  trace_id >= '5' AND trace_id < '6',\\n  trace_id >= '6' AND trace_id < '7',\\n  trace_id >= '7' AND trace_id < '8',\\n  trace_id >= '8' AND trace_id < '9',\\n  trace_id >= '9' AND trace_id < 'a',\\n  trace_id >= 'a' AND trace_id < 'b',\\n  trace_id >= 'b' AND trace_id < 'c',\\n  trace_id >= 'c' AND trace_id < 'd',\\n  trace_id >= 'd' AND trace_id < 'e',\\n  trace_id >= 'e' AND trace_id < 'f',\\n  trace_id >= 'f'\\n)\\nENGINE=mito\\nWITH(\\n  append_mode = 'true',\\n  table_data_model = 'greptime_trace_v1',\\n  ttl = '7days'\\n)\"]]";
+    let trace_table_sql = "[[\"mytable\",\"CREATE TABLE IF NOT EXISTS \\\"mytable\\\" (\\n  \\\"timestamp\\\" TIMESTAMP(9) NOT NULL,\\n  \\\"timestamp_end\\\" TIMESTAMP(9) NULL,\\n  \\\"duration_nano\\\" BIGINT UNSIGNED NULL,\\n  \\\"parent_span_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"trace_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"span_id\\\" STRING NULL,\\n  \\\"span_kind\\\" STRING NULL,\\n  \\\"span_name\\\" STRING NULL,\\n  \\\"span_status_code\\\" STRING NULL,\\n  \\\"span_status_message\\\" STRING NULL,\\n  \\\"trace_state\\\" STRING NULL,\\n  \\\"scope_name\\\" STRING NULL,\\n  \\\"scope_version\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"span_attributes.operation.type\\\" STRING NULL,\\n  \\\"span_attributes.net.peer.ip\\\" STRING NULL,\\n  \\\"span_attributes.peer.service\\\" STRING NULL,\\n  \\\"span_events\\\" JSON NULL,\\n  \\\"span_links\\\" JSON NULL,\\n  TIME INDEX (\\\"timestamp\\\"),\\n  PRIMARY KEY (\\\"service_name\\\")\\n)\\nPARTITION ON COLUMNS (\\\"trace_id\\\") (\\n  trace_id < '1',\\n  trace_id >= '1' AND trace_id < '2',\\n  trace_id >= '2' AND trace_id < '3',\\n  trace_id >= '3' AND trace_id < '4',\\n  trace_id >= '4' AND trace_id < '5',\\n  trace_id >= '5' AND trace_id < '6',\\n  trace_id >= '6' AND trace_id < '7',\\n  trace_id >= '7' AND trace_id < '8',\\n  trace_id >= '8' AND trace_id < '9',\\n  trace_id >= '9' AND trace_id < 'a',\\n  trace_id >= 'a' AND trace_id < 'b',\\n  trace_id >= 'b' AND trace_id < 'c',\\n  trace_id >= 'c' AND trace_id < 'd',\\n  trace_id >= 'd' AND trace_id < 'e',\\n  trace_id >= 'e' AND trace_id < 'f',\\n  trace_id >= 'f'\\n)\\nENGINE=mito\\nWITH(\\n  'comment' = 'Created on insertion',\\n  append_mode = 'true',\\n  table_data_model = 'greptime_trace_v1',\\n  ttl = '7days'\\n)\"]]";
     validate_data(
         "trace_v1_create_table",
         &client,
@@ -6101,7 +6373,7 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) {
     )
     .await;
 
-    let trace_meta_table_sql = "[[\"mytable_services\",\"CREATE TABLE IF NOT EXISTS \\\"mytable_services\\\" (\\n  \\\"timestamp\\\" TIMESTAMP(9) NOT NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  TIME INDEX (\\\"timestamp\\\"),\\n  PRIMARY KEY (\\\"service_name\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  append_mode = 'false'\\n)\"]]";
+    let trace_meta_table_sql = "[[\"mytable_services\",\"CREATE TABLE IF NOT EXISTS \\\"mytable_services\\\" (\\n  \\\"timestamp\\\" TIMESTAMP(9) NOT NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  TIME INDEX (\\\"timestamp\\\"),\\n  PRIMARY KEY (\\\"service_name\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n  'comment' = 'Created on insertion',\\n  append_mode = 'false'\\n)\"]]";
     validate_data(
         "trace_v1_create_meta_table",
         &client,
@@ -6153,9 +6425,25 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) {
             {
                 "name": "access-redis",
                 "spanKind": "server"
+            },
+            {
+                "name": "access-span-1",
+                "spanKind": "server"
+            },
+            {
+                "name": "access-span-2",
+                "spanKind": "server"
+            },
+            {
+                "name": "access-span-3",
+                "spanKind": "server"
+            },
+            {
+                "name": "access-span-4",
+                "spanKind": "server"
             }
         ],
-        "total": 3,
+        "total": 7,
         "limit": 0,
         "offset": 0,
         "errors": []
@@ -6177,9 +6465,13 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) {
         "data": [
             "access-mysql",
             "access-pg",
-            "access-redis"
+            "access-redis",
+            "access-span-1",
+            "access-span-2",
+            "access-span-3",
+            "access-span-4"
         ],
-        "total": 3,
+        "total": 7,
         "limit": 0,
         "offset": 0,
         "errors": []
@@ -6441,6 +6733,30 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) {
     let expected: Value = serde_json::from_str(expected).unwrap();
     assert_eq!(resp, expected);
 
+    // Test `/api/traces/{trace_id}` API for non-existent trace.
+    let res = client
+        .get("/v1/jaeger/api/traces/0000000000000000000000000000dead")
+        .header("x-greptime-trace-table-name", trace_table_name)
+        .send()
+        .await;
+    assert_eq!(StatusCode::NOT_FOUND, res.status());
+    let expected = r#"{
+  "data": null,
+  "total": 0,
+  "limit": 0,
+  "offset": 0,
+  "errors": [
+    {
+      "code": 404,
+      "msg": "trace not found"
+    }
+  ]
+}
+"#;
+    let resp: Value = serde_json::from_str(&res.text().await).unwrap();
+    let expected: Value = serde_json::from_str(expected).unwrap();
+    assert_eq!(resp, expected);
+
     // Test `/api/traces` API.
     let res = client
         .get("/v1/jaeger/api/traces?service=test-jaeger-query-api&operation=access-mysql&start=1738726754492421&end=1738726754642422&tags=%7B%22operation.type%22%3A%22access-mysql%22%7D")
@@ -6601,6 +6917,97 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) {
     let expected: Value = serde_json::from_str(expected).unwrap();
     assert_eq!(resp, expected);
 
+    // Test `/api/traces` API with Grafana User-Agent.
+    // When user agent is Grafana, only return at most 3 spans per trace (earliest by timestamp).
+    // Trace `5611dce1bc9ebed65352d99a027b08fb` has 4 spans, so only 3 should be returned.
+    let res = client
+        .get("/v1/jaeger/api/traces?service=test-jaeger-query-api&start=1738726754600000&end=1738726754700004")
+        .header("x-greptime-trace-table-name", trace_table_name)
+        .header("User-Agent", "Grafana/8.0.0")
+        .send()
+        .await;
+    assert_eq!(StatusCode::OK, res.status());
+
+    let resp: Value = serde_json::from_str(&res.text().await).unwrap();
+    // Verify that the trace has exactly 3 spans (limited by Grafana user agent)
+    let data = resp.get("data").unwrap().as_array().unwrap();
+    assert_eq!(data.len(), 1, "Expected 1 trace");
+    let trace = &data[0];
+    assert_eq!(
+        trace.get("traceID").unwrap().as_str().unwrap(),
+        "5611dce1bc9ebed65352d99a027b08fb"
+    );
+    let spans = trace.get("spans").unwrap().as_array().unwrap();
+    assert_eq!(
+        spans.len(),
+        3,
+        "Expected 3 spans (limited by Grafana user agent), got {}",
+        spans.len()
+    );
+    // Verify that the 3 earliest spans are returned (by timestamp ascending)
+    let span_names: Vec<&str> = spans
+        .iter()
+        .map(|s| s.get("operationName").unwrap().as_str().unwrap())
+        .collect();
+    assert!(
+        span_names.contains(&"access-span-1"),
+        "Expected access-span-1 in spans"
+    );
+    assert!(
+        span_names.contains(&"access-span-2"),
+        "Expected access-span-2 in spans"
+    );
+    assert!(
+        span_names.contains(&"access-span-3"),
+        "Expected access-span-3 in spans"
+    );
+    assert!(
+        !span_names.contains(&"access-span-4"),
+        "access-span-4 should NOT be in spans (4th span)"
+    );
+
+    // Test `/api/traces` API without User-Agent (default behavior).
+    // All 4 spans should be returned for the trace.
+    let res = client
+        .get("/v1/jaeger/api/traces?service=test-jaeger-query-api&start=1738726754600000&end=1738726754700004")
+        .header("x-greptime-trace-table-name", trace_table_name)
+        .send()
+        .await;
+    assert_eq!(StatusCode::OK, res.status());
+
+    let resp: Value = serde_json::from_str(&res.text().await).unwrap();
+    let data = resp.get("data").unwrap().as_array().unwrap();
+    assert_eq!(data.len(), 1, "Expected 1 trace");
+    let trace = &data[0];
+    let spans = trace.get("spans").unwrap().as_array().unwrap();
+    assert_eq!(
+        spans.len(),
+        4,
+        "Expected 4 spans (no user agent limit), got {}",
+        spans.len()
+    );
+
+    // Test `/api/traces` API with Jaeger User-Agent (should return all spans like default).
+    let res = client
+        .get("/v1/jaeger/api/traces?service=test-jaeger-query-api&start=1738726754600000&end=1738726754700004")
+        .header("x-greptime-trace-table-name", trace_table_name)
+        .header("User-Agent", "Jaeger-Query/1.0.0")
+        .send()
+        .await;
+    assert_eq!(StatusCode::OK, res.status());
+
+    let resp: Value = serde_json::from_str(&res.text().await).unwrap();
+    let data = resp.get("data").unwrap().as_array().unwrap();
+    assert_eq!(data.len(), 1, "Expected 1 trace");
+    let trace = &data[0];
+    let spans = trace.get("spans").unwrap().as_array().unwrap();
+    assert_eq!(
+        spans.len(),
+        4,
+        "Expected 4 spans (Jaeger user agent, no limit), got {}",
+        spans.len()
+    );
+
     guard.remove_all().await;
 }
 
diff --git a/tests-integration/tests/region_migration.rs b/tests-integration/tests/region_migration.rs
index 732f3b77f7..b9f106e183 100644
--- a/tests-integration/tests/region_migration.rs
+++ b/tests-integration/tests/region_migration.rs
@@ -34,9 +34,8 @@ use common_test_util::temp_dir::create_temp_dir;
 use common_wal::config::kafka::common::{KafkaConnectionConfig, KafkaTopicConfig};
 use common_wal::config::kafka::{DatanodeKafkaConfig, MetasrvKafkaConfig};
 use common_wal::config::{DatanodeWalConfig, MetasrvWalConfig};
-use datatypes::prelude::ScalarVector;
-use datatypes::value::Value;
-use datatypes::vectors::{Helper, UInt64Vector};
+use datatypes::arrow::array::AsArray;
+use datatypes::arrow::datatypes::UInt64Type;
 use frontend::error::Result as FrontendResult;
 use frontend::instance::Instance;
 use futures::future::BoxFuture;
@@ -364,7 +363,7 @@ pub async fn test_metric_table_region_migration_by_sql(
     let result = cluster
         .frontend
         .instance
-        .do_query("select * from t1", query_ctx.clone())
+        .do_query("select * from t1 order by host desc", query_ctx.clone())
         .await
         .remove(0);
 
@@ -380,7 +379,7 @@ pub async fn test_metric_table_region_migration_by_sql(
     let result = cluster
         .frontend
         .instance
-        .do_query("select * from t2", query_ctx)
+        .do_query("select * from t2 order by job desc", query_ctx)
         .await
         .remove(0);
 
@@ -1189,12 +1188,12 @@ async fn find_region_distribution_by_sql(
     let mut distribution = RegionDistribution::new();
 
     for batch in recordbatches.take() {
-        let datanode_ids: &UInt64Vector =
-            unsafe { Helper::static_cast(batch.column_by_name("datanode_id").unwrap()) };
-        let region_ids: &UInt64Vector =
-            unsafe { Helper::static_cast(batch.column_by_name("region_id").unwrap()) };
+        let column = batch.column_by_name("datanode_id").unwrap();
+        let datanode_ids = column.as_primitive::<UInt64Type>();
+        let column = batch.column_by_name("region_id").unwrap();
+        let region_ids = column.as_primitive::<UInt64Type>();
 
-        for (datanode_id, region_id) in datanode_ids.iter_data().zip(region_ids.iter_data()) {
+        for (datanode_id, region_id) in datanode_ids.iter().zip(region_ids.iter()) {
             let (Some(datanode_id), Some(region_id)) = (datanode_id, region_id) else {
                 unreachable!();
             };
@@ -1231,11 +1230,10 @@ async fn trigger_migration_by_sql(
 
     info!("SQL result:\n {}", recordbatches.pretty_print().unwrap());
 
-    let Value::String(procedure_id) = recordbatches.take()[0].column(0).get(0) else {
-        unreachable!();
-    };
-
-    procedure_id.as_utf8().to_string()
+    let record_batch = &recordbatches.take()[0];
+    let column = record_batch.column(0);
+    let column = column.as_string::<i32>();
+    column.value(0).to_string()
 }
 
 /// Query procedure state by SQL.
@@ -1254,11 +1252,10 @@ async fn query_procedure_by_sql(instance: &Arc<Instance>, pid: &str) -> String {
 
     info!("SQL result:\n {}", recordbatches.pretty_print().unwrap());
 
-    let Value::String(state) = recordbatches.take()[0].column(0).get(0) else {
-        unreachable!();
-    };
-
-    state.as_utf8().to_string()
+    let record_batch = &recordbatches.take()[0];
+    let column = record_batch.column(0);
+    let column = column.as_string::<i32>();
+    column.value(0).to_string()
 }
 
 async fn insert_values(instance: &Arc<Instance>, ts: u64) -> Vec<FrontendResult<Output>> {
diff --git a/tests-integration/tests/sql.rs b/tests-integration/tests/sql.rs
index c28347076a..b3d981b1b0 100644
--- a/tests-integration/tests/sql.rs
+++ b/tests-integration/tests/sql.rs
@@ -230,7 +230,7 @@ pub async fn test_mysql_crud(store_type: StorageType) {
         .unwrap();
 
     sqlx::query(
-        "create table demo(i bigint, ts timestamp time index default current_timestamp, d date default null, dt timestamp(3) default null, b blob default null, j json default null, v vector(3) default null)",
+        "create table demo(i bigint, ts timestamp time index default current_timestamp, d date default null, dt timestamp(3) default null, b blob default null, j json, v vector(3) default null)",
     )
     .execute(&pool)
     .await
@@ -307,13 +307,7 @@ pub async fn test_mysql_crud(store_type: StorageType) {
             }
         });
         assert_eq!(json, expected_j);
-        assert_eq!(
-            vector,
-            [1.0f32, 2.0, 3.0]
-                .iter()
-                .flat_map(|x| x.to_le_bytes())
-                .collect::<Vec<u8>>()
-        );
+        assert_eq!(vector, "[1,2,3]".as_bytes());
     }
 
     let rows = sqlx::query("select i from demo where i=?")
diff --git a/tests/cases/distributed/explain/step_aggr_advance.result b/tests/cases/distributed/explain/step_aggr_advance.result
index a5f7a4aff9..20407ddd73 100644
--- a/tests/cases/distributed/explain/step_aggr_advance.result
+++ b/tests/cases/distributed/explain/step_aggr_advance.result
@@ -21,37 +21,28 @@ Affected Rows: 0
 -- SQLNESS REPLACE (Hash.*) REDACTED
 tql explain (1752591864, 1752592164, '30s') max by (a, b, c) (max_over_time(aggr_optimize_not [2m]));
 
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                                                                                                |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                               |
-|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[max(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                                             |
-|               |     Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c                                                                                    |
-|               |       MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                           |
-|               |         MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                             |
-|               | Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
-|               |   Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
-|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
-|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
-|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
-|               |           Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
-|               |             Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
-|               |               TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
-|               | ]]                                                                                                                                                                                                                                                                  |
-| physical_plan | SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST]                                                                                                                                          |
-|               |   SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                    |
-|               |     AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[max(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1, 2])                         |
-|               |       SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                     |
-|               |         CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                 |
-|               |           RepartitionExec: partitioning=REDACTED
-|               |             AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[max(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1, 2])                          |
-|               |               ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), a@2 as a, b@3 as b, c@4 as c]                         |
-|               |                 CooperativeExec                                                                                                                                                                                                                                     |
-|               |                   SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true]                                                                                                                                       |
-|               |                     CooperativeExec                                                                                                                                                                                                                                 |
-|               |                       MergeScanExec: REDACTED
-|               |                                                                                                                                                                                                                                                                     |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                    |
++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                              |
+|               |   MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                       |
+|               | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                   |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[max(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                                                 |
+|               |     Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
+|               |       Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
+|               |         PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
+|               |           PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
+|               |             PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
+|               |               Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
+|               |                 Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
+|               |                   TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
+|               | ]]                                                                                                                                                                                                                                                                      |
+| physical_plan | SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST]                                                                                                                                              |
+|               |   CooperativeExec                                                                                                                                                                                                                                                       |
+|               |     CooperativeExec                                                                                                                                                                                                                                                     |
+|               |       MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                                                                         |
++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
 -- SQLNESS REPLACE (metrics.*) REDACTED
 -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
@@ -66,30 +57,34 @@ tql analyze (1752591864, 1752592164, '30s') max by (a, b, c) (max_over_time(aggr
 | stage | node | plan_|
 +-+-+-+
 | 0_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST] REDACTED
-|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
-|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[max(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1, 2]) REDACTED
-|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
-|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
-|_|_|_RepartitionExec: partitioning=REDACTED
-|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[max(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1, 2]) REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), a@2 as a, b@3 as b, c@4 as c] REDACTED
 |_|_|_CooperativeExec REDACTED
-|_|_|_SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
-| 1_| 0_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+| 1_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[max(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[max(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
 |_|_|_|
-| 1_| 1_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+| 1_| 1_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[max(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[max(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
@@ -105,37 +100,32 @@ tql analyze (1752591864, 1752592164, '30s') max by (a, b, c) (max_over_time(aggr
 -- SQLNESS REPLACE (Hash.*) REDACTED
 tql explain (1752591864, 1752592164, '30s') sum by (a, b) (max_over_time(aggr_optimize_not [2m]));
 
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                                                                                                |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                   |
-|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                                                                  |
-|               |     Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b                                                                                                         |
-|               |       MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                           |
-|               |         MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                             |
-|               | Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
-|               |   Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
-|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
-|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
-|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
-|               |           Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
-|               |             Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
-|               |               TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
-|               | ]]                                                                                                                                                                                                                                                                  |
-| physical_plan | SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, greptime_timestamp@2 ASC NULLS LAST]                                                                                                                                                              |
-|               |   SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, greptime_timestamp@2 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                        |
-|               |     AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, greptime_timestamp@2 as greptime_timestamp], aggr=[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1])                                      |
-|               |       SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                         |
-|               |         CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                 |
-|               |           RepartitionExec: partitioning=REDACTED
-|               |             AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1])                                       |
-|               |               ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), a@2 as a, b@3 as b]                                   |
-|               |                 CooperativeExec                                                                                                                                                                                                                                     |
-|               |                   SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true]                                                                                                                                       |
-|               |                     CooperativeExec                                                                                                                                                                                                                                 |
-|               |                       MergeScanExec: REDACTED
-|               |                                                                                                                                                                                                                                                                     |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                        |
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                           |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.greptime_timestamp]], aggr=[[__sum_merge(__sum_state(prom_max_over_time(greptime_timestamp_range,greptime_value))) AS sum(prom_max_over_time(greptime_timestamp_range,greptime_value))]] |
+|               |     MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                         |
+|               | Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.greptime_timestamp]], aggr=[[__sum_state(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                                                                    |
+|               |   Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                           |
+|               |     Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d       |
+|               |       PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                             |
+|               |         PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                |
+|               |           PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                       |
+|               |             Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                  |
+|               |               Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                            |
+|               |                 TableScan: aggr_optimize_not                                                                                                                                                                                                                                |
+|               | ]]                                                                                                                                                                                                                                                                          |
+| physical_plan | SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, greptime_timestamp@2 ASC NULLS LAST]                                                                                                                                                                      |
+|               |   SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, greptime_timestamp@2 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                |
+|               |     AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, greptime_timestamp@2 as greptime_timestamp], aggr=[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))]                                                                                     |
+|               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                           |
+|               |         RepartitionExec: partitioning=REDACTED
+|               |           AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, greptime_timestamp@2 as greptime_timestamp], aggr=[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))]                                                                                        |
+|               |             CooperativeExec                                                                                                                                                                                                                                                 |
+|               |               MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                                                                             |
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
 -- SQLNESS REPLACE (metrics.*) REDACTED
 -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
@@ -151,29 +141,33 @@ tql analyze (1752591864, 1752592164, '30s') sum by (a, b) (max_over_time(aggr_op
 +-+-+-+
 | 0_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, greptime_timestamp@2 ASC NULLS LAST] REDACTED
 |_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, greptime_timestamp@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
-|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, greptime_timestamp@2 as greptime_timestamp], aggr=[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1]) REDACTED
-|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, greptime_timestamp@2 as greptime_timestamp], aggr=[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
 |_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_RepartitionExec: partitioning=REDACTED
-|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0, 1]) REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), a@2 as a, b@3 as b] REDACTED
-|_|_|_CooperativeExec REDACTED
-|_|_|_SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b, greptime_timestamp@2 as greptime_timestamp], aggr=[sum(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
-| 1_| 0_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, greptime_timestamp@2 as greptime_timestamp], aggr=[__sum_state(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, greptime_timestamp@0 as greptime_timestamp], aggr=[__sum_state(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
 |_|_|_|
-| 1_| 1_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, greptime_timestamp@2 as greptime_timestamp], aggr=[__sum_state(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, greptime_timestamp@0 as greptime_timestamp], aggr=[__sum_state(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
@@ -189,37 +183,32 @@ tql analyze (1752591864, 1752592164, '30s') sum by (a, b) (max_over_time(aggr_op
 -- SQLNESS REPLACE (Hash.*) REDACTED
 tql explain (1752591864, 1752592164, '30s') avg by (a) (max_over_time(aggr_optimize_not [2m]));
 
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                                                                                                |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                       |
-|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.greptime_timestamp]], aggr=[[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                                                                                       |
-|               |     Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a                                                                                                                              |
-|               |       MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                           |
-|               |         MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                             |
-|               | Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
-|               |   Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
-|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
-|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
-|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
-|               |           Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
-|               |             Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
-|               |               TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
-|               | ]]                                                                                                                                                                                                                                                                  |
-| physical_plan | SortPreservingMergeExec: [a@0 ASC NULLS LAST, greptime_timestamp@1 ASC NULLS LAST]                                                                                                                                                                                  |
-|               |   SortExec: expr=[a@0 ASC NULLS LAST, greptime_timestamp@1 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                            |
-|               |     AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, greptime_timestamp@1 as greptime_timestamp], aggr=[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0])                                                   |
-|               |       SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                             |
-|               |         CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                 |
-|               |           RepartitionExec: partitioning=REDACTED
-|               |             AggregateExec: mode=Partial, gby=[a@2 as a, greptime_timestamp@0 as greptime_timestamp], aggr=[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0])                                                    |
-|               |               ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), a@2 as a]                                             |
-|               |                 CooperativeExec                                                                                                                                                                                                                                     |
-|               |                   SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true]                                                                                                                                       |
-|               |                     CooperativeExec                                                                                                                                                                                                                                 |
-|               |                       MergeScanExec: REDACTED
-|               |                                                                                                                                                                                                                                                                     |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                  |
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                         |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.greptime_timestamp]], aggr=[[__avg_merge(__avg_state(prom_max_over_time(greptime_timestamp_range,greptime_value))) AS avg(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                |
+|               |     MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                   |
+|               | Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.greptime_timestamp]], aggr=[[__avg_state(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                                                                                   |
+|               |   Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
+|               |     Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
+|               |       PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
+|               |         PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
+|               |           PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
+|               |             Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
+|               |               Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
+|               |                 TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
+|               | ]]                                                                                                                                                                                                                                                                    |
+| physical_plan | SortPreservingMergeExec: [a@0 ASC NULLS LAST, greptime_timestamp@1 ASC NULLS LAST]                                                                                                                                                                                    |
+|               |   SortExec: expr=[a@0 ASC NULLS LAST, greptime_timestamp@1 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                              |
+|               |     AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, greptime_timestamp@1 as greptime_timestamp], aggr=[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))]                                                                                         |
+|               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                     |
+|               |         RepartitionExec: partitioning=REDACTED
+|               |           AggregateExec: mode=Partial, gby=[a@0 as a, greptime_timestamp@1 as greptime_timestamp], aggr=[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))]                                                                                            |
+|               |             CooperativeExec                                                                                                                                                                                                                                           |
+|               |               MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                                                                       |
++---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
 -- SQLNESS REPLACE (metrics.*) REDACTED
 -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
@@ -235,29 +224,33 @@ tql analyze (1752591864, 1752592164, '30s') avg by (a) (max_over_time(aggr_optim
 +-+-+-+
 | 0_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, greptime_timestamp@1 ASC NULLS LAST] REDACTED
 |_|_|_SortExec: expr=[a@0 ASC NULLS LAST, greptime_timestamp@1 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
-|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, greptime_timestamp@1 as greptime_timestamp], aggr=[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0]) REDACTED
-|_|_|_SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, greptime_timestamp@1 as greptime_timestamp], aggr=[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
 |_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_RepartitionExec: partitioning=REDACTED
-|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, greptime_timestamp@0 as greptime_timestamp], aggr=[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=PartiallySorted([0]) REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), a@2 as a] REDACTED
-|_|_|_CooperativeExec REDACTED
-|_|_|_SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, greptime_timestamp@1 as greptime_timestamp], aggr=[avg(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
-| 1_| 0_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, greptime_timestamp@1 as greptime_timestamp], aggr=[__avg_state(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, greptime_timestamp@0 as greptime_timestamp], aggr=[__avg_state(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
 |_|_|_|
-| 1_| 1_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, greptime_timestamp@1 as greptime_timestamp], aggr=[__avg_state(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, greptime_timestamp@0 as greptime_timestamp], aggr=[__avg_state(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
@@ -273,34 +266,28 @@ tql analyze (1752591864, 1752592164, '30s') avg by (a) (max_over_time(aggr_optim
 -- SQLNESS REPLACE (Hash.*) REDACTED
 tql explain (1752591864, 1752592164, '30s') count by (a, b, c, d) (max_over_time(aggr_optimize_not [2m]));
 
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                                                                                                |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.d ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                           |
-|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d, aggr_optimize_not.greptime_timestamp]], aggr=[[count(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                      |
-|               |     MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                             |
-|               |       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                               |
-|               | Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
-|               |   Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
-|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
-|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
-|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
-|               |           Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
-|               |             Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
-|               |               TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
-|               | ]]                                                                                                                                                                                                                                                                  |
-| physical_plan | SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST]                                                                                                                      |
-|               |   AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, d@3 as d, greptime_timestamp@4 as greptime_timestamp], aggr=[count(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=Sorted                                   |
-|               |     SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST], preserve_partitioning=[true]                                                                                              |
-|               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                   |
-|               |         RepartitionExec: partitioning=REDACTED
-|               |           AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, d@5 as d, greptime_timestamp@0 as greptime_timestamp], aggr=[count(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=Sorted                                    |
-|               |             CooperativeExec                                                                                                                                                                                                                                         |
-|               |               SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true]                                                                                                                                           |
-|               |                 CooperativeExec                                                                                                                                                                                                                                     |
-|               |                   MergeScanExec: REDACTED
-|               |                                                                                                                                                                                                                                                                     |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                    |
++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.d ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                          |
+|               |   MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                       |
+|               | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.d ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                               |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d, aggr_optimize_not.greptime_timestamp]], aggr=[[count(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                          |
+|               |     Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
+|               |       Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
+|               |         PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
+|               |           PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
+|               |             PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
+|               |               Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
+|               |                 Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
+|               |                   TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
+|               | ]]                                                                                                                                                                                                                                                                      |
+| physical_plan | SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST]                                                                                                                          |
+|               |   CooperativeExec                                                                                                                                                                                                                                                       |
+|               |     CooperativeExec                                                                                                                                                                                                                                                     |
+|               |       MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                                                                         |
++---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
 -- SQLNESS REPLACE (metrics.*) REDACTED
 -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
@@ -315,17 +302,17 @@ tql analyze (1752591864, 1752592164, '30s') count by (a, b, c, d) (max_over_time
 | stage | node | plan_|
 +-+-+-+
 | 0_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST] REDACTED
-|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, d@3 as d, greptime_timestamp@4 as greptime_timestamp], aggr=[count(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=Sorted REDACTED
-|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
-|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
-|_|_|_RepartitionExec: partitioning=REDACTED
-|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, d@5 as d, greptime_timestamp@0 as greptime_timestamp], aggr=[count(prom_max_over_time(greptime_timestamp_range,greptime_value))], ordering_mode=Sorted REDACTED
 |_|_|_CooperativeExec REDACTED
-|_|_|_SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
-| 1_| 0_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+| 1_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, d@3 as d, greptime_timestamp@4 as greptime_timestamp], aggr=[count(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, d@5 as d, greptime_timestamp@0 as greptime_timestamp], aggr=[count(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
 |_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
@@ -334,7 +321,13 @@ tql analyze (1752591864, 1752592164, '30s') count by (a, b, c, d) (max_over_time
 |_|_|_CooperativeExec REDACTED
 |_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
 |_|_|_|
-| 1_| 1_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+| 1_| 1_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, d@3 ASC NULLS LAST, greptime_timestamp@4 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, d@3 as d, greptime_timestamp@4 as greptime_timestamp], aggr=[count(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, d@5 as d, greptime_timestamp@0 as greptime_timestamp], aggr=[count(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
 |_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
@@ -352,35 +345,32 @@ tql analyze (1752591864, 1752592164, '30s') count by (a, b, c, d) (max_over_time
 -- SQLNESS REPLACE (Hash.*) REDACTED
 tql explain (1752591864, 1752592164, '30s') min by (b, c, d) (max_over_time(aggr_optimize_not [2m]));
 
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                                                                                                |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Sort: aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.d ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                               |
-|               |   Aggregate: groupBy=[[aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d, aggr_optimize_not.greptime_timestamp]], aggr=[[min(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                                             |
-|               |     Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d                                                                                    |
-|               |       MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                           |
-|               |         MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                             |
-|               | Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                     |
-|               |   Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
-|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                       |
-|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                          |
-|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                 |
-|               |           Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                            |
-|               |             Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                      |
-|               |               TableScan: aggr_optimize_not                                                                                                                                                                                                                          |
-|               | ]]                                                                                                                                                                                                                                                                  |
-| physical_plan | SortPreservingMergeExec: [b@0 ASC NULLS LAST, c@1 ASC NULLS LAST, d@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST]                                                                                                                                          |
-|               |   SortExec: expr=[b@0 ASC NULLS LAST, c@1 ASC NULLS LAST, d@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                    |
-|               |     AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, c@1 as c, d@2 as d, greptime_timestamp@3 as greptime_timestamp], aggr=[min(prom_max_over_time(greptime_timestamp_range,greptime_value))]                                                                   |
-|               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                   |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                                                                                             |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Sort: aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.d ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                            |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d, aggr_optimize_not.greptime_timestamp]], aggr=[[__min_merge(__min_state(prom_max_over_time(greptime_timestamp_range,greptime_value))) AS min(prom_max_over_time(greptime_timestamp_range,greptime_value))]] |
+|               |     MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                              |
+|               | Aggregate: groupBy=[[aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d, aggr_optimize_not.greptime_timestamp]], aggr=[[__min_state(prom_max_over_time(greptime_timestamp_range,greptime_value))]]                                                                                    |
+|               |   Filter: prom_max_over_time(greptime_timestamp_range,greptime_value) IS NOT NULL                                                                                                                                                                                                                |
+|               |     Projection: aggr_optimize_not.greptime_timestamp, prom_max_over_time(greptime_timestamp_range, greptime_value) AS prom_max_over_time(greptime_timestamp_range,greptime_value), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d                            |
+|               |       PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                  |
+|               |         PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                     |
+|               |           PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                            |
+|               |             Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                                       |
+|               |               Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                                                 |
+|               |                 TableScan: aggr_optimize_not                                                                                                                                                                                                                                                     |
+|               | ]]                                                                                                                                                                                                                                                                                               |
+| physical_plan | SortPreservingMergeExec: [b@0 ASC NULLS LAST, c@1 ASC NULLS LAST, d@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST]                                                                                                                                                                       |
+|               |   SortExec: expr=[b@0 ASC NULLS LAST, c@1 ASC NULLS LAST, d@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                 |
+|               |     AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, c@1 as c, d@2 as d, greptime_timestamp@3 as greptime_timestamp], aggr=[min(prom_max_over_time(greptime_timestamp_range,greptime_value))]                                                                                                |
+|               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                |
 |               |         RepartitionExec: partitioning=REDACTED
-|               |           AggregateExec: mode=Partial, gby=[b@2 as b, c@3 as c, d@4 as d, greptime_timestamp@0 as greptime_timestamp], aggr=[min(prom_max_over_time(greptime_timestamp_range,greptime_value))]                                                                      |
-|               |             ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), b@3 as b, c@4 as c, d@5 as d]                           |
-|               |               CooperativeExec                                                                                                                                                                                                                                       |
-|               |                 CooperativeExec                                                                                                                                                                                                                                     |
-|               |                   MergeScanExec: REDACTED
-|               |                                                                                                                                                                                                                                                                     |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+|               |           AggregateExec: mode=Partial, gby=[b@0 as b, c@1 as c, d@2 as d, greptime_timestamp@3 as greptime_timestamp], aggr=[min(prom_max_over_time(greptime_timestamp_range,greptime_value))]                                                                                                   |
+|               |             CooperativeExec                                                                                                                                                                                                                                                                      |
+|               |               MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                                                                                                  |
++---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
 -- SQLNESS REPLACE (metrics.*) REDACTED
 -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
@@ -399,24 +389,30 @@ tql analyze (1752591864, 1752592164, '30s') min by (b, c, d) (max_over_time(aggr
 |_|_|_AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, c@1 as c, d@2 as d, greptime_timestamp@3 as greptime_timestamp], aggr=[min(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
 |_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_RepartitionExec: partitioning=REDACTED
-|_|_|_AggregateExec: mode=Partial, gby=[b@2 as b, c@3 as c, d@4 as d, greptime_timestamp@0 as greptime_timestamp], aggr=[min(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range,greptime_value)@1 as prom_max_over_time(greptime_timestamp_range,greptime_value), b@3 as b, c@4 as c, d@5 as d] REDACTED
-|_|_|_CooperativeExec REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[b@0 as b, c@1 as c, d@2 as d, greptime_timestamp@3 as greptime_timestamp], aggr=[min(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
-| 1_| 0_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, c@1 as c, d@2 as d, greptime_timestamp@3 as greptime_timestamp], aggr=[__min_state(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[b@2 as b, c@3 as c, d@4 as d, greptime_timestamp@0 as greptime_timestamp], aggr=[__min_state(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), b@1 as b, c@2 as c, d@3 as d] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
 |_|_|_|
-| 1_| 1_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[b@0 as b, c@1 as c, d@2 as d, greptime_timestamp@3 as greptime_timestamp], aggr=[__min_state(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[b@2 as b, c@3 as c, d@4 as d, greptime_timestamp@0 as greptime_timestamp], aggr=[__min_state(prom_max_over_time(greptime_timestamp_range,greptime_value))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_max_over_time(greptime_timestamp_range,greptime_value)@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_max_over_time(greptime_timestamp_range@6, greptime_value@5) as prom_max_over_time(greptime_timestamp_range,greptime_value), b@1 as b, c@2 as c, d@3 as d] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
@@ -432,32 +428,29 @@ tql analyze (1752591864, 1752592164, '30s') min by (b, c, d) (max_over_time(aggr
 -- SQLNESS REPLACE (Hash.*) REDACTED
 tql explain sum(aggr_optimize_not);
 
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| plan_type     | plan                                                                                                                                                                                                                      |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| logical_plan  | Sort: aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                 |
-|               |   Aggregate: groupBy=[[aggr_optimize_not.greptime_timestamp]], aggr=[[sum(aggr_optimize_not.greptime_value)]]                                                                                                             |
-|               |     Projection: aggr_optimize_not.greptime_timestamp, aggr_optimize_not.greptime_value                                                                                                                                    |
-|               |       MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST |
-|               |         MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                   |
-|               | PromInstantManipulate: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp]                                                                                                                |
-|               |   PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                             |
-|               |     Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST        |
-|               |       Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-300000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                  |
-|               |         TableScan: aggr_optimize_not                                                                                                                                                                                      |
-|               | ]]                                                                                                                                                                                                                        |
-| physical_plan | SortPreservingMergeExec: [greptime_timestamp@0 ASC NULLS LAST]                                                                                                                                                            |
-|               |   SortExec: expr=[greptime_timestamp@0 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                      |
-|               |     AggregateExec: mode=FinalPartitioned, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[sum(aggr_optimize_not.greptime_value)]                                                                                  |
-|               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                         |
++---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| plan_type     | plan                                                                                                                                                                                                                 |
++---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| logical_plan  | Sort: aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                            |
+|               |   Aggregate: groupBy=[[aggr_optimize_not.greptime_timestamp]], aggr=[[__sum_merge(__sum_state(aggr_optimize_not.greptime_value)) AS sum(aggr_optimize_not.greptime_value)]]                                          |
+|               |     MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                  |
+|               | Aggregate: groupBy=[[aggr_optimize_not.greptime_timestamp]], aggr=[[__sum_state(aggr_optimize_not.greptime_value)]]                                                                                                  |
+|               |   PromInstantManipulate: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp]                                                                                                         |
+|               |     PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                      |
+|               |       Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST |
+|               |         Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-300000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                           |
+|               |           TableScan: aggr_optimize_not                                                                                                                                                                               |
+|               | ]]                                                                                                                                                                                                                   |
+| physical_plan | SortPreservingMergeExec: [greptime_timestamp@0 ASC NULLS LAST]                                                                                                                                                       |
+|               |   SortExec: expr=[greptime_timestamp@0 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                 |
+|               |     AggregateExec: mode=FinalPartitioned, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[sum(aggr_optimize_not.greptime_value)]                                                                             |
+|               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                    |
 |               |         RepartitionExec: partitioning=REDACTED
-|               |           AggregateExec: mode=Partial, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[sum(aggr_optimize_not.greptime_value)]                                                                                     |
-|               |             ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, greptime_value@5 as greptime_value]                                                                                                         |
-|               |               CooperativeExec                                                                                                                                                                                             |
-|               |                 CooperativeExec                                                                                                                                                                                           |
-|               |                   MergeScanExec: REDACTED
-|               |                                                                                                                                                                                                                           |
-+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+|               |           AggregateExec: mode=Partial, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[sum(aggr_optimize_not.greptime_value)]                                                                                |
+|               |             CooperativeExec                                                                                                                                                                                          |
+|               |               MergeScanExec: REDACTED
+|               |                                                                                                                                                                                                                      |
++---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
 -- SQLNESS REPLACE (metrics.*) REDACTED
 -- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
@@ -477,17 +470,25 @@ tql analyze sum(aggr_optimize_not);
 |_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_RepartitionExec: partitioning=REDACTED
 |_|_|_AggregateExec: mode=Partial, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[sum(aggr_optimize_not.greptime_value)] REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, greptime_value@5 as greptime_value] REDACTED
-|_|_|_CooperativeExec REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
-| 1_| 0_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp] REDACTED
+| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[__sum_state(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[__sum_state(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, greptime_value@5 as greptime_value] REDACTED
+|_|_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
 |_|_|_|
-| 1_| 1_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp] REDACTED
+| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[__sum_state(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[__sum_state(aggr_optimize_not.greptime_value)] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, greptime_value@5 as greptime_value] REDACTED
+|_|_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
@@ -520,62 +521,61 @@ tql explain (1752591864, 1752592164, '30s') sum by (a, b, c) (rate(aggr_optimize
 +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 | logical_plan  | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) |
 |               |   Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp                                                                                                                                                                                                                                                                                                                                                      |
-|               |     SubqueryAlias: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-|               |       Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                             |
-|               |         Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                                                   |
-|               |           Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c                                                                                                                                                                                                                                                                                                                                                                                          |
-|               |             MergeSort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                         |
-|               |               MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-|               | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-|               |   Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d                                                                                                                                                                                                                                                                 |
-|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-|               |           Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                                |
-|               |             Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                                                                                                                                                                                                                                                                                                                                                          |
-|               |               TableScan: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|               |     MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               |       MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|               | SubqueryAlias: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |   Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|               |     Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                                                       |
+|               |       Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |         Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d                                                                                                                                                                                                                                                           |
+|               |           PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+|               |             PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |               PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+|               |                 Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                                          |
+|               |                   Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                                                                                                                                                                                                                                                                                                                                                    |
+|               |                     TableScan: aggr_optimize_not                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
 |               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
 |               |     SubqueryAlias: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
 |               |       Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST                                                                                                                                                                                                                                                                                                                                                                                                     |
 |               |         Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]]                                                                                                                                                                                                                                                                                                                                           |
-|               |           Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c                                                                                                                                                                                                                                                                                                                                                                  |
-|               |             MergeSort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                           |
-|               |               MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-|               | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-|               |   Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.d                                                                                                                                                                                                                             |
-|               |     PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
-|               |       PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-|               |         PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
-|               |           Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                                  |
-|               |             Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                                                                                                                                                                                                                                                                                                                                              |
-|               |               TableScan: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+|               |           Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|               |             Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c                                                                                                                                                                                                                                              |
+|               |               PromRangeManipulate: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"]                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|               |                 PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+|               |                   PromSeriesDivide: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |                     Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST                                                                                                                                                                                                                                                                                                                                        |
+|               |                       Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.d, aggr_optimize_not_count.greptime_timestamp, aggr_optimize_not_count.greptime_value                                                                                                                                                                                                                                                                                                                                                                        |
+|               |                         MergeScan [is_placeholder=false, remote_input=[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|               | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(-420000, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(300000, None)                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               |   TableScan: aggr_optimize_not_count                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 |               | ]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
 | physical_plan | ProjectionExec: expr=[a@1 as a, b@2 as b, c@3 as c, greptime_timestamp@4 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@0 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                               |
 |               |   CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
 |               |     REDACTED
-|               |       AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2])                                                                                                                                                                                                                                                                                                                                   |
-|               |         SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-|               |           CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               |             RepartitionExec: partitioning=REDACTED
-|               |               AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2])                                                                                                                                                                                                                                                                                                                                    |
-|               |                 ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c]                                                                                                                                                                                                                                                                                                           |
-|               |                   CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-|               |                     SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-|               |                       CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               |                         MergeScanExec: REDACTED
 |               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
 |               |         RepartitionExec: partitioning=REDACTED
-|               |           CoalescePartitionsExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-|               |             AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2])                                                                                                                                                                                                                                                                                                                             |
-|               |               SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|               |           CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+|               |             CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+|               |               MergeScanExec: REDACTED
+|               |       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|               |         RepartitionExec: partitioning=REDACTED
+|               |           SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST]                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+|               |             SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+|               |               AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                                                                                                                                                                                                                                                                                                                     |
 |               |                 CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
 |               |                   RepartitionExec: partitioning=REDACTED
-|               |                     AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2])                                                                                                                                                                                                                                                                                                                              |
-|               |                       ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c]                                                                                                                                                                                                                                                                                                     |
-|               |                         CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-|               |                           SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
-|               |                             CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-|               |                               MergeScanExec: REDACTED
+|               |                     AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]                                                                                                                                                                                                                                                                                                                                                                        |
+|               |                       CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+|               |                         FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|               |                           ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c]                                                                                                                                                                                                                                                                                                 |
+|               |                             PromRangeManipulateExec: req range=[0..0], interval=[300000], eval range=[120000], time index=[greptime_timestamp]                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+|               |                               PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+|               |                                 PromSeriesDivideExec: tags=["a", "b", "c", "d"]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+|               |                                   SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true]                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+|               |                                     CoalesceBatchesExec: target_batch_size=8192                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+|               |                                       RepartitionExec: partitioning=REDACTED
+|               |                                         CooperativeExec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|               |                                           MergeScanExec: REDACTED
 |               |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         |
 +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
 
@@ -591,67 +591,62 @@ tql analyze (1752591864, 1752592164, '30s') sum by (a, b, c) (rate(aggr_optimize
 +-+-+-+
 | stage | node | plan_|
 +-+-+-+
-| 0_| 0_|_ProjectionExec: expr=[a@1 as a, b@2 as b, c@3 as c, greptime_timestamp@4 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@0 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] REDACTED
+| 0_| 0_|_ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] REDACTED
 |_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_REDACTED
-|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2]) REDACTED
-|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
-|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
-|_|_|_RepartitionExec: partitioning=REDACTED
-|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2]) REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c] REDACTED
-|_|_|_CooperativeExec REDACTED
-|_|_|_SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true] REDACTED
-|_|_|_CooperativeExec REDACTED
-|_|_|_MergeScanExec: REDACTED
-|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
-|_|_|_RepartitionExec: partitioning=REDACTED
 |_|_|_CoalescePartitionsExec REDACTED
-|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2]) REDACTED
-|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c] REDACTED
+|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
+|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true] REDACTED
 |_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_RepartitionExec: partitioning=REDACTED
-|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))], ordering_mode=PartiallySorted([0, 1, 2]) REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c] REDACTED
 |_|_|_CooperativeExec REDACTED
-|_|_|_SortExec: expr=[a@2 ASC, b@3 ASC, c@4 ASC, d@5 ASC, greptime_timestamp@0 ASC], preserve_partitioning=[true] REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
-| 1_| 0_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c, d@5 as d] REDACTED
+| 1_| 0_|_CooperativeExec REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+| 1_| 1_|_CooperativeExec REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+| 1_| 0_|_ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000)))@4 as sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] REDACTED
+|_|_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000)))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000)))] REDACTED
 |_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000))@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
 |_|_|_|
-| 1_| 1_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c, d@5 as d] REDACTED
+| 1_| 1_|_ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000)))@4 as sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] REDACTED
+|_|_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000)))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000)))] REDACTED
 |_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000))@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
-|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
-|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
-|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
-|_|_|_CooperativeExec REDACTED
-|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
-|_|_|_|
-| 1_| 0_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not_count.greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c, d@5 as d] REDACTED
-|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
-|_|_|_FilterExec: prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not_count.greptime_timestamp,Int64(120000))@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not_count.greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
-|_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
-|_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
-|_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
-|_|_|_CooperativeExec REDACTED
-|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
-|_|_|_|
-| 1_| 1_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not_count.greptime_timestamp,Int64(120000))@1 as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@2 as a, b@3 as b, c@4 as c, d@5 as d] REDACTED
-|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
-|_|_|_FilterExec: prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not_count.greptime_timestamp,Int64(120000))@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not_count.greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c, d@3 as d] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,aggr_optimize_not.greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["a", "b", "c", "d"] REDACTED
diff --git a/tests/cases/distributed/explain/subqueries.result b/tests/cases/distributed/explain/subqueries.result
index 3f40eb0c24..260b2963d5 100644
--- a/tests/cases/distributed/explain/subqueries.result
+++ b/tests/cases/distributed/explain/subqueries.result
@@ -373,7 +373,8 @@ EXPLAIN SELECT DISTINCT x FROM (SELECT a AS x FROM t) sq ORDER BY x;
 | logical_plan_| Sort: sq.x ASC NULLS LAST_|
 |_|_Aggregate: groupBy=[[sq.x]], aggr=[[]]_|
 |_|_MergeScan [is_placeholder=false, remote_input=[_|
-|_| Projection: sq.x_|
+|_| Distinct:_|
+|_|_Projection: sq.x_|
 |_|_SubqueryAlias: sq_|
 |_|_Projection: t.a AS x_|
 |_|_TableScan: t_|
diff --git a/tests/cases/distributed/flow-tql/flow_tql.result b/tests/cases/distributed/flow-tql/flow_tql.result
index 6afffc6edb..7e56009552 100644
--- a/tests/cases/distributed/flow-tql/flow_tql.result
+++ b/tests/cases/distributed/flow-tql/flow_tql.result
@@ -15,20 +15,22 @@ Affected Rows: 0
 
 SHOW CREATE TABLE cnt_reqs;
 
-+----------+-------------------------------------------+
-| Table    | Create Table                              |
-+----------+-------------------------------------------+
-| cnt_reqs | CREATE TABLE IF NOT EXISTS "cnt_reqs" (   |
-|          |   "count(http_requests.val)" DOUBLE NULL, |
-|          |   "ts" TIMESTAMP(3) NOT NULL,             |
-|          |   "status_code" STRING NULL,              |
-|          |   TIME INDEX ("ts"),                      |
-|          |   PRIMARY KEY ("status_code")             |
-|          | )                                         |
-|          |                                           |
-|          | ENGINE=mito                               |
-|          |                                           |
-+----------+-------------------------------------------+
++----------+---------------------------------------------------+
+| Table    | Create Table                                      |
++----------+---------------------------------------------------+
+| cnt_reqs | CREATE TABLE IF NOT EXISTS "cnt_reqs" (           |
+|          |   "count(http_requests.val)" DOUBLE NULL,         |
+|          |   "ts" TIMESTAMP(3) NOT NULL,                     |
+|          |   "status_code" STRING NULL,                      |
+|          |   TIME INDEX ("ts"),                              |
+|          |   PRIMARY KEY ("status_code")                     |
+|          | )                                                 |
+|          |                                                   |
+|          | ENGINE=mito                                       |
+|          | WITH(                                             |
+|          |   'comment' = 'Auto created table by flow engine' |
+|          | )                                                 |
++----------+---------------------------------------------------+
 
 -- test if sink table is tql queryable
 TQL EVAL (now() - '1m'::interval, now(), '5s') count_values("status_code", cnt_reqs);
@@ -113,7 +115,7 @@ Error: 3001(EngineExecuteQuery), Invalid query: TQL query only supports one f64
 SHOW TABLES;
 
 +------------------------+
-| Tables                 |
+| Tables_in_public       |
 +------------------------+
 | http_requests_two_vals |
 | numbers                |
@@ -157,20 +159,22 @@ Affected Rows: 0
 
 SHOW CREATE TABLE cnt_reqs;
 
-+----------+-------------------------------------------+
-| Table    | Create Table                              |
-+----------+-------------------------------------------+
-| cnt_reqs | CREATE TABLE IF NOT EXISTS "cnt_reqs" (   |
-|          |   "count(http_requests.val)" DOUBLE NULL, |
-|          |   "ts" TIMESTAMP(3) NOT NULL,             |
-|          |   "status_code" STRING NULL,              |
-|          |   TIME INDEX ("ts"),                      |
-|          |   PRIMARY KEY ("status_code")             |
-|          | )                                         |
-|          |                                           |
-|          | ENGINE=mito                               |
-|          |                                           |
-+----------+-------------------------------------------+
++----------+---------------------------------------------------+
+| Table    | Create Table                                      |
++----------+---------------------------------------------------+
+| cnt_reqs | CREATE TABLE IF NOT EXISTS "cnt_reqs" (           |
+|          |   "count(http_requests.val)" DOUBLE NULL,         |
+|          |   "ts" TIMESTAMP(3) NOT NULL,                     |
+|          |   "status_code" STRING NULL,                      |
+|          |   TIME INDEX ("ts"),                              |
+|          |   PRIMARY KEY ("status_code")                     |
+|          | )                                                 |
+|          |                                                   |
+|          | ENGINE=mito                                       |
+|          | WITH(                                             |
+|          |   'comment' = 'Auto created table by flow engine' |
+|          | )                                                 |
++----------+---------------------------------------------------+
 
 -- test if sink table is tql queryable
 TQL EVAL (now() - '1m'::interval, now(), '5s') count_values("status_code", cnt_reqs);
@@ -258,7 +262,9 @@ SHOW CREATE TABLE rate_reqs;
 |           | )                                                         |
 |           |                                                           |
 |           | ENGINE=mito                                               |
-|           |                                                           |
+|           | WITH(                                                     |
+|           |   'comment' = 'Auto created table by flow engine'         |
+|           | )                                                         |
 +-----------+-----------------------------------------------------------+
 
 -- test if sink table is tql queryable
@@ -337,7 +343,9 @@ SHOW CREATE TABLE rate_reqs;
 |           | )                                                         |
 |           |                                                           |
 |           | ENGINE=mito                                               |
-|           |                                                           |
+|           | WITH(                                                     |
+|           |   'comment' = 'Auto created table by flow engine'         |
+|           | )                                                         |
 +-----------+-----------------------------------------------------------+
 
 -- test if sink table is tql queryable
diff --git a/tests/cases/distributed/tql-explain-analyze/analyze.result b/tests/cases/distributed/tql-explain-analyze/analyze.result
index 1f90b57d46..ac40094d99 100644
--- a/tests/cases/distributed/tql-explain-analyze/analyze.result
+++ b/tests/cases/distributed/tql-explain-analyze/analyze.result
@@ -127,10 +127,7 @@ TQL ANALYZE (0, 10, '5s') test;
 +-+-+-+
 | stage | node | plan_|
 +-+-+-+
-| 0_| 0_|_SortPreservingMergeExec: [k@2 ASC, l@3 ASC, j@1 ASC] REDACTED
-|_|_|_CooperativeExec REDACTED
-|_|_|_SortExec: expr=[k@2 ASC, l@3 ASC, j@1 ASC], preserve_partitioning=[true] REDACTED
-|_|_|_CooperativeExec REDACTED
+| 0_| 0_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
 | 1_| 0_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[5000], time index=[j] REDACTED
@@ -158,10 +155,7 @@ TQL ANALYZE (0, 10, '5s') rate(test[10s]);
 +-+-+-+
 | stage | node | plan_|
 +-+-+-+
-| 0_| 0_|_SortPreservingMergeExec: [k@2 ASC, l@3 ASC, j@0 ASC] REDACTED
-|_|_|_CooperativeExec REDACTED
-|_|_|_SortExec: expr=[k@2 ASC, l@3 ASC, j@0 ASC], preserve_partitioning=[true] REDACTED
-|_|_|_CooperativeExec REDACTED
+| 0_| 0_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
 | 1_| 0_|_ProjectionExec: expr=[j@0 as j, prom_rate(j_range,i,test.j,Int64(10000))@1 as prom_rate(j_range,i,j,Int64(10000)), k@2 as k, l@3 as l] REDACTED
diff --git a/tests/cases/standalone/common/aggregate/distinct.result b/tests/cases/standalone/common/aggregate/distinct.result
index f36b0b75a6..48331ed45e 100644
--- a/tests/cases/standalone/common/aggregate/distinct.result
+++ b/tests/cases/standalone/common/aggregate/distinct.result
@@ -105,6 +105,133 @@ EXPLAIN ANALYZE SELECT DISTINCT a FROM test ORDER BY a;
 |_|_| Total rows: 2_|
 +-+-+-+
 
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE SELECT DISTINCT a, b FROM test ORDER BY a;
+
++-+-+-+
+| stage | node | plan_|
++-+-+-+
+| 0_| 0_|_CooperativeExec REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b], aggr=[] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b], aggr=[] REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+|_|_| Total rows: 3_|
++-+-+-+
+
+DROP TABLE test;
+
+Affected Rows: 0
+
+CREATE TABLE test (a INTEGER, b INTEGER, t TIMESTAMP TIME INDEX)
+PARTITION ON COLUMNS(a) (
+    a <= 10,
+    a > 10,
+);
+
+Affected Rows: 0
+
+INSERT INTO test VALUES (1, 22, 1), (1, 21, 2), (100, 21, 3), (100, 22, 4);
+
+Affected Rows: 4
+
+SELECT DISTINCT a, b FROM test ORDER BY a, b;
+
++-----+----+
+| a   | b  |
++-----+----+
+| 1   | 21 |
+| 1   | 22 |
+| 100 | 21 |
+| 100 | 22 |
++-----+----+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE SELECT DISTINCT a FROM test ORDER BY a;
+
++-+-+-+
+| stage | node | plan_|
++-+-+-+
+| 0_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=SinglePartitioned, gby=[a@0 as a], aggr=[] REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[] REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+|_|_| Total rows: 2_|
++-+-+-+
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE SELECT DISTINCT a, b FROM test ORDER BY a;
+
++-+-+-+
+| stage | node | plan_|
++-+-+-+
+| 0_| 0_|_SortPreservingMergeExec: [a@0 ASC NULLS LAST] REDACTED
+|_|_|_SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b], aggr=[] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b], aggr=[] REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b], aggr=[] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b], aggr=[] REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b], aggr=[] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b], aggr=[] REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_SeqScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0} REDACTED
+|_|_|_|
+|_|_| Total rows: 4_|
++-+-+-+
+
 DROP TABLE test;
 
 Affected Rows: 0
diff --git a/tests/cases/standalone/common/aggregate/distinct.sql b/tests/cases/standalone/common/aggregate/distinct.sql
index 8fe8cec395..cdf9de5e6d 100644
--- a/tests/cases/standalone/common/aggregate/distinct.sql
+++ b/tests/cases/standalone/common/aggregate/distinct.sql
@@ -27,4 +27,43 @@ SELECT DISTINCT ON (a) * FROM test ORDER BY a, t DESC;
 -- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
 EXPLAIN ANALYZE SELECT DISTINCT a FROM test ORDER BY a;
 
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE SELECT DISTINCT a, b FROM test ORDER BY a;
+
+DROP TABLE test;
+
+CREATE TABLE test (a INTEGER, b INTEGER, t TIMESTAMP TIME INDEX)
+PARTITION ON COLUMNS(a) (
+    a <= 10,
+    a > 10,
+);
+
+INSERT INTO test VALUES (1, 22, 1), (1, 21, 2), (100, 21, 3), (100, 22, 4);
+
+SELECT DISTINCT a, b FROM test ORDER BY a, b;
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE SELECT DISTINCT a FROM test ORDER BY a;
+
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED
+-- SQLNESS REPLACE (Hash.*) REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+EXPLAIN ANALYZE SELECT DISTINCT a, b FROM test ORDER BY a;
+
 DROP TABLE test;
diff --git a/tests/cases/standalone/common/alter/alter_database.result b/tests/cases/standalone/common/alter/alter_database.result
index 8ff458989e..911ef5ddfc 100644
--- a/tests/cases/standalone/common/alter/alter_database.result
+++ b/tests/cases/standalone/common/alter/alter_database.result
@@ -104,6 +104,216 @@ SHOW CREATE DATABASE alter_database;
 | alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
 +----------------+----------------------------------------------+
 
+ALTER DATABASE alter_database SET 'compaction.type'='twcs';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+----------------------------------------------+
+| Database       | Create Database                              |
++----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.type' = 'twcs'                 |
+|                | )                                            |
++----------------+----------------------------------------------+
+
+ALTER DATABASE alter_database SET 'compaction.twcs.time_window'='2h';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+----------------------------------------------+
+| Database       | Create Database                              |
++----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.twcs.time_window' = '2h',      |
+|                |   'compaction.type' = 'twcs'                 |
+|                | )                                            |
++----------------+----------------------------------------------+
+
+ALTER DATABASE alter_database SET 'compaction.twcs.trigger_file_num'='8';
+
+Affected Rows: 0
+
+ALTER DATABASE alter_database SET 'compaction.twcs.max_output_file_size'='512MB';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+-----------------------------------------------------+
+| Database       | Create Database                                     |
++----------------+-----------------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database        |
+|                | WITH(                                               |
+|                |   'compaction.twcs.max_output_file_size' = '512MB', |
+|                |   'compaction.twcs.time_window' = '2h',             |
+|                |   'compaction.twcs.trigger_file_num' = '8',         |
+|                |   'compaction.type' = 'twcs'                        |
+|                | )                                                   |
++----------------+-----------------------------------------------------+
+
+ALTER DATABASE alter_database SET 'compaction.twcs.time_window'='1d';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+-----------------------------------------------------+
+| Database       | Create Database                                     |
++----------------+-----------------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database        |
+|                | WITH(                                               |
+|                |   'compaction.twcs.max_output_file_size' = '512MB', |
+|                |   'compaction.twcs.time_window' = '1d',             |
+|                |   'compaction.twcs.trigger_file_num' = '8',         |
+|                |   'compaction.type' = 'twcs'                        |
+|                | )                                                   |
++----------------+-----------------------------------------------------+
+
+-- SQLNESS ARG restart=true
+SHOW CREATE DATABASE alter_database;
+
++----------------+-----------------------------------------------------+
+| Database       | Create Database                                     |
++----------------+-----------------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database        |
+|                | WITH(                                               |
+|                |   'compaction.twcs.max_output_file_size' = '512MB', |
+|                |   'compaction.twcs.time_window' = '1d',             |
+|                |   'compaction.twcs.trigger_file_num' = '8',         |
+|                |   'compaction.type' = 'twcs'                        |
+|                | )                                                   |
++----------------+-----------------------------------------------------+
+
+ALTER DATABASE alter_database UNSET 'compaction.twcs.trigger_file_num';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+-----------------------------------------------------+
+| Database       | Create Database                                     |
++----------------+-----------------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database        |
+|                | WITH(                                               |
+|                |   'compaction.twcs.max_output_file_size' = '512MB', |
+|                |   'compaction.twcs.time_window' = '1d',             |
+|                |   'compaction.type' = 'twcs'                        |
+|                | )                                                   |
++----------------+-----------------------------------------------------+
+
+ALTER DATABASE alter_database UNSET 'compaction.twcs.time_window';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+-----------------------------------------------------+
+| Database       | Create Database                                     |
++----------------+-----------------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database        |
+|                | WITH(                                               |
+|                |   'compaction.twcs.max_output_file_size' = '512MB', |
+|                |   'compaction.type' = 'twcs'                        |
+|                | )                                                   |
++----------------+-----------------------------------------------------+
+
+ALTER DATABASE alter_database SET 'compaction.twcs.fallback_to_local'='true';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+-----------------------------------------------------+
+| Database       | Create Database                                     |
++----------------+-----------------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database        |
+|                | WITH(                                               |
+|                |   'compaction.twcs.fallback_to_local' = 'true',     |
+|                |   'compaction.twcs.max_output_file_size' = '512MB', |
+|                |   'compaction.type' = 'twcs'                        |
+|                | )                                                   |
++----------------+-----------------------------------------------------+
+
+ALTER DATABASE alter_database UNSET 'compaction.type';
+
+Affected Rows: 0
+
+ALTER DATABASE alter_database UNSET 'compaction.twcs.max_output_file_size';
+
+Affected Rows: 0
+
+ALTER DATABASE alter_database UNSET 'compaction.twcs.fallback_to_local';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+----------------------------------------------+
+| Database       | Create Database                              |
++----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
++----------------+----------------------------------------------+
+
+-- SQLNESS ARG restart=true
+SHOW CREATE DATABASE alter_database;
+
++----------------+----------------------------------------------+
+| Database       | Create Database                              |
++----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
++----------------+----------------------------------------------+
+
+ALTER DATABASE alter_database SET 'invalid.compaction.option'='value';
+
+Error: 1004(InvalidArguments), Invalid set database option, key: invalid.compaction.option, value: value
+
+ALTER DATABASE alter_database SET 'ttl'='1h';
+
+Affected Rows: 0
+
+ALTER DATABASE alter_database SET 'compaction.type'='twcs';
+
+Affected Rows: 0
+
+ALTER DATABASE alter_database SET 'compaction.twcs.time_window'='30m';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+----------------------------------------------+
+| Database       | Create Database                              |
++----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.twcs.time_window' = '30m',     |
+|                |   'compaction.type' = 'twcs',                |
+|                |   ttl = '1h'                                 |
+|                | )                                            |
++----------------+----------------------------------------------+
+
+ALTER DATABASE alter_database UNSET 'ttl';
+
+Affected Rows: 0
+
+SHOW CREATE DATABASE alter_database;
+
++----------------+----------------------------------------------+
+| Database       | Create Database                              |
++----------------+----------------------------------------------+
+| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
+|                | WITH(                                        |
+|                |   'compaction.twcs.time_window' = '30m',     |
+|                |   'compaction.type' = 'twcs'                 |
+|                | )                                            |
++----------------+----------------------------------------------+
+
 DROP DATABASE alter_database;
 
 Affected Rows: 0
diff --git a/tests/cases/standalone/common/alter/alter_database.sql b/tests/cases/standalone/common/alter/alter_database.sql
index f6491a24dd..1b2f75637a 100644
--- a/tests/cases/standalone/common/alter/alter_database.sql
+++ b/tests/cases/standalone/common/alter/alter_database.sql
@@ -32,5 +32,63 @@ SHOW CREATE DATABASE alter_database;
 -- SQLNESS ARG restart=true
 SHOW CREATE DATABASE alter_database;
 
+ALTER DATABASE alter_database SET 'compaction.type'='twcs';
+
+SHOW CREATE DATABASE alter_database;
+
+ALTER DATABASE alter_database SET 'compaction.twcs.time_window'='2h';
+
+SHOW CREATE DATABASE alter_database;
+
+ALTER DATABASE alter_database SET 'compaction.twcs.trigger_file_num'='8';
+
+ALTER DATABASE alter_database SET 'compaction.twcs.max_output_file_size'='512MB';
+
+SHOW CREATE DATABASE alter_database;
+
+ALTER DATABASE alter_database SET 'compaction.twcs.time_window'='1d';
+
+SHOW CREATE DATABASE alter_database;
+
+-- SQLNESS ARG restart=true
+SHOW CREATE DATABASE alter_database;
+
+ALTER DATABASE alter_database UNSET 'compaction.twcs.trigger_file_num';
+
+SHOW CREATE DATABASE alter_database;
+
+ALTER DATABASE alter_database UNSET 'compaction.twcs.time_window';
+
+SHOW CREATE DATABASE alter_database;
+
+ALTER DATABASE alter_database SET 'compaction.twcs.fallback_to_local'='true';
+
+SHOW CREATE DATABASE alter_database;
+
+ALTER DATABASE alter_database UNSET 'compaction.type';
+
+ALTER DATABASE alter_database UNSET 'compaction.twcs.max_output_file_size';
+
+ALTER DATABASE alter_database UNSET 'compaction.twcs.fallback_to_local';
+
+SHOW CREATE DATABASE alter_database;
+
+-- SQLNESS ARG restart=true
+SHOW CREATE DATABASE alter_database;
+
+ALTER DATABASE alter_database SET 'invalid.compaction.option'='value';
+
+ALTER DATABASE alter_database SET 'ttl'='1h';
+
+ALTER DATABASE alter_database SET 'compaction.type'='twcs';
+
+ALTER DATABASE alter_database SET 'compaction.twcs.time_window'='30m';
+
+SHOW CREATE DATABASE alter_database;
+
+ALTER DATABASE alter_database UNSET 'ttl';
+
+SHOW CREATE DATABASE alter_database;
+
 DROP DATABASE alter_database;
 
diff --git a/tests/cases/standalone/common/alter/alter_metric_table.result b/tests/cases/standalone/common/alter/alter_metric_table.result
index 1e51766669..e44e4ae5f9 100644
--- a/tests/cases/standalone/common/alter/alter_metric_table.result
+++ b/tests/cases/standalone/common/alter/alter_metric_table.result
@@ -4,12 +4,12 @@ Affected Rows: 0
 
 SHOW TABLES;
 
-+---------+
-| Tables  |
-+---------+
-| numbers |
-| phy     |
-+---------+
++------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
+| phy              |
++------------------+
 
 DESC TABLE phy;
 
diff --git a/tests/cases/standalone/common/alter/alter_table_alter_column_set_default.result b/tests/cases/standalone/common/alter/alter_table_alter_column_set_default.result
index 30d82e1000..c92f6a4acf 100644
--- a/tests/cases/standalone/common/alter/alter_table_alter_column_set_default.result
+++ b/tests/cases/standalone/common/alter/alter_table_alter_column_set_default.result
@@ -116,7 +116,7 @@ SHOW CREATE TABLE test1;
 
 ALTER TABLE test1 MODIFY COLUMN o SET DEFAULT "not allow";
 
-Error: 1001(Unsupported), Unsupported expr in default constraint: "not allow" for column: o
+Error: 1001(Unsupported), Unsupported default constraint for column: 'o', reason: expr '"not allow"' not supported
 
 ALTER TABLE test1 MODIFY COLUMN o SET DEFAULT NULL;
 
diff --git a/tests/cases/standalone/common/catalog/schema.result b/tests/cases/standalone/common/catalog/schema.result
index 57d2a2f952..759fadc04e 100644
--- a/tests/cases/standalone/common/catalog/schema.result
+++ b/tests/cases/standalone/common/catalog/schema.result
@@ -45,19 +45,19 @@ Affected Rows: 0
 
 SHOW TABLES FROM test_public_schema;
 
-+--------+
-| Tables |
-+--------+
-| hello  |
-+--------+
++------------------------------+
+| Tables_in_test_public_schema |
++------------------------------+
+| hello                        |
++------------------------------+
 
 SHOW TABLES FROM public;
 
-+---------+
-| Tables  |
-+---------+
-| numbers |
-+---------+
++------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
++------------------+
 
 INSERT INTO hello VALUES (2), (3), (4);
 
@@ -75,19 +75,19 @@ SELECT * FROM hello;
 
 SHOW TABLES;
 
-+--------+
-| Tables |
-+--------+
-| hello  |
-+--------+
++------------------------------+
+| Tables_in_test_public_schema |
++------------------------------+
+| hello                        |
++------------------------------+
 
 SHOW FULL TABLES WHERE Table_type != 'VIEW';
 
-+--------+------------+
-| Tables | Table_type |
-+--------+------------+
-| hello  | BASE TABLE |
-+--------+------------+
++------------------------------+------------+
+| Tables_in_test_public_schema | Table_type |
++------------------------------+------------+
+| hello                        | BASE TABLE |
++------------------------------+------------+
 
 DROP TABLE hello;
 
@@ -104,19 +104,19 @@ SHOW TABLES FROM test_public_schema;
 
 SHOW TABLES FROM public;
 
-+---------+
-| Tables  |
-+---------+
-| numbers |
-+---------+
++------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
++------------------+
 
 SHOW TABLES FROM public WHERE Tables = 'numbers';
 
-+---------+
-| Tables  |
-+---------+
-| numbers |
-+---------+
++------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
++------------------+
 
 DROP SCHEMA test_public_schema;
 
diff --git a/tests/cases/standalone/common/comment.result b/tests/cases/standalone/common/comment.result
new file mode 100644
index 0000000000..19f9b8776b
--- /dev/null
+++ b/tests/cases/standalone/common/comment.result
@@ -0,0 +1,184 @@
+-- Test: COMMENT ON TABLE add & remove
+CREATE TABLE comment_table_test (
+	pk INT,
+	val DOUBLE,
+	ts TIMESTAMP TIME INDEX,
+	PRIMARY KEY(pk)
+);
+
+Affected Rows: 0
+
+-- Add table comment
+COMMENT ON TABLE comment_table_test IS 'table level description';
+
+Affected Rows: 0
+
+SHOW CREATE TABLE comment_table_test;
+
++--------------------+---------------------------------------------------+
+| Table              | Create Table                                      |
++--------------------+---------------------------------------------------+
+| comment_table_test | CREATE TABLE IF NOT EXISTS "comment_table_test" ( |
+|                    |   "pk" INT NULL,                                  |
+|                    |   "val" DOUBLE NULL,                              |
+|                    |   "ts" TIMESTAMP(3) NOT NULL,                     |
+|                    |   TIME INDEX ("ts"),                              |
+|                    |   PRIMARY KEY ("pk")                              |
+|                    | )                                                 |
+|                    |                                                   |
+|                    | ENGINE=mito                                       |
+|                    | WITH(                                             |
+|                    |   comment = 'table level description'             |
+|                    | )                                                 |
++--------------------+---------------------------------------------------+
+
+-- Remove table comment
+COMMENT ON TABLE comment_table_test IS NULL;
+
+Affected Rows: 0
+
+SHOW CREATE TABLE comment_table_test;
+
++--------------------+---------------------------------------------------+
+| Table              | Create Table                                      |
++--------------------+---------------------------------------------------+
+| comment_table_test | CREATE TABLE IF NOT EXISTS "comment_table_test" ( |
+|                    |   "pk" INT NULL,                                  |
+|                    |   "val" DOUBLE NULL,                              |
+|                    |   "ts" TIMESTAMP(3) NOT NULL,                     |
+|                    |   TIME INDEX ("ts"),                              |
+|                    |   PRIMARY KEY ("pk")                              |
+|                    | )                                                 |
+|                    |                                                   |
+|                    | ENGINE=mito                                       |
+|                    |                                                   |
++--------------------+---------------------------------------------------+
+
+DROP TABLE comment_table_test;
+
+Affected Rows: 0
+
+-- Test: COMMENT ON COLUMN add & remove
+CREATE TABLE comment_column_test (
+	pk INT,
+	val DOUBLE,
+	ts TIMESTAMP TIME INDEX,
+	PRIMARY KEY(pk)
+);
+
+Affected Rows: 0
+
+-- Add column comment
+COMMENT ON COLUMN comment_column_test.val IS 'value column description';
+
+Affected Rows: 0
+
+SHOW CREATE TABLE comment_column_test;
+
++---------------------+---------------------------------------------------------+
+| Table               | Create Table                                            |
++---------------------+---------------------------------------------------------+
+| comment_column_test | CREATE TABLE IF NOT EXISTS "comment_column_test" (      |
+|                     |   "pk" INT NULL,                                        |
+|                     |   "val" DOUBLE NULL COMMENT 'value column description', |
+|                     |   "ts" TIMESTAMP(3) NOT NULL,                           |
+|                     |   TIME INDEX ("ts"),                                    |
+|                     |   PRIMARY KEY ("pk")                                    |
+|                     | )                                                       |
+|                     |                                                         |
+|                     | ENGINE=mito                                             |
+|                     |                                                         |
++---------------------+---------------------------------------------------------+
+
+-- Remove column comment
+COMMENT ON COLUMN comment_column_test.val IS NULL;
+
+Affected Rows: 0
+
+SHOW CREATE TABLE comment_column_test;
+
++---------------------+----------------------------------------------------+
+| Table               | Create Table                                       |
++---------------------+----------------------------------------------------+
+| comment_column_test | CREATE TABLE IF NOT EXISTS "comment_column_test" ( |
+|                     |   "pk" INT NULL,                                   |
+|                     |   "val" DOUBLE NULL,                               |
+|                     |   "ts" TIMESTAMP(3) NOT NULL,                      |
+|                     |   TIME INDEX ("ts"),                               |
+|                     |   PRIMARY KEY ("pk")                               |
+|                     | )                                                  |
+|                     |                                                    |
+|                     | ENGINE=mito                                        |
+|                     |                                                    |
++---------------------+----------------------------------------------------+
+
+DROP TABLE comment_column_test;
+
+Affected Rows: 0
+
+-- Test: COMMENT ON FLOW add & remove
+-- Prepare source & sink tables
+CREATE TABLE flow_source_comment_test (
+	desc_str STRING,
+	ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+CREATE TABLE flow_sink_comment_test (
+	desc_str STRING,
+	ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+CREATE FLOW flow_comment_test
+SINK TO flow_sink_comment_test
+AS
+SELECT desc_str, ts FROM flow_source_comment_test;
+
+Affected Rows: 0
+
+-- Add flow comment
+COMMENT ON FLOW flow_comment_test IS 'flow level description';
+
+Affected Rows: 0
+
+SHOW CREATE FLOW flow_comment_test;
+
++-------------------+------------------------------------------------------+
+| Flow              | Create Flow                                          |
++-------------------+------------------------------------------------------+
+| flow_comment_test | CREATE FLOW IF NOT EXISTS flow_comment_test          |
+|                   | SINK TO flow_sink_comment_test                       |
+|                   | COMMENT 'flow level description'                     |
+|                   | AS SELECT desc_str, ts FROM flow_source_comment_test |
++-------------------+------------------------------------------------------+
+
+-- Remove flow comment
+COMMENT ON FLOW flow_comment_test IS NULL;
+
+Affected Rows: 0
+
+SHOW CREATE FLOW flow_comment_test;
+
++-------------------+------------------------------------------------------+
+| Flow              | Create Flow                                          |
++-------------------+------------------------------------------------------+
+| flow_comment_test | CREATE FLOW IF NOT EXISTS flow_comment_test          |
+|                   | SINK TO flow_sink_comment_test                       |
+|                   | AS SELECT desc_str, ts FROM flow_source_comment_test |
++-------------------+------------------------------------------------------+
+
+DROP FLOW flow_comment_test;
+
+Affected Rows: 0
+
+DROP TABLE flow_source_comment_test;
+
+Affected Rows: 0
+
+DROP TABLE flow_sink_comment_test;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/comment.sql b/tests/cases/standalone/common/comment.sql
new file mode 100644
index 0000000000..564480563b
--- /dev/null
+++ b/tests/cases/standalone/common/comment.sql
@@ -0,0 +1,65 @@
+-- Test: COMMENT ON TABLE add & remove
+CREATE TABLE comment_table_test (
+	pk INT,
+	val DOUBLE,
+	ts TIMESTAMP TIME INDEX,
+	PRIMARY KEY(pk)
+);
+
+-- Add table comment
+COMMENT ON TABLE comment_table_test IS 'table level description';
+SHOW CREATE TABLE comment_table_test;
+
+-- Remove table comment
+COMMENT ON TABLE comment_table_test IS NULL;
+SHOW CREATE TABLE comment_table_test;
+
+DROP TABLE comment_table_test;
+
+-- Test: COMMENT ON COLUMN add & remove
+CREATE TABLE comment_column_test (
+	pk INT,
+	val DOUBLE,
+	ts TIMESTAMP TIME INDEX,
+	PRIMARY KEY(pk)
+);
+
+-- Add column comment
+COMMENT ON COLUMN comment_column_test.val IS 'value column description';
+SHOW CREATE TABLE comment_column_test;
+
+-- Remove column comment
+COMMENT ON COLUMN comment_column_test.val IS NULL;
+SHOW CREATE TABLE comment_column_test;
+
+DROP TABLE comment_column_test;
+
+-- Test: COMMENT ON FLOW add & remove
+-- Prepare source & sink tables
+CREATE TABLE flow_source_comment_test (
+	desc_str STRING,
+	ts TIMESTAMP TIME INDEX
+);
+
+CREATE TABLE flow_sink_comment_test (
+	desc_str STRING,
+	ts TIMESTAMP TIME INDEX
+);
+
+CREATE FLOW flow_comment_test
+SINK TO flow_sink_comment_test
+AS
+SELECT desc_str, ts FROM flow_source_comment_test;
+
+-- Add flow comment
+COMMENT ON FLOW flow_comment_test IS 'flow level description';
+SHOW CREATE FLOW flow_comment_test;
+
+-- Remove flow comment
+COMMENT ON FLOW flow_comment_test IS NULL;
+SHOW CREATE FLOW flow_comment_test;
+
+DROP FLOW flow_comment_test;
+DROP TABLE flow_source_comment_test;
+DROP TABLE flow_sink_comment_test;
+
diff --git a/tests/cases/standalone/common/copy/copy_database_from_fs_parquet.result b/tests/cases/standalone/common/copy/copy_database_from_fs_parquet.result
index 999271da8d..3ec38aa7ca 100644
--- a/tests/cases/standalone/common/copy/copy_database_from_fs_parquet.result
+++ b/tests/cases/standalone/common/copy/copy_database_from_fs_parquet.result
@@ -64,3 +64,149 @@ DROP TABLE demo;
 
 Affected Rows: 0
 
+CREATE TABLE cpu_metrics (
+    host STRING,
+    `usage` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+INSERT INTO cpu_metrics 
+VALUES
+    ('host1', 66.6, 1655276557000),
+    ('host2', 77.7, 1655276558000),
+    ('host3', 88.8, 1655276559000);
+
+Affected Rows: 3
+
+CREATE TABLE memory_stats (
+    host STRING,
+    used DOUBLE,
+    `free` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+INSERT INTO memory_stats 
+VALUES
+    ('host1', 1024, 512, 1655276557000),
+    ('host2', 2048, 1024, 1655276558000),
+    ('host3', 4096, 2048, 1655276559000);
+
+Affected Rows: 3
+
+CREATE TABLE event_logs (
+    `id` INT,
+    `message` STRING,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+INSERT INTO event_logs 
+VALUES
+    (1, 'start', 1655276557000),
+    (2, 'processing', 1655276558000),
+    (3, 'finish', 1655276559000);
+
+Affected Rows: 3
+
+CREATE TABLE sensors (
+    sensor_id STRING,
+    temperature DOUBLE,
+    pressure INT,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+INSERT INTO sensors 
+VALUES
+    ('s1', 36.5, 1001, 1655276557000),
+    ('s2', 37.2, 1003, 1655276558000),
+    ('s3', 35.9, 998, 1655276559000);
+
+Affected Rows: 3
+
+COPY DATABASE public TO '${SQLNESS_HOME}/export_parallel/' WITH (format='parquet', parallelism=2);
+
+Affected Rows: 12
+
+DELETE FROM cpu_metrics;
+
+Affected Rows: 3
+
+DELETE FROM memory_stats;
+
+Affected Rows: 3
+
+DELETE FROM event_logs;
+
+Affected Rows: 3
+
+DELETE FROM sensors;
+
+Affected Rows: 3
+
+COPY DATABASE public FROM '${SQLNESS_HOME}/export_parallel/' WITH (parallelism=2);
+
+Affected Rows: 12
+
+SELECT * FROM cpu_metrics;
+
++-------+-------+---------------------+
+| host  | usage | ts                  |
++-------+-------+---------------------+
+| host1 | 66.6  | 2022-06-15T07:02:37 |
+| host2 | 77.7  | 2022-06-15T07:02:38 |
+| host3 | 88.8  | 2022-06-15T07:02:39 |
++-------+-------+---------------------+
+
+SELECT * FROM memory_stats;
+
++-------+--------+--------+---------------------+
+| host  | used   | free   | ts                  |
++-------+--------+--------+---------------------+
+| host1 | 1024.0 | 512.0  | 2022-06-15T07:02:37 |
+| host2 | 2048.0 | 1024.0 | 2022-06-15T07:02:38 |
+| host3 | 4096.0 | 2048.0 | 2022-06-15T07:02:39 |
++-------+--------+--------+---------------------+
+
+SELECT * FROM event_logs;
+
++----+------------+---------------------+
+| id | message    | ts                  |
++----+------------+---------------------+
+| 1  | start      | 2022-06-15T07:02:37 |
+| 2  | processing | 2022-06-15T07:02:38 |
+| 3  | finish     | 2022-06-15T07:02:39 |
++----+------------+---------------------+
+
+SELECT * FROM sensors;
+
++-----------+-------------+----------+---------------------+
+| sensor_id | temperature | pressure | ts                  |
++-----------+-------------+----------+---------------------+
+| s1        | 36.5        | 1001     | 2022-06-15T07:02:37 |
+| s2        | 37.2        | 1003     | 2022-06-15T07:02:38 |
+| s3        | 35.9        | 998      | 2022-06-15T07:02:39 |
++-----------+-------------+----------+---------------------+
+
+DROP TABLE cpu_metrics;
+
+Affected Rows: 0
+
+DROP TABLE memory_stats;
+
+Affected Rows: 0
+
+DROP TABLE event_logs;
+
+Affected Rows: 0
+
+DROP TABLE sensors;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/copy/copy_database_from_fs_parquet.sql b/tests/cases/standalone/common/copy/copy_database_from_fs_parquet.sql
index 671070e07b..691bfd95e5 100644
--- a/tests/cases/standalone/common/copy/copy_database_from_fs_parquet.sql
+++ b/tests/cases/standalone/common/copy/copy_database_from_fs_parquet.sql
@@ -25,3 +25,82 @@ DELETE FROM demo;
 COPY DATABASE public FROM '${SQLNESS_HOME}/demo/export/parquet_range/' LIMIT 2;
 
 DROP TABLE demo;
+
+CREATE TABLE cpu_metrics (
+    host STRING,
+    `usage` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+INSERT INTO cpu_metrics 
+VALUES
+    ('host1', 66.6, 1655276557000),
+    ('host2', 77.7, 1655276558000),
+    ('host3', 88.8, 1655276559000);
+
+CREATE TABLE memory_stats (
+    host STRING,
+    used DOUBLE,
+    `free` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+INSERT INTO memory_stats 
+VALUES
+    ('host1', 1024, 512, 1655276557000),
+    ('host2', 2048, 1024, 1655276558000),
+    ('host3', 4096, 2048, 1655276559000);
+
+CREATE TABLE event_logs (
+    `id` INT,
+    `message` STRING,
+    ts TIMESTAMP TIME INDEX
+);
+
+INSERT INTO event_logs 
+VALUES
+    (1, 'start', 1655276557000),
+    (2, 'processing', 1655276558000),
+    (3, 'finish', 1655276559000);
+
+CREATE TABLE sensors (
+    sensor_id STRING,
+    temperature DOUBLE,
+    pressure INT,
+    ts TIMESTAMP TIME INDEX
+);
+
+INSERT INTO sensors 
+VALUES
+    ('s1', 36.5, 1001, 1655276557000),
+    ('s2', 37.2, 1003, 1655276558000),
+    ('s3', 35.9, 998, 1655276559000);
+
+
+COPY DATABASE public TO '${SQLNESS_HOME}/export_parallel/' WITH (format='parquet', parallelism=2);
+
+DELETE FROM cpu_metrics;
+
+DELETE FROM memory_stats;
+
+DELETE FROM event_logs;
+
+DELETE FROM sensors;
+
+COPY DATABASE public FROM '${SQLNESS_HOME}/export_parallel/' WITH (parallelism=2);
+
+SELECT * FROM cpu_metrics;
+
+SELECT * FROM memory_stats;
+
+SELECT * FROM event_logs;
+
+SELECT * FROM sensors;
+
+DROP TABLE cpu_metrics;
+
+DROP TABLE memory_stats;
+
+DROP TABLE event_logs;
+
+DROP TABLE sensors;
diff --git a/tests/cases/standalone/common/copy/copy_from_csv_compressed.result b/tests/cases/standalone/common/copy/copy_from_csv_compressed.result
new file mode 100644
index 0000000000..223bbfc26f
--- /dev/null
+++ b/tests/cases/standalone/common/copy/copy_from_csv_compressed.result
@@ -0,0 +1,233 @@
+-- Test compressed CSV import functionality
+-- First, create and export data with different compression types
+CREATE TABLE test_csv_export(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+-- Insert test data
+INSERT INTO test_csv_export(`id`, `name`, `value`, ts) VALUES 
+    (1, 'Alice', 10.5, 1640995200000),
+    (2, 'Bob', 20.3, 1640995260000),
+    (3, 'Charlie', 30.7, 1640995320000),
+    (4, 'David', 40.1, 1640995380000),
+    (5, 'Eve', 50.9, 1640995440000);
+
+Affected Rows: 5
+
+-- Export with different compression types
+COPY test_csv_export TO '${SQLNESS_HOME}/import/test_csv_uncompressed.csv' WITH (format='csv');
+
+Affected Rows: 5
+
+COPY test_csv_export TO '${SQLNESS_HOME}/import/test_csv_gzip.csv.gz' WITH (format='csv', compression_type='gzip');
+
+Affected Rows: 5
+
+COPY test_csv_export TO '${SQLNESS_HOME}/import/test_csv_zstd.csv.zst' WITH (format='csv', compression_type='zstd');
+
+Affected Rows: 5
+
+COPY test_csv_export TO '${SQLNESS_HOME}/import/test_csv_bzip2.csv.bz2' WITH (format='csv', compression_type='bzip2');
+
+Affected Rows: 5
+
+COPY test_csv_export TO '${SQLNESS_HOME}/import/test_csv_xz.csv.xz' WITH (format='csv', compression_type='xz');
+
+Affected Rows: 5
+
+-- Test importing uncompressed CSV
+CREATE TABLE test_csv_import_uncompressed(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+COPY test_csv_import_uncompressed FROM '${SQLNESS_HOME}/import/test_csv_uncompressed.csv' WITH (format='csv');
+
+Affected Rows: 5
+
+SELECT COUNT(*) as uncompressed_count FROM test_csv_import_uncompressed;
+
++--------------------+
+| uncompressed_count |
++--------------------+
+| 5                  |
++--------------------+
+
+-- Test importing GZIP compressed CSV
+CREATE TABLE test_csv_import_gzip(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+COPY test_csv_import_gzip FROM '${SQLNESS_HOME}/import/test_csv_gzip.csv.gz' WITH (format='csv', compression_type='gzip');
+
+Affected Rows: 5
+
+SELECT COUNT(*) as gzip_count FROM test_csv_import_gzip;
+
++------------+
+| gzip_count |
++------------+
+| 5          |
++------------+
+
+SELECT `id`, `name`, `value` FROM test_csv_import_gzip WHERE `id` = 1;
+
++----+-------+-------+
+| id | name  | value |
++----+-------+-------+
+| 1  | Alice | 10.5  |
++----+-------+-------+
+
+-- Test importing ZSTD compressed CSV
+CREATE TABLE test_csv_import_zstd(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+COPY test_csv_import_zstd FROM '${SQLNESS_HOME}/import/test_csv_zstd.csv.zst' WITH (format='csv', compression_type='zstd');
+
+Affected Rows: 5
+
+SELECT COUNT(*) as zstd_count FROM test_csv_import_zstd;
+
++------------+
+| zstd_count |
++------------+
+| 5          |
++------------+
+
+SELECT `id`, `name`, `value` FROM test_csv_import_zstd WHERE `id` = 2;
+
++----+------+-------+
+| id | name | value |
++----+------+-------+
+| 2  | Bob  | 20.3  |
++----+------+-------+
+
+-- Test importing BZIP2 compressed CSV
+CREATE TABLE test_csv_import_bzip2(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+COPY test_csv_import_bzip2 FROM '${SQLNESS_HOME}/import/test_csv_bzip2.csv.bz2' WITH (format='csv', compression_type='bzip2');
+
+Affected Rows: 5
+
+SELECT COUNT(*) as bzip2_count FROM test_csv_import_bzip2;
+
++-------------+
+| bzip2_count |
++-------------+
+| 5           |
++-------------+
+
+SELECT `id`, `name`, `value` FROM test_csv_import_bzip2 WHERE `id` = 3;
+
++----+---------+-------+
+| id | name    | value |
++----+---------+-------+
+| 3  | Charlie | 30.7  |
++----+---------+-------+
+
+-- Test importing XZ compressed CSV
+CREATE TABLE test_csv_import_xz(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+COPY test_csv_import_xz FROM '${SQLNESS_HOME}/import/test_csv_xz.csv.xz' WITH (format='csv', compression_type='xz');
+
+Affected Rows: 5
+
+SELECT COUNT(*) as xz_count FROM test_csv_import_xz;
+
++----------+
+| xz_count |
++----------+
+| 5        |
++----------+
+
+SELECT `id`, `name`, `value` FROM test_csv_import_xz WHERE `id` = 4;
+
++----+-------+-------+
+| id | name  | value |
++----+-------+-------+
+| 4  | David | 40.1  |
++----+-------+-------+
+
+-- Verify data integrity by comparing all imported tables
+SELECT source, count FROM (
+    SELECT 'uncompressed' as source, COUNT(*) as count, 1 as order_key FROM test_csv_import_uncompressed
+    UNION ALL
+    SELECT 'gzip', COUNT(*) as count, 2 as order_key FROM test_csv_import_gzip
+    UNION ALL
+    SELECT 'zstd', COUNT(*) as count, 3 as order_key FROM test_csv_import_zstd
+    UNION ALL
+    SELECT 'bzip2', COUNT(*) as count, 4 as order_key FROM test_csv_import_bzip2
+    UNION ALL
+    SELECT 'xz', COUNT(*) as count, 5 as order_key FROM test_csv_import_xz
+) AS subquery
+ORDER BY order_key;
+
++--------------+-------+
+| source       | count |
++--------------+-------+
+| uncompressed | 5     |
+| gzip         | 5     |
+| zstd         | 5     |
+| bzip2        | 5     |
+| xz           | 5     |
++--------------+-------+
+
+-- Clean up
+DROP TABLE test_csv_export;
+
+Affected Rows: 0
+
+DROP TABLE test_csv_import_uncompressed;
+
+Affected Rows: 0
+
+DROP TABLE test_csv_import_gzip;
+
+Affected Rows: 0
+
+DROP TABLE test_csv_import_zstd;
+
+Affected Rows: 0
+
+DROP TABLE test_csv_import_bzip2;
+
+Affected Rows: 0
+
+DROP TABLE test_csv_import_xz;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/copy/copy_from_csv_compressed.sql b/tests/cases/standalone/common/copy/copy_from_csv_compressed.sql
new file mode 100644
index 0000000000..8be260b69a
--- /dev/null
+++ b/tests/cases/standalone/common/copy/copy_from_csv_compressed.sql
@@ -0,0 +1,109 @@
+-- Test compressed CSV import functionality
+-- First, create and export data with different compression types
+CREATE TABLE test_csv_export(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+-- Insert test data
+INSERT INTO test_csv_export(`id`, `name`, `value`, ts) VALUES 
+    (1, 'Alice', 10.5, 1640995200000),
+    (2, 'Bob', 20.3, 1640995260000),
+    (3, 'Charlie', 30.7, 1640995320000),
+    (4, 'David', 40.1, 1640995380000),
+    (5, 'Eve', 50.9, 1640995440000);
+
+-- Export with different compression types
+COPY test_csv_export TO '${SQLNESS_HOME}/import/test_csv_uncompressed.csv' WITH (format='csv');
+COPY test_csv_export TO '${SQLNESS_HOME}/import/test_csv_gzip.csv.gz' WITH (format='csv', compression_type='gzip');
+COPY test_csv_export TO '${SQLNESS_HOME}/import/test_csv_zstd.csv.zst' WITH (format='csv', compression_type='zstd');
+COPY test_csv_export TO '${SQLNESS_HOME}/import/test_csv_bzip2.csv.bz2' WITH (format='csv', compression_type='bzip2');
+COPY test_csv_export TO '${SQLNESS_HOME}/import/test_csv_xz.csv.xz' WITH (format='csv', compression_type='xz');
+
+-- Test importing uncompressed CSV
+CREATE TABLE test_csv_import_uncompressed(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+COPY test_csv_import_uncompressed FROM '${SQLNESS_HOME}/import/test_csv_uncompressed.csv' WITH (format='csv');
+
+SELECT COUNT(*) as uncompressed_count FROM test_csv_import_uncompressed;
+
+-- Test importing GZIP compressed CSV
+CREATE TABLE test_csv_import_gzip(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+COPY test_csv_import_gzip FROM '${SQLNESS_HOME}/import/test_csv_gzip.csv.gz' WITH (format='csv', compression_type='gzip');
+
+SELECT COUNT(*) as gzip_count FROM test_csv_import_gzip;
+SELECT `id`, `name`, `value` FROM test_csv_import_gzip WHERE `id` = 1;
+
+-- Test importing ZSTD compressed CSV
+CREATE TABLE test_csv_import_zstd(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+COPY test_csv_import_zstd FROM '${SQLNESS_HOME}/import/test_csv_zstd.csv.zst' WITH (format='csv', compression_type='zstd');
+
+SELECT COUNT(*) as zstd_count FROM test_csv_import_zstd;
+SELECT `id`, `name`, `value` FROM test_csv_import_zstd WHERE `id` = 2;
+
+-- Test importing BZIP2 compressed CSV
+CREATE TABLE test_csv_import_bzip2(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+COPY test_csv_import_bzip2 FROM '${SQLNESS_HOME}/import/test_csv_bzip2.csv.bz2' WITH (format='csv', compression_type='bzip2');
+
+SELECT COUNT(*) as bzip2_count FROM test_csv_import_bzip2;
+SELECT `id`, `name`, `value` FROM test_csv_import_bzip2 WHERE `id` = 3;
+
+-- Test importing XZ compressed CSV
+CREATE TABLE test_csv_import_xz(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+COPY test_csv_import_xz FROM '${SQLNESS_HOME}/import/test_csv_xz.csv.xz' WITH (format='csv', compression_type='xz');
+
+SELECT COUNT(*) as xz_count FROM test_csv_import_xz;
+SELECT `id`, `name`, `value` FROM test_csv_import_xz WHERE `id` = 4;
+
+-- Verify data integrity by comparing all imported tables
+SELECT source, count FROM (
+    SELECT 'uncompressed' as source, COUNT(*) as count, 1 as order_key FROM test_csv_import_uncompressed
+    UNION ALL
+    SELECT 'gzip', COUNT(*) as count, 2 as order_key FROM test_csv_import_gzip
+    UNION ALL
+    SELECT 'zstd', COUNT(*) as count, 3 as order_key FROM test_csv_import_zstd
+    UNION ALL
+    SELECT 'bzip2', COUNT(*) as count, 4 as order_key FROM test_csv_import_bzip2
+    UNION ALL
+    SELECT 'xz', COUNT(*) as count, 5 as order_key FROM test_csv_import_xz
+) AS subquery
+ORDER BY order_key;
+
+-- Clean up
+DROP TABLE test_csv_export;
+DROP TABLE test_csv_import_uncompressed;
+DROP TABLE test_csv_import_gzip;
+DROP TABLE test_csv_import_zstd;
+DROP TABLE test_csv_import_bzip2;
+DROP TABLE test_csv_import_xz;
diff --git a/tests/cases/standalone/common/copy/copy_from_fs_csv.result b/tests/cases/standalone/common/copy/copy_from_fs_csv.result
index 28864ed460..4ebd4f2000 100644
--- a/tests/cases/standalone/common/copy/copy_from_fs_csv.result
+++ b/tests/cases/standalone/common/copy/copy_from_fs_csv.result
@@ -42,7 +42,7 @@ CREATE TABLE with_json(host string, cpu double, memory double, jsons JSON, ts ti
 
 Affected Rows: 0
 
-Copy with_json FROM '${SQLNESS_HOME}/demo/export/json/demo.json' with (format='json');
+Copy with_json FROM '${SQLNESS_HOME}/demo/export/csv/demo.csv' with (format='csv');
 
 Affected Rows: 3
 
@@ -72,7 +72,7 @@ select host, cpu, memory, jsons, ts from demo where host != 'host3';
 +-------+------+--------+------------------------+----------------------------+
 | host  | cpu  | memory | jsons                  | ts                         |
 +-------+------+--------+------------------------+----------------------------+
-| host1 | 66.6 | 1024   | {"foo":"bar"}          | 2022-06-15 07:02:37.000000 |
+| host1 | 66.6 | 1024.0 | {"foo":"bar"}          | 2022-06-15 07:02:37.000000 |
 | host2 | 88.8 | 333.3  | {"a":null,"foo":"bar"} | 2022-06-15 07:02:38.000000 |
 +-------+------+--------+------------------------+----------------------------+
 
diff --git a/tests/cases/standalone/common/copy/copy_from_fs_csv.sql b/tests/cases/standalone/common/copy/copy_from_fs_csv.sql
index 7d3e712da3..49550d9ecb 100644
--- a/tests/cases/standalone/common/copy/copy_from_fs_csv.sql
+++ b/tests/cases/standalone/common/copy/copy_from_fs_csv.sql
@@ -21,7 +21,7 @@ select * from with_filename order by ts;
 
 CREATE TABLE with_json(host string, cpu double, memory double, jsons JSON, ts timestamp time index);
 
-Copy with_json FROM '${SQLNESS_HOME}/demo/export/json/demo.json' with (format='json');
+Copy with_json FROM '${SQLNESS_HOME}/demo/export/csv/demo.csv' with (format='csv');
 
 select host, cpu, memory, json_to_string(jsons), ts from with_json order by ts;
 
diff --git a/tests/cases/standalone/common/copy/copy_from_fs_json.result b/tests/cases/standalone/common/copy/copy_from_fs_json.result
index 0c415b1a84..87fae5b912 100644
--- a/tests/cases/standalone/common/copy/copy_from_fs_json.result
+++ b/tests/cases/standalone/common/copy/copy_from_fs_json.result
@@ -72,7 +72,7 @@ select host, cpu, memory, jsons, ts from demo where host != 'host3';
 +-------+------+--------+------------------------+----------------------------+
 | host  | cpu  | memory | jsons                  | ts                         |
 +-------+------+--------+------------------------+----------------------------+
-| host1 | 66.6 | 1024   | {"foo":"bar"}          | 2022-06-15 07:02:37.000000 |
+| host1 | 66.6 | 1024.0 | {"foo":"bar"}          | 2022-06-15 07:02:37.000000 |
 | host2 | 88.8 | 333.3  | {"a":null,"foo":"bar"} | 2022-06-15 07:02:38.000000 |
 +-------+------+--------+------------------------+----------------------------+
 
diff --git a/tests/cases/standalone/common/copy/copy_from_fs_parquet.result b/tests/cases/standalone/common/copy/copy_from_fs_parquet.result
index c377160f31..53b7f9b52c 100644
--- a/tests/cases/standalone/common/copy/copy_from_fs_parquet.result
+++ b/tests/cases/standalone/common/copy/copy_from_fs_parquet.result
@@ -110,7 +110,7 @@ select host, cpu, memory, jsons, ts from demo where host != 'host3';
 +-------+------+--------+------------------------+----------------------------+
 | host  | cpu  | memory | jsons                  | ts                         |
 +-------+------+--------+------------------------+----------------------------+
-| host1 | 66.6 | 1024   | {"foo":"bar"}          | 2022-06-15 07:02:37.000000 |
+| host1 | 66.6 | 1024.0 | {"foo":"bar"}          | 2022-06-15 07:02:37.000000 |
 | host2 | 88.8 | 333.3  | {"a":null,"foo":"bar"} | 2022-06-15 07:02:38.000000 |
 +-------+------+--------+------------------------+----------------------------+
 
diff --git a/tests/cases/standalone/common/copy/copy_from_json_compressed.result b/tests/cases/standalone/common/copy/copy_from_json_compressed.result
new file mode 100644
index 0000000000..28b64f328c
--- /dev/null
+++ b/tests/cases/standalone/common/copy/copy_from_json_compressed.result
@@ -0,0 +1,233 @@
+-- Test compressed JSON import functionality
+-- First, create and export data with different compression types
+CREATE TABLE test_json_export(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+-- Insert test data
+INSERT INTO test_json_export(`id`, `name`, `value`, ts) VALUES 
+    (1, 'Alice', 10.5, 1640995200000),
+    (2, 'Bob', 20.3, 1640995260000),
+    (3, 'Charlie', 30.7, 1640995320000),
+    (4, 'David', 40.1, 1640995380000),
+    (5, 'Eve', 50.9, 1640995440000);
+
+Affected Rows: 5
+
+-- Export with different compression types
+COPY test_json_export TO '${SQLNESS_HOME}/import/test_json_uncompressed.json' WITH (format='json');
+
+Affected Rows: 5
+
+COPY test_json_export TO '${SQLNESS_HOME}/import/test_json_gzip.json.gz' WITH (format='json', compression_type='gzip');
+
+Affected Rows: 5
+
+COPY test_json_export TO '${SQLNESS_HOME}/import/test_json_zstd.json.zst' WITH (format='json', compression_type='zstd');
+
+Affected Rows: 5
+
+COPY test_json_export TO '${SQLNESS_HOME}/import/test_json_bzip2.json.bz2' WITH (format='json', compression_type='bzip2');
+
+Affected Rows: 5
+
+COPY test_json_export TO '${SQLNESS_HOME}/import/test_json_xz.json.xz' WITH (format='json', compression_type='xz');
+
+Affected Rows: 5
+
+-- Test importing uncompressed JSON
+CREATE TABLE test_json_import_uncompressed(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+COPY test_json_import_uncompressed FROM '${SQLNESS_HOME}/import/test_json_uncompressed.json' WITH (format='json');
+
+Affected Rows: 5
+
+SELECT COUNT(*) as uncompressed_count FROM test_json_import_uncompressed;
+
++--------------------+
+| uncompressed_count |
++--------------------+
+| 5                  |
++--------------------+
+
+-- Test importing GZIP compressed JSON
+CREATE TABLE test_json_import_gzip(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+COPY test_json_import_gzip FROM '${SQLNESS_HOME}/import/test_json_gzip.json.gz' WITH (format='json', compression_type='gzip');
+
+Affected Rows: 5
+
+SELECT COUNT(*) as gzip_count FROM test_json_import_gzip;
+
++------------+
+| gzip_count |
++------------+
+| 5          |
++------------+
+
+SELECT `id`, `name`, `value` FROM test_json_import_gzip WHERE `id` = 1;
+
++----+-------+-------+
+| id | name  | value |
++----+-------+-------+
+| 1  | Alice | 10.5  |
++----+-------+-------+
+
+-- Test importing ZSTD compressed JSON
+CREATE TABLE test_json_import_zstd(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+COPY test_json_import_zstd FROM '${SQLNESS_HOME}/import/test_json_zstd.json.zst' WITH (format='json', compression_type='zstd');
+
+Affected Rows: 5
+
+SELECT COUNT(*) as zstd_count FROM test_json_import_zstd;
+
++------------+
+| zstd_count |
++------------+
+| 5          |
++------------+
+
+SELECT `id`, `name`, `value` FROM test_json_import_zstd WHERE `id` = 2;
+
++----+------+-------+
+| id | name | value |
++----+------+-------+
+| 2  | Bob  | 20.3  |
++----+------+-------+
+
+-- Test importing BZIP2 compressed JSON
+CREATE TABLE test_json_import_bzip2(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+COPY test_json_import_bzip2 FROM '${SQLNESS_HOME}/import/test_json_bzip2.json.bz2' WITH (format='json', compression_type='bzip2');
+
+Affected Rows: 5
+
+SELECT COUNT(*) as bzip2_count FROM test_json_import_bzip2;
+
++-------------+
+| bzip2_count |
++-------------+
+| 5           |
++-------------+
+
+SELECT `id`, `name`, `value` FROM test_json_import_bzip2 WHERE `id` = 3;
+
++----+---------+-------+
+| id | name    | value |
++----+---------+-------+
+| 3  | Charlie | 30.7  |
++----+---------+-------+
+
+-- Test importing XZ compressed JSON
+CREATE TABLE test_json_import_xz(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+COPY test_json_import_xz FROM '${SQLNESS_HOME}/import/test_json_xz.json.xz' WITH (format='json', compression_type='xz');
+
+Affected Rows: 5
+
+SELECT COUNT(*) as xz_count FROM test_json_import_xz;
+
++----------+
+| xz_count |
++----------+
+| 5        |
++----------+
+
+SELECT `id`, `name`, `value` FROM test_json_import_xz WHERE `id` = 4;
+
++----+-------+-------+
+| id | name  | value |
++----+-------+-------+
+| 4  | David | 40.1  |
++----+-------+-------+
+
+-- Verify data integrity by comparing all imported tables
+SELECT source, count FROM (
+    SELECT 'uncompressed' as source, COUNT(*) as count, 1 as order_key FROM test_json_import_uncompressed
+    UNION ALL
+    SELECT 'gzip', COUNT(*) as count, 2 as order_key FROM test_json_import_gzip
+    UNION ALL
+    SELECT 'zstd', COUNT(*) as count, 3 as order_key FROM test_json_import_zstd
+    UNION ALL
+    SELECT 'bzip2', COUNT(*) as count, 4 as order_key FROM test_json_import_bzip2
+    UNION ALL
+    SELECT 'xz', COUNT(*) as count, 5 as order_key FROM test_json_import_xz
+) AS subquery
+ORDER BY order_key;
+
++--------------+-------+
+| source       | count |
++--------------+-------+
+| uncompressed | 5     |
+| gzip         | 5     |
+| zstd         | 5     |
+| bzip2        | 5     |
+| xz           | 5     |
++--------------+-------+
+
+-- Clean up
+DROP TABLE test_json_export;
+
+Affected Rows: 0
+
+DROP TABLE test_json_import_uncompressed;
+
+Affected Rows: 0
+
+DROP TABLE test_json_import_gzip;
+
+Affected Rows: 0
+
+DROP TABLE test_json_import_zstd;
+
+Affected Rows: 0
+
+DROP TABLE test_json_import_bzip2;
+
+Affected Rows: 0
+
+DROP TABLE test_json_import_xz;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/copy/copy_from_json_compressed.sql b/tests/cases/standalone/common/copy/copy_from_json_compressed.sql
new file mode 100644
index 0000000000..8b3b400812
--- /dev/null
+++ b/tests/cases/standalone/common/copy/copy_from_json_compressed.sql
@@ -0,0 +1,109 @@
+-- Test compressed JSON import functionality
+-- First, create and export data with different compression types
+CREATE TABLE test_json_export(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+-- Insert test data
+INSERT INTO test_json_export(`id`, `name`, `value`, ts) VALUES 
+    (1, 'Alice', 10.5, 1640995200000),
+    (2, 'Bob', 20.3, 1640995260000),
+    (3, 'Charlie', 30.7, 1640995320000),
+    (4, 'David', 40.1, 1640995380000),
+    (5, 'Eve', 50.9, 1640995440000);
+
+-- Export with different compression types
+COPY test_json_export TO '${SQLNESS_HOME}/import/test_json_uncompressed.json' WITH (format='json');
+COPY test_json_export TO '${SQLNESS_HOME}/import/test_json_gzip.json.gz' WITH (format='json', compression_type='gzip');
+COPY test_json_export TO '${SQLNESS_HOME}/import/test_json_zstd.json.zst' WITH (format='json', compression_type='zstd');
+COPY test_json_export TO '${SQLNESS_HOME}/import/test_json_bzip2.json.bz2' WITH (format='json', compression_type='bzip2');
+COPY test_json_export TO '${SQLNESS_HOME}/import/test_json_xz.json.xz' WITH (format='json', compression_type='xz');
+
+-- Test importing uncompressed JSON
+CREATE TABLE test_json_import_uncompressed(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+COPY test_json_import_uncompressed FROM '${SQLNESS_HOME}/import/test_json_uncompressed.json' WITH (format='json');
+
+SELECT COUNT(*) as uncompressed_count FROM test_json_import_uncompressed;
+
+-- Test importing GZIP compressed JSON
+CREATE TABLE test_json_import_gzip(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+COPY test_json_import_gzip FROM '${SQLNESS_HOME}/import/test_json_gzip.json.gz' WITH (format='json', compression_type='gzip');
+
+SELECT COUNT(*) as gzip_count FROM test_json_import_gzip;
+SELECT `id`, `name`, `value` FROM test_json_import_gzip WHERE `id` = 1;
+
+-- Test importing ZSTD compressed JSON
+CREATE TABLE test_json_import_zstd(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+COPY test_json_import_zstd FROM '${SQLNESS_HOME}/import/test_json_zstd.json.zst' WITH (format='json', compression_type='zstd');
+
+SELECT COUNT(*) as zstd_count FROM test_json_import_zstd;
+SELECT `id`, `name`, `value` FROM test_json_import_zstd WHERE `id` = 2;
+
+-- Test importing BZIP2 compressed JSON
+CREATE TABLE test_json_import_bzip2(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+COPY test_json_import_bzip2 FROM '${SQLNESS_HOME}/import/test_json_bzip2.json.bz2' WITH (format='json', compression_type='bzip2');
+
+SELECT COUNT(*) as bzip2_count FROM test_json_import_bzip2;
+SELECT `id`, `name`, `value` FROM test_json_import_bzip2 WHERE `id` = 3;
+
+-- Test importing XZ compressed JSON
+CREATE TABLE test_json_import_xz(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+COPY test_json_import_xz FROM '${SQLNESS_HOME}/import/test_json_xz.json.xz' WITH (format='json', compression_type='xz');
+
+SELECT COUNT(*) as xz_count FROM test_json_import_xz;
+SELECT `id`, `name`, `value` FROM test_json_import_xz WHERE `id` = 4;
+
+-- Verify data integrity by comparing all imported tables
+SELECT source, count FROM (
+    SELECT 'uncompressed' as source, COUNT(*) as count, 1 as order_key FROM test_json_import_uncompressed
+    UNION ALL
+    SELECT 'gzip', COUNT(*) as count, 2 as order_key FROM test_json_import_gzip
+    UNION ALL
+    SELECT 'zstd', COUNT(*) as count, 3 as order_key FROM test_json_import_zstd
+    UNION ALL
+    SELECT 'bzip2', COUNT(*) as count, 4 as order_key FROM test_json_import_bzip2
+    UNION ALL
+    SELECT 'xz', COUNT(*) as count, 5 as order_key FROM test_json_import_xz
+) AS subquery
+ORDER BY order_key;
+
+-- Clean up
+DROP TABLE test_json_export;
+DROP TABLE test_json_import_uncompressed;
+DROP TABLE test_json_import_gzip;
+DROP TABLE test_json_import_zstd;
+DROP TABLE test_json_import_bzip2;
+DROP TABLE test_json_import_xz;
diff --git a/tests/cases/standalone/common/copy/copy_to_csv_compressed.result b/tests/cases/standalone/common/copy/copy_to_csv_compressed.result
new file mode 100644
index 0000000000..8b3f8585c2
--- /dev/null
+++ b/tests/cases/standalone/common/copy/copy_to_csv_compressed.result
@@ -0,0 +1,65 @@
+-- Test compressed CSV export functionality
+CREATE TABLE test_csv_compression(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+-- Insert test data
+INSERT INTO test_csv_compression(`id`, `name`, `value`, ts) VALUES 
+    (1, 'Alice', 10.5, 1640995200000),
+    (2, 'Bob', 20.3, 1640995260000),
+    (3, 'Charlie', 30.7, 1640995320000),
+    (4, 'David', 40.1, 1640995380000),
+    (5, 'Eve', 50.9, 1640995440000);
+
+Affected Rows: 5
+
+-- Test uncompressed CSV export
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_uncompressed.csv' WITH (format='csv');
+
+Affected Rows: 5
+
+-- Test GZIP compressed CSV export
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_gzip.csv.gz' WITH (format='csv', compression_type='gzip');
+
+Affected Rows: 5
+
+-- Test ZSTD compressed CSV export
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_zstd.csv.zst' WITH (format='csv', compression_type='zstd');
+
+Affected Rows: 5
+
+-- Test BZIP2 compressed CSV export
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_bzip2.csv.bz2' WITH (format='csv', compression_type='bzip2');
+
+Affected Rows: 5
+
+-- Test XZ compressed CSV export
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_xz.csv.xz' WITH (format='csv', compression_type='xz');
+
+Affected Rows: 5
+
+-- Test compressed CSV with custom delimiter b'\t' (ASCII 9)
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_tab_separated.csv.gz' WITH (format='csv', compression_type='gzip', delimiter='9');
+
+Affected Rows: 5
+
+-- Test compressed CSV with timestamp format
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_timestamp_format.csv.zst' WITH (format='csv', compression_type='zstd', timestamp_format='%Y-%m-%d %H:%M:%S');
+
+Affected Rows: 5
+
+-- Test compressed CSV with date format
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_date_format.csv.gz' WITH (format='csv', compression_type='gzip', date_format='%Y/%m/%d');
+
+Affected Rows: 5
+
+-- Clean up
+DROP TABLE test_csv_compression;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/copy/copy_to_csv_compressed.sql b/tests/cases/standalone/common/copy/copy_to_csv_compressed.sql
new file mode 100644
index 0000000000..1eb899c563
--- /dev/null
+++ b/tests/cases/standalone/common/copy/copy_to_csv_compressed.sql
@@ -0,0 +1,42 @@
+-- Test compressed CSV export functionality
+CREATE TABLE test_csv_compression(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+-- Insert test data
+INSERT INTO test_csv_compression(`id`, `name`, `value`, ts) VALUES 
+    (1, 'Alice', 10.5, 1640995200000),
+    (2, 'Bob', 20.3, 1640995260000),
+    (3, 'Charlie', 30.7, 1640995320000),
+    (4, 'David', 40.1, 1640995380000),
+    (5, 'Eve', 50.9, 1640995440000);
+
+-- Test uncompressed CSV export
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_uncompressed.csv' WITH (format='csv');
+
+-- Test GZIP compressed CSV export
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_gzip.csv.gz' WITH (format='csv', compression_type='gzip');
+
+-- Test ZSTD compressed CSV export
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_zstd.csv.zst' WITH (format='csv', compression_type='zstd');
+
+-- Test BZIP2 compressed CSV export
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_bzip2.csv.bz2' WITH (format='csv', compression_type='bzip2');
+
+-- Test XZ compressed CSV export
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_xz.csv.xz' WITH (format='csv', compression_type='xz');
+
+-- Test compressed CSV with custom delimiter b'\t' (ASCII 9)
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_tab_separated.csv.gz' WITH (format='csv', compression_type='gzip', delimiter='9');
+
+-- Test compressed CSV with timestamp format
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_timestamp_format.csv.zst' WITH (format='csv', compression_type='zstd', timestamp_format='%Y-%m-%d %H:%M:%S');
+
+-- Test compressed CSV with date format
+COPY test_csv_compression TO '${SQLNESS_HOME}/export/test_csv_date_format.csv.gz' WITH (format='csv', compression_type='gzip', date_format='%Y/%m/%d');
+
+-- Clean up
+DROP TABLE test_csv_compression;
diff --git a/tests/cases/standalone/common/copy/copy_to_json_compressed.result b/tests/cases/standalone/common/copy/copy_to_json_compressed.result
new file mode 100644
index 0000000000..567b8b1853
--- /dev/null
+++ b/tests/cases/standalone/common/copy/copy_to_json_compressed.result
@@ -0,0 +1,55 @@
+-- Test compressed JSON export functionality
+CREATE TABLE test_json_compression(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+-- Insert test data
+INSERT INTO test_json_compression(`id`, `name`, `value`, ts) VALUES 
+    (1, 'Alice', 10.5, 1640995200000),
+    (2, 'Bob', 20.3, 1640995260000),
+    (3, 'Charlie', 30.7, 1640995320000),
+    (4, 'David', 40.1, 1640995380000),
+    (5, 'Eve', 50.9, 1640995440000);
+
+Affected Rows: 5
+
+-- Test uncompressed JSON export
+COPY test_json_compression TO '${SQLNESS_HOME}/export/test_json_uncompressed.json' WITH (format='json');
+
+Affected Rows: 5
+
+-- Test GZIP compressed JSON export
+COPY test_json_compression TO '${SQLNESS_HOME}/export/test_json_gzip.json.gz' WITH (format='json', compression_type='gzip');
+
+Affected Rows: 5
+
+-- Test ZSTD compressed JSON export
+COPY test_json_compression TO '${SQLNESS_HOME}/export/test_json_zstd.json.zst' WITH (format='json', compression_type='zstd');
+
+Affected Rows: 5
+
+-- Test BZIP2 compressed JSON export
+COPY test_json_compression TO '${SQLNESS_HOME}/export/test_json_bzip2.json.bz2' WITH (format='json', compression_type='bzip2');
+
+Affected Rows: 5
+
+-- Test XZ compressed JSON export
+COPY test_json_compression TO '${SQLNESS_HOME}/export/test_json_xz.json.xz' WITH (format='json', compression_type='xz');
+
+Affected Rows: 5
+
+-- Test compressed JSON with schema inference limit
+COPY test_json_compression TO '${SQLNESS_HOME}/export/test_json_schema_limit.json.gz' WITH (format='json', compression_type='gzip', schema_infer_max_record=100);
+
+Affected Rows: 5
+
+-- Clean up
+DROP TABLE test_json_compression;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/copy/copy_to_json_compressed.sql b/tests/cases/standalone/common/copy/copy_to_json_compressed.sql
new file mode 100644
index 0000000000..ba31c290dd
--- /dev/null
+++ b/tests/cases/standalone/common/copy/copy_to_json_compressed.sql
@@ -0,0 +1,36 @@
+-- Test compressed JSON export functionality
+CREATE TABLE test_json_compression(
+    `id` UINT32,
+    `name` STRING,
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+-- Insert test data
+INSERT INTO test_json_compression(`id`, `name`, `value`, ts) VALUES 
+    (1, 'Alice', 10.5, 1640995200000),
+    (2, 'Bob', 20.3, 1640995260000),
+    (3, 'Charlie', 30.7, 1640995320000),
+    (4, 'David', 40.1, 1640995380000),
+    (5, 'Eve', 50.9, 1640995440000);
+
+-- Test uncompressed JSON export
+COPY test_json_compression TO '${SQLNESS_HOME}/export/test_json_uncompressed.json' WITH (format='json');
+
+-- Test GZIP compressed JSON export
+COPY test_json_compression TO '${SQLNESS_HOME}/export/test_json_gzip.json.gz' WITH (format='json', compression_type='gzip');
+
+-- Test ZSTD compressed JSON export
+COPY test_json_compression TO '${SQLNESS_HOME}/export/test_json_zstd.json.zst' WITH (format='json', compression_type='zstd');
+
+-- Test BZIP2 compressed JSON export
+COPY test_json_compression TO '${SQLNESS_HOME}/export/test_json_bzip2.json.bz2' WITH (format='json', compression_type='bzip2');
+
+-- Test XZ compressed JSON export
+COPY test_json_compression TO '${SQLNESS_HOME}/export/test_json_xz.json.xz' WITH (format='json', compression_type='xz');
+
+-- Test compressed JSON with schema inference limit
+COPY test_json_compression TO '${SQLNESS_HOME}/export/test_json_schema_limit.json.gz' WITH (format='json', compression_type='gzip', schema_infer_max_record=100);
+
+-- Clean up
+DROP TABLE test_json_compression;
diff --git a/tests/cases/standalone/common/create/create_metric_table.result b/tests/cases/standalone/common/create/create_metric_table.result
index 55f1525a53..86d7ead991 100644
--- a/tests/cases/standalone/common/create/create_metric_table.result
+++ b/tests/cases/standalone/common/create/create_metric_table.result
@@ -4,12 +4,12 @@ Affected Rows: 0
 
 SHOW TABLES;
 
-+---------+
-| Tables  |
-+---------+
-| numbers |
-| phy     |
-+---------+
++------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
+| phy              |
++------------------+
 
 DESC TABLE phy;
 
diff --git a/tests/cases/standalone/common/create/create_type_alias.result b/tests/cases/standalone/common/create/create_type_alias.result
index bcd4cde30a..ded9e13472 100644
--- a/tests/cases/standalone/common/create/create_type_alias.result
+++ b/tests/cases/standalone/common/create/create_type_alias.result
@@ -1,6 +1,14 @@
 CREATE TABLE data_types (
   s string,
-  tint int8,
+  i2 int2,
+  i4 int4,
+  i8 int8,
+  f4 float4,
+  f8 float8,
+  u64 uint64,
+  u32 uint32,
+  u16 uint16,
+  u8 uint8,
   sint int16,
   i INT32,
   bint INT64,
@@ -26,7 +34,15 @@ SHOW CREATE TABLE data_types;
 +------------+------------------------------------------------------------+
 | data_types | CREATE TABLE IF NOT EXISTS "data_types" (                  |
 |            |   "s" STRING NULL,                                         |
-|            |   "tint" TINYINT NULL,                                     |
+|            |   "i2" SMALLINT NULL,                                      |
+|            |   "i4" INT NULL,                                           |
+|            |   "i8" BIGINT NULL,                                        |
+|            |   "f4" FLOAT NULL,                                         |
+|            |   "f8" DOUBLE NULL,                                        |
+|            |   "u64" BIGINT UNSIGNED NULL,                              |
+|            |   "u32" INT UNSIGNED NULL,                                 |
+|            |   "u16" SMALLINT UNSIGNED NULL,                            |
+|            |   "u8" TINYINT UNSIGNED NULL,                              |
 |            |   "sint" SMALLINT NULL,                                    |
 |            |   "i" INT NULL,                                            |
 |            |   "bint" BIGINT NULL,                                      |
@@ -55,7 +71,15 @@ DESC TABLE data_types;
 | Column | Type                 | Key | Null | Default             | Semantic Type |
 +--------+----------------------+-----+------+---------------------+---------------+
 | s      | String               | PRI | YES  |                     | TAG           |
-| tint   | Int8                 |     | YES  |                     | FIELD         |
+| i2     | Int16                |     | YES  |                     | FIELD         |
+| i4     | Int32                |     | YES  |                     | FIELD         |
+| i8     | Int64                |     | YES  |                     | FIELD         |
+| f4     | Float32              |     | YES  |                     | FIELD         |
+| f8     | Float64              |     | YES  |                     | FIELD         |
+| u64    | UInt64               |     | YES  |                     | FIELD         |
+| u32    | UInt32               |     | YES  |                     | FIELD         |
+| u16    | UInt16               |     | YES  |                     | FIELD         |
+| u8     | UInt8                |     | YES  |                     | FIELD         |
 | sint   | Int16                |     | YES  |                     | FIELD         |
 | i      | Int32                |     | YES  |                     | FIELD         |
 | bint   | Int64                |     | YES  |                     | FIELD         |
diff --git a/tests/cases/standalone/common/create/create_type_alias.sql b/tests/cases/standalone/common/create/create_type_alias.sql
index 937fb0b506..710035606b 100644
--- a/tests/cases/standalone/common/create/create_type_alias.sql
+++ b/tests/cases/standalone/common/create/create_type_alias.sql
@@ -1,6 +1,14 @@
 CREATE TABLE data_types (
   s string,
-  tint int8,
+  i2 int2,
+  i4 int4,
+  i8 int8,
+  f4 float4,
+  f8 float8,
+  u64 uint64,
+  u32 uint32,
+  u16 uint16,
+  u8 uint8,
   sint int16,
   i INT32,
   bint INT64,
diff --git a/tests/cases/standalone/common/create/current_timestamp.result b/tests/cases/standalone/common/create/current_timestamp.result
index 09cb8a01a5..8571b62d30 100644
--- a/tests/cases/standalone/common/create/current_timestamp.result
+++ b/tests/cases/standalone/common/create/current_timestamp.result
@@ -54,7 +54,7 @@ show create table t3;
 
 create table t4 (ts timestamp time index default now);
 
-Error: 1001(Unsupported), Unsupported expr in default constraint: now for column: ts
+Error: 1001(Unsupported), Unsupported default constraint for column: 'ts', reason: expr 'now' not supported
 
 drop table t1;
 
diff --git a/tests/cases/standalone/common/drop/drop_table.result b/tests/cases/standalone/common/drop/drop_table.result
index 0f1c079938..50d5494b44 100644
--- a/tests/cases/standalone/common/drop/drop_table.result
+++ b/tests/cases/standalone/common/drop/drop_table.result
@@ -40,12 +40,12 @@ Error: 4001(TableNotFound), Table not found: greptime.public.bar
 
 SHOW TABLES;
 
-+---------+
-| Tables  |
-+---------+
-| foo     |
-| numbers |
-+---------+
++------------------+
+| Tables_in_public |
++------------------+
+| foo              |
+| numbers          |
++------------------+
 
 DROP TABLE IF EXISTS foo, bar;
 
diff --git a/tests/cases/standalone/common/flow/flow_advance_ttl.result b/tests/cases/standalone/common/flow/flow_advance_ttl.result
index 05ae665be8..12b27ace13 100644
--- a/tests/cases/standalone/common/flow/flow_advance_ttl.result
+++ b/tests/cases/standalone/common/flow/flow_advance_ttl.result
@@ -46,6 +46,7 @@ SHOW CREATE TABLE distinct_basic;
 |                | )                                                         |
 +----------------+-----------------------------------------------------------+
 
+-- SQLNESS REPLACE \d{4} REDACTED
 SHOW CREATE TABLE out_distinct_basic;
 
 +--------------------+---------------------------------------------------+
@@ -60,7 +61,9 @@ SHOW CREATE TABLE out_distinct_basic;
 |                    | )                                                 |
 |                    |                                                   |
 |                    | ENGINE=mito                                       |
-|                    |                                                   |
+|                    | WITH(                                             |
+|                    |   'comment' = 'Sink table for flow flow-id=REDACTED'  |
+|                    | )                                                 |
 +--------------------+---------------------------------------------------+
 
 -- SQLNESS SLEEP 3s
@@ -242,7 +245,9 @@ SHOW CREATE TABLE out_distinct_basic;
 |                    | )                                                 |
 |                    |                                                   |
 |                    | ENGINE=mito                                       |
-|                    |                                                   |
+|                    | WITH(                                             |
+|                    |   'comment' = 'Auto created table by flow engine' |
+|                    | )                                                 |
 +--------------------+---------------------------------------------------+
 
 SELECT
diff --git a/tests/cases/standalone/common/flow/flow_advance_ttl.sql b/tests/cases/standalone/common/flow/flow_advance_ttl.sql
index 141c595e89..9574eabd91 100644
--- a/tests/cases/standalone/common/flow/flow_advance_ttl.sql
+++ b/tests/cases/standalone/common/flow/flow_advance_ttl.sql
@@ -20,6 +20,7 @@ SELECT flow_name, options FROM INFORMATION_SCHEMA.FLOWS;
 
 SHOW CREATE TABLE distinct_basic;
 
+-- SQLNESS REPLACE \d{4} REDACTED
 SHOW CREATE TABLE out_distinct_basic;
 
 -- SQLNESS SLEEP 3s
diff --git a/tests/cases/standalone/common/flow/flow_auto_sink_table.result b/tests/cases/standalone/common/flow/flow_auto_sink_table.result
index f1d229e6e8..90d53b9598 100644
--- a/tests/cases/standalone/common/flow/flow_auto_sink_table.result
+++ b/tests/cases/standalone/common/flow/flow_auto_sink_table.result
@@ -20,19 +20,21 @@ Affected Rows: 0
 
 SHOW CREATE TABLE out_num_cnt_basic;
 
-+-------------------+--------------------------------------------------+
-| Table             | Create Table                                     |
-+-------------------+--------------------------------------------------+
-| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" ( |
-|                   |   "sum(numbers_input_basic.number)" BIGINT NULL, |
-|                   |   "time_window" TIMESTAMP(9) NOT NULL,           |
-|                   |   "update_at" TIMESTAMP(3) NULL,                 |
-|                   |   TIME INDEX ("time_window")                     |
-|                   | )                                                |
-|                   |                                                  |
-|                   | ENGINE=mito                                      |
-|                   |                                                  |
-+-------------------+--------------------------------------------------+
++-------------------+---------------------------------------------------+
+| Table             | Create Table                                      |
++-------------------+---------------------------------------------------+
+| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" (  |
+|                   |   "sum(numbers_input_basic.number)" BIGINT NULL,  |
+|                   |   "time_window" TIMESTAMP(9) NOT NULL,            |
+|                   |   "update_at" TIMESTAMP(3) NULL,                  |
+|                   |   TIME INDEX ("time_window")                      |
+|                   | )                                                 |
+|                   |                                                   |
+|                   | ENGINE=mito                                       |
+|                   | WITH(                                             |
+|                   |   'comment' = 'Auto created table by flow engine' |
+|                   | )                                                 |
++-------------------+---------------------------------------------------+
 
 -- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
 ADMIN FLUSH_FLOW('test_numbers_basic');
@@ -55,19 +57,21 @@ SELECT 1;
 -- SQLNESS SLEEP 3s
 SHOW CREATE TABLE out_num_cnt_basic;
 
-+-------------------+--------------------------------------------------+
-| Table             | Create Table                                     |
-+-------------------+--------------------------------------------------+
-| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" ( |
-|                   |   "sum(numbers_input_basic.number)" BIGINT NULL, |
-|                   |   "time_window" TIMESTAMP(9) NOT NULL,           |
-|                   |   "update_at" TIMESTAMP(3) NULL,                 |
-|                   |   TIME INDEX ("time_window")                     |
-|                   | )                                                |
-|                   |                                                  |
-|                   | ENGINE=mito                                      |
-|                   |                                                  |
-+-------------------+--------------------------------------------------+
++-------------------+---------------------------------------------------+
+| Table             | Create Table                                      |
++-------------------+---------------------------------------------------+
+| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" (  |
+|                   |   "sum(numbers_input_basic.number)" BIGINT NULL,  |
+|                   |   "time_window" TIMESTAMP(9) NOT NULL,            |
+|                   |   "update_at" TIMESTAMP(3) NULL,                  |
+|                   |   TIME INDEX ("time_window")                      |
+|                   | )                                                 |
+|                   |                                                   |
+|                   | ENGINE=mito                                       |
+|                   | WITH(                                             |
+|                   |   'comment' = 'Auto created table by flow engine' |
+|                   | )                                                 |
++-------------------+---------------------------------------------------+
 
 SHOW CREATE FLOW test_numbers_basic;
 
@@ -122,19 +126,21 @@ SELECT 1;
 -- SQLNESS SLEEP 3s
 SHOW CREATE TABLE out_num_cnt_basic;
 
-+-------------------+--------------------------------------------------+
-| Table             | Create Table                                     |
-+-------------------+--------------------------------------------------+
-| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" ( |
-|                   |   "sumup" BIGINT NULL,                           |
-|                   |   "event_time" TIMESTAMP(3) NOT NULL,            |
-|                   |   "update_at" TIMESTAMP(3) NULL,                 |
-|                   |   TIME INDEX ("event_time")                      |
-|                   | )                                                |
-|                   |                                                  |
-|                   | ENGINE=mito                                      |
-|                   |                                                  |
-+-------------------+--------------------------------------------------+
++-------------------+---------------------------------------------------+
+| Table             | Create Table                                      |
++-------------------+---------------------------------------------------+
+| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" (  |
+|                   |   "sumup" BIGINT NULL,                            |
+|                   |   "event_time" TIMESTAMP(3) NOT NULL,             |
+|                   |   "update_at" TIMESTAMP(3) NULL,                  |
+|                   |   TIME INDEX ("event_time")                       |
+|                   | )                                                 |
+|                   |                                                   |
+|                   | ENGINE=mito                                       |
+|                   | WITH(                                             |
+|                   |   'comment' = 'Auto created table by flow engine' |
+|                   | )                                                 |
++-------------------+---------------------------------------------------+
 
 -- SQLNESS ARG restart=true
 SELECT 1;
@@ -158,19 +164,21 @@ SHOW CREATE FLOW test_numbers_basic;
 
 SHOW CREATE TABLE out_num_cnt_basic;
 
-+-------------------+--------------------------------------------------+
-| Table             | Create Table                                     |
-+-------------------+--------------------------------------------------+
-| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" ( |
-|                   |   "sumup" BIGINT NULL,                           |
-|                   |   "event_time" TIMESTAMP(3) NOT NULL,            |
-|                   |   "update_at" TIMESTAMP(3) NULL,                 |
-|                   |   TIME INDEX ("event_time")                      |
-|                   | )                                                |
-|                   |                                                  |
-|                   | ENGINE=mito                                      |
-|                   |                                                  |
-+-------------------+--------------------------------------------------+
++-------------------+---------------------------------------------------+
+| Table             | Create Table                                      |
++-------------------+---------------------------------------------------+
+| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" (  |
+|                   |   "sumup" BIGINT NULL,                            |
+|                   |   "event_time" TIMESTAMP(3) NOT NULL,             |
+|                   |   "update_at" TIMESTAMP(3) NULL,                  |
+|                   |   TIME INDEX ("event_time")                       |
+|                   | )                                                 |
+|                   |                                                   |
+|                   | ENGINE=mito                                       |
+|                   | WITH(                                             |
+|                   |   'comment' = 'Auto created table by flow engine' |
+|                   | )                                                 |
++-------------------+---------------------------------------------------+
 
 DROP FLOW test_numbers_basic;
 
diff --git a/tests/cases/standalone/common/flow/flow_basic.result b/tests/cases/standalone/common/flow/flow_basic.result
index e089af1781..a465047809 100644
--- a/tests/cases/standalone/common/flow/flow_basic.result
+++ b/tests/cases/standalone/common/flow/flow_basic.result
@@ -20,19 +20,21 @@ Affected Rows: 0
 
 SHOW CREATE TABLE out_num_cnt_basic;
 
-+-------------------+--------------------------------------------------+
-| Table             | Create Table                                     |
-+-------------------+--------------------------------------------------+
-| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" ( |
-|                   |   "sum(numbers_input_basic.number)" BIGINT NULL, |
-|                   |   "time_window" TIMESTAMP(9) NOT NULL,           |
-|                   |   "update_at" TIMESTAMP(3) NULL,                 |
-|                   |   TIME INDEX ("time_window")                     |
-|                   | )                                                |
-|                   |                                                  |
-|                   | ENGINE=mito                                      |
-|                   |                                                  |
-+-------------------+--------------------------------------------------+
++-------------------+---------------------------------------------------+
+| Table             | Create Table                                      |
++-------------------+---------------------------------------------------+
+| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" (  |
+|                   |   "sum(numbers_input_basic.number)" BIGINT NULL,  |
+|                   |   "time_window" TIMESTAMP(9) NOT NULL,            |
+|                   |   "update_at" TIMESTAMP(3) NULL,                  |
+|                   |   TIME INDEX ("time_window")                      |
+|                   | )                                                 |
+|                   |                                                   |
+|                   | ENGINE=mito                                       |
+|                   | WITH(                                             |
+|                   |   'comment' = 'Auto created table by flow engine' |
+|                   | )                                                 |
++-------------------+---------------------------------------------------+
 
 -- TODO(discord9): confirm if it's necessary to flush flow here?
 -- because flush_flow result is at most 1
@@ -47,19 +49,21 @@ ADMIN FLUSH_FLOW('test_numbers_basic');
 
 SHOW CREATE TABLE out_num_cnt_basic;
 
-+-------------------+--------------------------------------------------+
-| Table             | Create Table                                     |
-+-------------------+--------------------------------------------------+
-| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" ( |
-|                   |   "sum(numbers_input_basic.number)" BIGINT NULL, |
-|                   |   "time_window" TIMESTAMP(9) NOT NULL,           |
-|                   |   "update_at" TIMESTAMP(3) NULL,                 |
-|                   |   TIME INDEX ("time_window")                     |
-|                   | )                                                |
-|                   |                                                  |
-|                   | ENGINE=mito                                      |
-|                   |                                                  |
-+-------------------+--------------------------------------------------+
++-------------------+---------------------------------------------------+
+| Table             | Create Table                                      |
++-------------------+---------------------------------------------------+
+| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" (  |
+|                   |   "sum(numbers_input_basic.number)" BIGINT NULL,  |
+|                   |   "time_window" TIMESTAMP(9) NOT NULL,            |
+|                   |   "update_at" TIMESTAMP(3) NULL,                  |
+|                   |   TIME INDEX ("time_window")                      |
+|                   | )                                                 |
+|                   |                                                   |
+|                   | ENGINE=mito                                       |
+|                   | WITH(                                             |
+|                   |   'comment' = 'Auto created table by flow engine' |
+|                   | )                                                 |
++-------------------+---------------------------------------------------+
 
 -- SQLNESS ARG restart=true
 SELECT 1;
@@ -172,19 +176,21 @@ Affected Rows: 0
 
 SHOW CREATE TABLE out_basic;
 
-+-----------+---------------------------------------------+
-| Table     | Create Table                                |
-+-----------+---------------------------------------------+
-| out_basic | CREATE TABLE IF NOT EXISTS "out_basic" (    |
-|           |   "wildcard" BIGINT NULL,                   |
-|           |   "update_at" TIMESTAMP(3) NULL,            |
-|           |   "__ts_placeholder" TIMESTAMP(3) NOT NULL, |
-|           |   TIME INDEX ("__ts_placeholder")           |
-|           | )                                           |
-|           |                                             |
-|           | ENGINE=mito                                 |
-|           |                                             |
-+-----------+---------------------------------------------+
++-----------+---------------------------------------------------+
+| Table     | Create Table                                      |
++-----------+---------------------------------------------------+
+| out_basic | CREATE TABLE IF NOT EXISTS "out_basic" (          |
+|           |   "wildcard" BIGINT NULL,                         |
+|           |   "update_at" TIMESTAMP(3) NULL,                  |
+|           |   "__ts_placeholder" TIMESTAMP(3) NOT NULL,       |
+|           |   TIME INDEX ("__ts_placeholder")                 |
+|           | )                                                 |
+|           |                                                   |
+|           | ENGINE=mito                                       |
+|           | WITH(                                             |
+|           |   'comment' = 'Auto created table by flow engine' |
+|           | )                                                 |
++-----------+---------------------------------------------------+
 
 DROP FLOW test_wildcard_basic;
 
@@ -200,19 +206,21 @@ Affected Rows: 0
 
 SHOW CREATE TABLE out_basic;
 
-+-----------+---------------------------------------------+
-| Table     | Create Table                                |
-+-----------+---------------------------------------------+
-| out_basic | CREATE TABLE IF NOT EXISTS "out_basic" (    |
-|           |   "wildcard" BIGINT NULL,                   |
-|           |   "update_at" TIMESTAMP(3) NULL,            |
-|           |   "__ts_placeholder" TIMESTAMP(3) NOT NULL, |
-|           |   TIME INDEX ("__ts_placeholder")           |
-|           | )                                           |
-|           |                                             |
-|           | ENGINE=mito                                 |
-|           |                                             |
-+-----------+---------------------------------------------+
++-----------+---------------------------------------------------+
+| Table     | Create Table                                      |
++-----------+---------------------------------------------------+
+| out_basic | CREATE TABLE IF NOT EXISTS "out_basic" (          |
+|           |   "wildcard" BIGINT NULL,                         |
+|           |   "update_at" TIMESTAMP(3) NULL,                  |
+|           |   "__ts_placeholder" TIMESTAMP(3) NOT NULL,       |
+|           |   TIME INDEX ("__ts_placeholder")                 |
+|           | )                                                 |
+|           |                                                   |
+|           | ENGINE=mito                                       |
+|           | WITH(                                             |
+|           |   'comment' = 'Auto created table by flow engine' |
+|           | )                                                 |
++-----------+---------------------------------------------------+
 
 -- SQLNESS ARG restart=true
 SELECT 1;
@@ -243,19 +251,21 @@ ADMIN FLUSH_FLOW('test_wildcard_basic');
 
 SHOW CREATE TABLE out_basic;
 
-+-----------+---------------------------------------------+
-| Table     | Create Table                                |
-+-----------+---------------------------------------------+
-| out_basic | CREATE TABLE IF NOT EXISTS "out_basic" (    |
-|           |   "wildcard" BIGINT NULL,                   |
-|           |   "update_at" TIMESTAMP(3) NULL,            |
-|           |   "__ts_placeholder" TIMESTAMP(3) NOT NULL, |
-|           |   TIME INDEX ("__ts_placeholder")           |
-|           | )                                           |
-|           |                                             |
-|           | ENGINE=mito                                 |
-|           |                                             |
-+-----------+---------------------------------------------+
++-----------+---------------------------------------------------+
+| Table     | Create Table                                      |
++-----------+---------------------------------------------------+
+| out_basic | CREATE TABLE IF NOT EXISTS "out_basic" (          |
+|           |   "wildcard" BIGINT NULL,                         |
+|           |   "update_at" TIMESTAMP(3) NULL,                  |
+|           |   "__ts_placeholder" TIMESTAMP(3) NOT NULL,       |
+|           |   TIME INDEX ("__ts_placeholder")                 |
+|           | )                                                 |
+|           |                                                   |
+|           | ENGINE=mito                                       |
+|           | WITH(                                             |
+|           |   'comment' = 'Auto created table by flow engine' |
+|           | )                                                 |
++-----------+---------------------------------------------------+
 
 SELECT wildcard FROM out_basic;
 
@@ -309,7 +319,9 @@ SHOW CREATE TABLE out_distinct_basic;
 |                    | )                                                 |
 |                    |                                                   |
 |                    | ENGINE=mito                                       |
-|                    |                                                   |
+|                    | WITH(                                             |
+|                    |   'comment' = 'Auto created table by flow engine' |
+|                    | )                                                 |
 +--------------------+---------------------------------------------------+
 
 -- TODO(discord9): confirm if it's necessary to flush flow here?
@@ -365,7 +377,9 @@ SHOW CREATE TABLE out_distinct_basic;
 |                    | )                                                 |
 |                    |                                                   |
 |                    | ENGINE=mito                                       |
-|                    |                                                   |
+|                    | WITH(                                             |
+|                    |   'comment' = 'Auto created table by flow engine' |
+|                    | )                                                 |
 +--------------------+---------------------------------------------------+
 
 SELECT
@@ -637,20 +651,22 @@ Affected Rows: 0
 
 SHOW CREATE TABLE ngx_country;
 
-+-------------+---------------------------------------------+
-| Table       | Create Table                                |
-+-------------+---------------------------------------------+
-| ngx_country | CREATE TABLE IF NOT EXISTS "ngx_country" (  |
-|             |   "country" STRING NULL,                    |
-|             |   "update_at" TIMESTAMP(3) NULL,            |
-|             |   "__ts_placeholder" TIMESTAMP(3) NOT NULL, |
-|             |   TIME INDEX ("__ts_placeholder"),          |
-|             |   PRIMARY KEY ("country")                   |
-|             | )                                           |
-|             |                                             |
-|             | ENGINE=mito                                 |
-|             |                                             |
-+-------------+---------------------------------------------+
++-------------+---------------------------------------------------+
+| Table       | Create Table                                      |
++-------------+---------------------------------------------------+
+| ngx_country | CREATE TABLE IF NOT EXISTS "ngx_country" (        |
+|             |   "country" STRING NULL,                          |
+|             |   "update_at" TIMESTAMP(3) NULL,                  |
+|             |   "__ts_placeholder" TIMESTAMP(3) NOT NULL,       |
+|             |   TIME INDEX ("__ts_placeholder"),                |
+|             |   PRIMARY KEY ("country")                         |
+|             | )                                                 |
+|             |                                                   |
+|             | ENGINE=mito                                       |
+|             | WITH(                                             |
+|             |   'comment' = 'Auto created table by flow engine' |
+|             | )                                                 |
++-------------+---------------------------------------------------+
 
 INSERT INTO
     ngx_access_log
@@ -670,20 +686,22 @@ ADMIN FLUSH_FLOW('calc_ngx_country');
 
 SHOW CREATE TABLE ngx_country;
 
-+-------------+---------------------------------------------+
-| Table       | Create Table                                |
-+-------------+---------------------------------------------+
-| ngx_country | CREATE TABLE IF NOT EXISTS "ngx_country" (  |
-|             |   "country" STRING NULL,                    |
-|             |   "update_at" TIMESTAMP(3) NULL,            |
-|             |   "__ts_placeholder" TIMESTAMP(3) NOT NULL, |
-|             |   TIME INDEX ("__ts_placeholder"),          |
-|             |   PRIMARY KEY ("country")                   |
-|             | )                                           |
-|             |                                             |
-|             | ENGINE=mito                                 |
-|             |                                             |
-+-------------+---------------------------------------------+
++-------------+---------------------------------------------------+
+| Table       | Create Table                                      |
++-------------+---------------------------------------------------+
+| ngx_country | CREATE TABLE IF NOT EXISTS "ngx_country" (        |
+|             |   "country" STRING NULL,                          |
+|             |   "update_at" TIMESTAMP(3) NULL,                  |
+|             |   "__ts_placeholder" TIMESTAMP(3) NOT NULL,       |
+|             |   TIME INDEX ("__ts_placeholder"),                |
+|             |   PRIMARY KEY ("country")                         |
+|             | )                                                 |
+|             |                                                   |
+|             | ENGINE=mito                                       |
+|             | WITH(                                             |
+|             |   'comment' = 'Auto created table by flow engine' |
+|             | )                                                 |
++-------------+---------------------------------------------------+
 
 SELECT
     country
@@ -787,20 +805,22 @@ Affected Rows: 0
 
 SHOW CREATE TABLE ngx_country;
 
-+-------------+--------------------------------------------+
-| Table       | Create Table                               |
-+-------------+--------------------------------------------+
-| ngx_country | CREATE TABLE IF NOT EXISTS "ngx_country" ( |
-|             |   "country" STRING NULL,                   |
-|             |   "time_window" TIMESTAMP(3) NOT NULL,     |
-|             |   "update_at" TIMESTAMP(3) NULL,           |
-|             |   TIME INDEX ("time_window"),              |
-|             |   PRIMARY KEY ("country")                  |
-|             | )                                          |
-|             |                                            |
-|             | ENGINE=mito                                |
-|             |                                            |
-+-------------+--------------------------------------------+
++-------------+---------------------------------------------------+
+| Table       | Create Table                                      |
++-------------+---------------------------------------------------+
+| ngx_country | CREATE TABLE IF NOT EXISTS "ngx_country" (        |
+|             |   "country" STRING NULL,                          |
+|             |   "time_window" TIMESTAMP(3) NOT NULL,            |
+|             |   "update_at" TIMESTAMP(3) NULL,                  |
+|             |   TIME INDEX ("time_window"),                     |
+|             |   PRIMARY KEY ("country")                         |
+|             | )                                                 |
+|             |                                                   |
+|             | ENGINE=mito                                       |
+|             | WITH(                                             |
+|             |   'comment' = 'Auto created table by flow engine' |
+|             | )                                                 |
++-------------+---------------------------------------------------+
 
 INSERT INTO
     ngx_access_log
@@ -820,20 +840,22 @@ ADMIN FLUSH_FLOW('calc_ngx_country');
 
 SHOW CREATE TABLE ngx_country;
 
-+-------------+--------------------------------------------+
-| Table       | Create Table                               |
-+-------------+--------------------------------------------+
-| ngx_country | CREATE TABLE IF NOT EXISTS "ngx_country" ( |
-|             |   "country" STRING NULL,                   |
-|             |   "time_window" TIMESTAMP(3) NOT NULL,     |
-|             |   "update_at" TIMESTAMP(3) NULL,           |
-|             |   TIME INDEX ("time_window"),              |
-|             |   PRIMARY KEY ("country")                  |
-|             | )                                          |
-|             |                                            |
-|             | ENGINE=mito                                |
-|             |                                            |
-+-------------+--------------------------------------------+
++-------------+---------------------------------------------------+
+| Table       | Create Table                                      |
++-------------+---------------------------------------------------+
+| ngx_country | CREATE TABLE IF NOT EXISTS "ngx_country" (        |
+|             |   "country" STRING NULL,                          |
+|             |   "time_window" TIMESTAMP(3) NOT NULL,            |
+|             |   "update_at" TIMESTAMP(3) NULL,                  |
+|             |   TIME INDEX ("time_window"),                     |
+|             |   PRIMARY KEY ("country")                         |
+|             | )                                                 |
+|             |                                                   |
+|             | ENGINE=mito                                       |
+|             | WITH(                                             |
+|             |   'comment' = 'Auto created table by flow engine' |
+|             | )                                                 |
++-------------+---------------------------------------------------+
 
 SELECT
     country,
@@ -991,11 +1013,11 @@ ADMIN FLUSH_FLOW('temp_monitoring');
 -- This table should not exist yet
 SHOW TABLES LIKE 'temp_alerts';
 
-+-------------+
-| Tables      |
-+-------------+
-| temp_alerts |
-+-------------+
++------------------+
+| Tables_in_public |
++------------------+
+| temp_alerts      |
++------------------+
 
 INSERT INTO
     temp_sensor_data
@@ -1015,11 +1037,11 @@ ADMIN FLUSH_FLOW('temp_monitoring');
 
 SHOW TABLES LIKE 'temp_alerts';
 
-+-------------+
-| Tables      |
-+-------------+
-| temp_alerts |
-+-------------+
++------------------+
+| Tables_in_public |
++------------------+
+| temp_alerts      |
++------------------+
 
 SELECT
     sensor_id,
@@ -1673,19 +1695,21 @@ Affected Rows: 0
 
 SHOW CREATE TABLE out_num_cnt_basic;
 
-+-------------------+--------------------------------------------------+
-| Table             | Create Table                                     |
-+-------------------+--------------------------------------------------+
-| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" ( |
-|                   |   "avg_after_filter_num" BIGINT NULL,            |
-|                   |   "update_at" TIMESTAMP(3) NULL,                 |
-|                   |   "__ts_placeholder" TIMESTAMP(3) NOT NULL,      |
-|                   |   TIME INDEX ("__ts_placeholder")                |
-|                   | )                                                |
-|                   |                                                  |
-|                   | ENGINE=mito                                      |
-|                   |                                                  |
-+-------------------+--------------------------------------------------+
++-------------------+---------------------------------------------------+
+| Table             | Create Table                                      |
++-------------------+---------------------------------------------------+
+| out_num_cnt_basic | CREATE TABLE IF NOT EXISTS "out_num_cnt_basic" (  |
+|                   |   "avg_after_filter_num" BIGINT NULL,             |
+|                   |   "update_at" TIMESTAMP(3) NULL,                  |
+|                   |   "__ts_placeholder" TIMESTAMP(3) NOT NULL,       |
+|                   |   TIME INDEX ("__ts_placeholder")                 |
+|                   | )                                                 |
+|                   |                                                   |
+|                   | ENGINE=mito                                       |
+|                   | WITH(                                             |
+|                   |   'comment' = 'Auto created table by flow engine' |
+|                   | )                                                 |
++-------------------+---------------------------------------------------+
 
 -- TODO(discord9): confirm if it's necessary to flush flow here?
 -- because flush_flow result is at most 1
diff --git a/tests/cases/standalone/common/function/admin/build_index_table.result b/tests/cases/standalone/common/function/admin/build_index_table.result
new file mode 100644
index 0000000000..4951df9e0d
--- /dev/null
+++ b/tests/cases/standalone/common/function/admin/build_index_table.result
@@ -0,0 +1,72 @@
+CREATE TABLE test (
+    ts TIMESTAMP TIME INDEX,
+    msg TEXT,
+);
+
+Affected Rows: 0
+
+INSERT INTO test VALUES 
+(1,"The quick brown fox jumps over the lazy dog"), 
+(2,"The quick brown fox jumps over the lazy cat"), 
+(3,"The quick brown fox jumps over the lazy mouse"), 
+(4,"The quick brown fox jumps over the lazy rabbit"), 
+(5,"The quick brown fox jumps over the lazy turtle");
+
+Affected Rows: 5
+
+SELECT * FROM test;
+
++-------------------------+------------------------------------------------+
+| ts                      | msg                                            |
++-------------------------+------------------------------------------------+
+| 1970-01-01T00:00:00.001 | The quick brown fox jumps over the lazy dog    |
+| 1970-01-01T00:00:00.002 | The quick brown fox jumps over the lazy cat    |
+| 1970-01-01T00:00:00.003 | The quick brown fox jumps over the lazy mouse  |
+| 1970-01-01T00:00:00.004 | The quick brown fox jumps over the lazy rabbit |
+| 1970-01-01T00:00:00.005 | The quick brown fox jumps over the lazy turtle |
++-------------------------+------------------------------------------------+
+
+ADMIN FLUSH_TABLE('test');
+
++---------------------------+
+| ADMIN FLUSH_TABLE('test') |
++---------------------------+
+| 0                         |
++---------------------------+
+
+-- SQLNESS SLEEP 1s
+-- No fulltext index yet
+SELECT index_size FROM INFORMATION_SCHEMA.REGION_STATISTICS;
+
++------------+
+| index_size |
++------------+
+| 0          |
++------------+
+
+ALTER TABLE test MODIFY COLUMN msg SET FULLTEXT INDEX;
+
+Affected Rows: 0
+
+ADMIN BUILD_INDEX('test');
+
++---------------------------+
+| ADMIN BUILD_INDEX('test') |
++---------------------------+
+| 0                         |
++---------------------------+
+
+-- SQLNESS SLEEP 1s
+-- Fulltext index built
+SELECT index_size FROM INFORMATION_SCHEMA.REGION_STATISTICS;
+
++------------+
+| index_size |
++------------+
+| 318        |
++------------+
+
+DROP TABLE test;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/function/admin/build_index_table.sql b/tests/cases/standalone/common/function/admin/build_index_table.sql
new file mode 100644
index 0000000000..da3b84f23d
--- /dev/null
+++ b/tests/cases/standalone/common/function/admin/build_index_table.sql
@@ -0,0 +1,29 @@
+CREATE TABLE test (
+    ts TIMESTAMP TIME INDEX,
+    msg TEXT,
+);
+
+INSERT INTO test VALUES 
+(1,"The quick brown fox jumps over the lazy dog"), 
+(2,"The quick brown fox jumps over the lazy cat"), 
+(3,"The quick brown fox jumps over the lazy mouse"), 
+(4,"The quick brown fox jumps over the lazy rabbit"), 
+(5,"The quick brown fox jumps over the lazy turtle");
+
+SELECT * FROM test;
+
+ADMIN FLUSH_TABLE('test');
+
+-- SQLNESS SLEEP 1s
+-- No fulltext index yet
+SELECT index_size FROM INFORMATION_SCHEMA.REGION_STATISTICS;
+
+ALTER TABLE test MODIFY COLUMN msg SET FULLTEXT INDEX;
+
+ADMIN BUILD_INDEX('test');
+
+-- SQLNESS SLEEP 1s
+-- Fulltext index built
+SELECT index_size FROM INFORMATION_SCHEMA.REGION_STATISTICS;
+
+DROP TABLE test;
diff --git a/tests/cases/standalone/common/function/expression.result b/tests/cases/standalone/common/function/expression.result
index bacf9dda71..de30cea6c2 100644
--- a/tests/cases/standalone/common/function/expression.result
+++ b/tests/cases/standalone/common/function/expression.result
@@ -96,6 +96,133 @@ SELECT LAST_VALUE('a');
 | a                     |
 +-----------------------+
 
+-- MySQL-compatible IF function tests
+SELECT IF(true, 'yes', 'no');
+
++------------------------------------------+
+| if(Boolean(true),Utf8("yes"),Utf8("no")) |
++------------------------------------------+
+| yes                                      |
++------------------------------------------+
+
+SELECT IF(false, 'yes', 'no');
+
++-------------------------------------------+
+| if(Boolean(false),Utf8("yes"),Utf8("no")) |
++-------------------------------------------+
+| no                                        |
++-------------------------------------------+
+
+SELECT IF(NULL, 'yes', 'no');
+
++---------------------------------+
+| if(NULL,Utf8("yes"),Utf8("no")) |
++---------------------------------+
+| no                              |
++---------------------------------+
+
+SELECT IF(1, 'yes', 'no');
+
++-------------------------------------+
+| if(Int64(1),Utf8("yes"),Utf8("no")) |
++-------------------------------------+
+| yes                                 |
++-------------------------------------+
+
+SELECT IF(0, 'yes', 'no');
+
++-------------------------------------+
+| if(Int64(0),Utf8("yes"),Utf8("no")) |
++-------------------------------------+
+| no                                  |
++-------------------------------------+
+
+SELECT IF(-1, 'yes', 'no');
+
++--------------------------------------+
+| if(Int64(-1),Utf8("yes"),Utf8("no")) |
++--------------------------------------+
+| yes                                  |
++--------------------------------------+
+
+SELECT IF(1.5, 'yes', 'no');
+
++-----------------------------------------+
+| if(Float64(1.5),Utf8("yes"),Utf8("no")) |
++-----------------------------------------+
+| yes                                     |
++-----------------------------------------+
+
+SELECT IF(0.0, 'yes', 'no');
+
++---------------------------------------+
+| if(Float64(0),Utf8("yes"),Utf8("no")) |
++---------------------------------------+
+| no                                    |
++---------------------------------------+
+
+-- Test with table column
+SELECT IF(a > 1, 'greater', 'not greater') FROM t;
+
++--------------------------------------------------------+
+| if(t.a > Int64(1),Utf8("greater"),Utf8("not greater")) |
++--------------------------------------------------------+
+| not greater                                            |
+| not greater                                            |
+| greater                                                |
++--------------------------------------------------------+
+
+-- Test numeric return types
+SELECT IF(true, 100, 200);
+
++-----------------------------------------+
+| if(Boolean(true),Int64(100),Int64(200)) |
++-----------------------------------------+
+| 100                                     |
++-----------------------------------------+
+
+SELECT IF(false, 100, 200);
+
++------------------------------------------+
+| if(Boolean(false),Int64(100),Int64(200)) |
++------------------------------------------+
+| 200                                      |
++------------------------------------------+
+
+-- Test with IFNULL (should already work via DataFusion)
+SELECT IFNULL(NULL, 'default');
+
++------------------------------+
+| ifnull(NULL,Utf8("default")) |
++------------------------------+
+| default                      |
++------------------------------+
+
+SELECT IFNULL('value', 'default');
+
++---------------------------------------+
+| ifnull(Utf8("value"),Utf8("default")) |
++---------------------------------------+
+| value                                 |
++---------------------------------------+
+
+-- Test COALESCE (should already work via DataFusion)
+SELECT COALESCE(NULL, NULL, 'third');
+
++-----------------------------------+
+| coalesce(NULL,NULL,Utf8("third")) |
++-----------------------------------+
+| third                             |
++-----------------------------------+
+
+SELECT COALESCE('first', 'second');
+
++----------------------------------------+
+| coalesce(Utf8("first"),Utf8("second")) |
++----------------------------------------+
+| first                                  |
++----------------------------------------+
+
 DROP TABLE t;
 
 Affected Rows: 0
diff --git a/tests/cases/standalone/common/function/expression.sql b/tests/cases/standalone/common/function/expression.sql
index b1de90036f..76d63b4665 100644
--- a/tests/cases/standalone/common/function/expression.sql
+++ b/tests/cases/standalone/common/function/expression.sql
@@ -24,4 +24,39 @@ SELECT LAST_VALUE(1);
 
 SELECT LAST_VALUE('a');
 
+-- MySQL-compatible IF function tests
+SELECT IF(true, 'yes', 'no');
+
+SELECT IF(false, 'yes', 'no');
+
+SELECT IF(NULL, 'yes', 'no');
+
+SELECT IF(1, 'yes', 'no');
+
+SELECT IF(0, 'yes', 'no');
+
+SELECT IF(-1, 'yes', 'no');
+
+SELECT IF(1.5, 'yes', 'no');
+
+SELECT IF(0.0, 'yes', 'no');
+
+-- Test with table column
+SELECT IF(a > 1, 'greater', 'not greater') FROM t;
+
+-- Test numeric return types
+SELECT IF(true, 100, 200);
+
+SELECT IF(false, 100, 200);
+
+-- Test with IFNULL (should already work via DataFusion)
+SELECT IFNULL(NULL, 'default');
+
+SELECT IFNULL('value', 'default');
+
+-- Test COALESCE (should already work via DataFusion)
+SELECT COALESCE(NULL, NULL, 'third');
+
+SELECT COALESCE('first', 'second');
+
 DROP TABLE t;
diff --git a/tests/cases/standalone/common/function/function_alias.result b/tests/cases/standalone/common/function/function_alias.result
new file mode 100644
index 0000000000..fe41c83ccb
--- /dev/null
+++ b/tests/cases/standalone/common/function/function_alias.result
@@ -0,0 +1,72 @@
+-- MySQL-compatible function alias tests
+-- ucase -> upper
+SELECT
+    ucase('dataFusion') AS ucase_value,
+    upper('dataFusion') AS upper_value;
+
++-------------+-------------+
+| ucase_value | upper_value |
++-------------+-------------+
+| DATAFUSION  | DATAFUSION  |
++-------------+-------------+
+
+-- lcase -> lower
+SELECT
+    lcase('DataFusion') AS lcase_value,
+    lower('DataFusion') AS lower_value;
+
++-------------+-------------+
+| lcase_value | lower_value |
++-------------+-------------+
+| datafusion  | datafusion  |
++-------------+-------------+
+
+-- ceiling -> ceil
+SELECT
+    ceiling(1.2) AS ceiling_pos,
+    ceil(1.2) AS ceil_pos,
+    ceiling(-1.2) AS ceiling_neg,
+    ceil(-1.2) AS ceil_neg;
+
++-------------+----------+-------------+----------+
+| ceiling_pos | ceil_pos | ceiling_neg | ceil_neg |
++-------------+----------+-------------+----------+
+| 2.0         | 2.0      | -1.0        | -1.0     |
++-------------+----------+-------------+----------+
+
+-- mid -> substr
+SELECT
+    mid('datafusion', 5, 3) AS mid_value,
+    substr('datafusion', 5, 3) AS substr_value;
+
++-----------+--------------+
+| mid_value | substr_value |
++-----------+--------------+
+| fus       | fus          |
++-----------+--------------+
+
+-- rand -> random
+-- NOTE: RAND([seed]) is supported by MySQL, but seed is not supported here.
+-- This test only validates that rand() exists and returns values in [0, 1).
+SELECT rand() >= 0.0 AND rand() < 1.0 AS rand_in_range;
+
++---------------+
+| rand_in_range |
++---------------+
+| true          |
++---------------+
+
+-- std -> stddev_pop, variance -> var_pop
+SELECT
+    round(std(x), 6) AS std_value,
+    round(stddev_pop(x), 6) AS stddev_pop_value,
+    round(variance(x), 6) AS variance_value,
+    round(var_pop(x), 6) AS var_pop_value
+FROM (VALUES (1.0), (2.0), (3.0)) AS t(x);
+
++-----------+------------------+----------------+---------------+
+| std_value | stddev_pop_value | variance_value | var_pop_value |
++-----------+------------------+----------------+---------------+
+| 0.816497  | 0.816497         | 0.666667       | 0.666667      |
++-----------+------------------+----------------+---------------+
+
diff --git a/tests/cases/standalone/common/function/function_alias.sql b/tests/cases/standalone/common/function/function_alias.sql
new file mode 100644
index 0000000000..3582bfe565
--- /dev/null
+++ b/tests/cases/standalone/common/function/function_alias.sql
@@ -0,0 +1,36 @@
+-- MySQL-compatible function alias tests
+
+-- ucase -> upper
+SELECT
+    ucase('dataFusion') AS ucase_value,
+    upper('dataFusion') AS upper_value;
+
+-- lcase -> lower
+SELECT
+    lcase('DataFusion') AS lcase_value,
+    lower('DataFusion') AS lower_value;
+
+-- ceiling -> ceil
+SELECT
+    ceiling(1.2) AS ceiling_pos,
+    ceil(1.2) AS ceil_pos,
+    ceiling(-1.2) AS ceiling_neg,
+    ceil(-1.2) AS ceil_neg;
+
+-- mid -> substr
+SELECT
+    mid('datafusion', 5, 3) AS mid_value,
+    substr('datafusion', 5, 3) AS substr_value;
+
+-- rand -> random
+-- NOTE: RAND([seed]) is supported by MySQL, but seed is not supported here.
+-- This test only validates that rand() exists and returns values in [0, 1).
+SELECT rand() >= 0.0 AND rand() < 1.0 AS rand_in_range;
+
+-- std -> stddev_pop, variance -> var_pop
+SELECT
+    round(std(x), 6) AS std_value,
+    round(stddev_pop(x), 6) AS stddev_pop_value,
+    round(variance(x), 6) AS variance_value,
+    round(var_pop(x), 6) AS var_pop_value
+FROM (VALUES (1.0), (2.0), (3.0)) AS t(x);
diff --git a/tests/cases/standalone/common/function/geo.result b/tests/cases/standalone/common/function/geo.result
index c9a064d405..738ad8d94a 100644
--- a/tests/cases/standalone/common/function/geo.result
+++ b/tests/cases/standalone/common/function/geo.result
@@ -32,11 +32,11 @@ Error: 3001(EngineExecuteQuery), Cast error: Can't cast value -1 to type UInt8
 
 SELECT h3_latlng_to_cell(37.76938, -122.3889, 8::Int8), h3_latlng_to_cell_string(37.76938, -122.3889, 8::Int8);
 
-+-------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
-| h3_latlng_to_cell(Float64(37.76938),Float64(-122.3889),arrow_cast(Int64(8),Utf8("Int8"))) | h3_latlng_to_cell_string(Float64(37.76938),Float64(-122.3889),arrow_cast(Int64(8),Utf8("Int8"))) |
-+-------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
-| 613196570438926335                                                                        | 88283082e7fffff                                                                                  |
-+-------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------+
++--------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+
+| h3_latlng_to_cell(Float64(37.76938),Float64(-122.3889),arrow_cast(Int64(8),Utf8("Int64"))) | h3_latlng_to_cell_string(Float64(37.76938),Float64(-122.3889),arrow_cast(Int64(8),Utf8("Int64"))) |
++--------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+
+| 613196570438926335                                                                         | 88283082e7fffff                                                                                   |
++--------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------+
 
 SELECT h3_latlng_to_cell(37.76938, -122.3889, 8::Int16), h3_latlng_to_cell_string(37.76938, -122.3889, 8::Int16);
 
@@ -248,11 +248,11 @@ Error: 3001(EngineExecuteQuery), Cast error: Can't cast value -1 to type UInt8
 
 SELECT geohash(37.76938, -122.3889, 11::Int8);
 
-+----------------------------------------------------------------------------------+
-| geohash(Float64(37.76938),Float64(-122.3889),arrow_cast(Int64(11),Utf8("Int8"))) |
-+----------------------------------------------------------------------------------+
-| 9q8yygxneft                                                                      |
-+----------------------------------------------------------------------------------+
++-----------------------------------------------------------------------------------+
+| geohash(Float64(37.76938),Float64(-122.3889),arrow_cast(Int64(11),Utf8("Int64"))) |
++-----------------------------------------------------------------------------------+
+| 9q8yygxneft                                                                       |
++-----------------------------------------------------------------------------------+
 
 SELECT geohash(37.76938, -122.3889, 11::Int16);
 
diff --git a/tests/cases/standalone/common/function/json/json_get.result b/tests/cases/standalone/common/function/json/json_get.result
index 01767387a9..5f17415d0c 100644
--- a/tests/cases/standalone/common/function/json/json_get.result
+++ b/tests/cases/standalone/common/function/json/json_get.result
@@ -47,6 +47,30 @@ SELECT json_get_string(parse_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b');
 |                                                                                |
 +--------------------------------------------------------------------------------+
 
+SELECT json_to_string(json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.b.c'));
+
++---------------------------------------------------------------------------------------------------+
+| json_to_string(json_get_object(parse_json(Utf8("{"a": {"b": {"c": {"d": 42}}}}")),Utf8("a.b.c"))) |
++---------------------------------------------------------------------------------------------------+
+| {"d":42}                                                                                          |
++---------------------------------------------------------------------------------------------------+
+
+SELECT json_get_int(json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.b.c'), 'd');
+
++-----------------------------------------------------------------------------------------------------------+
+| json_get_int(json_get_object(parse_json(Utf8("{"a": {"b": {"c": {"d": 42}}}}")),Utf8("a.b.c")),Utf8("d")) |
++-----------------------------------------------------------------------------------------------------------+
+| 42                                                                                                        |
++-----------------------------------------------------------------------------------------------------------+
+
+SELECT json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.e');
+
++---------------------------------------------------------------------------------+
+| json_get_object(parse_json(Utf8("{"a": {"b": {"c": {"d": 42}}}}")),Utf8("a.e")) |
++---------------------------------------------------------------------------------+
+|                                                                                 |
++---------------------------------------------------------------------------------+
+
 -- test functions with table rows --
 CREATE TABLE jsons(j JSON, ts timestamp time index);
 
@@ -123,6 +147,39 @@ SELECT json_get_int(j, 'a.b["c"]') FROM jsons;
 | 1                                      |
 +----------------------------------------+
 
+SELECT json_to_string(json_get_object(j, 'a.b')) FROM jsons;
+
++------------------------------------------------------+
+| json_to_string(json_get_object(jsons.j,Utf8("a.b"))) |
++------------------------------------------------------+
+| {"c":1}                                              |
+| {"c":1.234}                                          |
+| {"c":"foo"}                                          |
+| {"c":true}                                           |
++------------------------------------------------------+
+
+SELECT json_get_string(json_get_object(j, 'a.b'), 'c') FROM jsons;
+
++-----------------------------------------------------------------+
+| json_get_string(json_get_object(jsons.j,Utf8("a.b")),Utf8("c")) |
++-----------------------------------------------------------------+
+| 1                                                               |
+| 1.234                                                           |
+| foo                                                             |
+| true                                                            |
++-----------------------------------------------------------------+
+
+SELECT json_get_object(j, 'a.x') FROM jsons;
+
++--------------------------------------+
+| json_get_object(jsons.j,Utf8("a.x")) |
++--------------------------------------+
+|                                      |
+|                                      |
+|                                      |
+|                                      |
++--------------------------------------+
+
 DROP TABLE jsons;
 
 Affected Rows: 0
@@ -148,6 +205,10 @@ INSERT INTO jsons VALUES(parse_json('[1.2, 3.1415926535897932384626, -3e123, 1e1
 
 Affected Rows: 1
 
+INSERT INTO jsons VALUES(parse_json('[{"a": {"i": 1}}, {"a": {"i": 2}}, {"a": {"i": 3}}]'), 5);
+
+Affected Rows: 1
+
 SELECT json_get_int(j, '[0]') FROM jsons;
 
 +-----------------------------------+
@@ -157,6 +218,7 @@ SELECT json_get_int(j, '[0]') FROM jsons;
 | 1                                 |
 | 1                                 |
 |                                   |
+|                                   |
 +-----------------------------------+
 
 SELECT json_get_float(j, '[1]') FROM jsons;
@@ -168,6 +230,7 @@ SELECT json_get_float(j, '[1]') FROM jsons;
 | 0.0                                 |
 | 0.0                                 |
 | 3.141592653589793                   |
+|                                     |
 +-------------------------------------+
 
 SELECT json_get_bool(j, '[2]') FROM jsons;
@@ -179,6 +242,7 @@ SELECT json_get_bool(j, '[2]') FROM jsons;
 | false                              |
 |                                    |
 |                                    |
+|                                    |
 +------------------------------------+
 
 SELECT json_get_string(j, '[3]') FROM jsons;
@@ -190,8 +254,45 @@ SELECT json_get_string(j, '[3]') FROM jsons;
 | false                                                  |
 | 2147483648                                             |
 | 1e100                                                  |
+|                                                        |
 +--------------------------------------------------------+
 
+SELECT json_to_string(json_get_object(j, '[0]')) FROM jsons;
+
++------------------------------------------------------+
+| json_to_string(json_get_object(jsons.j,Utf8("[0]"))) |
++------------------------------------------------------+
+|                                                      |
+|                                                      |
+|                                                      |
+|                                                      |
+| {"a":{"i":1}}                                        |
++------------------------------------------------------+
+
+SELECT json_get_int(json_get_object(j, '[0]'), 'a.i') FROM jsons;
+
++----------------------------------------------------------------+
+| json_get_int(json_get_object(jsons.j,Utf8("[0]")),Utf8("a.i")) |
++----------------------------------------------------------------+
+|                                                                |
+|                                                                |
+|                                                                |
+|                                                                |
+| 1                                                              |
++----------------------------------------------------------------+
+
+SELECT json_get_int(json_get_object(j, '[9]'), 'a.i') FROM jsons;
+
++----------------------------------------------------------------+
+| json_get_int(json_get_object(jsons.j,Utf8("[9]")),Utf8("a.i")) |
++----------------------------------------------------------------+
+|                                                                |
+|                                                                |
+|                                                                |
+|                                                                |
+|                                                                |
++----------------------------------------------------------------+
+
 DROP TABLE jsons;
 
 Affected Rows: 0
@@ -259,6 +360,27 @@ SELECT json_to_string(j) FROM jsons WHERE CAST(json_get_int(j, 'a.b.c') AS BOOLE
 | {"a":{"b":{"c":true}}}  |
 +-------------------------+
 
+SELECT json_to_string(j) FROM jsons WHERE json_get_string(json_get_object(j, 'a.b'), 'c') == 'foo';
+
++-------------------------+
+| json_to_string(jsons.j) |
++-------------------------+
+| {"a":{"b":{"c":"foo"}}} |
++-------------------------+
+
+SELECT json_to_string(j) FROM jsons WHERE json_to_string(json_get_object(j, 'a.b')) == '{"c":1}';
+
++-------------------------+
+| json_to_string(jsons.j) |
++-------------------------+
+| {"a":{"b":{"c":1}}}     |
++-------------------------+
+
+SELECT json_to_string(j) FROM jsons WHERE json_get_string(json_get_object(j, 'a.x'), 'c') == 'foo';
+
+++
+++
+
 DROP TABLE jsons;
 
 Affected Rows: 0
diff --git a/tests/cases/standalone/common/function/json/json_get.sql b/tests/cases/standalone/common/function/json/json_get.sql
index 3247536b07..010a3bd7a7 100644
--- a/tests/cases/standalone/common/function/json/json_get.sql
+++ b/tests/cases/standalone/common/function/json/json_get.sql
@@ -11,6 +11,12 @@ SELECT json_get_int(parse_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b');
 
 SELECT json_get_string(parse_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b');
 
+SELECT json_to_string(json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.b.c'));
+
+SELECT json_get_int(json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.b.c'), 'd');
+
+SELECT json_get_object(parse_json('{"a": {"b": {"c": {"d": 42}}}}'), 'a.e');
+
 -- test functions with table rows --
 CREATE TABLE jsons(j JSON, ts timestamp time index);
 
@@ -32,6 +38,12 @@ SELECT json_get_bool(j, 'a.b.c') FROM jsons;
 
 SELECT json_get_int(j, 'a.b["c"]') FROM jsons;
 
+SELECT json_to_string(json_get_object(j, 'a.b')) FROM jsons;
+
+SELECT json_get_string(json_get_object(j, 'a.b'), 'c') FROM jsons;
+
+SELECT json_get_object(j, 'a.x') FROM jsons;
+
 DROP TABLE jsons;
 
 -- test functions with arrays --
@@ -45,6 +57,8 @@ INSERT INTO jsons VALUES(parse_json('[1, 0, -2147483649, 2147483648]'), 3);
 
 INSERT INTO jsons VALUES(parse_json('[1.2, 3.1415926535897932384626, -3e123, 1e100]'), 4);
 
+INSERT INTO jsons VALUES(parse_json('[{"a": {"i": 1}}, {"a": {"i": 2}}, {"a": {"i": 3}}]'), 5);
+
 SELECT json_get_int(j, '[0]') FROM jsons;
 
 SELECT json_get_float(j, '[1]') FROM jsons;
@@ -53,6 +67,12 @@ SELECT json_get_bool(j, '[2]') FROM jsons;
 
 SELECT json_get_string(j, '[3]') FROM jsons;
 
+SELECT json_to_string(json_get_object(j, '[0]')) FROM jsons;
+
+SELECT json_get_int(json_get_object(j, '[0]'), 'a.i') FROM jsons;
+
+SELECT json_get_int(json_get_object(j, '[9]'), 'a.i') FROM jsons;
+
 DROP TABLE jsons;
 
 -- test functions in WHERE clause --
@@ -76,4 +96,10 @@ SELECT json_to_string(j) FROM jsons WHERE json_get_bool(j, 'a.b.c') = true;
 
 SELECT json_to_string(j) FROM jsons WHERE CAST(json_get_int(j, 'a.b.c') AS BOOLEAN);
 
+SELECT json_to_string(j) FROM jsons WHERE json_get_string(json_get_object(j, 'a.b'), 'c') == 'foo';
+
+SELECT json_to_string(j) FROM jsons WHERE json_to_string(json_get_object(j, 'a.b')) == '{"c":1}';
+
+SELECT json_to_string(j) FROM jsons WHERE json_get_string(json_get_object(j, 'a.x'), 'c') == 'foo';
+
 DROP TABLE jsons;
diff --git a/tests/cases/standalone/common/information_schema/ssts.result b/tests/cases/standalone/common/information_schema/ssts.result
index f9ac0dd47b..bf0642f667 100644
--- a/tests/cases/standalone/common/information_schema/ssts.result
+++ b/tests/cases/standalone/common/information_schema/ssts.result
@@ -10,7 +10,7 @@ DESC TABLE information_schema.ssts_manifest;
 | region_group     | UInt8               |     | NO   |         | FIELD         |
 | region_sequence  | UInt32              |     | NO   |         | FIELD         |
 | file_id          | String              |     | NO   |         | FIELD         |
-| index_file_id    | String              |     | YES  |         | FIELD         |
+| index_version    | UInt64              |     | NO   |         | FIELD         |
 | level            | UInt8               |     | NO   |         | FIELD         |
 | file_path        | String              |     | NO   |         | FIELD         |
 | file_size        | UInt64              |     | NO   |         | FIELD         |
@@ -97,13 +97,13 @@ ADMIN FLUSH_TABLE('sst_case');
 -- SQLNESS REPLACE (/public/\d+) /public/<TABLE_ID>
 SELECT * FROM information_schema.ssts_manifest order by file_path;
 
-+----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+
-| table_dir                  | region_id     | table_id | region_number | region_group | region_sequence | file_id                              | index_file_id                        | level | file_path                                                                              | file_size | index_file_path                                                                             | index_file_size | num_rows | num_row_groups | num_series | min_ts                  | max_ts                  | sequence | origin_region_id | node_id | visible |
-+----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+
-| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> | <UUID> |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
-| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> | <UUID> |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
-| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> | <UUID> |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
-+----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+
++----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+---------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+
+| table_dir                  | region_id     | table_id | region_number | region_group | region_sequence | file_id                              | index_version | level | file_path                                                                              | file_size | index_file_path                                                                             | index_file_size | num_rows | num_row_groups | num_series | min_ts                  | max_ts                  | sequence | origin_region_id | node_id | visible |
++----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+---------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+
+| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> |<NUM>|<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
+| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> |<NUM>|<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
+| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> |<NUM>|<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
++----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+---------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+
 
 -- SQLNESS REPLACE (\s+\d+\s+) <NUM>
 -- SQLNESS REPLACE ([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}) <UUID>
@@ -165,15 +165,15 @@ ADMIN FLUSH_TABLE('sst_case');
 -- SQLNESS REPLACE (/public/\d+) /public/<TABLE_ID>
 SELECT * FROM information_schema.ssts_manifest order by file_path;
 
-+----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+
-| table_dir                  | region_id     | table_id | region_number | region_group | region_sequence | file_id                              | index_file_id                        | level | file_path                                                                              | file_size | index_file_path                                                                             | index_file_size | num_rows | num_row_groups | num_series | min_ts                  | max_ts                  | sequence | origin_region_id | node_id | visible |
-+----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+
-| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> | <UUID> |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
-| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> | <UUID> |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
-| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> | <UUID> |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
-| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> | <UUID> |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
-| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> | <UUID> |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
-+----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+--------------------------------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+
++----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+---------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+
+| table_dir                  | region_id     | table_id | region_number | region_group | region_sequence | file_id                              | index_version | level | file_path                                                                              | file_size | index_file_path                                                                             | index_file_size | num_rows | num_row_groups | num_series | min_ts                  | max_ts                  | sequence | origin_region_id | node_id | visible |
++----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+---------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+
+| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> |<NUM>|<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
+| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> |<NUM>|<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
+| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> |<NUM>|<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
+| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> |<NUM>|<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
+| data/greptime/public/<TABLE_ID>/ |<NUM>|<NUM>|<NUM>|<NUM>|<NUM>| <UUID> |<NUM>|<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/<UUID>.parquet |<NUM>| data/greptime/public/<TABLE_ID>/<REGION_ID>_<REGION_NUMBER>/index/<UUID>.puffin |<NUM>|<NUM>|<NUM>|<NUM>| <DATETIME> | <DATETIME> |<NUM>|<NUM>|<NUM>| true    |
++----------------------------+---------------+----------+---------------+--------------+-----------------+--------------------------------------+---------------+-------+----------------------------------------------------------------------------------------+-----------+---------------------------------------------------------------------------------------------+-----------------+----------+----------------+------------+-------------------------+-------------------------+----------+------------------+---------+---------+
 
 -- SQLNESS REPLACE (\s+\d+\s+) <NUM>
 -- SQLNESS REPLACE ([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}) <UUID>
diff --git a/tests/cases/standalone/common/insert/logical_metric_table.result b/tests/cases/standalone/common/insert/logical_metric_table.result
index ad6142050d..80e765ccd6 100644
--- a/tests/cases/standalone/common/insert/logical_metric_table.result
+++ b/tests/cases/standalone/common/insert/logical_metric_table.result
@@ -37,8 +37,8 @@ SELECT * from t2;
 +------+-------------------------+-----+
 | job  | ts                      | val |
 +------+-------------------------+-----+
-| job2 | 1970-01-01T00:00:00.001 | 1.0 |
 | job1 | 1970-01-01T00:00:00     | 0.0 |
+| job2 | 1970-01-01T00:00:00.001 | 1.0 |
 +------+-------------------------+-----+
 
 DROP TABLE t1;
@@ -67,10 +67,10 @@ SELECT ts, val, __tsid, host, job FROM phy;
 +-------------------------+-----+----------------------+-------+------+
 | ts                      | val | __tsid               | host  | job  |
 +-------------------------+-----+----------------------+-------+------+
-| 1970-01-01T00:00:00.001 | 1.0 | 1128149335081630826  | host2 |      |
-| 1970-01-01T00:00:00     | 0.0 | 18067404594631612786 | host1 |      |
-| 1970-01-01T00:00:00.001 | 1.0 | 2176048834144407834  |       | job2 |
-| 1970-01-01T00:00:00     | 0.0 | 15980333303142110493 |       | job1 |
+| 1970-01-01T00:00:00.001 | 1.0 | 7947983149541006936  | host2 |      |
+| 1970-01-01T00:00:00     | 0.0 | 13882403126406556045 | host1 |      |
+| 1970-01-01T00:00:00     | 0.0 | 6248409809737953425  |       | job1 |
+| 1970-01-01T00:00:00.001 | 1.0 | 12867770218286207316 |       | job2 |
 +-------------------------+-----+----------------------+-------+------+
 
 DROP TABLE phy;
@@ -123,8 +123,8 @@ SELECT * from t2;
 +------+-------------------------+-----+
 | job  | ts                      | val |
 +------+-------------------------+-----+
-| job2 | 1970-01-01T00:00:00.001 | 1.0 |
 | job1 | 1970-01-01T00:00:00     | 0.0 |
+| job2 | 1970-01-01T00:00:00.001 | 1.0 |
 +------+-------------------------+-----+
 
 ADMIN flush_table('phy');
@@ -154,10 +154,10 @@ SELECT * from t2;
 +------+-------------------------+-----+
 | job  | ts                      | val |
 +------+-------------------------+-----+
-| job2 | 1970-01-01T00:00:00.001 | 1.0 |
 | job3 | 1970-01-01T00:00:00     | 0.0 |
-| job4 | 1970-01-01T00:00:00.001 | 1.0 |
 | job1 | 1970-01-01T00:00:00     | 0.0 |
+| job4 | 1970-01-01T00:00:00.001 | 1.0 |
+| job2 | 1970-01-01T00:00:00.001 | 1.0 |
 +------+-------------------------+-----+
 
 DROP TABLE t1;
diff --git a/tests/cases/standalone/common/mysql.result b/tests/cases/standalone/common/mysql.result
index 5f120aec90..232cd3aed2 100644
--- a/tests/cases/standalone/common/mysql.result
+++ b/tests/cases/standalone/common/mysql.result
@@ -27,3 +27,78 @@ SHOW DATABASES;
 | public             |
 +--------------------+
 
+-- ======================================================
+-- MySQL compatibility tests for JDBC connectors
+-- ======================================================
+-- Test MySQL IF() function (issue #7278 compatibility)
+-- SQLNESS PROTOCOL MYSQL
+SELECT IF(1, 'yes', 'no') as result;
+
++--------+
+| result |
++--------+
+| yes    |
++--------+
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT IF(0, 'yes', 'no') as result;
+
++--------+
+| result |
++--------+
+| no     |
++--------+
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT IF(NULL, 'yes', 'no') as result;
+
++--------+
+| result |
++--------+
+| no     |
++--------+
+
+-- Test IFNULL (should work via DataFusion)
+-- SQLNESS PROTOCOL MYSQL
+SELECT IFNULL(NULL, 'default') as result;
+
++---------+
+| result  |
++---------+
+| default |
++---------+
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT IFNULL('value', 'default') as result;
+
++--------+
+| result |
++--------+
+| value  |
++--------+
+
+-- Test COALESCE
+-- SQLNESS PROTOCOL MYSQL
+SELECT COALESCE(NULL, NULL, 'third') as result;
+
++--------+
+| result |
++--------+
+| third  |
++--------+
+
+-- Verify SHOW TABLES column naming
+-- SQLNESS PROTOCOL MYSQL
+USE public;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SHOW TABLES;
+
++------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
++------------------+
+
diff --git a/tests/cases/standalone/common/mysql.sql b/tests/cases/standalone/common/mysql.sql
index 31b2db23f9..74b2b12fbe 100644
--- a/tests/cases/standalone/common/mysql.sql
+++ b/tests/cases/standalone/common/mysql.sql
@@ -6,3 +6,35 @@ SELECT @@version_comment;
 
 -- SQLNESS PROTOCOL MYSQL
 SHOW DATABASES;
+
+-- ======================================================
+-- MySQL compatibility tests for JDBC connectors
+-- ======================================================
+
+-- Test MySQL IF() function (issue #7278 compatibility)
+-- SQLNESS PROTOCOL MYSQL
+SELECT IF(1, 'yes', 'no') as result;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT IF(0, 'yes', 'no') as result;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT IF(NULL, 'yes', 'no') as result;
+
+-- Test IFNULL (should work via DataFusion)
+-- SQLNESS PROTOCOL MYSQL
+SELECT IFNULL(NULL, 'default') as result;
+
+-- SQLNESS PROTOCOL MYSQL
+SELECT IFNULL('value', 'default') as result;
+
+-- Test COALESCE
+-- SQLNESS PROTOCOL MYSQL
+SELECT COALESCE(NULL, NULL, 'third') as result;
+
+-- Verify SHOW TABLES column naming
+-- SQLNESS PROTOCOL MYSQL
+USE public;
+
+-- SQLNESS PROTOCOL MYSQL
+SHOW TABLES;
diff --git a/tests/cases/standalone/common/partition.result b/tests/cases/standalone/common/partition.result
index 236da83a59..6171744134 100644
--- a/tests/cases/standalone/common/partition.result
+++ b/tests/cases/standalone/common/partition.result
@@ -12,15 +12,15 @@ PARTITION ON COLUMNS (a) (
 Affected Rows: 0
 
 -- SQLNESS REPLACE (\d{13}) ID
-SELECT table_catalog, table_schema, table_name, partition_name, partition_expression, greptime_partition_id from information_schema.partitions WHERE table_name = 'my_table' ORDER BY table_catalog, table_schema, table_name, partition_name;
+SELECT table_catalog, table_schema, table_name, partition_name, partition_expression, partition_description, greptime_partition_id from information_schema.partitions WHERE table_name = 'my_table' ORDER BY table_catalog, table_schema, table_name, partition_name;
 
-+---------------+--------------+------------+----------------+------------------------+-----------------------+
-| table_catalog | table_schema | table_name | partition_name | partition_expression   | greptime_partition_id |
-+---------------+--------------+------------+----------------+------------------------+-----------------------+
-| greptime      | public       | my_table   | p0             | a < 1000               | ID         |
-| greptime      | public       | my_table   | p1             | a >= 1000 AND a < 2000 | ID         |
-| greptime      | public       | my_table   | p2             | a >= 2000              | ID         |
-+---------------+--------------+------------+----------------+------------------------+-----------------------+
++---------------+--------------+------------+----------------+----------------------+------------------------+-----------------------+
+| table_catalog | table_schema | table_name | partition_name | partition_expression | partition_description  | greptime_partition_id |
++---------------+--------------+------------+----------------+----------------------+------------------------+-----------------------+
+| greptime      | public       | my_table   | p0             | a                    | a < 1000               | ID         |
+| greptime      | public       | my_table   | p1             | a                    | a >= 1000 AND a < 2000 | ID         |
+| greptime      | public       | my_table   | p2             | a                    | a >= 2000              | ID         |
++---------------+--------------+------------+----------------+----------------------+------------------------+-----------------------+
 
 -- SQLNESS REPLACE (\d{13}) REGION_ID
 -- SQLNESS REPLACE (\d{1}) PEER_ID
@@ -126,7 +126,7 @@ SELECT table_catalog, table_schema, table_name, partition_name, partition_expres
 +---------------+--------------+------------+----------------+----------------------+-----------------------+
 | table_catalog | table_schema | table_name | partition_name | partition_expression | greptime_partition_id |
 +---------------+--------------+------------+----------------+----------------------+-----------------------+
-| greptime      | public       | my_table   | p0             |                      | ID         |
+| greptime      | public       | my_table   | p0             | a                    | ID         |
 +---------------+--------------+------------+----------------+----------------------+-----------------------+
 
 -- SQLNESS REPLACE (\d{13}) REGION_ID
diff --git a/tests/cases/standalone/common/partition.sql b/tests/cases/standalone/common/partition.sql
index 89c4258be9..7b5e8ea775 100644
--- a/tests/cases/standalone/common/partition.sql
+++ b/tests/cases/standalone/common/partition.sql
@@ -10,7 +10,7 @@ PARTITION ON COLUMNS (a) (
 );
 
 -- SQLNESS REPLACE (\d{13}) ID
-SELECT table_catalog, table_schema, table_name, partition_name, partition_expression, greptime_partition_id from information_schema.partitions WHERE table_name = 'my_table' ORDER BY table_catalog, table_schema, table_name, partition_name;
+SELECT table_catalog, table_schema, table_name, partition_name, partition_expression, partition_description, greptime_partition_id from information_schema.partitions WHERE table_name = 'my_table' ORDER BY table_catalog, table_schema, table_name, partition_name;
 
 -- SQLNESS REPLACE (\d{13}) REGION_ID
 -- SQLNESS REPLACE (\d{1}) PEER_ID
diff --git a/tests/cases/standalone/common/promql/histogram_multi_partition.result b/tests/cases/standalone/common/promql/histogram_multi_partition.result
new file mode 100644
index 0000000000..c0b146ecd6
--- /dev/null
+++ b/tests/cases/standalone/common/promql/histogram_multi_partition.result
@@ -0,0 +1,91 @@
+-- Minimal repro for histogram quantile over multi-partition input.
+create table histogram_gap_bucket (
+    ts timestamp time index,
+    le string,
+    shard string,
+    val double,
+    primary key (shard, le)
+) partition on columns (shard) (
+    shard < 'n',
+    shard >= 'n'
+);
+
+Affected Rows: 0
+
+insert into histogram_gap_bucket values
+    (0, '0.5', 'a', 1),
+    (0, '1', 'a', 2),
+    (0, '+Inf', 'a', 2),
+    (0, '0.5', 'z', 2),
+    (0, '1', 'z', 4),
+    (0, '+Inf', 'z', 4),
+    (10000, '0.5', 'a', 1),
+    (10000, '1', 'a', 2),
+    (10000, '+Inf', 'a', 2),
+    (10000, '0.5', 'z', 1),
+    (10000, '1', 'z', 3),
+    (10000, '+Inf', 'z', 3);
+
+Affected Rows: 12
+
+-- Ensure the physical plan keeps the required repartition/order before folding buckets.
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE Hash\(\[ts@1\],.* Hash([ts@1],REDACTED
+-- SQLNESS REPLACE Hash\(\[le@0,\sts@1\],.* Hash([le@0, ts@1],REDACTED
+tql analyze (0, 10, '10s') histogram_quantile(0.5, sum by (le) (histogram_gap_bucket));
+
++-+-+-+
+| stage | node | plan_|
++-+-+-+
+| 0_| 0_|_HistogramFoldExec: le=@0, field=@2, quantile=0.5 REDACTED
+|_|_|_SortExec: expr=[ts@1 ASC NULLS LAST, CAST(le@0 AS Float64) ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=Hash([ts@1],REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[le@0 as le, ts@1 as ts], aggr=[sum(histogram_gap_bucket.val)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=Hash([le@0, ts@1],REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[le@0 as le, ts@1 as ts], aggr=[sum(histogram_gap_bucket.val)] REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_MergeScanExec: REDACTED
+|_|_|_|
+| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[le@0 as le, ts@1 as ts], aggr=[__sum_state(histogram_gap_bucket.val)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=Hash([le@0, ts@1],REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[le@1 as le, ts@0 as ts], aggr=[__sum_state(histogram_gap_bucket.val)] REDACTED
+|_|_|_ProjectionExec: expr=[ts@0 as ts, le@1 as le, val@3 as val] REDACTED
+|_|_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[10000], time index=[ts] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["shard", "le"] REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[le@0 as le, ts@1 as ts], aggr=[__sum_state(histogram_gap_bucket.val)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=Hash([le@0, ts@1],REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[le@1 as le, ts@0 as ts], aggr=[__sum_state(histogram_gap_bucket.val)] REDACTED
+|_|_|_ProjectionExec: expr=[ts@0 as ts, le@1 as le, val@3 as val] REDACTED
+|_|_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[10000], time index=[ts] REDACTED
+|_|_|_PromSeriesDivideExec: tags=["shard", "le"] REDACTED
+|_|_|_CooperativeExec REDACTED
+|_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":1, "mem_ranges":1, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
+|_|_|_|
+|_|_| Total rows: 2_|
++-+-+-+
+
+-- SQLNESS SORT_RESULT 2 1
+tql eval (0, 10, '10s') histogram_quantile(0.5, sum by (le) (histogram_gap_bucket));
+
++---------------------+-------------------------------+
+| ts                  | sum(histogram_gap_bucket.val) |
++---------------------+-------------------------------+
+| 1970-01-01T00:00:00 | 0.5                           |
+| 1970-01-01T00:00:10 | 0.5833333333333334            |
++---------------------+-------------------------------+
+
+drop table histogram_gap_bucket;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/promql/histogram_multi_partition.sql b/tests/cases/standalone/common/promql/histogram_multi_partition.sql
new file mode 100644
index 0000000000..b360999fcf
--- /dev/null
+++ b/tests/cases/standalone/common/promql/histogram_multi_partition.sql
@@ -0,0 +1,40 @@
+-- Minimal repro for histogram quantile over multi-partition input.
+create table histogram_gap_bucket (
+    ts timestamp time index,
+    le string,
+    shard string,
+    val double,
+    primary key (shard, le)
+) partition on columns (shard) (
+    shard < 'n',
+    shard >= 'n'
+);
+
+insert into histogram_gap_bucket values
+    (0, '0.5', 'a', 1),
+    (0, '1', 'a', 2),
+    (0, '+Inf', 'a', 2),
+    (0, '0.5', 'z', 2),
+    (0, '1', 'z', 4),
+    (0, '+Inf', 'z', 4),
+    (10000, '0.5', 'a', 1),
+    (10000, '1', 'a', 2),
+    (10000, '+Inf', 'a', 2),
+    (10000, '0.5', 'z', 1),
+    (10000, '1', 'z', 3),
+    (10000, '+Inf', 'z', 3);
+
+-- Ensure the physical plan keeps the required repartition/order before folding buckets.
+-- SQLNESS REPLACE (metrics.*) REDACTED
+-- SQLNESS REPLACE (peers.*) REDACTED
+-- SQLNESS REPLACE region=\d+\(\d+,\s+\d+\) region=REDACTED
+-- SQLNESS REPLACE (-+) -
+-- SQLNESS REPLACE (\s\s+) _
+-- SQLNESS REPLACE Hash\(\[ts@1\],.* Hash([ts@1],REDACTED
+-- SQLNESS REPLACE Hash\(\[le@0,\sts@1\],.* Hash([le@0, ts@1],REDACTED
+tql analyze (0, 10, '10s') histogram_quantile(0.5, sum by (le) (histogram_gap_bucket));
+
+-- SQLNESS SORT_RESULT 2 1
+tql eval (0, 10, '10s') histogram_quantile(0.5, sum by (le) (histogram_gap_bucket));
+
+drop table histogram_gap_bucket;
diff --git a/tests/cases/standalone/common/promql/offset.result b/tests/cases/standalone/common/promql/offset.result
index f5787928d8..760e1ac333 100644
--- a/tests/cases/standalone/common/promql/offset.result
+++ b/tests/cases/standalone/common/promql/offset.result
@@ -46,16 +46,6 @@ tql eval (1500, 1500, '1s') calculate_rate_offset_total;
 -- SQLNESS SORT_RESULT 3 1
 tql eval (1500, 1500, '1s') calculate_rate_offset_total offset 10m;
 
-+---------------------+-------+---+
-| ts                  | val   | x |
-+---------------------+-------+---+
-| 1970-01-01T00:25:00 | 140.0 | b |
-| 1970-01-01T00:25:00 | 70.0  | a |
-+---------------------+-------+---+
-
--- SQLNESS SORT_RESULT 3 1
-tql eval (1500, 1500, '1s') calculate_rate_offset_total offset -10m;
-
 +---------------------+------+---+
 | ts                  | val  | x |
 +---------------------+------+---+
@@ -63,9 +53,25 @@ tql eval (1500, 1500, '1s') calculate_rate_offset_total offset -10m;
 | 1970-01-01T00:25:00 | 60.0 | b |
 +---------------------+------+---+
 
+-- SQLNESS SORT_RESULT 3 1
+tql eval (1500, 1500, '1s') calculate_rate_offset_total offset -10m;
+
++---------------------+-------+---+
+| ts                  | val   | x |
++---------------------+-------+---+
+| 1970-01-01T00:25:00 | 140.0 | b |
+| 1970-01-01T00:25:00 | 70.0  | a |
++---------------------+-------+---+
+
 -- SQLNESS SORT_RESULT 3 1
 tql eval (0, 0, '1s') calculate_rate_offset_total offset 10m;
 
+++
+++
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (0, 0, '1s') calculate_rate_offset_total offset -10m;
+
 +---------------------+------+---+
 | ts                  | val  | x |
 +---------------------+------+---+
@@ -73,18 +79,8 @@ tql eval (0, 0, '1s') calculate_rate_offset_total offset 10m;
 | 1970-01-01T00:00:00 | 40.0 | b |
 +---------------------+------+---+
 
-tql eval (0, 0, '1s') calculate_rate_offset_total offset -10m;
-
-++
-++
-
-tql eval (3000, 3000, '1s') calculate_rate_offset_total offset 10m;
-
-++
-++
-
 -- SQLNESS SORT_RESULT 3 1
-tql eval (3000, 3000, '1s') calculate_rate_offset_total offset -10m;
+tql eval (3000, 3000, '1s') calculate_rate_offset_total offset 10m;
 
 +---------------------+-------+---+
 | ts                  | val   | x |
@@ -93,6 +89,13 @@ tql eval (3000, 3000, '1s') calculate_rate_offset_total offset -10m;
 | 1970-01-01T00:50:00 | 80.0  | a |
 +---------------------+-------+---+
 
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3000, '1s') calculate_rate_offset_total offset -10m;
+
+++
+++
+
+-- SQLNESS SORT_RESULT 3 1
 tql eval (3000, 3000, '1s') rate(calculate_rate_window_total[10m]);
 
 ++
diff --git a/tests/cases/standalone/common/promql/offset.sql b/tests/cases/standalone/common/promql/offset.sql
index 37981e7929..53dd2c326f 100644
--- a/tests/cases/standalone/common/promql/offset.sql
+++ b/tests/cases/standalone/common/promql/offset.sql
@@ -42,13 +42,16 @@ tql eval (1500, 1500, '1s') calculate_rate_offset_total offset -10m;
 -- SQLNESS SORT_RESULT 3 1
 tql eval (0, 0, '1s') calculate_rate_offset_total offset 10m;
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (0, 0, '1s') calculate_rate_offset_total offset -10m;
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (3000, 3000, '1s') calculate_rate_offset_total offset 10m;
 
 -- SQLNESS SORT_RESULT 3 1
 tql eval (3000, 3000, '1s') calculate_rate_offset_total offset -10m;
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (3000, 3000, '1s') rate(calculate_rate_window_total[10m]);
 
 -- SQLNESS SORT_RESULT 3 1
diff --git a/tests/cases/standalone/common/promql/offset_direction.result b/tests/cases/standalone/common/promql/offset_direction.result
new file mode 100644
index 0000000000..50098be162
--- /dev/null
+++ b/tests/cases/standalone/common/promql/offset_direction.result
@@ -0,0 +1,44 @@
+-- Regression for offset direction: positive offsets should query past data.
+create table offset_direction (
+    ts timestamp time index,
+    val double,
+    host string primary key
+);
+
+Affected Rows: 0
+
+insert into offset_direction values
+    (940000, 10.0, 'a'),
+    (1000000, 20.0, 'a'),
+    (1060000, 30.0, 'a');
+
+Affected Rows: 3
+
+tql eval (1000, 1000, '1s') offset_direction;
+
++---------------------+------+------+
+| ts                  | val  | host |
++---------------------+------+------+
+| 1970-01-01T00:16:40 | 20.0 | a    |
++---------------------+------+------+
+
+tql eval (1000, 1000, '1s') offset_direction offset 60s;
+
++---------------------+------+------+
+| ts                  | val  | host |
++---------------------+------+------+
+| 1970-01-01T00:16:40 | 10.0 | a    |
++---------------------+------+------+
+
+tql eval (1000, 1000, '1s') offset_direction offset -60s;
+
++---------------------+------+------+
+| ts                  | val  | host |
++---------------------+------+------+
+| 1970-01-01T00:16:40 | 30.0 | a    |
++---------------------+------+------+
+
+drop table offset_direction;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/promql/offset_direction.sql b/tests/cases/standalone/common/promql/offset_direction.sql
new file mode 100644
index 0000000000..464062b7d1
--- /dev/null
+++ b/tests/cases/standalone/common/promql/offset_direction.sql
@@ -0,0 +1,20 @@
+-- Regression for offset direction: positive offsets should query past data.
+
+create table offset_direction (
+    ts timestamp time index,
+    val double,
+    host string primary key
+);
+
+insert into offset_direction values
+    (940000, 10.0, 'a'),
+    (1000000, 20.0, 'a'),
+    (1060000, 30.0, 'a');
+
+tql eval (1000, 1000, '1s') offset_direction;
+
+tql eval (1000, 1000, '1s') offset_direction offset 60s;
+
+tql eval (1000, 1000, '1s') offset_direction offset -60s;
+
+drop table offset_direction;
diff --git a/tests/cases/standalone/common/promql/simple_histogram.result b/tests/cases/standalone/common/promql/simple_histogram.result
index 1409e7834a..30b48b3455 100644
--- a/tests/cases/standalone/common/promql/simple_histogram.result
+++ b/tests/cases/standalone/common/promql/simple_histogram.result
@@ -56,6 +56,7 @@ tql eval (3000, 3000, '1s') histogram_quantile(NaN, histogram_bucket);
 +---------------------+----------+-----+
 
 -- Quantile value in lowest bucket, which is positive.
+-- SQLNESS SORT_RESULT 3 1
 tql eval (3000, 3000, '1s') histogram_quantile(0, histogram_bucket{s="positive"});
 
 +---------------------+----------+-----+
@@ -65,6 +66,7 @@ tql eval (3000, 3000, '1s') histogram_quantile(0, histogram_bucket{s="positive"}
 +---------------------+----------+-----+
 
 -- Quantile value in lowest bucket, which is negative.
+-- SQLNESS SORT_RESULT 3 1
 tql eval (3000, 3000, '1s') histogram_quantile(0, histogram_bucket{s="negative"});
 
 +---------------------+----------+------+
@@ -128,6 +130,7 @@ tql eval (3000, 3000, '1s') label_replace(histogram_quantile(0.8, histogram_buck
 -- More realistic with rates.
 -- This case doesn't contains value because other point are not inserted.
 -- quantile with rate is covered in other cases
+-- SQLNESS SORT_RESULT 3 1
 tql eval (3000, 3000, '1s') histogram_quantile(0.2, rate(histogram_bucket[5m]));
 
 ++
@@ -202,6 +205,7 @@ insert into histogram2_bucket values
 Affected Rows: 50
 
 -- Want results exactly in the middle of the bucket.
+-- SQLNESS SORT_RESULT 3 1
 tql eval (420, 420, '1s') histogram_quantile(0.166, histogram2_bucket);
 
 +---------------------+-------+
@@ -210,6 +214,7 @@ tql eval (420, 420, '1s') histogram_quantile(0.166, histogram2_bucket);
 | 1970-01-01T00:07:00 | 0.996 |
 +---------------------+-------+
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (420, 420, '1s') histogram_quantile(0.5, histogram2_bucket);
 
 +---------------------+-----+
@@ -218,6 +223,7 @@ tql eval (420, 420, '1s') histogram_quantile(0.5, histogram2_bucket);
 | 1970-01-01T00:07:00 | 3.0 |
 +---------------------+-----+
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (420, 420, '1s') histogram_quantile(0.833, histogram2_bucket);
 
 +---------------------+-------------------+
@@ -226,6 +232,7 @@ tql eval (420, 420, '1s') histogram_quantile(0.833, histogram2_bucket);
 | 1970-01-01T00:07:00 | 4.997999999999999 |
 +---------------------+-------------------+
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (2820, 2820, '1s') histogram_quantile(0.166, rate(histogram2_bucket[15m]));
 
 +---------------------+------------------------------------------+
@@ -234,6 +241,7 @@ tql eval (2820, 2820, '1s') histogram_quantile(0.166, rate(histogram2_bucket[15m
 | 1970-01-01T00:47:00 | 0.996                                    |
 +---------------------+------------------------------------------+
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (2820, 2820, '1s') histogram_quantile(0.5, rate(histogram2_bucket[15m]));
 
 +---------------------+------------------------------------------+
@@ -242,6 +250,7 @@ tql eval (2820, 2820, '1s') histogram_quantile(0.5, rate(histogram2_bucket[15m])
 | 1970-01-01T00:47:00 | 3.0                                      |
 +---------------------+------------------------------------------+
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (2820, 2820, '1s') histogram_quantile(0.833, rate(histogram2_bucket[15m]));
 
 +---------------------+------------------------------------------+
@@ -282,6 +291,7 @@ insert into histogram3_bucket values
 
 Affected Rows: 12
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (3000, 3005, '3s') histogram_quantile(0.5, sum by(le, s) (rate(histogram3_bucket[5m])));
 
 +---+---------------------+-----------------------------------------------+
@@ -319,6 +329,7 @@ insert into histogram4_bucket values
 
 Affected Rows: 7
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (2900, 3000, '100s') histogram_quantile(0.9, histogram4_bucket);
 
 +---------------------+---+-----+
@@ -332,6 +343,7 @@ drop table histogram4_bucket;
 
 Affected Rows: 0
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval(0, 10, '10s') histogram_quantile(0.99, sum by(pod,instance, fff) (rate(greptime_servers_postgres_query_elapsed_bucket{instance=~"xxx"}[1m])));
 
 ++
@@ -348,12 +360,14 @@ CREATE TABLE greptime_servers_postgres_query_elapsed_no_le (
 
 Affected Rows: 0
 
--- should return empty result instead of error when 'le' column is missing
+-- should return empty result instead of error when 'le' column is missin
+-- SQLNESS SORT_RESULT 3 1
 tql eval(0, 10, '10s') histogram_quantile(0.99, sum by(pod,instance, le) (rate(greptime_servers_postgres_query_elapsed_no_le{instance=~"xxx"}[1m])));
 
 ++
 ++
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval(0, 10, '10s') histogram_quantile(0.99, sum by(pod,instance, fbf) (rate(greptime_servers_postgres_query_elapsed_no_le{instance=~"xxx"}[1m])));
 
 ++
@@ -363,3 +377,53 @@ drop table greptime_servers_postgres_query_elapsed_no_le;
 
 Affected Rows: 0
 
+-- test case with some missing buckets
+create table histogram5_bucket (
+    ts timestamp time index,
+    le string,
+    s string,
+    val double,
+    primary key (s, le),
+);
+
+Affected Rows: 0
+
+insert into histogram5_bucket values
+    (3000000, "0.1", "a", 0),
+    -- (3000000, "1", "a", 0),
+    -- (3000000, "5", "a", 0),
+    -- (3000000, "+Inf", "a", 0),
+    (3005000, "0.1", "a", 50),
+    (3005000, "1", "a", 70),
+    (3005000, "5", "a", 110),
+    (3005000, "+Inf", "a", 120),
+    (3010000, "0.1", "a", 10),
+    -- (3010000, "1", "a", 20),
+    -- (3010000, "5", "a", 20),
+    (3010000, "+Inf", "a", 30),
+    (3015000, "0.1", "a", 10),
+    (3015000, "1", "a", 10),
+    (3015000, "3", "a", 20), --
+    (3015000, "5", "a", 30),
+    (3015000, "+Inf", "a", 50);
+
+Affected Rows: 12
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3015, '3s') histogram_quantile(0.5, histogram5_bucket);
+
++---------------------+---+--------------------+
+| ts                  | s | val                |
++---------------------+---+--------------------+
+| 1970-01-01T00:50:00 | a | NaN                |
+| 1970-01-01T00:50:03 | a | NaN                |
+| 1970-01-01T00:50:06 | a | 0.5499999999999999 |
+| 1970-01-01T00:50:09 | a | 0.5499999999999999 |
+| 1970-01-01T00:50:12 | a | 0.775              |
+| 1970-01-01T00:50:15 | a | 4.0                |
++---------------------+---+--------------------+
+
+drop table histogram5_bucket;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/promql/simple_histogram.sql b/tests/cases/standalone/common/promql/simple_histogram.sql
index d6dde4cb69..d18df63804 100644
--- a/tests/cases/standalone/common/promql/simple_histogram.sql
+++ b/tests/cases/standalone/common/promql/simple_histogram.sql
@@ -32,9 +32,11 @@ tql eval (3000, 3000, '1s') histogram_quantile(1.01, histogram_bucket);
 tql eval (3000, 3000, '1s') histogram_quantile(NaN, histogram_bucket);
 
 -- Quantile value in lowest bucket, which is positive.
+-- SQLNESS SORT_RESULT 3 1
 tql eval (3000, 3000, '1s') histogram_quantile(0, histogram_bucket{s="positive"});
 
 -- Quantile value in lowest bucket, which is negative.
+-- SQLNESS SORT_RESULT 3 1
 tql eval (3000, 3000, '1s') histogram_quantile(0, histogram_bucket{s="negative"});
 
 -- Quantile value in highest bucket.
@@ -57,6 +59,7 @@ tql eval (3000, 3000, '1s') label_replace(histogram_quantile(0.8, histogram_buck
 -- More realistic with rates.
 -- This case doesn't contains value because other point are not inserted.
 -- quantile with rate is covered in other cases
+-- SQLNESS SORT_RESULT 3 1
 tql eval (3000, 3000, '1s') histogram_quantile(0.2, rate(histogram_bucket[5m]));
 
 drop table histogram_bucket;
@@ -122,16 +125,22 @@ insert into histogram2_bucket values
     (2700000, "+Inf", 30);
 
 -- Want results exactly in the middle of the bucket.
+-- SQLNESS SORT_RESULT 3 1
 tql eval (420, 420, '1s') histogram_quantile(0.166, histogram2_bucket);
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (420, 420, '1s') histogram_quantile(0.5, histogram2_bucket);
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (420, 420, '1s') histogram_quantile(0.833, histogram2_bucket);
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (2820, 2820, '1s') histogram_quantile(0.166, rate(histogram2_bucket[15m]));
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (2820, 2820, '1s') histogram_quantile(0.5, rate(histogram2_bucket[15m]));
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (2820, 2820, '1s') histogram_quantile(0.833, rate(histogram2_bucket[15m]));
 
 drop table histogram2_bucket;
@@ -160,6 +169,7 @@ insert into histogram3_bucket values
     (3005000, "5", "a", 20),
     (3005000, "+Inf", "a", 30);
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (3000, 3005, '3s') histogram_quantile(0.5, sum by(le, s) (rate(histogram3_bucket[5m])));
 
 drop table histogram3_bucket;
@@ -184,10 +194,12 @@ insert into histogram4_bucket values
     -- INF here is missing
 ;
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval (2900, 3000, '100s') histogram_quantile(0.9, histogram4_bucket);
 
 drop table histogram4_bucket;
 
+-- SQLNESS SORT_RESULT 3 1
 tql eval(0, 10, '10s') histogram_quantile(0.99, sum by(pod,instance, fff) (rate(greptime_servers_postgres_query_elapsed_bucket{instance=~"xxx"}[1m])));
 
 -- test case where table exists but doesn't have 'le' column should raise error
@@ -199,8 +211,44 @@ CREATE TABLE greptime_servers_postgres_query_elapsed_no_le (
     PRIMARY KEY (pod, instance)
 );
 
--- should return empty result instead of error when 'le' column is missing
+-- should return empty result instead of error when 'le' column is missin
+-- SQLNESS SORT_RESULT 3 1
 tql eval(0, 10, '10s') histogram_quantile(0.99, sum by(pod,instance, le) (rate(greptime_servers_postgres_query_elapsed_no_le{instance=~"xxx"}[1m])));
+
+-- SQLNESS SORT_RESULT 3 1
 tql eval(0, 10, '10s') histogram_quantile(0.99, sum by(pod,instance, fbf) (rate(greptime_servers_postgres_query_elapsed_no_le{instance=~"xxx"}[1m])));
 
 drop table greptime_servers_postgres_query_elapsed_no_le;
+
+-- test case with some missing buckets
+create table histogram5_bucket (
+    ts timestamp time index,
+    le string,
+    s string,
+    val double,
+    primary key (s, le),
+);
+
+insert into histogram5_bucket values
+    (3000000, "0.1", "a", 0),
+    -- (3000000, "1", "a", 0),
+    -- (3000000, "5", "a", 0),
+    -- (3000000, "+Inf", "a", 0),
+    (3005000, "0.1", "a", 50),
+    (3005000, "1", "a", 70),
+    (3005000, "5", "a", 110),
+    (3005000, "+Inf", "a", 120),
+    (3010000, "0.1", "a", 10),
+    -- (3010000, "1", "a", 20),
+    -- (3010000, "5", "a", 20),
+    (3010000, "+Inf", "a", 30),
+    (3015000, "0.1", "a", 10),
+    (3015000, "1", "a", 10),
+    (3015000, "3", "a", 20), --
+    (3015000, "5", "a", 30),
+    (3015000, "+Inf", "a", 50);
+
+-- SQLNESS SORT_RESULT 3 1
+tql eval (3000, 3015, '3s') histogram_quantile(0.5, histogram5_bucket);
+
+drop table histogram5_bucket;
diff --git a/tests/cases/standalone/common/show/show_databases_tables.result b/tests/cases/standalone/common/show/show_databases_tables.result
index bc1dad745e..d817227392 100644
--- a/tests/cases/standalone/common/show/show_databases_tables.result
+++ b/tests/cases/standalone/common/show/show_databases_tables.result
@@ -25,7 +25,7 @@ Affected Rows: 0
 SHOW TABLES;
 
 +---------------------------------------+
-| Tables                                |
+| Tables_in_information_schema          |
 +---------------------------------------+
 | build_info                            |
 | character_sets                        |
@@ -52,7 +52,6 @@ SHOW TABLES;
 | region_peers                          |
 | region_statistics                     |
 | routines                              |
-| runtime_metrics                       |
 | schema_privileges                     |
 | schemata                              |
 | session_status                        |
@@ -67,16 +66,16 @@ SHOW TABLES;
 
 SHOW TABLES LIKE 'tables';
 
-+--------+
-| Tables |
-+--------+
-| tables |
-+--------+
++------------------------------+
+| Tables_in_information_schema |
++------------------------------+
+| tables                       |
++------------------------------+
 
 SHOW FULL TABLES;
 
 +---------------------------------------+-----------------+
-| Tables                                | Table_type      |
+| Tables_in_information_schema          | Table_type      |
 +---------------------------------------+-----------------+
 | build_info                            | LOCAL TEMPORARY |
 | character_sets                        | LOCAL TEMPORARY |
@@ -103,7 +102,6 @@ SHOW FULL TABLES;
 | region_peers                          | LOCAL TEMPORARY |
 | region_statistics                     | LOCAL TEMPORARY |
 | routines                              | LOCAL TEMPORARY |
-| runtime_metrics                       | LOCAL TEMPORARY |
 | schema_privileges                     | LOCAL TEMPORARY |
 | schemata                              | LOCAL TEMPORARY |
 | session_status                        | LOCAL TEMPORARY |
@@ -148,7 +146,6 @@ SHOW TABLE STATUS;
 |region_peers||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
 |region_statistics||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
 |routines||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
-|runtime_metrics||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
 |schema_privileges||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
 |schemata||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
 |session_status||11|Fixed|0|0|0|0|0|0|0|DATETIME|DATETIME||utf8_bin|0|||
diff --git a/tests/cases/standalone/common/system/information_schema.result b/tests/cases/standalone/common/system/information_schema.result
index cfe2568fdf..04ed933355 100644
--- a/tests/cases/standalone/common/system/information_schema.result
+++ b/tests/cases/standalone/common/system/information_schema.result
@@ -39,7 +39,6 @@ order by table_schema, table_name;
 |greptime|information_schema|region_peers|LOCALTEMPORARY|29|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
 |greptime|information_schema|region_statistics|LOCALTEMPORARY|35|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
 |greptime|information_schema|routines|LOCALTEMPORARY|21|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
-|greptime|information_schema|runtime_metrics|LOCALTEMPORARY|27|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
 |greptime|information_schema|schema_privileges|LOCALTEMPORARY|22|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
 |greptime|information_schema|schemata|LOCALTEMPORARY|15|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
 |greptime|information_schema|session_status|LOCALTEMPORARY|26|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|DATETIME||utf8_bin|0|||Y|
@@ -369,12 +368,6 @@ select * from information_schema.columns order by table_schema, table_name, colu
 | greptime      | information_schema | routines                              | sql_data_access                   | 21               | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
 | greptime      | information_schema | routines                              | sql_mode                          | 26               | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
 | greptime      | information_schema | routines                              | sql_path                          | 22               | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
-| greptime      | information_schema | runtime_metrics                       | labels                            | 3                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | Yes         | string              |                |        |
-| greptime      | information_schema | runtime_metrics                       | metric_name                       | 1                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
-| greptime      | information_schema | runtime_metrics                       | peer_addr                         | 4                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | Yes         | string              |                |        |
-| greptime      | information_schema | runtime_metrics                       | peer_type                         | 5                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
-| greptime      | information_schema | runtime_metrics                       | timestamp                         | 6                |                          |                        |                   |               | 3                  |                    |                |            |       | select,insert |                       | TimestampMillisecond | timestamp(3)        | FIELD         |                | No          | timestamp(3)        |                |        |
-| greptime      | information_schema | runtime_metrics                       | value                             | 2                |                          |                        | 22                |               |                    |                    |                |            |       | select,insert |                       | Float64              | double              | FIELD         |                | No          | double              |                |        |
 | greptime      | information_schema | schema_privileges                     | grantee                           | 1                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
 | greptime      | information_schema | schema_privileges                     | is_grantable                      | 5                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
 | greptime      | information_schema | schema_privileges                     | privilege_type                    | 4                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
@@ -407,9 +400,9 @@ select * from information_schema.columns order by table_schema, table_name, colu
 | greptime      | information_schema | ssts_manifest                         | file_id                           | 7                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
 | greptime      | information_schema | ssts_manifest                         | file_path                         | 10               | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | No          | string              |                |        |
 | greptime      | information_schema | ssts_manifest                         | file_size                         | 11               |                          |                        | 20                | 0             |                    |                    |                |            |       | select,insert |                       | UInt64               | bigint unsigned     | FIELD         |                | No          | bigint unsigned     |                |        |
-| greptime      | information_schema | ssts_manifest                         | index_file_id                     | 8                | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | Yes         | string              |                |        |
 | greptime      | information_schema | ssts_manifest                         | index_file_path                   | 12               | 2147483647               | 2147483647             |                   |               |                    | utf8               | utf8_bin       |            |       | select,insert |                       | String               | string              | FIELD         |                | Yes         | string              |                |        |
 | greptime      | information_schema | ssts_manifest                         | index_file_size                   | 13               |                          |                        | 20                | 0             |                    |                    |                |            |       | select,insert |                       | UInt64               | bigint unsigned     | FIELD         |                | Yes         | bigint unsigned     |                |        |
+| greptime      | information_schema | ssts_manifest                         | index_version                     | 8                |                          |                        | 20                | 0             |                    |                    |                |            |       | select,insert |                       | UInt64               | bigint unsigned     | FIELD         |                | No          | bigint unsigned     |                |        |
 | greptime      | information_schema | ssts_manifest                         | level                             | 9                |                          |                        | 3                 | 0             |                    |                    |                |            |       | select,insert |                       | UInt8                | tinyint unsigned    | FIELD         |                | No          | tinyint unsigned    |                |        |
 | greptime      | information_schema | ssts_manifest                         | max_ts                            | 18               |                          |                        |                   |               | 9                  |                    |                |            |       | select,insert |                       | TimestampNanosecond  | timestamp(9)        | FIELD         |                | Yes         | timestamp(9)        |                |        |
 | greptime      | information_schema | ssts_manifest                         | min_ts                            | 17               |                          |                        |                   |               | 9                  |                    |                |            |       | select,insert |                       | TimestampNanosecond  | timestamp(9)        | FIELD         |                | Yes         | timestamp(9)        |                |        |
@@ -832,19 +825,6 @@ SELECT * FROM CHECK_CONSTRAINTS;
 +--------------------+-------------------+-----------------+--------------+
 +--------------------+-------------------+-----------------+--------------+
 
-DESC TABLE RUNTIME_METRICS;
-
-+-------------+----------------------+-----+------+---------+---------------+
-| Column      | Type                 | Key | Null | Default | Semantic Type |
-+-------------+----------------------+-----+------+---------+---------------+
-| metric_name | String               |     | NO   |         | FIELD         |
-| value       | Float64              |     | NO   |         | FIELD         |
-| labels      | String               |     | YES  |         | FIELD         |
-| peer_addr   | String               |     | YES  |         | FIELD         |
-| peer_type   | String               |     | NO   |         | FIELD         |
-| timestamp   | TimestampMillisecond |     | NO   |         | FIELD         |
-+-------------+----------------------+-----+------+---------+---------------+
-
 DESC TABLE REGION_PEERS;
 
 +---------------+--------+-----+------+---------+---------------+
diff --git a/tests/cases/standalone/common/system/information_schema.sql b/tests/cases/standalone/common/system/information_schema.sql
index 68bf931cff..992ed8cd14 100644
--- a/tests/cases/standalone/common/system/information_schema.sql
+++ b/tests/cases/standalone/common/system/information_schema.sql
@@ -130,8 +130,6 @@ DESC TABLE CHECK_CONSTRAINTS;
 
 SELECT * FROM CHECK_CONSTRAINTS;
 
-DESC TABLE RUNTIME_METRICS;
-
 DESC TABLE REGION_PEERS;
 
 USE INFORMATION_SCHEMA;
diff --git a/tests/cases/standalone/common/system/pg_catalog.result b/tests/cases/standalone/common/system/pg_catalog.result
index 0aa7f1cc7e..31e80bbf87 100644
--- a/tests/cases/standalone/common/system/pg_catalog.result
+++ b/tests/cases/standalone/common/system/pg_catalog.result
@@ -13,15 +13,16 @@ SELECT session_user is not null;
 | t                          |
 +----------------------------+
 
--- session_user and current_schema
+-- SQLNESS REPLACE PostgreSQL.* VERSION
+-- current_schema
 -- SQLNESS PROTOCOL POSTGRES
-select current_schema();
+select current_schema(), current_schemas(true), current_schemas(false), version(), current_database();
 
-+------------------+
-| current_schema() |
-+------------------+
-| public           |
-+------------------+
++------------------+---------------------------------------------------------+---------------------------------+-----------------------------------------+--------------------+
+| current_schema() | current_schemas(Boolean(true))                          | current_schemas(Boolean(false)) | version                                 | current_database() |
++------------------+---------------------------------------------------------+---------------------------------+-----------------------------------------+--------------------+
+| public           | {public,information_schema,pg_catalog,greptime_private} | {public}                        | VERSION
++------------------+---------------------------------------------------------+---------------------------------+-----------------------------------------+--------------------+
 
 -- search_path for pg using schema for now FIXME when support real search_path
 -- SQLNESS PROTOCOL POSTGRES
@@ -861,6 +862,77 @@ where relnamespace in (
 | foo     |
 +---------+
 
+-- SQLNESS PROTOCOL POSTGRES
+SELECT
+    CASE WHEN
+            quote_ident(table_schema) IN (
+            SELECT
+            CASE WHEN trim(s[i]) = '"$user"' THEN user ELSE trim(s[i]) END
+            FROM
+            generate_series(
+                array_lower(string_to_array(current_setting('search_path'),','),1),
+                array_upper(string_to_array(current_setting('search_path'),','),1)
+            ) as i,
+            string_to_array(current_setting('search_path'),',') s
+            )
+        THEN quote_ident(table_name)
+        ELSE quote_ident(table_schema) || '.' || quote_ident(table_name)
+    END AS "table"
+    FROM information_schema.tables
+    WHERE quote_ident(table_schema) NOT IN ('information_schema',
+                                'pg_catalog',
+                                '_timescaledb_cache',
+                                '_timescaledb_catalog',
+                                '_timescaledb_internal',
+                                '_timescaledb_config',
+                                'timescaledb_information',
+                                'timescaledb_experimental')
+    ORDER BY CASE WHEN
+            quote_ident(table_schema) IN (
+            SELECT
+            CASE WHEN trim(s[i]) = '"$user"' THEN user ELSE trim(s[i]) END
+            FROM
+            generate_series(
+                array_lower(string_to_array(current_setting('search_path'),','),1),
+                array_upper(string_to_array(current_setting('search_path'),','),1)
+            ) as i,
+            string_to_array(current_setting('search_path'),',') s
+            ) THEN 0 ELSE 1 END, 1;
+
++----------------+
+| table          |
++----------------+
+| my_db.foo      |
+| public.numbers |
++----------------+
+
+-- SQLNESS PROTOCOL POSTGRES
+SELECT quote_ident(column_name) AS "column", data_type AS "type"
+    FROM information_schema.columns
+    WHERE
+        CASE WHEN array_length(parse_ident('my_db.foo'),1) = 2
+        THEN quote_ident(table_schema) = (parse_ident('my_db.foo'))[1]
+            AND quote_ident(table_name) = (parse_ident('my_db.foo'))[2]
+        ELSE quote_ident(table_name) = 'my_db.foo'
+            AND
+            quote_ident(table_schema) IN (
+            SELECT
+            CASE WHEN trim(s[i]) = '"$user"' THEN user ELSE trim(s[i]) END
+            FROM
+            generate_series(
+                array_lower(string_to_array(current_setting('search_path'),','),1),
+                array_upper(string_to_array(current_setting('search_path'),','),1)
+            ) as i,
+            string_to_array(current_setting('search_path'),',') s
+            )
+        END;
+
++--------+--------------+
+| column | type         |
++--------+--------------+
+| ts     | timestamp(3) |
++--------+--------------+
+
 -- SQLNESS PROTOCOL POSTGRES
 -- SQLNESS REPLACE (\d+\s*) OID
 select relnamespace, relname, relkind
@@ -958,3 +1030,72 @@ use public;
 
 
 
+-- PostgreSQL description functions - placeholder returning NULL for compatibility
+-- SQLNESS PROTOCOL POSTGRES
+SELECT obj_description((SELECT oid FROM pg_class LIMIT 1), 'pg_class') IS NULL AS is_null;
+
++---------+
+| is_null |
++---------+
+| t       |
++---------+
+
+-- SQLNESS PROTOCOL POSTGRES
+SELECT obj_description((SELECT oid FROM pg_class LIMIT 1)) IS NULL AS is_null;
+
++---------+
+| is_null |
++---------+
+| t       |
++---------+
+
+-- SQLNESS PROTOCOL POSTGRES
+SELECT col_description((SELECT oid FROM pg_class LIMIT 1), 1) IS NULL AS is_null;
+
++---------+
+| is_null |
++---------+
+| t       |
++---------+
+
+-- SQLNESS PROTOCOL POSTGRES
+SELECT shobj_description(1, 'pg_database') IS NULL AS is_null;
+
++---------+
+| is_null |
++---------+
+| t       |
++---------+
+
+-- pg_my_temp_schema returns 0 (no temp schema support)
+-- SQLNESS PROTOCOL POSTGRES
+SELECT pg_my_temp_schema();
+
++---------------------+
+| pg_my_temp_schema() |
++---------------------+
+| 0                   |
++---------------------+
+
+-- Issue 7313
+-- SQLNESS PROTOCOL POSTGRES
+-- SQLNESS REPLACE (\d+\s*) OID
+SELECT
+	oid
+	,nspname
+	,nspname = ANY (current_schemas(true)) AS is_on_search_path
+	    ,obj_description(oid, 'pg_namespace') AS comment
+FROM pg_namespace; SELECT
+oid
+,nspname
+FROM pg_namespace
+WHERE oid = pg_my_temp_schema();
+
++-------+--------------------+-------------------+---------+
+| oid   | nspname            | is_on_search_path | comment |
++-------+--------------------+-------------------+---------+
+| OID| greptime_private   | t                 |         |
+| OID| information_schema | t                 |         |
+| OID| public             | t                 |         |
++-------+--------------------+-------------------+---------+
+
diff --git a/tests/cases/standalone/common/system/pg_catalog.sql b/tests/cases/standalone/common/system/pg_catalog.sql
index 960f3cb1ff..979d55b480 100644
--- a/tests/cases/standalone/common/system/pg_catalog.sql
+++ b/tests/cases/standalone/common/system/pg_catalog.sql
@@ -5,9 +5,10 @@ create database pg_catalog;
 -- SQLNESS PROTOCOL POSTGRES
 SELECT session_user is not null;
 
--- session_user and current_schema
+-- SQLNESS REPLACE PostgreSQL.* VERSION
+-- current_schema
 -- SQLNESS PROTOCOL POSTGRES
-select current_schema();
+select current_schema(), current_schemas(true), current_schemas(false), version(), current_database();
 
 -- search_path for pg using schema for now FIXME when support real search_path
 -- SQLNESS PROTOCOL POSTGRES
@@ -131,6 +132,64 @@ where relnamespace in (
     where nspname like 'my%'
 );
 
+-- SQLNESS PROTOCOL POSTGRES
+SELECT
+    CASE WHEN
+            quote_ident(table_schema) IN (
+            SELECT
+            CASE WHEN trim(s[i]) = '"$user"' THEN user ELSE trim(s[i]) END
+            FROM
+            generate_series(
+                array_lower(string_to_array(current_setting('search_path'),','),1),
+                array_upper(string_to_array(current_setting('search_path'),','),1)
+            ) as i,
+            string_to_array(current_setting('search_path'),',') s
+            )
+        THEN quote_ident(table_name)
+        ELSE quote_ident(table_schema) || '.' || quote_ident(table_name)
+    END AS "table"
+    FROM information_schema.tables
+    WHERE quote_ident(table_schema) NOT IN ('information_schema',
+                                'pg_catalog',
+                                '_timescaledb_cache',
+                                '_timescaledb_catalog',
+                                '_timescaledb_internal',
+                                '_timescaledb_config',
+                                'timescaledb_information',
+                                'timescaledb_experimental')
+    ORDER BY CASE WHEN
+            quote_ident(table_schema) IN (
+            SELECT
+            CASE WHEN trim(s[i]) = '"$user"' THEN user ELSE trim(s[i]) END
+            FROM
+            generate_series(
+                array_lower(string_to_array(current_setting('search_path'),','),1),
+                array_upper(string_to_array(current_setting('search_path'),','),1)
+            ) as i,
+            string_to_array(current_setting('search_path'),',') s
+            ) THEN 0 ELSE 1 END, 1;
+
+-- SQLNESS PROTOCOL POSTGRES
+SELECT quote_ident(column_name) AS "column", data_type AS "type"
+    FROM information_schema.columns
+    WHERE
+        CASE WHEN array_length(parse_ident('my_db.foo'),1) = 2
+        THEN quote_ident(table_schema) = (parse_ident('my_db.foo'))[1]
+            AND quote_ident(table_name) = (parse_ident('my_db.foo'))[2]
+        ELSE quote_ident(table_name) = 'my_db.foo'
+            AND
+            quote_ident(table_schema) IN (
+            SELECT
+            CASE WHEN trim(s[i]) = '"$user"' THEN user ELSE trim(s[i]) END
+            FROM
+            generate_series(
+                array_lower(string_to_array(current_setting('search_path'),','),1),
+                array_upper(string_to_array(current_setting('search_path'),','),1)
+            ) as i,
+            string_to_array(current_setting('search_path'),',') s
+            )
+        END;
+
 -- SQLNESS PROTOCOL POSTGRES
 -- SQLNESS REPLACE (\d+\s*) OID
 select relnamespace, relname, relkind
@@ -165,3 +224,37 @@ drop table my_db.foo;
 
 -- SQLNESS PROTOCOL POSTGRES
 use public;
+
+-- PostgreSQL description functions - placeholder returning NULL for compatibility
+
+-- SQLNESS PROTOCOL POSTGRES
+SELECT obj_description((SELECT oid FROM pg_class LIMIT 1), 'pg_class') IS NULL AS is_null;
+
+-- SQLNESS PROTOCOL POSTGRES
+SELECT obj_description((SELECT oid FROM pg_class LIMIT 1)) IS NULL AS is_null;
+
+-- SQLNESS PROTOCOL POSTGRES
+SELECT col_description((SELECT oid FROM pg_class LIMIT 1), 1) IS NULL AS is_null;
+
+-- SQLNESS PROTOCOL POSTGRES
+SELECT shobj_description(1, 'pg_database') IS NULL AS is_null;
+
+-- pg_my_temp_schema returns 0 (no temp schema support)
+-- SQLNESS PROTOCOL POSTGRES
+SELECT pg_my_temp_schema();
+
+-- Issue 7313
+-- SQLNESS PROTOCOL POSTGRES
+-- SQLNESS REPLACE (\d+\s*) OID
+SELECT
+	oid
+	,nspname
+	,nspname = ANY (current_schemas(true)) AS is_on_search_path
+
+	    ,obj_description(oid, 'pg_namespace') AS comment
+
+FROM pg_namespace; SELECT
+oid
+,nspname
+FROM pg_namespace
+WHERE oid = pg_my_temp_schema();
diff --git a/tests/cases/standalone/common/system/set_unsupported.result b/tests/cases/standalone/common/system/set_unsupported.result
new file mode 100644
index 0000000000..a2d2805af6
--- /dev/null
+++ b/tests/cases/standalone/common/system/set_unsupported.result
@@ -0,0 +1,80 @@
+-- Test unsupported set variables for MySQL protocol
+-- These should succeed with a warning instead of failing
+-- Test setting an unsupported variable
+-- SQLNESS PROTOCOL MYSQL
+SET autocommit = 1;
+
+affected_rows: 0
+
+-- Test setting with @@ prefix (previously this would succeed)
+-- SQLNESS PROTOCOL MYSQL
+SET @@autocommit = 1;
+
+affected_rows: 0
+
+-- Test setting character_set_client (commonly used by MySQL clients)
+-- SQLNESS PROTOCOL MYSQL
+SET character_set_client = 'utf8mb4';
+
+affected_rows: 0
+
+-- Test setting character_set_results
+-- SQLNESS PROTOCOL MYSQL
+SET character_set_results = 'utf8mb4';
+
+affected_rows: 0
+
+-- Test setting sql_mode
+-- SQLNESS PROTOCOL MYSQL
+SET sql_mode = 'STRICT_TRANS_TABLES';
+
+affected_rows: 0
+
+-- Test multiple unsupported settings
+-- SQLNESS PROTOCOL MYSQL
+SET @@session.sql_mode = 'TRADITIONAL';
+
+affected_rows: 0
+
+-- Test NAMES (special MySQL syntax for character set)
+-- SQLNESS PROTOCOL MYSQL
+SET NAMES utf8mb4;
+
+affected_rows: 0
+
+-- Test collation_connection
+-- SQLNESS PROTOCOL MYSQL
+SET collation_connection = 'utf8mb4_unicode_ci';
+
+affected_rows: 0
+
+-- Test SHOW WARNINGS after setting unsupported variable
+-- SQLNESS PROTOCOL MYSQL
+SET some_unsupported_var = 123;
+
+affected_rows: 0
+
+-- SQLNESS PROTOCOL MYSQL
+SHOW WARNINGS;
+
++---------+------+-----------------------------------------------+
+| Level   | Code | Message                                       |
++---------+------+-----------------------------------------------+
+| Warning | 1000 | Unsupported set variable SOME_UNSUPPORTED_VAR |
++---------+------+-----------------------------------------------+
+
+-- Test that warning is cleared after next statement
+-- SQLNESS PROTOCOL MYSQL
+SELECT 1;
+
++----------+
+| Int64(1) |
++----------+
+| 1        |
++----------+
+
+-- SQLNESS PROTOCOL MYSQL
+SHOW WARNINGS;
+
+affected_rows: 0
+
diff --git a/tests/cases/standalone/common/system/set_unsupported.sql b/tests/cases/standalone/common/system/set_unsupported.sql
new file mode 100644
index 0000000000..42ed21320a
--- /dev/null
+++ b/tests/cases/standalone/common/system/set_unsupported.sql
@@ -0,0 +1,48 @@
+-- Test unsupported set variables for MySQL protocol
+-- These should succeed with a warning instead of failing
+
+-- Test setting an unsupported variable
+-- SQLNESS PROTOCOL MYSQL
+SET autocommit = 1;
+
+-- Test setting with @@ prefix (previously this would succeed)
+-- SQLNESS PROTOCOL MYSQL
+SET @@autocommit = 1;
+
+-- Test setting character_set_client (commonly used by MySQL clients)
+-- SQLNESS PROTOCOL MYSQL
+SET character_set_client = 'utf8mb4';
+
+-- Test setting character_set_results
+-- SQLNESS PROTOCOL MYSQL
+SET character_set_results = 'utf8mb4';
+
+-- Test setting sql_mode
+-- SQLNESS PROTOCOL MYSQL
+SET sql_mode = 'STRICT_TRANS_TABLES';
+
+-- Test multiple unsupported settings
+-- SQLNESS PROTOCOL MYSQL
+SET @@session.sql_mode = 'TRADITIONAL';
+
+-- Test NAMES (special MySQL syntax for character set)
+-- SQLNESS PROTOCOL MYSQL
+SET NAMES utf8mb4;
+
+-- Test collation_connection
+-- SQLNESS PROTOCOL MYSQL
+SET collation_connection = 'utf8mb4_unicode_ci';
+
+-- Test SHOW WARNINGS after setting unsupported variable
+-- SQLNESS PROTOCOL MYSQL
+SET some_unsupported_var = 123;
+
+-- SQLNESS PROTOCOL MYSQL
+SHOW WARNINGS;
+
+-- Test that warning is cleared after next statement
+-- SQLNESS PROTOCOL MYSQL
+SELECT 1;
+
+-- SQLNESS PROTOCOL MYSQL
+SHOW WARNINGS;
diff --git a/tests/cases/standalone/common/system/starrocks_compatibility.result b/tests/cases/standalone/common/system/starrocks_compatibility.result
new file mode 100644
index 0000000000..e7a50386b9
--- /dev/null
+++ b/tests/cases/standalone/common/system/starrocks_compatibility.result
@@ -0,0 +1,204 @@
+-- Test file for StarRocks External Catalog MySQL Compatibility
+-- This test simulates the exact queries StarRocks JDBC connector sends
+-- Reference: MysqlSchemaResolver.java in StarRocks
+-- Setup: Create test table with partitions
+CREATE TABLE test_partitions (
+    ts TIMESTAMP TIME INDEX,
+    host STRING PRIMARY KEY,
+    val DOUBLE
+) PARTITION ON COLUMNS (host) ();
+
+Affected Rows: 0
+
+INSERT INTO test_partitions VALUES
+    ('2024-01-01 00:00:00', 'host1', 1.0),
+    ('2024-01-01 00:00:00', 'host2', 2.0);
+
+Affected Rows: 2
+
+-- ============================================
+-- Section 1: JDBC DatabaseMetaData API queries
+-- ============================================
+-- getCatalogs() -> SHOW DATABASES
+SHOW DATABASES;
+
++--------------------+
+| Database           |
++--------------------+
+| greptime_private   |
+| information_schema |
+| public             |
++--------------------+
+
+-- getTables(db, null, null, types) with backtick quoting
+SHOW FULL TABLES FROM `public` LIKE '%';
+
++------------------+-----------------+
+| Tables_in_public | Table_type      |
++------------------+-----------------+
+| numbers          | LOCAL TEMPORARY |
+| test_partitions  | BASE TABLE      |
++------------------+-----------------+
+
+-- getColumns(db, null, tbl, "%") with backtick quoting
+SHOW FULL COLUMNS FROM `test_partitions` FROM `public` LIKE '%';
+
++-------+--------------+-----------+------+------------+---------+---------+---------------+-------+----------------------+
+| Field | Type         | Collation | Null | Key        | Default | Comment | Privileges    | Extra | Greptime_type        |
++-------+--------------+-----------+------+------------+---------+---------+---------------+-------+----------------------+
+| host  | string       | utf8_bin  | Yes  | PRI        |         |         | select,insert |       | String               |
+| ts    | timestamp(3) |           | No   | TIME INDEX |         |         | select,insert |       | TimestampMillisecond |
+| val   | double       |           | Yes  |            |         |         | select,insert |       | Float64              |
++-------+--------------+-----------+------+------------+---------+---------+---------------+-------+----------------------+
+
+-- ============================================
+-- Section 2: INFORMATION_SCHEMA queries
+-- ============================================
+-- Schema listing (alternative to SHOW DATABASES)
+SELECT catalog_name, schema_name FROM INFORMATION_SCHEMA.SCHEMATA
+WHERE schema_name NOT IN ('information_schema', 'pg_catalog')
+ORDER BY schema_name;
+
++--------------+------------------+
+| catalog_name | schema_name      |
++--------------+------------------+
+| greptime     | greptime_private |
+| greptime     | public           |
++--------------+------------------+
+
+-- Tables listing
+SELECT table_catalog, table_schema, table_name, table_type
+FROM INFORMATION_SCHEMA.TABLES
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions';
+
++---------------+--------------+-----------------+------------+
+| table_catalog | table_schema | table_name      | table_type |
++---------------+--------------+-----------------+------------+
+| greptime      | public       | test_partitions | BASE TABLE |
++---------------+--------------+-----------------+------------+
+
+-- Columns listing
+SELECT table_schema, table_name, column_name, data_type, is_nullable
+FROM INFORMATION_SCHEMA.COLUMNS
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions'
+ORDER BY ordinal_position;
+
++--------------+-----------------+-------------+--------------+-------------+
+| table_schema | table_name      | column_name | data_type    | is_nullable |
++--------------+-----------------+-------------+--------------+-------------+
+| public       | test_partitions | ts          | timestamp(3) | No          |
+| public       | test_partitions | host        | string       | Yes         |
+| public       | test_partitions | val         | double       | Yes         |
++--------------+-----------------+-------------+--------------+-------------+
+
+-- ============================================
+-- Section 3: StarRocks Partition Queries
+-- These are the specific queries StarRocks sends for partition metadata
+-- ============================================
+-- List partition names (what StarRocks uses for partition identification)
+SELECT PARTITION_DESCRIPTION as NAME
+FROM INFORMATION_SCHEMA.PARTITIONS
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions'
+  AND PARTITION_NAME IS NOT NULL
+  AND (PARTITION_METHOD = 'RANGE' or PARTITION_METHOD = 'RANGE COLUMNS')
+ORDER BY PARTITION_DESCRIPTION;
+
++------+
+| name |
++------+
+|      |
++------+
+
+-- Get partition columns (StarRocks uses this to identify partition key)
+SELECT DISTINCT PARTITION_EXPRESSION
+FROM INFORMATION_SCHEMA.PARTITIONS
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions'
+  AND PARTITION_NAME IS NOT NULL
+  AND (PARTITION_METHOD = 'RANGE' or PARTITION_METHOD = 'RANGE COLUMNS')
+  AND PARTITION_EXPRESSION IS NOT NULL;
+
++----------------------+
+| partition_expression |
++----------------------+
+| host                 |
++----------------------+
+
+-- Get partitions with modification time (uses IF() function for NULL handling)
+-- StarRocks uses this for cache invalidation
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) DATETIME
+SELECT PARTITION_NAME,
+       IF(UPDATE_TIME IS NULL, CREATE_TIME, UPDATE_TIME) AS MODIFIED_TIME
+FROM INFORMATION_SCHEMA.PARTITIONS
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions'
+  AND PARTITION_NAME IS NOT NULL
+ORDER BY PARTITION_NAME;
+
++----------------+---------------------+
+| partition_name | modified_time       |
++----------------+---------------------+
+| p0             | DATETIME |
++----------------+---------------------+
+
+-- Get table modification time (for non-partitioned tables, StarRocks uses this)
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) DATETIME
+SELECT TABLE_NAME AS NAME,
+       IF(UPDATE_TIME IS NULL, CREATE_TIME, UPDATE_TIME) AS MODIFIED_TIME
+FROM INFORMATION_SCHEMA.PARTITIONS
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions';
+
++-----------------+---------------------+
+| name            | modified_time       |
++-----------------+---------------------+
+| test_partitions | DATETIME |
++-----------------+---------------------+
+
+-- ============================================
+-- Section 4: Raw PARTITIONS data inspection
+-- Verify GreptimeDB returns appropriate partition metadata
+-- ============================================
+-- Show what GreptimeDB returns for PARTITIONS
+-- SQLNESS REPLACE (\d{13,}) REGION_ID
+SELECT table_schema, table_name, partition_name, partition_method,
+       partition_expression, partition_description, greptime_partition_id
+FROM INFORMATION_SCHEMA.PARTITIONS
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions';
+
++--------------+-----------------+----------------+------------------+----------------------+-----------------------+-----------------------+
+| table_schema | table_name      | partition_name | partition_method | partition_expression | partition_description | greptime_partition_id |
++--------------+-----------------+----------------+------------------+----------------------+-----------------------+-----------------------+
+| public       | test_partitions | p0             | RANGE            | host                 |                       | REGION_ID         |
++--------------+-----------------+----------------+------------------+----------------------+-----------------------+-----------------------+
+
+-- ============================================
+-- Section 5: IF() function tests with timestamps
+-- StarRocks heavily uses IF() for NULL timestamp handling
+-- ============================================
+SELECT IF(1, 'yes', 'no') as result;
+
++--------+
+| result |
++--------+
+| yes    |
++--------+
+
+SELECT IF(0, 'yes', 'no') as result;
+
++--------+
+| result |
++--------+
+| no     |
++--------+
+
+SELECT IF(NULL, 'yes', 'no') as result;
+
++--------+
+| result |
++--------+
+| no     |
++--------+
+
+-- Cleanup
+DROP TABLE test_partitions;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/system/starrocks_compatibility.sql b/tests/cases/standalone/common/system/starrocks_compatibility.sql
new file mode 100644
index 0000000000..37ad0cb126
--- /dev/null
+++ b/tests/cases/standalone/common/system/starrocks_compatibility.sql
@@ -0,0 +1,111 @@
+-- Test file for StarRocks External Catalog MySQL Compatibility
+-- This test simulates the exact queries StarRocks JDBC connector sends
+-- Reference: MysqlSchemaResolver.java in StarRocks
+
+-- Setup: Create test table with partitions
+CREATE TABLE test_partitions (
+    ts TIMESTAMP TIME INDEX,
+    host STRING PRIMARY KEY,
+    val DOUBLE
+) PARTITION ON COLUMNS (host) ();
+
+INSERT INTO test_partitions VALUES
+    ('2024-01-01 00:00:00', 'host1', 1.0),
+    ('2024-01-01 00:00:00', 'host2', 2.0);
+
+-- ============================================
+-- Section 1: JDBC DatabaseMetaData API queries
+-- ============================================
+
+-- getCatalogs() -> SHOW DATABASES
+SHOW DATABASES;
+
+-- getTables(db, null, null, types) with backtick quoting
+SHOW FULL TABLES FROM `public` LIKE '%';
+
+-- getColumns(db, null, tbl, "%") with backtick quoting
+SHOW FULL COLUMNS FROM `test_partitions` FROM `public` LIKE '%';
+
+-- ============================================
+-- Section 2: INFORMATION_SCHEMA queries
+-- ============================================
+
+-- Schema listing (alternative to SHOW DATABASES)
+SELECT catalog_name, schema_name FROM INFORMATION_SCHEMA.SCHEMATA
+WHERE schema_name NOT IN ('information_schema', 'pg_catalog')
+ORDER BY schema_name;
+
+-- Tables listing
+SELECT table_catalog, table_schema, table_name, table_type
+FROM INFORMATION_SCHEMA.TABLES
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions';
+
+-- Columns listing
+SELECT table_schema, table_name, column_name, data_type, is_nullable
+FROM INFORMATION_SCHEMA.COLUMNS
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions'
+ORDER BY ordinal_position;
+
+-- ============================================
+-- Section 3: StarRocks Partition Queries
+-- These are the specific queries StarRocks sends for partition metadata
+-- ============================================
+
+-- List partition names (what StarRocks uses for partition identification)
+SELECT PARTITION_DESCRIPTION as NAME
+FROM INFORMATION_SCHEMA.PARTITIONS
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions'
+  AND PARTITION_NAME IS NOT NULL
+  AND (PARTITION_METHOD = 'RANGE' or PARTITION_METHOD = 'RANGE COLUMNS')
+ORDER BY PARTITION_DESCRIPTION;
+
+-- Get partition columns (StarRocks uses this to identify partition key)
+SELECT DISTINCT PARTITION_EXPRESSION
+FROM INFORMATION_SCHEMA.PARTITIONS
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions'
+  AND PARTITION_NAME IS NOT NULL
+  AND (PARTITION_METHOD = 'RANGE' or PARTITION_METHOD = 'RANGE COLUMNS')
+  AND PARTITION_EXPRESSION IS NOT NULL;
+
+-- Get partitions with modification time (uses IF() function for NULL handling)
+-- StarRocks uses this for cache invalidation
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) DATETIME
+SELECT PARTITION_NAME,
+       IF(UPDATE_TIME IS NULL, CREATE_TIME, UPDATE_TIME) AS MODIFIED_TIME
+FROM INFORMATION_SCHEMA.PARTITIONS
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions'
+  AND PARTITION_NAME IS NOT NULL
+ORDER BY PARTITION_NAME;
+
+-- Get table modification time (for non-partitioned tables, StarRocks uses this)
+-- SQLNESS REPLACE (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}) DATETIME
+SELECT TABLE_NAME AS NAME,
+       IF(UPDATE_TIME IS NULL, CREATE_TIME, UPDATE_TIME) AS MODIFIED_TIME
+FROM INFORMATION_SCHEMA.PARTITIONS
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions';
+
+-- ============================================
+-- Section 4: Raw PARTITIONS data inspection
+-- Verify GreptimeDB returns appropriate partition metadata
+-- ============================================
+
+-- Show what GreptimeDB returns for PARTITIONS
+-- SQLNESS REPLACE (\d{13,}) REGION_ID
+SELECT table_schema, table_name, partition_name, partition_method,
+       partition_expression, partition_description, greptime_partition_id
+FROM INFORMATION_SCHEMA.PARTITIONS
+WHERE TABLE_SCHEMA = 'public' AND TABLE_NAME = 'test_partitions';
+
+-- ============================================
+-- Section 5: IF() function tests with timestamps
+-- StarRocks heavily uses IF() for NULL timestamp handling
+-- ============================================
+
+SELECT IF(1, 'yes', 'no') as result;
+
+SELECT IF(0, 'yes', 'no') as result;
+
+SELECT IF(NULL, 'yes', 'no') as result;
+
+-- Cleanup
+DROP TABLE test_partitions;
diff --git a/tests/cases/standalone/common/tql/partition.result b/tests/cases/standalone/common/tql/partition.result
index 401cd68ec0..9869bceb96 100644
--- a/tests/cases/standalone/common/tql/partition.result
+++ b/tests/cases/standalone/common/tql/partition.result
@@ -73,29 +73,33 @@ tql analyze (0, 10, '1s') 100 - (avg by (k) (irate(t[1m])) * 100);
 |_|_|_RepartitionExec: partitioning=REDACTED
 |_|_|_SortPreservingMergeExec: [k@0 ASC NULLS LAST, j@1 ASC NULLS LAST] REDACTED
 |_|_|_SortExec: expr=[k@0 ASC NULLS LAST, j@1 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
-|_|_|_AggregateExec: mode=FinalPartitioned, gby=[k@0 as k, j@1 as j], aggr=[avg(prom_irate(j_range,i))], ordering_mode=PartiallySorted([0]) REDACTED
-|_|_|_SortExec: expr=[k@0 ASC NULLS LAST], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=FinalPartitioned, gby=[k@0 as k, j@1 as j], aggr=[avg(prom_irate(j_range,i))] REDACTED
 |_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_RepartitionExec: partitioning=REDACTED
-|_|_|_AggregateExec: mode=Partial, gby=[k@2 as k, j@0 as j], aggr=[avg(prom_irate(j_range,i))], ordering_mode=PartiallySorted([0]) REDACTED
-|_|_|_ProjectionExec: expr=[j@0 as j, prom_irate(j_range,i)@1 as prom_irate(j_range,i), k@2 as k] REDACTED
-|_|_|_CooperativeExec REDACTED
-|_|_|_SortExec: expr=[k@2 ASC, l@3 ASC, j@0 ASC], preserve_partitioning=[true] REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[k@0 as k, j@1 as j], aggr=[avg(prom_irate(j_range,i))] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
-| 1_| 0_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[k@0 as k, j@1 as j], aggr=[__avg_state(prom_irate(j_range,i))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[k@2 as k, j@0 as j], aggr=[__avg_state(prom_irate(j_range,i))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_irate(j_range,i)@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[j@1 as j, prom_irate(j_range@4, i@0) as prom_irate(j_range,i), k@2 as k, l@3 as l] REDACTED
+|_|_|_ProjectionExec: expr=[j@1 as j, prom_irate(j_range@4, i@0) as prom_irate(j_range,i), k@2 as k] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[0..10000], interval=[1000], eval range=[60000], time index=[j] REDACTED
 |_|_|_PromSeriesNormalizeExec: offset=[0], time index=[j], filter NaN: [true] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["k", "l"] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
 |_|_|_|
-| 1_| 1_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[k@0 as k, j@1 as j], aggr=[__avg_state(prom_irate(j_range,i))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[k@2 as k, j@0 as j], aggr=[__avg_state(prom_irate(j_range,i))] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_FilterExec: prom_irate(j_range,i)@1 IS NOT NULL REDACTED
-|_|_|_ProjectionExec: expr=[j@1 as j, prom_irate(j_range@4, i@0) as prom_irate(j_range,i), k@2 as k, l@3 as l] REDACTED
+|_|_|_ProjectionExec: expr=[j@1 as j, prom_irate(j_range@4, i@0) as prom_irate(j_range,i), k@2 as k] REDACTED
 |_|_|_PromRangeManipulateExec: req range=[0..10000], interval=[1000], eval range=[60000], time index=[j] REDACTED
 |_|_|_PromSeriesNormalizeExec: offset=[0], time index=[j], filter NaN: [true] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["k", "l"] REDACTED
diff --git a/tests/cases/standalone/common/tql/tql-cte.result b/tests/cases/standalone/common/tql/tql-cte.result
index 7127f79d9f..f433dda7c4 100644
--- a/tests/cases/standalone/common/tql/tql-cte.result
+++ b/tests/cases/standalone/common/tql/tql-cte.result
@@ -676,15 +676,10 @@ WITH time_shifted AS (
 )
 SELECT * FROM time_shifted;
 
-+---------------------+-----+
-| ts                  | val |
-+---------------------+-----+
-| 1970-01-01T00:00:00 | 3.0 |
-| 1970-01-01T00:00:10 | 3.0 |
-| 1970-01-01T00:00:20 | 3.0 |
-| 1970-01-01T00:00:30 | 3.0 |
-| 1970-01-01T00:00:40 | 3.0 |
-+---------------------+-----+
++----+-----+
+| ts | val |
++----+-----+
++----+-----+
 
 -- SQLNESS REPLACE (peers.*) REDACTED
 -- SQLNESS REPLACE (partitioning.*) REDACTED
@@ -702,7 +697,7 @@ SELECT * FROM time_shifted;
 |               |     PromInstantManipulate: range=[0..40000], lookback=[300000], interval=[10000], time index=[ts]                      |
 |               |       PromSeriesNormalize: offset=[50000], time index=[ts], filter NaN: [false]                                        |
 |               |         PromSeriesDivide: tags=[]                                                                                      |
-|               |           Filter: metric.ts >= TimestampMillisecond(-250000, None) AND metric.ts <= TimestampMillisecond(390000, None) |
+|               |           Filter: metric.ts >= TimestampMillisecond(-350000, None) AND metric.ts <= TimestampMillisecond(290000, None) |
 |               |             TableScan: metric                                                                                          |
 |               | ]]                                                                                                                     |
 | physical_plan | CooperativeExec                                                                                                        |
diff --git a/tests/cases/standalone/common/ttl/database_ttl_with_metric_engine.result b/tests/cases/standalone/common/ttl/database_ttl_with_metric_engine.result
index f326523031..ad59b7bd1a 100644
--- a/tests/cases/standalone/common/ttl/database_ttl_with_metric_engine.result
+++ b/tests/cases/standalone/common/ttl/database_ttl_with_metric_engine.result
@@ -22,14 +22,14 @@ INSERT INTO test_ttl(ts, val, host) VALUES
 
 Affected Rows: 3
 
-SELECT val, host FROM test_ttl;
+SELECT val, host FROM test_ttl ORDER BY host;
 
 +-----+-------+
 | val | host  |
 +-----+-------+
+| 1.0 | host1 |
 | 2.0 | host2 |
 | 3.0 | host3 |
-| 1.0 | host1 |
 +-----+-------+
 
 -- SQLNESS SLEEP 2s
@@ -83,26 +83,26 @@ ADMIN compact_table('phy');
 +----------------------------+
 
 --- should not be expired --
-SELECT val, host FROM test_ttl;
+SELECT val, host FROM test_ttl ORDER BY host;
 
 +-----+-------+
 | val | host  |
 +-----+-------+
+| 1.0 | host1 |
 | 2.0 | host2 |
 | 3.0 | host3 |
-| 1.0 | host1 |
 +-----+-------+
 
 -- restart the db, ensure everything is ok
 -- SQLNESS ARG restart=true
-SELECT val, host FROM test_ttl;
+SELECT val, host FROM test_ttl ORDER BY host;
 
 +-----+-------+
 | val | host  |
 +-----+-------+
+| 1.0 | host1 |
 | 2.0 | host2 |
 | 3.0 | host3 |
-| 1.0 | host1 |
 +-----+-------+
 
 DROP TABLE test_ttl;
diff --git a/tests/cases/standalone/common/ttl/database_ttl_with_metric_engine.sql b/tests/cases/standalone/common/ttl/database_ttl_with_metric_engine.sql
index f4d37e7fba..d1deaf004a 100644
--- a/tests/cases/standalone/common/ttl/database_ttl_with_metric_engine.sql
+++ b/tests/cases/standalone/common/ttl/database_ttl_with_metric_engine.sql
@@ -13,7 +13,7 @@ INSERT INTO test_ttl(ts, val, host) VALUES
        (now(), 2, 'host2'),
        (now(), 3, 'host3');
 
-SELECT val, host FROM test_ttl;
+SELECT val, host FROM test_ttl ORDER BY host;
 
 -- SQLNESS SLEEP 2s
 ADMIN flush_table('phy');
@@ -35,11 +35,11 @@ ADMIN flush_table('phy');
 ADMIN compact_table('phy');
 
 --- should not be expired --
-SELECT val, host FROM test_ttl;
+SELECT val, host FROM test_ttl ORDER BY host;
 
 -- restart the db, ensure everything is ok
 -- SQLNESS ARG restart=true
-SELECT val, host FROM test_ttl;
+SELECT val, host FROM test_ttl ORDER BY host;
 
 DROP TABLE test_ttl;
 
diff --git a/tests/cases/standalone/common/ttl/metric_engine_ttl.result b/tests/cases/standalone/common/ttl/metric_engine_ttl.result
index badcb715f9..6152c0cd58 100644
--- a/tests/cases/standalone/common/ttl/metric_engine_ttl.result
+++ b/tests/cases/standalone/common/ttl/metric_engine_ttl.result
@@ -13,14 +13,14 @@ INSERT INTO test_ttl(ts, val, host) VALUES
 
 Affected Rows: 3
 
-SELECT val, host FROM test_ttl;
+SELECT val, host FROM test_ttl ORDER BY host;
 
 +-----+-------+
 | val | host  |
 +-----+-------+
+| 1.0 | host1 |
 | 2.0 | host2 |
 | 3.0 | host3 |
-| 1.0 | host1 |
 +-----+-------+
 
 -- SQLNESS SLEEP 2s
@@ -74,26 +74,26 @@ ADMIN compact_table('phy');
 +----------------------------+
 
 --- should not be expired --
-SELECT val, host FROM test_ttl;
+SELECT val, host FROM test_ttl ORDER BY host;
 
 +-----+-------+
 | val | host  |
 +-----+-------+
+| 1.0 | host1 |
 | 2.0 | host2 |
 | 3.0 | host3 |
-| 1.0 | host1 |
 +-----+-------+
 
 -- restart the db, ensure everything is ok
 -- SQLNESS ARG restart=true
-SELECT val, host FROM test_ttl;
+SELECT val, host FROM test_ttl ORDER BY host;
 
 +-----+-------+
 | val | host  |
 +-----+-------+
+| 1.0 | host1 |
 | 2.0 | host2 |
 | 3.0 | host3 |
-| 1.0 | host1 |
 +-----+-------+
 
 DROP TABLE test_ttl;
diff --git a/tests/cases/standalone/common/ttl/metric_engine_ttl.sql b/tests/cases/standalone/common/ttl/metric_engine_ttl.sql
index a556bc1d9e..c5fce4e916 100644
--- a/tests/cases/standalone/common/ttl/metric_engine_ttl.sql
+++ b/tests/cases/standalone/common/ttl/metric_engine_ttl.sql
@@ -7,7 +7,7 @@ INSERT INTO test_ttl(ts, val, host) VALUES
        (now(), 2, 'host2'),
        (now(), 3, 'host3');
 
-SELECT val, host FROM test_ttl;
+SELECT val, host FROM test_ttl ORDER BY host;
 
 -- SQLNESS SLEEP 2s
 ADMIN flush_table('phy');
@@ -29,11 +29,11 @@ ADMIN flush_table('phy');
 ADMIN compact_table('phy');
 
 --- should not be expired --
-SELECT val, host FROM test_ttl;
+SELECT val, host FROM test_ttl ORDER BY host;
 
 -- restart the db, ensure everything is ok
 -- SQLNESS ARG restart=true
-SELECT val, host FROM test_ttl;
+SELECT val, host FROM test_ttl ORDER BY host;
 
 DROP TABLE test_ttl;
 
diff --git a/tests/cases/standalone/common/types/json/json-structured.result b/tests/cases/standalone/common/types/json/json-structured.result
new file mode 100644
index 0000000000..0553831e90
--- /dev/null
+++ b/tests/cases/standalone/common/types/json/json-structured.result
@@ -0,0 +1,82 @@
+CREATE TABLE t (ts TIMESTAMP TIME INDEX, j JSON(format = "structured") DEFAULT '{"foo": "bar"}');
+
+Error: 1001(Unsupported), Unsupported default constraint for column: 'j', reason: json column cannot have a default value
+
+CREATE TABLE t (ts TIMESTAMP TIME INDEX, j JSON(format = "structured"));
+
+Affected Rows: 0
+
+DESC TABLE t;
+
++--------+----------------------+-----+------+---------+---------------+
+| Column | Type                 | Key | Null | Default | Semantic Type |
++--------+----------------------+-----+------+---------+---------------+
+| ts     | TimestampMillisecond | PRI | NO   |         | TIMESTAMP     |
+| j      | Json<Null>           |     | YES  |         | FIELD         |
++--------+----------------------+-----+------+---------+---------------+
+
+INSERT INTO t VALUES
+(1762128001000, '{"int": 1}'),
+(1762128002000, '{"int": 2, "list": [0.1, 0.2, 0.3]}'),
+(1762128003000, '{"int": 3, "list": [0.4, 0.5, 0.6], "nested": {"a": {"x": "hello"}, "b": {"y": -1}}}');
+
+Affected Rows: 3
+
+DESC TABLE t;
+
++--------+-----------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
+| Column | Type                                                                                                                                    | Key | Null | Default | Semantic Type |
++--------+-----------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
+| ts     | TimestampMillisecond                                                                                                                    | PRI | NO   |         | TIMESTAMP     |
+| j      | Json<Object{"int": Number(I64), "list": Array[Number(F64)], "nested": Object{"a": Object{"x": String}, "b": Object{"y": Number(I64)}}}> |     | YES  |         | FIELD         |
++--------+-----------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
+
+INSERT INTO t VALUES
+(1762128004000, '{"int": 4, "bool": true, "nested": {"a": {"y": 1}}}'),
+(1762128005000, '{"int": 5, "bool": false, "nested": {"b": {"x": "world"}}}');
+
+Affected Rows: 2
+
+DESC TABLE t;
+
++--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
+| Column | Type                                                                                                                                                                                 | Key | Null | Default | Semantic Type |
++--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
+| ts     | TimestampMillisecond                                                                                                                                                                 | PRI | NO   |         | TIMESTAMP     |
+| j      | Json<Object{"bool": Bool, "int": Number(I64), "list": Array[Number(F64)], "nested": Object{"a": Object{"x": String, "y": Number(I64)}, "b": Object{"x": String, "y": Number(I64)}}}> |     | YES  |         | FIELD         |
++--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
+
+INSERT INTO t VALUES (1762128006000, '{"int": 6, "list": [-6.0], "bool": true, "nested": {"a": {"x": "ax", "y": 66}, "b": {"y": -66, "x": "bx"}}}');
+
+Affected Rows: 1
+
+DESC TABLE t;
+
++--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
+| Column | Type                                                                                                                                                                                 | Key | Null | Default | Semantic Type |
++--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
+| ts     | TimestampMillisecond                                                                                                                                                                 | PRI | NO   |         | TIMESTAMP     |
+| j      | Json<Object{"bool": Bool, "int": Number(I64), "list": Array[Number(F64)], "nested": Object{"a": Object{"x": String, "y": Number(I64)}, "b": Object{"x": String, "y": Number(I64)}}}> |     | YES  |         | FIELD         |
++--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
+
+INSERT INTO t VALUES (1762128011000, '{}');
+
+Error: 1004(InvalidArguments), Invalid InsertRequest, reason: empty json object is not supported, consider adding a dummy field
+
+SELECT ts, j FROM t order by ts;
+
++---------------------+----------------------------------------------------------------------------------------+
+| ts                  | j                                                                                      |
++---------------------+----------------------------------------------------------------------------------------+
+| 2025-11-03T00:00:01 | {bool: , int: 1, list: , nested: }                                                     |
+| 2025-11-03T00:00:02 | {bool: , int: 2, list: [0.1, 0.2, 0.3], nested: }                                      |
+| 2025-11-03T00:00:03 | {bool: , int: 3, list: [0.4, 0.5, 0.6], nested: {a: {x: hello, y: }, b: {x: , y: -1}}} |
+| 2025-11-03T00:00:04 | {bool: true, int: 4, list: , nested: {a: {x: , y: 1}, b: }}                            |
+| 2025-11-03T00:00:05 | {bool: false, int: 5, list: , nested: {a: , b: {x: world, y: }}}                       |
+| 2025-11-03T00:00:06 | {bool: true, int: 6, list: [-6.0], nested: {a: {x: ax, y: 66}, b: {x: bx, y: -66}}}    |
++---------------------+----------------------------------------------------------------------------------------+
+
+DROP table t;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/types/json/json-structured.sql b/tests/cases/standalone/common/types/json/json-structured.sql
new file mode 100644
index 0000000000..8bb10b4b0e
--- /dev/null
+++ b/tests/cases/standalone/common/types/json/json-structured.sql
@@ -0,0 +1,28 @@
+CREATE TABLE t (ts TIMESTAMP TIME INDEX, j JSON(format = "structured") DEFAULT '{"foo": "bar"}');
+
+CREATE TABLE t (ts TIMESTAMP TIME INDEX, j JSON(format = "structured"));
+
+DESC TABLE t;
+
+INSERT INTO t VALUES
+(1762128001000, '{"int": 1}'),
+(1762128002000, '{"int": 2, "list": [0.1, 0.2, 0.3]}'),
+(1762128003000, '{"int": 3, "list": [0.4, 0.5, 0.6], "nested": {"a": {"x": "hello"}, "b": {"y": -1}}}');
+
+DESC TABLE t;
+
+INSERT INTO t VALUES
+(1762128004000, '{"int": 4, "bool": true, "nested": {"a": {"y": 1}}}'),
+(1762128005000, '{"int": 5, "bool": false, "nested": {"b": {"x": "world"}}}');
+
+DESC TABLE t;
+
+INSERT INTO t VALUES (1762128006000, '{"int": 6, "list": [-6.0], "bool": true, "nested": {"a": {"x": "ax", "y": 66}, "b": {"y": -66, "x": "bx"}}}');
+
+DESC TABLE t;
+
+INSERT INTO t VALUES (1762128011000, '{}');
+
+SELECT ts, j FROM t order by ts;
+
+DROP table t;
diff --git a/tests/cases/standalone/common/view/create.result b/tests/cases/standalone/common/view/create.result
index 54135b8ecf..c5d2d2750c 100644
--- a/tests/cases/standalone/common/view/create.result
+++ b/tests/cases/standalone/common/view/create.result
@@ -45,23 +45,23 @@ Affected Rows: 0
 
 SHOW TABLES;
 
-+------------+
-| Tables     |
-+------------+
-| numbers    |
-| test_table |
-| test_view  |
-+------------+
++------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
+| test_table       |
+| test_view        |
++------------------+
 
 SHOW FULL TABLES;
 
-+------------+-----------------+
-| Tables     | Table_type      |
-+------------+-----------------+
-| numbers    | LOCAL TEMPORARY |
-| test_table | BASE TABLE      |
-| test_view  | VIEW            |
-+------------+-----------------+
++------------------+-----------------+
+| Tables_in_public | Table_type      |
++------------------+-----------------+
+| numbers          | LOCAL TEMPORARY |
+| test_table       | BASE TABLE      |
+| test_view        | VIEW            |
++------------------+-----------------+
 
 -- psql: \dv
 SELECT n.nspname as "Schema",
@@ -113,7 +113,6 @@ SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE;
 |greptime|information_schema|region_peers|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 |greptime|information_schema|region_statistics|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 |greptime|information_schema|routines|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
-|greptime|information_schema|runtime_metrics|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 |greptime|information_schema|schema_privileges|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 |greptime|information_schema|schemata|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
 |greptime|information_schema|session_status|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|DATETIME||utf8_bin|ID|||Y|
@@ -183,11 +182,11 @@ Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.public
 
 SHOW TABLES;
 
-+---------+
-| Tables  |
-+---------+
-| numbers |
-+---------+
++------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
++------------------+
 
 -- psql: \dv
 SELECT n.nspname as "Schema",
diff --git a/tests/cases/standalone/common/view/view.result b/tests/cases/standalone/common/view/view.result
index 1ba9e9ec8f..21d674e54e 100644
--- a/tests/cases/standalone/common/view/view.result
+++ b/tests/cases/standalone/common/view/view.result
@@ -84,11 +84,11 @@ Affected Rows: 0
 
 SHOW TABLES;
 
-+---------+
-| Tables  |
-+---------+
-| numbers |
-+---------+
++------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
++------------------+
 
 SHOW VIEWS;
 
diff --git a/tests/cases/standalone/create/recover_created.result b/tests/cases/standalone/create/recover_created.result
index c384a7888a..cac18d52c8 100644
--- a/tests/cases/standalone/create/recover_created.result
+++ b/tests/cases/standalone/create/recover_created.result
@@ -17,11 +17,11 @@ Affected Rows: 0
 -- SQLNESS ARG restart=true
 show tables;
 
-+---------+
-| Tables  |
-+---------+
-| numbers |
-+---------+
++------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
++------------------+
 
 create table t3 (c timestamp time index);
 
diff --git a/tests/cases/standalone/flow-tql/flow_tql.result b/tests/cases/standalone/flow-tql/flow_tql.result
index 6fb9386e83..72fe7759ae 100644
--- a/tests/cases/standalone/flow-tql/flow_tql.result
+++ b/tests/cases/standalone/flow-tql/flow_tql.result
@@ -15,20 +15,22 @@ Affected Rows: 0
 
 SHOW CREATE TABLE cnt_reqs;
 
-+----------+-------------------------------------------+
-| Table    | Create Table                              |
-+----------+-------------------------------------------+
-| cnt_reqs | CREATE TABLE IF NOT EXISTS "cnt_reqs" (   |
-|          |   "count(http_requests.val)" DOUBLE NULL, |
-|          |   "ts" TIMESTAMP(3) NOT NULL,             |
-|          |   "status_code" STRING NULL,              |
-|          |   TIME INDEX ("ts"),                      |
-|          |   PRIMARY KEY ("status_code")             |
-|          | )                                         |
-|          |                                           |
-|          | ENGINE=mito                               |
-|          |                                           |
-+----------+-------------------------------------------+
++----------+---------------------------------------------------+
+| Table    | Create Table                                      |
++----------+---------------------------------------------------+
+| cnt_reqs | CREATE TABLE IF NOT EXISTS "cnt_reqs" (           |
+|          |   "count(http_requests.val)" DOUBLE NULL,         |
+|          |   "ts" TIMESTAMP(3) NOT NULL,                     |
+|          |   "status_code" STRING NULL,                      |
+|          |   TIME INDEX ("ts"),                              |
+|          |   PRIMARY KEY ("status_code")                     |
+|          | )                                                 |
+|          |                                                   |
+|          | ENGINE=mito                                       |
+|          | WITH(                                             |
+|          |   'comment' = 'Auto created table by flow engine' |
+|          | )                                                 |
++----------+---------------------------------------------------+
 
 -- test if sink table is tql queryable
 TQL EVAL (now() - '1m'::interval, now(), '5s') count_values("status_code", cnt_reqs);
@@ -113,7 +115,7 @@ Error: 3001(EngineExecuteQuery), Invalid query: TQL query only supports one f64
 SHOW TABLES;
 
 +------------------------+
-| Tables                 |
+| Tables_in_public       |
 +------------------------+
 | http_requests_two_vals |
 | numbers                |
@@ -157,20 +159,22 @@ Affected Rows: 0
 
 SHOW CREATE TABLE cnt_reqs;
 
-+----------+-------------------------------------------+
-| Table    | Create Table                              |
-+----------+-------------------------------------------+
-| cnt_reqs | CREATE TABLE IF NOT EXISTS "cnt_reqs" (   |
-|          |   "count(http_requests.val)" DOUBLE NULL, |
-|          |   "ts" TIMESTAMP(3) NOT NULL,             |
-|          |   "status_code" STRING NULL,              |
-|          |   TIME INDEX ("ts"),                      |
-|          |   PRIMARY KEY ("status_code")             |
-|          | )                                         |
-|          |                                           |
-|          | ENGINE=mito                               |
-|          |                                           |
-+----------+-------------------------------------------+
++----------+---------------------------------------------------+
+| Table    | Create Table                                      |
++----------+---------------------------------------------------+
+| cnt_reqs | CREATE TABLE IF NOT EXISTS "cnt_reqs" (           |
+|          |   "count(http_requests.val)" DOUBLE NULL,         |
+|          |   "ts" TIMESTAMP(3) NOT NULL,                     |
+|          |   "status_code" STRING NULL,                      |
+|          |   TIME INDEX ("ts"),                              |
+|          |   PRIMARY KEY ("status_code")                     |
+|          | )                                                 |
+|          |                                                   |
+|          | ENGINE=mito                                       |
+|          | WITH(                                             |
+|          |   'comment' = 'Auto created table by flow engine' |
+|          | )                                                 |
++----------+---------------------------------------------------+
 
 -- test if sink table is tql queryable
 TQL EVAL (now() - '1m'::interval, now(), '5s') count_values("status_code", cnt_reqs);
@@ -258,7 +262,9 @@ SHOW CREATE TABLE rate_reqs;
 |           | )                                                         |
 |           |                                                           |
 |           | ENGINE=mito                                               |
-|           |                                                           |
+|           | WITH(                                                     |
+|           |   'comment' = 'Auto created table by flow engine'         |
+|           | )                                                         |
 +-----------+-----------------------------------------------------------+
 
 -- test if sink table is tql queryable
@@ -337,7 +343,9 @@ SHOW CREATE TABLE rate_reqs;
 |           | )                                                         |
 |           |                                                           |
 |           | ENGINE=mito                                               |
-|           |                                                           |
+|           | WITH(                                                     |
+|           |   'comment' = 'Auto created table by flow engine'         |
+|           | )                                                         |
 +-----------+-----------------------------------------------------------+
 
 -- test if sink table is tql queryable
diff --git a/tests/cases/standalone/tql-explain-analyze/analyze.result b/tests/cases/standalone/tql-explain-analyze/analyze.result
index 0d366e5965..67679942d5 100644
--- a/tests/cases/standalone/tql-explain-analyze/analyze.result
+++ b/tests/cases/standalone/tql-explain-analyze/analyze.result
@@ -127,10 +127,7 @@ TQL ANALYZE (0, 10, '5s') test;
 +-+-+-+
 | stage | node | plan_|
 +-+-+-+
-| 0_| 0_|_SortPreservingMergeExec: [k@2 ASC, l@3 ASC, j@1 ASC] REDACTED
-|_|_|_CooperativeExec REDACTED
-|_|_|_SortExec: expr=[k@2 ASC, l@3 ASC, j@1 ASC], preserve_partitioning=[true] REDACTED
-|_|_|_CooperativeExec REDACTED
+| 0_| 0_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
 | 1_| 0_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[5000], time index=[j] REDACTED
@@ -158,10 +155,7 @@ TQL ANALYZE (0, 10, '5s') rate(test[10s]);
 +-+-+-+
 | stage | node | plan_|
 +-+-+-+
-| 0_| 0_|_SortPreservingMergeExec: [k@2 ASC, l@3 ASC, j@0 ASC] REDACTED
-|_|_|_CooperativeExec REDACTED
-|_|_|_SortExec: expr=[k@2 ASC, l@3 ASC, j@0 ASC], preserve_partitioning=[true] REDACTED
-|_|_|_CooperativeExec REDACTED
+| 0_| 0_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
 | 1_| 0_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
@@ -199,7 +193,7 @@ TQL ANALYZE FORMAT JSON (0, 10, '5s') test;
 +-+-+-+
 | stage | node | plan_|
 +-+-+-+
-| 0_| 0_| {"name":"SortPreservingMergeExec","param":"[k@2 ASC, l@3 ASC, j@1 ASC]","output_rows":0,"REDACTED
+| 0_| 0_| {"name":"","param":"","output_rows":0,"REDACTED
 | 1_| 0_| {"name":"PromInstantManipulateExec","param":"range=[0..10000], lookback=[300000], interval=[5000], time index=[j]","output_rows":0,"REDACTED
 | 1_| 1_| {"name":"PromInstantManipulateExec","param":"range=[0..10000], lookback=[300000], interval=[5000], time index=[j]","output_rows":0,"REDACTED
 |_|_| Total rows: 0_|
@@ -219,7 +213,7 @@ TQL ANALYZE VERBOSE FORMAT JSON (0, 10, '5s') test;
 +-+-+-+
 | stage | node | plan_|
 +-+-+-+
-| 0_| 0_| {"name":"SortPreservingMergeExec","param":"[k@2 ASC, l@3 ASC, j@1 ASC]","output_rows":0,"REDACTED
+| 0_| 0_| {"name":"","param":"","output_rows":0,"REDACTED
 | 1_| 0_| {"name":"PromInstantManipulateExec","param":"range=[0..10000], lookback=[300000], interval=[5000], time index=[j]","output_rows":0,"REDACTED
 | 1_| 1_| {"name":"PromInstantManipulateExec","param":"range=[0..10000], lookback=[300000], interval=[5000], time index=[j]","output_rows":0,"REDACTED
 |_|_| Total rows: 0_|
@@ -238,10 +232,7 @@ TQL ANALYZE FORMAT TEXT (0, 10, '5s') test;
 +-+-+-+
 | stage | node | plan_|
 +-+-+-+
-| 0_| 0_|_SortPreservingMergeExec: [k@2 ASC, l@3 ASC, j@1 ASC] REDACTED
-|_|_|_CooperativeExec REDACTED
-|_|_|_SortExec: expr=[k@2 ASC, l@3 ASC, j@1 ASC], preserve_partitioning=[true] REDACTED
-|_|_|_CooperativeExec REDACTED
+| 0_| 0_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
 | 1_| 0_|_PromInstantManipulateExec: range=[0..10000], lookback=[300000], interval=[5000], time index=[j] REDACTED
@@ -298,17 +289,25 @@ TQL ANALYZE sum(test2);
 |_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
 |_|_|_RepartitionExec: partitioning=REDACTED
 |_|_|_AggregateExec: mode=Partial, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[sum(test2.greptime_value)] REDACTED
-|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, greptime_value@1 as greptime_value] REDACTED
-|_|_|_CooperativeExec REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_MergeScanExec: REDACTED
 |_|_|_|
-| 1_| 0_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp] REDACTED
+| 1_| 0_|_AggregateExec: mode=FinalPartitioned, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[__sum_state(test2.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[__sum_state(test2.greptime_value)] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, greptime_value@1 as greptime_value] REDACTED
+|_|_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["shard"] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
 |_|_|_|
-| 1_| 1_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp] REDACTED
+| 1_| 1_|_AggregateExec: mode=FinalPartitioned, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[__sum_state(test2.greptime_value)] REDACTED
+|_|_|_CoalesceBatchesExec: target_batch_size=8192 REDACTED
+|_|_|_RepartitionExec: partitioning=REDACTED
+|_|_|_AggregateExec: mode=Partial, gby=[greptime_timestamp@0 as greptime_timestamp], aggr=[__sum_state(test2.greptime_value)] REDACTED
+|_|_|_ProjectionExec: expr=[greptime_timestamp@0 as greptime_timestamp, greptime_value@1 as greptime_value] REDACTED
+|_|_|_PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[greptime_timestamp] REDACTED
 |_|_|_PromSeriesDivideExec: tags=["shard"] REDACTED
 |_|_|_CooperativeExec REDACTED
 |_|_|_SeriesScan: region=REDACTED, "partition_count":{"count":0, "mem_ranges":0, "files":0, "file_ranges":0}, "distribution":"PerSeries" REDACTED
diff --git a/tests/upgrade-compat/standalone/common/test_ttl.result b/tests/upgrade-compat/standalone/common/test_ttl.result
index d06bc629b6..c368d13e4c 100644
--- a/tests/upgrade-compat/standalone/common/test_ttl.result
+++ b/tests/upgrade-compat/standalone/common/test_ttl.result
@@ -26,14 +26,14 @@ Affected Rows: 1
 -- SQLNESS ARG version=latest
 SHOW TABLES;
 
-+---------------+
-| Tables        |
-+---------------+
-| numbers       |
-| test_ttl_0s   |
-| test_ttl_1s   |
-| test_ttl_none |
-+---------------+
++------------------+
+| Tables_in_public |
++------------------+
+| numbers          |
+| test_ttl_0s      |
+| test_ttl_1s      |
+| test_ttl_none    |
++------------------+
 
 SHOW CREATE TABLE test_ttl_1s;