Merge commit 'd304df6e75' into feat/flush-hook-extension-point

2026-06-04 22:30:37 +00:00 · 2026-06-04 04:20:27 -07:00
parent 848886a7f9 d304df6e75
commit 12269ca452
72 changed files with 3536 additions and 423 deletions
--- a/.github/workflows/dev-build.yml
+++ b/.github/workflows/dev-build.yml
@@ -30,7 +30,7 @@ on:
      linux_arm64_runner:
        type: choice
        description: The runner uses to build linux-arm64 artifacts
-        default: ec2-c6g.4xlarge-arm64
+        default: ec2-c6g.8xlarge-arm64
        options:
          - ec2-c6g.xlarge-arm64 # 4C8G
          - ec2-c6g.2xlarge-arm64 # 8C16G
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -27,7 +27,7 @@ on:
      linux_arm64_runner:
        type: choice
        description: The runner uses to build linux-arm64 artifacts
-        default: ec2-c6g.4xlarge-arm64
+        default: ec2-c6g.8xlarge-arm64
        options:
          - ec2-c6g.xlarge-arm64 # 4C8G
          - ec2-c6g.2xlarge-arm64 # 8C16G
--- a/.github/workflows/nightly-jsonbench.yaml
+++ b/.github/workflows/nightly-jsonbench.yaml
@@ -1,19 +1,81 @@
 name: Nightly JSONBench

 on:
-  schedule:
-    # Trigger at 00:00(Asia/Shanghai) on every weekday.
-    - cron: "0 16 * * 0-4"
+  workflow_run:
+    workflows: [ "GreptimeDB Nightly Build" ]
+    types: [ completed ]
  workflow_dispatch:
+    inputs:
+      run_id:
+        description: The nightly build workflow run id to download GreptimeDB artifacts from
+        required: true
+        type: string
+
+permissions:
+  actions: read
+  contents: read

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

 jobs:
+  resolve-artifact:
+    name: Resolve GreptimeDB nightly artifact
+    if: ${{ github.repository == 'GreptimeTeam/greptimedb' && (github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success') }}
+    runs-on: ubuntu-latest
+    outputs:
+      artifact-name: ${{ steps.find-artifact.outputs.artifact-name }}
+      run-id: ${{ steps.resolve-run-id.outputs.run-id }}
+    steps:
+      - name: Resolve nightly build run id
+        id: resolve-run-id
+        shell: bash
+        env:
+          EVENT_NAME: ${{ github.event_name }}
+          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
+          INPUT_RUN_ID: ${{ inputs.run_id }}
+        run: |
+          set -euo pipefail
+
+          if [[ "${EVENT_NAME}" == "workflow_dispatch" ]]; then
+            run_id="${INPUT_RUN_ID}"
+          else
+            run_id="${WORKFLOW_RUN_ID}"
+          fi
+
+          if [[ ! "${run_id}" =~ ^[0-9]+$ ]]; then
+            echo "Invalid workflow run id: ${run_id}"
+            exit 1
+          fi
+
+          echo "run-id=${run_id}" >> "${GITHUB_OUTPUT}"
+
+      - name: Find GreptimeDB nightly artifact
+        id: find-artifact
+        shell: bash
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          RUN_ID: ${{ steps.resolve-run-id.outputs.run-id }}
+        run: |
+          set -euo pipefail
+
+          artifact_name=$(gh api "repos/${GITHUB_REPOSITORY}/actions/runs/${RUN_ID}/artifacts" --paginate \
+            --jq '.artifacts[] | select(.name | test("^greptime-linux-arm64-nightly-[0-9]{8}-[0-9a-f]+$")) | .name' \
+            | head -n 1)
+
+          if [[ -z "${artifact_name}" ]]; then
+            echo "Cannot find linux arm64 nightly artifact in workflow run ${RUN_ID}."
+            exit 1
+          fi
+
+          echo "Download GreptimeDB artifact: ${artifact_name}"
+          echo "artifact-name=${artifact_name}" >> "${GITHUB_OUTPUT}"
+
  allocate-runner:
    name: Allocate runner
-    if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
+    if: ${{ github.repository == 'GreptimeTeam/greptimedb' && (github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success') }}
+    needs: [ resolve-artifact ]
    runs-on: ubuntu-latest
    outputs:
      linux-arm64-runner: ${{ steps.start-linux-arm64-runner.outputs.label }}
@@ -43,55 +105,50 @@ jobs:

  jsonbench:
    name: Run JSONBench
-    if: ${{ github.repository == 'GreptimeTeam/greptimedb' }}
-    needs: [ allocate-runner ]
+    if: ${{ github.repository == 'GreptimeTeam/greptimedb' && (github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success') }}
+    needs: [ resolve-artifact, allocate-runner ]
    runs-on: ${{ needs.allocate-runner.outputs.linux-arm64-runner }}
    timeout-minutes: 120
    env:
-      JSONBENCH_DATA_DIR: /home/runner/data/bluesky
-      JSONBENCH_OUTPUT_PREFIX: _ubuntu-latest
+      JSONBENCH_OUTPUT_PREFIX: _linux-arm64
    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
+      - name: Download GreptimeDB nightly artifact
+        uses: actions/download-artifact@v4
        with:
-          fetch-depth: 0
-          persist-credentials: false
+          name: ${{ needs.resolve-artifact.outputs.artifact-name }}
+          path: greptimedb-artifact
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          run-id: ${{ needs.resolve-artifact.outputs.run-id }}

-      - uses: arduino/setup-protoc@v3
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-
-      - uses: actions-rust-lang/setup-rust-toolchain@v1
-
-      - name: Rust Cache
-        uses: Swatinem/rust-cache@v2
-        with:
-          shared-key: "nightly-jsonbench"
-          cache-all-crates: "true"
-          save-if: ${{ github.ref == 'refs/heads/main' }}
-
-      - name: Build GreptimeDB
-        run: cargo build --profile nightly --bin greptime
-
-      - name: Reclaim disk space
+      - name: Prepare GreptimeDB binary
        shell: bash
        run: |
          set -euo pipefail

-          mkdir -p "${RUNNER_TEMP}/greptimedb-bin"
-          cp ./target/nightly/greptime "${RUNNER_TEMP}/greptimedb-bin/greptime"
-          chmod +x "${RUNNER_TEMP}/greptimedb-bin/greptime"
-
-          rm -rf ./target
+          tar -xzf "greptimedb-artifact/${{ needs.resolve-artifact.outputs.artifact-name }}.tar.gz"
+          cp "${{ needs.resolve-artifact.outputs.artifact-name }}/greptime" ./greptime
+          chmod +x ./greptime
+          rm -rf greptimedb-artifact "${{ needs.resolve-artifact.outputs.artifact-name }}"

      - name: Run JSONBench
+        env:
+          # TODO(LFC): Change to "3" (100m) when JSON2 ingestion performance is optimized.
+          JSONBENCH_DATASET: 2
        shell: bash
        run: |
          set -euo pipefail

-          cd "${RUNNER_TEMP}"
-          cp "${RUNNER_TEMP}/greptimedb-bin/greptime" ./greptime
-          chmod +x ./greptime
+          export JSONBENCH_DATA_DIR="/root/data/bluesky"
+          echo "Use JSONBench data directory ${JSONBENCH_DATA_DIR}"
+
+          echo "Cloning JSONBench"
+          git clone --branch greptimedb-new-json --depth 1 https://github.com/GreptimeTeam/JSONBench.git JSONBench
+
+          echo "Downloading JSONBench dataset choice ${JSONBENCH_DATASET} to ${JSONBENCH_DATA_DIR}"
+          mkdir -p "${JSONBENCH_DATA_DIR}"
+          printf "${JSONBENCH_DATASET}\n" | ./JSONBench/download_data.sh
+          downloaded_files=$(find "${JSONBENCH_DATA_DIR}" -type f | wc -l)
+          echo "Downloaded JSONBench dataset files: ${downloaded_files}"

          export GREPTIMEDB_STANDALONE__WAL__DIR=greptimedb_data/wal
          export GREPTIMEDB_STANDALONE__STORAGE__DATA_HOME=greptimedb_data
@@ -100,10 +157,12 @@ jobs:
          export GREPTIMEDB_STANDALONE__HTTP__BODY_LIMIT=1GB
          export GREPTIMEDB_STANDALONE__HTTP__TIMEOUT=500s

+          echo "Starting GreptimeDB standalone"
          ./greptime standalone start > greptimedb.log 2>&1 &
          greptime_pid=$!
          trap 'kill "${greptime_pid}" 2>/dev/null || true' EXIT

+          echo "Waiting for GreptimeDB health check"
          until curl -s --fail -o /dev/null http://localhost:4000/health; do
            if ! kill -0 "${greptime_pid}" 2>/dev/null; then
              cat greptimedb.log
@@ -111,12 +170,14 @@ jobs:
            fi
            sleep 1
          done
+          echo "GreptimeDB is ready"

-          git clone --branch greptimedb-new-json --depth 1 https://github.com/GreptimeTeam/JSONBench.git JSONBench
          cp ./greptime JSONBench/greptimedb/greptime

          cd JSONBench/greptimedb
-          ./main.sh 3 "${JSONBENCH_DATA_DIR}" success.log error.log "${JSONBENCH_OUTPUT_PREFIX}" false
+          echo "Running JSONBench main.sh with dataset choice ${JSONBENCH_DATASET} and install=false"
+          ./main.sh ${JSONBENCH_DATASET} "${JSONBENCH_DATA_DIR}" success.log error.log "${JSONBENCH_OUTPUT_PREFIX}" false
+          echo "JSONBench finished"

      - name: Upload JSONBench results
        if: always()
@@ -124,21 +185,21 @@ jobs:
        with:
          name: jsonbench-results
          path: |
-            ${{ runner.temp }}/greptimedb.log
-            ${{ runner.temp }}/JSONBench/greptimedb/*.log
-            ${{ runner.temp }}/JSONBench/greptimedb/*.total_size
-            ${{ runner.temp }}/JSONBench/greptimedb/*.data_size
-            ${{ runner.temp }}/JSONBench/greptimedb/*.index_size
-            ${{ runner.temp }}/JSONBench/greptimedb/*.count
-            ${{ runner.temp }}/JSONBench/greptimedb/*.results_runtime
-            ${{ runner.temp }}/JSONBench/greptimedb/*.query_results
+            ./greptimedb.log
+            ./JSONBench/greptimedb/*.log
+            ./JSONBench/greptimedb/*.total_size
+            ./JSONBench/greptimedb/*.data_size
+            ./JSONBench/greptimedb/*.index_size
+            ./JSONBench/greptimedb/*.count
+            ./JSONBench/greptimedb/*.results_runtime
+            ./JSONBench/greptimedb/*.query_results
          if-no-files-found: ignore
          retention-days: 7

  stop-linux-arm64-runner:
    name: Stop Linux ARM64 runner
    # It's always run as the last job in the workflow to make sure that the runner is released.
-    if: ${{ always() }}
+    if: ${{ always() && needs.allocate-runner.outputs.linux-arm64-ec2-runner-instance-id != '' }}
    runs-on: ubuntu-latest
    needs: [
      allocate-runner,
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -259,7 +259,7 @@ tracing-opentelemetry = "0.31.0"
 tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"] }
 typetag = "0.2"
 uuid = { version = "1.17", features = ["serde", "v4", "v7", "fast-rng"] }
-vrl = "0.25"
+vrl = "0.33"
 zstd = "0.13"
 # DO_NOT_REMOVE_THIS: END_OF_EXTERNAL_DEPENDENCIES

--- a/config/config.md
+++ b/config/config.md
@@ -451,6 +451,7 @@
 | `init_regions_in_background` | Bool | `false` | Initialize all regions in the background during the startup.<br/>By default, it provides services after all regions have been initialized. |
 | `init_regions_parallelism` | Integer | `16` | Parallelism of initializing regions. |
 | `max_concurrent_queries` | Integer | `0` | The maximum concurrent queries allowed to be executed. Zero means unlimited. |
+| `concurrent_query_limiter_timeout` | String | `100ms` | Timeout to acquire a permit from the concurrent query limiter when `max_concurrent_queries` is reached. |
 | `enable_telemetry` | Bool | `true` | Enable telemetry to collect anonymous usage data. Enabled by default. |
 | `http` | -- | -- | The HTTP server options. |
 | `http.addr` | String | `127.0.0.1:4000` | The address to bind the HTTP server. |
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -20,6 +20,9 @@ init_regions_parallelism = 16
 ## The maximum concurrent queries allowed to be executed. Zero means unlimited.
 max_concurrent_queries = 0

+## Timeout to acquire a permit from the concurrent query limiter when `max_concurrent_queries` is reached.
+concurrent_query_limiter_timeout = "100ms"
+
 ## Enable telemetry to collect anonymous usage data. Enabled by default.
 #+ enable_telemetry = true

--- a/src/cli/src/data/export_v2/command.rs
+++ b/src/cli/src/data/export_v2/command.rs
@@ -1077,7 +1077,9 @@ async fn verify_snapshot(storage: &OpenDalStorage) -> Result<VerifyReport> {
            ));
        }
        let data_files = storage.list_files_recursive("data/").await?;
-        if let Some(path) = data_files.first() {
+        // Report the lexicographically smallest path so the message is stable
+        // regardless of listing order across backends.
+        if let Some(path) = data_files.iter().min() {
            report.push_error(format!(
                "Schema-only snapshot should not contain data files (found '{}')",
                path
@@ -1103,75 +1105,113 @@ fn summarize_chunks(manifest: &Manifest) -> VerifyChunkSummary {
    }
 }

+/// A data file declared by a completed chunk that is expected to exist in storage.
+#[derive(Debug)]
+struct ChunkFile {
+    chunk_id: u32,
+    path: String,
+}
+
+/// Expected snapshot contents derived purely from the manifest (no object-store IO).
+///
+/// Separating planning from scanning makes it obvious which problems come from
+/// the manifest alone and which require comparing against actual storage.
+#[derive(Debug, Default)]
+struct VerifyPlan {
+    /// Valid data files declared by completed chunks; each must exist in storage.
+    files_to_check: Vec<ChunkFile>,
+    /// All syntactically-safe data paths declared by any chunk, regardless of
+    /// status. Used as the orphan-detection baseline so a listed-but-invalid
+    /// file is not also reported as unexpected.
+    claimed_data_files: HashSet<String>,
+    /// Total data-file references in completed chunks (valid + invalid).
+    data_files_total: usize,
+    /// Problems detectable from the manifest alone.
+    problems: Vec<VerifyProblem>,
+}
+
+/// Actual data files discovered under `data/` (the only object-store IO in
+/// chunk/data-file verification).
+#[derive(Debug)]
+struct VerifyDataScan {
+    existing_data_files: HashSet<String>,
+}
+
+/// Result of reconciling the manifest plan against the storage scan.
+#[derive(Debug, Default)]
+struct VerifyOutcome {
+    data_files_total: usize,
+    data_files_verified: usize,
+    problems: Vec<VerifyProblem>,
+}
+
 async fn verify_chunks_and_data_files(
    storage: &OpenDalStorage,
    report: &mut VerifyReport,
 ) -> Result<()> {
-    let existing_files: HashSet<_> = storage
-        .list_files_recursive("data/")
-        .await?
-        .into_iter()
-        .collect();
-    let mut data_files_total = 0;
-    let mut data_files_verified = 0;
-    let mut problems = Vec::new();
-    let mut seen_chunk_ids = HashSet::new();
-    let mut claimed_data_files = HashSet::new();
+    let plan = build_verify_plan(&report.manifest);
+    let scan = scan_data_files(storage).await?;
+    let outcome = reconcile_plan_with_scan(plan, &scan);

-    for chunk in &report.manifest.chunks {
+    report.data_files_total = outcome.data_files_total;
+    report.data_files_verified = outcome.data_files_verified;
+    report.problems.extend(outcome.problems);
+
+    Ok(())
+}
+
+/// Builds the expected-state plan from the manifest. Pure; performs no IO.
+fn build_verify_plan(manifest: &Manifest) -> VerifyPlan {
+    let mut plan = VerifyPlan::default();
+    let mut seen_chunk_ids = HashSet::new();
+
+    for chunk in &manifest.chunks {
        if !seen_chunk_ids.insert(chunk.id) {
-            problems.push(VerifyProblem {
+            plan.problems.push(VerifyProblem {
                severity: VerifySeverity::Error,
                message: format!("Chunk {}: duplicate chunk id", chunk.id),
            });
        }
        for file in &chunk.files {
            if let Some(path) = safe_manifest_data_file_path(file) {
-                claimed_data_files.insert(path.to_string());
+                plan.claimed_data_files.insert(path.to_string());
            }
        }

        match chunk.status {
            ChunkStatus::Completed => {
                if chunk.files.is_empty() {
-                    problems.push(VerifyProblem {
+                    plan.problems.push(VerifyProblem {
                        severity: VerifySeverity::Error,
                        message: format!("Chunk {}: completed chunk has no data files", chunk.id),
                    });
                    continue;
                }
-                let allowed_prefixes = report
-                    .manifest
+                let allowed_prefixes = manifest
                    .schemas
                    .iter()
                    .map(|schema| data_dir_for_schema_chunk(schema, chunk.id))
                    .collect::<Vec<_>>();
                for file in &chunk.files {
-                    data_files_total += 1;
-                    let Some(path) = valid_manifest_data_file_path(file, &allowed_prefixes) else {
-                        problems.push(VerifyProblem {
+                    plan.data_files_total += 1;
+                    match valid_manifest_data_file_path(file, &allowed_prefixes) {
+                        Some(path) => plan.files_to_check.push(ChunkFile {
+                            chunk_id: chunk.id,
+                            path: path.to_string(),
+                        }),
+                        None => plan.problems.push(VerifyProblem {
                            severity: VerifySeverity::Error,
                            message: format!(
                                "Chunk {}: invalid data file path '{}'",
                                chunk.id, file
                            ),
-                        });
-                        continue;
-                    };
-
-                    if existing_files.contains(path) {
-                        data_files_verified += 1;
-                    } else {
-                        problems.push(VerifyProblem {
-                            severity: VerifySeverity::Error,
-                            message: format!("Chunk {}: missing file '{}'", chunk.id, path),
-                        });
+                        }),
                    }
                }
            }
            ChunkStatus::Skipped => {
                if !chunk.files.is_empty() {
-                    problems.push(VerifyProblem {
+                    plan.problems.push(VerifyProblem {
                        severity: VerifySeverity::Error,
                        message: format!(
                            "Chunk {}: skipped chunk should not list data files",
@@ -1181,20 +1221,20 @@ async fn verify_chunks_and_data_files(
                }
            }
            ChunkStatus::Pending => {
-                problems.push(VerifyProblem {
+                plan.problems.push(VerifyProblem {
                    severity: VerifySeverity::Error,
                    message: format!("Chunk {}: status is 'pending'", chunk.id),
                });
            }
            ChunkStatus::InProgress => {
-                problems.push(VerifyProblem {
+                plan.problems.push(VerifyProblem {
                    severity: VerifySeverity::Error,
                    message: format!("Chunk {}: status is 'in_progress'", chunk.id),
                });
            }
            ChunkStatus::Failed => {
                let reason = chunk.error.as_deref().unwrap_or("unknown error");
-                problems.push(VerifyProblem {
+                plan.problems.push(VerifyProblem {
                    severity: VerifySeverity::Error,
                    message: format!("Chunk {}: status is 'failed' (error: {})", chunk.id, reason),
                });
@@ -1202,20 +1242,60 @@ async fn verify_chunks_and_data_files(
        }
    }

-    for path in &existing_files {
-        if !claimed_data_files.contains(path) {
+    plan
+}
+
+/// Lists all data files under `data/`. This is the only object-store IO in
+/// chunk/data-file verification.
+async fn scan_data_files(storage: &OpenDalStorage) -> Result<VerifyDataScan> {
+    let existing_data_files = storage
+        .list_files_recursive("data/")
+        .await?
+        .into_iter()
+        .collect();
+    Ok(VerifyDataScan {
+        existing_data_files,
+    })
+}
+
+/// Reconciles the manifest plan against the storage scan. Pure; performs no IO.
+///
+/// Emits missing-file problems for expected files absent from storage and
+/// unexpected-file problems for storage files no chunk claims. Unexpected files
+/// are sorted by path so output is deterministic regardless of listing order.
+fn reconcile_plan_with_scan(plan: VerifyPlan, scan: &VerifyDataScan) -> VerifyOutcome {
+    let mut problems = plan.problems;
+    let mut data_files_verified = 0;
+
+    for file in &plan.files_to_check {
+        if scan.existing_data_files.contains(&file.path) {
+            data_files_verified += 1;
+        } else {
            problems.push(VerifyProblem {
                severity: VerifySeverity::Error,
-                message: format!("Unexpected data file '{}' is not listed in manifest", path),
+                message: format!("Chunk {}: missing file '{}'", file.chunk_id, file.path),
            });
        }
    }

-    report.data_files_total = data_files_total;
-    report.data_files_verified = data_files_verified;
-    report.problems.extend(problems);
+    let mut orphans: Vec<&String> = scan
+        .existing_data_files
+        .iter()
+        .filter(|path| !plan.claimed_data_files.contains(*path))
+        .collect();
+    orphans.sort();
+    for path in orphans {
+        problems.push(VerifyProblem {
+            severity: VerifySeverity::Error,
+            message: format!("Unexpected data file '{}' is not listed in manifest", path),
+        });
+    }

-    Ok(())
+    VerifyOutcome {
+        data_files_total: plan.data_files_total,
+        data_files_verified,
+        problems,
+    }
 }

 fn valid_manifest_data_file_path<'a>(
@@ -2294,6 +2374,90 @@ mod tests {
        );
    }

+    #[test]
+    fn test_build_verify_plan_classifies_chunks_without_io() {
+        let mut manifest = test_manifest(
+            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
+            false,
+            true,
+        );
+        // test_manifest(complete) gives: chunk 1 completed (1 file), chunk 2 skipped.
+        let mut failed = ChunkMeta::new(3, TimeRange::unbounded());
+        failed.mark_failed("boom".to_string());
+        manifest.chunks.push(failed);
+        manifest
+            .chunks
+            .push(ChunkMeta::new(4, TimeRange::unbounded()));
+
+        let plan = build_verify_plan(&manifest);
+
+        assert_eq!(plan.files_to_check.len(), 1);
+        assert_eq!(plan.files_to_check[0].chunk_id, 1);
+        assert_eq!(plan.files_to_check[0].path, "data/public/1/file.parquet");
+        assert_eq!(plan.data_files_total, 1);
+        assert!(
+            plan.claimed_data_files
+                .contains("data/public/1/file.parquet")
+        );
+        assert_eq!(plan.problems.len(), 2);
+        assert!(
+            plan.problems
+                .iter()
+                .any(|problem| problem.message.contains("status is 'failed'"))
+        );
+        assert!(
+            plan.problems
+                .iter()
+                .any(|problem| problem.message.contains("status is 'pending'"))
+        );
+    }
+
+    #[tokio::test]
+    async fn test_verify_snapshot_produces_deterministic_problem_output() {
+        let dir = tempdir().unwrap();
+        let manifest = test_manifest(
+            chrono::Utc.with_ymd_and_hms(2026, 1, 1, 0, 0, 0).unwrap(),
+            false,
+            true,
+        );
+        write_root_manifest(dir.path(), manifest);
+        write_snapshot_file(dir.path(), "schema/schemas.json", b"[]");
+        write_default_ddl_files(dir.path());
+        write_snapshot_file(dir.path(), "data/public/1/file.parquet", b"data");
+        // Many orphan files under a known chunk prefix to stress ordering.
+        for i in 0..50 {
+            write_snapshot_file(
+                dir.path(),
+                &format!("data/public/1/orphan_{:02}.parquet", i),
+                b"x",
+            );
+        }
+
+        let storage = file_storage_for_dir(dir.path());
+        let messages = |report: &VerifyReport| {
+            report
+                .problems
+                .iter()
+                .map(|problem| problem.message.clone())
+                .collect::<Vec<_>>()
+        };
+        let first = messages(&verify_snapshot(&storage).await.unwrap());
+        let second = messages(&verify_snapshot(&storage).await.unwrap());
+
+        // Output is identical across runs despite HashSet-based scanning.
+        assert_eq!(first, second);
+
+        let orphans = first
+            .iter()
+            .filter(|message| message.contains("Unexpected data file"))
+            .cloned()
+            .collect::<Vec<_>>();
+        assert_eq!(orphans.len(), 50);
+        let mut sorted = orphans.clone();
+        sorted.sort();
+        assert_eq!(orphans, sorted);
+    }
+
    fn write_test_manifest(root: &std::path::Path, dir: &str, manifest: Manifest) {
        let snapshot_dir = root.join(dir);
        std::fs::create_dir_all(&snapshot_dir).unwrap();
--- a/src/cmd/src/datanode/scanbench.rs
+++ b/src/cmd/src/datanode/scanbench.rs
@@ -524,6 +524,7 @@ impl ScanbenchCommand {
            options: HashMap::default(),
            skip_wal_replay: !self.enable_wal,
            checkpoint: None,
+            requirements: Default::default(),
        };

        engine
--- a/src/common/datasource/src/file_format.rs
+++ b/src/common/datasource/src/file_format.rs
@@ -61,6 +61,7 @@ pub const FORMAT_COMPRESSION_TYPE: &str = "compression_type";
 pub const FORMAT_DELIMITER: &str = "delimiter";
 pub const FORMAT_SCHEMA_INFER_MAX_RECORD: &str = "schema_infer_max_record";
 pub const FORMAT_HAS_HEADER: &str = "has_header";
+pub const FORMAT_SKIP_BAD_RECORDS: &str = "skip_bad_records";
 pub const FORMAT_TYPE: &str = "format";
 pub const FILE_PATTERN: &str = "pattern";
 pub const TIMESTAMP_FORMAT: &str = "timestamp_format";
--- a/src/common/datasource/src/file_format/csv.rs
+++ b/src/common/datasource/src/file_format/csv.rs
@@ -13,15 +13,24 @@
 // limitations under the License.

 use std::collections::HashMap;
+use std::io;
 use std::str::FromStr;
+use std::sync::Arc;
+use std::task::Poll;

 use arrow::csv::reader::Format;
 use arrow::csv::{self, WriterBuilder};
+use arrow::error::ArrowError;
 use arrow::record_batch::RecordBatch;
-use arrow_schema::Schema;
+use arrow_schema::{Schema, SchemaRef};
 use async_trait::async_trait;
+use bytes::{Buf, Bytes};
 use common_runtime;
+use common_telemetry::warn;
 use datafusion::physical_plan::SendableRecordBatchStream;
+use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
+use futures::StreamExt;
+use futures::stream::BoxStream;
 use object_store::ObjectStore;
 use snafu::ResultExt;
 use tokio_util::compat::FuturesAsyncReadCompatExt;
@@ -34,9 +43,12 @@ use crate::file_format::{self, FileFormat, stream_to_file};
 use crate::share_buffer::SharedBuffer;
 use crate::util::normalize_infer_schema;

+const SKIP_BAD_RECORDS_BATCH_SIZE: usize = 1;
+
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct CsvFormat {
    pub has_header: bool,
+    pub skip_bad_records: bool,
    pub delimiter: u8,
    pub schema_infer_max_record: Option<usize>,
    pub compression_type: CompressionType,
@@ -76,13 +88,11 @@ impl TryFrom<&HashMap<String, String>> for CsvFormat {
                })?);
        };
        if let Some(has_header) = value.get(file_format::FORMAT_HAS_HEADER) {
-            format.has_header = has_header.parse().map_err(|_| {
-                error::ParseFormatSnafu {
-                    key: file_format::FORMAT_HAS_HEADER,
-                    value: has_header,
-                }
-                .build()
-            })?;
+            format.has_header = parse_bool(file_format::FORMAT_HAS_HEADER, has_header)?;
+        };
+        if let Some(skip_bad_records) = value.get(file_format::FORMAT_SKIP_BAD_RECORDS) {
+            format.skip_bad_records =
+                parse_bool(file_format::FORMAT_SKIP_BAD_RECORDS, skip_bad_records)?;
        };
        if let Some(timestamp_format) = value.get(file_format::TIMESTAMP_FORMAT) {
            format.timestamp_format = Some(timestamp_format.clone());
@@ -97,10 +107,17 @@ impl TryFrom<&HashMap<String, String>> for CsvFormat {
    }
 }

+fn parse_bool(key: &'static str, value: &str) -> Result<bool> {
+    value
+        .parse()
+        .map_err(|_| error::ParseFormatSnafu { key, value }.build())
+}
+
 impl Default for CsvFormat {
    fn default() -> Self {
        Self {
            has_header: true,
+            skip_bad_records: false,
            delimiter: b',',
            schema_infer_max_record: Some(file_format::DEFAULT_SCHEMA_INFER_MAX_RECORD),
            compression_type: CompressionType::Uncompressed,
@@ -189,10 +206,136 @@ impl DfRecordBatchEncoder for csv::Writer<SharedBuffer> {
    }
 }

+/// Builds a CSV stream that can skip selected record-level parse/cast errors.
+///
+/// This recovery path intentionally uses one-record batches. It is slower than
+/// normal CSV scanning, but keeps each parse/cast failure isolated to a single
+/// record. Arrow's CSV decoder clears buffered rows before type parsing, so a
+/// failed multi-row flush cannot be safely retried row by row without replaying
+/// input bytes.
+pub async fn tolerant_csv_stream(
+    store: &ObjectStore,
+    path: &str,
+    schema: SchemaRef,
+    projection: Vec<usize>,
+    format: &CsvFormat,
+) -> Result<SendableRecordBatchStream> {
+    let meta = store
+        .stat(path)
+        .await
+        .context(error::ReadObjectSnafu { path })?;
+
+    let reader = store
+        .reader(path)
+        .await
+        .context(error::ReadObjectSnafu { path })?
+        .into_bytes_stream(0..meta.content_length())
+        .await
+        .context(error::ReadObjectSnafu { path })?;
+
+    let reader = format.compression_type.convert_stream(reader).boxed();
+    tolerant_csv_stream_from_reader(
+        reader,
+        path,
+        schema,
+        projection,
+        format.has_header,
+        format.delimiter,
+    )
+}
+
+fn tolerant_csv_stream_from_reader(
+    reader: BoxStream<'static, io::Result<Bytes>>,
+    path: &str,
+    schema: SchemaRef,
+    projection: Vec<usize>,
+    has_header: bool,
+    delimiter: u8,
+) -> Result<SendableRecordBatchStream> {
+    let projected_schema = Arc::new(
+        schema
+            .project(&projection)
+            .context(error::InferSchemaSnafu)?,
+    );
+    let mut decoder = csv::ReaderBuilder::new(schema)
+        .with_header(has_header)
+        .with_delimiter(delimiter)
+        .with_batch_size(SKIP_BAD_RECORDS_BATCH_SIZE)
+        .with_projection(projection)
+        .build_decoder();
+
+    let path = path.to_string();
+    let mut upstream = reader.fuse();
+    let mut buffered = Bytes::new();
+    let mut input_finished = false;
+    let stream = futures::stream::poll_fn(move |cx| {
+        loop {
+            while !input_finished {
+                if buffered.is_empty() {
+                    match futures::ready!(upstream.poll_next_unpin(cx)) {
+                        Some(Ok(bytes)) if bytes.is_empty() => continue,
+                        Some(Ok(bytes)) => buffered = bytes,
+                        Some(Err(error)) => return Poll::Ready(Some(Err(error.into()))),
+                        None => input_finished = true,
+                    }
+                }
+
+                let decoded = decoder.decode(buffered.as_ref())?;
+                if decoded > 0 {
+                    buffered.advance(decoded);
+                    continue;
+                }
+
+                if decoder.capacity() == 0 || input_finished {
+                    break;
+                }
+
+                if buffered.is_empty() {
+                    continue;
+                }
+
+                return Poll::Ready(Some(Err(ArrowError::ParseError(
+                    "CSV decoder made no progress while input bytes remain".to_string(),
+                ))));
+            }
+
+            match decoder.flush() {
+                Ok(Some(batch)) => return Poll::Ready(Some(Ok(batch))),
+                Ok(None) if input_finished => return Poll::Ready(None),
+                Ok(None) => continue,
+                Err(error) if is_skippable_arrow_error(&error) => {
+                    warn!(
+                        "Skipping bad CSV record while copying from {}: {}",
+                        path, error
+                    );
+                }
+                Err(error) => return Poll::Ready(Some(Err(error))),
+            }
+        }
+    })
+    .map(|result: std::result::Result<RecordBatch, ArrowError>| result.map_err(Into::into));
+
+    Ok(Box::pin(RecordBatchStreamAdapter::new(
+        projected_schema,
+        stream,
+    )))
+}
+
+pub fn is_skippable_arrow_error(error: &ArrowError) -> bool {
+    matches!(
+        error,
+        ArrowError::ParseError(_)
+            | ArrowError::CastError(_)
+            | ArrowError::ComputeError(_)
+            | ArrowError::InvalidArgumentError(_)
+    )
+}
+
 #[cfg(test)]
 mod tests {
    use std::sync::Arc;

+    use arrow_schema::{DataType, Field};
    use common_recordbatch::adapter::DfRecordBatchStreamAdapter;
    use common_recordbatch::{RecordBatch, RecordBatches};
    use common_test_util::find_workspace_path;
@@ -205,7 +348,7 @@ mod tests {
    use super::*;
    use crate::file_format::{
        FORMAT_COMPRESSION_TYPE, FORMAT_DELIMITER, FORMAT_HAS_HEADER,
-        FORMAT_SCHEMA_INFER_MAX_RECORD, FileFormat, file_to_stream,
+        FORMAT_SCHEMA_INFER_MAX_RECORD, FORMAT_SKIP_BAD_RECORDS, FileFormat, file_to_stream,
    };
    use crate::test_util::{format_schema, test_store};

@@ -331,11 +474,29 @@ mod tests {
                schema_infer_max_record: Some(2000),
                delimiter: b'\t',
                has_header: false,
+                skip_bad_records: false,
                timestamp_format: None,
                time_format: None,
                date_format: None
            }
        );
+
+        let map = HashMap::from([(FORMAT_SKIP_BAD_RECORDS.to_string(), "true".to_string())]);
+        let format = CsvFormat::try_from(&map).unwrap();
+
+        assert_eq!(
+            format,
+            CsvFormat {
+                skip_bad_records: true,
+                ..CsvFormat::default()
+            }
+        );
+    }
+
+    #[test]
+    fn test_try_from_rejects_invalid_bool_options() {
+        let map = HashMap::from([(FORMAT_SKIP_BAD_RECORDS.to_string(), "yes".to_string())]);
+        assert!(CsvFormat::try_from(&map).is_err());
    }

    #[tokio::test]
@@ -496,4 +657,63 @@ mod tests {
            assert_eq!(expected, pretty_print);
        }
    }
+
+    #[tokio::test]
+    async fn test_tolerant_csv_stream_continues_after_parse_error() {
+        let temp_dir = common_test_util::temp_dir::create_temp_dir("test_tolerant_csv_stream");
+        let csv_file_path = temp_dir.path().join("input.csv");
+        std::fs::write(
+            &csv_file_path,
+            "id,name,value\n1,Alice,10.5\nbad,Bad,20.0\nworse,Bad,21.0\n2,Bob,30.5",
+        )
+        .unwrap();
+
+        let store = test_store("/");
+        let schema = Arc::new(arrow_schema::Schema::new(vec![
+            Field::new("id", DataType::UInt32, false),
+            Field::new("name", DataType::Utf8, false),
+            Field::new("value", DataType::Float64, false),
+        ]));
+        let path = csv_file_path.to_str().unwrap();
+
+        let stream =
+            tolerant_csv_stream(&store, path, schema, vec![0, 1, 2], &CsvFormat::default())
+                .await
+                .unwrap();
+        let batches = stream.try_collect::<Vec<_>>().await.unwrap();
+        let pretty_print = arrow::util::pretty::pretty_format_batches(&batches)
+            .unwrap()
+            .to_string();
+        let expected = r#"+----+-------+-------+
+| id | name  | value |
+----+-------+-------+
+| 1  | Alice | 10.5  |
+| 2  | Bob   | 30.5  |
+----+-------+-------+"#;
+        assert_eq!(expected, pretty_print);
+    }
+
+    #[tokio::test]
+    async fn test_tolerant_csv_stream_fails_on_structural_csv_error() {
+        let temp_dir =
+            common_test_util::temp_dir::create_temp_dir("test_tolerant_csv_stream_csv_error");
+        let csv_file_path = temp_dir.path().join("input.csv");
+        std::fs::write(&csv_file_path, "id,name,value\n1,Alice,10.5\n2,Bob\n").unwrap();
+
+        let store = test_store("/");
+        let schema = Arc::new(arrow_schema::Schema::new(vec![
+            Field::new("id", DataType::UInt32, false),
+            Field::new("name", DataType::Utf8, false),
+            Field::new("value", DataType::Float64, false),
+        ]));
+        let path = csv_file_path.to_str().unwrap();
+
+        let stream =
+            tolerant_csv_stream(&store, path, schema, vec![0, 1, 2], &CsvFormat::default())
+                .await
+                .unwrap();
+        let error = stream.try_collect::<Vec<_>>().await.unwrap_err();
+
+        assert!(error.to_string().contains("incorrect number of fields"));
+    }
 }
--- a/src/common/meta/src/instruction.rs
+++ b/src/common/meta/src/instruction.rs
@@ -18,7 +18,7 @@ use std::time::Duration;

 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use store_api::region_engine::SyncRegionFromRequest;
-use store_api::region_request::RegionFlushReason;
+use store_api::region_request::{RegionFlushReason, RegionRequirements};
 use store_api::storage::{FileRefsManifest, GcReport, RegionId, RegionNumber};
 use strum::Display;
 use table::metadata::TableId;
@@ -179,12 +179,24 @@ impl Display for OpenRegion {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
-            "OpenRegion(region_ident={}, region_storage_path={})",
-            self.region_ident, self.region_storage_path
+            "OpenRegion(region_ident={}, region_storage_path={}, reason={:?})",
+            self.region_ident, self.region_storage_path, self.reason
        )
    }
 }

+/// The reason why an open region instruction is triggered.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
+pub enum OpenRegionReason {
+    /// Open triggered before region migration.
+    RegionMigration,
+    /// Open triggered by region failover.
+    RegionFailover,
+    /// Open triggered when adding a follower region.
+    #[cfg(feature = "enterprise")]
+    RegionFollower,
+}
+
 #[serde_with::serde_as]
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct OpenRegion {
@@ -196,6 +208,10 @@ pub struct OpenRegion {
    pub region_wal_options: HashMap<RegionNumber, String>,
    #[serde(default)]
    pub skip_wal_replay: bool,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub reason: Option<OpenRegionReason>,
+    #[serde(default)]
+    pub requirements: RegionRequirements,
 }

 impl OpenRegion {
@@ -205,6 +221,8 @@ impl OpenRegion {
        region_options: HashMap<String, String>,
        region_wal_options: HashMap<RegionNumber, String>,
        skip_wal_replay: bool,
+        reason: Option<OpenRegionReason>,
+        requirements: RegionRequirements,
    ) -> Self {
        Self {
            region_ident,
@@ -212,6 +230,8 @@ impl OpenRegion {
            region_options,
            region_wal_options,
            skip_wal_replay,
+            reason,
+            requirements,
        }
    }
 }
@@ -1126,11 +1146,13 @@ mod tests {
            HashMap::new(),
            HashMap::new(),
            false,
+            None,
+            RegionRequirements::empty(),
        )]);

        let serialized = serde_json::to_string(&open_region).unwrap();
        assert_eq!(
-            r#"{"OpenRegions":[{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false}]}"#,
+            r#"{"OpenRegions":[{"region_ident":{"datanode_id":2,"table_id":1024,"region_number":1,"engine":"mito2"},"region_storage_path":"test/foo","region_options":{},"region_wal_options":{},"skip_wal_replay":false,"requirements":{"object_storage":false}}]}"#,
            serialized
        );

@@ -1213,6 +1235,8 @@ mod tests {
            HashMap::new(),
            HashMap::new(),
            false,
+            None,
+            RegionRequirements::empty(),
        )]);
        assert_eq!(open_region_instruction, open_region);

@@ -1368,10 +1392,41 @@ mod tests {
            region_options,
            region_wal_options: HashMap::new(),
            skip_wal_replay: false,
+            reason: None,
+            requirements: RegionRequirements::empty(),
        };
        assert_eq!(expected, deserialized);
    }

+    #[test]
+    fn test_serialize_open_region_with_reason_and_requirements() {
+        let open_region = OpenRegion::new(
+            RegionIdent {
+                datanode_id: 2,
+                table_id: 1024,
+                region_number: 1,
+                engine: "mito2".to_string(),
+            },
+            "test/foo",
+            HashMap::new(),
+            HashMap::new(),
+            false,
+            Some(OpenRegionReason::RegionMigration),
+            RegionRequirements::object_storage(),
+        );
+
+        let serialized = serde_json::to_string(&open_region).unwrap();
+        assert!(serialized.contains(r#""reason":"RegionMigration""#));
+        assert!(serialized.contains(r#""object_storage":true"#));
+
+        let deserialized: OpenRegion = serde_json::from_str(&serialized).unwrap();
+        assert_eq!(Some(OpenRegionReason::RegionMigration), deserialized.reason);
+        assert_eq!(
+            RegionRequirements::object_storage(),
+            deserialized.requirements
+        );
+    }
+
    #[test]
    fn test_flush_regions_creation() {
        let region_id = RegionId::new(1024, 1);
--- a/src/datanode/src/config.rs
+++ b/src/datanode/src/config.rs
@@ -14,6 +14,8 @@

 //! Datanode configurations

+use std::time::Duration;
+
 use common_base::readable_size::ReadableSize;
 use common_config::{Configurable, DEFAULT_DATA_HOME};
 use common_options::memory::MemoryOptions;
@@ -75,6 +77,10 @@ pub struct DatanodeOptions {
    pub wal: DatanodeWalConfig,
    pub storage: StorageConfig,
    pub max_concurrent_queries: usize,
+    /// Timeout to acquire a permit from the concurrent query limiter when
+    /// `max_concurrent_queries` is reached. Only effective when the limiter is enabled.
+    #[serde(with = "humantime_serde")]
+    pub concurrent_query_limiter_timeout: Duration,
    /// Options for different store engines.
    pub region_engine: Vec<RegionEngineConfig>,
    pub logging: LoggingOptions,
@@ -131,6 +137,7 @@ impl Default for DatanodeOptions {
            wal: DatanodeWalConfig::default(),
            storage: StorageConfig::default(),
            max_concurrent_queries: 0,
+            concurrent_query_limiter_timeout: Duration::from_millis(100),
            region_engine: vec![
                RegionEngineConfig::Mito(MitoConfig::default()),
                RegionEngineConfig::File(FileEngineConfig::default()),
--- a/src/datanode/src/datanode.rs
+++ b/src/datanode/src/datanode.rs
@@ -445,8 +445,7 @@ impl DatanodeBuilder {
            event_listener,
            table_provider_factory,
            opts.max_concurrent_queries,
-            //TODO: revaluate the hardcoded timeout on the next version of datanode concurrency limiter.
-            Duration::from_millis(100),
+            opts.concurrent_query_limiter_timeout,
            opts.grpc.flight_compression,
        );

--- a/src/datanode/src/heartbeat/handler.rs
+++ b/src/datanode/src/heartbeat/handler.rs
@@ -313,7 +313,7 @@ mod tests {
    use mito2::test_util::{CreateRequestBuilder, TestEnv};
    use store_api::path_utils::table_dir;
    use store_api::region_engine::RegionRole;
-    use store_api::region_request::{RegionCloseRequest, RegionRequest};
+    use store_api::region_request::{RegionCloseRequest, RegionRequest, RegionRequirements};
    use store_api::storage::RegionId;
    use tokio::sync::mpsc::{self, Receiver};

@@ -442,6 +442,8 @@ mod tests {
            HashMap::new(),
            HashMap::new(),
            false,
+            None,
+            RegionRequirements::empty(),
        )])
    }

--- a/src/datanode/src/heartbeat/handler/open_region.rs
+++ b/src/datanode/src/heartbeat/handler/open_region.rs
@@ -14,6 +14,7 @@

 use common_meta::instruction::{InstructionReply, OpenRegion, SimpleReply};
 use common_meta::wal_provider::prepare_wal_options;
+use common_telemetry::info;
 use store_api::path_utils::table_dir;
 use store_api::region_request::{PathType, RegionOpenRequest};
 use store_api::storage::RegionId;
@@ -41,8 +42,13 @@ impl InstructionHandler for OpenRegionsHandler {
                    mut region_options,
                    region_wal_options,
                    skip_wal_replay,
+                    reason,
+                    requirements,
                } = open_region;
                let region_id = RegionId::new(region_ident.table_id, region_ident.region_number);
+                info!(
+                    "Received open region instruction, region_id: {region_id}, reason: {reason:?}"
+                );
                prepare_wal_options(&mut region_options, region_id, &region_wal_options);
                let request = RegionOpenRequest {
                    engine: region_ident.engine,
@@ -51,6 +57,7 @@ impl InstructionHandler for OpenRegionsHandler {
                    options: region_options,
                    skip_wal_replay,
                    checkpoint: None,
+                    requirements,
                };
                (region_id, request)
            })
@@ -85,7 +92,7 @@ mod tests {
    use mito2::engine::MITO_ENGINE_NAME;
    use mito2::test_util::{CreateRequestBuilder, TestEnv};
    use store_api::path_utils::table_dir;
-    use store_api::region_request::{RegionCloseRequest, RegionRequest};
+    use store_api::region_request::{RegionCloseRequest, RegionRequest, RegionRequirements};
    use store_api::storage::RegionId;

    use crate::heartbeat::handler::RegionHeartbeatResponseHandler;
@@ -98,17 +105,21 @@ mod tests {
    ) -> Instruction {
        let region_idents = region_ids
            .into_iter()
-            .map(|region_id| OpenRegion {
-                region_ident: RegionIdent {
-                    datanode_id: 0,
-                    table_id: region_id.table_id(),
-                    region_number: region_id.region_number(),
-                    engine: MITO_ENGINE_NAME.to_string(),
-                },
-                region_storage_path: storage_path.to_string(),
-                region_options: HashMap::new(),
-                region_wal_options: HashMap::new(),
-                skip_wal_replay: false,
+            .map(|region_id| {
+                OpenRegion::new(
+                    RegionIdent {
+                        datanode_id: 0,
+                        table_id: region_id.table_id(),
+                        region_number: region_id.region_number(),
+                        engine: MITO_ENGINE_NAME.to_string(),
+                    },
+                    storage_path,
+                    HashMap::new(),
+                    HashMap::new(),
+                    false,
+                    None,
+                    RegionRequirements::empty(),
+                )
            })
            .collect();

--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -49,6 +49,7 @@ use common_telemetry::{debug, error, info, warn};
 use dashmap::DashMap;
 use datafusion::datasource::TableProvider;
 use datafusion_common::tree_node::TreeNode;
+use datatypes::schema::SchemaRef;
 use either::Either;
 use futures_util::Stream;
 use futures_util::future::try_join_all;
@@ -82,7 +83,7 @@ use store_api::region_request::{
    RegionOpenRequest, RegionRequest,
 };
 use store_api::storage::RegionId;
-use tokio::sync::{Semaphore, SemaphorePermit};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::timeout;
 use tonic::{Request, Response, Result as TonicResult};

@@ -257,7 +258,7 @@ impl RegionServer {
        request: api::v1::region::QueryRequest,
        query_ctx: QueryContextRef,
    ) -> Result<SendableRecordBatchStream> {
-        let _permit = if let Some(p) = &self.inner.parallelism {
+        let permit = if let Some(p) = &self.inner.parallelism {
            Some(p.acquire().await?)
        } else {
            None
@@ -298,14 +299,13 @@ impl RegionServer {
            )
            .await?;

-        Ok(wrap_flow_region_watermark_stream(
-            stream, region_id, &query_ctx,
-        ))
+        let stream = wrap_flow_region_watermark_stream(stream, region_id, &query_ctx);
+        Ok(maybe_guard_stream(stream, permit))
    }

    #[tracing::instrument(skip_all)]
    pub async fn handle_read(&self, request: QueryRequest) -> Result<SendableRecordBatchStream> {
-        let _permit = if let Some(p) = &self.inner.parallelism {
+        let permit = if let Some(p) = &self.inner.parallelism {
            Some(p.acquire().await?)
        } else {
            None
@@ -332,9 +332,8 @@ impl RegionServer {
            .handle_read(QueryRequest { plan, ..request }, query_ctx.clone())
            .await?;

-        Ok(wrap_flow_region_watermark_stream(
-            stream, region_id, &query_ctx,
-        ))
+        let stream = wrap_flow_region_watermark_stream(stream, region_id, &query_ctx);
+        Ok(maybe_guard_stream(stream, permit))
    }

    /// Returns all opened and reportable regions.
@@ -1058,7 +1057,7 @@ struct RegionServerInner {
 }

 struct RegionServerParallelism {
-    semaphore: Semaphore,
+    semaphore: Arc<Semaphore>,
    timeout: Duration,
 }

@@ -1071,19 +1070,68 @@ impl RegionServerParallelism {
            return None;
        }
        Some(RegionServerParallelism {
-            semaphore: Semaphore::new(max_concurrent_queries),
+            semaphore: Arc::new(Semaphore::new(max_concurrent_queries)),
            timeout: concurrent_query_limiter_timeout,
        })
    }

-    pub async fn acquire(&self) -> Result<SemaphorePermit<'_>> {
-        timeout(self.timeout, self.semaphore.acquire())
+    pub async fn acquire(&self) -> Result<OwnedSemaphorePermit> {
+        timeout(self.timeout, self.semaphore.clone().acquire_owned())
            .await
            .context(ConcurrentQueryLimiterTimeoutSnafu)?
            .context(ConcurrentQueryLimiterClosedSnafu)
    }
 }

+/// Wraps a record batch stream and holds a concurrency permit until the stream is
+/// fully consumed (dropped), so `max_concurrent_queries` bounds the number of
+/// in-flight read streams, not just query planning.
+struct PermitGuardedStream {
+    inner: SendableRecordBatchStream,
+    _permit: OwnedSemaphorePermit,
+}
+
+impl RecordBatchStream for PermitGuardedStream {
+    fn name(&self) -> &str {
+        self.inner.name()
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.inner.schema()
+    }
+
+    fn output_ordering(&self) -> Option<&[OrderOption]> {
+        self.inner.output_ordering()
+    }
+
+    fn metrics(&self) -> Option<RecordBatchMetrics> {
+        self.inner.metrics()
+    }
+}
+
+impl Stream for PermitGuardedStream {
+    type Item = common_recordbatch::error::Result<RecordBatch>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.inner.as_mut().poll_next(cx)
+    }
+}
+
+/// Wraps `stream` so it holds `permit` until fully consumed. Returns `stream`
+/// unchanged when no permit was acquired (limiter disabled).
+fn maybe_guard_stream(
+    stream: SendableRecordBatchStream,
+    permit: Option<OwnedSemaphorePermit>,
+) -> SendableRecordBatchStream {
+    match permit {
+        Some(permit) => Box::pin(PermitGuardedStream {
+            inner: stream,
+            _permit: permit,
+        }),
+        None => stream,
+    }
+}
+
 enum CurrentEngine {
    Engine(RegionEngineRef),
    EarlyReturn(AffectedRows),
@@ -2057,6 +2105,7 @@ mod tests {
                    options: Default::default(),
                    skip_wal_replay: false,
                    checkpoint: None,
+                    requirements: Default::default(),
                }),
            )
            .await
@@ -2235,6 +2284,7 @@ mod tests {
                            options: Default::default(),
                            skip_wal_replay: false,
                            checkpoint: None,
+                            requirements: Default::default(),
                        },
                    ),
                    (
@@ -2246,6 +2296,7 @@ mod tests {
                            options: Default::default(),
                            skip_wal_replay: false,
                            checkpoint: None,
+                            requirements: Default::default(),
                        },
                    ),
                ],
@@ -2268,6 +2319,7 @@ mod tests {
                            options: Default::default(),
                            skip_wal_replay: false,
                            checkpoint: None,
+                            requirements: Default::default(),
                        },
                    ),
                    (
@@ -2279,6 +2331,7 @@ mod tests {
                            options: Default::default(),
                            skip_wal_replay: false,
                            checkpoint: None,
+                            requirements: Default::default(),
                        },
                    ),
                ],
--- a/src/datanode/src/utils.rs
+++ b/src/datanode/src/utils.rs
@@ -175,6 +175,7 @@ pub async fn build_region_open_requests(
                options,
                skip_wal_replay: false,
                checkpoint,
+                requirements: Default::default(),
            },
        ));
    }
@@ -193,6 +194,7 @@ pub async fn build_region_open_requests(
                    options,
                    skip_wal_replay: true,
                    checkpoint: None,
+                    requirements: Default::default(),
                },
            ));
        }
--- a/src/file-engine/src/engine.rs
+++ b/src/file-engine/src/engine.rs
@@ -32,7 +32,7 @@ use store_api::region_engine::{
 };
 use store_api::region_request::{
    AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest,
-    RegionRequest,
+    RegionRequest, RegionRequirements,
 };
 use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
 use tokio::sync::Mutex;
@@ -186,6 +186,24 @@ struct EngineInner {

 type EngineInnerRef = Arc<EngineInner>;

+fn ensure_open_requirements(
+    requirements: RegionRequirements,
+    object_store: &ObjectStore,
+) -> EngineResult<()> {
+    if !requirements.object_storage {
+        return Ok(());
+    }
+
+    ensure!(
+        object_store::util::is_object_storage(object_store),
+        UnsupportedSnafu {
+            operation: "open region with object storage requirement on non-object storage"
+        }
+    );
+
+    Ok(())
+}
+
 impl EngineInner {
    fn new(object_store: ObjectStore) -> Self {
        Self {
@@ -289,6 +307,8 @@ impl EngineInner {
            return Ok(0);
        }

+        ensure_open_requirements(request.requirements, &self.object_store)?;
+
        let res = FileRegion::open(region_id, request, &self.object_store).await;
        let region = res.inspect_err(|err| {
            error!(
@@ -356,3 +376,53 @@ impl EngineInner {
        self.regions.read().unwrap().contains_key(&region_id)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use object_store::services::{Fs, S3};
+
+    use super::*;
+    use crate::error::Error;
+
+    fn build_fs_object_store() -> ObjectStore {
+        ObjectStore::new(Fs::default().root("/tmp"))
+            .unwrap()
+            .finish()
+    }
+
+    fn build_s3_object_store() -> ObjectStore {
+        ObjectStore::new(
+            S3::default()
+                .bucket("test-bucket")
+                .region("us-east-1")
+                .disable_ec2_metadata(),
+        )
+        .unwrap()
+        .finish()
+    }
+
+    #[test]
+    fn test_empty_open_requirements_are_supported() {
+        ensure_open_requirements(RegionRequirements::empty(), &build_fs_object_store()).unwrap();
+    }
+
+    #[test]
+    fn test_object_storage_open_requirement_rejects_fs_object_store() {
+        let err = ensure_open_requirements(
+            RegionRequirements::object_storage(),
+            &build_fs_object_store(),
+        )
+        .unwrap_err();
+
+        assert!(matches!(err, Error::Unsupported { .. }));
+    }
+
+    #[test]
+    fn test_object_storage_open_requirement_accepts_s3_object_store() {
+        ensure_open_requirements(
+            RegionRequirements::object_storage(),
+            &build_s3_object_store(),
+        )
+        .unwrap();
+    }
+}
--- a/src/file-engine/src/region.rs
+++ b/src/file-engine/src/region.rs
@@ -181,6 +181,7 @@ mod tests {
            options: HashMap::default(),
            skip_wal_replay: false,
            checkpoint: None,
+            requirements: Default::default(),
        };

        let region = FileRegion::open(region_id, request, &object_store)
@@ -238,6 +239,7 @@ mod tests {
            options: HashMap::default(),
            skip_wal_replay: false,
            checkpoint: None,
+            requirements: Default::default(),
        };
        let err = FileRegion::open(region_id, request, &object_store)
            .await
--- a/src/flow/src/batching_mode/engine.rs
+++ b/src/flow/src/batching_mode/engine.rs
@@ -630,8 +630,11 @@ impl BatchingEngine {
        let engine = self.query_engine.clone();
        let frontend = self.frontend_client.clone();

-        // check execute once first to detect any error early
+        // Create sink table if needed, then validate an existing/created sink schema before
+        // spawning the background task. This catches user-created sink schema mismatches at
+        // CREATE FLOW time instead of surfacing them later in the execution loop.
        task.check_or_create_sink_table(&engine, &frontend).await?;
+        task.validate_sink_table_schema(&engine).await?;

        let (start_tx, start_rx) = oneshot::channel();

--- a/src/flow/src/batching_mode/task.rs
+++ b/src/flow/src/batching_mode/task.rs
@@ -265,6 +265,36 @@ impl BatchingTask {
        Ok(None)
    }

+    /// Validates that the sink table schema can accept this flow's output.
+    ///
+    /// This is a dry-run of the same schema matching logic used by runtime insert-plan
+    /// generation, but without adding dirty-window filters or executing the query. It is used
+    /// during CREATE FLOW to catch existing sink table mismatches early.
+    pub async fn validate_sink_table_schema(&self, engine: &QueryEngineRef) -> Result<(), Error> {
+        let (table, _) = get_table_info_df_schema(
+            self.config.catalog_manager.clone(),
+            self.config.sink_table_name.clone(),
+        )
+        .await?;
+
+        let table_meta = &table.table_info().meta;
+        let merge_mode_last_non_null =
+            is_merge_mode_last_non_null(&table_meta.options.extra_options);
+        let primary_key_indices = table_meta.primary_key_indices.clone();
+        let query_ctx = self.state.read().unwrap().query_ctx.clone();
+
+        gen_plan_with_matching_schema(
+            &self.config.query,
+            query_ctx,
+            engine.clone(),
+            table_meta.schema.clone(),
+            &primary_key_indices,
+            merge_mode_last_non_null,
+        )
+        .await
+        .map(|_| ())
+    }
+
    async fn is_table_exist(&self, table_name: &[String; 3]) -> Result<bool, Error> {
        self.config
            .catalog_manager
@@ -929,7 +959,7 @@ impl BatchingTask {
        let (expire_lower_bound, expire_upper_bound) =
            match (expire_time_window_bound, &self.config.query_type) {
                (Some((Some(l), Some(u))), QueryType::Sql) => (l, u),
-                (None, QueryType::Sql) => {
+                (None, QueryType::Sql) if self.config.flow_eval_interval.is_none() => {
                    // if it's sql query and no time window lower/upper bound is found, just return the original query(with auto columns)
                    // use sink_table_meta to add to query the `update_at` and `__ts_placeholder` column's value too for compatibility reason
                    debug!(
@@ -950,7 +980,8 @@ impl BatchingTask {
                }
                _ => {
                    // Clean dirty windows for full-query/non-scoped paths,
-                    // such as TQL, that cannot use a time-window filter.
+                    // such as TQL or evaluation-interval SQL without a recognized
+                    // time-window expression, that cannot use a time-window filter.
                    let (_, dirty_windows_to_restore) = self.drain_dirty_windows_signal();

                    let plan_info = self
--- a/src/flow/src/batching_mode/task/test.rs
+++ b/src/flow/src/batching_mode/task/test.rs
@@ -974,6 +974,38 @@ async fn test_non_scoped_path_generates_plan_with_empty_dirty_signal() {
    assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
 }

+#[tokio::test]
+async fn test_no_time_window_sql_with_eval_interval_generates_plan_without_dirty_signal() {
+    let TestTaskParts {
+        mut task,
+        query_engine,
+        ..
+    } = new_test_task_engine_and_plan_with_query(
+        "SELECT number, ts FROM numbers_with_ts",
+        "missing_sink",
+    )
+    .await;
+    Arc::get_mut(&mut task.config)
+        .expect("test task config should be uniquely owned")
+        .flow_eval_interval = Some(Duration::from_secs(60));
+    task.state.write().unwrap().dirty_time_windows.clean();
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", CDT::uint32_datatype(), false),
+        ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false).with_time_index(true),
+    ]));
+
+    let plan = task
+        .gen_query_with_time_window(query_engine, &sink_schema, &[], false, None)
+        .await
+        .unwrap()
+        .expect(
+            "eval-interval SQL without a time-window expr should run by interval, not dirty signal",
+        );
+
+    assert!(plan.can_advance_checkpoints);
+    assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
+}
+
 #[tokio::test]
 async fn test_executed_query_failure_restores_scoped_dirty_windows_for_flush_path() {
    let (task, plan) = new_test_task_and_plan_with_missing_sink().await;
--- a/src/flow/src/batching_mode/utils.rs
+++ b/src/flow/src/batching_mode/utils.rs
@@ -33,9 +33,10 @@ use datafusion_common::{
 };
 use datafusion_expr::logical_plan::{Aggregate, TableScan};
 use datafusion_expr::{
-    Distinct, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, Projection, and, binary_expr,
-    bitwise_and, bitwise_or, bitwise_xor, is_null, or, when,
+    Distinct, ExprSchemable, JoinType, LogicalPlan, LogicalPlanBuilder, Operator, Projection, and,
+    binary_expr, bitwise_and, bitwise_or, bitwise_xor, is_null, or, when,
 };
+use datatypes::prelude::ConcreteDataType;
 use datatypes::schema::{ColumnSchema, SchemaRef};
 use query::QueryEngineRef;
 use query::parser::{DEFAULT_LOOKBACK_STRING, PromQuery, QueryLanguageParser, QueryStatement};
@@ -955,7 +956,7 @@ pub(crate) async fn gen_plan_with_matching_schema(
        .clone()
        .rewrite(&mut add_auto_column)
        .with_context(|_| DatafusionSnafu {
-            context: format!("Failed to rewrite plan:\n {}\n", plan),
+            context: "Failed to rewrite plan".to_string(),
        })?
        .data;
    Ok(plan)
@@ -1090,33 +1091,23 @@ impl ColumnMatcherRewriter {
    }

    /// modify the exprs in place so that it matches the schema and some auto columns are added
-    fn modify_project_exprs(&mut self, mut exprs: Vec<Expr>) -> DfResult<Vec<Expr>> {
+    fn modify_project_exprs(
+        &mut self,
+        mut exprs: Vec<Expr>,
+        input_schema: &DFSchema,
+    ) -> DfResult<Vec<Expr>> {
        if self.allow_partial {
            return self.modify_project_exprs_with_partial(exprs);
        }

+        let original_exprs = exprs.clone();
+
        let all_names = self
            .schema
            .column_schemas()
            .iter()
            .map(|c| c.name.clone())
            .collect::<BTreeSet<_>>();
-        // first match by position
-        for (idx, expr) in exprs.iter_mut().enumerate() {
-            if !all_names.contains(&expr.qualified_name().1)
-                && let Some(col_name) = self
-                    .schema
-                    .column_schemas()
-                    .get(idx)
-                    .map(|c| c.name.clone())
-            {
-                // if the data type mismatched, later check_execute will error out
-                // hence no need to check it here, beside, optimize pass might be able to cast it
-                // so checking here is not necessary
-                *expr = expr.clone().alias(col_name);
-            }
-        }
-
        // add columns if have different column count
        let query_col_cnt = exprs.len();
        let table_col_cnt = self.schema.column_schemas().len();
@@ -1140,10 +1131,9 @@ impl ColumnMatcherRewriter {
                // is the update at column
                exprs.push(datafusion::prelude::now().alias(&last_col_schema.name));
            } else {
-                // helpful error message
-                return Err(DataFusionError::Plan(format!(
-                    "Expect the last column in table to be timestamp column, found column {} with type {:?}",
-                    last_col_schema.name, last_col_schema.data_type
+                return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
+                    &original_exprs,
+                    self.schema.as_ref(),
                )));
            }
        } else if query_col_cnt + 2 == table_col_cnt {
@@ -1170,14 +1160,110 @@ impl ColumnMatcherRewriter {
                )));
            }
        } else {
-            return Err(DataFusionError::Plan(format!(
-                "Expect table have 0,1 or 2 columns more than query columns, found {} query columns {:?}, {} table columns {:?}",
-                query_col_cnt,
-                exprs,
-                table_col_cnt,
-                self.schema.column_schemas()
+            return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
+                &original_exprs,
+                self.schema.as_ref(),
            )));
        }
+
+        self.match_extra_output_columns(exprs, input_schema, &original_exprs, &all_names)
+    }
+
+    /// Match flow output columns whose names are not in the sink schema by the same position only.
+    ///
+    /// This keeps the legacy "omit output aliases and map by position" behavior, but only when the
+    /// sink column at the same index is actually missing from the flow output. If the extra output
+    /// would be aliased to a sink column that already exists elsewhere, report a schema mismatch
+    /// instead of guessing another sink column by type.
+    ///
+    /// In particular, this intentionally rejects cross-position remaps like
+    /// `record_time_window2 -> record_time_window`: they are easy to confuse with real schema
+    /// mismatches and should be fixed by giving the flow output the sink column name explicitly.
+    fn match_extra_output_columns(
+        &self,
+        mut exprs: Vec<Expr>,
+        input_schema: &DFSchema,
+        original_exprs: &[Expr],
+        all_names: &BTreeSet<String>,
+    ) -> DfResult<Vec<Expr>> {
+        let mut output_names = exprs
+            .iter()
+            .map(|expr| expr.qualified_name().1)
+            .collect::<Vec<_>>();
+        let output_name_set = output_names.iter().cloned().collect::<BTreeSet<_>>();
+        let extra_expr_indices = output_names
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, name)| (!all_names.contains(name)).then_some(idx))
+            .collect::<Vec<_>>();
+        let missing_sink_indices = self
+            .schema
+            .column_schemas()
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, column)| (!output_name_set.contains(&column.name)).then_some(idx))
+            .collect::<Vec<_>>();
+
+        if extra_expr_indices.is_empty() && missing_sink_indices.is_empty() {
+            return Ok(exprs);
+        }
+
+        if extra_expr_indices.len() != missing_sink_indices.len() {
+            return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
+                original_exprs,
+                self.schema.as_ref(),
+            )));
+        }
+
+        let mut positional_matches = Vec::new();
+        for expr_idx in extra_expr_indices {
+            if !missing_sink_indices.contains(&expr_idx) {
+                return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
+                    original_exprs,
+                    self.schema.as_ref(),
+                )));
+            }
+
+            let target_col_schema = &self.schema.column_schemas()[expr_idx];
+            let expr_type =
+                ConcreteDataType::from_arrow_type(&exprs[expr_idx].get_type(input_schema)?);
+            if is_obviously_incompatible_positional_match(&expr_type, &target_col_schema.data_type)
+            {
+                return Err(DataFusionError::Plan(format!(
+                    "Cannot match flow output column '{}' to sink column '{}' by position: incompatible data types, flow output type is {:?}, sink column type is {:?}. {}",
+                    output_names[expr_idx],
+                    target_col_schema.name,
+                    expr_type,
+                    target_col_schema.data_type,
+                    format_flow_sink_schema_mismatch(original_exprs, self.schema.as_ref())
+                )));
+            }
+
+            let target_name = target_col_schema.name.clone();
+            positional_matches.push(format!(
+                "{} -> {} (flow output type: {:?}, sink column type: {:?})",
+                output_names[expr_idx], target_name, expr_type, target_col_schema.data_type
+            ));
+            exprs[expr_idx] = exprs[expr_idx].clone().alias(target_name.clone());
+            output_names[expr_idx] = target_name;
+        }
+
+        if !positional_matches.is_empty() {
+            debug!(
+                "Matched flow output columns to sink columns by position: {:?}",
+                positional_matches
+            );
+        }
+
+        let duplicated_output_names = duplicate_names(&output_names);
+        if !duplicated_output_names.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "Flow output schema contains duplicate column(s) after schema matching {:?}. {}",
+                duplicated_output_names,
+                format_flow_sink_schema_mismatch(&exprs, self.schema.as_ref())
+            )));
+        }
+
        Ok(exprs)
    }

@@ -1186,12 +1272,9 @@ impl ColumnMatcherRewriter {
        let query_col_cnt = exprs.len();

        if query_col_cnt > table_col_cnt {
-            return Err(DataFusionError::Plan(format!(
-                "Expect query column count <= table column count, found {} query columns {:?}, {} table columns {:?}",
-                query_col_cnt,
-                exprs,
-                table_col_cnt,
-                self.schema.column_schemas()
+            return Err(DataFusionError::Plan(format_flow_sink_schema_mismatch(
+                &exprs,
+                self.schema.as_ref(),
            )));
        }

@@ -1209,8 +1292,9 @@ impl ColumnMatcherRewriter {
            .collect();
        if !missing.is_empty() {
            return Err(DataFusionError::Plan(format!(
-                "Column(s) {:?} required by sink table are missing from flow output when merge_mode=last_non_null",
-                missing
+                "Column(s) {:?} required by sink table are missing from flow output when merge_mode=last_non_null. {}",
+                missing,
+                format_flow_sink_schema_mismatch(&exprs, self.schema.as_ref())
            )));
        }

@@ -1250,8 +1334,9 @@ impl ColumnMatcherRewriter {
        if !remap.is_empty() {
            let extra: Vec<_> = remap.keys().cloned().collect();
            return Err(DataFusionError::Plan(format!(
-                "Flow output has extra column(s) {:?} not found in sink schema when merge_mode=last_non_null",
-                extra
+                "Flow output has extra column(s) {:?} not found in sink schema when merge_mode=last_non_null. {}",
+                extra,
+                format_flow_sink_schema_mismatch(&exprs, self.schema.as_ref())
            )));
        }

@@ -1281,6 +1366,80 @@ impl ColumnMatcherRewriter {
    }
 }

+fn is_obviously_incompatible_positional_match(
+    expr_type: &ConcreteDataType,
+    sink_type: &ConcreteDataType,
+) -> bool {
+    // This is a coarse type-family guard for legacy positional aliasing, not a strict type equality
+    // check. For example, numeric width/sign differences are allowed here and left to downstream
+    // coercion, and untyped NULL can be coerced to any target type. Clearly different families such
+    // as timestamp vs string are rejected early.
+    if expr_type.is_null() || expr_type == sink_type {
+        return false;
+    }
+
+    expr_type.is_timestamp() != sink_type.is_timestamp()
+        || expr_type.is_string() != sink_type.is_string()
+        || expr_type.is_boolean() != sink_type.is_boolean()
+        || expr_type.is_json() != sink_type.is_json()
+        || expr_type.is_vector() != sink_type.is_vector()
+}
+
+fn duplicate_names(names: &[String]) -> Vec<String> {
+    let mut seen = HashSet::new();
+    let mut duplicated = BTreeSet::new();
+    for name in names {
+        if !seen.insert(name.as_str()) {
+            duplicated.insert(name.as_str());
+        }
+    }
+    duplicated.into_iter().map(str::to_string).collect()
+}
+
+fn format_flow_sink_schema_mismatch(
+    query_exprs: &[Expr],
+    sink_schema: &datatypes::schema::Schema,
+) -> String {
+    let flow_output_columns = query_exprs
+        .iter()
+        .map(|expr| expr.qualified_name().1)
+        .collect::<Vec<_>>();
+    let sink_table_columns = sink_schema
+        .column_schemas()
+        .iter()
+        .map(|col| col.name.clone())
+        .collect::<Vec<_>>();
+
+    let flow_output_set = flow_output_columns.iter().cloned().collect::<HashSet<_>>();
+    let sink_table_set = sink_table_columns.iter().cloned().collect::<HashSet<_>>();
+
+    let mut extra_flow_columns = flow_output_columns
+        .iter()
+        .filter(|name| !sink_table_set.contains(*name))
+        .cloned()
+        .collect::<Vec<_>>();
+    extra_flow_columns.sort();
+    extra_flow_columns.dedup();
+
+    let mut missing_sink_columns = sink_table_columns
+        .iter()
+        .filter(|name| !flow_output_set.contains(*name))
+        .cloned()
+        .collect::<Vec<_>>();
+    missing_sink_columns.sort();
+    missing_sink_columns.dedup();
+
+    format!(
+        "Flow output schema does not match sink table schema: found {} flow output columns and {} sink table columns. flow output columns: {:?}, sink table columns: {:?}, extra flow columns not in sink: {:?}, missing sink columns from flow output: {:?}",
+        flow_output_columns.len(),
+        sink_table_columns.len(),
+        flow_output_columns,
+        sink_table_columns,
+        extra_flow_columns,
+        missing_sink_columns
+    )
+}
+
 impl TreeNodeRewriter for ColumnMatcherRewriter {
    type Node = LogicalPlan;
    fn f_down(&mut self, mut node: Self::Node) -> DfResult<Transformed<Self::Node>> {
@@ -1327,7 +1486,7 @@ impl TreeNodeRewriter for ColumnMatcherRewriter {
        // if not, wrap it in a projection
        if let LogicalPlan::Projection(project) = &node {
            let exprs = project.expr.clone();
-            let exprs = self.modify_project_exprs(exprs)?;
+            let exprs = self.modify_project_exprs(exprs, project.input.schema())?;

            self.is_rewritten = true;
            let new_plan =
@@ -1341,7 +1500,7 @@ impl TreeNodeRewriter for ColumnMatcherRewriter {
                    field.name(),
                )));
            }
-            let exprs = self.modify_project_exprs(exprs)?;
+            let exprs = self.modify_project_exprs(exprs, node.schema())?;
            self.is_rewritten = true;
            let new_plan =
                LogicalPlan::Projection(Projection::try_new(exprs, Arc::new(node.clone()))?);
--- a/src/flow/src/batching_mode/utils/test.rs
+++ b/src/flow/src/batching_mode/utils/test.rs
@@ -14,6 +14,7 @@

 use std::sync::Arc;

+use catalog::RegisterTableRequest;
 use common_recordbatch::RecordBatch;
 use common_time::Timestamp;
 use datafusion_common::tree_node::TreeNode as _;
@@ -29,7 +30,9 @@ use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
 use table::test_util::MemTable;

 use super::*;
+use crate::batching_mode::BatchingModeOptions;
 use crate::batching_mode::state::FilterExprInfo;
+use crate::batching_mode::task::{BatchingTask, TaskArgs};
 use crate::test_utils::create_test_query_engine;

 fn u32_table(table_name: &str, columns: Vec<&str>, rows: usize) -> TableRef {
@@ -432,9 +435,7 @@ async fn test_add_auto_column_rewriter() {
        // error datatype mismatch
        (
            "SELECT number, ts FROM numbers_with_ts",
-            Err(
-                "Expect the last column in table to be timestamp column, found column atat with type Int8",
-            ),
+            Err("missing sink columns from flow output: [\"atat\"]"),
            vec![
                ColumnSchema::new("number", ConcreteDataType::int32_datatype(), true),
                ColumnSchema::new(
@@ -498,6 +499,383 @@ async fn test_add_auto_column_rewriter() {
    }
 }

+#[tokio::test]
+async fn test_gen_plan_with_matching_schema_reports_extra_flow_columns_before_positional_alias() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+        ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true),
+        ColumnSchema::new(
+            "max(numbers_with_ts.number)",
+            ConcreteDataType::uint32_datatype(),
+            true,
+        ),
+    ]));
+
+    let err = gen_plan_with_matching_schema(
+        "SELECT number, number AS extra, ts, max(number) FROM numbers_with_ts GROUP BY number, ts",
+        ctx,
+        query_engine,
+        sink_schema,
+        &[],
+        false,
+    )
+    .await
+    .unwrap_err()
+    .to_string();
+
+    assert!(
+        err.contains("Flow output schema does not match sink table schema"),
+        "{err}"
+    );
+    assert!(err.contains("flow output columns"), "{err}");
+    assert!(err.contains("sink table columns"), "{err}");
+    assert!(err.contains("extra flow columns not in sink"), "{err}");
+    assert!(err.contains("extra"), "{err}");
+    assert!(
+        !err.contains("extra AS ts"),
+        "schema error should not primarily expose positional alias: {err}"
+    );
+}
+
+#[tokio::test]
+async fn test_gen_plan_with_matching_schema_rejects_positional_alias_type_mismatch() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+        ColumnSchema::new(
+            "event_time",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true),
+        ColumnSchema::new(
+            "max(numbers_with_ts.number)",
+            ConcreteDataType::uint32_datatype(),
+            true,
+        ),
+    ]));
+
+    let err = gen_plan_with_matching_schema(
+        "SELECT number, number AS not_time, max(number) FROM numbers_with_ts GROUP BY number",
+        ctx,
+        query_engine,
+        sink_schema,
+        &[],
+        false,
+    )
+    .await
+    .unwrap_err()
+    .to_string();
+
+    assert!(
+        err.contains(
+            "Cannot match flow output column 'not_time' to sink column 'event_time' by position"
+        ),
+        "{err}"
+    );
+    assert!(err.contains("incompatible data types"), "{err}");
+    assert!(
+        !err.contains("not_time AS event_time"),
+        "schema error should not expose an incompatible positional alias: {err}"
+    );
+}
+
+#[tokio::test]
+async fn test_gen_plan_with_matching_schema_rejects_cross_position_extra_column_match() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+        ColumnSchema::new(
+            "time_window",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true),
+        ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            true,
+        ),
+    ]));
+
+    let err = gen_plan_with_matching_schema(
+        "SELECT number, ts, date_bin('5 minutes', ts) AS time_window2 FROM numbers_with_ts GROUP BY number, ts, time_window2",
+        ctx,
+        query_engine,
+        sink_schema,
+        &[],
+        false,
+    )
+    .await
+    .unwrap_err()
+    .to_string();
+
+    assert!(
+        err.contains("Flow output schema does not match sink table schema"),
+        "{err}"
+    );
+    assert!(err.contains("time_window2"), "{err}");
+    assert!(err.contains("time_window"), "{err}");
+    assert!(!err.contains("DuplicateUnqualifiedField"), "{err}");
+}
+
+#[tokio::test]
+async fn test_gen_plan_with_matching_schema_accepts_out_of_order_matching_names() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+        ColumnSchema::new(
+            "time_window",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true),
+        ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            true,
+        ),
+    ]));
+
+    let plan = gen_plan_with_matching_schema(
+        "SELECT number, ts, date_bin('5 minutes', ts) AS time_window FROM numbers_with_ts GROUP BY number, ts, time_window",
+        ctx,
+        query_engine,
+        sink_schema,
+        &[],
+        false,
+    )
+    .await
+    .unwrap();
+    let output_names = plan
+        .schema()
+        .fields()
+        .iter()
+        .map(|field| field.name().clone())
+        .collect::<Vec<_>>();
+
+    assert_eq!(
+        output_names,
+        vec![
+            "number".to_string(),
+            "ts".to_string(),
+            "time_window".to_string()
+        ]
+    );
+    assert!(duplicate_names(&output_names).is_empty());
+}
+
+#[tokio::test]
+async fn test_gen_plan_with_matching_schema_allows_numeric_positional_alias() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("renamed_number", ConcreteDataType::int64_datatype(), true),
+        ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true),
+    ]));
+
+    let plan = gen_plan_with_matching_schema(
+        "SELECT number, ts FROM numbers_with_ts",
+        ctx,
+        query_engine,
+        sink_schema,
+        &[],
+        false,
+    )
+    .await
+    .unwrap();
+    let sql = df_plan_to_sql(&plan).unwrap();
+
+    assert_eq!(
+        "SELECT numbers_with_ts.number AS renamed_number, numbers_with_ts.ts FROM numbers_with_ts",
+        sql
+    );
+}
+
+#[tokio::test]
+async fn test_gen_plan_with_matching_schema_allows_null_positional_alias() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+        ColumnSchema::new("label", ConcreteDataType::string_datatype(), true),
+    ]));
+
+    let plan = gen_plan_with_matching_schema(
+        "SELECT number, NULL AS label_placeholder FROM numbers_with_ts",
+        ctx,
+        query_engine,
+        sink_schema,
+        &[],
+        false,
+    )
+    .await
+    .unwrap();
+    let output_names = plan
+        .schema()
+        .fields()
+        .iter()
+        .map(|field| field.name().clone())
+        .collect::<Vec<_>>();
+    let sql = df_plan_to_sql(&plan).unwrap();
+
+    assert_eq!(
+        output_names,
+        vec!["number".to_string(), "label".to_string()]
+    );
+    assert!(sql.contains("NULL AS label"), "{sql}");
+}
+
+#[tokio::test]
+async fn test_gen_plan_with_matching_schema_accepts_matching_flow_schema() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+        ColumnSchema::new("extra", ConcreteDataType::uint32_datatype(), true),
+        ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true),
+        ColumnSchema::new(
+            "max(numbers_with_ts.number)",
+            ConcreteDataType::uint32_datatype(),
+            true,
+        ),
+    ]));
+
+    let plan = gen_plan_with_matching_schema(
+        "SELECT number, number AS extra, ts, max(number) FROM numbers_with_ts GROUP BY number, ts",
+        ctx,
+        query_engine,
+        sink_schema,
+        &[],
+        false,
+    )
+    .await
+    .unwrap();
+    let sql = df_plan_to_sql(&plan).unwrap();
+
+    assert_eq!(
+        "SELECT numbers_with_ts.number, numbers_with_ts.number AS extra, numbers_with_ts.ts, max(numbers_with_ts.number) FROM numbers_with_ts GROUP BY numbers_with_ts.number, numbers_with_ts.ts",
+        sql
+    );
+}
+
+#[tokio::test]
+async fn test_validate_sink_table_schema_rejects_existing_sink_missing_flow_column() {
+    let query_engine = create_test_query_engine();
+    let query_ctx = QueryContext::arc();
+    let sql = "SELECT number, number AS extra, max(number) FROM numbers_with_ts GROUP BY number";
+    let plan = sql_to_df_plan(query_ctx.clone(), query_engine.clone(), sql, true)
+        .await
+        .unwrap();
+
+    let catalog_manager = catalog::memory::new_memory_catalog_manager().unwrap();
+    let sink_table_name = [
+        "greptime".to_string(),
+        "public".to_string(),
+        "existing_sink".to_string(),
+    ];
+    let sink_table = u32_table(
+        "existing_sink",
+        vec!["number", "max(numbers_with_ts.number)"],
+        0,
+    );
+    catalog_manager
+        .register_table_sync(RegisterTableRequest {
+            catalog: sink_table_name[0].clone(),
+            schema: sink_table_name[1].clone(),
+            table_name: sink_table_name[2].clone(),
+            table_id: 4096,
+            table: sink_table,
+        })
+        .unwrap();
+
+    let (_shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel();
+    let task = BatchingTask::try_new(TaskArgs {
+        flow_id: 1,
+        query: sql,
+        plan,
+        time_window_expr: None,
+        expire_after: None,
+        sink_table_name,
+        source_table_names: vec![[
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers_with_ts".to_string(),
+        ]],
+        query_ctx,
+        catalog_manager,
+        shutdown_rx,
+        batch_opts: Arc::new(BatchingModeOptions::default()),
+        flow_eval_interval: None,
+    })
+    .unwrap();
+
+    let err = task
+        .validate_sink_table_schema(&query_engine)
+        .await
+        .unwrap_err()
+        .to_string();
+
+    assert!(
+        err.contains("Flow output schema does not match sink table schema"),
+        "{err}"
+    );
+    assert!(err.contains("extra"), "{err}");
+}
+
+#[tokio::test]
+async fn test_gen_plan_with_matching_schema_allow_partial_fills_nullable_columns() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), false),
+        ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true),
+        ColumnSchema::new("optional_value", ConcreteDataType::uint32_datatype(), true),
+    ]));
+
+    let plan = gen_plan_with_matching_schema(
+        "SELECT number, ts FROM numbers_with_ts",
+        ctx,
+        query_engine,
+        sink_schema,
+        &[0],
+        true,
+    )
+    .await
+    .unwrap();
+    let sql = df_plan_to_sql(&plan).unwrap();
+
+    assert_eq!(
+        "SELECT numbers_with_ts.number, numbers_with_ts.ts, NULL AS optional_value FROM numbers_with_ts",
+        sql
+    );
+}
+
 #[tokio::test]
 async fn test_find_group_by_exprs() {
    let testcases = vec![
@@ -1491,3 +1869,118 @@ async fn test_analyze_incremental_aggregate_plan_rejects_cast_wrapped_alias() {
        );
    }
 }
+
+#[tokio::test]
+async fn test_gen_plan_with_matching_schema_last_non_null_rejects_missing_primary_key_column() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    // Sink table with primary_key_indices=[0] ("number"), time_index="ts", and merge_mode=last_non_null.
+    // The flow query omits "number", which is a required primary-key column.
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+        ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true),
+        ColumnSchema::new("optional_value", ConcreteDataType::uint32_datatype(), true),
+    ]));
+
+    let err = gen_plan_with_matching_schema(
+        "SELECT ts FROM numbers_with_ts",
+        ctx,
+        query_engine,
+        sink_schema,
+        &[0],
+        true,
+    )
+    .await
+    .unwrap_err()
+    .to_string();
+
+    assert!(
+        err.contains(
+            "required by sink table are missing from flow output when merge_mode=last_non_null"
+        ),
+        "{err}"
+    );
+    assert!(err.contains("number"), "{err}");
+}
+
+#[tokio::test]
+async fn test_gen_plan_with_matching_schema_last_non_null_rejects_missing_time_index_column() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    // Sink table with primary_key_indices=[0] ("number"), time_index="ts", and merge_mode=last_non_null.
+    // The flow query omits "ts", which is a required time-index column.
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+        ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true),
+        ColumnSchema::new("optional_value", ConcreteDataType::uint32_datatype(), true),
+    ]));
+
+    let err = gen_plan_with_matching_schema(
+        "SELECT number FROM numbers_with_ts",
+        ctx,
+        query_engine,
+        sink_schema,
+        &[0],
+        true,
+    )
+    .await
+    .unwrap_err()
+    .to_string();
+
+    assert!(
+        err.contains(
+            "required by sink table are missing from flow output when merge_mode=last_non_null"
+        ),
+        "{err}"
+    );
+    assert!(err.contains("ts"), "{err}");
+}
+
+#[tokio::test]
+async fn test_gen_plan_with_matching_schema_last_non_null_rejects_extra_flow_column() {
+    let query_engine = create_test_query_engine();
+    let ctx = QueryContext::arc();
+    // Sink table with merge_mode=last_non_null.
+    // Sink has 3 columns: number (pk), ts (time_index), optional_value (nullable).
+    // Flow outputs: number, number AS extra, ts → "extra" is not in sink schema.
+    // query_col_cnt(3) <= table_col_cnt(3), so the extra branch is reached.
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", ConcreteDataType::uint32_datatype(), true),
+        ColumnSchema::new(
+            "ts",
+            ConcreteDataType::timestamp_millisecond_datatype(),
+            false,
+        )
+        .with_time_index(true),
+        ColumnSchema::new("optional_value", ConcreteDataType::uint32_datatype(), true),
+    ]));
+
+    let err = gen_plan_with_matching_schema(
+        "SELECT number, number AS extra, ts FROM numbers_with_ts",
+        ctx,
+        query_engine,
+        sink_schema,
+        &[0],
+        true,
+    )
+    .await
+    .unwrap_err()
+    .to_string();
+
+    assert!(err.contains("extra column(s)"), "{err}");
+    assert!(err.contains("extra"), "{err}");
+    assert!(
+        err.contains("Flow output schema does not match sink table schema"),
+        "{err}"
+    );
+}
--- a/src/frontend/src/server.rs
+++ b/src/frontend/src/server.rs
@@ -288,7 +288,6 @@ where

        let http_server = builder
            .with_metrics_handler(MetricsHandler)
-            .with_plugins(self.plugins.clone())
            .with_greptime_config_options(toml)
            .build();
        Ok(http_server)
--- a/src/meta-client/src/client.rs
+++ b/src/meta-client/src/client.rs
@@ -1344,7 +1344,7 @@ mod tests {

        // Generates rough 10MB data, which is larger than the default grpc message size limit.
        for i in 0..10 {
-            let data: Vec<u8> = (0..1024 * 1024).map(|_| rng.random()).collect();
+            let data: Vec<u8> = (0..1024 * 1024).map(|_| rng.random::<u8>()).collect();
            in_memory
                .put(
                    PutRequest::new()
--- a/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs
@@ -18,7 +18,9 @@ use std::ops::Div;
 use api::v1::meta::MailboxMessage;
 use common_meta::RegionIdent;
 use common_meta::distributed_time_constants::default_distributed_time_constants;
-use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply};
+use common_meta::instruction::{
+    Instruction, InstructionReply, OpenRegion, OpenRegionReason, SimpleReply,
+};
 use common_meta::key::datanode_table::RegionInfo;
 use common_procedure::{Context as ProcedureContext, Status};
 use common_telemetry::info;
@@ -26,12 +28,13 @@ use common_telemetry::tracing_context::TracingContext;
 use serde::{Deserialize, Serialize};
 use snafu::{OptionExt, ResultExt};
 use store_api::region_engine::RegionRole;
+use store_api::region_request::RegionRequirements;
 use tokio::time::Instant;

 use crate::error::{self, Result};
 use crate::handler::HeartbeatMailbox;
 use crate::procedure::region_migration::flush_leader_region::PreFlushRegion;
-use crate::procedure::region_migration::{Context, State};
+use crate::procedure::region_migration::{Context, RegionMigrationTriggerReason, State};
 use crate::service::mailbox::Channel;

 #[derive(Debug, Serialize, Deserialize)]
@@ -67,6 +70,10 @@ impl OpenCandidateRegion {
        let region_ids = ctx.persistent_ctx.region_ids.clone();
        let from_peer_id = ctx.persistent_ctx.from_peer.id;
        let to_peer_id = ctx.persistent_ctx.to_peer.id;
+        let reason = match ctx.persistent_ctx.trigger_reason {
+            RegionMigrationTriggerReason::Failover => OpenRegionReason::RegionFailover,
+            _ => OpenRegionReason::RegionMigration,
+        };
        let datanode_table_values = ctx.get_from_peer_datanode_table_values().await?;
        let mut open_regions = Vec::with_capacity(region_ids.len());

@@ -97,6 +104,8 @@ impl OpenCandidateRegion {
                region_options,
                region_wal_options,
                true,
+                Some(reason),
+                RegionRequirements::object_storage(),
            ));
        }

@@ -233,18 +242,20 @@ mod tests {
    }

    fn new_mock_open_instruction(datanode_id: DatanodeId, region_id: RegionId) -> Instruction {
-        Instruction::OpenRegions(vec![OpenRegion {
-            region_ident: RegionIdent {
+        Instruction::OpenRegions(vec![OpenRegion::new(
+            RegionIdent {
                datanode_id,
                table_id: region_id.table_id(),
                region_number: region_id.region_number(),
                engine: MITO2_ENGINE.to_string(),
            },
-            region_storage_path: "/bar/foo/region/".to_string(),
-            region_options: Default::default(),
-            region_wal_options: Default::default(),
-            skip_wal_replay: true,
-        }])
+            "/bar/foo/region/",
+            Default::default(),
+            Default::default(),
+            true,
+            Some(OpenRegionReason::RegionMigration),
+            RegionRequirements::object_storage(),
+        )])
    }

    #[tokio::test]
@@ -263,6 +274,57 @@ mod tests {
        assert!(!err.is_retryable());
    }

+    #[tokio::test]
+    async fn test_build_open_region_instruction_reason() {
+        let state = OpenCandidateRegion;
+        let mut persistent_context = new_persistent_context();
+        let from_peer_id = persistent_context.from_peer.id;
+        let region_id = persistent_context.region_ids[0];
+        let env = TestingEnv::new();
+
+        let table_info = new_test_table_info(1024);
+        let region_routes = vec![RegionRoute {
+            region: Region::new_test(region_id),
+            leader_peer: Some(Peer::empty(from_peer_id)),
+            ..Default::default()
+        }];
+        env.table_metadata_manager()
+            .create_table_metadata(
+                table_info,
+                TableRouteValue::physical(region_routes),
+                HashMap::default(),
+            )
+            .await
+            .unwrap();
+
+        let mut ctx = env
+            .context_factory()
+            .new_context(persistent_context.clone());
+        let instruction = state.build_open_region_instruction(&mut ctx).await.unwrap();
+        let open_regions = instruction.into_open_regions().unwrap();
+        assert_eq!(
+            Some(OpenRegionReason::RegionMigration),
+            open_regions[0].reason
+        );
+        assert_eq!(
+            RegionRequirements::object_storage(),
+            open_regions[0].requirements
+        );
+
+        persistent_context.trigger_reason = RegionMigrationTriggerReason::Failover;
+        let mut ctx = env.context_factory().new_context(persistent_context);
+        let instruction = state.build_open_region_instruction(&mut ctx).await.unwrap();
+        let open_regions = instruction.into_open_regions().unwrap();
+        assert_eq!(
+            Some(OpenRegionReason::RegionFailover),
+            open_regions[0].reason
+        );
+        assert_eq!(
+            RegionRequirements::object_storage(),
+            open_regions[0].requirements
+        );
+    }
+
    #[tokio::test]
    async fn test_datanode_is_unreachable() {
        let state = OpenCandidateRegion;
--- a/src/metric-engine/src/engine.rs
+++ b/src/metric-engine/src/engine.rs
@@ -620,6 +620,7 @@ mod test {
            options: physical_region_option,
            skip_wal_replay: false,
            checkpoint: None,
+            requirements: Default::default(),
        };
        engine
            .handle_request(physical_region_id, RegionRequest::Open(open_request))
@@ -644,6 +645,7 @@ mod test {
            options: HashMap::new(),
            skip_wal_replay: false,
            checkpoint: None,
+            requirements: Default::default(),
        };
        engine
            .handle_request(
@@ -721,6 +723,7 @@ mod test {
            options: physical_region_option,
            skip_wal_replay: false,
            checkpoint: None,
+            requirements: Default::default(),
        };
        // Opening an already opened region should succeed.
        // Since the region is already open, no metadata recovery operations will be performed.
@@ -749,6 +752,7 @@ mod test {
            options: physical_region_option,
            skip_wal_replay: false,
            checkpoint: None,
+            requirements: Default::default(),
        };
        let err = metric_engine
            .handle_request(physical_region_id, RegionRequest::Open(open_request))
@@ -854,6 +858,7 @@ mod test {
                        options: options.clone(),
                        skip_wal_replay: true,
                        checkpoint: None,
+                        requirements: Default::default(),
                    },
                )
            })
--- a/src/metric-engine/src/engine/open.rs
+++ b/src/metric-engine/src/engine/open.rs
@@ -222,6 +222,7 @@ impl MetricEngineInner {
                entry_id: checkpoint.metadata_entry_id.unwrap_or_default(),
                metadata_entry_id: None,
            }),
+            requirements: request.requirements,
        };

        let mut data_region_options = request.options;
@@ -239,6 +240,7 @@ impl MetricEngineInner {
                entry_id: checkpoint.entry_id,
                metadata_entry_id: None,
            }),
+            requirements: request.requirements,
        };

        (open_metadata_region_request, open_data_region_request)
--- a/src/metric-engine/src/engine/sync/region.rs
+++ b/src/metric-engine/src/engine/sync/region.rs
@@ -321,6 +321,7 @@ mod tests {
                    options: physical_region_option,
                    skip_wal_replay: false,
                    checkpoint: None,
+                    requirements: Default::default(),
                }),
            )
            .await
--- a/src/metric-engine/src/test_util.rs
+++ b/src/metric-engine/src/test_util.rs
@@ -144,6 +144,7 @@ impl TestEnv {
                    options: physical_region_option,
                    skip_wal_replay: true,
                    checkpoint: None,
+                    requirements: Default::default(),
                }),
            )
            .await
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 default = []
 test = ["common-test-util", "rstest", "rstest_reuse", "rskafka"]
 testing = ["test"]
+test-shared-fs-region-migration = []
 enterprise = []
 vector_index = ["dep:roaring", "index/vector_index"]

--- a/src/mito2/src/engine/alter_test.rs
+++ b/src/mito2/src/engine/alter_test.rs
@@ -277,6 +277,7 @@ async fn test_alter_region_with_format(flat_format: bool) {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -481,6 +482,7 @@ async fn test_put_after_alter_with_format(flat_format: bool) {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -844,6 +846,7 @@ async fn test_alter_column_fulltext_options_with_format(flat_format: bool) {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -979,6 +982,7 @@ async fn test_alter_column_set_inverted_index_with_format(flat_format: bool) {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -1248,6 +1252,7 @@ async fn test_alter_region_sst_format_with_flush() {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -1366,6 +1371,7 @@ async fn test_alter_region_sst_format_without_flush() {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -1492,6 +1498,7 @@ async fn test_alter_region_sst_format_flat_to_pk_with_flush() {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -1610,6 +1617,7 @@ async fn test_alter_region_sst_format_flat_to_pk_without_flush() {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -1725,6 +1733,7 @@ async fn test_alter_region_append_mode_with_flush() {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -1843,6 +1852,7 @@ async fn test_alter_region_append_mode_without_flush() {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
--- a/src/mito2/src/engine/append_mode_test.rs
+++ b/src/mito2/src/engine/append_mode_test.rs
@@ -348,6 +348,7 @@ async fn test_alter_append_mode_clears_merge_mode_with_format(flat_format: bool)
                options,
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
--- a/src/mito2/src/engine/basic_test.rs
+++ b/src/mito2/src/engine/basic_test.rs
@@ -196,6 +196,7 @@ async fn test_region_replay_with_format(factory: Option<LogStoreFactory>, flat_f
                options,
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
--- a/src/mito2/src/engine/batch_catchup_test.rs
+++ b/src/mito2/src/engine/batch_catchup_test.rs
@@ -160,6 +160,7 @@ async fn test_batch_catchup_with_format(factory: Option<LogStoreFactory>, flat_f
                    skip_wal_replay: true,
                    path_type: PathType::Bare,
                    checkpoint: None,
+                    requirements: Default::default(),
                },
            )
        })
--- a/src/mito2/src/engine/batch_open_test.rs
+++ b/src/mito2/src/engine/batch_open_test.rs
@@ -136,6 +136,7 @@ async fn test_batch_open_with_format(factory: Option<LogStoreFactory>, flat_form
                    skip_wal_replay: false,
                    path_type: PathType::Bare,
                    checkpoint: None,
+                    requirements: Default::default(),
                },
            )
        })
@@ -149,6 +150,7 @@ async fn test_batch_open_with_format(factory: Option<LogStoreFactory>, flat_form
            skip_wal_replay: false,
            path_type: PathType::Bare,
            checkpoint: None,
+            requirements: Default::default(),
        },
    ));

@@ -221,6 +223,7 @@ async fn test_batch_open_err_with_format(factory: Option<LogStoreFactory>, flat_
                    skip_wal_replay: false,
                    path_type: PathType::Bare,
                    checkpoint: None,
+                    requirements: Default::default(),
                },
            )
        })
--- a/src/mito2/src/engine/bump_committed_sequence_test.rs
+++ b/src/mito2/src/engine/bump_committed_sequence_test.rs
@@ -112,6 +112,7 @@ async fn test_bump_committed_sequence_with_format(flat_format: bool) {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -151,6 +152,7 @@ async fn test_bump_committed_sequence_with_format(flat_format: bool) {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
--- a/src/mito2/src/engine/catchup_test.rs
+++ b/src/mito2/src/engine/catchup_test.rs
@@ -97,6 +97,7 @@ async fn test_catchup_with_last_entry_id(factory: Option<LogStoreFactory>) {
                options,
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -218,6 +219,7 @@ async fn test_catchup_with_incorrect_last_entry_id(factory: Option<LogStoreFacto
                options,
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -321,6 +323,7 @@ async fn test_catchup_without_last_entry_id(factory: Option<LogStoreFactory>) {
                options,
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -423,6 +426,7 @@ async fn test_catchup_with_manifest_update(factory: Option<LogStoreFactory>) {
                options,
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -527,6 +531,7 @@ async fn open_region(
                skip_wal_replay,
                path_type: PathType::Bare,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -622,6 +627,7 @@ async fn test_local_catchup(factory: Option<LogStoreFactory>) {
                skip_wal_replay: true,
                path_type: PathType::Bare,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
--- a/src/mito2/src/engine/compaction_test.rs
+++ b/src/mito2/src/engine/compaction_test.rs
@@ -1023,6 +1023,7 @@ async fn test_change_region_compaction_window_with_format(flat_format: bool) {
                options: Default::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -1125,6 +1126,7 @@ async fn test_open_overwrite_compaction_window_with_format(flat_format: bool) {
                options,
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
--- a/src/mito2/src/engine/open_test.rs
+++ b/src/mito2/src/engine/open_test.rs
@@ -64,6 +64,7 @@ async fn test_engine_open_empty_with_format(flat_format: bool) {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -110,6 +111,7 @@ async fn test_engine_open_existing_with_format(flat_format: bool) {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -237,6 +239,7 @@ async fn test_engine_region_open_with_options_with_format(flat_format: bool) {
                options: HashMap::from([("ttl".to_string(), "4d".to_string())]),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -297,6 +300,7 @@ async fn test_engine_region_open_with_custom_store_with_format(flat_format: bool
                options: HashMap::from([("storage".to_string(), "Gcs".to_string())]),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -392,6 +396,7 @@ async fn test_open_region_skip_wal_replay_with_format(flat_format: bool) {
                options: Default::default(),
                skip_wal_replay: true,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -431,6 +436,7 @@ async fn test_open_region_skip_wal_replay_with_format(flat_format: bool) {
                options: Default::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -484,6 +490,7 @@ async fn test_open_region_wait_for_opening_region_ok_with_format(flat_format: bo
                    options: HashMap::default(),
                    skip_wal_replay: false,
                    checkpoint: None,
+                    requirements: Default::default(),
                }),
            )
            .await
@@ -535,6 +542,7 @@ async fn test_open_region_wait_for_opening_region_err_with_format(flat_format: b
                    options: HashMap::default(),
                    skip_wal_replay: false,
                    checkpoint: None,
+                    requirements: Default::default(),
                }),
            )
            .await
@@ -691,6 +699,7 @@ async fn test_open_backfills_partition_expr_with_fetcher() {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -725,6 +734,7 @@ async fn test_open_backfills_partition_expr_with_fetcher() {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -766,6 +776,7 @@ async fn test_open_keeps_none_without_fetcher() {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
--- a/src/mito2/src/engine/parallel_test.rs
+++ b/src/mito2/src/engine/parallel_test.rs
@@ -52,6 +52,7 @@ async fn scan_in_parallel(
                skip_wal_replay: false,
                path_type: PathType::Bare,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
--- a/src/mito2/src/engine/skip_wal_test.rs
+++ b/src/mito2/src/engine/skip_wal_test.rs
@@ -87,6 +87,7 @@ async fn test_close_region_skip_wal(insert: bool) {
                options: request.options.clone(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -154,6 +155,7 @@ async fn test_close_follower_region_skip_wal() {
                options: request.options.clone(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -271,6 +273,7 @@ async fn test_close_region_after_truncate_skip_wal() {
                options: request.options,
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
--- a/src/mito2/src/engine/sync_test.rs
+++ b/src/mito2/src/engine/sync_test.rs
@@ -127,6 +127,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) {
                // Ensure the region is not replayed from the WAL.
                skip_wal_replay: true,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -239,6 +240,7 @@ async fn test_sync_after_alter_region_with_format(flat_format: bool) {
                // Ensure the region is not replayed from the WAL.
                skip_wal_replay: true,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
--- a/src/mito2/src/engine/truncate_test.rs
+++ b/src/mito2/src/engine/truncate_test.rs
@@ -323,6 +323,7 @@ async fn test_engine_truncate_reopen_with_format(flat_format: bool) {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
@@ -447,6 +448,7 @@ async fn test_engine_truncate_during_flush_with_format(flat_format: bool) {
                options: HashMap::default(),
                skip_wal_replay: false,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -916,6 +916,20 @@ pub enum Error {
        source: Arc<Error>,
    },

+    #[snafu(display(
+        "Region {} does not satisfy open requirement '{}': {}",
+        region_id,
+        requirement,
+        reason
+    ))]
+    OpenRegionRequirement {
+        region_id: RegionId,
+        requirement: &'static str,
+        reason: &'static str,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
    #[snafu(display("Failed to parse job id"))]
    ParseJobId {
        #[snafu(implicit)]
@@ -1376,6 +1390,7 @@ impl ErrorExt for Error {
            PrimaryKeyLengthMismatch { .. } => StatusCode::InvalidArguments,
            InvalidSender { .. } => StatusCode::InvalidArguments,
            InvalidSchedulerState { .. } => StatusCode::InvalidArguments,
+            OpenRegionRequirement { .. } => StatusCode::InvalidArguments,
            DeleteSsts { .. } | DeleteIndex { .. } | DeleteIndexes { .. } => {
                StatusCode::StorageUnavailable
            }
--- a/src/mito2/src/region/opener.rs
+++ b/src/mito2/src/region/opener.rs
@@ -27,8 +27,9 @@ use futures::future::BoxFuture;
 use log_store::kafka::log_store::KafkaLogStore;
 use log_store::noop::log_store::NoopLogStore;
 use log_store::raft_engine::log_store::RaftEngineLogStore;
+use object_store::ObjectStore;
 use object_store::manager::ObjectStoreManagerRef;
-use object_store::util::normalize_dir;
+use object_store::util::{is_object_storage, normalize_dir};
 use snafu::{OptionExt, ResultExt, ensure};
 use store_api::logstore::LogStore;
 use store_api::logstore::provider::Provider;
@@ -36,7 +37,7 @@ use store_api::metadata::{
    ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
 };
 use store_api::region_engine::RegionRole;
-use store_api::region_request::PathType;
+use store_api::region_request::{PathType, RegionRequirements};
 use store_api::storage::{ColumnId, RegionId};
 use tokio::sync::Semaphore;

@@ -46,8 +47,8 @@ use crate::cache::file_cache::{FileCache, FileType, IndexKey};
 use crate::config::MitoConfig;
 use crate::error;
 use crate::error::{
-    EmptyRegionDirSnafu, InvalidMetadataSnafu, ObjectStoreNotFoundSnafu, RegionCorruptedSnafu,
-    Result, StaleLogEntrySnafu,
+    EmptyRegionDirSnafu, InvalidMetadataSnafu, InvalidRegionOptionsSnafu, ObjectStoreNotFoundSnafu,
+    RegionCorruptedSnafu, Result, StaleLogEntrySnafu,
 };
 use crate::manifest::action::RegionManifest;
 use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
@@ -206,6 +207,29 @@ impl RegionOpener {
        Ok(self)
    }

+    /// Ensures the current region open request satisfies its requirements.
+    pub(crate) fn ensure_open_requirements(&self, requirements: RegionRequirements) -> Result<()> {
+        if !requirements.object_storage {
+            return Ok(());
+        }
+
+        let options = self.options.as_ref().context(InvalidRegionOptionsSnafu {
+            reason: "missing region options before requirement check".to_string(),
+        })?;
+        let object_store = get_object_store(&options.storage, &self.object_store_manager)?;
+
+        ensure!(
+            supports_open_region_object_storage_requirement(&object_store),
+            error::OpenRegionRequirementSnafu {
+                region_id: self.region_id,
+                requirement: "object storage",
+                reason: "region data must be accessible from another datanode",
+            }
+        );
+
+        Ok(())
+    }
+
    /// Sets the cache manager for the region.
    pub(crate) fn cache(mut self, cache_manager: Option<CacheManagerRef>) -> Self {
        self.cache_manager = cache_manager;
@@ -597,6 +621,21 @@ impl RegionOpener {
    }
 }

+#[cfg(not(feature = "test-shared-fs-region-migration"))]
+fn supports_open_region_object_storage_requirement(object_store: &ObjectStore) -> bool {
+    is_object_storage(object_store)
+}
+
+#[cfg(feature = "test-shared-fs-region-migration")]
+fn supports_open_region_object_storage_requirement(object_store: &ObjectStore) -> bool {
+    // Integration tests can configure multiple datanodes to share the same
+    // temporary home dir. That makes file storage accessible to all test
+    // datanodes, but production file storage still does not satisfy this
+    // requirement.
+    is_object_storage(object_store)
+        || object_store.info().scheme() == object_store::services::FS_SCHEME
+}
+
 /// Creates a version builder from a region manifest.
 pub(crate) fn version_builder_from_manifest(
    manifest: &RegionManifest,
@@ -1172,14 +1211,17 @@ mod tests {
    use datatypes::arrow::array::{ArrayRef, BinaryArray, Int64Array};
    use datatypes::arrow::record_batch::RecordBatch;
    use object_store::ObjectStore;
-    use object_store::services::{Fs, Memory};
+    use object_store::services::{Fs, Memory, S3};
    use parquet::arrow::ArrowWriter;
    use parquet::file::metadata::KeyValue;
    use parquet::file::properties::WriterProperties;
    use store_api::region_request::PathType;
    use store_api::storage::{FileId, RegionId};

-    use super::{preload_parquet_meta_cache_for_files, sanitize_region_options};
+    use super::{
+        preload_parquet_meta_cache_for_files, sanitize_region_options,
+        supports_open_region_object_storage_requirement,
+    };
    use crate::cache::CacheManager;
    use crate::cache::file_cache::{FileType, IndexKey};
    use crate::manifest::action::{RegionManifest, RemovedFilesRecord};
@@ -1207,6 +1249,48 @@ mod tests {
        }
    }

+    fn build_fs_object_store() -> ObjectStore {
+        ObjectStore::new(Fs::default().root("/tmp"))
+            .unwrap()
+            .finish()
+    }
+
+    #[test]
+    #[cfg(not(feature = "test-shared-fs-region-migration"))]
+    fn test_open_requirement_rejects_fs_object_store() {
+        let object_store = build_fs_object_store();
+
+        assert!(!supports_open_region_object_storage_requirement(
+            &object_store
+        ));
+    }
+
+    #[test]
+    #[cfg(feature = "test-shared-fs-region-migration")]
+    fn test_open_requirement_accepts_shared_fs_object_store_for_tests() {
+        let object_store = build_fs_object_store();
+
+        assert!(supports_open_region_object_storage_requirement(
+            &object_store
+        ));
+    }
+
+    #[test]
+    fn test_open_requirement_accepts_s3_object_store() {
+        let object_store = ObjectStore::new(
+            S3::default()
+                .bucket("test-bucket")
+                .region("us-east-1")
+                .disable_ec2_metadata(),
+        )
+        .unwrap()
+        .finish();
+
+        assert!(supports_open_region_object_storage_requirement(
+            &object_store
+        ));
+    }
+
    #[test]
    fn test_sanitize_region_options_options_format_wins() {
        // Manifest persisted PrimaryKey, but the re-parsed options now request Flat
--- a/src/mito2/src/test_util.rs
+++ b/src/mito2/src/test_util.rs
@@ -1307,6 +1307,7 @@ pub async fn reopen_region(
                skip_wal_replay: false,
                path_type: PathType::Bare,
                checkpoint: None,
+                requirements: Default::default(),
            }),
        )
        .await
--- a/src/mito2/src/worker/handle_open.rs
+++ b/src/mito2/src/worker/handle_open.rs
@@ -87,14 +87,11 @@ impl<S: LogStore> RegionWorkerLoop<S> {
        else {
            return;
        };
-        if let Err(err) = self.check_and_cleanup_region(region_id, &request).await {
-            sender.send(Err(err));
-            return;
-        }
        info!("Try to open region {}, worker: {}", region_id, self.id);
        sanitize_open_request_options(&mut request.options);

        // Open region from specific region dir.
+        let requirements = request.requirements;
        let opener = match RegionOpener::new(
            region_id,
            &request.table_dir,
@@ -112,7 +109,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
        .cache(Some(self.cache_manager.clone()))
        .wal_entry_reader(wal_entry_receiver.map(|receiver| Box::new(receiver) as _))
        .replay_checkpoint(request.checkpoint.map(|checkpoint| checkpoint.entry_id))
-        .parse_options(request.options)
+        .parse_options(request.options.clone())
        {
            Ok(opener) => opener,
            Err(err) => {
@@ -121,6 +118,16 @@ impl<S: LogStore> RegionWorkerLoop<S> {
            }
        };

+        if let Err(err) = opener.ensure_open_requirements(requirements) {
+            sender.send(Err(err));
+            return;
+        }
+
+        if let Err(err) = self.check_and_cleanup_region(region_id, &request).await {
+            sender.send(Err(err));
+            return;
+        }
+
        let now = Instant::now();
        let regions = self.regions.clone();
        let wal = self.wal.clone();
--- a/src/object-store/src/util.rs
+++ b/src/object-store/src/util.rs
@@ -22,11 +22,17 @@ use opendal::layers::{
    LoggingInterceptor, LoggingLayer, RetryEvent, RetryInterceptor, RetryLayer, TracingLayer,
 };
 use opendal::raw::{AccessorInfo, HttpClient, Operation};
+use opendal::services::FS_SCHEME;
 use snafu::ResultExt;

 use crate::config::HttpClientConfig;
 use crate::{ObjectStore, error};

+/// Returns true if the object store is not backed by local filesystem.
+pub fn is_object_storage(object_store: &ObjectStore) -> bool {
+    object_store.info().scheme() != FS_SCHEME
+}
+
 /// Join two paths and normalize the output dir.
 ///
 /// The output dir is always ends with `/`. e.g.
@@ -249,7 +255,11 @@ impl RetryInterceptor for PrintDetailedError {

 #[cfg(test)]
 mod tests {
+    use opendal::services::Fs;
+
    use super::*;
+    use crate::ObjectStore;
+    use crate::util::is_object_storage;

    #[test]
    fn test_normalize_dir() {
@@ -289,4 +299,14 @@ mod tests {
        assert_eq!("/abc", join_path("//", "/abc"));
        assert_eq!("abc/def", join_path("abc/", "//def"));
    }
+
+    #[test]
+    fn test_fs_is_not_object_storage() {
+        let object_store = ObjectStore::new(Fs::default().root("/tmp"))
+            .unwrap()
+            .finish();
+
+        assert_eq!(FS_SCHEME, object_store.info().scheme());
+        assert!(!is_object_storage(&object_store));
+    }
 }
--- a/src/operator/src/statement/copy_table_from.rs
+++ b/src/operator/src/statement/copy_table_from.rs
@@ -15,11 +15,15 @@
 use std::collections::HashMap;
 use std::future::Future;
 use std::path::Path;
+use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll};

 use client::{Output, OutputData, OutputMeta};
 use common_base::readable_size::ReadableSize;
-use common_datasource::file_format::csv::CsvFormat;
+use common_datasource::file_format::csv::{
+    CsvFormat, is_skippable_arrow_error, tolerant_csv_stream,
+};
 use common_datasource::file_format::json::JsonFormat;
 use common_datasource::file_format::orc::{ReaderAdapter, infer_orc_schema, new_orc_stream_reader};
 use common_datasource::file_format::{FileFormat, Format, file_to_stream};
@@ -33,10 +37,13 @@ use common_telemetry::{debug, tracing};
 use datafusion::datasource::physical_plan::{CsvSource, FileSource, JsonSource};
 use datafusion::parquet::arrow::ParquetRecordBatchStreamBuilder;
 use datafusion::parquet::arrow::arrow_reader::ArrowReaderMetadata;
+use datafusion_common::DataFusionError;
+use datafusion_common::arrow::error::ArrowError;
 use datafusion_common::config::CsvOptions;
 use datafusion_expr::Expr;
 use datatypes::arrow::compute::can_cast_types;
 use datatypes::arrow::datatypes::{DataType as ArrowDataType, Schema, SchemaRef};
+use datatypes::arrow::record_batch::RecordBatch;
 use datatypes::vectors::Helper;
 use futures_util::StreamExt;
 use object_store::{Entry, EntryMode, ObjectStore};
@@ -221,23 +228,42 @@ impl StatementExecutor {
                let csv_source = CsvSource::new(schema.clone())
                    .with_csv_options(options)
                    .with_batch_size(DEFAULT_BATCH_SIZE);
-                let stream = file_to_stream(
-                    object_store,
-                    path,
-                    csv_source,
-                    Some(projection),
-                    format.compression_type,
-                )
-                .await
-                .context(error::BuildFileStreamSnafu)?;
+                let stream = if format.skip_bad_records {
+                    let reader_schema =
+                        csv_reader_schema_for_skip_bad_records(schema, &compat_schema);
+                    tolerant_csv_stream(
+                        object_store,
+                        path,
+                        Arc::new(reader_schema),
+                        projection.clone(),
+                        format,
+                    )
+                    .await
+                    .context(error::BuildFileStreamSnafu)?
+                } else {
+                    file_to_stream(
+                        object_store,
+                        path,
+                        csv_source,
+                        Some(projection),
+                        format.compression_type,
+                    )
+                    .await
+                    .context(error::BuildFileStreamSnafu)?
+                };

-                Ok(Box::pin(
+                let stream = Box::pin(
                    // The projection is already applied in the CSV reader when we created the stream,
                    // so we pass None here to avoid double projection which would cause schema mismatch errors.
                    RecordBatchStreamTypeAdapter::new(output_schema, stream, None)
                        .with_filter(filters)
                        .context(error::PhysicalExprSnafu)?,
-                ))
+                );
+                if format.skip_bad_records {
+                    Ok(Box::pin(SkipBadRecordsStream::new(stream, path)))
+                } else {
+                    Ok(stream)
+                }
            }
            FileMetadata::Json {
                path,
@@ -469,6 +495,58 @@ fn gen_insert_output(rows_inserted: usize, insert_cost: usize) -> Output {
    )
 }

+struct SkipBadRecordsStream {
+    inner: DfSendableRecordBatchStream,
+    path: String,
+}
+
+impl SkipBadRecordsStream {
+    fn new(inner: DfSendableRecordBatchStream, path: impl Into<String>) -> Self {
+        Self {
+            inner,
+            path: path.into(),
+        }
+    }
+}
+
+impl datafusion::physical_plan::RecordBatchStream for SkipBadRecordsStream {
+    fn schema(&self) -> SchemaRef {
+        self.inner.schema()
+    }
+}
+
+impl futures::Stream for SkipBadRecordsStream {
+    type Item = datafusion_common::Result<RecordBatch>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.get_mut();
+        loop {
+            match this.inner.as_mut().poll_next(cx) {
+                Poll::Ready(Some(Err(error))) if is_skippable_record_error(&error) => {
+                    common_telemetry::warn!(
+                        "Skipping bad record while copying from {}: {}",
+                        this.path,
+                        error
+                    );
+                    continue;
+                }
+                other => return other,
+            }
+        }
+    }
+}
+
+fn is_skippable_record_error(error: &DataFusionError) -> bool {
+    match error {
+        DataFusionError::ArrowError(error, _) => is_skippable_arrow_error(error),
+        DataFusionError::External(error) => error
+            .downcast_ref::<ArrowError>()
+            .is_some_and(is_skippable_arrow_error),
+        DataFusionError::Context(_, error) => is_skippable_record_error(error),
+        _ => false,
+    }
+}
+
 /// Executes all pending inserts all at once, drain pending requests and reset pending bytes.
 async fn batch_insert(
    pending: &mut Vec<impl Future<Output = Result<Output>>>,
@@ -498,6 +576,59 @@ fn can_cast_types_for_greptime(from: &ArrowDataType, to: &ArrowDataType) -> bool
    can_cast_types(from, to)
 }

+fn csv_reader_schema_for_skip_bad_records(file: &SchemaRef, compat: &SchemaRef) -> Schema {
+    let fields = file
+        .fields()
+        .iter()
+        .enumerate()
+        .map(|(idx, file_field)| {
+            let compat_field = compat
+                .fields()
+                .find(file_field.name())
+                .map(|(_, field)| field);
+
+            match compat_field {
+                Some(compat_field) if can_csv_reader_parse_type(compat_field.data_type()) => {
+                    compat_field.clone()
+                }
+                _ => file.fields()[idx].clone(),
+            }
+        })
+        .collect::<Vec<_>>();
+
+    Schema::new_with_metadata(fields, file.metadata().clone())
+}
+
+fn can_csv_reader_parse_type(data_type: &ArrowDataType) -> bool {
+    match data_type {
+        ArrowDataType::Boolean
+        | ArrowDataType::Decimal32(_, _)
+        | ArrowDataType::Decimal64(_, _)
+        | ArrowDataType::Decimal128(_, _)
+        | ArrowDataType::Decimal256(_, _)
+        | ArrowDataType::Int8
+        | ArrowDataType::Int16
+        | ArrowDataType::Int32
+        | ArrowDataType::Int64
+        | ArrowDataType::UInt8
+        | ArrowDataType::UInt16
+        | ArrowDataType::UInt32
+        | ArrowDataType::UInt64
+        | ArrowDataType::Float32
+        | ArrowDataType::Float64
+        | ArrowDataType::Date32
+        | ArrowDataType::Date64
+        | ArrowDataType::Time32(_)
+        | ArrowDataType::Time64(_)
+        | ArrowDataType::Timestamp(_, _)
+        | ArrowDataType::Null
+        | ArrowDataType::Utf8
+        | ArrowDataType::Utf8View => true,
+        ArrowDataType::Dictionary(_, value_type) => value_type.as_ref() == &ArrowDataType::Utf8,
+        _ => false,
+    }
+}
+
 fn ensure_schema_compatible(from: &SchemaRef, to: &SchemaRef) -> Result<()> {
    let not_match = from
        .fields
@@ -780,4 +911,31 @@ mod tests {
            assert_eq!(test.0.project(&fp).unwrap(), test.1.project(&tp).unwrap());
        }
    }
+
+    #[test]
+    fn test_csv_reader_schema_for_skip_bad_records() {
+        let file_schema = make_test_schema(&[
+            Field::new("id", DataType::Utf8, true),
+            Field::new("jsons", DataType::Utf8, true),
+            Field::new("ts", DataType::Utf8, true),
+        ]);
+        let compat_schema = make_test_schema(&[
+            Field::new("id", DataType::UInt32, true),
+            Field::new("jsons", DataType::Binary, true),
+            Field::new(
+                "ts",
+                DataType::Timestamp(datatypes::arrow::datatypes::TimeUnit::Millisecond, None),
+                true,
+            ),
+        ]);
+
+        let reader_schema = csv_reader_schema_for_skip_bad_records(&file_schema, &compat_schema);
+
+        assert_eq!(reader_schema.field(0).data_type(), &DataType::UInt32);
+        assert_eq!(reader_schema.field(1).data_type(), &DataType::Utf8);
+        assert_eq!(
+            reader_schema.field(2).data_type(),
+            compat_schema.field(2).data_type()
+        );
+    }
 }
--- a/src/pipeline/benches/processor.rs
+++ b/src/pipeline/benches/processor.rs
@@ -233,6 +233,36 @@ transform:
    parse(&Content::Yaml(pipeline_yaml)).unwrap()
 }

+fn prepare_vrl_pipeline() -> Pipeline {
+    let pipeline_yaml = r#"
+---
+description: Minimal VRL processor benchmark
+
+processors:
+  - vrl:
+      source: |
+        .service_alias = .service
+        .host_alias = .host
+        del(.unused)
+        .processed = true
+        .
+
+transform:
+  - field: service
+    type: string
+  - field: host
+    type: string
+  - field: service_alias
+    type: string
+  - field: host_alias
+    type: string
+  - field: processed
+    type: boolean
+"#;
+
+    parse(&Content::Yaml(pipeline_yaml)).unwrap()
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
    let input_value_str = include_str!("./data.log");
    let input_value = Deserializer::from_str(input_value_str)
@@ -262,6 +292,41 @@ fn criterion_benchmark(c: &mut Criterion) {
        })
    });
    group.finish();
+
+    let vrl_input_value = (0..128)
+        .map(|i| {
+            serde_json::json!({
+                "service": "frontend",
+                "host": format!("host-{i}"),
+                "unused": "drop-me"
+            })
+            .into()
+        })
+        .collect::<Vec<VrlValue>>();
+    let vrl_pipeline = prepare_vrl_pipeline();
+
+    let (vrl_pipeline, mut vrl_schema_info, vrl_pipeline_def, vrl_pipeline_param) =
+        setup_pipeline!(vrl_pipeline);
+    let vrl_pipeline_ctx = PipelineContext::new(
+        &vrl_pipeline_def,
+        &vrl_pipeline_param,
+        session::context::Channel::Unknown,
+    );
+
+    let mut group = c.benchmark_group("vrl processor");
+    group.sample_size(50);
+    group.bench_function("processor mut", |b| {
+        b.iter(|| {
+            processor_mut(
+                black_box(vrl_pipeline.clone()),
+                black_box(&vrl_pipeline_ctx),
+                black_box(&mut vrl_schema_info),
+                black_box(vrl_input_value.clone()),
+            )
+            .unwrap();
+        })
+    });
+    group.finish();
 }

 // Testing the pipeline's performance in converting Json to Rows
--- a/src/pipeline/src/etl/processor/vrl_processor.rs
+++ b/src/pipeline/src/etl/processor/vrl_processor.rs
@@ -12,9 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+use std::cell::RefCell;
 use std::collections::BTreeMap;

 use chrono_tz::Tz;
+use once_cell::sync::Lazy;
 use snafu::{OptionExt, ensure};
 use vrl::compiler::runtime::Runtime;
 use vrl::compiler::{Program, TargetValue, compile};
@@ -31,6 +33,12 @@ use crate::etl::processor::yaml_string;
 pub(crate) const PROCESSOR_VRL: &str = "vrl";
 const SOURCE: &str = "source";

+static UTC_TIMEZONE: Lazy<TimeZone> = Lazy::new(|| TimeZone::Named(Tz::UTC));
+
+thread_local! {
+    static VRL_RUNTIME: RefCell<Runtime> = RefCell::new(Runtime::default());
+}
+
 #[derive(Debug)]
 pub struct VrlProcessor {
    source: String,
@@ -74,10 +82,14 @@ impl VrlProcessor {
            secrets: Secrets::default(),
        };

-        let timezone = TimeZone::Named(Tz::UTC);
-        let mut runtime = Runtime::default();
-        let re = runtime
-            .resolve(&mut target, &self.program, &timezone)
+        let re = VRL_RUNTIME
+            .with(|runtime| {
+                let mut runtime = runtime.borrow_mut();
+                runtime.clear();
+                let result = runtime.resolve(&mut target, &self.program, &UTC_TIMEZONE);
+                runtime.clear();
+                result
+            })
            .map_err(|e| {
                ExecuteVrlSnafu {
                    msg: e.get_expression_error().to_string(),
--- a/src/servers/src/configurator.rs
+++ b/src/servers/src/configurator.rs
@@ -14,25 +14,11 @@

 use std::sync::Arc;

-use axum::Router as HttpRouter;
 use common_error::ext::BoxedError;
 use tonic::transport::server::Router as GrpcRouter;

 use crate::grpc::builder::GrpcServerBuilder;

-/// A configurator that customizes or enhances an HTTP router.
-#[async_trait::async_trait]
-pub trait HttpConfigurator<C>: Send + Sync {
-    /// Configures the given HTTP router using the provided context.
-    async fn configure_http(
-        &self,
-        route: HttpRouter,
-        ctx: C,
-    ) -> std::result::Result<HttpRouter, BoxedError>;
-}
-
-pub type HttpConfiguratorRef<C> = Arc<dyn HttpConfigurator<C>>;
-
 /// A configurator that customizes or enhances a gRPC router.
 #[async_trait::async_trait]
 pub trait GrpcRouterConfigurator<C>: Send + Sync {
--- a/src/servers/src/grpc.rs
+++ b/src/servers/src/grpc.rs
@@ -24,7 +24,7 @@ pub mod prom_query_gateway;
 pub mod region_server;

 use std::any::Any;
-use std::net::SocketAddr;
+use std::net::{IpAddr, SocketAddr};
 use std::time::Duration;

 use api::v1::health_check_server::{HealthCheck, HealthCheckServer};
@@ -95,14 +95,8 @@ impl GrpcOptions {
        if self.server_addr.is_empty() {
            match local_ip_address::local_ip() {
                Ok(ip) => {
-                    let detected_addr = format!(
-                        "{}:{}",
-                        ip,
-                        self.bind_addr
-                            .split(':')
-                            .nth(1)
-                            .unwrap_or(DEFAULT_GRPC_ADDR_PORT)
-                    );
+                    let port = port_from_bind_addr(&self.bind_addr);
+                    let detected_addr = format_server_addr(ip, port);
                    info!("Using detected: {} as server address", detected_addr);
                    self.server_addr = detected_addr;
                }
@@ -131,7 +125,18 @@ impl GrpcOptions {
    }
 }

-const DEFAULT_GRPC_ADDR_PORT: &str = "4001";
+const DEFAULT_GRPC_ADDR_PORT: u16 = 4001;
+
+fn port_from_bind_addr(bind_addr: &str) -> u16 {
+    bind_addr
+        .rsplit_once(':')
+        .and_then(|(_, port)| port.parse().ok())
+        .unwrap_or(DEFAULT_GRPC_ADDR_PORT)
+}
+
+fn format_server_addr(ip: IpAddr, port: u16) -> String {
+    SocketAddr::new(ip, port).to_string()
+}

 const DEFAULT_INTERNAL_GRPC_ADDR_PORT: &str = "4010";

@@ -415,3 +420,36 @@ impl Server for GrpcServer {
        self
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
+
+    use super::{DEFAULT_GRPC_ADDR_PORT, format_server_addr, port_from_bind_addr};
+
+    #[test]
+    fn test_port_from_bind_addr() {
+        assert_eq!(3002, port_from_bind_addr("127.0.0.1:3002"));
+        assert_eq!(3002, port_from_bind_addr("[::]:3002"));
+        assert_eq!(
+            3002,
+            port_from_bind_addr("greptimedb-metasrv.default.svc.cluster.local:3002")
+        );
+        assert_eq!(
+            DEFAULT_GRPC_ADDR_PORT,
+            port_from_bind_addr("invalid-bind-addr")
+        );
+    }
+
+    #[test]
+    fn test_format_server_addr() {
+        assert_eq!(
+            "127.0.0.1:3002",
+            format_server_addr(IpAddr::V4(Ipv4Addr::LOCALHOST), 3002)
+        );
+        assert_eq!(
+            "[::1]:3002",
+            format_server_addr(IpAddr::V6(Ipv6Addr::LOCALHOST), 3002)
+        );
+    }
+}
--- a/src/servers/src/http.rs
+++ b/src/servers/src/http.rs
@@ -27,7 +27,6 @@ use axum::response::{IntoResponse, Response};
 use axum::routing::Route;
 use axum::serve::ListenerExt;
 use axum::{Router, middleware, routing};
-use common_base::Plugins;
 use common_base::readable_size::ReadableSize;
 use common_recordbatch::RecordBatch;
 use common_telemetry::{error, info};
@@ -52,11 +51,9 @@ use tower_http::trace::TraceLayer;

 use self::authorize::AuthState;
 use self::result::table_result::TableResponse;
-use crate::configurator::HttpConfiguratorRef;
 use crate::elasticsearch;
 use crate::error::{
-    AddressBindSnafu, AlreadyStartedSnafu, Error, InternalIoSnafu, InvalidHeaderValueSnafu,
-    OtherSnafu, Result,
+    AddressBindSnafu, AlreadyStartedSnafu, Error, InternalIoSnafu, InvalidHeaderValueSnafu, Result,
 };
 use crate::http::influxdb::{influxdb_health, influxdb_ping, influxdb_write_v1, influxdb_write_v2};
 use crate::http::otlp::OtlpState;
@@ -139,9 +136,6 @@ pub struct HttpServer {
    user_provider: Option<UserProviderRef>,
    memory_limiter: ServerMemoryLimiter,

-    // plugins
-    plugins: Plugins,
-
    // server configs
    options: HttpOptions,
    bind_addr: Option<SocketAddr>,
@@ -516,7 +510,6 @@ pub struct DashboardState {

 pub struct HttpServerBuilder {
    options: HttpOptions,
-    plugins: Plugins,
    user_provider: Option<UserProviderRef>,
    router: Router,
    memory_limiter: ServerMemoryLimiter,
@@ -526,7 +519,6 @@ impl HttpServerBuilder {
    pub fn new(options: HttpOptions) -> Self {
        Self {
            options,
-            plugins: Plugins::default(),
            user_provider: None,
            router: Router::new(),
            memory_limiter: ServerMemoryLimiter::default(),
@@ -687,10 +679,6 @@ impl HttpServerBuilder {
        Self { router, ..self }
    }

-    pub fn with_plugins(self, plugins: Plugins) -> Self {
-        Self { plugins, ..self }
-    }
-
    pub fn with_greptime_config_options(self, opts: String) -> Self {
        let config_router = HttpServer::route_config(GreptimeOptionsConfigState {
            greptime_config_options: opts,
@@ -748,7 +736,6 @@ impl HttpServerBuilder {
            options: self.options,
            user_provider: self.user_provider,
            shutdown_tx: Mutex::new(None),
-            plugins: self.plugins,
            router: StdMutex::new(self.router),
            bind_addr: None,
            memory_limiter: self.memory_limiter,
@@ -1237,14 +1224,7 @@ impl Server for HttpServer {
                AlreadyStartedSnafu { server: "HTTP" }
            );

-            let mut app = self.make_app();
-            if let Some(configurator) = self.plugins.get::<HttpConfiguratorRef<()>>() {
-                app = configurator
-                    .configure_http(app, ())
-                    .await
-                    .context(OtherSnafu)?;
-            }
-            let app = self.build(app)?;
+            let app = self.build(self.make_app())?;
            let listener = tokio::net::TcpListener::bind(listening)
                .await
                .context(AddressBindSnafu { addr: listening })?
--- a/src/sql/src/parsers/copy_parser.rs
+++ b/src/sql/src/parsers/copy_parser.rs
@@ -401,6 +401,28 @@ mod tests {
        }
    }

+    #[test]
+    fn test_parse_copy_table_from_csv_options() {
+        let sql =
+            "COPY my_table FROM '/tmp/test.csv' WITH (FORMAT = 'CSV', SKIP_BAD_RECORDS = 'false')";
+        let mut result =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap();
+        assert_eq!(1, result.len());
+
+        let statement = result.remove(0);
+        assert_matches!(statement, Statement::Copy { .. });
+        match statement {
+            Statement::Copy(crate::statements::copy::Copy::CopyTable(CopyTable::From(
+                copy_table,
+            ))) => {
+                assert_eq!(copy_table.with.get("format"), Some("CSV"));
+                assert_eq!(copy_table.with.get("skip_bad_records"), Some("false"));
+            }
+            _ => unreachable!(),
+        }
+    }
+
    #[test]
    fn test_parse_copy_table_to() {
        struct Test<'a> {
--- a/src/sql/src/util.rs
+++ b/src/sql/src/util.rs
@@ -27,7 +27,7 @@ use serde::Serialize;
 use snafu::ensure;
 use sqlparser::ast::{
    Array, Expr, Ident, ObjectName, ObjectNamePart, SetExpr, SqlOption, StructField, TableFactor,
-    Value, ValueWithSpan,
+    TableWithJoins, Value, ValueWithSpan,
 };
 use sqlparser_derive::{Visit, VisitMut};

@@ -195,7 +195,7 @@ pub fn extract_tables_from_query(query: &SqlOrTql) -> impl Iterator<Item = Objec

    match query {
        SqlOrTql::Sql(query, _) => {
-            extract_tables_from_set_expr(&query.inner.body, &mut names);
+            extract_tables_from_sql_query(&query.inner, &mut names);
            extract_tables_from_hybrid_cte_query(query, &mut names);
        }
        SqlOrTql::Tql(tql, _) => extract_tables_from_tql(tql, &mut names),
@@ -205,26 +205,34 @@ pub fn extract_tables_from_query(query: &SqlOrTql) -> impl Iterator<Item = Objec
 }

 fn extract_tables_from_hybrid_cte_query(query: &Query, sql_names: &mut HashSet<ObjectName>) {
-    let mut tql_names = HashSet::new();
-    let mut cte_names: HashSet<String> = HashSet::new();
    if let Some(hybrid_cte) = &query.hybrid_cte {
+        let mut cte_names: HashSet<String> = hybrid_cte
+            .cte_tables
+            .iter()
+            .map(|cte| ParserContext::canonicalize_identifier(cte.name.clone()).value)
+            .collect();
+        remove_cte_names(sql_names, &cte_names);
+
+        cte_names.clear();
        for cte in &hybrid_cte.cte_tables {
-            cte_names.insert(ParserContext::canonicalize_identifier(cte.name.clone()).value);
-            if let CteContent::Tql(tql) = &cte.content {
-                extract_tables_from_tql(tql, &mut tql_names);
+            let cte_name = ParserContext::canonicalize_identifier(cte.name.clone()).value;
+            let mut cte_query_names = HashSet::new();
+            match &cte.content {
+                CteContent::Sql(cte_query) => {
+                    extract_tables_from_sql_query(cte_query, &mut cte_query_names)
+                }
+                CteContent::Tql(tql) => extract_tables_from_tql(tql, &mut cte_query_names),
+            }
+            if hybrid_cte.recursive {
+                cte_names.insert(cte_name.clone());
+            }
+            remove_cte_names(&mut cte_query_names, &cte_names);
+            sql_names.extend(cte_query_names);
+            if !hybrid_cte.recursive {
+                cte_names.insert(cte_name);
            }
        }
    }
-
-    if let Some(with) = &query.inner.with {
-        for cte in &with.cte_tables {
-            cte_names.insert(ParserContext::canonicalize_identifier(cte.alias.name.clone()).value);
-        }
-    }
-
-    remove_cte_names(sql_names, &cte_names);
-
-    sql_names.extend(tql_names);
 }

 fn remove_cte_names(names: &mut HashSet<ObjectName>, cte_names: &HashSet<String>) {
@@ -339,6 +347,33 @@ pub fn location_to_index(sql: &str, location: &sqlparser::tokenizer::Location) -
    index - 1
 }

+/// Helper function for [extract_tables_from_query].
+///
+/// Handle [sqlparser::ast::Query].
+fn extract_tables_from_sql_query(query: &sqlparser::ast::Query, names: &mut HashSet<ObjectName>) {
+    let mut cte_names = HashSet::new();
+    if let Some(with) = &query.with {
+        for cte in &with.cte_tables {
+            let cte_name = ParserContext::canonicalize_identifier(cte.alias.name.clone()).value;
+            let mut cte_query_names = HashSet::new();
+            extract_tables_from_sql_query(&cte.query, &mut cte_query_names);
+            if with.recursive {
+                cte_names.insert(cte_name.clone());
+            }
+            remove_cte_names(&mut cte_query_names, &cte_names);
+            names.extend(cte_query_names);
+            if !with.recursive {
+                cte_names.insert(cte_name);
+            }
+        }
+    }
+
+    let mut body_names = HashSet::new();
+    extract_tables_from_set_expr(&query.body, &mut body_names);
+    remove_cte_names(&mut body_names, &cte_names);
+    names.extend(body_names);
+}
+
 /// Helper function for [extract_tables_from_query].
 ///
 /// Handle [SetExpr].
@@ -346,14 +381,11 @@ fn extract_tables_from_set_expr(set_expr: &SetExpr, names: &mut HashSet<ObjectNa
    match set_expr {
        SetExpr::Select(select) => {
            for from in &select.from {
-                table_factor_to_object_name(&from.relation, names);
-                for join in &from.joins {
-                    table_factor_to_object_name(&join.relation, names);
-                }
+                extract_tables_from_table_with_joins(from, names);
            }
        }
        SetExpr::Query(query) => {
-            extract_tables_from_set_expr(&query.body, names);
+            extract_tables_from_sql_query(query, names);
        }
        SetExpr::SetOperation { left, right, .. } => {
            extract_tables_from_set_expr(left, names);
@@ -363,12 +395,47 @@ fn extract_tables_from_set_expr(set_expr: &SetExpr, names: &mut HashSet<ObjectNa
    };
 }

+/// Helper function for [extract_tables_from_query].
+///
+/// Handle [TableWithJoins].
+fn extract_tables_from_table_with_joins(
+    table_with_joins: &TableWithJoins,
+    names: &mut HashSet<ObjectName>,
+) {
+    table_factor_to_object_name(&table_with_joins.relation, names);
+    for join in &table_with_joins.joins {
+        table_factor_to_object_name(&join.relation, names);
+    }
+}
+
 /// Helper function for [extract_tables_from_query].
 ///
 /// Handle [TableFactor].
 fn table_factor_to_object_name(table_factor: &TableFactor, names: &mut HashSet<ObjectName>) {
-    if let TableFactor::Table { name, .. } = table_factor {
-        names.insert(name.to_owned());
+    match table_factor {
+        TableFactor::Table { name, .. } => {
+            names.insert(name.to_owned());
+        }
+        TableFactor::Derived { subquery, .. } => {
+            extract_tables_from_sql_query(subquery, names);
+        }
+        TableFactor::NestedJoin {
+            table_with_joins, ..
+        } => {
+            extract_tables_from_table_with_joins(table_with_joins, names);
+        }
+        TableFactor::Pivot { table, .. }
+        | TableFactor::Unpivot { table, .. }
+        | TableFactor::MatchRecognize { table, .. } => {
+            table_factor_to_object_name(table, names);
+        }
+        TableFactor::TableFunction { .. }
+        | TableFactor::Function { .. }
+        | TableFactor::UNNEST { .. }
+        | TableFactor::JsonTable { .. }
+        | TableFactor::OpenJsonTable { .. }
+        | TableFactor::XmlTable { .. }
+        | TableFactor::SemanticView { .. } => {}
    }
 }

@@ -458,6 +525,91 @@ TQL EVAL (now() - '15s'::interval, now(), '5s') count_values("status_code", {__n
        }
    }

+    #[test]
+    fn test_extract_tables_from_sql_query_with_derived_join() {
+        let sql = r#"
+CREATE FLOW flow_batch_join_subquery SINK TO flow_batch_join_sink
+EVAL INTERVAL '1m' AS
+SELECT a.symbol, b.mark_price
+FROM (
+    SELECT inst_id AS symbol, max(ts) AS mark_iv_ts
+    FROM flow_batch_join_opt_summary
+    GROUP BY inst_id
+) a
+LEFT JOIN (
+    SELECT symbol, max(mark_price) AS mark_price
+    FROM flow_batch_join_market_v5
+    WHERE "type" = 'OPTION_MARK'
+    GROUP BY symbol
+) b ON a.symbol = b.symbol;
+"#;
+        let mut stmts =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap();
+        let Statement::CreateFlow(create_flow) = stmts.pop().unwrap() else {
+            unreachable!()
+        };
+
+        let mut tables = extract_tables_from_query(&create_flow.query)
+            .map(|table| format_raw_object_name(&table))
+            .collect_vec();
+        tables.sort();
+        assert_eq!(
+            vec![
+                "flow_batch_join_market_v5".to_string(),
+                "flow_batch_join_opt_summary".to_string(),
+            ],
+            tables
+        );
+    }
+
+    #[test]
+    fn test_extract_tables_from_sql_query_with_cte_scopes() {
+        let testcases = vec![
+            (
+                r#"
+WITH source AS (
+    SELECT * FROM source
+)
+SELECT * FROM source;
+"#,
+                vec!["source".to_string()],
+            ),
+            (
+                r#"
+WITH first_cte AS (
+    SELECT * FROM physical_source
+), second_cte AS (
+    SELECT * FROM first_cte
+)
+SELECT * FROM second_cte;
+"#,
+                vec!["physical_source".to_string()],
+            ),
+        ];
+
+        for (sql, expected_tables) in testcases {
+            let mut stmts = ParserContext::create_with_dialect(
+                sql,
+                &GreptimeDbDialect {},
+                ParseOptions::default(),
+            )
+            .unwrap();
+            let Statement::Query(query) = stmts.pop().unwrap() else {
+                unreachable!()
+            };
+
+            let mut tables = HashSet::new();
+            extract_tables_from_sql_query(&query.inner, &mut tables);
+            let mut tables = tables
+                .into_iter()
+                .map(|table| format_raw_object_name(&table))
+                .collect_vec();
+            tables.sort();
+            assert_eq!(expected_tables, tables);
+        }
+    }
+
    #[test]
    fn test_extract_tables_from_tql_query_with_schema_matcher() {
        let sql = r#"
--- a/src/store-api/src/region_request.rs
+++ b/src/store-api/src/region_request.rs
@@ -315,6 +315,7 @@ fn make_region_open(open: OpenRequest) -> Result<Vec<(RegionId, RegionRequest)>>
            options: open.options,
            skip_wal_replay: false,
            checkpoint: None,
+            requirements: Default::default(),
        }),
    )])
 }
@@ -566,6 +567,28 @@ pub struct RegionDropRequest {
    pub partial_drop: bool,
 }

+/// Requirements for a region request.
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(default)]
+pub struct RegionRequirements {
+    /// Whether the region data must be backed by object storage.
+    pub object_storage: bool,
+}
+
+impl RegionRequirements {
+    /// Returns empty requirements.
+    pub fn empty() -> Self {
+        Self::default()
+    }
+
+    /// Returns requirements for object storage.
+    pub fn object_storage() -> Self {
+        Self {
+            object_storage: true,
+        }
+    }
+}
+
 /// Open region request.
 #[derive(Debug, Clone)]
 pub struct RegionOpenRequest {
@@ -581,6 +604,8 @@ pub struct RegionOpenRequest {
    pub skip_wal_replay: bool,
    /// Replay checkpoint.
    pub checkpoint: Option<ReplayCheckpoint>,
+    /// Requirements for opening the region.
+    pub requirements: RegionRequirements,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
--- a/tests-integration/Cargo.toml
+++ b/tests-integration/Cargo.toml
@@ -63,7 +63,7 @@ log-query = { workspace = true }
 loki-proto.workspace = true
 meta-client.workspace = true
 meta-srv = { workspace = true, features = ["mock"] }
-mito2.workspace = true
+mito2 = { workspace = true, features = ["test-shared-fs-region-migration"] }
 object-store.workspace = true
 operator = { workspace = true, features = ["testing"] }
 plugins.workspace = true
--- a/tests/cases/standalone/common/copy/copy_from_fs_csv.result
+++ b/tests/cases/standalone/common/copy/copy_from_fs_csv.result
@@ -183,6 +183,24 @@ select * from csv_null_prefix_import;
 | final | 2023-11-14T22:13:23 |
 +-------+---------------------+

+CREATE TABLE csv_skip_bad_records(host_id int, host_name string, reading_value double, ts timestamp time index);
+
+Affected Rows: 0
+
+-- SQLNESS ENV PWD
+Copy csv_skip_bad_records FROM '$PWD/tests/data/csv/skip_bad_records.csv' WITH (format='csv', skip_bad_records='true');
+
+Affected Rows: 2
+
+select * from csv_skip_bad_records order by ts;
+
+---------+-----------+---------------+---------------------+
+| host_id | host_name | reading_value | ts                  |
+---------+-----------+---------------+---------------------+
+| 1       | Alice     | 10.5          | 2024-01-01T00:00:00 |
+| 2       | Bob       | 30.5          | 2024-01-01T00:00:02 |
+---------+-----------+---------------+---------------------+
+
 drop table demo;

 Affected Rows: 0
@@ -219,3 +237,7 @@ drop table csv_null_prefix_import;

 Affected Rows: 0

+drop table csv_skip_bad_records;
+
+Affected Rows: 0
+
--- a/tests/cases/standalone/common/copy/copy_from_fs_csv.sql
+++ b/tests/cases/standalone/common/copy/copy_from_fs_csv.sql
@@ -73,6 +73,13 @@ Copy csv_null_prefix_import FROM '${SQLNESS_HOME}/demo/export/csv_null_prefix.cs

 select * from csv_null_prefix_import;

+CREATE TABLE csv_skip_bad_records(host_id int, host_name string, reading_value double, ts timestamp time index);
+
+-- SQLNESS ENV PWD
+Copy csv_skip_bad_records FROM '$PWD/tests/data/csv/skip_bad_records.csv' WITH (format='csv', skip_bad_records='true');
+
+select * from csv_skip_bad_records order by ts;
+
 drop table demo;

 drop table with_filename;
@@ -90,3 +97,5 @@ drop table demo_with_less_columns;
 drop table csv_null_prefix;

 drop table csv_null_prefix_import;
+
+drop table csv_skip_bad_records;
--- a/tests/cases/standalone/common/flow/flow_batch_join_subquery.result
+++ b/tests/cases/standalone/common/flow/flow_batch_join_subquery.result
@@ -0,0 +1,130 @@
+CREATE DATABASE flow_join_fixture;
+
+Affected Rows: 1
+
+CREATE TABLE flow_join_fixture."left_samples" (
+    source_id STRING,
+    left_value DOUBLE,
+    event_ts TIMESTAMP,
+    observed_at TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+CREATE TABLE flow_join_fixture."right_samples" (
+    source_id STRING,
+    right_value DOUBLE,
+    sample_kind STRING,
+    event_ts TIMESTAMP,
+    observed_at TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+-- Verify batching flow creation accepts aggregate subqueries joined by LEFT JOIN.
+CREATE FLOW flow_batch_join_subquery SINK TO flow_batch_join_sink
+EVAL INTERVAL '5m' AS
+SELECT
+    l.source_id,
+    l.measure_name,
+    l.bucket_time,
+    l.left_event_ts,
+    l.left_value,
+    r.right_event_ts,
+    r.right_value
+FROM (
+    SELECT
+        source_id,
+        'sample' AS measure_name,
+        date_trunc('minute', now()) AS bucket_time,
+        max(event_ts) AS left_event_ts,
+        last_value(left_value ORDER BY observed_at) AS left_value
+    FROM
+        flow_join_fixture."left_samples"
+    WHERE
+        observed_at BETWEEN date_trunc('minute', now()) - INTERVAL '5 minutes'
+            AND date_trunc('minute', now())
+    GROUP BY
+        source_id
+) l
+LEFT JOIN (
+    SELECT
+        source_id,
+        'sample' AS measure_name,
+        date_trunc('minute', now()) AS bucket_time,
+        max(event_ts) AS right_event_ts,
+        last_value(right_value ORDER BY observed_at) AS right_value
+    FROM
+        flow_join_fixture."right_samples"
+    WHERE
+        observed_at BETWEEN date_trunc('minute', now()) - INTERVAL '5 minutes'
+            AND date_trunc('minute', now())
+        AND sample_kind = 'primary'
+    GROUP BY
+        source_id
+) r ON l.source_id = r.source_id AND l.bucket_time = r.bucket_time;
+
+Affected Rows: 0
+
+SELECT
+    source_table_names LIKE '%left_samples%' AS has_left_source,
+    source_table_names LIKE '%right_samples%' AS has_right_source,
+    options LIKE '%"flow_type":"batching"%' AS is_batching_flow
+FROM
+    INFORMATION_SCHEMA.FLOWS
+WHERE
+    flow_name = 'flow_batch_join_subquery';
+
+-----------------+------------------+------------------+
+| has_left_source | has_right_source | is_batching_flow |
+-----------------+------------------+------------------+
+| true            | true             | true             |
+-----------------+------------------+------------------+
+
+INSERT INTO flow_join_fixture."left_samples" VALUES
+    ('source-a', 0.12, date_trunc('minute', now()), date_trunc('minute', now()));
+
+Affected Rows: 1
+
+INSERT INTO flow_join_fixture."right_samples" VALUES
+    ('source-a', 100.5, 'primary', date_trunc('minute', now()), date_trunc('minute', now()));
+
+Affected Rows: 1
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('flow_batch_join_subquery');
+
+----------------------------------------------+
+| ADMIN FLUSH_FLOW('flow_batch_join_subquery') |
+----------------------------------------------+
+|  FLOW_FLUSHED  |
+----------------------------------------------+
+
+SELECT source_id, measure_name, left_value, right_value FROM flow_batch_join_sink ORDER BY source_id;
+
+-----------+--------------+------------+-------------+
+| source_id | measure_name | left_value | right_value |
+-----------+--------------+------------+-------------+
+| source-a  | sample       | 0.12       | 100.5       |
+-----------+--------------+------------+-------------+
+
+DROP FLOW flow_batch_join_subquery;
+
+Affected Rows: 0
+
+DROP TABLE flow_batch_join_sink;
+
+Affected Rows: 0
+
+DROP TABLE flow_join_fixture."left_samples";
+
+Affected Rows: 0
+
+DROP TABLE flow_join_fixture."right_samples";
+
+Affected Rows: 0
+
+DROP DATABASE flow_join_fixture;
+
+Affected Rows: 0
+
--- a/tests/cases/standalone/common/flow/flow_batch_join_subquery.sql
+++ b/tests/cases/standalone/common/flow/flow_batch_join_subquery.sql
@@ -0,0 +1,85 @@
+CREATE DATABASE flow_join_fixture;
+
+CREATE TABLE flow_join_fixture."left_samples" (
+    source_id STRING,
+    left_value DOUBLE,
+    event_ts TIMESTAMP,
+    observed_at TIMESTAMP TIME INDEX
+);
+
+CREATE TABLE flow_join_fixture."right_samples" (
+    source_id STRING,
+    right_value DOUBLE,
+    sample_kind STRING,
+    event_ts TIMESTAMP,
+    observed_at TIMESTAMP TIME INDEX
+);
+
+-- Verify batching flow creation accepts aggregate subqueries joined by LEFT JOIN.
+CREATE FLOW flow_batch_join_subquery SINK TO flow_batch_join_sink
+EVAL INTERVAL '5m' AS
+SELECT
+    l.source_id,
+    l.measure_name,
+    l.bucket_time,
+    l.left_event_ts,
+    l.left_value,
+    r.right_event_ts,
+    r.right_value
+FROM (
+    SELECT
+        source_id,
+        'sample' AS measure_name,
+        date_trunc('minute', now()) AS bucket_time,
+        max(event_ts) AS left_event_ts,
+        last_value(left_value ORDER BY observed_at) AS left_value
+    FROM
+        flow_join_fixture."left_samples"
+    WHERE
+        observed_at BETWEEN date_trunc('minute', now()) - INTERVAL '5 minutes'
+            AND date_trunc('minute', now())
+    GROUP BY
+        source_id
+) l
+LEFT JOIN (
+    SELECT
+        source_id,
+        'sample' AS measure_name,
+        date_trunc('minute', now()) AS bucket_time,
+        max(event_ts) AS right_event_ts,
+        last_value(right_value ORDER BY observed_at) AS right_value
+    FROM
+        flow_join_fixture."right_samples"
+    WHERE
+        observed_at BETWEEN date_trunc('minute', now()) - INTERVAL '5 minutes'
+            AND date_trunc('minute', now())
+        AND sample_kind = 'primary'
+    GROUP BY
+        source_id
+) r ON l.source_id = r.source_id AND l.bucket_time = r.bucket_time;
+
+SELECT
+    source_table_names LIKE '%left_samples%' AS has_left_source,
+    source_table_names LIKE '%right_samples%' AS has_right_source,
+    options LIKE '%"flow_type":"batching"%' AS is_batching_flow
+FROM
+    INFORMATION_SCHEMA.FLOWS
+WHERE
+    flow_name = 'flow_batch_join_subquery';
+
+INSERT INTO flow_join_fixture."left_samples" VALUES
+    ('source-a', 0.12, date_trunc('minute', now()), date_trunc('minute', now()));
+
+INSERT INTO flow_join_fixture."right_samples" VALUES
+    ('source-a', 100.5, 'primary', date_trunc('minute', now()), date_trunc('minute', now()));
+
+-- SQLNESS REPLACE (ADMIN\sFLUSH_FLOW\('\w+'\)\s+\|\n\+-+\+\n\|\s+)[0-9]+\s+\| $1 FLOW_FLUSHED  |
+ADMIN FLUSH_FLOW('flow_batch_join_subquery');
+
+SELECT source_id, measure_name, left_value, right_value FROM flow_batch_join_sink ORDER BY source_id;
+
+DROP FLOW flow_batch_join_subquery;
+DROP TABLE flow_batch_join_sink;
+DROP TABLE flow_join_fixture."left_samples";
+DROP TABLE flow_join_fixture."right_samples";
+DROP DATABASE flow_join_fixture;
--- a/tests/cases/standalone/common/flow/flow_last_non_null.result
+++ b/tests/cases/standalone/common/flow/flow_last_non_null.result
@@ -162,6 +162,8 @@ CREATE TABLE approx_rate (

 Affected Rows: 0

+-- Without merge_mode=last_non_null, this partial output is rejected at CREATE FLOW time.
+-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
 CREATE FLOW find_approx_rate SINK TO approx_rate AS
 SELECT
    (max(byte) - min(byte)) / 30.0 as rate,
@@ -172,24 +174,7 @@ from
 GROUP BY
    time_window;

-Affected Rows: 0
-
-INSERT INTO
-    bytes_log
-VALUES
-    (NULL, '2023-01-01 00:00:01'),
-    (300, '2023-01-01 00:00:31');
-
-Affected Rows: 2
-
-- should return error
-ADMIN FLUSH_FLOW('find_approx_rate');
-
-Error: 1002(Unexpected), Failed to execute admin function flush_flow: Execution error: Internal error: 1003
-
-DROP FLOW find_approx_rate;
-
-Affected Rows: 0
+Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Flow output schema does not match sink table schema: found 3 flow output columns and 4 sink table columns. flow output columns: [\"rate\", \"time_window\", \"update_at\"], sink table columns: [\"rate\", \"time_window\", \"update_at\", \"bb\"], extra flow columns not in sink: [], missing sink columns from flow output: [\"bb\"]") in context: Failed to rewrite plan

 DROP TABLE bytes_log;

--- a/tests/cases/standalone/common/flow/flow_last_non_null.sql
+++ b/tests/cases/standalone/common/flow/flow_last_non_null.sql
@@ -84,6 +84,8 @@ CREATE TABLE approx_rate (
    TIME INDEX(time_window)
 );

+-- Without merge_mode=last_non_null, this partial output is rejected at CREATE FLOW time.
+-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
 CREATE FLOW find_approx_rate SINK TO approx_rate AS
 SELECT
    (max(byte) - min(byte)) / 30.0 as rate,
@@ -93,16 +95,5 @@ from
    bytes_log
 GROUP BY
    time_window;
-
-INSERT INTO
-    bytes_log
-VALUES
-    (NULL, '2023-01-01 00:00:01'),
-    (300, '2023-01-01 00:00:31');
-
-- should return error
-ADMIN FLUSH_FLOW('find_approx_rate');
-
-DROP FLOW find_approx_rate;
 DROP TABLE bytes_log;
 DROP TABLE approx_rate;
--- a/tests/cases/standalone/common/flow/flow_sink_schema_mismatch.result
+++ b/tests/cases/standalone/common/flow/flow_sink_schema_mismatch.result
@@ -0,0 +1,123 @@
+-- Verify that batching flow rejects CREATE FLOW when the pre-existing sink
+-- table schema does not match the flow output (create-time validation, not runtime).
+CREATE TABLE source_mm (
+    "number" INT,
+    extra STRING,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+-- Pre-create a sink table that is intentionally missing the "extra" column.
+-- This case validates batching mode at CREATE FLOW time, before any INSERT/FLUSH.
+CREATE TABLE sink_mm (
+    "number" INT,
+    time_window TIMESTAMP TIME INDEX,
+    cnt BIGINT
+);
+
+Affected Rows: 0
+
+-- This CREATE FLOW should fail immediately: the flow outputs (number, extra, time_window, cnt)
+-- but sink_mm has only (number, time_window, cnt).
+-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
+CREATE FLOW mismatch_flow SINK TO sink_mm AS
+SELECT
+    "number",
+    extra,
+    date_bin(INTERVAL '1 second', ts) as time_window,
+    count(*) as cnt
+FROM
+    source_mm
+GROUP BY
+    "number", extra, time_window;
+
+Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Flow output schema does not match sink table schema: found 4 flow output columns and 3 sink table columns. flow output columns: [\"number\", \"extra\", \"time_window\", \"cnt\"], sink table columns: [\"number\", \"time_window\", \"cnt\"], extra flow columns not in sink: [\"extra\"], missing sink columns from flow output: []") in context: Failed to rewrite plan
+
+DROP TABLE source_mm;
+
+Affected Rows: 0
+
+DROP TABLE sink_mm;
+
+Affected Rows: 0
+
+-- TQL/PromQL flows use the same create-time sink schema validation path.
+CREATE TABLE tql_source_mm (
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX,
+    sensor STRING,
+    loc STRING,
+    PRIMARY KEY (sensor, loc)
+);
+
+Affected Rows: 0
+
+-- Pre-create a TQL sink table that is intentionally missing the "sensor" tag column.
+CREATE TABLE tql_sink_mm (
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+-- This CREATE FLOW should fail immediately: the TQL output has (value, sensor, ts),
+-- but tql_sink_mm has only (value, ts).
+-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
+CREATE FLOW tql_mismatch_flow
+SINK TO tql_sink_mm
+EVAL INTERVAL '1m' AS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (tql_source_mm) AS value;
+
+Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Flow output schema does not match sink table schema: found 3 flow output columns and 2 sink table columns. flow output columns: [\"value\", \"sensor\", \"ts\"], sink table columns: [\"value\", \"ts\"], extra flow columns not in sink: [\"sensor\"], missing sink columns from flow output: []") in context: Failed to rewrite plan
+
+DROP TABLE tql_source_mm;
+
+Affected Rows: 0
+
+DROP TABLE tql_sink_mm;
+
+Affected Rows: 0
+
+-- Real merge_mode=last_non_null sink options should enable partial schema validation.
+CREATE TABLE lnn_source_mm (
+    device STRING,
+    val DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+Affected Rows: 0
+
+CREATE TABLE lnn_sink_mm (
+    device STRING,
+    time_window TIMESTAMP TIME INDEX,
+    cnt BIGINT,
+    PRIMARY KEY (device)
+) WITH('merge_mode'='last_non_null');
+
+Affected Rows: 0
+
+-- This CREATE FLOW should fail through the last_non_null partial validator: the
+-- sink primary key "device" is required but absent from the flow output.
+-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
+CREATE FLOW lnn_missing_pk_flow
+SINK TO lnn_sink_mm AS
+SELECT
+    date_bin(INTERVAL '1 second', ts) as time_window,
+    count(*) as cnt
+FROM
+    lnn_source_mm
+GROUP BY
+    time_window;
+
+Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Column(s) [\"device\"] required by sink table are missing from flow output when merge_mode=last_non_null. Flow output schema does not match sink table schema: found 2 flow output columns and 3 sink table columns. flow output columns: [\"time_window\", \"cnt\"], sink table columns: [\"device\", \"time_window\", \"cnt\"], extra flow columns not in sink: [], missing sink columns from flow output: [\"device\"]") in context: Failed to rewrite plan
+
+DROP TABLE lnn_source_mm;
+
+Affected Rows: 0
+
+DROP TABLE lnn_sink_mm;
+
+Affected Rows: 0
+
--- a/tests/cases/standalone/common/flow/flow_sink_schema_mismatch.sql
+++ b/tests/cases/standalone/common/flow/flow_sink_schema_mismatch.sql
@@ -0,0 +1,89 @@
+-- Verify that batching flow rejects CREATE FLOW when the pre-existing sink
+-- table schema does not match the flow output (create-time validation, not runtime).
+CREATE TABLE source_mm (
+    "number" INT,
+    extra STRING,
+    ts TIMESTAMP TIME INDEX
+);
+
+-- Pre-create a sink table that is intentionally missing the "extra" column.
+-- This case validates batching mode at CREATE FLOW time, before any INSERT/FLUSH.
+CREATE TABLE sink_mm (
+    "number" INT,
+    time_window TIMESTAMP TIME INDEX,
+    cnt BIGINT
+);
+
+-- This CREATE FLOW should fail immediately: the flow outputs (number, extra, time_window, cnt)
+-- but sink_mm has only (number, time_window, cnt).
+-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
+CREATE FLOW mismatch_flow SINK TO sink_mm AS
+SELECT
+    "number",
+    extra,
+    date_bin(INTERVAL '1 second', ts) as time_window,
+    count(*) as cnt
+FROM
+    source_mm
+GROUP BY
+    "number", extra, time_window;
+
+DROP TABLE source_mm;
+DROP TABLE sink_mm;
+
+-- TQL/PromQL flows use the same create-time sink schema validation path.
+CREATE TABLE tql_source_mm (
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX,
+    sensor STRING,
+    loc STRING,
+    PRIMARY KEY (sensor, loc)
+);
+
+-- Pre-create a TQL sink table that is intentionally missing the "sensor" tag column.
+CREATE TABLE tql_sink_mm (
+    `value` DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+-- This CREATE FLOW should fail immediately: the TQL output has (value, sensor, ts),
+-- but tql_sink_mm has only (value, ts).
+-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
+CREATE FLOW tql_mismatch_flow
+SINK TO tql_sink_mm
+EVAL INTERVAL '1m' AS
+TQL EVAL (now() - '1m'::interval, now(), '1m')
+avg by(sensor) (tql_source_mm) AS value;
+
+DROP TABLE tql_source_mm;
+DROP TABLE tql_sink_mm;
+
+-- Real merge_mode=last_non_null sink options should enable partial schema validation.
+CREATE TABLE lnn_source_mm (
+    device STRING,
+    val DOUBLE,
+    ts TIMESTAMP TIME INDEX
+);
+
+CREATE TABLE lnn_sink_mm (
+    device STRING,
+    time_window TIMESTAMP TIME INDEX,
+    cnt BIGINT,
+    PRIMARY KEY (device)
+) WITH('merge_mode'='last_non_null');
+
+-- This CREATE FLOW should fail through the last_non_null partial validator: the
+-- sink primary key "device" is required but absent from the flow output.
+-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
+CREATE FLOW lnn_missing_pk_flow
+SINK TO lnn_sink_mm AS
+SELECT
+    date_bin(INTERVAL '1 second', ts) as time_window,
+    count(*) as cnt
+FROM
+    lnn_source_mm
+GROUP BY
+    time_window;
+
+DROP TABLE lnn_source_mm;
+DROP TABLE lnn_sink_mm;
--- a/tests/cases/standalone/flow-tql/flow_tql_missing_value_sink_schema.result
+++ b/tests/cases/standalone/flow-tql/flow_tql_missing_value_sink_schema.result
@@ -0,0 +1,90 @@
+-- Regression for a TQL flow whose pre-created sink table is missing the value
+-- output column. The labels are intentionally minimal and anonymous.
+CREATE DATABASE source_schema;
+
+Affected Rows: 1
+
+CREATE DATABASE sink_schema;
+
+Affected Rows: 1
+
+USE source_schema;
+
+Affected Rows: 0
+
+CREATE TABLE metric_input (
+  namespace STRING NULL,
+  app STRING NULL,
+  greptime_timestamp TIMESTAMP(3) NOT NULL,
+  greptime_value DOUBLE NULL,
+  TIME INDEX (greptime_timestamp),
+  PRIMARY KEY (namespace, app)
+);
+
+Affected Rows: 0
+
+INSERT INTO metric_input VALUES
+  ('ns', 'app-a', '2026-01-23T03:40:00Z', 10.0),
+  ('ns', 'app-a', '2026-01-23T03:50:00Z', 20.0);
+
+Affected Rows: 2
+
+USE sink_schema;
+
+Affected Rows: 0
+
+-- Intentionally omit greptime_value DOUBLE from the pre-created sink table.
+CREATE TABLE missing_value_sink (
+  namespace STRING NULL,
+  app STRING NULL,
+  greptime_timestamp TIMESTAMP(3) NOT NULL,
+  TIME INDEX (greptime_timestamp),
+  PRIMARY KEY (namespace, app)
+)
+ENGINE=mito;
+
+Affected Rows: 0
+
+-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
+CREATE FLOW missing_value_flow
+SINK TO sink_schema.missing_value_sink
+EVAL INTERVAL '3600 s'
+AS TQL EVAL (
+  date_bin('2m'::interval, now() - '2m'::interval),
+  date_bin('2m'::interval, now() - '2m'::interval),
+  '1h'
+)
+  avg by (namespace, app) (
+    avg_over_time(metric_input{__schema__="source_schema"}[1h])
+  );
+
+Error: 3001(EngineExecuteQuery), Datafusion error: Plan("Flow output schema does not match sink table schema: found 4 flow output columns and 3 sink table columns. flow output columns: [\"namespace\", \"app\", \"greptime_timestamp\", \"avg(prom_avg_over_time(greptime_timestamp_range,greptime_value))\"], sink table columns: [\"namespace\", \"app\", \"greptime_timestamp\"], extra flow columns not in sink: [\"avg(prom_avg_over_time(greptime_timestamp_range,greptime_value))\"], missing sink columns from flow output: []") in context: Failed to rewrite plan
+
+DROP FLOW IF EXISTS missing_value_flow;
+
+Affected Rows: 0
+
+DROP TABLE missing_value_sink;
+
+Affected Rows: 0
+
+USE source_schema;
+
+Affected Rows: 0
+
+DROP TABLE metric_input;
+
+Affected Rows: 0
+
+USE public;
+
+Affected Rows: 0
+
+DROP DATABASE sink_schema;
+
+Affected Rows: 0
+
+DROP DATABASE source_schema;
+
+Affected Rows: 0
+
--- a/tests/cases/standalone/flow-tql/flow_tql_missing_value_sink_schema.sql
+++ b/tests/cases/standalone/flow-tql/flow_tql_missing_value_sink_schema.sql
@@ -0,0 +1,55 @@
+-- Regression for a TQL flow whose pre-created sink table is missing the value
+-- output column. The labels are intentionally minimal and anonymous.
+
+CREATE DATABASE source_schema;
+CREATE DATABASE sink_schema;
+
+USE source_schema;
+
+CREATE TABLE metric_input (
+  namespace STRING NULL,
+  app STRING NULL,
+  greptime_timestamp TIMESTAMP(3) NOT NULL,
+  greptime_value DOUBLE NULL,
+  TIME INDEX (greptime_timestamp),
+  PRIMARY KEY (namespace, app)
+);
+
+INSERT INTO metric_input VALUES
+  ('ns', 'app-a', '2026-01-23T03:40:00Z', 10.0),
+  ('ns', 'app-a', '2026-01-23T03:50:00Z', 20.0);
+
+USE sink_schema;
+
+-- Intentionally omit greptime_value DOUBLE from the pre-created sink table.
+CREATE TABLE missing_value_sink (
+  namespace STRING NULL,
+  app STRING NULL,
+  greptime_timestamp TIMESTAMP(3) NOT NULL,
+  TIME INDEX (greptime_timestamp),
+  PRIMARY KEY (namespace, app)
+)
+ENGINE=mito;
+
+-- SQLNESS REPLACE (in\scontext:\sFailed\sto\srewrite\splan:\sError\sduring\splanning:.*) in context: Failed to rewrite plan
+CREATE FLOW missing_value_flow
+SINK TO sink_schema.missing_value_sink
+EVAL INTERVAL '3600 s'
+AS TQL EVAL (
+  date_bin('2m'::interval, now() - '2m'::interval),
+  date_bin('2m'::interval, now() - '2m'::interval),
+  '1h'
+)
+  avg by (namespace, app) (
+    avg_over_time(metric_input{__schema__="source_schema"}[1h])
+  );
+
+DROP FLOW IF EXISTS missing_value_flow;
+DROP TABLE missing_value_sink;
+
+USE source_schema;
+DROP TABLE metric_input;
+
+USE public;
+DROP DATABASE sink_schema;
+DROP DATABASE source_schema;
--- a/tests/data/csv/skip_bad_records.csv
+++ b/tests/data/csv/skip_bad_records.csv
@@ -0,0 +1,4 @@
+host_id,host_name,reading_value,ts
+1,Alice,10.5,2024-01-01T00:00:00
+bad,Bad,20.0,2024-01-01T00:00:01
+2,Bob,30.5,2024-01-01T00:00:02