Merge branch 'main' into feat/flush-hook-extension-point

2026-06-03 05:40:40 +00:00 · 2026-06-01 22:12:09 -07:00
parent 655f3a959c c1b0418377
commit 848886a7f9
85 changed files with 2690 additions and 1760 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2278,6 +2278,7 @@ dependencies = [
 "futures",
 "lazy_static",
 "object-store",
+ "object_store_opendal",
 "orc-rust",
 "parquet",
 "paste",
@@ -5102,6 +5103,7 @@ dependencies = [
 "datatypes",
 "futures",
 "object-store",
+ "object_store_opendal",
 "serde",
 "serde_json",
 "snafu 0.8.6",
@@ -8320,6 +8322,7 @@ dependencies = [
 "datafusion-common",
 "datafusion-expr",
 "datatypes",
+ "derive_more",
 "dotenv",
 "either",
 "futures",
@@ -9074,8 +9077,9 @@ dependencies = [

 [[package]]
 name = "object_store_opendal"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eb12a624a41fce745838d0ef3701ff6c47797c13cd18ad3612fd2a3134fdbd8"
 dependencies = [
 "async-trait",
 "bytes",
@@ -9162,8 +9166,9 @@ checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"

 [[package]]
 name = "opendal"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96c9c85ce253ff87225e7669979d877a20c98a06604ec9d6dd5f4473e08f1ae1"
 dependencies = [
 "ctor",
 "opendal-core",
@@ -9183,8 +9188,9 @@ dependencies = [

 [[package]]
 name = "opendal-core"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4f8607c90e2c963a91467f50fb49fbc7fb3d573f88cea219ca59ccd3740b309"
 dependencies = [
 "anyhow",
 "base64 0.22.1",
@@ -9210,8 +9216,9 @@ dependencies = [

 [[package]]
 name = "opendal-layer-concurrent-limit"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d6f81ba6960e3fae1882f253b114b21d7e444e1534f209c7737a79f6243eb6f"
 dependencies = [
 "futures",
 "http 1.3.1",
@@ -9221,8 +9228,9 @@ dependencies = [

 [[package]]
 name = "opendal-layer-logging"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "58ada45c6d81d1aa4c9305d0c7d4bc317c59c85866a0908a2d75a7a978aa5ee2"
 dependencies = [
 "log",
 "opendal-core",
@@ -9230,8 +9238,9 @@ dependencies = [

 [[package]]
 name = "opendal-layer-observe-metrics-common"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "628b0228fdbd13c3d9d50eee4341f2eb82ca5b44991e4c68f07c84cc823e2d12"
 dependencies = [
 "futures",
 "http 1.3.1",
@@ -9240,8 +9249,9 @@ dependencies = [

 [[package]]
 name = "opendal-layer-prometheus"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0487bdb1357097ec8654781bad03ef310282517738e2864ebde69e27aaafc5ec"
 dependencies = [
 "opendal-core",
 "opendal-layer-observe-metrics-common",
@@ -9250,8 +9260,9 @@ dependencies = [

 [[package]]
 name = "opendal-layer-retry"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b2a25a718afb81fad81cb9a0580a1cb989221fa2317f888c6a37f8dad408eb7"
 dependencies = [
 "backon",
 "log",
@@ -9260,8 +9271,9 @@ dependencies = [

 [[package]]
 name = "opendal-layer-timeout"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e91f731724c213af81e9d03517859c8fc47b4578e64ad61ae4f099f10fe36e3"
 dependencies = [
 "opendal-core",
 "tokio",
@@ -9269,8 +9281,9 @@ dependencies = [

 [[package]]
 name = "opendal-layer-tracing"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90c6fc9df6da1f0dafbdf55fa48525f1643aefbe7da8f46936e869e2a5b8a34f"
 dependencies = [
 "futures",
 "http 1.3.1",
@@ -9280,8 +9293,9 @@ dependencies = [

 [[package]]
 name = "opendal-service-azblob"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0030644366ef5d8cbe3a4a5822bf99a4aafddc1666e9d24b44d158d9062fc76a"
 dependencies = [
 "base64 0.22.1",
 "bytes",
@@ -9300,8 +9314,9 @@ dependencies = [

 [[package]]
 name = "opendal-service-azure-common"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b489f13c42e69d69bdd72952b634356ec43a7881a20259b38b540fcecdf4051"
 dependencies = [
 "http 1.3.1",
 "opendal-core",
@@ -9309,8 +9324,9 @@ dependencies = [

 [[package]]
 name = "opendal-service-fs"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e89a665fef0e6bd249cf5ea47fc174b7ba892159bee4b9382528b1ca873a2c"
 dependencies = [
 "bytes",
 "log",
@@ -9322,8 +9338,9 @@ dependencies = [

 [[package]]
 name = "opendal-service-gcs"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48de101aac565ed06af4b47903c24eafd249075553ec1fb18256751c45148d47"
 dependencies = [
 "async-trait",
 "bytes",
@@ -9342,8 +9359,9 @@ dependencies = [

 [[package]]
 name = "opendal-service-http"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb6af628a0bf14075b957179444927e1df40dc7addef382b585a05ef015a077b"
 dependencies = [
 "http 1.3.1",
 "log",
@@ -9353,8 +9371,9 @@ dependencies = [

 [[package]]
 name = "opendal-service-oss"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "328fa55e8888cbdfe00826bfea2a79042422b720e8369e9e021e46121dea5ace"
 dependencies = [
 "bytes",
 "http 1.3.1",
@@ -9369,8 +9388,9 @@ dependencies = [

 [[package]]
 name = "opendal-service-s3"
-version = "0.56.0"
-source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "313d46c9f5ae70bca26b7c3e3fbb9b639292625f28af73aa016f47e788af9deb"
 dependencies = [
 "base64 0.22.1",
 "bytes",
@@ -14102,9 +14122,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"

 [[package]]
 name = "tar"
-version = "0.4.45"
+version = "0.4.46"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973"
+checksum = "3f6221d9a6003c78398e3b239969f352578258df48c8eb051caadae0015bc840"
 dependencies = [
 "filetime",
 "libc",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -178,7 +178,7 @@ nalgebra = "0.33"
 nix = { version = "0.30.1", default-features = false, features = ["event", "fs", "process"] }
 notify = "8.0"
 num_cpus = "1.16"
-object_store_opendal = { git = "https://github.com/apache/opendal.git", rev = "4ad2d85296ffa6fdc2882f97d3c760ee243913f7" }
+object_store_opendal = "0.57"
 once_cell = "1.18"
 opentelemetry-proto = { version = "0.31", features = [
    "gen-tonic",
--- a/config/config.md
+++ b/config/config.md
@@ -14,6 +14,7 @@
 | --- | -----| ------- | ----------- |
 | `default_timezone` | String | Unset | The default timezone of the server. |
 | `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
+| `auto_create_table` | Bool | `true` | Server-side global switch for auto table creation on write.<br/>When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`. |
 | `user_provider` | String | Unset | The user provider for authentication.<br/>Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" |
 | `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
 | `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.<br/>Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail" |
@@ -230,6 +231,7 @@
 | --- | -----| ------- | ----------- |
 | `default_timezone` | String | Unset | The default timezone of the server. |
 | `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
+| `auto_create_table` | Bool | `true` | Server-side global switch for auto table creation on write.<br/>When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`. |
 | `user_provider` | String | Unset | The user provider for authentication.<br/>Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" |
 | `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
 | `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.<br/>Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail" |
@@ -628,6 +630,7 @@
 | `flow.batching_mode.experimental_frontend_scan_timeout` | String | `30s` | Flow wait for available frontend timeout,<br/>if failed to find available frontend after frontend_scan_timeout elapsed, return error<br/>which prevent flownode from starting |
 | `flow.batching_mode.experimental_max_filter_num_per_query` | Integer | `20` | Maximum number of filters allowed in a single query |
 | `flow.batching_mode.experimental_time_window_merge_threshold` | Integer | `3` | Time window merge distance |
+| `flow.batching_mode.experimental_enable_incremental_read` | Bool | `false` | Whether to enable experimental flow incremental source reads.<br/>When disabled, batching flows always execute full-snapshot queries.<br/>Can be overridden per flow with WITH (experimental_enable_incremental_read = 'true'). |
 | `flow.batching_mode.read_preference` | String | `Leader` | Read preference of the Frontend client. |
 | `flow.batching_mode.frontend_tls` | -- | -- | -- |
 | `flow.batching_mode.frontend_tls.enabled` | Bool | `false` | Whether to enable TLS for client. |
--- a/config/flownode.example.toml
+++ b/config/flownode.example.toml
@@ -31,6 +31,10 @@ node_id = 14
 #+experimental_max_filter_num_per_query=20
 ## Time window merge distance
 #+experimental_time_window_merge_threshold=3
+## Whether to enable experimental flow incremental source reads.
+## When disabled, batching flows always execute full-snapshot queries.
+## Can be overridden per flow with WITH (experimental_enable_incremental_read = 'true').
+#+experimental_enable_incremental_read=false
 ## Read preference of the Frontend client.
 #+read_preference="Leader"
 [flow.batching_mode.frontend_tls]
--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -6,6 +6,10 @@ default_timezone = "UTC"
 ## @toml2docs:none-default
 default_column_prefix = "greptime"

+## Server-side global switch for auto table creation on write.
+## When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`.
+#+ auto_create_table = true
+
 ## The user provider for authentication.
 ## Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd"
 ## @toml2docs:none-default
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -6,6 +6,10 @@ default_timezone = "UTC"
 ## @toml2docs:none-default
 default_column_prefix = "greptime"

+## Server-side global switch for auto table creation on write.
+## When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`.
+#+ auto_create_table = true
+
 ## The user provider for authentication.
 ## Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd"
 ## @toml2docs:none-default
--- a/docs/rfcs/2026-05-28-table-semantic-layer.md
+++ b/docs/rfcs/2026-05-28-table-semantic-layer.md
@@ -0,0 +1,157 @@
+---
+Feature Name: Table Semantic Layer
+Tracking Issue: TBD
+Date: 2026-05-28
+Author: "Dennis Zhuang <killme2008@gmail.com>"
+---
+
+# Summary
+
+Attach a thin layer of semantic metadata to each table so machine consumers — LLM agents, alert generators, dashboard builders, MCP servers, ETL pipelines — can align it with the observability concepts they already know (OTel instrument kinds, Prometheus naming conventions, UCUM units, semantic conventions, severity numbers, OTel ↔ Prometheus translation rules).
+
+The mechanism reuses what already exists in `table_options` (the same slot that today carries `table_data_model` and `otlp_metric_compat`): a reserved `greptime.semantic.*` namespace, plus standard SQL column `COMMENT` for field-level supplements, plus an `information_schema.semantic_tables` view as the discovery entry point. No new protocol, no new DDL keyword.
+
+Per-table identity only. Cross-table relationships are deferred.
+
+# Motivation
+
+GreptimeDB already ingests OTLP metrics / traces / logs and Prometheus remote write. Each protocol carries rich metadata on the wire (instrument kind, temporality, unit, scope, resource, semantic-conventions version), and most of it is dropped when rows land in a table:
+
+- An `opentelemetry_traces` table looks like any wide table; signal type, source, and field provenance must be guessed from naming.
+- The OTel-to-Prometheus translation in v0.16+ actively drops scope attributes and most resource attributes; the table never records *what was dropped*.
+- Prometheus remote write v1 metadata is unreliable by protocol, but downstream tables do not flag whether `counter` typing was *declared* or *inferred* from the `_total` suffix.
+- Mixed-temporality data (OTel delta + Prometheus cumulative in the same table) is unrecoverable from schema alone.
+
+The audience is broader than LLM agents. Alert generators need to choose between `rate()` and absolute thresholds, and need units to pick sensible bounds. Dashboard builders pick visualisations by signal type. MCP servers surface a structured tool catalog instead of free-text descriptions. ETL pipelines need lineage to know whether a `service_name` column is `resource.service.name` or a free-form label. All of them currently guess from column names; the metadata to remove the guess already exists at ingest time, we just do not preserve it.
+
+# Goals
+
+1. Tag every ingested table with a stable identity using existing SQL surfaces — no new protocol, no new DDL keyword.
+2. Record the lossy transformations the ingestion path performs (dropped attributes, scope handling, type inference vs. declaration).
+3. Expose one `information_schema` view as the consumer-facing discovery entry point.
+4. Keep the layer optional and additive — tables without these options keep working unchanged.
+
+# Non-Goals
+
+- Cross-table relationship modelling. Deferred to a follow-up RFC.
+- Bespoke storage. Reuse `table_options` and column `COMMENT`.
+- Semantic enforcement at query time. The layer is descriptive, not coercive.
+- New wire protocol. Upstream standardisation is mentioned only as a future direction.
+
+# Proposal
+
+## Three mechanisms
+
+1. **`greptime.semantic.*` table options** — table-level identity and lineage. Carried inside the existing `table_options` blob. This is the same slot that today carries `table_data_model = 'greptime_trace_v1'` and `otlp_metric_compat = 'prom'`, so the mechanism is generalising what the OTLP trace auto-create path already does.
+2. **Column `COMMENT`** — column-level supplements ("this column is `resource.service.name`"; "this column carries delta values"). Standard SQL.
+3. **`information_schema.semantic_tables` view** — a denormalised projection of the options, registered through the existing `with_extra_table_factories()` hook. Tables without a `greptime.semantic.*` option do not appear in the view.
+
+## Vocabulary
+
+All keys are flat strings under the `greptime.semantic.` prefix; values are strings; unknown keys are tolerated so the vocabulary can grow without coordinated rollouts.
+
+**Common (all signals)**
+
+| Key | Example |
+| --- | --- |
+| `greptime.semantic.signal_type` | `trace` / `log` / `metric` / `event` |
+| `greptime.semantic.source` | `opentelemetry` / `prometheus` / `elasticsearch` / `loki` / `custom` |
+| `greptime.semantic.source_version` | protocol or SDK version, e.g. `v2` (Prom remote write), `1.30.0` (optional) |
+| `greptime.semantic.pipeline` | `greptime_trace_v1` (subsumes the existing `table_data_model` value) |
+
+**Trace**: `greptime.semantic.trace.conventions` (e.g. `otel-semconv-1.27`, lifted from `schema_url`, which is the version of the OpenTelemetry semantic conventions used in this table), `greptime.semantic.trace.has_events`, `greptime.semantic.trace.has_links`.
+
+**Metric** — v1 assumes one metric type per table, which is how both Prom RW and the post-v0.16 OTel ingestion path land data today; mixed-type tables are a follow-up.
+
+| Key | Example |
+| --- | --- |
+| `greptime.semantic.metric.type` | `counter` / `gauge` / `histogram` / `summary` / `updown_counter` / `gauge_histogram` / `info` / `stateset` |
+| `greptime.semantic.metric.unit` | UCUM, e.g. `s`, `By`, `{request}` |
+| `greptime.semantic.metric.temporality` | `cumulative` / `delta` (OTel only) |
+| `greptime.semantic.metric.monotonic` | `true` / `false` |
+| `greptime.semantic.metric.metadata_quality` | `declared` (OTLP / Prom RW v2 / exposition) or `inferred` (Prom RW v1, name-suffix guess) |
+| `greptime.semantic.metric.original_name` | Pre-translation OTel name when the table name was Prometheus-ised |
+
+`metadata_quality = inferred` is the load-bearing field for confidence-aware tooling: an inferred counter should be re-checked before betting on `rate()`-style semantics.
+
+**Log**: `greptime.semantic.log.severity_scheme` (`otlp` / `syslog` / `custom`), `greptime.semantic.log.body_format` (`string` / `json` / `mixed`).
+
+**Resource / scope preservation**: `greptime.semantic.resource.attributes_preserved` (JSON array string of attrs promoted to columns), `greptime.semantic.resource.attributes_dropped` (boolean), `greptime.semantic.scope.preserved` (boolean). These answer the most common downstream question: "is this data missing because it was dropped, or because it lives on a different column than I think?" List-shaped values use JSON array strings rather than comma-separated text to avoid escaping and ordering ambiguity.
+
+## Conflict and update semantics
+
+Two design decisions worth pinning down up front, because they constrain everything else:
+
+- **Conflict.** Some table-level keys (`trace.conventions` lifted from `schema_url`, `metric.temporality`, ...) cannot represent the truth when a long-lived table sees rows from multiple sources. v1 records `mixed` or `unknown` rather than a fictitious single value. Downstream consumers must treat any single-valued semantic key as best-effort, not strong evidence.
+- **Update.** Semantic options are stamped at table creation. v1 does not specify an update path; promoting `metadata_quality` from `inferred` to `declared`, refreshing `resource.attributes_preserved`, or revising `trace.conventions` on later writes is deferred. If real usage shows update is needed, it lands as a separate RFC.
+
+## `information_schema.semantic_tables`
+
+A consumer's first SQL on connect:
+
+```sql
+SELECT table_catalog, table_schema, table_name, signal_type, source, pipeline
+FROM information_schema.semantic_tables;
+```
+
+returns one row per semantic-tagged table. The view exposes a stable set of core columns (`table_catalog`, `table_schema`, `table_name`, `signal_type`, `source`, `source_version`, `pipeline`) plus a `semantic_options` JSON column carrying the rest of the `greptime.semantic.*` keys verbatim. Future keys appear inside `semantic_options` without forcing a view-schema change; only widely-used keys are ever promoted to first-class columns.
+
+# Implementation Plan
+
+Four phases, each independently shippable.
+
+1. **Identity.** Stamp `signal_type` and `source` on every auto-create path. The OTLP paths already have natural injection points; Prom remote write is the one non-trivial path because metric-engine logical tables share physical storage (see Open Question 2).
+2. **Metric specifics.** Add type / unit / temporality / monotonic / metadata_quality / original_name at OTel metric and Prom RW ingestion sites; the data is already at hand inside the OTel translator.
+3. **Resource / scope lineage.** Record what the OTel-to-Prometheus translation kept and dropped.
+4. **`information_schema.semantic_tables` view + documentation** as a stable user-facing contract.
+
+# Relationship to OpenTelemetry standardisation
+
+OTel today standardises what producers emit and how data collectors are managed; the read side — what a backend exposes back to clients — is deliberately vendor turf. OTLP is one-way; OpAMP is agent management; OTEP-0243 (App Telemetry Schema) is producer-side; `schema_url` is producer-stated with no reverse. Adjacent precedents — Prometheus `/api/v1/metadata`, Loki labels API, Tempo tags, Jaeger services, ad-hoc MCP servers — are all vendor-specific.
+
+This is a real gap. The shape we propose locally (signal-agnostic, `schema_url`-aware, structured around a small vocabulary) is deliberately close to what a future upstream OTEP for a backend-catalog read API could look like, with Weaver's *Resolved Telemetry Schema* as the natural data model. We do not commit to driving such an OTEP here; we do commit to keeping the local shape close enough that a future upstream proposal does not force a breaking migration.
+
+# Alternatives
+
+- **New DDL syntax (`SEMANTIC trace WITH (...)`).** Cleaner-looking but non-standard and forces every client to learn it. The metadata is not interesting enough to justify a new keyword.
+- **Dedicated `_semantic` system table.** Doubles the storage path for what is static per-table KV and adds lifecycle questions (drop, backfill). A view over `table_options` covers the same access pattern.
+- **Column comments only.** Discovery (`WHERE signal_type = 'trace'`) becomes a full-text problem. Comments are good for column-level supplements, not for identity.
+- **Encode everything into the table name.** What we do today. Every new field becomes a new naming convention.
+
+# Open Questions
+
+1. **Namespace prefix.** `greptime.semantic.*` vs. bare `semantic.*`. v1 picks the vendored prefix; alias or migrate if a community standard later emerges.
+2. **Prom RW injection point.** Metric-engine logical tables share physical storage, so per-logical-table options need a hook that does not exist as cleanly as the OTLP trace branch. A short spike before Phase 1 lands for Prom RW.
+3. **Mixed-type metric tables.** When ingestion modes that pack multiple metric types into one table appear, `metric.type` migrates from table-level to row-level. v1 leaves a `metric.type = 'mixed'` marker and punts.
+4. **Stability surface.** Top-level keys (`signal_type`, `source`) are stable; sub-namespaces (`metric.*`, ...) are evolving until v1.0 of the layer is declared.
+
+# Future Work
+
+- **Cross-table relationships.** Paired trace/services tables, metric/info pairing, JOIN hints. Its own RFC.
+- **Producer SDK/client identity.** An optional `greptime.semantic.source.sdk` key recording the emitting client (e.g. `opentelemetry-go`, `opentelemetry-java`, `opentelemetry-collector`). Because a single table can receive data from multiple SDKs (a shared trace table is the common case), mixed producers collapse to `mixed`, following the same conflict rule as the table-level keys above.
+- **Backfill** for tables created before this feature shipped.
+- **Upstream proposal.** Carry the shape into a community proposal — likely an OTEP for an OTLP-Catalog read API plus an MCP binding — informed by Greptime's local usage data.
+
+# References
+
+OpenTelemetry:
+- [OTLP specification](https://opentelemetry.io/docs/specs/otlp/)
+- [OTel Schemas (`schema_url`)](https://opentelemetry.io/docs/specs/otel/schemas/)
+- [Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/)
+- [OTEP-0243: App Telemetry Schema](https://github.com/open-telemetry/oteps/blob/main/text/0243-app-telemetry-schema-vision-roadmap.md)
+- [OpAMP specification](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md)
+- [Weaver: Resolved Telemetry Schema](https://github.com/open-telemetry/weaver)
+- [2025 Stability Proposal](https://opentelemetry.io/blog/2025/stability-proposal-announcement/)
+
+Prometheus / OpenMetrics:
+- [Prometheus Remote Write 1.0](https://prometheus.io/docs/specs/prw/remote_write_spec/)
+- [Prometheus Remote Write 2.0](https://prometheus.io/docs/specs/prw/remote_write_spec_2_0/)
+- [Prometheus exposition formats](https://prometheus.io/docs/instrumenting/exposition_formats/)
+- [Prometheus HTTP API: `/api/v1/metadata`](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-metric-metadata)
+
+Units and conventions:
+- [UCUM — Unified Code for Units of Measure](https://ucum.org/)
+
+GreptimeDB:
+- [OTLP ingestion guide](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/)
+- [Trace data model](https://docs.greptime.com/user-guide/traces/data-model/)
--- a/src/catalog/src/kvbackend/table_cache.rs
+++ b/src/catalog/src/kvbackend/table_cache.rs
@@ -14,7 +14,9 @@

 use std::sync::Arc;

-use common_meta::cache::{CacheContainer, Initializer, TableInfoCacheRef, TableNameCacheRef};
+use common_meta::cache::{
+    CacheContainer, InitStrategy, Initializer, TableInfoCacheRef, TableNameCacheRef,
+};
 use common_meta::error::{Result as MetaResult, ValueNotExistSnafu};
 use common_meta::instruction::CacheIdent;
 use futures::future::BoxFuture;
@@ -38,7 +40,14 @@ pub fn new_table_cache(
 ) -> TableCache {
    let init = init_factory(table_info_cache, table_name_cache);

-    CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
+    CacheContainer::with_strategy(
+        name,
+        cache,
+        Box::new(invalidator),
+        init,
+        filter,
+        InitStrategy::VersionChecked,
+    )
 }

 fn init_factory(
--- a/src/cmd/src/datanode.rs
+++ b/src/cmd/src/datanode.rs
@@ -79,7 +79,7 @@ impl App for Instance {
    }

    async fn start(&mut self) -> Result<()> {
-        plugins::start_datanode_plugins(self.datanode.plugins())
+        plugins::start_datanode_plugins(&self.datanode)
            .await
            .context(StartDatanodeSnafu)?;

--- a/src/cmd/src/flownode.rs
+++ b/src/cmd/src/flownode.rs
@@ -90,7 +90,7 @@ impl App for Instance {
    }

    async fn start(&mut self) -> Result<()> {
-        plugins::start_flownode_plugins(self.flownode.flow_engine().plugins().clone())
+        plugins::start_flownode_plugins(&self.flownode)
            .await
            .context(StartFlownodeSnafu)?;

--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -95,8 +95,7 @@ impl App for Instance {
    }

    async fn start(&mut self) -> Result<()> {
-        let plugins = self.frontend.instance.plugins().clone();
-        plugins::start_frontend_plugins(plugins)
+        plugins::start_frontend_plugins(&self.frontend.instance)
            .await
            .context(error::StartFrontendSnafu)?;

--- a/src/cmd/src/metasrv.rs
+++ b/src/cmd/src/metasrv.rs
@@ -68,7 +68,7 @@ impl App for Instance {
    }

    async fn start(&mut self) -> Result<()> {
-        plugins::start_metasrv_plugins(self.instance.plugins())
+        plugins::start_metasrv_plugins(&self.instance)
            .await
            .context(StartMetaServerSnafu)?;

--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -164,7 +164,7 @@ impl App for Instance {
            .start(self.leader_services_context.clone())
            .await?;

-        plugins::start_frontend_plugins(self.frontend.instance.plugins().clone())
+        plugins::start_frontend_plugins(&self.frontend.instance)
            .await
            .context(error::StartFrontendSnafu)?;

--- a/src/cmd/tests/load_config_test.rs
+++ b/src/cmd/tests/load_config_test.rs
@@ -114,6 +114,7 @@ fn test_load_frontend_example_config() {
        component: FrontendOptions {
            default_timezone: Some("UTC".to_string()),
            default_column_prefix: Some("greptime".to_string()),
+            auto_create_table: true,
            meta_client: Some(MetaClientOptions {
                metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
                timeout: Duration::from_secs(3),
@@ -267,6 +268,7 @@ fn test_load_standalone_example_config() {
        component: StandaloneOptions {
            default_timezone: Some("UTC".to_string()),
            default_column_prefix: Some("greptime".to_string()),
+            auto_create_table: true,
            wal: DatanodeWalConfig::RaftEngine(RaftEngineConfig {
                dir: Some(format!("{}/{}", DEFAULT_DATA_HOME, WAL_DIR)),
                sync_period: Some(Duration::from_secs(10)),
--- a/src/common/datasource/Cargo.toml
+++ b/src/common/datasource/Cargo.toml
@@ -33,6 +33,7 @@ datatypes.workspace = true
 futures.workspace = true
 lazy_static.workspace = true
 object-store.workspace = true
+object_store_opendal.workspace = true
 orc-rust = { version = "0.8", default-features = false, features = ["async"] }
 parquet.workspace = true
 paste.workspace = true
--- a/src/common/datasource/src/file_format.rs
+++ b/src/common/datasource/src/file_format.rs
@@ -316,7 +316,7 @@ pub async fn file_to_stream(
            .with_file_compression_type(df_compression)
            .build();

-    let store = Arc::new(object_store::compat::OpendalStore::new(store.clone()));
+    let store = Arc::new(object_store_opendal::OpendalStore::new(store.clone()));
    let file_opener = config.file_source().create_file_opener(store, &config, 0)?;
    let stream = FileStream::new(&config, 0, file_opener, &ExecutionPlanMetricsSet::new())?;

--- a/src/common/datasource/src/file_format/tests.rs
+++ b/src/common/datasource/src/file_format/tests.rs
@@ -44,7 +44,7 @@ struct Test<'a> {

 impl Test<'_> {
    async fn run(self, store: &ObjectStore) {
-        let store = Arc::new(object_store::compat::OpendalStore::new(store.clone()));
+        let store = Arc::new(object_store_opendal::OpendalStore::new(store.clone()));
        let file_opener = self
            .file_source
            .create_file_opener(store, &self.config, 0)
--- a/src/common/datasource/src/object_store/oss.rs
+++ b/src/common/datasource/src/object_store/oss.rs
@@ -27,12 +27,14 @@ const ACCESS_KEY_ID: &str = "access_key_id";
 const ACCESS_KEY_SECRET: &str = "access_key_secret";
 const ROOT: &str = "root";
 const ALLOW_ANONYMOUS: &str = "allow_anonymous";
+const SKIP_SIGNATURE: &str = "skip_signature";

 /// Check if the key is supported in OSS configuration.
 pub fn is_supported_in_oss(key: &str) -> bool {
    [
        ROOT,
        ALLOW_ANONYMOUS,
+        SKIP_SIGNATURE,
        BUCKET,
        ENDPOINT,
        ACCESS_KEY_ID,
@@ -61,18 +63,23 @@ pub fn build_oss_backend(
        builder = builder.access_key_secret(access_key_secret);
    }

-    if let Some(allow_anonymous) = connection.get(ALLOW_ANONYMOUS) {
-        let allow = allow_anonymous.as_str().parse::<bool>().map_err(|e| {
+    if let Some((key, value)) = connection
+        .get(SKIP_SIGNATURE)
+        .map(|value| (SKIP_SIGNATURE, value))
+        .or_else(|| {
+            connection
+                .get(ALLOW_ANONYMOUS)
+                .map(|value| (ALLOW_ANONYMOUS, value))
+        })
+    {
+        let skip_signature = value.as_str().parse::<bool>().map_err(|e| {
            error::InvalidConnectionSnafu {
-                msg: format!(
-                    "failed to parse the option {}={}, {}",
-                    ALLOW_ANONYMOUS, allow_anonymous, e
-                ),
+                msg: format!("failed to parse the option {}={}, {}", key, value, e),
            }
            .build()
        })?;
-        if allow {
-            builder = builder.allow_anonymous();
+        if skip_signature {
+            builder = builder.skip_signature();
        }
    }

@@ -93,6 +100,7 @@ mod tests {
    fn test_is_supported_in_oss() {
        assert!(is_supported_in_oss(ROOT));
        assert!(is_supported_in_oss(ALLOW_ANONYMOUS));
+        assert!(is_supported_in_oss(SKIP_SIGNATURE));
        assert!(is_supported_in_oss(BUCKET));
        assert!(is_supported_in_oss(ENDPOINT));
        assert!(is_supported_in_oss(ACCESS_KEY_ID));
--- a/src/common/datasource/src/test_util.rs
+++ b/src/common/datasource/src/test_util.rs
@@ -103,7 +103,7 @@ pub async fn setup_stream_to_json_test(origin_path: &str, threshold: impl Fn(usi
        test_util::TEST_BATCH_SIZE,
        schema.clone(),
        FileCompressionType::UNCOMPRESSED,
-        Arc::new(object_store::compat::OpendalStore::new(store.clone())),
+        Arc::new(object_store_opendal::OpendalStore::new(store.clone())),
        true,
    );

@@ -157,7 +157,7 @@ pub async fn setup_stream_to_csv_test(

    let csv_opener = csv_source
        .create_file_opener(
-            Arc::new(object_store::compat::OpendalStore::new(store.clone())),
+            Arc::new(object_store_opendal::OpendalStore::new(store.clone())),
            &config,
            0,
        )
--- a/src/common/meta/src/cache.rs
+++ b/src/common/meta/src/cache.rs
@@ -17,7 +17,7 @@ mod flow;
 mod registry;
 mod table;

-pub use container::{CacheContainer, Initializer, Invalidator, TokenFilter};
+pub use container::{CacheContainer, InitStrategy, Initializer, Invalidator, TokenFilter};
 pub use flow::{TableFlownodeSetCache, TableFlownodeSetCacheRef, new_table_flownode_set_cache};
 pub use registry::{
    CacheRegistry, CacheRegistryBuilder, CacheRegistryRef, LayeredCacheRegistry,
--- a/src/common/meta/src/ddl/create_flow.rs
+++ b/src/common/meta/src/ddl/create_flow.rs
@@ -437,11 +437,13 @@ pub fn defer_on_missing_source(flow_task: &CreateFlowTask) -> Result<bool> {
 pub fn validate_flow_options(flow_task: &CreateFlowTask) -> Result<()> {
    for key in flow_task.flow_options.keys() {
        match key.as_str() {
-            DEFER_ON_MISSING_SOURCE_KEY | FlowType::FLOW_TYPE_KEY => {}
+            DEFER_ON_MISSING_SOURCE_KEY
+            | FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY
+            | FlowType::FLOW_TYPE_KEY => {}
            unknown => {
                return UnexpectedSnafu {
                    err_msg: format!(
-                        "Unknown flow option '{unknown}', supported user options: {DEFER_ON_MISSING_SOURCE_KEY}"
+                        "Unknown flow option '{unknown}', supported user options: {DEFER_ON_MISSING_SOURCE_KEY}, {FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY}"
                    ),
                }
                .fail();
@@ -487,6 +489,9 @@ pub enum FlowType {
    Streaming,
 }

+pub const FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY: &str =
+    "experimental_enable_incremental_read";
+
 impl FlowType {
    pub const BATCHING: &str = "batching";
    pub const STREAMING: &str = "streaming";
--- a/src/common/meta/src/ddl/tests/create_flow.rs
+++ b/src/common/meta/src/ddl/tests/create_flow.rs
@@ -24,8 +24,9 @@ use table::table_name::TableName;

 use crate::ddl::DdlContext;
 use crate::ddl::create_flow::{
-    CreateFlowData, CreateFlowProcedure, CreateFlowState, DEFER_ON_MISSING_SOURCE_KEY, FlowType,
-    defer_on_missing_source,
+    CreateFlowData, CreateFlowProcedure, CreateFlowState, DEFER_ON_MISSING_SOURCE_KEY,
+    FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, FlowType, defer_on_missing_source,
+    validate_flow_options,
 };
 use crate::ddl::test_util::create_table::test_create_table_task;
 use crate::ddl::test_util::flownode_handler::NaiveFlownodeHandler;
@@ -275,6 +276,22 @@ fn test_defer_on_missing_source_invalid_value() {
    );
 }

+#[test]
+fn test_validate_flow_options_allows_incremental_read_option() {
+    let mut task = test_create_flow_task(
+        "my_flow",
+        vec![],
+        TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
+        false,
+    );
+    task.flow_options.insert(
+        FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(),
+        "true".to_string(),
+    );
+
+    validate_flow_options(&task).unwrap();
+}
+
 #[tokio::test]
 async fn test_create_flow_rejects_unknown_option_in_meta_task() {
    let mut task = test_create_flow_task(
--- a/src/file-engine/Cargo.toml
+++ b/src/file-engine/Cargo.toml
@@ -29,6 +29,7 @@ datafusion-expr.workspace = true
 datatypes.workspace = true
 futures.workspace = true
 object-store.workspace = true
+object_store_opendal.workspace = true
 serde = { version = "1.0", features = ["derive"] }
 serde_json.workspace = true
 snafu.workspace = true
--- a/src/file-engine/src/query/file_stream.rs
+++ b/src/file-engine/src/query/file_stream.rs
@@ -61,7 +61,7 @@ fn build_record_batch_stream(
            .with_file_group(FileGroup::new(files))
            .build();

-    let store = Arc::new(object_store::compat::OpendalStore::new(
+    let store = Arc::new(object_store_opendal::OpendalStore::new(
        scan_plan_config.store.clone(),
    ));

--- a/src/flow/src/batching_mode.rs
+++ b/src/flow/src/batching_mode.rs
@@ -23,7 +23,6 @@ use session::ReadPreference;
 mod checkpoint;
 pub(crate) mod engine;
 pub(crate) mod frontend_client;
-mod incremental_filter;
 mod state;
 mod table_creator;
 mod task;
@@ -55,6 +54,10 @@ pub struct BatchingModeOptions {
    pub experimental_max_filter_num_per_query: usize,
    /// Time window merge distance
    pub experimental_time_window_merge_threshold: usize,
+    /// Whether to enable experimental flow incremental source reads.
+    ///
+    /// When disabled, batching flows always execute full-snapshot queries.
+    pub experimental_enable_incremental_read: bool,
    /// Read preference of the Frontend client.
    pub read_preference: ReadPreference,
    /// TLS option for client connections to frontends.
@@ -72,6 +75,7 @@ impl Default for BatchingModeOptions {
            experimental_frontend_scan_timeout: Duration::from_secs(30),
            experimental_max_filter_num_per_query: 20,
            experimental_time_window_merge_threshold: 3,
+            experimental_enable_incremental_read: false,
            read_preference: Default::default(),
            frontend_tls: None,
        }
--- a/src/flow/src/batching_mode/engine.rs
+++ b/src/flow/src/batching_mode/engine.rs
@@ -21,7 +21,7 @@ use std::time::Duration;
 use api::v1::flow::DirtyWindowRequests;
 use catalog::CatalogManagerRef;
 use common_error::ext::BoxedError;
-use common_meta::ddl::create_flow::FlowType;
+use common_meta::ddl::create_flow::{FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, FlowType};
 use common_meta::key::TableMetadataManagerRef;
 use common_meta::key::flow::FlowMetadataManagerRef;
 use common_meta::key::flow::flow_state::FlowStat;
@@ -38,6 +38,7 @@ use session::context::QueryContext;
 use snafu::{OptionExt, ResultExt, ensure};
 use sql::parsers::utils::is_tql;
 use store_api::metric_engine_consts::is_metric_engine_internal_column;
+use store_api::mito_engine_options::APPEND_MODE_KEY;
 use store_api::storage::{RegionId, TableId};
 use table::table_reference::TableReference;
 use tokio::sync::{RwLock, oneshot};
@@ -428,6 +429,55 @@ async fn get_table_info(
 }

 impl BatchingEngine {
+    fn batch_opts_for_flow_options(
+        &self,
+        flow_options: &HashMap<String, String>,
+    ) -> Result<Arc<BatchingModeOptions>, Error> {
+        let mut batch_opts = (*self.batch_opts).clone();
+        if let Some(enable_incremental_read) =
+            flow_options.get(FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY)
+        {
+            batch_opts.experimental_enable_incremental_read = enable_incremental_read
+                .parse::<bool>()
+                .map_err(|_| {
+                    InvalidQuerySnafu {
+                        reason: format!(
+                            "Invalid flow option {FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY}: {enable_incremental_read}"
+                        ),
+                    }
+                    .build()
+                })?;
+        }
+
+        Ok(Arc::new(batch_opts))
+    }
+
+    fn table_options_enable_append_mode(extra_options: &HashMap<String, String>) -> bool {
+        extra_options
+            .get(APPEND_MODE_KEY)
+            .is_some_and(|value| value.eq_ignore_ascii_case("true"))
+    }
+
+    fn ensure_incremental_source_append_only(
+        batch_opts: &BatchingModeOptions,
+        table_name: &[String; 3],
+        extra_options: &HashMap<String, String>,
+    ) -> Result<(), Error> {
+        if batch_opts.experimental_enable_incremental_read {
+            ensure!(
+                Self::table_options_enable_append_mode(extra_options),
+                UnsupportedSnafu {
+                    reason: format!(
+                        "Flow incremental read requires append-only source table, but source table `{}` is not append-only. Consider setting append_mode='true' on the source table or disabling experimental_enable_incremental_read",
+                        table_name.join(".")
+                    ),
+                }
+            );
+        }
+
+        Ok(())
+    }
+
    pub async fn create_flow_inner(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
        let CreateFlowArgs {
            flow_id,
@@ -494,6 +544,8 @@ impl BatchingEngine {
            }
        );

+        let batch_opts = self.batch_opts_for_flow_options(&flow_options)?;
+
        let mut source_table_names = Vec::with_capacity(2);
        for src_id in source_table_ids {
            // also check table option to see if ttl!=instant
@@ -509,6 +561,11 @@ impl BatchingEngine {
                    ),
                }
            );
+            Self::ensure_incremental_source_append_only(
+                &batch_opts,
+                &table_name,
+                &table_info.table_info.meta.options.extra_options,
+            )?;

            source_table_names.push(table_name);
        }
@@ -563,7 +620,7 @@ impl BatchingEngine {
            query_ctx,
            catalog_manager: self.catalog_manager.clone(),
            shutdown_rx: rx,
-            batch_opts: self.batch_opts.clone(),
+            batch_opts,
            flow_eval_interval: eval_interval.map(|secs| Duration::from_secs(secs as u64)),
        };

@@ -808,7 +865,7 @@ impl BatchingEngine {
        });

        let res = task
-            .gen_exec_once(
+            .execute_once_serialized(
                &self.query_engine,
                &self.frontend_client,
                cur_dirty_window_cnt,
@@ -946,6 +1003,76 @@ mod tests {
        )
    }

+    #[tokio::test]
+    async fn test_flow_option_overrides_incremental_read_switch() {
+        let engine = new_test_engine().await;
+
+        let default_opts = engine.batch_opts_for_flow_options(&HashMap::new()).unwrap();
+        assert!(!default_opts.experimental_enable_incremental_read);
+
+        let enabled_opts = engine
+            .batch_opts_for_flow_options(&HashMap::from([(
+                FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(),
+                "true".to_string(),
+            )]))
+            .unwrap();
+        assert!(enabled_opts.experimental_enable_incremental_read);
+    }
+
+    #[test]
+    fn test_table_options_enable_append_mode() {
+        assert!(!BatchingEngine::table_options_enable_append_mode(
+            &HashMap::new()
+        ));
+        assert!(!BatchingEngine::table_options_enable_append_mode(
+            &HashMap::from([(APPEND_MODE_KEY.to_string(), "false".to_string())])
+        ));
+        assert!(BatchingEngine::table_options_enable_append_mode(
+            &HashMap::from([(APPEND_MODE_KEY.to_string(), "TRUE".to_string())])
+        ));
+    }
+
+    #[test]
+    fn test_incremental_source_append_only_enforcement() {
+        let table_name = [
+            "greptime".to_string(),
+            "public".to_string(),
+            "numbers".to_string(),
+        ];
+        let disabled_opts = BatchingModeOptions::default();
+        let enabled_opts = BatchingModeOptions {
+            experimental_enable_incremental_read: true,
+            ..Default::default()
+        };
+        let non_append_options = HashMap::new();
+        let append_options = HashMap::from([(APPEND_MODE_KEY.to_string(), "true".to_string())]);
+
+        BatchingEngine::ensure_incremental_source_append_only(
+            &disabled_opts,
+            &table_name,
+            &non_append_options,
+        )
+        .expect("disabled incremental read should not require append-only source");
+        BatchingEngine::ensure_incremental_source_append_only(
+            &enabled_opts,
+            &table_name,
+            &append_options,
+        )
+        .expect("append-only source should be accepted when incremental read is enabled");
+
+        let err = BatchingEngine::ensure_incremental_source_append_only(
+            &enabled_opts,
+            &table_name,
+            &non_append_options,
+        )
+        .expect_err("non-append source should be rejected when incremental read is enabled");
+        assert!(
+            err.to_string()
+                .contains("Flow incremental read requires append-only source table"),
+            "{err}"
+        );
+    }
+
    async fn new_test_task(flow_id: FlowId) -> (BatchingTask, oneshot::Sender<()>) {
        let query_engine = create_test_query_engine();
        let ctx = QueryContext::arc();
--- a/src/flow/src/batching_mode/incremental_filter.rs
+++ b/src/flow/src/batching_mode/incremental_filter.rs
@@ -1,222 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use common_telemetry::tracing::debug;
-use datafusion_expr::Expr;
-use datatypes::schema::Schema;
-
-use crate::batching_mode::state::FilterExprInfo;
-use crate::batching_mode::utils::IncrementalAggregateAnalysis;
-use crate::{Error, FlowId};
-
-pub(super) fn build_sink_dirty_time_window_filter_expr(
-    flow_id: FlowId,
-    analysis: &IncrementalAggregateAnalysis,
-    sink_schema: &Schema,
-    dirty_filter: Option<&FilterExprInfo>,
-) -> Result<Option<Expr>, Error> {
-    let Some(dirty_filter) = dirty_filter else {
-        return Ok(None);
-    };
-
-    let Some(sink_filter_col) =
-        infer_sink_time_window_filter_col(flow_id, analysis, sink_schema, dirty_filter)
-    else {
-        return Ok(None);
-    };
-
-    dirty_filter.predicate_for_col(&sink_filter_col)
-}
-
-fn infer_sink_time_window_filter_col(
-    flow_id: FlowId,
-    analysis: &IncrementalAggregateAnalysis,
-    sink_schema: &Schema,
-    dirty_filter: &FilterExprInfo,
-) -> Option<String> {
-    if analysis.group_key_names.is_empty() {
-        return None;
-    }
-
-    let is_timestamp_group_key = |name: &str| {
-        analysis.group_key_names.iter().any(|key| key == name)
-            && sink_schema
-                .column_schema_by_name(name)
-                .is_some_and(|col| col.data_type.is_timestamp())
-    };
-
-    if is_timestamp_group_key(&dirty_filter.col_name) {
-        return Some(dirty_filter.col_name.clone());
-    }
-
-    let candidates = analysis
-        .group_key_names
-        .iter()
-        .filter(|name| is_timestamp_group_key(name))
-        .cloned()
-        .collect::<Vec<_>>();
-
-    match candidates.as_slice() {
-        [name] => Some(name.clone()),
-        [] => {
-            debug!(
-                "Flow {} cannot infer sink dirty-window filter column: no timestamp group key in {:?}",
-                flow_id, analysis.group_key_names
-            );
-            None
-        }
-        _ => {
-            debug!(
-                "Flow {} cannot infer sink dirty-window filter column: ambiguous timestamp group keys {:?}",
-                flow_id, candidates
-            );
-            None
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use datatypes::prelude::ConcreteDataType;
-    use datatypes::schema::ColumnSchema;
-    use pretty_assertions::assert_eq;
-
-    use super::*;
-    use crate::adapter::AUTO_CREATED_UPDATE_AT_TS_COL;
-    use crate::batching_mode::state::FilterExprInfo;
-    use crate::batching_mode::utils::IncrementalAggregateAnalysis;
-
-    fn test_analysis_with_group_keys(group_key_names: Vec<&str>) -> IncrementalAggregateAnalysis {
-        IncrementalAggregateAnalysis {
-            group_key_names: group_key_names
-                .into_iter()
-                .map(|name| name.to_string())
-                .collect(),
-            merge_columns: vec![],
-            literal_columns: vec![],
-            output_field_names: vec![],
-            unsupported_exprs: vec![],
-        }
-    }
-
-    fn test_dirty_filter(col_name: &str) -> FilterExprInfo {
-        FilterExprInfo {
-            expr: datafusion_expr::col(col_name),
-            col_name: col_name.to_string(),
-            time_ranges: vec![],
-            window_size: chrono::Duration::seconds(1),
-        }
-    }
-
-    fn test_sink_schema(columns: Vec<(&str, ConcreteDataType)>) -> Schema {
-        Schema::new(
-            columns
-                .into_iter()
-                .map(|(name, data_type)| ColumnSchema::new(name, data_type, true))
-                .collect(),
-        )
-    }
-
-    #[test]
-    fn test_infer_sink_time_window_filter_col_uses_matching_source_group_key() {
-        let analysis = test_analysis_with_group_keys(vec!["ts", "host"]);
-        let sink_schema = test_sink_schema(vec![
-            ("ts", ConcreteDataType::timestamp_millisecond_datatype()),
-            ("host", ConcreteDataType::string_datatype()),
-        ]);
-        let dirty_filter = test_dirty_filter("ts");
-
-        assert_eq!(
-            Some("ts".to_string()),
-            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
-        );
-    }
-
-    #[test]
-    fn test_infer_sink_time_window_filter_col_uses_unique_timestamp_group_key() {
-        let analysis = test_analysis_with_group_keys(vec!["host", "time_window"]);
-        let sink_schema = test_sink_schema(vec![
-            ("host", ConcreteDataType::string_datatype()),
-            (
-                "time_window",
-                ConcreteDataType::timestamp_millisecond_datatype(),
-            ),
-            (
-                AUTO_CREATED_UPDATE_AT_TS_COL,
-                ConcreteDataType::timestamp_millisecond_datatype(),
-            ),
-        ]);
-        let dirty_filter = test_dirty_filter("ts");
-
-        assert_eq!(
-            Some("time_window".to_string()),
-            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
-        );
-    }
-
-    #[test]
-    fn test_infer_sink_time_window_filter_col_skips_global_aggregate() {
-        let analysis = test_analysis_with_group_keys(vec![]);
-        let sink_schema = test_sink_schema(vec![
-            ("number", ConcreteDataType::uint32_datatype()),
-            (
-                "time_window",
-                ConcreteDataType::timestamp_millisecond_datatype(),
-            ),
-        ]);
-        let dirty_filter = test_dirty_filter("ts");
-
-        assert_eq!(
-            None,
-            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
-        );
-    }
-
-    #[test]
-    fn test_infer_sink_time_window_filter_col_skips_without_timestamp_group_key() {
-        let analysis = test_analysis_with_group_keys(vec!["host", "device"]);
-        let sink_schema = test_sink_schema(vec![
-            ("host", ConcreteDataType::string_datatype()),
-            ("device", ConcreteDataType::string_datatype()),
-            (
-                AUTO_CREATED_UPDATE_AT_TS_COL,
-                ConcreteDataType::timestamp_millisecond_datatype(),
-            ),
-        ]);
-        let dirty_filter = test_dirty_filter("ts");
-
-        assert_eq!(
-            None,
-            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
-        );
-    }
-
-    #[test]
-    fn test_infer_sink_time_window_filter_col_skips_ambiguous_timestamp_group_keys() {
-        let analysis = test_analysis_with_group_keys(vec!["ts", "time_window"]);
-        let sink_schema = test_sink_schema(vec![
-            ("ts", ConcreteDataType::timestamp_millisecond_datatype()),
-            (
-                "time_window",
-                ConcreteDataType::timestamp_millisecond_datatype(),
-            ),
-        ]);
-        let dirty_filter = test_dirty_filter("source_ts");
-
-        assert_eq!(
-            None,
-            infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
-        );
-    }
-}
--- a/src/flow/src/batching_mode/state.rs
+++ b/src/flow/src/batching_mode/state.rs
@@ -66,12 +66,20 @@ pub struct TaskState {
 }
 impl TaskState {
    pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
+        Self::with_dirty_time_windows(query_ctx, shutdown_rx, DirtyTimeWindows::default())
+    }
+
+    pub fn with_dirty_time_windows(
+        query_ctx: QueryContextRef,
+        shutdown_rx: oneshot::Receiver<()>,
+        dirty_time_windows: DirtyTimeWindows,
+    ) -> Self {
        Self {
            query_ctx,
            last_update_time: Instant::now(),
            last_query_duration: Duration::from_secs(0),
            last_exec_time_millis: None,
-            dirty_time_windows: Default::default(),
+            dirty_time_windows,
            checkpoint_mode: CheckpointMode::FullSnapshot,
            checkpoints: Default::default(),
            incremental_disabled: false,
@@ -264,6 +272,16 @@ impl DirtyTimeWindows {
            time_window_merge_threshold,
        }
    }
+
+    #[cfg(test)]
+    pub(crate) fn max_filter_num_per_query(&self) -> usize {
+        self.max_filter_num_per_query
+    }
+
+    #[cfg(test)]
+    pub(crate) fn time_window_merge_threshold(&self) -> usize {
+        self.time_window_merge_threshold
+    }
 }

 impl Default for DirtyTimeWindows {
@@ -681,7 +699,7 @@ impl DirtyTimeWindows {
    }
 }

-fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
+pub(crate) fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
    let value = Value::from(value);
    let value = value
        .try_to_scalar_value(&value.data_type())
--- a/src/flow/src/batching_mode/task.rs
+++ b/src/flow/src/batching_mode/task.rs
@@ -27,7 +27,7 @@ use datafusion::datasource::DefaultTableSource;
 use datafusion::sql::unparser::expr_to_sql;
 use datafusion_common::DFSchemaRef;
 use datafusion_common::tree_node::{Transformed, TreeNode};
-use datafusion_expr::{DmlStatement, LogicalPlan, WriteOp};
+use datafusion_expr::{DmlStatement, LogicalPlan, WriteOp, col, lit};
 use datatypes::schema::Schema;
 use query::QueryEngineRef;
 use query::options::FLOW_INCREMENTAL_MODE;
@@ -38,14 +38,16 @@ use sql::parsers::utils::is_tql;
 use store_api::mito_engine_options::MERGE_MODE_KEY;
 use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
 use table::table::adapter::DfTableProviderAdapter;
-use tokio::sync::oneshot;
 use tokio::sync::oneshot::error::TryRecvError;
+use tokio::sync::{Mutex, oneshot};
 use tokio::time::Instant;

 use crate::batching_mode::BatchingModeOptions;
 use crate::batching_mode::checkpoint::checkpoint_mode_label;
 use crate::batching_mode::frontend_client::{FrontendClient, PeerDesc};
-use crate::batching_mode::state::{CheckpointMode, DirtyTimeWindows, FilterExprInfo, TaskState};
+use crate::batching_mode::state::{
+    CheckpointMode, DirtyTimeWindows, FilterExprInfo, TaskState, to_df_literal,
+};
 use crate::batching_mode::table_creator::{QueryType, create_table_with_expr};
 use crate::batching_mode::time_window::TimeWindowExpr;
 use crate::batching_mode::utils::{
@@ -67,12 +69,6 @@ use crate::{Error, FlowId};
 mod ckpt;
 mod inc;

-/// Maximum number of dirty time-window predicates attached to one incremental
-/// SQL query. This keeps generated OR filters bounded so Substrait encoding and
-/// downstream planning remain predictable; if the backlog is larger, the flow
-/// drains one capped batch and postpones checkpoint advancement to a later run.
-const MAX_INCREMENTAL_DIRTY_WINDOW_FILTERS: usize = 4096;
-
 /// The task's config, immutable once created
 #[derive(Clone)]
 pub struct TaskConfig {
@@ -113,6 +109,10 @@ fn is_merge_mode_last_non_null(options: &HashMap<String, String>) -> bool {
 pub struct BatchingTask {
    pub config: Arc<TaskConfig>,
    pub state: Arc<RwLock<TaskState>>,
+    /// Serializes plan generation, execution, checkpoint advancement, and dirty
+    /// window restoration for this flow. Without this, a manual flush and the
+    /// background loop can process the same checkpoint range concurrently.
+    execution_lock: Arc<Mutex<()>>,
 }

 /// Arguments for creating batching task
@@ -150,6 +150,16 @@ pub enum DirtyRestore {
    Unscoped(DirtyTimeWindows),
 }

+struct ExecuteOnceOutcome {
+    new_query: Option<PlanInfo>,
+    /// Execution result of the generated insert plan.
+    ///
+    /// `Ok(Some((affected_rows, elapsed)))` means a query was executed.
+    /// `Ok(None)` means no query was generated because there was no dirty signal.
+    /// `Err(_)` means plan generation or execution failed.
+    result: Result<Option<(usize, Duration)>, Error>,
+}
+
 impl BatchingTask {
    #[allow(clippy::too_many_arguments)]
    pub fn try_new(
@@ -168,6 +178,18 @@ impl BatchingTask {
            flow_eval_interval,
        }: TaskArgs<'_>,
    ) -> Result<Self, Error> {
+        let mut state = TaskState::with_dirty_time_windows(
+            query_ctx.clone(),
+            shutdown_rx,
+            DirtyTimeWindows::new(
+                batch_opts.experimental_max_filter_num_per_query,
+                batch_opts.experimental_time_window_merge_threshold,
+            ),
+        );
+        if !batch_opts.experimental_enable_incremental_read {
+            state.disable_incremental();
+        }
+
        Ok(Self {
            config: Arc::new(TaskConfig {
                flow_id,
@@ -182,7 +204,8 @@ impl BatchingTask {
                batch_opts,
                flow_eval_interval,
            }),
-            state: Arc::new(RwLock::new(TaskState::new(query_ctx, shutdown_rx))),
+            state: Arc::new(RwLock::new(state)),
+            execution_lock: Arc::new(Mutex::new(())),
        })
    }

@@ -251,40 +274,75 @@ impl BatchingTask {
            .context(ExternalSnafu)
    }

-    pub async fn gen_exec_once(
+    pub(crate) async fn execute_once_serialized(
        &self,
        engine: &QueryEngineRef,
        frontend_client: &Arc<FrontendClient>,
        max_window_cnt: Option<usize>,
    ) -> Result<Option<(usize, Duration)>, Error> {
-        if let Some(new_query) = self.gen_insert_plan(engine, max_window_cnt).await? {
+        let outcome = self
+            .execute_once_serialized_with_outcome(engine, frontend_client, max_window_cnt)
+            .await;
+        outcome.result
+    }
+
+    /// Executes one flow evaluation under `execution_lock` and keeps the
+    /// generated query context for the background loop's error logging/backoff.
+    async fn execute_once_serialized_with_outcome(
+        &self,
+        engine: &QueryEngineRef,
+        frontend_client: &Arc<FrontendClient>,
+        max_window_cnt: Option<usize>,
+    ) -> ExecuteOnceOutcome {
+        let _execution_guard = self.execution_lock.lock().await;
+        self.execute_once_unlocked(engine, frontend_client, max_window_cnt)
+            .await
+    }
+
+    /// Executes one flow evaluation. Caller must hold `execution_lock`.
+    async fn execute_once_unlocked(
+        &self,
+        engine: &QueryEngineRef,
+        frontend_client: &Arc<FrontendClient>,
+        max_window_cnt: Option<usize>,
+    ) -> ExecuteOnceOutcome {
+        let new_query = match self.gen_insert_plan_unlocked(engine, max_window_cnt).await {
+            Ok(new_query) => new_query,
+            Err(err) => {
+                return ExecuteOnceOutcome {
+                    new_query: None,
+                    result: Err(err),
+                };
+            }
+        };
+
+        if let Some(new_query) = new_query {
            debug!("Generate new query: {}", new_query.plan);
-            let dirty_filter = match &new_query.dirty_restore {
-                DirtyRestore::Scoped(f) => Some(f),
-                _ => None,
-            };
-            match self
-                .execute_logical_plan(
+            let res = self
+                .execute_logical_plan_unlocked(
                    frontend_client,
                    &new_query.plan,
-                    dirty_filter,
                    new_query.can_advance_checkpoints,
                )
-                .await
-            {
-                Ok(result) => Ok(result),
-                Err(err) => {
-                    self.handle_executed_query_failure(Some(&new_query));
-                    Err(err)
-                }
+                .await;
+            if res.is_err() {
+                self.handle_executed_query_failure(Some(&new_query));
+            }
+            ExecuteOnceOutcome {
+                new_query: Some(new_query),
+                result: res,
            }
        } else {
            debug!("Generate no query");
-            Ok(None)
+            ExecuteOnceOutcome {
+                new_query: None,
+                result: Ok(None),
+            }
        }
    }

-    pub async fn gen_insert_plan(
+    /// Generates the insert plan. Caller must reach this through the serialized path.
+    async fn gen_insert_plan_unlocked(
        &self,
        engine: &QueryEngineRef,
        max_window_cnt: Option<usize>,
@@ -388,11 +446,11 @@ impl BatchingTask {
        Ok(())
    }

-    pub async fn execute_logical_plan(
+    /// Executes the insert plan. Caller must reach this through the serialized path.
+    async fn execute_logical_plan_unlocked(
        &self,
        frontend_client: &Arc<FrontendClient>,
        plan: &LogicalPlan,
-        dirty_filter: Option<&FilterExprInfo>,
        can_advance_checkpoints: bool,
    ) -> Result<Option<(usize, Duration)>, Error> {
        let instant = Instant::now();
@@ -426,8 +484,7 @@ impl BatchingTask {
        // For incremental-mode SQL queries, attempt to rewrite the delta aggregate
        // plan into a safe delta-LEFT-JOIN-sink form before deciding on extensions.
        let incremental_plan = if can_advance_checkpoints {
-            self.prepare_plan_for_incremental(&plan, dirty_filter)
-                .await?
+            self.prepare_plan_for_incremental(&plan).await?
        } else {
            None
        };
@@ -580,6 +637,112 @@ impl BatchingTask {
        })
    }

+    fn restore_unscoped_dirty_windows(&self, dirty_windows: &DirtyTimeWindows) {
+        self.state
+            .write()
+            .unwrap()
+            .dirty_time_windows
+            .add_dirty_windows(dirty_windows);
+    }
+
+    fn restore_unscoped_dirty_windows_on_err<T>(
+        &self,
+        dirty_windows: &DirtyTimeWindows,
+        result: Result<T, Error>,
+    ) -> Result<T, Error> {
+        result.inspect_err(|_| {
+            self.restore_unscoped_dirty_windows(dirty_windows);
+        })
+    }
+
+    fn drain_dirty_windows_signal(&self) -> (bool, DirtyTimeWindows) {
+        let mut state = self.state.write().unwrap();
+        let dirty_windows_to_restore = state.dirty_time_windows.clone();
+        let is_dirty = !dirty_windows_to_restore.is_empty();
+        state.dirty_time_windows.clean();
+        (is_dirty, dirty_windows_to_restore)
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    async fn gen_unfiltered_plan_info(
+        &self,
+        engine: QueryEngineRef,
+        query_ctx: QueryContextRef,
+        sink_table_schema: Arc<Schema>,
+        primary_key_indices: &[usize],
+        allow_partial: bool,
+        dirty_windows_to_restore: DirtyTimeWindows,
+        retention_filter: Option<(&str, Timestamp, &'static str)>,
+    ) -> Result<PlanInfo, Error> {
+        let mut plan = self.restore_unscoped_dirty_windows_on_err(
+            &dirty_windows_to_restore,
+            gen_plan_with_matching_schema(
+                &self.config.query,
+                query_ctx,
+                engine,
+                sink_table_schema,
+                primary_key_indices,
+                allow_partial,
+            )
+            .await,
+        )?;
+
+        if let Some((col_name, lower_bound, context)) = retention_filter {
+            let lower = self.restore_unscoped_dirty_windows_on_err(
+                &dirty_windows_to_restore,
+                to_df_literal(lower_bound),
+            )?;
+            let retention_filter = col(col_name).gt_eq(lit(lower));
+            let mut add_filter = AddFilterRewriter::new(retention_filter);
+            plan = self.restore_unscoped_dirty_windows_on_err(
+                &dirty_windows_to_restore,
+                plan.clone()
+                    .rewrite(&mut add_filter)
+                    .with_context(|_| DatafusionSnafu {
+                        context: format!(
+                            "Failed to apply {context} expire_after filter to plan:\n {}\n",
+                            plan
+                        ),
+                    })
+                    .map(|rewrite| rewrite.data),
+            )?;
+        }
+
+        Ok(PlanInfo {
+            plan,
+            dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
+            can_advance_checkpoints: true,
+        })
+    }
+
+    async fn gen_unfiltered_plan_info_if_dirty(
+        &self,
+        engine: QueryEngineRef,
+        query_ctx: QueryContextRef,
+        sink_table_schema: Arc<Schema>,
+        primary_key_indices: &[usize],
+        allow_partial: bool,
+        retention_filter: Option<(&str, Timestamp, &'static str)>,
+    ) -> Result<Option<PlanInfo>, Error> {
+        let (is_dirty, dirty_windows_to_restore) = self.drain_dirty_windows_signal();
+        if !is_dirty {
+            debug!("Flow id={:?}, no new data, not update", self.config.flow_id);
+            return Ok(None);
+        }
+
+        self.gen_unfiltered_plan_info(
+            engine,
+            query_ctx,
+            sink_table_schema,
+            primary_key_indices,
+            allow_partial,
+            dirty_windows_to_restore,
+            retention_filter,
+        )
+        .await
+        .map(Some)
+    }
+
    fn handle_executed_query_failure(&self, query: Option<&PlanInfo>) {
        if let Some(query) = query {
            self.restore_dirty_windows_after_failure(query);
@@ -626,33 +789,11 @@ impl BatchingTask {

            let min_refresh = self.config.batch_opts.experimental_min_refresh_duration;

-            let new_query = match self.gen_insert_plan(&engine, max_window_cnt).await {
-                Ok(new_query) => new_query,
-                Err(err) => {
-                    common_telemetry::error!(err; "Failed to generate query for flow={}", self.config.flow_id);
-                    // also sleep for a little while before try again to prevent flooding logs
-                    tokio::time::sleep(min_refresh).await;
-                    continue;
-                }
-            };
+            let outcome = self
+                .execute_once_serialized_with_outcome(&engine, &frontend_client, max_window_cnt)
+                .await;

-            let res = if let Some(new_query) = &new_query {
-                let dirty_filter = match &new_query.dirty_restore {
-                    DirtyRestore::Scoped(f) => Some(f),
-                    _ => None,
-                };
-                self.execute_logical_plan(
-                    &frontend_client,
-                    &new_query.plan,
-                    dirty_filter,
-                    new_query.can_advance_checkpoints,
-                )
-                .await
-            } else {
-                Ok(None)
-            };
-
-            match res {
+            match outcome.result {
                // normal execute, sleep for some time before doing next query
                Ok(Some(_)) => {
                    // can increase max_window_cnt to query more windows next time
@@ -703,11 +844,10 @@ impl BatchingTask {
                }
                // TODO(discord9): this error should have better place to go, but for now just print error, also more context is needed
                Err(err) => {
-                    self.handle_executed_query_failure(new_query.as_ref());
                    METRIC_FLOW_BATCHING_ENGINE_ERROR_CNT
                        .with_label_values(&[&flow_id_str])
                        .inc();
-                    match new_query {
+                    match outcome.new_query {
                        Some(query) => {
                            common_telemetry::error!(err; "Failed to execute query for flow={} with query: {}", self.config.flow_id, query.plan);
                            // TODO(discord9): add some backoff here? half the query time window or what
@@ -743,6 +883,20 @@ impl BatchingTask {
        create_table_with_expr(&plan, &self.config.sink_table_name, &self.config.query_type)
    }

+    fn should_use_unfiltered_incremental_delta(&self) -> bool {
+        let state = self.state.read().unwrap();
+        state.checkpoint_mode() == CheckpointMode::Incremental
+            && !state.is_incremental_disabled()
+            && matches!(self.config.query_type, QueryType::Sql)
+    }
+
+    fn should_use_unfiltered_full_snapshot_seeding(&self) -> bool {
+        let state = self.state.read().unwrap();
+        state.checkpoint_mode() == CheckpointMode::FullSnapshot
+            && !state.is_incremental_disabled()
+            && matches!(self.config.query_type, QueryType::Sql)
+    }
+
    /// will merge and use the first ten time window in query
    async fn gen_query_with_time_window(
        &self,
@@ -783,83 +937,35 @@ impl BatchingTask {
                        self.config.flow_id
                    );
                    // clean dirty time window too, this could be from create flow's check_execute
-                    let (is_dirty, dirty_windows_to_restore) = {
-                        let mut state = self.state.write().unwrap();
-                        let dirty_windows_to_restore = state.dirty_time_windows.clone();
-                        let is_dirty = !dirty_windows_to_restore.is_empty();
-                        state.dirty_time_windows.clean();
-                        (is_dirty, dirty_windows_to_restore)
-                    };
-
-                    if !is_dirty {
-                        // no dirty data, hence no need to update
-                        debug!("Flow id={:?}, no new data, not update", self.config.flow_id);
-                        return Ok(None);
-                    }
-
-                    let plan = match gen_plan_with_matching_schema(
-                        &self.config.query,
-                        query_ctx,
-                        engine,
-                        sink_table_schema.clone(),
-                        primary_key_indices,
-                        allow_partial,
-                    )
-                    .await
-                    {
-                        Ok(plan) => plan,
-                        Err(err) => {
-                            self.state
-                                .write()
-                                .unwrap()
-                                .dirty_time_windows
-                                .add_dirty_windows(&dirty_windows_to_restore);
-                            return Err(err);
-                        }
-                    };
-
-                    return Ok(Some(PlanInfo {
-                        plan,
-                        dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
-                        can_advance_checkpoints: true,
-                    }));
+                    return self
+                        .gen_unfiltered_plan_info_if_dirty(
+                            engine,
+                            query_ctx,
+                            sink_table_schema.clone(),
+                            primary_key_indices,
+                            allow_partial,
+                            None,
+                        )
+                        .await;
                }
                _ => {
                    // Clean dirty windows for full-query/non-scoped paths,
                    // such as TQL, that cannot use a time-window filter.
-                    let dirty_windows_to_restore = {
-                        let mut state = self.state.write().unwrap();
-                        let dirty_windows_to_restore = state.dirty_time_windows.clone();
-                        state.dirty_time_windows.clean();
-                        dirty_windows_to_restore
-                    };
+                    let (_, dirty_windows_to_restore) = self.drain_dirty_windows_signal();

-                    let plan = match gen_plan_with_matching_schema(
-                        &self.config.query,
-                        query_ctx,
-                        engine,
-                        sink_table_schema.clone(),
-                        primary_key_indices,
-                        allow_partial,
-                    )
-                    .await
-                    {
-                        Ok(plan) => plan,
-                        Err(err) => {
-                            self.state
-                                .write()
-                                .unwrap()
-                                .dirty_time_windows
-                                .add_dirty_windows(&dirty_windows_to_restore);
-                            return Err(err);
-                        }
-                    };
+                    let plan_info = self
+                        .gen_unfiltered_plan_info(
+                            engine,
+                            query_ctx,
+                            sink_table_schema.clone(),
+                            primary_key_indices,
+                            allow_partial,
+                            dirty_windows_to_restore,
+                            None,
+                        )
+                        .await?;

-                    return Ok(Some(PlanInfo {
-                        plan,
-                        dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
-                        can_advance_checkpoints: true,
-                    }));
+                    return Ok(Some(plan_info));
                }
            };

@@ -889,22 +995,61 @@ impl BatchingTask {
                ),
            })?;

+        if self.should_use_unfiltered_full_snapshot_seeding() {
+            // A full-snapshot query that can seed/refresh incremental
+            // checkpoints must not use dirty-window predicates. Rows can be
+            // written after dirty windows are drained but before the source scan
+            // snapshot opens; a stale dirty-window filter could exclude those
+            // rows while the returned watermark includes them, causing the next
+            // incremental read to skip them forever. Execute an unfiltered full
+            // snapshot instead, and keep dirty windows only as the scheduling and
+            // failure-restoration signal.
+            let retention_filter = self
+                .config
+                .expire_after
+                .map(|_| (col_name.as_str(), expire_lower_bound, "full-snapshot"));
+            return self
+                .gen_unfiltered_plan_info_if_dirty(
+                    engine,
+                    query_ctx,
+                    sink_table_schema.clone(),
+                    primary_key_indices,
+                    allow_partial,
+                    retention_filter,
+                )
+                .await;
+        }
+
+        if self.should_use_unfiltered_incremental_delta() {
+            // In incremental mode, source correctness is defined by the
+            // per-region sequence range `(checkpoint, scan-open snapshot]`, not
+            // by dirty-window predicates. Dirty windows are only a scheduling
+            // signal here. Applying a stale dirty-window filter to the source can
+            // exclude rows that are inside the returned watermark and make a
+            // checkpoint advance skip them forever. The sink side is also left
+            // unfiltered by dirty windows; the incremental rewrite joins the
+            // delta groups with the full sink state for correctness. Future
+            // dynamic filters can prune sink reads as a pure optimization.
+            let retention_filter = self
+                .config
+                .expire_after
+                .map(|_| (col_name.as_str(), expire_lower_bound, "incremental"));
+            return self
+                .gen_unfiltered_plan_info_if_dirty(
+                    engine,
+                    query_ctx,
+                    sink_table_schema.clone(),
+                    primary_key_indices,
+                    allow_partial,
+                    retention_filter,
+                )
+                .await;
+        }
+
        let (expr, can_advance_checkpoints) = {
            let mut state = self.state.write().unwrap();
-            let window_cnt = if state.checkpoint_mode() == CheckpointMode::Incremental
-                && !state.is_incremental_disabled()
-                && matches!(self.config.query_type, QueryType::Sql)
-            {
-                // Incremental scans are bounded by region sequence checkpoints,
-                // so the dirty-window filter only narrows sink-side/time-window
-                // work. Drain more windows than normal, but keep a hard cap to
-                // avoid building a huge OR filter after a long downtime. If
-                // windows remain, checkpoints won't advance this round.
-                MAX_INCREMENTAL_DIRTY_WINDOW_FILTERS
-            } else {
-                max_window_cnt
-                    .unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query)
-            };
+            let window_cnt = max_window_cnt
+                .unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query);
            let expr = state.dirty_time_windows.gen_filter_exprs(
                &col_name,
                Some(expire_lower_bound),
--- a/src/flow/src/batching_mode/task/inc.rs
+++ b/src/flow/src/batching_mode/task/inc.rs
@@ -26,8 +26,7 @@ use snafu::ResultExt;
 use table::metadata::TableId;

 use crate::Error;
-use crate::batching_mode::incremental_filter::build_sink_dirty_time_window_filter_expr;
-use crate::batching_mode::state::{CheckpointMode, FilterExprInfo};
+use crate::batching_mode::state::CheckpointMode;
 use crate::batching_mode::table_creator::QueryType;
 use crate::batching_mode::task::BatchingTask;
 use crate::batching_mode::utils::{
@@ -74,7 +73,6 @@ impl BatchingTask {
    pub(super) async fn prepare_plan_for_incremental(
        &self,
        plan: &LogicalPlan,
-        dirty_filter: Option<&FilterExprInfo>,
    ) -> Result<Option<LogicalPlan>, Error> {
        let is_incremental_sql = {
            let state = self.state.read().unwrap();
@@ -152,31 +150,12 @@ impl BatchingTask {
                return Ok(None);
            }
        };
-        let sink_schema = sink_table.table_info().meta.schema.clone();
-        let sink_dirty_filter = match build_sink_dirty_time_window_filter_expr(
-            self.config.flow_id,
-            &analysis,
-            &sink_schema,
-            dirty_filter,
-        ) {
-            Ok(filter) => filter,
-            Err(err) => {
-                warn!(
-                    "Flow {} failed to build sink dirty time window filter; \
-                     falling back to full snapshot for this round: {:?}",
-                    self.config.flow_id, err
-                );
-                self.state.write().unwrap().mark_full_snapshot();
-                return Ok(None);
-            }
-        };
-
        let rewritten_inner = match rewrite_incremental_aggregate_with_sink_merge(
            &inner_plan,
            &analysis,
            sink_table,
            &self.config.sink_table_name,
-            sink_dirty_filter,
+            None,
        )
        .await
        {
--- a/src/flow/src/batching_mode/task/test.rs
+++ b/src/flow/src/batching_mode/task/test.rs
@@ -25,7 +25,9 @@ use datatypes::data_type::ConcreteDataType as CDT;
 use datatypes::schema::ColumnSchema;
 use datatypes::vectors::{TimestampMillisecondVector, UInt32Vector, VectorRef};
 use pretty_assertions::assert_eq;
-use query::options::{FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY};
+use query::options::{
+    FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY, QueryOptions,
+};
 use session::context::QueryContext;
 use table::test_util::MemTable;

@@ -38,6 +40,13 @@ use crate::batching_mode::state::CheckpointMode;
 use crate::batching_mode::time_window::find_time_window_expr;
 use crate::test_utils::create_test_query_engine;

+fn incremental_batch_opts() -> Arc<BatchingModeOptions> {
+    Arc::new(BatchingModeOptions {
+        experimental_enable_incremental_read: true,
+        ..Default::default()
+    })
+}
+
 async fn new_test_task_and_plan_with_missing_sink() -> (BatchingTask, LogicalPlan) {
    new_test_task_engine_and_plan_with_query(
        "SELECT number, ts FROM numbers_with_ts",
@@ -60,6 +69,15 @@ impl TestTaskParts {
 }

 async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str) -> TestTaskParts {
+    new_test_task_engine_and_plan_with_query_and_opts(query, sink_table, incremental_batch_opts())
+        .await
+}
+
+async fn new_test_task_engine_and_plan_with_query_and_opts(
+    query: &str,
+    sink_table: &str,
+    batch_opts: Arc<BatchingModeOptions>,
+) -> TestTaskParts {
    let query_engine = create_test_query_engine();
    let ctx = QueryContext::arc();
    let plan = sql_to_df_plan(
@@ -91,7 +109,7 @@ async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str)
        query_ctx: ctx,
        catalog_manager: query_engine.engine_state().catalog_manager().clone(),
        shutdown_rx: rx,
-        batch_opts: Arc::new(BatchingModeOptions::default()),
+        batch_opts,
        flow_eval_interval: None,
    })
    .unwrap();
@@ -103,6 +121,75 @@ async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str)
    }
 }

+#[tokio::test]
+async fn test_incremental_read_is_disabled_by_default() {
+    let task = new_test_task_engine_and_plan_with_query_and_opts(
+        "SELECT number, ts FROM numbers_with_ts",
+        "numbers_with_ts",
+        Arc::new(BatchingModeOptions::default()),
+    )
+    .await
+    .task;
+
+    assert!(task.state.read().unwrap().is_incremental_disabled());
+}
+
+#[tokio::test]
+async fn test_dirty_time_windows_uses_batch_opts() {
+    let task = new_test_task_engine_and_plan_with_query_and_opts(
+        "SELECT number, ts FROM numbers_with_ts",
+        "numbers_with_ts",
+        Arc::new(BatchingModeOptions {
+            experimental_max_filter_num_per_query: 7,
+            experimental_time_window_merge_threshold: 11,
+            ..Default::default()
+        }),
+    )
+    .await
+    .task;
+
+    let state = task.state.read().unwrap();
+    assert_eq!(7, state.dirty_time_windows.max_filter_num_per_query());
+    assert_eq!(11, state.dirty_time_windows.time_window_merge_threshold());
+}
+
+#[tokio::test]
+async fn test_execute_once_serialized_waits_for_execution_lock() {
+    let TestTaskParts {
+        task, query_engine, ..
+    } = new_test_task_engine_and_plan_with_query(
+        "SELECT number, ts FROM numbers_with_ts",
+        "missing_sink",
+    )
+    .await;
+    let (frontend_client, _handler) =
+        FrontendClient::from_empty_grpc_handler(QueryOptions::default());
+    let frontend_client = Arc::new(frontend_client);
+
+    let guard = task.execution_lock.clone().lock_owned().await;
+    let task_to_run = task.clone();
+    let query_engine_to_run = query_engine.clone();
+    let frontend_client_to_run = frontend_client.clone();
+    let exec = tokio::spawn(async move {
+        task_to_run
+            .execute_once_serialized(&query_engine_to_run, &frontend_client_to_run, None)
+            .await
+    });
+
+    tokio::time::sleep(Duration::from_millis(20)).await;
+    assert!(
+        !exec.is_finished(),
+        "execute_once_serialized should wait for execution_lock"
+    );
+
+    drop(guard);
+    tokio::time::timeout(Duration::from_secs(1), exec)
+        .await
+        .expect("execute_once_serialized should finish once execution_lock is released")
+        .expect("execute_once_serialized task should not panic")
+        .expect_err("missing sink should fail after acquiring execution_lock");
+}
+
 async fn new_time_window_test_task_with_query(query: &str) -> TestTaskParts {
    let query_engine = create_test_query_engine();
    let ctx = QueryContext::arc();
@@ -147,7 +234,7 @@ async fn new_time_window_test_task_with_query(query: &str) -> TestTaskParts {
        query_ctx: ctx,
        catalog_manager: query_engine.engine_state().catalog_manager().clone(),
        shutdown_rx: rx,
-        batch_opts: Arc::new(BatchingModeOptions::default()),
+        batch_opts: incremental_batch_opts(),
        flow_eval_interval: None,
    })
    .unwrap();
@@ -226,6 +313,14 @@ fn dirty_range(start: i64, end: i64) -> DirtyTimeWindows {
    dirty
 }

+fn expire_after_for_retention_filter_test() -> i64 {
+    let now_secs = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .expect("Time went backwards")
+        .as_secs();
+    (now_secs - 10) as i64
+}
+
 async fn assert_unscoped_failure_restore(
    consumed_dirty_windows: DirtyTimeWindows,
    current_dirty_windows: DirtyTimeWindows,
@@ -626,6 +721,7 @@ async fn test_full_snapshot_scoped_plan_marks_checkpoint_advance_safe_only_after
    .await;
    {
        let mut state = task.state.write().unwrap();
+        state.disable_incremental();
        state
            .dirty_time_windows
            .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
@@ -657,7 +753,7 @@ async fn test_full_snapshot_scoped_plan_marks_checkpoint_advance_safe_only_after
 }

 #[tokio::test]
-async fn test_incremental_scoped_plan_consumes_all_dirty_windows_for_checkpoint_safety() {
+async fn test_incremental_plan_consumes_dirty_signal_for_checkpoint_safety() {
    let TestTaskParts {
        task,
        query_engine,
@@ -692,6 +788,192 @@ async fn test_incremental_scoped_plan_consumes_all_dirty_windows_for_checkpoint_
    assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
 }

+#[tokio::test]
+async fn test_full_snapshot_seeding_for_incremental_does_not_add_dirty_window_filter() {
+    let TestTaskParts {
+        task,
+        query_engine,
+        ..
+    } = new_time_window_test_task_with_query(
+        "SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window",
+    )
+    .await;
+    {
+        let mut state = task.state.write().unwrap();
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+        assert!(!state.is_incremental_disabled());
+        state
+            .dirty_time_windows
+            .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
+        state
+            .dirty_time_windows
+            .add_window(Timestamp::new_second(30), Some(Timestamp::new_second(35)));
+    }
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", CDT::uint32_datatype(), false),
+        ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
+            .with_time_index(true),
+    ]));
+
+    let plan = task
+        .gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
+        .await
+        .unwrap()
+        .unwrap();
+
+    let plan_text = plan.plan.to_string();
+    assert!(plan.can_advance_checkpoints);
+    assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
+    assert!(!plan_text.contains("Filter:"), "{plan_text}");
+}
+
+#[tokio::test]
+async fn test_full_snapshot_seeding_applies_expire_after_retention_filter() {
+    let TestTaskParts {
+        mut task,
+        query_engine,
+        ..
+    } = new_time_window_test_task_with_query(
+        "SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window",
+    )
+    .await;
+    {
+        let mut state = task.state.write().unwrap();
+        assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
+        assert!(!state.is_incremental_disabled());
+        state
+            .dirty_time_windows
+            .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
+    }
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", CDT::uint32_datatype(), false),
+        ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
+            .with_time_index(true),
+    ]));
+
+    Arc::get_mut(&mut task.config)
+        .expect("test task config should be uniquely owned")
+        .expire_after = Some(expire_after_for_retention_filter_test());
+    let plan = task
+        .gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
+        .await
+        .unwrap()
+        .unwrap();
+
+    assert!(plan.can_advance_checkpoints);
+    assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
+    let plan_text = plan.plan.to_string();
+    assert!(
+        plan_text.contains("Filter: ts >= TimestampMillisecond("),
+        "{plan_text}"
+    );
+}
+
+#[tokio::test]
+async fn test_incremental_plan_does_not_add_dirty_window_filter() {
+    let TestTaskParts {
+        task,
+        query_engine,
+        ..
+    } = new_time_window_test_task_with_query(
+        "SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window",
+    )
+    .await;
+    {
+        let mut state = task.state.write().unwrap();
+        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
+        state
+            .dirty_time_windows
+            .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
+    }
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", CDT::uint32_datatype(), false),
+        ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
+            .with_time_index(true),
+    ]));
+
+    let plan = task
+        .gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
+        .await
+        .unwrap()
+        .unwrap();
+
+    let plan_text = plan.plan.to_string();
+    assert!(plan.can_advance_checkpoints);
+    assert!(!plan_text.contains("Filter:"), "{plan_text}");
+}
+
+#[tokio::test]
+async fn test_incremental_delta_applies_expire_after_retention_filter() {
+    let TestTaskParts {
+        mut task,
+        query_engine,
+        ..
+    } = new_time_window_test_task_with_query(
+        "SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window",
+    )
+    .await;
+    {
+        let mut state = task.state.write().unwrap();
+        state.advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
+        state
+            .dirty_time_windows
+            .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
+    }
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", CDT::uint32_datatype(), false),
+        ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
+            .with_time_index(true),
+    ]));
+
+    Arc::get_mut(&mut task.config)
+        .expect("test task config should be uniquely owned")
+        .expire_after = Some(expire_after_for_retention_filter_test());
+    let plan = task
+        .gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
+        .await
+        .unwrap()
+        .unwrap();
+
+    assert!(plan.can_advance_checkpoints);
+    assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
+    let plan_text = plan.plan.to_string();
+    assert!(
+        plan_text.contains("Filter: ts >= TimestampMillisecond("),
+        "{plan_text}"
+    );
+}
+
+#[tokio::test]
+async fn test_non_scoped_path_generates_plan_with_empty_dirty_signal() {
+    let TestTaskParts {
+        mut task,
+        query_engine,
+        ..
+    } = new_test_task_engine_and_plan_with_query(
+        "SELECT number, ts FROM numbers_with_ts",
+        "missing_sink",
+    )
+    .await;
+    Arc::get_mut(&mut task.config)
+        .expect("test task config should be uniquely owned")
+        .query_type = QueryType::Tql;
+    task.state.write().unwrap().dirty_time_windows.clean();
+    let sink_schema = Arc::new(Schema::new(vec![
+        ColumnSchema::new("number", CDT::uint32_datatype(), false),
+        ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false).with_time_index(true),
+    ]));
+
+    let plan = task
+        .gen_query_with_time_window(query_engine, &sink_schema, &[], false, None)
+        .await
+        .unwrap()
+        .expect("non-scoped path should generate a plan even with an empty dirty signal");
+
+    assert!(plan.can_advance_checkpoints);
+    assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
+}
+
 #[tokio::test]
 async fn test_executed_query_failure_restores_scoped_dirty_windows_for_flush_path() {
    let (task, plan) = new_test_task_and_plan_with_missing_sink().await;
@@ -773,7 +1055,7 @@ async fn test_prepare_plan_for_incremental_disables_on_non_aggregate() {
        query_ctx: ctx,
        catalog_manager: query_engine.engine_state().catalog_manager().clone(),
        shutdown_rx: rx,
-        batch_opts: Arc::new(BatchingModeOptions::default()),
+        batch_opts: incremental_batch_opts(),
        flow_eval_interval: None,
    })
    .unwrap();
@@ -788,10 +1070,7 @@ async fn test_prepare_plan_for_incremental_disables_on_non_aggregate() {
        CheckpointMode::Incremental
    );

-    let incremental_plan = task
-        .prepare_plan_for_incremental(&dml_plan, None)
-        .await
-        .unwrap();
+    let incremental_plan = task.prepare_plan_for_incremental(&dml_plan).await.unwrap();
    assert!(incremental_plan.is_none());
    let state = task.state.read().unwrap();
    assert!(state.is_incremental_disabled());
@@ -852,7 +1131,7 @@ async fn test_prepare_plan_for_incremental_falls_back_without_disable_on_rewrite
        query_ctx: ctx,
        catalog_manager: query_engine.engine_state().catalog_manager().clone(),
        shutdown_rx: rx,
-        batch_opts: Arc::new(BatchingModeOptions::default()),
+        batch_opts: incremental_batch_opts(),
        flow_eval_interval: None,
    })
    .unwrap();
@@ -866,10 +1145,7 @@ async fn test_prepare_plan_for_incremental_falls_back_without_disable_on_rewrite
        CheckpointMode::Incremental
    );

-    let incremental_plan = task
-        .prepare_plan_for_incremental(&dml_plan, None)
-        .await
-        .unwrap();
+    let incremental_plan = task.prepare_plan_for_incremental(&dml_plan).await.unwrap();
    assert!(incremental_plan.is_none());
    let state = task.state.read().unwrap();
    assert!(!state.is_incremental_disabled());
@@ -928,7 +1204,7 @@ async fn test_prepare_plan_for_incremental_group_by_without_merge_columns_uses_o
        query_ctx: ctx,
        catalog_manager: query_engine.engine_state().catalog_manager().clone(),
        shutdown_rx: rx,
-        batch_opts: Arc::new(BatchingModeOptions::default()),
+        batch_opts: incremental_batch_opts(),
        flow_eval_interval: None,
    })
    .unwrap();
@@ -939,7 +1215,7 @@ async fn test_prepare_plan_for_incremental_group_by_without_merge_columns_uses_o
        .advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));

    let incremental_plan = task
-        .prepare_plan_for_incremental(&dml_plan, None)
+        .prepare_plan_for_incremental(&dml_plan)
        .await
        .unwrap()
        .expect("plain GROUP BY is incremental-safe without a rewrite");
@@ -962,7 +1238,7 @@ async fn test_auto_created_sql_aggregate_sink_reaches_incremental_safe() {
    task.state.write().unwrap().dirty_time_windows.set_dirty();

    let plan_info = task
-        .gen_insert_plan(&query_engine, None)
+        .gen_insert_plan_unlocked(&query_engine, None)
        .await
        .unwrap()
        .unwrap();
@@ -973,7 +1249,7 @@ async fn test_auto_created_sql_aggregate_sink_reaches_incremental_safe() {
        .unwrap()
        .advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
    let incremental_plan = task
-        .prepare_plan_for_incremental(&plan_info.plan, None)
+        .prepare_plan_for_incremental(&plan_info.plan)
        .await
        .unwrap();
    let incremental_safe = incremental_plan.is_some();
@@ -1078,11 +1354,11 @@ async fn test_insert_plan_matching_failure_restores_consumed_dirty_marker() {
    register_number_only_sink(&query_engine, sink_table);
    task.state.write().unwrap().dirty_time_windows.set_dirty();

-    let result = task.gen_insert_plan(&query_engine, None).await;
+    let result = task.gen_insert_plan_unlocked(&query_engine, None).await;

    assert!(result.is_err());
    let _err = match result {
-        Ok(_) => panic!("gen_insert_plan should fail with a sink column mismatch"),
+        Ok(_) => panic!("gen_insert_plan_unlocked should fail with a sink column mismatch"),
        Err(err) => err,
    };
    let state = task.state.read().unwrap();
--- a/src/flow/src/batching_mode/utils/test.rs
+++ b/src/flow/src/batching_mode/utils/test.rs
@@ -1288,9 +1288,10 @@ async fn test_rewrite_incremental_aggregate_with_left_join() {

 #[tokio::test]
 async fn test_rewrite_incremental_aggregate_filters_sink_dirty_time_window() {
-    // This verifies the rewrite placement when callers supply an already
-    // inferred sink dirty-window predicate. The task-level inference rules are
-    // covered by `infer_sink_time_window_filter_col` tests in task.rs.
+    // This verifies the rewrite placement when callers supply a sink predicate.
+    // The production incremental flow path currently leaves sink scans
+    // unfiltered for correctness and relies on future dynamic filters for
+    // pruning.
    let query_engine = create_test_query_engine();
    let ctx = QueryContext::arc();
    let sql = "SELECT max(number) AS number, date_bin(INTERVAL '1 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window";
--- a/src/flow/src/server.rs
+++ b/src/flow/src/server.rs
@@ -566,11 +566,15 @@ impl FrontendInvoker {
                name: TABLE_FLOWNODE_SET_CACHE_NAME,
            })?;

+        // TODO(auto_create_table): flow sink tables are created through a controlled
+        // `CREATE FLOW` path, not client writes, so they are intentionally exempt from
+        // the frontend's global auto-create switch. Revisit if flow should honor it.
        let inserter = Arc::new(Inserter::new(
            catalog_manager.clone(),
            partition_manager.clone(),
            node_manager.clone(),
            table_flownode_cache,
+            true,
        ));

        let deleter = Arc::new(Deleter::new(
--- a/src/frontend/src/frontend.rs
+++ b/src/frontend/src/frontend.rs
@@ -44,6 +44,11 @@ pub struct FrontendOptions {
    pub node_id: Option<String>,
    pub default_timezone: Option<String>,
    pub default_column_prefix: Option<String>,
+    /// Server-side global switch for auto table creation on write.
+    /// Acts as an upper bound: when `false`, missing tables are never auto-created
+    /// even if a request sets the `auto_create_table` hint to `true`. When `true`
+    /// (default), the per-request hint still applies. Default: `true`.
+    pub auto_create_table: bool,
    /// Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
    /// Set to 0 to disable the limit. Default: "0" (unlimited)
    pub max_in_flight_write_bytes: ReadableSize,
@@ -82,6 +87,7 @@ impl Default for FrontendOptions {
            node_id: None,
            default_timezone: None,
            default_column_prefix: None,
+            auto_create_table: true,
            max_in_flight_write_bytes: ReadableSize(0),
            write_bytes_exhausted_policy: OnExhaustedPolicy::default(),
            http: HttpOptions::default(),
--- a/src/frontend/src/instance/builder.rs
+++ b/src/frontend/src/instance/builder.rs
@@ -185,6 +185,7 @@ impl FrontendBuilder {
            partition_manager.clone(),
            node_manager.clone(),
            table_flownode_cache,
+            self.options.auto_create_table,
        ));
        let deleter = Arc::new(Deleter::new(
            self.catalog_manager.clone(),
--- a/src/frontend/src/instance/otlp.rs
+++ b/src/frontend/src/instance/otlp.rs
@@ -43,7 +43,12 @@ use servers::query_handler::{
 };
 use session::context::QueryContextRef;
 use snafu::{IntoError, ResultExt};
-use table::requests::{OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM};
+use table::requests::{
+    OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM, SEMANTIC_PIPELINE, SEMANTIC_SIGNAL_TYPE,
+    SEMANTIC_SOURCE, SEMANTIC_TRACE_CONVENTIONS, SEMANTIC_TRACE_HAS_EVENTS,
+    SEMANTIC_TRACE_HAS_LINKS, SEMANTIC_VALUE_UNKNOWN, SIGNAL_TYPE_LOG, SIGNAL_TYPE_METRIC,
+    SIGNAL_TYPE_TRACE, SOURCE_OPENTELEMETRY, TABLE_DATA_MODEL_TRACE_V1,
+};

 use crate::instance::Instance;
 use crate::instance::otlp::trace_semconv::trace_semconv_fixed_type;
@@ -131,12 +136,14 @@ impl OpenTelemetryProtocolHandler for Instance {
        let (requests, rows) = otlp::metrics::to_grpc_insert_requests(request, &mut metric_ctx)?;
        OTLP_METRICS_ROWS.inc_by(rows as u64);

-        let ctx = if !is_legacy {
+        let ctx = {
            let mut c = (*ctx).clone();
-            c.set_extension(OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM.to_string());
+            c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC);
+            c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
+            if !is_legacy {
+                c.set_extension(OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM.to_string());
+            }
            Arc::new(c)
-        } else {
-            ctx
        };

        // If the user uses the legacy path, it is by default without metric engine.
@@ -211,6 +218,15 @@ impl OpenTelemetryProtocolHandler for Instance {
            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
        interceptor_ref.pre_execute(ctx.clone())?;

+        // `as_req_iter` clones this ctx into each `temp_ctx`, so identity set here
+        // reaches the context that drives table auto-create.
+        let ctx = {
+            let mut c = (*ctx).clone();
+            c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_LOG);
+            c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
+            Arc::new(c)
+        };
+
        let opt_req = otlp::logs::to_grpc_insert_requests(
            request,
            pipeline,
@@ -256,6 +272,23 @@ impl Instance {
        ctx: QueryContextRef,
    ) -> ServerResult<TraceIngestOutcome> {
        let is_trace_v1_model = matches!(pipeline, PipelineWay::OtlpTraceDirectV1);
+
+        // Only the main span table gets the identity; the derived `_services` /
+        // `_operations` lookup tables keep the unstamped `ctx`.
+        let main_ctx = {
+            let mut c = (*ctx).clone();
+            c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_TRACE);
+            c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
+            if is_trace_v1_model {
+                c.set_extension(SEMANTIC_PIPELINE, TABLE_DATA_MODEL_TRACE_V1);
+                c.set_extension(SEMANTIC_TRACE_HAS_EVENTS, "true");
+                c.set_extension(SEMANTIC_TRACE_HAS_LINKS, "true");
+                // schema_url is row-level, so conventions is unknown at table level.
+                c.set_extension(SEMANTIC_TRACE_CONVENTIONS, SEMANTIC_VALUE_UNKNOWN);
+            }
+            Arc::new(c)
+        };
+
        let ingest_ctx = TraceChunkIngestContext {
            pipeline_handler,
            pipeline,
@@ -278,7 +311,7 @@ impl Instance {
                .map(|chunk| chunk.collect::<Vec<_>>())
                .collect::<Vec<_>>();
            for chunk in chunks {
-                self.ingest_trace_chunk(&ingest_ctx, chunk, ctx.clone(), &mut ingest_state)
+                self.ingest_trace_chunk(&ingest_ctx, chunk, main_ctx.clone(), &mut ingest_state)
                    .await?;
            }
        }
--- a/src/meta-srv/src/procedure/repartition.rs
+++ b/src/meta-srv/src/procedure/repartition.rs
@@ -440,7 +440,17 @@ impl Context {
        };
        let _ = self
            .cache_invalidator
-            .invalidate(&ctx, &[CacheIdent::TableId(table_id)])
+            .invalidate(
+                &ctx,
+                &[
+                    CacheIdent::TableId(table_id),
+                    CacheIdent::TableName(TableName {
+                        catalog_name: self.persistent_ctx.catalog_name.clone(),
+                        schema_name: self.persistent_ctx.schema_name.clone(),
+                        table_name: self.persistent_ctx.table_name.clone(),
+                    }),
+                ],
+            )
            .await;
        Ok(())
    }
--- a/src/meta-srv/src/procedure/repartition/update_partition_metadata.rs
+++ b/src/meta-srv/src/procedure/repartition/update_partition_metadata.rs
@@ -95,10 +95,19 @@ impl State for UpdatePartitionMetadata {

        let mut new_table_info = table_info_value.table_info.clone();
        new_table_info.meta.partition_key_indices = partition_key_indices;
+        common_telemetry::info!(
+            "Update table partition metadata, table_id: {}, partition_key_indices: {:?}, partition_columns: {:?}",
+            table_id,
+            new_table_info.meta.partition_key_indices,
+            new_table_info
+                .meta
+                .partition_column_names()
+                .cloned()
+                .collect::<Vec<_>>(),
+        );
        ctx.update_table_info(&table_info_value, table_info_value.update(new_table_info))
            .await?;
-        // We don't invalidate cache here because the subsequent AllocateRegion step
-        // will update the table route and invalidate the cache accordingly.
+        ctx.invalidate_table_cache().await?;

        Ok((
            Box::new(AllocateRegion::new(self.plan_entries.clone())),
--- a/src/mito2/Cargo.toml
+++ b/src/mito2/Cargo.toml
@@ -50,6 +50,7 @@ datafusion-common.workspace = true
 datafusion-expr.workspace = true
 datatypes.workspace = true
 dashmap.workspace = true
+derive_more.workspace = true
 dotenv.workspace = true
 either.workspace = true
 futures.workspace = true
--- a/src/mito2/src/compaction.rs
+++ b/src/mito2/src/compaction.rs
@@ -150,6 +150,7 @@ impl CompactionScheduler {
    }

    /// Schedules a compaction for the region.
+    /// Returns whether a compaction is scheduled.
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn schedule_compaction(
        &mut self,
@@ -161,7 +162,7 @@ impl CompactionScheduler {
        manifest_ctx: &ManifestContextRef,
        schema_metadata_manager: SchemaMetadataManagerRef,
        max_parallelism: usize,
-    ) -> Result<()> {
+    ) -> Result<bool> {
        // skip compaction if region is in staging state
        let current_state = manifest_ctx.current_state();
        if current_state == RegionRoleState::Leader(RegionLeaderState::Staging) {
@@ -170,7 +171,7 @@ impl CompactionScheduler {
                region_id, compact_options
            );
            waiter.send(Ok(0));
-            return Ok(());
+            return Ok(false);
        }

        if let Some(status) = self.region_status.get_mut(&region_id) {
@@ -192,7 +193,7 @@ impl CompactionScheduler {
                    );
                }
            }
-            return Ok(());
+            return Ok(false);
        }

        // The region can compact directly.
@@ -209,7 +210,7 @@ impl CompactionScheduler {
            max_parallelism,
        );

-        let result = match self
+        match self
            .schedule_compaction_request(request, compact_options)
            .await
        {
@@ -220,14 +221,12 @@ impl CompactionScheduler {
                status.active_compaction = Some(active_compaction);
                self.region_status.insert(region_id, status);

-                Ok(())
+                self.listener.on_compaction_scheduled(region_id);
+                Ok(true)
            }
-            Ok(None) => Ok(()),
+            Ok(None) => Ok(false),
            Err(e) => Err(e),
-        };
-
-        self.listener.on_compaction_scheduled(region_id);
-        result
+        }
    }

    // Handle pending manual compaction request for the region.
@@ -334,6 +333,27 @@ impl CompactionScheduler {
            // And skip try to schedule next compaction task.
            return pending_ddl_requests;
        }
+        Vec::new()
+    }
+
+    pub(crate) fn is_compacting(&self, region_id: RegionId) -> bool {
+        self.region_status
+            .get(&region_id)
+            .map(|status| status.active_compaction.is_some())
+            .unwrap_or(false)
+    }
+
+    /// Schedules next compaction upon a finished compaction.
+    /// Returns whether the compaction is scheduled.
+    pub(crate) async fn schedule_next_compaction(
+        &mut self,
+        region_id: RegionId,
+        manifest_ctx: &ManifestContextRef,
+        schema_metadata_manager: SchemaMetadataManagerRef,
+    ) -> bool {
+        let Some(status) = self.region_status.get_mut(&region_id) else {
+            return false;
+        };

        // We should always try to compact the region until picker returns None.
        let request = status.new_compaction_request(
@@ -364,20 +384,21 @@ impl CompactionScheduler {
                    "Successfully scheduled next compaction for region id: {}",
                    region_id
                );
+                true
            }
            Ok(None) => {
                // No further compaction tasks can be scheduled; cleanup the `CompactionStatus` for this region.
                // All DDL requests and pending compaction requests have already been processed.
                // Safe to remove the region from status tracking.
                self.region_status.remove(&region_id);
+                false
            }
            Err(e) => {
                error!(e; "Failed to schedule next compaction for region {}", region_id);
                self.remove_region_on_failure(region_id, Arc::new(e));
+                false
            }
        }
-
-        Vec::new()
    }

    /// Notifies the scheduler that the compaction job is cancelled cooperatively.
@@ -1435,7 +1456,7 @@ mod tests {
        let manifest_ctx = env
            .mock_manifest_context(version_control.current().version.metadata.clone())
            .await;
-        scheduler
+        let scheduled = scheduler
            .schedule_compaction(
                builder.region_id(),
                compact_request::Options::Regular(Default::default()),
@@ -1448,6 +1469,7 @@ mod tests {
            )
            .await
            .unwrap();
+        assert!(!scheduled);
        let output = output_rx.await.unwrap().unwrap();
        assert_eq!(output, 0);
        assert!(scheduler.region_status.is_empty());
@@ -1456,7 +1478,7 @@ mod tests {
        let version_control = Arc::new(builder.push_l0_file(0, 1000).build());
        let (output_tx, output_rx) = oneshot::channel();
        let waiter = OptionOutputTx::from(output_tx);
-        scheduler
+        let scheduled = scheduler
            .schedule_compaction(
                builder.region_id(),
                compact_request::Options::Regular(Default::default()),
@@ -1469,11 +1491,67 @@ mod tests {
            )
            .await
            .unwrap();
+        assert!(!scheduled);
        let output = output_rx.await.unwrap().unwrap();
        assert_eq!(output, 0);
        assert!(scheduler.region_status.is_empty());
    }

+    #[tokio::test]
+    async fn test_schedule_compaction_returns_true_when_task_scheduled() {
+        let job_scheduler = Arc::new(VecScheduler::default());
+        let env = SchedulerEnv::new().await.scheduler(job_scheduler.clone());
+        let (tx, _rx) = mpsc::channel(4);
+        let mut scheduler = env.mock_compaction_scheduler(tx);
+        let mut builder = VersionControlBuilder::new();
+        let region_id = builder.region_id();
+        let end = 1000 * 1000;
+        // Five overlapping L0 files are enough for the regular picker to create a task.
+        let version_control = Arc::new(
+            builder
+                .push_l0_file(0, end)
+                .push_l0_file(10, end)
+                .push_l0_file(50, end)
+                .push_l0_file(80, end)
+                .push_l0_file(90, end)
+                .build(),
+        );
+        let manifest_ctx = env
+            .mock_manifest_context(version_control.current().version.metadata.clone())
+            .await;
+        let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager();
+        schema_metadata_manager
+            .register_region_table_info(
+                region_id.table_id(),
+                "test_table",
+                "test_catalog",
+                "test_schema",
+                None,
+                kv_backend,
+            )
+            .await;
+
+        let scheduled = scheduler
+            .schedule_compaction(
+                region_id,
+                Options::Regular(Default::default()),
+                &version_control,
+                &env.access_layer,
+                OptionOutputTx::none(),
+                &manifest_ctx,
+                schema_metadata_manager,
+                1,
+            )
+            .await
+            .unwrap();
+
+        // The boolean result is what the worker uses to decide whether to update
+        // last_schedule_compaction_millis.
+        assert!(scheduled);
+        assert_eq!(1, job_scheduler.num_jobs());
+        assert!(scheduler.region_status.contains_key(&region_id));
+    }
+
    #[tokio::test]
    async fn test_schedule_on_finished() {
        common_telemetry::init_default_ut_logging();
@@ -1511,7 +1589,7 @@ mod tests {
        let manifest_ctx = env
            .mock_manifest_context(version_control.current().version.metadata.clone())
            .await;
-        scheduler
+        let scheduled = scheduler
            .schedule_compaction(
                region_id,
                compact_request::Options::Regular(Default::default()),
@@ -1525,6 +1603,7 @@ mod tests {
            .await
            .unwrap();
        // Should schedule 1 compaction.
+        assert!(scheduled);
        assert_eq!(1, scheduler.region_status.len());
        assert_eq!(1, job_scheduler.num_jobs());
        let data = version_control.current();
@@ -1543,7 +1622,7 @@ mod tests {
        );
        // The task is pending.
        let (tx, _rx) = oneshot::channel();
-        scheduler
+        let scheduled = scheduler
            .schedule_compaction(
                region_id,
                compact_request::Options::Regular(Default::default()),
@@ -1556,6 +1635,7 @@ mod tests {
            )
            .await
            .unwrap();
+        assert!(!scheduled);
        assert_eq!(1, scheduler.region_status.len());
        assert_eq!(1, job_scheduler.num_jobs());
        assert!(
@@ -1571,6 +1651,10 @@ mod tests {
        scheduler
            .on_compaction_finished(region_id, &manifest_ctx, schema_metadata_manager.clone())
            .await;
+        let scheduled = scheduler
+            .schedule_next_compaction(region_id, &manifest_ctx, schema_metadata_manager.clone())
+            .await;
+        assert!(scheduled);
        assert_eq!(1, scheduler.region_status.len());
        assert_eq!(2, job_scheduler.num_jobs());

@@ -1583,7 +1667,7 @@ mod tests {
        );
        let (tx, _rx) = oneshot::channel();
        // The task is pending.
-        scheduler
+        let scheduled = scheduler
            .schedule_compaction(
                region_id,
                compact_request::Options::Regular(Default::default()),
@@ -1596,6 +1680,7 @@ mod tests {
            )
            .await
            .unwrap();
+        assert!(!scheduled);
        assert_eq!(2, job_scheduler.num_jobs());
        assert!(
            !scheduler
@@ -2329,6 +2414,15 @@ mod tests {
            .await;

        assert!(pending_ddls.is_empty());
+        assert!(scheduler.region_status.contains_key(&region_id));
+
+        let (schema_metadata_manager, _kv_backend) = mock_schema_metadata_manager();
+        // With no compactable files, next scheduling returns false and removes
+        // the status without creating a background task.
+        let scheduled = scheduler
+            .schedule_next_compaction(region_id, &manifest_ctx, schema_metadata_manager)
+            .await;
+        assert!(!scheduled);
        assert!(!scheduler.region_status.contains_key(&region_id));
    }

@@ -2371,6 +2465,14 @@ mod tests {
            .await;

        assert!(pending_ddls.is_empty());
+        assert!(scheduler.region_status.contains_key(&region_id));
+
+        let (schema_metadata_manager, _kv_backend) = mock_schema_metadata_manager();
+        // The failing scheduler simulates a submit error; callers must see false.
+        let scheduled = scheduler
+            .schedule_next_compaction(region_id, &manifest_ctx, schema_metadata_manager)
+            .await;
+        assert!(!scheduled);
        assert!(!scheduler.region_status.contains_key(&region_id));
    }

--- a/src/mito2/src/compaction/run.rs
+++ b/src/mito2/src/compaction/run.rs
@@ -15,6 +15,9 @@
 //! This file contains code to find sorted runs in a set if ranged items and
 //! along with the best way to merge these items to satisfy the desired run count.

+use std::cmp::Ordering;
+use std::collections::BinaryHeap;
+
 use bytes::{Buf, Bytes};
 use common_base::BitVec;
 use common_base::readable_size::ReadableSize;
@@ -423,6 +426,133 @@ where
    runs
 }

+pub(crate) fn find_sorted_runs_by_time_range<T>(items: &mut [T]) -> Vec<SortedRun<T>>
+where
+    T: Item,
+{
+    if items.is_empty() {
+        return vec![];
+    }
+    sort_ranged_items(items);
+
+    use derive_more::{Eq, PartialEq};
+
+    /// `SortedRun` with a creation sequence `i`.
+    #[derive(PartialEq, Eq)]
+    struct Run<T: Item> {
+        i: usize,
+        #[partial_eq(skip)]
+        run: SortedRun<T>,
+    }
+
+    impl<T: Item> Run<T> {
+        fn new(i: usize, item: &T) -> Run<T> {
+            let mut run = SortedRun::default();
+            run.push_item(item.clone());
+            Run { i, run }
+        }
+
+        fn push_item(&mut self, item: &T) {
+            self.run.push_item(item.clone());
+        }
+    }
+
+    impl<T: Item> PartialOrd for Run<T> {
+        fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+            Some(self.cmp(other))
+        }
+    }
+
+    /// Sort by run's `end` desc then `start` asc.
+    impl<T: Item> Ord for Run<T> {
+        fn cmp(&self, other: &Self) -> Ordering {
+            let l_run = &self.run;
+            let r_run = &other.run;
+
+            // Safety: `start` and `end` must both exist because it's guaranteed that whenever a
+            // `Run` is created, an item is pushed into it immediately (see its `new` method above).
+            // And there are no other ways to create a `Run` beyond its `new` method in this
+            // function's scope.
+            let l_end = l_run.end.unwrap();
+            let r_end = r_run.end.unwrap();
+            r_end
+                .cmp(&l_end)
+                .then_with(|| {
+                    let l_start = l_run.start.unwrap();
+                    let r_start = r_run.start.unwrap();
+                    l_start.cmp(&r_start)
+                })
+                .then_with(|| self.i.cmp(&other.i))
+        }
+    }
+
+    /// Wrapper around the `Run` above, to support sorting them by their creation sequence `i`.
+    #[derive(PartialEq, Eq)]
+    struct Wrapper<T: Item>(Run<T>);
+
+    impl<T: Item> PartialOrd for Wrapper<T> {
+        fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+            Some(self.cmp(other))
+        }
+    }
+
+    impl<T: Item> Ord for Wrapper<T> {
+        fn cmp(&self, other: &Self) -> Ordering {
+            other.0.i.cmp(&self.0.i)
+        }
+    }
+
+    // Two heaps for finding a run that is both:
+    // 1. not overlapping with item's range,
+    // 2. and is created earliest,
+    // when iterating the items.
+    //
+    // Heap 1 (`runs_sorted_by_end`) is for storing the runs of which top has the minimal "end"
+    // just about to overlap with the current selected item.
+    //
+    // Heap 2 (`runs_sort_by_index`) is for storing the runs that all have "end"s non-overlap with
+    // the current selected item, and of which top is the earliest created run.
+    //
+    // The finding of a suitable run basically works like this:
+    // 1. moves the runs in heap 1 to heap 2, until the top is overlapping with the current item;
+    // 2. now heap 2 has all the runs that can accept the current item, pop its top;
+    // 3. the top is the earliest created run, push the current item;
+    // 4. because the run has changed, push it back to heap 1;
+    // 5. check the next item. Important: we don't need to push the runs in heap 2 to 1, because
+    //    the items are sorted by "start". When checking the next item, heap 2's runs must all have
+    //    "end"s smaller than next item's "start".
+    //
+    // Actually the heap 2 is only for aligning with the runs selection outcomes in the original
+    // `find_sorted_runs` implementation. If we just need the invariant that each run has the
+    // non-overlapping items, we can get rid of heap 2 and make the codes simpler.
+
+    let mut runs_sort_by_end = BinaryHeap::<Run<T>>::new();
+    let mut runs_sort_by_index = BinaryHeap::<Wrapper<T>>::new();
+    let mut i = 0;
+
+    for item in items {
+        let (start, _) = item.range();
+
+        while let Some(run) = runs_sort_by_end.pop_if(|x| x.run.end.unwrap() <= start) {
+            runs_sort_by_index.push(Wrapper(run));
+        }
+
+        let Some(mut run) = runs_sort_by_index.pop() else {
+            i += 1;
+            runs_sort_by_end.push(Run::new(i, item));
+            continue;
+        };
+
+        run.0.push_item(item);
+        runs_sort_by_end.push(run.0);
+    }
+
+    let mut runs = runs_sort_by_end.into_vec();
+    runs.extend(runs_sort_by_index.into_vec().into_iter().map(|x| x.0));
+    runs.sort_unstable_by_key(|run| run.i);
+    runs.into_iter().map(|x| x.run).collect()
+}
+
 /// Finds a set of files with minimum penalty to merge that can reduce the total num of runs.
 /// The penalty of merging is defined as the size of all overlapping files between two runs.
 pub fn reduce_runs<T: Item>(mut runs: Vec<SortedRun<T>>) -> Vec<T> {
@@ -599,6 +729,8 @@ mod tests {
        expected_runs: &[Vec<(i64, i64)>],
    ) -> Vec<SortedRun<MockFile>> {
        let mut files = build_items(ranges);
+        let mut files_clone = files.clone();
+
        let runs = find_sorted_runs(&mut files);

        let result_file_ranges: Vec<Vec<_>> = runs
@@ -606,6 +738,13 @@ mod tests {
            .map(|r| r.items.iter().map(|f| f.range()).collect())
            .collect();
        assert_eq!(&expected_runs, &result_file_ranges);
+
+        let runs_by_time_range = find_sorted_runs_by_time_range(&mut files_clone);
+        let results: Vec<Vec<_>> = runs_by_time_range
+            .iter()
+            .map(|r| r.items.iter().map(|f| f.range()).collect())
+            .collect();
+        assert_eq!(&expected_runs, &results);
        runs
    }

--- a/src/mito2/src/compaction/twcs.rs
+++ b/src/mito2/src/compaction/twcs.rs
@@ -22,14 +22,15 @@ use common_telemetry::{debug, info};
 use common_time::Timestamp;
 use common_time::timestamp::TimeUnit;
 use common_time::timestamp_millis::BucketAligned;
+use rayon::prelude::*;
 use store_api::storage::RegionId;

 use crate::compaction::buckets::infer_time_bucket;
 use crate::compaction::compactor::CompactionRegion;
 use crate::compaction::picker::{Picker, PickerOutput};
 use crate::compaction::run::{
-    FileGroup, Item, Ranged, find_sorted_runs, merge_primary_key_ranges, merge_seq_files,
-    primary_key_ranges_overlap, reduce_runs,
+    FileGroup, Item, Ranged, find_sorted_runs, find_sorted_runs_by_time_range,
+    merge_primary_key_ranges, merge_seq_files, primary_key_ranges_overlap, reduce_runs,
 };
 use crate::compaction::{CompactionOutput, get_expired_ssts};
 use crate::sst::file::{FileHandle, Level, overlaps};
@@ -64,11 +65,10 @@ impl TwcsPicker {
        time_windows: &mut BTreeMap<i64, Window>,
        active_window: Option<i64>,
    ) -> Vec<CompactionOutput> {
-        let mut output = vec![];
-        for (window, files) in time_windows {
-            if files.files.is_empty() {
-                continue;
-            }
+        let find_inputs = |files: &Window,
+                           windows: &BTreeMap<i64, Window>|
+         -> (Vec<FileGroup>, bool) {
+            let window = &files.time_window;
            let mut files_to_merge: Vec<_> = files.files().cloned().collect();

            // Filter out large files in append mode - they won't benefit from compaction
@@ -88,13 +88,18 @@ impl TwcsPicker {
                );
            }

-            let sorted_runs = find_sorted_runs(&mut files_to_merge);
+            let sorted_runs = if files_to_merge.len() < 1024 {
+                find_sorted_runs(&mut files_to_merge)
+            } else {
+                find_sorted_runs_by_time_range(&mut files_to_merge)
+            };
            let found_runs = sorted_runs.len();
            // We only remove deletion markers if we found less than 2 runs and not in append mode.
            // because after compaction there will be no overlapping files.
-            let filter_deleted = !files.overlapping && found_runs <= 2 && !self.append_mode;
+            let filter_deleted =
+                found_runs <= 2 && !self.append_mode && !window_has_overlap(files, windows);
            if found_runs == 0 {
-                continue;
+                return (vec![], filter_deleted);
            }

            let mut inputs = if found_runs > 1 {
@@ -102,7 +107,7 @@ impl TwcsPicker {
            } else {
                let run = sorted_runs.last().unwrap();
                if run.items().len() < self.trigger_file_num {
-                    continue;
+                    return (vec![], filter_deleted);
                }
                // no overlapping files, try merge small files
                merge_seq_files(run.items(), self.max_output_file_size)
@@ -144,6 +149,26 @@ impl TwcsPicker {
                    filter_deleted,
                    &inputs,
                );
+            }
+            (inputs, filter_deleted)
+        };
+
+        let mut output = vec![];
+        let windows = time_windows
+            .values()
+            .filter(|w| !w.files.is_empty())
+            .collect::<Vec<_>>();
+        let chunk_size = self.max_background_tasks.unwrap_or(windows.len()).max(1);
+        'chunks: for chunk in windows.chunks(chunk_size) {
+            for (inputs, filter_deleted) in chunk
+                .par_iter() // parallelly calculate the inputs
+                .map(|window| find_inputs(window, time_windows))
+                .collect::<Vec<_>>()
+            {
+                if inputs.is_empty() {
+                    continue;
+                }
+
                output.push(CompactionOutput {
                    output_level: LEVEL_COMPACTED, // always compact to l1
                    inputs: inputs.into_iter().flat_map(|fg| fg.into_files()).collect(),
@@ -158,7 +183,7 @@ impl TwcsPicker {
                        "Region ({:?}) compaction task size larger than max background tasks({}), remaining tasks discarded",
                        region_id, max_background_tasks
                    );
-                    break;
+                    break 'chunks;
                }
            }
        }
@@ -268,7 +293,6 @@ struct Window {
    // created from the same compaction task.
    files: HashMap<Option<NonZeroU64>, FileGroup>,
    time_window: i64,
-    overlapping: bool,
    primary_key_range: Option<(bytes::Bytes, bytes::Bytes)>,
 }

@@ -283,7 +307,6 @@ impl Window {
            end,
            files,
            time_window: 0,
-            overlapping: false,
            primary_key_range,
        }
    }
@@ -346,37 +369,21 @@ fn assign_to_windows<'a>(
            }
        }
    }
-    if windows.is_empty() {
-        return BTreeMap::new();
-    }
+    windows.into_iter().collect()
+}

-    let mut windows = windows.into_values().collect::<Vec<_>>();
-    windows.sort_unstable_by(|l, r| l.start.cmp(&r.start).then(l.end.cmp(&r.end).reverse()));
-
-    for idx in 0..windows.len() {
-        let lhs_range = windows[idx].range();
-        for next_idx in idx + 1..windows.len() {
-            let rhs_range = windows[next_idx].range();
-            if rhs_range.0 > lhs_range.1 {
-                break;
-            }
-
-            let windows_overlap = overlaps(&lhs_range, &rhs_range)
-                && match (
-                    &windows[idx].primary_key_range,
-                    &windows[next_idx].primary_key_range,
-                ) {
-                    (Some(lhs), Some(rhs)) => primary_key_ranges_overlap(lhs, rhs),
+fn window_has_overlap(this: &Window, windows: &BTreeMap<i64, Window>) -> bool {
+    windows
+        .values()
+        .filter(|that| this.time_window != that.time_window)
+        .any(|that| {
+            overlaps(&this.range(), &that.range()) && {
+                match (&this.primary_key_range, &that.primary_key_range) {
+                    (Some(l), Some(r)) => primary_key_ranges_overlap(l, r),
                    _ => true,
-                };
-            if windows_overlap {
-                windows[idx].overlapping = true;
-                windows[next_idx].overlapping = true;
+                }
            }
-        }
-    }
-
-    windows.into_iter().map(|w| (w.time_window, w)).collect()
+        })
 }

 /// Finds the latest active writing window among all files.
@@ -606,7 +613,8 @@ mod tests {

        for (expected_window, overlapping, window_files) in expected_files {
            let actual_window = windows.get(expected_window).unwrap();
-            assert_eq!(*overlapping, actual_window.overlapping);
+            let actual_overlapping = window_has_overlap(actual_window, &windows);
+            assert_eq!(*overlapping, actual_overlapping);
            let mut file_ranges = actual_window
                .files
                .values()
@@ -744,7 +752,8 @@ mod tests {

        let windows = assign_to_windows(files.iter(), 2);

-        assert!(!windows.get(&2).unwrap().overlapping);
+        let overlapping = window_has_overlap(windows.get(&2).unwrap(), &windows);
+        assert!(!overlapping);
    }

    #[test]
@@ -773,7 +782,8 @@ mod tests {

        let windows = assign_to_windows(files.iter(), 2);

-        assert!(!windows.get(&4).unwrap().overlapping);
+        let overlapping = window_has_overlap(windows.get(&4).unwrap(), &windows);
+        assert!(!overlapping);
    }

    struct CompactionPickerTestCase {
--- a/src/mito2/src/engine/edit_region_test.rs
+++ b/src/mito2/src/engine/edit_region_test.rs
@@ -21,6 +21,7 @@ use common_error::ext::ErrorExt;
 use common_error::status_code::StatusCode;
 use common_recordbatch::DfRecordBatch;
 use common_test_util::flight::encode_to_flight_data;
+use common_time::Timestamp;
 use common_time::util::current_time_millis;
 use datatypes::arrow::array::{ArrayRef, Float64Array, StringArray, TimestampMillisecondArray};
 use datatypes::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
@@ -67,7 +68,8 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) {
        default_flat_format: flat_format,
        ..Default::default()
    };
-    let time_provider = Arc::new(MockTimeProvider::new(current_time_millis()));
+    let initial_time = current_time_millis();
+    let time_provider = Arc::new(MockTimeProvider::new(initial_time));
    let engine = env
        .create_engine_with_time(
            config.clone(),
@@ -99,14 +101,22 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) {
        .await
        .unwrap();
    let region = engine.get_region(region_id).unwrap();
+    let initial_schedule_time = region.last_schedule_compaction_millis();
+    assert_eq!(initial_time, initial_schedule_time);

-    let new_edit = || RegionEdit {
-        files_to_add: vec![FileMeta {
-            region_id: region.region_id,
-            file_id: FileId::random(),
-            level: 0,
-            ..Default::default()
-        }],
+    let new_edit = |file_starts: &[i64]| RegionEdit {
+        files_to_add: file_starts
+            .iter()
+            .map(|start| FileMeta {
+                region_id: region.region_id,
+                file_id: FileId::random(),
+                time_range: (
+                    Timestamp::new_millisecond(*start),
+                    Timestamp::new_millisecond(1000 * 1000),
+                ),
+                ..Default::default()
+            })
+            .collect(),
        files_to_remove: vec![],
        timestamp_ms: None,
        compaction_time_window: None,
@@ -115,19 +125,23 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) {
        committed_sequence: None,
    };
    engine
-        .edit_region(region.region_id, new_edit())
+        .edit_region(region.region_id, new_edit(&[0, 10, 50, 80]))
        .await
        .unwrap();
    // Asserts that the compaction of the region is not scheduled,
    // because the minimum time interval between two compactions is not passed.
    assert_eq!(rx.try_recv(), Err(oneshot::error::TryRecvError::Empty));
+    assert_eq!(
+        initial_schedule_time,
+        region.last_schedule_compaction_millis()
+    );

    // Simulates the time has passed the min compaction interval,
-    time_provider
-        .set_now(current_time_millis() + config.min_compaction_interval.as_millis() as i64);
+    let next_schedule_time = initial_time + config.min_compaction_interval.as_millis() as i64;
+    time_provider.set_now(next_schedule_time);
    // ... then edits the region again,
    engine
-        .edit_region(region.region_id, new_edit())
+        .edit_region(region.region_id, new_edit(&[90]))
        .await
        .unwrap();
    // ... finally asserts that the compaction of the region is scheduled.
@@ -136,6 +150,9 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) {
        .unwrap()
        .unwrap();
    assert_eq!(region_id, actual);
+    // Wait for the `last_schedule_compaction_millis` to update.
+    tokio::time::sleep(Duration::from_millis(100)).await;
+    assert_eq!(next_schedule_time, region.last_schedule_compaction_millis());
 }

 #[tokio::test]
--- a/src/mito2/src/lib.rs
+++ b/src/mito2/src/lib.rs
@@ -18,6 +18,7 @@

 #![feature(debug_closure_helpers)]
 #![feature(duration_constructors)]
+#![feature(binary_heap_pop_if)]

 #[cfg(any(test, feature = "test"))]
 #[cfg_attr(feature = "test", allow(unused))]
--- a/src/mito2/src/region.rs
+++ b/src/mito2/src/region.rs
@@ -157,8 +157,8 @@ pub struct MitoRegion {
    pub(crate) provider: Provider,
    /// Last flush time in millis.
    last_flush_millis: AtomicI64,
-    /// Last compaction time in millis.
-    last_compaction_millis: AtomicI64,
+    /// Last schedule compaction time in millis.
+    last_schedule_compaction_millis: AtomicI64,
    /// Provider to get current time.
    time_provider: TimeProviderRef,
    /// The topic's latest entry id since the region's last flushing.
@@ -251,15 +251,16 @@ impl MitoRegion {
        self.last_flush_millis.store(now, Ordering::Relaxed);
    }

-    /// Returns last compaction timestamp in millis.
-    pub(crate) fn last_compaction_millis(&self) -> i64 {
-        self.last_compaction_millis.load(Ordering::Relaxed)
+    /// Returns last schedule compaction timestamp in millis.
+    pub(crate) fn last_schedule_compaction_millis(&self) -> i64 {
+        self.last_schedule_compaction_millis.load(Ordering::Relaxed)
    }

-    /// Update compaction time to current time.
-    pub(crate) fn update_compaction_millis(&self) {
+    /// Update schedule compaction time to current time.
+    pub(crate) fn update_schedule_compaction_millis(&self) {
        let now = self.time_provider.current_time_millis();
-        self.last_compaction_millis.store(now, Ordering::Relaxed);
+        self.last_schedule_compaction_millis
+            .store(now, Ordering::Relaxed);
    }

    /// Returns the table dir.
@@ -1727,7 +1728,7 @@ mod tests {
            file_purger: crate::test_util::new_noop_file_purger(),
            provider: Provider::noop_provider(),
            last_flush_millis: Default::default(),
-            last_compaction_millis: Default::default(),
+            last_schedule_compaction_millis: Default::default(),
            time_provider: Arc::new(StdTimeProvider),
            topic_latest_entry_id: Default::default(),
            written_bytes: Arc::new(AtomicU64::new(0)),
@@ -2084,7 +2085,7 @@ mod tests {
            file_purger: crate::test_util::new_noop_file_purger(),
            provider: Provider::noop_provider(),
            last_flush_millis: Default::default(),
-            last_compaction_millis: Default::default(),
+            last_schedule_compaction_millis: Default::default(),
            time_provider: Arc::new(StdTimeProvider),
            topic_latest_entry_id: Default::default(),
            written_bytes: Arc::new(AtomicU64::new(0)),
--- a/src/mito2/src/region/opener.rs
+++ b/src/mito2/src/region/opener.rs
@@ -345,7 +345,7 @@ impl RegionOpener {
            ),
            provider,
            last_flush_millis: AtomicI64::new(now),
-            last_compaction_millis: AtomicI64::new(now),
+            last_schedule_compaction_millis: AtomicI64::new(now),
            time_provider: self.time_provider.clone(),
            topic_latest_entry_id: AtomicU64::new(0),
            written_bytes: Arc::new(AtomicU64::new(0)),
@@ -581,7 +581,7 @@ impl RegionOpener {
            file_purger,
            provider: provider.clone(),
            last_flush_millis: AtomicI64::new(now),
-            last_compaction_millis: AtomicI64::new(now),
+            last_schedule_compaction_millis: AtomicI64::new(now),
            time_provider: self.time_provider.clone(),
            topic_latest_entry_id: AtomicU64::new(topic_latest_entry_id),
            written_bytes: Arc::new(AtomicU64::new(0)),
--- a/src/mito2/src/worker/handle_compaction.rs
+++ b/src/mito2/src/worker/handle_compaction.rs
@@ -13,7 +13,7 @@
 // limitations under the License.

 use api::v1::region::compact_request;
-use common_telemetry::{error, info, warn};
+use common_telemetry::{debug, error, info};
 use store_api::logstore::LogStore;
 use store_api::region_request::RegionCompactRequest;
 use store_api::storage::RegionId;
@@ -80,7 +80,6 @@ impl<S> RegionWorkerLoop<S> {
                return;
            }
        };
-        region.update_compaction_millis();

        region.version_control.apply_edit(
            Some(request.edit.clone()),
@@ -118,6 +117,31 @@ impl<S> RegionWorkerLoop<S> {
            )
            .await;
        self.handle_ddl_requests(&mut pending_ddls).await;
+
+        if self.compaction_scheduler.is_compacting(region_id) {
+            return;
+        }
+
+        let now = self.time_provider.current_time_millis();
+        if now - region.last_schedule_compaction_millis()
+            >= self.config.min_compaction_interval.as_millis() as i64
+        {
+            debug!(
+                "minimal compaction interval time {:?} has passed, scheduling next compaction",
+                self.config.min_compaction_interval
+            );
+            if self
+                .compaction_scheduler
+                .schedule_next_compaction(
+                    region_id,
+                    &region.manifest_ctx,
+                    self.schema_metadata_manager.clone(),
+                )
+                .await
+            {
+                region.update_schedule_compaction_millis();
+            }
+        }
    }

    pub(crate) async fn handle_compaction_cancelled(
@@ -160,9 +184,14 @@ impl<S> RegionWorkerLoop<S> {
            return;
        }
        let now = self.time_provider.current_time_millis();
-        if now - region.last_compaction_millis()
+        if now - region.last_schedule_compaction_millis()
            >= self.config.min_compaction_interval.as_millis() as i64
-            && let Err(e) = self
+        {
+            debug!(
+                "minimal compaction interval time {:?} has passed, scheduling next compaction",
+                self.config.min_compaction_interval
+            );
+            match self
                .compaction_scheduler
                .schedule_compaction(
                    region.region_id,
@@ -175,11 +204,13 @@ impl<S> RegionWorkerLoop<S> {
                    1, // Default for automatic compaction
                )
                .await
-        {
-            warn!(
-                "Failed to schedule compaction for region: {}, err: {}",
-                region.region_id, e
-            );
+            {
+                Ok(true) => region.update_schedule_compaction_millis(),
+                Ok(false) => {}
+                Err(e) => {
+                    error!(e; "Failed to schedule compaction for region: {}", region.region_id)
+                }
+            }
        }
    }
 }
--- a/src/object-store/Cargo.toml
+++ b/src/object-store/Cargo.toml
@@ -24,7 +24,7 @@ derive_builder = { workspace = true, optional = true }
 futures.workspace = true
 humantime-serde.workspace = true
 lazy_static.workspace = true
-opendal = { git = "https://github.com/apache/opendal.git", rev = "4ad2d85296ffa6fdc2882f97d3c760ee243913f7", features = [
+opendal = { version = "0.57", features = [
    "layers-tracing",
    "layers-prometheus",
    "services-azblob",
--- a/src/object-store/src/compat.rs
+++ b/src/object-store/src/compat.rs
--- a/src/object-store/src/layers/mock.rs
+++ b/src/object-store/src/layers/mock.rs
@@ -21,7 +21,7 @@ pub use opendal::raw::{
    Access, Layer, LayeredAccess, OpDelete, OpList, OpRead, OpWrite, RpDelete, RpList, RpRead,
    RpWrite, oio,
 };
-use opendal::raw::{OpCopy, RpCopy};
+use opendal::raw::{OpCopier, OpCopy, RpCopy};
 pub use opendal::{Buffer, Error, ErrorKind, Metadata, Result};

 pub type MockWriterFactory = Arc<dyn Fn(&str, OpWrite, oio::Writer) -> oio::Writer + Send + Sync>;
@@ -146,6 +146,7 @@ impl<A: Access> LayeredAccess for MockAccessor<A> {
    type Writer = MockWriter;
    type Lister = MockLister;
    type Deleter = MockDeleter;
+    type Copier = oio::Copier;

    fn inner(&self) -> &Self::Inner {
        &self.inner
@@ -222,15 +223,24 @@ impl<A: Access> LayeredAccess for MockAccessor<A> {
        }
    }

-    async fn copy(&self, from: &str, to: &str, args: OpCopy) -> Result<RpCopy> {
-        let Some(copy_interceptor) = self.copy_interceptor.as_ref() else {
-            return self.inner.copy(from, to, args).await;
-        };
+    async fn copy(
+        &self,
+        from: &str,
+        to: &str,
+        args: OpCopy,
+        opts: OpCopier,
+    ) -> Result<(RpCopy, Self::Copier)> {
+        if let Some(result) = self
+            .copy_interceptor
+            .as_ref()
+            .and_then(|copy_interceptor| copy_interceptor(from, to, args.clone()))
+        {
+            return result.map(|rp_copy| (rp_copy, Box::new(()) as oio::Copier));
+        }

-        let Some(result) = copy_interceptor(from, to, args.clone()) else {
-            return self.inner.copy(from, to, args).await;
-        };
-
-        result
+        self.inner
+            .copy(from, to, args, opts)
+            .await
+            .map(|(rp_copy, copier)| (rp_copy, Box::new(copier) as oio::Copier))
    }
 }
--- a/src/object-store/src/lib.rs
+++ b/src/object-store/src/lib.rs
@@ -18,7 +18,6 @@ pub use opendal::{
    FuturesAsyncWriter, Lister, Operator as ObjectStore, Reader, Result, Writer, services,
 };

-pub mod compat;
 pub mod config;
 pub mod error;
 pub mod factory;
--- a/src/operator/src/insert.rs
+++ b/src/operator/src/insert.rs
@@ -63,6 +63,7 @@ use table::metadata::TableInfo;
 use table::requests::{
    AUTO_CREATE_TABLE_KEY, InsertRequest as TableInsertRequest, TABLE_DATA_MODEL,
    TABLE_DATA_MODEL_TRACE_V1, TRACE_TABLE_PARTITIONS_HINT_KEY, VALID_TABLE_OPTION_KEYS,
+    is_semantic_option_key,
 };
 use table::table_reference::TableReference;

@@ -83,6 +84,10 @@ pub struct Inserter {
    pub(crate) partition_manager: PartitionRuleManagerRef,
    pub(crate) node_manager: NodeManagerRef,
    pub(crate) table_flownode_set_cache: TableFlownodeSetCacheRef,
+    /// Server-side upper bound for auto table creation on write.
+    /// When `false`, missing tables are never auto-created regardless of the
+    /// per-request `auto_create_table` hint. When `true`, the hint still applies.
+    auto_create_table: bool,
 }

 pub type InserterRef = Arc<Inserter>;
@@ -135,12 +140,14 @@ impl Inserter {
        partition_manager: PartitionRuleManagerRef,
        node_manager: NodeManagerRef,
        table_flownode_set_cache: TableFlownodeSetCacheRef,
+        auto_create_table: bool,
    ) -> Self {
        Self {
            catalog_manager,
            partition_manager,
            node_manager,
            table_flownode_set_cache,
+            auto_create_table,
        }
    }

@@ -469,6 +476,30 @@ impl Inserter {
        Ok(inserts)
    }

+    /// Returns `None` if auto table creation is allowed, or `Some(reason)` if
+    /// disabled by either the global config or the request hint. The reason tells
+    /// which one, for a clearer error.
+    fn auto_create_disabled_reason(&self, ctx: &QueryContextRef) -> Result<Option<&'static str>> {
+        let auto_create_table_hint = ctx
+            .extension(AUTO_CREATE_TABLE_KEY)
+            .map(|v| v.parse::<bool>())
+            .transpose()
+            .map_err(|_| {
+                InvalidInsertRequestSnafu {
+                    reason: "`auto_create_table` hint must be a boolean",
+                }
+                .build()
+            })?
+            .unwrap_or(true);
+        Ok(if !self.auto_create_table {
+            Some("auto-create table is disabled by frontend config")
+        } else if !auto_create_table_hint {
+            Some("`auto_create_table` hint is disabled")
+        } else {
+            None
+        })
+    }
+
    /// Creates or alter tables on demand:
    /// - if table does not exist, create table by inferred CreateExpr
    /// - if table exist, check if schema matches. If any new column found, alter table by inferred `AlterExpr`
@@ -498,19 +529,7 @@ impl Inserter {
        let schema = ctx.current_schema();

        let mut table_infos = HashMap::new();
-        // If `auto_create_table` hint is disabled, skip creating/altering tables.
-        let auto_create_table_hint = ctx
-            .extension(AUTO_CREATE_TABLE_KEY)
-            .map(|v| v.parse::<bool>())
-            .transpose()
-            .map_err(|_| {
-                InvalidInsertRequestSnafu {
-                    reason: "`auto_create_table` hint must be a boolean",
-                }
-                .build()
-            })?
-            .unwrap_or(true);
-        if !auto_create_table_hint {
+        if let Some(disabled_reason) = self.auto_create_disabled_reason(ctx)? {
            let mut instant_table_ids = HashSet::new();
            for req in &requests.inserts {
                let table = self
@@ -518,8 +537,8 @@ impl Inserter {
                    .await?
                    .context(InvalidInsertRequestSnafu {
                        reason: format!(
-                            "Table `{}` does not exist, and `auto_create_table` hint is disabled",
-                            req.table_name
+                            "Table `{}` does not exist, and {}",
+                            req.table_name, disabled_reason
                        ),
                    })?;
                let table_info = table.table_info();
@@ -767,6 +786,16 @@ impl Inserter {
            return Ok(());
        }

+        // Gate here too, otherwise a disabled switch would still leak the physical table.
+        if let Some(disabled_reason) = self.auto_create_disabled_reason(ctx)? {
+            return InvalidInsertRequestSnafu {
+                reason: format!(
+                    "Physical table `{physical_table}` does not exist, and {disabled_reason}"
+                ),
+            }
+            .fail();
+        }
+
        let table_reference = TableReference::full(catalog_name, &schema_name, &physical_table);
        info!("Physical metric table `{table_reference}` does not exist, try creating table");

@@ -1061,6 +1090,13 @@ pub fn fill_table_options_for_create(
        }
    }

+    // Semantic keys are prefix-matched, not in the fixed allowlist above.
+    for (key, value) in ctx.extensions() {
+        if is_semantic_option_key(&key) {
+            table_options.insert(key, value);
+        }
+    }
+
    match create_type {
        AutoCreateTableType::Logical(physical_table) => {
            table_options.insert(
@@ -1333,6 +1369,7 @@ mod tests {
                Cache::new(100),
                kv_backend.clone(),
            )),
+            true,
        );
        let alter_expr = inserter
            .get_alter_table_expr_on_demand(&mut req, &table, &ctx, true, true)
@@ -1362,6 +1399,34 @@ mod tests {
        assert!(!table_options.contains_key(APPEND_MODE_KEY));
    }

+    #[test]
+    fn test_fill_table_options_copies_semantic_extensions() {
+        use table::requests::{
+            SEMANTIC_PER_TABLE_INDEX_KEY, SEMANTIC_SIGNAL_TYPE, SEMANTIC_SOURCE,
+            SIGNAL_TYPE_METRIC, SOURCE_OPENTELEMETRY,
+        };
+
+        let mut ctx = QueryContext::with(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME);
+        ctx.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC);
+        ctx.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
+        // The internal transport key must NOT be copied into table options.
+        ctx.set_extension(SEMANTIC_PER_TABLE_INDEX_KEY, "{}");
+        let ctx = Arc::new(ctx);
+        let mut table_options = Default::default();
+
+        fill_table_options_for_create(&mut table_options, &AutoCreateTableType::Physical, &ctx);
+
+        assert_eq!(
+            Some(SIGNAL_TYPE_METRIC),
+            table_options.get(SEMANTIC_SIGNAL_TYPE).map(String::as_str)
+        );
+        assert_eq!(
+            Some(SOURCE_OPENTELEMETRY),
+            table_options.get(SEMANTIC_SOURCE).map(String::as_str)
+        );
+        assert!(!table_options.contains_key(SEMANTIC_PER_TABLE_INDEX_KEY));
+    }
+
    #[test]
    fn test_last_non_null_create_options_preserve_default_with_append_mode_false() {
        let mut ctx = QueryContext::with(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME);
--- a/src/operator/src/statement/ddl.rs
+++ b/src/operator/src/statement/ddl.rs
@@ -35,7 +35,9 @@ use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, is_reado
 use common_catalog::{format_full_flow_name, format_full_table_name};
 use common_error::ext::BoxedError;
 use common_meta::cache_invalidator::Context;
-use common_meta::ddl::create_flow::{DEFER_ON_MISSING_SOURCE_KEY, FlowType};
+use common_meta::ddl::create_flow::{
+    DEFER_ON_MISSING_SOURCE_KEY, FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, FlowType,
+};
 use common_meta::instruction::CacheIdent;
 use common_meta::key::schema_name::{SchemaName, SchemaNameKey};
 use common_meta::procedure_executor::ExecutorContext;
@@ -114,7 +116,10 @@ struct DdlSubmitOptions {
    timeout: Duration,
 }

-const ALLOWED_FLOW_OPTIONS: [&str; 1] = [DEFER_ON_MISSING_SOURCE_KEY];
+const ALLOWED_FLOW_OPTIONS: [&str; 2] = [
+    DEFER_ON_MISSING_SOURCE_KEY,
+    FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY,
+];

 fn build_procedure_id_output(procedure_id: Vec<u8>) -> Result<Output> {
    let procedure_id = String::from_utf8_lossy(&procedure_id).to_string();
@@ -187,7 +192,9 @@ fn validate_and_normalize_flow_options(
            }

            let normalized_value = match key.as_str() {
-                DEFER_ON_MISSING_SOURCE_KEY => normalize_flow_bool_option(&key, &value)?,
+                DEFER_ON_MISSING_SOURCE_KEY | FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY => {
+                    normalize_flow_bool_option(&key, &value)?
+                }
                _ => {
                    return InvalidSqlSnafu {
                        err_msg: format!(
@@ -2478,12 +2485,23 @@ mod test {

    #[test]
    fn test_validate_and_normalize_flow_options_valid() {
-        let options =
-            HashMap::from([(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "TRUE".to_string())]);
+        let options = HashMap::from([
+            (DEFER_ON_MISSING_SOURCE_KEY.to_string(), "TRUE".to_string()),
+            (
+                FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(),
+                "FALSE".to_string(),
+            ),
+        ]);

        assert_eq!(
            validate_and_normalize_flow_options(options).unwrap(),
-            HashMap::from([(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string(),)])
+            HashMap::from([
+                (DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string(),),
+                (
+                    FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(),
+                    "false".to_string(),
+                )
+            ])
        );
    }

@@ -2497,7 +2515,7 @@ mod test {

        assert!(
            err.to_string()
-                .contains("unknown flow option 'foo', supported options: defer_on_missing_source")
+                .contains("unknown flow option 'foo', supported options: defer_on_missing_source, experimental_enable_incremental_read")
        );
    }

--- a/src/plugins/src/datanode.rs
+++ b/src/plugins/src/datanode.rs
@@ -14,6 +14,7 @@

 use common_base::Plugins;
 use datanode::config::DatanodeOptions;
+use datanode::datanode::Datanode;
 use datanode::error::Result;

 use crate::options::PluginOptions;
@@ -28,6 +29,6 @@ pub async fn setup_datanode_plugins(
    Ok(())
 }

-pub async fn start_datanode_plugins(_plugins: Plugins) -> Result<()> {
+pub async fn start_datanode_plugins(_instance: &Datanode) -> Result<()> {
    Ok(())
 }
--- a/src/plugins/src/flownode.rs
+++ b/src/plugins/src/flownode.rs
@@ -13,8 +13,8 @@
 // limitations under the License.

 use common_base::Plugins;
-use flow::FlownodeOptions;
 use flow::error::Result;
+use flow::{FlownodeInstance, FlownodeOptions};

 use crate::options::PluginOptions;

@@ -27,7 +27,7 @@ pub async fn setup_flownode_plugins(
    Ok(())
 }

-pub async fn start_flownode_plugins(_plugins: Plugins) -> Result<()> {
+pub async fn start_flownode_plugins(_instance: &FlownodeInstance) -> Result<()> {
    Ok(())
 }

--- a/src/plugins/src/frontend.rs
+++ b/src/plugins/src/frontend.rs
@@ -17,6 +17,7 @@ use common_base::Plugins;
 use common_meta::cache::CacheRegistryBuilder;
 use frontend::error::{IllegalAuthConfigSnafu, Result};
 use frontend::frontend::FrontendOptions;
+use frontend::instance::Instance;
 use snafu::ResultExt;

 use crate::options::PluginOptions;
@@ -51,7 +52,7 @@ pub async fn setup_frontend_dynamic_plugins(
    Ok(())
 }

-pub async fn start_frontend_plugins(_plugins: Plugins) -> Result<()> {
+pub async fn start_frontend_plugins(_instance: &Instance) -> Result<()> {
    Ok(())
 }

--- a/src/plugins/src/lib.rs
+++ b/src/plugins/src/lib.rs
@@ -26,4 +26,4 @@ pub use flownode::{setup_flownode_plugins, start_flownode_plugins};
 pub use frontend::{setup_frontend_plugins, start_frontend_plugins};
 pub use meta_srv::{setup_metasrv_plugins, start_metasrv_plugins};
 pub use options::PluginOptions;
-pub use standalone::{setup_standalone_plugins, start_standalone_plugins};
+pub use standalone::setup_standalone_plugins;
--- a/src/plugins/src/meta_srv.rs
+++ b/src/plugins/src/meta_srv.rs
@@ -13,6 +13,7 @@
 // limitations under the License.

 use common_base::Plugins;
+use meta_srv::bootstrap::MetasrvInstance;
 use meta_srv::error::Result;
 use meta_srv::metasrv::MetasrvOptions;

@@ -27,6 +28,6 @@ pub async fn setup_metasrv_plugins(
    Ok(())
 }

-pub async fn start_metasrv_plugins(_plugins: Plugins) -> Result<()> {
+pub async fn start_metasrv_plugins(_instance: &MetasrvInstance) -> Result<()> {
    Ok(())
 }
--- a/src/plugins/src/standalone.rs
+++ b/src/plugins/src/standalone.rs
@@ -31,10 +31,6 @@ pub async fn setup_standalone_plugins(
    Ok(())
 }

-pub async fn start_standalone_plugins(_plugins: Plugins) -> Result<()> {
-    Ok(())
-}
-
 /// Allows standalone plugins to add cache invalidators to the layered registry.
 pub fn configure_cache_registry(_plugins: &Plugins) -> Option<CacheRegistryBuilder> {
    None
--- a/src/servers/src/http/prom_store.rs
+++ b/src/servers/src/http/prom_store.rs
@@ -31,6 +31,10 @@ use prost::Message;
 use serde::{Deserialize, Serialize};
 use session::context::{Channel, QueryContext};
 use snafu::prelude::*;
+use table::requests::{
+    METADATA_QUALITY_INFERRED, SEMANTIC_METRIC_METADATA_QUALITY, SEMANTIC_SIGNAL_TYPE,
+    SEMANTIC_SOURCE, SIGNAL_TYPE_METRIC, SOURCE_PROMETHEUS,
+};

 use crate::error::{self, InternalSnafu, PipelineSnafu, Result};
 use crate::http::extractor::PipelineInfo;
@@ -108,6 +112,13 @@ pub async fn remote_write(
        .clone()
        .unwrap_or_else(|| GREPTIME_PHYSICAL_TABLE.to_string());
    query_ctx.set_extension(PHYSICAL_TABLE_PARAM, physical_table.clone());
+    // Stamp the Prometheus metric identity here, before `as_req_iter` splits into the
+    // batched and direct write paths, so both inherit it (the batched path bypasses
+    // `PromStoreProtocolHandler::write`). Prom RW v1 metadata is weak, so the type is
+    // inferred from naming.
+    query_ctx.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC);
+    query_ctx.set_extension(SEMANTIC_SOURCE, SOURCE_PROMETHEUS);
+    query_ctx.set_extension(SEMANTIC_METRIC_METADATA_QUALITY, METADATA_QUALITY_INFERRED);
    let query_ctx = Arc::new(query_ctx);
    let _timer = crate::metrics::METRIC_HTTP_PROM_STORE_WRITE_ELAPSED
        .with_label_values(&[db.as_str()])
--- a/src/sql/src/parsers/utils.rs
+++ b/src/sql/src/parsers/utils.rs
@@ -40,7 +40,7 @@ use snafu::{ResultExt, ensure};
 use sqlparser::dialect::Dialect;
 use sqlparser::keywords::Keyword;
 use sqlparser::parser::Parser;
-use table::requests::validate_table_option;
+use table::requests::{SEMANTIC_PREFIX, validate_semantic_option, validate_table_option};

 use crate::error::{
    ConvertToLogicalExpressionSnafu, InvalidSqlSnafu, InvalidTableOptionSnafu, ParseSqlValueSnafu,
@@ -395,8 +395,18 @@ pub fn parse_with_options(parser: &mut Parser) -> Result<OptionMap> {
        .into_iter()
        .map(parse_option_string)
        .collect::<Result<HashMap<String, OptionValue>>>()?;
-    for key in options.keys() {
-        ensure!(validate_table_option(key), InvalidTableOptionSnafu { key });
+    for (key, value) in &options {
+        if key.starts_with(SEMANTIC_PREFIX) {
+            // Semantic keys are whitelisted and value-checked against their domain,
+            // so a user cannot set an unknown key or an out-of-range value.
+            let value = value.as_string().unwrap_or_default();
+            ensure!(
+                validate_semantic_option(key, value),
+                InvalidTableOptionSnafu { key }
+            );
+        } else {
+            ensure!(validate_table_option(key), InvalidTableOptionSnafu { key });
+        }
    }
    Ok(OptionMap::new(options))
 }
--- a/src/sql/src/statements/create.rs
+++ b/src/sql/src/statements/create.rs
@@ -868,7 +868,25 @@ ENGINE=mito
 ";
        let result =
            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
-        assert_matches!(result, Err(Error::InvalidTableOption { .. }))
+        assert_matches!(result, Err(Error::InvalidTableOption { .. }));
+
+        // A whitelisted semantic key with an in-domain value is accepted.
+        let semantic = |with: &str| {
+            let sql =
+                format!("create table demo(host string, ts timestamp time index) with({with});");
+            ParserContext::create_with_dialect(&sql, &GreptimeDbDialect {}, ParseOptions::default())
+        };
+        assert!(semantic("'greptime.semantic.signal_type'='metric'").is_ok());
+        // An out-of-domain value is rejected.
+        assert_matches!(
+            semantic("'greptime.semantic.signal_type'='spans'"),
+            Err(Error::InvalidTableOption { .. })
+        );
+        // An unknown key under the semantic prefix is rejected.
+        assert_matches!(
+            semantic("'greptime.semantic.bogus'='x'"),
+            Err(Error::InvalidTableOption { .. })
+        );
    }

    #[test]
--- a/src/standalone/src/options.rs
+++ b/src/standalone/src/options.rs
@@ -38,6 +38,10 @@ pub struct StandaloneOptions {
    pub enable_telemetry: bool,
    pub default_timezone: Option<String>,
    pub default_column_prefix: Option<String>,
+    /// Server-side global switch for auto table creation on write.
+    /// Upper bound: when `false`, missing tables are never auto-created even if a
+    /// request sets the `auto_create_table` hint to `true`. Default: `true`.
+    pub auto_create_table: bool,
    /// Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
    /// Set to 0 to disable the limit. Default: "0" (unlimited)
    pub max_in_flight_write_bytes: ReadableSize,
@@ -77,6 +81,7 @@ impl Default for StandaloneOptions {
            enable_telemetry: true,
            default_timezone: None,
            default_column_prefix: None,
+            auto_create_table: true,
            max_in_flight_write_bytes: ReadableSize(0),
            write_bytes_exhausted_policy: OnExhaustedPolicy::default(),
            http: HttpOptions::default(),
@@ -130,6 +135,7 @@ impl StandaloneOptions {
        let cloned_opts = self.clone();
        FrontendOptions {
            default_timezone: cloned_opts.default_timezone,
+            auto_create_table: cloned_opts.auto_create_table,
            max_in_flight_write_bytes: cloned_opts.max_in_flight_write_bytes,
            write_bytes_exhausted_policy: cloned_opts.write_bytes_exhausted_policy,
            http: cloned_opts.http,
--- a/src/table/src/requests.rs
+++ b/src/table/src/requests.rs
@@ -48,6 +48,9 @@ use crate::error::{ParseTableOptionSnafu, Result};
 use crate::metadata::{TableId, TableVersion};
 use crate::table_reference::TableReference;

+mod semantic;
+pub use semantic::*;
+
 pub const FILE_TABLE_META_KEY: &str = "__private.file_table_meta";
 pub const FILE_TABLE_LOCATION_KEY: &str = "location";
 pub const FILE_TABLE_PATTERN_KEY: &str = "pattern";
@@ -129,6 +132,12 @@ pub fn validate_table_option(key: &str) -> bool {
        return true;
    }

+    // Semantic-layer keys share a reserved prefix instead of a fixed allowlist so
+    // the vocabulary can grow without touching this gate. See `semantic` module.
+    if is_semantic_option_key(key) {
+        return true;
+    }
+
    VALID_TABLE_OPTION_KEYS.contains(&key) || VALID_DDL_OPTION_KEYS.contains(&key)
 }

@@ -490,6 +499,14 @@ mod tests {
        assert!(validate_table_option(STORAGE_KEY));
        assert!(validate_table_option(MEMTABLE_BULK_MERGE_THRESHOLD));
        assert!(!validate_table_option("foo"));
+
+        // Only whitelisted semantic keys are accepted.
+        assert!(validate_table_option(SEMANTIC_SIGNAL_TYPE));
+        assert!(validate_table_option(SEMANTIC_METRIC_TYPE));
+        // Unknown semantic key, near-miss, and the internal transport key are rejected.
+        assert!(!validate_table_option("greptime.semantic.future.key"));
+        assert!(!validate_table_option("greptime.semanticx"));
+        assert!(!validate_table_option(SEMANTIC_PER_TABLE_INDEX_KEY));
    }

    #[test]
--- a/src/table/src/requests/semantic.rs
+++ b/src/table/src/requests/semantic.rs
@@ -0,0 +1,280 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Table semantic layer vocabulary.
+//!
+//! A thin layer of semantic metadata attached to a table via `table_options`, so
+//! machine consumers (LLM agents, alert/dashboard builders, MCP servers, ETL) can
+//! align a table with the observability concept it stands for without guessing
+//! from column names. See `docs/rfcs/2026-05-28-table-semantic-layer.md`.
+//!
+//! All public table-option keys share the [`SEMANTIC_PREFIX`] namespace and are
+//! string-valued. [`is_semantic_option_key`] gates them through
+//! [`crate::requests::validate_table_option`], so they are accepted both on the
+//! ingestion auto-create path and on explicit `CREATE TABLE ... WITH (...)` DDL.
+
+/// Reserved prefix for every public semantic table-option key.
+pub const SEMANTIC_PREFIX: &str = "greptime.semantic.";
+
+/// Internal `QueryContext` extension key carrying the per-table semantic index
+/// (a `{table_name -> {semantic_key: value}}` JSON blob) from the ingestion
+/// encode path to the auto-create site. Deliberately OUTSIDE [`SEMANTIC_PREFIX`]
+/// so it is not a valid table option and never leaks into a table's options.
+pub const SEMANTIC_PER_TABLE_INDEX_KEY: &str = "greptime.internal.semantic.per_table_index";
+
+// ---- Common keys (all signals) ----
+
+/// Signal kind: one of [`SIGNAL_TYPE_TRACE`] / [`SIGNAL_TYPE_LOG`] /
+/// [`SIGNAL_TYPE_METRIC`] / [`SIGNAL_TYPE_EVENT`].
+pub const SEMANTIC_SIGNAL_TYPE: &str = "greptime.semantic.signal_type";
+/// Ingestion ecosystem, e.g. [`SOURCE_OPENTELEMETRY`] / [`SOURCE_PROMETHEUS`].
+pub const SEMANTIC_SOURCE: &str = "greptime.semantic.source";
+/// Optional protocol or SDK version string, e.g. `v2` (Prom remote write), `1.30.0`.
+pub const SEMANTIC_SOURCE_VERSION: &str = "greptime.semantic.source_version";
+/// Internal ingestion pipeline / data model, e.g. `greptime_trace_v1`.
+pub const SEMANTIC_PIPELINE: &str = "greptime.semantic.pipeline";
+
+// ---- Trace keys ----
+
+/// Semantic-conventions version the rows conform to (e.g. `otel-semconv-1.27`),
+/// or [`SEMANTIC_VALUE_UNKNOWN`] / [`SEMANTIC_VALUE_MIXED`] when not single-valued.
+pub const SEMANTIC_TRACE_CONVENTIONS: &str = "greptime.semantic.trace.conventions";
+/// Whether `span_events` are preserved on the table.
+pub const SEMANTIC_TRACE_HAS_EVENTS: &str = "greptime.semantic.trace.has_events";
+/// Whether `span_links` are preserved on the table.
+pub const SEMANTIC_TRACE_HAS_LINKS: &str = "greptime.semantic.trace.has_links";
+
+// ---- Metric keys (populated in Phase 2) ----
+
+/// Instrument kind: `counter` / `gauge` / `histogram` / `summary` /
+/// `updown_counter` / `gauge_histogram` / `info` / `stateset`.
+pub const SEMANTIC_METRIC_TYPE: &str = "greptime.semantic.metric.type";
+/// UCUM unit, e.g. `s`, `By`, `{request}`.
+pub const SEMANTIC_METRIC_UNIT: &str = "greptime.semantic.metric.unit";
+/// `cumulative` / `delta` (OTel only).
+pub const SEMANTIC_METRIC_TEMPORALITY: &str = "greptime.semantic.metric.temporality";
+/// `true` / `false` for sum / counter typed data.
+pub const SEMANTIC_METRIC_MONOTONIC: &str = "greptime.semantic.metric.monotonic";
+/// [`METADATA_QUALITY_DECLARED`] when the protocol stated the type, or
+/// [`METADATA_QUALITY_INFERRED`] when guessed from a name suffix.
+pub const SEMANTIC_METRIC_METADATA_QUALITY: &str = "greptime.semantic.metric.metadata_quality";
+/// Pre-translation OTel metric name when the table name was Prometheus-ised.
+pub const SEMANTIC_METRIC_ORIGINAL_NAME: &str = "greptime.semantic.metric.original_name";
+
+// ---- Log keys (populated in Phase 3) ----
+
+/// `otlp` / `syslog` / `custom` — which mapping to use for `severity_number`.
+pub const SEMANTIC_LOG_SEVERITY_SCHEME: &str = "greptime.semantic.log.severity_scheme";
+/// `string` / `json` / `mixed` — how to parse `body`.
+pub const SEMANTIC_LOG_BODY_FORMAT: &str = "greptime.semantic.log.body_format";
+
+// ---- Resource / scope preservation keys (populated in Phase 3) ----
+
+/// JSON array string of resource attributes promoted to first-class columns.
+pub const SEMANTIC_RESOURCE_ATTRIBUTES_PRESERVED: &str =
+    "greptime.semantic.resource.attributes_preserved";
+/// `true` / `false` — whether any resource attribute was dropped at ingest.
+pub const SEMANTIC_RESOURCE_ATTRIBUTES_DROPPED: &str =
+    "greptime.semantic.resource.attributes_dropped";
+/// `true` / `false` — whether `scope.name` / `scope.version` survive on the row.
+pub const SEMANTIC_SCOPE_PRESERVED: &str = "greptime.semantic.scope.preserved";
+
+// ---- Value constants ----
+
+pub const SIGNAL_TYPE_TRACE: &str = "trace";
+pub const SIGNAL_TYPE_LOG: &str = "log";
+pub const SIGNAL_TYPE_METRIC: &str = "metric";
+pub const SIGNAL_TYPE_EVENT: &str = "event";
+
+pub const SOURCE_OPENTELEMETRY: &str = "opentelemetry";
+pub const SOURCE_PROMETHEUS: &str = "prometheus";
+
+pub const METADATA_QUALITY_DECLARED: &str = "declared";
+pub const METADATA_QUALITY_INFERRED: &str = "inferred";
+
+/// Sentinel for a key that cannot be determined at stamp time.
+pub const SEMANTIC_VALUE_UNKNOWN: &str = "unknown";
+/// Sentinel for a single-valued key that saw conflicting sources.
+pub const SEMANTIC_VALUE_MIXED: &str = "mixed";
+
+/// Every recognised public semantic table-option key. The set is a closed
+/// whitelist: keys under [`SEMANTIC_PREFIX`] that are not listed here are rejected,
+/// so an unknown key like `greptime.semantic.unknown_key` does not silently land
+/// in a table's options. Adding a key to the vocabulary means adding it here.
+pub const SEMANTIC_OPTION_KEYS: &[&str] = &[
+    SEMANTIC_SIGNAL_TYPE,
+    SEMANTIC_SOURCE,
+    SEMANTIC_SOURCE_VERSION,
+    SEMANTIC_PIPELINE,
+    SEMANTIC_TRACE_CONVENTIONS,
+    SEMANTIC_TRACE_HAS_EVENTS,
+    SEMANTIC_TRACE_HAS_LINKS,
+    SEMANTIC_METRIC_TYPE,
+    SEMANTIC_METRIC_UNIT,
+    SEMANTIC_METRIC_TEMPORALITY,
+    SEMANTIC_METRIC_MONOTONIC,
+    SEMANTIC_METRIC_METADATA_QUALITY,
+    SEMANTIC_METRIC_ORIGINAL_NAME,
+    SEMANTIC_LOG_SEVERITY_SCHEME,
+    SEMANTIC_LOG_BODY_FORMAT,
+    SEMANTIC_RESOURCE_ATTRIBUTES_PRESERVED,
+    SEMANTIC_RESOURCE_ATTRIBUTES_DROPPED,
+    SEMANTIC_SCOPE_PRESERVED,
+];
+
+/// Returns true if `key` is a recognised semantic table-option key (whitelist).
+///
+/// Note this is membership, not a prefix test: unknown keys under
+/// [`SEMANTIC_PREFIX`] are rejected, and the internal
+/// [`SEMANTIC_PER_TABLE_INDEX_KEY`] (outside the prefix) never matches.
+pub fn is_semantic_option_key(key: &str) -> bool {
+    SEMANTIC_OPTION_KEYS.contains(&key)
+}
+
+/// Validates a `greptime.semantic.*` option's `value` against its allowed domain.
+///
+/// Open-value keys (unit, original_name, version, pipeline, conventions, the
+/// preserved-attributes list) accept any non-empty string. Closed-domain keys
+/// accept a fixed set, plus the `unknown` sentinel, plus `mixed` for the keys
+/// where one long-lived table can legitimately see multiple values. Keys not in
+/// [`SEMANTIC_OPTION_KEYS`] are rejected.
+pub fn validate_semantic_option(key: &str, value: &str) -> bool {
+    match key {
+        SEMANTIC_SOURCE_VERSION
+        | SEMANTIC_PIPELINE
+        | SEMANTIC_METRIC_UNIT
+        | SEMANTIC_METRIC_ORIGINAL_NAME
+        | SEMANTIC_TRACE_CONVENTIONS
+        | SEMANTIC_RESOURCE_ATTRIBUTES_PRESERVED => !value.is_empty(),
+
+        SEMANTIC_SIGNAL_TYPE => matches!(value, "trace" | "log" | "metric" | "event" | "unknown"),
+        SEMANTIC_SOURCE => matches!(
+            value,
+            "opentelemetry"
+                | "prometheus"
+                | "elasticsearch"
+                | "loki"
+                | "custom"
+                | "mixed"
+                | "unknown"
+        ),
+        SEMANTIC_METRIC_TYPE => matches!(
+            value,
+            "counter"
+                | "gauge"
+                | "histogram"
+                | "summary"
+                | "updown_counter"
+                | "gauge_histogram"
+                | "info"
+                | "stateset"
+                | "mixed"
+                | "unknown"
+        ),
+        SEMANTIC_METRIC_TEMPORALITY => {
+            matches!(value, "cumulative" | "delta" | "mixed" | "unknown")
+        }
+        SEMANTIC_METRIC_MONOTONIC
+        | SEMANTIC_TRACE_HAS_EVENTS
+        | SEMANTIC_TRACE_HAS_LINKS
+        | SEMANTIC_RESOURCE_ATTRIBUTES_DROPPED
+        | SEMANTIC_SCOPE_PRESERVED => matches!(value, "true" | "false" | "unknown"),
+        SEMANTIC_METRIC_METADATA_QUALITY => matches!(value, "declared" | "inferred" | "unknown"),
+        SEMANTIC_LOG_SEVERITY_SCHEME => matches!(value, "otlp" | "syslog" | "custom" | "unknown"),
+        SEMANTIC_LOG_BODY_FORMAT => matches!(value, "string" | "json" | "mixed" | "unknown"),
+
+        _ => false,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_is_semantic_option_key() {
+        assert!(is_semantic_option_key(SEMANTIC_SIGNAL_TYPE));
+        assert!(is_semantic_option_key(SEMANTIC_METRIC_TYPE));
+
+        // Unknown keys under the prefix are not whitelisted.
+        assert!(!is_semantic_option_key("greptime.semantic.future.key"));
+        assert!(!is_semantic_option_key("greptime.semantic.unknown_key"));
+        // Near-misses must not match.
+        assert!(!is_semantic_option_key("greptime.semanticx"));
+        assert!(!is_semantic_option_key("semantic.signal_type"));
+        assert!(!is_semantic_option_key("table_data_model"));
+        // The internal transport key must never be treated as a table option.
+        assert!(!is_semantic_option_key(SEMANTIC_PER_TABLE_INDEX_KEY));
+    }
+
+    #[test]
+    fn test_validate_semantic_option() {
+        // Enum keys reject out-of-domain values.
+        assert!(validate_semantic_option(SEMANTIC_SIGNAL_TYPE, "metric"));
+        assert!(!validate_semantic_option(SEMANTIC_SIGNAL_TYPE, "spans"));
+        assert!(validate_semantic_option(SEMANTIC_METRIC_TYPE, "counter"));
+        assert!(validate_semantic_option(SEMANTIC_METRIC_TYPE, "mixed"));
+        assert!(!validate_semantic_option(SEMANTIC_METRIC_TYPE, "bogus"));
+
+        // Booleans, sentinels, open values.
+        assert!(validate_semantic_option(SEMANTIC_TRACE_HAS_EVENTS, "true"));
+        assert!(!validate_semantic_option(SEMANTIC_TRACE_HAS_EVENTS, "yes"));
+        assert!(validate_semantic_option(
+            SEMANTIC_METRIC_TEMPORALITY,
+            "unknown"
+        ));
+        assert!(validate_semantic_option(SEMANTIC_METRIC_UNIT, "By"));
+        assert!(!validate_semantic_option(SEMANTIC_METRIC_UNIT, ""));
+
+        // Unknown key is rejected regardless of value.
+        assert!(!validate_semantic_option(
+            "greptime.semantic.future.key",
+            "x"
+        ));
+
+        // Drift guard: every value stamped by the ingestion path must validate.
+        assert!(validate_semantic_option(
+            SEMANTIC_SIGNAL_TYPE,
+            SIGNAL_TYPE_TRACE
+        ));
+        assert!(validate_semantic_option(
+            SEMANTIC_SIGNAL_TYPE,
+            SIGNAL_TYPE_METRIC
+        ));
+        assert!(validate_semantic_option(
+            SEMANTIC_SIGNAL_TYPE,
+            SIGNAL_TYPE_LOG
+        ));
+        assert!(validate_semantic_option(
+            SEMANTIC_SOURCE,
+            SOURCE_OPENTELEMETRY
+        ));
+        assert!(validate_semantic_option(SEMANTIC_SOURCE, SOURCE_PROMETHEUS));
+        assert!(validate_semantic_option(
+            SEMANTIC_METRIC_METADATA_QUALITY,
+            METADATA_QUALITY_INFERRED
+        ));
+        assert!(validate_semantic_option(
+            SEMANTIC_TRACE_CONVENTIONS,
+            SEMANTIC_VALUE_UNKNOWN
+        ));
+        // An empty value never validates, for any whitelisted key.
+        for key in SEMANTIC_OPTION_KEYS {
+            assert!(
+                !validate_semantic_option(key, ""),
+                "empty value should never validate for {key}"
+            );
+        }
+    }
+}
--- a/tests-fuzz/src/context.rs
+++ b/tests-fuzz/src/context.rs
@@ -200,6 +200,15 @@ impl TableContext {
                partitions.remove_bound(removed_idx)?;
                partition_def.exprs = partitions.generate()?;
            }
+            RepartitionExpr::AlterPartitions(partition) => {
+                ensure!(
+                    self.partition.is_none(),
+                    error::UnexpectedSnafu {
+                        violated: format!("Table {} already has partition", self.name),
+                    }
+                );
+                self.partition = Some(partition.partition);
+            }
        }

        Ok(self)
--- a/tests-fuzz/src/generator/create_expr.rs
+++ b/tests-fuzz/src/generator/create_expr.rs
@@ -44,6 +44,7 @@ pub struct CreateTableExprGenerator<R: Rng + 'static> {
    #[builder(setter(into))]
    engine: String,
    partition: usize,
+    partition_column: bool,
    if_not_exists: bool,
    #[builder(setter(into))]
    name: Ident,
@@ -67,6 +68,7 @@ impl<R: Rng + 'static> Default for CreateTableExprGenerator<R> {
            engine: DEFAULT_ENGINE.to_string(),
            if_not_exists: false,
            partition: 0,
+            partition_column: false,
            name: Ident::new(""),
            with_clause: HashMap::default(),
            name_generator: Box::new(MappedGenerator::new(WordGenerator, random_capitalize_map)),
@@ -95,7 +97,7 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateTableExprGenerato
        let mut builder = CreateTableExprBuilder::default();
        let mut columns = Vec::with_capacity(self.columns);
        let mut primary_keys = vec![];
-        let need_partible_column = self.partition > 1;
+        let need_partible_column = self.partition > 1 || self.partition_column;
        let mut column_names = self.name_generator.choose(rng, self.columns);

        if self.columns == 1 {
@@ -123,13 +125,15 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateTableExprGenerato
                )
                .remove(0);

-                // Generates partition bounds.
-                let partition_def = generate_partition_def(
-                    self.partition,
-                    column.column_type.clone(),
-                    name.clone(),
-                );
-                builder.partition(partition_def);
+                if self.partition > 1 {
+                    // Generates partition bounds.
+                    let partition_def = generate_partition_def(
+                        self.partition,
+                        column.column_type.clone(),
+                        name.clone(),
+                    );
+                    builder.partition(partition_def);
+                }
                columns.push(column);
            }
            // Generates the ts column.
@@ -178,11 +182,12 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateTableExprGenerato
    }
 }

-fn generate_partition_def(
+pub fn generate_partition_def(
    partitions: usize,
    column_type: ConcreteDataType,
    column_name: Ident,
 ) -> PartitionDef {
+    assert!(partitions > 1, "partitions must be greater than 1");
    let bounds = generate_partition_bounds(&column_type, partitions - 1);
    let partitions = SimplePartitions::new(column_name.clone(), bounds);
    let partition_exprs = partitions.generate().unwrap();
@@ -193,24 +198,23 @@ fn generate_partition_def(
    }
 }

-fn generate_metric_partition(partitions: usize) -> Option<(Column, PartitionDef)> {
-    if partitions <= 1 {
-        return None;
-    }
-
-    let partition_column = Column {
+fn metric_partition_column() -> Column {
+    Column {
        name: Ident::new("host"),
        column_type: ConcreteDataType::string_datatype(),
        options: vec![ColumnOption::PrimaryKey],
-    };
+    }
+}
+
+pub fn generate_metric_partition_def(partitions: usize) -> PartitionDef {
+    assert!(partitions > 1, "partitions must be greater than 1");
+    let partition_column = metric_partition_column();
    let bounds = generate_partition_bounds(&partition_column.column_type, partitions - 1);
    let partitions = SimplePartitions::new(partition_column.name.clone(), bounds);
-    let partition_def = PartitionDef {
+    PartitionDef {
        columns: vec![partitions.column_name.clone()],
        exprs: partitions.generate().unwrap(),
-    };
-
-    Some((partition_column, partition_def))
+    }
 }

 /// Generate a physical table with 2 columns: ts of TimestampType::Millisecond as time index and val of Float64Type.
@@ -223,6 +227,8 @@ pub struct CreatePhysicalTableExprGenerator<R: Rng + 'static> {
    if_not_exists: bool,
    #[builder(default = "0")]
    partition: usize,
+    #[builder(default = "false")]
+    partition_column: bool,
    #[builder(default, setter(into))]
    with_clause: HashMap<String, String>,
 }
@@ -252,11 +258,13 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreatePhysicalTableExpr

        let mut partition = None;
        let mut primary_keys = vec![];
-        if let Some((partition_column, partition_def)) = generate_metric_partition(self.partition) {
-            columns.push(partition_column);
-            partition = Some(partition_def);
+        if self.partition > 1 || self.partition_column {
+            columns.push(metric_partition_column());
            primary_keys.push(columns.len() - 1);
        }
+        if self.partition > 1 {
+            partition = Some(generate_metric_partition_def(self.partition));
+        }

        Ok(CreateTableExpr {
            table_name: self.name_generator.generate(rng),
@@ -387,6 +395,7 @@ mod tests {

    use super::*;
    use crate::context::TableContext;
+    use crate::ir::PARTIBLE_DATA_TYPES;

    #[test]
    fn test_float64() {
@@ -423,6 +432,18 @@ mod tests {
            .unwrap();
        assert_eq!(expr.columns.len(), 10);
        assert!(expr.partition.is_none());
+
+        let expr = CreateTableExprGeneratorBuilder::default()
+            .columns(10)
+            .partition(1)
+            .partition_column(true)
+            .build()
+            .unwrap()
+            .generate(&mut rng)
+            .unwrap();
+        assert_eq!(expr.columns.len(), 10);
+        assert!(expr.partition.is_none());
+        assert!(PARTIBLE_DATA_TYPES.contains(&expr.columns[0].column_type));
    }

    #[test]
@@ -516,6 +537,25 @@ mod tests {
        assert_eq!(physical_table_expr.partition.unwrap().exprs.len(), 3);
    }

+    #[test]
+    fn test_create_physical_table_expr_generator_with_partition_column() {
+        let mut rng = rand::rng();
+        let physical_table_expr = CreatePhysicalTableExprGeneratorBuilder::default()
+            .partition(1)
+            .partition_column(true)
+            .if_not_exists(false)
+            .build()
+            .unwrap()
+            .generate(&mut rng)
+            .unwrap();
+
+        assert_eq!(physical_table_expr.engine, "metric");
+        assert!(physical_table_expr.partition.is_none());
+        assert_eq!(physical_table_expr.columns.len(), 3);
+        assert_eq!(physical_table_expr.columns[2].name, Ident::new("host"));
+        assert_eq!(physical_table_expr.primary_keys, vec![2]);
+    }
+
    #[test]
    fn test_create_logical_table_expr_generator_without_partition_column() {
        let mut rng = rand::rng();
--- a/tests-fuzz/src/ir.rs
+++ b/tests-fuzz/src/ir.rs
@@ -30,7 +30,7 @@ use std::time::Duration;
 pub use alter_expr::{AlterTableExpr, AlterTableOption};
 use common_time::timestamp::TimeUnit;
 use common_time::{Date, Timestamp};
-pub use create_expr::{CreateDatabaseExpr, CreateTableExpr};
+pub use create_expr::{CreateDatabaseExpr, CreateTableExpr, PartitionDef};
 use datatypes::data_type::ConcreteDataType;
 use datatypes::types::TimestampType;
 use datatypes::value::Value;
@@ -40,7 +40,7 @@ use lazy_static::lazy_static;
 pub use partition_expr::SimplePartitions;
 use rand::Rng;
 use rand::seq::{IndexedRandom, SliceRandom};
-pub use repartition_expr::RepartitionExpr;
+pub use repartition_expr::{AlterTablePartitionsExpr, RepartitionExpr};
 use serde::{Deserialize, Serialize};

 use self::insert_expr::RowValues;
--- a/tests-fuzz/src/ir/repartition_expr.rs
+++ b/tests-fuzz/src/ir/repartition_expr.rs
@@ -16,6 +16,7 @@ use partition::expr::PartitionExpr;
 use serde::{Deserialize, Serialize};

 use crate::ir::Ident;
+use crate::ir::create_expr::PartitionDef;

 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct SplitPartitionExpr {
@@ -34,10 +35,19 @@ pub struct MergePartitionExpr {
    pub wait: bool,
 }

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AlterTablePartitionsExpr {
+    pub table_name: Ident,
+    pub partition: PartitionDef,
+    #[serde(default = "default_wait")]
+    pub wait: bool,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub enum RepartitionExpr {
    Split(SplitPartitionExpr),
    Merge(MergePartitionExpr),
+    AlterPartitions(AlterTablePartitionsExpr),
 }

 const fn default_wait() -> bool {
--- a/tests-fuzz/src/translator/mysql/repartition_expr.rs
+++ b/tests-fuzz/src/translator/mysql/repartition_expr.rs
@@ -15,7 +15,10 @@
 use partition::expr::PartitionExpr;

 use crate::error::Result;
-use crate::ir::repartition_expr::{MergePartitionExpr, RepartitionExpr, SplitPartitionExpr};
+use crate::ir::create_expr::PartitionDef;
+use crate::ir::repartition_expr::{
+    AlterTablePartitionsExpr, MergePartitionExpr, RepartitionExpr, SplitPartitionExpr,
+};
 use crate::translator::DslTranslator;

 pub struct RepartitionExprTranslator;
@@ -59,10 +62,38 @@ impl DslTranslator<RepartitionExpr, String> for RepartitionExprTranslator {
                    table_name, merge_exprs, wait_clause
                ))
            }
+            RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
+                table_name,
+                partition,
+                wait,
+            }) => {
+                let partition_clause = format_partition_clause(partition);
+                let wait_clause = format_wait_clause(*wait);
+                Ok(format!(
+                    "ALTER TABLE {} {}{};",
+                    table_name, partition_clause, wait_clause
+                ))
+            }
        }
    }
 }

+fn format_partition_clause(partition: &PartitionDef) -> String {
+    let columns = partition
+        .columns
+        .iter()
+        .map(|column| column.to_string())
+        .collect::<Vec<_>>()
+        .join(", ");
+    let exprs = partition
+        .exprs
+        .iter()
+        .map(format_partition_expr_sql)
+        .collect::<Vec<_>>()
+        .join(",\n  ");
+    format!("PARTITION ON COLUMNS ({columns}) (\n  {exprs}\n)")
+}
+
 fn format_partition_expr_sql(expr: &PartitionExpr) -> String {
    expr.to_parser_expr().to_string()
 }
@@ -79,9 +110,15 @@ fn format_wait_clause(wait: bool) -> String {
 mod tests {
    use datatypes::value::Value;
    use partition::expr::col;
+    use sql::dialect::GreptimeDbDialect;
+    use sql::parser::{ParseOptions, ParserContext};

    use super::RepartitionExprTranslator;
-    use crate::ir::repartition_expr::{MergePartitionExpr, RepartitionExpr, SplitPartitionExpr};
+    use crate::ir::Ident;
+    use crate::ir::create_expr::PartitionDef;
+    use crate::ir::repartition_expr::{
+        AlterTablePartitionsExpr, MergePartitionExpr, RepartitionExpr, SplitPartitionExpr,
+    };
    use crate::translator::DslTranslator;

    #[test]
@@ -149,4 +186,61 @@ mod tests {
 );"#;
        assert_eq!(sql, expected);
    }
+
+    #[test]
+    fn test_translate_alter_table_partitions_expr() {
+        let expr = RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
+            table_name: "demo".into(),
+            partition: PartitionDef {
+                columns: vec![Ident::new("id")],
+                exprs: vec![
+                    col("id").lt(Value::Int32(10)),
+                    col("id")
+                        .gt_eq(Value::Int32(10))
+                        .and(col("id").lt(Value::Int32(20))),
+                    col("id").gt_eq(Value::Int32(20)),
+                ],
+            },
+            wait: true,
+        });
+        let sql = RepartitionExprTranslator.translate(&expr).unwrap();
+        let expected = r#"ALTER TABLE demo PARTITION ON COLUMNS (id) (
+  id < 10,
+  id >= 10 AND id < 20,
+  id >= 20
+);"#;
+        assert_eq!(sql, expected);
+        assert_repartition_sql_parseable(&sql);
+    }
+
+    #[test]
+    fn test_translate_alter_table_partitions_expr_wait_false() {
+        let expr = RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
+            table_name: "demo".into(),
+            partition: PartitionDef {
+                columns: vec![Ident::new("host")],
+                exprs: vec![
+                    col("host").lt(Value::from("m")),
+                    col("host").gt_eq(Value::from("m")),
+                ],
+            },
+            wait: false,
+        });
+        let sql = RepartitionExprTranslator.translate(&expr).unwrap();
+        let expected = r#"ALTER TABLE demo PARTITION ON COLUMNS (host) (
+  host < 'm',
+  host >= 'm'
+) WITH (
+  WAIT = false
+);"#;
+        assert_eq!(sql, expected);
+        assert_repartition_sql_parseable(&sql);
+    }
+
+    fn assert_repartition_sql_parseable(sql: &str) {
+        let statements =
+            ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
+                .unwrap();
+        assert_eq!(statements.len(), 1);
+    }
 }
--- a/tests-fuzz/src/validator/partition.rs
+++ b/tests-fuzz/src/validator/partition.rs
@@ -21,7 +21,8 @@ use crate::ir::Ident;
 use crate::ir::create_expr::PartitionDef;

 const PARTITIONS_INFO_SCHEMA_SQL: &str = "SELECT table_catalog, table_schema, table_name, \
-partition_name, partition_expression, partition_description, greptime_partition_id, \
+partition_name, COALESCE(partition_expression, '') AS partition_expression, \
+COALESCE(partition_description, '') AS partition_description, greptime_partition_id, \
 partition_ordinal_position FROM information_schema.partitions WHERE table_name = ? \
 ORDER BY partition_ordinal_position;";

@@ -91,3 +92,20 @@ pub fn assert_partitions(expected: &PartitionDef, actual: &[PartitionInfo]) -> R

    Ok(())
 }
+
+/// Asserts that the table has no partition metadata in information schema.
+pub fn assert_unpartitioned(actual: &[PartitionInfo]) -> Result<()> {
+    let has_no_partition_metadata = actual.is_empty()
+        || (actual.len() == 1
+            && actual[0].partition_expression.is_empty()
+            && actual[0].partition_description.is_empty());
+
+    ensure!(
+        has_no_partition_metadata,
+        error::AssertSnafu {
+            reason: format!("Expected unpartitioned table, got partitions: {actual:?}"),
+        }
+    );
+
+    Ok(())
+}
--- a/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs
+++ b/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs
@@ -36,14 +36,15 @@ use tests_fuzz::fake::{
 use tests_fuzz::generator::Generator;
 use tests_fuzz::generator::create_expr::{
    CreateLogicalTableExprGeneratorBuilder, CreatePhysicalTableExprGeneratorBuilder,
+    generate_metric_partition_def,
 };
 use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder;
 use tests_fuzz::generator::repartition_expr::{
    MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder,
 };
 use tests_fuzz::ir::{
-    CreateTableExpr, Ident, InsertIntoExpr, RepartitionExpr, generate_random_value,
-    generate_unique_timestamp_for_mysql_with_clock,
+    AlterTablePartitionsExpr, CreateTableExpr, Ident, InsertIntoExpr, PartitionDef,
+    RepartitionExpr, generate_random_value, generate_unique_timestamp_for_mysql_with_clock,
 };
 use tests_fuzz::translator::DslTranslator;
 use tests_fuzz::translator::csv::InsertExprToCsvRecordsTranslator;
@@ -94,6 +95,7 @@ fn generate_create_physical_table_expr<R: Rng + 'static>(
        ))))
        .if_not_exists(rng.random_bool(0.5))
        .partition(partitions)
+        .partition_column(partitions <= 1)
        .build()
        .unwrap()
        .generate(rng)
@@ -158,12 +160,6 @@ async fn create_metric_tables<R: Rng + 'static>(
        })?;
    info!("Create physical table: {create_physical_sql}, result: {result:?}");
    let physical_table_ctx = Arc::new(TableContext::from(&create_physical_expr));
-    ensure!(
-        physical_table_ctx.partition.is_some(),
-        error::AssertSnafu {
-            reason: "Physical metric table must have partition".to_string()
-        }
-    );

    let mut logical_tables = BTreeMap::new();
    let mut create_logical_sqls = HashMap::new();
@@ -436,6 +432,11 @@ fn repartition_operation<R: Rng + 'static>(
    table_ctx: &TableContextRef,
    rng: &mut R,
 ) -> Result<RepartitionExpr> {
+    if table_ctx.partition.is_none() {
+        let partition = generate_metric_partition_def(rng.random_range(2..8));
+        return Ok(alter_table_partitions_expr(table_ctx, partition, true));
+    }
+
    let split = rng.random_bool(0.5);
    if table_ctx.partition.as_ref().unwrap().exprs.len() <= 2 || split {
        let expr = SplitPartitionExprGeneratorBuilder::default()
@@ -454,19 +455,35 @@ fn repartition_operation<R: Rng + 'static>(
    }
 }

+fn alter_table_partitions_expr(
+    table_ctx: &TableContextRef,
+    partition: PartitionDef,
+    wait: bool,
+) -> RepartitionExpr {
+    RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
+        table_name: table_ctx.name.clone(),
+        partition,
+        wait,
+    })
+}
+
 impl Arbitrary<'_> for FuzzInput {
    fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result<Self> {
        let seed = get_fuzz_override::<u64>("SEED").unwrap_or(u.int_in_range(u64::MIN..=u64::MAX)?);
        let mut rng = ChaChaRng::seed_from_u64(seed);
-        let partitions =
-            get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8));
+        let partitions = get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| {
+            if rng.random_bool(0.5) {
+                1
+            } else {
+                rng.random_range(2..8)
+            }
+        });
        let max_tables = get_gt_fuzz_input_max_tables();
        let tables = get_fuzz_override::<usize>("TABLES")
            .unwrap_or_else(|| rng.random_range(1..=std::cmp::max(1, max_tables)));
-        let max_actions = get_gt_fuzz_input_max_alter_actions();
+        let max_actions = std::cmp::min(128, get_gt_fuzz_input_max_alter_actions());
        let actions = get_fuzz_override::<usize>("ACTIONS")
            .unwrap_or_else(|| rng.random_range(1..max_actions));
-
        Ok(FuzzInput {
            seed,
            actions,
@@ -536,7 +553,11 @@ async fn execute_repartition_metric_table(ctx: FuzzContext, input: FuzzInput) ->
    tokio::time::sleep(Duration::from_millis(100)).await;

    for i in 0..input.actions {
-        let partition_num = physical_table_ctx.partition.as_ref().unwrap().exprs.len();
+        let partition_num = physical_table_ctx
+            .partition
+            .as_ref()
+            .map(|partition| partition.exprs.len())
+            .unwrap_or_default();
        info!(
            "partition_num: {partition_num}, action: {}/{}, table: {}, logical table num: {}",
            i + 1,
--- a/tests-fuzz/targets/ddl/fuzz_repartition_table.rs
+++ b/tests-fuzz/targets/ddl/fuzz_repartition_table.rs
@@ -33,14 +33,15 @@ use tests_fuzz::fake::{
    uppercase_and_keyword_backtick_map,
 };
 use tests_fuzz::generator::Generator;
-use tests_fuzz::generator::create_expr::CreateTableExprGeneratorBuilder;
+use tests_fuzz::generator::create_expr::{CreateTableExprGeneratorBuilder, generate_partition_def};
 use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder;
 use tests_fuzz::generator::repartition_expr::{
    MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder,
 };
 use tests_fuzz::ir::{
-    CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator, RepartitionExpr, RowValue,
-    SimplePartitions, generate_partition_value, generate_unique_timestamp_for_mysql_with_clock,
+    AlterTablePartitionsExpr, CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator,
+    PartitionDef, RepartitionExpr, RowValue, SimplePartitions, generate_partition_value,
+    generate_unique_timestamp_for_mysql_with_clock,
 };
 use tests_fuzz::translator::DslTranslator;
 use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator;
@@ -75,8 +76,13 @@ impl Arbitrary<'_> for FuzzInput {
    fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result<Self> {
        let seed = get_fuzz_override::<u64>("SEED").unwrap_or(u.int_in_range(u64::MIN..=u64::MAX)?);
        let mut rng = ChaChaRng::seed_from_u64(seed);
-        let partitions =
-            get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8));
+        let partitions = get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| {
+            if rng.random_bool(0.5) {
+                1
+            } else {
+                rng.random_range(2..8)
+            }
+        });
        let max_actions = get_gt_fuzz_input_max_alter_actions();
        let actions = get_fuzz_override::<usize>("ACTIONS")
            .unwrap_or_else(|| rng.random_range(1..max_actions));
@@ -99,6 +105,7 @@ fn generate_create_expr<R: Rng + 'static>(
        )))
        .columns(5)
        .partition(input.partitions)
+        .partition_column(input.partitions <= 1)
        .engine("mito")
        .ts_column_type_generator(Box::new(MySQLTsColumnTypeGenerator))
        .build()
@@ -122,7 +129,7 @@ fn build_insert_expr<R: Rng + 'static>(
    let ts_value_generator = generate_unique_timestamp_for_mysql_with_clock(clock.clone());
    let counter = Arc::new(AtomicUsize::new(0));
    let counter_clone = counter.clone();
-    let partition_len = table_ctx.partition.as_ref().unwrap().exprs.len();
+    let partition_len = partitions.bounds.len() + 1;
    let row = rng.random_range(partition_len..partition_len * 2);

    let moved_partitions = partitions.clone();
@@ -150,6 +157,28 @@ fn build_insert_expr<R: Rng + 'static>(
    insert_generator.generate(rng).unwrap()
 }

+fn alter_table_partitions_expr(
+    table_ctx: &TableContextRef,
+    partition: PartitionDef,
+    wait: bool,
+) -> RepartitionExpr {
+    RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
+        table_name: table_ctx.name.clone(),
+        partition,
+        wait,
+    })
+}
+
+fn alter_table_partitions_expr_from_table_ctx<R: Rng + 'static>(
+    table_ctx: &TableContextRef,
+    rng: &mut R,
+    wait: bool,
+) -> RepartitionExpr {
+    let column = table_ctx.columns[0].clone();
+    let partition = generate_partition_def(rng.random_range(2..8), column.column_type, column.name);
+    alter_table_partitions_expr(table_ctx, partition, wait)
+}
+
 async fn execute_insert_with_retry(ctx: &FuzzContext, sql: &str) -> Result<()> {
    let mut delay = Duration::from_millis(100);
    let mut attempt = 0;
@@ -236,9 +265,36 @@ async fn execute_repartition_table(ctx: FuzzContext, input: FuzzInput) -> Result
        inserted_rows: 0,
    }));

+    let mut action_start = 0;
+    if table_ctx.partition.is_none() {
+        let expr = alter_table_partitions_expr_from_table_ctx(&table_ctx, &mut rng, true);
+        let translator = RepartitionExprTranslator;
+        let sql = translator.translate(&expr)?;
+        info!("Initial partition sql: {sql}");
+        let result = sqlx::query(&sql)
+            .execute(&ctx.greptime)
+            .await
+            .context(error::ExecuteQuerySnafu { sql: &sql })?;
+        info!("Initial partition result: {result:?}");
+        table_ctx = Arc::new(Arc::unwrap_or_clone(table_ctx).repartition(expr).unwrap());
+        shared_state.lock().unwrap().table_ctx = table_ctx.clone();
+
+        let partition_entries = validator::partition::fetch_partitions_info_schema(
+            &ctx.greptime,
+            "public".into(),
+            &table_ctx.name,
+        )
+        .await?;
+        validator::partition::assert_partitions(
+            table_ctx.partition.as_ref().unwrap(),
+            &partition_entries,
+        )?;
+        action_start = 1;
+    }
+
    let writer_rng = ChaChaRng::seed_from_u64(input.seed);
    let writer_task = tokio::spawn(write_loop(writer_rng, ctx.clone(), shared_state.clone()));
-    for i in 0..input.actions {
+    for i in action_start..input.actions {
        let partition_num = table_ctx.partition.as_ref().unwrap().exprs.len();
        info!(
            "partition_num: {partition_num}, action: {}/{}",
--- a/tests-fuzz/targets/ddl/fuzz_repartition_table_chaos.rs
+++ b/tests-fuzz/targets/ddl/fuzz_repartition_table_chaos.rs
@@ -34,14 +34,15 @@ use tests_fuzz::fake::{
    uppercase_and_keyword_backtick_map,
 };
 use tests_fuzz::generator::Generator;
-use tests_fuzz::generator::create_expr::CreateTableExprGeneratorBuilder;
+use tests_fuzz::generator::create_expr::{CreateTableExprGeneratorBuilder, generate_partition_def};
 use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder;
 use tests_fuzz::generator::repartition_expr::{
    MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder,
 };
 use tests_fuzz::ir::{
-    CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator, RepartitionExpr, RowValue,
-    SimplePartitions, generate_partition_value, generate_unique_timestamp_for_mysql_with_clock,
+    AlterTablePartitionsExpr, CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator,
+    PartitionDef, RepartitionExpr, RowValue, SimplePartitions, generate_partition_value,
+    generate_unique_timestamp_for_mysql_with_clock,
 };
 use tests_fuzz::translator::DslTranslator;
 use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator;
@@ -93,13 +94,17 @@ impl Arbitrary<'_> for FuzzInput {
        let mut rng = ChaChaRng::seed_from_u64(seed);
        let rows = get_fuzz_override::<usize>("ROWS")
            .unwrap_or_else(|| rng.random_range(2..get_gt_fuzz_input_max_rows()));
-        let partitions =
-            get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8));
+        let partitions = get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| {
+            if rng.random_bool(0.5) {
+                1
+            } else {
+                rng.random_range(2..8)
+            }
+        });
        let chaos_delay_ms =
            get_fuzz_override::<u64>("CHAOS_DELAY_MS").unwrap_or_else(|| rng.random_range(0..5000));
        let chaos_hold_secs =
            get_fuzz_override::<u64>("CHAOS_HOLD_SECS").unwrap_or_else(|| rng.random_range(10..20));
-
        Ok(FuzzInput {
            seed,
            rows,
@@ -127,6 +132,7 @@ fn generate_create_expr<R: Rng + 'static>(
        )))
        .columns(5)
        .partition(input.partitions)
+        .partition_column(input.partitions <= 1)
        .engine("mito")
        .ts_column_type_generator(Box::new(MySQLTsColumnTypeGenerator))
        .build()
@@ -144,7 +150,7 @@ fn build_insert_expr<R: Rng + 'static>(
    let ts_value_generator = generate_unique_timestamp_for_mysql_with_clock(clock.clone());
    let counter = Arc::new(AtomicUsize::new(0));
    let counter_clone = counter.clone();
-    let partition_len = table_ctx.partition.as_ref().unwrap().exprs.len();
+    let partition_len = partitions.bounds.len() + 1;
    let moved_partitions = partitions.clone();
    let insert_generator = InsertExprGeneratorBuilder::default()
        .table_ctx(table_ctx.clone())
@@ -202,10 +208,12 @@ async fn create_table(ctx: &FuzzContext, expr: &CreateTableExpr) -> Result<Table
 async fn insert_initial_rows<R: Rng + 'static>(
    ctx: &FuzzContext,
    table_ctx: &TableContextRef,
+    partition_def: &PartitionDef,
    rng: &mut R,
    rows: usize,
 ) -> Result<u64> {
-    let partitions = SimplePartitions::from_table_ctx(table_ctx).unwrap();
+    let partitions =
+        SimplePartitions::from_exprs(partition_def.columns[0].clone(), &partition_def.exprs)?;
    let clock = Arc::new(Mutex::new(Timestamp::current_millis()));
    let insert_expr = build_insert_expr(table_ctx, rng, &partitions, &clock, rows);
    let inserted_rows = insert_expr.values_list.len() as u64;
@@ -260,6 +268,28 @@ fn repartition_operation<R: Rng + 'static>(
    }
 }

+fn alter_table_partitions_expr(
+    table_ctx: &TableContextRef,
+    partition: PartitionDef,
+    wait: bool,
+) -> RepartitionExpr {
+    RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
+        table_name: table_ctx.name.clone(),
+        partition,
+        wait,
+    })
+}
+
+fn alter_table_partitions_expr_from_table_ctx<R: Rng + 'static>(
+    table_ctx: &TableContextRef,
+    rng: &mut R,
+    wait: bool,
+) -> RepartitionExpr {
+    let column = table_ctx.columns[0].clone();
+    let partition = generate_partition_def(rng.random_range(2..8), column.column_type, column.name);
+    alter_table_partitions_expr(table_ctx, partition, wait)
+}
+
 async fn submit_repartition_procedure(ctx: &FuzzContext, expr: &RepartitionExpr) -> Result<String> {
    let translator = RepartitionExprTranslator;
    let sql = translator.translate(expr)?;
@@ -334,10 +364,13 @@ async fn validate_terminal_metadata(
            after_table_ctx.partition.as_ref().unwrap(),
            &partition_entries,
        )?,
-        ProcedureTerminalState::Failed => validator::partition::assert_partitions(
-            before_table_ctx.partition.as_ref().unwrap(),
-            &partition_entries,
-        )?,
+        ProcedureTerminalState::Failed => {
+            if let Some(partition) = before_table_ctx.partition.as_ref() {
+                validator::partition::assert_partitions(partition, &partition_entries)?;
+            } else {
+                validator::partition::assert_unpartitioned(&partition_entries)?;
+            }
+        }
    }

    Ok(())
@@ -359,7 +392,21 @@ async fn execute_repartition_chaos(ctx: FuzzContext, input: FuzzInput) -> Result

    let create_expr = generate_create_expr(&input, &mut rng)?;
    let before_table_ctx = create_table(&ctx, &create_expr).await?;
-    let inserted_rows = insert_initial_rows(&ctx, &before_table_ctx, &mut rng, input.rows).await?;
+    let insert_partition = create_expr.partition.clone().unwrap_or_else(|| {
+        generate_partition_def(
+            2,
+            before_table_ctx.columns[0].column_type.clone(),
+            before_table_ctx.columns[0].name.clone(),
+        )
+    });
+    let inserted_rows = insert_initial_rows(
+        &ctx,
+        &before_table_ctx,
+        &insert_partition,
+        &mut rng,
+        input.rows,
+    )
+    .await?;
    validate_table_rows(&ctx, &before_table_ctx, inserted_rows).await?;

    let before_entries = validator::partition::fetch_partitions_info_schema(
@@ -370,7 +417,11 @@ async fn execute_repartition_chaos(ctx: FuzzContext, input: FuzzInput) -> Result
    .await?;
    info!("Before repartition partition entries: {before_entries:?}");

-    let repartition_expr = repartition_operation(&before_table_ctx, &mut rng, false)?;
+    let repartition_expr = if before_table_ctx.partition.is_some() {
+        repartition_operation(&before_table_ctx, &mut rng, false)?
+    } else {
+        alter_table_partitions_expr_from_table_ctx(&before_table_ctx, &mut rng, false)
+    };
    let after_table_ctx = Arc::new(
        Arc::unwrap_or_clone(before_table_ctx.clone())
            .repartition(repartition_expr.clone())
--- a/tests-integration/src/standalone.rs
+++ b/tests-integration/src/standalone.rs
@@ -80,6 +80,7 @@ pub struct GreptimeDbStandaloneBuilder {
    default_store: Option<StorageType>,
    plugin: Option<Plugins>,
    slow_query_options: SlowQueryOptions,
+    auto_create_table: bool,
 }

 impl GreptimeDbStandaloneBuilder {
@@ -97,9 +98,16 @@ impl GreptimeDbStandaloneBuilder {
                threshold: Duration::from_secs(1),
                ..Default::default()
            },
+            auto_create_table: true,
        }
    }

+    #[must_use]
+    pub fn with_auto_create_table(mut self, auto_create_table: bool) -> Self {
+        self.auto_create_table = auto_create_table;
+        self
+    }
+
    #[must_use]
    pub fn with_default_store_type(self, store_type: StorageType) -> Self {
        Self {
@@ -347,6 +355,7 @@ impl GreptimeDbStandaloneBuilder {
            wal: self.metasrv_wal_config.clone().into(),
            grpc: GrpcOptions::default().with_server_addr("127.0.0.1:4001"),
            slow_query: self.slow_query_options.clone(),
+            auto_create_table: self.auto_create_table,
            ..StandaloneOptions::default()
        };

--- a/tests-integration/src/test_util.rs
+++ b/tests-integration/src/test_util.rs
@@ -16,6 +16,7 @@ use std::env;
 use std::fmt::Display;
 use std::net::SocketAddr;
 use std::sync::Arc;
+use std::time::Duration;

 use auth::{DefaultPermissionChecker, PermissionCheckerRef, UserProviderRef};
 use axum::Router;
@@ -49,6 +50,7 @@ use servers::http::{HttpOptions, HttpServerBuilder};
 use servers::metrics_handler::MetricsHandler;
 use servers::mysql::server::{MysqlServer, MysqlSpawnConfig, MysqlSpawnRef};
 use servers::otel_arrow::OtelArrowServiceHandler;
+use servers::pending_rows_batcher::PendingRowsBatcher;
 use servers::postgres::PostgresServer;
 use servers::prom_remote_write::validation::PromValidationMode;
 use servers::query_handler::sql::SqlQueryHandler;
@@ -564,6 +566,24 @@ async fn run_sql(sql: &str, instance: &GreptimeDbStandalone) {
 pub async fn setup_test_prom_app_with_frontend(
    store_type: StorageType,
    name: &str,
+) -> (Router, TestGuard) {
+    setup_test_prom_app_with_frontend_inner(store_type, name, false).await
+}
+
+/// Like [`setup_test_prom_app_with_frontend`] but enables the pending-rows batcher,
+/// so Prometheus remote write goes through the batched (metric-engine) path instead
+/// of the direct `PromStoreProtocolHandler::write` path.
+pub async fn setup_test_prom_app_with_frontend_batched(
+    store_type: StorageType,
+    name: &str,
+) -> (Router, TestGuard) {
+    setup_test_prom_app_with_frontend_inner(store_type, name, true).await
+}
+
+async fn setup_test_prom_app_with_frontend_inner(
+    store_type: StorageType,
+    name: &str,
+    enable_batcher: bool,
 ) -> (Router, TestGuard) {
    unsafe {
        std::env::set_var("TZ", "UTC");
@@ -617,6 +637,24 @@ pub async fn setup_test_prom_app_with_frontend(
        ..Default::default()
    };
    let frontend_ref = instance.fe_instance().clone();
+    // Mirror the production wiring at `frontend::server`: build the batcher from the
+    // instance's managers. A short flush interval keeps the test responsive.
+    let pending_rows_batcher = if enable_batcher {
+        PendingRowsBatcher::try_new(
+            frontend_ref.partition_manager().clone(),
+            frontend_ref.node_manager().clone(),
+            frontend_ref.catalog_manager().clone(),
+            true,
+            frontend_ref.clone(),
+            Duration::from_millis(50),
+            1000,
+            4,
+            64,
+            64,
+        )
+    } else {
+        None
+    };
    let http_server = HttpServerBuilder::new(http_opts)
        .with_sql_handler(frontend_ref.clone())
        .with_logs_handler(instance.fe_instance().clone())
@@ -625,7 +663,7 @@ pub async fn setup_test_prom_app_with_frontend(
            Some(frontend_ref.clone()),
            true,
            PromValidationMode::Strict,
-            None,
+            pending_rows_batcher,
        )
        .with_prometheus_handler(frontend_ref)
        .with_greptime_config_options(instance.opts.datanode_options().to_toml().unwrap())
@@ -649,6 +687,20 @@ pub async fn setup_grpc_server_with_user_provider(
    setup_grpc_server_with(store_type, name, user_provider, None, None).await
 }

+/// Sets up a gRPC server backed by a standalone instance whose frontend has auto
+/// table creation disabled, for testing the server-side global switch.
+pub async fn setup_grpc_server_with_auto_create_table_disabled(
+    store_type: StorageType,
+    name: &str,
+) -> (GreptimeDbStandalone, Arc<GrpcServer>) {
+    let instance = GreptimeDbStandaloneBuilder::new(name)
+        .with_default_store_type(store_type)
+        .with_auto_create_table(false)
+        .build()
+        .await;
+    setup_grpc_server_for_instance(instance, None, None, None).await
+}
+
 pub async fn setup_grpc_server_with(
    store_type: StorageType,
    name: &str,
@@ -657,7 +709,17 @@ pub async fn setup_grpc_server_with(
    memory_limiter: Option<servers::request_memory_limiter::ServerMemoryLimiter>,
 ) -> (GreptimeDbStandalone, Arc<GrpcServer>) {
    let instance = setup_standalone_instance(name, store_type).await;
+    setup_grpc_server_for_instance(instance, user_provider, grpc_config, memory_limiter).await
+}

+/// Builds and starts a gRPC server on top of an already-constructed standalone
+/// instance. This is the shared core behind the `setup_grpc_server_*` helpers.
+async fn setup_grpc_server_for_instance(
+    instance: GreptimeDbStandalone,
+    user_provider: Option<UserProviderRef>,
+    grpc_config: Option<GrpcServerConfig>,
+    memory_limiter: Option<servers::request_memory_limiter::ServerMemoryLimiter>,
+) -> (GreptimeDbStandalone, Arc<GrpcServer>) {
    let runtime: Runtime = RuntimeBuilder::default()
        .worker_threads(2)
        .thread_name("grpc-handlers")
--- a/tests-integration/tests/grpc.rs
+++ b/tests-integration/tests/grpc.rs
@@ -44,7 +44,8 @@ use servers::request_memory_limiter::ServerMemoryLimiter;
 use servers::server::Server;
 use servers::tls::{TlsMode, TlsOption};
 use tests_integration::test_util::{
-    StorageType, setup_grpc_server, setup_grpc_server_with, setup_grpc_server_with_user_provider,
+    StorageType, setup_grpc_server, setup_grpc_server_with,
+    setup_grpc_server_with_auto_create_table_disabled, setup_grpc_server_with_user_provider,
 };
 use tonic::Request;
 use tonic::metadata::MetadataValue;
@@ -82,6 +83,7 @@ macro_rules! grpc_tests {
                test_invalid_dbname,
                test_auto_create_table,
                test_auto_create_table_with_hints,
+                test_auto_create_table_disabled_by_config,
                test_otel_arrow_auth,
                test_insert_and_select,
                test_dbname,
@@ -405,6 +407,81 @@ pub async fn test_auto_create_table_with_hints(store_type: StorageType) {
    let _ = fe_grpc_server.shutdown().await;
 }

+/// When the frontend global switch disables auto table creation, a write to a
+/// missing table must fail even if the request sets `auto_create_table=true`,
+/// proving the global config is an upper bound that hints cannot bypass.
+pub async fn test_auto_create_table_disabled_by_config(store_type: StorageType) {
+    let (_db, fe_grpc_server) = setup_grpc_server_with_auto_create_table_disabled(
+        store_type,
+        "test_auto_create_table_disabled_by_config",
+    )
+    .await;
+    let addr = fe_grpc_server.bind_addr().unwrap().to_string();
+
+    let grpc_client = Client::with_urls(vec![addr]);
+    let db = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, grpc_client);
+
+    // Plain row insert to a missing table: must fail even with `auto_create_table=true`.
+    let (host, cpu, mem, ts) = expect_data();
+    let request = InsertRequest {
+        table_name: "demo".to_string(),
+        columns: vec![host, cpu, mem, ts],
+        row_count: 4,
+    };
+    let result = db
+        .insert_with_hints(
+            InsertRequests {
+                inserts: vec![request],
+            },
+            &[("auto_create_table", "true")],
+        )
+        .await;
+    let err = result.unwrap_err().to_string();
+    assert!(
+        err.contains("does not exist") && err.contains("disabled by frontend config"),
+        "unexpected error: {err}"
+    );
+
+    // Metric path (via `physical_table` hint): must also fail without leaking the physical table.
+    let (host, cpu, mem, ts) = expect_data();
+    let request = InsertRequest {
+        table_name: "demo_metric".to_string(),
+        columns: vec![host, cpu, mem, ts],
+        row_count: 4,
+    };
+    let result = db
+        .insert_with_hints(
+            InsertRequests {
+                inserts: vec![request],
+            },
+            &[
+                ("auto_create_table", "true"),
+                ("physical_table", "greptime_physical_table"),
+            ],
+        )
+        .await;
+    let err = result.unwrap_err().to_string();
+    assert!(
+        err.contains("does not exist") && err.contains("disabled by frontend config"),
+        "unexpected error: {err}"
+    );
+
+    // The physical table must not have been created before the failure.
+    let output = db.sql("SHOW TABLES").await.unwrap();
+    let record_batches = match output.data {
+        OutputData::RecordBatches(record_batches) => record_batches,
+        OutputData::Stream(stream) => RecordBatches::try_collect(stream).await.unwrap(),
+        OutputData::AffectedRows(_) => unreachable!(),
+    };
+    let tables = record_batches.pretty_print().unwrap();
+    assert!(
+        !tables.contains("greptime_physical_table"),
+        "physical table leaked despite disabled auto-create:\n{tables}"
+    );
+
+    let _ = fe_grpc_server.shutdown().await;
+}
+
 fn expect_data() -> (Column, Column, Column, Column) {
    // testing data:
    let expected_host_col = Column {
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -71,6 +71,7 @@ use tests_integration::test_util::{
    StorageType, setup_test_http_app, setup_test_http_app_with_frontend,
    setup_test_http_app_with_frontend_and_slow_query_threshold,
    setup_test_http_app_with_frontend_and_user_provider, setup_test_prom_app_with_frontend,
+    setup_test_prom_app_with_frontend_batched,
 };
 use urlencoding::encode;
 use yaml_rust::YamlLoader;
@@ -117,6 +118,7 @@ macro_rules! http_tests {
                test_dashboard_path,
                test_dashboard_api,
                test_prometheus_remote_write,
+                test_prometheus_remote_write_batched,
                test_prometheus_remote_special_labels,
                test_prometheus_remote_schema_labels,
                test_prometheus_remote_write_with_pipeline,
@@ -1491,6 +1493,7 @@ mem_threshold_on_create = "auto"
    let expected_toml_str = format!(
        r#"
 enable_telemetry = true
+auto_create_table = true
 max_in_flight_write_bytes = "0KiB"
 write_bytes_exhausted_policy = "wait"
 init_regions_in_background = false
@@ -1601,6 +1604,7 @@ experimental_grpc_max_retries = 3
 experimental_frontend_scan_timeout = "30s"
 experimental_max_filter_num_per_query = 20
 experimental_time_window_merge_threshold = 3
+experimental_enable_incremental_read = false
 read_preference = "Leader"

 [logging]
@@ -1954,6 +1958,18 @@ pub async fn test_prometheus_remote_write(store_type: StorageType) {
    )
    .await;

+    // Prom RW tables carry the metric identity; type is inferred from naming.
+    validate_data(
+        "prometheus_remote_write_semantic_identity",
+        &client,
+        "select count(*) from information_schema.tables where table_name = 'metric2' \
+         and create_options like '%greptime.semantic.signal_type=metric%' \
+         and create_options like '%greptime.semantic.source=prometheus%' \
+         and create_options like '%greptime.semantic.metric.metadata_quality=inferred%';",
+        "[[1]]",
+    )
+    .await;
+
    // Write snappy encoded data with new labels
    let write_request = WriteRequest {
        timeseries: mock_timeseries_new_label(),
@@ -1975,6 +1991,48 @@ pub async fn test_prometheus_remote_write(store_type: StorageType) {
    guard.remove_all().await;
 }

+/// Covers the batched (pending-rows-batcher) Prometheus remote write path, which
+/// bypasses `PromStoreProtocolHandler::write`. Verifies the metric table is created
+/// asynchronously and still carries the Prometheus semantic identity stamped on the
+/// shared request context.
+pub async fn test_prometheus_remote_write_batched(store_type: StorageType) {
+    common_telemetry::init_default_ut_logging();
+    let (app, mut guard) =
+        setup_test_prom_app_with_frontend_batched(store_type, "prometheus_remote_write_batched")
+            .await;
+    let client = TestClient::new(app).await;
+
+    let write_request = WriteRequest {
+        timeseries: prom_store::mock_timeseries(),
+        ..Default::default()
+    };
+    let serialized_request = write_request.encode_to_vec();
+    let compressed_request =
+        prom_store::snappy_compress(&serialized_request).expect("failed to encode snappy");
+
+    let res = client
+        .post("/v1/prometheus/write")
+        .header("Content-Encoding", "snappy")
+        .body(compressed_request)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::NO_CONTENT);
+
+    // The batcher flushes asynchronously, so poll until the table exists and carries
+    // the semantic identity (signal_type/source/metadata_quality).
+    wait_for_data(
+        &client,
+        "select count(*) from information_schema.tables where table_name = 'metric2' \
+         and create_options like '%greptime.semantic.signal_type=metric%' \
+         and create_options like '%greptime.semantic.source=prometheus%' \
+         and create_options like '%greptime.semantic.metric.metadata_quality=inferred%'",
+        "[[1]]",
+    )
+    .await;
+
+    guard.remove_all().await;
+}
+
 pub async fn test_prometheus_remote_special_labels(store_type: StorageType) {
    common_telemetry::init_default_ut_logging();
    let (app, mut guard) =
@@ -2023,7 +2081,7 @@ pub async fn test_prometheus_remote_special_labels(store_type: StorageType) {
        expected,
    )
    .await;
-    let expected = "[[\"idc3_lo_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc3_lo_table\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  on_physical_table = 'f1'\\n)\"]]";
+    let expected = "[[\"idc3_lo_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc3_lo_table\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  'greptime.semantic.metric.metadata_quality' = 'inferred',\\n  'greptime.semantic.signal_type' = 'metric',\\n  'greptime.semantic.source' = 'prometheus',\\n  on_physical_table = 'f1'\\n)\"]]";
    validate_data(
        "test_prometheus_remote_special_labels_idc3_show_create_table",
        &client,
@@ -2049,7 +2107,7 @@ pub async fn test_prometheus_remote_special_labels(store_type: StorageType) {
        expected,
    )
    .await;
-    let expected = "[[\"idc4_local_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc4_local_table\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  on_physical_table = 'f2'\\n)\"]]";
+    let expected = "[[\"idc4_local_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc4_local_table\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  'greptime.semantic.metric.metadata_quality' = 'inferred',\\n  'greptime.semantic.signal_type' = 'metric',\\n  'greptime.semantic.source' = 'prometheus',\\n  on_physical_table = 'f2'\\n)\"]]";
    validate_data(
        "test_prometheus_remote_special_labels_idc4_show_create_table",
        &client,
@@ -5025,6 +5083,28 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
    let expected = "[[\"claude_code_cost_usage_USD_total\"],[\"claude_code_token_usage_tokens_total\"],[\"demo\"],[\"greptime_physical_table\"],[\"numbers\"]]";
    validate_data("otlp_metrics_all_tables", &client, "show tables;", expected).await;

+    // Metric-engine logical table carries the semantic identity. Match substrings
+    // because extra_options ordering is not stable.
+    validate_data(
+        "otlp_metrics_semantic_identity",
+        &client,
+        "select count(*) from information_schema.tables where table_name = 'claude_code_cost_usage_USD_total' \
+         and create_options like '%greptime.semantic.signal_type=metric%' \
+         and create_options like '%greptime.semantic.source=opentelemetry%';",
+        "[[1]]",
+    )
+    .await;
+    // OTLP metric type is declared, so Phase 1 must not stamp `metadata_quality`
+    // here (Phase 2 adds it as `declared`).
+    validate_data(
+        "otlp_metrics_no_metadata_quality",
+        &client,
+        "select count(*) from information_schema.tables where table_name = 'claude_code_cost_usage_USD_total' \
+         and create_options like '%metadata_quality%';",
+        "[[0]]",
+    )
+    .await;
+
    // CREATE TABLE IF NOT EXISTS "claude_code_cost_usage_USD_total" (
    //   "greptime_timestamp" TIMESTAMP(3) NOT NULL,
    //   "greptime_value" DOUBLE NULL,
@@ -5049,7 +5129,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
    //   on_physical_table = 'greptime_physical_table',
    //   otlp_metric_compat = 'prom'
    // )
-    let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  \\\"host_arch\\\" STRING NULL,\\n  \\\"job\\\" STRING NULL,\\n  \\\"model\\\" STRING NULL,\\n  \\\"os_version\\\" STRING NULL,\\n  \\\"otel_scope_name\\\" STRING NULL,\\n  \\\"otel_scope_schema_url\\\" STRING NULL,\\n  \\\"otel_scope_version\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  \\\"service_version\\\" STRING NULL,\\n  \\\"session_id\\\" STRING NULL,\\n  \\\"terminal_type\\\" STRING NULL,\\n  \\\"user_id\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"host_arch\\\", \\\"job\\\", \\\"model\\\", \\\"os_version\\\", \\\"otel_scope_name\\\", \\\"otel_scope_schema_url\\\", \\\"otel_scope_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  on_physical_table = 'greptime_physical_table',\\n  otlp_metric_compat = 'prom'\\n)\"]]";
+    let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  \\\"host_arch\\\" STRING NULL,\\n  \\\"job\\\" STRING NULL,\\n  \\\"model\\\" STRING NULL,\\n  \\\"os_version\\\" STRING NULL,\\n  \\\"otel_scope_name\\\" STRING NULL,\\n  \\\"otel_scope_schema_url\\\" STRING NULL,\\n  \\\"otel_scope_version\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  \\\"service_version\\\" STRING NULL,\\n  \\\"session_id\\\" STRING NULL,\\n  \\\"terminal_type\\\" STRING NULL,\\n  \\\"user_id\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"host_arch\\\", \\\"job\\\", \\\"model\\\", \\\"os_version\\\", \\\"otel_scope_name\\\", \\\"otel_scope_schema_url\\\", \\\"otel_scope_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  \'comment\' = 'Created on insertion',\\n  'greptime.semantic.signal_type' = 'metric',\\n  'greptime.semantic.source' = 'opentelemetry',\\n  on_physical_table = 'greptime_physical_table',\\n  otlp_metric_compat = 'prom'\\n)\"]]";
    validate_data(
        "otlp_metrics_all_show_create_table",
        &client,
@@ -5122,7 +5202,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
    //     on_physical_table = 'greptime_physical_table',
    //     otlp_metric_compat = 'prom'
    //   )
-    let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  \\\"job\\\" STRING NULL,\\n  \\\"model\\\" STRING NULL,\\n  \\\"os_type\\\" STRING NULL,\\n  \\\"os_version\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  \\\"service_version\\\" STRING NULL,\\n  \\\"session_id\\\" STRING NULL,\\n  \\\"terminal_type\\\" STRING NULL,\\n  \\\"user_id\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"os_type\\\", \\\"os_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  'comment' = 'Created on insertion',\\n  on_physical_table = 'greptime_physical_table',\\n  otlp_metric_compat = 'prom'\\n)\"]]";
+    let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  \\\"job\\\" STRING NULL,\\n  \\\"model\\\" STRING NULL,\\n  \\\"os_type\\\" STRING NULL,\\n  \\\"os_version\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  \\\"service_version\\\" STRING NULL,\\n  \\\"session_id\\\" STRING NULL,\\n  \\\"terminal_type\\\" STRING NULL,\\n  \\\"user_id\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"os_type\\\", \\\"os_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  'comment' = 'Created on insertion',\\n  'greptime.semantic.signal_type' = 'metric',\\n  'greptime.semantic.source' = 'opentelemetry',\\n  on_physical_table = 'greptime_physical_table',\\n  otlp_metric_compat = 'prom'\\n)\"]]";
    validate_data(
        "otlp_metrics_show_create_table",
        &client,
@@ -5186,7 +5266,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
    //     on_physical_table = 'greptime_physical_table',
    //     otlp_metric_compat = 'prom'
    //   )
-    let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  \\\"job\\\" STRING NULL,\\n  \\\"model\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  \\\"service_version\\\" STRING NULL,\\n  \\\"session_id\\\" STRING NULL,\\n  \\\"terminal_type\\\" STRING NULL,\\n  \\\"user_id\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  'comment' = 'Created on insertion',\\n  on_physical_table = 'greptime_physical_table',\\n  otlp_metric_compat = 'prom'\\n)\"]]";
+    let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n  \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n  \\\"greptime_value\\\" DOUBLE NULL,\\n  \\\"job\\\" STRING NULL,\\n  \\\"model\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL,\\n  \\\"service_version\\\" STRING NULL,\\n  \\\"session_id\\\" STRING NULL,\\n  \\\"terminal_type\\\" STRING NULL,\\n  \\\"user_id\\\" STRING NULL,\\n  TIME INDEX (\\\"greptime_timestamp\\\"),\\n  PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n  'comment' = 'Created on insertion',\\n  'greptime.semantic.signal_type' = 'metric',\\n  'greptime.semantic.source' = 'opentelemetry',\\n  on_physical_table = 'greptime_physical_table',\\n  otlp_metric_compat = 'prom'\\n)\"]]";
    validate_data(
        "otlp_metrics_show_create_table_none",
        &client,
@@ -5493,7 +5573,22 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
    let expected = r#"[[1736480942444376000,1736480942444499000,123000,null,"c05d7a4ec8e1f231f02ed6e8da8655b4","d24f921c75f68e23","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444376000,1736480942444499000,123000,"d24f921c75f68e23","c05d7a4ec8e1f231f02ed6e8da8655b4","9630f2916e2f7909","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]],[1736480942444589000,1736480942444712000,123000,null,"cc9e0991a2e63d274984bd44ee669203","eba7be77e3558179","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444589000,1736480942444712000,123000,"eba7be77e3558179","cc9e0991a2e63d274984bd44ee669203","8f847259b0f6e1ab","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]]]"#;
    validate_data("otlp_traces", &client, "select * from mytable;", expected).await;

-    let expected_ddl = r#"[["mytable","CREATE TABLE IF NOT EXISTS \"mytable\" (\n  \"timestamp\" TIMESTAMP(9) NOT NULL,\n  \"timestamp_end\" TIMESTAMP(9) NULL,\n  \"duration_nano\" BIGINT UNSIGNED NULL,\n  \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_id\" STRING NULL,\n  \"span_kind\" STRING NULL,\n  \"span_name\" STRING NULL,\n  \"span_status_code\" STRING NULL,\n  \"span_status_message\" STRING NULL,\n  \"trace_state\" STRING NULL,\n  \"scope_name\" STRING NULL,\n  \"scope_version\" STRING NULL,\n  \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_attributes.net.peer.ip\" STRING NULL,\n  \"span_attributes.peer.service\" STRING NULL,\n  \"span_events\" JSON NULL,\n  \"span_links\" JSON NULL,\n  TIME INDEX (\"timestamp\"),\n  PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n  trace_id < '1',\n  trace_id >= '1' AND trace_id < '2',\n  trace_id >= '2' AND trace_id < '3',\n  trace_id >= '3' AND trace_id < '4',\n  trace_id >= '4' AND trace_id < '5',\n  trace_id >= '5' AND trace_id < '6',\n  trace_id >= '6' AND trace_id < '7',\n  trace_id >= '7' AND trace_id < '8',\n  trace_id >= '8' AND trace_id < '9',\n  trace_id >= '9' AND trace_id < 'a',\n  trace_id >= 'a' AND trace_id < 'b',\n  trace_id >= 'b' AND trace_id < 'c',\n  trace_id >= 'c' AND trace_id < 'd',\n  trace_id >= 'd' AND trace_id < 'e',\n  trace_id >= 'e' AND trace_id < 'f',\n  trace_id >= 'f'\n)\nENGINE=mito\nWITH(\n  'comment' = 'Created on insertion',\n  append_mode = 'true',\n  table_data_model = 'greptime_trace_v1'\n)"]]"#;
+    // The trace v1 main table carries the trace identity (events/links preserved as
+    // JSON columns by the v1 model).
+    validate_data(
+        "otlp_traces_semantic_identity",
+        &client,
+        "select count(*) from information_schema.tables where table_name = 'mytable' \
+         and create_options like '%greptime.semantic.signal_type=trace%' \
+         and create_options like '%greptime.semantic.source=opentelemetry%' \
+         and create_options like '%greptime.semantic.pipeline=greptime_trace_v1%' \
+         and create_options like '%greptime.semantic.trace.has_events=true%' \
+         and create_options like '%greptime.semantic.trace.has_links=true%';",
+        "[[1]]",
+    )
+    .await;
+
+    let expected_ddl = r#"[["mytable","CREATE TABLE IF NOT EXISTS \"mytable\" (\n  \"timestamp\" TIMESTAMP(9) NOT NULL,\n  \"timestamp_end\" TIMESTAMP(9) NULL,\n  \"duration_nano\" BIGINT UNSIGNED NULL,\n  \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_id\" STRING NULL,\n  \"span_kind\" STRING NULL,\n  \"span_name\" STRING NULL,\n  \"span_status_code\" STRING NULL,\n  \"span_status_message\" STRING NULL,\n  \"trace_state\" STRING NULL,\n  \"scope_name\" STRING NULL,\n  \"scope_version\" STRING NULL,\n  \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_attributes.net.peer.ip\" STRING NULL,\n  \"span_attributes.peer.service\" STRING NULL,\n  \"span_events\" JSON NULL,\n  \"span_links\" JSON NULL,\n  TIME INDEX (\"timestamp\"),\n  PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n  trace_id < '1',\n  trace_id >= '1' AND trace_id < '2',\n  trace_id >= '2' AND trace_id < '3',\n  trace_id >= '3' AND trace_id < '4',\n  trace_id >= '4' AND trace_id < '5',\n  trace_id >= '5' AND trace_id < '6',\n  trace_id >= '6' AND trace_id < '7',\n  trace_id >= '7' AND trace_id < '8',\n  trace_id >= '8' AND trace_id < '9',\n  trace_id >= '9' AND trace_id < 'a',\n  trace_id >= 'a' AND trace_id < 'b',\n  trace_id >= 'b' AND trace_id < 'c',\n  trace_id >= 'c' AND trace_id < 'd',\n  trace_id >= 'd' AND trace_id < 'e',\n  trace_id >= 'e' AND trace_id < 'f',\n  trace_id >= 'f'\n)\nENGINE=mito\nWITH(\n  'comment' = 'Created on insertion',\n  append_mode = 'true',\n  'greptime.semantic.pipeline' = 'greptime_trace_v1',\n  'greptime.semantic.signal_type' = 'trace',\n  'greptime.semantic.source' = 'opentelemetry',\n  'greptime.semantic.trace.conventions' = 'unknown',\n  'greptime.semantic.trace.has_events' = 'true',\n  'greptime.semantic.trace.has_links' = 'true',\n  table_data_model = 'greptime_trace_v1'\n)"]]"#;
    validate_data(
        "otlp_traces",
        &client,
@@ -5546,7 +5641,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
    )
    .await;
    assert_eq!(StatusCode::OK, res.status());
-    let expected_ddl = r#"[["trace_table_part1","CREATE TABLE IF NOT EXISTS \"trace_table_part1\" (\n  \"timestamp\" TIMESTAMP(9) NOT NULL,\n  \"timestamp_end\" TIMESTAMP(9) NULL,\n  \"duration_nano\" BIGINT UNSIGNED NULL,\n  \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_id\" STRING NULL,\n  \"span_kind\" STRING NULL,\n  \"span_name\" STRING NULL,\n  \"span_status_code\" STRING NULL,\n  \"span_status_message\" STRING NULL,\n  \"trace_state\" STRING NULL,\n  \"scope_name\" STRING NULL,\n  \"scope_version\" STRING NULL,\n  \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_attributes.net.peer.ip\" STRING NULL,\n  \"span_attributes.peer.service\" STRING NULL,\n  \"span_events\" JSON NULL,\n  \"span_links\" JSON NULL,\n  TIME INDEX (\"timestamp\"),\n  PRIMARY KEY (\"service_name\")\n)\n\nENGINE=mito\nWITH(\n  'comment' = 'Created on insertion',\n  append_mode = 'true',\n  table_data_model = 'greptime_trace_v1'\n)"]]"#;
+    let expected_ddl = r#"[["trace_table_part1","CREATE TABLE IF NOT EXISTS \"trace_table_part1\" (\n  \"timestamp\" TIMESTAMP(9) NOT NULL,\n  \"timestamp_end\" TIMESTAMP(9) NULL,\n  \"duration_nano\" BIGINT UNSIGNED NULL,\n  \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_id\" STRING NULL,\n  \"span_kind\" STRING NULL,\n  \"span_name\" STRING NULL,\n  \"span_status_code\" STRING NULL,\n  \"span_status_message\" STRING NULL,\n  \"trace_state\" STRING NULL,\n  \"scope_name\" STRING NULL,\n  \"scope_version\" STRING NULL,\n  \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_attributes.net.peer.ip\" STRING NULL,\n  \"span_attributes.peer.service\" STRING NULL,\n  \"span_events\" JSON NULL,\n  \"span_links\" JSON NULL,\n  TIME INDEX (\"timestamp\"),\n  PRIMARY KEY (\"service_name\")\n)\n\nENGINE=mito\nWITH(\n  'comment' = 'Created on insertion',\n  append_mode = 'true',\n  'greptime.semantic.pipeline' = 'greptime_trace_v1',\n  'greptime.semantic.signal_type' = 'trace',\n  'greptime.semantic.source' = 'opentelemetry',\n  'greptime.semantic.trace.conventions' = 'unknown',\n  'greptime.semantic.trace.has_events' = 'true',\n  'greptime.semantic.trace.has_links' = 'true',\n  table_data_model = 'greptime_trace_v1'\n)"]]"#;
    validate_data(
        "otlp_traces",
        &client,
@@ -5583,7 +5678,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
    )
    .await;
    assert_eq!(StatusCode::OK, res.status());
-    let expected_ddl = r#"[["trace_table_part4","CREATE TABLE IF NOT EXISTS \"trace_table_part4\" (\n  \"timestamp\" TIMESTAMP(9) NOT NULL,\n  \"timestamp_end\" TIMESTAMP(9) NULL,\n  \"duration_nano\" BIGINT UNSIGNED NULL,\n  \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_id\" STRING NULL,\n  \"span_kind\" STRING NULL,\n  \"span_name\" STRING NULL,\n  \"span_status_code\" STRING NULL,\n  \"span_status_message\" STRING NULL,\n  \"trace_state\" STRING NULL,\n  \"scope_name\" STRING NULL,\n  \"scope_version\" STRING NULL,\n  \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_attributes.net.peer.ip\" STRING NULL,\n  \"span_attributes.peer.service\" STRING NULL,\n  \"span_events\" JSON NULL,\n  \"span_links\" JSON NULL,\n  TIME INDEX (\"timestamp\"),\n  PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n  trace_id < '4',\n  trace_id >= '4' AND trace_id < '8',\n  trace_id >= '8' AND trace_id < 'c',\n  trace_id >= 'c'\n)\nENGINE=mito\nWITH(\n  'comment' = 'Created on insertion',\n  append_mode = 'true',\n  table_data_model = 'greptime_trace_v1'\n)"]]"#;
+    let expected_ddl = r#"[["trace_table_part4","CREATE TABLE IF NOT EXISTS \"trace_table_part4\" (\n  \"timestamp\" TIMESTAMP(9) NOT NULL,\n  \"timestamp_end\" TIMESTAMP(9) NULL,\n  \"duration_nano\" BIGINT UNSIGNED NULL,\n  \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_id\" STRING NULL,\n  \"span_kind\" STRING NULL,\n  \"span_name\" STRING NULL,\n  \"span_status_code\" STRING NULL,\n  \"span_status_message\" STRING NULL,\n  \"trace_state\" STRING NULL,\n  \"scope_name\" STRING NULL,\n  \"scope_version\" STRING NULL,\n  \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_attributes.net.peer.ip\" STRING NULL,\n  \"span_attributes.peer.service\" STRING NULL,\n  \"span_events\" JSON NULL,\n  \"span_links\" JSON NULL,\n  TIME INDEX (\"timestamp\"),\n  PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n  trace_id < '4',\n  trace_id >= '4' AND trace_id < '8',\n  trace_id >= '8' AND trace_id < 'c',\n  trace_id >= 'c'\n)\nENGINE=mito\nWITH(\n  'comment' = 'Created on insertion',\n  append_mode = 'true',\n  'greptime.semantic.pipeline' = 'greptime_trace_v1',\n  'greptime.semantic.signal_type' = 'trace',\n  'greptime.semantic.source' = 'opentelemetry',\n  'greptime.semantic.trace.conventions' = 'unknown',\n  'greptime.semantic.trace.has_events' = 'true',\n  'greptime.semantic.trace.has_links' = 'true',\n  table_data_model = 'greptime_trace_v1'\n)"]]"#;
    validate_data(
        "otlp_traces",
        &client,
@@ -5620,7 +5715,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
    )
    .await;
    assert_eq!(StatusCode::OK, res.status());
-    let expected_ddl = r#"[["trace_table_part32","CREATE TABLE IF NOT EXISTS \"trace_table_part32\" (\n  \"timestamp\" TIMESTAMP(9) NOT NULL,\n  \"timestamp_end\" TIMESTAMP(9) NULL,\n  \"duration_nano\" BIGINT UNSIGNED NULL,\n  \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_id\" STRING NULL,\n  \"span_kind\" STRING NULL,\n  \"span_name\" STRING NULL,\n  \"span_status_code\" STRING NULL,\n  \"span_status_message\" STRING NULL,\n  \"trace_state\" STRING NULL,\n  \"scope_name\" STRING NULL,\n  \"scope_version\" STRING NULL,\n  \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_attributes.net.peer.ip\" STRING NULL,\n  \"span_attributes.peer.service\" STRING NULL,\n  \"span_events\" JSON NULL,\n  \"span_links\" JSON NULL,\n  TIME INDEX (\"timestamp\"),\n  PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n  trace_id < '08',\n  trace_id >= '08' AND trace_id < '10',\n  trace_id >= '10' AND trace_id < '18',\n  trace_id >= '18' AND trace_id < '20',\n  trace_id >= '20' AND trace_id < '28',\n  trace_id >= '28' AND trace_id < '30',\n  trace_id >= '30' AND trace_id < '38',\n  trace_id >= '38' AND trace_id < '40',\n  trace_id >= '40' AND trace_id < '48',\n  trace_id >= '48' AND trace_id < '50',\n  trace_id >= '50' AND trace_id < '58',\n  trace_id >= '58' AND trace_id < '60',\n  trace_id >= '60' AND trace_id < '68',\n  trace_id >= '68' AND trace_id < '70',\n  trace_id >= '70' AND trace_id < '78',\n  trace_id >= '78' AND trace_id < '80',\n  trace_id >= '80' AND trace_id < '88',\n  trace_id >= '88' AND trace_id < '90',\n  trace_id >= '90' AND trace_id < '98',\n  trace_id >= '98' AND trace_id < 'a0',\n  trace_id >= 'a0' AND trace_id < 'a8',\n  trace_id >= 'a8' AND trace_id < 'b0',\n  trace_id >= 'b0' AND trace_id < 'b8',\n  trace_id >= 'b8' AND trace_id < 'c0',\n  trace_id >= 'c0' AND trace_id < 'c8',\n  trace_id >= 'c8' AND trace_id < 'd0',\n  trace_id >= 'd0' AND trace_id < 'd8',\n  trace_id >= 'd8' AND trace_id < 'e0',\n  trace_id >= 'e0' AND trace_id < 'e8',\n  trace_id >= 'e8' AND trace_id < 'f0',\n  trace_id >= 'f0' AND trace_id < 'f8',\n  trace_id >= 'f8'\n)\nENGINE=mito\nWITH(\n  'comment' = 'Created on insertion',\n  append_mode = 'true',\n  table_data_model = 'greptime_trace_v1'\n)"]]"#;
+    let expected_ddl = r#"[["trace_table_part32","CREATE TABLE IF NOT EXISTS \"trace_table_part32\" (\n  \"timestamp\" TIMESTAMP(9) NOT NULL,\n  \"timestamp_end\" TIMESTAMP(9) NULL,\n  \"duration_nano\" BIGINT UNSIGNED NULL,\n  \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_id\" STRING NULL,\n  \"span_kind\" STRING NULL,\n  \"span_name\" STRING NULL,\n  \"span_status_code\" STRING NULL,\n  \"span_status_message\" STRING NULL,\n  \"trace_state\" STRING NULL,\n  \"scope_name\" STRING NULL,\n  \"scope_version\" STRING NULL,\n  \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n  \"span_attributes.net.peer.ip\" STRING NULL,\n  \"span_attributes.peer.service\" STRING NULL,\n  \"span_events\" JSON NULL,\n  \"span_links\" JSON NULL,\n  TIME INDEX (\"timestamp\"),\n  PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n  trace_id < '08',\n  trace_id >= '08' AND trace_id < '10',\n  trace_id >= '10' AND trace_id < '18',\n  trace_id >= '18' AND trace_id < '20',\n  trace_id >= '20' AND trace_id < '28',\n  trace_id >= '28' AND trace_id < '30',\n  trace_id >= '30' AND trace_id < '38',\n  trace_id >= '38' AND trace_id < '40',\n  trace_id >= '40' AND trace_id < '48',\n  trace_id >= '48' AND trace_id < '50',\n  trace_id >= '50' AND trace_id < '58',\n  trace_id >= '58' AND trace_id < '60',\n  trace_id >= '60' AND trace_id < '68',\n  trace_id >= '68' AND trace_id < '70',\n  trace_id >= '70' AND trace_id < '78',\n  trace_id >= '78' AND trace_id < '80',\n  trace_id >= '80' AND trace_id < '88',\n  trace_id >= '88' AND trace_id < '90',\n  trace_id >= '90' AND trace_id < '98',\n  trace_id >= '98' AND trace_id < 'a0',\n  trace_id >= 'a0' AND trace_id < 'a8',\n  trace_id >= 'a8' AND trace_id < 'b0',\n  trace_id >= 'b0' AND trace_id < 'b8',\n  trace_id >= 'b8' AND trace_id < 'c0',\n  trace_id >= 'c0' AND trace_id < 'c8',\n  trace_id >= 'c8' AND trace_id < 'd0',\n  trace_id >= 'd0' AND trace_id < 'd8',\n  trace_id >= 'd8' AND trace_id < 'e0',\n  trace_id >= 'e0' AND trace_id < 'e8',\n  trace_id >= 'e8' AND trace_id < 'f0',\n  trace_id >= 'f0' AND trace_id < 'f8',\n  trace_id >= 'f8'\n)\nENGINE=mito\nWITH(\n  'comment' = 'Created on insertion',\n  append_mode = 'true',\n  'greptime.semantic.pipeline' = 'greptime_trace_v1',\n  'greptime.semantic.signal_type' = 'trace',\n  'greptime.semantic.source' = 'opentelemetry',\n  'greptime.semantic.trace.conventions' = 'unknown',\n  'greptime.semantic.trace.has_events' = 'true',\n  'greptime.semantic.trace.has_links' = 'true',\n  table_data_model = 'greptime_trace_v1'\n)"]]"#;
    validate_data(
        "otlp_traces",
        &client,
@@ -6283,6 +6378,17 @@ pub async fn test_otlp_logs(store_type: StorageType) {
            expected,
        )
        .await;
+
+        // The auto-created log table carries the log identity.
+        validate_data(
+            "otlp_logs_semantic_identity",
+            &client,
+            "select count(*) from information_schema.tables where table_name = 'opentelemetry_logs' \
+             and create_options like '%greptime.semantic.signal_type=log%' \
+             and create_options like '%greptime.semantic.source=opentelemetry%';",
+            "[[1]]",
+        )
+        .await;
    }

    {
@@ -7718,7 +7824,7 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) {
    .await;
    assert_eq!(StatusCode::OK, res.status());

-    let trace_table_sql = "[[\"mytable\",\"CREATE TABLE IF NOT EXISTS \\\"mytable\\\" (\\n  \\\"timestamp\\\" TIMESTAMP(9) NOT NULL,\\n  \\\"timestamp_end\\\" TIMESTAMP(9) NULL,\\n  \\\"duration_nano\\\" BIGINT UNSIGNED NULL,\\n  \\\"parent_span_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"trace_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"span_id\\\" STRING NULL,\\n  \\\"span_kind\\\" STRING NULL,\\n  \\\"span_name\\\" STRING NULL,\\n  \\\"span_status_code\\\" STRING NULL,\\n  \\\"span_status_message\\\" STRING NULL,\\n  \\\"trace_state\\\" STRING NULL,\\n  \\\"scope_name\\\" STRING NULL,\\n  \\\"scope_version\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"span_attributes.operation.type\\\" STRING NULL,\\n  \\\"span_attributes.net.peer.ip\\\" STRING NULL,\\n  \\\"span_attributes.peer.service\\\" STRING NULL,\\n  \\\"span_events\\\" JSON NULL,\\n  \\\"span_links\\\" JSON NULL,\\n  TIME INDEX (\\\"timestamp\\\"),\\n  PRIMARY KEY (\\\"service_name\\\")\\n)\\nPARTITION ON COLUMNS (\\\"trace_id\\\") (\\n  trace_id < '1',\\n  trace_id >= '1' AND trace_id < '2',\\n  trace_id >= '2' AND trace_id < '3',\\n  trace_id >= '3' AND trace_id < '4',\\n  trace_id >= '4' AND trace_id < '5',\\n  trace_id >= '5' AND trace_id < '6',\\n  trace_id >= '6' AND trace_id < '7',\\n  trace_id >= '7' AND trace_id < '8',\\n  trace_id >= '8' AND trace_id < '9',\\n  trace_id >= '9' AND trace_id < 'a',\\n  trace_id >= 'a' AND trace_id < 'b',\\n  trace_id >= 'b' AND trace_id < 'c',\\n  trace_id >= 'c' AND trace_id < 'd',\\n  trace_id >= 'd' AND trace_id < 'e',\\n  trace_id >= 'e' AND trace_id < 'f',\\n  trace_id >= 'f'\\n)\\nENGINE=mito\\nWITH(\\n  'comment' = 'Created on insertion',\\n  append_mode = 'true',\\n  table_data_model = 'greptime_trace_v1',\\n  ttl = '7days'\\n)\"]]";
+    let trace_table_sql = "[[\"mytable\",\"CREATE TABLE IF NOT EXISTS \\\"mytable\\\" (\\n  \\\"timestamp\\\" TIMESTAMP(9) NOT NULL,\\n  \\\"timestamp_end\\\" TIMESTAMP(9) NULL,\\n  \\\"duration_nano\\\" BIGINT UNSIGNED NULL,\\n  \\\"parent_span_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"trace_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"span_id\\\" STRING NULL,\\n  \\\"span_kind\\\" STRING NULL,\\n  \\\"span_name\\\" STRING NULL,\\n  \\\"span_status_code\\\" STRING NULL,\\n  \\\"span_status_message\\\" STRING NULL,\\n  \\\"trace_state\\\" STRING NULL,\\n  \\\"scope_name\\\" STRING NULL,\\n  \\\"scope_version\\\" STRING NULL,\\n  \\\"service_name\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n  \\\"span_attributes.operation.type\\\" STRING NULL,\\n  \\\"span_attributes.net.peer.ip\\\" STRING NULL,\\n  \\\"span_attributes.peer.service\\\" STRING NULL,\\n  \\\"span_events\\\" JSON NULL,\\n  \\\"span_links\\\" JSON NULL,\\n  TIME INDEX (\\\"timestamp\\\"),\\n  PRIMARY KEY (\\\"service_name\\\")\\n)\\nPARTITION ON COLUMNS (\\\"trace_id\\\") (\\n  trace_id < '1',\\n  trace_id >= '1' AND trace_id < '2',\\n  trace_id >= '2' AND trace_id < '3',\\n  trace_id >= '3' AND trace_id < '4',\\n  trace_id >= '4' AND trace_id < '5',\\n  trace_id >= '5' AND trace_id < '6',\\n  trace_id >= '6' AND trace_id < '7',\\n  trace_id >= '7' AND trace_id < '8',\\n  trace_id >= '8' AND trace_id < '9',\\n  trace_id >= '9' AND trace_id < 'a',\\n  trace_id >= 'a' AND trace_id < 'b',\\n  trace_id >= 'b' AND trace_id < 'c',\\n  trace_id >= 'c' AND trace_id < 'd',\\n  trace_id >= 'd' AND trace_id < 'e',\\n  trace_id >= 'e' AND trace_id < 'f',\\n  trace_id >= 'f'\\n)\\nENGINE=mito\\nWITH(\\n  'comment' = 'Created on insertion',\\n  append_mode = 'true',\\n  'greptime.semantic.pipeline' = 'greptime_trace_v1',\\n  'greptime.semantic.signal_type' = 'trace',\\n  'greptime.semantic.source' = 'opentelemetry',\\n  'greptime.semantic.trace.conventions' = 'unknown',\\n  'greptime.semantic.trace.has_events' = 'true',\\n  'greptime.semantic.trace.has_links' = 'true',\\n  table_data_model = 'greptime_trace_v1',\\n  ttl = '7days'\\n)\"]]";
    validate_data(
        "trace_v1_create_table",
        &client,
--- a/tests/cases/standalone/common/flow/flow_incremental_aggr.result
+++ b/tests/cases/standalone/common/flow/flow_incremental_aggr.result
@@ -1,3 +1,31 @@
+-- Incremental aggregate reads only support append-only source tables because
+-- update/upsert sources need old-value compensation.
+CREATE TABLE incremental_non_append_input (
+    host_id INT,
+    n INT,
+    ts TIMESTAMP TIME INDEX,
+    PRIMARY KEY(host_id)
+);
+
+Affected Rows: 0
+
+CREATE FLOW incremental_non_append_flow SINK TO incremental_non_append_sink
+WITH (experimental_enable_incremental_read = 'true')
+AS
+SELECT
+    sum(n) AS total,
+    date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window
+FROM
+    incremental_non_append_input
+GROUP BY
+    time_window;
+
+Error: 3001(EngineExecuteQuery), Unsupported: Flow incremental read requires append-only source table, but source table `greptime.public.incremental_non_append_input` is not append-only. Consider setting append_mode='true' on the source table or disabling experimental_enable_incremental_read
+
+DROP TABLE incremental_non_append_input;
+
+Affected Rows: 0
+
 CREATE TABLE incremental_aggr_input (
    host_id INT,
    n INT,
@@ -9,7 +37,9 @@ CREATE TABLE incremental_aggr_input (

 Affected Rows: 0

-CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink AS
+CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink
+WITH (experimental_enable_incremental_read = 'true')
+AS
 SELECT
    sum(n) AS total,
    min(n) AS min_n,
--- a/tests/cases/standalone/common/flow/flow_incremental_aggr.sql
+++ b/tests/cases/standalone/common/flow/flow_incremental_aggr.sql
@@ -1,3 +1,25 @@
+-- Incremental aggregate reads only support append-only source tables because
+-- update/upsert sources need old-value compensation.
+CREATE TABLE incremental_non_append_input (
+    host_id INT,
+    n INT,
+    ts TIMESTAMP TIME INDEX,
+    PRIMARY KEY(host_id)
+);
+
+CREATE FLOW incremental_non_append_flow SINK TO incremental_non_append_sink
+WITH (experimental_enable_incremental_read = 'true')
+AS
+SELECT
+    sum(n) AS total,
+    date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window
+FROM
+    incremental_non_append_input
+GROUP BY
+    time_window;
+
+DROP TABLE incremental_non_append_input;
+
 CREATE TABLE incremental_aggr_input (
    host_id INT,
    n INT,
@@ -7,7 +29,9 @@ CREATE TABLE incremental_aggr_input (
    append_mode = 'true'
 );

-CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink AS
+CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink
+WITH (experimental_enable_incremental_read = 'true')
+AS
 SELECT
    sum(n) AS total,
    min(n) AS min_n,
--- a/tests/cases/standalone/common/flow/flow_incremental_memtable.result
+++ b/tests/cases/standalone/common/flow/flow_incremental_memtable.result
@@ -12,7 +12,9 @@ CREATE TABLE flow_incr_memtable_input (

 Affected Rows: 0

-CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink AS
+CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink
+WITH (experimental_enable_incremental_read = 'true')
+AS
 SELECT
    sum(n) AS total,
    min(n) AS min_n,
--- a/tests/cases/standalone/common/flow/flow_incremental_memtable.sql
+++ b/tests/cases/standalone/common/flow/flow_incremental_memtable.sql
@@ -10,7 +10,9 @@ CREATE TABLE flow_incr_memtable_input (
    append_mode = 'true'
 );

-CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink AS
+CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink
+WITH (experimental_enable_incremental_read = 'true')
+AS
 SELECT
    sum(n) AS total,
    min(n) AS min_n,
--- a/tests/cases/standalone/common/flow/flow_incremental_partitioned.result
+++ b/tests/cases/standalone/common/flow/flow_incremental_partitioned.result
@@ -17,7 +17,9 @@ WITH (

 Affected Rows: 0

-CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink AS
+CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink
+WITH (experimental_enable_incremental_read = 'true')
+AS
 SELECT
    sum(n) AS total,
    min(n) AS min_n,
--- a/tests/cases/standalone/common/flow/flow_incremental_partitioned.sql
+++ b/tests/cases/standalone/common/flow/flow_incremental_partitioned.sql
@@ -15,7 +15,9 @@ WITH (
    append_mode = 'true'
 );

-CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink AS
+CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink
+WITH (experimental_enable_incremental_read = 'true')
+AS
 SELECT
    sum(n) AS total,
    min(n) AS min_n,
--- a/tests/cases/standalone/common/flow/show_create_flow.result
+++ b/tests/cases/standalone/common/flow/show_create_flow.result
@@ -476,7 +476,7 @@ SINK TO out_num_cnt_show
 WITH (access_key_id = [true])
 AS SELECT number AS n1 FROM numbers_input_show where number > 10;

-Error: 1004(InvalidArguments), Invalid SQL, error: unknown flow option 'access_key_id', supported options: defer_on_missing_source
+Error: 1004(InvalidArguments), Invalid SQL, error: unknown flow option 'access_key_id', supported options: defer_on_missing_source, experimental_enable_incremental_read

 DROP FLOW filter_numbers_show;