diff --git a/Cargo.lock b/Cargo.lock index 63ba289947..a65159d26a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2278,6 +2278,7 @@ dependencies = [ "futures", "lazy_static", "object-store", + "object_store_opendal", "orc-rust", "parquet", "paste", @@ -5102,6 +5103,7 @@ dependencies = [ "datatypes", "futures", "object-store", + "object_store_opendal", "serde", "serde_json", "snafu 0.8.6", @@ -8320,6 +8322,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datatypes", + "derive_more", "dotenv", "either", "futures", @@ -9074,8 +9077,9 @@ dependencies = [ [[package]] name = "object_store_opendal" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eb12a624a41fce745838d0ef3701ff6c47797c13cd18ad3612fd2a3134fdbd8" dependencies = [ "async-trait", "bytes", @@ -9162,8 +9166,9 @@ checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" [[package]] name = "opendal" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c9c85ce253ff87225e7669979d877a20c98a06604ec9d6dd5f4473e08f1ae1" dependencies = [ "ctor", "opendal-core", @@ -9183,8 +9188,9 @@ dependencies = [ [[package]] name = "opendal-core" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4f8607c90e2c963a91467f50fb49fbc7fb3d573f88cea219ca59ccd3740b309" dependencies = [ "anyhow", "base64 0.22.1", @@ -9210,8 +9216,9 @@ dependencies = [ [[package]] name = "opendal-layer-concurrent-limit" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d6f81ba6960e3fae1882f253b114b21d7e444e1534f209c7737a79f6243eb6f" dependencies = [ "futures", "http 1.3.1", @@ -9221,8 +9228,9 @@ dependencies = [ [[package]] name = "opendal-layer-logging" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58ada45c6d81d1aa4c9305d0c7d4bc317c59c85866a0908a2d75a7a978aa5ee2" dependencies = [ "log", "opendal-core", @@ -9230,8 +9238,9 @@ dependencies = [ [[package]] name = "opendal-layer-observe-metrics-common" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "628b0228fdbd13c3d9d50eee4341f2eb82ca5b44991e4c68f07c84cc823e2d12" dependencies = [ "futures", "http 1.3.1", @@ -9240,8 +9249,9 @@ dependencies = [ [[package]] name = "opendal-layer-prometheus" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0487bdb1357097ec8654781bad03ef310282517738e2864ebde69e27aaafc5ec" dependencies = [ "opendal-core", "opendal-layer-observe-metrics-common", @@ -9250,8 +9260,9 @@ dependencies = [ [[package]] name = "opendal-layer-retry" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2a25a718afb81fad81cb9a0580a1cb989221fa2317f888c6a37f8dad408eb7" dependencies = [ "backon", "log", @@ -9260,8 +9271,9 @@ dependencies = [ [[package]] name = "opendal-layer-timeout" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e91f731724c213af81e9d03517859c8fc47b4578e64ad61ae4f099f10fe36e3" dependencies = [ "opendal-core", "tokio", @@ -9269,8 +9281,9 @@ dependencies = [ [[package]] name = "opendal-layer-tracing" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90c6fc9df6da1f0dafbdf55fa48525f1643aefbe7da8f46936e869e2a5b8a34f" dependencies = [ "futures", "http 1.3.1", @@ -9280,8 +9293,9 @@ dependencies = [ [[package]] name = "opendal-service-azblob" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0030644366ef5d8cbe3a4a5822bf99a4aafddc1666e9d24b44d158d9062fc76a" dependencies = [ "base64 0.22.1", "bytes", @@ -9300,8 +9314,9 @@ dependencies = [ [[package]] name = "opendal-service-azure-common" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b489f13c42e69d69bdd72952b634356ec43a7881a20259b38b540fcecdf4051" dependencies = [ "http 1.3.1", "opendal-core", @@ -9309,8 +9324,9 @@ dependencies = [ [[package]] name = "opendal-service-fs" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e89a665fef0e6bd249cf5ea47fc174b7ba892159bee4b9382528b1ca873a2c" dependencies = [ "bytes", "log", @@ -9322,8 +9338,9 @@ dependencies = [ [[package]] name = "opendal-service-gcs" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48de101aac565ed06af4b47903c24eafd249075553ec1fb18256751c45148d47" dependencies = [ "async-trait", "bytes", @@ -9342,8 +9359,9 @@ dependencies = [ [[package]] name = "opendal-service-http" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb6af628a0bf14075b957179444927e1df40dc7addef382b585a05ef015a077b" dependencies = [ "http 1.3.1", "log", @@ -9353,8 +9371,9 @@ dependencies = [ [[package]] name = "opendal-service-oss" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "328fa55e8888cbdfe00826bfea2a79042422b720e8369e9e021e46121dea5ace" dependencies = [ "bytes", "http 1.3.1", @@ -9369,8 +9388,9 @@ dependencies = [ [[package]] name = "opendal-service-s3" -version = "0.56.0" -source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "313d46c9f5ae70bca26b7c3e3fbb9b639292625f28af73aa016f47e788af9deb" dependencies = [ "base64 0.22.1", "bytes", @@ -14102,9 +14122,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tar" -version = "0.4.45" +version = "0.4.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" +checksum = "3f6221d9a6003c78398e3b239969f352578258df48c8eb051caadae0015bc840" dependencies = [ "filetime", "libc", diff --git a/Cargo.toml b/Cargo.toml index 32407f31cf..56200a24d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -178,7 +178,7 @@ nalgebra = "0.33" nix = { version = "0.30.1", default-features = false, features = ["event", "fs", "process"] } notify = "8.0" num_cpus = "1.16" -object_store_opendal = { git = "https://github.com/apache/opendal.git", rev = "4ad2d85296ffa6fdc2882f97d3c760ee243913f7" } +object_store_opendal = "0.57" once_cell = "1.18" opentelemetry-proto = { version = "0.31", features = [ "gen-tonic", diff --git a/config/config.md b/config/config.md index 0fae0caaa4..d9cffaf122 100644 --- a/config/config.md +++ b/config/config.md @@ -14,6 +14,7 @@ | --- | -----| ------- | ----------- | | `default_timezone` | String | Unset | The default timezone of the server. | | `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. | +| `auto_create_table` | Bool | `true` | Server-side global switch for auto table creation on write.
When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`. | | `user_provider` | String | Unset | The user provider for authentication.
Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" | | `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
Set to 0 to disable the limit. Default: "0" (unlimited) | | `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.
Options: "wait" (default, 10s timeout), "wait()" (e.g., "wait(30s)"), "fail" | @@ -230,6 +231,7 @@ | --- | -----| ------- | ----------- | | `default_timezone` | String | Unset | The default timezone of the server. | | `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. | +| `auto_create_table` | Bool | `true` | Server-side global switch for auto table creation on write.
When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`. | | `user_provider` | String | Unset | The user provider for authentication.
Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" | | `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
Set to 0 to disable the limit. Default: "0" (unlimited) | | `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.
Options: "wait" (default, 10s timeout), "wait()" (e.g., "wait(30s)"), "fail" | @@ -628,6 +630,7 @@ | `flow.batching_mode.experimental_frontend_scan_timeout` | String | `30s` | Flow wait for available frontend timeout,
if failed to find available frontend after frontend_scan_timeout elapsed, return error
which prevent flownode from starting | | `flow.batching_mode.experimental_max_filter_num_per_query` | Integer | `20` | Maximum number of filters allowed in a single query | | `flow.batching_mode.experimental_time_window_merge_threshold` | Integer | `3` | Time window merge distance | +| `flow.batching_mode.experimental_enable_incremental_read` | Bool | `false` | Whether to enable experimental flow incremental source reads.
When disabled, batching flows always execute full-snapshot queries.
Can be overridden per flow with WITH (experimental_enable_incremental_read = 'true'). | | `flow.batching_mode.read_preference` | String | `Leader` | Read preference of the Frontend client. | | `flow.batching_mode.frontend_tls` | -- | -- | -- | | `flow.batching_mode.frontend_tls.enabled` | Bool | `false` | Whether to enable TLS for client. | diff --git a/config/flownode.example.toml b/config/flownode.example.toml index 2c053e6e8c..ff8a9e4a50 100644 --- a/config/flownode.example.toml +++ b/config/flownode.example.toml @@ -31,6 +31,10 @@ node_id = 14 #+experimental_max_filter_num_per_query=20 ## Time window merge distance #+experimental_time_window_merge_threshold=3 +## Whether to enable experimental flow incremental source reads. +## When disabled, batching flows always execute full-snapshot queries. +## Can be overridden per flow with WITH (experimental_enable_incremental_read = 'true'). +#+experimental_enable_incremental_read=false ## Read preference of the Frontend client. #+read_preference="Leader" [flow.batching_mode.frontend_tls] diff --git a/config/frontend.example.toml b/config/frontend.example.toml index 39f38fbef9..a044aebda6 100644 --- a/config/frontend.example.toml +++ b/config/frontend.example.toml @@ -6,6 +6,10 @@ default_timezone = "UTC" ## @toml2docs:none-default default_column_prefix = "greptime" +## Server-side global switch for auto table creation on write. +## When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`. +#+ auto_create_table = true + ## The user provider for authentication. ## Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" ## @toml2docs:none-default diff --git a/config/standalone.example.toml b/config/standalone.example.toml index d5c42e744c..5740e0e1cf 100644 --- a/config/standalone.example.toml +++ b/config/standalone.example.toml @@ -6,6 +6,10 @@ default_timezone = "UTC" ## @toml2docs:none-default default_column_prefix = "greptime" +## Server-side global switch for auto table creation on write. +## When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`. +#+ auto_create_table = true + ## The user provider for authentication. ## Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" ## @toml2docs:none-default diff --git a/docs/rfcs/2026-05-28-table-semantic-layer.md b/docs/rfcs/2026-05-28-table-semantic-layer.md new file mode 100644 index 0000000000..e4d899d704 --- /dev/null +++ b/docs/rfcs/2026-05-28-table-semantic-layer.md @@ -0,0 +1,157 @@ +--- +Feature Name: Table Semantic Layer +Tracking Issue: TBD +Date: 2026-05-28 +Author: "Dennis Zhuang " +--- + +# Summary + +Attach a thin layer of semantic metadata to each table so machine consumers — LLM agents, alert generators, dashboard builders, MCP servers, ETL pipelines — can align it with the observability concepts they already know (OTel instrument kinds, Prometheus naming conventions, UCUM units, semantic conventions, severity numbers, OTel ↔ Prometheus translation rules). + +The mechanism reuses what already exists in `table_options` (the same slot that today carries `table_data_model` and `otlp_metric_compat`): a reserved `greptime.semantic.*` namespace, plus standard SQL column `COMMENT` for field-level supplements, plus an `information_schema.semantic_tables` view as the discovery entry point. No new protocol, no new DDL keyword. + +Per-table identity only. Cross-table relationships are deferred. + +# Motivation + +GreptimeDB already ingests OTLP metrics / traces / logs and Prometheus remote write. Each protocol carries rich metadata on the wire (instrument kind, temporality, unit, scope, resource, semantic-conventions version), and most of it is dropped when rows land in a table: + +- An `opentelemetry_traces` table looks like any wide table; signal type, source, and field provenance must be guessed from naming. +- The OTel-to-Prometheus translation in v0.16+ actively drops scope attributes and most resource attributes; the table never records *what was dropped*. +- Prometheus remote write v1 metadata is unreliable by protocol, but downstream tables do not flag whether `counter` typing was *declared* or *inferred* from the `_total` suffix. +- Mixed-temporality data (OTel delta + Prometheus cumulative in the same table) is unrecoverable from schema alone. + +The audience is broader than LLM agents. Alert generators need to choose between `rate()` and absolute thresholds, and need units to pick sensible bounds. Dashboard builders pick visualisations by signal type. MCP servers surface a structured tool catalog instead of free-text descriptions. ETL pipelines need lineage to know whether a `service_name` column is `resource.service.name` or a free-form label. All of them currently guess from column names; the metadata to remove the guess already exists at ingest time, we just do not preserve it. + +# Goals + +1. Tag every ingested table with a stable identity using existing SQL surfaces — no new protocol, no new DDL keyword. +2. Record the lossy transformations the ingestion path performs (dropped attributes, scope handling, type inference vs. declaration). +3. Expose one `information_schema` view as the consumer-facing discovery entry point. +4. Keep the layer optional and additive — tables without these options keep working unchanged. + +# Non-Goals + +- Cross-table relationship modelling. Deferred to a follow-up RFC. +- Bespoke storage. Reuse `table_options` and column `COMMENT`. +- Semantic enforcement at query time. The layer is descriptive, not coercive. +- New wire protocol. Upstream standardisation is mentioned only as a future direction. + +# Proposal + +## Three mechanisms + +1. **`greptime.semantic.*` table options** — table-level identity and lineage. Carried inside the existing `table_options` blob. This is the same slot that today carries `table_data_model = 'greptime_trace_v1'` and `otlp_metric_compat = 'prom'`, so the mechanism is generalising what the OTLP trace auto-create path already does. +2. **Column `COMMENT`** — column-level supplements ("this column is `resource.service.name`"; "this column carries delta values"). Standard SQL. +3. **`information_schema.semantic_tables` view** — a denormalised projection of the options, registered through the existing `with_extra_table_factories()` hook. Tables without a `greptime.semantic.*` option do not appear in the view. + +## Vocabulary + +All keys are flat strings under the `greptime.semantic.` prefix; values are strings; unknown keys are tolerated so the vocabulary can grow without coordinated rollouts. + +**Common (all signals)** + +| Key | Example | +| --- | --- | +| `greptime.semantic.signal_type` | `trace` / `log` / `metric` / `event` | +| `greptime.semantic.source` | `opentelemetry` / `prometheus` / `elasticsearch` / `loki` / `custom` | +| `greptime.semantic.source_version` | protocol or SDK version, e.g. `v2` (Prom remote write), `1.30.0` (optional) | +| `greptime.semantic.pipeline` | `greptime_trace_v1` (subsumes the existing `table_data_model` value) | + +**Trace**: `greptime.semantic.trace.conventions` (e.g. `otel-semconv-1.27`, lifted from `schema_url`, which is the version of the OpenTelemetry semantic conventions used in this table), `greptime.semantic.trace.has_events`, `greptime.semantic.trace.has_links`. + +**Metric** — v1 assumes one metric type per table, which is how both Prom RW and the post-v0.16 OTel ingestion path land data today; mixed-type tables are a follow-up. + +| Key | Example | +| --- | --- | +| `greptime.semantic.metric.type` | `counter` / `gauge` / `histogram` / `summary` / `updown_counter` / `gauge_histogram` / `info` / `stateset` | +| `greptime.semantic.metric.unit` | UCUM, e.g. `s`, `By`, `{request}` | +| `greptime.semantic.metric.temporality` | `cumulative` / `delta` (OTel only) | +| `greptime.semantic.metric.monotonic` | `true` / `false` | +| `greptime.semantic.metric.metadata_quality` | `declared` (OTLP / Prom RW v2 / exposition) or `inferred` (Prom RW v1, name-suffix guess) | +| `greptime.semantic.metric.original_name` | Pre-translation OTel name when the table name was Prometheus-ised | + +`metadata_quality = inferred` is the load-bearing field for confidence-aware tooling: an inferred counter should be re-checked before betting on `rate()`-style semantics. + +**Log**: `greptime.semantic.log.severity_scheme` (`otlp` / `syslog` / `custom`), `greptime.semantic.log.body_format` (`string` / `json` / `mixed`). + +**Resource / scope preservation**: `greptime.semantic.resource.attributes_preserved` (JSON array string of attrs promoted to columns), `greptime.semantic.resource.attributes_dropped` (boolean), `greptime.semantic.scope.preserved` (boolean). These answer the most common downstream question: "is this data missing because it was dropped, or because it lives on a different column than I think?" List-shaped values use JSON array strings rather than comma-separated text to avoid escaping and ordering ambiguity. + +## Conflict and update semantics + +Two design decisions worth pinning down up front, because they constrain everything else: + +- **Conflict.** Some table-level keys (`trace.conventions` lifted from `schema_url`, `metric.temporality`, ...) cannot represent the truth when a long-lived table sees rows from multiple sources. v1 records `mixed` or `unknown` rather than a fictitious single value. Downstream consumers must treat any single-valued semantic key as best-effort, not strong evidence. +- **Update.** Semantic options are stamped at table creation. v1 does not specify an update path; promoting `metadata_quality` from `inferred` to `declared`, refreshing `resource.attributes_preserved`, or revising `trace.conventions` on later writes is deferred. If real usage shows update is needed, it lands as a separate RFC. + +## `information_schema.semantic_tables` + +A consumer's first SQL on connect: + +```sql +SELECT table_catalog, table_schema, table_name, signal_type, source, pipeline +FROM information_schema.semantic_tables; +``` + +returns one row per semantic-tagged table. The view exposes a stable set of core columns (`table_catalog`, `table_schema`, `table_name`, `signal_type`, `source`, `source_version`, `pipeline`) plus a `semantic_options` JSON column carrying the rest of the `greptime.semantic.*` keys verbatim. Future keys appear inside `semantic_options` without forcing a view-schema change; only widely-used keys are ever promoted to first-class columns. + +# Implementation Plan + +Four phases, each independently shippable. + +1. **Identity.** Stamp `signal_type` and `source` on every auto-create path. The OTLP paths already have natural injection points; Prom remote write is the one non-trivial path because metric-engine logical tables share physical storage (see Open Question 2). +2. **Metric specifics.** Add type / unit / temporality / monotonic / metadata_quality / original_name at OTel metric and Prom RW ingestion sites; the data is already at hand inside the OTel translator. +3. **Resource / scope lineage.** Record what the OTel-to-Prometheus translation kept and dropped. +4. **`information_schema.semantic_tables` view + documentation** as a stable user-facing contract. + +# Relationship to OpenTelemetry standardisation + +OTel today standardises what producers emit and how data collectors are managed; the read side — what a backend exposes back to clients — is deliberately vendor turf. OTLP is one-way; OpAMP is agent management; OTEP-0243 (App Telemetry Schema) is producer-side; `schema_url` is producer-stated with no reverse. Adjacent precedents — Prometheus `/api/v1/metadata`, Loki labels API, Tempo tags, Jaeger services, ad-hoc MCP servers — are all vendor-specific. + +This is a real gap. The shape we propose locally (signal-agnostic, `schema_url`-aware, structured around a small vocabulary) is deliberately close to what a future upstream OTEP for a backend-catalog read API could look like, with Weaver's *Resolved Telemetry Schema* as the natural data model. We do not commit to driving such an OTEP here; we do commit to keeping the local shape close enough that a future upstream proposal does not force a breaking migration. + +# Alternatives + +- **New DDL syntax (`SEMANTIC trace WITH (...)`).** Cleaner-looking but non-standard and forces every client to learn it. The metadata is not interesting enough to justify a new keyword. +- **Dedicated `_semantic` system table.** Doubles the storage path for what is static per-table KV and adds lifecycle questions (drop, backfill). A view over `table_options` covers the same access pattern. +- **Column comments only.** Discovery (`WHERE signal_type = 'trace'`) becomes a full-text problem. Comments are good for column-level supplements, not for identity. +- **Encode everything into the table name.** What we do today. Every new field becomes a new naming convention. + +# Open Questions + +1. **Namespace prefix.** `greptime.semantic.*` vs. bare `semantic.*`. v1 picks the vendored prefix; alias or migrate if a community standard later emerges. +2. **Prom RW injection point.** Metric-engine logical tables share physical storage, so per-logical-table options need a hook that does not exist as cleanly as the OTLP trace branch. A short spike before Phase 1 lands for Prom RW. +3. **Mixed-type metric tables.** When ingestion modes that pack multiple metric types into one table appear, `metric.type` migrates from table-level to row-level. v1 leaves a `metric.type = 'mixed'` marker and punts. +4. **Stability surface.** Top-level keys (`signal_type`, `source`) are stable; sub-namespaces (`metric.*`, ...) are evolving until v1.0 of the layer is declared. + +# Future Work + +- **Cross-table relationships.** Paired trace/services tables, metric/info pairing, JOIN hints. Its own RFC. +- **Producer SDK/client identity.** An optional `greptime.semantic.source.sdk` key recording the emitting client (e.g. `opentelemetry-go`, `opentelemetry-java`, `opentelemetry-collector`). Because a single table can receive data from multiple SDKs (a shared trace table is the common case), mixed producers collapse to `mixed`, following the same conflict rule as the table-level keys above. +- **Backfill** for tables created before this feature shipped. +- **Upstream proposal.** Carry the shape into a community proposal — likely an OTEP for an OTLP-Catalog read API plus an MCP binding — informed by Greptime's local usage data. + +# References + +OpenTelemetry: +- [OTLP specification](https://opentelemetry.io/docs/specs/otlp/) +- [OTel Schemas (`schema_url`)](https://opentelemetry.io/docs/specs/otel/schemas/) +- [Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/) +- [OTEP-0243: App Telemetry Schema](https://github.com/open-telemetry/oteps/blob/main/text/0243-app-telemetry-schema-vision-roadmap.md) +- [OpAMP specification](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md) +- [Weaver: Resolved Telemetry Schema](https://github.com/open-telemetry/weaver) +- [2025 Stability Proposal](https://opentelemetry.io/blog/2025/stability-proposal-announcement/) + +Prometheus / OpenMetrics: +- [Prometheus Remote Write 1.0](https://prometheus.io/docs/specs/prw/remote_write_spec/) +- [Prometheus Remote Write 2.0](https://prometheus.io/docs/specs/prw/remote_write_spec_2_0/) +- [Prometheus exposition formats](https://prometheus.io/docs/instrumenting/exposition_formats/) +- [Prometheus HTTP API: `/api/v1/metadata`](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-metric-metadata) + +Units and conventions: +- [UCUM — Unified Code for Units of Measure](https://ucum.org/) + +GreptimeDB: +- [OTLP ingestion guide](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/) +- [Trace data model](https://docs.greptime.com/user-guide/traces/data-model/) diff --git a/src/catalog/src/kvbackend/table_cache.rs b/src/catalog/src/kvbackend/table_cache.rs index 42b3fbc74b..13f74a48c9 100644 --- a/src/catalog/src/kvbackend/table_cache.rs +++ b/src/catalog/src/kvbackend/table_cache.rs @@ -14,7 +14,9 @@ use std::sync::Arc; -use common_meta::cache::{CacheContainer, Initializer, TableInfoCacheRef, TableNameCacheRef}; +use common_meta::cache::{ + CacheContainer, InitStrategy, Initializer, TableInfoCacheRef, TableNameCacheRef, +}; use common_meta::error::{Result as MetaResult, ValueNotExistSnafu}; use common_meta::instruction::CacheIdent; use futures::future::BoxFuture; @@ -38,7 +40,14 @@ pub fn new_table_cache( ) -> TableCache { let init = init_factory(table_info_cache, table_name_cache); - CacheContainer::new(name, cache, Box::new(invalidator), init, filter) + CacheContainer::with_strategy( + name, + cache, + Box::new(invalidator), + init, + filter, + InitStrategy::VersionChecked, + ) } fn init_factory( diff --git a/src/cmd/src/datanode.rs b/src/cmd/src/datanode.rs index 3c106bd43f..7f2057bc97 100644 --- a/src/cmd/src/datanode.rs +++ b/src/cmd/src/datanode.rs @@ -79,7 +79,7 @@ impl App for Instance { } async fn start(&mut self) -> Result<()> { - plugins::start_datanode_plugins(self.datanode.plugins()) + plugins::start_datanode_plugins(&self.datanode) .await .context(StartDatanodeSnafu)?; diff --git a/src/cmd/src/flownode.rs b/src/cmd/src/flownode.rs index 6228cbd3f3..32d9070ec6 100644 --- a/src/cmd/src/flownode.rs +++ b/src/cmd/src/flownode.rs @@ -90,7 +90,7 @@ impl App for Instance { } async fn start(&mut self) -> Result<()> { - plugins::start_flownode_plugins(self.flownode.flow_engine().plugins().clone()) + plugins::start_flownode_plugins(&self.flownode) .await .context(StartFlownodeSnafu)?; diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index da2c111e7c..cbc07d10e9 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -95,8 +95,7 @@ impl App for Instance { } async fn start(&mut self) -> Result<()> { - let plugins = self.frontend.instance.plugins().clone(); - plugins::start_frontend_plugins(plugins) + plugins::start_frontend_plugins(&self.frontend.instance) .await .context(error::StartFrontendSnafu)?; diff --git a/src/cmd/src/metasrv.rs b/src/cmd/src/metasrv.rs index bf3cb2f5e7..e30b115ada 100644 --- a/src/cmd/src/metasrv.rs +++ b/src/cmd/src/metasrv.rs @@ -68,7 +68,7 @@ impl App for Instance { } async fn start(&mut self) -> Result<()> { - plugins::start_metasrv_plugins(self.instance.plugins()) + plugins::start_metasrv_plugins(&self.instance) .await .context(StartMetaServerSnafu)?; diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index e0f2c673ff..7d99e99554 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -164,7 +164,7 @@ impl App for Instance { .start(self.leader_services_context.clone()) .await?; - plugins::start_frontend_plugins(self.frontend.instance.plugins().clone()) + plugins::start_frontend_plugins(&self.frontend.instance) .await .context(error::StartFrontendSnafu)?; diff --git a/src/cmd/tests/load_config_test.rs b/src/cmd/tests/load_config_test.rs index 6cffcd67c2..cee29e4456 100644 --- a/src/cmd/tests/load_config_test.rs +++ b/src/cmd/tests/load_config_test.rs @@ -114,6 +114,7 @@ fn test_load_frontend_example_config() { component: FrontendOptions { default_timezone: Some("UTC".to_string()), default_column_prefix: Some("greptime".to_string()), + auto_create_table: true, meta_client: Some(MetaClientOptions { metasrv_addrs: vec!["127.0.0.1:3002".to_string()], timeout: Duration::from_secs(3), @@ -267,6 +268,7 @@ fn test_load_standalone_example_config() { component: StandaloneOptions { default_timezone: Some("UTC".to_string()), default_column_prefix: Some("greptime".to_string()), + auto_create_table: true, wal: DatanodeWalConfig::RaftEngine(RaftEngineConfig { dir: Some(format!("{}/{}", DEFAULT_DATA_HOME, WAL_DIR)), sync_period: Some(Duration::from_secs(10)), diff --git a/src/common/datasource/Cargo.toml b/src/common/datasource/Cargo.toml index 470b5371f7..8b4053db2f 100644 --- a/src/common/datasource/Cargo.toml +++ b/src/common/datasource/Cargo.toml @@ -33,6 +33,7 @@ datatypes.workspace = true futures.workspace = true lazy_static.workspace = true object-store.workspace = true +object_store_opendal.workspace = true orc-rust = { version = "0.8", default-features = false, features = ["async"] } parquet.workspace = true paste.workspace = true diff --git a/src/common/datasource/src/file_format.rs b/src/common/datasource/src/file_format.rs index a6a358c9e4..e36f94c0d2 100644 --- a/src/common/datasource/src/file_format.rs +++ b/src/common/datasource/src/file_format.rs @@ -316,7 +316,7 @@ pub async fn file_to_stream( .with_file_compression_type(df_compression) .build(); - let store = Arc::new(object_store::compat::OpendalStore::new(store.clone())); + let store = Arc::new(object_store_opendal::OpendalStore::new(store.clone())); let file_opener = config.file_source().create_file_opener(store, &config, 0)?; let stream = FileStream::new(&config, 0, file_opener, &ExecutionPlanMetricsSet::new())?; diff --git a/src/common/datasource/src/file_format/tests.rs b/src/common/datasource/src/file_format/tests.rs index 93ab3b4409..a925f73d48 100644 --- a/src/common/datasource/src/file_format/tests.rs +++ b/src/common/datasource/src/file_format/tests.rs @@ -44,7 +44,7 @@ struct Test<'a> { impl Test<'_> { async fn run(self, store: &ObjectStore) { - let store = Arc::new(object_store::compat::OpendalStore::new(store.clone())); + let store = Arc::new(object_store_opendal::OpendalStore::new(store.clone())); let file_opener = self .file_source .create_file_opener(store, &self.config, 0) diff --git a/src/common/datasource/src/object_store/oss.rs b/src/common/datasource/src/object_store/oss.rs index aded3eca2c..aacc17ac5e 100644 --- a/src/common/datasource/src/object_store/oss.rs +++ b/src/common/datasource/src/object_store/oss.rs @@ -27,12 +27,14 @@ const ACCESS_KEY_ID: &str = "access_key_id"; const ACCESS_KEY_SECRET: &str = "access_key_secret"; const ROOT: &str = "root"; const ALLOW_ANONYMOUS: &str = "allow_anonymous"; +const SKIP_SIGNATURE: &str = "skip_signature"; /// Check if the key is supported in OSS configuration. pub fn is_supported_in_oss(key: &str) -> bool { [ ROOT, ALLOW_ANONYMOUS, + SKIP_SIGNATURE, BUCKET, ENDPOINT, ACCESS_KEY_ID, @@ -61,18 +63,23 @@ pub fn build_oss_backend( builder = builder.access_key_secret(access_key_secret); } - if let Some(allow_anonymous) = connection.get(ALLOW_ANONYMOUS) { - let allow = allow_anonymous.as_str().parse::().map_err(|e| { + if let Some((key, value)) = connection + .get(SKIP_SIGNATURE) + .map(|value| (SKIP_SIGNATURE, value)) + .or_else(|| { + connection + .get(ALLOW_ANONYMOUS) + .map(|value| (ALLOW_ANONYMOUS, value)) + }) + { + let skip_signature = value.as_str().parse::().map_err(|e| { error::InvalidConnectionSnafu { - msg: format!( - "failed to parse the option {}={}, {}", - ALLOW_ANONYMOUS, allow_anonymous, e - ), + msg: format!("failed to parse the option {}={}, {}", key, value, e), } .build() })?; - if allow { - builder = builder.allow_anonymous(); + if skip_signature { + builder = builder.skip_signature(); } } @@ -93,6 +100,7 @@ mod tests { fn test_is_supported_in_oss() { assert!(is_supported_in_oss(ROOT)); assert!(is_supported_in_oss(ALLOW_ANONYMOUS)); + assert!(is_supported_in_oss(SKIP_SIGNATURE)); assert!(is_supported_in_oss(BUCKET)); assert!(is_supported_in_oss(ENDPOINT)); assert!(is_supported_in_oss(ACCESS_KEY_ID)); diff --git a/src/common/datasource/src/test_util.rs b/src/common/datasource/src/test_util.rs index 0a13d9c6e8..ea2b0c768c 100644 --- a/src/common/datasource/src/test_util.rs +++ b/src/common/datasource/src/test_util.rs @@ -103,7 +103,7 @@ pub async fn setup_stream_to_json_test(origin_path: &str, threshold: impl Fn(usi test_util::TEST_BATCH_SIZE, schema.clone(), FileCompressionType::UNCOMPRESSED, - Arc::new(object_store::compat::OpendalStore::new(store.clone())), + Arc::new(object_store_opendal::OpendalStore::new(store.clone())), true, ); @@ -157,7 +157,7 @@ pub async fn setup_stream_to_csv_test( let csv_opener = csv_source .create_file_opener( - Arc::new(object_store::compat::OpendalStore::new(store.clone())), + Arc::new(object_store_opendal::OpendalStore::new(store.clone())), &config, 0, ) diff --git a/src/common/meta/src/cache.rs b/src/common/meta/src/cache.rs index f16290937a..c26a0fab76 100644 --- a/src/common/meta/src/cache.rs +++ b/src/common/meta/src/cache.rs @@ -17,7 +17,7 @@ mod flow; mod registry; mod table; -pub use container::{CacheContainer, Initializer, Invalidator, TokenFilter}; +pub use container::{CacheContainer, InitStrategy, Initializer, Invalidator, TokenFilter}; pub use flow::{TableFlownodeSetCache, TableFlownodeSetCacheRef, new_table_flownode_set_cache}; pub use registry::{ CacheRegistry, CacheRegistryBuilder, CacheRegistryRef, LayeredCacheRegistry, diff --git a/src/common/meta/src/ddl/create_flow.rs b/src/common/meta/src/ddl/create_flow.rs index ddfb0c0759..8a419176c9 100644 --- a/src/common/meta/src/ddl/create_flow.rs +++ b/src/common/meta/src/ddl/create_flow.rs @@ -437,11 +437,13 @@ pub fn defer_on_missing_source(flow_task: &CreateFlowTask) -> Result { pub fn validate_flow_options(flow_task: &CreateFlowTask) -> Result<()> { for key in flow_task.flow_options.keys() { match key.as_str() { - DEFER_ON_MISSING_SOURCE_KEY | FlowType::FLOW_TYPE_KEY => {} + DEFER_ON_MISSING_SOURCE_KEY + | FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY + | FlowType::FLOW_TYPE_KEY => {} unknown => { return UnexpectedSnafu { err_msg: format!( - "Unknown flow option '{unknown}', supported user options: {DEFER_ON_MISSING_SOURCE_KEY}" + "Unknown flow option '{unknown}', supported user options: {DEFER_ON_MISSING_SOURCE_KEY}, {FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY}" ), } .fail(); @@ -487,6 +489,9 @@ pub enum FlowType { Streaming, } +pub const FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY: &str = + "experimental_enable_incremental_read"; + impl FlowType { pub const BATCHING: &str = "batching"; pub const STREAMING: &str = "streaming"; diff --git a/src/common/meta/src/ddl/tests/create_flow.rs b/src/common/meta/src/ddl/tests/create_flow.rs index a1a6c040f1..7150be39cb 100644 --- a/src/common/meta/src/ddl/tests/create_flow.rs +++ b/src/common/meta/src/ddl/tests/create_flow.rs @@ -24,8 +24,9 @@ use table::table_name::TableName; use crate::ddl::DdlContext; use crate::ddl::create_flow::{ - CreateFlowData, CreateFlowProcedure, CreateFlowState, DEFER_ON_MISSING_SOURCE_KEY, FlowType, - defer_on_missing_source, + CreateFlowData, CreateFlowProcedure, CreateFlowState, DEFER_ON_MISSING_SOURCE_KEY, + FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, FlowType, defer_on_missing_source, + validate_flow_options, }; use crate::ddl::test_util::create_table::test_create_table_task; use crate::ddl::test_util::flownode_handler::NaiveFlownodeHandler; @@ -275,6 +276,22 @@ fn test_defer_on_missing_source_invalid_value() { ); } +#[test] +fn test_validate_flow_options_allows_incremental_read_option() { + let mut task = test_create_flow_task( + "my_flow", + vec![], + TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"), + false, + ); + task.flow_options.insert( + FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(), + "true".to_string(), + ); + + validate_flow_options(&task).unwrap(); +} + #[tokio::test] async fn test_create_flow_rejects_unknown_option_in_meta_task() { let mut task = test_create_flow_task( diff --git a/src/file-engine/Cargo.toml b/src/file-engine/Cargo.toml index 6c8c9e887d..9d031cb279 100644 --- a/src/file-engine/Cargo.toml +++ b/src/file-engine/Cargo.toml @@ -29,6 +29,7 @@ datafusion-expr.workspace = true datatypes.workspace = true futures.workspace = true object-store.workspace = true +object_store_opendal.workspace = true serde = { version = "1.0", features = ["derive"] } serde_json.workspace = true snafu.workspace = true diff --git a/src/file-engine/src/query/file_stream.rs b/src/file-engine/src/query/file_stream.rs index eec8f8961d..a480a50374 100644 --- a/src/file-engine/src/query/file_stream.rs +++ b/src/file-engine/src/query/file_stream.rs @@ -61,7 +61,7 @@ fn build_record_batch_stream( .with_file_group(FileGroup::new(files)) .build(); - let store = Arc::new(object_store::compat::OpendalStore::new( + let store = Arc::new(object_store_opendal::OpendalStore::new( scan_plan_config.store.clone(), )); diff --git a/src/flow/src/batching_mode.rs b/src/flow/src/batching_mode.rs index 580762a142..a8bd139d98 100644 --- a/src/flow/src/batching_mode.rs +++ b/src/flow/src/batching_mode.rs @@ -23,7 +23,6 @@ use session::ReadPreference; mod checkpoint; pub(crate) mod engine; pub(crate) mod frontend_client; -mod incremental_filter; mod state; mod table_creator; mod task; @@ -55,6 +54,10 @@ pub struct BatchingModeOptions { pub experimental_max_filter_num_per_query: usize, /// Time window merge distance pub experimental_time_window_merge_threshold: usize, + /// Whether to enable experimental flow incremental source reads. + /// + /// When disabled, batching flows always execute full-snapshot queries. + pub experimental_enable_incremental_read: bool, /// Read preference of the Frontend client. pub read_preference: ReadPreference, /// TLS option for client connections to frontends. @@ -72,6 +75,7 @@ impl Default for BatchingModeOptions { experimental_frontend_scan_timeout: Duration::from_secs(30), experimental_max_filter_num_per_query: 20, experimental_time_window_merge_threshold: 3, + experimental_enable_incremental_read: false, read_preference: Default::default(), frontend_tls: None, } diff --git a/src/flow/src/batching_mode/engine.rs b/src/flow/src/batching_mode/engine.rs index f37e54d80b..68fb3793e4 100644 --- a/src/flow/src/batching_mode/engine.rs +++ b/src/flow/src/batching_mode/engine.rs @@ -21,7 +21,7 @@ use std::time::Duration; use api::v1::flow::DirtyWindowRequests; use catalog::CatalogManagerRef; use common_error::ext::BoxedError; -use common_meta::ddl::create_flow::FlowType; +use common_meta::ddl::create_flow::{FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, FlowType}; use common_meta::key::TableMetadataManagerRef; use common_meta::key::flow::FlowMetadataManagerRef; use common_meta::key::flow::flow_state::FlowStat; @@ -38,6 +38,7 @@ use session::context::QueryContext; use snafu::{OptionExt, ResultExt, ensure}; use sql::parsers::utils::is_tql; use store_api::metric_engine_consts::is_metric_engine_internal_column; +use store_api::mito_engine_options::APPEND_MODE_KEY; use store_api::storage::{RegionId, TableId}; use table::table_reference::TableReference; use tokio::sync::{RwLock, oneshot}; @@ -428,6 +429,55 @@ async fn get_table_info( } impl BatchingEngine { + fn batch_opts_for_flow_options( + &self, + flow_options: &HashMap, + ) -> Result, Error> { + let mut batch_opts = (*self.batch_opts).clone(); + if let Some(enable_incremental_read) = + flow_options.get(FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY) + { + batch_opts.experimental_enable_incremental_read = enable_incremental_read + .parse::() + .map_err(|_| { + InvalidQuerySnafu { + reason: format!( + "Invalid flow option {FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY}: {enable_incremental_read}" + ), + } + .build() + })?; + } + + Ok(Arc::new(batch_opts)) + } + + fn table_options_enable_append_mode(extra_options: &HashMap) -> bool { + extra_options + .get(APPEND_MODE_KEY) + .is_some_and(|value| value.eq_ignore_ascii_case("true")) + } + + fn ensure_incremental_source_append_only( + batch_opts: &BatchingModeOptions, + table_name: &[String; 3], + extra_options: &HashMap, + ) -> Result<(), Error> { + if batch_opts.experimental_enable_incremental_read { + ensure!( + Self::table_options_enable_append_mode(extra_options), + UnsupportedSnafu { + reason: format!( + "Flow incremental read requires append-only source table, but source table `{}` is not append-only. Consider setting append_mode='true' on the source table or disabling experimental_enable_incremental_read", + table_name.join(".") + ), + } + ); + } + + Ok(()) + } + pub async fn create_flow_inner(&self, args: CreateFlowArgs) -> Result, Error> { let CreateFlowArgs { flow_id, @@ -494,6 +544,8 @@ impl BatchingEngine { } ); + let batch_opts = self.batch_opts_for_flow_options(&flow_options)?; + let mut source_table_names = Vec::with_capacity(2); for src_id in source_table_ids { // also check table option to see if ttl!=instant @@ -509,6 +561,11 @@ impl BatchingEngine { ), } ); + Self::ensure_incremental_source_append_only( + &batch_opts, + &table_name, + &table_info.table_info.meta.options.extra_options, + )?; source_table_names.push(table_name); } @@ -563,7 +620,7 @@ impl BatchingEngine { query_ctx, catalog_manager: self.catalog_manager.clone(), shutdown_rx: rx, - batch_opts: self.batch_opts.clone(), + batch_opts, flow_eval_interval: eval_interval.map(|secs| Duration::from_secs(secs as u64)), }; @@ -808,7 +865,7 @@ impl BatchingEngine { }); let res = task - .gen_exec_once( + .execute_once_serialized( &self.query_engine, &self.frontend_client, cur_dirty_window_cnt, @@ -946,6 +1003,76 @@ mod tests { ) } + #[tokio::test] + async fn test_flow_option_overrides_incremental_read_switch() { + let engine = new_test_engine().await; + + let default_opts = engine.batch_opts_for_flow_options(&HashMap::new()).unwrap(); + assert!(!default_opts.experimental_enable_incremental_read); + + let enabled_opts = engine + .batch_opts_for_flow_options(&HashMap::from([( + FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(), + "true".to_string(), + )])) + .unwrap(); + assert!(enabled_opts.experimental_enable_incremental_read); + } + + #[test] + fn test_table_options_enable_append_mode() { + assert!(!BatchingEngine::table_options_enable_append_mode( + &HashMap::new() + )); + assert!(!BatchingEngine::table_options_enable_append_mode( + &HashMap::from([(APPEND_MODE_KEY.to_string(), "false".to_string())]) + )); + assert!(BatchingEngine::table_options_enable_append_mode( + &HashMap::from([(APPEND_MODE_KEY.to_string(), "TRUE".to_string())]) + )); + } + + #[test] + fn test_incremental_source_append_only_enforcement() { + let table_name = [ + "greptime".to_string(), + "public".to_string(), + "numbers".to_string(), + ]; + let disabled_opts = BatchingModeOptions::default(); + let enabled_opts = BatchingModeOptions { + experimental_enable_incremental_read: true, + ..Default::default() + }; + let non_append_options = HashMap::new(); + let append_options = HashMap::from([(APPEND_MODE_KEY.to_string(), "true".to_string())]); + + BatchingEngine::ensure_incremental_source_append_only( + &disabled_opts, + &table_name, + &non_append_options, + ) + .expect("disabled incremental read should not require append-only source"); + BatchingEngine::ensure_incremental_source_append_only( + &enabled_opts, + &table_name, + &append_options, + ) + .expect("append-only source should be accepted when incremental read is enabled"); + + let err = BatchingEngine::ensure_incremental_source_append_only( + &enabled_opts, + &table_name, + &non_append_options, + ) + .expect_err("non-append source should be rejected when incremental read is enabled"); + assert!( + err.to_string() + .contains("Flow incremental read requires append-only source table"), + "{err}" + ); + } + async fn new_test_task(flow_id: FlowId) -> (BatchingTask, oneshot::Sender<()>) { let query_engine = create_test_query_engine(); let ctx = QueryContext::arc(); diff --git a/src/flow/src/batching_mode/incremental_filter.rs b/src/flow/src/batching_mode/incremental_filter.rs deleted file mode 100644 index ddc58d0378..0000000000 --- a/src/flow/src/batching_mode/incremental_filter.rs +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_telemetry::tracing::debug; -use datafusion_expr::Expr; -use datatypes::schema::Schema; - -use crate::batching_mode::state::FilterExprInfo; -use crate::batching_mode::utils::IncrementalAggregateAnalysis; -use crate::{Error, FlowId}; - -pub(super) fn build_sink_dirty_time_window_filter_expr( - flow_id: FlowId, - analysis: &IncrementalAggregateAnalysis, - sink_schema: &Schema, - dirty_filter: Option<&FilterExprInfo>, -) -> Result, Error> { - let Some(dirty_filter) = dirty_filter else { - return Ok(None); - }; - - let Some(sink_filter_col) = - infer_sink_time_window_filter_col(flow_id, analysis, sink_schema, dirty_filter) - else { - return Ok(None); - }; - - dirty_filter.predicate_for_col(&sink_filter_col) -} - -fn infer_sink_time_window_filter_col( - flow_id: FlowId, - analysis: &IncrementalAggregateAnalysis, - sink_schema: &Schema, - dirty_filter: &FilterExprInfo, -) -> Option { - if analysis.group_key_names.is_empty() { - return None; - } - - let is_timestamp_group_key = |name: &str| { - analysis.group_key_names.iter().any(|key| key == name) - && sink_schema - .column_schema_by_name(name) - .is_some_and(|col| col.data_type.is_timestamp()) - }; - - if is_timestamp_group_key(&dirty_filter.col_name) { - return Some(dirty_filter.col_name.clone()); - } - - let candidates = analysis - .group_key_names - .iter() - .filter(|name| is_timestamp_group_key(name)) - .cloned() - .collect::>(); - - match candidates.as_slice() { - [name] => Some(name.clone()), - [] => { - debug!( - "Flow {} cannot infer sink dirty-window filter column: no timestamp group key in {:?}", - flow_id, analysis.group_key_names - ); - None - } - _ => { - debug!( - "Flow {} cannot infer sink dirty-window filter column: ambiguous timestamp group keys {:?}", - flow_id, candidates - ); - None - } - } -} - -#[cfg(test)] -mod test { - use datatypes::prelude::ConcreteDataType; - use datatypes::schema::ColumnSchema; - use pretty_assertions::assert_eq; - - use super::*; - use crate::adapter::AUTO_CREATED_UPDATE_AT_TS_COL; - use crate::batching_mode::state::FilterExprInfo; - use crate::batching_mode::utils::IncrementalAggregateAnalysis; - - fn test_analysis_with_group_keys(group_key_names: Vec<&str>) -> IncrementalAggregateAnalysis { - IncrementalAggregateAnalysis { - group_key_names: group_key_names - .into_iter() - .map(|name| name.to_string()) - .collect(), - merge_columns: vec![], - literal_columns: vec![], - output_field_names: vec![], - unsupported_exprs: vec![], - } - } - - fn test_dirty_filter(col_name: &str) -> FilterExprInfo { - FilterExprInfo { - expr: datafusion_expr::col(col_name), - col_name: col_name.to_string(), - time_ranges: vec![], - window_size: chrono::Duration::seconds(1), - } - } - - fn test_sink_schema(columns: Vec<(&str, ConcreteDataType)>) -> Schema { - Schema::new( - columns - .into_iter() - .map(|(name, data_type)| ColumnSchema::new(name, data_type, true)) - .collect(), - ) - } - - #[test] - fn test_infer_sink_time_window_filter_col_uses_matching_source_group_key() { - let analysis = test_analysis_with_group_keys(vec!["ts", "host"]); - let sink_schema = test_sink_schema(vec![ - ("ts", ConcreteDataType::timestamp_millisecond_datatype()), - ("host", ConcreteDataType::string_datatype()), - ]); - let dirty_filter = test_dirty_filter("ts"); - - assert_eq!( - Some("ts".to_string()), - infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter) - ); - } - - #[test] - fn test_infer_sink_time_window_filter_col_uses_unique_timestamp_group_key() { - let analysis = test_analysis_with_group_keys(vec!["host", "time_window"]); - let sink_schema = test_sink_schema(vec![ - ("host", ConcreteDataType::string_datatype()), - ( - "time_window", - ConcreteDataType::timestamp_millisecond_datatype(), - ), - ( - AUTO_CREATED_UPDATE_AT_TS_COL, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - ]); - let dirty_filter = test_dirty_filter("ts"); - - assert_eq!( - Some("time_window".to_string()), - infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter) - ); - } - - #[test] - fn test_infer_sink_time_window_filter_col_skips_global_aggregate() { - let analysis = test_analysis_with_group_keys(vec![]); - let sink_schema = test_sink_schema(vec![ - ("number", ConcreteDataType::uint32_datatype()), - ( - "time_window", - ConcreteDataType::timestamp_millisecond_datatype(), - ), - ]); - let dirty_filter = test_dirty_filter("ts"); - - assert_eq!( - None, - infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter) - ); - } - - #[test] - fn test_infer_sink_time_window_filter_col_skips_without_timestamp_group_key() { - let analysis = test_analysis_with_group_keys(vec!["host", "device"]); - let sink_schema = test_sink_schema(vec![ - ("host", ConcreteDataType::string_datatype()), - ("device", ConcreteDataType::string_datatype()), - ( - AUTO_CREATED_UPDATE_AT_TS_COL, - ConcreteDataType::timestamp_millisecond_datatype(), - ), - ]); - let dirty_filter = test_dirty_filter("ts"); - - assert_eq!( - None, - infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter) - ); - } - - #[test] - fn test_infer_sink_time_window_filter_col_skips_ambiguous_timestamp_group_keys() { - let analysis = test_analysis_with_group_keys(vec!["ts", "time_window"]); - let sink_schema = test_sink_schema(vec![ - ("ts", ConcreteDataType::timestamp_millisecond_datatype()), - ( - "time_window", - ConcreteDataType::timestamp_millisecond_datatype(), - ), - ]); - let dirty_filter = test_dirty_filter("source_ts"); - - assert_eq!( - None, - infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter) - ); - } -} diff --git a/src/flow/src/batching_mode/state.rs b/src/flow/src/batching_mode/state.rs index 42b71a4ec7..c5fcc74143 100644 --- a/src/flow/src/batching_mode/state.rs +++ b/src/flow/src/batching_mode/state.rs @@ -66,12 +66,20 @@ pub struct TaskState { } impl TaskState { pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self { + Self::with_dirty_time_windows(query_ctx, shutdown_rx, DirtyTimeWindows::default()) + } + + pub fn with_dirty_time_windows( + query_ctx: QueryContextRef, + shutdown_rx: oneshot::Receiver<()>, + dirty_time_windows: DirtyTimeWindows, + ) -> Self { Self { query_ctx, last_update_time: Instant::now(), last_query_duration: Duration::from_secs(0), last_exec_time_millis: None, - dirty_time_windows: Default::default(), + dirty_time_windows, checkpoint_mode: CheckpointMode::FullSnapshot, checkpoints: Default::default(), incremental_disabled: false, @@ -264,6 +272,16 @@ impl DirtyTimeWindows { time_window_merge_threshold, } } + + #[cfg(test)] + pub(crate) fn max_filter_num_per_query(&self) -> usize { + self.max_filter_num_per_query + } + + #[cfg(test)] + pub(crate) fn time_window_merge_threshold(&self) -> usize { + self.time_window_merge_threshold + } } impl Default for DirtyTimeWindows { @@ -681,7 +699,7 @@ impl DirtyTimeWindows { } } -fn to_df_literal(value: Timestamp) -> Result { +pub(crate) fn to_df_literal(value: Timestamp) -> Result { let value = Value::from(value); let value = value .try_to_scalar_value(&value.data_type()) diff --git a/src/flow/src/batching_mode/task.rs b/src/flow/src/batching_mode/task.rs index 3cdf7899a6..cbd6a05cc2 100644 --- a/src/flow/src/batching_mode/task.rs +++ b/src/flow/src/batching_mode/task.rs @@ -27,7 +27,7 @@ use datafusion::datasource::DefaultTableSource; use datafusion::sql::unparser::expr_to_sql; use datafusion_common::DFSchemaRef; use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_expr::{DmlStatement, LogicalPlan, WriteOp}; +use datafusion_expr::{DmlStatement, LogicalPlan, WriteOp, col, lit}; use datatypes::schema::Schema; use query::QueryEngineRef; use query::options::FLOW_INCREMENTAL_MODE; @@ -38,14 +38,16 @@ use sql::parsers::utils::is_tql; use store_api::mito_engine_options::MERGE_MODE_KEY; use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan}; use table::table::adapter::DfTableProviderAdapter; -use tokio::sync::oneshot; use tokio::sync::oneshot::error::TryRecvError; +use tokio::sync::{Mutex, oneshot}; use tokio::time::Instant; use crate::batching_mode::BatchingModeOptions; use crate::batching_mode::checkpoint::checkpoint_mode_label; use crate::batching_mode::frontend_client::{FrontendClient, PeerDesc}; -use crate::batching_mode::state::{CheckpointMode, DirtyTimeWindows, FilterExprInfo, TaskState}; +use crate::batching_mode::state::{ + CheckpointMode, DirtyTimeWindows, FilterExprInfo, TaskState, to_df_literal, +}; use crate::batching_mode::table_creator::{QueryType, create_table_with_expr}; use crate::batching_mode::time_window::TimeWindowExpr; use crate::batching_mode::utils::{ @@ -67,12 +69,6 @@ use crate::{Error, FlowId}; mod ckpt; mod inc; -/// Maximum number of dirty time-window predicates attached to one incremental -/// SQL query. This keeps generated OR filters bounded so Substrait encoding and -/// downstream planning remain predictable; if the backlog is larger, the flow -/// drains one capped batch and postpones checkpoint advancement to a later run. -const MAX_INCREMENTAL_DIRTY_WINDOW_FILTERS: usize = 4096; - /// The task's config, immutable once created #[derive(Clone)] pub struct TaskConfig { @@ -113,6 +109,10 @@ fn is_merge_mode_last_non_null(options: &HashMap) -> bool { pub struct BatchingTask { pub config: Arc, pub state: Arc>, + /// Serializes plan generation, execution, checkpoint advancement, and dirty + /// window restoration for this flow. Without this, a manual flush and the + /// background loop can process the same checkpoint range concurrently. + execution_lock: Arc>, } /// Arguments for creating batching task @@ -150,6 +150,16 @@ pub enum DirtyRestore { Unscoped(DirtyTimeWindows), } +struct ExecuteOnceOutcome { + new_query: Option, + /// Execution result of the generated insert plan. + /// + /// `Ok(Some((affected_rows, elapsed)))` means a query was executed. + /// `Ok(None)` means no query was generated because there was no dirty signal. + /// `Err(_)` means plan generation or execution failed. + result: Result, Error>, +} + impl BatchingTask { #[allow(clippy::too_many_arguments)] pub fn try_new( @@ -168,6 +178,18 @@ impl BatchingTask { flow_eval_interval, }: TaskArgs<'_>, ) -> Result { + let mut state = TaskState::with_dirty_time_windows( + query_ctx.clone(), + shutdown_rx, + DirtyTimeWindows::new( + batch_opts.experimental_max_filter_num_per_query, + batch_opts.experimental_time_window_merge_threshold, + ), + ); + if !batch_opts.experimental_enable_incremental_read { + state.disable_incremental(); + } + Ok(Self { config: Arc::new(TaskConfig { flow_id, @@ -182,7 +204,8 @@ impl BatchingTask { batch_opts, flow_eval_interval, }), - state: Arc::new(RwLock::new(TaskState::new(query_ctx, shutdown_rx))), + state: Arc::new(RwLock::new(state)), + execution_lock: Arc::new(Mutex::new(())), }) } @@ -251,40 +274,75 @@ impl BatchingTask { .context(ExternalSnafu) } - pub async fn gen_exec_once( + pub(crate) async fn execute_once_serialized( &self, engine: &QueryEngineRef, frontend_client: &Arc, max_window_cnt: Option, ) -> Result, Error> { - if let Some(new_query) = self.gen_insert_plan(engine, max_window_cnt).await? { + let outcome = self + .execute_once_serialized_with_outcome(engine, frontend_client, max_window_cnt) + .await; + outcome.result + } + + /// Executes one flow evaluation under `execution_lock` and keeps the + /// generated query context for the background loop's error logging/backoff. + async fn execute_once_serialized_with_outcome( + &self, + engine: &QueryEngineRef, + frontend_client: &Arc, + max_window_cnt: Option, + ) -> ExecuteOnceOutcome { + let _execution_guard = self.execution_lock.lock().await; + self.execute_once_unlocked(engine, frontend_client, max_window_cnt) + .await + } + + /// Executes one flow evaluation. Caller must hold `execution_lock`. + async fn execute_once_unlocked( + &self, + engine: &QueryEngineRef, + frontend_client: &Arc, + max_window_cnt: Option, + ) -> ExecuteOnceOutcome { + let new_query = match self.gen_insert_plan_unlocked(engine, max_window_cnt).await { + Ok(new_query) => new_query, + Err(err) => { + return ExecuteOnceOutcome { + new_query: None, + result: Err(err), + }; + } + }; + + if let Some(new_query) = new_query { debug!("Generate new query: {}", new_query.plan); - let dirty_filter = match &new_query.dirty_restore { - DirtyRestore::Scoped(f) => Some(f), - _ => None, - }; - match self - .execute_logical_plan( + let res = self + .execute_logical_plan_unlocked( frontend_client, &new_query.plan, - dirty_filter, new_query.can_advance_checkpoints, ) - .await - { - Ok(result) => Ok(result), - Err(err) => { - self.handle_executed_query_failure(Some(&new_query)); - Err(err) - } + .await; + if res.is_err() { + self.handle_executed_query_failure(Some(&new_query)); + } + ExecuteOnceOutcome { + new_query: Some(new_query), + result: res, } } else { debug!("Generate no query"); - Ok(None) + ExecuteOnceOutcome { + new_query: None, + result: Ok(None), + } } } - pub async fn gen_insert_plan( + /// Generates the insert plan. Caller must reach this through the serialized path. + async fn gen_insert_plan_unlocked( &self, engine: &QueryEngineRef, max_window_cnt: Option, @@ -388,11 +446,11 @@ impl BatchingTask { Ok(()) } - pub async fn execute_logical_plan( + /// Executes the insert plan. Caller must reach this through the serialized path. + async fn execute_logical_plan_unlocked( &self, frontend_client: &Arc, plan: &LogicalPlan, - dirty_filter: Option<&FilterExprInfo>, can_advance_checkpoints: bool, ) -> Result, Error> { let instant = Instant::now(); @@ -426,8 +484,7 @@ impl BatchingTask { // For incremental-mode SQL queries, attempt to rewrite the delta aggregate // plan into a safe delta-LEFT-JOIN-sink form before deciding on extensions. let incremental_plan = if can_advance_checkpoints { - self.prepare_plan_for_incremental(&plan, dirty_filter) - .await? + self.prepare_plan_for_incremental(&plan).await? } else { None }; @@ -580,6 +637,112 @@ impl BatchingTask { }) } + fn restore_unscoped_dirty_windows(&self, dirty_windows: &DirtyTimeWindows) { + self.state + .write() + .unwrap() + .dirty_time_windows + .add_dirty_windows(dirty_windows); + } + + fn restore_unscoped_dirty_windows_on_err( + &self, + dirty_windows: &DirtyTimeWindows, + result: Result, + ) -> Result { + result.inspect_err(|_| { + self.restore_unscoped_dirty_windows(dirty_windows); + }) + } + + fn drain_dirty_windows_signal(&self) -> (bool, DirtyTimeWindows) { + let mut state = self.state.write().unwrap(); + let dirty_windows_to_restore = state.dirty_time_windows.clone(); + let is_dirty = !dirty_windows_to_restore.is_empty(); + state.dirty_time_windows.clean(); + (is_dirty, dirty_windows_to_restore) + } + + #[allow(clippy::too_many_arguments)] + async fn gen_unfiltered_plan_info( + &self, + engine: QueryEngineRef, + query_ctx: QueryContextRef, + sink_table_schema: Arc, + primary_key_indices: &[usize], + allow_partial: bool, + dirty_windows_to_restore: DirtyTimeWindows, + retention_filter: Option<(&str, Timestamp, &'static str)>, + ) -> Result { + let mut plan = self.restore_unscoped_dirty_windows_on_err( + &dirty_windows_to_restore, + gen_plan_with_matching_schema( + &self.config.query, + query_ctx, + engine, + sink_table_schema, + primary_key_indices, + allow_partial, + ) + .await, + )?; + + if let Some((col_name, lower_bound, context)) = retention_filter { + let lower = self.restore_unscoped_dirty_windows_on_err( + &dirty_windows_to_restore, + to_df_literal(lower_bound), + )?; + let retention_filter = col(col_name).gt_eq(lit(lower)); + let mut add_filter = AddFilterRewriter::new(retention_filter); + plan = self.restore_unscoped_dirty_windows_on_err( + &dirty_windows_to_restore, + plan.clone() + .rewrite(&mut add_filter) + .with_context(|_| DatafusionSnafu { + context: format!( + "Failed to apply {context} expire_after filter to plan:\n {}\n", + plan + ), + }) + .map(|rewrite| rewrite.data), + )?; + } + + Ok(PlanInfo { + plan, + dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore), + can_advance_checkpoints: true, + }) + } + + async fn gen_unfiltered_plan_info_if_dirty( + &self, + engine: QueryEngineRef, + query_ctx: QueryContextRef, + sink_table_schema: Arc, + primary_key_indices: &[usize], + allow_partial: bool, + retention_filter: Option<(&str, Timestamp, &'static str)>, + ) -> Result, Error> { + let (is_dirty, dirty_windows_to_restore) = self.drain_dirty_windows_signal(); + if !is_dirty { + debug!("Flow id={:?}, no new data, not update", self.config.flow_id); + return Ok(None); + } + + self.gen_unfiltered_plan_info( + engine, + query_ctx, + sink_table_schema, + primary_key_indices, + allow_partial, + dirty_windows_to_restore, + retention_filter, + ) + .await + .map(Some) + } + fn handle_executed_query_failure(&self, query: Option<&PlanInfo>) { if let Some(query) = query { self.restore_dirty_windows_after_failure(query); @@ -626,33 +789,11 @@ impl BatchingTask { let min_refresh = self.config.batch_opts.experimental_min_refresh_duration; - let new_query = match self.gen_insert_plan(&engine, max_window_cnt).await { - Ok(new_query) => new_query, - Err(err) => { - common_telemetry::error!(err; "Failed to generate query for flow={}", self.config.flow_id); - // also sleep for a little while before try again to prevent flooding logs - tokio::time::sleep(min_refresh).await; - continue; - } - }; + let outcome = self + .execute_once_serialized_with_outcome(&engine, &frontend_client, max_window_cnt) + .await; - let res = if let Some(new_query) = &new_query { - let dirty_filter = match &new_query.dirty_restore { - DirtyRestore::Scoped(f) => Some(f), - _ => None, - }; - self.execute_logical_plan( - &frontend_client, - &new_query.plan, - dirty_filter, - new_query.can_advance_checkpoints, - ) - .await - } else { - Ok(None) - }; - - match res { + match outcome.result { // normal execute, sleep for some time before doing next query Ok(Some(_)) => { // can increase max_window_cnt to query more windows next time @@ -703,11 +844,10 @@ impl BatchingTask { } // TODO(discord9): this error should have better place to go, but for now just print error, also more context is needed Err(err) => { - self.handle_executed_query_failure(new_query.as_ref()); METRIC_FLOW_BATCHING_ENGINE_ERROR_CNT .with_label_values(&[&flow_id_str]) .inc(); - match new_query { + match outcome.new_query { Some(query) => { common_telemetry::error!(err; "Failed to execute query for flow={} with query: {}", self.config.flow_id, query.plan); // TODO(discord9): add some backoff here? half the query time window or what @@ -743,6 +883,20 @@ impl BatchingTask { create_table_with_expr(&plan, &self.config.sink_table_name, &self.config.query_type) } + fn should_use_unfiltered_incremental_delta(&self) -> bool { + let state = self.state.read().unwrap(); + state.checkpoint_mode() == CheckpointMode::Incremental + && !state.is_incremental_disabled() + && matches!(self.config.query_type, QueryType::Sql) + } + + fn should_use_unfiltered_full_snapshot_seeding(&self) -> bool { + let state = self.state.read().unwrap(); + state.checkpoint_mode() == CheckpointMode::FullSnapshot + && !state.is_incremental_disabled() + && matches!(self.config.query_type, QueryType::Sql) + } + /// will merge and use the first ten time window in query async fn gen_query_with_time_window( &self, @@ -783,83 +937,35 @@ impl BatchingTask { self.config.flow_id ); // clean dirty time window too, this could be from create flow's check_execute - let (is_dirty, dirty_windows_to_restore) = { - let mut state = self.state.write().unwrap(); - let dirty_windows_to_restore = state.dirty_time_windows.clone(); - let is_dirty = !dirty_windows_to_restore.is_empty(); - state.dirty_time_windows.clean(); - (is_dirty, dirty_windows_to_restore) - }; - - if !is_dirty { - // no dirty data, hence no need to update - debug!("Flow id={:?}, no new data, not update", self.config.flow_id); - return Ok(None); - } - - let plan = match gen_plan_with_matching_schema( - &self.config.query, - query_ctx, - engine, - sink_table_schema.clone(), - primary_key_indices, - allow_partial, - ) - .await - { - Ok(plan) => plan, - Err(err) => { - self.state - .write() - .unwrap() - .dirty_time_windows - .add_dirty_windows(&dirty_windows_to_restore); - return Err(err); - } - }; - - return Ok(Some(PlanInfo { - plan, - dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore), - can_advance_checkpoints: true, - })); + return self + .gen_unfiltered_plan_info_if_dirty( + engine, + query_ctx, + sink_table_schema.clone(), + primary_key_indices, + allow_partial, + None, + ) + .await; } _ => { // Clean dirty windows for full-query/non-scoped paths, // such as TQL, that cannot use a time-window filter. - let dirty_windows_to_restore = { - let mut state = self.state.write().unwrap(); - let dirty_windows_to_restore = state.dirty_time_windows.clone(); - state.dirty_time_windows.clean(); - dirty_windows_to_restore - }; + let (_, dirty_windows_to_restore) = self.drain_dirty_windows_signal(); - let plan = match gen_plan_with_matching_schema( - &self.config.query, - query_ctx, - engine, - sink_table_schema.clone(), - primary_key_indices, - allow_partial, - ) - .await - { - Ok(plan) => plan, - Err(err) => { - self.state - .write() - .unwrap() - .dirty_time_windows - .add_dirty_windows(&dirty_windows_to_restore); - return Err(err); - } - }; + let plan_info = self + .gen_unfiltered_plan_info( + engine, + query_ctx, + sink_table_schema.clone(), + primary_key_indices, + allow_partial, + dirty_windows_to_restore, + None, + ) + .await?; - return Ok(Some(PlanInfo { - plan, - dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore), - can_advance_checkpoints: true, - })); + return Ok(Some(plan_info)); } }; @@ -889,22 +995,61 @@ impl BatchingTask { ), })?; + if self.should_use_unfiltered_full_snapshot_seeding() { + // A full-snapshot query that can seed/refresh incremental + // checkpoints must not use dirty-window predicates. Rows can be + // written after dirty windows are drained but before the source scan + // snapshot opens; a stale dirty-window filter could exclude those + // rows while the returned watermark includes them, causing the next + // incremental read to skip them forever. Execute an unfiltered full + // snapshot instead, and keep dirty windows only as the scheduling and + // failure-restoration signal. + let retention_filter = self + .config + .expire_after + .map(|_| (col_name.as_str(), expire_lower_bound, "full-snapshot")); + return self + .gen_unfiltered_plan_info_if_dirty( + engine, + query_ctx, + sink_table_schema.clone(), + primary_key_indices, + allow_partial, + retention_filter, + ) + .await; + } + + if self.should_use_unfiltered_incremental_delta() { + // In incremental mode, source correctness is defined by the + // per-region sequence range `(checkpoint, scan-open snapshot]`, not + // by dirty-window predicates. Dirty windows are only a scheduling + // signal here. Applying a stale dirty-window filter to the source can + // exclude rows that are inside the returned watermark and make a + // checkpoint advance skip them forever. The sink side is also left + // unfiltered by dirty windows; the incremental rewrite joins the + // delta groups with the full sink state for correctness. Future + // dynamic filters can prune sink reads as a pure optimization. + let retention_filter = self + .config + .expire_after + .map(|_| (col_name.as_str(), expire_lower_bound, "incremental")); + return self + .gen_unfiltered_plan_info_if_dirty( + engine, + query_ctx, + sink_table_schema.clone(), + primary_key_indices, + allow_partial, + retention_filter, + ) + .await; + } + let (expr, can_advance_checkpoints) = { let mut state = self.state.write().unwrap(); - let window_cnt = if state.checkpoint_mode() == CheckpointMode::Incremental - && !state.is_incremental_disabled() - && matches!(self.config.query_type, QueryType::Sql) - { - // Incremental scans are bounded by region sequence checkpoints, - // so the dirty-window filter only narrows sink-side/time-window - // work. Drain more windows than normal, but keep a hard cap to - // avoid building a huge OR filter after a long downtime. If - // windows remain, checkpoints won't advance this round. - MAX_INCREMENTAL_DIRTY_WINDOW_FILTERS - } else { - max_window_cnt - .unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query) - }; + let window_cnt = max_window_cnt + .unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query); let expr = state.dirty_time_windows.gen_filter_exprs( &col_name, Some(expire_lower_bound), diff --git a/src/flow/src/batching_mode/task/inc.rs b/src/flow/src/batching_mode/task/inc.rs index 4fb64a676e..9af54c1ba7 100644 --- a/src/flow/src/batching_mode/task/inc.rs +++ b/src/flow/src/batching_mode/task/inc.rs @@ -26,8 +26,7 @@ use snafu::ResultExt; use table::metadata::TableId; use crate::Error; -use crate::batching_mode::incremental_filter::build_sink_dirty_time_window_filter_expr; -use crate::batching_mode::state::{CheckpointMode, FilterExprInfo}; +use crate::batching_mode::state::CheckpointMode; use crate::batching_mode::table_creator::QueryType; use crate::batching_mode::task::BatchingTask; use crate::batching_mode::utils::{ @@ -74,7 +73,6 @@ impl BatchingTask { pub(super) async fn prepare_plan_for_incremental( &self, plan: &LogicalPlan, - dirty_filter: Option<&FilterExprInfo>, ) -> Result, Error> { let is_incremental_sql = { let state = self.state.read().unwrap(); @@ -152,31 +150,12 @@ impl BatchingTask { return Ok(None); } }; - let sink_schema = sink_table.table_info().meta.schema.clone(); - let sink_dirty_filter = match build_sink_dirty_time_window_filter_expr( - self.config.flow_id, - &analysis, - &sink_schema, - dirty_filter, - ) { - Ok(filter) => filter, - Err(err) => { - warn!( - "Flow {} failed to build sink dirty time window filter; \ - falling back to full snapshot for this round: {:?}", - self.config.flow_id, err - ); - self.state.write().unwrap().mark_full_snapshot(); - return Ok(None); - } - }; - let rewritten_inner = match rewrite_incremental_aggregate_with_sink_merge( &inner_plan, &analysis, sink_table, &self.config.sink_table_name, - sink_dirty_filter, + None, ) .await { diff --git a/src/flow/src/batching_mode/task/test.rs b/src/flow/src/batching_mode/task/test.rs index 959aeb00c9..d64b4ef1b9 100644 --- a/src/flow/src/batching_mode/task/test.rs +++ b/src/flow/src/batching_mode/task/test.rs @@ -25,7 +25,9 @@ use datatypes::data_type::ConcreteDataType as CDT; use datatypes::schema::ColumnSchema; use datatypes::vectors::{TimestampMillisecondVector, UInt32Vector, VectorRef}; use pretty_assertions::assert_eq; -use query::options::{FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY}; +use query::options::{ + FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY, QueryOptions, +}; use session::context::QueryContext; use table::test_util::MemTable; @@ -38,6 +40,13 @@ use crate::batching_mode::state::CheckpointMode; use crate::batching_mode::time_window::find_time_window_expr; use crate::test_utils::create_test_query_engine; +fn incremental_batch_opts() -> Arc { + Arc::new(BatchingModeOptions { + experimental_enable_incremental_read: true, + ..Default::default() + }) +} + async fn new_test_task_and_plan_with_missing_sink() -> (BatchingTask, LogicalPlan) { new_test_task_engine_and_plan_with_query( "SELECT number, ts FROM numbers_with_ts", @@ -60,6 +69,15 @@ impl TestTaskParts { } async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str) -> TestTaskParts { + new_test_task_engine_and_plan_with_query_and_opts(query, sink_table, incremental_batch_opts()) + .await +} + +async fn new_test_task_engine_and_plan_with_query_and_opts( + query: &str, + sink_table: &str, + batch_opts: Arc, +) -> TestTaskParts { let query_engine = create_test_query_engine(); let ctx = QueryContext::arc(); let plan = sql_to_df_plan( @@ -91,7 +109,7 @@ async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str) query_ctx: ctx, catalog_manager: query_engine.engine_state().catalog_manager().clone(), shutdown_rx: rx, - batch_opts: Arc::new(BatchingModeOptions::default()), + batch_opts, flow_eval_interval: None, }) .unwrap(); @@ -103,6 +121,75 @@ async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str) } } +#[tokio::test] +async fn test_incremental_read_is_disabled_by_default() { + let task = new_test_task_engine_and_plan_with_query_and_opts( + "SELECT number, ts FROM numbers_with_ts", + "numbers_with_ts", + Arc::new(BatchingModeOptions::default()), + ) + .await + .task; + + assert!(task.state.read().unwrap().is_incremental_disabled()); +} + +#[tokio::test] +async fn test_dirty_time_windows_uses_batch_opts() { + let task = new_test_task_engine_and_plan_with_query_and_opts( + "SELECT number, ts FROM numbers_with_ts", + "numbers_with_ts", + Arc::new(BatchingModeOptions { + experimental_max_filter_num_per_query: 7, + experimental_time_window_merge_threshold: 11, + ..Default::default() + }), + ) + .await + .task; + + let state = task.state.read().unwrap(); + assert_eq!(7, state.dirty_time_windows.max_filter_num_per_query()); + assert_eq!(11, state.dirty_time_windows.time_window_merge_threshold()); +} + +#[tokio::test] +async fn test_execute_once_serialized_waits_for_execution_lock() { + let TestTaskParts { + task, query_engine, .. + } = new_test_task_engine_and_plan_with_query( + "SELECT number, ts FROM numbers_with_ts", + "missing_sink", + ) + .await; + let (frontend_client, _handler) = + FrontendClient::from_empty_grpc_handler(QueryOptions::default()); + let frontend_client = Arc::new(frontend_client); + + let guard = task.execution_lock.clone().lock_owned().await; + let task_to_run = task.clone(); + let query_engine_to_run = query_engine.clone(); + let frontend_client_to_run = frontend_client.clone(); + let exec = tokio::spawn(async move { + task_to_run + .execute_once_serialized(&query_engine_to_run, &frontend_client_to_run, None) + .await + }); + + tokio::time::sleep(Duration::from_millis(20)).await; + assert!( + !exec.is_finished(), + "execute_once_serialized should wait for execution_lock" + ); + + drop(guard); + tokio::time::timeout(Duration::from_secs(1), exec) + .await + .expect("execute_once_serialized should finish once execution_lock is released") + .expect("execute_once_serialized task should not panic") + .expect_err("missing sink should fail after acquiring execution_lock"); +} + async fn new_time_window_test_task_with_query(query: &str) -> TestTaskParts { let query_engine = create_test_query_engine(); let ctx = QueryContext::arc(); @@ -147,7 +234,7 @@ async fn new_time_window_test_task_with_query(query: &str) -> TestTaskParts { query_ctx: ctx, catalog_manager: query_engine.engine_state().catalog_manager().clone(), shutdown_rx: rx, - batch_opts: Arc::new(BatchingModeOptions::default()), + batch_opts: incremental_batch_opts(), flow_eval_interval: None, }) .unwrap(); @@ -226,6 +313,14 @@ fn dirty_range(start: i64, end: i64) -> DirtyTimeWindows { dirty } +fn expire_after_for_retention_filter_test() -> i64 { + let now_secs = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards") + .as_secs(); + (now_secs - 10) as i64 +} + async fn assert_unscoped_failure_restore( consumed_dirty_windows: DirtyTimeWindows, current_dirty_windows: DirtyTimeWindows, @@ -626,6 +721,7 @@ async fn test_full_snapshot_scoped_plan_marks_checkpoint_advance_safe_only_after .await; { let mut state = task.state.write().unwrap(); + state.disable_incremental(); state .dirty_time_windows .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5))); @@ -657,7 +753,7 @@ async fn test_full_snapshot_scoped_plan_marks_checkpoint_advance_safe_only_after } #[tokio::test] -async fn test_incremental_scoped_plan_consumes_all_dirty_windows_for_checkpoint_safety() { +async fn test_incremental_plan_consumes_dirty_signal_for_checkpoint_safety() { let TestTaskParts { task, query_engine, @@ -692,6 +788,192 @@ async fn test_incremental_scoped_plan_consumes_all_dirty_windows_for_checkpoint_ assert!(task.state.read().unwrap().dirty_time_windows.is_empty()); } +#[tokio::test] +async fn test_full_snapshot_seeding_for_incremental_does_not_add_dirty_window_filter() { + let TestTaskParts { + task, + query_engine, + .. + } = new_time_window_test_task_with_query( + "SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window", + ) + .await; + { + let mut state = task.state.write().unwrap(); + assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot); + assert!(!state.is_incremental_disabled()); + state + .dirty_time_windows + .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5))); + state + .dirty_time_windows + .add_window(Timestamp::new_second(30), Some(Timestamp::new_second(35))); + } + let sink_schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("number", CDT::uint32_datatype(), false), + ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false) + .with_time_index(true), + ])); + + let plan = task + .gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1)) + .await + .unwrap() + .unwrap(); + + let plan_text = plan.plan.to_string(); + assert!(plan.can_advance_checkpoints); + assert!(task.state.read().unwrap().dirty_time_windows.is_empty()); + assert!(!plan_text.contains("Filter:"), "{plan_text}"); +} + +#[tokio::test] +async fn test_full_snapshot_seeding_applies_expire_after_retention_filter() { + let TestTaskParts { + mut task, + query_engine, + .. + } = new_time_window_test_task_with_query( + "SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window", + ) + .await; + { + let mut state = task.state.write().unwrap(); + assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot); + assert!(!state.is_incremental_disabled()); + state + .dirty_time_windows + .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5))); + } + let sink_schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("number", CDT::uint32_datatype(), false), + ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false) + .with_time_index(true), + ])); + + Arc::get_mut(&mut task.config) + .expect("test task config should be uniquely owned") + .expire_after = Some(expire_after_for_retention_filter_test()); + let plan = task + .gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1)) + .await + .unwrap() + .unwrap(); + + assert!(plan.can_advance_checkpoints); + assert!(task.state.read().unwrap().dirty_time_windows.is_empty()); + let plan_text = plan.plan.to_string(); + assert!( + plan_text.contains("Filter: ts >= TimestampMillisecond("), + "{plan_text}" + ); +} + +#[tokio::test] +async fn test_incremental_plan_does_not_add_dirty_window_filter() { + let TestTaskParts { + task, + query_engine, + .. + } = new_time_window_test_task_with_query( + "SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window", + ) + .await; + { + let mut state = task.state.write().unwrap(); + state.advance_checkpoints(HashMap::from([(1_u64, 10_u64)])); + state + .dirty_time_windows + .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5))); + } + let sink_schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("number", CDT::uint32_datatype(), false), + ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false) + .with_time_index(true), + ])); + + let plan = task + .gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1)) + .await + .unwrap() + .unwrap(); + + let plan_text = plan.plan.to_string(); + assert!(plan.can_advance_checkpoints); + assert!(!plan_text.contains("Filter:"), "{plan_text}"); +} + +#[tokio::test] +async fn test_incremental_delta_applies_expire_after_retention_filter() { + let TestTaskParts { + mut task, + query_engine, + .. + } = new_time_window_test_task_with_query( + "SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window", + ) + .await; + { + let mut state = task.state.write().unwrap(); + state.advance_checkpoints(HashMap::from([(1_u64, 10_u64)])); + state + .dirty_time_windows + .add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5))); + } + let sink_schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("number", CDT::uint32_datatype(), false), + ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false) + .with_time_index(true), + ])); + + Arc::get_mut(&mut task.config) + .expect("test task config should be uniquely owned") + .expire_after = Some(expire_after_for_retention_filter_test()); + let plan = task + .gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1)) + .await + .unwrap() + .unwrap(); + + assert!(plan.can_advance_checkpoints); + assert!(task.state.read().unwrap().dirty_time_windows.is_empty()); + let plan_text = plan.plan.to_string(); + assert!( + plan_text.contains("Filter: ts >= TimestampMillisecond("), + "{plan_text}" + ); +} + +#[tokio::test] +async fn test_non_scoped_path_generates_plan_with_empty_dirty_signal() { + let TestTaskParts { + mut task, + query_engine, + .. + } = new_test_task_engine_and_plan_with_query( + "SELECT number, ts FROM numbers_with_ts", + "missing_sink", + ) + .await; + Arc::get_mut(&mut task.config) + .expect("test task config should be uniquely owned") + .query_type = QueryType::Tql; + task.state.write().unwrap().dirty_time_windows.clean(); + let sink_schema = Arc::new(Schema::new(vec![ + ColumnSchema::new("number", CDT::uint32_datatype(), false), + ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false).with_time_index(true), + ])); + + let plan = task + .gen_query_with_time_window(query_engine, &sink_schema, &[], false, None) + .await + .unwrap() + .expect("non-scoped path should generate a plan even with an empty dirty signal"); + + assert!(plan.can_advance_checkpoints); + assert!(task.state.read().unwrap().dirty_time_windows.is_empty()); +} + #[tokio::test] async fn test_executed_query_failure_restores_scoped_dirty_windows_for_flush_path() { let (task, plan) = new_test_task_and_plan_with_missing_sink().await; @@ -773,7 +1055,7 @@ async fn test_prepare_plan_for_incremental_disables_on_non_aggregate() { query_ctx: ctx, catalog_manager: query_engine.engine_state().catalog_manager().clone(), shutdown_rx: rx, - batch_opts: Arc::new(BatchingModeOptions::default()), + batch_opts: incremental_batch_opts(), flow_eval_interval: None, }) .unwrap(); @@ -788,10 +1070,7 @@ async fn test_prepare_plan_for_incremental_disables_on_non_aggregate() { CheckpointMode::Incremental ); - let incremental_plan = task - .prepare_plan_for_incremental(&dml_plan, None) - .await - .unwrap(); + let incremental_plan = task.prepare_plan_for_incremental(&dml_plan).await.unwrap(); assert!(incremental_plan.is_none()); let state = task.state.read().unwrap(); assert!(state.is_incremental_disabled()); @@ -852,7 +1131,7 @@ async fn test_prepare_plan_for_incremental_falls_back_without_disable_on_rewrite query_ctx: ctx, catalog_manager: query_engine.engine_state().catalog_manager().clone(), shutdown_rx: rx, - batch_opts: Arc::new(BatchingModeOptions::default()), + batch_opts: incremental_batch_opts(), flow_eval_interval: None, }) .unwrap(); @@ -866,10 +1145,7 @@ async fn test_prepare_plan_for_incremental_falls_back_without_disable_on_rewrite CheckpointMode::Incremental ); - let incremental_plan = task - .prepare_plan_for_incremental(&dml_plan, None) - .await - .unwrap(); + let incremental_plan = task.prepare_plan_for_incremental(&dml_plan).await.unwrap(); assert!(incremental_plan.is_none()); let state = task.state.read().unwrap(); assert!(!state.is_incremental_disabled()); @@ -928,7 +1204,7 @@ async fn test_prepare_plan_for_incremental_group_by_without_merge_columns_uses_o query_ctx: ctx, catalog_manager: query_engine.engine_state().catalog_manager().clone(), shutdown_rx: rx, - batch_opts: Arc::new(BatchingModeOptions::default()), + batch_opts: incremental_batch_opts(), flow_eval_interval: None, }) .unwrap(); @@ -939,7 +1215,7 @@ async fn test_prepare_plan_for_incremental_group_by_without_merge_columns_uses_o .advance_checkpoints(HashMap::from([(1_u64, 10_u64)])); let incremental_plan = task - .prepare_plan_for_incremental(&dml_plan, None) + .prepare_plan_for_incremental(&dml_plan) .await .unwrap() .expect("plain GROUP BY is incremental-safe without a rewrite"); @@ -962,7 +1238,7 @@ async fn test_auto_created_sql_aggregate_sink_reaches_incremental_safe() { task.state.write().unwrap().dirty_time_windows.set_dirty(); let plan_info = task - .gen_insert_plan(&query_engine, None) + .gen_insert_plan_unlocked(&query_engine, None) .await .unwrap() .unwrap(); @@ -973,7 +1249,7 @@ async fn test_auto_created_sql_aggregate_sink_reaches_incremental_safe() { .unwrap() .advance_checkpoints(HashMap::from([(1_u64, 10_u64)])); let incremental_plan = task - .prepare_plan_for_incremental(&plan_info.plan, None) + .prepare_plan_for_incremental(&plan_info.plan) .await .unwrap(); let incremental_safe = incremental_plan.is_some(); @@ -1078,11 +1354,11 @@ async fn test_insert_plan_matching_failure_restores_consumed_dirty_marker() { register_number_only_sink(&query_engine, sink_table); task.state.write().unwrap().dirty_time_windows.set_dirty(); - let result = task.gen_insert_plan(&query_engine, None).await; + let result = task.gen_insert_plan_unlocked(&query_engine, None).await; assert!(result.is_err()); let _err = match result { - Ok(_) => panic!("gen_insert_plan should fail with a sink column mismatch"), + Ok(_) => panic!("gen_insert_plan_unlocked should fail with a sink column mismatch"), Err(err) => err, }; let state = task.state.read().unwrap(); diff --git a/src/flow/src/batching_mode/utils/test.rs b/src/flow/src/batching_mode/utils/test.rs index 5b9cf7f507..317b0a5475 100644 --- a/src/flow/src/batching_mode/utils/test.rs +++ b/src/flow/src/batching_mode/utils/test.rs @@ -1288,9 +1288,10 @@ async fn test_rewrite_incremental_aggregate_with_left_join() { #[tokio::test] async fn test_rewrite_incremental_aggregate_filters_sink_dirty_time_window() { - // This verifies the rewrite placement when callers supply an already - // inferred sink dirty-window predicate. The task-level inference rules are - // covered by `infer_sink_time_window_filter_col` tests in task.rs. + // This verifies the rewrite placement when callers supply a sink predicate. + // The production incremental flow path currently leaves sink scans + // unfiltered for correctness and relies on future dynamic filters for + // pruning. let query_engine = create_test_query_engine(); let ctx = QueryContext::arc(); let sql = "SELECT max(number) AS number, date_bin(INTERVAL '1 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window"; diff --git a/src/flow/src/server.rs b/src/flow/src/server.rs index 913128f386..c2d986f645 100644 --- a/src/flow/src/server.rs +++ b/src/flow/src/server.rs @@ -566,11 +566,15 @@ impl FrontendInvoker { name: TABLE_FLOWNODE_SET_CACHE_NAME, })?; + // TODO(auto_create_table): flow sink tables are created through a controlled + // `CREATE FLOW` path, not client writes, so they are intentionally exempt from + // the frontend's global auto-create switch. Revisit if flow should honor it. let inserter = Arc::new(Inserter::new( catalog_manager.clone(), partition_manager.clone(), node_manager.clone(), table_flownode_cache, + true, )); let deleter = Arc::new(Deleter::new( diff --git a/src/frontend/src/frontend.rs b/src/frontend/src/frontend.rs index fb3b096f06..918185cb8f 100644 --- a/src/frontend/src/frontend.rs +++ b/src/frontend/src/frontend.rs @@ -44,6 +44,11 @@ pub struct FrontendOptions { pub node_id: Option, pub default_timezone: Option, pub default_column_prefix: Option, + /// Server-side global switch for auto table creation on write. + /// Acts as an upper bound: when `false`, missing tables are never auto-created + /// even if a request sets the `auto_create_table` hint to `true`. When `true` + /// (default), the per-request hint still applies. Default: `true`. + pub auto_create_table: bool, /// Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight). /// Set to 0 to disable the limit. Default: "0" (unlimited) pub max_in_flight_write_bytes: ReadableSize, @@ -82,6 +87,7 @@ impl Default for FrontendOptions { node_id: None, default_timezone: None, default_column_prefix: None, + auto_create_table: true, max_in_flight_write_bytes: ReadableSize(0), write_bytes_exhausted_policy: OnExhaustedPolicy::default(), http: HttpOptions::default(), diff --git a/src/frontend/src/instance/builder.rs b/src/frontend/src/instance/builder.rs index ff857ed768..526d8aac73 100644 --- a/src/frontend/src/instance/builder.rs +++ b/src/frontend/src/instance/builder.rs @@ -185,6 +185,7 @@ impl FrontendBuilder { partition_manager.clone(), node_manager.clone(), table_flownode_cache, + self.options.auto_create_table, )); let deleter = Arc::new(Deleter::new( self.catalog_manager.clone(), diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index 434a413fed..757f657c1e 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -43,7 +43,12 @@ use servers::query_handler::{ }; use session::context::QueryContextRef; use snafu::{IntoError, ResultExt}; -use table::requests::{OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM}; +use table::requests::{ + OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM, SEMANTIC_PIPELINE, SEMANTIC_SIGNAL_TYPE, + SEMANTIC_SOURCE, SEMANTIC_TRACE_CONVENTIONS, SEMANTIC_TRACE_HAS_EVENTS, + SEMANTIC_TRACE_HAS_LINKS, SEMANTIC_VALUE_UNKNOWN, SIGNAL_TYPE_LOG, SIGNAL_TYPE_METRIC, + SIGNAL_TYPE_TRACE, SOURCE_OPENTELEMETRY, TABLE_DATA_MODEL_TRACE_V1, +}; use crate::instance::Instance; use crate::instance::otlp::trace_semconv::trace_semconv_fixed_type; @@ -131,12 +136,14 @@ impl OpenTelemetryProtocolHandler for Instance { let (requests, rows) = otlp::metrics::to_grpc_insert_requests(request, &mut metric_ctx)?; OTLP_METRICS_ROWS.inc_by(rows as u64); - let ctx = if !is_legacy { + let ctx = { let mut c = (*ctx).clone(); - c.set_extension(OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM.to_string()); + c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC); + c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY); + if !is_legacy { + c.set_extension(OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM.to_string()); + } Arc::new(c) - } else { - ctx }; // If the user uses the legacy path, it is by default without metric engine. @@ -211,6 +218,15 @@ impl OpenTelemetryProtocolHandler for Instance { .get::>(); interceptor_ref.pre_execute(ctx.clone())?; + // `as_req_iter` clones this ctx into each `temp_ctx`, so identity set here + // reaches the context that drives table auto-create. + let ctx = { + let mut c = (*ctx).clone(); + c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_LOG); + c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY); + Arc::new(c) + }; + let opt_req = otlp::logs::to_grpc_insert_requests( request, pipeline, @@ -256,6 +272,23 @@ impl Instance { ctx: QueryContextRef, ) -> ServerResult { let is_trace_v1_model = matches!(pipeline, PipelineWay::OtlpTraceDirectV1); + + // Only the main span table gets the identity; the derived `_services` / + // `_operations` lookup tables keep the unstamped `ctx`. + let main_ctx = { + let mut c = (*ctx).clone(); + c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_TRACE); + c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY); + if is_trace_v1_model { + c.set_extension(SEMANTIC_PIPELINE, TABLE_DATA_MODEL_TRACE_V1); + c.set_extension(SEMANTIC_TRACE_HAS_EVENTS, "true"); + c.set_extension(SEMANTIC_TRACE_HAS_LINKS, "true"); + // schema_url is row-level, so conventions is unknown at table level. + c.set_extension(SEMANTIC_TRACE_CONVENTIONS, SEMANTIC_VALUE_UNKNOWN); + } + Arc::new(c) + }; + let ingest_ctx = TraceChunkIngestContext { pipeline_handler, pipeline, @@ -278,7 +311,7 @@ impl Instance { .map(|chunk| chunk.collect::>()) .collect::>(); for chunk in chunks { - self.ingest_trace_chunk(&ingest_ctx, chunk, ctx.clone(), &mut ingest_state) + self.ingest_trace_chunk(&ingest_ctx, chunk, main_ctx.clone(), &mut ingest_state) .await?; } } diff --git a/src/meta-srv/src/procedure/repartition.rs b/src/meta-srv/src/procedure/repartition.rs index c1819cb364..8c7c1dfeff 100644 --- a/src/meta-srv/src/procedure/repartition.rs +++ b/src/meta-srv/src/procedure/repartition.rs @@ -440,7 +440,17 @@ impl Context { }; let _ = self .cache_invalidator - .invalidate(&ctx, &[CacheIdent::TableId(table_id)]) + .invalidate( + &ctx, + &[ + CacheIdent::TableId(table_id), + CacheIdent::TableName(TableName { + catalog_name: self.persistent_ctx.catalog_name.clone(), + schema_name: self.persistent_ctx.schema_name.clone(), + table_name: self.persistent_ctx.table_name.clone(), + }), + ], + ) .await; Ok(()) } diff --git a/src/meta-srv/src/procedure/repartition/update_partition_metadata.rs b/src/meta-srv/src/procedure/repartition/update_partition_metadata.rs index cc9ca1c9bb..8d68a6e8fe 100644 --- a/src/meta-srv/src/procedure/repartition/update_partition_metadata.rs +++ b/src/meta-srv/src/procedure/repartition/update_partition_metadata.rs @@ -95,10 +95,19 @@ impl State for UpdatePartitionMetadata { let mut new_table_info = table_info_value.table_info.clone(); new_table_info.meta.partition_key_indices = partition_key_indices; + common_telemetry::info!( + "Update table partition metadata, table_id: {}, partition_key_indices: {:?}, partition_columns: {:?}", + table_id, + new_table_info.meta.partition_key_indices, + new_table_info + .meta + .partition_column_names() + .cloned() + .collect::>(), + ); ctx.update_table_info(&table_info_value, table_info_value.update(new_table_info)) .await?; - // We don't invalidate cache here because the subsequent AllocateRegion step - // will update the table route and invalidate the cache accordingly. + ctx.invalidate_table_cache().await?; Ok(( Box::new(AllocateRegion::new(self.plan_entries.clone())), diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index 3e3a18a24d..99e3439879 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -50,6 +50,7 @@ datafusion-common.workspace = true datafusion-expr.workspace = true datatypes.workspace = true dashmap.workspace = true +derive_more.workspace = true dotenv.workspace = true either.workspace = true futures.workspace = true diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs index 4c83064371..8c98395292 100644 --- a/src/mito2/src/compaction.rs +++ b/src/mito2/src/compaction.rs @@ -150,6 +150,7 @@ impl CompactionScheduler { } /// Schedules a compaction for the region. + /// Returns whether a compaction is scheduled. #[allow(clippy::too_many_arguments)] pub(crate) async fn schedule_compaction( &mut self, @@ -161,7 +162,7 @@ impl CompactionScheduler { manifest_ctx: &ManifestContextRef, schema_metadata_manager: SchemaMetadataManagerRef, max_parallelism: usize, - ) -> Result<()> { + ) -> Result { // skip compaction if region is in staging state let current_state = manifest_ctx.current_state(); if current_state == RegionRoleState::Leader(RegionLeaderState::Staging) { @@ -170,7 +171,7 @@ impl CompactionScheduler { region_id, compact_options ); waiter.send(Ok(0)); - return Ok(()); + return Ok(false); } if let Some(status) = self.region_status.get_mut(®ion_id) { @@ -192,7 +193,7 @@ impl CompactionScheduler { ); } } - return Ok(()); + return Ok(false); } // The region can compact directly. @@ -209,7 +210,7 @@ impl CompactionScheduler { max_parallelism, ); - let result = match self + match self .schedule_compaction_request(request, compact_options) .await { @@ -220,14 +221,12 @@ impl CompactionScheduler { status.active_compaction = Some(active_compaction); self.region_status.insert(region_id, status); - Ok(()) + self.listener.on_compaction_scheduled(region_id); + Ok(true) } - Ok(None) => Ok(()), + Ok(None) => Ok(false), Err(e) => Err(e), - }; - - self.listener.on_compaction_scheduled(region_id); - result + } } // Handle pending manual compaction request for the region. @@ -334,6 +333,27 @@ impl CompactionScheduler { // And skip try to schedule next compaction task. return pending_ddl_requests; } + Vec::new() + } + + pub(crate) fn is_compacting(&self, region_id: RegionId) -> bool { + self.region_status + .get(®ion_id) + .map(|status| status.active_compaction.is_some()) + .unwrap_or(false) + } + + /// Schedules next compaction upon a finished compaction. + /// Returns whether the compaction is scheduled. + pub(crate) async fn schedule_next_compaction( + &mut self, + region_id: RegionId, + manifest_ctx: &ManifestContextRef, + schema_metadata_manager: SchemaMetadataManagerRef, + ) -> bool { + let Some(status) = self.region_status.get_mut(®ion_id) else { + return false; + }; // We should always try to compact the region until picker returns None. let request = status.new_compaction_request( @@ -364,20 +384,21 @@ impl CompactionScheduler { "Successfully scheduled next compaction for region id: {}", region_id ); + true } Ok(None) => { // No further compaction tasks can be scheduled; cleanup the `CompactionStatus` for this region. // All DDL requests and pending compaction requests have already been processed. // Safe to remove the region from status tracking. self.region_status.remove(®ion_id); + false } Err(e) => { error!(e; "Failed to schedule next compaction for region {}", region_id); self.remove_region_on_failure(region_id, Arc::new(e)); + false } } - - Vec::new() } /// Notifies the scheduler that the compaction job is cancelled cooperatively. @@ -1435,7 +1456,7 @@ mod tests { let manifest_ctx = env .mock_manifest_context(version_control.current().version.metadata.clone()) .await; - scheduler + let scheduled = scheduler .schedule_compaction( builder.region_id(), compact_request::Options::Regular(Default::default()), @@ -1448,6 +1469,7 @@ mod tests { ) .await .unwrap(); + assert!(!scheduled); let output = output_rx.await.unwrap().unwrap(); assert_eq!(output, 0); assert!(scheduler.region_status.is_empty()); @@ -1456,7 +1478,7 @@ mod tests { let version_control = Arc::new(builder.push_l0_file(0, 1000).build()); let (output_tx, output_rx) = oneshot::channel(); let waiter = OptionOutputTx::from(output_tx); - scheduler + let scheduled = scheduler .schedule_compaction( builder.region_id(), compact_request::Options::Regular(Default::default()), @@ -1469,11 +1491,67 @@ mod tests { ) .await .unwrap(); + assert!(!scheduled); let output = output_rx.await.unwrap().unwrap(); assert_eq!(output, 0); assert!(scheduler.region_status.is_empty()); } + #[tokio::test] + async fn test_schedule_compaction_returns_true_when_task_scheduled() { + let job_scheduler = Arc::new(VecScheduler::default()); + let env = SchedulerEnv::new().await.scheduler(job_scheduler.clone()); + let (tx, _rx) = mpsc::channel(4); + let mut scheduler = env.mock_compaction_scheduler(tx); + let mut builder = VersionControlBuilder::new(); + let region_id = builder.region_id(); + let end = 1000 * 1000; + // Five overlapping L0 files are enough for the regular picker to create a task. + let version_control = Arc::new( + builder + .push_l0_file(0, end) + .push_l0_file(10, end) + .push_l0_file(50, end) + .push_l0_file(80, end) + .push_l0_file(90, end) + .build(), + ); + let manifest_ctx = env + .mock_manifest_context(version_control.current().version.metadata.clone()) + .await; + let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager(); + schema_metadata_manager + .register_region_table_info( + region_id.table_id(), + "test_table", + "test_catalog", + "test_schema", + None, + kv_backend, + ) + .await; + + let scheduled = scheduler + .schedule_compaction( + region_id, + Options::Regular(Default::default()), + &version_control, + &env.access_layer, + OptionOutputTx::none(), + &manifest_ctx, + schema_metadata_manager, + 1, + ) + .await + .unwrap(); + + // The boolean result is what the worker uses to decide whether to update + // last_schedule_compaction_millis. + assert!(scheduled); + assert_eq!(1, job_scheduler.num_jobs()); + assert!(scheduler.region_status.contains_key(®ion_id)); + } + #[tokio::test] async fn test_schedule_on_finished() { common_telemetry::init_default_ut_logging(); @@ -1511,7 +1589,7 @@ mod tests { let manifest_ctx = env .mock_manifest_context(version_control.current().version.metadata.clone()) .await; - scheduler + let scheduled = scheduler .schedule_compaction( region_id, compact_request::Options::Regular(Default::default()), @@ -1525,6 +1603,7 @@ mod tests { .await .unwrap(); // Should schedule 1 compaction. + assert!(scheduled); assert_eq!(1, scheduler.region_status.len()); assert_eq!(1, job_scheduler.num_jobs()); let data = version_control.current(); @@ -1543,7 +1622,7 @@ mod tests { ); // The task is pending. let (tx, _rx) = oneshot::channel(); - scheduler + let scheduled = scheduler .schedule_compaction( region_id, compact_request::Options::Regular(Default::default()), @@ -1556,6 +1635,7 @@ mod tests { ) .await .unwrap(); + assert!(!scheduled); assert_eq!(1, scheduler.region_status.len()); assert_eq!(1, job_scheduler.num_jobs()); assert!( @@ -1571,6 +1651,10 @@ mod tests { scheduler .on_compaction_finished(region_id, &manifest_ctx, schema_metadata_manager.clone()) .await; + let scheduled = scheduler + .schedule_next_compaction(region_id, &manifest_ctx, schema_metadata_manager.clone()) + .await; + assert!(scheduled); assert_eq!(1, scheduler.region_status.len()); assert_eq!(2, job_scheduler.num_jobs()); @@ -1583,7 +1667,7 @@ mod tests { ); let (tx, _rx) = oneshot::channel(); // The task is pending. - scheduler + let scheduled = scheduler .schedule_compaction( region_id, compact_request::Options::Regular(Default::default()), @@ -1596,6 +1680,7 @@ mod tests { ) .await .unwrap(); + assert!(!scheduled); assert_eq!(2, job_scheduler.num_jobs()); assert!( !scheduler @@ -2329,6 +2414,15 @@ mod tests { .await; assert!(pending_ddls.is_empty()); + assert!(scheduler.region_status.contains_key(®ion_id)); + + let (schema_metadata_manager, _kv_backend) = mock_schema_metadata_manager(); + // With no compactable files, next scheduling returns false and removes + // the status without creating a background task. + let scheduled = scheduler + .schedule_next_compaction(region_id, &manifest_ctx, schema_metadata_manager) + .await; + assert!(!scheduled); assert!(!scheduler.region_status.contains_key(®ion_id)); } @@ -2371,6 +2465,14 @@ mod tests { .await; assert!(pending_ddls.is_empty()); + assert!(scheduler.region_status.contains_key(®ion_id)); + + let (schema_metadata_manager, _kv_backend) = mock_schema_metadata_manager(); + // The failing scheduler simulates a submit error; callers must see false. + let scheduled = scheduler + .schedule_next_compaction(region_id, &manifest_ctx, schema_metadata_manager) + .await; + assert!(!scheduled); assert!(!scheduler.region_status.contains_key(®ion_id)); } diff --git a/src/mito2/src/compaction/run.rs b/src/mito2/src/compaction/run.rs index ecd19666e2..1ba4569021 100644 --- a/src/mito2/src/compaction/run.rs +++ b/src/mito2/src/compaction/run.rs @@ -15,6 +15,9 @@ //! This file contains code to find sorted runs in a set if ranged items and //! along with the best way to merge these items to satisfy the desired run count. +use std::cmp::Ordering; +use std::collections::BinaryHeap; + use bytes::{Buf, Bytes}; use common_base::BitVec; use common_base::readable_size::ReadableSize; @@ -423,6 +426,133 @@ where runs } +pub(crate) fn find_sorted_runs_by_time_range(items: &mut [T]) -> Vec> +where + T: Item, +{ + if items.is_empty() { + return vec![]; + } + sort_ranged_items(items); + + use derive_more::{Eq, PartialEq}; + + /// `SortedRun` with a creation sequence `i`. + #[derive(PartialEq, Eq)] + struct Run { + i: usize, + #[partial_eq(skip)] + run: SortedRun, + } + + impl Run { + fn new(i: usize, item: &T) -> Run { + let mut run = SortedRun::default(); + run.push_item(item.clone()); + Run { i, run } + } + + fn push_item(&mut self, item: &T) { + self.run.push_item(item.clone()); + } + } + + impl PartialOrd for Run { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + + /// Sort by run's `end` desc then `start` asc. + impl Ord for Run { + fn cmp(&self, other: &Self) -> Ordering { + let l_run = &self.run; + let r_run = &other.run; + + // Safety: `start` and `end` must both exist because it's guaranteed that whenever a + // `Run` is created, an item is pushed into it immediately (see its `new` method above). + // And there are no other ways to create a `Run` beyond its `new` method in this + // function's scope. + let l_end = l_run.end.unwrap(); + let r_end = r_run.end.unwrap(); + r_end + .cmp(&l_end) + .then_with(|| { + let l_start = l_run.start.unwrap(); + let r_start = r_run.start.unwrap(); + l_start.cmp(&r_start) + }) + .then_with(|| self.i.cmp(&other.i)) + } + } + + /// Wrapper around the `Run` above, to support sorting them by their creation sequence `i`. + #[derive(PartialEq, Eq)] + struct Wrapper(Run); + + impl PartialOrd for Wrapper { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + + impl Ord for Wrapper { + fn cmp(&self, other: &Self) -> Ordering { + other.0.i.cmp(&self.0.i) + } + } + + // Two heaps for finding a run that is both: + // 1. not overlapping with item's range, + // 2. and is created earliest, + // when iterating the items. + // + // Heap 1 (`runs_sorted_by_end`) is for storing the runs of which top has the minimal "end" + // just about to overlap with the current selected item. + // + // Heap 2 (`runs_sort_by_index`) is for storing the runs that all have "end"s non-overlap with + // the current selected item, and of which top is the earliest created run. + // + // The finding of a suitable run basically works like this: + // 1. moves the runs in heap 1 to heap 2, until the top is overlapping with the current item; + // 2. now heap 2 has all the runs that can accept the current item, pop its top; + // 3. the top is the earliest created run, push the current item; + // 4. because the run has changed, push it back to heap 1; + // 5. check the next item. Important: we don't need to push the runs in heap 2 to 1, because + // the items are sorted by "start". When checking the next item, heap 2's runs must all have + // "end"s smaller than next item's "start". + // + // Actually the heap 2 is only for aligning with the runs selection outcomes in the original + // `find_sorted_runs` implementation. If we just need the invariant that each run has the + // non-overlapping items, we can get rid of heap 2 and make the codes simpler. + + let mut runs_sort_by_end = BinaryHeap::>::new(); + let mut runs_sort_by_index = BinaryHeap::>::new(); + let mut i = 0; + + for item in items { + let (start, _) = item.range(); + + while let Some(run) = runs_sort_by_end.pop_if(|x| x.run.end.unwrap() <= start) { + runs_sort_by_index.push(Wrapper(run)); + } + + let Some(mut run) = runs_sort_by_index.pop() else { + i += 1; + runs_sort_by_end.push(Run::new(i, item)); + continue; + }; + + run.0.push_item(item); + runs_sort_by_end.push(run.0); + } + + let mut runs = runs_sort_by_end.into_vec(); + runs.extend(runs_sort_by_index.into_vec().into_iter().map(|x| x.0)); + runs.sort_unstable_by_key(|run| run.i); + runs.into_iter().map(|x| x.run).collect() +} + /// Finds a set of files with minimum penalty to merge that can reduce the total num of runs. /// The penalty of merging is defined as the size of all overlapping files between two runs. pub fn reduce_runs(mut runs: Vec>) -> Vec { @@ -599,6 +729,8 @@ mod tests { expected_runs: &[Vec<(i64, i64)>], ) -> Vec> { let mut files = build_items(ranges); + let mut files_clone = files.clone(); + let runs = find_sorted_runs(&mut files); let result_file_ranges: Vec> = runs @@ -606,6 +738,13 @@ mod tests { .map(|r| r.items.iter().map(|f| f.range()).collect()) .collect(); assert_eq!(&expected_runs, &result_file_ranges); + + let runs_by_time_range = find_sorted_runs_by_time_range(&mut files_clone); + let results: Vec> = runs_by_time_range + .iter() + .map(|r| r.items.iter().map(|f| f.range()).collect()) + .collect(); + assert_eq!(&expected_runs, &results); runs } diff --git a/src/mito2/src/compaction/twcs.rs b/src/mito2/src/compaction/twcs.rs index 87dcac7279..cfa44d5045 100644 --- a/src/mito2/src/compaction/twcs.rs +++ b/src/mito2/src/compaction/twcs.rs @@ -22,14 +22,15 @@ use common_telemetry::{debug, info}; use common_time::Timestamp; use common_time::timestamp::TimeUnit; use common_time::timestamp_millis::BucketAligned; +use rayon::prelude::*; use store_api::storage::RegionId; use crate::compaction::buckets::infer_time_bucket; use crate::compaction::compactor::CompactionRegion; use crate::compaction::picker::{Picker, PickerOutput}; use crate::compaction::run::{ - FileGroup, Item, Ranged, find_sorted_runs, merge_primary_key_ranges, merge_seq_files, - primary_key_ranges_overlap, reduce_runs, + FileGroup, Item, Ranged, find_sorted_runs, find_sorted_runs_by_time_range, + merge_primary_key_ranges, merge_seq_files, primary_key_ranges_overlap, reduce_runs, }; use crate::compaction::{CompactionOutput, get_expired_ssts}; use crate::sst::file::{FileHandle, Level, overlaps}; @@ -64,11 +65,10 @@ impl TwcsPicker { time_windows: &mut BTreeMap, active_window: Option, ) -> Vec { - let mut output = vec![]; - for (window, files) in time_windows { - if files.files.is_empty() { - continue; - } + let find_inputs = |files: &Window, + windows: &BTreeMap| + -> (Vec, bool) { + let window = &files.time_window; let mut files_to_merge: Vec<_> = files.files().cloned().collect(); // Filter out large files in append mode - they won't benefit from compaction @@ -88,13 +88,18 @@ impl TwcsPicker { ); } - let sorted_runs = find_sorted_runs(&mut files_to_merge); + let sorted_runs = if files_to_merge.len() < 1024 { + find_sorted_runs(&mut files_to_merge) + } else { + find_sorted_runs_by_time_range(&mut files_to_merge) + }; let found_runs = sorted_runs.len(); // We only remove deletion markers if we found less than 2 runs and not in append mode. // because after compaction there will be no overlapping files. - let filter_deleted = !files.overlapping && found_runs <= 2 && !self.append_mode; + let filter_deleted = + found_runs <= 2 && !self.append_mode && !window_has_overlap(files, windows); if found_runs == 0 { - continue; + return (vec![], filter_deleted); } let mut inputs = if found_runs > 1 { @@ -102,7 +107,7 @@ impl TwcsPicker { } else { let run = sorted_runs.last().unwrap(); if run.items().len() < self.trigger_file_num { - continue; + return (vec![], filter_deleted); } // no overlapping files, try merge small files merge_seq_files(run.items(), self.max_output_file_size) @@ -144,6 +149,26 @@ impl TwcsPicker { filter_deleted, &inputs, ); + } + (inputs, filter_deleted) + }; + + let mut output = vec![]; + let windows = time_windows + .values() + .filter(|w| !w.files.is_empty()) + .collect::>(); + let chunk_size = self.max_background_tasks.unwrap_or(windows.len()).max(1); + 'chunks: for chunk in windows.chunks(chunk_size) { + for (inputs, filter_deleted) in chunk + .par_iter() // parallelly calculate the inputs + .map(|window| find_inputs(window, time_windows)) + .collect::>() + { + if inputs.is_empty() { + continue; + } + output.push(CompactionOutput { output_level: LEVEL_COMPACTED, // always compact to l1 inputs: inputs.into_iter().flat_map(|fg| fg.into_files()).collect(), @@ -158,7 +183,7 @@ impl TwcsPicker { "Region ({:?}) compaction task size larger than max background tasks({}), remaining tasks discarded", region_id, max_background_tasks ); - break; + break 'chunks; } } } @@ -268,7 +293,6 @@ struct Window { // created from the same compaction task. files: HashMap, FileGroup>, time_window: i64, - overlapping: bool, primary_key_range: Option<(bytes::Bytes, bytes::Bytes)>, } @@ -283,7 +307,6 @@ impl Window { end, files, time_window: 0, - overlapping: false, primary_key_range, } } @@ -346,37 +369,21 @@ fn assign_to_windows<'a>( } } } - if windows.is_empty() { - return BTreeMap::new(); - } + windows.into_iter().collect() +} - let mut windows = windows.into_values().collect::>(); - windows.sort_unstable_by(|l, r| l.start.cmp(&r.start).then(l.end.cmp(&r.end).reverse())); - - for idx in 0..windows.len() { - let lhs_range = windows[idx].range(); - for next_idx in idx + 1..windows.len() { - let rhs_range = windows[next_idx].range(); - if rhs_range.0 > lhs_range.1 { - break; - } - - let windows_overlap = overlaps(&lhs_range, &rhs_range) - && match ( - &windows[idx].primary_key_range, - &windows[next_idx].primary_key_range, - ) { - (Some(lhs), Some(rhs)) => primary_key_ranges_overlap(lhs, rhs), +fn window_has_overlap(this: &Window, windows: &BTreeMap) -> bool { + windows + .values() + .filter(|that| this.time_window != that.time_window) + .any(|that| { + overlaps(&this.range(), &that.range()) && { + match (&this.primary_key_range, &that.primary_key_range) { + (Some(l), Some(r)) => primary_key_ranges_overlap(l, r), _ => true, - }; - if windows_overlap { - windows[idx].overlapping = true; - windows[next_idx].overlapping = true; + } } - } - } - - windows.into_iter().map(|w| (w.time_window, w)).collect() + }) } /// Finds the latest active writing window among all files. @@ -606,7 +613,8 @@ mod tests { for (expected_window, overlapping, window_files) in expected_files { let actual_window = windows.get(expected_window).unwrap(); - assert_eq!(*overlapping, actual_window.overlapping); + let actual_overlapping = window_has_overlap(actual_window, &windows); + assert_eq!(*overlapping, actual_overlapping); let mut file_ranges = actual_window .files .values() @@ -744,7 +752,8 @@ mod tests { let windows = assign_to_windows(files.iter(), 2); - assert!(!windows.get(&2).unwrap().overlapping); + let overlapping = window_has_overlap(windows.get(&2).unwrap(), &windows); + assert!(!overlapping); } #[test] @@ -773,7 +782,8 @@ mod tests { let windows = assign_to_windows(files.iter(), 2); - assert!(!windows.get(&4).unwrap().overlapping); + let overlapping = window_has_overlap(windows.get(&4).unwrap(), &windows); + assert!(!overlapping); } struct CompactionPickerTestCase { diff --git a/src/mito2/src/engine/edit_region_test.rs b/src/mito2/src/engine/edit_region_test.rs index e05e1a847a..1d9bf2101b 100644 --- a/src/mito2/src/engine/edit_region_test.rs +++ b/src/mito2/src/engine/edit_region_test.rs @@ -21,6 +21,7 @@ use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use common_recordbatch::DfRecordBatch; use common_test_util::flight::encode_to_flight_data; +use common_time::Timestamp; use common_time::util::current_time_millis; use datatypes::arrow::array::{ArrayRef, Float64Array, StringArray, TimestampMillisecondArray}; use datatypes::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; @@ -67,7 +68,8 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) { default_flat_format: flat_format, ..Default::default() }; - let time_provider = Arc::new(MockTimeProvider::new(current_time_millis())); + let initial_time = current_time_millis(); + let time_provider = Arc::new(MockTimeProvider::new(initial_time)); let engine = env .create_engine_with_time( config.clone(), @@ -99,14 +101,22 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) { .await .unwrap(); let region = engine.get_region(region_id).unwrap(); + let initial_schedule_time = region.last_schedule_compaction_millis(); + assert_eq!(initial_time, initial_schedule_time); - let new_edit = || RegionEdit { - files_to_add: vec![FileMeta { - region_id: region.region_id, - file_id: FileId::random(), - level: 0, - ..Default::default() - }], + let new_edit = |file_starts: &[i64]| RegionEdit { + files_to_add: file_starts + .iter() + .map(|start| FileMeta { + region_id: region.region_id, + file_id: FileId::random(), + time_range: ( + Timestamp::new_millisecond(*start), + Timestamp::new_millisecond(1000 * 1000), + ), + ..Default::default() + }) + .collect(), files_to_remove: vec![], timestamp_ms: None, compaction_time_window: None, @@ -115,19 +125,23 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) { committed_sequence: None, }; engine - .edit_region(region.region_id, new_edit()) + .edit_region(region.region_id, new_edit(&[0, 10, 50, 80])) .await .unwrap(); // Asserts that the compaction of the region is not scheduled, // because the minimum time interval between two compactions is not passed. assert_eq!(rx.try_recv(), Err(oneshot::error::TryRecvError::Empty)); + assert_eq!( + initial_schedule_time, + region.last_schedule_compaction_millis() + ); // Simulates the time has passed the min compaction interval, - time_provider - .set_now(current_time_millis() + config.min_compaction_interval.as_millis() as i64); + let next_schedule_time = initial_time + config.min_compaction_interval.as_millis() as i64; + time_provider.set_now(next_schedule_time); // ... then edits the region again, engine - .edit_region(region.region_id, new_edit()) + .edit_region(region.region_id, new_edit(&[90])) .await .unwrap(); // ... finally asserts that the compaction of the region is scheduled. @@ -136,6 +150,9 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) { .unwrap() .unwrap(); assert_eq!(region_id, actual); + // Wait for the `last_schedule_compaction_millis` to update. + tokio::time::sleep(Duration::from_millis(100)).await; + assert_eq!(next_schedule_time, region.last_schedule_compaction_millis()); } #[tokio::test] diff --git a/src/mito2/src/lib.rs b/src/mito2/src/lib.rs index 7d43685ded..a452e106c7 100644 --- a/src/mito2/src/lib.rs +++ b/src/mito2/src/lib.rs @@ -18,6 +18,7 @@ #![feature(debug_closure_helpers)] #![feature(duration_constructors)] +#![feature(binary_heap_pop_if)] #[cfg(any(test, feature = "test"))] #[cfg_attr(feature = "test", allow(unused))] diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs index 9d214caed3..4acec5a893 100644 --- a/src/mito2/src/region.rs +++ b/src/mito2/src/region.rs @@ -157,8 +157,8 @@ pub struct MitoRegion { pub(crate) provider: Provider, /// Last flush time in millis. last_flush_millis: AtomicI64, - /// Last compaction time in millis. - last_compaction_millis: AtomicI64, + /// Last schedule compaction time in millis. + last_schedule_compaction_millis: AtomicI64, /// Provider to get current time. time_provider: TimeProviderRef, /// The topic's latest entry id since the region's last flushing. @@ -251,15 +251,16 @@ impl MitoRegion { self.last_flush_millis.store(now, Ordering::Relaxed); } - /// Returns last compaction timestamp in millis. - pub(crate) fn last_compaction_millis(&self) -> i64 { - self.last_compaction_millis.load(Ordering::Relaxed) + /// Returns last schedule compaction timestamp in millis. + pub(crate) fn last_schedule_compaction_millis(&self) -> i64 { + self.last_schedule_compaction_millis.load(Ordering::Relaxed) } - /// Update compaction time to current time. - pub(crate) fn update_compaction_millis(&self) { + /// Update schedule compaction time to current time. + pub(crate) fn update_schedule_compaction_millis(&self) { let now = self.time_provider.current_time_millis(); - self.last_compaction_millis.store(now, Ordering::Relaxed); + self.last_schedule_compaction_millis + .store(now, Ordering::Relaxed); } /// Returns the table dir. @@ -1727,7 +1728,7 @@ mod tests { file_purger: crate::test_util::new_noop_file_purger(), provider: Provider::noop_provider(), last_flush_millis: Default::default(), - last_compaction_millis: Default::default(), + last_schedule_compaction_millis: Default::default(), time_provider: Arc::new(StdTimeProvider), topic_latest_entry_id: Default::default(), written_bytes: Arc::new(AtomicU64::new(0)), @@ -2084,7 +2085,7 @@ mod tests { file_purger: crate::test_util::new_noop_file_purger(), provider: Provider::noop_provider(), last_flush_millis: Default::default(), - last_compaction_millis: Default::default(), + last_schedule_compaction_millis: Default::default(), time_provider: Arc::new(StdTimeProvider), topic_latest_entry_id: Default::default(), written_bytes: Arc::new(AtomicU64::new(0)), diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs index 6785181b48..3142a87c38 100644 --- a/src/mito2/src/region/opener.rs +++ b/src/mito2/src/region/opener.rs @@ -345,7 +345,7 @@ impl RegionOpener { ), provider, last_flush_millis: AtomicI64::new(now), - last_compaction_millis: AtomicI64::new(now), + last_schedule_compaction_millis: AtomicI64::new(now), time_provider: self.time_provider.clone(), topic_latest_entry_id: AtomicU64::new(0), written_bytes: Arc::new(AtomicU64::new(0)), @@ -581,7 +581,7 @@ impl RegionOpener { file_purger, provider: provider.clone(), last_flush_millis: AtomicI64::new(now), - last_compaction_millis: AtomicI64::new(now), + last_schedule_compaction_millis: AtomicI64::new(now), time_provider: self.time_provider.clone(), topic_latest_entry_id: AtomicU64::new(topic_latest_entry_id), written_bytes: Arc::new(AtomicU64::new(0)), diff --git a/src/mito2/src/worker/handle_compaction.rs b/src/mito2/src/worker/handle_compaction.rs index fd021771b1..3abdf0e349 100644 --- a/src/mito2/src/worker/handle_compaction.rs +++ b/src/mito2/src/worker/handle_compaction.rs @@ -13,7 +13,7 @@ // limitations under the License. use api::v1::region::compact_request; -use common_telemetry::{error, info, warn}; +use common_telemetry::{debug, error, info}; use store_api::logstore::LogStore; use store_api::region_request::RegionCompactRequest; use store_api::storage::RegionId; @@ -80,7 +80,6 @@ impl RegionWorkerLoop { return; } }; - region.update_compaction_millis(); region.version_control.apply_edit( Some(request.edit.clone()), @@ -118,6 +117,31 @@ impl RegionWorkerLoop { ) .await; self.handle_ddl_requests(&mut pending_ddls).await; + + if self.compaction_scheduler.is_compacting(region_id) { + return; + } + + let now = self.time_provider.current_time_millis(); + if now - region.last_schedule_compaction_millis() + >= self.config.min_compaction_interval.as_millis() as i64 + { + debug!( + "minimal compaction interval time {:?} has passed, scheduling next compaction", + self.config.min_compaction_interval + ); + if self + .compaction_scheduler + .schedule_next_compaction( + region_id, + ®ion.manifest_ctx, + self.schema_metadata_manager.clone(), + ) + .await + { + region.update_schedule_compaction_millis(); + } + } } pub(crate) async fn handle_compaction_cancelled( @@ -160,9 +184,14 @@ impl RegionWorkerLoop { return; } let now = self.time_provider.current_time_millis(); - if now - region.last_compaction_millis() + if now - region.last_schedule_compaction_millis() >= self.config.min_compaction_interval.as_millis() as i64 - && let Err(e) = self + { + debug!( + "minimal compaction interval time {:?} has passed, scheduling next compaction", + self.config.min_compaction_interval + ); + match self .compaction_scheduler .schedule_compaction( region.region_id, @@ -175,11 +204,13 @@ impl RegionWorkerLoop { 1, // Default for automatic compaction ) .await - { - warn!( - "Failed to schedule compaction for region: {}, err: {}", - region.region_id, e - ); + { + Ok(true) => region.update_schedule_compaction_millis(), + Ok(false) => {} + Err(e) => { + error!(e; "Failed to schedule compaction for region: {}", region.region_id) + } + } } } } diff --git a/src/object-store/Cargo.toml b/src/object-store/Cargo.toml index 0815f066ba..9ca68bb780 100644 --- a/src/object-store/Cargo.toml +++ b/src/object-store/Cargo.toml @@ -24,7 +24,7 @@ derive_builder = { workspace = true, optional = true } futures.workspace = true humantime-serde.workspace = true lazy_static.workspace = true -opendal = { git = "https://github.com/apache/opendal.git", rev = "4ad2d85296ffa6fdc2882f97d3c760ee243913f7", features = [ +opendal = { version = "0.57", features = [ "layers-tracing", "layers-prometheus", "services-azblob", diff --git a/src/object-store/src/compat.rs b/src/object-store/src/compat.rs deleted file mode 100644 index 4498f8f3be..0000000000 --- a/src/object-store/src/compat.rs +++ /dev/null @@ -1,1045 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::fmt::{self, Debug, Display, Formatter}; -use std::future::IntoFuture; -use std::io; -use std::ops::Range; -use std::sync::Arc; - -use async_trait::async_trait; -use bytes::Bytes; -use datafusion_object_store::path::Path; -use datafusion_object_store::{ - Attribute, Attributes, CopyMode, CopyOptions, GetOptions, GetRange, GetResult, - GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore as ArrowObjectStore, - PutMode, PutMultipartOptions, PutOptions, PutPayload, PutResult, UploadPart, -}; -use futures::stream::BoxStream; -use futures::{FutureExt, StreamExt, TryStreamExt}; -use opendal::options::CopyOptions as OpendalCopyOptions; -use opendal::raw::percent_decode_path; -use opendal::{Buffer, Operator, OperatorInfo, Writer}; -use tokio::sync::{Mutex, oneshot}; - -/// OpendalStore implements ObjectStore trait by using opendal. -/// -/// This allows users to use opendal as an object store without extra cost. -/// -/// Visit [`opendal::services`] for more information about supported services. -/// -/// ```no_run -/// use std::sync::Arc; -/// -/// use bytes::Bytes; -/// use object_store::path::Path; -/// use object_store::ObjectStore; -/// use object_store_opendal::OpendalStore; -/// use opendal::services::S3; -/// use opendal::{Builder, Operator}; -/// -/// #[tokio::main] -/// async fn main() { -/// let builder = S3::default() -/// .access_key_id("my_access_key") -/// .secret_access_key("my_secret_key") -/// .endpoint("my_endpoint") -/// .region("my_region"); -/// -/// // Create a new operator -/// let operator = Operator::new(builder).unwrap().finish(); -/// -/// // Create a new object store -/// let object_store = Arc::new(OpendalStore::new(operator)); -/// -/// let path = Path::from("data/nested/test.txt"); -/// let bytes = Bytes::from_static(b"hello, world! I am nested."); -/// -/// object_store.put(&path, bytes.clone().into()).await.unwrap(); -/// -/// let content = object_store -/// .get(&path) -/// .await -/// .unwrap() -/// .bytes() -/// .await -/// .unwrap(); -/// -/// assert_eq!(content, bytes); -/// } -/// ``` -#[derive(Clone)] -pub struct OpendalStore { - info: Arc, - inner: Operator, -} - -impl OpendalStore { - /// Create OpendalStore by given Operator. - pub fn new(op: Operator) -> Self { - Self { - info: op.info().into(), - inner: op, - } - } - - /// Get the Operator info. - pub fn info(&self) -> &OperatorInfo { - self.info.as_ref() - } - - /// Copy a file from one location to another. - async fn copy_request( - &self, - from: &Path, - to: &Path, - if_not_exists: bool, - ) -> datafusion_object_store::Result<()> { - let mut copy_options = OpendalCopyOptions::default(); - if if_not_exists { - copy_options.if_not_exists = true; - } - - // Perform the copy operation - self.inner - .copy_options( - &percent_decode_path(from.as_ref()), - &percent_decode_path(to.as_ref()), - copy_options, - ) - .await - .map_err(|err| { - if if_not_exists && err.kind() == opendal::ErrorKind::AlreadyExists { - datafusion_object_store::Error::AlreadyExists { - path: to.to_string(), - source: Box::new(err), - } - } else { - format_object_store_error(err, from.as_ref()) - } - })?; - - Ok(()) - } -} - -impl Debug for OpendalStore { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - f.debug_struct("OpendalStore") - .field("scheme", &self.info.scheme()) - .field("name", &self.info.name()) - .field("root", &self.info.root()) - .field("capability", &self.info.full_capability()) - .finish() - } -} - -impl Display for OpendalStore { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - let info = self.inner.info(); - write!( - f, - "Opendal({}, bucket={}, root={})", - info.scheme(), - info.name(), - info.root() - ) - } -} - -impl From for OpendalStore { - fn from(value: Operator) -> Self { - Self::new(value) - } -} - -#[async_trait] -impl ArrowObjectStore for OpendalStore { - async fn put_opts( - &self, - location: &Path, - bytes: PutPayload, - opts: PutOptions, - ) -> datafusion_object_store::Result { - let decoded_location = percent_decode_path(location.as_ref()); - let mut future_write = self - .inner - .write_with(&decoded_location, Buffer::from_iter(bytes)); - let opts_mode = opts.mode.clone(); - match opts.mode { - PutMode::Overwrite => {} - PutMode::Create => { - future_write = future_write.if_not_exists(true); - } - PutMode::Update(update_version) => { - let Some(etag) = update_version.e_tag else { - return Err(datafusion_object_store::Error::NotSupported { - source: Box::new(opendal::Error::new( - opendal::ErrorKind::Unsupported, - "etag is required for conditional put", - )), - }); - }; - future_write = future_write.if_match(etag.as_str()); - } - } - let rp = future_write.await.map_err(|err| { - match format_object_store_error(err, location.as_ref()) { - datafusion_object_store::Error::Precondition { path, source } - if opts_mode == PutMode::Create => - { - datafusion_object_store::Error::AlreadyExists { path, source } - } - e => e, - } - })?; - - let e_tag = rp.etag().map(|s| s.to_string()); - let version = rp.version().map(|s| s.to_string()); - - Ok(PutResult { e_tag, version }) - } - - async fn put_multipart_opts( - &self, - location: &Path, - opts: PutMultipartOptions, - ) -> datafusion_object_store::Result> { - const DEFAULT_CONCURRENT: usize = 8; - - let mut options = opendal::options::WriteOptions { - concurrent: DEFAULT_CONCURRENT, - ..Default::default() - }; - - let mut user_metadata = HashMap::new(); - - for (key, value) in opts.attributes.iter() { - match key { - Attribute::CacheControl => { - options.cache_control = Some(value.to_string()); - } - Attribute::ContentDisposition => { - options.content_disposition = Some(value.to_string()); - } - Attribute::ContentEncoding => { - options.content_encoding = Some(value.to_string()); - } - Attribute::ContentLanguage => continue, - Attribute::ContentType => { - options.content_type = Some(value.to_string()); - } - Attribute::Metadata(k) => { - user_metadata.insert(k.to_string(), value.to_string()); - } - _ => {} - } - } - - if !user_metadata.is_empty() { - options.user_metadata = Some(user_metadata); - } - - let decoded_location = percent_decode_path(location.as_ref()); - let writer = self - .inner - .writer_options(&decoded_location, options) - .await - .map_err(|err| format_object_store_error(err, location.as_ref()))?; - let upload = OpendalMultipartUpload::new(writer, location.clone()); - - Ok(Box::new(upload)) - } - - async fn get_opts( - &self, - location: &Path, - options: GetOptions, - ) -> datafusion_object_store::Result { - let raw_location = percent_decode_path(location.as_ref()); - let meta = { - let mut s = self.inner.stat_with(&raw_location); - if let Some(version) = &options.version { - s = s.version(version.as_str()) - } - if let Some(if_match) = &options.if_match { - s = s.if_match(if_match.as_str()); - } - if let Some(if_none_match) = &options.if_none_match { - s = s.if_none_match(if_none_match.as_str()); - } - if let Some(if_modified_since) = - options.if_modified_since.and_then(datetime_to_timestamp) - { - s = s.if_modified_since(if_modified_since); - } - if let Some(if_unmodified_since) = - options.if_unmodified_since.and_then(datetime_to_timestamp) - { - s = s.if_unmodified_since(if_unmodified_since); - } - s.await - .map_err(|err| format_object_store_error(err, location.as_ref()))? - }; - - let mut attributes = Attributes::new(); - if let Some(user_meta) = meta.user_metadata() { - for (key, value) in user_meta { - attributes.insert( - Attribute::Metadata(key.clone().into()), - value.clone().into(), - ); - } - } - - let meta = ObjectMeta { - location: location.clone(), - last_modified: meta - .last_modified() - .and_then(timestamp_to_datetime) - .unwrap_or_default(), - size: meta.content_length(), - e_tag: meta.etag().map(|x| x.to_string()), - version: meta.version().map(|x| x.to_string()), - }; - - if options.head { - return Ok(GetResult { - payload: GetResultPayload::Stream(Box::pin(futures::stream::empty())), - range: 0..0, - meta, - attributes, - }); - } - - let reader = { - let mut r = self.inner.reader_with(raw_location.as_ref()); - if let Some(version) = options.version { - r = r.version(version.as_str()); - } - if let Some(if_match) = options.if_match { - r = r.if_match(if_match.as_str()); - } - if let Some(if_none_match) = options.if_none_match { - r = r.if_none_match(if_none_match.as_str()); - } - if let Some(if_modified_since) = - options.if_modified_since.and_then(datetime_to_timestamp) - { - r = r.if_modified_since(if_modified_since); - } - if let Some(if_unmodified_since) = - options.if_unmodified_since.and_then(datetime_to_timestamp) - { - r = r.if_unmodified_since(if_unmodified_since); - } - r.await - .map_err(|err| format_object_store_error(err, location.as_ref()))? - }; - - let read_range = match options.range { - Some(GetRange::Bounded(r)) => { - if r.start >= r.end || r.start >= meta.size { - 0..0 - } else { - let end = r.end.min(meta.size); - r.start..end - } - } - Some(GetRange::Offset(r)) => { - if r < meta.size { - r..meta.size - } else { - 0..0 - } - } - Some(GetRange::Suffix(r)) if r < meta.size => (meta.size - r)..meta.size, - _ => 0..meta.size, - }; - - let stream = reader - .into_bytes_stream(read_range.start..read_range.end) - .await - .map_err(|err| format_object_store_error(err, location.as_ref()))? - .map_ok(|buf| buf) - .map_err(|err: io::Error| datafusion_object_store::Error::Generic { - store: "IoError", - source: Box::new(err), - }); - - Ok(GetResult { - payload: GetResultPayload::Stream(Box::pin(stream)), - range: read_range.start..read_range.end, - meta, - attributes, - }) - } - - async fn get_ranges( - &self, - location: &Path, - ranges: &[Range], - ) -> datafusion_object_store::Result> { - if ranges.is_empty() { - return Ok(Vec::new()); - } - - let raw_location = percent_decode_path(location.as_ref()); - let reader = self - .inner - .reader_with(raw_location.as_ref()) - .await - .map_err(|err| format_object_store_error(err, location.as_ref()))?; - let buffers = reader - .fetch(ranges.to_vec()) - .await - .map_err(|err| format_object_store_error(err, location.as_ref()))?; - - Ok(buffers.into_iter().map(|buf| buf.to_bytes()).collect()) - } - - fn delete_stream( - &self, - locations: BoxStream<'static, datafusion_object_store::Result>, - ) -> BoxStream<'static, datafusion_object_store::Result> { - let this = self.clone(); - locations - .map(move |location| { - let this = this.clone(); - async move { - let location = location?; - let decoded_location = percent_decode_path(location.as_ref()); - this.inner - .delete(&decoded_location) - .await - .map_err(|err| format_object_store_error(err, location.as_ref()))?; - Ok(location) - } - }) - .buffered(10) - .boxed() - } - - fn list( - &self, - prefix: Option<&Path>, - ) -> BoxStream<'static, datafusion_object_store::Result> { - // object_store `Path` always removes trailing slash - // need to add it back - let path = prefix.map_or("".into(), |x| { - format!("{}/", percent_decode_path(x.as_ref())) - }); - - let this = self.clone(); - let fut = async move { - let stream = this - .inner - .lister_with(&path) - .recursive(true) - .await - .map_err(|err| format_object_store_error(err, &path))?; - - let stream = stream.then(|res| async { - let entry = res.map_err(|err| format_object_store_error(err, ""))?; - let meta = entry.metadata(); - - Ok(format_object_meta(entry.path(), meta)) - }); - Ok::<_, datafusion_object_store::Error>(stream) - }; - - fut.into_stream().try_flatten().boxed() - } - - fn list_with_offset( - &self, - prefix: Option<&Path>, - offset: &Path, - ) -> BoxStream<'static, datafusion_object_store::Result> { - let path = prefix.map_or("".into(), |x| { - format!("{}/", percent_decode_path(x.as_ref())) - }); - let offset = offset.clone(); - - // clone self for 'static lifetime - // clone self is cheap - let this = self.clone(); - - let fut = async move { - let list_with_start_after = this.inner.info().full_capability().list_with_start_after; - let mut fut = this.inner.lister_with(&path).recursive(true); - - // Use native start_after support if possible. - if list_with_start_after { - fut = fut.start_after(offset.as_ref()); - } - - let lister = fut - .await - .map_err(|err| format_object_store_error(err, &path))? - .then(move |entry| { - let path = path.clone(); - let this = this.clone(); - async move { - let entry = entry.map_err(|err| format_object_store_error(err, &path))?; - let (path, metadata) = entry.into_parts(); - - // If it's a dir or last_modified is present, we can use it directly. - if metadata.is_dir() || metadata.last_modified().is_some() { - let object_meta = format_object_meta(&path, &metadata); - return Ok(object_meta); - } - - let metadata = this - .inner - .stat(&path) - .await - .map_err(|err| format_object_store_error(err, &path))?; - let object_meta = format_object_meta(&path, &metadata); - Ok::<_, datafusion_object_store::Error>(object_meta) - } - }) - .boxed(); - - let stream = if list_with_start_after { - lister - } else { - lister - .try_filter(move |entry| futures::future::ready(entry.location > offset)) - .boxed() - }; - - Ok::<_, datafusion_object_store::Error>(stream) - }; - - fut.into_stream().try_flatten().boxed() - } - - async fn list_with_delimiter( - &self, - prefix: Option<&Path>, - ) -> datafusion_object_store::Result { - let path = prefix.map_or("".into(), |x| { - format!("{}/", percent_decode_path(x.as_ref())) - }); - let mut stream = self - .inner - .lister_with(&path) - .into_future() - .await - .map_err(|err| format_object_store_error(err, &path))?; - - let mut common_prefixes = Vec::new(); - let mut objects = Vec::new(); - - while let Some(res) = stream.next().await { - let entry = res.map_err(|err| format_object_store_error(err, ""))?; - let meta = entry.metadata(); - - if meta.is_dir() { - common_prefixes.push(entry.path().into()); - } else if meta.last_modified().is_some() { - objects.push(format_object_meta(entry.path(), meta)); - } else { - let meta = self - .inner - .stat(entry.path()) - .await - .map_err(|err| format_object_store_error(err, entry.path()))?; - objects.push(format_object_meta(entry.path(), &meta)); - } - } - - Ok(ListResult { - common_prefixes, - objects, - }) - } - - async fn copy_opts( - &self, - from: &Path, - to: &Path, - options: CopyOptions, - ) -> datafusion_object_store::Result<()> { - let if_not_exists = options.mode == CopyMode::Create; - self.copy_request(from, to, if_not_exists).await - } -} - -/// `MultipartUpload` implementation based on `Writer` in opendal. -/// -/// # Notes -/// -/// OpenDAL writer can handle concurrent internally we don't generate real `UploadPart` like existing -/// implementation do. Instead, we just write the part and notify the next task to be written. -/// -/// The lock here doesn't really involve the write process, it's just for the notify mechanism. -struct OpendalMultipartUpload { - writer: Arc>, - location: Path, - next_notify: oneshot::Receiver<()>, -} - -impl OpendalMultipartUpload { - fn new(writer: Writer, location: Path) -> Self { - // an immediately dropped sender for the first part to write without waiting - let (_, rx) = oneshot::channel(); - - Self { - writer: Arc::new(Mutex::new(writer)), - location, - next_notify: rx, - } - } -} - -#[async_trait] -impl MultipartUpload for OpendalMultipartUpload { - fn put_part(&mut self, data: PutPayload) -> UploadPart { - let writer = self.writer.clone(); - let location = self.location.clone(); - - // Generate next notify which will be notified after the current part is written. - let (tx, rx) = oneshot::channel(); - // Fetch the notify for current part to wait for it to be written. - let last_rx = std::mem::replace(&mut self.next_notify, rx); - - async move { - // Wait for the previous part to be written - let _ = last_rx.await; - - let mut writer = writer.lock().await; - let result = writer - .write(Buffer::from_iter(data)) - .await - .map_err(|err| format_object_store_error(err, location.as_ref())); - - // Notify the next part to be written - drop(tx); - - result - } - .boxed() - } - - async fn complete(&mut self) -> datafusion_object_store::Result { - let mut writer = self.writer.lock().await; - let metadata = writer - .close() - .await - .map_err(|err| format_object_store_error(err, self.location.as_ref()))?; - - let e_tag = metadata.etag().map(|s| s.to_string()); - let version = metadata.version().map(|s| s.to_string()); - - Ok(PutResult { e_tag, version }) - } - - async fn abort(&mut self) -> datafusion_object_store::Result<()> { - let mut writer = self.writer.lock().await; - writer - .abort() - .await - .map_err(|err| format_object_store_error(err, self.location.as_ref())) - } -} - -impl Debug for OpendalMultipartUpload { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - f.debug_struct("OpendalMultipartUpload") - .field("location", &self.location) - .finish() - } -} - -fn format_object_store_error(err: opendal::Error, path: &str) -> datafusion_object_store::Error { - match err.kind() { - opendal::ErrorKind::NotFound => datafusion_object_store::Error::NotFound { - path: path.to_string(), - source: Box::new(err), - }, - opendal::ErrorKind::Unsupported => datafusion_object_store::Error::NotSupported { - source: Box::new(err), - }, - opendal::ErrorKind::AlreadyExists => datafusion_object_store::Error::AlreadyExists { - path: path.to_string(), - source: Box::new(err), - }, - opendal::ErrorKind::ConditionNotMatch => datafusion_object_store::Error::Precondition { - path: path.to_string(), - source: Box::new(err), - }, - kind => datafusion_object_store::Error::Generic { - store: kind.into_static(), - source: Box::new(err), - }, - } -} - -fn format_object_meta(path: &str, meta: &opendal::Metadata) -> ObjectMeta { - ObjectMeta { - location: path.into(), - last_modified: meta - .last_modified() - .and_then(timestamp_to_datetime) - .unwrap_or_default(), - size: meta.content_length(), - e_tag: meta.etag().map(|x| x.to_string()), - version: meta.version().map(|x| x.to_string()), - } -} - -fn timestamp_to_datetime(ts: opendal::raw::Timestamp) -> Option> { - let ts = ts.into_inner(); - chrono::DateTime::::from_timestamp(ts.as_second(), ts.subsec_nanosecond() as u32) -} - -fn datetime_to_timestamp(dt: chrono::DateTime) -> Option { - opendal::raw::Timestamp::new(dt.timestamp(), dt.timestamp_subsec_nanos() as i32).ok() -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use bytes::Bytes; - use datafusion_object_store::path::Path; - use datafusion_object_store::{ - ObjectStore as ArrowObjectStore, ObjectStoreExt, WriteMultipart, - }; - use opendal::{Operator, services}; - use rand::{Rng, RngCore}; - - use super::*; - - async fn create_test_object_store() -> Arc { - let op = Operator::new(services::Memory::default()).unwrap().finish(); - let object_store = Arc::new(OpendalStore::new(op)); - - let path: Path = "data/test.txt".into(); - let bytes = Bytes::from_static(b"hello, world!"); - object_store.put(&path, bytes.into()).await.unwrap(); - - let path: Path = "data/nested/test.txt".into(); - let bytes = Bytes::from_static(b"hello, world! I am nested."); - object_store.put(&path, bytes.into()).await.unwrap(); - - object_store - } - - #[tokio::test] - async fn test_basic() { - let op = Operator::new(services::Memory::default()).unwrap().finish(); - let object_store: Arc = Arc::new(OpendalStore::new(op)); - - // Retrieve a specific file - let path: Path = "data/test.txt".into(); - - let bytes = Bytes::from_static(b"hello, world!"); - object_store.put(&path, bytes.clone().into()).await.unwrap(); - - let meta = object_store.head(&path).await.unwrap(); - - assert_eq!(meta.size, 13); - - assert_eq!( - object_store - .get(&path) - .await - .unwrap() - .bytes() - .await - .unwrap(), - bytes - ); - } - - #[tokio::test] - async fn test_put_multipart() { - let op = Operator::new(services::Memory::default()).unwrap().finish(); - let object_store: Arc = Arc::new(OpendalStore::new(op)); - - let mut rng = rand::rng(); - - // Case complete - let path: Path = "data/test_complete.txt".into(); - let upload = object_store.put_multipart(&path).await.unwrap(); - - let mut write = WriteMultipart::new(upload); - - let mut all_bytes = vec![]; - let round = rng.random_range(1..=1024); - for _ in 0..round { - let size = rng.random_range(1..=1024); - let mut bytes = vec![0; size]; - rng.fill_bytes(&mut bytes); - - all_bytes.extend_from_slice(&bytes); - write.put(bytes.into()); - } - - let _ = write.finish().await.unwrap(); - - let meta = object_store.head(&path).await.unwrap(); - - assert_eq!(meta.size, all_bytes.len() as u64); - - assert_eq!( - object_store - .get(&path) - .await - .unwrap() - .bytes() - .await - .unwrap(), - Bytes::from(all_bytes) - ); - - // Case abort - let path: Path = "data/test_abort.txt".into(); - let mut upload = object_store.put_multipart(&path).await.unwrap(); - upload.put_part(vec![1; 1024].into()).await.unwrap(); - upload.abort().await.unwrap(); - - let res = object_store.head(&path).await; - let err = res.unwrap_err(); - - assert!(matches!( - err, - datafusion_object_store::Error::NotFound { .. } - )) - } - - #[tokio::test] - async fn test_list() { - let object_store = create_test_object_store().await; - let path: Path = "data/".into(); - let results = object_store.list(Some(&path)).collect::>().await; - assert_eq!(results.len(), 2); - let mut locations = results - .iter() - .map(|x| x.as_ref().unwrap().location.as_ref()) - .collect::>(); - - let expected_files = vec![ - ( - "data/nested/test.txt", - Bytes::from_static(b"hello, world! I am nested."), - ), - ("data/test.txt", Bytes::from_static(b"hello, world!")), - ]; - - let expected_locations = expected_files.iter().map(|x| x.0).collect::>(); - - locations.sort(); - assert_eq!(locations, expected_locations); - - for (location, bytes) in expected_files { - let path: Path = location.into(); - assert_eq!( - object_store - .get(&path) - .await - .unwrap() - .bytes() - .await - .unwrap(), - bytes - ); - } - } - - #[tokio::test] - async fn test_list_with_delimiter() { - let object_store = create_test_object_store().await; - let path: Path = "data/".into(); - let result = object_store.list_with_delimiter(Some(&path)).await.unwrap(); - assert_eq!(result.objects.len(), 1); - assert_eq!(result.common_prefixes.len(), 1); - assert_eq!(result.objects[0].location.as_ref(), "data/test.txt"); - assert_eq!(result.common_prefixes[0].as_ref(), "data/nested"); - } - - #[tokio::test] - async fn test_list_with_offset() { - let object_store = create_test_object_store().await; - let path: Path = "data/".into(); - let offset: Path = "data/nested/test.txt".into(); - let result = object_store - .list_with_offset(Some(&path), &offset) - .collect::>() - .await; - assert_eq!(result.len(), 1); - assert_eq!( - result[0].as_ref().unwrap().location.as_ref(), - "data/test.txt" - ); - } - - mod stat_counter { - use std::sync::atomic::{AtomicUsize, Ordering}; - - use super::*; - - #[derive(Debug, Clone)] - pub struct StatCounterLayer { - count: Arc, - } - - impl StatCounterLayer { - pub fn new(count: Arc) -> Self { - Self { count } - } - } - - impl opendal::raw::Layer for StatCounterLayer { - type LayeredAccess = StatCounterAccessor; - - fn layer(&self, inner: A) -> Self::LayeredAccess { - StatCounterAccessor { - inner, - count: self.count.clone(), - } - } - } - - #[derive(Debug, Clone)] - pub struct StatCounterAccessor { - inner: A, - count: Arc, - } - - impl opendal::raw::LayeredAccess for StatCounterAccessor { - type Inner = A; - type Reader = A::Reader; - type Writer = A::Writer; - type Lister = A::Lister; - type Deleter = A::Deleter; - - fn inner(&self) -> &Self::Inner { - &self.inner - } - - async fn stat( - &self, - path: &str, - args: opendal::raw::OpStat, - ) -> opendal::Result { - self.count.fetch_add(1, Ordering::SeqCst); - self.inner.stat(path, args).await - } - - async fn read( - &self, - path: &str, - args: opendal::raw::OpRead, - ) -> opendal::Result<(opendal::raw::RpRead, Self::Reader)> { - self.inner.read(path, args).await - } - - async fn write( - &self, - path: &str, - args: opendal::raw::OpWrite, - ) -> opendal::Result<(opendal::raw::RpWrite, Self::Writer)> { - self.inner.write(path, args).await - } - - async fn delete(&self) -> opendal::Result<(opendal::raw::RpDelete, Self::Deleter)> { - self.inner.delete().await - } - - async fn list( - &self, - path: &str, - args: opendal::raw::OpList, - ) -> opendal::Result<(opendal::raw::RpList, Self::Lister)> { - self.inner.list(path, args).await - } - - async fn copy( - &self, - from: &str, - to: &str, - args: opendal::raw::OpCopy, - ) -> opendal::Result { - self.inner.copy(from, to, args).await - } - - async fn rename( - &self, - from: &str, - to: &str, - args: opendal::raw::OpRename, - ) -> opendal::Result { - self.inner.rename(from, to, args).await - } - } - } - - #[tokio::test] - async fn test_get_ranges_no_stat() { - use std::sync::atomic::{AtomicUsize, Ordering}; - - // Create a stat counter and operator with tracking layer - let stat_count = Arc::new(AtomicUsize::new(0)); - let op = Operator::new(opendal::services::Memory::default()) - .unwrap() - .layer(stat_counter::StatCounterLayer::new(stat_count.clone())) - .finish(); - let store = OpendalStore::new(op); - - // Create a test file - let location = "test_get_range.txt".into(); - let value = Bytes::from_static(b"Hello, world!"); - store.put(&location, value.clone().into()).await.unwrap(); - - // Reset counter after put - stat_count.store(0, Ordering::SeqCst); - - // Test 1: get_ranges should NOT call stat() - let range = 0..5; - let ret = store - .get_ranges(&location, std::slice::from_ref(&range)) - .await - .unwrap(); - assert_eq!(vec![Bytes::from_static(b"Hello")], ret); - assert_eq!( - stat_count.load(Ordering::SeqCst), - 0, - "get_ranges should not call stat()" - ); - - // Reset counter - stat_count.store(0, Ordering::SeqCst); - - // Test 2: get_opts SHOULD call stat() to get metadata - let opts = datafusion_object_store::GetOptions { - range: Some(datafusion_object_store::GetRange::Bounded(0..5)), - ..Default::default() - }; - let ret = store.get_opts(&location, opts).await.unwrap(); - let data = ret.bytes().await.unwrap(); - assert_eq!(Bytes::from_static(b"Hello"), data); - assert!( - stat_count.load(Ordering::SeqCst) > 0, - "get_opts should call stat() to get metadata" - ); - - // Cleanup - store.delete(&location).await.unwrap(); - } -} diff --git a/src/object-store/src/layers/mock.rs b/src/object-store/src/layers/mock.rs index 3df8aae535..f4d3df54d7 100644 --- a/src/object-store/src/layers/mock.rs +++ b/src/object-store/src/layers/mock.rs @@ -21,7 +21,7 @@ pub use opendal::raw::{ Access, Layer, LayeredAccess, OpDelete, OpList, OpRead, OpWrite, RpDelete, RpList, RpRead, RpWrite, oio, }; -use opendal::raw::{OpCopy, RpCopy}; +use opendal::raw::{OpCopier, OpCopy, RpCopy}; pub use opendal::{Buffer, Error, ErrorKind, Metadata, Result}; pub type MockWriterFactory = Arc oio::Writer + Send + Sync>; @@ -146,6 +146,7 @@ impl LayeredAccess for MockAccessor { type Writer = MockWriter; type Lister = MockLister; type Deleter = MockDeleter; + type Copier = oio::Copier; fn inner(&self) -> &Self::Inner { &self.inner @@ -222,15 +223,24 @@ impl LayeredAccess for MockAccessor { } } - async fn copy(&self, from: &str, to: &str, args: OpCopy) -> Result { - let Some(copy_interceptor) = self.copy_interceptor.as_ref() else { - return self.inner.copy(from, to, args).await; - }; + async fn copy( + &self, + from: &str, + to: &str, + args: OpCopy, + opts: OpCopier, + ) -> Result<(RpCopy, Self::Copier)> { + if let Some(result) = self + .copy_interceptor + .as_ref() + .and_then(|copy_interceptor| copy_interceptor(from, to, args.clone())) + { + return result.map(|rp_copy| (rp_copy, Box::new(()) as oio::Copier)); + } - let Some(result) = copy_interceptor(from, to, args.clone()) else { - return self.inner.copy(from, to, args).await; - }; - - result + self.inner + .copy(from, to, args, opts) + .await + .map(|(rp_copy, copier)| (rp_copy, Box::new(copier) as oio::Copier)) } } diff --git a/src/object-store/src/lib.rs b/src/object-store/src/lib.rs index 3a5f72c5ce..f1f8b59082 100644 --- a/src/object-store/src/lib.rs +++ b/src/object-store/src/lib.rs @@ -18,7 +18,6 @@ pub use opendal::{ FuturesAsyncWriter, Lister, Operator as ObjectStore, Reader, Result, Writer, services, }; -pub mod compat; pub mod config; pub mod error; pub mod factory; diff --git a/src/operator/src/insert.rs b/src/operator/src/insert.rs index ff8ed2b78b..317c324429 100644 --- a/src/operator/src/insert.rs +++ b/src/operator/src/insert.rs @@ -63,6 +63,7 @@ use table::metadata::TableInfo; use table::requests::{ AUTO_CREATE_TABLE_KEY, InsertRequest as TableInsertRequest, TABLE_DATA_MODEL, TABLE_DATA_MODEL_TRACE_V1, TRACE_TABLE_PARTITIONS_HINT_KEY, VALID_TABLE_OPTION_KEYS, + is_semantic_option_key, }; use table::table_reference::TableReference; @@ -83,6 +84,10 @@ pub struct Inserter { pub(crate) partition_manager: PartitionRuleManagerRef, pub(crate) node_manager: NodeManagerRef, pub(crate) table_flownode_set_cache: TableFlownodeSetCacheRef, + /// Server-side upper bound for auto table creation on write. + /// When `false`, missing tables are never auto-created regardless of the + /// per-request `auto_create_table` hint. When `true`, the hint still applies. + auto_create_table: bool, } pub type InserterRef = Arc; @@ -135,12 +140,14 @@ impl Inserter { partition_manager: PartitionRuleManagerRef, node_manager: NodeManagerRef, table_flownode_set_cache: TableFlownodeSetCacheRef, + auto_create_table: bool, ) -> Self { Self { catalog_manager, partition_manager, node_manager, table_flownode_set_cache, + auto_create_table, } } @@ -469,6 +476,30 @@ impl Inserter { Ok(inserts) } + /// Returns `None` if auto table creation is allowed, or `Some(reason)` if + /// disabled by either the global config or the request hint. The reason tells + /// which one, for a clearer error. + fn auto_create_disabled_reason(&self, ctx: &QueryContextRef) -> Result> { + let auto_create_table_hint = ctx + .extension(AUTO_CREATE_TABLE_KEY) + .map(|v| v.parse::()) + .transpose() + .map_err(|_| { + InvalidInsertRequestSnafu { + reason: "`auto_create_table` hint must be a boolean", + } + .build() + })? + .unwrap_or(true); + Ok(if !self.auto_create_table { + Some("auto-create table is disabled by frontend config") + } else if !auto_create_table_hint { + Some("`auto_create_table` hint is disabled") + } else { + None + }) + } + /// Creates or alter tables on demand: /// - if table does not exist, create table by inferred CreateExpr /// - if table exist, check if schema matches. If any new column found, alter table by inferred `AlterExpr` @@ -498,19 +529,7 @@ impl Inserter { let schema = ctx.current_schema(); let mut table_infos = HashMap::new(); - // If `auto_create_table` hint is disabled, skip creating/altering tables. - let auto_create_table_hint = ctx - .extension(AUTO_CREATE_TABLE_KEY) - .map(|v| v.parse::()) - .transpose() - .map_err(|_| { - InvalidInsertRequestSnafu { - reason: "`auto_create_table` hint must be a boolean", - } - .build() - })? - .unwrap_or(true); - if !auto_create_table_hint { + if let Some(disabled_reason) = self.auto_create_disabled_reason(ctx)? { let mut instant_table_ids = HashSet::new(); for req in &requests.inserts { let table = self @@ -518,8 +537,8 @@ impl Inserter { .await? .context(InvalidInsertRequestSnafu { reason: format!( - "Table `{}` does not exist, and `auto_create_table` hint is disabled", - req.table_name + "Table `{}` does not exist, and {}", + req.table_name, disabled_reason ), })?; let table_info = table.table_info(); @@ -767,6 +786,16 @@ impl Inserter { return Ok(()); } + // Gate here too, otherwise a disabled switch would still leak the physical table. + if let Some(disabled_reason) = self.auto_create_disabled_reason(ctx)? { + return InvalidInsertRequestSnafu { + reason: format!( + "Physical table `{physical_table}` does not exist, and {disabled_reason}" + ), + } + .fail(); + } + let table_reference = TableReference::full(catalog_name, &schema_name, &physical_table); info!("Physical metric table `{table_reference}` does not exist, try creating table"); @@ -1061,6 +1090,13 @@ pub fn fill_table_options_for_create( } } + // Semantic keys are prefix-matched, not in the fixed allowlist above. + for (key, value) in ctx.extensions() { + if is_semantic_option_key(&key) { + table_options.insert(key, value); + } + } + match create_type { AutoCreateTableType::Logical(physical_table) => { table_options.insert( @@ -1333,6 +1369,7 @@ mod tests { Cache::new(100), kv_backend.clone(), )), + true, ); let alter_expr = inserter .get_alter_table_expr_on_demand(&mut req, &table, &ctx, true, true) @@ -1362,6 +1399,34 @@ mod tests { assert!(!table_options.contains_key(APPEND_MODE_KEY)); } + #[test] + fn test_fill_table_options_copies_semantic_extensions() { + use table::requests::{ + SEMANTIC_PER_TABLE_INDEX_KEY, SEMANTIC_SIGNAL_TYPE, SEMANTIC_SOURCE, + SIGNAL_TYPE_METRIC, SOURCE_OPENTELEMETRY, + }; + + let mut ctx = QueryContext::with(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME); + ctx.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC); + ctx.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY); + // The internal transport key must NOT be copied into table options. + ctx.set_extension(SEMANTIC_PER_TABLE_INDEX_KEY, "{}"); + let ctx = Arc::new(ctx); + let mut table_options = Default::default(); + + fill_table_options_for_create(&mut table_options, &AutoCreateTableType::Physical, &ctx); + + assert_eq!( + Some(SIGNAL_TYPE_METRIC), + table_options.get(SEMANTIC_SIGNAL_TYPE).map(String::as_str) + ); + assert_eq!( + Some(SOURCE_OPENTELEMETRY), + table_options.get(SEMANTIC_SOURCE).map(String::as_str) + ); + assert!(!table_options.contains_key(SEMANTIC_PER_TABLE_INDEX_KEY)); + } + #[test] fn test_last_non_null_create_options_preserve_default_with_append_mode_false() { let mut ctx = QueryContext::with(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME); diff --git a/src/operator/src/statement/ddl.rs b/src/operator/src/statement/ddl.rs index aef164c6bb..cadb1cde66 100644 --- a/src/operator/src/statement/ddl.rs +++ b/src/operator/src/statement/ddl.rs @@ -35,7 +35,9 @@ use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, is_reado use common_catalog::{format_full_flow_name, format_full_table_name}; use common_error::ext::BoxedError; use common_meta::cache_invalidator::Context; -use common_meta::ddl::create_flow::{DEFER_ON_MISSING_SOURCE_KEY, FlowType}; +use common_meta::ddl::create_flow::{ + DEFER_ON_MISSING_SOURCE_KEY, FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, FlowType, +}; use common_meta::instruction::CacheIdent; use common_meta::key::schema_name::{SchemaName, SchemaNameKey}; use common_meta::procedure_executor::ExecutorContext; @@ -114,7 +116,10 @@ struct DdlSubmitOptions { timeout: Duration, } -const ALLOWED_FLOW_OPTIONS: [&str; 1] = [DEFER_ON_MISSING_SOURCE_KEY]; +const ALLOWED_FLOW_OPTIONS: [&str; 2] = [ + DEFER_ON_MISSING_SOURCE_KEY, + FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, +]; fn build_procedure_id_output(procedure_id: Vec) -> Result { let procedure_id = String::from_utf8_lossy(&procedure_id).to_string(); @@ -187,7 +192,9 @@ fn validate_and_normalize_flow_options( } let normalized_value = match key.as_str() { - DEFER_ON_MISSING_SOURCE_KEY => normalize_flow_bool_option(&key, &value)?, + DEFER_ON_MISSING_SOURCE_KEY | FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY => { + normalize_flow_bool_option(&key, &value)? + } _ => { return InvalidSqlSnafu { err_msg: format!( @@ -2478,12 +2485,23 @@ mod test { #[test] fn test_validate_and_normalize_flow_options_valid() { - let options = - HashMap::from([(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "TRUE".to_string())]); + let options = HashMap::from([ + (DEFER_ON_MISSING_SOURCE_KEY.to_string(), "TRUE".to_string()), + ( + FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(), + "FALSE".to_string(), + ), + ]); assert_eq!( validate_and_normalize_flow_options(options).unwrap(), - HashMap::from([(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string(),)]) + HashMap::from([ + (DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string(),), + ( + FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(), + "false".to_string(), + ) + ]) ); } @@ -2497,7 +2515,7 @@ mod test { assert!( err.to_string() - .contains("unknown flow option 'foo', supported options: defer_on_missing_source") + .contains("unknown flow option 'foo', supported options: defer_on_missing_source, experimental_enable_incremental_read") ); } diff --git a/src/plugins/src/datanode.rs b/src/plugins/src/datanode.rs index 60640f05f1..321265a35d 100644 --- a/src/plugins/src/datanode.rs +++ b/src/plugins/src/datanode.rs @@ -14,6 +14,7 @@ use common_base::Plugins; use datanode::config::DatanodeOptions; +use datanode::datanode::Datanode; use datanode::error::Result; use crate::options::PluginOptions; @@ -28,6 +29,6 @@ pub async fn setup_datanode_plugins( Ok(()) } -pub async fn start_datanode_plugins(_plugins: Plugins) -> Result<()> { +pub async fn start_datanode_plugins(_instance: &Datanode) -> Result<()> { Ok(()) } diff --git a/src/plugins/src/flownode.rs b/src/plugins/src/flownode.rs index 9fbb018030..566051d84d 100644 --- a/src/plugins/src/flownode.rs +++ b/src/plugins/src/flownode.rs @@ -13,8 +13,8 @@ // limitations under the License. use common_base::Plugins; -use flow::FlownodeOptions; use flow::error::Result; +use flow::{FlownodeInstance, FlownodeOptions}; use crate::options::PluginOptions; @@ -27,7 +27,7 @@ pub async fn setup_flownode_plugins( Ok(()) } -pub async fn start_flownode_plugins(_plugins: Plugins) -> Result<()> { +pub async fn start_flownode_plugins(_instance: &FlownodeInstance) -> Result<()> { Ok(()) } diff --git a/src/plugins/src/frontend.rs b/src/plugins/src/frontend.rs index df7ec4fcb9..9986132d63 100644 --- a/src/plugins/src/frontend.rs +++ b/src/plugins/src/frontend.rs @@ -17,6 +17,7 @@ use common_base::Plugins; use common_meta::cache::CacheRegistryBuilder; use frontend::error::{IllegalAuthConfigSnafu, Result}; use frontend::frontend::FrontendOptions; +use frontend::instance::Instance; use snafu::ResultExt; use crate::options::PluginOptions; @@ -51,7 +52,7 @@ pub async fn setup_frontend_dynamic_plugins( Ok(()) } -pub async fn start_frontend_plugins(_plugins: Plugins) -> Result<()> { +pub async fn start_frontend_plugins(_instance: &Instance) -> Result<()> { Ok(()) } diff --git a/src/plugins/src/lib.rs b/src/plugins/src/lib.rs index ba33cb9825..a1b9b5e889 100644 --- a/src/plugins/src/lib.rs +++ b/src/plugins/src/lib.rs @@ -26,4 +26,4 @@ pub use flownode::{setup_flownode_plugins, start_flownode_plugins}; pub use frontend::{setup_frontend_plugins, start_frontend_plugins}; pub use meta_srv::{setup_metasrv_plugins, start_metasrv_plugins}; pub use options::PluginOptions; -pub use standalone::{setup_standalone_plugins, start_standalone_plugins}; +pub use standalone::setup_standalone_plugins; diff --git a/src/plugins/src/meta_srv.rs b/src/plugins/src/meta_srv.rs index 282ac241c5..6d862fdfbc 100644 --- a/src/plugins/src/meta_srv.rs +++ b/src/plugins/src/meta_srv.rs @@ -13,6 +13,7 @@ // limitations under the License. use common_base::Plugins; +use meta_srv::bootstrap::MetasrvInstance; use meta_srv::error::Result; use meta_srv::metasrv::MetasrvOptions; @@ -27,6 +28,6 @@ pub async fn setup_metasrv_plugins( Ok(()) } -pub async fn start_metasrv_plugins(_plugins: Plugins) -> Result<()> { +pub async fn start_metasrv_plugins(_instance: &MetasrvInstance) -> Result<()> { Ok(()) } diff --git a/src/plugins/src/standalone.rs b/src/plugins/src/standalone.rs index 510b84106b..a946fc7e91 100644 --- a/src/plugins/src/standalone.rs +++ b/src/plugins/src/standalone.rs @@ -31,10 +31,6 @@ pub async fn setup_standalone_plugins( Ok(()) } -pub async fn start_standalone_plugins(_plugins: Plugins) -> Result<()> { - Ok(()) -} - /// Allows standalone plugins to add cache invalidators to the layered registry. pub fn configure_cache_registry(_plugins: &Plugins) -> Option { None diff --git a/src/servers/src/http/prom_store.rs b/src/servers/src/http/prom_store.rs index bfc072e84e..280c0655d7 100644 --- a/src/servers/src/http/prom_store.rs +++ b/src/servers/src/http/prom_store.rs @@ -31,6 +31,10 @@ use prost::Message; use serde::{Deserialize, Serialize}; use session::context::{Channel, QueryContext}; use snafu::prelude::*; +use table::requests::{ + METADATA_QUALITY_INFERRED, SEMANTIC_METRIC_METADATA_QUALITY, SEMANTIC_SIGNAL_TYPE, + SEMANTIC_SOURCE, SIGNAL_TYPE_METRIC, SOURCE_PROMETHEUS, +}; use crate::error::{self, InternalSnafu, PipelineSnafu, Result}; use crate::http::extractor::PipelineInfo; @@ -108,6 +112,13 @@ pub async fn remote_write( .clone() .unwrap_or_else(|| GREPTIME_PHYSICAL_TABLE.to_string()); query_ctx.set_extension(PHYSICAL_TABLE_PARAM, physical_table.clone()); + // Stamp the Prometheus metric identity here, before `as_req_iter` splits into the + // batched and direct write paths, so both inherit it (the batched path bypasses + // `PromStoreProtocolHandler::write`). Prom RW v1 metadata is weak, so the type is + // inferred from naming. + query_ctx.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC); + query_ctx.set_extension(SEMANTIC_SOURCE, SOURCE_PROMETHEUS); + query_ctx.set_extension(SEMANTIC_METRIC_METADATA_QUALITY, METADATA_QUALITY_INFERRED); let query_ctx = Arc::new(query_ctx); let _timer = crate::metrics::METRIC_HTTP_PROM_STORE_WRITE_ELAPSED .with_label_values(&[db.as_str()]) diff --git a/src/sql/src/parsers/utils.rs b/src/sql/src/parsers/utils.rs index 0306bd859d..239f3155ca 100644 --- a/src/sql/src/parsers/utils.rs +++ b/src/sql/src/parsers/utils.rs @@ -40,7 +40,7 @@ use snafu::{ResultExt, ensure}; use sqlparser::dialect::Dialect; use sqlparser::keywords::Keyword; use sqlparser::parser::Parser; -use table::requests::validate_table_option; +use table::requests::{SEMANTIC_PREFIX, validate_semantic_option, validate_table_option}; use crate::error::{ ConvertToLogicalExpressionSnafu, InvalidSqlSnafu, InvalidTableOptionSnafu, ParseSqlValueSnafu, @@ -395,8 +395,18 @@ pub fn parse_with_options(parser: &mut Parser) -> Result { .into_iter() .map(parse_option_string) .collect::>>()?; - for key in options.keys() { - ensure!(validate_table_option(key), InvalidTableOptionSnafu { key }); + for (key, value) in &options { + if key.starts_with(SEMANTIC_PREFIX) { + // Semantic keys are whitelisted and value-checked against their domain, + // so a user cannot set an unknown key or an out-of-range value. + let value = value.as_string().unwrap_or_default(); + ensure!( + validate_semantic_option(key, value), + InvalidTableOptionSnafu { key } + ); + } else { + ensure!(validate_table_option(key), InvalidTableOptionSnafu { key }); + } } Ok(OptionMap::new(options)) } diff --git a/src/sql/src/statements/create.rs b/src/sql/src/statements/create.rs index 74ab8aee18..67742b853d 100644 --- a/src/sql/src/statements/create.rs +++ b/src/sql/src/statements/create.rs @@ -868,7 +868,25 @@ ENGINE=mito "; let result = ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default()); - assert_matches!(result, Err(Error::InvalidTableOption { .. })) + assert_matches!(result, Err(Error::InvalidTableOption { .. })); + + // A whitelisted semantic key with an in-domain value is accepted. + let semantic = |with: &str| { + let sql = + format!("create table demo(host string, ts timestamp time index) with({with});"); + ParserContext::create_with_dialect(&sql, &GreptimeDbDialect {}, ParseOptions::default()) + }; + assert!(semantic("'greptime.semantic.signal_type'='metric'").is_ok()); + // An out-of-domain value is rejected. + assert_matches!( + semantic("'greptime.semantic.signal_type'='spans'"), + Err(Error::InvalidTableOption { .. }) + ); + // An unknown key under the semantic prefix is rejected. + assert_matches!( + semantic("'greptime.semantic.bogus'='x'"), + Err(Error::InvalidTableOption { .. }) + ); } #[test] diff --git a/src/standalone/src/options.rs b/src/standalone/src/options.rs index dece6389f0..bf1000fd58 100644 --- a/src/standalone/src/options.rs +++ b/src/standalone/src/options.rs @@ -38,6 +38,10 @@ pub struct StandaloneOptions { pub enable_telemetry: bool, pub default_timezone: Option, pub default_column_prefix: Option, + /// Server-side global switch for auto table creation on write. + /// Upper bound: when `false`, missing tables are never auto-created even if a + /// request sets the `auto_create_table` hint to `true`. Default: `true`. + pub auto_create_table: bool, /// Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight). /// Set to 0 to disable the limit. Default: "0" (unlimited) pub max_in_flight_write_bytes: ReadableSize, @@ -77,6 +81,7 @@ impl Default for StandaloneOptions { enable_telemetry: true, default_timezone: None, default_column_prefix: None, + auto_create_table: true, max_in_flight_write_bytes: ReadableSize(0), write_bytes_exhausted_policy: OnExhaustedPolicy::default(), http: HttpOptions::default(), @@ -130,6 +135,7 @@ impl StandaloneOptions { let cloned_opts = self.clone(); FrontendOptions { default_timezone: cloned_opts.default_timezone, + auto_create_table: cloned_opts.auto_create_table, max_in_flight_write_bytes: cloned_opts.max_in_flight_write_bytes, write_bytes_exhausted_policy: cloned_opts.write_bytes_exhausted_policy, http: cloned_opts.http, diff --git a/src/table/src/requests.rs b/src/table/src/requests.rs index 4506c6ae65..6f27d5a20b 100644 --- a/src/table/src/requests.rs +++ b/src/table/src/requests.rs @@ -48,6 +48,9 @@ use crate::error::{ParseTableOptionSnafu, Result}; use crate::metadata::{TableId, TableVersion}; use crate::table_reference::TableReference; +mod semantic; +pub use semantic::*; + pub const FILE_TABLE_META_KEY: &str = "__private.file_table_meta"; pub const FILE_TABLE_LOCATION_KEY: &str = "location"; pub const FILE_TABLE_PATTERN_KEY: &str = "pattern"; @@ -129,6 +132,12 @@ pub fn validate_table_option(key: &str) -> bool { return true; } + // Semantic-layer keys share a reserved prefix instead of a fixed allowlist so + // the vocabulary can grow without touching this gate. See `semantic` module. + if is_semantic_option_key(key) { + return true; + } + VALID_TABLE_OPTION_KEYS.contains(&key) || VALID_DDL_OPTION_KEYS.contains(&key) } @@ -490,6 +499,14 @@ mod tests { assert!(validate_table_option(STORAGE_KEY)); assert!(validate_table_option(MEMTABLE_BULK_MERGE_THRESHOLD)); assert!(!validate_table_option("foo")); + + // Only whitelisted semantic keys are accepted. + assert!(validate_table_option(SEMANTIC_SIGNAL_TYPE)); + assert!(validate_table_option(SEMANTIC_METRIC_TYPE)); + // Unknown semantic key, near-miss, and the internal transport key are rejected. + assert!(!validate_table_option("greptime.semantic.future.key")); + assert!(!validate_table_option("greptime.semanticx")); + assert!(!validate_table_option(SEMANTIC_PER_TABLE_INDEX_KEY)); } #[test] diff --git a/src/table/src/requests/semantic.rs b/src/table/src/requests/semantic.rs new file mode 100644 index 0000000000..66d5096293 --- /dev/null +++ b/src/table/src/requests/semantic.rs @@ -0,0 +1,280 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Table semantic layer vocabulary. +//! +//! A thin layer of semantic metadata attached to a table via `table_options`, so +//! machine consumers (LLM agents, alert/dashboard builders, MCP servers, ETL) can +//! align a table with the observability concept it stands for without guessing +//! from column names. See `docs/rfcs/2026-05-28-table-semantic-layer.md`. +//! +//! All public table-option keys share the [`SEMANTIC_PREFIX`] namespace and are +//! string-valued. [`is_semantic_option_key`] gates them through +//! [`crate::requests::validate_table_option`], so they are accepted both on the +//! ingestion auto-create path and on explicit `CREATE TABLE ... WITH (...)` DDL. + +/// Reserved prefix for every public semantic table-option key. +pub const SEMANTIC_PREFIX: &str = "greptime.semantic."; + +/// Internal `QueryContext` extension key carrying the per-table semantic index +/// (a `{table_name -> {semantic_key: value}}` JSON blob) from the ingestion +/// encode path to the auto-create site. Deliberately OUTSIDE [`SEMANTIC_PREFIX`] +/// so it is not a valid table option and never leaks into a table's options. +pub const SEMANTIC_PER_TABLE_INDEX_KEY: &str = "greptime.internal.semantic.per_table_index"; + +// ---- Common keys (all signals) ---- + +/// Signal kind: one of [`SIGNAL_TYPE_TRACE`] / [`SIGNAL_TYPE_LOG`] / +/// [`SIGNAL_TYPE_METRIC`] / [`SIGNAL_TYPE_EVENT`]. +pub const SEMANTIC_SIGNAL_TYPE: &str = "greptime.semantic.signal_type"; +/// Ingestion ecosystem, e.g. [`SOURCE_OPENTELEMETRY`] / [`SOURCE_PROMETHEUS`]. +pub const SEMANTIC_SOURCE: &str = "greptime.semantic.source"; +/// Optional protocol or SDK version string, e.g. `v2` (Prom remote write), `1.30.0`. +pub const SEMANTIC_SOURCE_VERSION: &str = "greptime.semantic.source_version"; +/// Internal ingestion pipeline / data model, e.g. `greptime_trace_v1`. +pub const SEMANTIC_PIPELINE: &str = "greptime.semantic.pipeline"; + +// ---- Trace keys ---- + +/// Semantic-conventions version the rows conform to (e.g. `otel-semconv-1.27`), +/// or [`SEMANTIC_VALUE_UNKNOWN`] / [`SEMANTIC_VALUE_MIXED`] when not single-valued. +pub const SEMANTIC_TRACE_CONVENTIONS: &str = "greptime.semantic.trace.conventions"; +/// Whether `span_events` are preserved on the table. +pub const SEMANTIC_TRACE_HAS_EVENTS: &str = "greptime.semantic.trace.has_events"; +/// Whether `span_links` are preserved on the table. +pub const SEMANTIC_TRACE_HAS_LINKS: &str = "greptime.semantic.trace.has_links"; + +// ---- Metric keys (populated in Phase 2) ---- + +/// Instrument kind: `counter` / `gauge` / `histogram` / `summary` / +/// `updown_counter` / `gauge_histogram` / `info` / `stateset`. +pub const SEMANTIC_METRIC_TYPE: &str = "greptime.semantic.metric.type"; +/// UCUM unit, e.g. `s`, `By`, `{request}`. +pub const SEMANTIC_METRIC_UNIT: &str = "greptime.semantic.metric.unit"; +/// `cumulative` / `delta` (OTel only). +pub const SEMANTIC_METRIC_TEMPORALITY: &str = "greptime.semantic.metric.temporality"; +/// `true` / `false` for sum / counter typed data. +pub const SEMANTIC_METRIC_MONOTONIC: &str = "greptime.semantic.metric.monotonic"; +/// [`METADATA_QUALITY_DECLARED`] when the protocol stated the type, or +/// [`METADATA_QUALITY_INFERRED`] when guessed from a name suffix. +pub const SEMANTIC_METRIC_METADATA_QUALITY: &str = "greptime.semantic.metric.metadata_quality"; +/// Pre-translation OTel metric name when the table name was Prometheus-ised. +pub const SEMANTIC_METRIC_ORIGINAL_NAME: &str = "greptime.semantic.metric.original_name"; + +// ---- Log keys (populated in Phase 3) ---- + +/// `otlp` / `syslog` / `custom` — which mapping to use for `severity_number`. +pub const SEMANTIC_LOG_SEVERITY_SCHEME: &str = "greptime.semantic.log.severity_scheme"; +/// `string` / `json` / `mixed` — how to parse `body`. +pub const SEMANTIC_LOG_BODY_FORMAT: &str = "greptime.semantic.log.body_format"; + +// ---- Resource / scope preservation keys (populated in Phase 3) ---- + +/// JSON array string of resource attributes promoted to first-class columns. +pub const SEMANTIC_RESOURCE_ATTRIBUTES_PRESERVED: &str = + "greptime.semantic.resource.attributes_preserved"; +/// `true` / `false` — whether any resource attribute was dropped at ingest. +pub const SEMANTIC_RESOURCE_ATTRIBUTES_DROPPED: &str = + "greptime.semantic.resource.attributes_dropped"; +/// `true` / `false` — whether `scope.name` / `scope.version` survive on the row. +pub const SEMANTIC_SCOPE_PRESERVED: &str = "greptime.semantic.scope.preserved"; + +// ---- Value constants ---- + +pub const SIGNAL_TYPE_TRACE: &str = "trace"; +pub const SIGNAL_TYPE_LOG: &str = "log"; +pub const SIGNAL_TYPE_METRIC: &str = "metric"; +pub const SIGNAL_TYPE_EVENT: &str = "event"; + +pub const SOURCE_OPENTELEMETRY: &str = "opentelemetry"; +pub const SOURCE_PROMETHEUS: &str = "prometheus"; + +pub const METADATA_QUALITY_DECLARED: &str = "declared"; +pub const METADATA_QUALITY_INFERRED: &str = "inferred"; + +/// Sentinel for a key that cannot be determined at stamp time. +pub const SEMANTIC_VALUE_UNKNOWN: &str = "unknown"; +/// Sentinel for a single-valued key that saw conflicting sources. +pub const SEMANTIC_VALUE_MIXED: &str = "mixed"; + +/// Every recognised public semantic table-option key. The set is a closed +/// whitelist: keys under [`SEMANTIC_PREFIX`] that are not listed here are rejected, +/// so an unknown key like `greptime.semantic.unknown_key` does not silently land +/// in a table's options. Adding a key to the vocabulary means adding it here. +pub const SEMANTIC_OPTION_KEYS: &[&str] = &[ + SEMANTIC_SIGNAL_TYPE, + SEMANTIC_SOURCE, + SEMANTIC_SOURCE_VERSION, + SEMANTIC_PIPELINE, + SEMANTIC_TRACE_CONVENTIONS, + SEMANTIC_TRACE_HAS_EVENTS, + SEMANTIC_TRACE_HAS_LINKS, + SEMANTIC_METRIC_TYPE, + SEMANTIC_METRIC_UNIT, + SEMANTIC_METRIC_TEMPORALITY, + SEMANTIC_METRIC_MONOTONIC, + SEMANTIC_METRIC_METADATA_QUALITY, + SEMANTIC_METRIC_ORIGINAL_NAME, + SEMANTIC_LOG_SEVERITY_SCHEME, + SEMANTIC_LOG_BODY_FORMAT, + SEMANTIC_RESOURCE_ATTRIBUTES_PRESERVED, + SEMANTIC_RESOURCE_ATTRIBUTES_DROPPED, + SEMANTIC_SCOPE_PRESERVED, +]; + +/// Returns true if `key` is a recognised semantic table-option key (whitelist). +/// +/// Note this is membership, not a prefix test: unknown keys under +/// [`SEMANTIC_PREFIX`] are rejected, and the internal +/// [`SEMANTIC_PER_TABLE_INDEX_KEY`] (outside the prefix) never matches. +pub fn is_semantic_option_key(key: &str) -> bool { + SEMANTIC_OPTION_KEYS.contains(&key) +} + +/// Validates a `greptime.semantic.*` option's `value` against its allowed domain. +/// +/// Open-value keys (unit, original_name, version, pipeline, conventions, the +/// preserved-attributes list) accept any non-empty string. Closed-domain keys +/// accept a fixed set, plus the `unknown` sentinel, plus `mixed` for the keys +/// where one long-lived table can legitimately see multiple values. Keys not in +/// [`SEMANTIC_OPTION_KEYS`] are rejected. +pub fn validate_semantic_option(key: &str, value: &str) -> bool { + match key { + SEMANTIC_SOURCE_VERSION + | SEMANTIC_PIPELINE + | SEMANTIC_METRIC_UNIT + | SEMANTIC_METRIC_ORIGINAL_NAME + | SEMANTIC_TRACE_CONVENTIONS + | SEMANTIC_RESOURCE_ATTRIBUTES_PRESERVED => !value.is_empty(), + + SEMANTIC_SIGNAL_TYPE => matches!(value, "trace" | "log" | "metric" | "event" | "unknown"), + SEMANTIC_SOURCE => matches!( + value, + "opentelemetry" + | "prometheus" + | "elasticsearch" + | "loki" + | "custom" + | "mixed" + | "unknown" + ), + SEMANTIC_METRIC_TYPE => matches!( + value, + "counter" + | "gauge" + | "histogram" + | "summary" + | "updown_counter" + | "gauge_histogram" + | "info" + | "stateset" + | "mixed" + | "unknown" + ), + SEMANTIC_METRIC_TEMPORALITY => { + matches!(value, "cumulative" | "delta" | "mixed" | "unknown") + } + SEMANTIC_METRIC_MONOTONIC + | SEMANTIC_TRACE_HAS_EVENTS + | SEMANTIC_TRACE_HAS_LINKS + | SEMANTIC_RESOURCE_ATTRIBUTES_DROPPED + | SEMANTIC_SCOPE_PRESERVED => matches!(value, "true" | "false" | "unknown"), + SEMANTIC_METRIC_METADATA_QUALITY => matches!(value, "declared" | "inferred" | "unknown"), + SEMANTIC_LOG_SEVERITY_SCHEME => matches!(value, "otlp" | "syslog" | "custom" | "unknown"), + SEMANTIC_LOG_BODY_FORMAT => matches!(value, "string" | "json" | "mixed" | "unknown"), + + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_is_semantic_option_key() { + assert!(is_semantic_option_key(SEMANTIC_SIGNAL_TYPE)); + assert!(is_semantic_option_key(SEMANTIC_METRIC_TYPE)); + + // Unknown keys under the prefix are not whitelisted. + assert!(!is_semantic_option_key("greptime.semantic.future.key")); + assert!(!is_semantic_option_key("greptime.semantic.unknown_key")); + // Near-misses must not match. + assert!(!is_semantic_option_key("greptime.semanticx")); + assert!(!is_semantic_option_key("semantic.signal_type")); + assert!(!is_semantic_option_key("table_data_model")); + // The internal transport key must never be treated as a table option. + assert!(!is_semantic_option_key(SEMANTIC_PER_TABLE_INDEX_KEY)); + } + + #[test] + fn test_validate_semantic_option() { + // Enum keys reject out-of-domain values. + assert!(validate_semantic_option(SEMANTIC_SIGNAL_TYPE, "metric")); + assert!(!validate_semantic_option(SEMANTIC_SIGNAL_TYPE, "spans")); + assert!(validate_semantic_option(SEMANTIC_METRIC_TYPE, "counter")); + assert!(validate_semantic_option(SEMANTIC_METRIC_TYPE, "mixed")); + assert!(!validate_semantic_option(SEMANTIC_METRIC_TYPE, "bogus")); + + // Booleans, sentinels, open values. + assert!(validate_semantic_option(SEMANTIC_TRACE_HAS_EVENTS, "true")); + assert!(!validate_semantic_option(SEMANTIC_TRACE_HAS_EVENTS, "yes")); + assert!(validate_semantic_option( + SEMANTIC_METRIC_TEMPORALITY, + "unknown" + )); + assert!(validate_semantic_option(SEMANTIC_METRIC_UNIT, "By")); + assert!(!validate_semantic_option(SEMANTIC_METRIC_UNIT, "")); + + // Unknown key is rejected regardless of value. + assert!(!validate_semantic_option( + "greptime.semantic.future.key", + "x" + )); + + // Drift guard: every value stamped by the ingestion path must validate. + assert!(validate_semantic_option( + SEMANTIC_SIGNAL_TYPE, + SIGNAL_TYPE_TRACE + )); + assert!(validate_semantic_option( + SEMANTIC_SIGNAL_TYPE, + SIGNAL_TYPE_METRIC + )); + assert!(validate_semantic_option( + SEMANTIC_SIGNAL_TYPE, + SIGNAL_TYPE_LOG + )); + assert!(validate_semantic_option( + SEMANTIC_SOURCE, + SOURCE_OPENTELEMETRY + )); + assert!(validate_semantic_option(SEMANTIC_SOURCE, SOURCE_PROMETHEUS)); + assert!(validate_semantic_option( + SEMANTIC_METRIC_METADATA_QUALITY, + METADATA_QUALITY_INFERRED + )); + assert!(validate_semantic_option( + SEMANTIC_TRACE_CONVENTIONS, + SEMANTIC_VALUE_UNKNOWN + )); + // An empty value never validates, for any whitelisted key. + for key in SEMANTIC_OPTION_KEYS { + assert!( + !validate_semantic_option(key, ""), + "empty value should never validate for {key}" + ); + } + } +} diff --git a/tests-fuzz/src/context.rs b/tests-fuzz/src/context.rs index 2f65ad56aa..8043e166fd 100644 --- a/tests-fuzz/src/context.rs +++ b/tests-fuzz/src/context.rs @@ -200,6 +200,15 @@ impl TableContext { partitions.remove_bound(removed_idx)?; partition_def.exprs = partitions.generate()?; } + RepartitionExpr::AlterPartitions(partition) => { + ensure!( + self.partition.is_none(), + error::UnexpectedSnafu { + violated: format!("Table {} already has partition", self.name), + } + ); + self.partition = Some(partition.partition); + } } Ok(self) diff --git a/tests-fuzz/src/generator/create_expr.rs b/tests-fuzz/src/generator/create_expr.rs index 261a310db2..401c0fa9ff 100644 --- a/tests-fuzz/src/generator/create_expr.rs +++ b/tests-fuzz/src/generator/create_expr.rs @@ -44,6 +44,7 @@ pub struct CreateTableExprGenerator { #[builder(setter(into))] engine: String, partition: usize, + partition_column: bool, if_not_exists: bool, #[builder(setter(into))] name: Ident, @@ -67,6 +68,7 @@ impl Default for CreateTableExprGenerator { engine: DEFAULT_ENGINE.to_string(), if_not_exists: false, partition: 0, + partition_column: false, name: Ident::new(""), with_clause: HashMap::default(), name_generator: Box::new(MappedGenerator::new(WordGenerator, random_capitalize_map)), @@ -95,7 +97,7 @@ impl Generator for CreateTableExprGenerato let mut builder = CreateTableExprBuilder::default(); let mut columns = Vec::with_capacity(self.columns); let mut primary_keys = vec![]; - let need_partible_column = self.partition > 1; + let need_partible_column = self.partition > 1 || self.partition_column; let mut column_names = self.name_generator.choose(rng, self.columns); if self.columns == 1 { @@ -123,13 +125,15 @@ impl Generator for CreateTableExprGenerato ) .remove(0); - // Generates partition bounds. - let partition_def = generate_partition_def( - self.partition, - column.column_type.clone(), - name.clone(), - ); - builder.partition(partition_def); + if self.partition > 1 { + // Generates partition bounds. + let partition_def = generate_partition_def( + self.partition, + column.column_type.clone(), + name.clone(), + ); + builder.partition(partition_def); + } columns.push(column); } // Generates the ts column. @@ -178,11 +182,12 @@ impl Generator for CreateTableExprGenerato } } -fn generate_partition_def( +pub fn generate_partition_def( partitions: usize, column_type: ConcreteDataType, column_name: Ident, ) -> PartitionDef { + assert!(partitions > 1, "partitions must be greater than 1"); let bounds = generate_partition_bounds(&column_type, partitions - 1); let partitions = SimplePartitions::new(column_name.clone(), bounds); let partition_exprs = partitions.generate().unwrap(); @@ -193,24 +198,23 @@ fn generate_partition_def( } } -fn generate_metric_partition(partitions: usize) -> Option<(Column, PartitionDef)> { - if partitions <= 1 { - return None; - } - - let partition_column = Column { +fn metric_partition_column() -> Column { + Column { name: Ident::new("host"), column_type: ConcreteDataType::string_datatype(), options: vec![ColumnOption::PrimaryKey], - }; + } +} + +pub fn generate_metric_partition_def(partitions: usize) -> PartitionDef { + assert!(partitions > 1, "partitions must be greater than 1"); + let partition_column = metric_partition_column(); let bounds = generate_partition_bounds(&partition_column.column_type, partitions - 1); let partitions = SimplePartitions::new(partition_column.name.clone(), bounds); - let partition_def = PartitionDef { + PartitionDef { columns: vec![partitions.column_name.clone()], exprs: partitions.generate().unwrap(), - }; - - Some((partition_column, partition_def)) + } } /// Generate a physical table with 2 columns: ts of TimestampType::Millisecond as time index and val of Float64Type. @@ -223,6 +227,8 @@ pub struct CreatePhysicalTableExprGenerator { if_not_exists: bool, #[builder(default = "0")] partition: usize, + #[builder(default = "false")] + partition_column: bool, #[builder(default, setter(into))] with_clause: HashMap, } @@ -252,11 +258,13 @@ impl Generator for CreatePhysicalTableExpr let mut partition = None; let mut primary_keys = vec![]; - if let Some((partition_column, partition_def)) = generate_metric_partition(self.partition) { - columns.push(partition_column); - partition = Some(partition_def); + if self.partition > 1 || self.partition_column { + columns.push(metric_partition_column()); primary_keys.push(columns.len() - 1); } + if self.partition > 1 { + partition = Some(generate_metric_partition_def(self.partition)); + } Ok(CreateTableExpr { table_name: self.name_generator.generate(rng), @@ -387,6 +395,7 @@ mod tests { use super::*; use crate::context::TableContext; + use crate::ir::PARTIBLE_DATA_TYPES; #[test] fn test_float64() { @@ -423,6 +432,18 @@ mod tests { .unwrap(); assert_eq!(expr.columns.len(), 10); assert!(expr.partition.is_none()); + + let expr = CreateTableExprGeneratorBuilder::default() + .columns(10) + .partition(1) + .partition_column(true) + .build() + .unwrap() + .generate(&mut rng) + .unwrap(); + assert_eq!(expr.columns.len(), 10); + assert!(expr.partition.is_none()); + assert!(PARTIBLE_DATA_TYPES.contains(&expr.columns[0].column_type)); } #[test] @@ -516,6 +537,25 @@ mod tests { assert_eq!(physical_table_expr.partition.unwrap().exprs.len(), 3); } + #[test] + fn test_create_physical_table_expr_generator_with_partition_column() { + let mut rng = rand::rng(); + let physical_table_expr = CreatePhysicalTableExprGeneratorBuilder::default() + .partition(1) + .partition_column(true) + .if_not_exists(false) + .build() + .unwrap() + .generate(&mut rng) + .unwrap(); + + assert_eq!(physical_table_expr.engine, "metric"); + assert!(physical_table_expr.partition.is_none()); + assert_eq!(physical_table_expr.columns.len(), 3); + assert_eq!(physical_table_expr.columns[2].name, Ident::new("host")); + assert_eq!(physical_table_expr.primary_keys, vec![2]); + } + #[test] fn test_create_logical_table_expr_generator_without_partition_column() { let mut rng = rand::rng(); diff --git a/tests-fuzz/src/ir.rs b/tests-fuzz/src/ir.rs index ce1628cd61..e0b57ad0d1 100644 --- a/tests-fuzz/src/ir.rs +++ b/tests-fuzz/src/ir.rs @@ -30,7 +30,7 @@ use std::time::Duration; pub use alter_expr::{AlterTableExpr, AlterTableOption}; use common_time::timestamp::TimeUnit; use common_time::{Date, Timestamp}; -pub use create_expr::{CreateDatabaseExpr, CreateTableExpr}; +pub use create_expr::{CreateDatabaseExpr, CreateTableExpr, PartitionDef}; use datatypes::data_type::ConcreteDataType; use datatypes::types::TimestampType; use datatypes::value::Value; @@ -40,7 +40,7 @@ use lazy_static::lazy_static; pub use partition_expr::SimplePartitions; use rand::Rng; use rand::seq::{IndexedRandom, SliceRandom}; -pub use repartition_expr::RepartitionExpr; +pub use repartition_expr::{AlterTablePartitionsExpr, RepartitionExpr}; use serde::{Deserialize, Serialize}; use self::insert_expr::RowValues; diff --git a/tests-fuzz/src/ir/repartition_expr.rs b/tests-fuzz/src/ir/repartition_expr.rs index 5c8b401c8d..d17ed90d2c 100644 --- a/tests-fuzz/src/ir/repartition_expr.rs +++ b/tests-fuzz/src/ir/repartition_expr.rs @@ -16,6 +16,7 @@ use partition::expr::PartitionExpr; use serde::{Deserialize, Serialize}; use crate::ir::Ident; +use crate::ir::create_expr::PartitionDef; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SplitPartitionExpr { @@ -34,10 +35,19 @@ pub struct MergePartitionExpr { pub wait: bool, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AlterTablePartitionsExpr { + pub table_name: Ident, + pub partition: PartitionDef, + #[serde(default = "default_wait")] + pub wait: bool, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub enum RepartitionExpr { Split(SplitPartitionExpr), Merge(MergePartitionExpr), + AlterPartitions(AlterTablePartitionsExpr), } const fn default_wait() -> bool { diff --git a/tests-fuzz/src/translator/mysql/repartition_expr.rs b/tests-fuzz/src/translator/mysql/repartition_expr.rs index 56c27594b3..43ca8e57ef 100644 --- a/tests-fuzz/src/translator/mysql/repartition_expr.rs +++ b/tests-fuzz/src/translator/mysql/repartition_expr.rs @@ -15,7 +15,10 @@ use partition::expr::PartitionExpr; use crate::error::Result; -use crate::ir::repartition_expr::{MergePartitionExpr, RepartitionExpr, SplitPartitionExpr}; +use crate::ir::create_expr::PartitionDef; +use crate::ir::repartition_expr::{ + AlterTablePartitionsExpr, MergePartitionExpr, RepartitionExpr, SplitPartitionExpr, +}; use crate::translator::DslTranslator; pub struct RepartitionExprTranslator; @@ -59,10 +62,38 @@ impl DslTranslator for RepartitionExprTranslator { table_name, merge_exprs, wait_clause )) } + RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr { + table_name, + partition, + wait, + }) => { + let partition_clause = format_partition_clause(partition); + let wait_clause = format_wait_clause(*wait); + Ok(format!( + "ALTER TABLE {} {}{};", + table_name, partition_clause, wait_clause + )) + } } } } +fn format_partition_clause(partition: &PartitionDef) -> String { + let columns = partition + .columns + .iter() + .map(|column| column.to_string()) + .collect::>() + .join(", "); + let exprs = partition + .exprs + .iter() + .map(format_partition_expr_sql) + .collect::>() + .join(",\n "); + format!("PARTITION ON COLUMNS ({columns}) (\n {exprs}\n)") +} + fn format_partition_expr_sql(expr: &PartitionExpr) -> String { expr.to_parser_expr().to_string() } @@ -79,9 +110,15 @@ fn format_wait_clause(wait: bool) -> String { mod tests { use datatypes::value::Value; use partition::expr::col; + use sql::dialect::GreptimeDbDialect; + use sql::parser::{ParseOptions, ParserContext}; use super::RepartitionExprTranslator; - use crate::ir::repartition_expr::{MergePartitionExpr, RepartitionExpr, SplitPartitionExpr}; + use crate::ir::Ident; + use crate::ir::create_expr::PartitionDef; + use crate::ir::repartition_expr::{ + AlterTablePartitionsExpr, MergePartitionExpr, RepartitionExpr, SplitPartitionExpr, + }; use crate::translator::DslTranslator; #[test] @@ -149,4 +186,61 @@ mod tests { );"#; assert_eq!(sql, expected); } + + #[test] + fn test_translate_alter_table_partitions_expr() { + let expr = RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr { + table_name: "demo".into(), + partition: PartitionDef { + columns: vec![Ident::new("id")], + exprs: vec![ + col("id").lt(Value::Int32(10)), + col("id") + .gt_eq(Value::Int32(10)) + .and(col("id").lt(Value::Int32(20))), + col("id").gt_eq(Value::Int32(20)), + ], + }, + wait: true, + }); + let sql = RepartitionExprTranslator.translate(&expr).unwrap(); + let expected = r#"ALTER TABLE demo PARTITION ON COLUMNS (id) ( + id < 10, + id >= 10 AND id < 20, + id >= 20 +);"#; + assert_eq!(sql, expected); + assert_repartition_sql_parseable(&sql); + } + + #[test] + fn test_translate_alter_table_partitions_expr_wait_false() { + let expr = RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr { + table_name: "demo".into(), + partition: PartitionDef { + columns: vec![Ident::new("host")], + exprs: vec![ + col("host").lt(Value::from("m")), + col("host").gt_eq(Value::from("m")), + ], + }, + wait: false, + }); + let sql = RepartitionExprTranslator.translate(&expr).unwrap(); + let expected = r#"ALTER TABLE demo PARTITION ON COLUMNS (host) ( + host < 'm', + host >= 'm' +) WITH ( + WAIT = false +);"#; + assert_eq!(sql, expected); + assert_repartition_sql_parseable(&sql); + } + + fn assert_repartition_sql_parseable(sql: &str) { + let statements = + ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default()) + .unwrap(); + assert_eq!(statements.len(), 1); + } } diff --git a/tests-fuzz/src/validator/partition.rs b/tests-fuzz/src/validator/partition.rs index 3fbc8e7f86..581fb8a635 100644 --- a/tests-fuzz/src/validator/partition.rs +++ b/tests-fuzz/src/validator/partition.rs @@ -21,7 +21,8 @@ use crate::ir::Ident; use crate::ir::create_expr::PartitionDef; const PARTITIONS_INFO_SCHEMA_SQL: &str = "SELECT table_catalog, table_schema, table_name, \ -partition_name, partition_expression, partition_description, greptime_partition_id, \ +partition_name, COALESCE(partition_expression, '') AS partition_expression, \ +COALESCE(partition_description, '') AS partition_description, greptime_partition_id, \ partition_ordinal_position FROM information_schema.partitions WHERE table_name = ? \ ORDER BY partition_ordinal_position;"; @@ -91,3 +92,20 @@ pub fn assert_partitions(expected: &PartitionDef, actual: &[PartitionInfo]) -> R Ok(()) } + +/// Asserts that the table has no partition metadata in information schema. +pub fn assert_unpartitioned(actual: &[PartitionInfo]) -> Result<()> { + let has_no_partition_metadata = actual.is_empty() + || (actual.len() == 1 + && actual[0].partition_expression.is_empty() + && actual[0].partition_description.is_empty()); + + ensure!( + has_no_partition_metadata, + error::AssertSnafu { + reason: format!("Expected unpartitioned table, got partitions: {actual:?}"), + } + ); + + Ok(()) +} diff --git a/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs b/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs index 7932bc7759..8a6bd81fa8 100644 --- a/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs +++ b/tests-fuzz/targets/ddl/fuzz_repartition_metric_table.rs @@ -36,14 +36,15 @@ use tests_fuzz::fake::{ use tests_fuzz::generator::Generator; use tests_fuzz::generator::create_expr::{ CreateLogicalTableExprGeneratorBuilder, CreatePhysicalTableExprGeneratorBuilder, + generate_metric_partition_def, }; use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder; use tests_fuzz::generator::repartition_expr::{ MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder, }; use tests_fuzz::ir::{ - CreateTableExpr, Ident, InsertIntoExpr, RepartitionExpr, generate_random_value, - generate_unique_timestamp_for_mysql_with_clock, + AlterTablePartitionsExpr, CreateTableExpr, Ident, InsertIntoExpr, PartitionDef, + RepartitionExpr, generate_random_value, generate_unique_timestamp_for_mysql_with_clock, }; use tests_fuzz::translator::DslTranslator; use tests_fuzz::translator::csv::InsertExprToCsvRecordsTranslator; @@ -94,6 +95,7 @@ fn generate_create_physical_table_expr( )))) .if_not_exists(rng.random_bool(0.5)) .partition(partitions) + .partition_column(partitions <= 1) .build() .unwrap() .generate(rng) @@ -158,12 +160,6 @@ async fn create_metric_tables( })?; info!("Create physical table: {create_physical_sql}, result: {result:?}"); let physical_table_ctx = Arc::new(TableContext::from(&create_physical_expr)); - ensure!( - physical_table_ctx.partition.is_some(), - error::AssertSnafu { - reason: "Physical metric table must have partition".to_string() - } - ); let mut logical_tables = BTreeMap::new(); let mut create_logical_sqls = HashMap::new(); @@ -436,6 +432,11 @@ fn repartition_operation( table_ctx: &TableContextRef, rng: &mut R, ) -> Result { + if table_ctx.partition.is_none() { + let partition = generate_metric_partition_def(rng.random_range(2..8)); + return Ok(alter_table_partitions_expr(table_ctx, partition, true)); + } + let split = rng.random_bool(0.5); if table_ctx.partition.as_ref().unwrap().exprs.len() <= 2 || split { let expr = SplitPartitionExprGeneratorBuilder::default() @@ -454,19 +455,35 @@ fn repartition_operation( } } +fn alter_table_partitions_expr( + table_ctx: &TableContextRef, + partition: PartitionDef, + wait: bool, +) -> RepartitionExpr { + RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr { + table_name: table_ctx.name.clone(), + partition, + wait, + }) +} + impl Arbitrary<'_> for FuzzInput { fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result { let seed = get_fuzz_override::("SEED").unwrap_or(u.int_in_range(u64::MIN..=u64::MAX)?); let mut rng = ChaChaRng::seed_from_u64(seed); - let partitions = - get_fuzz_override::("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8)); + let partitions = get_fuzz_override::("PARTITIONS").unwrap_or_else(|| { + if rng.random_bool(0.5) { + 1 + } else { + rng.random_range(2..8) + } + }); let max_tables = get_gt_fuzz_input_max_tables(); let tables = get_fuzz_override::("TABLES") .unwrap_or_else(|| rng.random_range(1..=std::cmp::max(1, max_tables))); - let max_actions = get_gt_fuzz_input_max_alter_actions(); + let max_actions = std::cmp::min(128, get_gt_fuzz_input_max_alter_actions()); let actions = get_fuzz_override::("ACTIONS") .unwrap_or_else(|| rng.random_range(1..max_actions)); - Ok(FuzzInput { seed, actions, @@ -536,7 +553,11 @@ async fn execute_repartition_metric_table(ctx: FuzzContext, input: FuzzInput) -> tokio::time::sleep(Duration::from_millis(100)).await; for i in 0..input.actions { - let partition_num = physical_table_ctx.partition.as_ref().unwrap().exprs.len(); + let partition_num = physical_table_ctx + .partition + .as_ref() + .map(|partition| partition.exprs.len()) + .unwrap_or_default(); info!( "partition_num: {partition_num}, action: {}/{}, table: {}, logical table num: {}", i + 1, diff --git a/tests-fuzz/targets/ddl/fuzz_repartition_table.rs b/tests-fuzz/targets/ddl/fuzz_repartition_table.rs index d4b9e9fd7a..4f6a014f2e 100644 --- a/tests-fuzz/targets/ddl/fuzz_repartition_table.rs +++ b/tests-fuzz/targets/ddl/fuzz_repartition_table.rs @@ -33,14 +33,15 @@ use tests_fuzz::fake::{ uppercase_and_keyword_backtick_map, }; use tests_fuzz::generator::Generator; -use tests_fuzz::generator::create_expr::CreateTableExprGeneratorBuilder; +use tests_fuzz::generator::create_expr::{CreateTableExprGeneratorBuilder, generate_partition_def}; use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder; use tests_fuzz::generator::repartition_expr::{ MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder, }; use tests_fuzz::ir::{ - CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator, RepartitionExpr, RowValue, - SimplePartitions, generate_partition_value, generate_unique_timestamp_for_mysql_with_clock, + AlterTablePartitionsExpr, CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator, + PartitionDef, RepartitionExpr, RowValue, SimplePartitions, generate_partition_value, + generate_unique_timestamp_for_mysql_with_clock, }; use tests_fuzz::translator::DslTranslator; use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator; @@ -75,8 +76,13 @@ impl Arbitrary<'_> for FuzzInput { fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result { let seed = get_fuzz_override::("SEED").unwrap_or(u.int_in_range(u64::MIN..=u64::MAX)?); let mut rng = ChaChaRng::seed_from_u64(seed); - let partitions = - get_fuzz_override::("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8)); + let partitions = get_fuzz_override::("PARTITIONS").unwrap_or_else(|| { + if rng.random_bool(0.5) { + 1 + } else { + rng.random_range(2..8) + } + }); let max_actions = get_gt_fuzz_input_max_alter_actions(); let actions = get_fuzz_override::("ACTIONS") .unwrap_or_else(|| rng.random_range(1..max_actions)); @@ -99,6 +105,7 @@ fn generate_create_expr( ))) .columns(5) .partition(input.partitions) + .partition_column(input.partitions <= 1) .engine("mito") .ts_column_type_generator(Box::new(MySQLTsColumnTypeGenerator)) .build() @@ -122,7 +129,7 @@ fn build_insert_expr( let ts_value_generator = generate_unique_timestamp_for_mysql_with_clock(clock.clone()); let counter = Arc::new(AtomicUsize::new(0)); let counter_clone = counter.clone(); - let partition_len = table_ctx.partition.as_ref().unwrap().exprs.len(); + let partition_len = partitions.bounds.len() + 1; let row = rng.random_range(partition_len..partition_len * 2); let moved_partitions = partitions.clone(); @@ -150,6 +157,28 @@ fn build_insert_expr( insert_generator.generate(rng).unwrap() } +fn alter_table_partitions_expr( + table_ctx: &TableContextRef, + partition: PartitionDef, + wait: bool, +) -> RepartitionExpr { + RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr { + table_name: table_ctx.name.clone(), + partition, + wait, + }) +} + +fn alter_table_partitions_expr_from_table_ctx( + table_ctx: &TableContextRef, + rng: &mut R, + wait: bool, +) -> RepartitionExpr { + let column = table_ctx.columns[0].clone(); + let partition = generate_partition_def(rng.random_range(2..8), column.column_type, column.name); + alter_table_partitions_expr(table_ctx, partition, wait) +} + async fn execute_insert_with_retry(ctx: &FuzzContext, sql: &str) -> Result<()> { let mut delay = Duration::from_millis(100); let mut attempt = 0; @@ -236,9 +265,36 @@ async fn execute_repartition_table(ctx: FuzzContext, input: FuzzInput) -> Result inserted_rows: 0, })); + let mut action_start = 0; + if table_ctx.partition.is_none() { + let expr = alter_table_partitions_expr_from_table_ctx(&table_ctx, &mut rng, true); + let translator = RepartitionExprTranslator; + let sql = translator.translate(&expr)?; + info!("Initial partition sql: {sql}"); + let result = sqlx::query(&sql) + .execute(&ctx.greptime) + .await + .context(error::ExecuteQuerySnafu { sql: &sql })?; + info!("Initial partition result: {result:?}"); + table_ctx = Arc::new(Arc::unwrap_or_clone(table_ctx).repartition(expr).unwrap()); + shared_state.lock().unwrap().table_ctx = table_ctx.clone(); + + let partition_entries = validator::partition::fetch_partitions_info_schema( + &ctx.greptime, + "public".into(), + &table_ctx.name, + ) + .await?; + validator::partition::assert_partitions( + table_ctx.partition.as_ref().unwrap(), + &partition_entries, + )?; + action_start = 1; + } + let writer_rng = ChaChaRng::seed_from_u64(input.seed); let writer_task = tokio::spawn(write_loop(writer_rng, ctx.clone(), shared_state.clone())); - for i in 0..input.actions { + for i in action_start..input.actions { let partition_num = table_ctx.partition.as_ref().unwrap().exprs.len(); info!( "partition_num: {partition_num}, action: {}/{}", diff --git a/tests-fuzz/targets/ddl/fuzz_repartition_table_chaos.rs b/tests-fuzz/targets/ddl/fuzz_repartition_table_chaos.rs index d3789b696c..9d8faeebf5 100644 --- a/tests-fuzz/targets/ddl/fuzz_repartition_table_chaos.rs +++ b/tests-fuzz/targets/ddl/fuzz_repartition_table_chaos.rs @@ -34,14 +34,15 @@ use tests_fuzz::fake::{ uppercase_and_keyword_backtick_map, }; use tests_fuzz::generator::Generator; -use tests_fuzz::generator::create_expr::CreateTableExprGeneratorBuilder; +use tests_fuzz::generator::create_expr::{CreateTableExprGeneratorBuilder, generate_partition_def}; use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder; use tests_fuzz::generator::repartition_expr::{ MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder, }; use tests_fuzz::ir::{ - CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator, RepartitionExpr, RowValue, - SimplePartitions, generate_partition_value, generate_unique_timestamp_for_mysql_with_clock, + AlterTablePartitionsExpr, CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator, + PartitionDef, RepartitionExpr, RowValue, SimplePartitions, generate_partition_value, + generate_unique_timestamp_for_mysql_with_clock, }; use tests_fuzz::translator::DslTranslator; use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator; @@ -93,13 +94,17 @@ impl Arbitrary<'_> for FuzzInput { let mut rng = ChaChaRng::seed_from_u64(seed); let rows = get_fuzz_override::("ROWS") .unwrap_or_else(|| rng.random_range(2..get_gt_fuzz_input_max_rows())); - let partitions = - get_fuzz_override::("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8)); + let partitions = get_fuzz_override::("PARTITIONS").unwrap_or_else(|| { + if rng.random_bool(0.5) { + 1 + } else { + rng.random_range(2..8) + } + }); let chaos_delay_ms = get_fuzz_override::("CHAOS_DELAY_MS").unwrap_or_else(|| rng.random_range(0..5000)); let chaos_hold_secs = get_fuzz_override::("CHAOS_HOLD_SECS").unwrap_or_else(|| rng.random_range(10..20)); - Ok(FuzzInput { seed, rows, @@ -127,6 +132,7 @@ fn generate_create_expr( ))) .columns(5) .partition(input.partitions) + .partition_column(input.partitions <= 1) .engine("mito") .ts_column_type_generator(Box::new(MySQLTsColumnTypeGenerator)) .build() @@ -144,7 +150,7 @@ fn build_insert_expr( let ts_value_generator = generate_unique_timestamp_for_mysql_with_clock(clock.clone()); let counter = Arc::new(AtomicUsize::new(0)); let counter_clone = counter.clone(); - let partition_len = table_ctx.partition.as_ref().unwrap().exprs.len(); + let partition_len = partitions.bounds.len() + 1; let moved_partitions = partitions.clone(); let insert_generator = InsertExprGeneratorBuilder::default() .table_ctx(table_ctx.clone()) @@ -202,10 +208,12 @@ async fn create_table(ctx: &FuzzContext, expr: &CreateTableExpr) -> Result( ctx: &FuzzContext, table_ctx: &TableContextRef, + partition_def: &PartitionDef, rng: &mut R, rows: usize, ) -> Result { - let partitions = SimplePartitions::from_table_ctx(table_ctx).unwrap(); + let partitions = + SimplePartitions::from_exprs(partition_def.columns[0].clone(), &partition_def.exprs)?; let clock = Arc::new(Mutex::new(Timestamp::current_millis())); let insert_expr = build_insert_expr(table_ctx, rng, &partitions, &clock, rows); let inserted_rows = insert_expr.values_list.len() as u64; @@ -260,6 +268,28 @@ fn repartition_operation( } } +fn alter_table_partitions_expr( + table_ctx: &TableContextRef, + partition: PartitionDef, + wait: bool, +) -> RepartitionExpr { + RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr { + table_name: table_ctx.name.clone(), + partition, + wait, + }) +} + +fn alter_table_partitions_expr_from_table_ctx( + table_ctx: &TableContextRef, + rng: &mut R, + wait: bool, +) -> RepartitionExpr { + let column = table_ctx.columns[0].clone(); + let partition = generate_partition_def(rng.random_range(2..8), column.column_type, column.name); + alter_table_partitions_expr(table_ctx, partition, wait) +} + async fn submit_repartition_procedure(ctx: &FuzzContext, expr: &RepartitionExpr) -> Result { let translator = RepartitionExprTranslator; let sql = translator.translate(expr)?; @@ -334,10 +364,13 @@ async fn validate_terminal_metadata( after_table_ctx.partition.as_ref().unwrap(), &partition_entries, )?, - ProcedureTerminalState::Failed => validator::partition::assert_partitions( - before_table_ctx.partition.as_ref().unwrap(), - &partition_entries, - )?, + ProcedureTerminalState::Failed => { + if let Some(partition) = before_table_ctx.partition.as_ref() { + validator::partition::assert_partitions(partition, &partition_entries)?; + } else { + validator::partition::assert_unpartitioned(&partition_entries)?; + } + } } Ok(()) @@ -359,7 +392,21 @@ async fn execute_repartition_chaos(ctx: FuzzContext, input: FuzzInput) -> Result let create_expr = generate_create_expr(&input, &mut rng)?; let before_table_ctx = create_table(&ctx, &create_expr).await?; - let inserted_rows = insert_initial_rows(&ctx, &before_table_ctx, &mut rng, input.rows).await?; + let insert_partition = create_expr.partition.clone().unwrap_or_else(|| { + generate_partition_def( + 2, + before_table_ctx.columns[0].column_type.clone(), + before_table_ctx.columns[0].name.clone(), + ) + }); + let inserted_rows = insert_initial_rows( + &ctx, + &before_table_ctx, + &insert_partition, + &mut rng, + input.rows, + ) + .await?; validate_table_rows(&ctx, &before_table_ctx, inserted_rows).await?; let before_entries = validator::partition::fetch_partitions_info_schema( @@ -370,7 +417,11 @@ async fn execute_repartition_chaos(ctx: FuzzContext, input: FuzzInput) -> Result .await?; info!("Before repartition partition entries: {before_entries:?}"); - let repartition_expr = repartition_operation(&before_table_ctx, &mut rng, false)?; + let repartition_expr = if before_table_ctx.partition.is_some() { + repartition_operation(&before_table_ctx, &mut rng, false)? + } else { + alter_table_partitions_expr_from_table_ctx(&before_table_ctx, &mut rng, false) + }; let after_table_ctx = Arc::new( Arc::unwrap_or_clone(before_table_ctx.clone()) .repartition(repartition_expr.clone()) diff --git a/tests-integration/src/standalone.rs b/tests-integration/src/standalone.rs index b013b8b0d4..74a1501207 100644 --- a/tests-integration/src/standalone.rs +++ b/tests-integration/src/standalone.rs @@ -80,6 +80,7 @@ pub struct GreptimeDbStandaloneBuilder { default_store: Option, plugin: Option, slow_query_options: SlowQueryOptions, + auto_create_table: bool, } impl GreptimeDbStandaloneBuilder { @@ -97,9 +98,16 @@ impl GreptimeDbStandaloneBuilder { threshold: Duration::from_secs(1), ..Default::default() }, + auto_create_table: true, } } + #[must_use] + pub fn with_auto_create_table(mut self, auto_create_table: bool) -> Self { + self.auto_create_table = auto_create_table; + self + } + #[must_use] pub fn with_default_store_type(self, store_type: StorageType) -> Self { Self { @@ -347,6 +355,7 @@ impl GreptimeDbStandaloneBuilder { wal: self.metasrv_wal_config.clone().into(), grpc: GrpcOptions::default().with_server_addr("127.0.0.1:4001"), slow_query: self.slow_query_options.clone(), + auto_create_table: self.auto_create_table, ..StandaloneOptions::default() }; diff --git a/tests-integration/src/test_util.rs b/tests-integration/src/test_util.rs index 15d65c34ea..ff231f6054 100644 --- a/tests-integration/src/test_util.rs +++ b/tests-integration/src/test_util.rs @@ -16,6 +16,7 @@ use std::env; use std::fmt::Display; use std::net::SocketAddr; use std::sync::Arc; +use std::time::Duration; use auth::{DefaultPermissionChecker, PermissionCheckerRef, UserProviderRef}; use axum::Router; @@ -49,6 +50,7 @@ use servers::http::{HttpOptions, HttpServerBuilder}; use servers::metrics_handler::MetricsHandler; use servers::mysql::server::{MysqlServer, MysqlSpawnConfig, MysqlSpawnRef}; use servers::otel_arrow::OtelArrowServiceHandler; +use servers::pending_rows_batcher::PendingRowsBatcher; use servers::postgres::PostgresServer; use servers::prom_remote_write::validation::PromValidationMode; use servers::query_handler::sql::SqlQueryHandler; @@ -564,6 +566,24 @@ async fn run_sql(sql: &str, instance: &GreptimeDbStandalone) { pub async fn setup_test_prom_app_with_frontend( store_type: StorageType, name: &str, +) -> (Router, TestGuard) { + setup_test_prom_app_with_frontend_inner(store_type, name, false).await +} + +/// Like [`setup_test_prom_app_with_frontend`] but enables the pending-rows batcher, +/// so Prometheus remote write goes through the batched (metric-engine) path instead +/// of the direct `PromStoreProtocolHandler::write` path. +pub async fn setup_test_prom_app_with_frontend_batched( + store_type: StorageType, + name: &str, +) -> (Router, TestGuard) { + setup_test_prom_app_with_frontend_inner(store_type, name, true).await +} + +async fn setup_test_prom_app_with_frontend_inner( + store_type: StorageType, + name: &str, + enable_batcher: bool, ) -> (Router, TestGuard) { unsafe { std::env::set_var("TZ", "UTC"); @@ -617,6 +637,24 @@ pub async fn setup_test_prom_app_with_frontend( ..Default::default() }; let frontend_ref = instance.fe_instance().clone(); + // Mirror the production wiring at `frontend::server`: build the batcher from the + // instance's managers. A short flush interval keeps the test responsive. + let pending_rows_batcher = if enable_batcher { + PendingRowsBatcher::try_new( + frontend_ref.partition_manager().clone(), + frontend_ref.node_manager().clone(), + frontend_ref.catalog_manager().clone(), + true, + frontend_ref.clone(), + Duration::from_millis(50), + 1000, + 4, + 64, + 64, + ) + } else { + None + }; let http_server = HttpServerBuilder::new(http_opts) .with_sql_handler(frontend_ref.clone()) .with_logs_handler(instance.fe_instance().clone()) @@ -625,7 +663,7 @@ pub async fn setup_test_prom_app_with_frontend( Some(frontend_ref.clone()), true, PromValidationMode::Strict, - None, + pending_rows_batcher, ) .with_prometheus_handler(frontend_ref) .with_greptime_config_options(instance.opts.datanode_options().to_toml().unwrap()) @@ -649,6 +687,20 @@ pub async fn setup_grpc_server_with_user_provider( setup_grpc_server_with(store_type, name, user_provider, None, None).await } +/// Sets up a gRPC server backed by a standalone instance whose frontend has auto +/// table creation disabled, for testing the server-side global switch. +pub async fn setup_grpc_server_with_auto_create_table_disabled( + store_type: StorageType, + name: &str, +) -> (GreptimeDbStandalone, Arc) { + let instance = GreptimeDbStandaloneBuilder::new(name) + .with_default_store_type(store_type) + .with_auto_create_table(false) + .build() + .await; + setup_grpc_server_for_instance(instance, None, None, None).await +} + pub async fn setup_grpc_server_with( store_type: StorageType, name: &str, @@ -657,7 +709,17 @@ pub async fn setup_grpc_server_with( memory_limiter: Option, ) -> (GreptimeDbStandalone, Arc) { let instance = setup_standalone_instance(name, store_type).await; + setup_grpc_server_for_instance(instance, user_provider, grpc_config, memory_limiter).await +} +/// Builds and starts a gRPC server on top of an already-constructed standalone +/// instance. This is the shared core behind the `setup_grpc_server_*` helpers. +async fn setup_grpc_server_for_instance( + instance: GreptimeDbStandalone, + user_provider: Option, + grpc_config: Option, + memory_limiter: Option, +) -> (GreptimeDbStandalone, Arc) { let runtime: Runtime = RuntimeBuilder::default() .worker_threads(2) .thread_name("grpc-handlers") diff --git a/tests-integration/tests/grpc.rs b/tests-integration/tests/grpc.rs index 0f1112ff4a..d6b96484af 100644 --- a/tests-integration/tests/grpc.rs +++ b/tests-integration/tests/grpc.rs @@ -44,7 +44,8 @@ use servers::request_memory_limiter::ServerMemoryLimiter; use servers::server::Server; use servers::tls::{TlsMode, TlsOption}; use tests_integration::test_util::{ - StorageType, setup_grpc_server, setup_grpc_server_with, setup_grpc_server_with_user_provider, + StorageType, setup_grpc_server, setup_grpc_server_with, + setup_grpc_server_with_auto_create_table_disabled, setup_grpc_server_with_user_provider, }; use tonic::Request; use tonic::metadata::MetadataValue; @@ -82,6 +83,7 @@ macro_rules! grpc_tests { test_invalid_dbname, test_auto_create_table, test_auto_create_table_with_hints, + test_auto_create_table_disabled_by_config, test_otel_arrow_auth, test_insert_and_select, test_dbname, @@ -405,6 +407,81 @@ pub async fn test_auto_create_table_with_hints(store_type: StorageType) { let _ = fe_grpc_server.shutdown().await; } +/// When the frontend global switch disables auto table creation, a write to a +/// missing table must fail even if the request sets `auto_create_table=true`, +/// proving the global config is an upper bound that hints cannot bypass. +pub async fn test_auto_create_table_disabled_by_config(store_type: StorageType) { + let (_db, fe_grpc_server) = setup_grpc_server_with_auto_create_table_disabled( + store_type, + "test_auto_create_table_disabled_by_config", + ) + .await; + let addr = fe_grpc_server.bind_addr().unwrap().to_string(); + + let grpc_client = Client::with_urls(vec![addr]); + let db = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, grpc_client); + + // Plain row insert to a missing table: must fail even with `auto_create_table=true`. + let (host, cpu, mem, ts) = expect_data(); + let request = InsertRequest { + table_name: "demo".to_string(), + columns: vec![host, cpu, mem, ts], + row_count: 4, + }; + let result = db + .insert_with_hints( + InsertRequests { + inserts: vec![request], + }, + &[("auto_create_table", "true")], + ) + .await; + let err = result.unwrap_err().to_string(); + assert!( + err.contains("does not exist") && err.contains("disabled by frontend config"), + "unexpected error: {err}" + ); + + // Metric path (via `physical_table` hint): must also fail without leaking the physical table. + let (host, cpu, mem, ts) = expect_data(); + let request = InsertRequest { + table_name: "demo_metric".to_string(), + columns: vec![host, cpu, mem, ts], + row_count: 4, + }; + let result = db + .insert_with_hints( + InsertRequests { + inserts: vec![request], + }, + &[ + ("auto_create_table", "true"), + ("physical_table", "greptime_physical_table"), + ], + ) + .await; + let err = result.unwrap_err().to_string(); + assert!( + err.contains("does not exist") && err.contains("disabled by frontend config"), + "unexpected error: {err}" + ); + + // The physical table must not have been created before the failure. + let output = db.sql("SHOW TABLES").await.unwrap(); + let record_batches = match output.data { + OutputData::RecordBatches(record_batches) => record_batches, + OutputData::Stream(stream) => RecordBatches::try_collect(stream).await.unwrap(), + OutputData::AffectedRows(_) => unreachable!(), + }; + let tables = record_batches.pretty_print().unwrap(); + assert!( + !tables.contains("greptime_physical_table"), + "physical table leaked despite disabled auto-create:\n{tables}" + ); + + let _ = fe_grpc_server.shutdown().await; +} + fn expect_data() -> (Column, Column, Column, Column) { // testing data: let expected_host_col = Column { diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 7f411cdec2..9e757a743c 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -71,6 +71,7 @@ use tests_integration::test_util::{ StorageType, setup_test_http_app, setup_test_http_app_with_frontend, setup_test_http_app_with_frontend_and_slow_query_threshold, setup_test_http_app_with_frontend_and_user_provider, setup_test_prom_app_with_frontend, + setup_test_prom_app_with_frontend_batched, }; use urlencoding::encode; use yaml_rust::YamlLoader; @@ -117,6 +118,7 @@ macro_rules! http_tests { test_dashboard_path, test_dashboard_api, test_prometheus_remote_write, + test_prometheus_remote_write_batched, test_prometheus_remote_special_labels, test_prometheus_remote_schema_labels, test_prometheus_remote_write_with_pipeline, @@ -1491,6 +1493,7 @@ mem_threshold_on_create = "auto" let expected_toml_str = format!( r#" enable_telemetry = true +auto_create_table = true max_in_flight_write_bytes = "0KiB" write_bytes_exhausted_policy = "wait" init_regions_in_background = false @@ -1601,6 +1604,7 @@ experimental_grpc_max_retries = 3 experimental_frontend_scan_timeout = "30s" experimental_max_filter_num_per_query = 20 experimental_time_window_merge_threshold = 3 +experimental_enable_incremental_read = false read_preference = "Leader" [logging] @@ -1954,6 +1958,18 @@ pub async fn test_prometheus_remote_write(store_type: StorageType) { ) .await; + // Prom RW tables carry the metric identity; type is inferred from naming. + validate_data( + "prometheus_remote_write_semantic_identity", + &client, + "select count(*) from information_schema.tables where table_name = 'metric2' \ + and create_options like '%greptime.semantic.signal_type=metric%' \ + and create_options like '%greptime.semantic.source=prometheus%' \ + and create_options like '%greptime.semantic.metric.metadata_quality=inferred%';", + "[[1]]", + ) + .await; + // Write snappy encoded data with new labels let write_request = WriteRequest { timeseries: mock_timeseries_new_label(), @@ -1975,6 +1991,48 @@ pub async fn test_prometheus_remote_write(store_type: StorageType) { guard.remove_all().await; } +/// Covers the batched (pending-rows-batcher) Prometheus remote write path, which +/// bypasses `PromStoreProtocolHandler::write`. Verifies the metric table is created +/// asynchronously and still carries the Prometheus semantic identity stamped on the +/// shared request context. +pub async fn test_prometheus_remote_write_batched(store_type: StorageType) { + common_telemetry::init_default_ut_logging(); + let (app, mut guard) = + setup_test_prom_app_with_frontend_batched(store_type, "prometheus_remote_write_batched") + .await; + let client = TestClient::new(app).await; + + let write_request = WriteRequest { + timeseries: prom_store::mock_timeseries(), + ..Default::default() + }; + let serialized_request = write_request.encode_to_vec(); + let compressed_request = + prom_store::snappy_compress(&serialized_request).expect("failed to encode snappy"); + + let res = client + .post("/v1/prometheus/write") + .header("Content-Encoding", "snappy") + .body(compressed_request) + .send() + .await; + assert_eq!(res.status(), StatusCode::NO_CONTENT); + + // The batcher flushes asynchronously, so poll until the table exists and carries + // the semantic identity (signal_type/source/metadata_quality). + wait_for_data( + &client, + "select count(*) from information_schema.tables where table_name = 'metric2' \ + and create_options like '%greptime.semantic.signal_type=metric%' \ + and create_options like '%greptime.semantic.source=prometheus%' \ + and create_options like '%greptime.semantic.metric.metadata_quality=inferred%'", + "[[1]]", + ) + .await; + + guard.remove_all().await; +} + pub async fn test_prometheus_remote_special_labels(store_type: StorageType) { common_telemetry::init_default_ut_logging(); let (app, mut guard) = @@ -2023,7 +2081,7 @@ pub async fn test_prometheus_remote_special_labels(store_type: StorageType) { expected, ) .await; - let expected = "[[\"idc3_lo_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc3_lo_table\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n on_physical_table = 'f1'\\n)\"]]"; + let expected = "[[\"idc3_lo_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc3_lo_table\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n 'greptime.semantic.metric.metadata_quality' = 'inferred',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'prometheus',\\n on_physical_table = 'f1'\\n)\"]]"; validate_data( "test_prometheus_remote_special_labels_idc3_show_create_table", &client, @@ -2049,7 +2107,7 @@ pub async fn test_prometheus_remote_special_labels(store_type: StorageType) { expected, ) .await; - let expected = "[[\"idc4_local_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc4_local_table\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n on_physical_table = 'f2'\\n)\"]]"; + let expected = "[[\"idc4_local_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc4_local_table\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n 'greptime.semantic.metric.metadata_quality' = 'inferred',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'prometheus',\\n on_physical_table = 'f2'\\n)\"]]"; validate_data( "test_prometheus_remote_special_labels_idc4_show_create_table", &client, @@ -5025,6 +5083,28 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) { let expected = "[[\"claude_code_cost_usage_USD_total\"],[\"claude_code_token_usage_tokens_total\"],[\"demo\"],[\"greptime_physical_table\"],[\"numbers\"]]"; validate_data("otlp_metrics_all_tables", &client, "show tables;", expected).await; + // Metric-engine logical table carries the semantic identity. Match substrings + // because extra_options ordering is not stable. + validate_data( + "otlp_metrics_semantic_identity", + &client, + "select count(*) from information_schema.tables where table_name = 'claude_code_cost_usage_USD_total' \ + and create_options like '%greptime.semantic.signal_type=metric%' \ + and create_options like '%greptime.semantic.source=opentelemetry%';", + "[[1]]", + ) + .await; + // OTLP metric type is declared, so Phase 1 must not stamp `metadata_quality` + // here (Phase 2 adds it as `declared`). + validate_data( + "otlp_metrics_no_metadata_quality", + &client, + "select count(*) from information_schema.tables where table_name = 'claude_code_cost_usage_USD_total' \ + and create_options like '%metadata_quality%';", + "[[0]]", + ) + .await; + // CREATE TABLE IF NOT EXISTS "claude_code_cost_usage_USD_total" ( // "greptime_timestamp" TIMESTAMP(3) NOT NULL, // "greptime_value" DOUBLE NULL, @@ -5049,7 +5129,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) { // on_physical_table = 'greptime_physical_table', // otlp_metric_compat = 'prom' // ) - let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"host_arch\\\" STRING NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"os_version\\\" STRING NULL,\\n \\\"otel_scope_name\\\" STRING NULL,\\n \\\"otel_scope_schema_url\\\" STRING NULL,\\n \\\"otel_scope_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"host_arch\\\", \\\"job\\\", \\\"model\\\", \\\"os_version\\\", \\\"otel_scope_name\\\", \\\"otel_scope_schema_url\\\", \\\"otel_scope_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]"; + let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"host_arch\\\" STRING NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"os_version\\\" STRING NULL,\\n \\\"otel_scope_name\\\" STRING NULL,\\n \\\"otel_scope_schema_url\\\" STRING NULL,\\n \\\"otel_scope_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"host_arch\\\", \\\"job\\\", \\\"model\\\", \\\"os_version\\\", \\\"otel_scope_name\\\", \\\"otel_scope_schema_url\\\", \\\"otel_scope_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'opentelemetry',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]"; validate_data( "otlp_metrics_all_show_create_table", &client, @@ -5122,7 +5202,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) { // on_physical_table = 'greptime_physical_table', // otlp_metric_compat = 'prom' // ) - let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"os_type\\\" STRING NULL,\\n \\\"os_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"os_type\\\", \\\"os_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n 'comment' = 'Created on insertion',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]"; + let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"os_type\\\" STRING NULL,\\n \\\"os_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"os_type\\\", \\\"os_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n 'comment' = 'Created on insertion',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'opentelemetry',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]"; validate_data( "otlp_metrics_show_create_table", &client, @@ -5186,7 +5266,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) { // on_physical_table = 'greptime_physical_table', // otlp_metric_compat = 'prom' // ) - let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n 'comment' = 'Created on insertion',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]"; + let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n 'comment' = 'Created on insertion',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'opentelemetry',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]"; validate_data( "otlp_metrics_show_create_table_none", &client, @@ -5493,7 +5573,22 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) { let expected = r#"[[1736480942444376000,1736480942444499000,123000,null,"c05d7a4ec8e1f231f02ed6e8da8655b4","d24f921c75f68e23","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444376000,1736480942444499000,123000,"d24f921c75f68e23","c05d7a4ec8e1f231f02ed6e8da8655b4","9630f2916e2f7909","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]],[1736480942444589000,1736480942444712000,123000,null,"cc9e0991a2e63d274984bd44ee669203","eba7be77e3558179","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444589000,1736480942444712000,123000,"eba7be77e3558179","cc9e0991a2e63d274984bd44ee669203","8f847259b0f6e1ab","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]]]"#; validate_data("otlp_traces", &client, "select * from mytable;", expected).await; - let expected_ddl = r#"[["mytable","CREATE TABLE IF NOT EXISTS \"mytable\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '1',\n trace_id >= '1' AND trace_id < '2',\n trace_id >= '2' AND trace_id < '3',\n trace_id >= '3' AND trace_id < '4',\n trace_id >= '4' AND trace_id < '5',\n trace_id >= '5' AND trace_id < '6',\n trace_id >= '6' AND trace_id < '7',\n trace_id >= '7' AND trace_id < '8',\n trace_id >= '8' AND trace_id < '9',\n trace_id >= '9' AND trace_id < 'a',\n trace_id >= 'a' AND trace_id < 'b',\n trace_id >= 'b' AND trace_id < 'c',\n trace_id >= 'c' AND trace_id < 'd',\n trace_id >= 'd' AND trace_id < 'e',\n trace_id >= 'e' AND trace_id < 'f',\n trace_id >= 'f'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#; + // The trace v1 main table carries the trace identity (events/links preserved as + // JSON columns by the v1 model). + validate_data( + "otlp_traces_semantic_identity", + &client, + "select count(*) from information_schema.tables where table_name = 'mytable' \ + and create_options like '%greptime.semantic.signal_type=trace%' \ + and create_options like '%greptime.semantic.source=opentelemetry%' \ + and create_options like '%greptime.semantic.pipeline=greptime_trace_v1%' \ + and create_options like '%greptime.semantic.trace.has_events=true%' \ + and create_options like '%greptime.semantic.trace.has_links=true%';", + "[[1]]", + ) + .await; + + let expected_ddl = r#"[["mytable","CREATE TABLE IF NOT EXISTS \"mytable\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '1',\n trace_id >= '1' AND trace_id < '2',\n trace_id >= '2' AND trace_id < '3',\n trace_id >= '3' AND trace_id < '4',\n trace_id >= '4' AND trace_id < '5',\n trace_id >= '5' AND trace_id < '6',\n trace_id >= '6' AND trace_id < '7',\n trace_id >= '7' AND trace_id < '8',\n trace_id >= '8' AND trace_id < '9',\n trace_id >= '9' AND trace_id < 'a',\n trace_id >= 'a' AND trace_id < 'b',\n trace_id >= 'b' AND trace_id < 'c',\n trace_id >= 'c' AND trace_id < 'd',\n trace_id >= 'd' AND trace_id < 'e',\n trace_id >= 'e' AND trace_id < 'f',\n trace_id >= 'f'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\n 'greptime.semantic.signal_type' = 'trace',\n 'greptime.semantic.source' = 'opentelemetry',\n 'greptime.semantic.trace.conventions' = 'unknown',\n 'greptime.semantic.trace.has_events' = 'true',\n 'greptime.semantic.trace.has_links' = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#; validate_data( "otlp_traces", &client, @@ -5546,7 +5641,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) { ) .await; assert_eq!(StatusCode::OK, res.status()); - let expected_ddl = r#"[["trace_table_part1","CREATE TABLE IF NOT EXISTS \"trace_table_part1\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\n\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#; + let expected_ddl = r#"[["trace_table_part1","CREATE TABLE IF NOT EXISTS \"trace_table_part1\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\n\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\n 'greptime.semantic.signal_type' = 'trace',\n 'greptime.semantic.source' = 'opentelemetry',\n 'greptime.semantic.trace.conventions' = 'unknown',\n 'greptime.semantic.trace.has_events' = 'true',\n 'greptime.semantic.trace.has_links' = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#; validate_data( "otlp_traces", &client, @@ -5583,7 +5678,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) { ) .await; assert_eq!(StatusCode::OK, res.status()); - let expected_ddl = r#"[["trace_table_part4","CREATE TABLE IF NOT EXISTS \"trace_table_part4\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '4',\n trace_id >= '4' AND trace_id < '8',\n trace_id >= '8' AND trace_id < 'c',\n trace_id >= 'c'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#; + let expected_ddl = r#"[["trace_table_part4","CREATE TABLE IF NOT EXISTS \"trace_table_part4\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '4',\n trace_id >= '4' AND trace_id < '8',\n trace_id >= '8' AND trace_id < 'c',\n trace_id >= 'c'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\n 'greptime.semantic.signal_type' = 'trace',\n 'greptime.semantic.source' = 'opentelemetry',\n 'greptime.semantic.trace.conventions' = 'unknown',\n 'greptime.semantic.trace.has_events' = 'true',\n 'greptime.semantic.trace.has_links' = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#; validate_data( "otlp_traces", &client, @@ -5620,7 +5715,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) { ) .await; assert_eq!(StatusCode::OK, res.status()); - let expected_ddl = r#"[["trace_table_part32","CREATE TABLE IF NOT EXISTS \"trace_table_part32\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '08',\n trace_id >= '08' AND trace_id < '10',\n trace_id >= '10' AND trace_id < '18',\n trace_id >= '18' AND trace_id < '20',\n trace_id >= '20' AND trace_id < '28',\n trace_id >= '28' AND trace_id < '30',\n trace_id >= '30' AND trace_id < '38',\n trace_id >= '38' AND trace_id < '40',\n trace_id >= '40' AND trace_id < '48',\n trace_id >= '48' AND trace_id < '50',\n trace_id >= '50' AND trace_id < '58',\n trace_id >= '58' AND trace_id < '60',\n trace_id >= '60' AND trace_id < '68',\n trace_id >= '68' AND trace_id < '70',\n trace_id >= '70' AND trace_id < '78',\n trace_id >= '78' AND trace_id < '80',\n trace_id >= '80' AND trace_id < '88',\n trace_id >= '88' AND trace_id < '90',\n trace_id >= '90' AND trace_id < '98',\n trace_id >= '98' AND trace_id < 'a0',\n trace_id >= 'a0' AND trace_id < 'a8',\n trace_id >= 'a8' AND trace_id < 'b0',\n trace_id >= 'b0' AND trace_id < 'b8',\n trace_id >= 'b8' AND trace_id < 'c0',\n trace_id >= 'c0' AND trace_id < 'c8',\n trace_id >= 'c8' AND trace_id < 'd0',\n trace_id >= 'd0' AND trace_id < 'd8',\n trace_id >= 'd8' AND trace_id < 'e0',\n trace_id >= 'e0' AND trace_id < 'e8',\n trace_id >= 'e8' AND trace_id < 'f0',\n trace_id >= 'f0' AND trace_id < 'f8',\n trace_id >= 'f8'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#; + let expected_ddl = r#"[["trace_table_part32","CREATE TABLE IF NOT EXISTS \"trace_table_part32\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '08',\n trace_id >= '08' AND trace_id < '10',\n trace_id >= '10' AND trace_id < '18',\n trace_id >= '18' AND trace_id < '20',\n trace_id >= '20' AND trace_id < '28',\n trace_id >= '28' AND trace_id < '30',\n trace_id >= '30' AND trace_id < '38',\n trace_id >= '38' AND trace_id < '40',\n trace_id >= '40' AND trace_id < '48',\n trace_id >= '48' AND trace_id < '50',\n trace_id >= '50' AND trace_id < '58',\n trace_id >= '58' AND trace_id < '60',\n trace_id >= '60' AND trace_id < '68',\n trace_id >= '68' AND trace_id < '70',\n trace_id >= '70' AND trace_id < '78',\n trace_id >= '78' AND trace_id < '80',\n trace_id >= '80' AND trace_id < '88',\n trace_id >= '88' AND trace_id < '90',\n trace_id >= '90' AND trace_id < '98',\n trace_id >= '98' AND trace_id < 'a0',\n trace_id >= 'a0' AND trace_id < 'a8',\n trace_id >= 'a8' AND trace_id < 'b0',\n trace_id >= 'b0' AND trace_id < 'b8',\n trace_id >= 'b8' AND trace_id < 'c0',\n trace_id >= 'c0' AND trace_id < 'c8',\n trace_id >= 'c8' AND trace_id < 'd0',\n trace_id >= 'd0' AND trace_id < 'd8',\n trace_id >= 'd8' AND trace_id < 'e0',\n trace_id >= 'e0' AND trace_id < 'e8',\n trace_id >= 'e8' AND trace_id < 'f0',\n trace_id >= 'f0' AND trace_id < 'f8',\n trace_id >= 'f8'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\n 'greptime.semantic.signal_type' = 'trace',\n 'greptime.semantic.source' = 'opentelemetry',\n 'greptime.semantic.trace.conventions' = 'unknown',\n 'greptime.semantic.trace.has_events' = 'true',\n 'greptime.semantic.trace.has_links' = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#; validate_data( "otlp_traces", &client, @@ -6283,6 +6378,17 @@ pub async fn test_otlp_logs(store_type: StorageType) { expected, ) .await; + + // The auto-created log table carries the log identity. + validate_data( + "otlp_logs_semantic_identity", + &client, + "select count(*) from information_schema.tables where table_name = 'opentelemetry_logs' \ + and create_options like '%greptime.semantic.signal_type=log%' \ + and create_options like '%greptime.semantic.source=opentelemetry%';", + "[[1]]", + ) + .await; } { @@ -7718,7 +7824,7 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) { .await; assert_eq!(StatusCode::OK, res.status()); - let trace_table_sql = "[[\"mytable\",\"CREATE TABLE IF NOT EXISTS \\\"mytable\\\" (\\n \\\"timestamp\\\" TIMESTAMP(9) NOT NULL,\\n \\\"timestamp_end\\\" TIMESTAMP(9) NULL,\\n \\\"duration_nano\\\" BIGINT UNSIGNED NULL,\\n \\\"parent_span_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"trace_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"span_id\\\" STRING NULL,\\n \\\"span_kind\\\" STRING NULL,\\n \\\"span_name\\\" STRING NULL,\\n \\\"span_status_code\\\" STRING NULL,\\n \\\"span_status_message\\\" STRING NULL,\\n \\\"trace_state\\\" STRING NULL,\\n \\\"scope_name\\\" STRING NULL,\\n \\\"scope_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"span_attributes.operation.type\\\" STRING NULL,\\n \\\"span_attributes.net.peer.ip\\\" STRING NULL,\\n \\\"span_attributes.peer.service\\\" STRING NULL,\\n \\\"span_events\\\" JSON NULL,\\n \\\"span_links\\\" JSON NULL,\\n TIME INDEX (\\\"timestamp\\\"),\\n PRIMARY KEY (\\\"service_name\\\")\\n)\\nPARTITION ON COLUMNS (\\\"trace_id\\\") (\\n trace_id < '1',\\n trace_id >= '1' AND trace_id < '2',\\n trace_id >= '2' AND trace_id < '3',\\n trace_id >= '3' AND trace_id < '4',\\n trace_id >= '4' AND trace_id < '5',\\n trace_id >= '5' AND trace_id < '6',\\n trace_id >= '6' AND trace_id < '7',\\n trace_id >= '7' AND trace_id < '8',\\n trace_id >= '8' AND trace_id < '9',\\n trace_id >= '9' AND trace_id < 'a',\\n trace_id >= 'a' AND trace_id < 'b',\\n trace_id >= 'b' AND trace_id < 'c',\\n trace_id >= 'c' AND trace_id < 'd',\\n trace_id >= 'd' AND trace_id < 'e',\\n trace_id >= 'e' AND trace_id < 'f',\\n trace_id >= 'f'\\n)\\nENGINE=mito\\nWITH(\\n 'comment' = 'Created on insertion',\\n append_mode = 'true',\\n table_data_model = 'greptime_trace_v1',\\n ttl = '7days'\\n)\"]]"; + let trace_table_sql = "[[\"mytable\",\"CREATE TABLE IF NOT EXISTS \\\"mytable\\\" (\\n \\\"timestamp\\\" TIMESTAMP(9) NOT NULL,\\n \\\"timestamp_end\\\" TIMESTAMP(9) NULL,\\n \\\"duration_nano\\\" BIGINT UNSIGNED NULL,\\n \\\"parent_span_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"trace_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"span_id\\\" STRING NULL,\\n \\\"span_kind\\\" STRING NULL,\\n \\\"span_name\\\" STRING NULL,\\n \\\"span_status_code\\\" STRING NULL,\\n \\\"span_status_message\\\" STRING NULL,\\n \\\"trace_state\\\" STRING NULL,\\n \\\"scope_name\\\" STRING NULL,\\n \\\"scope_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"span_attributes.operation.type\\\" STRING NULL,\\n \\\"span_attributes.net.peer.ip\\\" STRING NULL,\\n \\\"span_attributes.peer.service\\\" STRING NULL,\\n \\\"span_events\\\" JSON NULL,\\n \\\"span_links\\\" JSON NULL,\\n TIME INDEX (\\\"timestamp\\\"),\\n PRIMARY KEY (\\\"service_name\\\")\\n)\\nPARTITION ON COLUMNS (\\\"trace_id\\\") (\\n trace_id < '1',\\n trace_id >= '1' AND trace_id < '2',\\n trace_id >= '2' AND trace_id < '3',\\n trace_id >= '3' AND trace_id < '4',\\n trace_id >= '4' AND trace_id < '5',\\n trace_id >= '5' AND trace_id < '6',\\n trace_id >= '6' AND trace_id < '7',\\n trace_id >= '7' AND trace_id < '8',\\n trace_id >= '8' AND trace_id < '9',\\n trace_id >= '9' AND trace_id < 'a',\\n trace_id >= 'a' AND trace_id < 'b',\\n trace_id >= 'b' AND trace_id < 'c',\\n trace_id >= 'c' AND trace_id < 'd',\\n trace_id >= 'd' AND trace_id < 'e',\\n trace_id >= 'e' AND trace_id < 'f',\\n trace_id >= 'f'\\n)\\nENGINE=mito\\nWITH(\\n 'comment' = 'Created on insertion',\\n append_mode = 'true',\\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\\n 'greptime.semantic.signal_type' = 'trace',\\n 'greptime.semantic.source' = 'opentelemetry',\\n 'greptime.semantic.trace.conventions' = 'unknown',\\n 'greptime.semantic.trace.has_events' = 'true',\\n 'greptime.semantic.trace.has_links' = 'true',\\n table_data_model = 'greptime_trace_v1',\\n ttl = '7days'\\n)\"]]"; validate_data( "trace_v1_create_table", &client, diff --git a/tests/cases/standalone/common/flow/flow_incremental_aggr.result b/tests/cases/standalone/common/flow/flow_incremental_aggr.result index bb66d5362c..2273e0e821 100644 --- a/tests/cases/standalone/common/flow/flow_incremental_aggr.result +++ b/tests/cases/standalone/common/flow/flow_incremental_aggr.result @@ -1,3 +1,31 @@ +-- Incremental aggregate reads only support append-only source tables because +-- update/upsert sources need old-value compensation. +CREATE TABLE incremental_non_append_input ( + host_id INT, + n INT, + ts TIMESTAMP TIME INDEX, + PRIMARY KEY(host_id) +); + +Affected Rows: 0 + +CREATE FLOW incremental_non_append_flow SINK TO incremental_non_append_sink +WITH (experimental_enable_incremental_read = 'true') +AS +SELECT + sum(n) AS total, + date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window +FROM + incremental_non_append_input +GROUP BY + time_window; + +Error: 3001(EngineExecuteQuery), Unsupported: Flow incremental read requires append-only source table, but source table `greptime.public.incremental_non_append_input` is not append-only. Consider setting append_mode='true' on the source table or disabling experimental_enable_incremental_read + +DROP TABLE incremental_non_append_input; + +Affected Rows: 0 + CREATE TABLE incremental_aggr_input ( host_id INT, n INT, @@ -9,7 +37,9 @@ CREATE TABLE incremental_aggr_input ( Affected Rows: 0 -CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink AS +CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink +WITH (experimental_enable_incremental_read = 'true') +AS SELECT sum(n) AS total, min(n) AS min_n, diff --git a/tests/cases/standalone/common/flow/flow_incremental_aggr.sql b/tests/cases/standalone/common/flow/flow_incremental_aggr.sql index 51dd431fef..4c012aef23 100644 --- a/tests/cases/standalone/common/flow/flow_incremental_aggr.sql +++ b/tests/cases/standalone/common/flow/flow_incremental_aggr.sql @@ -1,3 +1,25 @@ +-- Incremental aggregate reads only support append-only source tables because +-- update/upsert sources need old-value compensation. +CREATE TABLE incremental_non_append_input ( + host_id INT, + n INT, + ts TIMESTAMP TIME INDEX, + PRIMARY KEY(host_id) +); + +CREATE FLOW incremental_non_append_flow SINK TO incremental_non_append_sink +WITH (experimental_enable_incremental_read = 'true') +AS +SELECT + sum(n) AS total, + date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window +FROM + incremental_non_append_input +GROUP BY + time_window; + +DROP TABLE incremental_non_append_input; + CREATE TABLE incremental_aggr_input ( host_id INT, n INT, @@ -7,7 +29,9 @@ CREATE TABLE incremental_aggr_input ( append_mode = 'true' ); -CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink AS +CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink +WITH (experimental_enable_incremental_read = 'true') +AS SELECT sum(n) AS total, min(n) AS min_n, diff --git a/tests/cases/standalone/common/flow/flow_incremental_memtable.result b/tests/cases/standalone/common/flow/flow_incremental_memtable.result index 1e452b21ad..67326e1261 100644 --- a/tests/cases/standalone/common/flow/flow_incremental_memtable.result +++ b/tests/cases/standalone/common/flow/flow_incremental_memtable.result @@ -12,7 +12,9 @@ CREATE TABLE flow_incr_memtable_input ( Affected Rows: 0 -CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink AS +CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink +WITH (experimental_enable_incremental_read = 'true') +AS SELECT sum(n) AS total, min(n) AS min_n, diff --git a/tests/cases/standalone/common/flow/flow_incremental_memtable.sql b/tests/cases/standalone/common/flow/flow_incremental_memtable.sql index 66dccbb8b3..6dbbf6064f 100644 --- a/tests/cases/standalone/common/flow/flow_incremental_memtable.sql +++ b/tests/cases/standalone/common/flow/flow_incremental_memtable.sql @@ -10,7 +10,9 @@ CREATE TABLE flow_incr_memtable_input ( append_mode = 'true' ); -CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink AS +CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink +WITH (experimental_enable_incremental_read = 'true') +AS SELECT sum(n) AS total, min(n) AS min_n, diff --git a/tests/cases/standalone/common/flow/flow_incremental_partitioned.result b/tests/cases/standalone/common/flow/flow_incremental_partitioned.result index b56b390abd..0899d4acb0 100644 --- a/tests/cases/standalone/common/flow/flow_incremental_partitioned.result +++ b/tests/cases/standalone/common/flow/flow_incremental_partitioned.result @@ -17,7 +17,9 @@ WITH ( Affected Rows: 0 -CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink AS +CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink +WITH (experimental_enable_incremental_read = 'true') +AS SELECT sum(n) AS total, min(n) AS min_n, diff --git a/tests/cases/standalone/common/flow/flow_incremental_partitioned.sql b/tests/cases/standalone/common/flow/flow_incremental_partitioned.sql index 234c9b9085..18cece1889 100644 --- a/tests/cases/standalone/common/flow/flow_incremental_partitioned.sql +++ b/tests/cases/standalone/common/flow/flow_incremental_partitioned.sql @@ -15,7 +15,9 @@ WITH ( append_mode = 'true' ); -CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink AS +CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink +WITH (experimental_enable_incremental_read = 'true') +AS SELECT sum(n) AS total, min(n) AS min_n, diff --git a/tests/cases/standalone/common/flow/show_create_flow.result b/tests/cases/standalone/common/flow/show_create_flow.result index 113822cd18..431d1dfbb5 100644 --- a/tests/cases/standalone/common/flow/show_create_flow.result +++ b/tests/cases/standalone/common/flow/show_create_flow.result @@ -476,7 +476,7 @@ SINK TO out_num_cnt_show WITH (access_key_id = [true]) AS SELECT number AS n1 FROM numbers_input_show where number > 10; -Error: 1004(InvalidArguments), Invalid SQL, error: unknown flow option 'access_key_id', supported options: defer_on_missing_source +Error: 1004(InvalidArguments), Invalid SQL, error: unknown flow option 'access_key_id', supported options: defer_on_missing_source, experimental_enable_incremental_read DROP FLOW filter_numbers_show;