mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-06-03 05:40:40 +00:00
Merge branch 'main' into feat/flush-hook-extension-point
This commit is contained in:
92
Cargo.lock
generated
92
Cargo.lock
generated
@@ -2278,6 +2278,7 @@ dependencies = [
|
||||
"futures",
|
||||
"lazy_static",
|
||||
"object-store",
|
||||
"object_store_opendal",
|
||||
"orc-rust",
|
||||
"parquet",
|
||||
"paste",
|
||||
@@ -5102,6 +5103,7 @@ dependencies = [
|
||||
"datatypes",
|
||||
"futures",
|
||||
"object-store",
|
||||
"object_store_opendal",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"snafu 0.8.6",
|
||||
@@ -8320,6 +8322,7 @@ dependencies = [
|
||||
"datafusion-common",
|
||||
"datafusion-expr",
|
||||
"datatypes",
|
||||
"derive_more",
|
||||
"dotenv",
|
||||
"either",
|
||||
"futures",
|
||||
@@ -9074,8 +9077,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "object_store_opendal"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0eb12a624a41fce745838d0ef3701ff6c47797c13cd18ad3612fd2a3134fdbd8"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -9162,8 +9166,9 @@ checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"
|
||||
|
||||
[[package]]
|
||||
name = "opendal"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "96c9c85ce253ff87225e7669979d877a20c98a06604ec9d6dd5f4473e08f1ae1"
|
||||
dependencies = [
|
||||
"ctor",
|
||||
"opendal-core",
|
||||
@@ -9183,8 +9188,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-core"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c4f8607c90e2c963a91467f50fb49fbc7fb3d573f88cea219ca59ccd3740b309"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"base64 0.22.1",
|
||||
@@ -9210,8 +9216,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-layer-concurrent-limit"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0d6f81ba6960e3fae1882f253b114b21d7e444e1534f209c7737a79f6243eb6f"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"http 1.3.1",
|
||||
@@ -9221,8 +9228,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-layer-logging"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "58ada45c6d81d1aa4c9305d0c7d4bc317c59c85866a0908a2d75a7a978aa5ee2"
|
||||
dependencies = [
|
||||
"log",
|
||||
"opendal-core",
|
||||
@@ -9230,8 +9238,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-layer-observe-metrics-common"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "628b0228fdbd13c3d9d50eee4341f2eb82ca5b44991e4c68f07c84cc823e2d12"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"http 1.3.1",
|
||||
@@ -9240,8 +9249,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-layer-prometheus"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0487bdb1357097ec8654781bad03ef310282517738e2864ebde69e27aaafc5ec"
|
||||
dependencies = [
|
||||
"opendal-core",
|
||||
"opendal-layer-observe-metrics-common",
|
||||
@@ -9250,8 +9260,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-layer-retry"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b2a25a718afb81fad81cb9a0580a1cb989221fa2317f888c6a37f8dad408eb7"
|
||||
dependencies = [
|
||||
"backon",
|
||||
"log",
|
||||
@@ -9260,8 +9271,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-layer-timeout"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e91f731724c213af81e9d03517859c8fc47b4578e64ad61ae4f099f10fe36e3"
|
||||
dependencies = [
|
||||
"opendal-core",
|
||||
"tokio",
|
||||
@@ -9269,8 +9281,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-layer-tracing"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90c6fc9df6da1f0dafbdf55fa48525f1643aefbe7da8f46936e869e2a5b8a34f"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"http 1.3.1",
|
||||
@@ -9280,8 +9293,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-service-azblob"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0030644366ef5d8cbe3a4a5822bf99a4aafddc1666e9d24b44d158d9062fc76a"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
@@ -9300,8 +9314,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-service-azure-common"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b489f13c42e69d69bdd72952b634356ec43a7881a20259b38b540fcecdf4051"
|
||||
dependencies = [
|
||||
"http 1.3.1",
|
||||
"opendal-core",
|
||||
@@ -9309,8 +9324,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-service-fs"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22e89a665fef0e6bd249cf5ea47fc174b7ba892159bee4b9382528b1ca873a2c"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"log",
|
||||
@@ -9322,8 +9338,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-service-gcs"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48de101aac565ed06af4b47903c24eafd249075553ec1fb18256751c45148d47"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -9342,8 +9359,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-service-http"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb6af628a0bf14075b957179444927e1df40dc7addef382b585a05ef015a077b"
|
||||
dependencies = [
|
||||
"http 1.3.1",
|
||||
"log",
|
||||
@@ -9353,8 +9371,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-service-oss"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "328fa55e8888cbdfe00826bfea2a79042422b720e8369e9e021e46121dea5ace"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http 1.3.1",
|
||||
@@ -9369,8 +9388,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "opendal-service-s3"
|
||||
version = "0.56.0"
|
||||
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
|
||||
version = "0.57.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "313d46c9f5ae70bca26b7c3e3fbb9b639292625f28af73aa016f47e788af9deb"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
@@ -14102,9 +14122,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
|
||||
|
||||
[[package]]
|
||||
name = "tar"
|
||||
version = "0.4.45"
|
||||
version = "0.4.46"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973"
|
||||
checksum = "3f6221d9a6003c78398e3b239969f352578258df48c8eb051caadae0015bc840"
|
||||
dependencies = [
|
||||
"filetime",
|
||||
"libc",
|
||||
|
||||
@@ -178,7 +178,7 @@ nalgebra = "0.33"
|
||||
nix = { version = "0.30.1", default-features = false, features = ["event", "fs", "process"] }
|
||||
notify = "8.0"
|
||||
num_cpus = "1.16"
|
||||
object_store_opendal = { git = "https://github.com/apache/opendal.git", rev = "4ad2d85296ffa6fdc2882f97d3c760ee243913f7" }
|
||||
object_store_opendal = "0.57"
|
||||
once_cell = "1.18"
|
||||
opentelemetry-proto = { version = "0.31", features = [
|
||||
"gen-tonic",
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
| --- | -----| ------- | ----------- |
|
||||
| `default_timezone` | String | Unset | The default timezone of the server. |
|
||||
| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
|
||||
| `auto_create_table` | Bool | `true` | Server-side global switch for auto table creation on write.<br/>When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`. |
|
||||
| `user_provider` | String | Unset | The user provider for authentication.<br/>Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" |
|
||||
| `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
|
||||
| `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.<br/>Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail" |
|
||||
@@ -230,6 +231,7 @@
|
||||
| --- | -----| ------- | ----------- |
|
||||
| `default_timezone` | String | Unset | The default timezone of the server. |
|
||||
| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
|
||||
| `auto_create_table` | Bool | `true` | Server-side global switch for auto table creation on write.<br/>When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`. |
|
||||
| `user_provider` | String | Unset | The user provider for authentication.<br/>Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" |
|
||||
| `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
|
||||
| `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.<br/>Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail" |
|
||||
@@ -628,6 +630,7 @@
|
||||
| `flow.batching_mode.experimental_frontend_scan_timeout` | String | `30s` | Flow wait for available frontend timeout,<br/>if failed to find available frontend after frontend_scan_timeout elapsed, return error<br/>which prevent flownode from starting |
|
||||
| `flow.batching_mode.experimental_max_filter_num_per_query` | Integer | `20` | Maximum number of filters allowed in a single query |
|
||||
| `flow.batching_mode.experimental_time_window_merge_threshold` | Integer | `3` | Time window merge distance |
|
||||
| `flow.batching_mode.experimental_enable_incremental_read` | Bool | `false` | Whether to enable experimental flow incremental source reads.<br/>When disabled, batching flows always execute full-snapshot queries.<br/>Can be overridden per flow with WITH (experimental_enable_incremental_read = 'true'). |
|
||||
| `flow.batching_mode.read_preference` | String | `Leader` | Read preference of the Frontend client. |
|
||||
| `flow.batching_mode.frontend_tls` | -- | -- | -- |
|
||||
| `flow.batching_mode.frontend_tls.enabled` | Bool | `false` | Whether to enable TLS for client. |
|
||||
|
||||
@@ -31,6 +31,10 @@ node_id = 14
|
||||
#+experimental_max_filter_num_per_query=20
|
||||
## Time window merge distance
|
||||
#+experimental_time_window_merge_threshold=3
|
||||
## Whether to enable experimental flow incremental source reads.
|
||||
## When disabled, batching flows always execute full-snapshot queries.
|
||||
## Can be overridden per flow with WITH (experimental_enable_incremental_read = 'true').
|
||||
#+experimental_enable_incremental_read=false
|
||||
## Read preference of the Frontend client.
|
||||
#+read_preference="Leader"
|
||||
[flow.batching_mode.frontend_tls]
|
||||
|
||||
@@ -6,6 +6,10 @@ default_timezone = "UTC"
|
||||
## @toml2docs:none-default
|
||||
default_column_prefix = "greptime"
|
||||
|
||||
## Server-side global switch for auto table creation on write.
|
||||
## When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`.
|
||||
#+ auto_create_table = true
|
||||
|
||||
## The user provider for authentication.
|
||||
## Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd"
|
||||
## @toml2docs:none-default
|
||||
|
||||
@@ -6,6 +6,10 @@ default_timezone = "UTC"
|
||||
## @toml2docs:none-default
|
||||
default_column_prefix = "greptime"
|
||||
|
||||
## Server-side global switch for auto table creation on write.
|
||||
## When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`.
|
||||
#+ auto_create_table = true
|
||||
|
||||
## The user provider for authentication.
|
||||
## Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd"
|
||||
## @toml2docs:none-default
|
||||
|
||||
157
docs/rfcs/2026-05-28-table-semantic-layer.md
Normal file
157
docs/rfcs/2026-05-28-table-semantic-layer.md
Normal file
@@ -0,0 +1,157 @@
|
||||
---
|
||||
Feature Name: Table Semantic Layer
|
||||
Tracking Issue: TBD
|
||||
Date: 2026-05-28
|
||||
Author: "Dennis Zhuang <killme2008@gmail.com>"
|
||||
---
|
||||
|
||||
# Summary
|
||||
|
||||
Attach a thin layer of semantic metadata to each table so machine consumers — LLM agents, alert generators, dashboard builders, MCP servers, ETL pipelines — can align it with the observability concepts they already know (OTel instrument kinds, Prometheus naming conventions, UCUM units, semantic conventions, severity numbers, OTel ↔ Prometheus translation rules).
|
||||
|
||||
The mechanism reuses what already exists in `table_options` (the same slot that today carries `table_data_model` and `otlp_metric_compat`): a reserved `greptime.semantic.*` namespace, plus standard SQL column `COMMENT` for field-level supplements, plus an `information_schema.semantic_tables` view as the discovery entry point. No new protocol, no new DDL keyword.
|
||||
|
||||
Per-table identity only. Cross-table relationships are deferred.
|
||||
|
||||
# Motivation
|
||||
|
||||
GreptimeDB already ingests OTLP metrics / traces / logs and Prometheus remote write. Each protocol carries rich metadata on the wire (instrument kind, temporality, unit, scope, resource, semantic-conventions version), and most of it is dropped when rows land in a table:
|
||||
|
||||
- An `opentelemetry_traces` table looks like any wide table; signal type, source, and field provenance must be guessed from naming.
|
||||
- The OTel-to-Prometheus translation in v0.16+ actively drops scope attributes and most resource attributes; the table never records *what was dropped*.
|
||||
- Prometheus remote write v1 metadata is unreliable by protocol, but downstream tables do not flag whether `counter` typing was *declared* or *inferred* from the `_total` suffix.
|
||||
- Mixed-temporality data (OTel delta + Prometheus cumulative in the same table) is unrecoverable from schema alone.
|
||||
|
||||
The audience is broader than LLM agents. Alert generators need to choose between `rate()` and absolute thresholds, and need units to pick sensible bounds. Dashboard builders pick visualisations by signal type. MCP servers surface a structured tool catalog instead of free-text descriptions. ETL pipelines need lineage to know whether a `service_name` column is `resource.service.name` or a free-form label. All of them currently guess from column names; the metadata to remove the guess already exists at ingest time, we just do not preserve it.
|
||||
|
||||
# Goals
|
||||
|
||||
1. Tag every ingested table with a stable identity using existing SQL surfaces — no new protocol, no new DDL keyword.
|
||||
2. Record the lossy transformations the ingestion path performs (dropped attributes, scope handling, type inference vs. declaration).
|
||||
3. Expose one `information_schema` view as the consumer-facing discovery entry point.
|
||||
4. Keep the layer optional and additive — tables without these options keep working unchanged.
|
||||
|
||||
# Non-Goals
|
||||
|
||||
- Cross-table relationship modelling. Deferred to a follow-up RFC.
|
||||
- Bespoke storage. Reuse `table_options` and column `COMMENT`.
|
||||
- Semantic enforcement at query time. The layer is descriptive, not coercive.
|
||||
- New wire protocol. Upstream standardisation is mentioned only as a future direction.
|
||||
|
||||
# Proposal
|
||||
|
||||
## Three mechanisms
|
||||
|
||||
1. **`greptime.semantic.*` table options** — table-level identity and lineage. Carried inside the existing `table_options` blob. This is the same slot that today carries `table_data_model = 'greptime_trace_v1'` and `otlp_metric_compat = 'prom'`, so the mechanism is generalising what the OTLP trace auto-create path already does.
|
||||
2. **Column `COMMENT`** — column-level supplements ("this column is `resource.service.name`"; "this column carries delta values"). Standard SQL.
|
||||
3. **`information_schema.semantic_tables` view** — a denormalised projection of the options, registered through the existing `with_extra_table_factories()` hook. Tables without a `greptime.semantic.*` option do not appear in the view.
|
||||
|
||||
## Vocabulary
|
||||
|
||||
All keys are flat strings under the `greptime.semantic.` prefix; values are strings; unknown keys are tolerated so the vocabulary can grow without coordinated rollouts.
|
||||
|
||||
**Common (all signals)**
|
||||
|
||||
| Key | Example |
|
||||
| --- | --- |
|
||||
| `greptime.semantic.signal_type` | `trace` / `log` / `metric` / `event` |
|
||||
| `greptime.semantic.source` | `opentelemetry` / `prometheus` / `elasticsearch` / `loki` / `custom` |
|
||||
| `greptime.semantic.source_version` | protocol or SDK version, e.g. `v2` (Prom remote write), `1.30.0` (optional) |
|
||||
| `greptime.semantic.pipeline` | `greptime_trace_v1` (subsumes the existing `table_data_model` value) |
|
||||
|
||||
**Trace**: `greptime.semantic.trace.conventions` (e.g. `otel-semconv-1.27`, lifted from `schema_url`, which is the version of the OpenTelemetry semantic conventions used in this table), `greptime.semantic.trace.has_events`, `greptime.semantic.trace.has_links`.
|
||||
|
||||
**Metric** — v1 assumes one metric type per table, which is how both Prom RW and the post-v0.16 OTel ingestion path land data today; mixed-type tables are a follow-up.
|
||||
|
||||
| Key | Example |
|
||||
| --- | --- |
|
||||
| `greptime.semantic.metric.type` | `counter` / `gauge` / `histogram` / `summary` / `updown_counter` / `gauge_histogram` / `info` / `stateset` |
|
||||
| `greptime.semantic.metric.unit` | UCUM, e.g. `s`, `By`, `{request}` |
|
||||
| `greptime.semantic.metric.temporality` | `cumulative` / `delta` (OTel only) |
|
||||
| `greptime.semantic.metric.monotonic` | `true` / `false` |
|
||||
| `greptime.semantic.metric.metadata_quality` | `declared` (OTLP / Prom RW v2 / exposition) or `inferred` (Prom RW v1, name-suffix guess) |
|
||||
| `greptime.semantic.metric.original_name` | Pre-translation OTel name when the table name was Prometheus-ised |
|
||||
|
||||
`metadata_quality = inferred` is the load-bearing field for confidence-aware tooling: an inferred counter should be re-checked before betting on `rate()`-style semantics.
|
||||
|
||||
**Log**: `greptime.semantic.log.severity_scheme` (`otlp` / `syslog` / `custom`), `greptime.semantic.log.body_format` (`string` / `json` / `mixed`).
|
||||
|
||||
**Resource / scope preservation**: `greptime.semantic.resource.attributes_preserved` (JSON array string of attrs promoted to columns), `greptime.semantic.resource.attributes_dropped` (boolean), `greptime.semantic.scope.preserved` (boolean). These answer the most common downstream question: "is this data missing because it was dropped, or because it lives on a different column than I think?" List-shaped values use JSON array strings rather than comma-separated text to avoid escaping and ordering ambiguity.
|
||||
|
||||
## Conflict and update semantics
|
||||
|
||||
Two design decisions worth pinning down up front, because they constrain everything else:
|
||||
|
||||
- **Conflict.** Some table-level keys (`trace.conventions` lifted from `schema_url`, `metric.temporality`, ...) cannot represent the truth when a long-lived table sees rows from multiple sources. v1 records `mixed` or `unknown` rather than a fictitious single value. Downstream consumers must treat any single-valued semantic key as best-effort, not strong evidence.
|
||||
- **Update.** Semantic options are stamped at table creation. v1 does not specify an update path; promoting `metadata_quality` from `inferred` to `declared`, refreshing `resource.attributes_preserved`, or revising `trace.conventions` on later writes is deferred. If real usage shows update is needed, it lands as a separate RFC.
|
||||
|
||||
## `information_schema.semantic_tables`
|
||||
|
||||
A consumer's first SQL on connect:
|
||||
|
||||
```sql
|
||||
SELECT table_catalog, table_schema, table_name, signal_type, source, pipeline
|
||||
FROM information_schema.semantic_tables;
|
||||
```
|
||||
|
||||
returns one row per semantic-tagged table. The view exposes a stable set of core columns (`table_catalog`, `table_schema`, `table_name`, `signal_type`, `source`, `source_version`, `pipeline`) plus a `semantic_options` JSON column carrying the rest of the `greptime.semantic.*` keys verbatim. Future keys appear inside `semantic_options` without forcing a view-schema change; only widely-used keys are ever promoted to first-class columns.
|
||||
|
||||
# Implementation Plan
|
||||
|
||||
Four phases, each independently shippable.
|
||||
|
||||
1. **Identity.** Stamp `signal_type` and `source` on every auto-create path. The OTLP paths already have natural injection points; Prom remote write is the one non-trivial path because metric-engine logical tables share physical storage (see Open Question 2).
|
||||
2. **Metric specifics.** Add type / unit / temporality / monotonic / metadata_quality / original_name at OTel metric and Prom RW ingestion sites; the data is already at hand inside the OTel translator.
|
||||
3. **Resource / scope lineage.** Record what the OTel-to-Prometheus translation kept and dropped.
|
||||
4. **`information_schema.semantic_tables` view + documentation** as a stable user-facing contract.
|
||||
|
||||
# Relationship to OpenTelemetry standardisation
|
||||
|
||||
OTel today standardises what producers emit and how data collectors are managed; the read side — what a backend exposes back to clients — is deliberately vendor turf. OTLP is one-way; OpAMP is agent management; OTEP-0243 (App Telemetry Schema) is producer-side; `schema_url` is producer-stated with no reverse. Adjacent precedents — Prometheus `/api/v1/metadata`, Loki labels API, Tempo tags, Jaeger services, ad-hoc MCP servers — are all vendor-specific.
|
||||
|
||||
This is a real gap. The shape we propose locally (signal-agnostic, `schema_url`-aware, structured around a small vocabulary) is deliberately close to what a future upstream OTEP for a backend-catalog read API could look like, with Weaver's *Resolved Telemetry Schema* as the natural data model. We do not commit to driving such an OTEP here; we do commit to keeping the local shape close enough that a future upstream proposal does not force a breaking migration.
|
||||
|
||||
# Alternatives
|
||||
|
||||
- **New DDL syntax (`SEMANTIC trace WITH (...)`).** Cleaner-looking but non-standard and forces every client to learn it. The metadata is not interesting enough to justify a new keyword.
|
||||
- **Dedicated `_semantic` system table.** Doubles the storage path for what is static per-table KV and adds lifecycle questions (drop, backfill). A view over `table_options` covers the same access pattern.
|
||||
- **Column comments only.** Discovery (`WHERE signal_type = 'trace'`) becomes a full-text problem. Comments are good for column-level supplements, not for identity.
|
||||
- **Encode everything into the table name.** What we do today. Every new field becomes a new naming convention.
|
||||
|
||||
# Open Questions
|
||||
|
||||
1. **Namespace prefix.** `greptime.semantic.*` vs. bare `semantic.*`. v1 picks the vendored prefix; alias or migrate if a community standard later emerges.
|
||||
2. **Prom RW injection point.** Metric-engine logical tables share physical storage, so per-logical-table options need a hook that does not exist as cleanly as the OTLP trace branch. A short spike before Phase 1 lands for Prom RW.
|
||||
3. **Mixed-type metric tables.** When ingestion modes that pack multiple metric types into one table appear, `metric.type` migrates from table-level to row-level. v1 leaves a `metric.type = 'mixed'` marker and punts.
|
||||
4. **Stability surface.** Top-level keys (`signal_type`, `source`) are stable; sub-namespaces (`metric.*`, ...) are evolving until v1.0 of the layer is declared.
|
||||
|
||||
# Future Work
|
||||
|
||||
- **Cross-table relationships.** Paired trace/services tables, metric/info pairing, JOIN hints. Its own RFC.
|
||||
- **Producer SDK/client identity.** An optional `greptime.semantic.source.sdk` key recording the emitting client (e.g. `opentelemetry-go`, `opentelemetry-java`, `opentelemetry-collector`). Because a single table can receive data from multiple SDKs (a shared trace table is the common case), mixed producers collapse to `mixed`, following the same conflict rule as the table-level keys above.
|
||||
- **Backfill** for tables created before this feature shipped.
|
||||
- **Upstream proposal.** Carry the shape into a community proposal — likely an OTEP for an OTLP-Catalog read API plus an MCP binding — informed by Greptime's local usage data.
|
||||
|
||||
# References
|
||||
|
||||
OpenTelemetry:
|
||||
- [OTLP specification](https://opentelemetry.io/docs/specs/otlp/)
|
||||
- [OTel Schemas (`schema_url`)](https://opentelemetry.io/docs/specs/otel/schemas/)
|
||||
- [Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/)
|
||||
- [OTEP-0243: App Telemetry Schema](https://github.com/open-telemetry/oteps/blob/main/text/0243-app-telemetry-schema-vision-roadmap.md)
|
||||
- [OpAMP specification](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md)
|
||||
- [Weaver: Resolved Telemetry Schema](https://github.com/open-telemetry/weaver)
|
||||
- [2025 Stability Proposal](https://opentelemetry.io/blog/2025/stability-proposal-announcement/)
|
||||
|
||||
Prometheus / OpenMetrics:
|
||||
- [Prometheus Remote Write 1.0](https://prometheus.io/docs/specs/prw/remote_write_spec/)
|
||||
- [Prometheus Remote Write 2.0](https://prometheus.io/docs/specs/prw/remote_write_spec_2_0/)
|
||||
- [Prometheus exposition formats](https://prometheus.io/docs/instrumenting/exposition_formats/)
|
||||
- [Prometheus HTTP API: `/api/v1/metadata`](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-metric-metadata)
|
||||
|
||||
Units and conventions:
|
||||
- [UCUM — Unified Code for Units of Measure](https://ucum.org/)
|
||||
|
||||
GreptimeDB:
|
||||
- [OTLP ingestion guide](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/)
|
||||
- [Trace data model](https://docs.greptime.com/user-guide/traces/data-model/)
|
||||
@@ -14,7 +14,9 @@
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use common_meta::cache::{CacheContainer, Initializer, TableInfoCacheRef, TableNameCacheRef};
|
||||
use common_meta::cache::{
|
||||
CacheContainer, InitStrategy, Initializer, TableInfoCacheRef, TableNameCacheRef,
|
||||
};
|
||||
use common_meta::error::{Result as MetaResult, ValueNotExistSnafu};
|
||||
use common_meta::instruction::CacheIdent;
|
||||
use futures::future::BoxFuture;
|
||||
@@ -38,7 +40,14 @@ pub fn new_table_cache(
|
||||
) -> TableCache {
|
||||
let init = init_factory(table_info_cache, table_name_cache);
|
||||
|
||||
CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
|
||||
CacheContainer::with_strategy(
|
||||
name,
|
||||
cache,
|
||||
Box::new(invalidator),
|
||||
init,
|
||||
filter,
|
||||
InitStrategy::VersionChecked,
|
||||
)
|
||||
}
|
||||
|
||||
fn init_factory(
|
||||
|
||||
@@ -79,7 +79,7 @@ impl App for Instance {
|
||||
}
|
||||
|
||||
async fn start(&mut self) -> Result<()> {
|
||||
plugins::start_datanode_plugins(self.datanode.plugins())
|
||||
plugins::start_datanode_plugins(&self.datanode)
|
||||
.await
|
||||
.context(StartDatanodeSnafu)?;
|
||||
|
||||
|
||||
@@ -90,7 +90,7 @@ impl App for Instance {
|
||||
}
|
||||
|
||||
async fn start(&mut self) -> Result<()> {
|
||||
plugins::start_flownode_plugins(self.flownode.flow_engine().plugins().clone())
|
||||
plugins::start_flownode_plugins(&self.flownode)
|
||||
.await
|
||||
.context(StartFlownodeSnafu)?;
|
||||
|
||||
|
||||
@@ -95,8 +95,7 @@ impl App for Instance {
|
||||
}
|
||||
|
||||
async fn start(&mut self) -> Result<()> {
|
||||
let plugins = self.frontend.instance.plugins().clone();
|
||||
plugins::start_frontend_plugins(plugins)
|
||||
plugins::start_frontend_plugins(&self.frontend.instance)
|
||||
.await
|
||||
.context(error::StartFrontendSnafu)?;
|
||||
|
||||
|
||||
@@ -68,7 +68,7 @@ impl App for Instance {
|
||||
}
|
||||
|
||||
async fn start(&mut self) -> Result<()> {
|
||||
plugins::start_metasrv_plugins(self.instance.plugins())
|
||||
plugins::start_metasrv_plugins(&self.instance)
|
||||
.await
|
||||
.context(StartMetaServerSnafu)?;
|
||||
|
||||
|
||||
@@ -164,7 +164,7 @@ impl App for Instance {
|
||||
.start(self.leader_services_context.clone())
|
||||
.await?;
|
||||
|
||||
plugins::start_frontend_plugins(self.frontend.instance.plugins().clone())
|
||||
plugins::start_frontend_plugins(&self.frontend.instance)
|
||||
.await
|
||||
.context(error::StartFrontendSnafu)?;
|
||||
|
||||
|
||||
@@ -114,6 +114,7 @@ fn test_load_frontend_example_config() {
|
||||
component: FrontendOptions {
|
||||
default_timezone: Some("UTC".to_string()),
|
||||
default_column_prefix: Some("greptime".to_string()),
|
||||
auto_create_table: true,
|
||||
meta_client: Some(MetaClientOptions {
|
||||
metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
|
||||
timeout: Duration::from_secs(3),
|
||||
@@ -267,6 +268,7 @@ fn test_load_standalone_example_config() {
|
||||
component: StandaloneOptions {
|
||||
default_timezone: Some("UTC".to_string()),
|
||||
default_column_prefix: Some("greptime".to_string()),
|
||||
auto_create_table: true,
|
||||
wal: DatanodeWalConfig::RaftEngine(RaftEngineConfig {
|
||||
dir: Some(format!("{}/{}", DEFAULT_DATA_HOME, WAL_DIR)),
|
||||
sync_period: Some(Duration::from_secs(10)),
|
||||
|
||||
@@ -33,6 +33,7 @@ datatypes.workspace = true
|
||||
futures.workspace = true
|
||||
lazy_static.workspace = true
|
||||
object-store.workspace = true
|
||||
object_store_opendal.workspace = true
|
||||
orc-rust = { version = "0.8", default-features = false, features = ["async"] }
|
||||
parquet.workspace = true
|
||||
paste.workspace = true
|
||||
|
||||
@@ -316,7 +316,7 @@ pub async fn file_to_stream(
|
||||
.with_file_compression_type(df_compression)
|
||||
.build();
|
||||
|
||||
let store = Arc::new(object_store::compat::OpendalStore::new(store.clone()));
|
||||
let store = Arc::new(object_store_opendal::OpendalStore::new(store.clone()));
|
||||
let file_opener = config.file_source().create_file_opener(store, &config, 0)?;
|
||||
let stream = FileStream::new(&config, 0, file_opener, &ExecutionPlanMetricsSet::new())?;
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ struct Test<'a> {
|
||||
|
||||
impl Test<'_> {
|
||||
async fn run(self, store: &ObjectStore) {
|
||||
let store = Arc::new(object_store::compat::OpendalStore::new(store.clone()));
|
||||
let store = Arc::new(object_store_opendal::OpendalStore::new(store.clone()));
|
||||
let file_opener = self
|
||||
.file_source
|
||||
.create_file_opener(store, &self.config, 0)
|
||||
|
||||
@@ -27,12 +27,14 @@ const ACCESS_KEY_ID: &str = "access_key_id";
|
||||
const ACCESS_KEY_SECRET: &str = "access_key_secret";
|
||||
const ROOT: &str = "root";
|
||||
const ALLOW_ANONYMOUS: &str = "allow_anonymous";
|
||||
const SKIP_SIGNATURE: &str = "skip_signature";
|
||||
|
||||
/// Check if the key is supported in OSS configuration.
|
||||
pub fn is_supported_in_oss(key: &str) -> bool {
|
||||
[
|
||||
ROOT,
|
||||
ALLOW_ANONYMOUS,
|
||||
SKIP_SIGNATURE,
|
||||
BUCKET,
|
||||
ENDPOINT,
|
||||
ACCESS_KEY_ID,
|
||||
@@ -61,18 +63,23 @@ pub fn build_oss_backend(
|
||||
builder = builder.access_key_secret(access_key_secret);
|
||||
}
|
||||
|
||||
if let Some(allow_anonymous) = connection.get(ALLOW_ANONYMOUS) {
|
||||
let allow = allow_anonymous.as_str().parse::<bool>().map_err(|e| {
|
||||
if let Some((key, value)) = connection
|
||||
.get(SKIP_SIGNATURE)
|
||||
.map(|value| (SKIP_SIGNATURE, value))
|
||||
.or_else(|| {
|
||||
connection
|
||||
.get(ALLOW_ANONYMOUS)
|
||||
.map(|value| (ALLOW_ANONYMOUS, value))
|
||||
})
|
||||
{
|
||||
let skip_signature = value.as_str().parse::<bool>().map_err(|e| {
|
||||
error::InvalidConnectionSnafu {
|
||||
msg: format!(
|
||||
"failed to parse the option {}={}, {}",
|
||||
ALLOW_ANONYMOUS, allow_anonymous, e
|
||||
),
|
||||
msg: format!("failed to parse the option {}={}, {}", key, value, e),
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
if allow {
|
||||
builder = builder.allow_anonymous();
|
||||
if skip_signature {
|
||||
builder = builder.skip_signature();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,6 +100,7 @@ mod tests {
|
||||
fn test_is_supported_in_oss() {
|
||||
assert!(is_supported_in_oss(ROOT));
|
||||
assert!(is_supported_in_oss(ALLOW_ANONYMOUS));
|
||||
assert!(is_supported_in_oss(SKIP_SIGNATURE));
|
||||
assert!(is_supported_in_oss(BUCKET));
|
||||
assert!(is_supported_in_oss(ENDPOINT));
|
||||
assert!(is_supported_in_oss(ACCESS_KEY_ID));
|
||||
|
||||
@@ -103,7 +103,7 @@ pub async fn setup_stream_to_json_test(origin_path: &str, threshold: impl Fn(usi
|
||||
test_util::TEST_BATCH_SIZE,
|
||||
schema.clone(),
|
||||
FileCompressionType::UNCOMPRESSED,
|
||||
Arc::new(object_store::compat::OpendalStore::new(store.clone())),
|
||||
Arc::new(object_store_opendal::OpendalStore::new(store.clone())),
|
||||
true,
|
||||
);
|
||||
|
||||
@@ -157,7 +157,7 @@ pub async fn setup_stream_to_csv_test(
|
||||
|
||||
let csv_opener = csv_source
|
||||
.create_file_opener(
|
||||
Arc::new(object_store::compat::OpendalStore::new(store.clone())),
|
||||
Arc::new(object_store_opendal::OpendalStore::new(store.clone())),
|
||||
&config,
|
||||
0,
|
||||
)
|
||||
|
||||
@@ -17,7 +17,7 @@ mod flow;
|
||||
mod registry;
|
||||
mod table;
|
||||
|
||||
pub use container::{CacheContainer, Initializer, Invalidator, TokenFilter};
|
||||
pub use container::{CacheContainer, InitStrategy, Initializer, Invalidator, TokenFilter};
|
||||
pub use flow::{TableFlownodeSetCache, TableFlownodeSetCacheRef, new_table_flownode_set_cache};
|
||||
pub use registry::{
|
||||
CacheRegistry, CacheRegistryBuilder, CacheRegistryRef, LayeredCacheRegistry,
|
||||
|
||||
@@ -437,11 +437,13 @@ pub fn defer_on_missing_source(flow_task: &CreateFlowTask) -> Result<bool> {
|
||||
pub fn validate_flow_options(flow_task: &CreateFlowTask) -> Result<()> {
|
||||
for key in flow_task.flow_options.keys() {
|
||||
match key.as_str() {
|
||||
DEFER_ON_MISSING_SOURCE_KEY | FlowType::FLOW_TYPE_KEY => {}
|
||||
DEFER_ON_MISSING_SOURCE_KEY
|
||||
| FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY
|
||||
| FlowType::FLOW_TYPE_KEY => {}
|
||||
unknown => {
|
||||
return UnexpectedSnafu {
|
||||
err_msg: format!(
|
||||
"Unknown flow option '{unknown}', supported user options: {DEFER_ON_MISSING_SOURCE_KEY}"
|
||||
"Unknown flow option '{unknown}', supported user options: {DEFER_ON_MISSING_SOURCE_KEY}, {FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY}"
|
||||
),
|
||||
}
|
||||
.fail();
|
||||
@@ -487,6 +489,9 @@ pub enum FlowType {
|
||||
Streaming,
|
||||
}
|
||||
|
||||
pub const FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY: &str =
|
||||
"experimental_enable_incremental_read";
|
||||
|
||||
impl FlowType {
|
||||
pub const BATCHING: &str = "batching";
|
||||
pub const STREAMING: &str = "streaming";
|
||||
|
||||
@@ -24,8 +24,9 @@ use table::table_name::TableName;
|
||||
|
||||
use crate::ddl::DdlContext;
|
||||
use crate::ddl::create_flow::{
|
||||
CreateFlowData, CreateFlowProcedure, CreateFlowState, DEFER_ON_MISSING_SOURCE_KEY, FlowType,
|
||||
defer_on_missing_source,
|
||||
CreateFlowData, CreateFlowProcedure, CreateFlowState, DEFER_ON_MISSING_SOURCE_KEY,
|
||||
FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, FlowType, defer_on_missing_source,
|
||||
validate_flow_options,
|
||||
};
|
||||
use crate::ddl::test_util::create_table::test_create_table_task;
|
||||
use crate::ddl::test_util::flownode_handler::NaiveFlownodeHandler;
|
||||
@@ -275,6 +276,22 @@ fn test_defer_on_missing_source_invalid_value() {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validate_flow_options_allows_incremental_read_option() {
|
||||
let mut task = test_create_flow_task(
|
||||
"my_flow",
|
||||
vec![],
|
||||
TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
|
||||
false,
|
||||
);
|
||||
task.flow_options.insert(
|
||||
FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(),
|
||||
"true".to_string(),
|
||||
);
|
||||
|
||||
validate_flow_options(&task).unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_create_flow_rejects_unknown_option_in_meta_task() {
|
||||
let mut task = test_create_flow_task(
|
||||
|
||||
@@ -29,6 +29,7 @@ datafusion-expr.workspace = true
|
||||
datatypes.workspace = true
|
||||
futures.workspace = true
|
||||
object-store.workspace = true
|
||||
object_store_opendal.workspace = true
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json.workspace = true
|
||||
snafu.workspace = true
|
||||
|
||||
@@ -61,7 +61,7 @@ fn build_record_batch_stream(
|
||||
.with_file_group(FileGroup::new(files))
|
||||
.build();
|
||||
|
||||
let store = Arc::new(object_store::compat::OpendalStore::new(
|
||||
let store = Arc::new(object_store_opendal::OpendalStore::new(
|
||||
scan_plan_config.store.clone(),
|
||||
));
|
||||
|
||||
|
||||
@@ -23,7 +23,6 @@ use session::ReadPreference;
|
||||
mod checkpoint;
|
||||
pub(crate) mod engine;
|
||||
pub(crate) mod frontend_client;
|
||||
mod incremental_filter;
|
||||
mod state;
|
||||
mod table_creator;
|
||||
mod task;
|
||||
@@ -55,6 +54,10 @@ pub struct BatchingModeOptions {
|
||||
pub experimental_max_filter_num_per_query: usize,
|
||||
/// Time window merge distance
|
||||
pub experimental_time_window_merge_threshold: usize,
|
||||
/// Whether to enable experimental flow incremental source reads.
|
||||
///
|
||||
/// When disabled, batching flows always execute full-snapshot queries.
|
||||
pub experimental_enable_incremental_read: bool,
|
||||
/// Read preference of the Frontend client.
|
||||
pub read_preference: ReadPreference,
|
||||
/// TLS option for client connections to frontends.
|
||||
@@ -72,6 +75,7 @@ impl Default for BatchingModeOptions {
|
||||
experimental_frontend_scan_timeout: Duration::from_secs(30),
|
||||
experimental_max_filter_num_per_query: 20,
|
||||
experimental_time_window_merge_threshold: 3,
|
||||
experimental_enable_incremental_read: false,
|
||||
read_preference: Default::default(),
|
||||
frontend_tls: None,
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@ use std::time::Duration;
|
||||
use api::v1::flow::DirtyWindowRequests;
|
||||
use catalog::CatalogManagerRef;
|
||||
use common_error::ext::BoxedError;
|
||||
use common_meta::ddl::create_flow::FlowType;
|
||||
use common_meta::ddl::create_flow::{FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, FlowType};
|
||||
use common_meta::key::TableMetadataManagerRef;
|
||||
use common_meta::key::flow::FlowMetadataManagerRef;
|
||||
use common_meta::key::flow::flow_state::FlowStat;
|
||||
@@ -38,6 +38,7 @@ use session::context::QueryContext;
|
||||
use snafu::{OptionExt, ResultExt, ensure};
|
||||
use sql::parsers::utils::is_tql;
|
||||
use store_api::metric_engine_consts::is_metric_engine_internal_column;
|
||||
use store_api::mito_engine_options::APPEND_MODE_KEY;
|
||||
use store_api::storage::{RegionId, TableId};
|
||||
use table::table_reference::TableReference;
|
||||
use tokio::sync::{RwLock, oneshot};
|
||||
@@ -428,6 +429,55 @@ async fn get_table_info(
|
||||
}
|
||||
|
||||
impl BatchingEngine {
|
||||
fn batch_opts_for_flow_options(
|
||||
&self,
|
||||
flow_options: &HashMap<String, String>,
|
||||
) -> Result<Arc<BatchingModeOptions>, Error> {
|
||||
let mut batch_opts = (*self.batch_opts).clone();
|
||||
if let Some(enable_incremental_read) =
|
||||
flow_options.get(FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY)
|
||||
{
|
||||
batch_opts.experimental_enable_incremental_read = enable_incremental_read
|
||||
.parse::<bool>()
|
||||
.map_err(|_| {
|
||||
InvalidQuerySnafu {
|
||||
reason: format!(
|
||||
"Invalid flow option {FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY}: {enable_incremental_read}"
|
||||
),
|
||||
}
|
||||
.build()
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(Arc::new(batch_opts))
|
||||
}
|
||||
|
||||
fn table_options_enable_append_mode(extra_options: &HashMap<String, String>) -> bool {
|
||||
extra_options
|
||||
.get(APPEND_MODE_KEY)
|
||||
.is_some_and(|value| value.eq_ignore_ascii_case("true"))
|
||||
}
|
||||
|
||||
fn ensure_incremental_source_append_only(
|
||||
batch_opts: &BatchingModeOptions,
|
||||
table_name: &[String; 3],
|
||||
extra_options: &HashMap<String, String>,
|
||||
) -> Result<(), Error> {
|
||||
if batch_opts.experimental_enable_incremental_read {
|
||||
ensure!(
|
||||
Self::table_options_enable_append_mode(extra_options),
|
||||
UnsupportedSnafu {
|
||||
reason: format!(
|
||||
"Flow incremental read requires append-only source table, but source table `{}` is not append-only. Consider setting append_mode='true' on the source table or disabling experimental_enable_incremental_read",
|
||||
table_name.join(".")
|
||||
),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn create_flow_inner(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
|
||||
let CreateFlowArgs {
|
||||
flow_id,
|
||||
@@ -494,6 +544,8 @@ impl BatchingEngine {
|
||||
}
|
||||
);
|
||||
|
||||
let batch_opts = self.batch_opts_for_flow_options(&flow_options)?;
|
||||
|
||||
let mut source_table_names = Vec::with_capacity(2);
|
||||
for src_id in source_table_ids {
|
||||
// also check table option to see if ttl!=instant
|
||||
@@ -509,6 +561,11 @@ impl BatchingEngine {
|
||||
),
|
||||
}
|
||||
);
|
||||
Self::ensure_incremental_source_append_only(
|
||||
&batch_opts,
|
||||
&table_name,
|
||||
&table_info.table_info.meta.options.extra_options,
|
||||
)?;
|
||||
|
||||
source_table_names.push(table_name);
|
||||
}
|
||||
@@ -563,7 +620,7 @@ impl BatchingEngine {
|
||||
query_ctx,
|
||||
catalog_manager: self.catalog_manager.clone(),
|
||||
shutdown_rx: rx,
|
||||
batch_opts: self.batch_opts.clone(),
|
||||
batch_opts,
|
||||
flow_eval_interval: eval_interval.map(|secs| Duration::from_secs(secs as u64)),
|
||||
};
|
||||
|
||||
@@ -808,7 +865,7 @@ impl BatchingEngine {
|
||||
});
|
||||
|
||||
let res = task
|
||||
.gen_exec_once(
|
||||
.execute_once_serialized(
|
||||
&self.query_engine,
|
||||
&self.frontend_client,
|
||||
cur_dirty_window_cnt,
|
||||
@@ -946,6 +1003,76 @@ mod tests {
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_flow_option_overrides_incremental_read_switch() {
|
||||
let engine = new_test_engine().await;
|
||||
|
||||
let default_opts = engine.batch_opts_for_flow_options(&HashMap::new()).unwrap();
|
||||
assert!(!default_opts.experimental_enable_incremental_read);
|
||||
|
||||
let enabled_opts = engine
|
||||
.batch_opts_for_flow_options(&HashMap::from([(
|
||||
FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(),
|
||||
"true".to_string(),
|
||||
)]))
|
||||
.unwrap();
|
||||
assert!(enabled_opts.experimental_enable_incremental_read);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_table_options_enable_append_mode() {
|
||||
assert!(!BatchingEngine::table_options_enable_append_mode(
|
||||
&HashMap::new()
|
||||
));
|
||||
assert!(!BatchingEngine::table_options_enable_append_mode(
|
||||
&HashMap::from([(APPEND_MODE_KEY.to_string(), "false".to_string())])
|
||||
));
|
||||
assert!(BatchingEngine::table_options_enable_append_mode(
|
||||
&HashMap::from([(APPEND_MODE_KEY.to_string(), "TRUE".to_string())])
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_incremental_source_append_only_enforcement() {
|
||||
let table_name = [
|
||||
"greptime".to_string(),
|
||||
"public".to_string(),
|
||||
"numbers".to_string(),
|
||||
];
|
||||
let disabled_opts = BatchingModeOptions::default();
|
||||
let enabled_opts = BatchingModeOptions {
|
||||
experimental_enable_incremental_read: true,
|
||||
..Default::default()
|
||||
};
|
||||
let non_append_options = HashMap::new();
|
||||
let append_options = HashMap::from([(APPEND_MODE_KEY.to_string(), "true".to_string())]);
|
||||
|
||||
BatchingEngine::ensure_incremental_source_append_only(
|
||||
&disabled_opts,
|
||||
&table_name,
|
||||
&non_append_options,
|
||||
)
|
||||
.expect("disabled incremental read should not require append-only source");
|
||||
BatchingEngine::ensure_incremental_source_append_only(
|
||||
&enabled_opts,
|
||||
&table_name,
|
||||
&append_options,
|
||||
)
|
||||
.expect("append-only source should be accepted when incremental read is enabled");
|
||||
|
||||
let err = BatchingEngine::ensure_incremental_source_append_only(
|
||||
&enabled_opts,
|
||||
&table_name,
|
||||
&non_append_options,
|
||||
)
|
||||
.expect_err("non-append source should be rejected when incremental read is enabled");
|
||||
assert!(
|
||||
err.to_string()
|
||||
.contains("Flow incremental read requires append-only source table"),
|
||||
"{err}"
|
||||
);
|
||||
}
|
||||
|
||||
async fn new_test_task(flow_id: FlowId) -> (BatchingTask, oneshot::Sender<()>) {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
|
||||
@@ -1,222 +0,0 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use common_telemetry::tracing::debug;
|
||||
use datafusion_expr::Expr;
|
||||
use datatypes::schema::Schema;
|
||||
|
||||
use crate::batching_mode::state::FilterExprInfo;
|
||||
use crate::batching_mode::utils::IncrementalAggregateAnalysis;
|
||||
use crate::{Error, FlowId};
|
||||
|
||||
pub(super) fn build_sink_dirty_time_window_filter_expr(
|
||||
flow_id: FlowId,
|
||||
analysis: &IncrementalAggregateAnalysis,
|
||||
sink_schema: &Schema,
|
||||
dirty_filter: Option<&FilterExprInfo>,
|
||||
) -> Result<Option<Expr>, Error> {
|
||||
let Some(dirty_filter) = dirty_filter else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let Some(sink_filter_col) =
|
||||
infer_sink_time_window_filter_col(flow_id, analysis, sink_schema, dirty_filter)
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
dirty_filter.predicate_for_col(&sink_filter_col)
|
||||
}
|
||||
|
||||
fn infer_sink_time_window_filter_col(
|
||||
flow_id: FlowId,
|
||||
analysis: &IncrementalAggregateAnalysis,
|
||||
sink_schema: &Schema,
|
||||
dirty_filter: &FilterExprInfo,
|
||||
) -> Option<String> {
|
||||
if analysis.group_key_names.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let is_timestamp_group_key = |name: &str| {
|
||||
analysis.group_key_names.iter().any(|key| key == name)
|
||||
&& sink_schema
|
||||
.column_schema_by_name(name)
|
||||
.is_some_and(|col| col.data_type.is_timestamp())
|
||||
};
|
||||
|
||||
if is_timestamp_group_key(&dirty_filter.col_name) {
|
||||
return Some(dirty_filter.col_name.clone());
|
||||
}
|
||||
|
||||
let candidates = analysis
|
||||
.group_key_names
|
||||
.iter()
|
||||
.filter(|name| is_timestamp_group_key(name))
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
match candidates.as_slice() {
|
||||
[name] => Some(name.clone()),
|
||||
[] => {
|
||||
debug!(
|
||||
"Flow {} cannot infer sink dirty-window filter column: no timestamp group key in {:?}",
|
||||
flow_id, analysis.group_key_names
|
||||
);
|
||||
None
|
||||
}
|
||||
_ => {
|
||||
debug!(
|
||||
"Flow {} cannot infer sink dirty-window filter column: ambiguous timestamp group keys {:?}",
|
||||
flow_id, candidates
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::ColumnSchema;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
use crate::adapter::AUTO_CREATED_UPDATE_AT_TS_COL;
|
||||
use crate::batching_mode::state::FilterExprInfo;
|
||||
use crate::batching_mode::utils::IncrementalAggregateAnalysis;
|
||||
|
||||
fn test_analysis_with_group_keys(group_key_names: Vec<&str>) -> IncrementalAggregateAnalysis {
|
||||
IncrementalAggregateAnalysis {
|
||||
group_key_names: group_key_names
|
||||
.into_iter()
|
||||
.map(|name| name.to_string())
|
||||
.collect(),
|
||||
merge_columns: vec![],
|
||||
literal_columns: vec![],
|
||||
output_field_names: vec![],
|
||||
unsupported_exprs: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
fn test_dirty_filter(col_name: &str) -> FilterExprInfo {
|
||||
FilterExprInfo {
|
||||
expr: datafusion_expr::col(col_name),
|
||||
col_name: col_name.to_string(),
|
||||
time_ranges: vec![],
|
||||
window_size: chrono::Duration::seconds(1),
|
||||
}
|
||||
}
|
||||
|
||||
fn test_sink_schema(columns: Vec<(&str, ConcreteDataType)>) -> Schema {
|
||||
Schema::new(
|
||||
columns
|
||||
.into_iter()
|
||||
.map(|(name, data_type)| ColumnSchema::new(name, data_type, true))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_infer_sink_time_window_filter_col_uses_matching_source_group_key() {
|
||||
let analysis = test_analysis_with_group_keys(vec!["ts", "host"]);
|
||||
let sink_schema = test_sink_schema(vec![
|
||||
("ts", ConcreteDataType::timestamp_millisecond_datatype()),
|
||||
("host", ConcreteDataType::string_datatype()),
|
||||
]);
|
||||
let dirty_filter = test_dirty_filter("ts");
|
||||
|
||||
assert_eq!(
|
||||
Some("ts".to_string()),
|
||||
infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_infer_sink_time_window_filter_col_uses_unique_timestamp_group_key() {
|
||||
let analysis = test_analysis_with_group_keys(vec!["host", "time_window"]);
|
||||
let sink_schema = test_sink_schema(vec![
|
||||
("host", ConcreteDataType::string_datatype()),
|
||||
(
|
||||
"time_window",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
),
|
||||
(
|
||||
AUTO_CREATED_UPDATE_AT_TS_COL,
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
),
|
||||
]);
|
||||
let dirty_filter = test_dirty_filter("ts");
|
||||
|
||||
assert_eq!(
|
||||
Some("time_window".to_string()),
|
||||
infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_infer_sink_time_window_filter_col_skips_global_aggregate() {
|
||||
let analysis = test_analysis_with_group_keys(vec![]);
|
||||
let sink_schema = test_sink_schema(vec![
|
||||
("number", ConcreteDataType::uint32_datatype()),
|
||||
(
|
||||
"time_window",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
),
|
||||
]);
|
||||
let dirty_filter = test_dirty_filter("ts");
|
||||
|
||||
assert_eq!(
|
||||
None,
|
||||
infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_infer_sink_time_window_filter_col_skips_without_timestamp_group_key() {
|
||||
let analysis = test_analysis_with_group_keys(vec!["host", "device"]);
|
||||
let sink_schema = test_sink_schema(vec![
|
||||
("host", ConcreteDataType::string_datatype()),
|
||||
("device", ConcreteDataType::string_datatype()),
|
||||
(
|
||||
AUTO_CREATED_UPDATE_AT_TS_COL,
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
),
|
||||
]);
|
||||
let dirty_filter = test_dirty_filter("ts");
|
||||
|
||||
assert_eq!(
|
||||
None,
|
||||
infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_infer_sink_time_window_filter_col_skips_ambiguous_timestamp_group_keys() {
|
||||
let analysis = test_analysis_with_group_keys(vec!["ts", "time_window"]);
|
||||
let sink_schema = test_sink_schema(vec![
|
||||
("ts", ConcreteDataType::timestamp_millisecond_datatype()),
|
||||
(
|
||||
"time_window",
|
||||
ConcreteDataType::timestamp_millisecond_datatype(),
|
||||
),
|
||||
]);
|
||||
let dirty_filter = test_dirty_filter("source_ts");
|
||||
|
||||
assert_eq!(
|
||||
None,
|
||||
infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -66,12 +66,20 @@ pub struct TaskState {
|
||||
}
|
||||
impl TaskState {
|
||||
pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
|
||||
Self::with_dirty_time_windows(query_ctx, shutdown_rx, DirtyTimeWindows::default())
|
||||
}
|
||||
|
||||
pub fn with_dirty_time_windows(
|
||||
query_ctx: QueryContextRef,
|
||||
shutdown_rx: oneshot::Receiver<()>,
|
||||
dirty_time_windows: DirtyTimeWindows,
|
||||
) -> Self {
|
||||
Self {
|
||||
query_ctx,
|
||||
last_update_time: Instant::now(),
|
||||
last_query_duration: Duration::from_secs(0),
|
||||
last_exec_time_millis: None,
|
||||
dirty_time_windows: Default::default(),
|
||||
dirty_time_windows,
|
||||
checkpoint_mode: CheckpointMode::FullSnapshot,
|
||||
checkpoints: Default::default(),
|
||||
incremental_disabled: false,
|
||||
@@ -264,6 +272,16 @@ impl DirtyTimeWindows {
|
||||
time_window_merge_threshold,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn max_filter_num_per_query(&self) -> usize {
|
||||
self.max_filter_num_per_query
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn time_window_merge_threshold(&self) -> usize {
|
||||
self.time_window_merge_threshold
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for DirtyTimeWindows {
|
||||
@@ -681,7 +699,7 @@ impl DirtyTimeWindows {
|
||||
}
|
||||
}
|
||||
|
||||
fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
|
||||
pub(crate) fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
|
||||
let value = Value::from(value);
|
||||
let value = value
|
||||
.try_to_scalar_value(&value.data_type())
|
||||
|
||||
@@ -27,7 +27,7 @@ use datafusion::datasource::DefaultTableSource;
|
||||
use datafusion::sql::unparser::expr_to_sql;
|
||||
use datafusion_common::DFSchemaRef;
|
||||
use datafusion_common::tree_node::{Transformed, TreeNode};
|
||||
use datafusion_expr::{DmlStatement, LogicalPlan, WriteOp};
|
||||
use datafusion_expr::{DmlStatement, LogicalPlan, WriteOp, col, lit};
|
||||
use datatypes::schema::Schema;
|
||||
use query::QueryEngineRef;
|
||||
use query::options::FLOW_INCREMENTAL_MODE;
|
||||
@@ -38,14 +38,16 @@ use sql::parsers::utils::is_tql;
|
||||
use store_api::mito_engine_options::MERGE_MODE_KEY;
|
||||
use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
|
||||
use table::table::adapter::DfTableProviderAdapter;
|
||||
use tokio::sync::oneshot;
|
||||
use tokio::sync::oneshot::error::TryRecvError;
|
||||
use tokio::sync::{Mutex, oneshot};
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::batching_mode::BatchingModeOptions;
|
||||
use crate::batching_mode::checkpoint::checkpoint_mode_label;
|
||||
use crate::batching_mode::frontend_client::{FrontendClient, PeerDesc};
|
||||
use crate::batching_mode::state::{CheckpointMode, DirtyTimeWindows, FilterExprInfo, TaskState};
|
||||
use crate::batching_mode::state::{
|
||||
CheckpointMode, DirtyTimeWindows, FilterExprInfo, TaskState, to_df_literal,
|
||||
};
|
||||
use crate::batching_mode::table_creator::{QueryType, create_table_with_expr};
|
||||
use crate::batching_mode::time_window::TimeWindowExpr;
|
||||
use crate::batching_mode::utils::{
|
||||
@@ -67,12 +69,6 @@ use crate::{Error, FlowId};
|
||||
mod ckpt;
|
||||
mod inc;
|
||||
|
||||
/// Maximum number of dirty time-window predicates attached to one incremental
|
||||
/// SQL query. This keeps generated OR filters bounded so Substrait encoding and
|
||||
/// downstream planning remain predictable; if the backlog is larger, the flow
|
||||
/// drains one capped batch and postpones checkpoint advancement to a later run.
|
||||
const MAX_INCREMENTAL_DIRTY_WINDOW_FILTERS: usize = 4096;
|
||||
|
||||
/// The task's config, immutable once created
|
||||
#[derive(Clone)]
|
||||
pub struct TaskConfig {
|
||||
@@ -113,6 +109,10 @@ fn is_merge_mode_last_non_null(options: &HashMap<String, String>) -> bool {
|
||||
pub struct BatchingTask {
|
||||
pub config: Arc<TaskConfig>,
|
||||
pub state: Arc<RwLock<TaskState>>,
|
||||
/// Serializes plan generation, execution, checkpoint advancement, and dirty
|
||||
/// window restoration for this flow. Without this, a manual flush and the
|
||||
/// background loop can process the same checkpoint range concurrently.
|
||||
execution_lock: Arc<Mutex<()>>,
|
||||
}
|
||||
|
||||
/// Arguments for creating batching task
|
||||
@@ -150,6 +150,16 @@ pub enum DirtyRestore {
|
||||
Unscoped(DirtyTimeWindows),
|
||||
}
|
||||
|
||||
struct ExecuteOnceOutcome {
|
||||
new_query: Option<PlanInfo>,
|
||||
/// Execution result of the generated insert plan.
|
||||
///
|
||||
/// `Ok(Some((affected_rows, elapsed)))` means a query was executed.
|
||||
/// `Ok(None)` means no query was generated because there was no dirty signal.
|
||||
/// `Err(_)` means plan generation or execution failed.
|
||||
result: Result<Option<(usize, Duration)>, Error>,
|
||||
}
|
||||
|
||||
impl BatchingTask {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn try_new(
|
||||
@@ -168,6 +178,18 @@ impl BatchingTask {
|
||||
flow_eval_interval,
|
||||
}: TaskArgs<'_>,
|
||||
) -> Result<Self, Error> {
|
||||
let mut state = TaskState::with_dirty_time_windows(
|
||||
query_ctx.clone(),
|
||||
shutdown_rx,
|
||||
DirtyTimeWindows::new(
|
||||
batch_opts.experimental_max_filter_num_per_query,
|
||||
batch_opts.experimental_time_window_merge_threshold,
|
||||
),
|
||||
);
|
||||
if !batch_opts.experimental_enable_incremental_read {
|
||||
state.disable_incremental();
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
config: Arc::new(TaskConfig {
|
||||
flow_id,
|
||||
@@ -182,7 +204,8 @@ impl BatchingTask {
|
||||
batch_opts,
|
||||
flow_eval_interval,
|
||||
}),
|
||||
state: Arc::new(RwLock::new(TaskState::new(query_ctx, shutdown_rx))),
|
||||
state: Arc::new(RwLock::new(state)),
|
||||
execution_lock: Arc::new(Mutex::new(())),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -251,40 +274,75 @@ impl BatchingTask {
|
||||
.context(ExternalSnafu)
|
||||
}
|
||||
|
||||
pub async fn gen_exec_once(
|
||||
pub(crate) async fn execute_once_serialized(
|
||||
&self,
|
||||
engine: &QueryEngineRef,
|
||||
frontend_client: &Arc<FrontendClient>,
|
||||
max_window_cnt: Option<usize>,
|
||||
) -> Result<Option<(usize, Duration)>, Error> {
|
||||
if let Some(new_query) = self.gen_insert_plan(engine, max_window_cnt).await? {
|
||||
let outcome = self
|
||||
.execute_once_serialized_with_outcome(engine, frontend_client, max_window_cnt)
|
||||
.await;
|
||||
outcome.result
|
||||
}
|
||||
|
||||
/// Executes one flow evaluation under `execution_lock` and keeps the
|
||||
/// generated query context for the background loop's error logging/backoff.
|
||||
async fn execute_once_serialized_with_outcome(
|
||||
&self,
|
||||
engine: &QueryEngineRef,
|
||||
frontend_client: &Arc<FrontendClient>,
|
||||
max_window_cnt: Option<usize>,
|
||||
) -> ExecuteOnceOutcome {
|
||||
let _execution_guard = self.execution_lock.lock().await;
|
||||
self.execute_once_unlocked(engine, frontend_client, max_window_cnt)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Executes one flow evaluation. Caller must hold `execution_lock`.
|
||||
async fn execute_once_unlocked(
|
||||
&self,
|
||||
engine: &QueryEngineRef,
|
||||
frontend_client: &Arc<FrontendClient>,
|
||||
max_window_cnt: Option<usize>,
|
||||
) -> ExecuteOnceOutcome {
|
||||
let new_query = match self.gen_insert_plan_unlocked(engine, max_window_cnt).await {
|
||||
Ok(new_query) => new_query,
|
||||
Err(err) => {
|
||||
return ExecuteOnceOutcome {
|
||||
new_query: None,
|
||||
result: Err(err),
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(new_query) = new_query {
|
||||
debug!("Generate new query: {}", new_query.plan);
|
||||
let dirty_filter = match &new_query.dirty_restore {
|
||||
DirtyRestore::Scoped(f) => Some(f),
|
||||
_ => None,
|
||||
};
|
||||
match self
|
||||
.execute_logical_plan(
|
||||
let res = self
|
||||
.execute_logical_plan_unlocked(
|
||||
frontend_client,
|
||||
&new_query.plan,
|
||||
dirty_filter,
|
||||
new_query.can_advance_checkpoints,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(result) => Ok(result),
|
||||
Err(err) => {
|
||||
self.handle_executed_query_failure(Some(&new_query));
|
||||
Err(err)
|
||||
}
|
||||
.await;
|
||||
if res.is_err() {
|
||||
self.handle_executed_query_failure(Some(&new_query));
|
||||
}
|
||||
ExecuteOnceOutcome {
|
||||
new_query: Some(new_query),
|
||||
result: res,
|
||||
}
|
||||
} else {
|
||||
debug!("Generate no query");
|
||||
Ok(None)
|
||||
ExecuteOnceOutcome {
|
||||
new_query: None,
|
||||
result: Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn gen_insert_plan(
|
||||
/// Generates the insert plan. Caller must reach this through the serialized path.
|
||||
async fn gen_insert_plan_unlocked(
|
||||
&self,
|
||||
engine: &QueryEngineRef,
|
||||
max_window_cnt: Option<usize>,
|
||||
@@ -388,11 +446,11 @@ impl BatchingTask {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn execute_logical_plan(
|
||||
/// Executes the insert plan. Caller must reach this through the serialized path.
|
||||
async fn execute_logical_plan_unlocked(
|
||||
&self,
|
||||
frontend_client: &Arc<FrontendClient>,
|
||||
plan: &LogicalPlan,
|
||||
dirty_filter: Option<&FilterExprInfo>,
|
||||
can_advance_checkpoints: bool,
|
||||
) -> Result<Option<(usize, Duration)>, Error> {
|
||||
let instant = Instant::now();
|
||||
@@ -426,8 +484,7 @@ impl BatchingTask {
|
||||
// For incremental-mode SQL queries, attempt to rewrite the delta aggregate
|
||||
// plan into a safe delta-LEFT-JOIN-sink form before deciding on extensions.
|
||||
let incremental_plan = if can_advance_checkpoints {
|
||||
self.prepare_plan_for_incremental(&plan, dirty_filter)
|
||||
.await?
|
||||
self.prepare_plan_for_incremental(&plan).await?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -580,6 +637,112 @@ impl BatchingTask {
|
||||
})
|
||||
}
|
||||
|
||||
fn restore_unscoped_dirty_windows(&self, dirty_windows: &DirtyTimeWindows) {
|
||||
self.state
|
||||
.write()
|
||||
.unwrap()
|
||||
.dirty_time_windows
|
||||
.add_dirty_windows(dirty_windows);
|
||||
}
|
||||
|
||||
fn restore_unscoped_dirty_windows_on_err<T>(
|
||||
&self,
|
||||
dirty_windows: &DirtyTimeWindows,
|
||||
result: Result<T, Error>,
|
||||
) -> Result<T, Error> {
|
||||
result.inspect_err(|_| {
|
||||
self.restore_unscoped_dirty_windows(dirty_windows);
|
||||
})
|
||||
}
|
||||
|
||||
fn drain_dirty_windows_signal(&self) -> (bool, DirtyTimeWindows) {
|
||||
let mut state = self.state.write().unwrap();
|
||||
let dirty_windows_to_restore = state.dirty_time_windows.clone();
|
||||
let is_dirty = !dirty_windows_to_restore.is_empty();
|
||||
state.dirty_time_windows.clean();
|
||||
(is_dirty, dirty_windows_to_restore)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn gen_unfiltered_plan_info(
|
||||
&self,
|
||||
engine: QueryEngineRef,
|
||||
query_ctx: QueryContextRef,
|
||||
sink_table_schema: Arc<Schema>,
|
||||
primary_key_indices: &[usize],
|
||||
allow_partial: bool,
|
||||
dirty_windows_to_restore: DirtyTimeWindows,
|
||||
retention_filter: Option<(&str, Timestamp, &'static str)>,
|
||||
) -> Result<PlanInfo, Error> {
|
||||
let mut plan = self.restore_unscoped_dirty_windows_on_err(
|
||||
&dirty_windows_to_restore,
|
||||
gen_plan_with_matching_schema(
|
||||
&self.config.query,
|
||||
query_ctx,
|
||||
engine,
|
||||
sink_table_schema,
|
||||
primary_key_indices,
|
||||
allow_partial,
|
||||
)
|
||||
.await,
|
||||
)?;
|
||||
|
||||
if let Some((col_name, lower_bound, context)) = retention_filter {
|
||||
let lower = self.restore_unscoped_dirty_windows_on_err(
|
||||
&dirty_windows_to_restore,
|
||||
to_df_literal(lower_bound),
|
||||
)?;
|
||||
let retention_filter = col(col_name).gt_eq(lit(lower));
|
||||
let mut add_filter = AddFilterRewriter::new(retention_filter);
|
||||
plan = self.restore_unscoped_dirty_windows_on_err(
|
||||
&dirty_windows_to_restore,
|
||||
plan.clone()
|
||||
.rewrite(&mut add_filter)
|
||||
.with_context(|_| DatafusionSnafu {
|
||||
context: format!(
|
||||
"Failed to apply {context} expire_after filter to plan:\n {}\n",
|
||||
plan
|
||||
),
|
||||
})
|
||||
.map(|rewrite| rewrite.data),
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(PlanInfo {
|
||||
plan,
|
||||
dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
|
||||
can_advance_checkpoints: true,
|
||||
})
|
||||
}
|
||||
|
||||
async fn gen_unfiltered_plan_info_if_dirty(
|
||||
&self,
|
||||
engine: QueryEngineRef,
|
||||
query_ctx: QueryContextRef,
|
||||
sink_table_schema: Arc<Schema>,
|
||||
primary_key_indices: &[usize],
|
||||
allow_partial: bool,
|
||||
retention_filter: Option<(&str, Timestamp, &'static str)>,
|
||||
) -> Result<Option<PlanInfo>, Error> {
|
||||
let (is_dirty, dirty_windows_to_restore) = self.drain_dirty_windows_signal();
|
||||
if !is_dirty {
|
||||
debug!("Flow id={:?}, no new data, not update", self.config.flow_id);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
self.gen_unfiltered_plan_info(
|
||||
engine,
|
||||
query_ctx,
|
||||
sink_table_schema,
|
||||
primary_key_indices,
|
||||
allow_partial,
|
||||
dirty_windows_to_restore,
|
||||
retention_filter,
|
||||
)
|
||||
.await
|
||||
.map(Some)
|
||||
}
|
||||
|
||||
fn handle_executed_query_failure(&self, query: Option<&PlanInfo>) {
|
||||
if let Some(query) = query {
|
||||
self.restore_dirty_windows_after_failure(query);
|
||||
@@ -626,33 +789,11 @@ impl BatchingTask {
|
||||
|
||||
let min_refresh = self.config.batch_opts.experimental_min_refresh_duration;
|
||||
|
||||
let new_query = match self.gen_insert_plan(&engine, max_window_cnt).await {
|
||||
Ok(new_query) => new_query,
|
||||
Err(err) => {
|
||||
common_telemetry::error!(err; "Failed to generate query for flow={}", self.config.flow_id);
|
||||
// also sleep for a little while before try again to prevent flooding logs
|
||||
tokio::time::sleep(min_refresh).await;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let outcome = self
|
||||
.execute_once_serialized_with_outcome(&engine, &frontend_client, max_window_cnt)
|
||||
.await;
|
||||
|
||||
let res = if let Some(new_query) = &new_query {
|
||||
let dirty_filter = match &new_query.dirty_restore {
|
||||
DirtyRestore::Scoped(f) => Some(f),
|
||||
_ => None,
|
||||
};
|
||||
self.execute_logical_plan(
|
||||
&frontend_client,
|
||||
&new_query.plan,
|
||||
dirty_filter,
|
||||
new_query.can_advance_checkpoints,
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
Ok(None)
|
||||
};
|
||||
|
||||
match res {
|
||||
match outcome.result {
|
||||
// normal execute, sleep for some time before doing next query
|
||||
Ok(Some(_)) => {
|
||||
// can increase max_window_cnt to query more windows next time
|
||||
@@ -703,11 +844,10 @@ impl BatchingTask {
|
||||
}
|
||||
// TODO(discord9): this error should have better place to go, but for now just print error, also more context is needed
|
||||
Err(err) => {
|
||||
self.handle_executed_query_failure(new_query.as_ref());
|
||||
METRIC_FLOW_BATCHING_ENGINE_ERROR_CNT
|
||||
.with_label_values(&[&flow_id_str])
|
||||
.inc();
|
||||
match new_query {
|
||||
match outcome.new_query {
|
||||
Some(query) => {
|
||||
common_telemetry::error!(err; "Failed to execute query for flow={} with query: {}", self.config.flow_id, query.plan);
|
||||
// TODO(discord9): add some backoff here? half the query time window or what
|
||||
@@ -743,6 +883,20 @@ impl BatchingTask {
|
||||
create_table_with_expr(&plan, &self.config.sink_table_name, &self.config.query_type)
|
||||
}
|
||||
|
||||
fn should_use_unfiltered_incremental_delta(&self) -> bool {
|
||||
let state = self.state.read().unwrap();
|
||||
state.checkpoint_mode() == CheckpointMode::Incremental
|
||||
&& !state.is_incremental_disabled()
|
||||
&& matches!(self.config.query_type, QueryType::Sql)
|
||||
}
|
||||
|
||||
fn should_use_unfiltered_full_snapshot_seeding(&self) -> bool {
|
||||
let state = self.state.read().unwrap();
|
||||
state.checkpoint_mode() == CheckpointMode::FullSnapshot
|
||||
&& !state.is_incremental_disabled()
|
||||
&& matches!(self.config.query_type, QueryType::Sql)
|
||||
}
|
||||
|
||||
/// will merge and use the first ten time window in query
|
||||
async fn gen_query_with_time_window(
|
||||
&self,
|
||||
@@ -783,83 +937,35 @@ impl BatchingTask {
|
||||
self.config.flow_id
|
||||
);
|
||||
// clean dirty time window too, this could be from create flow's check_execute
|
||||
let (is_dirty, dirty_windows_to_restore) = {
|
||||
let mut state = self.state.write().unwrap();
|
||||
let dirty_windows_to_restore = state.dirty_time_windows.clone();
|
||||
let is_dirty = !dirty_windows_to_restore.is_empty();
|
||||
state.dirty_time_windows.clean();
|
||||
(is_dirty, dirty_windows_to_restore)
|
||||
};
|
||||
|
||||
if !is_dirty {
|
||||
// no dirty data, hence no need to update
|
||||
debug!("Flow id={:?}, no new data, not update", self.config.flow_id);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let plan = match gen_plan_with_matching_schema(
|
||||
&self.config.query,
|
||||
query_ctx,
|
||||
engine,
|
||||
sink_table_schema.clone(),
|
||||
primary_key_indices,
|
||||
allow_partial,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(plan) => plan,
|
||||
Err(err) => {
|
||||
self.state
|
||||
.write()
|
||||
.unwrap()
|
||||
.dirty_time_windows
|
||||
.add_dirty_windows(&dirty_windows_to_restore);
|
||||
return Err(err);
|
||||
}
|
||||
};
|
||||
|
||||
return Ok(Some(PlanInfo {
|
||||
plan,
|
||||
dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
|
||||
can_advance_checkpoints: true,
|
||||
}));
|
||||
return self
|
||||
.gen_unfiltered_plan_info_if_dirty(
|
||||
engine,
|
||||
query_ctx,
|
||||
sink_table_schema.clone(),
|
||||
primary_key_indices,
|
||||
allow_partial,
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
_ => {
|
||||
// Clean dirty windows for full-query/non-scoped paths,
|
||||
// such as TQL, that cannot use a time-window filter.
|
||||
let dirty_windows_to_restore = {
|
||||
let mut state = self.state.write().unwrap();
|
||||
let dirty_windows_to_restore = state.dirty_time_windows.clone();
|
||||
state.dirty_time_windows.clean();
|
||||
dirty_windows_to_restore
|
||||
};
|
||||
let (_, dirty_windows_to_restore) = self.drain_dirty_windows_signal();
|
||||
|
||||
let plan = match gen_plan_with_matching_schema(
|
||||
&self.config.query,
|
||||
query_ctx,
|
||||
engine,
|
||||
sink_table_schema.clone(),
|
||||
primary_key_indices,
|
||||
allow_partial,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(plan) => plan,
|
||||
Err(err) => {
|
||||
self.state
|
||||
.write()
|
||||
.unwrap()
|
||||
.dirty_time_windows
|
||||
.add_dirty_windows(&dirty_windows_to_restore);
|
||||
return Err(err);
|
||||
}
|
||||
};
|
||||
let plan_info = self
|
||||
.gen_unfiltered_plan_info(
|
||||
engine,
|
||||
query_ctx,
|
||||
sink_table_schema.clone(),
|
||||
primary_key_indices,
|
||||
allow_partial,
|
||||
dirty_windows_to_restore,
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
return Ok(Some(PlanInfo {
|
||||
plan,
|
||||
dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
|
||||
can_advance_checkpoints: true,
|
||||
}));
|
||||
return Ok(Some(plan_info));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -889,22 +995,61 @@ impl BatchingTask {
|
||||
),
|
||||
})?;
|
||||
|
||||
if self.should_use_unfiltered_full_snapshot_seeding() {
|
||||
// A full-snapshot query that can seed/refresh incremental
|
||||
// checkpoints must not use dirty-window predicates. Rows can be
|
||||
// written after dirty windows are drained but before the source scan
|
||||
// snapshot opens; a stale dirty-window filter could exclude those
|
||||
// rows while the returned watermark includes them, causing the next
|
||||
// incremental read to skip them forever. Execute an unfiltered full
|
||||
// snapshot instead, and keep dirty windows only as the scheduling and
|
||||
// failure-restoration signal.
|
||||
let retention_filter = self
|
||||
.config
|
||||
.expire_after
|
||||
.map(|_| (col_name.as_str(), expire_lower_bound, "full-snapshot"));
|
||||
return self
|
||||
.gen_unfiltered_plan_info_if_dirty(
|
||||
engine,
|
||||
query_ctx,
|
||||
sink_table_schema.clone(),
|
||||
primary_key_indices,
|
||||
allow_partial,
|
||||
retention_filter,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
if self.should_use_unfiltered_incremental_delta() {
|
||||
// In incremental mode, source correctness is defined by the
|
||||
// per-region sequence range `(checkpoint, scan-open snapshot]`, not
|
||||
// by dirty-window predicates. Dirty windows are only a scheduling
|
||||
// signal here. Applying a stale dirty-window filter to the source can
|
||||
// exclude rows that are inside the returned watermark and make a
|
||||
// checkpoint advance skip them forever. The sink side is also left
|
||||
// unfiltered by dirty windows; the incremental rewrite joins the
|
||||
// delta groups with the full sink state for correctness. Future
|
||||
// dynamic filters can prune sink reads as a pure optimization.
|
||||
let retention_filter = self
|
||||
.config
|
||||
.expire_after
|
||||
.map(|_| (col_name.as_str(), expire_lower_bound, "incremental"));
|
||||
return self
|
||||
.gen_unfiltered_plan_info_if_dirty(
|
||||
engine,
|
||||
query_ctx,
|
||||
sink_table_schema.clone(),
|
||||
primary_key_indices,
|
||||
allow_partial,
|
||||
retention_filter,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
let (expr, can_advance_checkpoints) = {
|
||||
let mut state = self.state.write().unwrap();
|
||||
let window_cnt = if state.checkpoint_mode() == CheckpointMode::Incremental
|
||||
&& !state.is_incremental_disabled()
|
||||
&& matches!(self.config.query_type, QueryType::Sql)
|
||||
{
|
||||
// Incremental scans are bounded by region sequence checkpoints,
|
||||
// so the dirty-window filter only narrows sink-side/time-window
|
||||
// work. Drain more windows than normal, but keep a hard cap to
|
||||
// avoid building a huge OR filter after a long downtime. If
|
||||
// windows remain, checkpoints won't advance this round.
|
||||
MAX_INCREMENTAL_DIRTY_WINDOW_FILTERS
|
||||
} else {
|
||||
max_window_cnt
|
||||
.unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query)
|
||||
};
|
||||
let window_cnt = max_window_cnt
|
||||
.unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query);
|
||||
let expr = state.dirty_time_windows.gen_filter_exprs(
|
||||
&col_name,
|
||||
Some(expire_lower_bound),
|
||||
|
||||
@@ -26,8 +26,7 @@ use snafu::ResultExt;
|
||||
use table::metadata::TableId;
|
||||
|
||||
use crate::Error;
|
||||
use crate::batching_mode::incremental_filter::build_sink_dirty_time_window_filter_expr;
|
||||
use crate::batching_mode::state::{CheckpointMode, FilterExprInfo};
|
||||
use crate::batching_mode::state::CheckpointMode;
|
||||
use crate::batching_mode::table_creator::QueryType;
|
||||
use crate::batching_mode::task::BatchingTask;
|
||||
use crate::batching_mode::utils::{
|
||||
@@ -74,7 +73,6 @@ impl BatchingTask {
|
||||
pub(super) async fn prepare_plan_for_incremental(
|
||||
&self,
|
||||
plan: &LogicalPlan,
|
||||
dirty_filter: Option<&FilterExprInfo>,
|
||||
) -> Result<Option<LogicalPlan>, Error> {
|
||||
let is_incremental_sql = {
|
||||
let state = self.state.read().unwrap();
|
||||
@@ -152,31 +150,12 @@ impl BatchingTask {
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
let sink_schema = sink_table.table_info().meta.schema.clone();
|
||||
let sink_dirty_filter = match build_sink_dirty_time_window_filter_expr(
|
||||
self.config.flow_id,
|
||||
&analysis,
|
||||
&sink_schema,
|
||||
dirty_filter,
|
||||
) {
|
||||
Ok(filter) => filter,
|
||||
Err(err) => {
|
||||
warn!(
|
||||
"Flow {} failed to build sink dirty time window filter; \
|
||||
falling back to full snapshot for this round: {:?}",
|
||||
self.config.flow_id, err
|
||||
);
|
||||
self.state.write().unwrap().mark_full_snapshot();
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
let rewritten_inner = match rewrite_incremental_aggregate_with_sink_merge(
|
||||
&inner_plan,
|
||||
&analysis,
|
||||
sink_table,
|
||||
&self.config.sink_table_name,
|
||||
sink_dirty_filter,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
{
|
||||
|
||||
@@ -25,7 +25,9 @@ use datatypes::data_type::ConcreteDataType as CDT;
|
||||
use datatypes::schema::ColumnSchema;
|
||||
use datatypes::vectors::{TimestampMillisecondVector, UInt32Vector, VectorRef};
|
||||
use pretty_assertions::assert_eq;
|
||||
use query::options::{FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY};
|
||||
use query::options::{
|
||||
FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY, QueryOptions,
|
||||
};
|
||||
use session::context::QueryContext;
|
||||
use table::test_util::MemTable;
|
||||
|
||||
@@ -38,6 +40,13 @@ use crate::batching_mode::state::CheckpointMode;
|
||||
use crate::batching_mode::time_window::find_time_window_expr;
|
||||
use crate::test_utils::create_test_query_engine;
|
||||
|
||||
fn incremental_batch_opts() -> Arc<BatchingModeOptions> {
|
||||
Arc::new(BatchingModeOptions {
|
||||
experimental_enable_incremental_read: true,
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
|
||||
async fn new_test_task_and_plan_with_missing_sink() -> (BatchingTask, LogicalPlan) {
|
||||
new_test_task_engine_and_plan_with_query(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
@@ -60,6 +69,15 @@ impl TestTaskParts {
|
||||
}
|
||||
|
||||
async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str) -> TestTaskParts {
|
||||
new_test_task_engine_and_plan_with_query_and_opts(query, sink_table, incremental_batch_opts())
|
||||
.await
|
||||
}
|
||||
|
||||
async fn new_test_task_engine_and_plan_with_query_and_opts(
|
||||
query: &str,
|
||||
sink_table: &str,
|
||||
batch_opts: Arc<BatchingModeOptions>,
|
||||
) -> TestTaskParts {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
let plan = sql_to_df_plan(
|
||||
@@ -91,7 +109,7 @@ async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str)
|
||||
query_ctx: ctx,
|
||||
catalog_manager: query_engine.engine_state().catalog_manager().clone(),
|
||||
shutdown_rx: rx,
|
||||
batch_opts: Arc::new(BatchingModeOptions::default()),
|
||||
batch_opts,
|
||||
flow_eval_interval: None,
|
||||
})
|
||||
.unwrap();
|
||||
@@ -103,6 +121,75 @@ async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str)
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_incremental_read_is_disabled_by_default() {
|
||||
let task = new_test_task_engine_and_plan_with_query_and_opts(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
"numbers_with_ts",
|
||||
Arc::new(BatchingModeOptions::default()),
|
||||
)
|
||||
.await
|
||||
.task;
|
||||
|
||||
assert!(task.state.read().unwrap().is_incremental_disabled());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_dirty_time_windows_uses_batch_opts() {
|
||||
let task = new_test_task_engine_and_plan_with_query_and_opts(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
"numbers_with_ts",
|
||||
Arc::new(BatchingModeOptions {
|
||||
experimental_max_filter_num_per_query: 7,
|
||||
experimental_time_window_merge_threshold: 11,
|
||||
..Default::default()
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.task;
|
||||
|
||||
let state = task.state.read().unwrap();
|
||||
assert_eq!(7, state.dirty_time_windows.max_filter_num_per_query());
|
||||
assert_eq!(11, state.dirty_time_windows.time_window_merge_threshold());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_execute_once_serialized_waits_for_execution_lock() {
|
||||
let TestTaskParts {
|
||||
task, query_engine, ..
|
||||
} = new_test_task_engine_and_plan_with_query(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
"missing_sink",
|
||||
)
|
||||
.await;
|
||||
let (frontend_client, _handler) =
|
||||
FrontendClient::from_empty_grpc_handler(QueryOptions::default());
|
||||
let frontend_client = Arc::new(frontend_client);
|
||||
|
||||
let guard = task.execution_lock.clone().lock_owned().await;
|
||||
let task_to_run = task.clone();
|
||||
let query_engine_to_run = query_engine.clone();
|
||||
let frontend_client_to_run = frontend_client.clone();
|
||||
let exec = tokio::spawn(async move {
|
||||
task_to_run
|
||||
.execute_once_serialized(&query_engine_to_run, &frontend_client_to_run, None)
|
||||
.await
|
||||
});
|
||||
|
||||
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||
assert!(
|
||||
!exec.is_finished(),
|
||||
"execute_once_serialized should wait for execution_lock"
|
||||
);
|
||||
|
||||
drop(guard);
|
||||
tokio::time::timeout(Duration::from_secs(1), exec)
|
||||
.await
|
||||
.expect("execute_once_serialized should finish once execution_lock is released")
|
||||
.expect("execute_once_serialized task should not panic")
|
||||
.expect_err("missing sink should fail after acquiring execution_lock");
|
||||
}
|
||||
|
||||
async fn new_time_window_test_task_with_query(query: &str) -> TestTaskParts {
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
@@ -147,7 +234,7 @@ async fn new_time_window_test_task_with_query(query: &str) -> TestTaskParts {
|
||||
query_ctx: ctx,
|
||||
catalog_manager: query_engine.engine_state().catalog_manager().clone(),
|
||||
shutdown_rx: rx,
|
||||
batch_opts: Arc::new(BatchingModeOptions::default()),
|
||||
batch_opts: incremental_batch_opts(),
|
||||
flow_eval_interval: None,
|
||||
})
|
||||
.unwrap();
|
||||
@@ -226,6 +313,14 @@ fn dirty_range(start: i64, end: i64) -> DirtyTimeWindows {
|
||||
dirty
|
||||
}
|
||||
|
||||
fn expire_after_for_retention_filter_test() -> i64 {
|
||||
let now_secs = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.expect("Time went backwards")
|
||||
.as_secs();
|
||||
(now_secs - 10) as i64
|
||||
}
|
||||
|
||||
async fn assert_unscoped_failure_restore(
|
||||
consumed_dirty_windows: DirtyTimeWindows,
|
||||
current_dirty_windows: DirtyTimeWindows,
|
||||
@@ -626,6 +721,7 @@ async fn test_full_snapshot_scoped_plan_marks_checkpoint_advance_safe_only_after
|
||||
.await;
|
||||
{
|
||||
let mut state = task.state.write().unwrap();
|
||||
state.disable_incremental();
|
||||
state
|
||||
.dirty_time_windows
|
||||
.add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
|
||||
@@ -657,7 +753,7 @@ async fn test_full_snapshot_scoped_plan_marks_checkpoint_advance_safe_only_after
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_incremental_scoped_plan_consumes_all_dirty_windows_for_checkpoint_safety() {
|
||||
async fn test_incremental_plan_consumes_dirty_signal_for_checkpoint_safety() {
|
||||
let TestTaskParts {
|
||||
task,
|
||||
query_engine,
|
||||
@@ -692,6 +788,192 @@ async fn test_incremental_scoped_plan_consumes_all_dirty_windows_for_checkpoint_
|
||||
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_full_snapshot_seeding_for_incremental_does_not_add_dirty_window_filter() {
|
||||
let TestTaskParts {
|
||||
task,
|
||||
query_engine,
|
||||
..
|
||||
} = new_time_window_test_task_with_query(
|
||||
"SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window",
|
||||
)
|
||||
.await;
|
||||
{
|
||||
let mut state = task.state.write().unwrap();
|
||||
assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
|
||||
assert!(!state.is_incremental_disabled());
|
||||
state
|
||||
.dirty_time_windows
|
||||
.add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
|
||||
state
|
||||
.dirty_time_windows
|
||||
.add_window(Timestamp::new_second(30), Some(Timestamp::new_second(35)));
|
||||
}
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", CDT::uint32_datatype(), false),
|
||||
ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
|
||||
.with_time_index(true),
|
||||
]));
|
||||
|
||||
let plan = task
|
||||
.gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
let plan_text = plan.plan.to_string();
|
||||
assert!(plan.can_advance_checkpoints);
|
||||
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
|
||||
assert!(!plan_text.contains("Filter:"), "{plan_text}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_full_snapshot_seeding_applies_expire_after_retention_filter() {
|
||||
let TestTaskParts {
|
||||
mut task,
|
||||
query_engine,
|
||||
..
|
||||
} = new_time_window_test_task_with_query(
|
||||
"SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window",
|
||||
)
|
||||
.await;
|
||||
{
|
||||
let mut state = task.state.write().unwrap();
|
||||
assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
|
||||
assert!(!state.is_incremental_disabled());
|
||||
state
|
||||
.dirty_time_windows
|
||||
.add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
|
||||
}
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", CDT::uint32_datatype(), false),
|
||||
ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
|
||||
.with_time_index(true),
|
||||
]));
|
||||
|
||||
Arc::get_mut(&mut task.config)
|
||||
.expect("test task config should be uniquely owned")
|
||||
.expire_after = Some(expire_after_for_retention_filter_test());
|
||||
let plan = task
|
||||
.gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert!(plan.can_advance_checkpoints);
|
||||
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
|
||||
let plan_text = plan.plan.to_string();
|
||||
assert!(
|
||||
plan_text.contains("Filter: ts >= TimestampMillisecond("),
|
||||
"{plan_text}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_incremental_plan_does_not_add_dirty_window_filter() {
|
||||
let TestTaskParts {
|
||||
task,
|
||||
query_engine,
|
||||
..
|
||||
} = new_time_window_test_task_with_query(
|
||||
"SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window",
|
||||
)
|
||||
.await;
|
||||
{
|
||||
let mut state = task.state.write().unwrap();
|
||||
state.advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
|
||||
state
|
||||
.dirty_time_windows
|
||||
.add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
|
||||
}
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", CDT::uint32_datatype(), false),
|
||||
ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
|
||||
.with_time_index(true),
|
||||
]));
|
||||
|
||||
let plan = task
|
||||
.gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
let plan_text = plan.plan.to_string();
|
||||
assert!(plan.can_advance_checkpoints);
|
||||
assert!(!plan_text.contains("Filter:"), "{plan_text}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_incremental_delta_applies_expire_after_retention_filter() {
|
||||
let TestTaskParts {
|
||||
mut task,
|
||||
query_engine,
|
||||
..
|
||||
} = new_time_window_test_task_with_query(
|
||||
"SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window",
|
||||
)
|
||||
.await;
|
||||
{
|
||||
let mut state = task.state.write().unwrap();
|
||||
state.advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
|
||||
state
|
||||
.dirty_time_windows
|
||||
.add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
|
||||
}
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", CDT::uint32_datatype(), false),
|
||||
ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
|
||||
.with_time_index(true),
|
||||
]));
|
||||
|
||||
Arc::get_mut(&mut task.config)
|
||||
.expect("test task config should be uniquely owned")
|
||||
.expire_after = Some(expire_after_for_retention_filter_test());
|
||||
let plan = task
|
||||
.gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert!(plan.can_advance_checkpoints);
|
||||
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
|
||||
let plan_text = plan.plan.to_string();
|
||||
assert!(
|
||||
plan_text.contains("Filter: ts >= TimestampMillisecond("),
|
||||
"{plan_text}"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_non_scoped_path_generates_plan_with_empty_dirty_signal() {
|
||||
let TestTaskParts {
|
||||
mut task,
|
||||
query_engine,
|
||||
..
|
||||
} = new_test_task_engine_and_plan_with_query(
|
||||
"SELECT number, ts FROM numbers_with_ts",
|
||||
"missing_sink",
|
||||
)
|
||||
.await;
|
||||
Arc::get_mut(&mut task.config)
|
||||
.expect("test task config should be uniquely owned")
|
||||
.query_type = QueryType::Tql;
|
||||
task.state.write().unwrap().dirty_time_windows.clean();
|
||||
let sink_schema = Arc::new(Schema::new(vec![
|
||||
ColumnSchema::new("number", CDT::uint32_datatype(), false),
|
||||
ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false).with_time_index(true),
|
||||
]));
|
||||
|
||||
let plan = task
|
||||
.gen_query_with_time_window(query_engine, &sink_schema, &[], false, None)
|
||||
.await
|
||||
.unwrap()
|
||||
.expect("non-scoped path should generate a plan even with an empty dirty signal");
|
||||
|
||||
assert!(plan.can_advance_checkpoints);
|
||||
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_executed_query_failure_restores_scoped_dirty_windows_for_flush_path() {
|
||||
let (task, plan) = new_test_task_and_plan_with_missing_sink().await;
|
||||
@@ -773,7 +1055,7 @@ async fn test_prepare_plan_for_incremental_disables_on_non_aggregate() {
|
||||
query_ctx: ctx,
|
||||
catalog_manager: query_engine.engine_state().catalog_manager().clone(),
|
||||
shutdown_rx: rx,
|
||||
batch_opts: Arc::new(BatchingModeOptions::default()),
|
||||
batch_opts: incremental_batch_opts(),
|
||||
flow_eval_interval: None,
|
||||
})
|
||||
.unwrap();
|
||||
@@ -788,10 +1070,7 @@ async fn test_prepare_plan_for_incremental_disables_on_non_aggregate() {
|
||||
CheckpointMode::Incremental
|
||||
);
|
||||
|
||||
let incremental_plan = task
|
||||
.prepare_plan_for_incremental(&dml_plan, None)
|
||||
.await
|
||||
.unwrap();
|
||||
let incremental_plan = task.prepare_plan_for_incremental(&dml_plan).await.unwrap();
|
||||
assert!(incremental_plan.is_none());
|
||||
let state = task.state.read().unwrap();
|
||||
assert!(state.is_incremental_disabled());
|
||||
@@ -852,7 +1131,7 @@ async fn test_prepare_plan_for_incremental_falls_back_without_disable_on_rewrite
|
||||
query_ctx: ctx,
|
||||
catalog_manager: query_engine.engine_state().catalog_manager().clone(),
|
||||
shutdown_rx: rx,
|
||||
batch_opts: Arc::new(BatchingModeOptions::default()),
|
||||
batch_opts: incremental_batch_opts(),
|
||||
flow_eval_interval: None,
|
||||
})
|
||||
.unwrap();
|
||||
@@ -866,10 +1145,7 @@ async fn test_prepare_plan_for_incremental_falls_back_without_disable_on_rewrite
|
||||
CheckpointMode::Incremental
|
||||
);
|
||||
|
||||
let incremental_plan = task
|
||||
.prepare_plan_for_incremental(&dml_plan, None)
|
||||
.await
|
||||
.unwrap();
|
||||
let incremental_plan = task.prepare_plan_for_incremental(&dml_plan).await.unwrap();
|
||||
assert!(incremental_plan.is_none());
|
||||
let state = task.state.read().unwrap();
|
||||
assert!(!state.is_incremental_disabled());
|
||||
@@ -928,7 +1204,7 @@ async fn test_prepare_plan_for_incremental_group_by_without_merge_columns_uses_o
|
||||
query_ctx: ctx,
|
||||
catalog_manager: query_engine.engine_state().catalog_manager().clone(),
|
||||
shutdown_rx: rx,
|
||||
batch_opts: Arc::new(BatchingModeOptions::default()),
|
||||
batch_opts: incremental_batch_opts(),
|
||||
flow_eval_interval: None,
|
||||
})
|
||||
.unwrap();
|
||||
@@ -939,7 +1215,7 @@ async fn test_prepare_plan_for_incremental_group_by_without_merge_columns_uses_o
|
||||
.advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
|
||||
|
||||
let incremental_plan = task
|
||||
.prepare_plan_for_incremental(&dml_plan, None)
|
||||
.prepare_plan_for_incremental(&dml_plan)
|
||||
.await
|
||||
.unwrap()
|
||||
.expect("plain GROUP BY is incremental-safe without a rewrite");
|
||||
@@ -962,7 +1238,7 @@ async fn test_auto_created_sql_aggregate_sink_reaches_incremental_safe() {
|
||||
task.state.write().unwrap().dirty_time_windows.set_dirty();
|
||||
|
||||
let plan_info = task
|
||||
.gen_insert_plan(&query_engine, None)
|
||||
.gen_insert_plan_unlocked(&query_engine, None)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
@@ -973,7 +1249,7 @@ async fn test_auto_created_sql_aggregate_sink_reaches_incremental_safe() {
|
||||
.unwrap()
|
||||
.advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
|
||||
let incremental_plan = task
|
||||
.prepare_plan_for_incremental(&plan_info.plan, None)
|
||||
.prepare_plan_for_incremental(&plan_info.plan)
|
||||
.await
|
||||
.unwrap();
|
||||
let incremental_safe = incremental_plan.is_some();
|
||||
@@ -1078,11 +1354,11 @@ async fn test_insert_plan_matching_failure_restores_consumed_dirty_marker() {
|
||||
register_number_only_sink(&query_engine, sink_table);
|
||||
task.state.write().unwrap().dirty_time_windows.set_dirty();
|
||||
|
||||
let result = task.gen_insert_plan(&query_engine, None).await;
|
||||
let result = task.gen_insert_plan_unlocked(&query_engine, None).await;
|
||||
|
||||
assert!(result.is_err());
|
||||
let _err = match result {
|
||||
Ok(_) => panic!("gen_insert_plan should fail with a sink column mismatch"),
|
||||
Ok(_) => panic!("gen_insert_plan_unlocked should fail with a sink column mismatch"),
|
||||
Err(err) => err,
|
||||
};
|
||||
let state = task.state.read().unwrap();
|
||||
|
||||
@@ -1288,9 +1288,10 @@ async fn test_rewrite_incremental_aggregate_with_left_join() {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_rewrite_incremental_aggregate_filters_sink_dirty_time_window() {
|
||||
// This verifies the rewrite placement when callers supply an already
|
||||
// inferred sink dirty-window predicate. The task-level inference rules are
|
||||
// covered by `infer_sink_time_window_filter_col` tests in task.rs.
|
||||
// This verifies the rewrite placement when callers supply a sink predicate.
|
||||
// The production incremental flow path currently leaves sink scans
|
||||
// unfiltered for correctness and relies on future dynamic filters for
|
||||
// pruning.
|
||||
let query_engine = create_test_query_engine();
|
||||
let ctx = QueryContext::arc();
|
||||
let sql = "SELECT max(number) AS number, date_bin(INTERVAL '1 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window";
|
||||
|
||||
@@ -566,11 +566,15 @@ impl FrontendInvoker {
|
||||
name: TABLE_FLOWNODE_SET_CACHE_NAME,
|
||||
})?;
|
||||
|
||||
// TODO(auto_create_table): flow sink tables are created through a controlled
|
||||
// `CREATE FLOW` path, not client writes, so they are intentionally exempt from
|
||||
// the frontend's global auto-create switch. Revisit if flow should honor it.
|
||||
let inserter = Arc::new(Inserter::new(
|
||||
catalog_manager.clone(),
|
||||
partition_manager.clone(),
|
||||
node_manager.clone(),
|
||||
table_flownode_cache,
|
||||
true,
|
||||
));
|
||||
|
||||
let deleter = Arc::new(Deleter::new(
|
||||
|
||||
@@ -44,6 +44,11 @@ pub struct FrontendOptions {
|
||||
pub node_id: Option<String>,
|
||||
pub default_timezone: Option<String>,
|
||||
pub default_column_prefix: Option<String>,
|
||||
/// Server-side global switch for auto table creation on write.
|
||||
/// Acts as an upper bound: when `false`, missing tables are never auto-created
|
||||
/// even if a request sets the `auto_create_table` hint to `true`. When `true`
|
||||
/// (default), the per-request hint still applies. Default: `true`.
|
||||
pub auto_create_table: bool,
|
||||
/// Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
|
||||
/// Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
pub max_in_flight_write_bytes: ReadableSize,
|
||||
@@ -82,6 +87,7 @@ impl Default for FrontendOptions {
|
||||
node_id: None,
|
||||
default_timezone: None,
|
||||
default_column_prefix: None,
|
||||
auto_create_table: true,
|
||||
max_in_flight_write_bytes: ReadableSize(0),
|
||||
write_bytes_exhausted_policy: OnExhaustedPolicy::default(),
|
||||
http: HttpOptions::default(),
|
||||
|
||||
@@ -185,6 +185,7 @@ impl FrontendBuilder {
|
||||
partition_manager.clone(),
|
||||
node_manager.clone(),
|
||||
table_flownode_cache,
|
||||
self.options.auto_create_table,
|
||||
));
|
||||
let deleter = Arc::new(Deleter::new(
|
||||
self.catalog_manager.clone(),
|
||||
|
||||
@@ -43,7 +43,12 @@ use servers::query_handler::{
|
||||
};
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::{IntoError, ResultExt};
|
||||
use table::requests::{OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM};
|
||||
use table::requests::{
|
||||
OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM, SEMANTIC_PIPELINE, SEMANTIC_SIGNAL_TYPE,
|
||||
SEMANTIC_SOURCE, SEMANTIC_TRACE_CONVENTIONS, SEMANTIC_TRACE_HAS_EVENTS,
|
||||
SEMANTIC_TRACE_HAS_LINKS, SEMANTIC_VALUE_UNKNOWN, SIGNAL_TYPE_LOG, SIGNAL_TYPE_METRIC,
|
||||
SIGNAL_TYPE_TRACE, SOURCE_OPENTELEMETRY, TABLE_DATA_MODEL_TRACE_V1,
|
||||
};
|
||||
|
||||
use crate::instance::Instance;
|
||||
use crate::instance::otlp::trace_semconv::trace_semconv_fixed_type;
|
||||
@@ -131,12 +136,14 @@ impl OpenTelemetryProtocolHandler for Instance {
|
||||
let (requests, rows) = otlp::metrics::to_grpc_insert_requests(request, &mut metric_ctx)?;
|
||||
OTLP_METRICS_ROWS.inc_by(rows as u64);
|
||||
|
||||
let ctx = if !is_legacy {
|
||||
let ctx = {
|
||||
let mut c = (*ctx).clone();
|
||||
c.set_extension(OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM.to_string());
|
||||
c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC);
|
||||
c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
|
||||
if !is_legacy {
|
||||
c.set_extension(OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM.to_string());
|
||||
}
|
||||
Arc::new(c)
|
||||
} else {
|
||||
ctx
|
||||
};
|
||||
|
||||
// If the user uses the legacy path, it is by default without metric engine.
|
||||
@@ -211,6 +218,15 @@ impl OpenTelemetryProtocolHandler for Instance {
|
||||
.get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
|
||||
interceptor_ref.pre_execute(ctx.clone())?;
|
||||
|
||||
// `as_req_iter` clones this ctx into each `temp_ctx`, so identity set here
|
||||
// reaches the context that drives table auto-create.
|
||||
let ctx = {
|
||||
let mut c = (*ctx).clone();
|
||||
c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_LOG);
|
||||
c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
|
||||
Arc::new(c)
|
||||
};
|
||||
|
||||
let opt_req = otlp::logs::to_grpc_insert_requests(
|
||||
request,
|
||||
pipeline,
|
||||
@@ -256,6 +272,23 @@ impl Instance {
|
||||
ctx: QueryContextRef,
|
||||
) -> ServerResult<TraceIngestOutcome> {
|
||||
let is_trace_v1_model = matches!(pipeline, PipelineWay::OtlpTraceDirectV1);
|
||||
|
||||
// Only the main span table gets the identity; the derived `_services` /
|
||||
// `_operations` lookup tables keep the unstamped `ctx`.
|
||||
let main_ctx = {
|
||||
let mut c = (*ctx).clone();
|
||||
c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_TRACE);
|
||||
c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
|
||||
if is_trace_v1_model {
|
||||
c.set_extension(SEMANTIC_PIPELINE, TABLE_DATA_MODEL_TRACE_V1);
|
||||
c.set_extension(SEMANTIC_TRACE_HAS_EVENTS, "true");
|
||||
c.set_extension(SEMANTIC_TRACE_HAS_LINKS, "true");
|
||||
// schema_url is row-level, so conventions is unknown at table level.
|
||||
c.set_extension(SEMANTIC_TRACE_CONVENTIONS, SEMANTIC_VALUE_UNKNOWN);
|
||||
}
|
||||
Arc::new(c)
|
||||
};
|
||||
|
||||
let ingest_ctx = TraceChunkIngestContext {
|
||||
pipeline_handler,
|
||||
pipeline,
|
||||
@@ -278,7 +311,7 @@ impl Instance {
|
||||
.map(|chunk| chunk.collect::<Vec<_>>())
|
||||
.collect::<Vec<_>>();
|
||||
for chunk in chunks {
|
||||
self.ingest_trace_chunk(&ingest_ctx, chunk, ctx.clone(), &mut ingest_state)
|
||||
self.ingest_trace_chunk(&ingest_ctx, chunk, main_ctx.clone(), &mut ingest_state)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -440,7 +440,17 @@ impl Context {
|
||||
};
|
||||
let _ = self
|
||||
.cache_invalidator
|
||||
.invalidate(&ctx, &[CacheIdent::TableId(table_id)])
|
||||
.invalidate(
|
||||
&ctx,
|
||||
&[
|
||||
CacheIdent::TableId(table_id),
|
||||
CacheIdent::TableName(TableName {
|
||||
catalog_name: self.persistent_ctx.catalog_name.clone(),
|
||||
schema_name: self.persistent_ctx.schema_name.clone(),
|
||||
table_name: self.persistent_ctx.table_name.clone(),
|
||||
}),
|
||||
],
|
||||
)
|
||||
.await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -95,10 +95,19 @@ impl State for UpdatePartitionMetadata {
|
||||
|
||||
let mut new_table_info = table_info_value.table_info.clone();
|
||||
new_table_info.meta.partition_key_indices = partition_key_indices;
|
||||
common_telemetry::info!(
|
||||
"Update table partition metadata, table_id: {}, partition_key_indices: {:?}, partition_columns: {:?}",
|
||||
table_id,
|
||||
new_table_info.meta.partition_key_indices,
|
||||
new_table_info
|
||||
.meta
|
||||
.partition_column_names()
|
||||
.cloned()
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
ctx.update_table_info(&table_info_value, table_info_value.update(new_table_info))
|
||||
.await?;
|
||||
// We don't invalidate cache here because the subsequent AllocateRegion step
|
||||
// will update the table route and invalidate the cache accordingly.
|
||||
ctx.invalidate_table_cache().await?;
|
||||
|
||||
Ok((
|
||||
Box::new(AllocateRegion::new(self.plan_entries.clone())),
|
||||
|
||||
@@ -50,6 +50,7 @@ datafusion-common.workspace = true
|
||||
datafusion-expr.workspace = true
|
||||
datatypes.workspace = true
|
||||
dashmap.workspace = true
|
||||
derive_more.workspace = true
|
||||
dotenv.workspace = true
|
||||
either.workspace = true
|
||||
futures.workspace = true
|
||||
|
||||
@@ -150,6 +150,7 @@ impl CompactionScheduler {
|
||||
}
|
||||
|
||||
/// Schedules a compaction for the region.
|
||||
/// Returns whether a compaction is scheduled.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) async fn schedule_compaction(
|
||||
&mut self,
|
||||
@@ -161,7 +162,7 @@ impl CompactionScheduler {
|
||||
manifest_ctx: &ManifestContextRef,
|
||||
schema_metadata_manager: SchemaMetadataManagerRef,
|
||||
max_parallelism: usize,
|
||||
) -> Result<()> {
|
||||
) -> Result<bool> {
|
||||
// skip compaction if region is in staging state
|
||||
let current_state = manifest_ctx.current_state();
|
||||
if current_state == RegionRoleState::Leader(RegionLeaderState::Staging) {
|
||||
@@ -170,7 +171,7 @@ impl CompactionScheduler {
|
||||
region_id, compact_options
|
||||
);
|
||||
waiter.send(Ok(0));
|
||||
return Ok(());
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
if let Some(status) = self.region_status.get_mut(®ion_id) {
|
||||
@@ -192,7 +193,7 @@ impl CompactionScheduler {
|
||||
);
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
// The region can compact directly.
|
||||
@@ -209,7 +210,7 @@ impl CompactionScheduler {
|
||||
max_parallelism,
|
||||
);
|
||||
|
||||
let result = match self
|
||||
match self
|
||||
.schedule_compaction_request(request, compact_options)
|
||||
.await
|
||||
{
|
||||
@@ -220,14 +221,12 @@ impl CompactionScheduler {
|
||||
status.active_compaction = Some(active_compaction);
|
||||
self.region_status.insert(region_id, status);
|
||||
|
||||
Ok(())
|
||||
self.listener.on_compaction_scheduled(region_id);
|
||||
Ok(true)
|
||||
}
|
||||
Ok(None) => Ok(()),
|
||||
Ok(None) => Ok(false),
|
||||
Err(e) => Err(e),
|
||||
};
|
||||
|
||||
self.listener.on_compaction_scheduled(region_id);
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
// Handle pending manual compaction request for the region.
|
||||
@@ -334,6 +333,27 @@ impl CompactionScheduler {
|
||||
// And skip try to schedule next compaction task.
|
||||
return pending_ddl_requests;
|
||||
}
|
||||
Vec::new()
|
||||
}
|
||||
|
||||
pub(crate) fn is_compacting(&self, region_id: RegionId) -> bool {
|
||||
self.region_status
|
||||
.get(®ion_id)
|
||||
.map(|status| status.active_compaction.is_some())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Schedules next compaction upon a finished compaction.
|
||||
/// Returns whether the compaction is scheduled.
|
||||
pub(crate) async fn schedule_next_compaction(
|
||||
&mut self,
|
||||
region_id: RegionId,
|
||||
manifest_ctx: &ManifestContextRef,
|
||||
schema_metadata_manager: SchemaMetadataManagerRef,
|
||||
) -> bool {
|
||||
let Some(status) = self.region_status.get_mut(®ion_id) else {
|
||||
return false;
|
||||
};
|
||||
|
||||
// We should always try to compact the region until picker returns None.
|
||||
let request = status.new_compaction_request(
|
||||
@@ -364,20 +384,21 @@ impl CompactionScheduler {
|
||||
"Successfully scheduled next compaction for region id: {}",
|
||||
region_id
|
||||
);
|
||||
true
|
||||
}
|
||||
Ok(None) => {
|
||||
// No further compaction tasks can be scheduled; cleanup the `CompactionStatus` for this region.
|
||||
// All DDL requests and pending compaction requests have already been processed.
|
||||
// Safe to remove the region from status tracking.
|
||||
self.region_status.remove(®ion_id);
|
||||
false
|
||||
}
|
||||
Err(e) => {
|
||||
error!(e; "Failed to schedule next compaction for region {}", region_id);
|
||||
self.remove_region_on_failure(region_id, Arc::new(e));
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
Vec::new()
|
||||
}
|
||||
|
||||
/// Notifies the scheduler that the compaction job is cancelled cooperatively.
|
||||
@@ -1435,7 +1456,7 @@ mod tests {
|
||||
let manifest_ctx = env
|
||||
.mock_manifest_context(version_control.current().version.metadata.clone())
|
||||
.await;
|
||||
scheduler
|
||||
let scheduled = scheduler
|
||||
.schedule_compaction(
|
||||
builder.region_id(),
|
||||
compact_request::Options::Regular(Default::default()),
|
||||
@@ -1448,6 +1469,7 @@ mod tests {
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(!scheduled);
|
||||
let output = output_rx.await.unwrap().unwrap();
|
||||
assert_eq!(output, 0);
|
||||
assert!(scheduler.region_status.is_empty());
|
||||
@@ -1456,7 +1478,7 @@ mod tests {
|
||||
let version_control = Arc::new(builder.push_l0_file(0, 1000).build());
|
||||
let (output_tx, output_rx) = oneshot::channel();
|
||||
let waiter = OptionOutputTx::from(output_tx);
|
||||
scheduler
|
||||
let scheduled = scheduler
|
||||
.schedule_compaction(
|
||||
builder.region_id(),
|
||||
compact_request::Options::Regular(Default::default()),
|
||||
@@ -1469,11 +1491,67 @@ mod tests {
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(!scheduled);
|
||||
let output = output_rx.await.unwrap().unwrap();
|
||||
assert_eq!(output, 0);
|
||||
assert!(scheduler.region_status.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_schedule_compaction_returns_true_when_task_scheduled() {
|
||||
let job_scheduler = Arc::new(VecScheduler::default());
|
||||
let env = SchedulerEnv::new().await.scheduler(job_scheduler.clone());
|
||||
let (tx, _rx) = mpsc::channel(4);
|
||||
let mut scheduler = env.mock_compaction_scheduler(tx);
|
||||
let mut builder = VersionControlBuilder::new();
|
||||
let region_id = builder.region_id();
|
||||
let end = 1000 * 1000;
|
||||
// Five overlapping L0 files are enough for the regular picker to create a task.
|
||||
let version_control = Arc::new(
|
||||
builder
|
||||
.push_l0_file(0, end)
|
||||
.push_l0_file(10, end)
|
||||
.push_l0_file(50, end)
|
||||
.push_l0_file(80, end)
|
||||
.push_l0_file(90, end)
|
||||
.build(),
|
||||
);
|
||||
let manifest_ctx = env
|
||||
.mock_manifest_context(version_control.current().version.metadata.clone())
|
||||
.await;
|
||||
let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager();
|
||||
schema_metadata_manager
|
||||
.register_region_table_info(
|
||||
region_id.table_id(),
|
||||
"test_table",
|
||||
"test_catalog",
|
||||
"test_schema",
|
||||
None,
|
||||
kv_backend,
|
||||
)
|
||||
.await;
|
||||
|
||||
let scheduled = scheduler
|
||||
.schedule_compaction(
|
||||
region_id,
|
||||
Options::Regular(Default::default()),
|
||||
&version_control,
|
||||
&env.access_layer,
|
||||
OptionOutputTx::none(),
|
||||
&manifest_ctx,
|
||||
schema_metadata_manager,
|
||||
1,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// The boolean result is what the worker uses to decide whether to update
|
||||
// last_schedule_compaction_millis.
|
||||
assert!(scheduled);
|
||||
assert_eq!(1, job_scheduler.num_jobs());
|
||||
assert!(scheduler.region_status.contains_key(®ion_id));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_schedule_on_finished() {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
@@ -1511,7 +1589,7 @@ mod tests {
|
||||
let manifest_ctx = env
|
||||
.mock_manifest_context(version_control.current().version.metadata.clone())
|
||||
.await;
|
||||
scheduler
|
||||
let scheduled = scheduler
|
||||
.schedule_compaction(
|
||||
region_id,
|
||||
compact_request::Options::Regular(Default::default()),
|
||||
@@ -1525,6 +1603,7 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
// Should schedule 1 compaction.
|
||||
assert!(scheduled);
|
||||
assert_eq!(1, scheduler.region_status.len());
|
||||
assert_eq!(1, job_scheduler.num_jobs());
|
||||
let data = version_control.current();
|
||||
@@ -1543,7 +1622,7 @@ mod tests {
|
||||
);
|
||||
// The task is pending.
|
||||
let (tx, _rx) = oneshot::channel();
|
||||
scheduler
|
||||
let scheduled = scheduler
|
||||
.schedule_compaction(
|
||||
region_id,
|
||||
compact_request::Options::Regular(Default::default()),
|
||||
@@ -1556,6 +1635,7 @@ mod tests {
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(!scheduled);
|
||||
assert_eq!(1, scheduler.region_status.len());
|
||||
assert_eq!(1, job_scheduler.num_jobs());
|
||||
assert!(
|
||||
@@ -1571,6 +1651,10 @@ mod tests {
|
||||
scheduler
|
||||
.on_compaction_finished(region_id, &manifest_ctx, schema_metadata_manager.clone())
|
||||
.await;
|
||||
let scheduled = scheduler
|
||||
.schedule_next_compaction(region_id, &manifest_ctx, schema_metadata_manager.clone())
|
||||
.await;
|
||||
assert!(scheduled);
|
||||
assert_eq!(1, scheduler.region_status.len());
|
||||
assert_eq!(2, job_scheduler.num_jobs());
|
||||
|
||||
@@ -1583,7 +1667,7 @@ mod tests {
|
||||
);
|
||||
let (tx, _rx) = oneshot::channel();
|
||||
// The task is pending.
|
||||
scheduler
|
||||
let scheduled = scheduler
|
||||
.schedule_compaction(
|
||||
region_id,
|
||||
compact_request::Options::Regular(Default::default()),
|
||||
@@ -1596,6 +1680,7 @@ mod tests {
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(!scheduled);
|
||||
assert_eq!(2, job_scheduler.num_jobs());
|
||||
assert!(
|
||||
!scheduler
|
||||
@@ -2329,6 +2414,15 @@ mod tests {
|
||||
.await;
|
||||
|
||||
assert!(pending_ddls.is_empty());
|
||||
assert!(scheduler.region_status.contains_key(®ion_id));
|
||||
|
||||
let (schema_metadata_manager, _kv_backend) = mock_schema_metadata_manager();
|
||||
// With no compactable files, next scheduling returns false and removes
|
||||
// the status without creating a background task.
|
||||
let scheduled = scheduler
|
||||
.schedule_next_compaction(region_id, &manifest_ctx, schema_metadata_manager)
|
||||
.await;
|
||||
assert!(!scheduled);
|
||||
assert!(!scheduler.region_status.contains_key(®ion_id));
|
||||
}
|
||||
|
||||
@@ -2371,6 +2465,14 @@ mod tests {
|
||||
.await;
|
||||
|
||||
assert!(pending_ddls.is_empty());
|
||||
assert!(scheduler.region_status.contains_key(®ion_id));
|
||||
|
||||
let (schema_metadata_manager, _kv_backend) = mock_schema_metadata_manager();
|
||||
// The failing scheduler simulates a submit error; callers must see false.
|
||||
let scheduled = scheduler
|
||||
.schedule_next_compaction(region_id, &manifest_ctx, schema_metadata_manager)
|
||||
.await;
|
||||
assert!(!scheduled);
|
||||
assert!(!scheduler.region_status.contains_key(®ion_id));
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,9 @@
|
||||
//! This file contains code to find sorted runs in a set if ranged items and
|
||||
//! along with the best way to merge these items to satisfy the desired run count.
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use common_base::BitVec;
|
||||
use common_base::readable_size::ReadableSize;
|
||||
@@ -423,6 +426,133 @@ where
|
||||
runs
|
||||
}
|
||||
|
||||
pub(crate) fn find_sorted_runs_by_time_range<T>(items: &mut [T]) -> Vec<SortedRun<T>>
|
||||
where
|
||||
T: Item,
|
||||
{
|
||||
if items.is_empty() {
|
||||
return vec![];
|
||||
}
|
||||
sort_ranged_items(items);
|
||||
|
||||
use derive_more::{Eq, PartialEq};
|
||||
|
||||
/// `SortedRun` with a creation sequence `i`.
|
||||
#[derive(PartialEq, Eq)]
|
||||
struct Run<T: Item> {
|
||||
i: usize,
|
||||
#[partial_eq(skip)]
|
||||
run: SortedRun<T>,
|
||||
}
|
||||
|
||||
impl<T: Item> Run<T> {
|
||||
fn new(i: usize, item: &T) -> Run<T> {
|
||||
let mut run = SortedRun::default();
|
||||
run.push_item(item.clone());
|
||||
Run { i, run }
|
||||
}
|
||||
|
||||
fn push_item(&mut self, item: &T) {
|
||||
self.run.push_item(item.clone());
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Item> PartialOrd for Run<T> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
/// Sort by run's `end` desc then `start` asc.
|
||||
impl<T: Item> Ord for Run<T> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let l_run = &self.run;
|
||||
let r_run = &other.run;
|
||||
|
||||
// Safety: `start` and `end` must both exist because it's guaranteed that whenever a
|
||||
// `Run` is created, an item is pushed into it immediately (see its `new` method above).
|
||||
// And there are no other ways to create a `Run` beyond its `new` method in this
|
||||
// function's scope.
|
||||
let l_end = l_run.end.unwrap();
|
||||
let r_end = r_run.end.unwrap();
|
||||
r_end
|
||||
.cmp(&l_end)
|
||||
.then_with(|| {
|
||||
let l_start = l_run.start.unwrap();
|
||||
let r_start = r_run.start.unwrap();
|
||||
l_start.cmp(&r_start)
|
||||
})
|
||||
.then_with(|| self.i.cmp(&other.i))
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper around the `Run` above, to support sorting them by their creation sequence `i`.
|
||||
#[derive(PartialEq, Eq)]
|
||||
struct Wrapper<T: Item>(Run<T>);
|
||||
|
||||
impl<T: Item> PartialOrd for Wrapper<T> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Item> Ord for Wrapper<T> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other.0.i.cmp(&self.0.i)
|
||||
}
|
||||
}
|
||||
|
||||
// Two heaps for finding a run that is both:
|
||||
// 1. not overlapping with item's range,
|
||||
// 2. and is created earliest,
|
||||
// when iterating the items.
|
||||
//
|
||||
// Heap 1 (`runs_sorted_by_end`) is for storing the runs of which top has the minimal "end"
|
||||
// just about to overlap with the current selected item.
|
||||
//
|
||||
// Heap 2 (`runs_sort_by_index`) is for storing the runs that all have "end"s non-overlap with
|
||||
// the current selected item, and of which top is the earliest created run.
|
||||
//
|
||||
// The finding of a suitable run basically works like this:
|
||||
// 1. moves the runs in heap 1 to heap 2, until the top is overlapping with the current item;
|
||||
// 2. now heap 2 has all the runs that can accept the current item, pop its top;
|
||||
// 3. the top is the earliest created run, push the current item;
|
||||
// 4. because the run has changed, push it back to heap 1;
|
||||
// 5. check the next item. Important: we don't need to push the runs in heap 2 to 1, because
|
||||
// the items are sorted by "start". When checking the next item, heap 2's runs must all have
|
||||
// "end"s smaller than next item's "start".
|
||||
//
|
||||
// Actually the heap 2 is only for aligning with the runs selection outcomes in the original
|
||||
// `find_sorted_runs` implementation. If we just need the invariant that each run has the
|
||||
// non-overlapping items, we can get rid of heap 2 and make the codes simpler.
|
||||
|
||||
let mut runs_sort_by_end = BinaryHeap::<Run<T>>::new();
|
||||
let mut runs_sort_by_index = BinaryHeap::<Wrapper<T>>::new();
|
||||
let mut i = 0;
|
||||
|
||||
for item in items {
|
||||
let (start, _) = item.range();
|
||||
|
||||
while let Some(run) = runs_sort_by_end.pop_if(|x| x.run.end.unwrap() <= start) {
|
||||
runs_sort_by_index.push(Wrapper(run));
|
||||
}
|
||||
|
||||
let Some(mut run) = runs_sort_by_index.pop() else {
|
||||
i += 1;
|
||||
runs_sort_by_end.push(Run::new(i, item));
|
||||
continue;
|
||||
};
|
||||
|
||||
run.0.push_item(item);
|
||||
runs_sort_by_end.push(run.0);
|
||||
}
|
||||
|
||||
let mut runs = runs_sort_by_end.into_vec();
|
||||
runs.extend(runs_sort_by_index.into_vec().into_iter().map(|x| x.0));
|
||||
runs.sort_unstable_by_key(|run| run.i);
|
||||
runs.into_iter().map(|x| x.run).collect()
|
||||
}
|
||||
|
||||
/// Finds a set of files with minimum penalty to merge that can reduce the total num of runs.
|
||||
/// The penalty of merging is defined as the size of all overlapping files between two runs.
|
||||
pub fn reduce_runs<T: Item>(mut runs: Vec<SortedRun<T>>) -> Vec<T> {
|
||||
@@ -599,6 +729,8 @@ mod tests {
|
||||
expected_runs: &[Vec<(i64, i64)>],
|
||||
) -> Vec<SortedRun<MockFile>> {
|
||||
let mut files = build_items(ranges);
|
||||
let mut files_clone = files.clone();
|
||||
|
||||
let runs = find_sorted_runs(&mut files);
|
||||
|
||||
let result_file_ranges: Vec<Vec<_>> = runs
|
||||
@@ -606,6 +738,13 @@ mod tests {
|
||||
.map(|r| r.items.iter().map(|f| f.range()).collect())
|
||||
.collect();
|
||||
assert_eq!(&expected_runs, &result_file_ranges);
|
||||
|
||||
let runs_by_time_range = find_sorted_runs_by_time_range(&mut files_clone);
|
||||
let results: Vec<Vec<_>> = runs_by_time_range
|
||||
.iter()
|
||||
.map(|r| r.items.iter().map(|f| f.range()).collect())
|
||||
.collect();
|
||||
assert_eq!(&expected_runs, &results);
|
||||
runs
|
||||
}
|
||||
|
||||
|
||||
@@ -22,14 +22,15 @@ use common_telemetry::{debug, info};
|
||||
use common_time::Timestamp;
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use common_time::timestamp_millis::BucketAligned;
|
||||
use rayon::prelude::*;
|
||||
use store_api::storage::RegionId;
|
||||
|
||||
use crate::compaction::buckets::infer_time_bucket;
|
||||
use crate::compaction::compactor::CompactionRegion;
|
||||
use crate::compaction::picker::{Picker, PickerOutput};
|
||||
use crate::compaction::run::{
|
||||
FileGroup, Item, Ranged, find_sorted_runs, merge_primary_key_ranges, merge_seq_files,
|
||||
primary_key_ranges_overlap, reduce_runs,
|
||||
FileGroup, Item, Ranged, find_sorted_runs, find_sorted_runs_by_time_range,
|
||||
merge_primary_key_ranges, merge_seq_files, primary_key_ranges_overlap, reduce_runs,
|
||||
};
|
||||
use crate::compaction::{CompactionOutput, get_expired_ssts};
|
||||
use crate::sst::file::{FileHandle, Level, overlaps};
|
||||
@@ -64,11 +65,10 @@ impl TwcsPicker {
|
||||
time_windows: &mut BTreeMap<i64, Window>,
|
||||
active_window: Option<i64>,
|
||||
) -> Vec<CompactionOutput> {
|
||||
let mut output = vec![];
|
||||
for (window, files) in time_windows {
|
||||
if files.files.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let find_inputs = |files: &Window,
|
||||
windows: &BTreeMap<i64, Window>|
|
||||
-> (Vec<FileGroup>, bool) {
|
||||
let window = &files.time_window;
|
||||
let mut files_to_merge: Vec<_> = files.files().cloned().collect();
|
||||
|
||||
// Filter out large files in append mode - they won't benefit from compaction
|
||||
@@ -88,13 +88,18 @@ impl TwcsPicker {
|
||||
);
|
||||
}
|
||||
|
||||
let sorted_runs = find_sorted_runs(&mut files_to_merge);
|
||||
let sorted_runs = if files_to_merge.len() < 1024 {
|
||||
find_sorted_runs(&mut files_to_merge)
|
||||
} else {
|
||||
find_sorted_runs_by_time_range(&mut files_to_merge)
|
||||
};
|
||||
let found_runs = sorted_runs.len();
|
||||
// We only remove deletion markers if we found less than 2 runs and not in append mode.
|
||||
// because after compaction there will be no overlapping files.
|
||||
let filter_deleted = !files.overlapping && found_runs <= 2 && !self.append_mode;
|
||||
let filter_deleted =
|
||||
found_runs <= 2 && !self.append_mode && !window_has_overlap(files, windows);
|
||||
if found_runs == 0 {
|
||||
continue;
|
||||
return (vec![], filter_deleted);
|
||||
}
|
||||
|
||||
let mut inputs = if found_runs > 1 {
|
||||
@@ -102,7 +107,7 @@ impl TwcsPicker {
|
||||
} else {
|
||||
let run = sorted_runs.last().unwrap();
|
||||
if run.items().len() < self.trigger_file_num {
|
||||
continue;
|
||||
return (vec![], filter_deleted);
|
||||
}
|
||||
// no overlapping files, try merge small files
|
||||
merge_seq_files(run.items(), self.max_output_file_size)
|
||||
@@ -144,6 +149,26 @@ impl TwcsPicker {
|
||||
filter_deleted,
|
||||
&inputs,
|
||||
);
|
||||
}
|
||||
(inputs, filter_deleted)
|
||||
};
|
||||
|
||||
let mut output = vec![];
|
||||
let windows = time_windows
|
||||
.values()
|
||||
.filter(|w| !w.files.is_empty())
|
||||
.collect::<Vec<_>>();
|
||||
let chunk_size = self.max_background_tasks.unwrap_or(windows.len()).max(1);
|
||||
'chunks: for chunk in windows.chunks(chunk_size) {
|
||||
for (inputs, filter_deleted) in chunk
|
||||
.par_iter() // parallelly calculate the inputs
|
||||
.map(|window| find_inputs(window, time_windows))
|
||||
.collect::<Vec<_>>()
|
||||
{
|
||||
if inputs.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
output.push(CompactionOutput {
|
||||
output_level: LEVEL_COMPACTED, // always compact to l1
|
||||
inputs: inputs.into_iter().flat_map(|fg| fg.into_files()).collect(),
|
||||
@@ -158,7 +183,7 @@ impl TwcsPicker {
|
||||
"Region ({:?}) compaction task size larger than max background tasks({}), remaining tasks discarded",
|
||||
region_id, max_background_tasks
|
||||
);
|
||||
break;
|
||||
break 'chunks;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -268,7 +293,6 @@ struct Window {
|
||||
// created from the same compaction task.
|
||||
files: HashMap<Option<NonZeroU64>, FileGroup>,
|
||||
time_window: i64,
|
||||
overlapping: bool,
|
||||
primary_key_range: Option<(bytes::Bytes, bytes::Bytes)>,
|
||||
}
|
||||
|
||||
@@ -283,7 +307,6 @@ impl Window {
|
||||
end,
|
||||
files,
|
||||
time_window: 0,
|
||||
overlapping: false,
|
||||
primary_key_range,
|
||||
}
|
||||
}
|
||||
@@ -346,37 +369,21 @@ fn assign_to_windows<'a>(
|
||||
}
|
||||
}
|
||||
}
|
||||
if windows.is_empty() {
|
||||
return BTreeMap::new();
|
||||
}
|
||||
windows.into_iter().collect()
|
||||
}
|
||||
|
||||
let mut windows = windows.into_values().collect::<Vec<_>>();
|
||||
windows.sort_unstable_by(|l, r| l.start.cmp(&r.start).then(l.end.cmp(&r.end).reverse()));
|
||||
|
||||
for idx in 0..windows.len() {
|
||||
let lhs_range = windows[idx].range();
|
||||
for next_idx in idx + 1..windows.len() {
|
||||
let rhs_range = windows[next_idx].range();
|
||||
if rhs_range.0 > lhs_range.1 {
|
||||
break;
|
||||
}
|
||||
|
||||
let windows_overlap = overlaps(&lhs_range, &rhs_range)
|
||||
&& match (
|
||||
&windows[idx].primary_key_range,
|
||||
&windows[next_idx].primary_key_range,
|
||||
) {
|
||||
(Some(lhs), Some(rhs)) => primary_key_ranges_overlap(lhs, rhs),
|
||||
fn window_has_overlap(this: &Window, windows: &BTreeMap<i64, Window>) -> bool {
|
||||
windows
|
||||
.values()
|
||||
.filter(|that| this.time_window != that.time_window)
|
||||
.any(|that| {
|
||||
overlaps(&this.range(), &that.range()) && {
|
||||
match (&this.primary_key_range, &that.primary_key_range) {
|
||||
(Some(l), Some(r)) => primary_key_ranges_overlap(l, r),
|
||||
_ => true,
|
||||
};
|
||||
if windows_overlap {
|
||||
windows[idx].overlapping = true;
|
||||
windows[next_idx].overlapping = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
windows.into_iter().map(|w| (w.time_window, w)).collect()
|
||||
})
|
||||
}
|
||||
|
||||
/// Finds the latest active writing window among all files.
|
||||
@@ -606,7 +613,8 @@ mod tests {
|
||||
|
||||
for (expected_window, overlapping, window_files) in expected_files {
|
||||
let actual_window = windows.get(expected_window).unwrap();
|
||||
assert_eq!(*overlapping, actual_window.overlapping);
|
||||
let actual_overlapping = window_has_overlap(actual_window, &windows);
|
||||
assert_eq!(*overlapping, actual_overlapping);
|
||||
let mut file_ranges = actual_window
|
||||
.files
|
||||
.values()
|
||||
@@ -744,7 +752,8 @@ mod tests {
|
||||
|
||||
let windows = assign_to_windows(files.iter(), 2);
|
||||
|
||||
assert!(!windows.get(&2).unwrap().overlapping);
|
||||
let overlapping = window_has_overlap(windows.get(&2).unwrap(), &windows);
|
||||
assert!(!overlapping);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -773,7 +782,8 @@ mod tests {
|
||||
|
||||
let windows = assign_to_windows(files.iter(), 2);
|
||||
|
||||
assert!(!windows.get(&4).unwrap().overlapping);
|
||||
let overlapping = window_has_overlap(windows.get(&4).unwrap(), &windows);
|
||||
assert!(!overlapping);
|
||||
}
|
||||
|
||||
struct CompactionPickerTestCase {
|
||||
|
||||
@@ -21,6 +21,7 @@ use common_error::ext::ErrorExt;
|
||||
use common_error::status_code::StatusCode;
|
||||
use common_recordbatch::DfRecordBatch;
|
||||
use common_test_util::flight::encode_to_flight_data;
|
||||
use common_time::Timestamp;
|
||||
use common_time::util::current_time_millis;
|
||||
use datatypes::arrow::array::{ArrayRef, Float64Array, StringArray, TimestampMillisecondArray};
|
||||
use datatypes::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
|
||||
@@ -67,7 +68,8 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) {
|
||||
default_flat_format: flat_format,
|
||||
..Default::default()
|
||||
};
|
||||
let time_provider = Arc::new(MockTimeProvider::new(current_time_millis()));
|
||||
let initial_time = current_time_millis();
|
||||
let time_provider = Arc::new(MockTimeProvider::new(initial_time));
|
||||
let engine = env
|
||||
.create_engine_with_time(
|
||||
config.clone(),
|
||||
@@ -99,14 +101,22 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) {
|
||||
.await
|
||||
.unwrap();
|
||||
let region = engine.get_region(region_id).unwrap();
|
||||
let initial_schedule_time = region.last_schedule_compaction_millis();
|
||||
assert_eq!(initial_time, initial_schedule_time);
|
||||
|
||||
let new_edit = || RegionEdit {
|
||||
files_to_add: vec![FileMeta {
|
||||
region_id: region.region_id,
|
||||
file_id: FileId::random(),
|
||||
level: 0,
|
||||
..Default::default()
|
||||
}],
|
||||
let new_edit = |file_starts: &[i64]| RegionEdit {
|
||||
files_to_add: file_starts
|
||||
.iter()
|
||||
.map(|start| FileMeta {
|
||||
region_id: region.region_id,
|
||||
file_id: FileId::random(),
|
||||
time_range: (
|
||||
Timestamp::new_millisecond(*start),
|
||||
Timestamp::new_millisecond(1000 * 1000),
|
||||
),
|
||||
..Default::default()
|
||||
})
|
||||
.collect(),
|
||||
files_to_remove: vec![],
|
||||
timestamp_ms: None,
|
||||
compaction_time_window: None,
|
||||
@@ -115,19 +125,23 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) {
|
||||
committed_sequence: None,
|
||||
};
|
||||
engine
|
||||
.edit_region(region.region_id, new_edit())
|
||||
.edit_region(region.region_id, new_edit(&[0, 10, 50, 80]))
|
||||
.await
|
||||
.unwrap();
|
||||
// Asserts that the compaction of the region is not scheduled,
|
||||
// because the minimum time interval between two compactions is not passed.
|
||||
assert_eq!(rx.try_recv(), Err(oneshot::error::TryRecvError::Empty));
|
||||
assert_eq!(
|
||||
initial_schedule_time,
|
||||
region.last_schedule_compaction_millis()
|
||||
);
|
||||
|
||||
// Simulates the time has passed the min compaction interval,
|
||||
time_provider
|
||||
.set_now(current_time_millis() + config.min_compaction_interval.as_millis() as i64);
|
||||
let next_schedule_time = initial_time + config.min_compaction_interval.as_millis() as i64;
|
||||
time_provider.set_now(next_schedule_time);
|
||||
// ... then edits the region again,
|
||||
engine
|
||||
.edit_region(region.region_id, new_edit())
|
||||
.edit_region(region.region_id, new_edit(&[90]))
|
||||
.await
|
||||
.unwrap();
|
||||
// ... finally asserts that the compaction of the region is scheduled.
|
||||
@@ -136,6 +150,9 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) {
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(region_id, actual);
|
||||
// Wait for the `last_schedule_compaction_millis` to update.
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
assert_eq!(next_schedule_time, region.last_schedule_compaction_millis());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
#![feature(debug_closure_helpers)]
|
||||
#![feature(duration_constructors)]
|
||||
#![feature(binary_heap_pop_if)]
|
||||
|
||||
#[cfg(any(test, feature = "test"))]
|
||||
#[cfg_attr(feature = "test", allow(unused))]
|
||||
|
||||
@@ -157,8 +157,8 @@ pub struct MitoRegion {
|
||||
pub(crate) provider: Provider,
|
||||
/// Last flush time in millis.
|
||||
last_flush_millis: AtomicI64,
|
||||
/// Last compaction time in millis.
|
||||
last_compaction_millis: AtomicI64,
|
||||
/// Last schedule compaction time in millis.
|
||||
last_schedule_compaction_millis: AtomicI64,
|
||||
/// Provider to get current time.
|
||||
time_provider: TimeProviderRef,
|
||||
/// The topic's latest entry id since the region's last flushing.
|
||||
@@ -251,15 +251,16 @@ impl MitoRegion {
|
||||
self.last_flush_millis.store(now, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Returns last compaction timestamp in millis.
|
||||
pub(crate) fn last_compaction_millis(&self) -> i64 {
|
||||
self.last_compaction_millis.load(Ordering::Relaxed)
|
||||
/// Returns last schedule compaction timestamp in millis.
|
||||
pub(crate) fn last_schedule_compaction_millis(&self) -> i64 {
|
||||
self.last_schedule_compaction_millis.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Update compaction time to current time.
|
||||
pub(crate) fn update_compaction_millis(&self) {
|
||||
/// Update schedule compaction time to current time.
|
||||
pub(crate) fn update_schedule_compaction_millis(&self) {
|
||||
let now = self.time_provider.current_time_millis();
|
||||
self.last_compaction_millis.store(now, Ordering::Relaxed);
|
||||
self.last_schedule_compaction_millis
|
||||
.store(now, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Returns the table dir.
|
||||
@@ -1727,7 +1728,7 @@ mod tests {
|
||||
file_purger: crate::test_util::new_noop_file_purger(),
|
||||
provider: Provider::noop_provider(),
|
||||
last_flush_millis: Default::default(),
|
||||
last_compaction_millis: Default::default(),
|
||||
last_schedule_compaction_millis: Default::default(),
|
||||
time_provider: Arc::new(StdTimeProvider),
|
||||
topic_latest_entry_id: Default::default(),
|
||||
written_bytes: Arc::new(AtomicU64::new(0)),
|
||||
@@ -2084,7 +2085,7 @@ mod tests {
|
||||
file_purger: crate::test_util::new_noop_file_purger(),
|
||||
provider: Provider::noop_provider(),
|
||||
last_flush_millis: Default::default(),
|
||||
last_compaction_millis: Default::default(),
|
||||
last_schedule_compaction_millis: Default::default(),
|
||||
time_provider: Arc::new(StdTimeProvider),
|
||||
topic_latest_entry_id: Default::default(),
|
||||
written_bytes: Arc::new(AtomicU64::new(0)),
|
||||
|
||||
@@ -345,7 +345,7 @@ impl RegionOpener {
|
||||
),
|
||||
provider,
|
||||
last_flush_millis: AtomicI64::new(now),
|
||||
last_compaction_millis: AtomicI64::new(now),
|
||||
last_schedule_compaction_millis: AtomicI64::new(now),
|
||||
time_provider: self.time_provider.clone(),
|
||||
topic_latest_entry_id: AtomicU64::new(0),
|
||||
written_bytes: Arc::new(AtomicU64::new(0)),
|
||||
@@ -581,7 +581,7 @@ impl RegionOpener {
|
||||
file_purger,
|
||||
provider: provider.clone(),
|
||||
last_flush_millis: AtomicI64::new(now),
|
||||
last_compaction_millis: AtomicI64::new(now),
|
||||
last_schedule_compaction_millis: AtomicI64::new(now),
|
||||
time_provider: self.time_provider.clone(),
|
||||
topic_latest_entry_id: AtomicU64::new(topic_latest_entry_id),
|
||||
written_bytes: Arc::new(AtomicU64::new(0)),
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
use api::v1::region::compact_request;
|
||||
use common_telemetry::{error, info, warn};
|
||||
use common_telemetry::{debug, error, info};
|
||||
use store_api::logstore::LogStore;
|
||||
use store_api::region_request::RegionCompactRequest;
|
||||
use store_api::storage::RegionId;
|
||||
@@ -80,7 +80,6 @@ impl<S> RegionWorkerLoop<S> {
|
||||
return;
|
||||
}
|
||||
};
|
||||
region.update_compaction_millis();
|
||||
|
||||
region.version_control.apply_edit(
|
||||
Some(request.edit.clone()),
|
||||
@@ -118,6 +117,31 @@ impl<S> RegionWorkerLoop<S> {
|
||||
)
|
||||
.await;
|
||||
self.handle_ddl_requests(&mut pending_ddls).await;
|
||||
|
||||
if self.compaction_scheduler.is_compacting(region_id) {
|
||||
return;
|
||||
}
|
||||
|
||||
let now = self.time_provider.current_time_millis();
|
||||
if now - region.last_schedule_compaction_millis()
|
||||
>= self.config.min_compaction_interval.as_millis() as i64
|
||||
{
|
||||
debug!(
|
||||
"minimal compaction interval time {:?} has passed, scheduling next compaction",
|
||||
self.config.min_compaction_interval
|
||||
);
|
||||
if self
|
||||
.compaction_scheduler
|
||||
.schedule_next_compaction(
|
||||
region_id,
|
||||
®ion.manifest_ctx,
|
||||
self.schema_metadata_manager.clone(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
region.update_schedule_compaction_millis();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn handle_compaction_cancelled(
|
||||
@@ -160,9 +184,14 @@ impl<S> RegionWorkerLoop<S> {
|
||||
return;
|
||||
}
|
||||
let now = self.time_provider.current_time_millis();
|
||||
if now - region.last_compaction_millis()
|
||||
if now - region.last_schedule_compaction_millis()
|
||||
>= self.config.min_compaction_interval.as_millis() as i64
|
||||
&& let Err(e) = self
|
||||
{
|
||||
debug!(
|
||||
"minimal compaction interval time {:?} has passed, scheduling next compaction",
|
||||
self.config.min_compaction_interval
|
||||
);
|
||||
match self
|
||||
.compaction_scheduler
|
||||
.schedule_compaction(
|
||||
region.region_id,
|
||||
@@ -175,11 +204,13 @@ impl<S> RegionWorkerLoop<S> {
|
||||
1, // Default for automatic compaction
|
||||
)
|
||||
.await
|
||||
{
|
||||
warn!(
|
||||
"Failed to schedule compaction for region: {}, err: {}",
|
||||
region.region_id, e
|
||||
);
|
||||
{
|
||||
Ok(true) => region.update_schedule_compaction_millis(),
|
||||
Ok(false) => {}
|
||||
Err(e) => {
|
||||
error!(e; "Failed to schedule compaction for region: {}", region.region_id)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ derive_builder = { workspace = true, optional = true }
|
||||
futures.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
lazy_static.workspace = true
|
||||
opendal = { git = "https://github.com/apache/opendal.git", rev = "4ad2d85296ffa6fdc2882f97d3c760ee243913f7", features = [
|
||||
opendal = { version = "0.57", features = [
|
||||
"layers-tracing",
|
||||
"layers-prometheus",
|
||||
"services-azblob",
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -21,7 +21,7 @@ pub use opendal::raw::{
|
||||
Access, Layer, LayeredAccess, OpDelete, OpList, OpRead, OpWrite, RpDelete, RpList, RpRead,
|
||||
RpWrite, oio,
|
||||
};
|
||||
use opendal::raw::{OpCopy, RpCopy};
|
||||
use opendal::raw::{OpCopier, OpCopy, RpCopy};
|
||||
pub use opendal::{Buffer, Error, ErrorKind, Metadata, Result};
|
||||
|
||||
pub type MockWriterFactory = Arc<dyn Fn(&str, OpWrite, oio::Writer) -> oio::Writer + Send + Sync>;
|
||||
@@ -146,6 +146,7 @@ impl<A: Access> LayeredAccess for MockAccessor<A> {
|
||||
type Writer = MockWriter;
|
||||
type Lister = MockLister;
|
||||
type Deleter = MockDeleter;
|
||||
type Copier = oio::Copier;
|
||||
|
||||
fn inner(&self) -> &Self::Inner {
|
||||
&self.inner
|
||||
@@ -222,15 +223,24 @@ impl<A: Access> LayeredAccess for MockAccessor<A> {
|
||||
}
|
||||
}
|
||||
|
||||
async fn copy(&self, from: &str, to: &str, args: OpCopy) -> Result<RpCopy> {
|
||||
let Some(copy_interceptor) = self.copy_interceptor.as_ref() else {
|
||||
return self.inner.copy(from, to, args).await;
|
||||
};
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &str,
|
||||
to: &str,
|
||||
args: OpCopy,
|
||||
opts: OpCopier,
|
||||
) -> Result<(RpCopy, Self::Copier)> {
|
||||
if let Some(result) = self
|
||||
.copy_interceptor
|
||||
.as_ref()
|
||||
.and_then(|copy_interceptor| copy_interceptor(from, to, args.clone()))
|
||||
{
|
||||
return result.map(|rp_copy| (rp_copy, Box::new(()) as oio::Copier));
|
||||
}
|
||||
|
||||
let Some(result) = copy_interceptor(from, to, args.clone()) else {
|
||||
return self.inner.copy(from, to, args).await;
|
||||
};
|
||||
|
||||
result
|
||||
self.inner
|
||||
.copy(from, to, args, opts)
|
||||
.await
|
||||
.map(|(rp_copy, copier)| (rp_copy, Box::new(copier) as oio::Copier))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,7 +18,6 @@ pub use opendal::{
|
||||
FuturesAsyncWriter, Lister, Operator as ObjectStore, Reader, Result, Writer, services,
|
||||
};
|
||||
|
||||
pub mod compat;
|
||||
pub mod config;
|
||||
pub mod error;
|
||||
pub mod factory;
|
||||
|
||||
@@ -63,6 +63,7 @@ use table::metadata::TableInfo;
|
||||
use table::requests::{
|
||||
AUTO_CREATE_TABLE_KEY, InsertRequest as TableInsertRequest, TABLE_DATA_MODEL,
|
||||
TABLE_DATA_MODEL_TRACE_V1, TRACE_TABLE_PARTITIONS_HINT_KEY, VALID_TABLE_OPTION_KEYS,
|
||||
is_semantic_option_key,
|
||||
};
|
||||
use table::table_reference::TableReference;
|
||||
|
||||
@@ -83,6 +84,10 @@ pub struct Inserter {
|
||||
pub(crate) partition_manager: PartitionRuleManagerRef,
|
||||
pub(crate) node_manager: NodeManagerRef,
|
||||
pub(crate) table_flownode_set_cache: TableFlownodeSetCacheRef,
|
||||
/// Server-side upper bound for auto table creation on write.
|
||||
/// When `false`, missing tables are never auto-created regardless of the
|
||||
/// per-request `auto_create_table` hint. When `true`, the hint still applies.
|
||||
auto_create_table: bool,
|
||||
}
|
||||
|
||||
pub type InserterRef = Arc<Inserter>;
|
||||
@@ -135,12 +140,14 @@ impl Inserter {
|
||||
partition_manager: PartitionRuleManagerRef,
|
||||
node_manager: NodeManagerRef,
|
||||
table_flownode_set_cache: TableFlownodeSetCacheRef,
|
||||
auto_create_table: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
catalog_manager,
|
||||
partition_manager,
|
||||
node_manager,
|
||||
table_flownode_set_cache,
|
||||
auto_create_table,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -469,6 +476,30 @@ impl Inserter {
|
||||
Ok(inserts)
|
||||
}
|
||||
|
||||
/// Returns `None` if auto table creation is allowed, or `Some(reason)` if
|
||||
/// disabled by either the global config or the request hint. The reason tells
|
||||
/// which one, for a clearer error.
|
||||
fn auto_create_disabled_reason(&self, ctx: &QueryContextRef) -> Result<Option<&'static str>> {
|
||||
let auto_create_table_hint = ctx
|
||||
.extension(AUTO_CREATE_TABLE_KEY)
|
||||
.map(|v| v.parse::<bool>())
|
||||
.transpose()
|
||||
.map_err(|_| {
|
||||
InvalidInsertRequestSnafu {
|
||||
reason: "`auto_create_table` hint must be a boolean",
|
||||
}
|
||||
.build()
|
||||
})?
|
||||
.unwrap_or(true);
|
||||
Ok(if !self.auto_create_table {
|
||||
Some("auto-create table is disabled by frontend config")
|
||||
} else if !auto_create_table_hint {
|
||||
Some("`auto_create_table` hint is disabled")
|
||||
} else {
|
||||
None
|
||||
})
|
||||
}
|
||||
|
||||
/// Creates or alter tables on demand:
|
||||
/// - if table does not exist, create table by inferred CreateExpr
|
||||
/// - if table exist, check if schema matches. If any new column found, alter table by inferred `AlterExpr`
|
||||
@@ -498,19 +529,7 @@ impl Inserter {
|
||||
let schema = ctx.current_schema();
|
||||
|
||||
let mut table_infos = HashMap::new();
|
||||
// If `auto_create_table` hint is disabled, skip creating/altering tables.
|
||||
let auto_create_table_hint = ctx
|
||||
.extension(AUTO_CREATE_TABLE_KEY)
|
||||
.map(|v| v.parse::<bool>())
|
||||
.transpose()
|
||||
.map_err(|_| {
|
||||
InvalidInsertRequestSnafu {
|
||||
reason: "`auto_create_table` hint must be a boolean",
|
||||
}
|
||||
.build()
|
||||
})?
|
||||
.unwrap_or(true);
|
||||
if !auto_create_table_hint {
|
||||
if let Some(disabled_reason) = self.auto_create_disabled_reason(ctx)? {
|
||||
let mut instant_table_ids = HashSet::new();
|
||||
for req in &requests.inserts {
|
||||
let table = self
|
||||
@@ -518,8 +537,8 @@ impl Inserter {
|
||||
.await?
|
||||
.context(InvalidInsertRequestSnafu {
|
||||
reason: format!(
|
||||
"Table `{}` does not exist, and `auto_create_table` hint is disabled",
|
||||
req.table_name
|
||||
"Table `{}` does not exist, and {}",
|
||||
req.table_name, disabled_reason
|
||||
),
|
||||
})?;
|
||||
let table_info = table.table_info();
|
||||
@@ -767,6 +786,16 @@ impl Inserter {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Gate here too, otherwise a disabled switch would still leak the physical table.
|
||||
if let Some(disabled_reason) = self.auto_create_disabled_reason(ctx)? {
|
||||
return InvalidInsertRequestSnafu {
|
||||
reason: format!(
|
||||
"Physical table `{physical_table}` does not exist, and {disabled_reason}"
|
||||
),
|
||||
}
|
||||
.fail();
|
||||
}
|
||||
|
||||
let table_reference = TableReference::full(catalog_name, &schema_name, &physical_table);
|
||||
info!("Physical metric table `{table_reference}` does not exist, try creating table");
|
||||
|
||||
@@ -1061,6 +1090,13 @@ pub fn fill_table_options_for_create(
|
||||
}
|
||||
}
|
||||
|
||||
// Semantic keys are prefix-matched, not in the fixed allowlist above.
|
||||
for (key, value) in ctx.extensions() {
|
||||
if is_semantic_option_key(&key) {
|
||||
table_options.insert(key, value);
|
||||
}
|
||||
}
|
||||
|
||||
match create_type {
|
||||
AutoCreateTableType::Logical(physical_table) => {
|
||||
table_options.insert(
|
||||
@@ -1333,6 +1369,7 @@ mod tests {
|
||||
Cache::new(100),
|
||||
kv_backend.clone(),
|
||||
)),
|
||||
true,
|
||||
);
|
||||
let alter_expr = inserter
|
||||
.get_alter_table_expr_on_demand(&mut req, &table, &ctx, true, true)
|
||||
@@ -1362,6 +1399,34 @@ mod tests {
|
||||
assert!(!table_options.contains_key(APPEND_MODE_KEY));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fill_table_options_copies_semantic_extensions() {
|
||||
use table::requests::{
|
||||
SEMANTIC_PER_TABLE_INDEX_KEY, SEMANTIC_SIGNAL_TYPE, SEMANTIC_SOURCE,
|
||||
SIGNAL_TYPE_METRIC, SOURCE_OPENTELEMETRY,
|
||||
};
|
||||
|
||||
let mut ctx = QueryContext::with(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME);
|
||||
ctx.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC);
|
||||
ctx.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
|
||||
// The internal transport key must NOT be copied into table options.
|
||||
ctx.set_extension(SEMANTIC_PER_TABLE_INDEX_KEY, "{}");
|
||||
let ctx = Arc::new(ctx);
|
||||
let mut table_options = Default::default();
|
||||
|
||||
fill_table_options_for_create(&mut table_options, &AutoCreateTableType::Physical, &ctx);
|
||||
|
||||
assert_eq!(
|
||||
Some(SIGNAL_TYPE_METRIC),
|
||||
table_options.get(SEMANTIC_SIGNAL_TYPE).map(String::as_str)
|
||||
);
|
||||
assert_eq!(
|
||||
Some(SOURCE_OPENTELEMETRY),
|
||||
table_options.get(SEMANTIC_SOURCE).map(String::as_str)
|
||||
);
|
||||
assert!(!table_options.contains_key(SEMANTIC_PER_TABLE_INDEX_KEY));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_last_non_null_create_options_preserve_default_with_append_mode_false() {
|
||||
let mut ctx = QueryContext::with(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME);
|
||||
|
||||
@@ -35,7 +35,9 @@ use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, is_reado
|
||||
use common_catalog::{format_full_flow_name, format_full_table_name};
|
||||
use common_error::ext::BoxedError;
|
||||
use common_meta::cache_invalidator::Context;
|
||||
use common_meta::ddl::create_flow::{DEFER_ON_MISSING_SOURCE_KEY, FlowType};
|
||||
use common_meta::ddl::create_flow::{
|
||||
DEFER_ON_MISSING_SOURCE_KEY, FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, FlowType,
|
||||
};
|
||||
use common_meta::instruction::CacheIdent;
|
||||
use common_meta::key::schema_name::{SchemaName, SchemaNameKey};
|
||||
use common_meta::procedure_executor::ExecutorContext;
|
||||
@@ -114,7 +116,10 @@ struct DdlSubmitOptions {
|
||||
timeout: Duration,
|
||||
}
|
||||
|
||||
const ALLOWED_FLOW_OPTIONS: [&str; 1] = [DEFER_ON_MISSING_SOURCE_KEY];
|
||||
const ALLOWED_FLOW_OPTIONS: [&str; 2] = [
|
||||
DEFER_ON_MISSING_SOURCE_KEY,
|
||||
FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY,
|
||||
];
|
||||
|
||||
fn build_procedure_id_output(procedure_id: Vec<u8>) -> Result<Output> {
|
||||
let procedure_id = String::from_utf8_lossy(&procedure_id).to_string();
|
||||
@@ -187,7 +192,9 @@ fn validate_and_normalize_flow_options(
|
||||
}
|
||||
|
||||
let normalized_value = match key.as_str() {
|
||||
DEFER_ON_MISSING_SOURCE_KEY => normalize_flow_bool_option(&key, &value)?,
|
||||
DEFER_ON_MISSING_SOURCE_KEY | FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY => {
|
||||
normalize_flow_bool_option(&key, &value)?
|
||||
}
|
||||
_ => {
|
||||
return InvalidSqlSnafu {
|
||||
err_msg: format!(
|
||||
@@ -2478,12 +2485,23 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_validate_and_normalize_flow_options_valid() {
|
||||
let options =
|
||||
HashMap::from([(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "TRUE".to_string())]);
|
||||
let options = HashMap::from([
|
||||
(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "TRUE".to_string()),
|
||||
(
|
||||
FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(),
|
||||
"FALSE".to_string(),
|
||||
),
|
||||
]);
|
||||
|
||||
assert_eq!(
|
||||
validate_and_normalize_flow_options(options).unwrap(),
|
||||
HashMap::from([(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string(),)])
|
||||
HashMap::from([
|
||||
(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string(),),
|
||||
(
|
||||
FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(),
|
||||
"false".to_string(),
|
||||
)
|
||||
])
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2497,7 +2515,7 @@ mod test {
|
||||
|
||||
assert!(
|
||||
err.to_string()
|
||||
.contains("unknown flow option 'foo', supported options: defer_on_missing_source")
|
||||
.contains("unknown flow option 'foo', supported options: defer_on_missing_source, experimental_enable_incremental_read")
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
use common_base::Plugins;
|
||||
use datanode::config::DatanodeOptions;
|
||||
use datanode::datanode::Datanode;
|
||||
use datanode::error::Result;
|
||||
|
||||
use crate::options::PluginOptions;
|
||||
@@ -28,6 +29,6 @@ pub async fn setup_datanode_plugins(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn start_datanode_plugins(_plugins: Plugins) -> Result<()> {
|
||||
pub async fn start_datanode_plugins(_instance: &Datanode) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -13,8 +13,8 @@
|
||||
// limitations under the License.
|
||||
|
||||
use common_base::Plugins;
|
||||
use flow::FlownodeOptions;
|
||||
use flow::error::Result;
|
||||
use flow::{FlownodeInstance, FlownodeOptions};
|
||||
|
||||
use crate::options::PluginOptions;
|
||||
|
||||
@@ -27,7 +27,7 @@ pub async fn setup_flownode_plugins(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn start_flownode_plugins(_plugins: Plugins) -> Result<()> {
|
||||
pub async fn start_flownode_plugins(_instance: &FlownodeInstance) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ use common_base::Plugins;
|
||||
use common_meta::cache::CacheRegistryBuilder;
|
||||
use frontend::error::{IllegalAuthConfigSnafu, Result};
|
||||
use frontend::frontend::FrontendOptions;
|
||||
use frontend::instance::Instance;
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::options::PluginOptions;
|
||||
@@ -51,7 +52,7 @@ pub async fn setup_frontend_dynamic_plugins(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn start_frontend_plugins(_plugins: Plugins) -> Result<()> {
|
||||
pub async fn start_frontend_plugins(_instance: &Instance) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -26,4 +26,4 @@ pub use flownode::{setup_flownode_plugins, start_flownode_plugins};
|
||||
pub use frontend::{setup_frontend_plugins, start_frontend_plugins};
|
||||
pub use meta_srv::{setup_metasrv_plugins, start_metasrv_plugins};
|
||||
pub use options::PluginOptions;
|
||||
pub use standalone::{setup_standalone_plugins, start_standalone_plugins};
|
||||
pub use standalone::setup_standalone_plugins;
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
use common_base::Plugins;
|
||||
use meta_srv::bootstrap::MetasrvInstance;
|
||||
use meta_srv::error::Result;
|
||||
use meta_srv::metasrv::MetasrvOptions;
|
||||
|
||||
@@ -27,6 +28,6 @@ pub async fn setup_metasrv_plugins(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn start_metasrv_plugins(_plugins: Plugins) -> Result<()> {
|
||||
pub async fn start_metasrv_plugins(_instance: &MetasrvInstance) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -31,10 +31,6 @@ pub async fn setup_standalone_plugins(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn start_standalone_plugins(_plugins: Plugins) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Allows standalone plugins to add cache invalidators to the layered registry.
|
||||
pub fn configure_cache_registry(_plugins: &Plugins) -> Option<CacheRegistryBuilder> {
|
||||
None
|
||||
|
||||
@@ -31,6 +31,10 @@ use prost::Message;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use session::context::{Channel, QueryContext};
|
||||
use snafu::prelude::*;
|
||||
use table::requests::{
|
||||
METADATA_QUALITY_INFERRED, SEMANTIC_METRIC_METADATA_QUALITY, SEMANTIC_SIGNAL_TYPE,
|
||||
SEMANTIC_SOURCE, SIGNAL_TYPE_METRIC, SOURCE_PROMETHEUS,
|
||||
};
|
||||
|
||||
use crate::error::{self, InternalSnafu, PipelineSnafu, Result};
|
||||
use crate::http::extractor::PipelineInfo;
|
||||
@@ -108,6 +112,13 @@ pub async fn remote_write(
|
||||
.clone()
|
||||
.unwrap_or_else(|| GREPTIME_PHYSICAL_TABLE.to_string());
|
||||
query_ctx.set_extension(PHYSICAL_TABLE_PARAM, physical_table.clone());
|
||||
// Stamp the Prometheus metric identity here, before `as_req_iter` splits into the
|
||||
// batched and direct write paths, so both inherit it (the batched path bypasses
|
||||
// `PromStoreProtocolHandler::write`). Prom RW v1 metadata is weak, so the type is
|
||||
// inferred from naming.
|
||||
query_ctx.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC);
|
||||
query_ctx.set_extension(SEMANTIC_SOURCE, SOURCE_PROMETHEUS);
|
||||
query_ctx.set_extension(SEMANTIC_METRIC_METADATA_QUALITY, METADATA_QUALITY_INFERRED);
|
||||
let query_ctx = Arc::new(query_ctx);
|
||||
let _timer = crate::metrics::METRIC_HTTP_PROM_STORE_WRITE_ELAPSED
|
||||
.with_label_values(&[db.as_str()])
|
||||
|
||||
@@ -40,7 +40,7 @@ use snafu::{ResultExt, ensure};
|
||||
use sqlparser::dialect::Dialect;
|
||||
use sqlparser::keywords::Keyword;
|
||||
use sqlparser::parser::Parser;
|
||||
use table::requests::validate_table_option;
|
||||
use table::requests::{SEMANTIC_PREFIX, validate_semantic_option, validate_table_option};
|
||||
|
||||
use crate::error::{
|
||||
ConvertToLogicalExpressionSnafu, InvalidSqlSnafu, InvalidTableOptionSnafu, ParseSqlValueSnafu,
|
||||
@@ -395,8 +395,18 @@ pub fn parse_with_options(parser: &mut Parser) -> Result<OptionMap> {
|
||||
.into_iter()
|
||||
.map(parse_option_string)
|
||||
.collect::<Result<HashMap<String, OptionValue>>>()?;
|
||||
for key in options.keys() {
|
||||
ensure!(validate_table_option(key), InvalidTableOptionSnafu { key });
|
||||
for (key, value) in &options {
|
||||
if key.starts_with(SEMANTIC_PREFIX) {
|
||||
// Semantic keys are whitelisted and value-checked against their domain,
|
||||
// so a user cannot set an unknown key or an out-of-range value.
|
||||
let value = value.as_string().unwrap_or_default();
|
||||
ensure!(
|
||||
validate_semantic_option(key, value),
|
||||
InvalidTableOptionSnafu { key }
|
||||
);
|
||||
} else {
|
||||
ensure!(validate_table_option(key), InvalidTableOptionSnafu { key });
|
||||
}
|
||||
}
|
||||
Ok(OptionMap::new(options))
|
||||
}
|
||||
|
||||
@@ -868,7 +868,25 @@ ENGINE=mito
|
||||
";
|
||||
let result =
|
||||
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
|
||||
assert_matches!(result, Err(Error::InvalidTableOption { .. }))
|
||||
assert_matches!(result, Err(Error::InvalidTableOption { .. }));
|
||||
|
||||
// A whitelisted semantic key with an in-domain value is accepted.
|
||||
let semantic = |with: &str| {
|
||||
let sql =
|
||||
format!("create table demo(host string, ts timestamp time index) with({with});");
|
||||
ParserContext::create_with_dialect(&sql, &GreptimeDbDialect {}, ParseOptions::default())
|
||||
};
|
||||
assert!(semantic("'greptime.semantic.signal_type'='metric'").is_ok());
|
||||
// An out-of-domain value is rejected.
|
||||
assert_matches!(
|
||||
semantic("'greptime.semantic.signal_type'='spans'"),
|
||||
Err(Error::InvalidTableOption { .. })
|
||||
);
|
||||
// An unknown key under the semantic prefix is rejected.
|
||||
assert_matches!(
|
||||
semantic("'greptime.semantic.bogus'='x'"),
|
||||
Err(Error::InvalidTableOption { .. })
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -38,6 +38,10 @@ pub struct StandaloneOptions {
|
||||
pub enable_telemetry: bool,
|
||||
pub default_timezone: Option<String>,
|
||||
pub default_column_prefix: Option<String>,
|
||||
/// Server-side global switch for auto table creation on write.
|
||||
/// Upper bound: when `false`, missing tables are never auto-created even if a
|
||||
/// request sets the `auto_create_table` hint to `true`. Default: `true`.
|
||||
pub auto_create_table: bool,
|
||||
/// Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
|
||||
/// Set to 0 to disable the limit. Default: "0" (unlimited)
|
||||
pub max_in_flight_write_bytes: ReadableSize,
|
||||
@@ -77,6 +81,7 @@ impl Default for StandaloneOptions {
|
||||
enable_telemetry: true,
|
||||
default_timezone: None,
|
||||
default_column_prefix: None,
|
||||
auto_create_table: true,
|
||||
max_in_flight_write_bytes: ReadableSize(0),
|
||||
write_bytes_exhausted_policy: OnExhaustedPolicy::default(),
|
||||
http: HttpOptions::default(),
|
||||
@@ -130,6 +135,7 @@ impl StandaloneOptions {
|
||||
let cloned_opts = self.clone();
|
||||
FrontendOptions {
|
||||
default_timezone: cloned_opts.default_timezone,
|
||||
auto_create_table: cloned_opts.auto_create_table,
|
||||
max_in_flight_write_bytes: cloned_opts.max_in_flight_write_bytes,
|
||||
write_bytes_exhausted_policy: cloned_opts.write_bytes_exhausted_policy,
|
||||
http: cloned_opts.http,
|
||||
|
||||
@@ -48,6 +48,9 @@ use crate::error::{ParseTableOptionSnafu, Result};
|
||||
use crate::metadata::{TableId, TableVersion};
|
||||
use crate::table_reference::TableReference;
|
||||
|
||||
mod semantic;
|
||||
pub use semantic::*;
|
||||
|
||||
pub const FILE_TABLE_META_KEY: &str = "__private.file_table_meta";
|
||||
pub const FILE_TABLE_LOCATION_KEY: &str = "location";
|
||||
pub const FILE_TABLE_PATTERN_KEY: &str = "pattern";
|
||||
@@ -129,6 +132,12 @@ pub fn validate_table_option(key: &str) -> bool {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Semantic-layer keys share a reserved prefix instead of a fixed allowlist so
|
||||
// the vocabulary can grow without touching this gate. See `semantic` module.
|
||||
if is_semantic_option_key(key) {
|
||||
return true;
|
||||
}
|
||||
|
||||
VALID_TABLE_OPTION_KEYS.contains(&key) || VALID_DDL_OPTION_KEYS.contains(&key)
|
||||
}
|
||||
|
||||
@@ -490,6 +499,14 @@ mod tests {
|
||||
assert!(validate_table_option(STORAGE_KEY));
|
||||
assert!(validate_table_option(MEMTABLE_BULK_MERGE_THRESHOLD));
|
||||
assert!(!validate_table_option("foo"));
|
||||
|
||||
// Only whitelisted semantic keys are accepted.
|
||||
assert!(validate_table_option(SEMANTIC_SIGNAL_TYPE));
|
||||
assert!(validate_table_option(SEMANTIC_METRIC_TYPE));
|
||||
// Unknown semantic key, near-miss, and the internal transport key are rejected.
|
||||
assert!(!validate_table_option("greptime.semantic.future.key"));
|
||||
assert!(!validate_table_option("greptime.semanticx"));
|
||||
assert!(!validate_table_option(SEMANTIC_PER_TABLE_INDEX_KEY));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
280
src/table/src/requests/semantic.rs
Normal file
280
src/table/src/requests/semantic.rs
Normal file
@@ -0,0 +1,280 @@
|
||||
// Copyright 2023 Greptime Team
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Table semantic layer vocabulary.
|
||||
//!
|
||||
//! A thin layer of semantic metadata attached to a table via `table_options`, so
|
||||
//! machine consumers (LLM agents, alert/dashboard builders, MCP servers, ETL) can
|
||||
//! align a table with the observability concept it stands for without guessing
|
||||
//! from column names. See `docs/rfcs/2026-05-28-table-semantic-layer.md`.
|
||||
//!
|
||||
//! All public table-option keys share the [`SEMANTIC_PREFIX`] namespace and are
|
||||
//! string-valued. [`is_semantic_option_key`] gates them through
|
||||
//! [`crate::requests::validate_table_option`], so they are accepted both on the
|
||||
//! ingestion auto-create path and on explicit `CREATE TABLE ... WITH (...)` DDL.
|
||||
|
||||
/// Reserved prefix for every public semantic table-option key.
|
||||
pub const SEMANTIC_PREFIX: &str = "greptime.semantic.";
|
||||
|
||||
/// Internal `QueryContext` extension key carrying the per-table semantic index
|
||||
/// (a `{table_name -> {semantic_key: value}}` JSON blob) from the ingestion
|
||||
/// encode path to the auto-create site. Deliberately OUTSIDE [`SEMANTIC_PREFIX`]
|
||||
/// so it is not a valid table option and never leaks into a table's options.
|
||||
pub const SEMANTIC_PER_TABLE_INDEX_KEY: &str = "greptime.internal.semantic.per_table_index";
|
||||
|
||||
// ---- Common keys (all signals) ----
|
||||
|
||||
/// Signal kind: one of [`SIGNAL_TYPE_TRACE`] / [`SIGNAL_TYPE_LOG`] /
|
||||
/// [`SIGNAL_TYPE_METRIC`] / [`SIGNAL_TYPE_EVENT`].
|
||||
pub const SEMANTIC_SIGNAL_TYPE: &str = "greptime.semantic.signal_type";
|
||||
/// Ingestion ecosystem, e.g. [`SOURCE_OPENTELEMETRY`] / [`SOURCE_PROMETHEUS`].
|
||||
pub const SEMANTIC_SOURCE: &str = "greptime.semantic.source";
|
||||
/// Optional protocol or SDK version string, e.g. `v2` (Prom remote write), `1.30.0`.
|
||||
pub const SEMANTIC_SOURCE_VERSION: &str = "greptime.semantic.source_version";
|
||||
/// Internal ingestion pipeline / data model, e.g. `greptime_trace_v1`.
|
||||
pub const SEMANTIC_PIPELINE: &str = "greptime.semantic.pipeline";
|
||||
|
||||
// ---- Trace keys ----
|
||||
|
||||
/// Semantic-conventions version the rows conform to (e.g. `otel-semconv-1.27`),
|
||||
/// or [`SEMANTIC_VALUE_UNKNOWN`] / [`SEMANTIC_VALUE_MIXED`] when not single-valued.
|
||||
pub const SEMANTIC_TRACE_CONVENTIONS: &str = "greptime.semantic.trace.conventions";
|
||||
/// Whether `span_events` are preserved on the table.
|
||||
pub const SEMANTIC_TRACE_HAS_EVENTS: &str = "greptime.semantic.trace.has_events";
|
||||
/// Whether `span_links` are preserved on the table.
|
||||
pub const SEMANTIC_TRACE_HAS_LINKS: &str = "greptime.semantic.trace.has_links";
|
||||
|
||||
// ---- Metric keys (populated in Phase 2) ----
|
||||
|
||||
/// Instrument kind: `counter` / `gauge` / `histogram` / `summary` /
|
||||
/// `updown_counter` / `gauge_histogram` / `info` / `stateset`.
|
||||
pub const SEMANTIC_METRIC_TYPE: &str = "greptime.semantic.metric.type";
|
||||
/// UCUM unit, e.g. `s`, `By`, `{request}`.
|
||||
pub const SEMANTIC_METRIC_UNIT: &str = "greptime.semantic.metric.unit";
|
||||
/// `cumulative` / `delta` (OTel only).
|
||||
pub const SEMANTIC_METRIC_TEMPORALITY: &str = "greptime.semantic.metric.temporality";
|
||||
/// `true` / `false` for sum / counter typed data.
|
||||
pub const SEMANTIC_METRIC_MONOTONIC: &str = "greptime.semantic.metric.monotonic";
|
||||
/// [`METADATA_QUALITY_DECLARED`] when the protocol stated the type, or
|
||||
/// [`METADATA_QUALITY_INFERRED`] when guessed from a name suffix.
|
||||
pub const SEMANTIC_METRIC_METADATA_QUALITY: &str = "greptime.semantic.metric.metadata_quality";
|
||||
/// Pre-translation OTel metric name when the table name was Prometheus-ised.
|
||||
pub const SEMANTIC_METRIC_ORIGINAL_NAME: &str = "greptime.semantic.metric.original_name";
|
||||
|
||||
// ---- Log keys (populated in Phase 3) ----
|
||||
|
||||
/// `otlp` / `syslog` / `custom` — which mapping to use for `severity_number`.
|
||||
pub const SEMANTIC_LOG_SEVERITY_SCHEME: &str = "greptime.semantic.log.severity_scheme";
|
||||
/// `string` / `json` / `mixed` — how to parse `body`.
|
||||
pub const SEMANTIC_LOG_BODY_FORMAT: &str = "greptime.semantic.log.body_format";
|
||||
|
||||
// ---- Resource / scope preservation keys (populated in Phase 3) ----
|
||||
|
||||
/// JSON array string of resource attributes promoted to first-class columns.
|
||||
pub const SEMANTIC_RESOURCE_ATTRIBUTES_PRESERVED: &str =
|
||||
"greptime.semantic.resource.attributes_preserved";
|
||||
/// `true` / `false` — whether any resource attribute was dropped at ingest.
|
||||
pub const SEMANTIC_RESOURCE_ATTRIBUTES_DROPPED: &str =
|
||||
"greptime.semantic.resource.attributes_dropped";
|
||||
/// `true` / `false` — whether `scope.name` / `scope.version` survive on the row.
|
||||
pub const SEMANTIC_SCOPE_PRESERVED: &str = "greptime.semantic.scope.preserved";
|
||||
|
||||
// ---- Value constants ----
|
||||
|
||||
pub const SIGNAL_TYPE_TRACE: &str = "trace";
|
||||
pub const SIGNAL_TYPE_LOG: &str = "log";
|
||||
pub const SIGNAL_TYPE_METRIC: &str = "metric";
|
||||
pub const SIGNAL_TYPE_EVENT: &str = "event";
|
||||
|
||||
pub const SOURCE_OPENTELEMETRY: &str = "opentelemetry";
|
||||
pub const SOURCE_PROMETHEUS: &str = "prometheus";
|
||||
|
||||
pub const METADATA_QUALITY_DECLARED: &str = "declared";
|
||||
pub const METADATA_QUALITY_INFERRED: &str = "inferred";
|
||||
|
||||
/// Sentinel for a key that cannot be determined at stamp time.
|
||||
pub const SEMANTIC_VALUE_UNKNOWN: &str = "unknown";
|
||||
/// Sentinel for a single-valued key that saw conflicting sources.
|
||||
pub const SEMANTIC_VALUE_MIXED: &str = "mixed";
|
||||
|
||||
/// Every recognised public semantic table-option key. The set is a closed
|
||||
/// whitelist: keys under [`SEMANTIC_PREFIX`] that are not listed here are rejected,
|
||||
/// so an unknown key like `greptime.semantic.unknown_key` does not silently land
|
||||
/// in a table's options. Adding a key to the vocabulary means adding it here.
|
||||
pub const SEMANTIC_OPTION_KEYS: &[&str] = &[
|
||||
SEMANTIC_SIGNAL_TYPE,
|
||||
SEMANTIC_SOURCE,
|
||||
SEMANTIC_SOURCE_VERSION,
|
||||
SEMANTIC_PIPELINE,
|
||||
SEMANTIC_TRACE_CONVENTIONS,
|
||||
SEMANTIC_TRACE_HAS_EVENTS,
|
||||
SEMANTIC_TRACE_HAS_LINKS,
|
||||
SEMANTIC_METRIC_TYPE,
|
||||
SEMANTIC_METRIC_UNIT,
|
||||
SEMANTIC_METRIC_TEMPORALITY,
|
||||
SEMANTIC_METRIC_MONOTONIC,
|
||||
SEMANTIC_METRIC_METADATA_QUALITY,
|
||||
SEMANTIC_METRIC_ORIGINAL_NAME,
|
||||
SEMANTIC_LOG_SEVERITY_SCHEME,
|
||||
SEMANTIC_LOG_BODY_FORMAT,
|
||||
SEMANTIC_RESOURCE_ATTRIBUTES_PRESERVED,
|
||||
SEMANTIC_RESOURCE_ATTRIBUTES_DROPPED,
|
||||
SEMANTIC_SCOPE_PRESERVED,
|
||||
];
|
||||
|
||||
/// Returns true if `key` is a recognised semantic table-option key (whitelist).
|
||||
///
|
||||
/// Note this is membership, not a prefix test: unknown keys under
|
||||
/// [`SEMANTIC_PREFIX`] are rejected, and the internal
|
||||
/// [`SEMANTIC_PER_TABLE_INDEX_KEY`] (outside the prefix) never matches.
|
||||
pub fn is_semantic_option_key(key: &str) -> bool {
|
||||
SEMANTIC_OPTION_KEYS.contains(&key)
|
||||
}
|
||||
|
||||
/// Validates a `greptime.semantic.*` option's `value` against its allowed domain.
|
||||
///
|
||||
/// Open-value keys (unit, original_name, version, pipeline, conventions, the
|
||||
/// preserved-attributes list) accept any non-empty string. Closed-domain keys
|
||||
/// accept a fixed set, plus the `unknown` sentinel, plus `mixed` for the keys
|
||||
/// where one long-lived table can legitimately see multiple values. Keys not in
|
||||
/// [`SEMANTIC_OPTION_KEYS`] are rejected.
|
||||
pub fn validate_semantic_option(key: &str, value: &str) -> bool {
|
||||
match key {
|
||||
SEMANTIC_SOURCE_VERSION
|
||||
| SEMANTIC_PIPELINE
|
||||
| SEMANTIC_METRIC_UNIT
|
||||
| SEMANTIC_METRIC_ORIGINAL_NAME
|
||||
| SEMANTIC_TRACE_CONVENTIONS
|
||||
| SEMANTIC_RESOURCE_ATTRIBUTES_PRESERVED => !value.is_empty(),
|
||||
|
||||
SEMANTIC_SIGNAL_TYPE => matches!(value, "trace" | "log" | "metric" | "event" | "unknown"),
|
||||
SEMANTIC_SOURCE => matches!(
|
||||
value,
|
||||
"opentelemetry"
|
||||
| "prometheus"
|
||||
| "elasticsearch"
|
||||
| "loki"
|
||||
| "custom"
|
||||
| "mixed"
|
||||
| "unknown"
|
||||
),
|
||||
SEMANTIC_METRIC_TYPE => matches!(
|
||||
value,
|
||||
"counter"
|
||||
| "gauge"
|
||||
| "histogram"
|
||||
| "summary"
|
||||
| "updown_counter"
|
||||
| "gauge_histogram"
|
||||
| "info"
|
||||
| "stateset"
|
||||
| "mixed"
|
||||
| "unknown"
|
||||
),
|
||||
SEMANTIC_METRIC_TEMPORALITY => {
|
||||
matches!(value, "cumulative" | "delta" | "mixed" | "unknown")
|
||||
}
|
||||
SEMANTIC_METRIC_MONOTONIC
|
||||
| SEMANTIC_TRACE_HAS_EVENTS
|
||||
| SEMANTIC_TRACE_HAS_LINKS
|
||||
| SEMANTIC_RESOURCE_ATTRIBUTES_DROPPED
|
||||
| SEMANTIC_SCOPE_PRESERVED => matches!(value, "true" | "false" | "unknown"),
|
||||
SEMANTIC_METRIC_METADATA_QUALITY => matches!(value, "declared" | "inferred" | "unknown"),
|
||||
SEMANTIC_LOG_SEVERITY_SCHEME => matches!(value, "otlp" | "syslog" | "custom" | "unknown"),
|
||||
SEMANTIC_LOG_BODY_FORMAT => matches!(value, "string" | "json" | "mixed" | "unknown"),
|
||||
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_is_semantic_option_key() {
|
||||
assert!(is_semantic_option_key(SEMANTIC_SIGNAL_TYPE));
|
||||
assert!(is_semantic_option_key(SEMANTIC_METRIC_TYPE));
|
||||
|
||||
// Unknown keys under the prefix are not whitelisted.
|
||||
assert!(!is_semantic_option_key("greptime.semantic.future.key"));
|
||||
assert!(!is_semantic_option_key("greptime.semantic.unknown_key"));
|
||||
// Near-misses must not match.
|
||||
assert!(!is_semantic_option_key("greptime.semanticx"));
|
||||
assert!(!is_semantic_option_key("semantic.signal_type"));
|
||||
assert!(!is_semantic_option_key("table_data_model"));
|
||||
// The internal transport key must never be treated as a table option.
|
||||
assert!(!is_semantic_option_key(SEMANTIC_PER_TABLE_INDEX_KEY));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validate_semantic_option() {
|
||||
// Enum keys reject out-of-domain values.
|
||||
assert!(validate_semantic_option(SEMANTIC_SIGNAL_TYPE, "metric"));
|
||||
assert!(!validate_semantic_option(SEMANTIC_SIGNAL_TYPE, "spans"));
|
||||
assert!(validate_semantic_option(SEMANTIC_METRIC_TYPE, "counter"));
|
||||
assert!(validate_semantic_option(SEMANTIC_METRIC_TYPE, "mixed"));
|
||||
assert!(!validate_semantic_option(SEMANTIC_METRIC_TYPE, "bogus"));
|
||||
|
||||
// Booleans, sentinels, open values.
|
||||
assert!(validate_semantic_option(SEMANTIC_TRACE_HAS_EVENTS, "true"));
|
||||
assert!(!validate_semantic_option(SEMANTIC_TRACE_HAS_EVENTS, "yes"));
|
||||
assert!(validate_semantic_option(
|
||||
SEMANTIC_METRIC_TEMPORALITY,
|
||||
"unknown"
|
||||
));
|
||||
assert!(validate_semantic_option(SEMANTIC_METRIC_UNIT, "By"));
|
||||
assert!(!validate_semantic_option(SEMANTIC_METRIC_UNIT, ""));
|
||||
|
||||
// Unknown key is rejected regardless of value.
|
||||
assert!(!validate_semantic_option(
|
||||
"greptime.semantic.future.key",
|
||||
"x"
|
||||
));
|
||||
|
||||
// Drift guard: every value stamped by the ingestion path must validate.
|
||||
assert!(validate_semantic_option(
|
||||
SEMANTIC_SIGNAL_TYPE,
|
||||
SIGNAL_TYPE_TRACE
|
||||
));
|
||||
assert!(validate_semantic_option(
|
||||
SEMANTIC_SIGNAL_TYPE,
|
||||
SIGNAL_TYPE_METRIC
|
||||
));
|
||||
assert!(validate_semantic_option(
|
||||
SEMANTIC_SIGNAL_TYPE,
|
||||
SIGNAL_TYPE_LOG
|
||||
));
|
||||
assert!(validate_semantic_option(
|
||||
SEMANTIC_SOURCE,
|
||||
SOURCE_OPENTELEMETRY
|
||||
));
|
||||
assert!(validate_semantic_option(SEMANTIC_SOURCE, SOURCE_PROMETHEUS));
|
||||
assert!(validate_semantic_option(
|
||||
SEMANTIC_METRIC_METADATA_QUALITY,
|
||||
METADATA_QUALITY_INFERRED
|
||||
));
|
||||
assert!(validate_semantic_option(
|
||||
SEMANTIC_TRACE_CONVENTIONS,
|
||||
SEMANTIC_VALUE_UNKNOWN
|
||||
));
|
||||
// An empty value never validates, for any whitelisted key.
|
||||
for key in SEMANTIC_OPTION_KEYS {
|
||||
assert!(
|
||||
!validate_semantic_option(key, ""),
|
||||
"empty value should never validate for {key}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -200,6 +200,15 @@ impl TableContext {
|
||||
partitions.remove_bound(removed_idx)?;
|
||||
partition_def.exprs = partitions.generate()?;
|
||||
}
|
||||
RepartitionExpr::AlterPartitions(partition) => {
|
||||
ensure!(
|
||||
self.partition.is_none(),
|
||||
error::UnexpectedSnafu {
|
||||
violated: format!("Table {} already has partition", self.name),
|
||||
}
|
||||
);
|
||||
self.partition = Some(partition.partition);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(self)
|
||||
|
||||
@@ -44,6 +44,7 @@ pub struct CreateTableExprGenerator<R: Rng + 'static> {
|
||||
#[builder(setter(into))]
|
||||
engine: String,
|
||||
partition: usize,
|
||||
partition_column: bool,
|
||||
if_not_exists: bool,
|
||||
#[builder(setter(into))]
|
||||
name: Ident,
|
||||
@@ -67,6 +68,7 @@ impl<R: Rng + 'static> Default for CreateTableExprGenerator<R> {
|
||||
engine: DEFAULT_ENGINE.to_string(),
|
||||
if_not_exists: false,
|
||||
partition: 0,
|
||||
partition_column: false,
|
||||
name: Ident::new(""),
|
||||
with_clause: HashMap::default(),
|
||||
name_generator: Box::new(MappedGenerator::new(WordGenerator, random_capitalize_map)),
|
||||
@@ -95,7 +97,7 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateTableExprGenerato
|
||||
let mut builder = CreateTableExprBuilder::default();
|
||||
let mut columns = Vec::with_capacity(self.columns);
|
||||
let mut primary_keys = vec![];
|
||||
let need_partible_column = self.partition > 1;
|
||||
let need_partible_column = self.partition > 1 || self.partition_column;
|
||||
let mut column_names = self.name_generator.choose(rng, self.columns);
|
||||
|
||||
if self.columns == 1 {
|
||||
@@ -123,13 +125,15 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateTableExprGenerato
|
||||
)
|
||||
.remove(0);
|
||||
|
||||
// Generates partition bounds.
|
||||
let partition_def = generate_partition_def(
|
||||
self.partition,
|
||||
column.column_type.clone(),
|
||||
name.clone(),
|
||||
);
|
||||
builder.partition(partition_def);
|
||||
if self.partition > 1 {
|
||||
// Generates partition bounds.
|
||||
let partition_def = generate_partition_def(
|
||||
self.partition,
|
||||
column.column_type.clone(),
|
||||
name.clone(),
|
||||
);
|
||||
builder.partition(partition_def);
|
||||
}
|
||||
columns.push(column);
|
||||
}
|
||||
// Generates the ts column.
|
||||
@@ -178,11 +182,12 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateTableExprGenerato
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_partition_def(
|
||||
pub fn generate_partition_def(
|
||||
partitions: usize,
|
||||
column_type: ConcreteDataType,
|
||||
column_name: Ident,
|
||||
) -> PartitionDef {
|
||||
assert!(partitions > 1, "partitions must be greater than 1");
|
||||
let bounds = generate_partition_bounds(&column_type, partitions - 1);
|
||||
let partitions = SimplePartitions::new(column_name.clone(), bounds);
|
||||
let partition_exprs = partitions.generate().unwrap();
|
||||
@@ -193,24 +198,23 @@ fn generate_partition_def(
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_metric_partition(partitions: usize) -> Option<(Column, PartitionDef)> {
|
||||
if partitions <= 1 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let partition_column = Column {
|
||||
fn metric_partition_column() -> Column {
|
||||
Column {
|
||||
name: Ident::new("host"),
|
||||
column_type: ConcreteDataType::string_datatype(),
|
||||
options: vec![ColumnOption::PrimaryKey],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
pub fn generate_metric_partition_def(partitions: usize) -> PartitionDef {
|
||||
assert!(partitions > 1, "partitions must be greater than 1");
|
||||
let partition_column = metric_partition_column();
|
||||
let bounds = generate_partition_bounds(&partition_column.column_type, partitions - 1);
|
||||
let partitions = SimplePartitions::new(partition_column.name.clone(), bounds);
|
||||
let partition_def = PartitionDef {
|
||||
PartitionDef {
|
||||
columns: vec![partitions.column_name.clone()],
|
||||
exprs: partitions.generate().unwrap(),
|
||||
};
|
||||
|
||||
Some((partition_column, partition_def))
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a physical table with 2 columns: ts of TimestampType::Millisecond as time index and val of Float64Type.
|
||||
@@ -223,6 +227,8 @@ pub struct CreatePhysicalTableExprGenerator<R: Rng + 'static> {
|
||||
if_not_exists: bool,
|
||||
#[builder(default = "0")]
|
||||
partition: usize,
|
||||
#[builder(default = "false")]
|
||||
partition_column: bool,
|
||||
#[builder(default, setter(into))]
|
||||
with_clause: HashMap<String, String>,
|
||||
}
|
||||
@@ -252,11 +258,13 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreatePhysicalTableExpr
|
||||
|
||||
let mut partition = None;
|
||||
let mut primary_keys = vec![];
|
||||
if let Some((partition_column, partition_def)) = generate_metric_partition(self.partition) {
|
||||
columns.push(partition_column);
|
||||
partition = Some(partition_def);
|
||||
if self.partition > 1 || self.partition_column {
|
||||
columns.push(metric_partition_column());
|
||||
primary_keys.push(columns.len() - 1);
|
||||
}
|
||||
if self.partition > 1 {
|
||||
partition = Some(generate_metric_partition_def(self.partition));
|
||||
}
|
||||
|
||||
Ok(CreateTableExpr {
|
||||
table_name: self.name_generator.generate(rng),
|
||||
@@ -387,6 +395,7 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::context::TableContext;
|
||||
use crate::ir::PARTIBLE_DATA_TYPES;
|
||||
|
||||
#[test]
|
||||
fn test_float64() {
|
||||
@@ -423,6 +432,18 @@ mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(expr.columns.len(), 10);
|
||||
assert!(expr.partition.is_none());
|
||||
|
||||
let expr = CreateTableExprGeneratorBuilder::default()
|
||||
.columns(10)
|
||||
.partition(1)
|
||||
.partition_column(true)
|
||||
.build()
|
||||
.unwrap()
|
||||
.generate(&mut rng)
|
||||
.unwrap();
|
||||
assert_eq!(expr.columns.len(), 10);
|
||||
assert!(expr.partition.is_none());
|
||||
assert!(PARTIBLE_DATA_TYPES.contains(&expr.columns[0].column_type));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -516,6 +537,25 @@ mod tests {
|
||||
assert_eq!(physical_table_expr.partition.unwrap().exprs.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_physical_table_expr_generator_with_partition_column() {
|
||||
let mut rng = rand::rng();
|
||||
let physical_table_expr = CreatePhysicalTableExprGeneratorBuilder::default()
|
||||
.partition(1)
|
||||
.partition_column(true)
|
||||
.if_not_exists(false)
|
||||
.build()
|
||||
.unwrap()
|
||||
.generate(&mut rng)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(physical_table_expr.engine, "metric");
|
||||
assert!(physical_table_expr.partition.is_none());
|
||||
assert_eq!(physical_table_expr.columns.len(), 3);
|
||||
assert_eq!(physical_table_expr.columns[2].name, Ident::new("host"));
|
||||
assert_eq!(physical_table_expr.primary_keys, vec![2]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_logical_table_expr_generator_without_partition_column() {
|
||||
let mut rng = rand::rng();
|
||||
|
||||
@@ -30,7 +30,7 @@ use std::time::Duration;
|
||||
pub use alter_expr::{AlterTableExpr, AlterTableOption};
|
||||
use common_time::timestamp::TimeUnit;
|
||||
use common_time::{Date, Timestamp};
|
||||
pub use create_expr::{CreateDatabaseExpr, CreateTableExpr};
|
||||
pub use create_expr::{CreateDatabaseExpr, CreateTableExpr, PartitionDef};
|
||||
use datatypes::data_type::ConcreteDataType;
|
||||
use datatypes::types::TimestampType;
|
||||
use datatypes::value::Value;
|
||||
@@ -40,7 +40,7 @@ use lazy_static::lazy_static;
|
||||
pub use partition_expr::SimplePartitions;
|
||||
use rand::Rng;
|
||||
use rand::seq::{IndexedRandom, SliceRandom};
|
||||
pub use repartition_expr::RepartitionExpr;
|
||||
pub use repartition_expr::{AlterTablePartitionsExpr, RepartitionExpr};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use self::insert_expr::RowValues;
|
||||
|
||||
@@ -16,6 +16,7 @@ use partition::expr::PartitionExpr;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::ir::Ident;
|
||||
use crate::ir::create_expr::PartitionDef;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SplitPartitionExpr {
|
||||
@@ -34,10 +35,19 @@ pub struct MergePartitionExpr {
|
||||
pub wait: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AlterTablePartitionsExpr {
|
||||
pub table_name: Ident,
|
||||
pub partition: PartitionDef,
|
||||
#[serde(default = "default_wait")]
|
||||
pub wait: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum RepartitionExpr {
|
||||
Split(SplitPartitionExpr),
|
||||
Merge(MergePartitionExpr),
|
||||
AlterPartitions(AlterTablePartitionsExpr),
|
||||
}
|
||||
|
||||
const fn default_wait() -> bool {
|
||||
|
||||
@@ -15,7 +15,10 @@
|
||||
use partition::expr::PartitionExpr;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::ir::repartition_expr::{MergePartitionExpr, RepartitionExpr, SplitPartitionExpr};
|
||||
use crate::ir::create_expr::PartitionDef;
|
||||
use crate::ir::repartition_expr::{
|
||||
AlterTablePartitionsExpr, MergePartitionExpr, RepartitionExpr, SplitPartitionExpr,
|
||||
};
|
||||
use crate::translator::DslTranslator;
|
||||
|
||||
pub struct RepartitionExprTranslator;
|
||||
@@ -59,10 +62,38 @@ impl DslTranslator<RepartitionExpr, String> for RepartitionExprTranslator {
|
||||
table_name, merge_exprs, wait_clause
|
||||
))
|
||||
}
|
||||
RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
|
||||
table_name,
|
||||
partition,
|
||||
wait,
|
||||
}) => {
|
||||
let partition_clause = format_partition_clause(partition);
|
||||
let wait_clause = format_wait_clause(*wait);
|
||||
Ok(format!(
|
||||
"ALTER TABLE {} {}{};",
|
||||
table_name, partition_clause, wait_clause
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn format_partition_clause(partition: &PartitionDef) -> String {
|
||||
let columns = partition
|
||||
.columns
|
||||
.iter()
|
||||
.map(|column| column.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ");
|
||||
let exprs = partition
|
||||
.exprs
|
||||
.iter()
|
||||
.map(format_partition_expr_sql)
|
||||
.collect::<Vec<_>>()
|
||||
.join(",\n ");
|
||||
format!("PARTITION ON COLUMNS ({columns}) (\n {exprs}\n)")
|
||||
}
|
||||
|
||||
fn format_partition_expr_sql(expr: &PartitionExpr) -> String {
|
||||
expr.to_parser_expr().to_string()
|
||||
}
|
||||
@@ -79,9 +110,15 @@ fn format_wait_clause(wait: bool) -> String {
|
||||
mod tests {
|
||||
use datatypes::value::Value;
|
||||
use partition::expr::col;
|
||||
use sql::dialect::GreptimeDbDialect;
|
||||
use sql::parser::{ParseOptions, ParserContext};
|
||||
|
||||
use super::RepartitionExprTranslator;
|
||||
use crate::ir::repartition_expr::{MergePartitionExpr, RepartitionExpr, SplitPartitionExpr};
|
||||
use crate::ir::Ident;
|
||||
use crate::ir::create_expr::PartitionDef;
|
||||
use crate::ir::repartition_expr::{
|
||||
AlterTablePartitionsExpr, MergePartitionExpr, RepartitionExpr, SplitPartitionExpr,
|
||||
};
|
||||
use crate::translator::DslTranslator;
|
||||
|
||||
#[test]
|
||||
@@ -149,4 +186,61 @@ mod tests {
|
||||
);"#;
|
||||
assert_eq!(sql, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_translate_alter_table_partitions_expr() {
|
||||
let expr = RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
|
||||
table_name: "demo".into(),
|
||||
partition: PartitionDef {
|
||||
columns: vec![Ident::new("id")],
|
||||
exprs: vec![
|
||||
col("id").lt(Value::Int32(10)),
|
||||
col("id")
|
||||
.gt_eq(Value::Int32(10))
|
||||
.and(col("id").lt(Value::Int32(20))),
|
||||
col("id").gt_eq(Value::Int32(20)),
|
||||
],
|
||||
},
|
||||
wait: true,
|
||||
});
|
||||
let sql = RepartitionExprTranslator.translate(&expr).unwrap();
|
||||
let expected = r#"ALTER TABLE demo PARTITION ON COLUMNS (id) (
|
||||
id < 10,
|
||||
id >= 10 AND id < 20,
|
||||
id >= 20
|
||||
);"#;
|
||||
assert_eq!(sql, expected);
|
||||
assert_repartition_sql_parseable(&sql);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_translate_alter_table_partitions_expr_wait_false() {
|
||||
let expr = RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
|
||||
table_name: "demo".into(),
|
||||
partition: PartitionDef {
|
||||
columns: vec![Ident::new("host")],
|
||||
exprs: vec![
|
||||
col("host").lt(Value::from("m")),
|
||||
col("host").gt_eq(Value::from("m")),
|
||||
],
|
||||
},
|
||||
wait: false,
|
||||
});
|
||||
let sql = RepartitionExprTranslator.translate(&expr).unwrap();
|
||||
let expected = r#"ALTER TABLE demo PARTITION ON COLUMNS (host) (
|
||||
host < 'm',
|
||||
host >= 'm'
|
||||
) WITH (
|
||||
WAIT = false
|
||||
);"#;
|
||||
assert_eq!(sql, expected);
|
||||
assert_repartition_sql_parseable(&sql);
|
||||
}
|
||||
|
||||
fn assert_repartition_sql_parseable(sql: &str) {
|
||||
let statements =
|
||||
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
|
||||
.unwrap();
|
||||
assert_eq!(statements.len(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,8 @@ use crate::ir::Ident;
|
||||
use crate::ir::create_expr::PartitionDef;
|
||||
|
||||
const PARTITIONS_INFO_SCHEMA_SQL: &str = "SELECT table_catalog, table_schema, table_name, \
|
||||
partition_name, partition_expression, partition_description, greptime_partition_id, \
|
||||
partition_name, COALESCE(partition_expression, '') AS partition_expression, \
|
||||
COALESCE(partition_description, '') AS partition_description, greptime_partition_id, \
|
||||
partition_ordinal_position FROM information_schema.partitions WHERE table_name = ? \
|
||||
ORDER BY partition_ordinal_position;";
|
||||
|
||||
@@ -91,3 +92,20 @@ pub fn assert_partitions(expected: &PartitionDef, actual: &[PartitionInfo]) -> R
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Asserts that the table has no partition metadata in information schema.
|
||||
pub fn assert_unpartitioned(actual: &[PartitionInfo]) -> Result<()> {
|
||||
let has_no_partition_metadata = actual.is_empty()
|
||||
|| (actual.len() == 1
|
||||
&& actual[0].partition_expression.is_empty()
|
||||
&& actual[0].partition_description.is_empty());
|
||||
|
||||
ensure!(
|
||||
has_no_partition_metadata,
|
||||
error::AssertSnafu {
|
||||
reason: format!("Expected unpartitioned table, got partitions: {actual:?}"),
|
||||
}
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -36,14 +36,15 @@ use tests_fuzz::fake::{
|
||||
use tests_fuzz::generator::Generator;
|
||||
use tests_fuzz::generator::create_expr::{
|
||||
CreateLogicalTableExprGeneratorBuilder, CreatePhysicalTableExprGeneratorBuilder,
|
||||
generate_metric_partition_def,
|
||||
};
|
||||
use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder;
|
||||
use tests_fuzz::generator::repartition_expr::{
|
||||
MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder,
|
||||
};
|
||||
use tests_fuzz::ir::{
|
||||
CreateTableExpr, Ident, InsertIntoExpr, RepartitionExpr, generate_random_value,
|
||||
generate_unique_timestamp_for_mysql_with_clock,
|
||||
AlterTablePartitionsExpr, CreateTableExpr, Ident, InsertIntoExpr, PartitionDef,
|
||||
RepartitionExpr, generate_random_value, generate_unique_timestamp_for_mysql_with_clock,
|
||||
};
|
||||
use tests_fuzz::translator::DslTranslator;
|
||||
use tests_fuzz::translator::csv::InsertExprToCsvRecordsTranslator;
|
||||
@@ -94,6 +95,7 @@ fn generate_create_physical_table_expr<R: Rng + 'static>(
|
||||
))))
|
||||
.if_not_exists(rng.random_bool(0.5))
|
||||
.partition(partitions)
|
||||
.partition_column(partitions <= 1)
|
||||
.build()
|
||||
.unwrap()
|
||||
.generate(rng)
|
||||
@@ -158,12 +160,6 @@ async fn create_metric_tables<R: Rng + 'static>(
|
||||
})?;
|
||||
info!("Create physical table: {create_physical_sql}, result: {result:?}");
|
||||
let physical_table_ctx = Arc::new(TableContext::from(&create_physical_expr));
|
||||
ensure!(
|
||||
physical_table_ctx.partition.is_some(),
|
||||
error::AssertSnafu {
|
||||
reason: "Physical metric table must have partition".to_string()
|
||||
}
|
||||
);
|
||||
|
||||
let mut logical_tables = BTreeMap::new();
|
||||
let mut create_logical_sqls = HashMap::new();
|
||||
@@ -436,6 +432,11 @@ fn repartition_operation<R: Rng + 'static>(
|
||||
table_ctx: &TableContextRef,
|
||||
rng: &mut R,
|
||||
) -> Result<RepartitionExpr> {
|
||||
if table_ctx.partition.is_none() {
|
||||
let partition = generate_metric_partition_def(rng.random_range(2..8));
|
||||
return Ok(alter_table_partitions_expr(table_ctx, partition, true));
|
||||
}
|
||||
|
||||
let split = rng.random_bool(0.5);
|
||||
if table_ctx.partition.as_ref().unwrap().exprs.len() <= 2 || split {
|
||||
let expr = SplitPartitionExprGeneratorBuilder::default()
|
||||
@@ -454,19 +455,35 @@ fn repartition_operation<R: Rng + 'static>(
|
||||
}
|
||||
}
|
||||
|
||||
fn alter_table_partitions_expr(
|
||||
table_ctx: &TableContextRef,
|
||||
partition: PartitionDef,
|
||||
wait: bool,
|
||||
) -> RepartitionExpr {
|
||||
RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
|
||||
table_name: table_ctx.name.clone(),
|
||||
partition,
|
||||
wait,
|
||||
})
|
||||
}
|
||||
|
||||
impl Arbitrary<'_> for FuzzInput {
|
||||
fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result<Self> {
|
||||
let seed = get_fuzz_override::<u64>("SEED").unwrap_or(u.int_in_range(u64::MIN..=u64::MAX)?);
|
||||
let mut rng = ChaChaRng::seed_from_u64(seed);
|
||||
let partitions =
|
||||
get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8));
|
||||
let partitions = get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| {
|
||||
if rng.random_bool(0.5) {
|
||||
1
|
||||
} else {
|
||||
rng.random_range(2..8)
|
||||
}
|
||||
});
|
||||
let max_tables = get_gt_fuzz_input_max_tables();
|
||||
let tables = get_fuzz_override::<usize>("TABLES")
|
||||
.unwrap_or_else(|| rng.random_range(1..=std::cmp::max(1, max_tables)));
|
||||
let max_actions = get_gt_fuzz_input_max_alter_actions();
|
||||
let max_actions = std::cmp::min(128, get_gt_fuzz_input_max_alter_actions());
|
||||
let actions = get_fuzz_override::<usize>("ACTIONS")
|
||||
.unwrap_or_else(|| rng.random_range(1..max_actions));
|
||||
|
||||
Ok(FuzzInput {
|
||||
seed,
|
||||
actions,
|
||||
@@ -536,7 +553,11 @@ async fn execute_repartition_metric_table(ctx: FuzzContext, input: FuzzInput) ->
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
|
||||
for i in 0..input.actions {
|
||||
let partition_num = physical_table_ctx.partition.as_ref().unwrap().exprs.len();
|
||||
let partition_num = physical_table_ctx
|
||||
.partition
|
||||
.as_ref()
|
||||
.map(|partition| partition.exprs.len())
|
||||
.unwrap_or_default();
|
||||
info!(
|
||||
"partition_num: {partition_num}, action: {}/{}, table: {}, logical table num: {}",
|
||||
i + 1,
|
||||
|
||||
@@ -33,14 +33,15 @@ use tests_fuzz::fake::{
|
||||
uppercase_and_keyword_backtick_map,
|
||||
};
|
||||
use tests_fuzz::generator::Generator;
|
||||
use tests_fuzz::generator::create_expr::CreateTableExprGeneratorBuilder;
|
||||
use tests_fuzz::generator::create_expr::{CreateTableExprGeneratorBuilder, generate_partition_def};
|
||||
use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder;
|
||||
use tests_fuzz::generator::repartition_expr::{
|
||||
MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder,
|
||||
};
|
||||
use tests_fuzz::ir::{
|
||||
CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator, RepartitionExpr, RowValue,
|
||||
SimplePartitions, generate_partition_value, generate_unique_timestamp_for_mysql_with_clock,
|
||||
AlterTablePartitionsExpr, CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator,
|
||||
PartitionDef, RepartitionExpr, RowValue, SimplePartitions, generate_partition_value,
|
||||
generate_unique_timestamp_for_mysql_with_clock,
|
||||
};
|
||||
use tests_fuzz::translator::DslTranslator;
|
||||
use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator;
|
||||
@@ -75,8 +76,13 @@ impl Arbitrary<'_> for FuzzInput {
|
||||
fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result<Self> {
|
||||
let seed = get_fuzz_override::<u64>("SEED").unwrap_or(u.int_in_range(u64::MIN..=u64::MAX)?);
|
||||
let mut rng = ChaChaRng::seed_from_u64(seed);
|
||||
let partitions =
|
||||
get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8));
|
||||
let partitions = get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| {
|
||||
if rng.random_bool(0.5) {
|
||||
1
|
||||
} else {
|
||||
rng.random_range(2..8)
|
||||
}
|
||||
});
|
||||
let max_actions = get_gt_fuzz_input_max_alter_actions();
|
||||
let actions = get_fuzz_override::<usize>("ACTIONS")
|
||||
.unwrap_or_else(|| rng.random_range(1..max_actions));
|
||||
@@ -99,6 +105,7 @@ fn generate_create_expr<R: Rng + 'static>(
|
||||
)))
|
||||
.columns(5)
|
||||
.partition(input.partitions)
|
||||
.partition_column(input.partitions <= 1)
|
||||
.engine("mito")
|
||||
.ts_column_type_generator(Box::new(MySQLTsColumnTypeGenerator))
|
||||
.build()
|
||||
@@ -122,7 +129,7 @@ fn build_insert_expr<R: Rng + 'static>(
|
||||
let ts_value_generator = generate_unique_timestamp_for_mysql_with_clock(clock.clone());
|
||||
let counter = Arc::new(AtomicUsize::new(0));
|
||||
let counter_clone = counter.clone();
|
||||
let partition_len = table_ctx.partition.as_ref().unwrap().exprs.len();
|
||||
let partition_len = partitions.bounds.len() + 1;
|
||||
let row = rng.random_range(partition_len..partition_len * 2);
|
||||
|
||||
let moved_partitions = partitions.clone();
|
||||
@@ -150,6 +157,28 @@ fn build_insert_expr<R: Rng + 'static>(
|
||||
insert_generator.generate(rng).unwrap()
|
||||
}
|
||||
|
||||
fn alter_table_partitions_expr(
|
||||
table_ctx: &TableContextRef,
|
||||
partition: PartitionDef,
|
||||
wait: bool,
|
||||
) -> RepartitionExpr {
|
||||
RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
|
||||
table_name: table_ctx.name.clone(),
|
||||
partition,
|
||||
wait,
|
||||
})
|
||||
}
|
||||
|
||||
fn alter_table_partitions_expr_from_table_ctx<R: Rng + 'static>(
|
||||
table_ctx: &TableContextRef,
|
||||
rng: &mut R,
|
||||
wait: bool,
|
||||
) -> RepartitionExpr {
|
||||
let column = table_ctx.columns[0].clone();
|
||||
let partition = generate_partition_def(rng.random_range(2..8), column.column_type, column.name);
|
||||
alter_table_partitions_expr(table_ctx, partition, wait)
|
||||
}
|
||||
|
||||
async fn execute_insert_with_retry(ctx: &FuzzContext, sql: &str) -> Result<()> {
|
||||
let mut delay = Duration::from_millis(100);
|
||||
let mut attempt = 0;
|
||||
@@ -236,9 +265,36 @@ async fn execute_repartition_table(ctx: FuzzContext, input: FuzzInput) -> Result
|
||||
inserted_rows: 0,
|
||||
}));
|
||||
|
||||
let mut action_start = 0;
|
||||
if table_ctx.partition.is_none() {
|
||||
let expr = alter_table_partitions_expr_from_table_ctx(&table_ctx, &mut rng, true);
|
||||
let translator = RepartitionExprTranslator;
|
||||
let sql = translator.translate(&expr)?;
|
||||
info!("Initial partition sql: {sql}");
|
||||
let result = sqlx::query(&sql)
|
||||
.execute(&ctx.greptime)
|
||||
.await
|
||||
.context(error::ExecuteQuerySnafu { sql: &sql })?;
|
||||
info!("Initial partition result: {result:?}");
|
||||
table_ctx = Arc::new(Arc::unwrap_or_clone(table_ctx).repartition(expr).unwrap());
|
||||
shared_state.lock().unwrap().table_ctx = table_ctx.clone();
|
||||
|
||||
let partition_entries = validator::partition::fetch_partitions_info_schema(
|
||||
&ctx.greptime,
|
||||
"public".into(),
|
||||
&table_ctx.name,
|
||||
)
|
||||
.await?;
|
||||
validator::partition::assert_partitions(
|
||||
table_ctx.partition.as_ref().unwrap(),
|
||||
&partition_entries,
|
||||
)?;
|
||||
action_start = 1;
|
||||
}
|
||||
|
||||
let writer_rng = ChaChaRng::seed_from_u64(input.seed);
|
||||
let writer_task = tokio::spawn(write_loop(writer_rng, ctx.clone(), shared_state.clone()));
|
||||
for i in 0..input.actions {
|
||||
for i in action_start..input.actions {
|
||||
let partition_num = table_ctx.partition.as_ref().unwrap().exprs.len();
|
||||
info!(
|
||||
"partition_num: {partition_num}, action: {}/{}",
|
||||
|
||||
@@ -34,14 +34,15 @@ use tests_fuzz::fake::{
|
||||
uppercase_and_keyword_backtick_map,
|
||||
};
|
||||
use tests_fuzz::generator::Generator;
|
||||
use tests_fuzz::generator::create_expr::CreateTableExprGeneratorBuilder;
|
||||
use tests_fuzz::generator::create_expr::{CreateTableExprGeneratorBuilder, generate_partition_def};
|
||||
use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder;
|
||||
use tests_fuzz::generator::repartition_expr::{
|
||||
MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder,
|
||||
};
|
||||
use tests_fuzz::ir::{
|
||||
CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator, RepartitionExpr, RowValue,
|
||||
SimplePartitions, generate_partition_value, generate_unique_timestamp_for_mysql_with_clock,
|
||||
AlterTablePartitionsExpr, CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator,
|
||||
PartitionDef, RepartitionExpr, RowValue, SimplePartitions, generate_partition_value,
|
||||
generate_unique_timestamp_for_mysql_with_clock,
|
||||
};
|
||||
use tests_fuzz::translator::DslTranslator;
|
||||
use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator;
|
||||
@@ -93,13 +94,17 @@ impl Arbitrary<'_> for FuzzInput {
|
||||
let mut rng = ChaChaRng::seed_from_u64(seed);
|
||||
let rows = get_fuzz_override::<usize>("ROWS")
|
||||
.unwrap_or_else(|| rng.random_range(2..get_gt_fuzz_input_max_rows()));
|
||||
let partitions =
|
||||
get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8));
|
||||
let partitions = get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| {
|
||||
if rng.random_bool(0.5) {
|
||||
1
|
||||
} else {
|
||||
rng.random_range(2..8)
|
||||
}
|
||||
});
|
||||
let chaos_delay_ms =
|
||||
get_fuzz_override::<u64>("CHAOS_DELAY_MS").unwrap_or_else(|| rng.random_range(0..5000));
|
||||
let chaos_hold_secs =
|
||||
get_fuzz_override::<u64>("CHAOS_HOLD_SECS").unwrap_or_else(|| rng.random_range(10..20));
|
||||
|
||||
Ok(FuzzInput {
|
||||
seed,
|
||||
rows,
|
||||
@@ -127,6 +132,7 @@ fn generate_create_expr<R: Rng + 'static>(
|
||||
)))
|
||||
.columns(5)
|
||||
.partition(input.partitions)
|
||||
.partition_column(input.partitions <= 1)
|
||||
.engine("mito")
|
||||
.ts_column_type_generator(Box::new(MySQLTsColumnTypeGenerator))
|
||||
.build()
|
||||
@@ -144,7 +150,7 @@ fn build_insert_expr<R: Rng + 'static>(
|
||||
let ts_value_generator = generate_unique_timestamp_for_mysql_with_clock(clock.clone());
|
||||
let counter = Arc::new(AtomicUsize::new(0));
|
||||
let counter_clone = counter.clone();
|
||||
let partition_len = table_ctx.partition.as_ref().unwrap().exprs.len();
|
||||
let partition_len = partitions.bounds.len() + 1;
|
||||
let moved_partitions = partitions.clone();
|
||||
let insert_generator = InsertExprGeneratorBuilder::default()
|
||||
.table_ctx(table_ctx.clone())
|
||||
@@ -202,10 +208,12 @@ async fn create_table(ctx: &FuzzContext, expr: &CreateTableExpr) -> Result<Table
|
||||
async fn insert_initial_rows<R: Rng + 'static>(
|
||||
ctx: &FuzzContext,
|
||||
table_ctx: &TableContextRef,
|
||||
partition_def: &PartitionDef,
|
||||
rng: &mut R,
|
||||
rows: usize,
|
||||
) -> Result<u64> {
|
||||
let partitions = SimplePartitions::from_table_ctx(table_ctx).unwrap();
|
||||
let partitions =
|
||||
SimplePartitions::from_exprs(partition_def.columns[0].clone(), &partition_def.exprs)?;
|
||||
let clock = Arc::new(Mutex::new(Timestamp::current_millis()));
|
||||
let insert_expr = build_insert_expr(table_ctx, rng, &partitions, &clock, rows);
|
||||
let inserted_rows = insert_expr.values_list.len() as u64;
|
||||
@@ -260,6 +268,28 @@ fn repartition_operation<R: Rng + 'static>(
|
||||
}
|
||||
}
|
||||
|
||||
fn alter_table_partitions_expr(
|
||||
table_ctx: &TableContextRef,
|
||||
partition: PartitionDef,
|
||||
wait: bool,
|
||||
) -> RepartitionExpr {
|
||||
RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
|
||||
table_name: table_ctx.name.clone(),
|
||||
partition,
|
||||
wait,
|
||||
})
|
||||
}
|
||||
|
||||
fn alter_table_partitions_expr_from_table_ctx<R: Rng + 'static>(
|
||||
table_ctx: &TableContextRef,
|
||||
rng: &mut R,
|
||||
wait: bool,
|
||||
) -> RepartitionExpr {
|
||||
let column = table_ctx.columns[0].clone();
|
||||
let partition = generate_partition_def(rng.random_range(2..8), column.column_type, column.name);
|
||||
alter_table_partitions_expr(table_ctx, partition, wait)
|
||||
}
|
||||
|
||||
async fn submit_repartition_procedure(ctx: &FuzzContext, expr: &RepartitionExpr) -> Result<String> {
|
||||
let translator = RepartitionExprTranslator;
|
||||
let sql = translator.translate(expr)?;
|
||||
@@ -334,10 +364,13 @@ async fn validate_terminal_metadata(
|
||||
after_table_ctx.partition.as_ref().unwrap(),
|
||||
&partition_entries,
|
||||
)?,
|
||||
ProcedureTerminalState::Failed => validator::partition::assert_partitions(
|
||||
before_table_ctx.partition.as_ref().unwrap(),
|
||||
&partition_entries,
|
||||
)?,
|
||||
ProcedureTerminalState::Failed => {
|
||||
if let Some(partition) = before_table_ctx.partition.as_ref() {
|
||||
validator::partition::assert_partitions(partition, &partition_entries)?;
|
||||
} else {
|
||||
validator::partition::assert_unpartitioned(&partition_entries)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -359,7 +392,21 @@ async fn execute_repartition_chaos(ctx: FuzzContext, input: FuzzInput) -> Result
|
||||
|
||||
let create_expr = generate_create_expr(&input, &mut rng)?;
|
||||
let before_table_ctx = create_table(&ctx, &create_expr).await?;
|
||||
let inserted_rows = insert_initial_rows(&ctx, &before_table_ctx, &mut rng, input.rows).await?;
|
||||
let insert_partition = create_expr.partition.clone().unwrap_or_else(|| {
|
||||
generate_partition_def(
|
||||
2,
|
||||
before_table_ctx.columns[0].column_type.clone(),
|
||||
before_table_ctx.columns[0].name.clone(),
|
||||
)
|
||||
});
|
||||
let inserted_rows = insert_initial_rows(
|
||||
&ctx,
|
||||
&before_table_ctx,
|
||||
&insert_partition,
|
||||
&mut rng,
|
||||
input.rows,
|
||||
)
|
||||
.await?;
|
||||
validate_table_rows(&ctx, &before_table_ctx, inserted_rows).await?;
|
||||
|
||||
let before_entries = validator::partition::fetch_partitions_info_schema(
|
||||
@@ -370,7 +417,11 @@ async fn execute_repartition_chaos(ctx: FuzzContext, input: FuzzInput) -> Result
|
||||
.await?;
|
||||
info!("Before repartition partition entries: {before_entries:?}");
|
||||
|
||||
let repartition_expr = repartition_operation(&before_table_ctx, &mut rng, false)?;
|
||||
let repartition_expr = if before_table_ctx.partition.is_some() {
|
||||
repartition_operation(&before_table_ctx, &mut rng, false)?
|
||||
} else {
|
||||
alter_table_partitions_expr_from_table_ctx(&before_table_ctx, &mut rng, false)
|
||||
};
|
||||
let after_table_ctx = Arc::new(
|
||||
Arc::unwrap_or_clone(before_table_ctx.clone())
|
||||
.repartition(repartition_expr.clone())
|
||||
|
||||
@@ -80,6 +80,7 @@ pub struct GreptimeDbStandaloneBuilder {
|
||||
default_store: Option<StorageType>,
|
||||
plugin: Option<Plugins>,
|
||||
slow_query_options: SlowQueryOptions,
|
||||
auto_create_table: bool,
|
||||
}
|
||||
|
||||
impl GreptimeDbStandaloneBuilder {
|
||||
@@ -97,9 +98,16 @@ impl GreptimeDbStandaloneBuilder {
|
||||
threshold: Duration::from_secs(1),
|
||||
..Default::default()
|
||||
},
|
||||
auto_create_table: true,
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_auto_create_table(mut self, auto_create_table: bool) -> Self {
|
||||
self.auto_create_table = auto_create_table;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn with_default_store_type(self, store_type: StorageType) -> Self {
|
||||
Self {
|
||||
@@ -347,6 +355,7 @@ impl GreptimeDbStandaloneBuilder {
|
||||
wal: self.metasrv_wal_config.clone().into(),
|
||||
grpc: GrpcOptions::default().with_server_addr("127.0.0.1:4001"),
|
||||
slow_query: self.slow_query_options.clone(),
|
||||
auto_create_table: self.auto_create_table,
|
||||
..StandaloneOptions::default()
|
||||
};
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ use std::env;
|
||||
use std::fmt::Display;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use auth::{DefaultPermissionChecker, PermissionCheckerRef, UserProviderRef};
|
||||
use axum::Router;
|
||||
@@ -49,6 +50,7 @@ use servers::http::{HttpOptions, HttpServerBuilder};
|
||||
use servers::metrics_handler::MetricsHandler;
|
||||
use servers::mysql::server::{MysqlServer, MysqlSpawnConfig, MysqlSpawnRef};
|
||||
use servers::otel_arrow::OtelArrowServiceHandler;
|
||||
use servers::pending_rows_batcher::PendingRowsBatcher;
|
||||
use servers::postgres::PostgresServer;
|
||||
use servers::prom_remote_write::validation::PromValidationMode;
|
||||
use servers::query_handler::sql::SqlQueryHandler;
|
||||
@@ -564,6 +566,24 @@ async fn run_sql(sql: &str, instance: &GreptimeDbStandalone) {
|
||||
pub async fn setup_test_prom_app_with_frontend(
|
||||
store_type: StorageType,
|
||||
name: &str,
|
||||
) -> (Router, TestGuard) {
|
||||
setup_test_prom_app_with_frontend_inner(store_type, name, false).await
|
||||
}
|
||||
|
||||
/// Like [`setup_test_prom_app_with_frontend`] but enables the pending-rows batcher,
|
||||
/// so Prometheus remote write goes through the batched (metric-engine) path instead
|
||||
/// of the direct `PromStoreProtocolHandler::write` path.
|
||||
pub async fn setup_test_prom_app_with_frontend_batched(
|
||||
store_type: StorageType,
|
||||
name: &str,
|
||||
) -> (Router, TestGuard) {
|
||||
setup_test_prom_app_with_frontend_inner(store_type, name, true).await
|
||||
}
|
||||
|
||||
async fn setup_test_prom_app_with_frontend_inner(
|
||||
store_type: StorageType,
|
||||
name: &str,
|
||||
enable_batcher: bool,
|
||||
) -> (Router, TestGuard) {
|
||||
unsafe {
|
||||
std::env::set_var("TZ", "UTC");
|
||||
@@ -617,6 +637,24 @@ pub async fn setup_test_prom_app_with_frontend(
|
||||
..Default::default()
|
||||
};
|
||||
let frontend_ref = instance.fe_instance().clone();
|
||||
// Mirror the production wiring at `frontend::server`: build the batcher from the
|
||||
// instance's managers. A short flush interval keeps the test responsive.
|
||||
let pending_rows_batcher = if enable_batcher {
|
||||
PendingRowsBatcher::try_new(
|
||||
frontend_ref.partition_manager().clone(),
|
||||
frontend_ref.node_manager().clone(),
|
||||
frontend_ref.catalog_manager().clone(),
|
||||
true,
|
||||
frontend_ref.clone(),
|
||||
Duration::from_millis(50),
|
||||
1000,
|
||||
4,
|
||||
64,
|
||||
64,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let http_server = HttpServerBuilder::new(http_opts)
|
||||
.with_sql_handler(frontend_ref.clone())
|
||||
.with_logs_handler(instance.fe_instance().clone())
|
||||
@@ -625,7 +663,7 @@ pub async fn setup_test_prom_app_with_frontend(
|
||||
Some(frontend_ref.clone()),
|
||||
true,
|
||||
PromValidationMode::Strict,
|
||||
None,
|
||||
pending_rows_batcher,
|
||||
)
|
||||
.with_prometheus_handler(frontend_ref)
|
||||
.with_greptime_config_options(instance.opts.datanode_options().to_toml().unwrap())
|
||||
@@ -649,6 +687,20 @@ pub async fn setup_grpc_server_with_user_provider(
|
||||
setup_grpc_server_with(store_type, name, user_provider, None, None).await
|
||||
}
|
||||
|
||||
/// Sets up a gRPC server backed by a standalone instance whose frontend has auto
|
||||
/// table creation disabled, for testing the server-side global switch.
|
||||
pub async fn setup_grpc_server_with_auto_create_table_disabled(
|
||||
store_type: StorageType,
|
||||
name: &str,
|
||||
) -> (GreptimeDbStandalone, Arc<GrpcServer>) {
|
||||
let instance = GreptimeDbStandaloneBuilder::new(name)
|
||||
.with_default_store_type(store_type)
|
||||
.with_auto_create_table(false)
|
||||
.build()
|
||||
.await;
|
||||
setup_grpc_server_for_instance(instance, None, None, None).await
|
||||
}
|
||||
|
||||
pub async fn setup_grpc_server_with(
|
||||
store_type: StorageType,
|
||||
name: &str,
|
||||
@@ -657,7 +709,17 @@ pub async fn setup_grpc_server_with(
|
||||
memory_limiter: Option<servers::request_memory_limiter::ServerMemoryLimiter>,
|
||||
) -> (GreptimeDbStandalone, Arc<GrpcServer>) {
|
||||
let instance = setup_standalone_instance(name, store_type).await;
|
||||
setup_grpc_server_for_instance(instance, user_provider, grpc_config, memory_limiter).await
|
||||
}
|
||||
|
||||
/// Builds and starts a gRPC server on top of an already-constructed standalone
|
||||
/// instance. This is the shared core behind the `setup_grpc_server_*` helpers.
|
||||
async fn setup_grpc_server_for_instance(
|
||||
instance: GreptimeDbStandalone,
|
||||
user_provider: Option<UserProviderRef>,
|
||||
grpc_config: Option<GrpcServerConfig>,
|
||||
memory_limiter: Option<servers::request_memory_limiter::ServerMemoryLimiter>,
|
||||
) -> (GreptimeDbStandalone, Arc<GrpcServer>) {
|
||||
let runtime: Runtime = RuntimeBuilder::default()
|
||||
.worker_threads(2)
|
||||
.thread_name("grpc-handlers")
|
||||
|
||||
@@ -44,7 +44,8 @@ use servers::request_memory_limiter::ServerMemoryLimiter;
|
||||
use servers::server::Server;
|
||||
use servers::tls::{TlsMode, TlsOption};
|
||||
use tests_integration::test_util::{
|
||||
StorageType, setup_grpc_server, setup_grpc_server_with, setup_grpc_server_with_user_provider,
|
||||
StorageType, setup_grpc_server, setup_grpc_server_with,
|
||||
setup_grpc_server_with_auto_create_table_disabled, setup_grpc_server_with_user_provider,
|
||||
};
|
||||
use tonic::Request;
|
||||
use tonic::metadata::MetadataValue;
|
||||
@@ -82,6 +83,7 @@ macro_rules! grpc_tests {
|
||||
test_invalid_dbname,
|
||||
test_auto_create_table,
|
||||
test_auto_create_table_with_hints,
|
||||
test_auto_create_table_disabled_by_config,
|
||||
test_otel_arrow_auth,
|
||||
test_insert_and_select,
|
||||
test_dbname,
|
||||
@@ -405,6 +407,81 @@ pub async fn test_auto_create_table_with_hints(store_type: StorageType) {
|
||||
let _ = fe_grpc_server.shutdown().await;
|
||||
}
|
||||
|
||||
/// When the frontend global switch disables auto table creation, a write to a
|
||||
/// missing table must fail even if the request sets `auto_create_table=true`,
|
||||
/// proving the global config is an upper bound that hints cannot bypass.
|
||||
pub async fn test_auto_create_table_disabled_by_config(store_type: StorageType) {
|
||||
let (_db, fe_grpc_server) = setup_grpc_server_with_auto_create_table_disabled(
|
||||
store_type,
|
||||
"test_auto_create_table_disabled_by_config",
|
||||
)
|
||||
.await;
|
||||
let addr = fe_grpc_server.bind_addr().unwrap().to_string();
|
||||
|
||||
let grpc_client = Client::with_urls(vec![addr]);
|
||||
let db = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, grpc_client);
|
||||
|
||||
// Plain row insert to a missing table: must fail even with `auto_create_table=true`.
|
||||
let (host, cpu, mem, ts) = expect_data();
|
||||
let request = InsertRequest {
|
||||
table_name: "demo".to_string(),
|
||||
columns: vec![host, cpu, mem, ts],
|
||||
row_count: 4,
|
||||
};
|
||||
let result = db
|
||||
.insert_with_hints(
|
||||
InsertRequests {
|
||||
inserts: vec![request],
|
||||
},
|
||||
&[("auto_create_table", "true")],
|
||||
)
|
||||
.await;
|
||||
let err = result.unwrap_err().to_string();
|
||||
assert!(
|
||||
err.contains("does not exist") && err.contains("disabled by frontend config"),
|
||||
"unexpected error: {err}"
|
||||
);
|
||||
|
||||
// Metric path (via `physical_table` hint): must also fail without leaking the physical table.
|
||||
let (host, cpu, mem, ts) = expect_data();
|
||||
let request = InsertRequest {
|
||||
table_name: "demo_metric".to_string(),
|
||||
columns: vec![host, cpu, mem, ts],
|
||||
row_count: 4,
|
||||
};
|
||||
let result = db
|
||||
.insert_with_hints(
|
||||
InsertRequests {
|
||||
inserts: vec![request],
|
||||
},
|
||||
&[
|
||||
("auto_create_table", "true"),
|
||||
("physical_table", "greptime_physical_table"),
|
||||
],
|
||||
)
|
||||
.await;
|
||||
let err = result.unwrap_err().to_string();
|
||||
assert!(
|
||||
err.contains("does not exist") && err.contains("disabled by frontend config"),
|
||||
"unexpected error: {err}"
|
||||
);
|
||||
|
||||
// The physical table must not have been created before the failure.
|
||||
let output = db.sql("SHOW TABLES").await.unwrap();
|
||||
let record_batches = match output.data {
|
||||
OutputData::RecordBatches(record_batches) => record_batches,
|
||||
OutputData::Stream(stream) => RecordBatches::try_collect(stream).await.unwrap(),
|
||||
OutputData::AffectedRows(_) => unreachable!(),
|
||||
};
|
||||
let tables = record_batches.pretty_print().unwrap();
|
||||
assert!(
|
||||
!tables.contains("greptime_physical_table"),
|
||||
"physical table leaked despite disabled auto-create:\n{tables}"
|
||||
);
|
||||
|
||||
let _ = fe_grpc_server.shutdown().await;
|
||||
}
|
||||
|
||||
fn expect_data() -> (Column, Column, Column, Column) {
|
||||
// testing data:
|
||||
let expected_host_col = Column {
|
||||
|
||||
@@ -71,6 +71,7 @@ use tests_integration::test_util::{
|
||||
StorageType, setup_test_http_app, setup_test_http_app_with_frontend,
|
||||
setup_test_http_app_with_frontend_and_slow_query_threshold,
|
||||
setup_test_http_app_with_frontend_and_user_provider, setup_test_prom_app_with_frontend,
|
||||
setup_test_prom_app_with_frontend_batched,
|
||||
};
|
||||
use urlencoding::encode;
|
||||
use yaml_rust::YamlLoader;
|
||||
@@ -117,6 +118,7 @@ macro_rules! http_tests {
|
||||
test_dashboard_path,
|
||||
test_dashboard_api,
|
||||
test_prometheus_remote_write,
|
||||
test_prometheus_remote_write_batched,
|
||||
test_prometheus_remote_special_labels,
|
||||
test_prometheus_remote_schema_labels,
|
||||
test_prometheus_remote_write_with_pipeline,
|
||||
@@ -1491,6 +1493,7 @@ mem_threshold_on_create = "auto"
|
||||
let expected_toml_str = format!(
|
||||
r#"
|
||||
enable_telemetry = true
|
||||
auto_create_table = true
|
||||
max_in_flight_write_bytes = "0KiB"
|
||||
write_bytes_exhausted_policy = "wait"
|
||||
init_regions_in_background = false
|
||||
@@ -1601,6 +1604,7 @@ experimental_grpc_max_retries = 3
|
||||
experimental_frontend_scan_timeout = "30s"
|
||||
experimental_max_filter_num_per_query = 20
|
||||
experimental_time_window_merge_threshold = 3
|
||||
experimental_enable_incremental_read = false
|
||||
read_preference = "Leader"
|
||||
|
||||
[logging]
|
||||
@@ -1954,6 +1958,18 @@ pub async fn test_prometheus_remote_write(store_type: StorageType) {
|
||||
)
|
||||
.await;
|
||||
|
||||
// Prom RW tables carry the metric identity; type is inferred from naming.
|
||||
validate_data(
|
||||
"prometheus_remote_write_semantic_identity",
|
||||
&client,
|
||||
"select count(*) from information_schema.tables where table_name = 'metric2' \
|
||||
and create_options like '%greptime.semantic.signal_type=metric%' \
|
||||
and create_options like '%greptime.semantic.source=prometheus%' \
|
||||
and create_options like '%greptime.semantic.metric.metadata_quality=inferred%';",
|
||||
"[[1]]",
|
||||
)
|
||||
.await;
|
||||
|
||||
// Write snappy encoded data with new labels
|
||||
let write_request = WriteRequest {
|
||||
timeseries: mock_timeseries_new_label(),
|
||||
@@ -1975,6 +1991,48 @@ pub async fn test_prometheus_remote_write(store_type: StorageType) {
|
||||
guard.remove_all().await;
|
||||
}
|
||||
|
||||
/// Covers the batched (pending-rows-batcher) Prometheus remote write path, which
|
||||
/// bypasses `PromStoreProtocolHandler::write`. Verifies the metric table is created
|
||||
/// asynchronously and still carries the Prometheus semantic identity stamped on the
|
||||
/// shared request context.
|
||||
pub async fn test_prometheus_remote_write_batched(store_type: StorageType) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let (app, mut guard) =
|
||||
setup_test_prom_app_with_frontend_batched(store_type, "prometheus_remote_write_batched")
|
||||
.await;
|
||||
let client = TestClient::new(app).await;
|
||||
|
||||
let write_request = WriteRequest {
|
||||
timeseries: prom_store::mock_timeseries(),
|
||||
..Default::default()
|
||||
};
|
||||
let serialized_request = write_request.encode_to_vec();
|
||||
let compressed_request =
|
||||
prom_store::snappy_compress(&serialized_request).expect("failed to encode snappy");
|
||||
|
||||
let res = client
|
||||
.post("/v1/prometheus/write")
|
||||
.header("Content-Encoding", "snappy")
|
||||
.body(compressed_request)
|
||||
.send()
|
||||
.await;
|
||||
assert_eq!(res.status(), StatusCode::NO_CONTENT);
|
||||
|
||||
// The batcher flushes asynchronously, so poll until the table exists and carries
|
||||
// the semantic identity (signal_type/source/metadata_quality).
|
||||
wait_for_data(
|
||||
&client,
|
||||
"select count(*) from information_schema.tables where table_name = 'metric2' \
|
||||
and create_options like '%greptime.semantic.signal_type=metric%' \
|
||||
and create_options like '%greptime.semantic.source=prometheus%' \
|
||||
and create_options like '%greptime.semantic.metric.metadata_quality=inferred%'",
|
||||
"[[1]]",
|
||||
)
|
||||
.await;
|
||||
|
||||
guard.remove_all().await;
|
||||
}
|
||||
|
||||
pub async fn test_prometheus_remote_special_labels(store_type: StorageType) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let (app, mut guard) =
|
||||
@@ -2023,7 +2081,7 @@ pub async fn test_prometheus_remote_special_labels(store_type: StorageType) {
|
||||
expected,
|
||||
)
|
||||
.await;
|
||||
let expected = "[[\"idc3_lo_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc3_lo_table\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n on_physical_table = 'f1'\\n)\"]]";
|
||||
let expected = "[[\"idc3_lo_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc3_lo_table\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n 'greptime.semantic.metric.metadata_quality' = 'inferred',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'prometheus',\\n on_physical_table = 'f1'\\n)\"]]";
|
||||
validate_data(
|
||||
"test_prometheus_remote_special_labels_idc3_show_create_table",
|
||||
&client,
|
||||
@@ -2049,7 +2107,7 @@ pub async fn test_prometheus_remote_special_labels(store_type: StorageType) {
|
||||
expected,
|
||||
)
|
||||
.await;
|
||||
let expected = "[[\"idc4_local_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc4_local_table\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n on_physical_table = 'f2'\\n)\"]]";
|
||||
let expected = "[[\"idc4_local_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc4_local_table\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n 'greptime.semantic.metric.metadata_quality' = 'inferred',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'prometheus',\\n on_physical_table = 'f2'\\n)\"]]";
|
||||
validate_data(
|
||||
"test_prometheus_remote_special_labels_idc4_show_create_table",
|
||||
&client,
|
||||
@@ -5025,6 +5083,28 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
|
||||
let expected = "[[\"claude_code_cost_usage_USD_total\"],[\"claude_code_token_usage_tokens_total\"],[\"demo\"],[\"greptime_physical_table\"],[\"numbers\"]]";
|
||||
validate_data("otlp_metrics_all_tables", &client, "show tables;", expected).await;
|
||||
|
||||
// Metric-engine logical table carries the semantic identity. Match substrings
|
||||
// because extra_options ordering is not stable.
|
||||
validate_data(
|
||||
"otlp_metrics_semantic_identity",
|
||||
&client,
|
||||
"select count(*) from information_schema.tables where table_name = 'claude_code_cost_usage_USD_total' \
|
||||
and create_options like '%greptime.semantic.signal_type=metric%' \
|
||||
and create_options like '%greptime.semantic.source=opentelemetry%';",
|
||||
"[[1]]",
|
||||
)
|
||||
.await;
|
||||
// OTLP metric type is declared, so Phase 1 must not stamp `metadata_quality`
|
||||
// here (Phase 2 adds it as `declared`).
|
||||
validate_data(
|
||||
"otlp_metrics_no_metadata_quality",
|
||||
&client,
|
||||
"select count(*) from information_schema.tables where table_name = 'claude_code_cost_usage_USD_total' \
|
||||
and create_options like '%metadata_quality%';",
|
||||
"[[0]]",
|
||||
)
|
||||
.await;
|
||||
|
||||
// CREATE TABLE IF NOT EXISTS "claude_code_cost_usage_USD_total" (
|
||||
// "greptime_timestamp" TIMESTAMP(3) NOT NULL,
|
||||
// "greptime_value" DOUBLE NULL,
|
||||
@@ -5049,7 +5129,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
|
||||
// on_physical_table = 'greptime_physical_table',
|
||||
// otlp_metric_compat = 'prom'
|
||||
// )
|
||||
let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"host_arch\\\" STRING NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"os_version\\\" STRING NULL,\\n \\\"otel_scope_name\\\" STRING NULL,\\n \\\"otel_scope_schema_url\\\" STRING NULL,\\n \\\"otel_scope_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"host_arch\\\", \\\"job\\\", \\\"model\\\", \\\"os_version\\\", \\\"otel_scope_name\\\", \\\"otel_scope_schema_url\\\", \\\"otel_scope_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]";
|
||||
let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"host_arch\\\" STRING NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"os_version\\\" STRING NULL,\\n \\\"otel_scope_name\\\" STRING NULL,\\n \\\"otel_scope_schema_url\\\" STRING NULL,\\n \\\"otel_scope_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"host_arch\\\", \\\"job\\\", \\\"model\\\", \\\"os_version\\\", \\\"otel_scope_name\\\", \\\"otel_scope_schema_url\\\", \\\"otel_scope_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'opentelemetry',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]";
|
||||
validate_data(
|
||||
"otlp_metrics_all_show_create_table",
|
||||
&client,
|
||||
@@ -5122,7 +5202,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
|
||||
// on_physical_table = 'greptime_physical_table',
|
||||
// otlp_metric_compat = 'prom'
|
||||
// )
|
||||
let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"os_type\\\" STRING NULL,\\n \\\"os_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"os_type\\\", \\\"os_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n 'comment' = 'Created on insertion',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]";
|
||||
let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"os_type\\\" STRING NULL,\\n \\\"os_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"os_type\\\", \\\"os_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n 'comment' = 'Created on insertion',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'opentelemetry',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]";
|
||||
validate_data(
|
||||
"otlp_metrics_show_create_table",
|
||||
&client,
|
||||
@@ -5186,7 +5266,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
|
||||
// on_physical_table = 'greptime_physical_table',
|
||||
// otlp_metric_compat = 'prom'
|
||||
// )
|
||||
let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n 'comment' = 'Created on insertion',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]";
|
||||
let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n 'comment' = 'Created on insertion',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'opentelemetry',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]";
|
||||
validate_data(
|
||||
"otlp_metrics_show_create_table_none",
|
||||
&client,
|
||||
@@ -5493,7 +5573,22 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
|
||||
let expected = r#"[[1736480942444376000,1736480942444499000,123000,null,"c05d7a4ec8e1f231f02ed6e8da8655b4","d24f921c75f68e23","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444376000,1736480942444499000,123000,"d24f921c75f68e23","c05d7a4ec8e1f231f02ed6e8da8655b4","9630f2916e2f7909","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]],[1736480942444589000,1736480942444712000,123000,null,"cc9e0991a2e63d274984bd44ee669203","eba7be77e3558179","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444589000,1736480942444712000,123000,"eba7be77e3558179","cc9e0991a2e63d274984bd44ee669203","8f847259b0f6e1ab","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]]]"#;
|
||||
validate_data("otlp_traces", &client, "select * from mytable;", expected).await;
|
||||
|
||||
let expected_ddl = r#"[["mytable","CREATE TABLE IF NOT EXISTS \"mytable\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '1',\n trace_id >= '1' AND trace_id < '2',\n trace_id >= '2' AND trace_id < '3',\n trace_id >= '3' AND trace_id < '4',\n trace_id >= '4' AND trace_id < '5',\n trace_id >= '5' AND trace_id < '6',\n trace_id >= '6' AND trace_id < '7',\n trace_id >= '7' AND trace_id < '8',\n trace_id >= '8' AND trace_id < '9',\n trace_id >= '9' AND trace_id < 'a',\n trace_id >= 'a' AND trace_id < 'b',\n trace_id >= 'b' AND trace_id < 'c',\n trace_id >= 'c' AND trace_id < 'd',\n trace_id >= 'd' AND trace_id < 'e',\n trace_id >= 'e' AND trace_id < 'f',\n trace_id >= 'f'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
|
||||
// The trace v1 main table carries the trace identity (events/links preserved as
|
||||
// JSON columns by the v1 model).
|
||||
validate_data(
|
||||
"otlp_traces_semantic_identity",
|
||||
&client,
|
||||
"select count(*) from information_schema.tables where table_name = 'mytable' \
|
||||
and create_options like '%greptime.semantic.signal_type=trace%' \
|
||||
and create_options like '%greptime.semantic.source=opentelemetry%' \
|
||||
and create_options like '%greptime.semantic.pipeline=greptime_trace_v1%' \
|
||||
and create_options like '%greptime.semantic.trace.has_events=true%' \
|
||||
and create_options like '%greptime.semantic.trace.has_links=true%';",
|
||||
"[[1]]",
|
||||
)
|
||||
.await;
|
||||
|
||||
let expected_ddl = r#"[["mytable","CREATE TABLE IF NOT EXISTS \"mytable\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '1',\n trace_id >= '1' AND trace_id < '2',\n trace_id >= '2' AND trace_id < '3',\n trace_id >= '3' AND trace_id < '4',\n trace_id >= '4' AND trace_id < '5',\n trace_id >= '5' AND trace_id < '6',\n trace_id >= '6' AND trace_id < '7',\n trace_id >= '7' AND trace_id < '8',\n trace_id >= '8' AND trace_id < '9',\n trace_id >= '9' AND trace_id < 'a',\n trace_id >= 'a' AND trace_id < 'b',\n trace_id >= 'b' AND trace_id < 'c',\n trace_id >= 'c' AND trace_id < 'd',\n trace_id >= 'd' AND trace_id < 'e',\n trace_id >= 'e' AND trace_id < 'f',\n trace_id >= 'f'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\n 'greptime.semantic.signal_type' = 'trace',\n 'greptime.semantic.source' = 'opentelemetry',\n 'greptime.semantic.trace.conventions' = 'unknown',\n 'greptime.semantic.trace.has_events' = 'true',\n 'greptime.semantic.trace.has_links' = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
|
||||
validate_data(
|
||||
"otlp_traces",
|
||||
&client,
|
||||
@@ -5546,7 +5641,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
|
||||
)
|
||||
.await;
|
||||
assert_eq!(StatusCode::OK, res.status());
|
||||
let expected_ddl = r#"[["trace_table_part1","CREATE TABLE IF NOT EXISTS \"trace_table_part1\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\n\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
|
||||
let expected_ddl = r#"[["trace_table_part1","CREATE TABLE IF NOT EXISTS \"trace_table_part1\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\n\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\n 'greptime.semantic.signal_type' = 'trace',\n 'greptime.semantic.source' = 'opentelemetry',\n 'greptime.semantic.trace.conventions' = 'unknown',\n 'greptime.semantic.trace.has_events' = 'true',\n 'greptime.semantic.trace.has_links' = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
|
||||
validate_data(
|
||||
"otlp_traces",
|
||||
&client,
|
||||
@@ -5583,7 +5678,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
|
||||
)
|
||||
.await;
|
||||
assert_eq!(StatusCode::OK, res.status());
|
||||
let expected_ddl = r#"[["trace_table_part4","CREATE TABLE IF NOT EXISTS \"trace_table_part4\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '4',\n trace_id >= '4' AND trace_id < '8',\n trace_id >= '8' AND trace_id < 'c',\n trace_id >= 'c'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
|
||||
let expected_ddl = r#"[["trace_table_part4","CREATE TABLE IF NOT EXISTS \"trace_table_part4\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '4',\n trace_id >= '4' AND trace_id < '8',\n trace_id >= '8' AND trace_id < 'c',\n trace_id >= 'c'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\n 'greptime.semantic.signal_type' = 'trace',\n 'greptime.semantic.source' = 'opentelemetry',\n 'greptime.semantic.trace.conventions' = 'unknown',\n 'greptime.semantic.trace.has_events' = 'true',\n 'greptime.semantic.trace.has_links' = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
|
||||
validate_data(
|
||||
"otlp_traces",
|
||||
&client,
|
||||
@@ -5620,7 +5715,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
|
||||
)
|
||||
.await;
|
||||
assert_eq!(StatusCode::OK, res.status());
|
||||
let expected_ddl = r#"[["trace_table_part32","CREATE TABLE IF NOT EXISTS \"trace_table_part32\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '08',\n trace_id >= '08' AND trace_id < '10',\n trace_id >= '10' AND trace_id < '18',\n trace_id >= '18' AND trace_id < '20',\n trace_id >= '20' AND trace_id < '28',\n trace_id >= '28' AND trace_id < '30',\n trace_id >= '30' AND trace_id < '38',\n trace_id >= '38' AND trace_id < '40',\n trace_id >= '40' AND trace_id < '48',\n trace_id >= '48' AND trace_id < '50',\n trace_id >= '50' AND trace_id < '58',\n trace_id >= '58' AND trace_id < '60',\n trace_id >= '60' AND trace_id < '68',\n trace_id >= '68' AND trace_id < '70',\n trace_id >= '70' AND trace_id < '78',\n trace_id >= '78' AND trace_id < '80',\n trace_id >= '80' AND trace_id < '88',\n trace_id >= '88' AND trace_id < '90',\n trace_id >= '90' AND trace_id < '98',\n trace_id >= '98' AND trace_id < 'a0',\n trace_id >= 'a0' AND trace_id < 'a8',\n trace_id >= 'a8' AND trace_id < 'b0',\n trace_id >= 'b0' AND trace_id < 'b8',\n trace_id >= 'b8' AND trace_id < 'c0',\n trace_id >= 'c0' AND trace_id < 'c8',\n trace_id >= 'c8' AND trace_id < 'd0',\n trace_id >= 'd0' AND trace_id < 'd8',\n trace_id >= 'd8' AND trace_id < 'e0',\n trace_id >= 'e0' AND trace_id < 'e8',\n trace_id >= 'e8' AND trace_id < 'f0',\n trace_id >= 'f0' AND trace_id < 'f8',\n trace_id >= 'f8'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
|
||||
let expected_ddl = r#"[["trace_table_part32","CREATE TABLE IF NOT EXISTS \"trace_table_part32\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '08',\n trace_id >= '08' AND trace_id < '10',\n trace_id >= '10' AND trace_id < '18',\n trace_id >= '18' AND trace_id < '20',\n trace_id >= '20' AND trace_id < '28',\n trace_id >= '28' AND trace_id < '30',\n trace_id >= '30' AND trace_id < '38',\n trace_id >= '38' AND trace_id < '40',\n trace_id >= '40' AND trace_id < '48',\n trace_id >= '48' AND trace_id < '50',\n trace_id >= '50' AND trace_id < '58',\n trace_id >= '58' AND trace_id < '60',\n trace_id >= '60' AND trace_id < '68',\n trace_id >= '68' AND trace_id < '70',\n trace_id >= '70' AND trace_id < '78',\n trace_id >= '78' AND trace_id < '80',\n trace_id >= '80' AND trace_id < '88',\n trace_id >= '88' AND trace_id < '90',\n trace_id >= '90' AND trace_id < '98',\n trace_id >= '98' AND trace_id < 'a0',\n trace_id >= 'a0' AND trace_id < 'a8',\n trace_id >= 'a8' AND trace_id < 'b0',\n trace_id >= 'b0' AND trace_id < 'b8',\n trace_id >= 'b8' AND trace_id < 'c0',\n trace_id >= 'c0' AND trace_id < 'c8',\n trace_id >= 'c8' AND trace_id < 'd0',\n trace_id >= 'd0' AND trace_id < 'd8',\n trace_id >= 'd8' AND trace_id < 'e0',\n trace_id >= 'e0' AND trace_id < 'e8',\n trace_id >= 'e8' AND trace_id < 'f0',\n trace_id >= 'f0' AND trace_id < 'f8',\n trace_id >= 'f8'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\n 'greptime.semantic.signal_type' = 'trace',\n 'greptime.semantic.source' = 'opentelemetry',\n 'greptime.semantic.trace.conventions' = 'unknown',\n 'greptime.semantic.trace.has_events' = 'true',\n 'greptime.semantic.trace.has_links' = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
|
||||
validate_data(
|
||||
"otlp_traces",
|
||||
&client,
|
||||
@@ -6283,6 +6378,17 @@ pub async fn test_otlp_logs(store_type: StorageType) {
|
||||
expected,
|
||||
)
|
||||
.await;
|
||||
|
||||
// The auto-created log table carries the log identity.
|
||||
validate_data(
|
||||
"otlp_logs_semantic_identity",
|
||||
&client,
|
||||
"select count(*) from information_schema.tables where table_name = 'opentelemetry_logs' \
|
||||
and create_options like '%greptime.semantic.signal_type=log%' \
|
||||
and create_options like '%greptime.semantic.source=opentelemetry%';",
|
||||
"[[1]]",
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
{
|
||||
@@ -7718,7 +7824,7 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) {
|
||||
.await;
|
||||
assert_eq!(StatusCode::OK, res.status());
|
||||
|
||||
let trace_table_sql = "[[\"mytable\",\"CREATE TABLE IF NOT EXISTS \\\"mytable\\\" (\\n \\\"timestamp\\\" TIMESTAMP(9) NOT NULL,\\n \\\"timestamp_end\\\" TIMESTAMP(9) NULL,\\n \\\"duration_nano\\\" BIGINT UNSIGNED NULL,\\n \\\"parent_span_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"trace_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"span_id\\\" STRING NULL,\\n \\\"span_kind\\\" STRING NULL,\\n \\\"span_name\\\" STRING NULL,\\n \\\"span_status_code\\\" STRING NULL,\\n \\\"span_status_message\\\" STRING NULL,\\n \\\"trace_state\\\" STRING NULL,\\n \\\"scope_name\\\" STRING NULL,\\n \\\"scope_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"span_attributes.operation.type\\\" STRING NULL,\\n \\\"span_attributes.net.peer.ip\\\" STRING NULL,\\n \\\"span_attributes.peer.service\\\" STRING NULL,\\n \\\"span_events\\\" JSON NULL,\\n \\\"span_links\\\" JSON NULL,\\n TIME INDEX (\\\"timestamp\\\"),\\n PRIMARY KEY (\\\"service_name\\\")\\n)\\nPARTITION ON COLUMNS (\\\"trace_id\\\") (\\n trace_id < '1',\\n trace_id >= '1' AND trace_id < '2',\\n trace_id >= '2' AND trace_id < '3',\\n trace_id >= '3' AND trace_id < '4',\\n trace_id >= '4' AND trace_id < '5',\\n trace_id >= '5' AND trace_id < '6',\\n trace_id >= '6' AND trace_id < '7',\\n trace_id >= '7' AND trace_id < '8',\\n trace_id >= '8' AND trace_id < '9',\\n trace_id >= '9' AND trace_id < 'a',\\n trace_id >= 'a' AND trace_id < 'b',\\n trace_id >= 'b' AND trace_id < 'c',\\n trace_id >= 'c' AND trace_id < 'd',\\n trace_id >= 'd' AND trace_id < 'e',\\n trace_id >= 'e' AND trace_id < 'f',\\n trace_id >= 'f'\\n)\\nENGINE=mito\\nWITH(\\n 'comment' = 'Created on insertion',\\n append_mode = 'true',\\n table_data_model = 'greptime_trace_v1',\\n ttl = '7days'\\n)\"]]";
|
||||
let trace_table_sql = "[[\"mytable\",\"CREATE TABLE IF NOT EXISTS \\\"mytable\\\" (\\n \\\"timestamp\\\" TIMESTAMP(9) NOT NULL,\\n \\\"timestamp_end\\\" TIMESTAMP(9) NULL,\\n \\\"duration_nano\\\" BIGINT UNSIGNED NULL,\\n \\\"parent_span_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"trace_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"span_id\\\" STRING NULL,\\n \\\"span_kind\\\" STRING NULL,\\n \\\"span_name\\\" STRING NULL,\\n \\\"span_status_code\\\" STRING NULL,\\n \\\"span_status_message\\\" STRING NULL,\\n \\\"trace_state\\\" STRING NULL,\\n \\\"scope_name\\\" STRING NULL,\\n \\\"scope_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"span_attributes.operation.type\\\" STRING NULL,\\n \\\"span_attributes.net.peer.ip\\\" STRING NULL,\\n \\\"span_attributes.peer.service\\\" STRING NULL,\\n \\\"span_events\\\" JSON NULL,\\n \\\"span_links\\\" JSON NULL,\\n TIME INDEX (\\\"timestamp\\\"),\\n PRIMARY KEY (\\\"service_name\\\")\\n)\\nPARTITION ON COLUMNS (\\\"trace_id\\\") (\\n trace_id < '1',\\n trace_id >= '1' AND trace_id < '2',\\n trace_id >= '2' AND trace_id < '3',\\n trace_id >= '3' AND trace_id < '4',\\n trace_id >= '4' AND trace_id < '5',\\n trace_id >= '5' AND trace_id < '6',\\n trace_id >= '6' AND trace_id < '7',\\n trace_id >= '7' AND trace_id < '8',\\n trace_id >= '8' AND trace_id < '9',\\n trace_id >= '9' AND trace_id < 'a',\\n trace_id >= 'a' AND trace_id < 'b',\\n trace_id >= 'b' AND trace_id < 'c',\\n trace_id >= 'c' AND trace_id < 'd',\\n trace_id >= 'd' AND trace_id < 'e',\\n trace_id >= 'e' AND trace_id < 'f',\\n trace_id >= 'f'\\n)\\nENGINE=mito\\nWITH(\\n 'comment' = 'Created on insertion',\\n append_mode = 'true',\\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\\n 'greptime.semantic.signal_type' = 'trace',\\n 'greptime.semantic.source' = 'opentelemetry',\\n 'greptime.semantic.trace.conventions' = 'unknown',\\n 'greptime.semantic.trace.has_events' = 'true',\\n 'greptime.semantic.trace.has_links' = 'true',\\n table_data_model = 'greptime_trace_v1',\\n ttl = '7days'\\n)\"]]";
|
||||
validate_data(
|
||||
"trace_v1_create_table",
|
||||
&client,
|
||||
|
||||
@@ -1,3 +1,31 @@
|
||||
-- Incremental aggregate reads only support append-only source tables because
|
||||
-- update/upsert sources need old-value compensation.
|
||||
CREATE TABLE incremental_non_append_input (
|
||||
host_id INT,
|
||||
n INT,
|
||||
ts TIMESTAMP TIME INDEX,
|
||||
PRIMARY KEY(host_id)
|
||||
);
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
CREATE FLOW incremental_non_append_flow SINK TO incremental_non_append_sink
|
||||
WITH (experimental_enable_incremental_read = 'true')
|
||||
AS
|
||||
SELECT
|
||||
sum(n) AS total,
|
||||
date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window
|
||||
FROM
|
||||
incremental_non_append_input
|
||||
GROUP BY
|
||||
time_window;
|
||||
|
||||
Error: 3001(EngineExecuteQuery), Unsupported: Flow incremental read requires append-only source table, but source table `greptime.public.incremental_non_append_input` is not append-only. Consider setting append_mode='true' on the source table or disabling experimental_enable_incremental_read
|
||||
|
||||
DROP TABLE incremental_non_append_input;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
CREATE TABLE incremental_aggr_input (
|
||||
host_id INT,
|
||||
n INT,
|
||||
@@ -9,7 +37,9 @@ CREATE TABLE incremental_aggr_input (
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink AS
|
||||
CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink
|
||||
WITH (experimental_enable_incremental_read = 'true')
|
||||
AS
|
||||
SELECT
|
||||
sum(n) AS total,
|
||||
min(n) AS min_n,
|
||||
|
||||
@@ -1,3 +1,25 @@
|
||||
-- Incremental aggregate reads only support append-only source tables because
|
||||
-- update/upsert sources need old-value compensation.
|
||||
CREATE TABLE incremental_non_append_input (
|
||||
host_id INT,
|
||||
n INT,
|
||||
ts TIMESTAMP TIME INDEX,
|
||||
PRIMARY KEY(host_id)
|
||||
);
|
||||
|
||||
CREATE FLOW incremental_non_append_flow SINK TO incremental_non_append_sink
|
||||
WITH (experimental_enable_incremental_read = 'true')
|
||||
AS
|
||||
SELECT
|
||||
sum(n) AS total,
|
||||
date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window
|
||||
FROM
|
||||
incremental_non_append_input
|
||||
GROUP BY
|
||||
time_window;
|
||||
|
||||
DROP TABLE incremental_non_append_input;
|
||||
|
||||
CREATE TABLE incremental_aggr_input (
|
||||
host_id INT,
|
||||
n INT,
|
||||
@@ -7,7 +29,9 @@ CREATE TABLE incremental_aggr_input (
|
||||
append_mode = 'true'
|
||||
);
|
||||
|
||||
CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink AS
|
||||
CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink
|
||||
WITH (experimental_enable_incremental_read = 'true')
|
||||
AS
|
||||
SELECT
|
||||
sum(n) AS total,
|
||||
min(n) AS min_n,
|
||||
|
||||
@@ -12,7 +12,9 @@ CREATE TABLE flow_incr_memtable_input (
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink AS
|
||||
CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink
|
||||
WITH (experimental_enable_incremental_read = 'true')
|
||||
AS
|
||||
SELECT
|
||||
sum(n) AS total,
|
||||
min(n) AS min_n,
|
||||
|
||||
@@ -10,7 +10,9 @@ CREATE TABLE flow_incr_memtable_input (
|
||||
append_mode = 'true'
|
||||
);
|
||||
|
||||
CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink AS
|
||||
CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink
|
||||
WITH (experimental_enable_incremental_read = 'true')
|
||||
AS
|
||||
SELECT
|
||||
sum(n) AS total,
|
||||
min(n) AS min_n,
|
||||
|
||||
@@ -17,7 +17,9 @@ WITH (
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink AS
|
||||
CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink
|
||||
WITH (experimental_enable_incremental_read = 'true')
|
||||
AS
|
||||
SELECT
|
||||
sum(n) AS total,
|
||||
min(n) AS min_n,
|
||||
|
||||
@@ -15,7 +15,9 @@ WITH (
|
||||
append_mode = 'true'
|
||||
);
|
||||
|
||||
CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink AS
|
||||
CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink
|
||||
WITH (experimental_enable_incremental_read = 'true')
|
||||
AS
|
||||
SELECT
|
||||
sum(n) AS total,
|
||||
min(n) AS min_n,
|
||||
|
||||
@@ -476,7 +476,7 @@ SINK TO out_num_cnt_show
|
||||
WITH (access_key_id = [true])
|
||||
AS SELECT number AS n1 FROM numbers_input_show where number > 10;
|
||||
|
||||
Error: 1004(InvalidArguments), Invalid SQL, error: unknown flow option 'access_key_id', supported options: defer_on_missing_source
|
||||
Error: 1004(InvalidArguments), Invalid SQL, error: unknown flow option 'access_key_id', supported options: defer_on_missing_source, experimental_enable_incremental_read
|
||||
|
||||
DROP FLOW filter_numbers_show;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user