Merge branch 'main' into feat/flush-hook-extension-point

This commit is contained in:
Ning Sun
2026-06-01 22:12:09 -07:00
committed by GitHub
85 changed files with 2690 additions and 1760 deletions

92
Cargo.lock generated
View File

@@ -2278,6 +2278,7 @@ dependencies = [
"futures",
"lazy_static",
"object-store",
"object_store_opendal",
"orc-rust",
"parquet",
"paste",
@@ -5102,6 +5103,7 @@ dependencies = [
"datatypes",
"futures",
"object-store",
"object_store_opendal",
"serde",
"serde_json",
"snafu 0.8.6",
@@ -8320,6 +8322,7 @@ dependencies = [
"datafusion-common",
"datafusion-expr",
"datatypes",
"derive_more",
"dotenv",
"either",
"futures",
@@ -9074,8 +9077,9 @@ dependencies = [
[[package]]
name = "object_store_opendal"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eb12a624a41fce745838d0ef3701ff6c47797c13cd18ad3612fd2a3134fdbd8"
dependencies = [
"async-trait",
"bytes",
@@ -9162,8 +9166,9 @@ checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"
[[package]]
name = "opendal"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96c9c85ce253ff87225e7669979d877a20c98a06604ec9d6dd5f4473e08f1ae1"
dependencies = [
"ctor",
"opendal-core",
@@ -9183,8 +9188,9 @@ dependencies = [
[[package]]
name = "opendal-core"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4f8607c90e2c963a91467f50fb49fbc7fb3d573f88cea219ca59ccd3740b309"
dependencies = [
"anyhow",
"base64 0.22.1",
@@ -9210,8 +9216,9 @@ dependencies = [
[[package]]
name = "opendal-layer-concurrent-limit"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0d6f81ba6960e3fae1882f253b114b21d7e444e1534f209c7737a79f6243eb6f"
dependencies = [
"futures",
"http 1.3.1",
@@ -9221,8 +9228,9 @@ dependencies = [
[[package]]
name = "opendal-layer-logging"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58ada45c6d81d1aa4c9305d0c7d4bc317c59c85866a0908a2d75a7a978aa5ee2"
dependencies = [
"log",
"opendal-core",
@@ -9230,8 +9238,9 @@ dependencies = [
[[package]]
name = "opendal-layer-observe-metrics-common"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "628b0228fdbd13c3d9d50eee4341f2eb82ca5b44991e4c68f07c84cc823e2d12"
dependencies = [
"futures",
"http 1.3.1",
@@ -9240,8 +9249,9 @@ dependencies = [
[[package]]
name = "opendal-layer-prometheus"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0487bdb1357097ec8654781bad03ef310282517738e2864ebde69e27aaafc5ec"
dependencies = [
"opendal-core",
"opendal-layer-observe-metrics-common",
@@ -9250,8 +9260,9 @@ dependencies = [
[[package]]
name = "opendal-layer-retry"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b2a25a718afb81fad81cb9a0580a1cb989221fa2317f888c6a37f8dad408eb7"
dependencies = [
"backon",
"log",
@@ -9260,8 +9271,9 @@ dependencies = [
[[package]]
name = "opendal-layer-timeout"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e91f731724c213af81e9d03517859c8fc47b4578e64ad61ae4f099f10fe36e3"
dependencies = [
"opendal-core",
"tokio",
@@ -9269,8 +9281,9 @@ dependencies = [
[[package]]
name = "opendal-layer-tracing"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90c6fc9df6da1f0dafbdf55fa48525f1643aefbe7da8f46936e869e2a5b8a34f"
dependencies = [
"futures",
"http 1.3.1",
@@ -9280,8 +9293,9 @@ dependencies = [
[[package]]
name = "opendal-service-azblob"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0030644366ef5d8cbe3a4a5822bf99a4aafddc1666e9d24b44d158d9062fc76a"
dependencies = [
"base64 0.22.1",
"bytes",
@@ -9300,8 +9314,9 @@ dependencies = [
[[package]]
name = "opendal-service-azure-common"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b489f13c42e69d69bdd72952b634356ec43a7881a20259b38b540fcecdf4051"
dependencies = [
"http 1.3.1",
"opendal-core",
@@ -9309,8 +9324,9 @@ dependencies = [
[[package]]
name = "opendal-service-fs"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22e89a665fef0e6bd249cf5ea47fc174b7ba892159bee4b9382528b1ca873a2c"
dependencies = [
"bytes",
"log",
@@ -9322,8 +9338,9 @@ dependencies = [
[[package]]
name = "opendal-service-gcs"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48de101aac565ed06af4b47903c24eafd249075553ec1fb18256751c45148d47"
dependencies = [
"async-trait",
"bytes",
@@ -9342,8 +9359,9 @@ dependencies = [
[[package]]
name = "opendal-service-http"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb6af628a0bf14075b957179444927e1df40dc7addef382b585a05ef015a077b"
dependencies = [
"http 1.3.1",
"log",
@@ -9353,8 +9371,9 @@ dependencies = [
[[package]]
name = "opendal-service-oss"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "328fa55e8888cbdfe00826bfea2a79042422b720e8369e9e021e46121dea5ace"
dependencies = [
"bytes",
"http 1.3.1",
@@ -9369,8 +9388,9 @@ dependencies = [
[[package]]
name = "opendal-service-s3"
version = "0.56.0"
source = "git+https://github.com/apache/opendal.git?rev=4ad2d85296ffa6fdc2882f97d3c760ee243913f7#4ad2d85296ffa6fdc2882f97d3c760ee243913f7"
version = "0.57.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "313d46c9f5ae70bca26b7c3e3fbb9b639292625f28af73aa016f47e788af9deb"
dependencies = [
"base64 0.22.1",
"bytes",
@@ -14102,9 +14122,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
[[package]]
name = "tar"
version = "0.4.45"
version = "0.4.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973"
checksum = "3f6221d9a6003c78398e3b239969f352578258df48c8eb051caadae0015bc840"
dependencies = [
"filetime",
"libc",

View File

@@ -178,7 +178,7 @@ nalgebra = "0.33"
nix = { version = "0.30.1", default-features = false, features = ["event", "fs", "process"] }
notify = "8.0"
num_cpus = "1.16"
object_store_opendal = { git = "https://github.com/apache/opendal.git", rev = "4ad2d85296ffa6fdc2882f97d3c760ee243913f7" }
object_store_opendal = "0.57"
once_cell = "1.18"
opentelemetry-proto = { version = "0.31", features = [
"gen-tonic",

View File

@@ -14,6 +14,7 @@
| --- | -----| ------- | ----------- |
| `default_timezone` | String | Unset | The default timezone of the server. |
| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
| `auto_create_table` | Bool | `true` | Server-side global switch for auto table creation on write.<br/>When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`. |
| `user_provider` | String | Unset | The user provider for authentication.<br/>Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" |
| `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
| `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.<br/>Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail" |
@@ -230,6 +231,7 @@
| --- | -----| ------- | ----------- |
| `default_timezone` | String | Unset | The default timezone of the server. |
| `default_column_prefix` | String | Unset | The default column prefix for auto-created time index and value columns. |
| `auto_create_table` | Bool | `true` | Server-side global switch for auto table creation on write.<br/>When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`. |
| `user_provider` | String | Unset | The user provider for authentication.<br/>Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd" |
| `max_in_flight_write_bytes` | String | Unset | Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).<br/>Set to 0 to disable the limit. Default: "0" (unlimited) |
| `write_bytes_exhausted_policy` | String | Unset | Policy when write bytes quota is exhausted.<br/>Options: "wait" (default, 10s timeout), "wait(<duration>)" (e.g., "wait(30s)"), "fail" |
@@ -628,6 +630,7 @@
| `flow.batching_mode.experimental_frontend_scan_timeout` | String | `30s` | Flow wait for available frontend timeout,<br/>if failed to find available frontend after frontend_scan_timeout elapsed, return error<br/>which prevent flownode from starting |
| `flow.batching_mode.experimental_max_filter_num_per_query` | Integer | `20` | Maximum number of filters allowed in a single query |
| `flow.batching_mode.experimental_time_window_merge_threshold` | Integer | `3` | Time window merge distance |
| `flow.batching_mode.experimental_enable_incremental_read` | Bool | `false` | Whether to enable experimental flow incremental source reads.<br/>When disabled, batching flows always execute full-snapshot queries.<br/>Can be overridden per flow with WITH (experimental_enable_incremental_read = 'true'). |
| `flow.batching_mode.read_preference` | String | `Leader` | Read preference of the Frontend client. |
| `flow.batching_mode.frontend_tls` | -- | -- | -- |
| `flow.batching_mode.frontend_tls.enabled` | Bool | `false` | Whether to enable TLS for client. |

View File

@@ -31,6 +31,10 @@ node_id = 14
#+experimental_max_filter_num_per_query=20
## Time window merge distance
#+experimental_time_window_merge_threshold=3
## Whether to enable experimental flow incremental source reads.
## When disabled, batching flows always execute full-snapshot queries.
## Can be overridden per flow with WITH (experimental_enable_incremental_read = 'true').
#+experimental_enable_incremental_read=false
## Read preference of the Frontend client.
#+read_preference="Leader"
[flow.batching_mode.frontend_tls]

View File

@@ -6,6 +6,10 @@ default_timezone = "UTC"
## @toml2docs:none-default
default_column_prefix = "greptime"
## Server-side global switch for auto table creation on write.
## When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`.
#+ auto_create_table = true
## The user provider for authentication.
## Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd"
## @toml2docs:none-default

View File

@@ -6,6 +6,10 @@ default_timezone = "UTC"
## @toml2docs:none-default
default_column_prefix = "greptime"
## Server-side global switch for auto table creation on write.
## When `false`, a missing table is never auto-created even if the request sets the `auto_create_table` hint to `true`. Default: `true`.
#+ auto_create_table = true
## The user provider for authentication.
## Examples: "static_user_provider:file:/path/to/users", "static_user_provider:cmd:greptime_user=greptime_pwd"
## @toml2docs:none-default

View File

@@ -0,0 +1,157 @@
---
Feature Name: Table Semantic Layer
Tracking Issue: TBD
Date: 2026-05-28
Author: "Dennis Zhuang <killme2008@gmail.com>"
---
# Summary
Attach a thin layer of semantic metadata to each table so machine consumers — LLM agents, alert generators, dashboard builders, MCP servers, ETL pipelines — can align it with the observability concepts they already know (OTel instrument kinds, Prometheus naming conventions, UCUM units, semantic conventions, severity numbers, OTel ↔ Prometheus translation rules).
The mechanism reuses what already exists in `table_options` (the same slot that today carries `table_data_model` and `otlp_metric_compat`): a reserved `greptime.semantic.*` namespace, plus standard SQL column `COMMENT` for field-level supplements, plus an `information_schema.semantic_tables` view as the discovery entry point. No new protocol, no new DDL keyword.
Per-table identity only. Cross-table relationships are deferred.
# Motivation
GreptimeDB already ingests OTLP metrics / traces / logs and Prometheus remote write. Each protocol carries rich metadata on the wire (instrument kind, temporality, unit, scope, resource, semantic-conventions version), and most of it is dropped when rows land in a table:
- An `opentelemetry_traces` table looks like any wide table; signal type, source, and field provenance must be guessed from naming.
- The OTel-to-Prometheus translation in v0.16+ actively drops scope attributes and most resource attributes; the table never records *what was dropped*.
- Prometheus remote write v1 metadata is unreliable by protocol, but downstream tables do not flag whether `counter` typing was *declared* or *inferred* from the `_total` suffix.
- Mixed-temporality data (OTel delta + Prometheus cumulative in the same table) is unrecoverable from schema alone.
The audience is broader than LLM agents. Alert generators need to choose between `rate()` and absolute thresholds, and need units to pick sensible bounds. Dashboard builders pick visualisations by signal type. MCP servers surface a structured tool catalog instead of free-text descriptions. ETL pipelines need lineage to know whether a `service_name` column is `resource.service.name` or a free-form label. All of them currently guess from column names; the metadata to remove the guess already exists at ingest time, we just do not preserve it.
# Goals
1. Tag every ingested table with a stable identity using existing SQL surfaces — no new protocol, no new DDL keyword.
2. Record the lossy transformations the ingestion path performs (dropped attributes, scope handling, type inference vs. declaration).
3. Expose one `information_schema` view as the consumer-facing discovery entry point.
4. Keep the layer optional and additive — tables without these options keep working unchanged.
# Non-Goals
- Cross-table relationship modelling. Deferred to a follow-up RFC.
- Bespoke storage. Reuse `table_options` and column `COMMENT`.
- Semantic enforcement at query time. The layer is descriptive, not coercive.
- New wire protocol. Upstream standardisation is mentioned only as a future direction.
# Proposal
## Three mechanisms
1. **`greptime.semantic.*` table options** — table-level identity and lineage. Carried inside the existing `table_options` blob. This is the same slot that today carries `table_data_model = 'greptime_trace_v1'` and `otlp_metric_compat = 'prom'`, so the mechanism is generalising what the OTLP trace auto-create path already does.
2. **Column `COMMENT`** — column-level supplements ("this column is `resource.service.name`"; "this column carries delta values"). Standard SQL.
3. **`information_schema.semantic_tables` view** — a denormalised projection of the options, registered through the existing `with_extra_table_factories()` hook. Tables without a `greptime.semantic.*` option do not appear in the view.
## Vocabulary
All keys are flat strings under the `greptime.semantic.` prefix; values are strings; unknown keys are tolerated so the vocabulary can grow without coordinated rollouts.
**Common (all signals)**
| Key | Example |
| --- | --- |
| `greptime.semantic.signal_type` | `trace` / `log` / `metric` / `event` |
| `greptime.semantic.source` | `opentelemetry` / `prometheus` / `elasticsearch` / `loki` / `custom` |
| `greptime.semantic.source_version` | protocol or SDK version, e.g. `v2` (Prom remote write), `1.30.0` (optional) |
| `greptime.semantic.pipeline` | `greptime_trace_v1` (subsumes the existing `table_data_model` value) |
**Trace**: `greptime.semantic.trace.conventions` (e.g. `otel-semconv-1.27`, lifted from `schema_url`, which is the version of the OpenTelemetry semantic conventions used in this table), `greptime.semantic.trace.has_events`, `greptime.semantic.trace.has_links`.
**Metric** — v1 assumes one metric type per table, which is how both Prom RW and the post-v0.16 OTel ingestion path land data today; mixed-type tables are a follow-up.
| Key | Example |
| --- | --- |
| `greptime.semantic.metric.type` | `counter` / `gauge` / `histogram` / `summary` / `updown_counter` / `gauge_histogram` / `info` / `stateset` |
| `greptime.semantic.metric.unit` | UCUM, e.g. `s`, `By`, `{request}` |
| `greptime.semantic.metric.temporality` | `cumulative` / `delta` (OTel only) |
| `greptime.semantic.metric.monotonic` | `true` / `false` |
| `greptime.semantic.metric.metadata_quality` | `declared` (OTLP / Prom RW v2 / exposition) or `inferred` (Prom RW v1, name-suffix guess) |
| `greptime.semantic.metric.original_name` | Pre-translation OTel name when the table name was Prometheus-ised |
`metadata_quality = inferred` is the load-bearing field for confidence-aware tooling: an inferred counter should be re-checked before betting on `rate()`-style semantics.
**Log**: `greptime.semantic.log.severity_scheme` (`otlp` / `syslog` / `custom`), `greptime.semantic.log.body_format` (`string` / `json` / `mixed`).
**Resource / scope preservation**: `greptime.semantic.resource.attributes_preserved` (JSON array string of attrs promoted to columns), `greptime.semantic.resource.attributes_dropped` (boolean), `greptime.semantic.scope.preserved` (boolean). These answer the most common downstream question: "is this data missing because it was dropped, or because it lives on a different column than I think?" List-shaped values use JSON array strings rather than comma-separated text to avoid escaping and ordering ambiguity.
## Conflict and update semantics
Two design decisions worth pinning down up front, because they constrain everything else:
- **Conflict.** Some table-level keys (`trace.conventions` lifted from `schema_url`, `metric.temporality`, ...) cannot represent the truth when a long-lived table sees rows from multiple sources. v1 records `mixed` or `unknown` rather than a fictitious single value. Downstream consumers must treat any single-valued semantic key as best-effort, not strong evidence.
- **Update.** Semantic options are stamped at table creation. v1 does not specify an update path; promoting `metadata_quality` from `inferred` to `declared`, refreshing `resource.attributes_preserved`, or revising `trace.conventions` on later writes is deferred. If real usage shows update is needed, it lands as a separate RFC.
## `information_schema.semantic_tables`
A consumer's first SQL on connect:
```sql
SELECT table_catalog, table_schema, table_name, signal_type, source, pipeline
FROM information_schema.semantic_tables;
```
returns one row per semantic-tagged table. The view exposes a stable set of core columns (`table_catalog`, `table_schema`, `table_name`, `signal_type`, `source`, `source_version`, `pipeline`) plus a `semantic_options` JSON column carrying the rest of the `greptime.semantic.*` keys verbatim. Future keys appear inside `semantic_options` without forcing a view-schema change; only widely-used keys are ever promoted to first-class columns.
# Implementation Plan
Four phases, each independently shippable.
1. **Identity.** Stamp `signal_type` and `source` on every auto-create path. The OTLP paths already have natural injection points; Prom remote write is the one non-trivial path because metric-engine logical tables share physical storage (see Open Question 2).
2. **Metric specifics.** Add type / unit / temporality / monotonic / metadata_quality / original_name at OTel metric and Prom RW ingestion sites; the data is already at hand inside the OTel translator.
3. **Resource / scope lineage.** Record what the OTel-to-Prometheus translation kept and dropped.
4. **`information_schema.semantic_tables` view + documentation** as a stable user-facing contract.
# Relationship to OpenTelemetry standardisation
OTel today standardises what producers emit and how data collectors are managed; the read side — what a backend exposes back to clients — is deliberately vendor turf. OTLP is one-way; OpAMP is agent management; OTEP-0243 (App Telemetry Schema) is producer-side; `schema_url` is producer-stated with no reverse. Adjacent precedents — Prometheus `/api/v1/metadata`, Loki labels API, Tempo tags, Jaeger services, ad-hoc MCP servers — are all vendor-specific.
This is a real gap. The shape we propose locally (signal-agnostic, `schema_url`-aware, structured around a small vocabulary) is deliberately close to what a future upstream OTEP for a backend-catalog read API could look like, with Weaver's *Resolved Telemetry Schema* as the natural data model. We do not commit to driving such an OTEP here; we do commit to keeping the local shape close enough that a future upstream proposal does not force a breaking migration.
# Alternatives
- **New DDL syntax (`SEMANTIC trace WITH (...)`).** Cleaner-looking but non-standard and forces every client to learn it. The metadata is not interesting enough to justify a new keyword.
- **Dedicated `_semantic` system table.** Doubles the storage path for what is static per-table KV and adds lifecycle questions (drop, backfill). A view over `table_options` covers the same access pattern.
- **Column comments only.** Discovery (`WHERE signal_type = 'trace'`) becomes a full-text problem. Comments are good for column-level supplements, not for identity.
- **Encode everything into the table name.** What we do today. Every new field becomes a new naming convention.
# Open Questions
1. **Namespace prefix.** `greptime.semantic.*` vs. bare `semantic.*`. v1 picks the vendored prefix; alias or migrate if a community standard later emerges.
2. **Prom RW injection point.** Metric-engine logical tables share physical storage, so per-logical-table options need a hook that does not exist as cleanly as the OTLP trace branch. A short spike before Phase 1 lands for Prom RW.
3. **Mixed-type metric tables.** When ingestion modes that pack multiple metric types into one table appear, `metric.type` migrates from table-level to row-level. v1 leaves a `metric.type = 'mixed'` marker and punts.
4. **Stability surface.** Top-level keys (`signal_type`, `source`) are stable; sub-namespaces (`metric.*`, ...) are evolving until v1.0 of the layer is declared.
# Future Work
- **Cross-table relationships.** Paired trace/services tables, metric/info pairing, JOIN hints. Its own RFC.
- **Producer SDK/client identity.** An optional `greptime.semantic.source.sdk` key recording the emitting client (e.g. `opentelemetry-go`, `opentelemetry-java`, `opentelemetry-collector`). Because a single table can receive data from multiple SDKs (a shared trace table is the common case), mixed producers collapse to `mixed`, following the same conflict rule as the table-level keys above.
- **Backfill** for tables created before this feature shipped.
- **Upstream proposal.** Carry the shape into a community proposal — likely an OTEP for an OTLP-Catalog read API plus an MCP binding — informed by Greptime's local usage data.
# References
OpenTelemetry:
- [OTLP specification](https://opentelemetry.io/docs/specs/otlp/)
- [OTel Schemas (`schema_url`)](https://opentelemetry.io/docs/specs/otel/schemas/)
- [Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/)
- [OTEP-0243: App Telemetry Schema](https://github.com/open-telemetry/oteps/blob/main/text/0243-app-telemetry-schema-vision-roadmap.md)
- [OpAMP specification](https://github.com/open-telemetry/opamp-spec/blob/main/specification.md)
- [Weaver: Resolved Telemetry Schema](https://github.com/open-telemetry/weaver)
- [2025 Stability Proposal](https://opentelemetry.io/blog/2025/stability-proposal-announcement/)
Prometheus / OpenMetrics:
- [Prometheus Remote Write 1.0](https://prometheus.io/docs/specs/prw/remote_write_spec/)
- [Prometheus Remote Write 2.0](https://prometheus.io/docs/specs/prw/remote_write_spec_2_0/)
- [Prometheus exposition formats](https://prometheus.io/docs/instrumenting/exposition_formats/)
- [Prometheus HTTP API: `/api/v1/metadata`](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-metric-metadata)
Units and conventions:
- [UCUM — Unified Code for Units of Measure](https://ucum.org/)
GreptimeDB:
- [OTLP ingestion guide](https://docs.greptime.com/user-guide/ingest-data/for-observability/opentelemetry/)
- [Trace data model](https://docs.greptime.com/user-guide/traces/data-model/)

View File

@@ -14,7 +14,9 @@
use std::sync::Arc;
use common_meta::cache::{CacheContainer, Initializer, TableInfoCacheRef, TableNameCacheRef};
use common_meta::cache::{
CacheContainer, InitStrategy, Initializer, TableInfoCacheRef, TableNameCacheRef,
};
use common_meta::error::{Result as MetaResult, ValueNotExistSnafu};
use common_meta::instruction::CacheIdent;
use futures::future::BoxFuture;
@@ -38,7 +40,14 @@ pub fn new_table_cache(
) -> TableCache {
let init = init_factory(table_info_cache, table_name_cache);
CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
CacheContainer::with_strategy(
name,
cache,
Box::new(invalidator),
init,
filter,
InitStrategy::VersionChecked,
)
}
fn init_factory(

View File

@@ -79,7 +79,7 @@ impl App for Instance {
}
async fn start(&mut self) -> Result<()> {
plugins::start_datanode_plugins(self.datanode.plugins())
plugins::start_datanode_plugins(&self.datanode)
.await
.context(StartDatanodeSnafu)?;

View File

@@ -90,7 +90,7 @@ impl App for Instance {
}
async fn start(&mut self) -> Result<()> {
plugins::start_flownode_plugins(self.flownode.flow_engine().plugins().clone())
plugins::start_flownode_plugins(&self.flownode)
.await
.context(StartFlownodeSnafu)?;

View File

@@ -95,8 +95,7 @@ impl App for Instance {
}
async fn start(&mut self) -> Result<()> {
let plugins = self.frontend.instance.plugins().clone();
plugins::start_frontend_plugins(plugins)
plugins::start_frontend_plugins(&self.frontend.instance)
.await
.context(error::StartFrontendSnafu)?;

View File

@@ -68,7 +68,7 @@ impl App for Instance {
}
async fn start(&mut self) -> Result<()> {
plugins::start_metasrv_plugins(self.instance.plugins())
plugins::start_metasrv_plugins(&self.instance)
.await
.context(StartMetaServerSnafu)?;

View File

@@ -164,7 +164,7 @@ impl App for Instance {
.start(self.leader_services_context.clone())
.await?;
plugins::start_frontend_plugins(self.frontend.instance.plugins().clone())
plugins::start_frontend_plugins(&self.frontend.instance)
.await
.context(error::StartFrontendSnafu)?;

View File

@@ -114,6 +114,7 @@ fn test_load_frontend_example_config() {
component: FrontendOptions {
default_timezone: Some("UTC".to_string()),
default_column_prefix: Some("greptime".to_string()),
auto_create_table: true,
meta_client: Some(MetaClientOptions {
metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
timeout: Duration::from_secs(3),
@@ -267,6 +268,7 @@ fn test_load_standalone_example_config() {
component: StandaloneOptions {
default_timezone: Some("UTC".to_string()),
default_column_prefix: Some("greptime".to_string()),
auto_create_table: true,
wal: DatanodeWalConfig::RaftEngine(RaftEngineConfig {
dir: Some(format!("{}/{}", DEFAULT_DATA_HOME, WAL_DIR)),
sync_period: Some(Duration::from_secs(10)),

View File

@@ -33,6 +33,7 @@ datatypes.workspace = true
futures.workspace = true
lazy_static.workspace = true
object-store.workspace = true
object_store_opendal.workspace = true
orc-rust = { version = "0.8", default-features = false, features = ["async"] }
parquet.workspace = true
paste.workspace = true

View File

@@ -316,7 +316,7 @@ pub async fn file_to_stream(
.with_file_compression_type(df_compression)
.build();
let store = Arc::new(object_store::compat::OpendalStore::new(store.clone()));
let store = Arc::new(object_store_opendal::OpendalStore::new(store.clone()));
let file_opener = config.file_source().create_file_opener(store, &config, 0)?;
let stream = FileStream::new(&config, 0, file_opener, &ExecutionPlanMetricsSet::new())?;

View File

@@ -44,7 +44,7 @@ struct Test<'a> {
impl Test<'_> {
async fn run(self, store: &ObjectStore) {
let store = Arc::new(object_store::compat::OpendalStore::new(store.clone()));
let store = Arc::new(object_store_opendal::OpendalStore::new(store.clone()));
let file_opener = self
.file_source
.create_file_opener(store, &self.config, 0)

View File

@@ -27,12 +27,14 @@ const ACCESS_KEY_ID: &str = "access_key_id";
const ACCESS_KEY_SECRET: &str = "access_key_secret";
const ROOT: &str = "root";
const ALLOW_ANONYMOUS: &str = "allow_anonymous";
const SKIP_SIGNATURE: &str = "skip_signature";
/// Check if the key is supported in OSS configuration.
pub fn is_supported_in_oss(key: &str) -> bool {
[
ROOT,
ALLOW_ANONYMOUS,
SKIP_SIGNATURE,
BUCKET,
ENDPOINT,
ACCESS_KEY_ID,
@@ -61,18 +63,23 @@ pub fn build_oss_backend(
builder = builder.access_key_secret(access_key_secret);
}
if let Some(allow_anonymous) = connection.get(ALLOW_ANONYMOUS) {
let allow = allow_anonymous.as_str().parse::<bool>().map_err(|e| {
if let Some((key, value)) = connection
.get(SKIP_SIGNATURE)
.map(|value| (SKIP_SIGNATURE, value))
.or_else(|| {
connection
.get(ALLOW_ANONYMOUS)
.map(|value| (ALLOW_ANONYMOUS, value))
})
{
let skip_signature = value.as_str().parse::<bool>().map_err(|e| {
error::InvalidConnectionSnafu {
msg: format!(
"failed to parse the option {}={}, {}",
ALLOW_ANONYMOUS, allow_anonymous, e
),
msg: format!("failed to parse the option {}={}, {}", key, value, e),
}
.build()
})?;
if allow {
builder = builder.allow_anonymous();
if skip_signature {
builder = builder.skip_signature();
}
}
@@ -93,6 +100,7 @@ mod tests {
fn test_is_supported_in_oss() {
assert!(is_supported_in_oss(ROOT));
assert!(is_supported_in_oss(ALLOW_ANONYMOUS));
assert!(is_supported_in_oss(SKIP_SIGNATURE));
assert!(is_supported_in_oss(BUCKET));
assert!(is_supported_in_oss(ENDPOINT));
assert!(is_supported_in_oss(ACCESS_KEY_ID));

View File

@@ -103,7 +103,7 @@ pub async fn setup_stream_to_json_test(origin_path: &str, threshold: impl Fn(usi
test_util::TEST_BATCH_SIZE,
schema.clone(),
FileCompressionType::UNCOMPRESSED,
Arc::new(object_store::compat::OpendalStore::new(store.clone())),
Arc::new(object_store_opendal::OpendalStore::new(store.clone())),
true,
);
@@ -157,7 +157,7 @@ pub async fn setup_stream_to_csv_test(
let csv_opener = csv_source
.create_file_opener(
Arc::new(object_store::compat::OpendalStore::new(store.clone())),
Arc::new(object_store_opendal::OpendalStore::new(store.clone())),
&config,
0,
)

View File

@@ -17,7 +17,7 @@ mod flow;
mod registry;
mod table;
pub use container::{CacheContainer, Initializer, Invalidator, TokenFilter};
pub use container::{CacheContainer, InitStrategy, Initializer, Invalidator, TokenFilter};
pub use flow::{TableFlownodeSetCache, TableFlownodeSetCacheRef, new_table_flownode_set_cache};
pub use registry::{
CacheRegistry, CacheRegistryBuilder, CacheRegistryRef, LayeredCacheRegistry,

View File

@@ -437,11 +437,13 @@ pub fn defer_on_missing_source(flow_task: &CreateFlowTask) -> Result<bool> {
pub fn validate_flow_options(flow_task: &CreateFlowTask) -> Result<()> {
for key in flow_task.flow_options.keys() {
match key.as_str() {
DEFER_ON_MISSING_SOURCE_KEY | FlowType::FLOW_TYPE_KEY => {}
DEFER_ON_MISSING_SOURCE_KEY
| FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY
| FlowType::FLOW_TYPE_KEY => {}
unknown => {
return UnexpectedSnafu {
err_msg: format!(
"Unknown flow option '{unknown}', supported user options: {DEFER_ON_MISSING_SOURCE_KEY}"
"Unknown flow option '{unknown}', supported user options: {DEFER_ON_MISSING_SOURCE_KEY}, {FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY}"
),
}
.fail();
@@ -487,6 +489,9 @@ pub enum FlowType {
Streaming,
}
pub const FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY: &str =
"experimental_enable_incremental_read";
impl FlowType {
pub const BATCHING: &str = "batching";
pub const STREAMING: &str = "streaming";

View File

@@ -24,8 +24,9 @@ use table::table_name::TableName;
use crate::ddl::DdlContext;
use crate::ddl::create_flow::{
CreateFlowData, CreateFlowProcedure, CreateFlowState, DEFER_ON_MISSING_SOURCE_KEY, FlowType,
defer_on_missing_source,
CreateFlowData, CreateFlowProcedure, CreateFlowState, DEFER_ON_MISSING_SOURCE_KEY,
FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, FlowType, defer_on_missing_source,
validate_flow_options,
};
use crate::ddl::test_util::create_table::test_create_table_task;
use crate::ddl::test_util::flownode_handler::NaiveFlownodeHandler;
@@ -275,6 +276,22 @@ fn test_defer_on_missing_source_invalid_value() {
);
}
#[test]
fn test_validate_flow_options_allows_incremental_read_option() {
let mut task = test_create_flow_task(
"my_flow",
vec![],
TableName::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "my_sink_table"),
false,
);
task.flow_options.insert(
FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(),
"true".to_string(),
);
validate_flow_options(&task).unwrap();
}
#[tokio::test]
async fn test_create_flow_rejects_unknown_option_in_meta_task() {
let mut task = test_create_flow_task(

View File

@@ -29,6 +29,7 @@ datafusion-expr.workspace = true
datatypes.workspace = true
futures.workspace = true
object-store.workspace = true
object_store_opendal.workspace = true
serde = { version = "1.0", features = ["derive"] }
serde_json.workspace = true
snafu.workspace = true

View File

@@ -61,7 +61,7 @@ fn build_record_batch_stream(
.with_file_group(FileGroup::new(files))
.build();
let store = Arc::new(object_store::compat::OpendalStore::new(
let store = Arc::new(object_store_opendal::OpendalStore::new(
scan_plan_config.store.clone(),
));

View File

@@ -23,7 +23,6 @@ use session::ReadPreference;
mod checkpoint;
pub(crate) mod engine;
pub(crate) mod frontend_client;
mod incremental_filter;
mod state;
mod table_creator;
mod task;
@@ -55,6 +54,10 @@ pub struct BatchingModeOptions {
pub experimental_max_filter_num_per_query: usize,
/// Time window merge distance
pub experimental_time_window_merge_threshold: usize,
/// Whether to enable experimental flow incremental source reads.
///
/// When disabled, batching flows always execute full-snapshot queries.
pub experimental_enable_incremental_read: bool,
/// Read preference of the Frontend client.
pub read_preference: ReadPreference,
/// TLS option for client connections to frontends.
@@ -72,6 +75,7 @@ impl Default for BatchingModeOptions {
experimental_frontend_scan_timeout: Duration::from_secs(30),
experimental_max_filter_num_per_query: 20,
experimental_time_window_merge_threshold: 3,
experimental_enable_incremental_read: false,
read_preference: Default::default(),
frontend_tls: None,
}

View File

@@ -21,7 +21,7 @@ use std::time::Duration;
use api::v1::flow::DirtyWindowRequests;
use catalog::CatalogManagerRef;
use common_error::ext::BoxedError;
use common_meta::ddl::create_flow::FlowType;
use common_meta::ddl::create_flow::{FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, FlowType};
use common_meta::key::TableMetadataManagerRef;
use common_meta::key::flow::FlowMetadataManagerRef;
use common_meta::key::flow::flow_state::FlowStat;
@@ -38,6 +38,7 @@ use session::context::QueryContext;
use snafu::{OptionExt, ResultExt, ensure};
use sql::parsers::utils::is_tql;
use store_api::metric_engine_consts::is_metric_engine_internal_column;
use store_api::mito_engine_options::APPEND_MODE_KEY;
use store_api::storage::{RegionId, TableId};
use table::table_reference::TableReference;
use tokio::sync::{RwLock, oneshot};
@@ -428,6 +429,55 @@ async fn get_table_info(
}
impl BatchingEngine {
fn batch_opts_for_flow_options(
&self,
flow_options: &HashMap<String, String>,
) -> Result<Arc<BatchingModeOptions>, Error> {
let mut batch_opts = (*self.batch_opts).clone();
if let Some(enable_incremental_read) =
flow_options.get(FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY)
{
batch_opts.experimental_enable_incremental_read = enable_incremental_read
.parse::<bool>()
.map_err(|_| {
InvalidQuerySnafu {
reason: format!(
"Invalid flow option {FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY}: {enable_incremental_read}"
),
}
.build()
})?;
}
Ok(Arc::new(batch_opts))
}
fn table_options_enable_append_mode(extra_options: &HashMap<String, String>) -> bool {
extra_options
.get(APPEND_MODE_KEY)
.is_some_and(|value| value.eq_ignore_ascii_case("true"))
}
fn ensure_incremental_source_append_only(
batch_opts: &BatchingModeOptions,
table_name: &[String; 3],
extra_options: &HashMap<String, String>,
) -> Result<(), Error> {
if batch_opts.experimental_enable_incremental_read {
ensure!(
Self::table_options_enable_append_mode(extra_options),
UnsupportedSnafu {
reason: format!(
"Flow incremental read requires append-only source table, but source table `{}` is not append-only. Consider setting append_mode='true' on the source table or disabling experimental_enable_incremental_read",
table_name.join(".")
),
}
);
}
Ok(())
}
pub async fn create_flow_inner(&self, args: CreateFlowArgs) -> Result<Option<FlowId>, Error> {
let CreateFlowArgs {
flow_id,
@@ -494,6 +544,8 @@ impl BatchingEngine {
}
);
let batch_opts = self.batch_opts_for_flow_options(&flow_options)?;
let mut source_table_names = Vec::with_capacity(2);
for src_id in source_table_ids {
// also check table option to see if ttl!=instant
@@ -509,6 +561,11 @@ impl BatchingEngine {
),
}
);
Self::ensure_incremental_source_append_only(
&batch_opts,
&table_name,
&table_info.table_info.meta.options.extra_options,
)?;
source_table_names.push(table_name);
}
@@ -563,7 +620,7 @@ impl BatchingEngine {
query_ctx,
catalog_manager: self.catalog_manager.clone(),
shutdown_rx: rx,
batch_opts: self.batch_opts.clone(),
batch_opts,
flow_eval_interval: eval_interval.map(|secs| Duration::from_secs(secs as u64)),
};
@@ -808,7 +865,7 @@ impl BatchingEngine {
});
let res = task
.gen_exec_once(
.execute_once_serialized(
&self.query_engine,
&self.frontend_client,
cur_dirty_window_cnt,
@@ -946,6 +1003,76 @@ mod tests {
)
}
#[tokio::test]
async fn test_flow_option_overrides_incremental_read_switch() {
let engine = new_test_engine().await;
let default_opts = engine.batch_opts_for_flow_options(&HashMap::new()).unwrap();
assert!(!default_opts.experimental_enable_incremental_read);
let enabled_opts = engine
.batch_opts_for_flow_options(&HashMap::from([(
FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(),
"true".to_string(),
)]))
.unwrap();
assert!(enabled_opts.experimental_enable_incremental_read);
}
#[test]
fn test_table_options_enable_append_mode() {
assert!(!BatchingEngine::table_options_enable_append_mode(
&HashMap::new()
));
assert!(!BatchingEngine::table_options_enable_append_mode(
&HashMap::from([(APPEND_MODE_KEY.to_string(), "false".to_string())])
));
assert!(BatchingEngine::table_options_enable_append_mode(
&HashMap::from([(APPEND_MODE_KEY.to_string(), "TRUE".to_string())])
));
}
#[test]
fn test_incremental_source_append_only_enforcement() {
let table_name = [
"greptime".to_string(),
"public".to_string(),
"numbers".to_string(),
];
let disabled_opts = BatchingModeOptions::default();
let enabled_opts = BatchingModeOptions {
experimental_enable_incremental_read: true,
..Default::default()
};
let non_append_options = HashMap::new();
let append_options = HashMap::from([(APPEND_MODE_KEY.to_string(), "true".to_string())]);
BatchingEngine::ensure_incremental_source_append_only(
&disabled_opts,
&table_name,
&non_append_options,
)
.expect("disabled incremental read should not require append-only source");
BatchingEngine::ensure_incremental_source_append_only(
&enabled_opts,
&table_name,
&append_options,
)
.expect("append-only source should be accepted when incremental read is enabled");
let err = BatchingEngine::ensure_incremental_source_append_only(
&enabled_opts,
&table_name,
&non_append_options,
)
.expect_err("non-append source should be rejected when incremental read is enabled");
assert!(
err.to_string()
.contains("Flow incremental read requires append-only source table"),
"{err}"
);
}
async fn new_test_task(flow_id: FlowId) -> (BatchingTask, oneshot::Sender<()>) {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();

View File

@@ -1,222 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use common_telemetry::tracing::debug;
use datafusion_expr::Expr;
use datatypes::schema::Schema;
use crate::batching_mode::state::FilterExprInfo;
use crate::batching_mode::utils::IncrementalAggregateAnalysis;
use crate::{Error, FlowId};
pub(super) fn build_sink_dirty_time_window_filter_expr(
flow_id: FlowId,
analysis: &IncrementalAggregateAnalysis,
sink_schema: &Schema,
dirty_filter: Option<&FilterExprInfo>,
) -> Result<Option<Expr>, Error> {
let Some(dirty_filter) = dirty_filter else {
return Ok(None);
};
let Some(sink_filter_col) =
infer_sink_time_window_filter_col(flow_id, analysis, sink_schema, dirty_filter)
else {
return Ok(None);
};
dirty_filter.predicate_for_col(&sink_filter_col)
}
fn infer_sink_time_window_filter_col(
flow_id: FlowId,
analysis: &IncrementalAggregateAnalysis,
sink_schema: &Schema,
dirty_filter: &FilterExprInfo,
) -> Option<String> {
if analysis.group_key_names.is_empty() {
return None;
}
let is_timestamp_group_key = |name: &str| {
analysis.group_key_names.iter().any(|key| key == name)
&& sink_schema
.column_schema_by_name(name)
.is_some_and(|col| col.data_type.is_timestamp())
};
if is_timestamp_group_key(&dirty_filter.col_name) {
return Some(dirty_filter.col_name.clone());
}
let candidates = analysis
.group_key_names
.iter()
.filter(|name| is_timestamp_group_key(name))
.cloned()
.collect::<Vec<_>>();
match candidates.as_slice() {
[name] => Some(name.clone()),
[] => {
debug!(
"Flow {} cannot infer sink dirty-window filter column: no timestamp group key in {:?}",
flow_id, analysis.group_key_names
);
None
}
_ => {
debug!(
"Flow {} cannot infer sink dirty-window filter column: ambiguous timestamp group keys {:?}",
flow_id, candidates
);
None
}
}
}
#[cfg(test)]
mod test {
use datatypes::prelude::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use pretty_assertions::assert_eq;
use super::*;
use crate::adapter::AUTO_CREATED_UPDATE_AT_TS_COL;
use crate::batching_mode::state::FilterExprInfo;
use crate::batching_mode::utils::IncrementalAggregateAnalysis;
fn test_analysis_with_group_keys(group_key_names: Vec<&str>) -> IncrementalAggregateAnalysis {
IncrementalAggregateAnalysis {
group_key_names: group_key_names
.into_iter()
.map(|name| name.to_string())
.collect(),
merge_columns: vec![],
literal_columns: vec![],
output_field_names: vec![],
unsupported_exprs: vec![],
}
}
fn test_dirty_filter(col_name: &str) -> FilterExprInfo {
FilterExprInfo {
expr: datafusion_expr::col(col_name),
col_name: col_name.to_string(),
time_ranges: vec![],
window_size: chrono::Duration::seconds(1),
}
}
fn test_sink_schema(columns: Vec<(&str, ConcreteDataType)>) -> Schema {
Schema::new(
columns
.into_iter()
.map(|(name, data_type)| ColumnSchema::new(name, data_type, true))
.collect(),
)
}
#[test]
fn test_infer_sink_time_window_filter_col_uses_matching_source_group_key() {
let analysis = test_analysis_with_group_keys(vec!["ts", "host"]);
let sink_schema = test_sink_schema(vec![
("ts", ConcreteDataType::timestamp_millisecond_datatype()),
("host", ConcreteDataType::string_datatype()),
]);
let dirty_filter = test_dirty_filter("ts");
assert_eq!(
Some("ts".to_string()),
infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
);
}
#[test]
fn test_infer_sink_time_window_filter_col_uses_unique_timestamp_group_key() {
let analysis = test_analysis_with_group_keys(vec!["host", "time_window"]);
let sink_schema = test_sink_schema(vec![
("host", ConcreteDataType::string_datatype()),
(
"time_window",
ConcreteDataType::timestamp_millisecond_datatype(),
),
(
AUTO_CREATED_UPDATE_AT_TS_COL,
ConcreteDataType::timestamp_millisecond_datatype(),
),
]);
let dirty_filter = test_dirty_filter("ts");
assert_eq!(
Some("time_window".to_string()),
infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
);
}
#[test]
fn test_infer_sink_time_window_filter_col_skips_global_aggregate() {
let analysis = test_analysis_with_group_keys(vec![]);
let sink_schema = test_sink_schema(vec![
("number", ConcreteDataType::uint32_datatype()),
(
"time_window",
ConcreteDataType::timestamp_millisecond_datatype(),
),
]);
let dirty_filter = test_dirty_filter("ts");
assert_eq!(
None,
infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
);
}
#[test]
fn test_infer_sink_time_window_filter_col_skips_without_timestamp_group_key() {
let analysis = test_analysis_with_group_keys(vec!["host", "device"]);
let sink_schema = test_sink_schema(vec![
("host", ConcreteDataType::string_datatype()),
("device", ConcreteDataType::string_datatype()),
(
AUTO_CREATED_UPDATE_AT_TS_COL,
ConcreteDataType::timestamp_millisecond_datatype(),
),
]);
let dirty_filter = test_dirty_filter("ts");
assert_eq!(
None,
infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
);
}
#[test]
fn test_infer_sink_time_window_filter_col_skips_ambiguous_timestamp_group_keys() {
let analysis = test_analysis_with_group_keys(vec!["ts", "time_window"]);
let sink_schema = test_sink_schema(vec![
("ts", ConcreteDataType::timestamp_millisecond_datatype()),
(
"time_window",
ConcreteDataType::timestamp_millisecond_datatype(),
),
]);
let dirty_filter = test_dirty_filter("source_ts");
assert_eq!(
None,
infer_sink_time_window_filter_col(1, &analysis, &sink_schema, &dirty_filter)
);
}
}

View File

@@ -66,12 +66,20 @@ pub struct TaskState {
}
impl TaskState {
pub fn new(query_ctx: QueryContextRef, shutdown_rx: oneshot::Receiver<()>) -> Self {
Self::with_dirty_time_windows(query_ctx, shutdown_rx, DirtyTimeWindows::default())
}
pub fn with_dirty_time_windows(
query_ctx: QueryContextRef,
shutdown_rx: oneshot::Receiver<()>,
dirty_time_windows: DirtyTimeWindows,
) -> Self {
Self {
query_ctx,
last_update_time: Instant::now(),
last_query_duration: Duration::from_secs(0),
last_exec_time_millis: None,
dirty_time_windows: Default::default(),
dirty_time_windows,
checkpoint_mode: CheckpointMode::FullSnapshot,
checkpoints: Default::default(),
incremental_disabled: false,
@@ -264,6 +272,16 @@ impl DirtyTimeWindows {
time_window_merge_threshold,
}
}
#[cfg(test)]
pub(crate) fn max_filter_num_per_query(&self) -> usize {
self.max_filter_num_per_query
}
#[cfg(test)]
pub(crate) fn time_window_merge_threshold(&self) -> usize {
self.time_window_merge_threshold
}
}
impl Default for DirtyTimeWindows {
@@ -681,7 +699,7 @@ impl DirtyTimeWindows {
}
}
fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
pub(crate) fn to_df_literal(value: Timestamp) -> Result<datafusion_common::ScalarValue, Error> {
let value = Value::from(value);
let value = value
.try_to_scalar_value(&value.data_type())

View File

@@ -27,7 +27,7 @@ use datafusion::datasource::DefaultTableSource;
use datafusion::sql::unparser::expr_to_sql;
use datafusion_common::DFSchemaRef;
use datafusion_common::tree_node::{Transformed, TreeNode};
use datafusion_expr::{DmlStatement, LogicalPlan, WriteOp};
use datafusion_expr::{DmlStatement, LogicalPlan, WriteOp, col, lit};
use datatypes::schema::Schema;
use query::QueryEngineRef;
use query::options::FLOW_INCREMENTAL_MODE;
@@ -38,14 +38,16 @@ use sql::parsers::utils::is_tql;
use store_api::mito_engine_options::MERGE_MODE_KEY;
use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
use table::table::adapter::DfTableProviderAdapter;
use tokio::sync::oneshot;
use tokio::sync::oneshot::error::TryRecvError;
use tokio::sync::{Mutex, oneshot};
use tokio::time::Instant;
use crate::batching_mode::BatchingModeOptions;
use crate::batching_mode::checkpoint::checkpoint_mode_label;
use crate::batching_mode::frontend_client::{FrontendClient, PeerDesc};
use crate::batching_mode::state::{CheckpointMode, DirtyTimeWindows, FilterExprInfo, TaskState};
use crate::batching_mode::state::{
CheckpointMode, DirtyTimeWindows, FilterExprInfo, TaskState, to_df_literal,
};
use crate::batching_mode::table_creator::{QueryType, create_table_with_expr};
use crate::batching_mode::time_window::TimeWindowExpr;
use crate::batching_mode::utils::{
@@ -67,12 +69,6 @@ use crate::{Error, FlowId};
mod ckpt;
mod inc;
/// Maximum number of dirty time-window predicates attached to one incremental
/// SQL query. This keeps generated OR filters bounded so Substrait encoding and
/// downstream planning remain predictable; if the backlog is larger, the flow
/// drains one capped batch and postpones checkpoint advancement to a later run.
const MAX_INCREMENTAL_DIRTY_WINDOW_FILTERS: usize = 4096;
/// The task's config, immutable once created
#[derive(Clone)]
pub struct TaskConfig {
@@ -113,6 +109,10 @@ fn is_merge_mode_last_non_null(options: &HashMap<String, String>) -> bool {
pub struct BatchingTask {
pub config: Arc<TaskConfig>,
pub state: Arc<RwLock<TaskState>>,
/// Serializes plan generation, execution, checkpoint advancement, and dirty
/// window restoration for this flow. Without this, a manual flush and the
/// background loop can process the same checkpoint range concurrently.
execution_lock: Arc<Mutex<()>>,
}
/// Arguments for creating batching task
@@ -150,6 +150,16 @@ pub enum DirtyRestore {
Unscoped(DirtyTimeWindows),
}
struct ExecuteOnceOutcome {
new_query: Option<PlanInfo>,
/// Execution result of the generated insert plan.
///
/// `Ok(Some((affected_rows, elapsed)))` means a query was executed.
/// `Ok(None)` means no query was generated because there was no dirty signal.
/// `Err(_)` means plan generation or execution failed.
result: Result<Option<(usize, Duration)>, Error>,
}
impl BatchingTask {
#[allow(clippy::too_many_arguments)]
pub fn try_new(
@@ -168,6 +178,18 @@ impl BatchingTask {
flow_eval_interval,
}: TaskArgs<'_>,
) -> Result<Self, Error> {
let mut state = TaskState::with_dirty_time_windows(
query_ctx.clone(),
shutdown_rx,
DirtyTimeWindows::new(
batch_opts.experimental_max_filter_num_per_query,
batch_opts.experimental_time_window_merge_threshold,
),
);
if !batch_opts.experimental_enable_incremental_read {
state.disable_incremental();
}
Ok(Self {
config: Arc::new(TaskConfig {
flow_id,
@@ -182,7 +204,8 @@ impl BatchingTask {
batch_opts,
flow_eval_interval,
}),
state: Arc::new(RwLock::new(TaskState::new(query_ctx, shutdown_rx))),
state: Arc::new(RwLock::new(state)),
execution_lock: Arc::new(Mutex::new(())),
})
}
@@ -251,40 +274,75 @@ impl BatchingTask {
.context(ExternalSnafu)
}
pub async fn gen_exec_once(
pub(crate) async fn execute_once_serialized(
&self,
engine: &QueryEngineRef,
frontend_client: &Arc<FrontendClient>,
max_window_cnt: Option<usize>,
) -> Result<Option<(usize, Duration)>, Error> {
if let Some(new_query) = self.gen_insert_plan(engine, max_window_cnt).await? {
let outcome = self
.execute_once_serialized_with_outcome(engine, frontend_client, max_window_cnt)
.await;
outcome.result
}
/// Executes one flow evaluation under `execution_lock` and keeps the
/// generated query context for the background loop's error logging/backoff.
async fn execute_once_serialized_with_outcome(
&self,
engine: &QueryEngineRef,
frontend_client: &Arc<FrontendClient>,
max_window_cnt: Option<usize>,
) -> ExecuteOnceOutcome {
let _execution_guard = self.execution_lock.lock().await;
self.execute_once_unlocked(engine, frontend_client, max_window_cnt)
.await
}
/// Executes one flow evaluation. Caller must hold `execution_lock`.
async fn execute_once_unlocked(
&self,
engine: &QueryEngineRef,
frontend_client: &Arc<FrontendClient>,
max_window_cnt: Option<usize>,
) -> ExecuteOnceOutcome {
let new_query = match self.gen_insert_plan_unlocked(engine, max_window_cnt).await {
Ok(new_query) => new_query,
Err(err) => {
return ExecuteOnceOutcome {
new_query: None,
result: Err(err),
};
}
};
if let Some(new_query) = new_query {
debug!("Generate new query: {}", new_query.plan);
let dirty_filter = match &new_query.dirty_restore {
DirtyRestore::Scoped(f) => Some(f),
_ => None,
};
match self
.execute_logical_plan(
let res = self
.execute_logical_plan_unlocked(
frontend_client,
&new_query.plan,
dirty_filter,
new_query.can_advance_checkpoints,
)
.await
{
Ok(result) => Ok(result),
Err(err) => {
self.handle_executed_query_failure(Some(&new_query));
Err(err)
}
.await;
if res.is_err() {
self.handle_executed_query_failure(Some(&new_query));
}
ExecuteOnceOutcome {
new_query: Some(new_query),
result: res,
}
} else {
debug!("Generate no query");
Ok(None)
ExecuteOnceOutcome {
new_query: None,
result: Ok(None),
}
}
}
pub async fn gen_insert_plan(
/// Generates the insert plan. Caller must reach this through the serialized path.
async fn gen_insert_plan_unlocked(
&self,
engine: &QueryEngineRef,
max_window_cnt: Option<usize>,
@@ -388,11 +446,11 @@ impl BatchingTask {
Ok(())
}
pub async fn execute_logical_plan(
/// Executes the insert plan. Caller must reach this through the serialized path.
async fn execute_logical_plan_unlocked(
&self,
frontend_client: &Arc<FrontendClient>,
plan: &LogicalPlan,
dirty_filter: Option<&FilterExprInfo>,
can_advance_checkpoints: bool,
) -> Result<Option<(usize, Duration)>, Error> {
let instant = Instant::now();
@@ -426,8 +484,7 @@ impl BatchingTask {
// For incremental-mode SQL queries, attempt to rewrite the delta aggregate
// plan into a safe delta-LEFT-JOIN-sink form before deciding on extensions.
let incremental_plan = if can_advance_checkpoints {
self.prepare_plan_for_incremental(&plan, dirty_filter)
.await?
self.prepare_plan_for_incremental(&plan).await?
} else {
None
};
@@ -580,6 +637,112 @@ impl BatchingTask {
})
}
fn restore_unscoped_dirty_windows(&self, dirty_windows: &DirtyTimeWindows) {
self.state
.write()
.unwrap()
.dirty_time_windows
.add_dirty_windows(dirty_windows);
}
fn restore_unscoped_dirty_windows_on_err<T>(
&self,
dirty_windows: &DirtyTimeWindows,
result: Result<T, Error>,
) -> Result<T, Error> {
result.inspect_err(|_| {
self.restore_unscoped_dirty_windows(dirty_windows);
})
}
fn drain_dirty_windows_signal(&self) -> (bool, DirtyTimeWindows) {
let mut state = self.state.write().unwrap();
let dirty_windows_to_restore = state.dirty_time_windows.clone();
let is_dirty = !dirty_windows_to_restore.is_empty();
state.dirty_time_windows.clean();
(is_dirty, dirty_windows_to_restore)
}
#[allow(clippy::too_many_arguments)]
async fn gen_unfiltered_plan_info(
&self,
engine: QueryEngineRef,
query_ctx: QueryContextRef,
sink_table_schema: Arc<Schema>,
primary_key_indices: &[usize],
allow_partial: bool,
dirty_windows_to_restore: DirtyTimeWindows,
retention_filter: Option<(&str, Timestamp, &'static str)>,
) -> Result<PlanInfo, Error> {
let mut plan = self.restore_unscoped_dirty_windows_on_err(
&dirty_windows_to_restore,
gen_plan_with_matching_schema(
&self.config.query,
query_ctx,
engine,
sink_table_schema,
primary_key_indices,
allow_partial,
)
.await,
)?;
if let Some((col_name, lower_bound, context)) = retention_filter {
let lower = self.restore_unscoped_dirty_windows_on_err(
&dirty_windows_to_restore,
to_df_literal(lower_bound),
)?;
let retention_filter = col(col_name).gt_eq(lit(lower));
let mut add_filter = AddFilterRewriter::new(retention_filter);
plan = self.restore_unscoped_dirty_windows_on_err(
&dirty_windows_to_restore,
plan.clone()
.rewrite(&mut add_filter)
.with_context(|_| DatafusionSnafu {
context: format!(
"Failed to apply {context} expire_after filter to plan:\n {}\n",
plan
),
})
.map(|rewrite| rewrite.data),
)?;
}
Ok(PlanInfo {
plan,
dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
can_advance_checkpoints: true,
})
}
async fn gen_unfiltered_plan_info_if_dirty(
&self,
engine: QueryEngineRef,
query_ctx: QueryContextRef,
sink_table_schema: Arc<Schema>,
primary_key_indices: &[usize],
allow_partial: bool,
retention_filter: Option<(&str, Timestamp, &'static str)>,
) -> Result<Option<PlanInfo>, Error> {
let (is_dirty, dirty_windows_to_restore) = self.drain_dirty_windows_signal();
if !is_dirty {
debug!("Flow id={:?}, no new data, not update", self.config.flow_id);
return Ok(None);
}
self.gen_unfiltered_plan_info(
engine,
query_ctx,
sink_table_schema,
primary_key_indices,
allow_partial,
dirty_windows_to_restore,
retention_filter,
)
.await
.map(Some)
}
fn handle_executed_query_failure(&self, query: Option<&PlanInfo>) {
if let Some(query) = query {
self.restore_dirty_windows_after_failure(query);
@@ -626,33 +789,11 @@ impl BatchingTask {
let min_refresh = self.config.batch_opts.experimental_min_refresh_duration;
let new_query = match self.gen_insert_plan(&engine, max_window_cnt).await {
Ok(new_query) => new_query,
Err(err) => {
common_telemetry::error!(err; "Failed to generate query for flow={}", self.config.flow_id);
// also sleep for a little while before try again to prevent flooding logs
tokio::time::sleep(min_refresh).await;
continue;
}
};
let outcome = self
.execute_once_serialized_with_outcome(&engine, &frontend_client, max_window_cnt)
.await;
let res = if let Some(new_query) = &new_query {
let dirty_filter = match &new_query.dirty_restore {
DirtyRestore::Scoped(f) => Some(f),
_ => None,
};
self.execute_logical_plan(
&frontend_client,
&new_query.plan,
dirty_filter,
new_query.can_advance_checkpoints,
)
.await
} else {
Ok(None)
};
match res {
match outcome.result {
// normal execute, sleep for some time before doing next query
Ok(Some(_)) => {
// can increase max_window_cnt to query more windows next time
@@ -703,11 +844,10 @@ impl BatchingTask {
}
// TODO(discord9): this error should have better place to go, but for now just print error, also more context is needed
Err(err) => {
self.handle_executed_query_failure(new_query.as_ref());
METRIC_FLOW_BATCHING_ENGINE_ERROR_CNT
.with_label_values(&[&flow_id_str])
.inc();
match new_query {
match outcome.new_query {
Some(query) => {
common_telemetry::error!(err; "Failed to execute query for flow={} with query: {}", self.config.flow_id, query.plan);
// TODO(discord9): add some backoff here? half the query time window or what
@@ -743,6 +883,20 @@ impl BatchingTask {
create_table_with_expr(&plan, &self.config.sink_table_name, &self.config.query_type)
}
fn should_use_unfiltered_incremental_delta(&self) -> bool {
let state = self.state.read().unwrap();
state.checkpoint_mode() == CheckpointMode::Incremental
&& !state.is_incremental_disabled()
&& matches!(self.config.query_type, QueryType::Sql)
}
fn should_use_unfiltered_full_snapshot_seeding(&self) -> bool {
let state = self.state.read().unwrap();
state.checkpoint_mode() == CheckpointMode::FullSnapshot
&& !state.is_incremental_disabled()
&& matches!(self.config.query_type, QueryType::Sql)
}
/// will merge and use the first ten time window in query
async fn gen_query_with_time_window(
&self,
@@ -783,83 +937,35 @@ impl BatchingTask {
self.config.flow_id
);
// clean dirty time window too, this could be from create flow's check_execute
let (is_dirty, dirty_windows_to_restore) = {
let mut state = self.state.write().unwrap();
let dirty_windows_to_restore = state.dirty_time_windows.clone();
let is_dirty = !dirty_windows_to_restore.is_empty();
state.dirty_time_windows.clean();
(is_dirty, dirty_windows_to_restore)
};
if !is_dirty {
// no dirty data, hence no need to update
debug!("Flow id={:?}, no new data, not update", self.config.flow_id);
return Ok(None);
}
let plan = match gen_plan_with_matching_schema(
&self.config.query,
query_ctx,
engine,
sink_table_schema.clone(),
primary_key_indices,
allow_partial,
)
.await
{
Ok(plan) => plan,
Err(err) => {
self.state
.write()
.unwrap()
.dirty_time_windows
.add_dirty_windows(&dirty_windows_to_restore);
return Err(err);
}
};
return Ok(Some(PlanInfo {
plan,
dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
can_advance_checkpoints: true,
}));
return self
.gen_unfiltered_plan_info_if_dirty(
engine,
query_ctx,
sink_table_schema.clone(),
primary_key_indices,
allow_partial,
None,
)
.await;
}
_ => {
// Clean dirty windows for full-query/non-scoped paths,
// such as TQL, that cannot use a time-window filter.
let dirty_windows_to_restore = {
let mut state = self.state.write().unwrap();
let dirty_windows_to_restore = state.dirty_time_windows.clone();
state.dirty_time_windows.clean();
dirty_windows_to_restore
};
let (_, dirty_windows_to_restore) = self.drain_dirty_windows_signal();
let plan = match gen_plan_with_matching_schema(
&self.config.query,
query_ctx,
engine,
sink_table_schema.clone(),
primary_key_indices,
allow_partial,
)
.await
{
Ok(plan) => plan,
Err(err) => {
self.state
.write()
.unwrap()
.dirty_time_windows
.add_dirty_windows(&dirty_windows_to_restore);
return Err(err);
}
};
let plan_info = self
.gen_unfiltered_plan_info(
engine,
query_ctx,
sink_table_schema.clone(),
primary_key_indices,
allow_partial,
dirty_windows_to_restore,
None,
)
.await?;
return Ok(Some(PlanInfo {
plan,
dirty_restore: DirtyRestore::Unscoped(dirty_windows_to_restore),
can_advance_checkpoints: true,
}));
return Ok(Some(plan_info));
}
};
@@ -889,22 +995,61 @@ impl BatchingTask {
),
})?;
if self.should_use_unfiltered_full_snapshot_seeding() {
// A full-snapshot query that can seed/refresh incremental
// checkpoints must not use dirty-window predicates. Rows can be
// written after dirty windows are drained but before the source scan
// snapshot opens; a stale dirty-window filter could exclude those
// rows while the returned watermark includes them, causing the next
// incremental read to skip them forever. Execute an unfiltered full
// snapshot instead, and keep dirty windows only as the scheduling and
// failure-restoration signal.
let retention_filter = self
.config
.expire_after
.map(|_| (col_name.as_str(), expire_lower_bound, "full-snapshot"));
return self
.gen_unfiltered_plan_info_if_dirty(
engine,
query_ctx,
sink_table_schema.clone(),
primary_key_indices,
allow_partial,
retention_filter,
)
.await;
}
if self.should_use_unfiltered_incremental_delta() {
// In incremental mode, source correctness is defined by the
// per-region sequence range `(checkpoint, scan-open snapshot]`, not
// by dirty-window predicates. Dirty windows are only a scheduling
// signal here. Applying a stale dirty-window filter to the source can
// exclude rows that are inside the returned watermark and make a
// checkpoint advance skip them forever. The sink side is also left
// unfiltered by dirty windows; the incremental rewrite joins the
// delta groups with the full sink state for correctness. Future
// dynamic filters can prune sink reads as a pure optimization.
let retention_filter = self
.config
.expire_after
.map(|_| (col_name.as_str(), expire_lower_bound, "incremental"));
return self
.gen_unfiltered_plan_info_if_dirty(
engine,
query_ctx,
sink_table_schema.clone(),
primary_key_indices,
allow_partial,
retention_filter,
)
.await;
}
let (expr, can_advance_checkpoints) = {
let mut state = self.state.write().unwrap();
let window_cnt = if state.checkpoint_mode() == CheckpointMode::Incremental
&& !state.is_incremental_disabled()
&& matches!(self.config.query_type, QueryType::Sql)
{
// Incremental scans are bounded by region sequence checkpoints,
// so the dirty-window filter only narrows sink-side/time-window
// work. Drain more windows than normal, but keep a hard cap to
// avoid building a huge OR filter after a long downtime. If
// windows remain, checkpoints won't advance this round.
MAX_INCREMENTAL_DIRTY_WINDOW_FILTERS
} else {
max_window_cnt
.unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query)
};
let window_cnt = max_window_cnt
.unwrap_or(self.config.batch_opts.experimental_max_filter_num_per_query);
let expr = state.dirty_time_windows.gen_filter_exprs(
&col_name,
Some(expire_lower_bound),

View File

@@ -26,8 +26,7 @@ use snafu::ResultExt;
use table::metadata::TableId;
use crate::Error;
use crate::batching_mode::incremental_filter::build_sink_dirty_time_window_filter_expr;
use crate::batching_mode::state::{CheckpointMode, FilterExprInfo};
use crate::batching_mode::state::CheckpointMode;
use crate::batching_mode::table_creator::QueryType;
use crate::batching_mode::task::BatchingTask;
use crate::batching_mode::utils::{
@@ -74,7 +73,6 @@ impl BatchingTask {
pub(super) async fn prepare_plan_for_incremental(
&self,
plan: &LogicalPlan,
dirty_filter: Option<&FilterExprInfo>,
) -> Result<Option<LogicalPlan>, Error> {
let is_incremental_sql = {
let state = self.state.read().unwrap();
@@ -152,31 +150,12 @@ impl BatchingTask {
return Ok(None);
}
};
let sink_schema = sink_table.table_info().meta.schema.clone();
let sink_dirty_filter = match build_sink_dirty_time_window_filter_expr(
self.config.flow_id,
&analysis,
&sink_schema,
dirty_filter,
) {
Ok(filter) => filter,
Err(err) => {
warn!(
"Flow {} failed to build sink dirty time window filter; \
falling back to full snapshot for this round: {:?}",
self.config.flow_id, err
);
self.state.write().unwrap().mark_full_snapshot();
return Ok(None);
}
};
let rewritten_inner = match rewrite_incremental_aggregate_with_sink_merge(
&inner_plan,
&analysis,
sink_table,
&self.config.sink_table_name,
sink_dirty_filter,
None,
)
.await
{

View File

@@ -25,7 +25,9 @@ use datatypes::data_type::ConcreteDataType as CDT;
use datatypes::schema::ColumnSchema;
use datatypes::vectors::{TimestampMillisecondVector, UInt32Vector, VectorRef};
use pretty_assertions::assert_eq;
use query::options::{FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY};
use query::options::{
FLOW_INCREMENTAL_AFTER_SEQS, FLOW_INCREMENTAL_MODE_MEMTABLE_ONLY, QueryOptions,
};
use session::context::QueryContext;
use table::test_util::MemTable;
@@ -38,6 +40,13 @@ use crate::batching_mode::state::CheckpointMode;
use crate::batching_mode::time_window::find_time_window_expr;
use crate::test_utils::create_test_query_engine;
fn incremental_batch_opts() -> Arc<BatchingModeOptions> {
Arc::new(BatchingModeOptions {
experimental_enable_incremental_read: true,
..Default::default()
})
}
async fn new_test_task_and_plan_with_missing_sink() -> (BatchingTask, LogicalPlan) {
new_test_task_engine_and_plan_with_query(
"SELECT number, ts FROM numbers_with_ts",
@@ -60,6 +69,15 @@ impl TestTaskParts {
}
async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str) -> TestTaskParts {
new_test_task_engine_and_plan_with_query_and_opts(query, sink_table, incremental_batch_opts())
.await
}
async fn new_test_task_engine_and_plan_with_query_and_opts(
query: &str,
sink_table: &str,
batch_opts: Arc<BatchingModeOptions>,
) -> TestTaskParts {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let plan = sql_to_df_plan(
@@ -91,7 +109,7 @@ async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str)
query_ctx: ctx,
catalog_manager: query_engine.engine_state().catalog_manager().clone(),
shutdown_rx: rx,
batch_opts: Arc::new(BatchingModeOptions::default()),
batch_opts,
flow_eval_interval: None,
})
.unwrap();
@@ -103,6 +121,75 @@ async fn new_test_task_engine_and_plan_with_query(query: &str, sink_table: &str)
}
}
#[tokio::test]
async fn test_incremental_read_is_disabled_by_default() {
let task = new_test_task_engine_and_plan_with_query_and_opts(
"SELECT number, ts FROM numbers_with_ts",
"numbers_with_ts",
Arc::new(BatchingModeOptions::default()),
)
.await
.task;
assert!(task.state.read().unwrap().is_incremental_disabled());
}
#[tokio::test]
async fn test_dirty_time_windows_uses_batch_opts() {
let task = new_test_task_engine_and_plan_with_query_and_opts(
"SELECT number, ts FROM numbers_with_ts",
"numbers_with_ts",
Arc::new(BatchingModeOptions {
experimental_max_filter_num_per_query: 7,
experimental_time_window_merge_threshold: 11,
..Default::default()
}),
)
.await
.task;
let state = task.state.read().unwrap();
assert_eq!(7, state.dirty_time_windows.max_filter_num_per_query());
assert_eq!(11, state.dirty_time_windows.time_window_merge_threshold());
}
#[tokio::test]
async fn test_execute_once_serialized_waits_for_execution_lock() {
let TestTaskParts {
task, query_engine, ..
} = new_test_task_engine_and_plan_with_query(
"SELECT number, ts FROM numbers_with_ts",
"missing_sink",
)
.await;
let (frontend_client, _handler) =
FrontendClient::from_empty_grpc_handler(QueryOptions::default());
let frontend_client = Arc::new(frontend_client);
let guard = task.execution_lock.clone().lock_owned().await;
let task_to_run = task.clone();
let query_engine_to_run = query_engine.clone();
let frontend_client_to_run = frontend_client.clone();
let exec = tokio::spawn(async move {
task_to_run
.execute_once_serialized(&query_engine_to_run, &frontend_client_to_run, None)
.await
});
tokio::time::sleep(Duration::from_millis(20)).await;
assert!(
!exec.is_finished(),
"execute_once_serialized should wait for execution_lock"
);
drop(guard);
tokio::time::timeout(Duration::from_secs(1), exec)
.await
.expect("execute_once_serialized should finish once execution_lock is released")
.expect("execute_once_serialized task should not panic")
.expect_err("missing sink should fail after acquiring execution_lock");
}
async fn new_time_window_test_task_with_query(query: &str) -> TestTaskParts {
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
@@ -147,7 +234,7 @@ async fn new_time_window_test_task_with_query(query: &str) -> TestTaskParts {
query_ctx: ctx,
catalog_manager: query_engine.engine_state().catalog_manager().clone(),
shutdown_rx: rx,
batch_opts: Arc::new(BatchingModeOptions::default()),
batch_opts: incremental_batch_opts(),
flow_eval_interval: None,
})
.unwrap();
@@ -226,6 +313,14 @@ fn dirty_range(start: i64, end: i64) -> DirtyTimeWindows {
dirty
}
fn expire_after_for_retention_filter_test() -> i64 {
let now_secs = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("Time went backwards")
.as_secs();
(now_secs - 10) as i64
}
async fn assert_unscoped_failure_restore(
consumed_dirty_windows: DirtyTimeWindows,
current_dirty_windows: DirtyTimeWindows,
@@ -626,6 +721,7 @@ async fn test_full_snapshot_scoped_plan_marks_checkpoint_advance_safe_only_after
.await;
{
let mut state = task.state.write().unwrap();
state.disable_incremental();
state
.dirty_time_windows
.add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
@@ -657,7 +753,7 @@ async fn test_full_snapshot_scoped_plan_marks_checkpoint_advance_safe_only_after
}
#[tokio::test]
async fn test_incremental_scoped_plan_consumes_all_dirty_windows_for_checkpoint_safety() {
async fn test_incremental_plan_consumes_dirty_signal_for_checkpoint_safety() {
let TestTaskParts {
task,
query_engine,
@@ -692,6 +788,192 @@ async fn test_incremental_scoped_plan_consumes_all_dirty_windows_for_checkpoint_
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
}
#[tokio::test]
async fn test_full_snapshot_seeding_for_incremental_does_not_add_dirty_window_filter() {
let TestTaskParts {
task,
query_engine,
..
} = new_time_window_test_task_with_query(
"SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window",
)
.await;
{
let mut state = task.state.write().unwrap();
assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
assert!(!state.is_incremental_disabled());
state
.dirty_time_windows
.add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
state
.dirty_time_windows
.add_window(Timestamp::new_second(30), Some(Timestamp::new_second(35)));
}
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", CDT::uint32_datatype(), false),
ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
.with_time_index(true),
]));
let plan = task
.gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
.await
.unwrap()
.unwrap();
let plan_text = plan.plan.to_string();
assert!(plan.can_advance_checkpoints);
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
assert!(!plan_text.contains("Filter:"), "{plan_text}");
}
#[tokio::test]
async fn test_full_snapshot_seeding_applies_expire_after_retention_filter() {
let TestTaskParts {
mut task,
query_engine,
..
} = new_time_window_test_task_with_query(
"SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window",
)
.await;
{
let mut state = task.state.write().unwrap();
assert_eq!(state.checkpoint_mode(), CheckpointMode::FullSnapshot);
assert!(!state.is_incremental_disabled());
state
.dirty_time_windows
.add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
}
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", CDT::uint32_datatype(), false),
ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
.with_time_index(true),
]));
Arc::get_mut(&mut task.config)
.expect("test task config should be uniquely owned")
.expire_after = Some(expire_after_for_retention_filter_test());
let plan = task
.gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
.await
.unwrap()
.unwrap();
assert!(plan.can_advance_checkpoints);
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
let plan_text = plan.plan.to_string();
assert!(
plan_text.contains("Filter: ts >= TimestampMillisecond("),
"{plan_text}"
);
}
#[tokio::test]
async fn test_incremental_plan_does_not_add_dirty_window_filter() {
let TestTaskParts {
task,
query_engine,
..
} = new_time_window_test_task_with_query(
"SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window",
)
.await;
{
let mut state = task.state.write().unwrap();
state.advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
state
.dirty_time_windows
.add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
}
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", CDT::uint32_datatype(), false),
ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
.with_time_index(true),
]));
let plan = task
.gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
.await
.unwrap()
.unwrap();
let plan_text = plan.plan.to_string();
assert!(plan.can_advance_checkpoints);
assert!(!plan_text.contains("Filter:"), "{plan_text}");
}
#[tokio::test]
async fn test_incremental_delta_applies_expire_after_retention_filter() {
let TestTaskParts {
mut task,
query_engine,
..
} = new_time_window_test_task_with_query(
"SELECT max(number) AS number, date_bin(INTERVAL '5 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window",
)
.await;
{
let mut state = task.state.write().unwrap();
state.advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
state
.dirty_time_windows
.add_window(Timestamp::new_second(0), Some(Timestamp::new_second(5)));
}
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", CDT::uint32_datatype(), false),
ColumnSchema::new("time_window", CDT::timestamp_millisecond_datatype(), false)
.with_time_index(true),
]));
Arc::get_mut(&mut task.config)
.expect("test task config should be uniquely owned")
.expire_after = Some(expire_after_for_retention_filter_test());
let plan = task
.gen_query_with_time_window(query_engine, &sink_schema, &[], false, Some(1))
.await
.unwrap()
.unwrap();
assert!(plan.can_advance_checkpoints);
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
let plan_text = plan.plan.to_string();
assert!(
plan_text.contains("Filter: ts >= TimestampMillisecond("),
"{plan_text}"
);
}
#[tokio::test]
async fn test_non_scoped_path_generates_plan_with_empty_dirty_signal() {
let TestTaskParts {
mut task,
query_engine,
..
} = new_test_task_engine_and_plan_with_query(
"SELECT number, ts FROM numbers_with_ts",
"missing_sink",
)
.await;
Arc::get_mut(&mut task.config)
.expect("test task config should be uniquely owned")
.query_type = QueryType::Tql;
task.state.write().unwrap().dirty_time_windows.clean();
let sink_schema = Arc::new(Schema::new(vec![
ColumnSchema::new("number", CDT::uint32_datatype(), false),
ColumnSchema::new("ts", CDT::timestamp_millisecond_datatype(), false).with_time_index(true),
]));
let plan = task
.gen_query_with_time_window(query_engine, &sink_schema, &[], false, None)
.await
.unwrap()
.expect("non-scoped path should generate a plan even with an empty dirty signal");
assert!(plan.can_advance_checkpoints);
assert!(task.state.read().unwrap().dirty_time_windows.is_empty());
}
#[tokio::test]
async fn test_executed_query_failure_restores_scoped_dirty_windows_for_flush_path() {
let (task, plan) = new_test_task_and_plan_with_missing_sink().await;
@@ -773,7 +1055,7 @@ async fn test_prepare_plan_for_incremental_disables_on_non_aggregate() {
query_ctx: ctx,
catalog_manager: query_engine.engine_state().catalog_manager().clone(),
shutdown_rx: rx,
batch_opts: Arc::new(BatchingModeOptions::default()),
batch_opts: incremental_batch_opts(),
flow_eval_interval: None,
})
.unwrap();
@@ -788,10 +1070,7 @@ async fn test_prepare_plan_for_incremental_disables_on_non_aggregate() {
CheckpointMode::Incremental
);
let incremental_plan = task
.prepare_plan_for_incremental(&dml_plan, None)
.await
.unwrap();
let incremental_plan = task.prepare_plan_for_incremental(&dml_plan).await.unwrap();
assert!(incremental_plan.is_none());
let state = task.state.read().unwrap();
assert!(state.is_incremental_disabled());
@@ -852,7 +1131,7 @@ async fn test_prepare_plan_for_incremental_falls_back_without_disable_on_rewrite
query_ctx: ctx,
catalog_manager: query_engine.engine_state().catalog_manager().clone(),
shutdown_rx: rx,
batch_opts: Arc::new(BatchingModeOptions::default()),
batch_opts: incremental_batch_opts(),
flow_eval_interval: None,
})
.unwrap();
@@ -866,10 +1145,7 @@ async fn test_prepare_plan_for_incremental_falls_back_without_disable_on_rewrite
CheckpointMode::Incremental
);
let incremental_plan = task
.prepare_plan_for_incremental(&dml_plan, None)
.await
.unwrap();
let incremental_plan = task.prepare_plan_for_incremental(&dml_plan).await.unwrap();
assert!(incremental_plan.is_none());
let state = task.state.read().unwrap();
assert!(!state.is_incremental_disabled());
@@ -928,7 +1204,7 @@ async fn test_prepare_plan_for_incremental_group_by_without_merge_columns_uses_o
query_ctx: ctx,
catalog_manager: query_engine.engine_state().catalog_manager().clone(),
shutdown_rx: rx,
batch_opts: Arc::new(BatchingModeOptions::default()),
batch_opts: incremental_batch_opts(),
flow_eval_interval: None,
})
.unwrap();
@@ -939,7 +1215,7 @@ async fn test_prepare_plan_for_incremental_group_by_without_merge_columns_uses_o
.advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
let incremental_plan = task
.prepare_plan_for_incremental(&dml_plan, None)
.prepare_plan_for_incremental(&dml_plan)
.await
.unwrap()
.expect("plain GROUP BY is incremental-safe without a rewrite");
@@ -962,7 +1238,7 @@ async fn test_auto_created_sql_aggregate_sink_reaches_incremental_safe() {
task.state.write().unwrap().dirty_time_windows.set_dirty();
let plan_info = task
.gen_insert_plan(&query_engine, None)
.gen_insert_plan_unlocked(&query_engine, None)
.await
.unwrap()
.unwrap();
@@ -973,7 +1249,7 @@ async fn test_auto_created_sql_aggregate_sink_reaches_incremental_safe() {
.unwrap()
.advance_checkpoints(HashMap::from([(1_u64, 10_u64)]));
let incremental_plan = task
.prepare_plan_for_incremental(&plan_info.plan, None)
.prepare_plan_for_incremental(&plan_info.plan)
.await
.unwrap();
let incremental_safe = incremental_plan.is_some();
@@ -1078,11 +1354,11 @@ async fn test_insert_plan_matching_failure_restores_consumed_dirty_marker() {
register_number_only_sink(&query_engine, sink_table);
task.state.write().unwrap().dirty_time_windows.set_dirty();
let result = task.gen_insert_plan(&query_engine, None).await;
let result = task.gen_insert_plan_unlocked(&query_engine, None).await;
assert!(result.is_err());
let _err = match result {
Ok(_) => panic!("gen_insert_plan should fail with a sink column mismatch"),
Ok(_) => panic!("gen_insert_plan_unlocked should fail with a sink column mismatch"),
Err(err) => err,
};
let state = task.state.read().unwrap();

View File

@@ -1288,9 +1288,10 @@ async fn test_rewrite_incremental_aggregate_with_left_join() {
#[tokio::test]
async fn test_rewrite_incremental_aggregate_filters_sink_dirty_time_window() {
// This verifies the rewrite placement when callers supply an already
// inferred sink dirty-window predicate. The task-level inference rules are
// covered by `infer_sink_time_window_filter_col` tests in task.rs.
// This verifies the rewrite placement when callers supply a sink predicate.
// The production incremental flow path currently leaves sink scans
// unfiltered for correctness and relies on future dynamic filters for
// pruning.
let query_engine = create_test_query_engine();
let ctx = QueryContext::arc();
let sql = "SELECT max(number) AS number, date_bin(INTERVAL '1 second', ts) AS time_window FROM numbers_with_ts GROUP BY time_window";

View File

@@ -566,11 +566,15 @@ impl FrontendInvoker {
name: TABLE_FLOWNODE_SET_CACHE_NAME,
})?;
// TODO(auto_create_table): flow sink tables are created through a controlled
// `CREATE FLOW` path, not client writes, so they are intentionally exempt from
// the frontend's global auto-create switch. Revisit if flow should honor it.
let inserter = Arc::new(Inserter::new(
catalog_manager.clone(),
partition_manager.clone(),
node_manager.clone(),
table_flownode_cache,
true,
));
let deleter = Arc::new(Deleter::new(

View File

@@ -44,6 +44,11 @@ pub struct FrontendOptions {
pub node_id: Option<String>,
pub default_timezone: Option<String>,
pub default_column_prefix: Option<String>,
/// Server-side global switch for auto table creation on write.
/// Acts as an upper bound: when `false`, missing tables are never auto-created
/// even if a request sets the `auto_create_table` hint to `true`. When `true`
/// (default), the per-request hint still applies. Default: `true`.
pub auto_create_table: bool,
/// Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
/// Set to 0 to disable the limit. Default: "0" (unlimited)
pub max_in_flight_write_bytes: ReadableSize,
@@ -82,6 +87,7 @@ impl Default for FrontendOptions {
node_id: None,
default_timezone: None,
default_column_prefix: None,
auto_create_table: true,
max_in_flight_write_bytes: ReadableSize(0),
write_bytes_exhausted_policy: OnExhaustedPolicy::default(),
http: HttpOptions::default(),

View File

@@ -185,6 +185,7 @@ impl FrontendBuilder {
partition_manager.clone(),
node_manager.clone(),
table_flownode_cache,
self.options.auto_create_table,
));
let deleter = Arc::new(Deleter::new(
self.catalog_manager.clone(),

View File

@@ -43,7 +43,12 @@ use servers::query_handler::{
};
use session::context::QueryContextRef;
use snafu::{IntoError, ResultExt};
use table::requests::{OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM};
use table::requests::{
OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM, SEMANTIC_PIPELINE, SEMANTIC_SIGNAL_TYPE,
SEMANTIC_SOURCE, SEMANTIC_TRACE_CONVENTIONS, SEMANTIC_TRACE_HAS_EVENTS,
SEMANTIC_TRACE_HAS_LINKS, SEMANTIC_VALUE_UNKNOWN, SIGNAL_TYPE_LOG, SIGNAL_TYPE_METRIC,
SIGNAL_TYPE_TRACE, SOURCE_OPENTELEMETRY, TABLE_DATA_MODEL_TRACE_V1,
};
use crate::instance::Instance;
use crate::instance::otlp::trace_semconv::trace_semconv_fixed_type;
@@ -131,12 +136,14 @@ impl OpenTelemetryProtocolHandler for Instance {
let (requests, rows) = otlp::metrics::to_grpc_insert_requests(request, &mut metric_ctx)?;
OTLP_METRICS_ROWS.inc_by(rows as u64);
let ctx = if !is_legacy {
let ctx = {
let mut c = (*ctx).clone();
c.set_extension(OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM.to_string());
c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC);
c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
if !is_legacy {
c.set_extension(OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM.to_string());
}
Arc::new(c)
} else {
ctx
};
// If the user uses the legacy path, it is by default without metric engine.
@@ -211,6 +218,15 @@ impl OpenTelemetryProtocolHandler for Instance {
.get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
interceptor_ref.pre_execute(ctx.clone())?;
// `as_req_iter` clones this ctx into each `temp_ctx`, so identity set here
// reaches the context that drives table auto-create.
let ctx = {
let mut c = (*ctx).clone();
c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_LOG);
c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
Arc::new(c)
};
let opt_req = otlp::logs::to_grpc_insert_requests(
request,
pipeline,
@@ -256,6 +272,23 @@ impl Instance {
ctx: QueryContextRef,
) -> ServerResult<TraceIngestOutcome> {
let is_trace_v1_model = matches!(pipeline, PipelineWay::OtlpTraceDirectV1);
// Only the main span table gets the identity; the derived `_services` /
// `_operations` lookup tables keep the unstamped `ctx`.
let main_ctx = {
let mut c = (*ctx).clone();
c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_TRACE);
c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
if is_trace_v1_model {
c.set_extension(SEMANTIC_PIPELINE, TABLE_DATA_MODEL_TRACE_V1);
c.set_extension(SEMANTIC_TRACE_HAS_EVENTS, "true");
c.set_extension(SEMANTIC_TRACE_HAS_LINKS, "true");
// schema_url is row-level, so conventions is unknown at table level.
c.set_extension(SEMANTIC_TRACE_CONVENTIONS, SEMANTIC_VALUE_UNKNOWN);
}
Arc::new(c)
};
let ingest_ctx = TraceChunkIngestContext {
pipeline_handler,
pipeline,
@@ -278,7 +311,7 @@ impl Instance {
.map(|chunk| chunk.collect::<Vec<_>>())
.collect::<Vec<_>>();
for chunk in chunks {
self.ingest_trace_chunk(&ingest_ctx, chunk, ctx.clone(), &mut ingest_state)
self.ingest_trace_chunk(&ingest_ctx, chunk, main_ctx.clone(), &mut ingest_state)
.await?;
}
}

View File

@@ -440,7 +440,17 @@ impl Context {
};
let _ = self
.cache_invalidator
.invalidate(&ctx, &[CacheIdent::TableId(table_id)])
.invalidate(
&ctx,
&[
CacheIdent::TableId(table_id),
CacheIdent::TableName(TableName {
catalog_name: self.persistent_ctx.catalog_name.clone(),
schema_name: self.persistent_ctx.schema_name.clone(),
table_name: self.persistent_ctx.table_name.clone(),
}),
],
)
.await;
Ok(())
}

View File

@@ -95,10 +95,19 @@ impl State for UpdatePartitionMetadata {
let mut new_table_info = table_info_value.table_info.clone();
new_table_info.meta.partition_key_indices = partition_key_indices;
common_telemetry::info!(
"Update table partition metadata, table_id: {}, partition_key_indices: {:?}, partition_columns: {:?}",
table_id,
new_table_info.meta.partition_key_indices,
new_table_info
.meta
.partition_column_names()
.cloned()
.collect::<Vec<_>>(),
);
ctx.update_table_info(&table_info_value, table_info_value.update(new_table_info))
.await?;
// We don't invalidate cache here because the subsequent AllocateRegion step
// will update the table route and invalidate the cache accordingly.
ctx.invalidate_table_cache().await?;
Ok((
Box::new(AllocateRegion::new(self.plan_entries.clone())),

View File

@@ -50,6 +50,7 @@ datafusion-common.workspace = true
datafusion-expr.workspace = true
datatypes.workspace = true
dashmap.workspace = true
derive_more.workspace = true
dotenv.workspace = true
either.workspace = true
futures.workspace = true

View File

@@ -150,6 +150,7 @@ impl CompactionScheduler {
}
/// Schedules a compaction for the region.
/// Returns whether a compaction is scheduled.
#[allow(clippy::too_many_arguments)]
pub(crate) async fn schedule_compaction(
&mut self,
@@ -161,7 +162,7 @@ impl CompactionScheduler {
manifest_ctx: &ManifestContextRef,
schema_metadata_manager: SchemaMetadataManagerRef,
max_parallelism: usize,
) -> Result<()> {
) -> Result<bool> {
// skip compaction if region is in staging state
let current_state = manifest_ctx.current_state();
if current_state == RegionRoleState::Leader(RegionLeaderState::Staging) {
@@ -170,7 +171,7 @@ impl CompactionScheduler {
region_id, compact_options
);
waiter.send(Ok(0));
return Ok(());
return Ok(false);
}
if let Some(status) = self.region_status.get_mut(&region_id) {
@@ -192,7 +193,7 @@ impl CompactionScheduler {
);
}
}
return Ok(());
return Ok(false);
}
// The region can compact directly.
@@ -209,7 +210,7 @@ impl CompactionScheduler {
max_parallelism,
);
let result = match self
match self
.schedule_compaction_request(request, compact_options)
.await
{
@@ -220,14 +221,12 @@ impl CompactionScheduler {
status.active_compaction = Some(active_compaction);
self.region_status.insert(region_id, status);
Ok(())
self.listener.on_compaction_scheduled(region_id);
Ok(true)
}
Ok(None) => Ok(()),
Ok(None) => Ok(false),
Err(e) => Err(e),
};
self.listener.on_compaction_scheduled(region_id);
result
}
}
// Handle pending manual compaction request for the region.
@@ -334,6 +333,27 @@ impl CompactionScheduler {
// And skip try to schedule next compaction task.
return pending_ddl_requests;
}
Vec::new()
}
pub(crate) fn is_compacting(&self, region_id: RegionId) -> bool {
self.region_status
.get(&region_id)
.map(|status| status.active_compaction.is_some())
.unwrap_or(false)
}
/// Schedules next compaction upon a finished compaction.
/// Returns whether the compaction is scheduled.
pub(crate) async fn schedule_next_compaction(
&mut self,
region_id: RegionId,
manifest_ctx: &ManifestContextRef,
schema_metadata_manager: SchemaMetadataManagerRef,
) -> bool {
let Some(status) = self.region_status.get_mut(&region_id) else {
return false;
};
// We should always try to compact the region until picker returns None.
let request = status.new_compaction_request(
@@ -364,20 +384,21 @@ impl CompactionScheduler {
"Successfully scheduled next compaction for region id: {}",
region_id
);
true
}
Ok(None) => {
// No further compaction tasks can be scheduled; cleanup the `CompactionStatus` for this region.
// All DDL requests and pending compaction requests have already been processed.
// Safe to remove the region from status tracking.
self.region_status.remove(&region_id);
false
}
Err(e) => {
error!(e; "Failed to schedule next compaction for region {}", region_id);
self.remove_region_on_failure(region_id, Arc::new(e));
false
}
}
Vec::new()
}
/// Notifies the scheduler that the compaction job is cancelled cooperatively.
@@ -1435,7 +1456,7 @@ mod tests {
let manifest_ctx = env
.mock_manifest_context(version_control.current().version.metadata.clone())
.await;
scheduler
let scheduled = scheduler
.schedule_compaction(
builder.region_id(),
compact_request::Options::Regular(Default::default()),
@@ -1448,6 +1469,7 @@ mod tests {
)
.await
.unwrap();
assert!(!scheduled);
let output = output_rx.await.unwrap().unwrap();
assert_eq!(output, 0);
assert!(scheduler.region_status.is_empty());
@@ -1456,7 +1478,7 @@ mod tests {
let version_control = Arc::new(builder.push_l0_file(0, 1000).build());
let (output_tx, output_rx) = oneshot::channel();
let waiter = OptionOutputTx::from(output_tx);
scheduler
let scheduled = scheduler
.schedule_compaction(
builder.region_id(),
compact_request::Options::Regular(Default::default()),
@@ -1469,11 +1491,67 @@ mod tests {
)
.await
.unwrap();
assert!(!scheduled);
let output = output_rx.await.unwrap().unwrap();
assert_eq!(output, 0);
assert!(scheduler.region_status.is_empty());
}
#[tokio::test]
async fn test_schedule_compaction_returns_true_when_task_scheduled() {
let job_scheduler = Arc::new(VecScheduler::default());
let env = SchedulerEnv::new().await.scheduler(job_scheduler.clone());
let (tx, _rx) = mpsc::channel(4);
let mut scheduler = env.mock_compaction_scheduler(tx);
let mut builder = VersionControlBuilder::new();
let region_id = builder.region_id();
let end = 1000 * 1000;
// Five overlapping L0 files are enough for the regular picker to create a task.
let version_control = Arc::new(
builder
.push_l0_file(0, end)
.push_l0_file(10, end)
.push_l0_file(50, end)
.push_l0_file(80, end)
.push_l0_file(90, end)
.build(),
);
let manifest_ctx = env
.mock_manifest_context(version_control.current().version.metadata.clone())
.await;
let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager();
schema_metadata_manager
.register_region_table_info(
region_id.table_id(),
"test_table",
"test_catalog",
"test_schema",
None,
kv_backend,
)
.await;
let scheduled = scheduler
.schedule_compaction(
region_id,
Options::Regular(Default::default()),
&version_control,
&env.access_layer,
OptionOutputTx::none(),
&manifest_ctx,
schema_metadata_manager,
1,
)
.await
.unwrap();
// The boolean result is what the worker uses to decide whether to update
// last_schedule_compaction_millis.
assert!(scheduled);
assert_eq!(1, job_scheduler.num_jobs());
assert!(scheduler.region_status.contains_key(&region_id));
}
#[tokio::test]
async fn test_schedule_on_finished() {
common_telemetry::init_default_ut_logging();
@@ -1511,7 +1589,7 @@ mod tests {
let manifest_ctx = env
.mock_manifest_context(version_control.current().version.metadata.clone())
.await;
scheduler
let scheduled = scheduler
.schedule_compaction(
region_id,
compact_request::Options::Regular(Default::default()),
@@ -1525,6 +1603,7 @@ mod tests {
.await
.unwrap();
// Should schedule 1 compaction.
assert!(scheduled);
assert_eq!(1, scheduler.region_status.len());
assert_eq!(1, job_scheduler.num_jobs());
let data = version_control.current();
@@ -1543,7 +1622,7 @@ mod tests {
);
// The task is pending.
let (tx, _rx) = oneshot::channel();
scheduler
let scheduled = scheduler
.schedule_compaction(
region_id,
compact_request::Options::Regular(Default::default()),
@@ -1556,6 +1635,7 @@ mod tests {
)
.await
.unwrap();
assert!(!scheduled);
assert_eq!(1, scheduler.region_status.len());
assert_eq!(1, job_scheduler.num_jobs());
assert!(
@@ -1571,6 +1651,10 @@ mod tests {
scheduler
.on_compaction_finished(region_id, &manifest_ctx, schema_metadata_manager.clone())
.await;
let scheduled = scheduler
.schedule_next_compaction(region_id, &manifest_ctx, schema_metadata_manager.clone())
.await;
assert!(scheduled);
assert_eq!(1, scheduler.region_status.len());
assert_eq!(2, job_scheduler.num_jobs());
@@ -1583,7 +1667,7 @@ mod tests {
);
let (tx, _rx) = oneshot::channel();
// The task is pending.
scheduler
let scheduled = scheduler
.schedule_compaction(
region_id,
compact_request::Options::Regular(Default::default()),
@@ -1596,6 +1680,7 @@ mod tests {
)
.await
.unwrap();
assert!(!scheduled);
assert_eq!(2, job_scheduler.num_jobs());
assert!(
!scheduler
@@ -2329,6 +2414,15 @@ mod tests {
.await;
assert!(pending_ddls.is_empty());
assert!(scheduler.region_status.contains_key(&region_id));
let (schema_metadata_manager, _kv_backend) = mock_schema_metadata_manager();
// With no compactable files, next scheduling returns false and removes
// the status without creating a background task.
let scheduled = scheduler
.schedule_next_compaction(region_id, &manifest_ctx, schema_metadata_manager)
.await;
assert!(!scheduled);
assert!(!scheduler.region_status.contains_key(&region_id));
}
@@ -2371,6 +2465,14 @@ mod tests {
.await;
assert!(pending_ddls.is_empty());
assert!(scheduler.region_status.contains_key(&region_id));
let (schema_metadata_manager, _kv_backend) = mock_schema_metadata_manager();
// The failing scheduler simulates a submit error; callers must see false.
let scheduled = scheduler
.schedule_next_compaction(region_id, &manifest_ctx, schema_metadata_manager)
.await;
assert!(!scheduled);
assert!(!scheduler.region_status.contains_key(&region_id));
}

View File

@@ -15,6 +15,9 @@
//! This file contains code to find sorted runs in a set if ranged items and
//! along with the best way to merge these items to satisfy the desired run count.
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use bytes::{Buf, Bytes};
use common_base::BitVec;
use common_base::readable_size::ReadableSize;
@@ -423,6 +426,133 @@ where
runs
}
pub(crate) fn find_sorted_runs_by_time_range<T>(items: &mut [T]) -> Vec<SortedRun<T>>
where
T: Item,
{
if items.is_empty() {
return vec![];
}
sort_ranged_items(items);
use derive_more::{Eq, PartialEq};
/// `SortedRun` with a creation sequence `i`.
#[derive(PartialEq, Eq)]
struct Run<T: Item> {
i: usize,
#[partial_eq(skip)]
run: SortedRun<T>,
}
impl<T: Item> Run<T> {
fn new(i: usize, item: &T) -> Run<T> {
let mut run = SortedRun::default();
run.push_item(item.clone());
Run { i, run }
}
fn push_item(&mut self, item: &T) {
self.run.push_item(item.clone());
}
}
impl<T: Item> PartialOrd for Run<T> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
/// Sort by run's `end` desc then `start` asc.
impl<T: Item> Ord for Run<T> {
fn cmp(&self, other: &Self) -> Ordering {
let l_run = &self.run;
let r_run = &other.run;
// Safety: `start` and `end` must both exist because it's guaranteed that whenever a
// `Run` is created, an item is pushed into it immediately (see its `new` method above).
// And there are no other ways to create a `Run` beyond its `new` method in this
// function's scope.
let l_end = l_run.end.unwrap();
let r_end = r_run.end.unwrap();
r_end
.cmp(&l_end)
.then_with(|| {
let l_start = l_run.start.unwrap();
let r_start = r_run.start.unwrap();
l_start.cmp(&r_start)
})
.then_with(|| self.i.cmp(&other.i))
}
}
/// Wrapper around the `Run` above, to support sorting them by their creation sequence `i`.
#[derive(PartialEq, Eq)]
struct Wrapper<T: Item>(Run<T>);
impl<T: Item> PartialOrd for Wrapper<T> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<T: Item> Ord for Wrapper<T> {
fn cmp(&self, other: &Self) -> Ordering {
other.0.i.cmp(&self.0.i)
}
}
// Two heaps for finding a run that is both:
// 1. not overlapping with item's range,
// 2. and is created earliest,
// when iterating the items.
//
// Heap 1 (`runs_sorted_by_end`) is for storing the runs of which top has the minimal "end"
// just about to overlap with the current selected item.
//
// Heap 2 (`runs_sort_by_index`) is for storing the runs that all have "end"s non-overlap with
// the current selected item, and of which top is the earliest created run.
//
// The finding of a suitable run basically works like this:
// 1. moves the runs in heap 1 to heap 2, until the top is overlapping with the current item;
// 2. now heap 2 has all the runs that can accept the current item, pop its top;
// 3. the top is the earliest created run, push the current item;
// 4. because the run has changed, push it back to heap 1;
// 5. check the next item. Important: we don't need to push the runs in heap 2 to 1, because
// the items are sorted by "start". When checking the next item, heap 2's runs must all have
// "end"s smaller than next item's "start".
//
// Actually the heap 2 is only for aligning with the runs selection outcomes in the original
// `find_sorted_runs` implementation. If we just need the invariant that each run has the
// non-overlapping items, we can get rid of heap 2 and make the codes simpler.
let mut runs_sort_by_end = BinaryHeap::<Run<T>>::new();
let mut runs_sort_by_index = BinaryHeap::<Wrapper<T>>::new();
let mut i = 0;
for item in items {
let (start, _) = item.range();
while let Some(run) = runs_sort_by_end.pop_if(|x| x.run.end.unwrap() <= start) {
runs_sort_by_index.push(Wrapper(run));
}
let Some(mut run) = runs_sort_by_index.pop() else {
i += 1;
runs_sort_by_end.push(Run::new(i, item));
continue;
};
run.0.push_item(item);
runs_sort_by_end.push(run.0);
}
let mut runs = runs_sort_by_end.into_vec();
runs.extend(runs_sort_by_index.into_vec().into_iter().map(|x| x.0));
runs.sort_unstable_by_key(|run| run.i);
runs.into_iter().map(|x| x.run).collect()
}
/// Finds a set of files with minimum penalty to merge that can reduce the total num of runs.
/// The penalty of merging is defined as the size of all overlapping files between two runs.
pub fn reduce_runs<T: Item>(mut runs: Vec<SortedRun<T>>) -> Vec<T> {
@@ -599,6 +729,8 @@ mod tests {
expected_runs: &[Vec<(i64, i64)>],
) -> Vec<SortedRun<MockFile>> {
let mut files = build_items(ranges);
let mut files_clone = files.clone();
let runs = find_sorted_runs(&mut files);
let result_file_ranges: Vec<Vec<_>> = runs
@@ -606,6 +738,13 @@ mod tests {
.map(|r| r.items.iter().map(|f| f.range()).collect())
.collect();
assert_eq!(&expected_runs, &result_file_ranges);
let runs_by_time_range = find_sorted_runs_by_time_range(&mut files_clone);
let results: Vec<Vec<_>> = runs_by_time_range
.iter()
.map(|r| r.items.iter().map(|f| f.range()).collect())
.collect();
assert_eq!(&expected_runs, &results);
runs
}

View File

@@ -22,14 +22,15 @@ use common_telemetry::{debug, info};
use common_time::Timestamp;
use common_time::timestamp::TimeUnit;
use common_time::timestamp_millis::BucketAligned;
use rayon::prelude::*;
use store_api::storage::RegionId;
use crate::compaction::buckets::infer_time_bucket;
use crate::compaction::compactor::CompactionRegion;
use crate::compaction::picker::{Picker, PickerOutput};
use crate::compaction::run::{
FileGroup, Item, Ranged, find_sorted_runs, merge_primary_key_ranges, merge_seq_files,
primary_key_ranges_overlap, reduce_runs,
FileGroup, Item, Ranged, find_sorted_runs, find_sorted_runs_by_time_range,
merge_primary_key_ranges, merge_seq_files, primary_key_ranges_overlap, reduce_runs,
};
use crate::compaction::{CompactionOutput, get_expired_ssts};
use crate::sst::file::{FileHandle, Level, overlaps};
@@ -64,11 +65,10 @@ impl TwcsPicker {
time_windows: &mut BTreeMap<i64, Window>,
active_window: Option<i64>,
) -> Vec<CompactionOutput> {
let mut output = vec![];
for (window, files) in time_windows {
if files.files.is_empty() {
continue;
}
let find_inputs = |files: &Window,
windows: &BTreeMap<i64, Window>|
-> (Vec<FileGroup>, bool) {
let window = &files.time_window;
let mut files_to_merge: Vec<_> = files.files().cloned().collect();
// Filter out large files in append mode - they won't benefit from compaction
@@ -88,13 +88,18 @@ impl TwcsPicker {
);
}
let sorted_runs = find_sorted_runs(&mut files_to_merge);
let sorted_runs = if files_to_merge.len() < 1024 {
find_sorted_runs(&mut files_to_merge)
} else {
find_sorted_runs_by_time_range(&mut files_to_merge)
};
let found_runs = sorted_runs.len();
// We only remove deletion markers if we found less than 2 runs and not in append mode.
// because after compaction there will be no overlapping files.
let filter_deleted = !files.overlapping && found_runs <= 2 && !self.append_mode;
let filter_deleted =
found_runs <= 2 && !self.append_mode && !window_has_overlap(files, windows);
if found_runs == 0 {
continue;
return (vec![], filter_deleted);
}
let mut inputs = if found_runs > 1 {
@@ -102,7 +107,7 @@ impl TwcsPicker {
} else {
let run = sorted_runs.last().unwrap();
if run.items().len() < self.trigger_file_num {
continue;
return (vec![], filter_deleted);
}
// no overlapping files, try merge small files
merge_seq_files(run.items(), self.max_output_file_size)
@@ -144,6 +149,26 @@ impl TwcsPicker {
filter_deleted,
&inputs,
);
}
(inputs, filter_deleted)
};
let mut output = vec![];
let windows = time_windows
.values()
.filter(|w| !w.files.is_empty())
.collect::<Vec<_>>();
let chunk_size = self.max_background_tasks.unwrap_or(windows.len()).max(1);
'chunks: for chunk in windows.chunks(chunk_size) {
for (inputs, filter_deleted) in chunk
.par_iter() // parallelly calculate the inputs
.map(|window| find_inputs(window, time_windows))
.collect::<Vec<_>>()
{
if inputs.is_empty() {
continue;
}
output.push(CompactionOutput {
output_level: LEVEL_COMPACTED, // always compact to l1
inputs: inputs.into_iter().flat_map(|fg| fg.into_files()).collect(),
@@ -158,7 +183,7 @@ impl TwcsPicker {
"Region ({:?}) compaction task size larger than max background tasks({}), remaining tasks discarded",
region_id, max_background_tasks
);
break;
break 'chunks;
}
}
}
@@ -268,7 +293,6 @@ struct Window {
// created from the same compaction task.
files: HashMap<Option<NonZeroU64>, FileGroup>,
time_window: i64,
overlapping: bool,
primary_key_range: Option<(bytes::Bytes, bytes::Bytes)>,
}
@@ -283,7 +307,6 @@ impl Window {
end,
files,
time_window: 0,
overlapping: false,
primary_key_range,
}
}
@@ -346,37 +369,21 @@ fn assign_to_windows<'a>(
}
}
}
if windows.is_empty() {
return BTreeMap::new();
}
windows.into_iter().collect()
}
let mut windows = windows.into_values().collect::<Vec<_>>();
windows.sort_unstable_by(|l, r| l.start.cmp(&r.start).then(l.end.cmp(&r.end).reverse()));
for idx in 0..windows.len() {
let lhs_range = windows[idx].range();
for next_idx in idx + 1..windows.len() {
let rhs_range = windows[next_idx].range();
if rhs_range.0 > lhs_range.1 {
break;
}
let windows_overlap = overlaps(&lhs_range, &rhs_range)
&& match (
&windows[idx].primary_key_range,
&windows[next_idx].primary_key_range,
) {
(Some(lhs), Some(rhs)) => primary_key_ranges_overlap(lhs, rhs),
fn window_has_overlap(this: &Window, windows: &BTreeMap<i64, Window>) -> bool {
windows
.values()
.filter(|that| this.time_window != that.time_window)
.any(|that| {
overlaps(&this.range(), &that.range()) && {
match (&this.primary_key_range, &that.primary_key_range) {
(Some(l), Some(r)) => primary_key_ranges_overlap(l, r),
_ => true,
};
if windows_overlap {
windows[idx].overlapping = true;
windows[next_idx].overlapping = true;
}
}
}
}
windows.into_iter().map(|w| (w.time_window, w)).collect()
})
}
/// Finds the latest active writing window among all files.
@@ -606,7 +613,8 @@ mod tests {
for (expected_window, overlapping, window_files) in expected_files {
let actual_window = windows.get(expected_window).unwrap();
assert_eq!(*overlapping, actual_window.overlapping);
let actual_overlapping = window_has_overlap(actual_window, &windows);
assert_eq!(*overlapping, actual_overlapping);
let mut file_ranges = actual_window
.files
.values()
@@ -744,7 +752,8 @@ mod tests {
let windows = assign_to_windows(files.iter(), 2);
assert!(!windows.get(&2).unwrap().overlapping);
let overlapping = window_has_overlap(windows.get(&2).unwrap(), &windows);
assert!(!overlapping);
}
#[test]
@@ -773,7 +782,8 @@ mod tests {
let windows = assign_to_windows(files.iter(), 2);
assert!(!windows.get(&4).unwrap().overlapping);
let overlapping = window_has_overlap(windows.get(&4).unwrap(), &windows);
assert!(!overlapping);
}
struct CompactionPickerTestCase {

View File

@@ -21,6 +21,7 @@ use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_recordbatch::DfRecordBatch;
use common_test_util::flight::encode_to_flight_data;
use common_time::Timestamp;
use common_time::util::current_time_millis;
use datatypes::arrow::array::{ArrayRef, Float64Array, StringArray, TimestampMillisecondArray};
use datatypes::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
@@ -67,7 +68,8 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) {
default_flat_format: flat_format,
..Default::default()
};
let time_provider = Arc::new(MockTimeProvider::new(current_time_millis()));
let initial_time = current_time_millis();
let time_provider = Arc::new(MockTimeProvider::new(initial_time));
let engine = env
.create_engine_with_time(
config.clone(),
@@ -99,14 +101,22 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) {
.await
.unwrap();
let region = engine.get_region(region_id).unwrap();
let initial_schedule_time = region.last_schedule_compaction_millis();
assert_eq!(initial_time, initial_schedule_time);
let new_edit = || RegionEdit {
files_to_add: vec![FileMeta {
region_id: region.region_id,
file_id: FileId::random(),
level: 0,
..Default::default()
}],
let new_edit = |file_starts: &[i64]| RegionEdit {
files_to_add: file_starts
.iter()
.map(|start| FileMeta {
region_id: region.region_id,
file_id: FileId::random(),
time_range: (
Timestamp::new_millisecond(*start),
Timestamp::new_millisecond(1000 * 1000),
),
..Default::default()
})
.collect(),
files_to_remove: vec![],
timestamp_ms: None,
compaction_time_window: None,
@@ -115,19 +125,23 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) {
committed_sequence: None,
};
engine
.edit_region(region.region_id, new_edit())
.edit_region(region.region_id, new_edit(&[0, 10, 50, 80]))
.await
.unwrap();
// Asserts that the compaction of the region is not scheduled,
// because the minimum time interval between two compactions is not passed.
assert_eq!(rx.try_recv(), Err(oneshot::error::TryRecvError::Empty));
assert_eq!(
initial_schedule_time,
region.last_schedule_compaction_millis()
);
// Simulates the time has passed the min compaction interval,
time_provider
.set_now(current_time_millis() + config.min_compaction_interval.as_millis() as i64);
let next_schedule_time = initial_time + config.min_compaction_interval.as_millis() as i64;
time_provider.set_now(next_schedule_time);
// ... then edits the region again,
engine
.edit_region(region.region_id, new_edit())
.edit_region(region.region_id, new_edit(&[90]))
.await
.unwrap();
// ... finally asserts that the compaction of the region is scheduled.
@@ -136,6 +150,9 @@ async fn test_edit_region_schedule_compaction_with_format(flat_format: bool) {
.unwrap()
.unwrap();
assert_eq!(region_id, actual);
// Wait for the `last_schedule_compaction_millis` to update.
tokio::time::sleep(Duration::from_millis(100)).await;
assert_eq!(next_schedule_time, region.last_schedule_compaction_millis());
}
#[tokio::test]

View File

@@ -18,6 +18,7 @@
#![feature(debug_closure_helpers)]
#![feature(duration_constructors)]
#![feature(binary_heap_pop_if)]
#[cfg(any(test, feature = "test"))]
#[cfg_attr(feature = "test", allow(unused))]

View File

@@ -157,8 +157,8 @@ pub struct MitoRegion {
pub(crate) provider: Provider,
/// Last flush time in millis.
last_flush_millis: AtomicI64,
/// Last compaction time in millis.
last_compaction_millis: AtomicI64,
/// Last schedule compaction time in millis.
last_schedule_compaction_millis: AtomicI64,
/// Provider to get current time.
time_provider: TimeProviderRef,
/// The topic's latest entry id since the region's last flushing.
@@ -251,15 +251,16 @@ impl MitoRegion {
self.last_flush_millis.store(now, Ordering::Relaxed);
}
/// Returns last compaction timestamp in millis.
pub(crate) fn last_compaction_millis(&self) -> i64 {
self.last_compaction_millis.load(Ordering::Relaxed)
/// Returns last schedule compaction timestamp in millis.
pub(crate) fn last_schedule_compaction_millis(&self) -> i64 {
self.last_schedule_compaction_millis.load(Ordering::Relaxed)
}
/// Update compaction time to current time.
pub(crate) fn update_compaction_millis(&self) {
/// Update schedule compaction time to current time.
pub(crate) fn update_schedule_compaction_millis(&self) {
let now = self.time_provider.current_time_millis();
self.last_compaction_millis.store(now, Ordering::Relaxed);
self.last_schedule_compaction_millis
.store(now, Ordering::Relaxed);
}
/// Returns the table dir.
@@ -1727,7 +1728,7 @@ mod tests {
file_purger: crate::test_util::new_noop_file_purger(),
provider: Provider::noop_provider(),
last_flush_millis: Default::default(),
last_compaction_millis: Default::default(),
last_schedule_compaction_millis: Default::default(),
time_provider: Arc::new(StdTimeProvider),
topic_latest_entry_id: Default::default(),
written_bytes: Arc::new(AtomicU64::new(0)),
@@ -2084,7 +2085,7 @@ mod tests {
file_purger: crate::test_util::new_noop_file_purger(),
provider: Provider::noop_provider(),
last_flush_millis: Default::default(),
last_compaction_millis: Default::default(),
last_schedule_compaction_millis: Default::default(),
time_provider: Arc::new(StdTimeProvider),
topic_latest_entry_id: Default::default(),
written_bytes: Arc::new(AtomicU64::new(0)),

View File

@@ -345,7 +345,7 @@ impl RegionOpener {
),
provider,
last_flush_millis: AtomicI64::new(now),
last_compaction_millis: AtomicI64::new(now),
last_schedule_compaction_millis: AtomicI64::new(now),
time_provider: self.time_provider.clone(),
topic_latest_entry_id: AtomicU64::new(0),
written_bytes: Arc::new(AtomicU64::new(0)),
@@ -581,7 +581,7 @@ impl RegionOpener {
file_purger,
provider: provider.clone(),
last_flush_millis: AtomicI64::new(now),
last_compaction_millis: AtomicI64::new(now),
last_schedule_compaction_millis: AtomicI64::new(now),
time_provider: self.time_provider.clone(),
topic_latest_entry_id: AtomicU64::new(topic_latest_entry_id),
written_bytes: Arc::new(AtomicU64::new(0)),

View File

@@ -13,7 +13,7 @@
// limitations under the License.
use api::v1::region::compact_request;
use common_telemetry::{error, info, warn};
use common_telemetry::{debug, error, info};
use store_api::logstore::LogStore;
use store_api::region_request::RegionCompactRequest;
use store_api::storage::RegionId;
@@ -80,7 +80,6 @@ impl<S> RegionWorkerLoop<S> {
return;
}
};
region.update_compaction_millis();
region.version_control.apply_edit(
Some(request.edit.clone()),
@@ -118,6 +117,31 @@ impl<S> RegionWorkerLoop<S> {
)
.await;
self.handle_ddl_requests(&mut pending_ddls).await;
if self.compaction_scheduler.is_compacting(region_id) {
return;
}
let now = self.time_provider.current_time_millis();
if now - region.last_schedule_compaction_millis()
>= self.config.min_compaction_interval.as_millis() as i64
{
debug!(
"minimal compaction interval time {:?} has passed, scheduling next compaction",
self.config.min_compaction_interval
);
if self
.compaction_scheduler
.schedule_next_compaction(
region_id,
&region.manifest_ctx,
self.schema_metadata_manager.clone(),
)
.await
{
region.update_schedule_compaction_millis();
}
}
}
pub(crate) async fn handle_compaction_cancelled(
@@ -160,9 +184,14 @@ impl<S> RegionWorkerLoop<S> {
return;
}
let now = self.time_provider.current_time_millis();
if now - region.last_compaction_millis()
if now - region.last_schedule_compaction_millis()
>= self.config.min_compaction_interval.as_millis() as i64
&& let Err(e) = self
{
debug!(
"minimal compaction interval time {:?} has passed, scheduling next compaction",
self.config.min_compaction_interval
);
match self
.compaction_scheduler
.schedule_compaction(
region.region_id,
@@ -175,11 +204,13 @@ impl<S> RegionWorkerLoop<S> {
1, // Default for automatic compaction
)
.await
{
warn!(
"Failed to schedule compaction for region: {}, err: {}",
region.region_id, e
);
{
Ok(true) => region.update_schedule_compaction_millis(),
Ok(false) => {}
Err(e) => {
error!(e; "Failed to schedule compaction for region: {}", region.region_id)
}
}
}
}
}

View File

@@ -24,7 +24,7 @@ derive_builder = { workspace = true, optional = true }
futures.workspace = true
humantime-serde.workspace = true
lazy_static.workspace = true
opendal = { git = "https://github.com/apache/opendal.git", rev = "4ad2d85296ffa6fdc2882f97d3c760ee243913f7", features = [
opendal = { version = "0.57", features = [
"layers-tracing",
"layers-prometheus",
"services-azblob",

File diff suppressed because it is too large Load Diff

View File

@@ -21,7 +21,7 @@ pub use opendal::raw::{
Access, Layer, LayeredAccess, OpDelete, OpList, OpRead, OpWrite, RpDelete, RpList, RpRead,
RpWrite, oio,
};
use opendal::raw::{OpCopy, RpCopy};
use opendal::raw::{OpCopier, OpCopy, RpCopy};
pub use opendal::{Buffer, Error, ErrorKind, Metadata, Result};
pub type MockWriterFactory = Arc<dyn Fn(&str, OpWrite, oio::Writer) -> oio::Writer + Send + Sync>;
@@ -146,6 +146,7 @@ impl<A: Access> LayeredAccess for MockAccessor<A> {
type Writer = MockWriter;
type Lister = MockLister;
type Deleter = MockDeleter;
type Copier = oio::Copier;
fn inner(&self) -> &Self::Inner {
&self.inner
@@ -222,15 +223,24 @@ impl<A: Access> LayeredAccess for MockAccessor<A> {
}
}
async fn copy(&self, from: &str, to: &str, args: OpCopy) -> Result<RpCopy> {
let Some(copy_interceptor) = self.copy_interceptor.as_ref() else {
return self.inner.copy(from, to, args).await;
};
async fn copy(
&self,
from: &str,
to: &str,
args: OpCopy,
opts: OpCopier,
) -> Result<(RpCopy, Self::Copier)> {
if let Some(result) = self
.copy_interceptor
.as_ref()
.and_then(|copy_interceptor| copy_interceptor(from, to, args.clone()))
{
return result.map(|rp_copy| (rp_copy, Box::new(()) as oio::Copier));
}
let Some(result) = copy_interceptor(from, to, args.clone()) else {
return self.inner.copy(from, to, args).await;
};
result
self.inner
.copy(from, to, args, opts)
.await
.map(|(rp_copy, copier)| (rp_copy, Box::new(copier) as oio::Copier))
}
}

View File

@@ -18,7 +18,6 @@ pub use opendal::{
FuturesAsyncWriter, Lister, Operator as ObjectStore, Reader, Result, Writer, services,
};
pub mod compat;
pub mod config;
pub mod error;
pub mod factory;

View File

@@ -63,6 +63,7 @@ use table::metadata::TableInfo;
use table::requests::{
AUTO_CREATE_TABLE_KEY, InsertRequest as TableInsertRequest, TABLE_DATA_MODEL,
TABLE_DATA_MODEL_TRACE_V1, TRACE_TABLE_PARTITIONS_HINT_KEY, VALID_TABLE_OPTION_KEYS,
is_semantic_option_key,
};
use table::table_reference::TableReference;
@@ -83,6 +84,10 @@ pub struct Inserter {
pub(crate) partition_manager: PartitionRuleManagerRef,
pub(crate) node_manager: NodeManagerRef,
pub(crate) table_flownode_set_cache: TableFlownodeSetCacheRef,
/// Server-side upper bound for auto table creation on write.
/// When `false`, missing tables are never auto-created regardless of the
/// per-request `auto_create_table` hint. When `true`, the hint still applies.
auto_create_table: bool,
}
pub type InserterRef = Arc<Inserter>;
@@ -135,12 +140,14 @@ impl Inserter {
partition_manager: PartitionRuleManagerRef,
node_manager: NodeManagerRef,
table_flownode_set_cache: TableFlownodeSetCacheRef,
auto_create_table: bool,
) -> Self {
Self {
catalog_manager,
partition_manager,
node_manager,
table_flownode_set_cache,
auto_create_table,
}
}
@@ -469,6 +476,30 @@ impl Inserter {
Ok(inserts)
}
/// Returns `None` if auto table creation is allowed, or `Some(reason)` if
/// disabled by either the global config or the request hint. The reason tells
/// which one, for a clearer error.
fn auto_create_disabled_reason(&self, ctx: &QueryContextRef) -> Result<Option<&'static str>> {
let auto_create_table_hint = ctx
.extension(AUTO_CREATE_TABLE_KEY)
.map(|v| v.parse::<bool>())
.transpose()
.map_err(|_| {
InvalidInsertRequestSnafu {
reason: "`auto_create_table` hint must be a boolean",
}
.build()
})?
.unwrap_or(true);
Ok(if !self.auto_create_table {
Some("auto-create table is disabled by frontend config")
} else if !auto_create_table_hint {
Some("`auto_create_table` hint is disabled")
} else {
None
})
}
/// Creates or alter tables on demand:
/// - if table does not exist, create table by inferred CreateExpr
/// - if table exist, check if schema matches. If any new column found, alter table by inferred `AlterExpr`
@@ -498,19 +529,7 @@ impl Inserter {
let schema = ctx.current_schema();
let mut table_infos = HashMap::new();
// If `auto_create_table` hint is disabled, skip creating/altering tables.
let auto_create_table_hint = ctx
.extension(AUTO_CREATE_TABLE_KEY)
.map(|v| v.parse::<bool>())
.transpose()
.map_err(|_| {
InvalidInsertRequestSnafu {
reason: "`auto_create_table` hint must be a boolean",
}
.build()
})?
.unwrap_or(true);
if !auto_create_table_hint {
if let Some(disabled_reason) = self.auto_create_disabled_reason(ctx)? {
let mut instant_table_ids = HashSet::new();
for req in &requests.inserts {
let table = self
@@ -518,8 +537,8 @@ impl Inserter {
.await?
.context(InvalidInsertRequestSnafu {
reason: format!(
"Table `{}` does not exist, and `auto_create_table` hint is disabled",
req.table_name
"Table `{}` does not exist, and {}",
req.table_name, disabled_reason
),
})?;
let table_info = table.table_info();
@@ -767,6 +786,16 @@ impl Inserter {
return Ok(());
}
// Gate here too, otherwise a disabled switch would still leak the physical table.
if let Some(disabled_reason) = self.auto_create_disabled_reason(ctx)? {
return InvalidInsertRequestSnafu {
reason: format!(
"Physical table `{physical_table}` does not exist, and {disabled_reason}"
),
}
.fail();
}
let table_reference = TableReference::full(catalog_name, &schema_name, &physical_table);
info!("Physical metric table `{table_reference}` does not exist, try creating table");
@@ -1061,6 +1090,13 @@ pub fn fill_table_options_for_create(
}
}
// Semantic keys are prefix-matched, not in the fixed allowlist above.
for (key, value) in ctx.extensions() {
if is_semantic_option_key(&key) {
table_options.insert(key, value);
}
}
match create_type {
AutoCreateTableType::Logical(physical_table) => {
table_options.insert(
@@ -1333,6 +1369,7 @@ mod tests {
Cache::new(100),
kv_backend.clone(),
)),
true,
);
let alter_expr = inserter
.get_alter_table_expr_on_demand(&mut req, &table, &ctx, true, true)
@@ -1362,6 +1399,34 @@ mod tests {
assert!(!table_options.contains_key(APPEND_MODE_KEY));
}
#[test]
fn test_fill_table_options_copies_semantic_extensions() {
use table::requests::{
SEMANTIC_PER_TABLE_INDEX_KEY, SEMANTIC_SIGNAL_TYPE, SEMANTIC_SOURCE,
SIGNAL_TYPE_METRIC, SOURCE_OPENTELEMETRY,
};
let mut ctx = QueryContext::with(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME);
ctx.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC);
ctx.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
// The internal transport key must NOT be copied into table options.
ctx.set_extension(SEMANTIC_PER_TABLE_INDEX_KEY, "{}");
let ctx = Arc::new(ctx);
let mut table_options = Default::default();
fill_table_options_for_create(&mut table_options, &AutoCreateTableType::Physical, &ctx);
assert_eq!(
Some(SIGNAL_TYPE_METRIC),
table_options.get(SEMANTIC_SIGNAL_TYPE).map(String::as_str)
);
assert_eq!(
Some(SOURCE_OPENTELEMETRY),
table_options.get(SEMANTIC_SOURCE).map(String::as_str)
);
assert!(!table_options.contains_key(SEMANTIC_PER_TABLE_INDEX_KEY));
}
#[test]
fn test_last_non_null_create_options_preserve_default_with_append_mode_false() {
let mut ctx = QueryContext::with(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME);

View File

@@ -35,7 +35,9 @@ use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, is_reado
use common_catalog::{format_full_flow_name, format_full_table_name};
use common_error::ext::BoxedError;
use common_meta::cache_invalidator::Context;
use common_meta::ddl::create_flow::{DEFER_ON_MISSING_SOURCE_KEY, FlowType};
use common_meta::ddl::create_flow::{
DEFER_ON_MISSING_SOURCE_KEY, FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY, FlowType,
};
use common_meta::instruction::CacheIdent;
use common_meta::key::schema_name::{SchemaName, SchemaNameKey};
use common_meta::procedure_executor::ExecutorContext;
@@ -114,7 +116,10 @@ struct DdlSubmitOptions {
timeout: Duration,
}
const ALLOWED_FLOW_OPTIONS: [&str; 1] = [DEFER_ON_MISSING_SOURCE_KEY];
const ALLOWED_FLOW_OPTIONS: [&str; 2] = [
DEFER_ON_MISSING_SOURCE_KEY,
FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY,
];
fn build_procedure_id_output(procedure_id: Vec<u8>) -> Result<Output> {
let procedure_id = String::from_utf8_lossy(&procedure_id).to_string();
@@ -187,7 +192,9 @@ fn validate_and_normalize_flow_options(
}
let normalized_value = match key.as_str() {
DEFER_ON_MISSING_SOURCE_KEY => normalize_flow_bool_option(&key, &value)?,
DEFER_ON_MISSING_SOURCE_KEY | FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY => {
normalize_flow_bool_option(&key, &value)?
}
_ => {
return InvalidSqlSnafu {
err_msg: format!(
@@ -2478,12 +2485,23 @@ mod test {
#[test]
fn test_validate_and_normalize_flow_options_valid() {
let options =
HashMap::from([(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "TRUE".to_string())]);
let options = HashMap::from([
(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "TRUE".to_string()),
(
FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(),
"FALSE".to_string(),
),
]);
assert_eq!(
validate_and_normalize_flow_options(options).unwrap(),
HashMap::from([(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string(),)])
HashMap::from([
(DEFER_ON_MISSING_SOURCE_KEY.to_string(), "true".to_string(),),
(
FLOW_EXPERIMENTAL_ENABLE_INCREMENTAL_READ_KEY.to_string(),
"false".to_string(),
)
])
);
}
@@ -2497,7 +2515,7 @@ mod test {
assert!(
err.to_string()
.contains("unknown flow option 'foo', supported options: defer_on_missing_source")
.contains("unknown flow option 'foo', supported options: defer_on_missing_source, experimental_enable_incremental_read")
);
}

View File

@@ -14,6 +14,7 @@
use common_base::Plugins;
use datanode::config::DatanodeOptions;
use datanode::datanode::Datanode;
use datanode::error::Result;
use crate::options::PluginOptions;
@@ -28,6 +29,6 @@ pub async fn setup_datanode_plugins(
Ok(())
}
pub async fn start_datanode_plugins(_plugins: Plugins) -> Result<()> {
pub async fn start_datanode_plugins(_instance: &Datanode) -> Result<()> {
Ok(())
}

View File

@@ -13,8 +13,8 @@
// limitations under the License.
use common_base::Plugins;
use flow::FlownodeOptions;
use flow::error::Result;
use flow::{FlownodeInstance, FlownodeOptions};
use crate::options::PluginOptions;
@@ -27,7 +27,7 @@ pub async fn setup_flownode_plugins(
Ok(())
}
pub async fn start_flownode_plugins(_plugins: Plugins) -> Result<()> {
pub async fn start_flownode_plugins(_instance: &FlownodeInstance) -> Result<()> {
Ok(())
}

View File

@@ -17,6 +17,7 @@ use common_base::Plugins;
use common_meta::cache::CacheRegistryBuilder;
use frontend::error::{IllegalAuthConfigSnafu, Result};
use frontend::frontend::FrontendOptions;
use frontend::instance::Instance;
use snafu::ResultExt;
use crate::options::PluginOptions;
@@ -51,7 +52,7 @@ pub async fn setup_frontend_dynamic_plugins(
Ok(())
}
pub async fn start_frontend_plugins(_plugins: Plugins) -> Result<()> {
pub async fn start_frontend_plugins(_instance: &Instance) -> Result<()> {
Ok(())
}

View File

@@ -26,4 +26,4 @@ pub use flownode::{setup_flownode_plugins, start_flownode_plugins};
pub use frontend::{setup_frontend_plugins, start_frontend_plugins};
pub use meta_srv::{setup_metasrv_plugins, start_metasrv_plugins};
pub use options::PluginOptions;
pub use standalone::{setup_standalone_plugins, start_standalone_plugins};
pub use standalone::setup_standalone_plugins;

View File

@@ -13,6 +13,7 @@
// limitations under the License.
use common_base::Plugins;
use meta_srv::bootstrap::MetasrvInstance;
use meta_srv::error::Result;
use meta_srv::metasrv::MetasrvOptions;
@@ -27,6 +28,6 @@ pub async fn setup_metasrv_plugins(
Ok(())
}
pub async fn start_metasrv_plugins(_plugins: Plugins) -> Result<()> {
pub async fn start_metasrv_plugins(_instance: &MetasrvInstance) -> Result<()> {
Ok(())
}

View File

@@ -31,10 +31,6 @@ pub async fn setup_standalone_plugins(
Ok(())
}
pub async fn start_standalone_plugins(_plugins: Plugins) -> Result<()> {
Ok(())
}
/// Allows standalone plugins to add cache invalidators to the layered registry.
pub fn configure_cache_registry(_plugins: &Plugins) -> Option<CacheRegistryBuilder> {
None

View File

@@ -31,6 +31,10 @@ use prost::Message;
use serde::{Deserialize, Serialize};
use session::context::{Channel, QueryContext};
use snafu::prelude::*;
use table::requests::{
METADATA_QUALITY_INFERRED, SEMANTIC_METRIC_METADATA_QUALITY, SEMANTIC_SIGNAL_TYPE,
SEMANTIC_SOURCE, SIGNAL_TYPE_METRIC, SOURCE_PROMETHEUS,
};
use crate::error::{self, InternalSnafu, PipelineSnafu, Result};
use crate::http::extractor::PipelineInfo;
@@ -108,6 +112,13 @@ pub async fn remote_write(
.clone()
.unwrap_or_else(|| GREPTIME_PHYSICAL_TABLE.to_string());
query_ctx.set_extension(PHYSICAL_TABLE_PARAM, physical_table.clone());
// Stamp the Prometheus metric identity here, before `as_req_iter` splits into the
// batched and direct write paths, so both inherit it (the batched path bypasses
// `PromStoreProtocolHandler::write`). Prom RW v1 metadata is weak, so the type is
// inferred from naming.
query_ctx.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC);
query_ctx.set_extension(SEMANTIC_SOURCE, SOURCE_PROMETHEUS);
query_ctx.set_extension(SEMANTIC_METRIC_METADATA_QUALITY, METADATA_QUALITY_INFERRED);
let query_ctx = Arc::new(query_ctx);
let _timer = crate::metrics::METRIC_HTTP_PROM_STORE_WRITE_ELAPSED
.with_label_values(&[db.as_str()])

View File

@@ -40,7 +40,7 @@ use snafu::{ResultExt, ensure};
use sqlparser::dialect::Dialect;
use sqlparser::keywords::Keyword;
use sqlparser::parser::Parser;
use table::requests::validate_table_option;
use table::requests::{SEMANTIC_PREFIX, validate_semantic_option, validate_table_option};
use crate::error::{
ConvertToLogicalExpressionSnafu, InvalidSqlSnafu, InvalidTableOptionSnafu, ParseSqlValueSnafu,
@@ -395,8 +395,18 @@ pub fn parse_with_options(parser: &mut Parser) -> Result<OptionMap> {
.into_iter()
.map(parse_option_string)
.collect::<Result<HashMap<String, OptionValue>>>()?;
for key in options.keys() {
ensure!(validate_table_option(key), InvalidTableOptionSnafu { key });
for (key, value) in &options {
if key.starts_with(SEMANTIC_PREFIX) {
// Semantic keys are whitelisted and value-checked against their domain,
// so a user cannot set an unknown key or an out-of-range value.
let value = value.as_string().unwrap_or_default();
ensure!(
validate_semantic_option(key, value),
InvalidTableOptionSnafu { key }
);
} else {
ensure!(validate_table_option(key), InvalidTableOptionSnafu { key });
}
}
Ok(OptionMap::new(options))
}

View File

@@ -868,7 +868,25 @@ ENGINE=mito
";
let result =
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default());
assert_matches!(result, Err(Error::InvalidTableOption { .. }))
assert_matches!(result, Err(Error::InvalidTableOption { .. }));
// A whitelisted semantic key with an in-domain value is accepted.
let semantic = |with: &str| {
let sql =
format!("create table demo(host string, ts timestamp time index) with({with});");
ParserContext::create_with_dialect(&sql, &GreptimeDbDialect {}, ParseOptions::default())
};
assert!(semantic("'greptime.semantic.signal_type'='metric'").is_ok());
// An out-of-domain value is rejected.
assert_matches!(
semantic("'greptime.semantic.signal_type'='spans'"),
Err(Error::InvalidTableOption { .. })
);
// An unknown key under the semantic prefix is rejected.
assert_matches!(
semantic("'greptime.semantic.bogus'='x'"),
Err(Error::InvalidTableOption { .. })
);
}
#[test]

View File

@@ -38,6 +38,10 @@ pub struct StandaloneOptions {
pub enable_telemetry: bool,
pub default_timezone: Option<String>,
pub default_column_prefix: Option<String>,
/// Server-side global switch for auto table creation on write.
/// Upper bound: when `false`, missing tables are never auto-created even if a
/// request sets the `auto_create_table` hint to `true`. Default: `true`.
pub auto_create_table: bool,
/// Maximum total memory for all concurrent write request bodies and messages (HTTP, gRPC, Flight).
/// Set to 0 to disable the limit. Default: "0" (unlimited)
pub max_in_flight_write_bytes: ReadableSize,
@@ -77,6 +81,7 @@ impl Default for StandaloneOptions {
enable_telemetry: true,
default_timezone: None,
default_column_prefix: None,
auto_create_table: true,
max_in_flight_write_bytes: ReadableSize(0),
write_bytes_exhausted_policy: OnExhaustedPolicy::default(),
http: HttpOptions::default(),
@@ -130,6 +135,7 @@ impl StandaloneOptions {
let cloned_opts = self.clone();
FrontendOptions {
default_timezone: cloned_opts.default_timezone,
auto_create_table: cloned_opts.auto_create_table,
max_in_flight_write_bytes: cloned_opts.max_in_flight_write_bytes,
write_bytes_exhausted_policy: cloned_opts.write_bytes_exhausted_policy,
http: cloned_opts.http,

View File

@@ -48,6 +48,9 @@ use crate::error::{ParseTableOptionSnafu, Result};
use crate::metadata::{TableId, TableVersion};
use crate::table_reference::TableReference;
mod semantic;
pub use semantic::*;
pub const FILE_TABLE_META_KEY: &str = "__private.file_table_meta";
pub const FILE_TABLE_LOCATION_KEY: &str = "location";
pub const FILE_TABLE_PATTERN_KEY: &str = "pattern";
@@ -129,6 +132,12 @@ pub fn validate_table_option(key: &str) -> bool {
return true;
}
// Semantic-layer keys share a reserved prefix instead of a fixed allowlist so
// the vocabulary can grow without touching this gate. See `semantic` module.
if is_semantic_option_key(key) {
return true;
}
VALID_TABLE_OPTION_KEYS.contains(&key) || VALID_DDL_OPTION_KEYS.contains(&key)
}
@@ -490,6 +499,14 @@ mod tests {
assert!(validate_table_option(STORAGE_KEY));
assert!(validate_table_option(MEMTABLE_BULK_MERGE_THRESHOLD));
assert!(!validate_table_option("foo"));
// Only whitelisted semantic keys are accepted.
assert!(validate_table_option(SEMANTIC_SIGNAL_TYPE));
assert!(validate_table_option(SEMANTIC_METRIC_TYPE));
// Unknown semantic key, near-miss, and the internal transport key are rejected.
assert!(!validate_table_option("greptime.semantic.future.key"));
assert!(!validate_table_option("greptime.semanticx"));
assert!(!validate_table_option(SEMANTIC_PER_TABLE_INDEX_KEY));
}
#[test]

View File

@@ -0,0 +1,280 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Table semantic layer vocabulary.
//!
//! A thin layer of semantic metadata attached to a table via `table_options`, so
//! machine consumers (LLM agents, alert/dashboard builders, MCP servers, ETL) can
//! align a table with the observability concept it stands for without guessing
//! from column names. See `docs/rfcs/2026-05-28-table-semantic-layer.md`.
//!
//! All public table-option keys share the [`SEMANTIC_PREFIX`] namespace and are
//! string-valued. [`is_semantic_option_key`] gates them through
//! [`crate::requests::validate_table_option`], so they are accepted both on the
//! ingestion auto-create path and on explicit `CREATE TABLE ... WITH (...)` DDL.
/// Reserved prefix for every public semantic table-option key.
pub const SEMANTIC_PREFIX: &str = "greptime.semantic.";
/// Internal `QueryContext` extension key carrying the per-table semantic index
/// (a `{table_name -> {semantic_key: value}}` JSON blob) from the ingestion
/// encode path to the auto-create site. Deliberately OUTSIDE [`SEMANTIC_PREFIX`]
/// so it is not a valid table option and never leaks into a table's options.
pub const SEMANTIC_PER_TABLE_INDEX_KEY: &str = "greptime.internal.semantic.per_table_index";
// ---- Common keys (all signals) ----
/// Signal kind: one of [`SIGNAL_TYPE_TRACE`] / [`SIGNAL_TYPE_LOG`] /
/// [`SIGNAL_TYPE_METRIC`] / [`SIGNAL_TYPE_EVENT`].
pub const SEMANTIC_SIGNAL_TYPE: &str = "greptime.semantic.signal_type";
/// Ingestion ecosystem, e.g. [`SOURCE_OPENTELEMETRY`] / [`SOURCE_PROMETHEUS`].
pub const SEMANTIC_SOURCE: &str = "greptime.semantic.source";
/// Optional protocol or SDK version string, e.g. `v2` (Prom remote write), `1.30.0`.
pub const SEMANTIC_SOURCE_VERSION: &str = "greptime.semantic.source_version";
/// Internal ingestion pipeline / data model, e.g. `greptime_trace_v1`.
pub const SEMANTIC_PIPELINE: &str = "greptime.semantic.pipeline";
// ---- Trace keys ----
/// Semantic-conventions version the rows conform to (e.g. `otel-semconv-1.27`),
/// or [`SEMANTIC_VALUE_UNKNOWN`] / [`SEMANTIC_VALUE_MIXED`] when not single-valued.
pub const SEMANTIC_TRACE_CONVENTIONS: &str = "greptime.semantic.trace.conventions";
/// Whether `span_events` are preserved on the table.
pub const SEMANTIC_TRACE_HAS_EVENTS: &str = "greptime.semantic.trace.has_events";
/// Whether `span_links` are preserved on the table.
pub const SEMANTIC_TRACE_HAS_LINKS: &str = "greptime.semantic.trace.has_links";
// ---- Metric keys (populated in Phase 2) ----
/// Instrument kind: `counter` / `gauge` / `histogram` / `summary` /
/// `updown_counter` / `gauge_histogram` / `info` / `stateset`.
pub const SEMANTIC_METRIC_TYPE: &str = "greptime.semantic.metric.type";
/// UCUM unit, e.g. `s`, `By`, `{request}`.
pub const SEMANTIC_METRIC_UNIT: &str = "greptime.semantic.metric.unit";
/// `cumulative` / `delta` (OTel only).
pub const SEMANTIC_METRIC_TEMPORALITY: &str = "greptime.semantic.metric.temporality";
/// `true` / `false` for sum / counter typed data.
pub const SEMANTIC_METRIC_MONOTONIC: &str = "greptime.semantic.metric.monotonic";
/// [`METADATA_QUALITY_DECLARED`] when the protocol stated the type, or
/// [`METADATA_QUALITY_INFERRED`] when guessed from a name suffix.
pub const SEMANTIC_METRIC_METADATA_QUALITY: &str = "greptime.semantic.metric.metadata_quality";
/// Pre-translation OTel metric name when the table name was Prometheus-ised.
pub const SEMANTIC_METRIC_ORIGINAL_NAME: &str = "greptime.semantic.metric.original_name";
// ---- Log keys (populated in Phase 3) ----
/// `otlp` / `syslog` / `custom` — which mapping to use for `severity_number`.
pub const SEMANTIC_LOG_SEVERITY_SCHEME: &str = "greptime.semantic.log.severity_scheme";
/// `string` / `json` / `mixed` — how to parse `body`.
pub const SEMANTIC_LOG_BODY_FORMAT: &str = "greptime.semantic.log.body_format";
// ---- Resource / scope preservation keys (populated in Phase 3) ----
/// JSON array string of resource attributes promoted to first-class columns.
pub const SEMANTIC_RESOURCE_ATTRIBUTES_PRESERVED: &str =
"greptime.semantic.resource.attributes_preserved";
/// `true` / `false` — whether any resource attribute was dropped at ingest.
pub const SEMANTIC_RESOURCE_ATTRIBUTES_DROPPED: &str =
"greptime.semantic.resource.attributes_dropped";
/// `true` / `false` — whether `scope.name` / `scope.version` survive on the row.
pub const SEMANTIC_SCOPE_PRESERVED: &str = "greptime.semantic.scope.preserved";
// ---- Value constants ----
pub const SIGNAL_TYPE_TRACE: &str = "trace";
pub const SIGNAL_TYPE_LOG: &str = "log";
pub const SIGNAL_TYPE_METRIC: &str = "metric";
pub const SIGNAL_TYPE_EVENT: &str = "event";
pub const SOURCE_OPENTELEMETRY: &str = "opentelemetry";
pub const SOURCE_PROMETHEUS: &str = "prometheus";
pub const METADATA_QUALITY_DECLARED: &str = "declared";
pub const METADATA_QUALITY_INFERRED: &str = "inferred";
/// Sentinel for a key that cannot be determined at stamp time.
pub const SEMANTIC_VALUE_UNKNOWN: &str = "unknown";
/// Sentinel for a single-valued key that saw conflicting sources.
pub const SEMANTIC_VALUE_MIXED: &str = "mixed";
/// Every recognised public semantic table-option key. The set is a closed
/// whitelist: keys under [`SEMANTIC_PREFIX`] that are not listed here are rejected,
/// so an unknown key like `greptime.semantic.unknown_key` does not silently land
/// in a table's options. Adding a key to the vocabulary means adding it here.
pub const SEMANTIC_OPTION_KEYS: &[&str] = &[
SEMANTIC_SIGNAL_TYPE,
SEMANTIC_SOURCE,
SEMANTIC_SOURCE_VERSION,
SEMANTIC_PIPELINE,
SEMANTIC_TRACE_CONVENTIONS,
SEMANTIC_TRACE_HAS_EVENTS,
SEMANTIC_TRACE_HAS_LINKS,
SEMANTIC_METRIC_TYPE,
SEMANTIC_METRIC_UNIT,
SEMANTIC_METRIC_TEMPORALITY,
SEMANTIC_METRIC_MONOTONIC,
SEMANTIC_METRIC_METADATA_QUALITY,
SEMANTIC_METRIC_ORIGINAL_NAME,
SEMANTIC_LOG_SEVERITY_SCHEME,
SEMANTIC_LOG_BODY_FORMAT,
SEMANTIC_RESOURCE_ATTRIBUTES_PRESERVED,
SEMANTIC_RESOURCE_ATTRIBUTES_DROPPED,
SEMANTIC_SCOPE_PRESERVED,
];
/// Returns true if `key` is a recognised semantic table-option key (whitelist).
///
/// Note this is membership, not a prefix test: unknown keys under
/// [`SEMANTIC_PREFIX`] are rejected, and the internal
/// [`SEMANTIC_PER_TABLE_INDEX_KEY`] (outside the prefix) never matches.
pub fn is_semantic_option_key(key: &str) -> bool {
SEMANTIC_OPTION_KEYS.contains(&key)
}
/// Validates a `greptime.semantic.*` option's `value` against its allowed domain.
///
/// Open-value keys (unit, original_name, version, pipeline, conventions, the
/// preserved-attributes list) accept any non-empty string. Closed-domain keys
/// accept a fixed set, plus the `unknown` sentinel, plus `mixed` for the keys
/// where one long-lived table can legitimately see multiple values. Keys not in
/// [`SEMANTIC_OPTION_KEYS`] are rejected.
pub fn validate_semantic_option(key: &str, value: &str) -> bool {
match key {
SEMANTIC_SOURCE_VERSION
| SEMANTIC_PIPELINE
| SEMANTIC_METRIC_UNIT
| SEMANTIC_METRIC_ORIGINAL_NAME
| SEMANTIC_TRACE_CONVENTIONS
| SEMANTIC_RESOURCE_ATTRIBUTES_PRESERVED => !value.is_empty(),
SEMANTIC_SIGNAL_TYPE => matches!(value, "trace" | "log" | "metric" | "event" | "unknown"),
SEMANTIC_SOURCE => matches!(
value,
"opentelemetry"
| "prometheus"
| "elasticsearch"
| "loki"
| "custom"
| "mixed"
| "unknown"
),
SEMANTIC_METRIC_TYPE => matches!(
value,
"counter"
| "gauge"
| "histogram"
| "summary"
| "updown_counter"
| "gauge_histogram"
| "info"
| "stateset"
| "mixed"
| "unknown"
),
SEMANTIC_METRIC_TEMPORALITY => {
matches!(value, "cumulative" | "delta" | "mixed" | "unknown")
}
SEMANTIC_METRIC_MONOTONIC
| SEMANTIC_TRACE_HAS_EVENTS
| SEMANTIC_TRACE_HAS_LINKS
| SEMANTIC_RESOURCE_ATTRIBUTES_DROPPED
| SEMANTIC_SCOPE_PRESERVED => matches!(value, "true" | "false" | "unknown"),
SEMANTIC_METRIC_METADATA_QUALITY => matches!(value, "declared" | "inferred" | "unknown"),
SEMANTIC_LOG_SEVERITY_SCHEME => matches!(value, "otlp" | "syslog" | "custom" | "unknown"),
SEMANTIC_LOG_BODY_FORMAT => matches!(value, "string" | "json" | "mixed" | "unknown"),
_ => false,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_semantic_option_key() {
assert!(is_semantic_option_key(SEMANTIC_SIGNAL_TYPE));
assert!(is_semantic_option_key(SEMANTIC_METRIC_TYPE));
// Unknown keys under the prefix are not whitelisted.
assert!(!is_semantic_option_key("greptime.semantic.future.key"));
assert!(!is_semantic_option_key("greptime.semantic.unknown_key"));
// Near-misses must not match.
assert!(!is_semantic_option_key("greptime.semanticx"));
assert!(!is_semantic_option_key("semantic.signal_type"));
assert!(!is_semantic_option_key("table_data_model"));
// The internal transport key must never be treated as a table option.
assert!(!is_semantic_option_key(SEMANTIC_PER_TABLE_INDEX_KEY));
}
#[test]
fn test_validate_semantic_option() {
// Enum keys reject out-of-domain values.
assert!(validate_semantic_option(SEMANTIC_SIGNAL_TYPE, "metric"));
assert!(!validate_semantic_option(SEMANTIC_SIGNAL_TYPE, "spans"));
assert!(validate_semantic_option(SEMANTIC_METRIC_TYPE, "counter"));
assert!(validate_semantic_option(SEMANTIC_METRIC_TYPE, "mixed"));
assert!(!validate_semantic_option(SEMANTIC_METRIC_TYPE, "bogus"));
// Booleans, sentinels, open values.
assert!(validate_semantic_option(SEMANTIC_TRACE_HAS_EVENTS, "true"));
assert!(!validate_semantic_option(SEMANTIC_TRACE_HAS_EVENTS, "yes"));
assert!(validate_semantic_option(
SEMANTIC_METRIC_TEMPORALITY,
"unknown"
));
assert!(validate_semantic_option(SEMANTIC_METRIC_UNIT, "By"));
assert!(!validate_semantic_option(SEMANTIC_METRIC_UNIT, ""));
// Unknown key is rejected regardless of value.
assert!(!validate_semantic_option(
"greptime.semantic.future.key",
"x"
));
// Drift guard: every value stamped by the ingestion path must validate.
assert!(validate_semantic_option(
SEMANTIC_SIGNAL_TYPE,
SIGNAL_TYPE_TRACE
));
assert!(validate_semantic_option(
SEMANTIC_SIGNAL_TYPE,
SIGNAL_TYPE_METRIC
));
assert!(validate_semantic_option(
SEMANTIC_SIGNAL_TYPE,
SIGNAL_TYPE_LOG
));
assert!(validate_semantic_option(
SEMANTIC_SOURCE,
SOURCE_OPENTELEMETRY
));
assert!(validate_semantic_option(SEMANTIC_SOURCE, SOURCE_PROMETHEUS));
assert!(validate_semantic_option(
SEMANTIC_METRIC_METADATA_QUALITY,
METADATA_QUALITY_INFERRED
));
assert!(validate_semantic_option(
SEMANTIC_TRACE_CONVENTIONS,
SEMANTIC_VALUE_UNKNOWN
));
// An empty value never validates, for any whitelisted key.
for key in SEMANTIC_OPTION_KEYS {
assert!(
!validate_semantic_option(key, ""),
"empty value should never validate for {key}"
);
}
}
}

View File

@@ -200,6 +200,15 @@ impl TableContext {
partitions.remove_bound(removed_idx)?;
partition_def.exprs = partitions.generate()?;
}
RepartitionExpr::AlterPartitions(partition) => {
ensure!(
self.partition.is_none(),
error::UnexpectedSnafu {
violated: format!("Table {} already has partition", self.name),
}
);
self.partition = Some(partition.partition);
}
}
Ok(self)

View File

@@ -44,6 +44,7 @@ pub struct CreateTableExprGenerator<R: Rng + 'static> {
#[builder(setter(into))]
engine: String,
partition: usize,
partition_column: bool,
if_not_exists: bool,
#[builder(setter(into))]
name: Ident,
@@ -67,6 +68,7 @@ impl<R: Rng + 'static> Default for CreateTableExprGenerator<R> {
engine: DEFAULT_ENGINE.to_string(),
if_not_exists: false,
partition: 0,
partition_column: false,
name: Ident::new(""),
with_clause: HashMap::default(),
name_generator: Box::new(MappedGenerator::new(WordGenerator, random_capitalize_map)),
@@ -95,7 +97,7 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateTableExprGenerato
let mut builder = CreateTableExprBuilder::default();
let mut columns = Vec::with_capacity(self.columns);
let mut primary_keys = vec![];
let need_partible_column = self.partition > 1;
let need_partible_column = self.partition > 1 || self.partition_column;
let mut column_names = self.name_generator.choose(rng, self.columns);
if self.columns == 1 {
@@ -123,13 +125,15 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateTableExprGenerato
)
.remove(0);
// Generates partition bounds.
let partition_def = generate_partition_def(
self.partition,
column.column_type.clone(),
name.clone(),
);
builder.partition(partition_def);
if self.partition > 1 {
// Generates partition bounds.
let partition_def = generate_partition_def(
self.partition,
column.column_type.clone(),
name.clone(),
);
builder.partition(partition_def);
}
columns.push(column);
}
// Generates the ts column.
@@ -178,11 +182,12 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreateTableExprGenerato
}
}
fn generate_partition_def(
pub fn generate_partition_def(
partitions: usize,
column_type: ConcreteDataType,
column_name: Ident,
) -> PartitionDef {
assert!(partitions > 1, "partitions must be greater than 1");
let bounds = generate_partition_bounds(&column_type, partitions - 1);
let partitions = SimplePartitions::new(column_name.clone(), bounds);
let partition_exprs = partitions.generate().unwrap();
@@ -193,24 +198,23 @@ fn generate_partition_def(
}
}
fn generate_metric_partition(partitions: usize) -> Option<(Column, PartitionDef)> {
if partitions <= 1 {
return None;
}
let partition_column = Column {
fn metric_partition_column() -> Column {
Column {
name: Ident::new("host"),
column_type: ConcreteDataType::string_datatype(),
options: vec![ColumnOption::PrimaryKey],
};
}
}
pub fn generate_metric_partition_def(partitions: usize) -> PartitionDef {
assert!(partitions > 1, "partitions must be greater than 1");
let partition_column = metric_partition_column();
let bounds = generate_partition_bounds(&partition_column.column_type, partitions - 1);
let partitions = SimplePartitions::new(partition_column.name.clone(), bounds);
let partition_def = PartitionDef {
PartitionDef {
columns: vec![partitions.column_name.clone()],
exprs: partitions.generate().unwrap(),
};
Some((partition_column, partition_def))
}
}
/// Generate a physical table with 2 columns: ts of TimestampType::Millisecond as time index and val of Float64Type.
@@ -223,6 +227,8 @@ pub struct CreatePhysicalTableExprGenerator<R: Rng + 'static> {
if_not_exists: bool,
#[builder(default = "0")]
partition: usize,
#[builder(default = "false")]
partition_column: bool,
#[builder(default, setter(into))]
with_clause: HashMap<String, String>,
}
@@ -252,11 +258,13 @@ impl<R: Rng + 'static> Generator<CreateTableExpr, R> for CreatePhysicalTableExpr
let mut partition = None;
let mut primary_keys = vec![];
if let Some((partition_column, partition_def)) = generate_metric_partition(self.partition) {
columns.push(partition_column);
partition = Some(partition_def);
if self.partition > 1 || self.partition_column {
columns.push(metric_partition_column());
primary_keys.push(columns.len() - 1);
}
if self.partition > 1 {
partition = Some(generate_metric_partition_def(self.partition));
}
Ok(CreateTableExpr {
table_name: self.name_generator.generate(rng),
@@ -387,6 +395,7 @@ mod tests {
use super::*;
use crate::context::TableContext;
use crate::ir::PARTIBLE_DATA_TYPES;
#[test]
fn test_float64() {
@@ -423,6 +432,18 @@ mod tests {
.unwrap();
assert_eq!(expr.columns.len(), 10);
assert!(expr.partition.is_none());
let expr = CreateTableExprGeneratorBuilder::default()
.columns(10)
.partition(1)
.partition_column(true)
.build()
.unwrap()
.generate(&mut rng)
.unwrap();
assert_eq!(expr.columns.len(), 10);
assert!(expr.partition.is_none());
assert!(PARTIBLE_DATA_TYPES.contains(&expr.columns[0].column_type));
}
#[test]
@@ -516,6 +537,25 @@ mod tests {
assert_eq!(physical_table_expr.partition.unwrap().exprs.len(), 3);
}
#[test]
fn test_create_physical_table_expr_generator_with_partition_column() {
let mut rng = rand::rng();
let physical_table_expr = CreatePhysicalTableExprGeneratorBuilder::default()
.partition(1)
.partition_column(true)
.if_not_exists(false)
.build()
.unwrap()
.generate(&mut rng)
.unwrap();
assert_eq!(physical_table_expr.engine, "metric");
assert!(physical_table_expr.partition.is_none());
assert_eq!(physical_table_expr.columns.len(), 3);
assert_eq!(physical_table_expr.columns[2].name, Ident::new("host"));
assert_eq!(physical_table_expr.primary_keys, vec![2]);
}
#[test]
fn test_create_logical_table_expr_generator_without_partition_column() {
let mut rng = rand::rng();

View File

@@ -30,7 +30,7 @@ use std::time::Duration;
pub use alter_expr::{AlterTableExpr, AlterTableOption};
use common_time::timestamp::TimeUnit;
use common_time::{Date, Timestamp};
pub use create_expr::{CreateDatabaseExpr, CreateTableExpr};
pub use create_expr::{CreateDatabaseExpr, CreateTableExpr, PartitionDef};
use datatypes::data_type::ConcreteDataType;
use datatypes::types::TimestampType;
use datatypes::value::Value;
@@ -40,7 +40,7 @@ use lazy_static::lazy_static;
pub use partition_expr::SimplePartitions;
use rand::Rng;
use rand::seq::{IndexedRandom, SliceRandom};
pub use repartition_expr::RepartitionExpr;
pub use repartition_expr::{AlterTablePartitionsExpr, RepartitionExpr};
use serde::{Deserialize, Serialize};
use self::insert_expr::RowValues;

View File

@@ -16,6 +16,7 @@ use partition::expr::PartitionExpr;
use serde::{Deserialize, Serialize};
use crate::ir::Ident;
use crate::ir::create_expr::PartitionDef;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SplitPartitionExpr {
@@ -34,10 +35,19 @@ pub struct MergePartitionExpr {
pub wait: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlterTablePartitionsExpr {
pub table_name: Ident,
pub partition: PartitionDef,
#[serde(default = "default_wait")]
pub wait: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum RepartitionExpr {
Split(SplitPartitionExpr),
Merge(MergePartitionExpr),
AlterPartitions(AlterTablePartitionsExpr),
}
const fn default_wait() -> bool {

View File

@@ -15,7 +15,10 @@
use partition::expr::PartitionExpr;
use crate::error::Result;
use crate::ir::repartition_expr::{MergePartitionExpr, RepartitionExpr, SplitPartitionExpr};
use crate::ir::create_expr::PartitionDef;
use crate::ir::repartition_expr::{
AlterTablePartitionsExpr, MergePartitionExpr, RepartitionExpr, SplitPartitionExpr,
};
use crate::translator::DslTranslator;
pub struct RepartitionExprTranslator;
@@ -59,10 +62,38 @@ impl DslTranslator<RepartitionExpr, String> for RepartitionExprTranslator {
table_name, merge_exprs, wait_clause
))
}
RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
table_name,
partition,
wait,
}) => {
let partition_clause = format_partition_clause(partition);
let wait_clause = format_wait_clause(*wait);
Ok(format!(
"ALTER TABLE {} {}{};",
table_name, partition_clause, wait_clause
))
}
}
}
}
fn format_partition_clause(partition: &PartitionDef) -> String {
let columns = partition
.columns
.iter()
.map(|column| column.to_string())
.collect::<Vec<_>>()
.join(", ");
let exprs = partition
.exprs
.iter()
.map(format_partition_expr_sql)
.collect::<Vec<_>>()
.join(",\n ");
format!("PARTITION ON COLUMNS ({columns}) (\n {exprs}\n)")
}
fn format_partition_expr_sql(expr: &PartitionExpr) -> String {
expr.to_parser_expr().to_string()
}
@@ -79,9 +110,15 @@ fn format_wait_clause(wait: bool) -> String {
mod tests {
use datatypes::value::Value;
use partition::expr::col;
use sql::dialect::GreptimeDbDialect;
use sql::parser::{ParseOptions, ParserContext};
use super::RepartitionExprTranslator;
use crate::ir::repartition_expr::{MergePartitionExpr, RepartitionExpr, SplitPartitionExpr};
use crate::ir::Ident;
use crate::ir::create_expr::PartitionDef;
use crate::ir::repartition_expr::{
AlterTablePartitionsExpr, MergePartitionExpr, RepartitionExpr, SplitPartitionExpr,
};
use crate::translator::DslTranslator;
#[test]
@@ -149,4 +186,61 @@ mod tests {
);"#;
assert_eq!(sql, expected);
}
#[test]
fn test_translate_alter_table_partitions_expr() {
let expr = RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
table_name: "demo".into(),
partition: PartitionDef {
columns: vec![Ident::new("id")],
exprs: vec![
col("id").lt(Value::Int32(10)),
col("id")
.gt_eq(Value::Int32(10))
.and(col("id").lt(Value::Int32(20))),
col("id").gt_eq(Value::Int32(20)),
],
},
wait: true,
});
let sql = RepartitionExprTranslator.translate(&expr).unwrap();
let expected = r#"ALTER TABLE demo PARTITION ON COLUMNS (id) (
id < 10,
id >= 10 AND id < 20,
id >= 20
);"#;
assert_eq!(sql, expected);
assert_repartition_sql_parseable(&sql);
}
#[test]
fn test_translate_alter_table_partitions_expr_wait_false() {
let expr = RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
table_name: "demo".into(),
partition: PartitionDef {
columns: vec![Ident::new("host")],
exprs: vec![
col("host").lt(Value::from("m")),
col("host").gt_eq(Value::from("m")),
],
},
wait: false,
});
let sql = RepartitionExprTranslator.translate(&expr).unwrap();
let expected = r#"ALTER TABLE demo PARTITION ON COLUMNS (host) (
host < 'm',
host >= 'm'
) WITH (
WAIT = false
);"#;
assert_eq!(sql, expected);
assert_repartition_sql_parseable(&sql);
}
fn assert_repartition_sql_parseable(sql: &str) {
let statements =
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
.unwrap();
assert_eq!(statements.len(), 1);
}
}

View File

@@ -21,7 +21,8 @@ use crate::ir::Ident;
use crate::ir::create_expr::PartitionDef;
const PARTITIONS_INFO_SCHEMA_SQL: &str = "SELECT table_catalog, table_schema, table_name, \
partition_name, partition_expression, partition_description, greptime_partition_id, \
partition_name, COALESCE(partition_expression, '') AS partition_expression, \
COALESCE(partition_description, '') AS partition_description, greptime_partition_id, \
partition_ordinal_position FROM information_schema.partitions WHERE table_name = ? \
ORDER BY partition_ordinal_position;";
@@ -91,3 +92,20 @@ pub fn assert_partitions(expected: &PartitionDef, actual: &[PartitionInfo]) -> R
Ok(())
}
/// Asserts that the table has no partition metadata in information schema.
pub fn assert_unpartitioned(actual: &[PartitionInfo]) -> Result<()> {
let has_no_partition_metadata = actual.is_empty()
|| (actual.len() == 1
&& actual[0].partition_expression.is_empty()
&& actual[0].partition_description.is_empty());
ensure!(
has_no_partition_metadata,
error::AssertSnafu {
reason: format!("Expected unpartitioned table, got partitions: {actual:?}"),
}
);
Ok(())
}

View File

@@ -36,14 +36,15 @@ use tests_fuzz::fake::{
use tests_fuzz::generator::Generator;
use tests_fuzz::generator::create_expr::{
CreateLogicalTableExprGeneratorBuilder, CreatePhysicalTableExprGeneratorBuilder,
generate_metric_partition_def,
};
use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder;
use tests_fuzz::generator::repartition_expr::{
MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder,
};
use tests_fuzz::ir::{
CreateTableExpr, Ident, InsertIntoExpr, RepartitionExpr, generate_random_value,
generate_unique_timestamp_for_mysql_with_clock,
AlterTablePartitionsExpr, CreateTableExpr, Ident, InsertIntoExpr, PartitionDef,
RepartitionExpr, generate_random_value, generate_unique_timestamp_for_mysql_with_clock,
};
use tests_fuzz::translator::DslTranslator;
use tests_fuzz::translator::csv::InsertExprToCsvRecordsTranslator;
@@ -94,6 +95,7 @@ fn generate_create_physical_table_expr<R: Rng + 'static>(
))))
.if_not_exists(rng.random_bool(0.5))
.partition(partitions)
.partition_column(partitions <= 1)
.build()
.unwrap()
.generate(rng)
@@ -158,12 +160,6 @@ async fn create_metric_tables<R: Rng + 'static>(
})?;
info!("Create physical table: {create_physical_sql}, result: {result:?}");
let physical_table_ctx = Arc::new(TableContext::from(&create_physical_expr));
ensure!(
physical_table_ctx.partition.is_some(),
error::AssertSnafu {
reason: "Physical metric table must have partition".to_string()
}
);
let mut logical_tables = BTreeMap::new();
let mut create_logical_sqls = HashMap::new();
@@ -436,6 +432,11 @@ fn repartition_operation<R: Rng + 'static>(
table_ctx: &TableContextRef,
rng: &mut R,
) -> Result<RepartitionExpr> {
if table_ctx.partition.is_none() {
let partition = generate_metric_partition_def(rng.random_range(2..8));
return Ok(alter_table_partitions_expr(table_ctx, partition, true));
}
let split = rng.random_bool(0.5);
if table_ctx.partition.as_ref().unwrap().exprs.len() <= 2 || split {
let expr = SplitPartitionExprGeneratorBuilder::default()
@@ -454,19 +455,35 @@ fn repartition_operation<R: Rng + 'static>(
}
}
fn alter_table_partitions_expr(
table_ctx: &TableContextRef,
partition: PartitionDef,
wait: bool,
) -> RepartitionExpr {
RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
table_name: table_ctx.name.clone(),
partition,
wait,
})
}
impl Arbitrary<'_> for FuzzInput {
fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result<Self> {
let seed = get_fuzz_override::<u64>("SEED").unwrap_or(u.int_in_range(u64::MIN..=u64::MAX)?);
let mut rng = ChaChaRng::seed_from_u64(seed);
let partitions =
get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8));
let partitions = get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| {
if rng.random_bool(0.5) {
1
} else {
rng.random_range(2..8)
}
});
let max_tables = get_gt_fuzz_input_max_tables();
let tables = get_fuzz_override::<usize>("TABLES")
.unwrap_or_else(|| rng.random_range(1..=std::cmp::max(1, max_tables)));
let max_actions = get_gt_fuzz_input_max_alter_actions();
let max_actions = std::cmp::min(128, get_gt_fuzz_input_max_alter_actions());
let actions = get_fuzz_override::<usize>("ACTIONS")
.unwrap_or_else(|| rng.random_range(1..max_actions));
Ok(FuzzInput {
seed,
actions,
@@ -536,7 +553,11 @@ async fn execute_repartition_metric_table(ctx: FuzzContext, input: FuzzInput) ->
tokio::time::sleep(Duration::from_millis(100)).await;
for i in 0..input.actions {
let partition_num = physical_table_ctx.partition.as_ref().unwrap().exprs.len();
let partition_num = physical_table_ctx
.partition
.as_ref()
.map(|partition| partition.exprs.len())
.unwrap_or_default();
info!(
"partition_num: {partition_num}, action: {}/{}, table: {}, logical table num: {}",
i + 1,

View File

@@ -33,14 +33,15 @@ use tests_fuzz::fake::{
uppercase_and_keyword_backtick_map,
};
use tests_fuzz::generator::Generator;
use tests_fuzz::generator::create_expr::CreateTableExprGeneratorBuilder;
use tests_fuzz::generator::create_expr::{CreateTableExprGeneratorBuilder, generate_partition_def};
use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder;
use tests_fuzz::generator::repartition_expr::{
MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder,
};
use tests_fuzz::ir::{
CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator, RepartitionExpr, RowValue,
SimplePartitions, generate_partition_value, generate_unique_timestamp_for_mysql_with_clock,
AlterTablePartitionsExpr, CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator,
PartitionDef, RepartitionExpr, RowValue, SimplePartitions, generate_partition_value,
generate_unique_timestamp_for_mysql_with_clock,
};
use tests_fuzz::translator::DslTranslator;
use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator;
@@ -75,8 +76,13 @@ impl Arbitrary<'_> for FuzzInput {
fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result<Self> {
let seed = get_fuzz_override::<u64>("SEED").unwrap_or(u.int_in_range(u64::MIN..=u64::MAX)?);
let mut rng = ChaChaRng::seed_from_u64(seed);
let partitions =
get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8));
let partitions = get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| {
if rng.random_bool(0.5) {
1
} else {
rng.random_range(2..8)
}
});
let max_actions = get_gt_fuzz_input_max_alter_actions();
let actions = get_fuzz_override::<usize>("ACTIONS")
.unwrap_or_else(|| rng.random_range(1..max_actions));
@@ -99,6 +105,7 @@ fn generate_create_expr<R: Rng + 'static>(
)))
.columns(5)
.partition(input.partitions)
.partition_column(input.partitions <= 1)
.engine("mito")
.ts_column_type_generator(Box::new(MySQLTsColumnTypeGenerator))
.build()
@@ -122,7 +129,7 @@ fn build_insert_expr<R: Rng + 'static>(
let ts_value_generator = generate_unique_timestamp_for_mysql_with_clock(clock.clone());
let counter = Arc::new(AtomicUsize::new(0));
let counter_clone = counter.clone();
let partition_len = table_ctx.partition.as_ref().unwrap().exprs.len();
let partition_len = partitions.bounds.len() + 1;
let row = rng.random_range(partition_len..partition_len * 2);
let moved_partitions = partitions.clone();
@@ -150,6 +157,28 @@ fn build_insert_expr<R: Rng + 'static>(
insert_generator.generate(rng).unwrap()
}
fn alter_table_partitions_expr(
table_ctx: &TableContextRef,
partition: PartitionDef,
wait: bool,
) -> RepartitionExpr {
RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
table_name: table_ctx.name.clone(),
partition,
wait,
})
}
fn alter_table_partitions_expr_from_table_ctx<R: Rng + 'static>(
table_ctx: &TableContextRef,
rng: &mut R,
wait: bool,
) -> RepartitionExpr {
let column = table_ctx.columns[0].clone();
let partition = generate_partition_def(rng.random_range(2..8), column.column_type, column.name);
alter_table_partitions_expr(table_ctx, partition, wait)
}
async fn execute_insert_with_retry(ctx: &FuzzContext, sql: &str) -> Result<()> {
let mut delay = Duration::from_millis(100);
let mut attempt = 0;
@@ -236,9 +265,36 @@ async fn execute_repartition_table(ctx: FuzzContext, input: FuzzInput) -> Result
inserted_rows: 0,
}));
let mut action_start = 0;
if table_ctx.partition.is_none() {
let expr = alter_table_partitions_expr_from_table_ctx(&table_ctx, &mut rng, true);
let translator = RepartitionExprTranslator;
let sql = translator.translate(&expr)?;
info!("Initial partition sql: {sql}");
let result = sqlx::query(&sql)
.execute(&ctx.greptime)
.await
.context(error::ExecuteQuerySnafu { sql: &sql })?;
info!("Initial partition result: {result:?}");
table_ctx = Arc::new(Arc::unwrap_or_clone(table_ctx).repartition(expr).unwrap());
shared_state.lock().unwrap().table_ctx = table_ctx.clone();
let partition_entries = validator::partition::fetch_partitions_info_schema(
&ctx.greptime,
"public".into(),
&table_ctx.name,
)
.await?;
validator::partition::assert_partitions(
table_ctx.partition.as_ref().unwrap(),
&partition_entries,
)?;
action_start = 1;
}
let writer_rng = ChaChaRng::seed_from_u64(input.seed);
let writer_task = tokio::spawn(write_loop(writer_rng, ctx.clone(), shared_state.clone()));
for i in 0..input.actions {
for i in action_start..input.actions {
let partition_num = table_ctx.partition.as_ref().unwrap().exprs.len();
info!(
"partition_num: {partition_num}, action: {}/{}",

View File

@@ -34,14 +34,15 @@ use tests_fuzz::fake::{
uppercase_and_keyword_backtick_map,
};
use tests_fuzz::generator::Generator;
use tests_fuzz::generator::create_expr::CreateTableExprGeneratorBuilder;
use tests_fuzz::generator::create_expr::{CreateTableExprGeneratorBuilder, generate_partition_def};
use tests_fuzz::generator::insert_expr::InsertExprGeneratorBuilder;
use tests_fuzz::generator::repartition_expr::{
MergePartitionExprGeneratorBuilder, SplitPartitionExprGeneratorBuilder,
};
use tests_fuzz::ir::{
CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator, RepartitionExpr, RowValue,
SimplePartitions, generate_partition_value, generate_unique_timestamp_for_mysql_with_clock,
AlterTablePartitionsExpr, CreateTableExpr, InsertIntoExpr, MySQLTsColumnTypeGenerator,
PartitionDef, RepartitionExpr, RowValue, SimplePartitions, generate_partition_value,
generate_unique_timestamp_for_mysql_with_clock,
};
use tests_fuzz::translator::DslTranslator;
use tests_fuzz::translator::mysql::create_expr::CreateTableExprTranslator;
@@ -93,13 +94,17 @@ impl Arbitrary<'_> for FuzzInput {
let mut rng = ChaChaRng::seed_from_u64(seed);
let rows = get_fuzz_override::<usize>("ROWS")
.unwrap_or_else(|| rng.random_range(2..get_gt_fuzz_input_max_rows()));
let partitions =
get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| rng.random_range(2..8));
let partitions = get_fuzz_override::<usize>("PARTITIONS").unwrap_or_else(|| {
if rng.random_bool(0.5) {
1
} else {
rng.random_range(2..8)
}
});
let chaos_delay_ms =
get_fuzz_override::<u64>("CHAOS_DELAY_MS").unwrap_or_else(|| rng.random_range(0..5000));
let chaos_hold_secs =
get_fuzz_override::<u64>("CHAOS_HOLD_SECS").unwrap_or_else(|| rng.random_range(10..20));
Ok(FuzzInput {
seed,
rows,
@@ -127,6 +132,7 @@ fn generate_create_expr<R: Rng + 'static>(
)))
.columns(5)
.partition(input.partitions)
.partition_column(input.partitions <= 1)
.engine("mito")
.ts_column_type_generator(Box::new(MySQLTsColumnTypeGenerator))
.build()
@@ -144,7 +150,7 @@ fn build_insert_expr<R: Rng + 'static>(
let ts_value_generator = generate_unique_timestamp_for_mysql_with_clock(clock.clone());
let counter = Arc::new(AtomicUsize::new(0));
let counter_clone = counter.clone();
let partition_len = table_ctx.partition.as_ref().unwrap().exprs.len();
let partition_len = partitions.bounds.len() + 1;
let moved_partitions = partitions.clone();
let insert_generator = InsertExprGeneratorBuilder::default()
.table_ctx(table_ctx.clone())
@@ -202,10 +208,12 @@ async fn create_table(ctx: &FuzzContext, expr: &CreateTableExpr) -> Result<Table
async fn insert_initial_rows<R: Rng + 'static>(
ctx: &FuzzContext,
table_ctx: &TableContextRef,
partition_def: &PartitionDef,
rng: &mut R,
rows: usize,
) -> Result<u64> {
let partitions = SimplePartitions::from_table_ctx(table_ctx).unwrap();
let partitions =
SimplePartitions::from_exprs(partition_def.columns[0].clone(), &partition_def.exprs)?;
let clock = Arc::new(Mutex::new(Timestamp::current_millis()));
let insert_expr = build_insert_expr(table_ctx, rng, &partitions, &clock, rows);
let inserted_rows = insert_expr.values_list.len() as u64;
@@ -260,6 +268,28 @@ fn repartition_operation<R: Rng + 'static>(
}
}
fn alter_table_partitions_expr(
table_ctx: &TableContextRef,
partition: PartitionDef,
wait: bool,
) -> RepartitionExpr {
RepartitionExpr::AlterPartitions(AlterTablePartitionsExpr {
table_name: table_ctx.name.clone(),
partition,
wait,
})
}
fn alter_table_partitions_expr_from_table_ctx<R: Rng + 'static>(
table_ctx: &TableContextRef,
rng: &mut R,
wait: bool,
) -> RepartitionExpr {
let column = table_ctx.columns[0].clone();
let partition = generate_partition_def(rng.random_range(2..8), column.column_type, column.name);
alter_table_partitions_expr(table_ctx, partition, wait)
}
async fn submit_repartition_procedure(ctx: &FuzzContext, expr: &RepartitionExpr) -> Result<String> {
let translator = RepartitionExprTranslator;
let sql = translator.translate(expr)?;
@@ -334,10 +364,13 @@ async fn validate_terminal_metadata(
after_table_ctx.partition.as_ref().unwrap(),
&partition_entries,
)?,
ProcedureTerminalState::Failed => validator::partition::assert_partitions(
before_table_ctx.partition.as_ref().unwrap(),
&partition_entries,
)?,
ProcedureTerminalState::Failed => {
if let Some(partition) = before_table_ctx.partition.as_ref() {
validator::partition::assert_partitions(partition, &partition_entries)?;
} else {
validator::partition::assert_unpartitioned(&partition_entries)?;
}
}
}
Ok(())
@@ -359,7 +392,21 @@ async fn execute_repartition_chaos(ctx: FuzzContext, input: FuzzInput) -> Result
let create_expr = generate_create_expr(&input, &mut rng)?;
let before_table_ctx = create_table(&ctx, &create_expr).await?;
let inserted_rows = insert_initial_rows(&ctx, &before_table_ctx, &mut rng, input.rows).await?;
let insert_partition = create_expr.partition.clone().unwrap_or_else(|| {
generate_partition_def(
2,
before_table_ctx.columns[0].column_type.clone(),
before_table_ctx.columns[0].name.clone(),
)
});
let inserted_rows = insert_initial_rows(
&ctx,
&before_table_ctx,
&insert_partition,
&mut rng,
input.rows,
)
.await?;
validate_table_rows(&ctx, &before_table_ctx, inserted_rows).await?;
let before_entries = validator::partition::fetch_partitions_info_schema(
@@ -370,7 +417,11 @@ async fn execute_repartition_chaos(ctx: FuzzContext, input: FuzzInput) -> Result
.await?;
info!("Before repartition partition entries: {before_entries:?}");
let repartition_expr = repartition_operation(&before_table_ctx, &mut rng, false)?;
let repartition_expr = if before_table_ctx.partition.is_some() {
repartition_operation(&before_table_ctx, &mut rng, false)?
} else {
alter_table_partitions_expr_from_table_ctx(&before_table_ctx, &mut rng, false)
};
let after_table_ctx = Arc::new(
Arc::unwrap_or_clone(before_table_ctx.clone())
.repartition(repartition_expr.clone())

View File

@@ -80,6 +80,7 @@ pub struct GreptimeDbStandaloneBuilder {
default_store: Option<StorageType>,
plugin: Option<Plugins>,
slow_query_options: SlowQueryOptions,
auto_create_table: bool,
}
impl GreptimeDbStandaloneBuilder {
@@ -97,9 +98,16 @@ impl GreptimeDbStandaloneBuilder {
threshold: Duration::from_secs(1),
..Default::default()
},
auto_create_table: true,
}
}
#[must_use]
pub fn with_auto_create_table(mut self, auto_create_table: bool) -> Self {
self.auto_create_table = auto_create_table;
self
}
#[must_use]
pub fn with_default_store_type(self, store_type: StorageType) -> Self {
Self {
@@ -347,6 +355,7 @@ impl GreptimeDbStandaloneBuilder {
wal: self.metasrv_wal_config.clone().into(),
grpc: GrpcOptions::default().with_server_addr("127.0.0.1:4001"),
slow_query: self.slow_query_options.clone(),
auto_create_table: self.auto_create_table,
..StandaloneOptions::default()
};

View File

@@ -16,6 +16,7 @@ use std::env;
use std::fmt::Display;
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;
use auth::{DefaultPermissionChecker, PermissionCheckerRef, UserProviderRef};
use axum::Router;
@@ -49,6 +50,7 @@ use servers::http::{HttpOptions, HttpServerBuilder};
use servers::metrics_handler::MetricsHandler;
use servers::mysql::server::{MysqlServer, MysqlSpawnConfig, MysqlSpawnRef};
use servers::otel_arrow::OtelArrowServiceHandler;
use servers::pending_rows_batcher::PendingRowsBatcher;
use servers::postgres::PostgresServer;
use servers::prom_remote_write::validation::PromValidationMode;
use servers::query_handler::sql::SqlQueryHandler;
@@ -564,6 +566,24 @@ async fn run_sql(sql: &str, instance: &GreptimeDbStandalone) {
pub async fn setup_test_prom_app_with_frontend(
store_type: StorageType,
name: &str,
) -> (Router, TestGuard) {
setup_test_prom_app_with_frontend_inner(store_type, name, false).await
}
/// Like [`setup_test_prom_app_with_frontend`] but enables the pending-rows batcher,
/// so Prometheus remote write goes through the batched (metric-engine) path instead
/// of the direct `PromStoreProtocolHandler::write` path.
pub async fn setup_test_prom_app_with_frontend_batched(
store_type: StorageType,
name: &str,
) -> (Router, TestGuard) {
setup_test_prom_app_with_frontend_inner(store_type, name, true).await
}
async fn setup_test_prom_app_with_frontend_inner(
store_type: StorageType,
name: &str,
enable_batcher: bool,
) -> (Router, TestGuard) {
unsafe {
std::env::set_var("TZ", "UTC");
@@ -617,6 +637,24 @@ pub async fn setup_test_prom_app_with_frontend(
..Default::default()
};
let frontend_ref = instance.fe_instance().clone();
// Mirror the production wiring at `frontend::server`: build the batcher from the
// instance's managers. A short flush interval keeps the test responsive.
let pending_rows_batcher = if enable_batcher {
PendingRowsBatcher::try_new(
frontend_ref.partition_manager().clone(),
frontend_ref.node_manager().clone(),
frontend_ref.catalog_manager().clone(),
true,
frontend_ref.clone(),
Duration::from_millis(50),
1000,
4,
64,
64,
)
} else {
None
};
let http_server = HttpServerBuilder::new(http_opts)
.with_sql_handler(frontend_ref.clone())
.with_logs_handler(instance.fe_instance().clone())
@@ -625,7 +663,7 @@ pub async fn setup_test_prom_app_with_frontend(
Some(frontend_ref.clone()),
true,
PromValidationMode::Strict,
None,
pending_rows_batcher,
)
.with_prometheus_handler(frontend_ref)
.with_greptime_config_options(instance.opts.datanode_options().to_toml().unwrap())
@@ -649,6 +687,20 @@ pub async fn setup_grpc_server_with_user_provider(
setup_grpc_server_with(store_type, name, user_provider, None, None).await
}
/// Sets up a gRPC server backed by a standalone instance whose frontend has auto
/// table creation disabled, for testing the server-side global switch.
pub async fn setup_grpc_server_with_auto_create_table_disabled(
store_type: StorageType,
name: &str,
) -> (GreptimeDbStandalone, Arc<GrpcServer>) {
let instance = GreptimeDbStandaloneBuilder::new(name)
.with_default_store_type(store_type)
.with_auto_create_table(false)
.build()
.await;
setup_grpc_server_for_instance(instance, None, None, None).await
}
pub async fn setup_grpc_server_with(
store_type: StorageType,
name: &str,
@@ -657,7 +709,17 @@ pub async fn setup_grpc_server_with(
memory_limiter: Option<servers::request_memory_limiter::ServerMemoryLimiter>,
) -> (GreptimeDbStandalone, Arc<GrpcServer>) {
let instance = setup_standalone_instance(name, store_type).await;
setup_grpc_server_for_instance(instance, user_provider, grpc_config, memory_limiter).await
}
/// Builds and starts a gRPC server on top of an already-constructed standalone
/// instance. This is the shared core behind the `setup_grpc_server_*` helpers.
async fn setup_grpc_server_for_instance(
instance: GreptimeDbStandalone,
user_provider: Option<UserProviderRef>,
grpc_config: Option<GrpcServerConfig>,
memory_limiter: Option<servers::request_memory_limiter::ServerMemoryLimiter>,
) -> (GreptimeDbStandalone, Arc<GrpcServer>) {
let runtime: Runtime = RuntimeBuilder::default()
.worker_threads(2)
.thread_name("grpc-handlers")

View File

@@ -44,7 +44,8 @@ use servers::request_memory_limiter::ServerMemoryLimiter;
use servers::server::Server;
use servers::tls::{TlsMode, TlsOption};
use tests_integration::test_util::{
StorageType, setup_grpc_server, setup_grpc_server_with, setup_grpc_server_with_user_provider,
StorageType, setup_grpc_server, setup_grpc_server_with,
setup_grpc_server_with_auto_create_table_disabled, setup_grpc_server_with_user_provider,
};
use tonic::Request;
use tonic::metadata::MetadataValue;
@@ -82,6 +83,7 @@ macro_rules! grpc_tests {
test_invalid_dbname,
test_auto_create_table,
test_auto_create_table_with_hints,
test_auto_create_table_disabled_by_config,
test_otel_arrow_auth,
test_insert_and_select,
test_dbname,
@@ -405,6 +407,81 @@ pub async fn test_auto_create_table_with_hints(store_type: StorageType) {
let _ = fe_grpc_server.shutdown().await;
}
/// When the frontend global switch disables auto table creation, a write to a
/// missing table must fail even if the request sets `auto_create_table=true`,
/// proving the global config is an upper bound that hints cannot bypass.
pub async fn test_auto_create_table_disabled_by_config(store_type: StorageType) {
let (_db, fe_grpc_server) = setup_grpc_server_with_auto_create_table_disabled(
store_type,
"test_auto_create_table_disabled_by_config",
)
.await;
let addr = fe_grpc_server.bind_addr().unwrap().to_string();
let grpc_client = Client::with_urls(vec![addr]);
let db = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, grpc_client);
// Plain row insert to a missing table: must fail even with `auto_create_table=true`.
let (host, cpu, mem, ts) = expect_data();
let request = InsertRequest {
table_name: "demo".to_string(),
columns: vec![host, cpu, mem, ts],
row_count: 4,
};
let result = db
.insert_with_hints(
InsertRequests {
inserts: vec![request],
},
&[("auto_create_table", "true")],
)
.await;
let err = result.unwrap_err().to_string();
assert!(
err.contains("does not exist") && err.contains("disabled by frontend config"),
"unexpected error: {err}"
);
// Metric path (via `physical_table` hint): must also fail without leaking the physical table.
let (host, cpu, mem, ts) = expect_data();
let request = InsertRequest {
table_name: "demo_metric".to_string(),
columns: vec![host, cpu, mem, ts],
row_count: 4,
};
let result = db
.insert_with_hints(
InsertRequests {
inserts: vec![request],
},
&[
("auto_create_table", "true"),
("physical_table", "greptime_physical_table"),
],
)
.await;
let err = result.unwrap_err().to_string();
assert!(
err.contains("does not exist") && err.contains("disabled by frontend config"),
"unexpected error: {err}"
);
// The physical table must not have been created before the failure.
let output = db.sql("SHOW TABLES").await.unwrap();
let record_batches = match output.data {
OutputData::RecordBatches(record_batches) => record_batches,
OutputData::Stream(stream) => RecordBatches::try_collect(stream).await.unwrap(),
OutputData::AffectedRows(_) => unreachable!(),
};
let tables = record_batches.pretty_print().unwrap();
assert!(
!tables.contains("greptime_physical_table"),
"physical table leaked despite disabled auto-create:\n{tables}"
);
let _ = fe_grpc_server.shutdown().await;
}
fn expect_data() -> (Column, Column, Column, Column) {
// testing data:
let expected_host_col = Column {

View File

@@ -71,6 +71,7 @@ use tests_integration::test_util::{
StorageType, setup_test_http_app, setup_test_http_app_with_frontend,
setup_test_http_app_with_frontend_and_slow_query_threshold,
setup_test_http_app_with_frontend_and_user_provider, setup_test_prom_app_with_frontend,
setup_test_prom_app_with_frontend_batched,
};
use urlencoding::encode;
use yaml_rust::YamlLoader;
@@ -117,6 +118,7 @@ macro_rules! http_tests {
test_dashboard_path,
test_dashboard_api,
test_prometheus_remote_write,
test_prometheus_remote_write_batched,
test_prometheus_remote_special_labels,
test_prometheus_remote_schema_labels,
test_prometheus_remote_write_with_pipeline,
@@ -1491,6 +1493,7 @@ mem_threshold_on_create = "auto"
let expected_toml_str = format!(
r#"
enable_telemetry = true
auto_create_table = true
max_in_flight_write_bytes = "0KiB"
write_bytes_exhausted_policy = "wait"
init_regions_in_background = false
@@ -1601,6 +1604,7 @@ experimental_grpc_max_retries = 3
experimental_frontend_scan_timeout = "30s"
experimental_max_filter_num_per_query = 20
experimental_time_window_merge_threshold = 3
experimental_enable_incremental_read = false
read_preference = "Leader"
[logging]
@@ -1954,6 +1958,18 @@ pub async fn test_prometheus_remote_write(store_type: StorageType) {
)
.await;
// Prom RW tables carry the metric identity; type is inferred from naming.
validate_data(
"prometheus_remote_write_semantic_identity",
&client,
"select count(*) from information_schema.tables where table_name = 'metric2' \
and create_options like '%greptime.semantic.signal_type=metric%' \
and create_options like '%greptime.semantic.source=prometheus%' \
and create_options like '%greptime.semantic.metric.metadata_quality=inferred%';",
"[[1]]",
)
.await;
// Write snappy encoded data with new labels
let write_request = WriteRequest {
timeseries: mock_timeseries_new_label(),
@@ -1975,6 +1991,48 @@ pub async fn test_prometheus_remote_write(store_type: StorageType) {
guard.remove_all().await;
}
/// Covers the batched (pending-rows-batcher) Prometheus remote write path, which
/// bypasses `PromStoreProtocolHandler::write`. Verifies the metric table is created
/// asynchronously and still carries the Prometheus semantic identity stamped on the
/// shared request context.
pub async fn test_prometheus_remote_write_batched(store_type: StorageType) {
common_telemetry::init_default_ut_logging();
let (app, mut guard) =
setup_test_prom_app_with_frontend_batched(store_type, "prometheus_remote_write_batched")
.await;
let client = TestClient::new(app).await;
let write_request = WriteRequest {
timeseries: prom_store::mock_timeseries(),
..Default::default()
};
let serialized_request = write_request.encode_to_vec();
let compressed_request =
prom_store::snappy_compress(&serialized_request).expect("failed to encode snappy");
let res = client
.post("/v1/prometheus/write")
.header("Content-Encoding", "snappy")
.body(compressed_request)
.send()
.await;
assert_eq!(res.status(), StatusCode::NO_CONTENT);
// The batcher flushes asynchronously, so poll until the table exists and carries
// the semantic identity (signal_type/source/metadata_quality).
wait_for_data(
&client,
"select count(*) from information_schema.tables where table_name = 'metric2' \
and create_options like '%greptime.semantic.signal_type=metric%' \
and create_options like '%greptime.semantic.source=prometheus%' \
and create_options like '%greptime.semantic.metric.metadata_quality=inferred%'",
"[[1]]",
)
.await;
guard.remove_all().await;
}
pub async fn test_prometheus_remote_special_labels(store_type: StorageType) {
common_telemetry::init_default_ut_logging();
let (app, mut guard) =
@@ -2023,7 +2081,7 @@ pub async fn test_prometheus_remote_special_labels(store_type: StorageType) {
expected,
)
.await;
let expected = "[[\"idc3_lo_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc3_lo_table\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n on_physical_table = 'f1'\\n)\"]]";
let expected = "[[\"idc3_lo_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc3_lo_table\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n 'greptime.semantic.metric.metadata_quality' = 'inferred',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'prometheus',\\n on_physical_table = 'f1'\\n)\"]]";
validate_data(
"test_prometheus_remote_special_labels_idc3_show_create_table",
&client,
@@ -2049,7 +2107,7 @@ pub async fn test_prometheus_remote_special_labels(store_type: StorageType) {
expected,
)
.await;
let expected = "[[\"idc4_local_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc4_local_table\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n on_physical_table = 'f2'\\n)\"]]";
let expected = "[[\"idc4_local_table\",\"CREATE TABLE IF NOT EXISTS \\\"idc4_local_table\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n 'greptime.semantic.metric.metadata_quality' = 'inferred',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'prometheus',\\n on_physical_table = 'f2'\\n)\"]]";
validate_data(
"test_prometheus_remote_special_labels_idc4_show_create_table",
&client,
@@ -5025,6 +5083,28 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
let expected = "[[\"claude_code_cost_usage_USD_total\"],[\"claude_code_token_usage_tokens_total\"],[\"demo\"],[\"greptime_physical_table\"],[\"numbers\"]]";
validate_data("otlp_metrics_all_tables", &client, "show tables;", expected).await;
// Metric-engine logical table carries the semantic identity. Match substrings
// because extra_options ordering is not stable.
validate_data(
"otlp_metrics_semantic_identity",
&client,
"select count(*) from information_schema.tables where table_name = 'claude_code_cost_usage_USD_total' \
and create_options like '%greptime.semantic.signal_type=metric%' \
and create_options like '%greptime.semantic.source=opentelemetry%';",
"[[1]]",
)
.await;
// OTLP metric type is declared, so Phase 1 must not stamp `metadata_quality`
// here (Phase 2 adds it as `declared`).
validate_data(
"otlp_metrics_no_metadata_quality",
&client,
"select count(*) from information_schema.tables where table_name = 'claude_code_cost_usage_USD_total' \
and create_options like '%metadata_quality%';",
"[[0]]",
)
.await;
// CREATE TABLE IF NOT EXISTS "claude_code_cost_usage_USD_total" (
// "greptime_timestamp" TIMESTAMP(3) NOT NULL,
// "greptime_value" DOUBLE NULL,
@@ -5049,7 +5129,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
// on_physical_table = 'greptime_physical_table',
// otlp_metric_compat = 'prom'
// )
let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"host_arch\\\" STRING NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"os_version\\\" STRING NULL,\\n \\\"otel_scope_name\\\" STRING NULL,\\n \\\"otel_scope_schema_url\\\" STRING NULL,\\n \\\"otel_scope_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"host_arch\\\", \\\"job\\\", \\\"model\\\", \\\"os_version\\\", \\\"otel_scope_name\\\", \\\"otel_scope_schema_url\\\", \\\"otel_scope_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]";
let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"host_arch\\\" STRING NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"os_version\\\" STRING NULL,\\n \\\"otel_scope_name\\\" STRING NULL,\\n \\\"otel_scope_schema_url\\\" STRING NULL,\\n \\\"otel_scope_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"host_arch\\\", \\\"job\\\", \\\"model\\\", \\\"os_version\\\", \\\"otel_scope_name\\\", \\\"otel_scope_schema_url\\\", \\\"otel_scope_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n \'comment\' = 'Created on insertion',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'opentelemetry',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]";
validate_data(
"otlp_metrics_all_show_create_table",
&client,
@@ -5122,7 +5202,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
// on_physical_table = 'greptime_physical_table',
// otlp_metric_compat = 'prom'
// )
let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"os_type\\\" STRING NULL,\\n \\\"os_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"os_type\\\", \\\"os_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n 'comment' = 'Created on insertion',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]";
let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"os_type\\\" STRING NULL,\\n \\\"os_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"os_type\\\", \\\"os_version\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n 'comment' = 'Created on insertion',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'opentelemetry',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]";
validate_data(
"otlp_metrics_show_create_table",
&client,
@@ -5186,7 +5266,7 @@ pub async fn test_otlp_metrics_new(store_type: StorageType) {
// on_physical_table = 'greptime_physical_table',
// otlp_metric_compat = 'prom'
// )
let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n 'comment' = 'Created on insertion',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]";
let expected = "[[\"claude_code_cost_usage_USD_total\",\"CREATE TABLE IF NOT EXISTS \\\"claude_code_cost_usage_USD_total\\\" (\\n \\\"greptime_timestamp\\\" TIMESTAMP(3) NOT NULL,\\n \\\"greptime_value\\\" DOUBLE NULL,\\n \\\"job\\\" STRING NULL,\\n \\\"model\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL,\\n \\\"service_version\\\" STRING NULL,\\n \\\"session_id\\\" STRING NULL,\\n \\\"terminal_type\\\" STRING NULL,\\n \\\"user_id\\\" STRING NULL,\\n TIME INDEX (\\\"greptime_timestamp\\\"),\\n PRIMARY KEY (\\\"job\\\", \\\"model\\\", \\\"service_name\\\", \\\"service_version\\\", \\\"session_id\\\", \\\"terminal_type\\\", \\\"user_id\\\")\\n)\\n\\nENGINE=metric\\nWITH(\\n 'comment' = 'Created on insertion',\\n 'greptime.semantic.signal_type' = 'metric',\\n 'greptime.semantic.source' = 'opentelemetry',\\n on_physical_table = 'greptime_physical_table',\\n otlp_metric_compat = 'prom'\\n)\"]]";
validate_data(
"otlp_metrics_show_create_table_none",
&client,
@@ -5493,7 +5573,22 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
let expected = r#"[[1736480942444376000,1736480942444499000,123000,null,"c05d7a4ec8e1f231f02ed6e8da8655b4","d24f921c75f68e23","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444376000,1736480942444499000,123000,"d24f921c75f68e23","c05d7a4ec8e1f231f02ed6e8da8655b4","9630f2916e2f7909","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]],[1736480942444589000,1736480942444712000,123000,null,"cc9e0991a2e63d274984bd44ee669203","eba7be77e3558179","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444589000,1736480942444712000,123000,"eba7be77e3558179","cc9e0991a2e63d274984bd44ee669203","8f847259b0f6e1ab","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]]]"#;
validate_data("otlp_traces", &client, "select * from mytable;", expected).await;
let expected_ddl = r#"[["mytable","CREATE TABLE IF NOT EXISTS \"mytable\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '1',\n trace_id >= '1' AND trace_id < '2',\n trace_id >= '2' AND trace_id < '3',\n trace_id >= '3' AND trace_id < '4',\n trace_id >= '4' AND trace_id < '5',\n trace_id >= '5' AND trace_id < '6',\n trace_id >= '6' AND trace_id < '7',\n trace_id >= '7' AND trace_id < '8',\n trace_id >= '8' AND trace_id < '9',\n trace_id >= '9' AND trace_id < 'a',\n trace_id >= 'a' AND trace_id < 'b',\n trace_id >= 'b' AND trace_id < 'c',\n trace_id >= 'c' AND trace_id < 'd',\n trace_id >= 'd' AND trace_id < 'e',\n trace_id >= 'e' AND trace_id < 'f',\n trace_id >= 'f'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
// The trace v1 main table carries the trace identity (events/links preserved as
// JSON columns by the v1 model).
validate_data(
"otlp_traces_semantic_identity",
&client,
"select count(*) from information_schema.tables where table_name = 'mytable' \
and create_options like '%greptime.semantic.signal_type=trace%' \
and create_options like '%greptime.semantic.source=opentelemetry%' \
and create_options like '%greptime.semantic.pipeline=greptime_trace_v1%' \
and create_options like '%greptime.semantic.trace.has_events=true%' \
and create_options like '%greptime.semantic.trace.has_links=true%';",
"[[1]]",
)
.await;
let expected_ddl = r#"[["mytable","CREATE TABLE IF NOT EXISTS \"mytable\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '1',\n trace_id >= '1' AND trace_id < '2',\n trace_id >= '2' AND trace_id < '3',\n trace_id >= '3' AND trace_id < '4',\n trace_id >= '4' AND trace_id < '5',\n trace_id >= '5' AND trace_id < '6',\n trace_id >= '6' AND trace_id < '7',\n trace_id >= '7' AND trace_id < '8',\n trace_id >= '8' AND trace_id < '9',\n trace_id >= '9' AND trace_id < 'a',\n trace_id >= 'a' AND trace_id < 'b',\n trace_id >= 'b' AND trace_id < 'c',\n trace_id >= 'c' AND trace_id < 'd',\n trace_id >= 'd' AND trace_id < 'e',\n trace_id >= 'e' AND trace_id < 'f',\n trace_id >= 'f'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\n 'greptime.semantic.signal_type' = 'trace',\n 'greptime.semantic.source' = 'opentelemetry',\n 'greptime.semantic.trace.conventions' = 'unknown',\n 'greptime.semantic.trace.has_events' = 'true',\n 'greptime.semantic.trace.has_links' = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
validate_data(
"otlp_traces",
&client,
@@ -5546,7 +5641,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
)
.await;
assert_eq!(StatusCode::OK, res.status());
let expected_ddl = r#"[["trace_table_part1","CREATE TABLE IF NOT EXISTS \"trace_table_part1\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\n\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
let expected_ddl = r#"[["trace_table_part1","CREATE TABLE IF NOT EXISTS \"trace_table_part1\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\n\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\n 'greptime.semantic.signal_type' = 'trace',\n 'greptime.semantic.source' = 'opentelemetry',\n 'greptime.semantic.trace.conventions' = 'unknown',\n 'greptime.semantic.trace.has_events' = 'true',\n 'greptime.semantic.trace.has_links' = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
validate_data(
"otlp_traces",
&client,
@@ -5583,7 +5678,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
)
.await;
assert_eq!(StatusCode::OK, res.status());
let expected_ddl = r#"[["trace_table_part4","CREATE TABLE IF NOT EXISTS \"trace_table_part4\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '4',\n trace_id >= '4' AND trace_id < '8',\n trace_id >= '8' AND trace_id < 'c',\n trace_id >= 'c'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
let expected_ddl = r#"[["trace_table_part4","CREATE TABLE IF NOT EXISTS \"trace_table_part4\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '4',\n trace_id >= '4' AND trace_id < '8',\n trace_id >= '8' AND trace_id < 'c',\n trace_id >= 'c'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\n 'greptime.semantic.signal_type' = 'trace',\n 'greptime.semantic.source' = 'opentelemetry',\n 'greptime.semantic.trace.conventions' = 'unknown',\n 'greptime.semantic.trace.has_events' = 'true',\n 'greptime.semantic.trace.has_links' = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
validate_data(
"otlp_traces",
&client,
@@ -5620,7 +5715,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
)
.await;
assert_eq!(StatusCode::OK, res.status());
let expected_ddl = r#"[["trace_table_part32","CREATE TABLE IF NOT EXISTS \"trace_table_part32\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '08',\n trace_id >= '08' AND trace_id < '10',\n trace_id >= '10' AND trace_id < '18',\n trace_id >= '18' AND trace_id < '20',\n trace_id >= '20' AND trace_id < '28',\n trace_id >= '28' AND trace_id < '30',\n trace_id >= '30' AND trace_id < '38',\n trace_id >= '38' AND trace_id < '40',\n trace_id >= '40' AND trace_id < '48',\n trace_id >= '48' AND trace_id < '50',\n trace_id >= '50' AND trace_id < '58',\n trace_id >= '58' AND trace_id < '60',\n trace_id >= '60' AND trace_id < '68',\n trace_id >= '68' AND trace_id < '70',\n trace_id >= '70' AND trace_id < '78',\n trace_id >= '78' AND trace_id < '80',\n trace_id >= '80' AND trace_id < '88',\n trace_id >= '88' AND trace_id < '90',\n trace_id >= '90' AND trace_id < '98',\n trace_id >= '98' AND trace_id < 'a0',\n trace_id >= 'a0' AND trace_id < 'a8',\n trace_id >= 'a8' AND trace_id < 'b0',\n trace_id >= 'b0' AND trace_id < 'b8',\n trace_id >= 'b8' AND trace_id < 'c0',\n trace_id >= 'c0' AND trace_id < 'c8',\n trace_id >= 'c8' AND trace_id < 'd0',\n trace_id >= 'd0' AND trace_id < 'd8',\n trace_id >= 'd8' AND trace_id < 'e0',\n trace_id >= 'e0' AND trace_id < 'e8',\n trace_id >= 'e8' AND trace_id < 'f0',\n trace_id >= 'f0' AND trace_id < 'f8',\n trace_id >= 'f8'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
let expected_ddl = r#"[["trace_table_part32","CREATE TABLE IF NOT EXISTS \"trace_table_part32\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '08',\n trace_id >= '08' AND trace_id < '10',\n trace_id >= '10' AND trace_id < '18',\n trace_id >= '18' AND trace_id < '20',\n trace_id >= '20' AND trace_id < '28',\n trace_id >= '28' AND trace_id < '30',\n trace_id >= '30' AND trace_id < '38',\n trace_id >= '38' AND trace_id < '40',\n trace_id >= '40' AND trace_id < '48',\n trace_id >= '48' AND trace_id < '50',\n trace_id >= '50' AND trace_id < '58',\n trace_id >= '58' AND trace_id < '60',\n trace_id >= '60' AND trace_id < '68',\n trace_id >= '68' AND trace_id < '70',\n trace_id >= '70' AND trace_id < '78',\n trace_id >= '78' AND trace_id < '80',\n trace_id >= '80' AND trace_id < '88',\n trace_id >= '88' AND trace_id < '90',\n trace_id >= '90' AND trace_id < '98',\n trace_id >= '98' AND trace_id < 'a0',\n trace_id >= 'a0' AND trace_id < 'a8',\n trace_id >= 'a8' AND trace_id < 'b0',\n trace_id >= 'b0' AND trace_id < 'b8',\n trace_id >= 'b8' AND trace_id < 'c0',\n trace_id >= 'c0' AND trace_id < 'c8',\n trace_id >= 'c8' AND trace_id < 'd0',\n trace_id >= 'd0' AND trace_id < 'd8',\n trace_id >= 'd8' AND trace_id < 'e0',\n trace_id >= 'e0' AND trace_id < 'e8',\n trace_id >= 'e8' AND trace_id < 'f0',\n trace_id >= 'f0' AND trace_id < 'f8',\n trace_id >= 'f8'\n)\nENGINE=mito\nWITH(\n 'comment' = 'Created on insertion',\n append_mode = 'true',\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\n 'greptime.semantic.signal_type' = 'trace',\n 'greptime.semantic.source' = 'opentelemetry',\n 'greptime.semantic.trace.conventions' = 'unknown',\n 'greptime.semantic.trace.has_events' = 'true',\n 'greptime.semantic.trace.has_links' = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
validate_data(
"otlp_traces",
&client,
@@ -6283,6 +6378,17 @@ pub async fn test_otlp_logs(store_type: StorageType) {
expected,
)
.await;
// The auto-created log table carries the log identity.
validate_data(
"otlp_logs_semantic_identity",
&client,
"select count(*) from information_schema.tables where table_name = 'opentelemetry_logs' \
and create_options like '%greptime.semantic.signal_type=log%' \
and create_options like '%greptime.semantic.source=opentelemetry%';",
"[[1]]",
)
.await;
}
{
@@ -7718,7 +7824,7 @@ pub async fn test_jaeger_query_api_for_trace_v1(store_type: StorageType) {
.await;
assert_eq!(StatusCode::OK, res.status());
let trace_table_sql = "[[\"mytable\",\"CREATE TABLE IF NOT EXISTS \\\"mytable\\\" (\\n \\\"timestamp\\\" TIMESTAMP(9) NOT NULL,\\n \\\"timestamp_end\\\" TIMESTAMP(9) NULL,\\n \\\"duration_nano\\\" BIGINT UNSIGNED NULL,\\n \\\"parent_span_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"trace_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"span_id\\\" STRING NULL,\\n \\\"span_kind\\\" STRING NULL,\\n \\\"span_name\\\" STRING NULL,\\n \\\"span_status_code\\\" STRING NULL,\\n \\\"span_status_message\\\" STRING NULL,\\n \\\"trace_state\\\" STRING NULL,\\n \\\"scope_name\\\" STRING NULL,\\n \\\"scope_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"span_attributes.operation.type\\\" STRING NULL,\\n \\\"span_attributes.net.peer.ip\\\" STRING NULL,\\n \\\"span_attributes.peer.service\\\" STRING NULL,\\n \\\"span_events\\\" JSON NULL,\\n \\\"span_links\\\" JSON NULL,\\n TIME INDEX (\\\"timestamp\\\"),\\n PRIMARY KEY (\\\"service_name\\\")\\n)\\nPARTITION ON COLUMNS (\\\"trace_id\\\") (\\n trace_id < '1',\\n trace_id >= '1' AND trace_id < '2',\\n trace_id >= '2' AND trace_id < '3',\\n trace_id >= '3' AND trace_id < '4',\\n trace_id >= '4' AND trace_id < '5',\\n trace_id >= '5' AND trace_id < '6',\\n trace_id >= '6' AND trace_id < '7',\\n trace_id >= '7' AND trace_id < '8',\\n trace_id >= '8' AND trace_id < '9',\\n trace_id >= '9' AND trace_id < 'a',\\n trace_id >= 'a' AND trace_id < 'b',\\n trace_id >= 'b' AND trace_id < 'c',\\n trace_id >= 'c' AND trace_id < 'd',\\n trace_id >= 'd' AND trace_id < 'e',\\n trace_id >= 'e' AND trace_id < 'f',\\n trace_id >= 'f'\\n)\\nENGINE=mito\\nWITH(\\n 'comment' = 'Created on insertion',\\n append_mode = 'true',\\n table_data_model = 'greptime_trace_v1',\\n ttl = '7days'\\n)\"]]";
let trace_table_sql = "[[\"mytable\",\"CREATE TABLE IF NOT EXISTS \\\"mytable\\\" (\\n \\\"timestamp\\\" TIMESTAMP(9) NOT NULL,\\n \\\"timestamp_end\\\" TIMESTAMP(9) NULL,\\n \\\"duration_nano\\\" BIGINT UNSIGNED NULL,\\n \\\"parent_span_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"trace_id\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"span_id\\\" STRING NULL,\\n \\\"span_kind\\\" STRING NULL,\\n \\\"span_name\\\" STRING NULL,\\n \\\"span_status_code\\\" STRING NULL,\\n \\\"span_status_message\\\" STRING NULL,\\n \\\"trace_state\\\" STRING NULL,\\n \\\"scope_name\\\" STRING NULL,\\n \\\"scope_version\\\" STRING NULL,\\n \\\"service_name\\\" STRING NULL SKIPPING INDEX WITH(false_positive_rate = '0.01', granularity = '10240', type = 'BLOOM'),\\n \\\"span_attributes.operation.type\\\" STRING NULL,\\n \\\"span_attributes.net.peer.ip\\\" STRING NULL,\\n \\\"span_attributes.peer.service\\\" STRING NULL,\\n \\\"span_events\\\" JSON NULL,\\n \\\"span_links\\\" JSON NULL,\\n TIME INDEX (\\\"timestamp\\\"),\\n PRIMARY KEY (\\\"service_name\\\")\\n)\\nPARTITION ON COLUMNS (\\\"trace_id\\\") (\\n trace_id < '1',\\n trace_id >= '1' AND trace_id < '2',\\n trace_id >= '2' AND trace_id < '3',\\n trace_id >= '3' AND trace_id < '4',\\n trace_id >= '4' AND trace_id < '5',\\n trace_id >= '5' AND trace_id < '6',\\n trace_id >= '6' AND trace_id < '7',\\n trace_id >= '7' AND trace_id < '8',\\n trace_id >= '8' AND trace_id < '9',\\n trace_id >= '9' AND trace_id < 'a',\\n trace_id >= 'a' AND trace_id < 'b',\\n trace_id >= 'b' AND trace_id < 'c',\\n trace_id >= 'c' AND trace_id < 'd',\\n trace_id >= 'd' AND trace_id < 'e',\\n trace_id >= 'e' AND trace_id < 'f',\\n trace_id >= 'f'\\n)\\nENGINE=mito\\nWITH(\\n 'comment' = 'Created on insertion',\\n append_mode = 'true',\\n 'greptime.semantic.pipeline' = 'greptime_trace_v1',\\n 'greptime.semantic.signal_type' = 'trace',\\n 'greptime.semantic.source' = 'opentelemetry',\\n 'greptime.semantic.trace.conventions' = 'unknown',\\n 'greptime.semantic.trace.has_events' = 'true',\\n 'greptime.semantic.trace.has_links' = 'true',\\n table_data_model = 'greptime_trace_v1',\\n ttl = '7days'\\n)\"]]";
validate_data(
"trace_v1_create_table",
&client,

View File

@@ -1,3 +1,31 @@
-- Incremental aggregate reads only support append-only source tables because
-- update/upsert sources need old-value compensation.
CREATE TABLE incremental_non_append_input (
host_id INT,
n INT,
ts TIMESTAMP TIME INDEX,
PRIMARY KEY(host_id)
);
Affected Rows: 0
CREATE FLOW incremental_non_append_flow SINK TO incremental_non_append_sink
WITH (experimental_enable_incremental_read = 'true')
AS
SELECT
sum(n) AS total,
date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window
FROM
incremental_non_append_input
GROUP BY
time_window;
Error: 3001(EngineExecuteQuery), Unsupported: Flow incremental read requires append-only source table, but source table `greptime.public.incremental_non_append_input` is not append-only. Consider setting append_mode='true' on the source table or disabling experimental_enable_incremental_read
DROP TABLE incremental_non_append_input;
Affected Rows: 0
CREATE TABLE incremental_aggr_input (
host_id INT,
n INT,
@@ -9,7 +37,9 @@ CREATE TABLE incremental_aggr_input (
Affected Rows: 0
CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink AS
CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink
WITH (experimental_enable_incremental_read = 'true')
AS
SELECT
sum(n) AS total,
min(n) AS min_n,

View File

@@ -1,3 +1,25 @@
-- Incremental aggregate reads only support append-only source tables because
-- update/upsert sources need old-value compensation.
CREATE TABLE incremental_non_append_input (
host_id INT,
n INT,
ts TIMESTAMP TIME INDEX,
PRIMARY KEY(host_id)
);
CREATE FLOW incremental_non_append_flow SINK TO incremental_non_append_sink
WITH (experimental_enable_incremental_read = 'true')
AS
SELECT
sum(n) AS total,
date_bin(INTERVAL '1 minute', ts, '2024-01-01 00:00:00') AS time_window
FROM
incremental_non_append_input
GROUP BY
time_window;
DROP TABLE incremental_non_append_input;
CREATE TABLE incremental_aggr_input (
host_id INT,
n INT,
@@ -7,7 +29,9 @@ CREATE TABLE incremental_aggr_input (
append_mode = 'true'
);
CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink AS
CREATE FLOW incremental_aggr_flow SINK TO incremental_aggr_sink
WITH (experimental_enable_incremental_read = 'true')
AS
SELECT
sum(n) AS total,
min(n) AS min_n,

View File

@@ -12,7 +12,9 @@ CREATE TABLE flow_incr_memtable_input (
Affected Rows: 0
CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink AS
CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink
WITH (experimental_enable_incremental_read = 'true')
AS
SELECT
sum(n) AS total,
min(n) AS min_n,

View File

@@ -10,7 +10,9 @@ CREATE TABLE flow_incr_memtable_input (
append_mode = 'true'
);
CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink AS
CREATE FLOW flow_incr_memtable SINK TO flow_incr_memtable_sink
WITH (experimental_enable_incremental_read = 'true')
AS
SELECT
sum(n) AS total,
min(n) AS min_n,

View File

@@ -17,7 +17,9 @@ WITH (
Affected Rows: 0
CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink AS
CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink
WITH (experimental_enable_incremental_read = 'true')
AS
SELECT
sum(n) AS total,
min(n) AS min_n,

View File

@@ -15,7 +15,9 @@ WITH (
append_mode = 'true'
);
CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink AS
CREATE FLOW flow_incr_part SINK TO flow_incr_part_sink
WITH (experimental_enable_incremental_read = 'true')
AS
SELECT
sum(n) AS total,
min(n) AS min_n,

View File

@@ -476,7 +476,7 @@ SINK TO out_num_cnt_show
WITH (access_key_id = [true])
AS SELECT number AS n1 FROM numbers_input_show where number > 10;
Error: 1004(InvalidArguments), Invalid SQL, error: unknown flow option 'access_key_id', supported options: defer_on_missing_source
Error: 1004(InvalidArguments), Invalid SQL, error: unknown flow option 'access_key_id', supported options: defer_on_missing_source, experimental_enable_incremental_read
DROP FLOW filter_numbers_show;