Compare commits

..

11 Commits

Author SHA1 Message Date
Yingwen
5d644c0b7f chore: bump version to v0.7.0 (#3433) 2024-03-05 12:07:37 +00:00
Ruihang Xia
020635063c feat: implement multi-dim partition rule (#3409)
* generate expr rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* implement show create for new partition rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* implement row spliter

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix: fix failed tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: fix lint issues

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: ignore tests for deprecated partition rule

* chore: remove unused partition rule tests setup

* test(sqlness): add basic partition tests

* test(multi_dim): add basic find region test

* address CR comments

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Signed-off-by: WenyXu <wenymedia@gmail.com>
Co-authored-by: WenyXu <wenymedia@gmail.com>
2024-03-05 11:39:15 +00:00
dependabot[bot]
97cbfcfe23 build(deps): bump mio from 0.8.10 to 0.8.11 (#3434)
Bumps [mio](https://github.com/tokio-rs/mio) from 0.8.10 to 0.8.11.
- [Release notes](https://github.com/tokio-rs/mio/releases)
- [Changelog](https://github.com/tokio-rs/mio/blob/master/CHANGELOG.md)
- [Commits](https://github.com/tokio-rs/mio/compare/v0.8.10...v0.8.11)

---
updated-dependencies:
- dependency-name: mio
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-03-05 11:04:14 +00:00
Lei, HUANG
7183fa198c refactor: make MergeTreeMemtable the default choice (#3430)
* refactor: make MergeTreeMemtable the default choice

* refactor: reformat

* chore: add doc to config
2024-03-05 10:00:08 +00:00
Lei, HUANG
02b18fbca1 feat: decode prom requests to grpc (#3425)
* hack: inline decode

* move to servers

* fix: samples lost

* add bench

* remove useless functions

* wip

* feat: remove object pools

* fix: minor issues

* fix: remove useless dep

* chore: rebase main

* format

* finish

* fix: format

* feat: introduce request pool

* try to fix license issue

* fix: clippy

* resolve comments

* fix:typo

* remove useless comments
2024-03-05 09:47:32 +00:00
shuiyisong
7b1c3503d0 fix: complete interceptors for all frontend entry (#3428) 2024-03-05 09:38:47 +00:00
liyang
6fd2ff49d5 ci: refine windows output env (#3431) 2024-03-05 08:38:28 +00:00
WU Jingdi
53f2a5846c feat: support tracing rule sampler (#3405)
* feat: support tracing rule sampler

* chore: simplify code
2024-03-05 15:40:02 +08:00
Yingwen
49157868f9 feat: Correct server metrics and add more metrics for scan (#3426)
* feat: drop timer on stream terminated

* refactor: combine metrics into a histogram vec

* refactor: frontend grpc metrics

* feat: add metrics middleware layer to grpc server

* refactor: move http metrics layer to metrics mod

* feat: bucket for grpc/http elapsed

* feat: remove duplicate metrics

* style: fix cilppy

* fix: incorrect bucket of promql series

* feat: more metrics for mito

* feat: convert cost

* test: fix metrics test
2024-03-04 10:15:10 +00:00
Ruihang Xia
ae2c18e1cf docs(rfcs): multi-dimension partition rule (#3350)
* docs(rfcs): multi-dimension partition rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* change math block type

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update tracking issue

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update discussion

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-04 08:10:54 +00:00
dennis zhuang
e6819412c5 refactor: show tables and show databases (#3423)
* refactor: show tables and show databases

* chore: clean code
2024-03-04 06:15:17 +00:00
92 changed files with 3491 additions and 934 deletions

View File

@@ -91,7 +91,7 @@ env:
# The scheduled version is '${{ env.NEXT_RELEASE_VERSION }}-nightly-YYYYMMDD', like v0.2.0-nigthly-20230313;
NIGHTLY_RELEASE_PREFIX: nightly
# Note: The NEXT_RELEASE_VERSION should be modified manually by every formal release.
NEXT_RELEASE_VERSION: v0.7.0
NEXT_RELEASE_VERSION: v0.8.0
jobs:
allocate-runners:
@@ -288,7 +288,7 @@ jobs:
- name: Set build windows result
id: set-build-windows-result
run: |
echo "build-windows-result=success" >> $GITHUB_OUTPUT
echo "build-windows-result=success" >> $Env:GITHUB_OUTPUT
release-images-to-dockerhub:
name: Build and push images to DockerHub

148
Cargo.lock generated
View File

@@ -196,7 +196,7 @@ checksum = "8f1f8f5a6f3d50d89e3797d7593a50f96bb2aaa20ca0cc7be1fb673232c91d72"
[[package]]
name = "api"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"common-base",
"common-decimal",
@@ -675,7 +675,7 @@ dependencies = [
[[package]]
name = "auth"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -861,7 +861,7 @@ dependencies = [
[[package]]
name = "benchmarks"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arrow",
"chrono",
@@ -1219,7 +1219,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "catalog"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arc-swap",
@@ -1510,7 +1510,7 @@ checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
[[package]]
name = "client"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arc-swap",
@@ -1546,7 +1546,7 @@ dependencies = [
"session",
"snafu",
"substrait 0.17.1",
"substrait 0.6.0",
"substrait 0.7.0",
"tokio",
"tokio-stream",
"tonic 0.10.2",
@@ -1576,7 +1576,7 @@ dependencies = [
[[package]]
name = "cmd"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"anymap",
"async-trait",
@@ -1629,7 +1629,7 @@ dependencies = [
"session",
"snafu",
"store-api",
"substrait 0.6.0",
"substrait 0.7.0",
"table",
"temp-env",
"tikv-jemallocator",
@@ -1672,7 +1672,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
[[package]]
name = "common-base"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"anymap",
"bitvec",
@@ -1687,7 +1687,7 @@ dependencies = [
[[package]]
name = "common-catalog"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"chrono",
"common-error",
@@ -1698,7 +1698,7 @@ dependencies = [
[[package]]
name = "common-config"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"common-base",
"humantime-serde",
@@ -1709,7 +1709,7 @@ dependencies = [
[[package]]
name = "common-datasource"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arrow",
"arrow-schema",
@@ -1741,7 +1741,7 @@ dependencies = [
[[package]]
name = "common-decimal"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arrow",
"bigdecimal",
@@ -1755,7 +1755,7 @@ dependencies = [
[[package]]
name = "common-error"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"snafu",
"strum 0.25.0",
@@ -1763,7 +1763,7 @@ dependencies = [
[[package]]
name = "common-function"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arc-swap",
@@ -1798,7 +1798,7 @@ dependencies = [
[[package]]
name = "common-greptimedb-telemetry"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"common-error",
@@ -1817,7 +1817,7 @@ dependencies = [
[[package]]
name = "common-grpc"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arrow-flight",
@@ -1847,7 +1847,7 @@ dependencies = [
[[package]]
name = "common-grpc-expr"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -1866,7 +1866,7 @@ dependencies = [
[[package]]
name = "common-macro"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arc-swap",
"common-query",
@@ -1881,7 +1881,7 @@ dependencies = [
[[package]]
name = "common-mem-prof"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"common-error",
"common-macro",
@@ -1894,7 +1894,7 @@ dependencies = [
[[package]]
name = "common-meta"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-recursion",
@@ -1944,11 +1944,11 @@ dependencies = [
[[package]]
name = "common-plugins"
version = "0.6.0"
version = "0.7.0"
[[package]]
name = "common-procedure"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-stream",
"async-trait",
@@ -1972,7 +1972,7 @@ dependencies = [
[[package]]
name = "common-procedure-test"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"common-procedure",
@@ -1980,7 +1980,7 @@ dependencies = [
[[package]]
name = "common-query"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -2003,7 +2003,7 @@ dependencies = [
[[package]]
name = "common-recordbatch"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arc-swap",
"common-base",
@@ -2023,7 +2023,7 @@ dependencies = [
[[package]]
name = "common-runtime"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"common-error",
@@ -2043,7 +2043,7 @@ dependencies = [
[[package]]
name = "common-telemetry"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"atty",
"backtrace",
@@ -2071,7 +2071,7 @@ dependencies = [
[[package]]
name = "common-test-util"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"client",
"common-query",
@@ -2083,7 +2083,7 @@ dependencies = [
[[package]]
name = "common-time"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arrow",
"chrono",
@@ -2099,14 +2099,14 @@ dependencies = [
[[package]]
name = "common-version"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"build-data",
]
[[package]]
name = "common-wal"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"common-base",
"common-error",
@@ -2754,7 +2754,7 @@ dependencies = [
[[package]]
name = "datanode"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arrow-flight",
@@ -2812,7 +2812,7 @@ dependencies = [
"snafu",
"sql",
"store-api",
"substrait 0.6.0",
"substrait 0.7.0",
"table",
"tokio",
"tokio-stream",
@@ -2826,7 +2826,7 @@ dependencies = [
[[package]]
name = "datatypes"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arrow",
"arrow-array",
@@ -3302,7 +3302,7 @@ dependencies = [
[[package]]
name = "file-engine"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -3403,7 +3403,7 @@ dependencies = [
[[package]]
name = "flow"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"bimap",
@@ -3458,7 +3458,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa"
[[package]]
name = "frontend"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arc-swap",
@@ -3522,7 +3522,7 @@ dependencies = [
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
"store-api",
"strfmt",
"substrait 0.6.0",
"substrait 0.7.0",
"table",
"tokio",
"toml 0.8.8",
@@ -4291,7 +4291,7 @@ dependencies = [
[[package]]
name = "index"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"asynchronous-codec",
@@ -4848,7 +4848,7 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "log-store"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-stream",
"async-trait",
@@ -5137,7 +5137,7 @@ dependencies = [
[[package]]
name = "meta-client"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -5167,7 +5167,7 @@ dependencies = [
[[package]]
name = "meta-srv"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"anymap",
"api",
@@ -5247,7 +5247,7 @@ dependencies = [
[[package]]
name = "metric-engine"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"aquamarine",
@@ -5307,9 +5307,9 @@ dependencies = [
[[package]]
name = "mio"
version = "0.8.10"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
dependencies = [
"libc",
"log",
@@ -5319,7 +5319,7 @@ dependencies = [
[[package]]
name = "mito2"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"anymap",
"api",
@@ -5380,6 +5380,7 @@ dependencies = [
"tokio",
"tokio-stream",
"tokio-util",
"toml 0.8.8",
"uuid",
]
@@ -5921,9 +5922,18 @@ dependencies = [
"memchr",
]
[[package]]
name = "object-pool"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee9a3e7196d09ec86002b939f1576e8e446d58def8fd48fe578e2c72d5328d68"
dependencies = [
"parking_lot 0.11.2",
]
[[package]]
name = "object-store"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"anyhow",
"async-trait",
@@ -6166,7 +6176,7 @@ dependencies = [
[[package]]
name = "operator"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -6213,7 +6223,7 @@ dependencies = [
"sql",
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
"store-api",
"substrait 0.6.0",
"substrait 0.7.0",
"table",
"tokio",
"tonic 0.10.2",
@@ -6444,7 +6454,7 @@ dependencies = [
[[package]]
name = "partition"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -6466,6 +6476,8 @@ dependencies = [
"serde",
"serde_json",
"snafu",
"sql",
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
"store-api",
"table",
]
@@ -6769,7 +6781,7 @@ dependencies = [
[[package]]
name = "plugins"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"auth",
"common-base",
@@ -7036,7 +7048,7 @@ dependencies = [
[[package]]
name = "promql"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"ahash 0.8.6",
"async-recursion",
@@ -7247,7 +7259,7 @@ dependencies = [
[[package]]
name = "puffin"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"bitflags 2.4.1",
@@ -7368,7 +7380,7 @@ dependencies = [
[[package]]
name = "query"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"ahash 0.8.6",
"api",
@@ -7429,7 +7441,7 @@ dependencies = [
"stats-cli",
"store-api",
"streaming-stats",
"substrait 0.6.0",
"substrait 0.7.0",
"table",
"tokio",
"tokio-stream",
@@ -8747,7 +8759,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "script"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arc-swap",
@@ -9020,7 +9032,7 @@ dependencies = [
[[package]]
name = "servers"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"aide",
"api",
@@ -9054,6 +9066,7 @@ dependencies = [
"common-test-util",
"common-time",
"common-version",
"criterion",
"datafusion",
"datafusion-common",
"datafusion-expr",
@@ -9073,6 +9086,7 @@ dependencies = [
"mime_guess",
"mysql_async",
"notify",
"object-pool",
"once_cell",
"openmetrics-parser",
"opensrv-mysql",
@@ -9122,7 +9136,7 @@ dependencies = [
[[package]]
name = "session"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arc-swap",
@@ -9392,7 +9406,7 @@ dependencies = [
[[package]]
name = "sql"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"common-base",
@@ -9444,7 +9458,7 @@ dependencies = [
[[package]]
name = "sqlness-runner"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"clap 4.4.11",
@@ -9651,7 +9665,7 @@ dependencies = [
[[package]]
name = "store-api"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"aquamarine",
@@ -9791,7 +9805,7 @@ dependencies = [
[[package]]
name = "substrait"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-recursion",
"async-trait",
@@ -9964,7 +9978,7 @@ dependencies = [
[[package]]
name = "table"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"anymap",
"async-trait",
@@ -10076,7 +10090,7 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
[[package]]
name = "tests-fuzz"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"common-error",
@@ -10101,7 +10115,7 @@ dependencies = [
[[package]]
name = "tests-integration"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arrow-flight",
@@ -10158,7 +10172,7 @@ dependencies = [
"sql",
"sqlx",
"store-api",
"substrait 0.6.0",
"substrait 0.7.0",
"table",
"tempfile",
"time",

View File

@@ -62,7 +62,7 @@ members = [
resolver = "2"
[workspace.package]
version = "0.6.0"
version = "0.7.0"
edition = "2021"
license = "Apache-2.0"

View File

@@ -138,6 +138,18 @@ mem_threshold_on_create = "64M"
# File system path to store intermediate files for external sorting (default `{data_home}/index_intermediate`).
intermediate_path = ""
[region_engine.mito.memtable]
# Memtable type.
# - "experimental": experimental memtable
# - "time_series": time-series memtable (deprecated)
type = "experimental"
# The max number of keys in one shard.
index_max_keys_per_shard = 8192
# The max rows of data inside the actively writing buffer in one shard.
data_freeze_threshold = 32768
# Max dictionary bytes.
fork_dictionary_bytes = "1GiB"
# Log options, see `standalone.example.toml`
# [logging]
# dir = "/tmp/greptimedb/logs"

View File

@@ -244,6 +244,18 @@ mem_threshold_on_create = "64M"
# File system path to store intermediate files for external sorting (default `{data_home}/index_intermediate`).
intermediate_path = ""
[region_engine.mito.memtable]
# Memtable type.
# - "experimental": experimental memtable
# - "time_series": time-series memtable (deprecated)
type = "experimental"
# The max number of keys in one shard.
index_max_keys_per_shard = 8192
# The max rows of data inside the actively writing buffer in one shard.
data_freeze_threshold = 32768
# Max dictionary bytes.
fork_dictionary_bytes = "1GiB"
# Log options
# [logging]
# Specify logs directory.
@@ -254,10 +266,11 @@ intermediate_path = ""
# enable_otlp_tracing = false
# tracing exporter endpoint with format `ip:port`, we use grpc oltp as exporter, default endpoint is `localhost:4317`
# otlp_endpoint = "localhost:4317"
# The percentage of tracing will be sampled and exported. Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1. ratio > 1 are treated as 1. Fractions < 0 are treated as 0
# tracing_sample_ratio = 1.0
# Whether to append logs to stdout. Defaults to true.
# append_stdout = true
# The percentage of tracing will be sampled and exported. Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1. ratio > 1 are treated as 1. Fractions < 0 are treated as 0
# [logging.tracing_sample_ratio]
# default_ratio = 0.0
# Standalone export the metrics generated by itself
# encoded to Prometheus remote-write format

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

View File

@@ -0,0 +1,101 @@
---
Feature Name: Multi-dimension Partition Rule
Tracking Issue: https://github.com/GreptimeTeam/greptimedb/issues/3351
Date: 2024-02-21
Author: "Ruihang Xia <waynestxia@gmail.com>"
---
# Summary
A new region partition scheme that runs on multiple dimensions of the key space. The partition rule is defined by a set of simple expressions on the partition key columns.
# Motivation
The current partition rule is from MySQL's [`RANGE Partition`](https://dev.mysql.com/doc/refman/8.0/en/partitioning-range.html), which is based on a single dimension. It is sort of a [Hilbert Curve](https://en.wikipedia.org/wiki/Hilbert_curve) and pick several point on the curve to divide the space. It is neither easy to understand how the data get partitioned nor flexible enough to handle complex partitioning requirements.
Considering the future requirements like region repartitioning or autonomous rebalancing, where both workload and partition may change frequently. Here proposes a new region partition scheme that uses a set of simple expressions on the partition key columns to divide the key space.
# Details
## Partition rule
First, we define a simple expression that can be used to define the partition rule. The simple expression is a binary expression expression on the partition key columns that can be evaluated to a boolean value. The binary operator is limited to comparison operators only, like `=`, `!=`, `>`, `>=`, `<`, `<=`. And the operands are limited either literal value or partition column.
Example of valid simple expressions are $`col_A = 10`$, $`col_A \gt 10 \& col_B \gt 20`$ or $`col_A \ne 10`$.
Those expressions can be used as predicates to divide the key space into different regions. The following example have two partition columns `Col A` and `Col B`, and four partitioned regions.
```math
\left\{\begin{aligned}
&col_A \le 10 &Region_1 \\
&10 \lt col_A \& col_A \le 20 &Region_2 \\
&20 \lt col_A \space \& \space col_B \lt 100 &Region_3 \\
&20 \lt col_A \space \& \space col_B \ge 100 &Region_4
\end{aligned}\right\}
```
An advantage of this scheme is that it is easy to understand how the data get partitioned. The above example can be visualized in a 2D space (two partition column is involved in the example).
![example](2d-example.png)
Here each expression draws a line in the 2D space. Managing data partitioning becomes a matter of drawing lines in the key space.
To make it easy to use, there is a "default region" which catches all the data that doesn't match any of previous expressions. The default region exist by default and do not need to specify. It is also possible to remove this default region if the DB finds it is not necessary.
## SQL interface
The SQL interface is in response to two parts: specifying the partition columns and the partition rule. Thouth we are targeting an autonomous system, it's still allowed to give some bootstrap rules or hints on creating table.
Partition column is specified by `PARTITION ON COLUMNS` sub-clause in `CREATE TABLE`:
```sql
CREATE TABLE t (...)
PARTITION ON COLUMNS (...) ();
```
Two following brackets are for partition columns and partition rule respectively.
Columns provided here are only used as an allow-list of how the partition rule can be defined. Which means (a) the sequence between columns doesn't matter, (b) the columns provided here are not necessarily being used in the partition rule.
The partition rule part is a list of comma-separated simple expressions. Expressions here are not corresponding to region, as they might be changed by system to fit various workload.
A full example of `CREATE TABLE` with partition rule is:
```sql
CREATE TABLE IF NOT EXISTS demo (
a STRING,
b STRING,
c STRING,
d STRING,
ts TIMESTAMP,
memory DOUBLE,
TIME INDEX (ts),
PRIMARY KEY (a, b, c, d)
)
PARTITION ON COLUMNS (c, b, a) (
a < 10,
10 >= a AND a < 20,
20 >= a AND b < 100,
20 >= a AND b > 100
)
```
## Combine with storage
Examining columns separately suits our columnar storage very well in two aspects.
1. The simple expression can be pushed down to storage and file format, and is likely to hit existing index. Makes pruning operation very efficient.
2. Columns in columnar storage are not tightly coupled like in the traditional row storages, which means we can easily add or remove columns from partition rule without much impact (like a global reshuffle) on data.
The data file itself can be "projected" to the key space as a polyhedron, it is guaranteed that each plane is in parallel with some coordinate planes (in a 2D scenario, this is saying that all the files can be projected to a rectangle). Thus partition or repartition also only need to consider related columns.
![sst-project](sst-project.png)
An additional limitation is that considering how the index works and how we organize the primary keys at present, the partition columns are limited to be a subset of primary keys for better performance.
# Drawbacks
This is a breaking change.

Binary file not shown.

After

Width:  |  Height:  |  Size: 71 KiB

View File

@@ -19,9 +19,9 @@ mod partitions;
mod predicate;
mod region_peers;
mod runtime_metrics;
mod schemata;
pub mod schemata;
mod table_names;
mod tables;
pub mod tables;
use std::collections::HashMap;
use std::sync::{Arc, Weak};

View File

@@ -37,8 +37,8 @@ use crate::error::{
use crate::information_schema::{InformationTable, Predicates};
use crate::CatalogManager;
const CATALOG_NAME: &str = "catalog_name";
const SCHEMA_NAME: &str = "schema_name";
pub const CATALOG_NAME: &str = "catalog_name";
pub const SCHEMA_NAME: &str = "schema_name";
const DEFAULT_CHARACTER_SET_NAME: &str = "default_character_set_name";
const DEFAULT_COLLATION_NAME: &str = "default_collation_name";
const INIT_CAPACITY: usize = 42;

View File

@@ -39,10 +39,10 @@ use crate::error::{
use crate::information_schema::{InformationTable, Predicates};
use crate::CatalogManager;
const TABLE_CATALOG: &str = "table_catalog";
const TABLE_SCHEMA: &str = "table_schema";
const TABLE_NAME: &str = "table_name";
const TABLE_TYPE: &str = "table_type";
pub const TABLE_CATALOG: &str = "table_catalog";
pub const TABLE_SCHEMA: &str = "table_schema";
pub const TABLE_NAME: &str = "table_name";
pub const TABLE_TYPE: &str = "table_type";
const TABLE_ID: &str = "table_id";
const ENGINE: &str = "engine";
const INIT_CAPACITY: usize = 42;

View File

@@ -12,11 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#![feature(let_chains)]
pub mod logging;
mod macros;
pub mod metric;
mod panic_hook;
pub mod tracing_context;
mod tracing_sampler;
pub use logging::{init_default_ut_logging, init_global_logging};
pub use metric::dump_metrics;

View File

@@ -31,6 +31,7 @@ use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::prelude::*;
use tracing_subscriber::{filter, EnvFilter, Registry};
use crate::tracing_sampler::{create_sampler, TracingSampleOptions};
pub use crate::{debug, error, info, trace, warn};
const DEFAULT_OTLP_ENDPOINT: &str = "http://localhost:4317";
@@ -42,7 +43,7 @@ pub struct LoggingOptions {
pub level: Option<String>,
pub enable_otlp_tracing: bool,
pub otlp_endpoint: Option<String>,
pub tracing_sample_ratio: Option<f64>,
pub tracing_sample_ratio: Option<TracingSampleOptions>,
pub append_stdout: bool,
}
@@ -176,8 +177,10 @@ pub fn init_global_logging(
.expect("error parsing log level string");
let sampler = opts
.tracing_sample_ratio
.map(Sampler::TraceIdRatioBased)
.unwrap_or(Sampler::AlwaysOn);
.as_ref()
.map(create_sampler)
.map(Sampler::ParentBased)
.unwrap_or(Sampler::ParentBased(Box::new(Sampler::AlwaysOn)));
// Must enable 'tokio_unstable' cfg to use this feature.
// For example: `RUSTFLAGS="--cfg tokio_unstable" cargo run -F common-telemetry/console -- standalone start`
#[cfg(feature = "tokio-console")]

View File

@@ -0,0 +1,176 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashSet;
use opentelemetry::trace::{
Link, SamplingDecision, SamplingResult, SpanKind, TraceContextExt, TraceId, TraceState,
};
use opentelemetry::KeyValue;
use opentelemetry_sdk::trace::{Sampler, ShouldSample};
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(default)]
pub struct TracingSampleOptions {
pub default_ratio: f64,
pub rules: Vec<TracingSampleRule>,
}
impl Default for TracingSampleOptions {
fn default() -> Self {
Self {
default_ratio: 1.0,
rules: vec![],
}
}
}
/// Determine the sampling rate of a span according to the `rules` provided in `RuleSampler`.
/// For spans that do not hit any `rules`, the `default_ratio` is used.
#[derive(Clone, Default, Debug, Serialize, Deserialize)]
#[serde(default)]
pub struct TracingSampleRule {
pub protocol: String,
pub request_types: HashSet<String>,
pub ratio: f64,
}
impl TracingSampleRule {
pub fn match_rule(&self, protocol: &str, request_type: Option<&str>) -> Option<f64> {
if protocol == self.protocol {
if self.request_types.is_empty() {
Some(self.ratio)
} else if let Some(t) = request_type
&& self.request_types.contains(t)
{
Some(self.ratio)
} else {
None
}
} else {
None
}
}
}
impl PartialEq for TracingSampleOptions {
fn eq(&self, other: &Self) -> bool {
self.default_ratio == other.default_ratio && self.rules == other.rules
}
}
impl PartialEq for TracingSampleRule {
fn eq(&self, other: &Self) -> bool {
self.protocol == other.protocol
&& self.request_types == other.request_types
&& self.ratio == other.ratio
}
}
impl Eq for TracingSampleOptions {}
impl Eq for TracingSampleRule {}
pub fn create_sampler(opt: &TracingSampleOptions) -> Box<dyn ShouldSample> {
if opt.rules.is_empty() {
Box::new(Sampler::TraceIdRatioBased(opt.default_ratio))
} else {
Box::new(opt.clone())
}
}
impl ShouldSample for TracingSampleOptions {
fn should_sample(
&self,
parent_context: Option<&opentelemetry::Context>,
trace_id: TraceId,
_name: &str,
_span_kind: &SpanKind,
attributes: &[KeyValue],
_links: &[Link],
) -> SamplingResult {
let (mut protocol, mut request_type) = (None, None);
for kv in attributes {
match kv.key.as_str() {
"protocol" => protocol = Some(kv.value.as_str()),
"request_type" => request_type = Some(kv.value.as_str()),
_ => (),
}
}
let ratio = protocol
.and_then(|p| {
self.rules
.iter()
.find_map(|rule| rule.match_rule(p.as_ref(), request_type.as_deref()))
})
.unwrap_or(self.default_ratio);
SamplingResult {
decision: sample_based_on_probability(ratio, trace_id),
// No extra attributes ever set by the SDK samplers.
attributes: Vec::new(),
// all sampler in SDK will not modify trace state.
trace_state: match parent_context {
Some(ctx) => ctx.span().span_context().trace_state().clone(),
None => TraceState::default(),
},
}
}
}
/// The code here mainly refers to the relevant implementation of
/// [opentelemetry](https://github.com/open-telemetry/opentelemetry-rust/blob/ef4701055cc39d3448d5e5392812ded00cdd4476/opentelemetry-sdk/src/trace/sampler.rs#L229),
/// and determines whether the span needs to be collected based on the `TraceId` and sampling rate (i.e. `prob`).
fn sample_based_on_probability(prob: f64, trace_id: TraceId) -> SamplingDecision {
if prob >= 1.0 {
SamplingDecision::RecordAndSample
} else {
let prob_upper_bound = (prob.max(0.0) * (1u64 << 63) as f64) as u64;
let bytes = trace_id.to_bytes();
let (_, low) = bytes.split_at(8);
let trace_id_low = u64::from_be_bytes(low.try_into().unwrap());
let rnd_from_trace_id = trace_id_low >> 1;
if rnd_from_trace_id < prob_upper_bound {
SamplingDecision::RecordAndSample
} else {
SamplingDecision::Drop
}
}
}
#[cfg(test)]
mod test {
use std::collections::HashSet;
use crate::tracing_sampler::TracingSampleRule;
#[test]
fn test_rule() {
let rule = TracingSampleRule {
protocol: "http".to_string(),
request_types: HashSet::new(),
ratio: 1.0,
};
assert_eq!(rule.match_rule("not_http", None), None);
assert_eq!(rule.match_rule("http", None), Some(1.0));
assert_eq!(rule.match_rule("http", Some("abc")), Some(1.0));
let rule1 = TracingSampleRule {
protocol: "http".to_string(),
request_types: HashSet::from(["mysql".to_string()]),
ratio: 1.0,
};
assert_eq!(rule1.match_rule("http", None), None);
assert_eq!(rule1.match_rule("http", Some("abc")), None);
assert_eq!(rule1.match_rule("http", Some("mysql")), Some(1.0));
}
}

View File

@@ -39,14 +39,16 @@ use common_procedure::local::{LocalManager, ManagerConfig};
use common_procedure::options::ProcedureConfig;
use common_procedure::ProcedureManagerRef;
use common_query::Output;
use common_telemetry::error;
use common_telemetry::logging::info;
use common_telemetry::{error, tracing};
use log_store::raft_engine::RaftEngineBackend;
use meta_client::client::{MetaClient, MetaClientBuilder};
use meta_client::MetaClientOptions;
use operator::delete::DeleterRef;
use operator::insert::InserterRef;
use operator::statement::StatementExecutor;
use prometheus::HistogramTimer;
use query::metrics::OnDone;
use query::parser::{PromQuery, QueryLanguageParser, QueryStatement};
use query::plan::LogicalPlan;
use query::query_engine::options::{validate_catalog_and_schema, QueryOptions};
@@ -85,7 +87,6 @@ use crate::error::{
};
use crate::frontend::{FrontendOptions, TomlSerializable};
use crate::heartbeat::HeartbeatTask;
use crate::metrics;
use crate::script::ScriptExecutor;
#[async_trait]
@@ -275,8 +276,8 @@ impl Instance {
impl SqlQueryHandler for Instance {
type Error = Error;
#[tracing::instrument(skip_all)]
async fn do_query(&self, query: &str, query_ctx: QueryContextRef) -> Vec<Result<Output>> {
let _timer = metrics::METRIC_HANDLE_SQL_ELAPSED.start_timer();
let query_interceptor_opt = self.plugins.get::<SqlQueryInterceptorRef<Error>>();
let query_interceptor = query_interceptor_opt.as_ref();
let query = match query_interceptor.pre_parsing(query, query_ctx.clone()) {
@@ -336,7 +337,6 @@ impl SqlQueryHandler for Instance {
}
async fn do_exec_plan(&self, plan: LogicalPlan, query_ctx: QueryContextRef) -> Result<Output> {
let _timer = metrics::METRIC_EXEC_PLAN_ELAPSED.start_timer();
// plan should be prepared before exec
// we'll do check there
self.query_engine
@@ -345,6 +345,7 @@ impl SqlQueryHandler for Instance {
.context(ExecLogicalPlanSnafu)
}
#[tracing::instrument(skip_all)]
async fn do_promql_query(
&self,
query: &PromQuery,
@@ -398,14 +399,27 @@ impl SqlQueryHandler for Instance {
}
}
/// Attaches a timer to the output and observes it once the output is exhausted.
pub fn attach_timer(output: Output, timer: HistogramTimer) -> Output {
match output {
Output::AffectedRows(_) | Output::RecordBatches(_) => output,
Output::Stream(stream, plan) => {
let stream = OnDone::new(stream, move || {
timer.observe_duration();
});
Output::Stream(Box::pin(stream), plan)
}
}
}
#[async_trait]
impl PrometheusHandler for Instance {
#[tracing::instrument(skip_all)]
async fn do_query(
&self,
query: &PromQuery,
query_ctx: QueryContextRef,
) -> server_error::Result<Output> {
let _timer = metrics::METRIC_HANDLE_PROMQL_ELAPSED.start_timer();
let interceptor = self
.plugins
.get::<PromQueryInterceptorRef<server_error::Error>>();

View File

@@ -20,6 +20,7 @@ use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use common_meta::table_name::TableName;
use common_query::Output;
use common_telemetry::tracing;
use query::parser::PromQuery;
use servers::interceptor::{GrpcQueryInterceptor, GrpcQueryInterceptorRef};
use servers::query_handler::grpc::GrpcQueryHandler;
@@ -31,7 +32,8 @@ use crate::error::{
Error, IncompleteGrpcRequestSnafu, NotSupportedSnafu, PermissionSnafu, Result,
TableOperationSnafu,
};
use crate::instance::Instance;
use crate::instance::{attach_timer, Instance};
use crate::metrics::{GRPC_HANDLE_PROMQL_ELAPSED, GRPC_HANDLE_SQL_ELAPSED};
#[async_trait]
impl GrpcQueryHandler for Instance {
@@ -59,6 +61,7 @@ impl GrpcQueryHandler for Instance {
})?;
match query {
Query::Sql(sql) => {
let timer = GRPC_HANDLE_SQL_ELAPSED.start_timer();
let mut result = SqlQueryHandler::do_query(self, &sql, ctx.clone()).await;
ensure!(
result.len() == 1,
@@ -66,7 +69,8 @@ impl GrpcQueryHandler for Instance {
feat: "execute multiple statements in SQL query string through GRPC interface"
}
);
result.remove(0)?
let output = result.remove(0)?;
attach_timer(output, timer)
}
Query::LogicalPlan(_) => {
return NotSupportedSnafu {
@@ -75,6 +79,7 @@ impl GrpcQueryHandler for Instance {
.fail();
}
Query::PromRangeQuery(promql) => {
let timer = GRPC_HANDLE_PROMQL_ELAPSED.start_timer();
let prom_query = PromQuery {
query: promql.query,
start: promql.start,
@@ -89,7 +94,8 @@ impl GrpcQueryHandler for Instance {
feat: "execute multiple statements in PromQL query string through GRPC interface"
}
);
result.remove(0)?
let output = result.remove(0)?;
attach_timer(output, timer)
}
}
}
@@ -173,6 +179,7 @@ fn fill_catalog_and_schema_from_context(ddl_expr: &mut DdlExpr, ctx: &QueryConte
}
impl Instance {
#[tracing::instrument(skip_all)]
pub async fn handle_inserts(
&self,
requests: InsertRequests,
@@ -184,6 +191,7 @@ impl Instance {
.context(TableOperationSnafu)
}
#[tracing::instrument(skip_all)]
pub async fn handle_row_inserts(
&self,
requests: RowInsertRequests,
@@ -195,6 +203,7 @@ impl Instance {
.context(TableOperationSnafu)
}
#[tracing::instrument(skip_all)]
pub async fn handle_metric_row_inserts(
&self,
requests: RowInsertRequests,
@@ -207,6 +216,7 @@ impl Instance {
.context(TableOperationSnafu)
}
#[tracing::instrument(skip_all)]
pub async fn handle_deletes(
&self,
requests: DeleteRequests,
@@ -218,6 +228,7 @@ impl Instance {
.context(TableOperationSnafu)
}
#[tracing::instrument(skip_all)]
pub async fn handle_row_deletes(
&self,
requests: RowDeleteRequests,

View File

@@ -15,8 +15,9 @@
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use common_error::ext::BoxedError;
use servers::error::AuthSnafu;
use servers::error::{AuthSnafu, Error};
use servers::influxdb::InfluxdbRequest;
use servers::interceptor::{LineProtocolInterceptor, LineProtocolInterceptorRef};
use servers::query_handler::InfluxdbLineProtocolHandler;
use session::context::QueryContextRef;
use snafu::ResultExt;
@@ -36,6 +37,9 @@ impl InfluxdbLineProtocolHandler for Instance {
.check_permission(ctx.current_user(), PermissionReq::LineProtocol)
.context(AuthSnafu)?;
let interceptor_ref = self.plugins.get::<LineProtocolInterceptorRef<Error>>();
interceptor_ref.pre_execute(&request.lines, ctx.clone())?;
let requests = request.try_into()?;
let _ = self
.handle_row_inserts(requests, ctx)

View File

@@ -15,6 +15,7 @@
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use common_error::ext::BoxedError;
use common_telemetry::tracing;
use servers::error as server_error;
use servers::error::AuthSnafu;
use servers::opentsdb::codec::DataPoint;
@@ -27,6 +28,7 @@ use crate::instance::Instance;
#[async_trait]
impl OpentsdbProtocolHandler for Instance {
#[tracing::instrument(skip_all, fields(protocol = "opentsdb"))]
async fn exec(
&self,
data_points: Vec<DataPoint>,

View File

@@ -15,6 +15,7 @@
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use common_error::ext::BoxedError;
use common_telemetry::tracing;
use opentelemetry_proto::tonic::collector::metrics::v1::{
ExportMetricsServiceRequest, ExportMetricsServiceResponse,
};
@@ -22,6 +23,7 @@ use opentelemetry_proto::tonic::collector::trace::v1::{
ExportTraceServiceRequest, ExportTraceServiceResponse,
};
use servers::error::{self, AuthSnafu, Result as ServerResult};
use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
use servers::otlp;
use servers::otlp::plugin::TraceParserRef;
use servers::query_handler::OpenTelemetryProtocolHandler;
@@ -33,6 +35,7 @@ use crate::metrics::{OTLP_METRICS_ROWS, OTLP_TRACES_ROWS};
#[async_trait]
impl OpenTelemetryProtocolHandler for Instance {
#[tracing::instrument(skip_all)]
async fn metrics(
&self,
request: ExportMetricsServiceRequest,
@@ -43,6 +46,12 @@ impl OpenTelemetryProtocolHandler for Instance {
.as_ref()
.check_permission(ctx.current_user(), PermissionReq::Otlp)
.context(AuthSnafu)?;
let interceptor_ref = self
.plugins
.get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
interceptor_ref.pre_execute(ctx.clone())?;
let (requests, rows) = otlp::metrics::to_grpc_insert_requests(request)?;
let _ = self
.handle_row_inserts(requests, ctx)
@@ -59,6 +68,7 @@ impl OpenTelemetryProtocolHandler for Instance {
Ok(resp)
}
#[tracing::instrument(skip_all)]
async fn traces(
&self,
request: ExportTraceServiceRequest,
@@ -70,6 +80,11 @@ impl OpenTelemetryProtocolHandler for Instance {
.check_permission(ctx.current_user(), PermissionReq::Otlp)
.context(AuthSnafu)?;
let interceptor_ref = self
.plugins
.get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
interceptor_ref.pre_execute(ctx.clone())?;
let (table_name, spans) = match self.plugins.get::<TraceParserRef>() {
Some(parser) => (parser.table_name(), parser.parse(request)),
None => (

View File

@@ -16,6 +16,7 @@ use std::sync::Arc;
use api::prom_store::remote::read_request::ResponseType;
use api::prom_store::remote::{Query, QueryResult, ReadRequest, ReadResponse, WriteRequest};
use api::v1::RowInsertRequests;
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use common_catalog::format_full_table_name;
@@ -23,12 +24,13 @@ use common_error::ext::BoxedError;
use common_query::prelude::GREPTIME_PHYSICAL_TABLE;
use common_query::Output;
use common_recordbatch::RecordBatches;
use common_telemetry::logging;
use common_telemetry::{logging, tracing};
use operator::insert::InserterRef;
use operator::statement::StatementExecutor;
use prost::Message;
use servers::error::{self, AuthSnafu, Result as ServerResult};
use servers::http::prom_store::PHYSICAL_TABLE_PARAM;
use servers::interceptor::{PromStoreProtocolInterceptor, PromStoreProtocolInterceptorRef};
use servers::prom_store::{self, Metrics};
use servers::query_handler::{
PromStoreProtocolHandler, PromStoreProtocolHandlerRef, PromStoreResponse,
@@ -87,6 +89,7 @@ async fn to_query_result(table_name: &str, output: Output) -> ServerResult<Query
}
impl Instance {
#[tracing::instrument(skip_all)]
async fn handle_remote_query(
&self,
ctx: &QueryContextRef,
@@ -126,6 +129,7 @@ impl Instance {
.context(ExecLogicalPlanSnafu)
}
#[tracing::instrument(skip_all)]
async fn handle_remote_queries(
&self,
ctx: QueryContextRef,
@@ -166,8 +170,12 @@ impl PromStoreProtocolHandler for Instance {
.as_ref()
.check_permission(ctx.current_user(), PermissionReq::PromStoreWrite)
.context(AuthSnafu)?;
let interceptor_ref = self
.plugins
.get::<PromStoreProtocolInterceptorRef<servers::error::Error>>();
interceptor_ref.pre_write(&request, ctx.clone())?;
let (requests, samples) = prom_store::to_grpc_row_insert_requests(request)?;
let (requests, samples) = prom_store::to_grpc_row_insert_requests(&request)?;
if with_metric_engine {
let physical_table = ctx
.extension(PHYSICAL_TABLE_PARAM)
@@ -190,6 +198,38 @@ impl PromStoreProtocolHandler for Instance {
Ok(())
}
async fn write_fast(
&self,
request: RowInsertRequests,
ctx: QueryContextRef,
with_metric_engine: bool,
) -> ServerResult<()> {
self.plugins
.get::<PermissionCheckerRef>()
.as_ref()
.check_permission(ctx.current_user(), PermissionReq::PromStoreWrite)
.context(AuthSnafu)?;
if with_metric_engine {
let physical_table = ctx
.extension(PHYSICAL_TABLE_PARAM)
.unwrap_or(GREPTIME_PHYSICAL_TABLE)
.to_string();
let _ = self
.handle_metric_row_inserts(request, ctx.clone(), physical_table.to_string())
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)?;
} else {
let _ = self
.handle_row_inserts(request, ctx.clone())
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)?;
}
Ok(())
}
async fn read(
&self,
request: ReadRequest,
@@ -200,6 +240,10 @@ impl PromStoreProtocolHandler for Instance {
.as_ref()
.check_permission(ctx.current_user(), PermissionReq::PromStoreRead)
.context(AuthSnafu)?;
let interceptor_ref = self
.plugins
.get::<PromStoreProtocolInterceptorRef<servers::error::Error>>();
interceptor_ref.pre_read(&request, ctx.clone())?;
let response_type = negotiate_response_type(&request.accepted_response_types)?;
@@ -265,7 +309,7 @@ impl PromStoreProtocolHandler for ExportMetricHandler {
ctx: QueryContextRef,
_: bool,
) -> ServerResult<()> {
let (requests, _) = prom_store::to_grpc_row_insert_requests(request)?;
let (requests, _) = prom_store::to_grpc_row_insert_requests(&request)?;
self.inserter
.handle_metric_row_inserts(
requests,
@@ -279,6 +323,15 @@ impl PromStoreProtocolHandler for ExportMetricHandler {
Ok(())
}
async fn write_fast(
&self,
_request: RowInsertRequests,
_ctx: QueryContextRef,
_with_metric_engine: bool,
) -> ServerResult<()> {
unimplemented!()
}
async fn read(
&self,
_request: ReadRequest,

View File

@@ -16,6 +16,8 @@ use std::collections::HashMap;
use async_trait::async_trait;
use common_query::Output;
use servers::error::Error;
use servers::interceptor::{ScriptInterceptor, ScriptInterceptorRef};
use servers::query_handler::ScriptHandler;
use session::context::QueryContextRef;
@@ -30,7 +32,10 @@ impl ScriptHandler for Instance {
name: &str,
script: &str,
) -> servers::error::Result<()> {
let _timer = metrics::METRIC_HANDLE_SCRIPTS_ELAPSED.start_timer();
let interceptor_ref = self.plugins.get::<ScriptInterceptorRef<Error>>();
interceptor_ref.pre_execute(name, query_ctx.clone())?;
let _timer = metrics::INSERT_SCRIPTS_ELAPSED.start_timer();
self.script_executor
.insert_script(query_ctx, name, script)
.await
@@ -42,7 +47,10 @@ impl ScriptHandler for Instance {
name: &str,
params: HashMap<String, String>,
) -> servers::error::Result<Output> {
let _timer = metrics::METRIC_RUN_SCRIPT_ELAPSED.start_timer();
let interceptor_ref = self.plugins.get::<ScriptInterceptorRef<Error>>();
interceptor_ref.pre_execute(name, query_ctx.clone())?;
let _timer = metrics::EXECUTE_SCRIPT_ELAPSED.start_timer();
self.script_executor
.execute_script(query_ctx, name, params)
.await

View File

@@ -16,22 +16,32 @@ use lazy_static::lazy_static;
use prometheus::*;
lazy_static! {
pub static ref METRIC_HANDLE_SQL_ELAPSED: Histogram =
register_histogram!("greptime_frontend_handle_sql_elapsed", "frontend handle sql elapsed").unwrap();
pub static ref METRIC_HANDLE_PROMQL_ELAPSED: Histogram = register_histogram!(
"greptime_frontend_handle_promql_elapsed",
"frontend handle promql elapsed"
/// Timer of handling query in RPC handler.
pub static ref GRPC_HANDLE_QUERY_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_frontend_grpc_handle_query_elapsed",
"Elapsed time of handling queries in RPC handler",
&["type"],
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
pub static ref METRIC_EXEC_PLAN_ELAPSED: Histogram =
register_histogram!("greptime_frontend_exec_plan_elapsed", "frontend exec plan elapsed").unwrap();
pub static ref METRIC_HANDLE_SCRIPTS_ELAPSED: Histogram = register_histogram!(
"greptime_frontend_handle_scripts_elapsed",
"frontend handle scripts elapsed"
pub static ref GRPC_HANDLE_SQL_ELAPSED: Histogram = GRPC_HANDLE_QUERY_ELAPSED
.with_label_values(&["sql"]);
pub static ref GRPC_HANDLE_PROMQL_ELAPSED: Histogram = GRPC_HANDLE_QUERY_ELAPSED
.with_label_values(&["promql"]);
/// Timer of handling scripts in the script handler.
pub static ref HANDLE_SCRIPT_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_frontend_handle_script_elapsed",
"Elapsed time of handling scripts in the script handler",
&["type"],
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
pub static ref METRIC_RUN_SCRIPT_ELAPSED: Histogram =
register_histogram!("greptime_frontend_run_script_elapsed", "frontend run script elapsed").unwrap();
pub static ref INSERT_SCRIPTS_ELAPSED: Histogram = HANDLE_SCRIPT_ELAPSED
.with_label_values(&["insert"]);
pub static ref EXECUTE_SCRIPT_ELAPSED: Histogram = HANDLE_SCRIPT_ELAPSED
.with_label_values(&["execute"]);
/// The samples count of Prometheus remote write.
pub static ref PROM_STORE_REMOTE_WRITE_SAMPLES: IntCounter = register_int_counter!(
"greptime_frontend_prometheus_remote_write_samples",

View File

@@ -76,6 +76,7 @@ common-test-util.workspace = true
criterion = "0.4"
log-store.workspace = true
rand.workspace = true
toml.workspace = true
[[bench]]
name = "bench_merge_tree"

View File

@@ -24,7 +24,7 @@ use serde::{Deserialize, Serialize};
use serde_with::{serde_as, NoneAsEmptyString};
use crate::error::Result;
use crate::memtable::merge_tree::MergeTreeConfig;
use crate::memtable::MemtableConfig;
use crate::sst::DEFAULT_WRITE_BUFFER_SIZE;
/// Default max running background job.
@@ -104,8 +104,8 @@ pub struct MitoConfig {
/// Inverted index configs.
pub inverted_index: InvertedIndexConfig,
/// Experimental memtable.
pub experimental_memtable: Option<MergeTreeConfig>,
/// Memtable config
pub memtable: MemtableConfig,
}
impl Default for MitoConfig {
@@ -131,7 +131,7 @@ impl Default for MitoConfig {
parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
allow_stale_entries: false,
inverted_index: InvertedIndexConfig::default(),
experimental_memtable: None,
memtable: MemtableConfig::default(),
};
// Adjust buffer and cache size according to system memory if we can.
@@ -319,3 +319,25 @@ fn divide_num_cpus(divisor: usize) -> usize {
(cores + divisor - 1) / divisor
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_deserialize_config() {
let s = r#"
[memtable]
type = "experimental"
index_max_keys_per_shard = 8192
data_freeze_threshold = 1024
dedup = true
fork_dictionary_bytes = "512MiB"
"#;
let config: MitoConfig = toml::from_str(s).unwrap();
let MemtableConfig::Experimental(config) = &config.memtable else {
unreachable!()
};
assert_eq!(1024, config.data_freeze_threshold);
}
}

View File

@@ -47,6 +47,7 @@ mod truncate_test;
use std::any::Any;
use std::sync::Arc;
use std::time::Instant;
use async_trait::async_trait;
use common_error::ext::BoxedError;
@@ -219,6 +220,7 @@ impl EngineInner {
/// Handles the scan `request` and returns a [Scanner] for the `request`.
fn handle_query(&self, region_id: RegionId, request: ScanRequest) -> Result<Scanner> {
let query_start = Instant::now();
// Reading a region doesn't need to go through the region worker thread.
let region = self
.workers
@@ -239,7 +241,8 @@ impl EngineInner {
Some(cache_manager),
)
.with_parallelism(scan_parallelism)
.ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled());
.with_ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled())
.with_start_time(query_start);
scan_region.scanner()
}

View File

@@ -14,16 +14,12 @@
//! Memtables are write buffers for regions.
pub mod key_values;
pub mod merge_tree;
pub mod time_series;
pub(crate) mod version;
use std::fmt;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::Arc;
use common_time::Timestamp;
use serde::{Deserialize, Serialize};
use store_api::metadata::RegionMetadataRef;
use store_api::storage::ColumnId;
use table::predicate::Predicate;
@@ -31,14 +27,34 @@ use table::predicate::Predicate;
use crate::error::Result;
use crate::flush::WriteBufferManagerRef;
pub use crate::memtable::key_values::KeyValues;
use crate::memtable::merge_tree::MergeTreeConfig;
use crate::metrics::WRITE_BUFFER_BYTES;
use crate::read::Batch;
pub mod key_values;
pub mod merge_tree;
pub mod time_series;
pub(crate) mod version;
/// Id for memtables.
///
/// Should be unique under the same region.
pub type MemtableId = u32;
/// Config for memtables.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum MemtableConfig {
Experimental(MergeTreeConfig),
TimeSeries,
}
impl Default for MemtableConfig {
fn default() -> Self {
Self::Experimental(MergeTreeConfig::default())
}
}
#[derive(Debug, Default)]
pub struct MemtableStats {
/// The estimated bytes allocated by this memtable from heap.
@@ -187,9 +203,30 @@ impl Drop for AllocTracker {
#[cfg(test)]
mod tests {
use common_base::readable_size::ReadableSize;
use super::*;
use crate::flush::{WriteBufferManager, WriteBufferManagerImpl};
#[test]
fn test_deserialize_memtable_config() {
let s = r#"
type = "experimental"
index_max_keys_per_shard = 8192
data_freeze_threshold = 1024
dedup = true
fork_dictionary_bytes = "512MiB"
"#;
let config: MemtableConfig = toml::from_str(s).unwrap();
let MemtableConfig::Experimental(merge_tree) = config else {
unreachable!()
};
assert!(merge_tree.dedup);
assert_eq!(8192, merge_tree.index_max_keys_per_shard);
assert_eq!(1024, merge_tree.data_freeze_threshold);
assert_eq!(ReadableSize::mb(512), merge_tree.fork_dictionary_bytes);
}
#[test]
fn test_alloc_tracker_without_manager() {
let tracker = AllocTracker::new(None);

View File

@@ -44,7 +44,7 @@ use crate::memtable::{
};
/// Use `1/DICTIONARY_SIZE_FACTOR` of OS memory as dictionary size.
const DICTIONARY_SIZE_FACTOR: u64 = 16;
const DICTIONARY_SIZE_FACTOR: u64 = 8;
/// Id of a shard, only unique inside a partition.
type ShardId = u32;
@@ -74,7 +74,7 @@ pub struct MergeTreeConfig {
impl Default for MergeTreeConfig {
fn default() -> Self {
let mut fork_dictionary_bytes = ReadableSize::mb(512);
let mut fork_dictionary_bytes = ReadableSize::gb(1);
if let Some(sys_memory) = common_config::utils::get_sys_total_memory() {
let adjust_dictionary_bytes =
std::cmp::min(sys_memory / DICTIONARY_SIZE_FACTOR, fork_dictionary_bytes);
@@ -85,7 +85,7 @@ impl Default for MergeTreeConfig {
Self {
index_max_keys_per_shard: 8192,
data_freeze_threshold: 102400,
data_freeze_threshold: 32768,
dedup: true,
fork_dictionary_bytes,
}

View File

@@ -39,7 +39,7 @@ use crate::memtable::merge_tree::partition::{
};
use crate::memtable::merge_tree::MergeTreeConfig;
use crate::memtable::{BoxedBatchIterator, KeyValues};
use crate::metrics::{MERGE_TREE_READ_STAGE_ELAPSED, READ_STAGE_ELAPSED};
use crate::metrics::{MERGE_TREE_READ_STAGE_ELAPSED, READ_ROWS_TOTAL, READ_STAGE_ELAPSED};
use crate::read::Batch;
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
@@ -397,6 +397,9 @@ struct TreeIter {
impl Drop for TreeIter {
fn drop(&mut self) {
READ_ROWS_TOTAL
.with_label_values(&["merge_tree_memtable"])
.inc_by(self.metrics.rows_fetched as u64);
MERGE_TREE_READ_STAGE_ELAPSED
.with_label_values(&["fetch_next_partition"])
.observe(self.metrics.fetch_partition_elapsed.as_secs_f64());

View File

@@ -123,7 +123,7 @@ lazy_static! {
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
/// Counter of rows read.
/// Counter of rows read from different source.
pub static ref READ_ROWS_TOTAL: IntCounterVec =
register_int_counter_vec!("greptime_mito_read_rows_total", "mito read rows total", &[TYPE_LABEL]).unwrap();
/// Counter of filtered rows during merge.
@@ -137,6 +137,24 @@ lazy_static! {
register_int_counter_vec!("greptime_mito_precise_filter_rows_total", "mito precise filter rows total", &[TYPE_LABEL]).unwrap();
pub static ref READ_ROWS_IN_ROW_GROUP_TOTAL: IntCounterVec =
register_int_counter_vec!("greptime_mito_read_rows_in_row_group_total", "mito read rows in row group total", &[TYPE_LABEL]).unwrap();
/// Histogram for the number of SSTs to scan per query.
pub static ref READ_SST_COUNT: Histogram = register_histogram!(
"greptime_mito_read_sst_count",
"Number of SSTs to scan in a scan task",
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 256.0, 1024.0],
).unwrap();
/// Histogram for the number of rows returned per query.
pub static ref READ_ROWS_RETURN: Histogram = register_histogram!(
"greptime_mito_read_rows_return",
"Number of rows returned in a scan task",
exponential_buckets(100.0, 10.0, 8).unwrap(),
).unwrap();
/// Histogram for the number of batches returned per query.
pub static ref READ_BATCHES_RETURN: Histogram = register_histogram!(
"greptime_mito_read_batches_return",
"Number of rows returned in a scan task",
exponential_buckets(100.0, 10.0, 7).unwrap(),
).unwrap();
// ------- End of query metrics.
// Cache related metrics.

View File

@@ -15,6 +15,7 @@
//! Scans a region according to the scan request.
use std::sync::Arc;
use std::time::Instant;
use common_recordbatch::SendableRecordBatchStream;
use common_telemetry::{debug, warn};
@@ -124,6 +125,8 @@ pub(crate) struct ScanRegion {
parallelism: ScanParallism,
/// Whether to ignore inverted index.
ignore_inverted_index: bool,
/// Start time of the scan task.
start_time: Option<Instant>,
}
impl ScanRegion {
@@ -141,6 +144,7 @@ impl ScanRegion {
cache_manager,
parallelism: ScanParallism::default(),
ignore_inverted_index: false,
start_time: None,
}
}
@@ -152,11 +156,17 @@ impl ScanRegion {
}
#[must_use]
pub(crate) fn ignore_inverted_index(mut self, ignore: bool) -> Self {
pub(crate) fn with_ignore_inverted_index(mut self, ignore: bool) -> Self {
self.ignore_inverted_index = ignore;
self
}
#[must_use]
pub(crate) fn with_start_time(mut self, now: Instant) -> Self {
self.start_time = Some(now);
self
}
/// Returns a [Scanner] to scan the region.
pub(crate) fn scanner(self) -> Result<Scanner> {
self.seq_scan().map(Scanner::Seq)
@@ -223,7 +233,8 @@ impl ScanRegion {
.with_files(files)
.with_cache(self.cache_manager)
.with_index_applier(index_applier)
.with_parallelism(self.parallelism);
.with_parallelism(self.parallelism)
.with_start_time(self.start_time);
Ok(seq_scan)
}

View File

@@ -32,7 +32,7 @@ use crate::access_layer::AccessLayerRef;
use crate::cache::{CacheManager, CacheManagerRef};
use crate::error::Result;
use crate::memtable::MemtableRef;
use crate::metrics::READ_STAGE_ELAPSED;
use crate::metrics::{READ_BATCHES_RETURN, READ_ROWS_RETURN, READ_SST_COUNT, READ_STAGE_ELAPSED};
use crate::read::compat::{self, CompatReader};
use crate::read::merge::MergeReaderBuilder;
use crate::read::projection::ProjectionMapper;
@@ -65,6 +65,8 @@ pub struct SeqScan {
parallelism: ScanParallism,
/// Index applier.
index_applier: Option<SstIndexApplierRef>,
/// Start time of the query.
query_start: Option<Instant>,
}
impl SeqScan {
@@ -82,6 +84,7 @@ impl SeqScan {
ignore_file_not_found: false,
parallelism: ScanParallism::default(),
index_applier: None,
query_start: None,
}
}
@@ -141,10 +144,19 @@ impl SeqScan {
self
}
/// Sets start time of the query.
#[must_use]
pub(crate) fn with_start_time(mut self, now: Option<Instant>) -> Self {
self.query_start = now;
self
}
/// Builds a stream for the query.
pub async fn build_stream(&self) -> Result<SendableRecordBatchStream> {
let start = Instant::now();
let mut metrics = Metrics::default();
let build_start = Instant::now();
let query_start = self.query_start.unwrap_or(build_start);
metrics.prepare_scan_cost = query_start.elapsed();
let use_parallel = self.use_parallel_reader();
// Scans all memtables and SSTs. Builds a merge reader to merge results.
let mut reader = if use_parallel {
@@ -152,9 +164,13 @@ impl SeqScan {
} else {
self.build_reader().await?
};
let elapsed = start.elapsed();
metrics.build_reader_cost = elapsed;
metrics.scan_cost = elapsed;
metrics.build_reader_cost = build_start.elapsed();
READ_STAGE_ELAPSED
.with_label_values(&["prepare_scan"])
.observe(metrics.prepare_scan_cost.as_secs_f64());
READ_STAGE_ELAPSED
.with_label_values(&["build_reader"])
.observe(metrics.build_reader_cost.as_secs_f64());
// Creates a stream to poll the batch reader and convert batch into record batch.
let mapper = self.mapper.clone();
@@ -165,15 +181,22 @@ impl SeqScan {
while let Some(batch) =
Self::fetch_record_batch(&mut reader, &mapper, cache, &mut metrics).await?
{
metrics.num_batches += 1;
metrics.num_rows += batch.num_rows();
yield batch;
}
// Update metrics.
metrics.total_cost = query_start.elapsed();
READ_STAGE_ELAPSED.with_label_values(&["convert_rb"]).observe(metrics.convert_cost.as_secs_f64());
READ_STAGE_ELAPSED.with_label_values(&["scan"]).observe(metrics.scan_cost.as_secs_f64());
READ_STAGE_ELAPSED.with_label_values(&["total"]).observe(metrics.total_cost.as_secs_f64());
READ_ROWS_RETURN.observe(metrics.num_rows as f64);
READ_BATCHES_RETURN.observe(metrics.num_batches as f64);
debug!(
"Seq scan finished, region_id: {:?}, metrics: {:?}, use_parallel: {}, parallelism: {}",
mapper.metadata().region_id, metrics, use_parallel, parallelism,
);
// Update metrics.
READ_STAGE_ELAPSED.with_label_values(&["total"]).observe(metrics.scan_cost.as_secs_f64());
};
let stream = Box::pin(RecordBatchStreamWrapper::new(
self.mapper.output_schema(),
@@ -249,6 +272,8 @@ impl SeqScan {
}
}
READ_SST_COUNT.observe(self.files.len() as f64);
Ok(sources)
}
@@ -318,12 +343,20 @@ impl SeqScan {
/// Metrics for [SeqScan].
#[derive(Debug, Default)]
struct Metrics {
/// Duration to prepare the scan task.
prepare_scan_cost: Duration,
/// Duration to build the reader.
build_reader_cost: Duration,
/// Duration to scan data.
scan_cost: Duration,
/// Duration to convert batches.
convert_cost: Duration,
/// Duration of the scan.
total_cost: Duration,
/// Number of batches returned.
num_batches: usize,
/// Number of rows returned.
num_rows: usize,
}
#[cfg(test)]

View File

@@ -49,7 +49,7 @@ use crate::flush::{FlushScheduler, WriteBufferManagerImpl, WriteBufferManagerRef
use crate::manifest::action::RegionEdit;
use crate::memtable::merge_tree::MergeTreeMemtableBuilder;
use crate::memtable::time_series::TimeSeriesMemtableBuilder;
use crate::memtable::MemtableBuilderRef;
use crate::memtable::{MemtableBuilderRef, MemtableConfig};
use crate::region::{MitoRegionRef, RegionMap, RegionMapRef};
use crate::request::{
BackgroundNotify, DdlRequest, SenderDdlRequest, SenderWriteRequest, WorkerRequest,
@@ -320,15 +320,15 @@ impl<S: LogStore> WorkerStarter<S> {
let (sender, receiver) = mpsc::channel(self.config.worker_channel_size);
let running = Arc::new(AtomicBool::new(true));
let memtable_builder = if let Some(config) = &self.config.experimental_memtable {
Arc::new(MergeTreeMemtableBuilder::new(
config.clone(),
let memtable_builder = match &self.config.memtable {
MemtableConfig::Experimental(merge_tree) => Arc::new(MergeTreeMemtableBuilder::new(
merge_tree.clone(),
Some(self.write_buffer_manager.clone()),
)) as _
} else {
Arc::new(TimeSeriesMemtableBuilder::new(Some(
)) as _,
MemtableConfig::TimeSeries => Arc::new(TimeSeriesMemtableBuilder::new(Some(
self.write_buffer_manager.clone(),
))) as _
))) as _,
};
let mut worker_thread = RegionWorkerLoop {
id: self.id,

View File

@@ -454,6 +454,12 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to parse sql value"))]
ParseSqlValue {
source: sql::error::Error,
location: Location,
},
#[snafu(display("Failed to build default value, column: {}", column))]
ColumnDefaultValue {
column: String,
@@ -522,6 +528,9 @@ pub enum Error {
#[snafu(display("Failed to create logical tables: {}", reason))]
CreateLogicalTables { reason: String, location: Location },
#[snafu(display("Invalid partition rule: {}", reason))]
InvalidPartitionRule { reason: String, location: Location },
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -643,7 +652,9 @@ impl ErrorExt for Error {
Error::CreateTableWithMultiCatalogs { .. }
| Error::CreateTableWithMultiSchemas { .. }
| Error::EmptyCreateTableExpr { .. } => StatusCode::InvalidArguments,
| Error::EmptyCreateTableExpr { .. }
| Error::InvalidPartitionRule { .. }
| Error::ParseSqlValue { .. } => StatusCode::InvalidArguments,
Error::CreateLogicalTables { .. } => StatusCode::Unexpected,
}

View File

@@ -31,9 +31,12 @@ use common_meta::rpc::router::{Partition, Partition as MetaPartition};
use common_meta::table_name::TableName;
use common_query::Output;
use common_telemetry::{info, tracing};
use common_time::Timezone;
use datatypes::prelude::ConcreteDataType;
use datatypes::schema::RawSchema;
use datatypes::value::Value;
use lazy_static::lazy_static;
use partition::expr::{Operand, PartitionExpr, RestrictedOp};
use partition::partition::{PartitionBound, PartitionDef};
use query::sql::create_table_stmt;
use regex::Regex;
@@ -42,6 +45,8 @@ use session::table_name::table_idents_to_full_name;
use snafu::{ensure, IntoError, OptionExt, ResultExt};
use sql::statements::alter::AlterTable;
use sql::statements::create::{CreateExternalTable, CreateTable, CreateTableLike, Partitions};
use sql::statements::sql_value_to_value;
use sqlparser::ast::{Expr, Ident, Value as ParserValue};
use table::dist_table::DistTable;
use table::metadata::{self, RawTableInfo, RawTableMeta, TableId, TableInfo, TableType};
use table::requests::{AlterKind, AlterTableRequest, TableOptions};
@@ -52,9 +57,9 @@ use crate::error::{
self, AlterExprToRequestSnafu, CatalogSnafu, ColumnDataTypeSnafu, ColumnNotFoundSnafu,
CreateLogicalTablesSnafu, CreateTableInfoSnafu, CreateTableWithMultiCatalogsSnafu,
CreateTableWithMultiSchemasSnafu, DeserializePartitionSnafu, EmptyCreateTableExprSnafu,
InvalidPartitionColumnsSnafu, InvalidTableNameSnafu, Result, SchemaNotFoundSnafu,
TableAlreadyExistsSnafu, TableMetadataManagerSnafu, TableNotFoundSnafu,
UnrecognizedTableOptionSnafu,
InvalidPartitionColumnsSnafu, InvalidPartitionRuleSnafu, InvalidTableNameSnafu,
ParseSqlValueSnafu, Result, SchemaNotFoundSnafu, TableAlreadyExistsSnafu,
TableMetadataManagerSnafu, TableNotFoundSnafu, UnrecognizedTableOptionSnafu,
};
use crate::expr_factory;
use crate::statement::show::create_partitions_stmt;
@@ -125,6 +130,7 @@ impl StatementExecutor {
self.create_table_inner(create_expr, None, &ctx).await
}
#[tracing::instrument(skip_all)]
pub async fn create_table_inner(
&self,
create_table: &mut CreateTableExpr,
@@ -408,6 +414,7 @@ impl StatementExecutor {
self.alter_table_inner(expr).await
}
#[tracing::instrument(skip_all)]
pub async fn alter_table_inner(&self, expr: AlterExpr) -> Result<Output> {
let catalog_name = if expr.catalog_name.is_empty() {
DEFAULT_CATALOG_NAME
@@ -728,13 +735,17 @@ fn find_partition_columns(partitions: &Option<Partitions>) -> Result<Vec<String>
Ok(columns)
}
/// Parse [Partitions] into a group of partition entries.
///
/// Returns a list of [PartitionBound], each of which defines a partition.
fn find_partition_entries(
create_table: &CreateTableExpr,
partitions: &Option<Partitions>,
partition_columns: &[String],
_query_ctx: &QueryContextRef,
query_ctx: &QueryContextRef,
) -> Result<Vec<Vec<PartitionBound>>> {
let entries = if let Some(_partitions) = partitions {
let entries = if let Some(partitions) = partitions {
// extract concrete data type of partition columns
let column_defs = partition_columns
.iter()
.map(|pc| {
@@ -746,24 +757,103 @@ fn find_partition_entries(
.unwrap()
})
.collect::<Vec<_>>();
let mut column_name_and_type = Vec::with_capacity(column_defs.len());
let mut column_name_and_type = HashMap::with_capacity(column_defs.len());
for column in column_defs {
let column_name = &column.name;
let data_type = ConcreteDataType::from(
ColumnDataTypeWrapper::try_new(column.data_type, column.datatype_extension.clone())
.context(ColumnDataTypeSnafu)?,
);
column_name_and_type.push((column_name, data_type));
column_name_and_type.insert(column_name, data_type);
}
// TODO(ruihang): implement the partition value parser.
vec![vec![PartitionBound::MaxValue]]
// Transform parser expr to partition expr
let mut partition_exprs = Vec::with_capacity(partitions.exprs.len());
for partition in &partitions.exprs {
let partition_expr =
convert_one_expr(partition, &column_name_and_type, &query_ctx.timezone())?;
partition_exprs.push(vec![PartitionBound::Expr(partition_expr)]);
}
// fallback for no expr
if partition_exprs.is_empty() {
partition_exprs.push(vec![PartitionBound::MaxValue]);
}
partition_exprs
} else {
vec![vec![PartitionBound::MaxValue]]
};
Ok(entries)
}
fn convert_one_expr(
expr: &Expr,
column_name_and_type: &HashMap<&String, ConcreteDataType>,
timezone: &Timezone,
) -> Result<PartitionExpr> {
let Expr::BinaryOp { left, op, right } = expr else {
return InvalidPartitionRuleSnafu {
reason: "partition rule must be a binary expression",
}
.fail();
};
let op =
RestrictedOp::try_from_parser(&op.clone()).with_context(|| InvalidPartitionRuleSnafu {
reason: format!("unsupported operator in partition expr {op}"),
})?;
// convert leaf node.
let (lhs, op, rhs) = match (left.as_ref(), right.as_ref()) {
(Expr::Identifier(ident), Expr::Value(value)) => {
let (column_name, data_type) = convert_identifier(ident, column_name_and_type)?;
let value = convert_value(value, data_type, timezone)?;
(Operand::Column(column_name), op, Operand::Value(value))
}
(Expr::Value(value), Expr::Identifier(ident)) => {
let (column_name, data_type) = convert_identifier(ident, column_name_and_type)?;
let value = convert_value(value, data_type, timezone)?;
(Operand::Value(value), op, Operand::Column(column_name))
}
(Expr::BinaryOp { .. }, Expr::BinaryOp { .. }) => {
// sub-expr must against another sub-expr
let lhs = convert_one_expr(left, column_name_and_type, timezone)?;
let rhs = convert_one_expr(right, column_name_and_type, timezone)?;
(Operand::Expr(lhs), op, Operand::Expr(rhs))
}
_ => {
return InvalidPartitionRuleSnafu {
reason: format!("invalid partition expr {expr}"),
}
.fail();
}
};
Ok(PartitionExpr::new(lhs, op, rhs))
}
fn convert_identifier(
ident: &Ident,
column_name_and_type: &HashMap<&String, ConcreteDataType>,
) -> Result<(String, ConcreteDataType)> {
let column_name = ident.value.clone();
let data_type = column_name_and_type
.get(&column_name)
.cloned()
.with_context(|| ColumnNotFoundSnafu { msg: &column_name })?;
Ok((column_name, data_type))
}
fn convert_value(
value: &ParserValue,
data_type: ConcreteDataType,
timezone: &Timezone,
) -> Result<Value> {
sql_value_to_value("<NONAME>", &data_type, value, Some(timezone)).context(ParseSqlValueSnafu)
}
/// Merge table level table options with schema level table options.
fn merge_options(mut table_opts: TableOptions, schema_opts: SchemaNameValue) -> TableOptions {
table_opts.ttl = table_opts.ttl.or(schema_opts.ttl);
table_opts
@@ -819,10 +909,10 @@ mod test {
(
r"
CREATE TABLE rcx ( a INT, b STRING, c TIMESTAMP, TIME INDEX (c) )
PARTITION BY RANGE COLUMNS (b) (
PARTITION r0 VALUES LESS THAN ('hz'),
PARTITION r1 VALUES LESS THAN ('sh'),
PARTITION r2 VALUES LESS THAN (MAXVALUE),
PARTITION ON COLUMNS (b) (
b < 'hz',
b >= 'hz' AND b < 'sh',
b >= 'sh'
)
ENGINE=mito",
r#"[{"column_list":["b"],"value_list":["{\"Value\":{\"String\":\"hz\"}}"]},{"column_list":["b"],"value_list":["{\"Value\":{\"String\":\"sh\"}}"]},{"column_list":["b"],"value_list":["\"MaxValue\""]}]"#,
@@ -832,8 +922,9 @@ ENGINE=mito",
CREATE TABLE rcx ( a INT, b STRING, c TIMESTAMP, TIME INDEX (c) )
PARTITION BY RANGE COLUMNS (b, a) (
PARTITION r0 VALUES LESS THAN ('hz', 10),
PARTITION r1 VALUES LESS THAN ('sh', 20),
PARTITION r2 VALUES LESS THAN (MAXVALUE, MAXVALUE),
b < 'hz' AND a < 10,
b >= 'hz' AND b < 'sh' AND a >= 10 AND a < 20,
b >= 'sh' AND a >= 20
)
ENGINE=mito",
r#"[{"column_list":["b","a"],"value_list":["{\"Value\":{\"String\":\"hz\"}}","{\"Value\":{\"Int32\":10}}"]},{"column_list":["b","a"],"value_list":["{\"Value\":{\"String\":\"sh\"}}","{\"Value\":{\"Int32\":20}}"]},{"column_list":["b","a"],"value_list":["\"MaxValue\"","\"MaxValue\""]}]"#,

View File

@@ -16,6 +16,7 @@ use common_meta::table_name::TableName;
use common_query::Output;
use common_telemetry::tracing;
use partition::manager::PartitionInfo;
use partition::partition::PartitionBound;
use session::context::QueryContextRef;
use snafu::ResultExt;
use sql::ast::Ident;
@@ -33,7 +34,7 @@ impl StatementExecutor {
stmt: ShowDatabases,
query_ctx: QueryContextRef,
) -> Result<Output> {
query::sql::show_databases(stmt, self.catalog_manager.clone(), query_ctx)
query::sql::show_databases(stmt, &self.query_engine, &self.catalog_manager, query_ctx)
.await
.context(ExecuteStatementSnafu)
}
@@ -44,7 +45,7 @@ impl StatementExecutor {
stmt: ShowTables,
query_ctx: QueryContextRef,
) -> Result<Output> {
query::sql::show_tables(stmt, self.catalog_manager.clone(), query_ctx)
query::sql::show_tables(stmt, &self.query_engine, &self.catalog_manager, query_ctx)
.await
.context(ExecuteStatementSnafu)
}
@@ -88,10 +89,22 @@ pub(crate) fn create_partitions_stmt(partitions: Vec<PartitionInfo>) -> Result<O
.map(|name| name[..].into())
.collect();
// TODO(ruihang): convert partition info back to partition expr
let exprs = partitions
.iter()
.filter_map(|partition| {
partition
.partition
.partition_bounds()
.first()
.and_then(|bound| {
if let PartitionBound::Expr(expr) = bound {
Some(expr.to_parser_expr())
} else {
None
}
})
})
.collect();
Ok(Some(Partitions {
column_list,
exprs: vec![],
}))
Ok(Some(Partitions { column_list, exprs }))
}

View File

@@ -29,6 +29,7 @@ use datatypes::prelude::ConcreteDataType;
use datatypes::schema::{ColumnSchema, SchemaBuilder};
use meta_client::client::MetaClient;
use partition::columns::RangeColumnsPartitionRule;
use partition::expr::{Operand, PartitionExpr, RestrictedOp};
use partition::manager::{PartitionRuleManager, PartitionRuleManagerRef};
use partition::partition::{PartitionBound, PartitionDef};
use partition::range::RangePartitionRule;
@@ -116,7 +117,11 @@ pub(crate) async fn create_partition_rule_manager(
partition: Some(
PartitionDef::new(
vec!["a".to_string()],
vec![PartitionBound::Value(10_i32.into())],
vec![PartitionBound::Expr(PartitionExpr::new(
Operand::Column("a".to_string()),
RestrictedOp::Lt,
Operand::Value(datatypes::value::Value::Int32(10)),
))],
)
.try_into()
.unwrap(),
@@ -135,7 +140,19 @@ pub(crate) async fn create_partition_rule_manager(
partition: Some(
PartitionDef::new(
vec!["a".to_string()],
vec![PartitionBound::Value(50_i32.into())],
vec![PartitionBound::Expr(PartitionExpr::new(
Operand::Expr(PartitionExpr::new(
Operand::Column("a".to_string()),
RestrictedOp::GtEq,
Operand::Value(datatypes::value::Value::Int32(10)),
)),
RestrictedOp::And,
Operand::Expr(PartitionExpr::new(
Operand::Column("a".to_string()),
RestrictedOp::Lt,
Operand::Value(datatypes::value::Value::Int32(50)),
)),
))],
)
.try_into()
.unwrap(),
@@ -154,7 +171,11 @@ pub(crate) async fn create_partition_rule_manager(
partition: Some(
PartitionDef::new(
vec!["a".to_string()],
vec![PartitionBound::MaxValue],
vec![PartitionBound::Expr(PartitionExpr::new(
Operand::Column("a".to_string()),
RestrictedOp::GtEq,
Operand::Value(datatypes::value::Value::Int32(50)),
))],
)
.try_into()
.unwrap(),
@@ -172,83 +193,11 @@ pub(crate) async fn create_partition_rule_manager(
.await
.unwrap();
table_metadata_manager
.create_table_metadata(
new_test_table_info(2, "table_2", regions.clone().into_iter()).into(),
TableRouteValue::physical(vec![
RegionRoute {
region: Region {
id: 1.into(),
name: "r1".to_string(),
partition: Some(
PartitionDef::new(
vec!["a".to_string(), "b".to_string()],
vec![
PartitionBound::Value(10_i32.into()),
PartitionBound::Value("hz".into()),
],
)
.try_into()
.unwrap(),
),
attrs: BTreeMap::new(),
},
leader_peer: None,
follower_peers: vec![],
leader_status: None,
leader_down_since: None,
},
RegionRoute {
region: Region {
id: 2.into(),
name: "r2".to_string(),
partition: Some(
PartitionDef::new(
vec!["a".to_string(), "b".to_string()],
vec![
PartitionBound::Value(50_i32.into()),
PartitionBound::Value("sh".into()),
],
)
.try_into()
.unwrap(),
),
attrs: BTreeMap::new(),
},
leader_peer: None,
follower_peers: vec![],
leader_status: None,
leader_down_since: None,
},
RegionRoute {
region: Region {
id: 3.into(),
name: "r3".to_string(),
partition: Some(
PartitionDef::new(
vec!["a".to_string(), "b".to_string()],
vec![PartitionBound::MaxValue, PartitionBound::MaxValue],
)
.try_into()
.unwrap(),
),
attrs: BTreeMap::new(),
},
leader_peer: None,
follower_peers: vec![],
leader_status: None,
leader_down_since: None,
},
]),
region_wal_options,
)
.await
.unwrap();
partition_manager
}
#[tokio::test(flavor = "multi_thread")]
#[ignore = "TODO(ruihang, weny): WIP new partition rule"]
async fn test_find_partition_rule() {
let partition_manager =
create_partition_rule_manager(Arc::new(MemoryKvBackend::default())).await;

View File

@@ -28,5 +28,7 @@ prometheus.workspace = true
serde.workspace = true
serde_json.workspace = true
snafu.workspace = true
sql.workspace = true
sqlparser.workspace = true
store-api.workspace = true
table.workspace = true

117
src/partition/src/expr.rs Normal file
View File

@@ -0,0 +1,117 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use datatypes::value::Value;
use serde::{Deserialize, Serialize};
use sql::statements::value_to_sql_value;
use sqlparser::ast::{BinaryOperator as ParserBinaryOperator, Expr as ParserExpr, Ident};
/// Struct for partition expression. This can be converted back to sqlparser's [Expr].
/// by [`Self::to_parser_expr`].
///
/// [Expr]: sqlparser::ast::Expr
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct PartitionExpr {
pub(crate) lhs: Box<Operand>,
pub(crate) op: RestrictedOp,
pub(crate) rhs: Box<Operand>,
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum Operand {
Column(String),
Value(Value),
Expr(PartitionExpr),
}
/// A restricted set of [Operator](datafusion_expr::Operator) that can be used in
/// partition expressions.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum RestrictedOp {
// Evaluate to binary
Eq,
NotEq,
Lt,
LtEq,
Gt,
GtEq,
// Conjunction
And,
Or,
}
impl RestrictedOp {
pub fn try_from_parser(op: &ParserBinaryOperator) -> Option<Self> {
match op {
ParserBinaryOperator::Eq => Some(Self::Eq),
ParserBinaryOperator::NotEq => Some(Self::NotEq),
ParserBinaryOperator::Lt => Some(Self::Lt),
ParserBinaryOperator::LtEq => Some(Self::LtEq),
ParserBinaryOperator::Gt => Some(Self::Gt),
ParserBinaryOperator::GtEq => Some(Self::GtEq),
ParserBinaryOperator::And => Some(Self::And),
ParserBinaryOperator::Or => Some(Self::Or),
_ => None,
}
}
pub fn to_parser_op(&self) -> ParserBinaryOperator {
match self {
Self::Eq => ParserBinaryOperator::Eq,
Self::NotEq => ParserBinaryOperator::NotEq,
Self::Lt => ParserBinaryOperator::Lt,
Self::LtEq => ParserBinaryOperator::LtEq,
Self::Gt => ParserBinaryOperator::Gt,
Self::GtEq => ParserBinaryOperator::GtEq,
Self::And => ParserBinaryOperator::And,
Self::Or => ParserBinaryOperator::Or,
}
}
}
impl PartitionExpr {
pub fn new(lhs: Operand, op: RestrictedOp, rhs: Operand) -> Self {
Self {
lhs: Box::new(lhs),
op,
rhs: Box::new(rhs),
}
}
/// Convert [Self] back to sqlparser's [Expr]
///
/// [Expr]: ParserExpr
pub fn to_parser_expr(&self) -> ParserExpr {
// Safety: Partition rule won't contains unsupported value type.
// Otherwise it will be rejected by the parser.
let lhs = match &*self.lhs {
Operand::Column(c) => ParserExpr::Identifier(Ident::new(c.clone())),
Operand::Value(v) => ParserExpr::Value(value_to_sql_value(v).unwrap()),
Operand::Expr(e) => e.to_parser_expr(),
};
let rhs = match &*self.rhs {
Operand::Column(c) => ParserExpr::Identifier(Ident::new(c.clone())),
Operand::Value(v) => ParserExpr::Value(value_to_sql_value(v).unwrap()),
Operand::Expr(e) => e.to_parser_expr(),
};
ParserExpr::BinaryOp {
left: Box::new(lhs),
op: self.op.to_parser_op(),
right: Box::new(rhs),
}
}
}

View File

@@ -16,8 +16,10 @@
pub mod columns;
pub mod error;
pub mod expr;
pub mod manager;
pub mod metrics;
pub mod multi_dim;
pub mod partition;
pub mod range;
pub mod splitter;

View File

@@ -30,6 +30,7 @@ use table::metadata::TableId;
use crate::columns::RangeColumnsPartitionRule;
use crate::error::{FindLeaderSnafu, Result};
use crate::multi_dim::MultiDimPartitionRule;
use crate::partition::{PartitionBound, PartitionDef, PartitionExpr};
use crate::range::RangePartitionRule;
use crate::splitter::RowSplitter;
@@ -122,12 +123,41 @@ impl PartitionRuleManager {
Ok(results)
}
/// Get partition rule of given table.
pub async fn find_table_partition_rule(&self, table_id: TableId) -> Result<PartitionRuleRef> {
let partitions = self.find_table_partitions(table_id).await?;
let partition_columns = partitions[0].partition.partition_columns();
let regions = partitions
.iter()
.map(|x| x.id.region_number())
.collect::<Vec<RegionNumber>>();
let exprs = partitions
.iter()
.filter_map(|x| match &x.partition.partition_bounds()[0] {
PartitionBound::Expr(e) => Some(e.clone()),
_ => None,
})
.collect::<Vec<_>>();
Ok(Arc::new(MultiDimPartitionRule::new(
partition_columns.clone(),
regions,
exprs,
)) as _)
}
/// Get partition rule of given table.
pub async fn find_table_partition_rule_deprecated(
&self,
table_id: TableId,
) -> Result<PartitionRuleRef> {
let partitions = self.find_table_partitions(table_id).await?;
debug_assert!(!partitions.is_empty());
let partition_columns = partitions[0].partition.partition_columns();
let regions = partitions
.iter()
.map(|x| x.id.region_number())
@@ -142,6 +172,7 @@ impl PartitionRuleManager {
.filter_map(|info| match &info.partition.partition_bounds()[0] {
PartitionBound::Value(v) => Some(v.clone()),
PartitionBound::MaxValue => None,
PartitionBound::Expr(_) => None,
})
.collect::<Vec<Value>>();
Arc::new(RangePartitionRule::new(
@@ -266,10 +297,15 @@ fn create_partitions_from_region_routes(
fn find_regions0(partition_rule: PartitionRuleRef, filter: &Expr) -> Result<HashSet<RegionNumber>> {
let expr = filter.df_expr();
match expr {
DfExpr::BinaryExpr(BinaryExpr { left, op, right }) if is_compare_op(op) => {
DfExpr::BinaryExpr(BinaryExpr { left, op, right }) if op.is_comparison_operator() => {
let column_op_value = match (left.as_ref(), right.as_ref()) {
(DfExpr::Column(c), DfExpr::Literal(v)) => Some((&c.name, *op, v)),
(DfExpr::Literal(v), DfExpr::Column(c)) => Some((&c.name, reverse_operator(op), v)),
(DfExpr::Literal(v), DfExpr::Column(c)) => Some((
&c.name,
// Safety: previous branch ensures this is a comparison operator
op.swap().unwrap(),
v,
)),
_ => None,
};
if let Some((column, op, scalar)) = column_op_value {
@@ -311,27 +347,3 @@ fn find_regions0(partition_rule: PartitionRuleRef, filter: &Expr) -> Result<Hash
.into_iter()
.collect::<HashSet<RegionNumber>>())
}
#[inline]
fn is_compare_op(op: &Operator) -> bool {
matches!(
*op,
Operator::Eq
| Operator::NotEq
| Operator::Lt
| Operator::LtEq
| Operator::Gt
| Operator::GtEq
)
}
#[inline]
fn reverse_operator(op: &Operator) -> Operator {
match *op {
Operator::Lt => Operator::Gt,
Operator::Gt => Operator::Lt,
Operator::LtEq => Operator::GtEq,
Operator::GtEq => Operator::LtEq,
_ => *op,
}
}

View File

@@ -0,0 +1,201 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::cmp::Ordering;
use std::collections::HashMap;
use datatypes::prelude::Value;
use serde::{Deserialize, Serialize};
use snafu::ensure;
use store_api::storage::RegionNumber;
use crate::error::{self, Result};
use crate::expr::{Operand, PartitionExpr, RestrictedOp};
use crate::PartitionRule;
#[derive(Debug, Serialize, Deserialize)]
pub struct MultiDimPartitionRule {
partition_columns: Vec<String>,
// name to index of `partition_columns`
name_to_index: HashMap<String, usize>,
regions: Vec<RegionNumber>,
exprs: Vec<PartitionExpr>,
}
impl MultiDimPartitionRule {
pub fn new(
partition_columns: Vec<String>,
regions: Vec<RegionNumber>,
exprs: Vec<PartitionExpr>,
) -> Self {
let name_to_index = partition_columns
.iter()
.enumerate()
.map(|(i, name)| (name.clone(), i))
.collect::<HashMap<_, _>>();
Self {
partition_columns,
name_to_index,
regions,
exprs,
}
}
fn find_region(&self, values: &[Value]) -> Result<RegionNumber> {
ensure!(
values.len() == self.partition_columns.len(),
error::RegionKeysSizeSnafu {
expect: self.partition_columns.len(),
actual: values.len(),
}
);
for (region_index, expr) in self.exprs.iter().enumerate() {
if self.evaluate_expr(expr, values)? {
return Ok(self.regions[region_index]);
}
}
// return the default region number
Ok(0)
}
fn evaluate_expr(&self, expr: &PartitionExpr, values: &[Value]) -> Result<bool> {
match (expr.lhs.as_ref(), expr.rhs.as_ref()) {
(Operand::Column(name), Operand::Value(r)) => {
let index = self.name_to_index.get(name).unwrap();
let l = &values[*index];
Self::perform_op(l, &expr.op, r)
}
(Operand::Value(l), Operand::Column(name)) => {
let index = self.name_to_index.get(name).unwrap();
let r = &values[*index];
Self::perform_op(l, &expr.op, r)
}
(Operand::Expr(lhs), Operand::Expr(rhs)) => {
let lhs = self.evaluate_expr(lhs, values)?;
let rhs = self.evaluate_expr(rhs, values)?;
match expr.op {
RestrictedOp::And => Ok(lhs && rhs),
RestrictedOp::Or => Ok(lhs || rhs),
_ => unreachable!(),
}
}
_ => unreachable!(),
}
}
fn perform_op(lhs: &Value, op: &RestrictedOp, rhs: &Value) -> Result<bool> {
let result = match op {
RestrictedOp::Eq => lhs.eq(rhs),
RestrictedOp::NotEq => lhs.ne(rhs),
RestrictedOp::Lt => lhs.partial_cmp(rhs) == Some(Ordering::Less),
RestrictedOp::LtEq => {
let result = lhs.partial_cmp(rhs);
result == Some(Ordering::Less) || result == Some(Ordering::Equal)
}
RestrictedOp::Gt => lhs.partial_cmp(rhs) == Some(Ordering::Greater),
RestrictedOp::GtEq => {
let result = lhs.partial_cmp(rhs);
result == Some(Ordering::Greater) || result == Some(Ordering::Equal)
}
RestrictedOp::And | RestrictedOp::Or => unreachable!(),
};
Ok(result)
}
}
impl PartitionRule for MultiDimPartitionRule {
fn as_any(&self) -> &dyn Any {
self
}
fn partition_columns(&self) -> Vec<String> {
self.partition_columns.clone()
}
fn find_region(&self, values: &[Value]) -> Result<RegionNumber> {
self.find_region(values)
}
fn find_regions_by_exprs(
&self,
_exprs: &[crate::partition::PartitionExpr],
) -> Result<Vec<RegionNumber>> {
Ok(self.regions.clone())
}
}
#[cfg(test)]
mod tests {
use std::assert_matches::assert_matches;
use super::*;
use crate::error;
#[test]
fn test_find_region() {
// PARTITION ON COLUMNS (b) (
// b < 'hz',
// b >= 'hz' AND b < 'sh',
// b >= 'sh'
// )
let rule = MultiDimPartitionRule::new(
vec!["b".to_string()],
vec![1, 2, 3],
vec![
PartitionExpr::new(
Operand::Column("b".to_string()),
RestrictedOp::Lt,
Operand::Value(datatypes::value::Value::String("hz".into())),
),
PartitionExpr::new(
Operand::Expr(PartitionExpr::new(
Operand::Column("b".to_string()),
RestrictedOp::GtEq,
Operand::Value(datatypes::value::Value::String("hz".into())),
)),
RestrictedOp::And,
Operand::Expr(PartitionExpr::new(
Operand::Column("b".to_string()),
RestrictedOp::Lt,
Operand::Value(datatypes::value::Value::String("sh".into())),
)),
),
PartitionExpr::new(
Operand::Column("b".to_string()),
RestrictedOp::GtEq,
Operand::Value(datatypes::value::Value::String("sh".into())),
),
],
);
assert_matches!(
rule.find_region(&["foo".into(), 1000_i32.into()]),
Err(error::Error::RegionKeysSize {
expect: 1,
actual: 2,
..
})
);
assert_matches!(rule.find_region(&["foo".into()]), Ok(1));
assert_matches!(rule.find_region(&["bar".into()]), Ok(1));
assert_matches!(rule.find_region(&["hz".into()]), Ok(2));
assert_matches!(rule.find_region(&["hzz".into()]), Ok(2));
assert_matches!(rule.find_region(&["sh".into()]), Ok(3));
assert_matches!(rule.find_region(&["zzzz".into()]), Ok(3));
}
}

View File

@@ -49,6 +49,7 @@ pub trait PartitionRule: Sync + Send {
pub enum PartitionBound {
Value(Value),
MaxValue,
Expr(crate::expr::PartitionExpr),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -62,6 +63,7 @@ impl Display for PartitionBound {
match self {
Self::Value(v) => write!(f, "{}", v),
Self::MaxValue => write!(f, "MAXVALUE"),
Self::Expr(e) => write!(f, "{:?}", e),
}
}
}

View File

@@ -17,6 +17,9 @@ use prometheus::*;
lazy_static! {
/// Counter for the number of series processed per query.
pub static ref PROMQL_SERIES_COUNT: Histogram =
register_histogram!("greptime_promql_series_count", "promql series count").unwrap();
pub static ref PROMQL_SERIES_COUNT: Histogram = register_histogram!(
"greptime_promql_series_count",
"promql series count",
exponential_buckets(10.0, 10.0, 8).unwrap(),
).unwrap();
}

View File

@@ -30,35 +30,30 @@ use common_query::physical_plan::{DfPhysicalPlanAdapter, PhysicalPlan, PhysicalP
use common_query::prelude::ScalarUdf;
use common_query::Output;
use common_recordbatch::adapter::RecordBatchStreamAdapter;
use common_recordbatch::{
EmptyRecordBatchStream, RecordBatch, RecordBatches, SendableRecordBatchStream,
};
use common_recordbatch::{EmptyRecordBatchStream, SendableRecordBatchStream};
use common_telemetry::tracing;
use datafusion::common::Column;
use datafusion::physical_plan::analyze::AnalyzeExec;
use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
use datafusion::physical_plan::ExecutionPlan;
use datafusion::prelude::SessionContext;
use datafusion_common::{ResolvedTableReference, ScalarValue};
use datafusion_expr::{DmlStatement, Expr as DfExpr, LogicalPlan as DfLogicalPlan, WriteOp};
use datafusion_common::ResolvedTableReference;
use datafusion_expr::{DmlStatement, LogicalPlan as DfLogicalPlan, WriteOp};
use datatypes::prelude::VectorRef;
use datatypes::schema::Schema;
use futures_util::StreamExt;
use session::context::QueryContextRef;
use snafu::{ensure, OptionExt, ResultExt};
use sql::ast::{BinaryOperator, Expr, Value};
use table::requests::{DeleteRequest, InsertRequest};
use table::TableRef;
use crate::dataframe::DataFrame;
pub use crate::datafusion::planner::DfContextProviderAdapter;
use crate::error::{
CatalogSnafu, CreateRecordBatchSnafu, CreateSchemaSnafu, DataFusionSnafu,
MissingTableMutationHandlerSnafu, MissingTimestampColumnSnafu, QueryExecutionSnafu, Result,
TableMutationSnafu, TableNotFoundSnafu, UnimplementedSnafu, UnsupportedExprSnafu,
CatalogSnafu, CreateRecordBatchSnafu, DataFusionSnafu, MissingTableMutationHandlerSnafu,
MissingTimestampColumnSnafu, QueryExecutionSnafu, Result, TableMutationSnafu,
TableNotFoundSnafu, UnsupportedExprSnafu,
};
use crate::executor::QueryExecutor;
use crate::logical_optimizer::LogicalOptimizer;
use crate::metrics::{OnDone, QUERY_STAGE_ELAPSED};
use crate::physical_optimizer::PhysicalOptimizer;
use crate::physical_planner::PhysicalPlanner;
use crate::physical_wrapper::PhysicalPlanWrapperRef;
@@ -114,6 +109,10 @@ impl DatafusionQueryEngine {
}
);
let _timer = QUERY_STAGE_ELAPSED
.with_label_values(&[dml.op.name()])
.start_timer();
let default_catalog = &query_ctx.current_catalog().to_owned();
let default_schema = &query_ctx.current_schema().to_owned();
let table_name = dml.table_name.resolve(default_catalog, default_schema);
@@ -308,7 +307,7 @@ impl QueryEngine for DatafusionQueryEngine {
impl LogicalOptimizer for DatafusionQueryEngine {
#[tracing::instrument(skip_all)]
fn optimize(&self, context: &QueryEngineContext, plan: &LogicalPlan) -> Result<LogicalPlan> {
let _timer = metrics::METRIC_OPTIMIZE_LOGICAL_ELAPSED.start_timer();
let _timer = metrics::OPTIMIZE_LOGICAL_ELAPSED.start_timer();
match plan {
LogicalPlan::DfPlan(df_plan) => {
// Optimized by extension rules
@@ -342,7 +341,7 @@ impl PhysicalPlanner for DatafusionQueryEngine {
ctx: &mut QueryEngineContext,
logical_plan: &LogicalPlan,
) -> Result<Arc<dyn PhysicalPlan>> {
let _timer = metrics::METRIC_CREATE_PHYSICAL_ELAPSED.start_timer();
let _timer = metrics::CREATE_PHYSICAL_ELAPSED.start_timer();
match logical_plan {
LogicalPlan::DfPlan(df_plan) => {
let state = ctx.state();
@@ -376,7 +375,7 @@ impl PhysicalOptimizer for DatafusionQueryEngine {
ctx: &mut QueryEngineContext,
plan: Arc<dyn PhysicalPlan>,
) -> Result<Arc<dyn PhysicalPlan>> {
let _timer = metrics::METRIC_OPTIMIZE_PHYSICAL_ELAPSED.start_timer();
let _timer = metrics::OPTIMIZE_PHYSICAL_ELAPSED.start_timer();
let state = ctx.state();
let config = state.config_options();
@@ -424,16 +423,22 @@ impl QueryExecutor for DatafusionQueryEngine {
ctx: &QueryEngineContext,
plan: &Arc<dyn PhysicalPlan>,
) -> Result<SendableRecordBatchStream> {
let _timer = metrics::METRIC_EXEC_PLAN_ELAPSED.start_timer();
let exec_timer = metrics::EXEC_PLAN_ELAPSED.start_timer();
let task_ctx = ctx.build_task_ctx();
match plan.output_partitioning().partition_count() {
0 => Ok(Box::pin(EmptyRecordBatchStream::new(plan.schema()))),
1 => Ok(plan
.execute(0, task_ctx)
.context(error::ExecutePhysicalPlanSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu))?,
1 => {
let stream = plan
.execute(0, task_ctx)
.context(error::ExecutePhysicalPlanSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
let stream = OnDone::new(stream, move || {
exec_timer.observe_duration();
});
Ok(Box::pin(stream))
}
_ => {
let df_plan = Arc::new(DfPhysicalPlanAdapter(plan.clone()));
// merge into a single partition
@@ -450,84 +455,15 @@ impl QueryExecutor for DatafusionQueryEngine {
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
stream.set_metrics2(df_plan);
let stream = OnDone::new(Box::pin(stream), move || {
exec_timer.observe_duration();
});
Ok(Box::pin(stream))
}
}
}
}
fn convert_filter_to_df_filter(filter: Expr) -> Result<DfExpr> {
match filter {
Expr::BinaryOp { left, op, right } => {
let left = convert_filter_to_df_filter(*left)?;
let right = convert_filter_to_df_filter(*right)?;
match op {
BinaryOperator::Eq => Ok(left.eq(right)),
_ => UnimplementedSnafu {
operation: format!("convert BinaryOperator into datafusion Expr, op: {op}"),
}
.fail(),
}
}
Expr::Value(value) => match value {
Value::SingleQuotedString(v) => Ok(DfExpr::Literal(ScalarValue::Utf8(Some(v)))),
_ => UnimplementedSnafu {
operation: format!("convert Expr::Value into datafusion Expr, value: {value}"),
}
.fail(),
},
Expr::Identifier(ident) => Ok(DfExpr::Column(Column::from_name(ident.value))),
_ => UnimplementedSnafu {
operation: format!("convert Expr into datafusion Expr, Expr: {filter}"),
}
.fail(),
}
}
/// Creates a table in memory and executes a show statement on the table.
pub async fn execute_show_with_filter(
record_batch: RecordBatch,
filter: Option<Expr>,
) -> Result<Output> {
let table_name = "table_name";
let column_schemas = record_batch.schema.column_schemas().to_vec();
let context = SessionContext::new();
context
.register_batch(table_name, record_batch.into_df_record_batch())
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
let mut dataframe = context
.sql(&format!("SELECT * FROM {table_name}"))
.await
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
if let Some(filter) = filter {
let filter = convert_filter_to_df_filter(filter)?;
dataframe = dataframe
.filter(filter)
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?
}
let df_batches = dataframe
.collect()
.await
.context(error::DatafusionSnafu)
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
let mut batches = Vec::with_capacity(df_batches.len());
let schema = Arc::new(Schema::try_new(column_schemas).context(CreateSchemaSnafu)?);
for df_batch in df_batches.into_iter() {
let batch = RecordBatch::try_from_df_record_batch(schema.clone(), df_batch)
.context(CreateRecordBatchSnafu)?;
batches.push(batch);
}
let record_batches = RecordBatches::try_new(schema, batches).context(CreateRecordBatchSnafu)?;
Ok(Output::RecordBatches(record_batches))
}
#[cfg(test)]
mod tests {
use std::borrow::Cow::Borrowed;
@@ -536,17 +472,12 @@ mod tests {
use catalog::RegisterTableRequest;
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, NUMBERS_TABLE_ID};
use common_query::Output;
use common_recordbatch::{util, RecordBatch};
use common_recordbatch::util;
use datafusion::prelude::{col, lit};
use datatypes::prelude::{ConcreteDataType, MutableVector, ScalarVectorBuilder};
use datatypes::schema::{ColumnSchema, Schema};
use datatypes::types::StringType;
use datatypes::vectors::{Helper, StringVectorBuilder, UInt32Vector, UInt64Vector, VectorRef};
use datatypes::prelude::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use datatypes::vectors::{Helper, UInt32Vector, UInt64Vector, VectorRef};
use session::context::QueryContext;
use sql::dialect::GreptimeDbDialect;
use sql::parser::{ParseOptions, ParserContext};
use sql::statements::show::{ShowKind, ShowTables};
use sql::statements::statement::Statement;
use table::table::numbers::{NumbersTable, NUMBERS_TABLE_NAME};
use super::*;
@@ -691,71 +622,4 @@ mod tests {
);
assert_eq!("Limit: skip=0, fetch=20\n Aggregate: groupBy=[[]], aggr=[[SUM(CAST(numbers.number AS UInt64))]]\n TableScan: numbers projection=[number]", format!("{}", logical_plan.display_indent()));
}
#[tokio::test]
async fn test_show_tables() {
// No filter
let column_schemas = vec![ColumnSchema::new(
"Tables",
ConcreteDataType::String(StringType),
false,
)];
let schema = Arc::new(Schema::new(column_schemas));
let mut builder = StringVectorBuilder::with_capacity(3);
builder.push(Some("monitor"));
builder.push(Some("system_metrics"));
let columns = vec![builder.to_vector()];
let record_batch = RecordBatch::new(schema, columns).unwrap();
let output = execute_show_with_filter(record_batch, None).await.unwrap();
let Output::RecordBatches(record_batches) = output else {
unreachable!()
};
let expected = "\
+----------------+
| Tables |
+----------------+
| monitor |
| system_metrics |
+----------------+";
assert_eq!(record_batches.pretty_print().unwrap(), expected);
// Filter
let column_schemas = vec![ColumnSchema::new(
"Tables",
ConcreteDataType::String(StringType),
false,
)];
let schema = Arc::new(Schema::new(column_schemas));
let mut builder = StringVectorBuilder::with_capacity(3);
builder.push(Some("monitor"));
builder.push(Some("system_metrics"));
let columns = vec![builder.to_vector()];
let record_batch = RecordBatch::new(schema, columns).unwrap();
let statement = ParserContext::create_with_dialect(
"SHOW TABLES WHERE \"Tables\"='monitor'",
&GreptimeDbDialect {},
ParseOptions::default(),
)
.unwrap()[0]
.clone();
let Statement::ShowTables(ShowTables { kind, .. }) = statement else {
unreachable!()
};
let ShowKind::Where(filter) = kind else {
unreachable!()
};
let output = execute_show_with_filter(record_batch, Some(filter))
.await
.unwrap();
let Output::RecordBatches(record_batches) = output else {
unreachable!()
};
let expected = "\
+---------+
| Tables |
+---------+
| monitor |
+---------+";
assert_eq!(record_batches.pretty_print().unwrap(), expected);
}
}

View File

@@ -49,12 +49,16 @@ impl DfContextProviderAdapter {
pub(crate) async fn try_new(
engine_state: Arc<QueryEngineState>,
session_state: SessionState,
df_stmt: &DfStatement,
df_stmt: Option<&DfStatement>,
query_ctx: QueryContextRef,
) -> Result<Self> {
let table_names = session_state
.resolve_table_references(df_stmt)
.context(DataFusionSnafu)?;
let table_names = if let Some(df_stmt) = df_stmt {
session_state
.resolve_table_references(df_stmt)
.context(DataFusionSnafu)?
} else {
vec![]
};
let mut table_provider = DfTableSourceProvider::new(
engine_state.catalog_manager().clone(),

View File

@@ -47,9 +47,7 @@ use store_api::storage::RegionId;
use tokio::time::Instant;
use crate::error::ConvertSchemaSnafu;
use crate::metrics::{
METRIC_MERGE_SCAN_ERRORS_TOTAL, METRIC_MERGE_SCAN_POLL_ELAPSED, METRIC_MERGE_SCAN_REGIONS,
};
use crate::metrics::{MERGE_SCAN_ERRORS_TOTAL, MERGE_SCAN_POLL_ELAPSED, MERGE_SCAN_REGIONS};
use crate::region_query::RegionQueryHandlerRef;
#[derive(Debug, Hash, PartialEq, Eq, Clone)]
@@ -170,7 +168,7 @@ impl MergeScanExec {
let tracing_context = TracingContext::from_json(context.session_id().as_str());
let stream = Box::pin(stream!({
METRIC_MERGE_SCAN_REGIONS.observe(regions.len() as f64);
MERGE_SCAN_REGIONS.observe(regions.len() as f64);
let _finish_timer = metric.finish_time().timer();
let mut ready_timer = metric.ready_time().timer();
let mut first_consume_timer = Some(metric.first_consume_time().timer());
@@ -188,7 +186,7 @@ impl MergeScanExec {
.do_get(request)
.await
.map_err(|e| {
METRIC_MERGE_SCAN_ERRORS_TOTAL.inc();
MERGE_SCAN_ERRORS_TOTAL.inc();
BoxedError::new(e)
})
.context(ExternalSnafu)?;
@@ -227,7 +225,7 @@ impl MergeScanExec {
metric.record_greptime_exec_cost(value as usize);
}
METRIC_MERGE_SCAN_POLL_ELAPSED.observe(poll_duration.as_secs_f64());
MERGE_SCAN_POLL_ELAPSED.observe(poll_duration.as_secs_f64());
}
}));

View File

@@ -54,24 +54,12 @@ pub enum Error {
#[snafu(display("Table not found: {}", table))]
TableNotFound { table: String, location: Location },
#[snafu(display("Failed to do vector computation"))]
VectorComputation {
source: datatypes::error::Error,
location: Location,
},
#[snafu(display("Failed to create RecordBatch"))]
CreateRecordBatch {
source: common_recordbatch::error::Error,
location: Location,
},
#[snafu(display("Failed to create Schema"))]
CreateSchema {
source: datatypes::error::Error,
location: Location,
},
#[snafu(display("Failure during query execution"))]
QueryExecution {
source: BoxedError,
@@ -291,9 +279,7 @@ impl ErrorExt for Error {
QueryAccessDenied { .. } => StatusCode::AccessDenied,
Catalog { source, .. } => source.status_code(),
VectorComputation { source, .. } | ConvertDatafusionSchema { source, .. } => {
source.status_code()
}
ConvertDatafusionSchema { source, .. } => source.status_code(),
CreateRecordBatch { source, .. } => source.status_code(),
QueryExecution { source, .. } | QueryPlan { source, .. } => source.status_code(),
DataFusion { error, .. } => match error {
@@ -306,7 +292,6 @@ impl ErrorExt for Error {
Sql { source, .. } => source.status_code(),
PlanSql { .. } => StatusCode::PlanQuery,
ConvertSqlType { source, .. } | ConvertSqlValue { source, .. } => source.status_code(),
CreateSchema { source, .. } => source.status_code(),
RegionQuery { source, .. } => source.status_code(),
TableMutation { source, .. } => source.status_code(),

View File

@@ -21,7 +21,7 @@ pub mod dist_plan;
pub mod error;
pub mod executor;
pub mod logical_optimizer;
mod metrics;
pub mod metrics;
mod optimizer;
pub mod parser;
pub mod physical_optimizer;

View File

@@ -12,53 +12,90 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::pin::Pin;
use std::task::{Context, Poll};
use common_recordbatch::{RecordBatch, RecordBatchStream, SendableRecordBatchStream};
use datatypes::schema::SchemaRef;
use futures::Stream;
use futures_util::ready;
use lazy_static::lazy_static;
use prometheus::*;
lazy_static! {
pub static ref METRIC_PARSE_SQL_ELAPSED: Histogram = register_histogram!(
"greptime_query_parse_sql_elapsed",
"query parse sql elapsed"
/// Timer of different stages in query.
pub static ref QUERY_STAGE_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_query_stage_elapsed",
"query engine time elapsed during each stage",
&["stage"],
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
pub static ref METRIC_PARSE_PROMQL_ELAPSED: Histogram = register_histogram!(
"greptime_query_parse_promql_elapsed",
"query parse promql elapsed"
)
.unwrap();
pub static ref METRIC_OPTIMIZE_LOGICAL_ELAPSED: Histogram = register_histogram!(
"greptime_query_optimize_logicalplan_elapsed",
"query optimize logicalplan elapsed"
)
.unwrap();
pub static ref METRIC_OPTIMIZE_PHYSICAL_ELAPSED: Histogram = register_histogram!(
"greptime_query_optimize_physicalplan_elapsed",
"query optimize physicalplan elapsed"
)
.unwrap();
pub static ref METRIC_CREATE_PHYSICAL_ELAPSED: Histogram = register_histogram!(
"greptime_query_create_physicalplan_elapsed",
"query create physicalplan elapsed"
)
.unwrap();
pub static ref METRIC_EXEC_PLAN_ELAPSED: Histogram = register_histogram!(
"greptime_query_execute_plan_elapsed",
"query execute plan elapsed"
)
.unwrap();
pub static ref METRIC_MERGE_SCAN_POLL_ELAPSED: Histogram = register_histogram!(
"greptime_query_merge_scan_poll_elapsed",
"query merge scan poll elapsed"
)
.unwrap();
pub static ref METRIC_MERGE_SCAN_REGIONS: Histogram = register_histogram!(
pub static ref PARSE_SQL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
.with_label_values(&["parse_sql"]);
pub static ref PARSE_PROMQL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
.with_label_values(&["parse_promql"]);
pub static ref OPTIMIZE_LOGICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
.with_label_values(&["optimize_logicalplan"]);
pub static ref OPTIMIZE_PHYSICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
.with_label_values(&["optimize_physicalplan"]);
pub static ref CREATE_PHYSICAL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
.with_label_values(&["create_physicalplan"]);
pub static ref EXEC_PLAN_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
.with_label_values(&["execute_plan"]);
pub static ref MERGE_SCAN_POLL_ELAPSED: Histogram = QUERY_STAGE_ELAPSED
.with_label_values(&["merge_scan_poll"]);
pub static ref MERGE_SCAN_REGIONS: Histogram = register_histogram!(
"greptime_query_merge_scan_regions",
"query merge scan regions"
)
.unwrap();
pub static ref METRIC_MERGE_SCAN_ERRORS_TOTAL: IntCounter = register_int_counter!(
pub static ref MERGE_SCAN_ERRORS_TOTAL: IntCounter = register_int_counter!(
"greptime_query_merge_scan_errors_total",
"query merge scan errors total"
)
.unwrap();
}
/// A stream to call the callback once a RecordBatch stream is done.
pub struct OnDone<F> {
stream: SendableRecordBatchStream,
callback: Option<F>,
}
impl<F> OnDone<F> {
/// Attaches a `callback` to invoke once the `stream` is terminated.
pub fn new(stream: SendableRecordBatchStream, callback: F) -> Self {
Self {
stream,
callback: Some(callback),
}
}
}
impl<F: FnOnce() + Unpin> RecordBatchStream for OnDone<F> {
fn schema(&self) -> SchemaRef {
self.stream.schema()
}
}
impl<F: FnOnce() + Unpin> Stream for OnDone<F> {
type Item = common_recordbatch::error::Result<RecordBatch>;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
match ready!(Pin::new(&mut self.stream).poll_next(cx)) {
Some(rb) => Poll::Ready(Some(rb)),
None => {
if let Some(callback) = self.callback.take() {
callback();
}
Poll::Ready(None)
}
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.stream.size_hint()
}
}

View File

@@ -20,6 +20,7 @@ use std::time::{Duration, SystemTime};
use chrono::DateTime;
use common_error::ext::{BoxedError, PlainError};
use common_error::status_code::StatusCode;
use common_telemetry::tracing;
use promql_parser::parser::ast::{Extension as NodeExtension, ExtensionExpr};
use promql_parser::parser::Expr::Extension;
use promql_parser::parser::{EvalStmt, Expr, ValueType};
@@ -33,7 +34,7 @@ use crate::error::{
AddSystemTimeOverflowSnafu, MultipleStatementsSnafu, ParseFloatSnafu, ParseTimestampSnafu,
QueryParseSnafu, Result, UnimplementedSnafu,
};
use crate::metrics::{METRIC_PARSE_PROMQL_ELAPSED, METRIC_PARSE_SQL_ELAPSED};
use crate::metrics::{PARSE_PROMQL_ELAPSED, PARSE_SQL_ELAPSED};
const DEFAULT_LOOKBACK: u64 = 5 * 60; // 5m
pub const DEFAULT_LOOKBACK_STRING: &str = "5m";
@@ -116,7 +117,7 @@ pub struct QueryLanguageParser {}
impl QueryLanguageParser {
/// Try to parse SQL with GreptimeDB dialect, return the statement when success.
pub fn parse_sql(sql: &str, _query_ctx: &QueryContextRef) -> Result<QueryStatement> {
let _timer = METRIC_PARSE_SQL_ELAPSED.start_timer();
let _timer = PARSE_SQL_ELAPSED.start_timer();
let mut statement =
ParserContext::create_with_dialect(sql, &GreptimeDbDialect {}, ParseOptions::default())
.map_err(BoxedError::new)
@@ -132,8 +133,9 @@ impl QueryLanguageParser {
}
/// Try to parse PromQL, return the statement when success.
#[tracing::instrument(skip_all)]
pub fn parse_promql(query: &PromQuery, _query_ctx: &QueryContextRef) -> Result<QueryStatement> {
let _timer = METRIC_PARSE_PROMQL_ELAPSED.start_timer();
let _timer = PARSE_PROMQL_ELAPSED.start_timer();
let expr = promql_parser::parser::parse(&query.query)
.map_err(|msg| BoxedError::new(PlainError::new(msg, StatusCode::InvalidArguments)))

View File

@@ -12,18 +12,23 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::sync::Arc;
use async_trait::async_trait;
use catalog::table_source::DfTableSourceProvider;
use common_error::ext::BoxedError;
use common_telemetry::tracing;
use datafusion::common::DFSchema;
use datafusion::execution::context::SessionState;
use datafusion::sql::planner::PlannerContext;
use datafusion_expr::Expr as DfExpr;
use datafusion_sql::planner::{ParserOptions, SqlToRel};
use promql::planner::PromPlanner;
use promql_parser::parser::EvalStmt;
use session::context::QueryContextRef;
use snafu::ResultExt;
use sql::ast::Expr as SqlExpr;
use sql::statements::statement::Statement;
use crate::error::{DataFusionSnafu, PlanSqlSnafu, QueryPlanSnafu, Result, SqlSnafu};
@@ -36,6 +41,8 @@ use crate::{DfContextProviderAdapter, QueryEngineContext};
#[async_trait]
pub trait LogicalPlanner: Send + Sync {
async fn plan(&self, stmt: QueryStatement, query_ctx: QueryContextRef) -> Result<LogicalPlan>;
fn as_any(&self) -> &dyn Any;
}
pub struct DfLogicalPlanner {
@@ -65,7 +72,7 @@ impl DfLogicalPlanner {
let context_provider = DfContextProviderAdapter::try_new(
self.engine_state.clone(),
self.session_state.clone(),
&df_stmt,
Some(&df_stmt),
query_ctx.clone(),
)
.await?;
@@ -95,6 +102,36 @@ impl DfLogicalPlanner {
Ok(LogicalPlan::DfPlan(plan))
}
/// Generate a relational expression from a SQL expression
#[tracing::instrument(skip_all)]
pub(crate) async fn sql_to_expr(
&self,
sql: SqlExpr,
schema: &DFSchema,
normalize_ident: bool,
query_ctx: QueryContextRef,
) -> Result<DfExpr> {
let context_provider = DfContextProviderAdapter::try_new(
self.engine_state.clone(),
self.session_state.clone(),
None,
query_ctx,
)
.await?;
let config_options = self.session_state.config().options();
let parser_options = ParserOptions {
enable_ident_normalization: normalize_ident,
parse_float_as_decimal: config_options.sql_parser.parse_float_as_decimal,
};
let sql_to_rel = SqlToRel::new_with_options(&context_provider, parser_options);
sql_to_rel
.sql_to_expr(sql.into(), schema, &mut PlannerContext::new())
.context(DataFusionSnafu)
}
#[tracing::instrument(skip_all)]
async fn plan_pql(&self, stmt: EvalStmt, query_ctx: QueryContextRef) -> Result<LogicalPlan> {
let table_provider = DfTableSourceProvider::new(
@@ -119,4 +156,8 @@ impl LogicalPlanner for DfLogicalPlanner {
QueryStatement::Promql(stmt) => self.plan_pql(stmt, query_ctx).await,
}
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -17,22 +17,28 @@ mod show_create_table;
use std::collections::HashMap;
use std::sync::Arc;
use catalog::information_schema::{schemata, tables, SCHEMATA, TABLES};
use catalog::CatalogManagerRef;
use common_catalog::consts::{
SEMANTIC_TYPE_FIELD, SEMANTIC_TYPE_PRIMARY_KEY, SEMANTIC_TYPE_TIME_INDEX,
INFORMATION_SCHEMA_NAME, SEMANTIC_TYPE_FIELD, SEMANTIC_TYPE_PRIMARY_KEY,
SEMANTIC_TYPE_TIME_INDEX,
};
use common_catalog::format_full_table_name;
use common_datasource::file_format::{infer_schemas, FileFormat, Format};
use common_datasource::lister::{Lister, Source};
use common_datasource::object_store::build_backend;
use common_datasource::util::find_dir_and_filename;
use common_query::prelude::GREPTIME_TIMESTAMP;
use common_query::Output;
use common_recordbatch::{RecordBatch, RecordBatches};
use common_recordbatch::adapter::RecordBatchStreamAdapter;
use common_recordbatch::RecordBatches;
use common_time::timezone::get_timezone;
use common_time::Timestamp;
use datafusion::prelude::SessionContext;
use datafusion_expr::{col, lit, Expr};
use datatypes::prelude::*;
use datatypes::schema::{ColumnDefaultConstraint, ColumnSchema, RawSchema, Schema};
use datatypes::vectors::{Helper, StringVector};
use datatypes::vectors::StringVector;
use object_store::ObjectStore;
use once_cell::sync::Lazy;
use regex::Regex;
@@ -44,11 +50,14 @@ use sql::statements::show::{ShowDatabases, ShowKind, ShowTables, ShowVariables};
use table::requests::{FILE_TABLE_LOCATION_KEY, FILE_TABLE_PATTERN_KEY};
use table::TableRef;
use crate::datafusion::execute_show_with_filter;
use crate::dataframe::DataFrame;
use crate::error::{self, Result, UnsupportedVariableSnafu};
use crate::planner::DfLogicalPlanner;
use crate::QueryEngineRef;
const SCHEMAS_COLUMN: &str = "Schemas";
const SCHEMAS_COLUMN: &str = "Database";
const TABLES_COLUMN: &str = "Tables";
const TABLE_TYPE_COLUMN: &str = "Table_type";
const COLUMN_NAME_COLUMN: &str = "Column";
const COLUMN_TYPE_COLUMN: &str = "Type";
const COLUMN_KEY_COLUMN: &str = "Key";
@@ -100,49 +109,144 @@ static SHOW_CREATE_TABLE_OUTPUT_SCHEMA: Lazy<Arc<Schema>> = Lazy::new(|| {
pub async fn show_databases(
stmt: ShowDatabases,
catalog_manager: CatalogManagerRef,
query_engine: &QueryEngineRef,
catalog_manager: &CatalogManagerRef,
query_ctx: QueryContextRef,
) -> Result<Output> {
let mut databases = catalog_manager
.schema_names(query_ctx.current_catalog())
let projects = vec![(schemata::SCHEMA_NAME, SCHEMAS_COLUMN)];
let filters = vec![col(schemata::CATALOG_NAME).eq(lit(query_ctx.current_catalog()))];
let like_field = Some(schemata::SCHEMA_NAME);
let sort = vec![col(schemata::SCHEMA_NAME).sort(true, true)];
query_from_information_schema_table(
query_engine,
catalog_manager,
query_ctx,
SCHEMATA,
projects,
filters,
like_field,
sort,
stmt.kind,
)
.await
}
/// Cast a `show` statement execution into a query from tables in `information_schema`.
/// - `table_name`: the table name in `information_schema`,
/// - `projects`: query projection, a list of `(column, renamed_column)`,
/// - `filters`: filter expressions for query,
/// - `like_field`: the field to filter by the predicate `ShowKind::Like`,
/// - `sort`: sort the results by the specified sorting expressions,
/// - `kind`: the show kind
#[allow(clippy::too_many_arguments)]
async fn query_from_information_schema_table(
query_engine: &QueryEngineRef,
catalog_manager: &CatalogManagerRef,
query_ctx: QueryContextRef,
table_name: &str,
projects: Vec<(&str, &str)>,
filters: Vec<Expr>,
like_field: Option<&str>,
sort: Vec<Expr>,
kind: ShowKind,
) -> Result<Output> {
let table = catalog_manager
.table(
query_ctx.current_catalog(),
INFORMATION_SCHEMA_NAME,
table_name,
)
.await
.context(error::CatalogSnafu)?;
.context(error::CatalogSnafu)?
.with_context(|| error::TableNotFoundSnafu {
table: format_full_table_name(
query_ctx.current_catalog(),
INFORMATION_SCHEMA_NAME,
table_name,
),
})?;
// TODO(dennis): Specify the order of the results in catalog manager API
databases.sort();
let DataFrame::DataFusion(dataframe) = query_engine.read_table(table)?;
let schema = Arc::new(Schema::new(vec![ColumnSchema::new(
SCHEMAS_COLUMN,
ConcreteDataType::string_datatype(),
false,
)]));
match stmt.kind {
ShowKind::All => {
let databases = Arc::new(StringVector::from(databases)) as _;
let records = RecordBatches::try_from_columns(schema, vec![databases])
.context(error::CreateRecordBatchSnafu)?;
Ok(Output::RecordBatches(records))
// Apply filters
let dataframe = filters.into_iter().try_fold(dataframe, |df, expr| {
df.filter(expr).context(error::PlanSqlSnafu)
})?;
// Apply `like` predicate if exists
let dataframe = if let (ShowKind::Like(ident), Some(field)) = (&kind, like_field) {
dataframe
.filter(col(field).like(lit(ident.value.clone())))
.context(error::PlanSqlSnafu)?
} else {
dataframe
};
// Apply sorting
let dataframe = dataframe
.sort(sort)
.context(error::PlanSqlSnafu)?
.select_columns(&projects.iter().map(|(c, _)| *c).collect::<Vec<_>>())
.context(error::PlanSqlSnafu)?;
// Apply projection
let dataframe = projects
.into_iter()
.try_fold(dataframe, |df, (column, renamed_column)| {
df.with_column_renamed(column, renamed_column)
.context(error::PlanSqlSnafu)
})?;
let dataframe = match kind {
ShowKind::All | ShowKind::Like(_) => {
// Like kind is processed above
dataframe
}
ShowKind::Where(filter) => {
let columns = vec![Arc::new(StringVector::from(databases)) as _];
let record_batch =
RecordBatch::new(schema, columns).context(error::CreateRecordBatchSnafu)?;
let result = execute_show_with_filter(record_batch, Some(filter)).await?;
Ok(result)
// Cast the results into VIEW for `where` clause,
// which is evaluated against the column names displayed by the SHOW statement.
let view = dataframe.into_view();
let dataframe = SessionContext::new_with_state(
query_engine
.engine_context(query_ctx.clone())
.state()
.clone(),
)
.read_table(view)
.context(error::DataFusionSnafu)?;
let planner = query_engine.planner();
let planner = planner
.as_any()
.downcast_ref::<DfLogicalPlanner>()
.expect("Must be the datafusion planner");
let filter = planner
.sql_to_expr(filter, dataframe.schema(), false, query_ctx)
.await?;
// Apply the `where` clause filters
dataframe.filter(filter).context(error::PlanSqlSnafu)?
}
ShowKind::Like(ident) => {
let databases = Helper::like_utf8(databases, &ident.value)
.context(error::VectorComputationSnafu)?;
let records = RecordBatches::try_from_columns(schema, vec![databases])
.context(error::CreateRecordBatchSnafu)?;
Ok(Output::RecordBatches(records))
}
}
};
let stream = dataframe
.execute_stream()
.await
.context(error::DataFusionSnafu)?;
Ok(Output::Stream(
Box::pin(RecordBatchStreamAdapter::try_new(stream).context(error::CreateRecordBatchSnafu)?),
None,
))
}
pub async fn show_tables(
stmt: ShowTables,
catalog_manager: CatalogManagerRef,
query_engine: &QueryEngineRef,
catalog_manager: &CatalogManagerRef,
query_ctx: QueryContextRef,
) -> Result<Output> {
let schema_name = if let Some(database) = stmt.database {
@@ -150,85 +254,36 @@ pub async fn show_tables(
} else {
query_ctx.current_schema().to_owned()
};
// TODO(sunng87): move this function into query_ctx
let mut tables = catalog_manager
.table_names(query_ctx.current_catalog(), &schema_name)
.await
.context(error::CatalogSnafu)?;
// TODO(dennis): Specify the order of the results in schema provider API
tables.sort();
let table_types: Option<Arc<dyn Vector>> = {
if stmt.full {
Some(
get_table_types(
&tables,
catalog_manager.clone(),
query_ctx.clone(),
&schema_name,
)
.await?,
)
} else {
None
}
// (dennis): MySQL rename `table_name` to `Tables_in_{schema}`, but we use `Tables` instead.
// I don't want to modify this currently, our dashboard may depend on it.
let projects = if stmt.full {
vec![
(tables::TABLE_NAME, TABLES_COLUMN),
(tables::TABLE_TYPE, TABLE_TYPE_COLUMN),
]
} else {
vec![(tables::TABLE_NAME, TABLES_COLUMN)]
};
let filters = vec![
col(tables::TABLE_SCHEMA).eq(lit(schema_name.clone())),
col(tables::TABLE_CATALOG).eq(lit(query_ctx.current_catalog())),
];
let like_field = Some(tables::TABLE_NAME);
let sort = vec![col(tables::TABLE_NAME).sort(true, true)];
let mut column_schema = vec![ColumnSchema::new(
TABLES_COLUMN,
ConcreteDataType::string_datatype(),
false,
)];
if table_types.is_some() {
column_schema.push(ColumnSchema::new(
"Table_type",
ConcreteDataType::string_datatype(),
false,
));
}
let schema = Arc::new(Schema::new(column_schema));
match stmt.kind {
ShowKind::All => {
let tables = Arc::new(StringVector::from(tables)) as _;
let mut columns = vec![tables];
if let Some(table_types) = table_types {
columns.push(table_types)
}
let records = RecordBatches::try_from_columns(schema, columns)
.context(error::CreateRecordBatchSnafu)?;
Ok(Output::RecordBatches(records))
}
ShowKind::Where(filter) => {
let mut columns = vec![Arc::new(StringVector::from(tables)) as _];
if let Some(table_types) = table_types {
columns.push(table_types)
}
let record_batch =
RecordBatch::new(schema, columns).context(error::CreateRecordBatchSnafu)?;
let result = execute_show_with_filter(record_batch, Some(filter)).await?;
Ok(result)
}
ShowKind::Like(ident) => {
let (tables, filter) = Helper::like_utf8_filter(tables, &ident.value)
.context(error::VectorComputationSnafu)?;
let mut columns = vec![tables];
if let Some(table_types) = table_types {
let table_types = table_types
.filter(&filter)
.context(error::VectorComputationSnafu)?;
columns.push(table_types)
}
let records = RecordBatches::try_from_columns(schema, columns)
.context(error::CreateRecordBatchSnafu)?;
Ok(Output::RecordBatches(records))
}
}
query_from_information_schema_table(
query_engine,
catalog_manager,
query_ctx,
TABLES,
projects,
filters,
like_field,
sort,
stmt.kind,
)
.await
}
pub fn show_variable(stmt: ShowVariables, query_ctx: QueryContextRef) -> Result<Output> {
@@ -513,25 +568,6 @@ fn parse_file_table_format(options: &HashMap<String, String>) -> Result<Box<dyn
)
}
async fn get_table_types(
tables: &[String],
catalog_manager: CatalogManagerRef,
query_ctx: QueryContextRef,
schema_name: &str,
) -> Result<Arc<dyn Vector>> {
let mut table_types = Vec::with_capacity(tables.len());
for table_name in tables {
if let Some(table) = catalog_manager
.table(query_ctx.current_catalog(), schema_name, table_name)
.await
.context(error::CatalogSnafu)?
{
table_types.push(table.table_type().to_string());
}
}
Ok(Arc::new(StringVector::from(table_types)) as _)
}
#[cfg(test)]
mod test {
use std::sync::Arc;

View File

@@ -60,6 +60,7 @@ itertools.workspace = true
lazy_static.workspace = true
mime_guess = "2.0"
notify = "6.1"
object-pool = "0.5"
once_cell.workspace = true
openmetrics-parser = "0.4"
opensrv-mysql = "0.7.0"
@@ -114,6 +115,7 @@ catalog = { workspace = true, features = ["testing"] }
client.workspace = true
common-base.workspace = true
common-test-util.workspace = true
criterion = "0.4"
mysql_async = { version = "0.33", default-features = false, features = [
"default-rustls",
] }
@@ -129,3 +131,7 @@ tokio-test = "0.4"
[build-dependencies]
common-version.workspace = true
[[bench]]
name = "bench_prom"
harness = false

View File

@@ -0,0 +1,21 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use criterion::criterion_main;
mod prom_decode;
criterion_main! {
prom_decode::benches
}

View File

@@ -0,0 +1,53 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::time::Duration;
use api::prom_store::remote::WriteRequest;
use bytes::Bytes;
use criterion::{criterion_group, criterion_main, Criterion};
use prost::Message;
use servers::prom_store::to_grpc_row_insert_requests;
use servers::proto::PromWriteRequest;
fn bench_decode_prom_request(c: &mut Criterion) {
let mut d = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
d.push("benches");
d.push("write_request.pb.data");
let data = Bytes::from(std::fs::read(d).unwrap());
let mut request = WriteRequest::default();
let mut prom_request = PromWriteRequest::default();
c.benchmark_group("decode")
.measurement_time(Duration::from_secs(3))
.bench_function("write_request", |b| {
b.iter(|| {
request.clear();
let data = data.clone();
request.merge(data).unwrap();
to_grpc_row_insert_requests(&request).unwrap();
});
})
.bench_function("prom_write_request", |b| {
b.iter(|| {
let data = data.clone();
prom_request.merge(data).unwrap();
prom_request.as_row_insert_requests();
});
});
}
criterion_group!(benches, bench_decode_prom_request);
criterion_main!(benches);

Binary file not shown.

View File

@@ -43,6 +43,7 @@ use tonic_reflection::server::{ServerReflection, ServerReflectionServer};
use crate::error::{
AlreadyStartedSnafu, InternalSnafu, Result, StartGrpcSnafu, TcpBindSnafu, TcpIncomingSnafu,
};
use crate::metrics::MetricsMiddlewareLayer;
use crate::server::Server;
type TonicResult<T> = std::result::Result<T, Status>;
@@ -168,7 +169,12 @@ impl Server for GrpcServer {
(incoming, addr)
};
let metrics_layer = tower::ServiceBuilder::new()
.layer(MetricsMiddlewareLayer)
.into_inner();
let builder = tonic::transport::Server::builder()
.layer(metrics_layer)
.add_routes(routes)
.add_service(self.create_healthcheck_service())
.add_service(self.create_reflection_service());

View File

@@ -26,7 +26,8 @@ use arrow_flight::{
use async_trait::async_trait;
use common_grpc::flight::{FlightEncoder, FlightMessage};
use common_query::Output;
use common_telemetry::tracing_context::TracingContext;
use common_telemetry::tracing::info_span;
use common_telemetry::tracing_context::{FutureExt, TracingContext};
use futures::Stream;
use prost::Message;
use snafu::ResultExt;
@@ -34,7 +35,7 @@ use tonic::{Request, Response, Status, Streaming};
use crate::error;
pub use crate::grpc::flight::stream::FlightRecordBatchStream;
use crate::grpc::greptime_handler::GreptimeRequestHandler;
use crate::grpc::greptime_handler::{get_request_type, GreptimeRequestHandler};
use crate::grpc::TonicResult;
pub type TonicStream<T> = Pin<Box<dyn Stream<Item = TonicResult<T>> + Send + Sync + 'static>>;
@@ -152,11 +153,20 @@ impl FlightCraft for GreptimeRequestHandler {
let request =
GreptimeRequest::decode(ticket.as_ref()).context(error::InvalidFlightTicketSnafu)?;
let output = self.handle_request(request).await?;
let stream: Pin<Box<dyn Stream<Item = Result<FlightData, Status>> + Send + Sync>> =
to_flight_data_stream(output, TracingContext::new());
Ok(Response::new(stream))
// The Grpc protocol pass query by Flight. It needs to be wrapped under a span, in order to record stream
let span = info_span!(
"GreptimeRequestHandler::do_get",
protocol = "grpc",
request_type = get_request_type(&request)
);
async {
let output = self.handle_request(request).await?;
let stream: Pin<Box<dyn Stream<Item = Result<FlightData, Status>> + Send + Sync>> =
to_flight_data_stream(output, TracingContext::from_current_span());
Ok(Response::new(stream))
}
.trace(span)
.await
}
}

View File

@@ -27,7 +27,8 @@ use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_query::Output;
use common_runtime::Runtime;
use common_telemetry::logging;
use common_telemetry::tracing_context::{FutureExt, TracingContext};
use common_telemetry::{logging, tracing};
use common_time::timezone::parse_timezone;
use session::context::{QueryContextBuilder, QueryContextRef};
use snafu::{OptionExt, ResultExt};
@@ -57,6 +58,7 @@ impl GreptimeRequestHandler {
}
}
#[tracing::instrument(skip_all, fields(protocol = "grpc", request_type = get_request_type(&request)))]
pub(crate) async fn handle_request(&self, request: GreptimeRequest) -> Result<Output> {
let query = request.request.context(InvalidQuerySnafu {
reason: "Expecting non-empty GreptimeRequest.",
@@ -79,16 +81,23 @@ impl GreptimeRequestHandler {
// - Obtaining a `JoinHandle` to get the panic message (if there's any).
// From its docs, `JoinHandle` is cancel safe. The task keeps running even it's handle been dropped.
// 2. avoid the handler blocks the gRPC runtime incidentally.
let tracing_context = TracingContext::from_current_span();
let handle = self.runtime.spawn(async move {
handler.do_query(query, query_ctx).await.map_err(|e| {
if e.status_code().should_log_error() {
logging::error!(e; "Failed to handle request");
} else {
// Currently, we still print a debug log.
logging::debug!("Failed to handle request, err: {:?}", e);
}
e
})
handler
.do_query(query, query_ctx)
.trace(tracing_context.attach(tracing::info_span!(
"GreptimeRequestHandler::handle_request_runtime"
)))
.await
.map_err(|e| {
if e.status_code().should_log_error() {
logging::error!(e; "Failed to handle request");
} else {
// Currently, we still print a debug log.
logging::debug!("Failed to handle request, err: {:?}", e);
}
e
})
});
handle.await.context(JoinTaskSnafu).map_err(|e| {
@@ -98,6 +107,14 @@ impl GreptimeRequestHandler {
}
}
pub fn get_request_type(request: &GreptimeRequest) -> &'static str {
request
.request
.as_ref()
.map(request_type)
.unwrap_or_default()
}
pub(crate) async fn auth(
user_provider: Option<UserProviderRef>,
header: Option<&RequestHeader>,

View File

@@ -16,7 +16,7 @@ use std::collections::HashMap;
use std::fmt::Display;
use std::net::SocketAddr;
use std::sync::Mutex as StdMutex;
use std::time::{Duration, Instant};
use std::time::Duration;
use aide::axum::{routing as apirouting, ApiRouter, IntoApiResponse};
use aide::openapi::{Info, OpenApi, Server as OpenAPIServer};
@@ -24,11 +24,9 @@ use aide::OperationOutput;
use async_trait::async_trait;
use auth::UserProviderRef;
use axum::error_handling::HandleErrorLayer;
use axum::extract::{DefaultBodyLimit, MatchedPath};
use axum::http::Request;
use axum::middleware::{self, Next};
use axum::extract::DefaultBodyLimit;
use axum::response::{Html, IntoResponse, Json, Response};
use axum::{routing, BoxError, Extension, Router};
use axum::{middleware, routing, BoxError, Extension, Router};
use common_base::readable_size::ReadableSize;
use common_base::Plugins;
use common_error::status_code::StatusCode;
@@ -61,9 +59,7 @@ use crate::http::influxdb_result_v1::InfluxdbV1Response;
use crate::http::prometheus::{
format_query, instant_query, label_values_query, labels_query, range_query, series_query,
};
use crate::metrics::{
HTTP_TRACK_METRICS, METRIC_HTTP_REQUESTS_ELAPSED, METRIC_HTTP_REQUESTS_TOTAL,
};
use crate::metrics::http_metrics_layer;
use crate::metrics_handler::MetricsHandler;
use crate::prometheus_handler::PrometheusHandlerRef;
use crate::query_handler::sql::ServerSqlQueryHandlerRef;
@@ -599,7 +595,7 @@ impl HttpServer {
}
// Add a layer to collect HTTP metrics for axum.
router = router.route_layer(middleware::from_fn(track_metrics));
router = router.route_layer(middleware::from_fn(http_metrics_layer));
router
}
@@ -727,35 +723,6 @@ impl HttpServer {
}
}
/// A middleware to record metrics for HTTP.
// Based on https://github.com/tokio-rs/axum/blob/axum-v0.6.16/examples/prometheus-metrics/src/main.rs
pub(crate) async fn track_metrics<B>(req: Request<B>, next: Next<B>) -> impl IntoResponse {
let _timer = HTTP_TRACK_METRICS
.with_label_values(&["value"])
.start_timer();
let start = Instant::now();
let path = if let Some(matched_path) = req.extensions().get::<MatchedPath>() {
matched_path.as_str().to_owned()
} else {
req.uri().path().to_owned()
};
let method = req.method().clone();
let response = next.run(req).await;
let latency = start.elapsed().as_secs_f64();
let status = response.status().as_u16().to_string();
let method_str = method.to_string();
let labels = [method_str.as_str(), path.as_str(), status.as_str()];
METRIC_HTTP_REQUESTS_TOTAL.with_label_values(&labels).inc();
METRIC_HTTP_REQUESTS_ELAPSED
.with_label_values(&labels)
.observe(latency);
response
}
pub const HTTP_SERVER: &str = "HTTP_SERVER";
#[async_trait]

View File

@@ -27,6 +27,7 @@ use common_plugins::GREPTIME_EXEC_PREFIX;
use common_query::physical_plan::PhysicalPlan;
use common_query::Output;
use common_recordbatch::util;
use common_telemetry::tracing;
use datafusion::physical_plan::metrics::MetricValue;
use query::parser::PromQuery;
use schemars::JsonSchema;
@@ -66,17 +67,22 @@ pub struct SqlQuery {
/// Handler to execute sql
#[axum_macros::debug_handler]
#[tracing::instrument(skip_all, fields(protocol = "http", request_type = "sql"))]
pub async fn sql(
State(state): State<ApiState>,
Query(query_params): Query<SqlQuery>,
Extension(query_ctx): Extension<QueryContextRef>,
Form(form_params): Form<SqlQuery>,
) -> HttpResponse {
let sql_handler = &state.sql_handler;
let start = Instant::now();
let sql = query_params.sql.or(form_params.sql);
let sql_handler = &state.sql_handler;
let db = query_ctx.get_db_string();
let _timer = crate::metrics::METRIC_HTTP_SQL_ELAPSED
.with_label_values(&[db.as_str()])
.start_timer();
let sql = query_params.sql.or(form_params.sql);
let format = query_params
.format
.or(form_params.format)
@@ -89,10 +95,6 @@ pub async fn sql(
.map(|s| s.to_lowercase())
.map(|s| Epoch::parse(s.as_str()).unwrap_or(Epoch::Millisecond));
let _timer = crate::metrics::METRIC_HTTP_SQL_ELAPSED
.with_label_values(&[db.as_str()])
.start_timer();
let result = if let Some(sql) = &sql {
if let Some((status, msg)) = validate_schema(sql_handler.clone(), query_ctx.clone()).await {
Err((status, msg))
@@ -250,6 +252,7 @@ impl From<PromqlQuery> for PromQuery {
/// Handler to execute promql
#[axum_macros::debug_handler]
#[tracing::instrument(skip_all, fields(protocol = "http", request_type = "promql"))]
pub async fn promql(
State(state): State<ApiState>,
Query(params): Query<PromqlQuery>,
@@ -258,6 +261,7 @@ pub async fn promql(
let sql_handler = &state.sql_handler;
let exec_start = Instant::now();
let db = query_ctx.get_db_string();
let _timer = crate::metrics::METRIC_HTTP_PROMQL_ELAPSED
.with_label_values(&[db.as_str()])
.start_timer();

View File

@@ -20,6 +20,7 @@ use axum::response::IntoResponse;
use axum::Extension;
use common_catalog::consts::DEFAULT_SCHEMA_NAME;
use common_grpc::writer::Precision;
use common_telemetry::tracing;
use session::context::QueryContextRef;
use crate::error::{Result, TimePrecisionSnafu};
@@ -39,6 +40,7 @@ pub async fn influxdb_health() -> Result<impl IntoResponse> {
}
#[axum_macros::debug_handler]
#[tracing::instrument(skip_all, fields(protocol = "influxdb", request_type = "write_v1"))]
pub async fn influxdb_write_v1(
State(handler): State<InfluxdbLineProtocolHandlerRef>,
Query(mut params): Query<HashMap<String, String>>,
@@ -58,6 +60,7 @@ pub async fn influxdb_write_v1(
}
#[axum_macros::debug_handler]
#[tracing::instrument(skip_all, fields(protocol = "influxdb", request_type = "write_v2"))]
pub async fn influxdb_write_v2(
State(handler): State<InfluxdbLineProtocolHandlerRef>,
Query(mut params): Query<HashMap<String, String>>,

View File

@@ -16,6 +16,7 @@ use axum::extract::{RawBody, State};
use axum::http::header;
use axum::response::IntoResponse;
use axum::Extension;
use common_telemetry::tracing;
use hyper::Body;
use opentelemetry_proto::tonic::collector::metrics::v1::{
ExportMetricsServiceRequest, ExportMetricsServiceResponse,
@@ -31,6 +32,7 @@ use crate::error::{self, Result};
use crate::query_handler::OpenTelemetryProtocolHandlerRef;
#[axum_macros::debug_handler]
#[tracing::instrument(skip_all, fields(protocol = "otlp", request_type = "metrics"))]
pub async fn metrics(
State(handler): State<OpenTelemetryProtocolHandlerRef>,
Extension(query_ctx): Extension<QueryContextRef>,
@@ -69,6 +71,7 @@ impl IntoResponse for OtlpMetricsResponse {
}
#[axum_macros::debug_handler]
#[tracing::instrument(skip_all, fields(protocol = "otlp", request_type = "traces"))]
pub async fn traces(
State(handler): State<OpenTelemetryProtocolHandlerRef>,
Extension(query_ctx): Extension<QueryContextRef>,

View File

@@ -15,13 +15,18 @@
use std::sync::Arc;
use api::prom_store::remote::{ReadRequest, WriteRequest};
use api::v1::RowInsertRequests;
use axum::extract::{Query, RawBody, State};
use axum::http::{header, StatusCode};
use axum::response::IntoResponse;
use axum::Extension;
use bytes::Bytes;
use common_catalog::consts::DEFAULT_SCHEMA_NAME;
use common_query::prelude::GREPTIME_PHYSICAL_TABLE;
use common_telemetry::tracing;
use hyper::Body;
use lazy_static::lazy_static;
use object_pool::Pool;
use prost::Message;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
@@ -30,9 +35,14 @@ use snafu::prelude::*;
use crate::error::{self, Result, UnexpectedPhysicalTableSnafu};
use crate::prom_store::snappy_decompress;
use crate::proto::PromWriteRequest;
use crate::query_handler::{PromStoreProtocolHandlerRef, PromStoreResponse};
pub const PHYSICAL_TABLE_PARAM: &str = "physical_table";
lazy_static! {
static ref PROM_WRITE_REQUEST_POOL: Pool<PromWriteRequest> =
Pool::new(256, PromWriteRequest::default);
}
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
pub struct DatabaseQuery {
@@ -75,6 +85,10 @@ pub async fn route_write_without_metric_engine(
}
#[axum_macros::debug_handler]
#[tracing::instrument(
skip_all,
fields(protocol = "prometheus", request_type = "remote_write")
)]
pub async fn remote_write(
State(handler): State<PromStoreProtocolHandlerRef>,
Query(params): Query<DatabaseQuery>,
@@ -86,14 +100,15 @@ pub async fn remote_write(
.with_label_values(&[db.as_str()])
.start_timer();
let request = decode_remote_write_request(body).await?;
let request = decode_remote_write_request_to_row_inserts(body).await?;
if let Some(physical_table) = params.physical_table {
let mut new_query_ctx = query_ctx.as_ref().clone();
new_query_ctx.set_extension(PHYSICAL_TABLE_PARAM, physical_table);
query_ctx = Arc::new(new_query_ctx);
}
handler.write(request, query_ctx, true).await?;
handler.write_fast(request, query_ctx, true).await?;
Ok((StatusCode::NO_CONTENT, ()))
}
@@ -111,6 +126,10 @@ impl IntoResponse for PromStoreResponse {
}
#[axum_macros::debug_handler]
#[tracing::instrument(
skip_all,
fields(protocol = "prometheus", request_type = "remote_read")
)]
pub async fn remote_read(
State(handler): State<PromStoreProtocolHandlerRef>,
Query(params): Query<DatabaseQuery>,
@@ -127,6 +146,23 @@ pub async fn remote_read(
handler.read(request, query_ctx).await
}
async fn decode_remote_write_request_to_row_inserts(body: Body) -> Result<RowInsertRequests> {
let _timer = crate::metrics::METRIC_HTTP_PROM_STORE_DECODE_ELAPSED.start_timer();
let body = hyper::body::to_bytes(body)
.await
.context(error::HyperSnafu)?;
let buf = Bytes::from(snappy_decompress(&body[..])?);
let mut request = PROM_WRITE_REQUEST_POOL.pull(PromWriteRequest::default);
request
.merge(buf)
.context(error::DecodePromRemoteRequestSnafu)?;
let (requests, samples) = request.as_row_insert_requests();
crate::metrics::METRIC_HTTP_PROM_STORE_DECODE_NUM_SERIES.observe(samples as f64);
Ok(requests)
}
async fn decode_remote_write_request(body: Body) -> Result<WriteRequest> {
let _timer = crate::metrics::METRIC_HTTP_PROM_STORE_DECODE_ELAPSED.start_timer();
let body = hyper::body::to_bytes(body)

View File

@@ -25,6 +25,7 @@ use common_error::status_code::StatusCode;
use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE};
use common_query::Output;
use common_recordbatch::RecordBatches;
use common_telemetry::tracing;
use common_time::util::{current_time_rfc3339, yesterday_rfc3339};
use datatypes::prelude::ConcreteDataType;
use datatypes::scalars::ScalarVector;
@@ -87,14 +88,16 @@ pub struct FormatQuery {
}
#[axum_macros::debug_handler]
#[tracing::instrument(
skip_all,
fields(protocol = "prometheus", request_type = "format_query")
)]
pub async fn format_query(
State(_handler): State<PrometheusHandlerRef>,
Query(params): Query<InstantQuery>,
Extension(_query_ctx): Extension<QueryContextRef>,
Form(form_params): Form<InstantQuery>,
) -> PrometheusJsonResponse {
let _timer = crate::metrics::METRIC_HTTP_PROMQL_FORMAT_QUERY_ELAPSED.start_timer();
let query = params.query.or(form_params.query).unwrap_or_default();
match promql_parser::parser::parse(&query) {
Ok(expr) => {
@@ -117,13 +120,16 @@ pub struct InstantQuery {
}
#[axum_macros::debug_handler]
#[tracing::instrument(
skip_all,
fields(protocol = "prometheus", request_type = "instant_query")
)]
pub async fn instant_query(
State(handler): State<PrometheusHandlerRef>,
Query(params): Query<InstantQuery>,
Extension(query_ctx): Extension<QueryContextRef>,
Form(form_params): Form<InstantQuery>,
) -> PrometheusJsonResponse {
let _timer = crate::metrics::METRIC_HTTP_PROMQL_INSTANT_QUERY_ELAPSED.start_timer();
// Extract time from query string, or use current server time if not specified.
let time = params
.time
@@ -157,13 +163,16 @@ pub struct RangeQuery {
}
#[axum_macros::debug_handler]
#[tracing::instrument(
skip_all,
fields(protocol = "prometheus", request_type = "range_query")
)]
pub async fn range_query(
State(handler): State<PrometheusHandlerRef>,
Query(params): Query<RangeQuery>,
Extension(query_ctx): Extension<QueryContextRef>,
Form(form_params): Form<RangeQuery>,
) -> PrometheusJsonResponse {
let _timer = crate::metrics::METRIC_HTTP_PROMQL_RANGE_QUERY_ELAPSED.start_timer();
let prom_query = PromQuery {
query: params.query.or(form_params.query).unwrap_or_default(),
start: params.start.or(form_params.start).unwrap_or_default(),
@@ -226,14 +235,16 @@ impl<'de> Deserialize<'de> for Matches {
}
#[axum_macros::debug_handler]
#[tracing::instrument(
skip_all,
fields(protocol = "prometheus", request_type = "labels_query")
)]
pub async fn labels_query(
State(handler): State<PrometheusHandlerRef>,
Query(params): Query<LabelsQuery>,
Extension(query_ctx): Extension<QueryContextRef>,
Form(form_params): Form<LabelsQuery>,
) -> PrometheusJsonResponse {
let _timer = crate::metrics::METRIC_HTTP_PROMQL_LABEL_QUERY_ELAPSED.start_timer();
let db = &params.db.unwrap_or(DEFAULT_SCHEMA_NAME.to_string());
let (catalog, schema) = parse_catalog_and_schema_from_db_string(db);
@@ -492,14 +503,16 @@ pub struct LabelValueQuery {
}
#[axum_macros::debug_handler]
#[tracing::instrument(
skip_all,
fields(protocol = "prometheus", request_type = "label_values_query")
)]
pub async fn label_values_query(
State(handler): State<PrometheusHandlerRef>,
Path(label_name): Path<String>,
Extension(query_ctx): Extension<QueryContextRef>,
Query(params): Query<LabelValueQuery>,
) -> PrometheusJsonResponse {
let _timer = crate::metrics::METRIC_HTTP_PROMQL_LABEL_VALUE_QUERY_ELAPSED.start_timer();
let db = &params.db.unwrap_or(DEFAULT_SCHEMA_NAME.to_string());
let (catalog, schema) = parse_catalog_and_schema_from_db_string(db);
@@ -618,13 +631,16 @@ pub struct SeriesQuery {
}
#[axum_macros::debug_handler]
#[tracing::instrument(
skip_all,
fields(protocol = "prometheus", request_type = "series_query")
)]
pub async fn series_query(
State(handler): State<PrometheusHandlerRef>,
Query(params): Query<SeriesQuery>,
Extension(query_ctx): Extension<QueryContextRef>,
Form(form_params): Form<SeriesQuery>,
) -> PrometheusJsonResponse {
let _timer = crate::metrics::METRIC_HTTP_PROMQL_SERIES_QUERY_ELAPSED.start_timer();
let mut queries: Vec<String> = params.matches.0;
if queries.is_empty() {
queries = form_params.matches.0;

View File

@@ -15,6 +15,7 @@
use std::borrow::Cow;
use std::sync::Arc;
use api::prom_store::remote::{ReadRequest, WriteRequest};
use api::v1::greptime_request::Request;
use common_error::ext::ErrorExt;
use common_query::Output;
@@ -246,3 +247,121 @@ where
}
}
}
/// ScriptInterceptor can track life cycle of a script request and customize or
/// abort its execution at given point.
pub trait ScriptInterceptor {
type Error: ErrorExt;
/// Called before script request is actually executed.
fn pre_execute(&self, _name: &str, _query_ctx: QueryContextRef) -> Result<(), Self::Error> {
Ok(())
}
}
pub type ScriptInterceptorRef<E> = Arc<dyn ScriptInterceptor<Error = E> + Send + Sync + 'static>;
impl<E: ErrorExt> ScriptInterceptor for Option<ScriptInterceptorRef<E>> {
type Error = E;
fn pre_execute(&self, name: &str, query_ctx: QueryContextRef) -> Result<(), Self::Error> {
if let Some(this) = self {
this.pre_execute(name, query_ctx)
} else {
Ok(())
}
}
}
/// LineProtocolInterceptor can track life cycle of a line protocol request
/// and customize or abort its execution at given point.
pub trait LineProtocolInterceptor {
type Error: ErrorExt;
fn pre_execute(&self, _line: &str, _query_ctx: QueryContextRef) -> Result<(), Self::Error> {
Ok(())
}
}
pub type LineProtocolInterceptorRef<E> =
Arc<dyn LineProtocolInterceptor<Error = E> + Send + Sync + 'static>;
impl<E: ErrorExt> LineProtocolInterceptor for Option<LineProtocolInterceptorRef<E>> {
type Error = E;
fn pre_execute(&self, line: &str, query_ctx: QueryContextRef) -> Result<(), Self::Error> {
if let Some(this) = self {
this.pre_execute(line, query_ctx)
} else {
Ok(())
}
}
}
/// OpenTelemetryProtocolInterceptor can track life cycle of an open telemetry protocol request
/// and customize or abort its execution at given point.
pub trait OpenTelemetryProtocolInterceptor {
type Error: ErrorExt;
fn pre_execute(&self, _query_ctx: QueryContextRef) -> Result<(), Self::Error> {
Ok(())
}
}
pub type OpenTelemetryProtocolInterceptorRef<E> =
Arc<dyn OpenTelemetryProtocolInterceptor<Error = E> + Send + Sync + 'static>;
impl<E: ErrorExt> OpenTelemetryProtocolInterceptor
for Option<OpenTelemetryProtocolInterceptorRef<E>>
{
type Error = E;
fn pre_execute(&self, query_ctx: QueryContextRef) -> Result<(), Self::Error> {
if let Some(this) = self {
this.pre_execute(query_ctx)
} else {
Ok(())
}
}
}
/// PromStoreProtocolInterceptor can track life cycle of a prom store request
/// and customize or abort its execution at given point.
pub trait PromStoreProtocolInterceptor {
type Error: ErrorExt;
fn pre_write(
&self,
_write_req: &WriteRequest,
_ctx: QueryContextRef,
) -> Result<(), Self::Error> {
Ok(())
}
fn pre_read(&self, _read_req: &ReadRequest, _ctx: QueryContextRef) -> Result<(), Self::Error> {
Ok(())
}
}
pub type PromStoreProtocolInterceptorRef<E> =
Arc<dyn PromStoreProtocolInterceptor<Error = E> + Send + Sync + 'static>;
impl<E: ErrorExt> PromStoreProtocolInterceptor for Option<PromStoreProtocolInterceptorRef<E>> {
type Error = E;
fn pre_write(&self, write_req: &WriteRequest, ctx: QueryContextRef) -> Result<(), Self::Error> {
if let Some(this) = self {
this.pre_write(write_req, ctx)
} else {
Ok(())
}
}
fn pre_read(&self, read_req: &ReadRequest, ctx: QueryContextRef) -> Result<(), Self::Error> {
if let Some(this) = self {
this.pre_read(read_req, ctx)
} else {
Ok(())
}
}
}

View File

@@ -36,9 +36,13 @@ pub mod mysql;
pub mod opentsdb;
pub mod otlp;
pub mod postgres;
mod prom_row_builder;
pub mod prom_store;
pub mod prometheus_handler;
pub mod proto;
pub mod query_handler;
#[allow(clippy::all)]
mod repeated_field;
mod row_writer;
pub mod server;
mod shutdown;

View File

@@ -18,6 +18,10 @@ pub(crate) mod jemalloc;
use std::task::{Context, Poll};
use std::time::Instant;
use axum::extract::MatchedPath;
use axum::http::Request;
use axum::middleware::Next;
use axum::response::IntoResponse;
use hyper::Body;
use lazy_static::lazy_static;
use prometheus::{
@@ -48,16 +52,20 @@ lazy_static! {
&[METRIC_PROTOCOL_LABEL]
)
.unwrap();
/// Http SQL query duration per database.
pub static ref METRIC_HTTP_SQL_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_servers_http_sql_elapsed",
"servers http sql elapsed",
&[METRIC_DB_LABEL]
&[METRIC_DB_LABEL],
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
/// Http pql query duration per database.
pub static ref METRIC_HTTP_PROMQL_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_servers_http_promql_elapsed",
"servers http promql elapsed",
&[METRIC_DB_LABEL]
&[METRIC_DB_LABEL],
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
pub static ref METRIC_AUTH_FAILURE: IntCounterVec = register_int_counter_vec!(
@@ -66,33 +74,41 @@ lazy_static! {
&[METRIC_CODE_LABEL]
)
.unwrap();
/// Http influxdb write duration per database.
pub static ref METRIC_HTTP_INFLUXDB_WRITE_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_servers_http_influxdb_write_elapsed",
"servers http influxdb write elapsed",
&[METRIC_DB_LABEL]
&[METRIC_DB_LABEL],
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
/// Http prometheus write duration per database.
pub static ref METRIC_HTTP_PROM_STORE_WRITE_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_servers_http_prometheus_write_elapsed",
"servers http prometheus write elapsed",
&[METRIC_DB_LABEL]
&[METRIC_DB_LABEL],
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
pub static ref METRIC_HTTP_PROM_STORE_DECODE_ELAPSED: Histogram = register_histogram!(
"greptime_servers_http_prometheus_decode_elapsed",
"servers http prometheus decode elapsed",
/// Prometheus remote write codec duration.
pub static ref METRIC_HTTP_PROM_STORE_CODEC_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_servers_http_prometheus_codec_elapsed",
"servers http prometheus request codec duration",
&["type"],
)
.unwrap();
/// Decode duration of prometheus write request.
pub static ref METRIC_HTTP_PROM_STORE_DECODE_ELAPSED: Histogram = METRIC_HTTP_PROM_STORE_CODEC_ELAPSED
.with_label_values(&["decode"]);
/// Duration to convert prometheus write request to gRPC request.
pub static ref METRIC_HTTP_PROM_STORE_CONVERT_ELAPSED: Histogram = METRIC_HTTP_PROM_STORE_CODEC_ELAPSED
.with_label_values(&["convert"]);
pub static ref METRIC_HTTP_PROM_STORE_DECODE_NUM_SERIES: Histogram = register_histogram!(
"greptime_servers_http_prometheus_decode_num_series",
"servers http prometheus decode num series",
)
.unwrap();
pub static ref METRIC_HTTP_PROM_STORE_CONVERT_ELAPSED: Histogram = register_histogram!(
"greptime_servers_http_prometheus_convert_elapsed",
"servers http prometheus convert to gRPC request elapsed",
)
.unwrap();
/// Http prometheus read duration per database.
pub static ref METRIC_HTTP_PROM_STORE_READ_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_servers_http_prometheus_read_elapsed",
"servers http prometheus read elapsed",
@@ -118,36 +134,6 @@ lazy_static! {
"servers opentsdb line write elapsed"
)
.unwrap();
pub static ref METRIC_HTTP_PROMQL_FORMAT_QUERY_ELAPSED: Histogram = register_histogram!(
"greptime_servers_http_promql_format_query_elapsed",
"servers http promql format query elapsed"
)
.unwrap();
pub static ref METRIC_HTTP_PROMQL_INSTANT_QUERY_ELAPSED: Histogram = register_histogram!(
"greptime_servers_http_promql_instant_query_elapsed",
"servers http promql instant query elapsed"
)
.unwrap();
pub static ref METRIC_HTTP_PROMQL_RANGE_QUERY_ELAPSED: Histogram = register_histogram!(
"greptime_servers_http_promql_range_query_elapsed",
"servers http promql range query elapsed"
)
.unwrap();
pub static ref METRIC_HTTP_PROMQL_LABEL_QUERY_ELAPSED: Histogram = register_histogram!(
"greptime_servers_http_promql_label_query_elapsed",
"servers http promql label query elapsed"
)
.unwrap();
pub static ref METRIC_HTTP_PROMQL_SERIES_QUERY_ELAPSED: Histogram = register_histogram!(
"greptime_servers_http_promql_series_query_elapsed",
"servers http promql series query elapsed"
)
.unwrap();
pub static ref METRIC_HTTP_PROMQL_LABEL_VALUE_QUERY_ELAPSED: Histogram = register_histogram!(
"greptime_servers_http_promql_label_value_query_elapsed",
"servers http promql label value query elapsed"
)
.unwrap();
pub static ref METRIC_MYSQL_CONNECTIONS: IntGauge = register_int_gauge!(
"greptime_servers_mysql_connection_count",
"servers mysql connection count"
@@ -202,7 +188,8 @@ lazy_static! {
pub static ref METRIC_HTTP_REQUESTS_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_servers_http_requests_elapsed",
"servers http requests elapsed",
&[METRIC_METHOD_LABEL, METRIC_PATH_LABEL, METRIC_CODE_LABEL]
&[METRIC_METHOD_LABEL, METRIC_PATH_LABEL, METRIC_CODE_LABEL],
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
pub static ref METRIC_GRPC_REQUESTS_TOTAL: IntCounterVec = register_int_counter_vec!(
@@ -214,13 +201,8 @@ lazy_static! {
pub static ref METRIC_GRPC_REQUESTS_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_servers_grpc_requests_elapsed",
"servers grpc requests elapsed",
&[METRIC_PATH_LABEL, METRIC_CODE_LABEL]
)
.unwrap();
pub static ref HTTP_TRACK_METRICS: HistogramVec = register_histogram_vec!(
"greptime_http_track_metrics",
"http track metrics",
&["tag"]
&[METRIC_PATH_LABEL, METRIC_CODE_LABEL],
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
}
@@ -284,3 +266,29 @@ where
})
}
}
/// A middleware to record metrics for HTTP.
// Based on https://github.com/tokio-rs/axum/blob/axum-v0.6.16/examples/prometheus-metrics/src/main.rs
pub(crate) async fn http_metrics_layer<B>(req: Request<B>, next: Next<B>) -> impl IntoResponse {
let start = Instant::now();
let path = if let Some(matched_path) = req.extensions().get::<MatchedPath>() {
matched_path.as_str().to_owned()
} else {
req.uri().path().to_owned()
};
let method = req.method().clone();
let response = next.run(req).await;
let latency = start.elapsed().as_secs_f64();
let status = response.status().as_u16().to_string();
let method_str = method.to_string();
let labels = [method_str.as_str(), path.as_str(), status.as_str()];
METRIC_HTTP_REQUESTS_TOTAL.with_label_values(&labels).inc();
METRIC_HTTP_REQUESTS_ELAPSED
.with_label_values(&labels)
.observe(latency);
response
}

View File

@@ -335,7 +335,7 @@ impl<W: AsyncWrite + Send + Sync + Unpin> AsyncMysqlShim<W> for MysqlInstanceShi
let _ = guard.remove(&stmt_id);
}
#[tracing::instrument(skip_all)]
#[tracing::instrument(skip_all, fields(protocol = "mysql"))]
async fn on_query<'a>(
&'a mut self,
query: &'a str,

View File

@@ -19,6 +19,7 @@ use common_error::ext::ErrorExt;
use common_query::Output;
use common_recordbatch::error::Result as RecordBatchResult;
use common_recordbatch::RecordBatch;
use common_telemetry::tracing;
use datatypes::schema::SchemaRef;
use futures::{future, stream, Stream, StreamExt};
use pgwire::api::portal::{Format, Portal};
@@ -40,6 +41,7 @@ use crate::SqlPlan;
#[async_trait]
impl SimpleQueryHandler for PostgresServerHandler {
#[tracing::instrument(skip_all, fields(protocol = "postgres"))]
async fn do_query<'a, C>(
&self,
_client: &mut C,

View File

@@ -0,0 +1,272 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::string::ToString;
use api::prom_store::remote::Sample;
use api::v1::value::ValueData;
use api::v1::{
ColumnDataType, ColumnSchema, Row, RowInsertRequest, RowInsertRequests, Rows, SemanticType,
Value,
};
use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE};
use crate::proto::PromLabel;
use crate::repeated_field::Clear;
/// [TablesBuilder] serves as an intermediate container to build [RowInsertRequests].
#[derive(Default)]
pub(crate) struct TablesBuilder {
tables: HashMap<String, TableBuilder>,
}
impl Clear for TablesBuilder {
fn clear(&mut self) {
self.tables.clear();
}
}
impl TablesBuilder {
/// Gets table builder with given table name. Creates an empty [TableBuilder] if not exist.
pub(crate) fn get_or_create_table_builder(
&mut self,
table_name: String,
label_num: usize,
row_num: usize,
) -> &mut TableBuilder {
self.tables
.entry(table_name)
.or_insert_with(|| TableBuilder::with_capacity(label_num + 2, row_num))
}
/// Converts [TablesBuilder] to [RowInsertRequests] and row numbers and clears inner states.
pub(crate) fn as_insert_requests(&mut self) -> (RowInsertRequests, usize) {
let mut total_rows = 0;
let inserts = self
.tables
.drain()
.map(|(name, mut table)| {
total_rows += table.num_rows();
table.as_row_insert_request(name)
})
.collect();
(RowInsertRequests { inserts }, total_rows)
}
}
/// Builder for one table.
pub(crate) struct TableBuilder {
/// Column schemas.
schema: Vec<ColumnSchema>,
/// Rows written.
rows: Vec<Row>,
/// Indices of columns inside `schema`.
col_indexes: HashMap<String, usize>,
}
impl Default for TableBuilder {
fn default() -> Self {
Self::with_capacity(2, 0)
}
}
impl TableBuilder {
pub(crate) fn with_capacity(cols: usize, rows: usize) -> Self {
let mut col_indexes = HashMap::with_capacity(cols);
col_indexes.insert(GREPTIME_TIMESTAMP.to_string(), 0);
col_indexes.insert(GREPTIME_VALUE.to_string(), 1);
let mut schema = Vec::with_capacity(cols);
schema.push(ColumnSchema {
column_name: GREPTIME_TIMESTAMP.to_string(),
datatype: ColumnDataType::TimestampMillisecond as i32,
semantic_type: SemanticType::Timestamp as i32,
datatype_extension: None,
});
schema.push(ColumnSchema {
column_name: GREPTIME_VALUE.to_string(),
datatype: ColumnDataType::Float64 as i32,
semantic_type: SemanticType::Field as i32,
datatype_extension: None,
});
Self {
schema,
rows: Vec::with_capacity(rows),
col_indexes,
}
}
/// Total number of rows inside table builder.
fn num_rows(&self) -> usize {
self.rows.len()
}
/// Adds a set of labels and samples to table builder.
pub(crate) fn add_labels_and_samples(&mut self, labels: &[PromLabel], samples: &[Sample]) {
let mut row = vec![Value { value_data: None }; self.col_indexes.len()];
for PromLabel { name, value } in labels {
// safety: we expect all labels are UTF-8 encoded strings.
let tag_name = unsafe { String::from_utf8_unchecked(name.to_vec()) };
let tag_value = unsafe { String::from_utf8_unchecked(value.to_vec()) };
let tag_value = Some(ValueData::StringValue(tag_value));
let tag_num = self.col_indexes.len();
match self.col_indexes.entry(tag_name) {
Entry::Occupied(e) => {
row[*e.get()].value_data = tag_value;
}
Entry::Vacant(e) => {
let column_name = e.key().clone();
e.insert(tag_num);
self.schema.push(ColumnSchema {
column_name,
datatype: ColumnDataType::String as i32,
semantic_type: SemanticType::Tag as i32,
datatype_extension: None,
});
row.push(Value {
value_data: tag_value,
});
}
}
}
if samples.len() == 1 {
let sample = &samples[0];
row[0].value_data = Some(ValueData::TimestampMillisecondValue(sample.timestamp));
row[1].value_data = Some(ValueData::F64Value(sample.value));
self.rows.push(Row { values: row });
return;
}
for sample in samples {
row[0].value_data = Some(ValueData::TimestampMillisecondValue(sample.timestamp));
row[1].value_data = Some(ValueData::F64Value(sample.value));
self.rows.push(Row {
values: row.clone(),
});
}
}
/// Converts [TableBuilder] to [RowInsertRequest] and clears buffered data.
pub(crate) fn as_row_insert_request(&mut self, table_name: String) -> RowInsertRequest {
let mut rows = std::mem::take(&mut self.rows);
let schema = std::mem::take(&mut self.schema);
let col_num = schema.len();
for row in &mut rows {
if row.values.len() < col_num {
row.values.resize(col_num, Value { value_data: None });
}
}
RowInsertRequest {
table_name,
rows: Some(Rows { schema, rows }),
}
}
}
#[cfg(test)]
mod tests {
use api::prom_store::remote::Sample;
use api::v1::value::ValueData;
use api::v1::Value;
use bytes::Bytes;
use crate::prom_row_builder::TableBuilder;
use crate::proto::PromLabel;
#[test]
fn test_table_builder() {
let mut builder = TableBuilder::default();
builder.add_labels_and_samples(
&[
PromLabel {
name: Bytes::from("tag0"),
value: Bytes::from("v0"),
},
PromLabel {
name: Bytes::from("tag1"),
value: Bytes::from("v1"),
},
],
&[Sample {
value: 0.0,
timestamp: 0,
}],
);
builder.add_labels_and_samples(
&[
PromLabel {
name: Bytes::from("tag0"),
value: Bytes::from("v0"),
},
PromLabel {
name: Bytes::from("tag2"),
value: Bytes::from("v2"),
},
],
&[Sample {
value: 0.1,
timestamp: 1,
}],
);
let request = builder.as_row_insert_request("test".to_string());
let rows = request.rows.unwrap().rows;
assert_eq!(2, rows.len());
assert_eq!(
vec![
Value {
value_data: Some(ValueData::TimestampMillisecondValue(0))
},
Value {
value_data: Some(ValueData::F64Value(0.0))
},
Value {
value_data: Some(ValueData::StringValue("v0".to_string()))
},
Value {
value_data: Some(ValueData::StringValue("v1".to_string()))
},
Value { value_data: None },
],
rows[0].values
);
assert_eq!(
vec![
Value {
value_data: Some(ValueData::TimestampMillisecondValue(1))
},
Value {
value_data: Some(ValueData::F64Value(0.1))
},
Value {
value_data: Some(ValueData::StringValue("v0".to_string()))
},
Value { value_data: None },
Value {
value_data: Some(ValueData::StringValue("v2".to_string()))
},
],
rows[1].values
);
}
}

View File

@@ -23,6 +23,7 @@ use api::prom_store::remote::{Label, Query, Sample, TimeSeries, WriteRequest};
use api::v1::RowInsertRequests;
use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE};
use common_recordbatch::{RecordBatch, RecordBatches};
use common_telemetry::tracing;
use common_time::timestamp::TimeUnit;
use datafusion::prelude::{col, lit, regexp_match, Expr};
use datafusion_common::ScalarValue;
@@ -38,6 +39,8 @@ use crate::row_writer::{self, MultiTableData};
pub const METRIC_NAME_LABEL: &str = "__name__";
pub const METRIC_NAME_LABEL_BYTES: &[u8] = b"__name__";
/// Metrics for push gateway protocol
pub struct Metrics {
pub exposition: MetricsExposition<PrometheusType, PrometheusValue>,
@@ -62,6 +65,7 @@ pub fn table_name(q: &Query) -> Result<String> {
}
/// Create a DataFrame from a remote Query
#[tracing::instrument(skip_all)]
pub fn query_to_plan(dataframe: DataFrame, q: &Query) -> Result<LogicalPlan> {
let DataFrame::DataFusion(dataframe) = dataframe;
@@ -298,12 +302,12 @@ fn recordbatch_to_timeseries(table: &str, recordbatch: RecordBatch) -> Result<Ve
Ok(timeseries_map.into_values().collect())
}
pub fn to_grpc_row_insert_requests(request: WriteRequest) -> Result<(RowInsertRequests, usize)> {
pub fn to_grpc_row_insert_requests(request: &WriteRequest) -> Result<(RowInsertRequests, usize)> {
let _timer = crate::metrics::METRIC_HTTP_PROM_STORE_CONVERT_ELAPSED.start_timer();
let mut multi_table_data = MultiTableData::new();
for series in request.timeseries {
for series in &request.timeseries {
let table_name = &series
.labels
.iter()
@@ -327,11 +331,11 @@ pub fn to_grpc_row_insert_requests(request: WriteRequest) -> Result<(RowInsertRe
);
// labels
let kvs = series.labels.into_iter().filter_map(|label| {
let kvs = series.labels.iter().filter_map(|label| {
if label.name == METRIC_NAME_LABEL {
None
} else {
Some((label.name, label.value))
Some((label.name.clone(), label.value.clone()))
}
});
@@ -647,7 +651,7 @@ mod tests {
..Default::default()
};
let mut exprs = to_grpc_row_insert_requests(write_request)
let mut exprs = to_grpc_row_insert_requests(&write_request)
.unwrap()
.0
.inserts;

304
src/servers/src/proto.rs Normal file
View File

@@ -0,0 +1,304 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::ops::Deref;
use api::prom_store::remote::Sample;
use api::v1::RowInsertRequests;
use bytes::{Buf, Bytes};
use prost::encoding::message::merge;
use prost::encoding::{decode_key, decode_varint, DecodeContext, WireType};
use prost::DecodeError;
use crate::prom_row_builder::TablesBuilder;
use crate::prom_store::METRIC_NAME_LABEL_BYTES;
use crate::repeated_field::{Clear, RepeatedField};
impl Clear for Sample {
fn clear(&mut self) {}
}
#[derive(Default, Clone)]
pub struct PromLabel {
pub name: Bytes,
pub value: Bytes,
}
impl Clear for PromLabel {
fn clear(&mut self) {
self.name.clear();
self.value.clear();
}
}
impl PromLabel {
pub fn merge_field<B>(
&mut self,
tag: u32,
wire_type: WireType,
buf: &mut B,
ctx: DecodeContext,
) -> Result<(), DecodeError>
where
B: Buf,
{
const STRUCT_NAME: &str = "PromLabel";
match tag {
1u32 => {
// decode label name
let value = &mut self.name;
prost::encoding::bytes::merge(wire_type, value, buf, ctx).map_err(|mut error| {
error.push(STRUCT_NAME, "name");
error
})
}
2u32 => {
// decode label value
let value = &mut self.value;
prost::encoding::bytes::merge(wire_type, value, buf, ctx).map_err(|mut error| {
error.push(STRUCT_NAME, "value");
error
})
}
_ => prost::encoding::skip_field(wire_type, tag, buf, ctx),
}
}
}
#[derive(Default)]
pub struct PromTimeSeries {
pub table_name: String,
pub labels: RepeatedField<PromLabel>,
pub samples: RepeatedField<Sample>,
}
impl Clear for PromTimeSeries {
fn clear(&mut self) {
self.table_name.clear();
self.labels.clear();
self.samples.clear();
}
}
impl PromTimeSeries {
pub fn merge_field<B>(
&mut self,
tag: u32,
wire_type: WireType,
buf: &mut B,
ctx: DecodeContext,
) -> Result<(), DecodeError>
where
B: Buf,
{
const STRUCT_NAME: &str = "PromTimeSeries";
match tag {
1u32 => {
// decode labels
let label = self.labels.push_default();
let len = decode_varint(buf).map_err(|mut error| {
error.push(STRUCT_NAME, "labels");
error
})?;
let remaining = buf.remaining();
if len > remaining as u64 {
return Err(DecodeError::new("buffer underflow"));
}
let limit = remaining - len as usize;
while buf.remaining() > limit {
let (tag, wire_type) = decode_key(buf)?;
label.merge_field(tag, wire_type, buf, ctx.clone())?;
}
if buf.remaining() != limit {
return Err(DecodeError::new("delimited length exceeded"));
}
if label.name.deref() == METRIC_NAME_LABEL_BYTES {
// safety: we expect all labels are UTF-8 encoded strings.
let table_name = unsafe { String::from_utf8_unchecked(label.value.to_vec()) };
self.table_name = table_name;
self.labels.truncate(self.labels.len() - 1); // remove last label
}
Ok(())
}
2u32 => {
let sample = self.samples.push_default();
merge(WireType::LengthDelimited, sample, buf, ctx).map_err(|mut error| {
error.push(STRUCT_NAME, "samples");
error
})?;
Ok(())
}
// skip exemplars
3u32 => prost::encoding::skip_field(wire_type, tag, buf, ctx),
_ => prost::encoding::skip_field(wire_type, tag, buf, ctx),
}
}
fn add_to_table_data(&mut self, table_builders: &mut TablesBuilder) {
let label_num = self.labels.len();
let row_num = self.samples.len();
let table_data = table_builders.get_or_create_table_builder(
std::mem::take(&mut self.table_name),
label_num,
row_num,
);
table_data.add_labels_and_samples(self.labels.as_slice(), self.samples.as_slice());
self.labels.clear();
self.samples.clear();
}
}
#[derive(Default)]
pub struct PromWriteRequest {
table_data: TablesBuilder,
series: PromTimeSeries,
}
impl Clear for PromWriteRequest {
fn clear(&mut self) {
self.table_data.clear();
}
}
impl PromWriteRequest {
pub fn as_row_insert_requests(&mut self) -> (RowInsertRequests, usize) {
self.table_data.as_insert_requests()
}
pub fn merge<B>(&mut self, mut buf: B) -> Result<(), DecodeError>
where
B: Buf,
Self: Sized,
{
const STRUCT_NAME: &str = "PromWriteRequest";
let ctx = DecodeContext::default();
while buf.has_remaining() {
let (tag, wire_type) = decode_key(&mut buf)?;
assert_eq!(WireType::LengthDelimited, wire_type);
match tag {
1u32 => {
// decode TimeSeries
let len = decode_varint(&mut buf).map_err(|mut e| {
e.push(STRUCT_NAME, "timeseries");
e
})?;
let remaining = buf.remaining();
if len > remaining as u64 {
return Err(DecodeError::new("buffer underflow"));
}
let limit = remaining - len as usize;
while buf.remaining() > limit {
let (tag, wire_type) = decode_key(&mut buf)?;
self.series
.merge_field(tag, wire_type, &mut buf, ctx.clone())?;
}
if buf.remaining() != limit {
return Err(DecodeError::new("delimited length exceeded"));
}
self.series.add_to_table_data(&mut self.table_data);
}
3u32 => {
// we can ignore metadata for now.
prost::encoding::skip_field(wire_type, tag, &mut buf, ctx.clone())?;
}
_ => prost::encoding::skip_field(wire_type, tag, &mut buf, ctx.clone())?,
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::collections::{HashMap, HashSet};
use api::prom_store::remote::WriteRequest;
use api::v1::RowInsertRequests;
use bytes::Bytes;
use prost::Message;
use crate::prom_store::to_grpc_row_insert_requests;
use crate::proto::PromWriteRequest;
use crate::repeated_field::Clear;
fn check_deserialized(
prom_write_request: &mut PromWriteRequest,
data: &Bytes,
expected_samples: usize,
expected_rows: &RowInsertRequests,
) {
prom_write_request.clear();
prom_write_request.merge(data.clone()).unwrap();
let (prom_rows, samples) = prom_write_request.as_row_insert_requests();
assert_eq!(expected_samples, samples);
assert_eq!(expected_rows.inserts.len(), prom_rows.inserts.len());
let schemas = expected_rows
.inserts
.iter()
.map(|r| {
(
r.table_name.clone(),
r.rows
.as_ref()
.unwrap()
.schema
.iter()
.map(|c| (c.column_name.clone(), c.datatype, c.semantic_type))
.collect::<HashSet<_>>(),
)
})
.collect::<HashMap<_, _>>();
for r in &prom_rows.inserts {
let expected = schemas.get(&r.table_name).unwrap();
assert_eq!(
expected,
&r.rows
.as_ref()
.unwrap()
.schema
.iter()
.map(|c| { (c.column_name.clone(), c.datatype, c.semantic_type) })
.collect()
);
}
}
// Ensures `WriteRequest` and `PromWriteRequest` produce the same gRPC request.
#[test]
fn test_decode_write_request() {
let mut d = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
d.push("benches");
d.push("write_request.pb.data");
let data = Bytes::from(std::fs::read(d).unwrap());
let (expected_rows, expected_samples) =
to_grpc_row_insert_requests(&WriteRequest::decode(data.clone()).unwrap()).unwrap();
let mut prom_write_request = PromWriteRequest::default();
for _ in 0..3 {
check_deserialized(
&mut prom_write_request,
&data,
expected_samples,
&expected_rows,
);
}
}
}

View File

@@ -29,6 +29,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use api::prom_store::remote::{ReadRequest, WriteRequest};
use api::v1::RowInsertRequests;
use async_trait::async_trait;
use common_query::Output;
use opentelemetry_proto::tonic::collector::metrics::v1::{
@@ -95,6 +96,15 @@ pub trait PromStoreProtocolHandler {
ctx: QueryContextRef,
with_metric_engine: bool,
) -> Result<()>;
/// Handling prometheus remote write requests
async fn write_fast(
&self,
request: RowInsertRequests,
ctx: QueryContextRef,
with_metric_engine: bool,
) -> Result<()>;
/// Handling prometheus remote read requests
async fn read(&self, request: ReadRequest, ctx: QueryContextRef) -> Result<PromStoreResponse>;
/// Handling push gateway requests

View File

@@ -0,0 +1,540 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Copyright (c) 2019 Stepan Koltsov
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
// OR OTHER DEALINGS IN THE SOFTWARE.
/// ! The [Clear] trait and [RepeatedField] are taken from [rust-protobuf](https://github.com/stepancheg/rust-protobuf/tree/master/protobuf-examples/vs-prost)
/// to leverage the pooling mechanism to avoid frequent heap allocation/deallocation when decoding deeply nested structs.
use std::borrow::Borrow;
use std::cmp::Ordering;
use std::default::Default;
use std::hash::{Hash, Hasher};
use std::iter::{FromIterator, IntoIterator};
use std::ops::{Deref, DerefMut, Index, IndexMut};
use std::{fmt, slice, vec};
use bytes::Bytes;
/// anything that can be cleared
pub trait Clear {
/// Clear this make, make it equivalent to newly created object.
fn clear(&mut self);
}
impl<T> Clear for Option<T> {
fn clear(&mut self) {
self.take();
}
}
impl Clear for String {
fn clear(&mut self) {
String::clear(self);
}
}
impl<T> Clear for Vec<T> {
fn clear(&mut self) {
Vec::clear(self);
}
}
impl Clear for Bytes {
fn clear(&mut self) {
Bytes::clear(self);
}
}
/// Wrapper around vector to avoid deallocations on clear.
pub struct RepeatedField<T> {
vec: Vec<T>,
len: usize,
}
impl<T> RepeatedField<T> {
/// Return number of elements in this container.
#[inline]
pub fn len(&self) -> usize {
self.len
}
/// Clear.
#[inline]
pub fn clear(&mut self) {
self.len = 0;
}
}
impl<T> Default for RepeatedField<T> {
#[inline]
fn default() -> RepeatedField<T> {
RepeatedField {
vec: Vec::new(),
len: 0,
}
}
}
impl<T> RepeatedField<T> {
/// Create new empty container.
#[inline]
pub fn new() -> RepeatedField<T> {
Default::default()
}
/// Create a contained with data from given vec.
#[inline]
pub fn from_vec(vec: Vec<T>) -> RepeatedField<T> {
let len = vec.len();
RepeatedField { vec, len }
}
/// Convert data into vec.
#[inline]
pub fn into_vec(self) -> Vec<T> {
let mut vec = self.vec;
vec.truncate(self.len);
vec
}
/// Return current capacity.
#[inline]
pub fn capacity(&self) -> usize {
self.vec.capacity()
}
/// View data as slice.
#[inline]
pub fn as_slice<'a>(&'a self) -> &'a [T] {
&self.vec[..self.len]
}
/// View data as mutable slice.
#[inline]
pub fn as_mut_slice<'a>(&'a mut self) -> &'a mut [T] {
&mut self.vec[..self.len]
}
/// Get subslice of this container.
#[inline]
pub fn slice(&self, start: usize, end: usize) -> &[T] {
&self.as_ref()[start..end]
}
/// Get mutable subslice of this container.
#[inline]
pub fn slice_mut(&mut self, start: usize, end: usize) -> &mut [T] {
&mut self.as_mut_slice()[start..end]
}
/// Get slice from given index.
#[inline]
pub fn slice_from(&self, start: usize) -> &[T] {
&self.as_ref()[start..]
}
/// Get mutable slice from given index.
#[inline]
pub fn slice_from_mut(&mut self, start: usize) -> &mut [T] {
&mut self.as_mut_slice()[start..]
}
/// Get slice to given index.
#[inline]
pub fn slice_to(&self, end: usize) -> &[T] {
&self.as_ref()[..end]
}
/// Get mutable slice to given index.
#[inline]
pub fn slice_to_mut(&mut self, end: usize) -> &mut [T] {
&mut self.as_mut_slice()[..end]
}
/// View this container as two slices split at given index.
#[inline]
pub fn split_at<'a>(&'a self, mid: usize) -> (&'a [T], &'a [T]) {
self.as_ref().split_at(mid)
}
/// View this container as two mutable slices split at given index.
#[inline]
pub fn split_at_mut<'a>(&'a mut self, mid: usize) -> (&'a mut [T], &'a mut [T]) {
self.as_mut_slice().split_at_mut(mid)
}
/// View all but first elements of this container.
#[inline]
pub fn tail(&self) -> &[T] {
&self.as_ref()[1..]
}
/// Last element of this container.
#[inline]
pub fn last(&self) -> Option<&T> {
self.as_ref().last()
}
/// Mutable last element of this container.
#[inline]
pub fn last_mut<'a>(&'a mut self) -> Option<&'a mut T> {
self.as_mut_slice().last_mut()
}
/// View all but last elements of this container.
#[inline]
pub fn init<'a>(&'a self) -> &'a [T] {
let s = self.as_ref();
&s[0..s.len() - 1]
}
/// Push an element to the end.
#[inline]
pub fn push(&mut self, value: T) {
if self.len == self.vec.len() {
self.vec.push(value);
} else {
self.vec[self.len] = value;
}
self.len += 1;
}
/// Pop last element.
#[inline]
pub fn pop(&mut self) -> Option<T> {
if self.len == 0 {
None
} else {
self.vec.truncate(self.len);
self.len -= 1;
self.vec.pop()
}
}
/// Insert an element at specified position.
#[inline]
pub fn insert(&mut self, index: usize, value: T) {
assert!(index <= self.len);
self.vec.insert(index, value);
self.len += 1;
}
/// Remove an element from specified position.
#[inline]
pub fn remove(&mut self, index: usize) -> T {
assert!(index < self.len);
self.len -= 1;
self.vec.remove(index)
}
/// Retains only the elements specified by the predicate.
///
/// In other words, remove all elements `e` such that `f(&e)` returns `false`.
/// This method operates in place, visiting each element exactly once in the
/// original order, and preserves the order of the retained elements.
///
/// # Examples
///
/// ```
/// # use protobuf::RepeatedField;
///
/// let mut vec = RepeatedField::from(vec![1, 2, 3, 4]);
/// vec.retain(|&x| x % 2 == 0);
/// assert_eq!(vec, RepeatedField::from(vec![2, 4]));
/// ```
pub fn retain<F>(&mut self, f: F)
where
F: FnMut(&T) -> bool,
{
// suboptimal
self.vec.truncate(self.len);
self.vec.retain(f);
self.len = self.vec.len();
}
/// Truncate at specified length.
#[inline]
pub fn truncate(&mut self, len: usize) {
if self.len > len {
self.len = len;
}
}
/// Reverse in place.
#[inline]
pub fn reverse(&mut self) {
self.as_mut_slice().reverse()
}
/// Into owned iterator.
#[inline]
pub fn into_iter(mut self) -> vec::IntoIter<T> {
self.vec.truncate(self.len);
self.vec.into_iter()
}
/// Immutable data iterator.
#[inline]
pub fn iter<'a>(&'a self) -> slice::Iter<'a, T> {
self.as_ref().iter()
}
/// Mutable data iterator.
#[inline]
pub fn iter_mut<'a>(&'a mut self) -> slice::IterMut<'a, T> {
self.as_mut_slice().iter_mut()
}
/// Sort elements with given comparator.
#[inline]
pub fn sort_by<F>(&mut self, compare: F)
where
F: Fn(&T, &T) -> Ordering,
{
self.as_mut_slice().sort_by(compare)
}
/// Get data as raw pointer.
#[inline]
pub fn as_ptr(&self) -> *const T {
self.vec.as_ptr()
}
/// Get data a mutable raw pointer.
#[inline]
pub fn as_mut_ptr(&mut self) -> *mut T {
self.vec.as_mut_ptr()
}
}
impl<T: Default + Clear> RepeatedField<T> {
/// Push default value.
/// This operation could be faster than `rf.push(Default::default())`,
/// because it may reuse previously allocated and cleared element.
pub fn push_default<'a>(&'a mut self) -> &'a mut T {
if self.len == self.vec.len() {
self.vec.push(Default::default());
} else {
self.vec[self.len].clear();
}
self.len += 1;
self.last_mut().unwrap()
}
}
impl<T> From<Vec<T>> for RepeatedField<T> {
#[inline]
fn from(values: Vec<T>) -> RepeatedField<T> {
RepeatedField::from_vec(values)
}
}
impl<'a, T: Clone> From<&'a [T]> for RepeatedField<T> {
#[inline]
fn from(values: &'a [T]) -> RepeatedField<T> {
RepeatedField::from_slice(values)
}
}
impl<T> Into<Vec<T>> for RepeatedField<T> {
#[inline]
fn into(self) -> Vec<T> {
self.into_vec()
}
}
impl<T: Clone> RepeatedField<T> {
/// Copy slice data to `RepeatedField`
#[inline]
pub fn from_slice(values: &[T]) -> RepeatedField<T> {
RepeatedField::from_vec(values.to_vec())
}
/// Copy slice data to `RepeatedField`
#[inline]
pub fn from_ref<X: AsRef<[T]>>(values: X) -> RepeatedField<T> {
RepeatedField::from_slice(values.as_ref())
}
/// Copy this data into new vec.
#[inline]
pub fn to_vec(&self) -> Vec<T> {
self.as_ref().to_vec()
}
}
impl<T: Clone> Clone for RepeatedField<T> {
#[inline]
fn clone(&self) -> RepeatedField<T> {
RepeatedField {
vec: self.to_vec(),
len: self.len(),
}
}
}
impl<T> FromIterator<T> for RepeatedField<T> {
#[inline]
fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> RepeatedField<T> {
RepeatedField::from_vec(FromIterator::from_iter(iter))
}
}
impl<'a, T> IntoIterator for &'a RepeatedField<T> {
type Item = &'a T;
type IntoIter = slice::Iter<'a, T>;
fn into_iter(self) -> slice::Iter<'a, T> {
self.iter()
}
}
impl<'a, T> IntoIterator for &'a mut RepeatedField<T> {
type Item = &'a mut T;
type IntoIter = slice::IterMut<'a, T>;
fn into_iter(self) -> slice::IterMut<'a, T> {
self.iter_mut()
}
}
impl<'a, T> IntoIterator for RepeatedField<T> {
type Item = T;
type IntoIter = vec::IntoIter<T>;
fn into_iter(self) -> vec::IntoIter<T> {
self.into_iter()
}
}
impl<T: PartialEq> PartialEq for RepeatedField<T> {
#[inline]
fn eq(&self, other: &RepeatedField<T>) -> bool {
self.as_ref() == other.as_ref()
}
}
impl<T: Eq> Eq for RepeatedField<T> {}
impl<T: PartialEq> PartialEq<[T]> for RepeatedField<T> {
fn eq(&self, other: &[T]) -> bool {
self.as_slice() == other
}
}
impl<T: PartialEq> PartialEq<RepeatedField<T>> for [T] {
fn eq(&self, other: &RepeatedField<T>) -> bool {
self == other.as_slice()
}
}
impl<T: PartialEq> RepeatedField<T> {
/// True iff this container contains given element.
#[inline]
pub fn contains(&self, value: &T) -> bool {
self.as_ref().contains(value)
}
}
impl<T: Hash> Hash for RepeatedField<T> {
fn hash<H: Hasher>(&self, state: &mut H) {
self.as_ref().hash(state);
}
}
impl<T> AsRef<[T]> for RepeatedField<T> {
#[inline]
fn as_ref<'a>(&'a self) -> &'a [T] {
&self.vec[..self.len]
}
}
impl<T> Borrow<[T]> for RepeatedField<T> {
#[inline]
fn borrow(&self) -> &[T] {
&self.vec[..self.len]
}
}
impl<T> Deref for RepeatedField<T> {
type Target = [T];
#[inline]
fn deref(&self) -> &[T] {
&self.vec[..self.len]
}
}
impl<T> DerefMut for RepeatedField<T> {
#[inline]
fn deref_mut(&mut self) -> &mut [T] {
&mut self.vec[..self.len]
}
}
impl<T> Index<usize> for RepeatedField<T> {
type Output = T;
#[inline]
fn index<'a>(&'a self, index: usize) -> &'a T {
&self.as_ref()[index]
}
}
impl<T> IndexMut<usize> for RepeatedField<T> {
#[inline]
fn index_mut<'a>(&'a mut self, index: usize) -> &'a mut T {
&mut self.as_mut_slice()[index]
}
}
impl<T> Extend<T> for RepeatedField<T> {
fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
self.vec.truncate(self.len);
self.vec.extend(iter);
self.len = self.vec.len();
}
}
impl<'a, T: Copy + 'a> Extend<&'a T> for RepeatedField<T> {
fn extend<I: IntoIterator<Item = &'a T>>(&mut self, iter: I) {
self.vec.truncate(self.len);
self.vec.extend(iter);
self.len = self.vec.len();
}
}
impl<T: fmt::Debug> fmt::Debug for RepeatedField<T> {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.as_ref().fmt(f)
}
}

View File

@@ -18,6 +18,7 @@ use api::prom_store::remote::{
LabelMatcher, Query, QueryResult, ReadRequest, ReadResponse, WriteRequest,
};
use api::v1::greptime_request::Request;
use api::v1::RowInsertRequests;
use async_trait::async_trait;
use axum::Router;
use axum_test_helper::TestClient;
@@ -64,6 +65,16 @@ impl PromStoreProtocolHandler for DummyInstance {
Ok(())
}
async fn write_fast(
&self,
_request: RowInsertRequests,
_ctx: QueryContextRef,
_with_metric_engine: bool,
) -> Result<()> {
Ok(())
}
async fn read(&self, request: ReadRequest, ctx: QueryContextRef) -> Result<PromStoreResponse> {
let _ = self
.tx
@@ -141,6 +152,7 @@ fn make_test_app(tx: mpsc::Sender<(String, Vec<u8>)>) -> Router {
#[tokio::test]
async fn test_prometheus_remote_write_read() {
common_telemetry::init_default_ut_logging();
let (tx, mut rx) = mpsc::channel(100);
let app = make_test_app(tx);
@@ -219,28 +231,17 @@ async fn test_prometheus_remote_write_read() {
requests.push(s);
}
assert_eq!(4, requests.len());
assert_eq!(2, requests.len());
assert_eq!("public", requests[0].0);
assert_eq!("prometheus", requests[1].0);
assert_eq!("prometheus", requests[2].0);
assert_eq!("public", requests[3].0);
assert_eq!(
write_request,
WriteRequest::decode(&(requests[0].1)[..]).unwrap()
);
assert_eq!(
write_request,
WriteRequest::decode(&(requests[1].1)[..]).unwrap()
);
assert_eq!("prometheus", requests[0].0);
assert_eq!("public", requests[1].0);
assert_eq!(
read_request,
ReadRequest::decode(&(requests[2].1)[..]).unwrap()
ReadRequest::decode(&(requests[0].1)[..]).unwrap()
);
assert_eq!(
read_request,
ReadRequest::decode(&(requests[3].1)[..]).unwrap()
ReadRequest::decode(&(requests[1].1)[..]).unwrap()
);
}

View File

@@ -98,6 +98,7 @@ impl CreateTableExprTranslator {
_ => format!("{v}"),
},
PartitionBound::MaxValue => "MAXVALUE".to_string(),
PartitionBound::Expr(expr) => expr.to_parser_expr().to_string(),
}
}

View File

@@ -186,7 +186,6 @@ mod test {
}
#[tokio::test(flavor = "multi_thread")]
#[ignore = "TODO(ruihang): WIP new partition rule"]
async fn test_distributed_insert_delete_and_query() {
common_telemetry::init_default_ut_logging();
@@ -204,11 +203,11 @@ CREATE TABLE {table_name} (
ts TIMESTAMP,
TIME INDEX (ts),
PRIMARY KEY (a, b)
) PARTITION BY RANGE COLUMNS(a) (
PARTITION r0 VALUES LESS THAN (10),
PARTITION r1 VALUES LESS THAN (20),
PARTITION r2 VALUES LESS THAN (50),
PARTITION r3 VALUES LESS THAN (MAXVALUE),
) PARTITION ON COLUMNS(a) (
a < 10,
a >= 10 AND a < 20,
a >= 20 AND a < 50,
a >= 50
)"
);
create_table(frontend, sql).await;

View File

@@ -67,7 +67,6 @@ mod tests {
}
#[tokio::test(flavor = "multi_thread")]
#[ignore = "TODO(ruihang): WIP new partition rule"]
async fn test_distributed_exec_sql() {
common_telemetry::init_default_ut_logging();
@@ -85,11 +84,11 @@ mod tests {
TIME INDEX (ts),
PRIMARY KEY(host)
)
PARTITION BY RANGE COLUMNS (host) (
PARTITION r0 VALUES LESS THAN ('550-A'),
PARTITION r1 VALUES LESS THAN ('550-W'),
PARTITION r2 VALUES LESS THAN ('MOSS'),
PARTITION r3 VALUES LESS THAN (MAXVALUE),
PARTITION ON COLUMNS (host) (
host < '550-A',
host >= '550-A' AND host < '550-W',
host >= '550-W' AND host < 'MOSS',
host >= 'MOSS'
)
engine=mito"#;
create_table(instance, sql).await;

View File

@@ -83,7 +83,6 @@ async fn test_create_database_and_insert_query(instance: Arc<dyn MockInstance>)
}
#[apply(both_instances_cases)]
#[ignore = "TODO(ruihang): WIP new partition rule"]
async fn test_show_create_table(instance: Arc<dyn MockInstance>) {
let frontend = instance.frontend();
let sql = if instance.is_distributed_mode() {
@@ -92,11 +91,11 @@ async fn test_show_create_table(instance: Arc<dyn MockInstance>) {
ts timestamp,
TIME INDEX(ts)
)
PARTITION BY RANGE COLUMNS (n) (
PARTITION r0 VALUES LESS THAN (1),
PARTITION r1 VALUES LESS THAN (10),
PARTITION r2 VALUES LESS THAN (100),
PARTITION r3 VALUES LESS THAN (MAXVALUE),
PARTITION ON COLUMNS (n) (
n < 1,
n >= 1 AND n < 10,
n >= 10 AND n < 100,
n >= 100
)"#
} else {
r#"create table demo(
@@ -113,26 +112,26 @@ PARTITION BY RANGE COLUMNS (n) (
let output = execute_sql(&frontend, "show create table demo").await;
let expected = if instance.is_distributed_mode() {
r#"+-------+--------------------------------------------+
| Table | Create Table |
+-------+--------------------------------------------+
| demo | CREATE TABLE IF NOT EXISTS "demo" ( |
| | "n" INT NULL, |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | TIME INDEX ("ts"), |
| | PRIMARY KEY ("n") |
| | ) |
| | PARTITION BY RANGE COLUMNS ("n") ( |
| | PARTITION r0 VALUES LESS THAN (1), |
| | PARTITION r1 VALUES LESS THAN (10), |
| | PARTITION r2 VALUES LESS THAN (100), |
| | PARTITION r3 VALUES LESS THAN (MAXVALUE) |
| | ) |
| | ENGINE=mito |
| | WITH( |
| | regions = 4 |
| | ) |
+-------+--------------------------------------------+"#
r#"+-------+-------------------------------------+
| Table | Create Table |
+-------+-------------------------------------+
| demo | CREATE TABLE IF NOT EXISTS "demo" ( |
| | "n" INT NULL, |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | TIME INDEX ("ts"), |
| | PRIMARY KEY ("n") |
| | ) |
| | PARTITION ON COLUMNS ("n") ( |
| | n < 1, |
| | n >= 100, |
| | n >= 1 AND n < 10, |
| | n >= 10 AND n < 100 |
| | ) |
| | ENGINE=mito |
| | WITH( |
| | regions = 4 |
| | ) |
+-------+-------------------------------------+"#
} else {
r#"+-------+-------------------------------------+
| Table | Create Table |
@@ -440,39 +439,27 @@ async fn test_execute_query(instance: Arc<dyn MockInstance>) {
async fn test_execute_show_databases_tables(instance: Arc<dyn MockInstance>) {
let instance = instance.frontend();
let expected = "\
+--------------------+
| Database |
+--------------------+
| greptime_private |
| information_schema |
| public |
+--------------------+\
";
let output = execute_sql(&instance, "show databases").await;
match output {
Output::RecordBatches(databases) => {
let databases = databases.take();
assert_eq!(1, databases[0].num_columns());
assert_eq!(databases[0].column(0).len(), 3);
assert_eq!(
*databases[0].column(0),
Arc::new(StringVector::from(vec![
Some("greptime_private"),
Some("information_schema"),
Some("public")
])) as VectorRef
);
}
_ => unreachable!(),
}
check_unordered_output_stream(output, expected).await;
let output = execute_sql(&instance, "show databases like '%bl%'").await;
match output {
Output::RecordBatches(databases) => {
let databases = databases.take();
assert_eq!(1, databases[0].num_columns());
assert_eq!(databases[0].column(0).len(), 1);
assert_eq!(
*databases[0].column(0),
Arc::new(StringVector::from(vec![Some("public")])) as VectorRef
);
}
_ => unreachable!(),
}
let expected = "\
+----------+
| Database |
+----------+
| public |
+----------+\
";
check_unordered_output_stream(output, expected).await;
let expected = "\
+---------+
@@ -500,21 +487,41 @@ async fn test_execute_show_databases_tables(instance: Arc<dyn MockInstance>) {
";
check_unordered_output_stream(output, expected).await;
let output = execute_sql(&instance, "SHOW FULL TABLES WHERE Table_type != 'VIEW'").await;
let expected = "\
+---------+-----------------+
| Tables | Table_type |
+---------+-----------------+
| demo | BASE TABLE |
| numbers | LOCAL TEMPORARY |
+---------+-----------------+\
";
check_unordered_output_stream(output, expected).await;
let output = execute_sql(
&instance,
"SHOW FULL TABLES WHERE Table_type = 'BASE TABLE'",
)
.await;
let expected = "\
+--------+------------+
| Tables | Table_type |
+--------+------------+
| demo | BASE TABLE |
+--------+------------+\
";
check_unordered_output_stream(output, expected).await;
// show tables like [string]
let output = execute_sql(&instance, "show tables like 'de%'").await;
match output {
Output::RecordBatches(databases) => {
let databases = databases.take();
assert_eq!(1, databases[0].num_columns());
assert_eq!(databases[0].column(0).len(), 1);
assert_eq!(
*databases[0].column(0),
Arc::new(StringVector::from(vec![Some("demo")])) as VectorRef
);
}
_ => unreachable!(),
}
let expected = "\
+--------+
| Tables |
+--------+
| demo |
+--------+\
";
check_unordered_output_stream(output, expected).await;
}
#[apply(both_instances_cases)]

View File

@@ -336,7 +336,12 @@ pub(crate) async fn check_unordered_output_stream(output: Output, expected: &str
};
let pretty_print = sort_table(&recordbatches.pretty_print().unwrap());
let expected = sort_table(expected);
assert_eq!(pretty_print, expected);
assert_eq!(
pretty_print,
expected,
"\n{}",
recordbatches.pretty_print().unwrap()
);
}
pub fn prepare_path(p: &str) -> String {

View File

@@ -557,7 +557,8 @@ pub async fn test_metrics_api(store_type: StorageType) {
let res = client.get("/metrics").send().await;
assert_eq!(res.status(), StatusCode::OK);
let body = res.text().await;
assert!(body.contains("frontend_handle_sql_elapsed"));
// Comment in the metrics text.
assert!(body.contains("# HELP"));
guard.remove_all().await;
}
@@ -784,6 +785,13 @@ write_buffer_size = "8MiB"
mem_threshold_on_create = "64.0MiB"
intermediate_path = ""
[datanode.region_engine.mito.memtable]
type = "experimental"
index_max_keys_per_shard = 8192
data_freeze_threshold = 32768
dedup = true
fork_dictionary_bytes = "1GiB"
[[datanode.region_engine]]
[datanode.region_engine.file]

View File

@@ -29,7 +29,6 @@ http_tests!(File, S3, S3WithCache, Oss, Azblob, Gcs);
// region_failover_tests!(File, S3, S3WithCache, Oss, Azblob);
sql_tests!(File);
// TODO(ruihang): re-enable this when the new partition rule is ready
// region_migration_tests!(File);
region_migration_tests!(File);
// TODO(niebayes): add integration tests for remote wal.

View File

@@ -1,36 +0,0 @@
CREATE TABLE my_table (
a INT PRIMARY KEY,
b STRING,
ts TIMESTAMP TIME INDEX,
)
PARTITION ON COLUMNS (a) (
a < 1000,
a >= 1000 AND a < 2000,
a >= 2000
);
Affected Rows: 0
-- SQLNESS REPLACE (\d{13}) ID
SELECT table_catalog, table_schema, table_name, partition_name, partition_expression, greptime_partition_id from information_schema.partitions WHERE table_name = 'my_table' ORDER BY table_catalog, table_schema, table_name, partition_name;
+---------------+--------------+------------+----------------+---------------------------------+-----------------------+
| table_catalog | table_schema | table_name | partition_name | partition_expression | greptime_partition_id |
+---------------+--------------+------------+----------------+---------------------------------+-----------------------+
| greptime | public | my_table | p0 | (a) VALUES LESS THAN (MAXVALUE) | ID |
+---------------+--------------+------------+----------------+---------------------------------+-----------------------+
-- SQLNESS REPLACE (\d{13}) REGION_ID
-- SQLNESS REPLACE (\d{1}) PEER_ID
SELECT region_id, peer_id, is_leader, status FROM information_schema.greptime_region_peers ORDER BY peer_id;
+---------------+---------+-----------+--------+
| region_id | peer_id | is_leader | status |
+---------------+---------+-----------+--------+
| REGION_ID | PEER_ID | Yes | ALIVE |
+---------------+---------+-----------+--------+
DROP TABLE my_table;
Affected Rows: 0

View File

@@ -1,19 +0,0 @@
CREATE TABLE my_table (
a INT PRIMARY KEY,
b STRING,
ts TIMESTAMP TIME INDEX,
)
PARTITION ON COLUMNS (a) (
a < 1000,
a >= 1000 AND a < 2000,
a >= 2000
);
-- SQLNESS REPLACE (\d{13}) ID
SELECT table_catalog, table_schema, table_name, partition_name, partition_expression, greptime_partition_id from information_schema.partitions WHERE table_name = 'my_table' ORDER BY table_catalog, table_schema, table_name, partition_name;
-- SQLNESS REPLACE (\d{13}) REGION_ID
-- SQLNESS REPLACE (\d{1}) PEER_ID
SELECT region_id, peer_id, is_leader, status FROM information_schema.greptime_region_peers ORDER BY peer_id;
DROP TABLE my_table;

View File

@@ -13,16 +13,16 @@ Affected Rows: 1
SHOW DATABASES LIKE '%public%';
+--------------------+
| Schemas |
| Database |
+--------------------+
| public |
| test_public_schema |
+--------------------+
SHOW DATABASES WHERE Schemas='test_public_schema';
SHOW DATABASES WHERE Database = 'test_public_schema';
+--------------------+
| Schemas |
| Database |
+--------------------+
| test_public_schema |
+--------------------+
@@ -81,6 +81,14 @@ SHOW TABLES;
| hello |
+--------+
SHOW FULL TABLES WHERE Table_type != 'VIEW';
+--------+------------+
| Tables | Table_type |
+--------+------------+
| hello | BASE TABLE |
+--------+------------+
DROP TABLE hello;
Affected Rows: 0
@@ -91,10 +99,8 @@ Error: 4001(TableNotFound), Table not found: greptime.test_public_schema.hello
SHOW TABLES FROM test_public_schema;
+--------+
| Tables |
+--------+
+--------+
++
++
SHOW TABLES FROM public;
@@ -104,7 +110,7 @@ SHOW TABLES FROM public;
| numbers |
+---------+
SHOW TABLES FROM public WHERE Tables='numbers';
SHOW TABLES FROM public WHERE Tables = 'numbers';
+---------+
| Tables |

View File

@@ -6,7 +6,7 @@ CREATE SCHEMA IF NOT EXISTS test_public_schema;
SHOW DATABASES LIKE '%public%';
SHOW DATABASES WHERE Schemas='test_public_schema';
SHOW DATABASES WHERE Database = 'test_public_schema';
USE test_public_schema;
@@ -26,6 +26,8 @@ SELECT * FROM hello;
SHOW TABLES;
SHOW FULL TABLES WHERE Table_type != 'VIEW';
DROP TABLE hello;
DROP TABLE hello;
@@ -34,7 +36,7 @@ SHOW TABLES FROM test_public_schema;
SHOW TABLES FROM public;
SHOW TABLES FROM public WHERE Tables='numbers';
SHOW TABLES FROM public WHERE Tables = 'numbers';
DROP SCHEMA test_public_schema;

View File

@@ -13,7 +13,7 @@ Error: 1002(Unexpected), Unexpected, violated: Invalid database name: ㊙data
show databases;
+--------------------+
| Schemas |
| Database |
+--------------------+
| greptime_private |
| illegal-database |

View File

@@ -0,0 +1,170 @@
CREATE TABLE my_table (
a INT PRIMARY KEY,
b STRING,
ts TIMESTAMP TIME INDEX,
)
PARTITION ON COLUMNS (a) (
a < 1000,
a >= 1000 AND a < 2000,
a >= 2000
);
Affected Rows: 0
-- SQLNESS REPLACE (\d{13}) ID
SELECT table_catalog, table_schema, table_name, partition_name, partition_expression, greptime_partition_id from information_schema.partitions WHERE table_name = 'my_table' ORDER BY table_catalog, table_schema, table_name, partition_name;
+---------------+--------------+------------+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
| table_catalog | table_schema | table_name | partition_name | partition_expression | greptime_partition_id |
+---------------+--------------+------------+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
| greptime | public | my_table | p0 | (a) VALUES LESS THAN (PartitionExpr { lhs: Column("a"), op: Lt, rhs: Value(Int32(1000)) }) | ID |
| greptime | public | my_table | p1 | (a) VALUES LESS THAN (PartitionExpr { lhs: Column("a"), op: GtEq, rhs: Value(Int32(2000)) }) | ID |
| greptime | public | my_table | p2 | (a) VALUES LESS THAN (PartitionExpr { lhs: Expr(PartitionExpr { lhs: Column("a"), op: GtEq, rhs: Value(Int32(1000)) }), op: And, rhs: Expr(PartitionExpr { lhs: Column("a"), op: Lt, rhs: Value(Int32(2000)) }) }) | ID |
+---------------+--------------+------------+----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
-- SQLNESS REPLACE (\d{13}) REGION_ID
-- SQLNESS REPLACE (\d{1}) PEER_ID
SELECT region_id, peer_id, is_leader, status FROM information_schema.greptime_region_peers ORDER BY peer_id;
+---------------+---------+-----------+--------+
| region_id | peer_id | is_leader | status |
+---------------+---------+-----------+--------+
| REGION_ID | PEER_ID | Yes | ALIVE |
| REGION_ID | PEER_ID | Yes | ALIVE |
| REGION_ID | PEER_ID | Yes | ALIVE |
+---------------+---------+-----------+--------+
INSERT INTO my_table VALUES
(100, 'a', 1),
(200, 'b', 2),
(1100, 'c', 3),
(1200, 'd', 4),
(2000, 'e', 5),
(2100, 'f', 6),
(2200, 'g', 7),
(2400, 'h', 8);
Affected Rows: 8
SELECT * FROM my_table;
+------+---+-------------------------+
| a | b | ts |
+------+---+-------------------------+
| 100 | a | 1970-01-01T00:00:00.001 |
| 200 | b | 1970-01-01T00:00:00.002 |
| 1100 | c | 1970-01-01T00:00:00.003 |
| 1200 | d | 1970-01-01T00:00:00.004 |
| 2000 | e | 1970-01-01T00:00:00.005 |
| 2100 | f | 1970-01-01T00:00:00.006 |
| 2200 | g | 1970-01-01T00:00:00.007 |
| 2400 | h | 1970-01-01T00:00:00.008 |
+------+---+-------------------------+
DELETE FROM my_table WHERE a < 150;
Affected Rows: 1
SELECT * FROM my_table;
+------+---+-------------------------+
| a | b | ts |
+------+---+-------------------------+
| 200 | b | 1970-01-01T00:00:00.002 |
| 1100 | c | 1970-01-01T00:00:00.003 |
| 1200 | d | 1970-01-01T00:00:00.004 |
| 2000 | e | 1970-01-01T00:00:00.005 |
| 2100 | f | 1970-01-01T00:00:00.006 |
| 2200 | g | 1970-01-01T00:00:00.007 |
| 2400 | h | 1970-01-01T00:00:00.008 |
+------+---+-------------------------+
DELETE FROM my_table WHERE a < 2200 AND a > 1500;
Affected Rows: 2
SELECT * FROM my_table;
+------+---+-------------------------+
| a | b | ts |
+------+---+-------------------------+
| 200 | b | 1970-01-01T00:00:00.002 |
| 1100 | c | 1970-01-01T00:00:00.003 |
| 1200 | d | 1970-01-01T00:00:00.004 |
| 2200 | g | 1970-01-01T00:00:00.007 |
| 2400 | h | 1970-01-01T00:00:00.008 |
+------+---+-------------------------+
DELETE FROM my_table WHERE a < 2500;
Affected Rows: 5
SELECT * FROM my_table;
++
++
DROP TABLE my_table;
Affected Rows: 0
CREATE TABLE my_table (
a INT PRIMARY KEY,
b STRING,
ts TIMESTAMP TIME INDEX,
)
PARTITION ON COLUMNS (a) ();
Affected Rows: 0
-- SQLNESS REPLACE (\d{13}) ID
SELECT table_catalog, table_schema, table_name, partition_name, partition_expression, greptime_partition_id from information_schema.partitions WHERE table_name = 'my_table' ORDER BY table_catalog, table_schema, table_name, partition_name;
+---------------+--------------+------------+----------------+---------------------------------+-----------------------+
| table_catalog | table_schema | table_name | partition_name | partition_expression | greptime_partition_id |
+---------------+--------------+------------+----------------+---------------------------------+-----------------------+
| greptime | public | my_table | p0 | (a) VALUES LESS THAN (MAXVALUE) | ID |
+---------------+--------------+------------+----------------+---------------------------------+-----------------------+
-- SQLNESS REPLACE (\d{13}) REGION_ID
-- SQLNESS REPLACE (\d{1}) PEER_ID
SELECT region_id, peer_id, is_leader, status FROM information_schema.greptime_region_peers ORDER BY peer_id;
+---------------+---------+-----------+--------+
| region_id | peer_id | is_leader | status |
+---------------+---------+-----------+--------+
| REGION_ID | PEER_ID | Yes | ALIVE |
+---------------+---------+-----------+--------+
INSERT INTO my_table VALUES
(100, 'a', 1),
(200, 'b', 2),
(1100, 'c', 3),
(1200, 'd', 4),
(2000, 'e', 5),
(2100, 'f', 6),
(2200, 'g', 7),
(2400, 'h', 8);
Affected Rows: 8
SELECT * FROM my_table;
+------+---+-------------------------+
| a | b | ts |
+------+---+-------------------------+
| 100 | a | 1970-01-01T00:00:00.001 |
| 200 | b | 1970-01-01T00:00:00.002 |
| 1100 | c | 1970-01-01T00:00:00.003 |
| 1200 | d | 1970-01-01T00:00:00.004 |
| 2000 | e | 1970-01-01T00:00:00.005 |
| 2100 | f | 1970-01-01T00:00:00.006 |
| 2200 | g | 1970-01-01T00:00:00.007 |
| 2400 | h | 1970-01-01T00:00:00.008 |
+------+---+-------------------------+
DROP TABLE my_table;
Affected Rows: 0

View File

@@ -0,0 +1,71 @@
CREATE TABLE my_table (
a INT PRIMARY KEY,
b STRING,
ts TIMESTAMP TIME INDEX,
)
PARTITION ON COLUMNS (a) (
a < 1000,
a >= 1000 AND a < 2000,
a >= 2000
);
-- SQLNESS REPLACE (\d{13}) ID
SELECT table_catalog, table_schema, table_name, partition_name, partition_expression, greptime_partition_id from information_schema.partitions WHERE table_name = 'my_table' ORDER BY table_catalog, table_schema, table_name, partition_name;
-- SQLNESS REPLACE (\d{13}) REGION_ID
-- SQLNESS REPLACE (\d{1}) PEER_ID
SELECT region_id, peer_id, is_leader, status FROM information_schema.greptime_region_peers ORDER BY peer_id;
INSERT INTO my_table VALUES
(100, 'a', 1),
(200, 'b', 2),
(1100, 'c', 3),
(1200, 'd', 4),
(2000, 'e', 5),
(2100, 'f', 6),
(2200, 'g', 7),
(2400, 'h', 8);
SELECT * FROM my_table;
DELETE FROM my_table WHERE a < 150;
SELECT * FROM my_table;
DELETE FROM my_table WHERE a < 2200 AND a > 1500;
SELECT * FROM my_table;
DELETE FROM my_table WHERE a < 2500;
SELECT * FROM my_table;
DROP TABLE my_table;
CREATE TABLE my_table (
a INT PRIMARY KEY,
b STRING,
ts TIMESTAMP TIME INDEX,
)
PARTITION ON COLUMNS (a) ();
-- SQLNESS REPLACE (\d{13}) ID
SELECT table_catalog, table_schema, table_name, partition_name, partition_expression, greptime_partition_id from information_schema.partitions WHERE table_name = 'my_table' ORDER BY table_catalog, table_schema, table_name, partition_name;
-- SQLNESS REPLACE (\d{13}) REGION_ID
-- SQLNESS REPLACE (\d{1}) PEER_ID
SELECT region_id, peer_id, is_leader, status FROM information_schema.greptime_region_peers ORDER BY peer_id;
INSERT INTO my_table VALUES
(100, 'a', 1),
(200, 'b', 2),
(1100, 'c', 3),
(1200, 'd', 4),
(2000, 'e', 5),
(2100, 'f', 6),
(2200, 'g', 7),
(2400, 'h', 8);
SELECT * FROM my_table;
DROP TABLE my_table;

View File

@@ -35,11 +35,13 @@ SHOW CREATE TABLE system_metrics;
| | PRIMARY KEY ("id", "host") |
| | ) |
| | PARTITION ON COLUMNS ("id") ( |
| | |
| | id < 5, |
| | id >= 9, |
| | id >= 5 AND id < 9 |
| | ) |
| | ENGINE=mito |
| | WITH( |
| | regions = 1, |
| | regions = 3, |
| | ttl = '7days', |
| | write_buffer_size = '1.0KiB' |
| | ) |

View File

@@ -1,7 +1,7 @@
show databases;
+-----------------------+
| Schemas |
| Database |
+-----------------------+
| greptime_private |
| illegal-database |