Compare commits

...

12 Commits

Author SHA1 Message Date
Weny Xu
c02b5dae93 chore: bump version to 0.9.5 (#4853) 2024-10-18 08:07:13 +00:00
Weny Xu
081c6d9e74 fix: flush metric metadata region (#4852)
* fix: flush metric metadata region

* chore: apply suggestions from CR
2024-10-18 07:21:35 +00:00
Weny Xu
ca6e02980e fix: overwrite entry_id if entry id is less than start_offset (#4842)
* fix: overwrite entry_id if entry id is less than start_offset

* feat: add `overwrite_entry_start_id` to options

* chore: update config.md
2024-10-18 06:31:02 +00:00
Weny Xu
74bdba4613 fix: fix metadata forward compatibility issue (#4846) 2024-10-18 06:26:41 +00:00
Weny Xu
2e0e82ddc8 chore: update greptime-proto to b4d3011 (#4850) 2024-10-18 04:10:22 +00:00
Yingwen
e0c4157ad8 feat: Seq scanner scans data by time range (#4809)
* feat: seq scan by partition

* feat: part metrics

* chore: remove unused codes

* chore: fmt stream

* feat: build ranges returns smallvec

* feat: move scan mem/file ranges to util and reuse

* feat: log metrics

* chore: correct some metrics

* feat: get explain info from ranges

* test: group test and remove unused codes

* chore: fix clippy

* feat: change PartitionRange end to exclusive

* test: add tests
2024-10-17 11:05:12 +00:00
discord9
613e07afb4 feat: window sort physical plan (#4814)
* WIP

* feat: range split& tests

* WIP: split range

* add sort exprs

* chore: typo

* WIP

* feat: find successive runs

* WIP

* READY FOR REVIEW PART ONE: more tests

* refactor: break into smaller functions

* feat: precompute working range(need testing)

* tests: on working range

* tests: on working range

* feat: support rev working range

* feat(to be tested): core logic of merge sort

* fix: poll results

* fix: find_slice_from_range&test

* chore: remove some unused util func&fields

* chore: typos

* chore: impl exec plan for WindowedSortExec

* test(WIP): window sort stream

* test: window sort stream

* chore: remove unused

* fix: fetch

* fix: WIP intersection remaining

* test: fix and test!

* chore: remove outdated comments

* chore: rename test

* chore: remove dbg line

* chore: sorted runs

* feat: handling unexpected data

* chore: unused

* chore: remove a print in test

* chore: per review

* docs: wrong comment

* chore: more test cases
2024-10-16 11:50:25 +00:00
Weny Xu
0ce93f0b88 chore: add more metrics for region migration (#4838) 2024-10-16 09:36:57 +00:00
Ning Sun
c231eee7c1 fix: respect feature flags for geo function (#4836) 2024-10-16 07:46:31 +00:00
Yiran
176f2df5b3 fix: dead links (#4837) 2024-10-16 07:43:14 +00:00
localhost
4622412dfe feat: add API to write OpenTelemetry logs to GreptimeDB (#4755)
* chore: otlp logs api

* feat: add API to write OpenTelemetry logs to GreptimeDB

* chore: fix test data schema error

* chore: modify the underlying data structure of the pipeline value map type from hashmap to btremap to keep key order

* chore: fix by pr comment

* chore: resolve conflicts and add some test

* chore: remove useless error

* chore: change otlp header name

* chore: fmt code

* chore: fix integration test for otlp log write api

* chore: fix by pr comment

* chore: set otlp body with fulltext default
2024-10-16 04:36:08 +00:00
jeremyhi
59ec90299b refactor: metasrv cannot be cloned (#4834)
* refactor: metasrv cannot be cloned

* chore: remove MetasrvInstance's clone
2024-10-15 11:36:48 +00:00
62 changed files with 5240 additions and 1283 deletions

146
Cargo.lock generated
View File

@@ -214,7 +214,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"
[[package]]
name = "api"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"common-base",
"common-decimal",
@@ -775,7 +775,7 @@ dependencies = [
[[package]]
name = "auth"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"async-trait",
@@ -1385,7 +1385,7 @@ dependencies = [
[[package]]
name = "cache"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"catalog",
"common-error",
@@ -1393,7 +1393,7 @@ dependencies = [
"common-meta",
"moka",
"snafu 0.8.5",
"substrait 0.9.4",
"substrait 0.9.5",
]
[[package]]
@@ -1420,7 +1420,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "catalog"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"arrow",
@@ -1759,7 +1759,7 @@ checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
[[package]]
name = "client"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"arc-swap",
@@ -1789,7 +1789,7 @@ dependencies = [
"serde_json",
"snafu 0.8.5",
"substrait 0.37.3",
"substrait 0.9.4",
"substrait 0.9.5",
"tokio",
"tokio-stream",
"tonic 0.11.0",
@@ -1819,7 +1819,7 @@ dependencies = [
[[package]]
name = "cmd"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"async-trait",
"auth",
@@ -1876,7 +1876,7 @@ dependencies = [
"similar-asserts",
"snafu 0.8.5",
"store-api",
"substrait 0.9.4",
"substrait 0.9.5",
"table",
"temp-env",
"tempfile",
@@ -1922,7 +1922,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
[[package]]
name = "common-base"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"anymap2",
"async-trait",
@@ -1940,7 +1940,7 @@ dependencies = [
[[package]]
name = "common-catalog"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"chrono",
"common-error",
@@ -1951,7 +1951,7 @@ dependencies = [
[[package]]
name = "common-config"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"common-base",
"common-error",
@@ -1974,7 +1974,7 @@ dependencies = [
[[package]]
name = "common-datasource"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"arrow",
"arrow-schema",
@@ -2011,7 +2011,7 @@ dependencies = [
[[package]]
name = "common-decimal"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"bigdecimal 0.4.5",
"common-error",
@@ -2024,7 +2024,7 @@ dependencies = [
[[package]]
name = "common-error"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"snafu 0.8.5",
"strum 0.25.0",
@@ -2033,7 +2033,7 @@ dependencies = [
[[package]]
name = "common-frontend"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"async-trait",
@@ -2048,7 +2048,7 @@ dependencies = [
[[package]]
name = "common-function"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"arc-swap",
@@ -2088,7 +2088,7 @@ dependencies = [
[[package]]
name = "common-greptimedb-telemetry"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"async-trait",
"common-runtime",
@@ -2105,7 +2105,7 @@ dependencies = [
[[package]]
name = "common-grpc"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"arrow-flight",
@@ -2131,7 +2131,7 @@ dependencies = [
[[package]]
name = "common-grpc-expr"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"common-base",
@@ -2149,7 +2149,7 @@ dependencies = [
[[package]]
name = "common-macro"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"arc-swap",
"common-query",
@@ -2163,7 +2163,7 @@ dependencies = [
[[package]]
name = "common-mem-prof"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"common-error",
"common-macro",
@@ -2176,7 +2176,7 @@ dependencies = [
[[package]]
name = "common-meta"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"anymap2",
"api",
@@ -2233,11 +2233,11 @@ dependencies = [
[[package]]
name = "common-plugins"
version = "0.9.4"
version = "0.9.5"
[[package]]
name = "common-pprof"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"common-error",
"common-macro",
@@ -2249,7 +2249,7 @@ dependencies = [
[[package]]
name = "common-procedure"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"async-stream",
"async-trait",
@@ -2276,7 +2276,7 @@ dependencies = [
[[package]]
name = "common-procedure-test"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"async-trait",
"common-procedure",
@@ -2284,7 +2284,7 @@ dependencies = [
[[package]]
name = "common-query"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"async-trait",
@@ -2310,7 +2310,7 @@ dependencies = [
[[package]]
name = "common-recordbatch"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"arc-swap",
"common-error",
@@ -2329,7 +2329,7 @@ dependencies = [
[[package]]
name = "common-runtime"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"async-trait",
"common-error",
@@ -2351,7 +2351,7 @@ dependencies = [
[[package]]
name = "common-telemetry"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"atty",
"backtrace",
@@ -2379,7 +2379,7 @@ dependencies = [
[[package]]
name = "common-test-util"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"client",
"common-query",
@@ -2391,7 +2391,7 @@ dependencies = [
[[package]]
name = "common-time"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"arrow",
"chrono",
@@ -2407,7 +2407,7 @@ dependencies = [
[[package]]
name = "common-version"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"build-data",
"const_format",
@@ -2418,7 +2418,7 @@ dependencies = [
[[package]]
name = "common-wal"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"common-base",
"common-error",
@@ -3227,7 +3227,7 @@ dependencies = [
[[package]]
name = "datanode"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"arrow-flight",
@@ -3277,7 +3277,7 @@ dependencies = [
"session",
"snafu 0.8.5",
"store-api",
"substrait 0.9.4",
"substrait 0.9.5",
"table",
"tokio",
"toml 0.8.19",
@@ -3286,7 +3286,7 @@ dependencies = [
[[package]]
name = "datatypes"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"arrow",
"arrow-array",
@@ -3892,7 +3892,7 @@ dependencies = [
[[package]]
name = "file-engine"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"async-trait",
@@ -4003,7 +4003,7 @@ dependencies = [
[[package]]
name = "flow"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"arrow",
@@ -4060,7 +4060,7 @@ dependencies = [
"snafu 0.8.5",
"store-api",
"strum 0.25.0",
"substrait 0.9.4",
"substrait 0.9.5",
"table",
"tokio",
"tonic 0.11.0",
@@ -4122,7 +4122,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa"
[[package]]
name = "frontend"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"arc-swap",
@@ -4518,7 +4518,7 @@ dependencies = [
[[package]]
name = "greptime-proto"
version = "0.1.0"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=0b4f7c8ab06399f6b90e1626e8d5b9697cb33bb9#0b4f7c8ab06399f6b90e1626e8d5b9697cb33bb9"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=b4d301184eb0d01fd4d1042fcc7c5dfb54f3c1e3#b4d301184eb0d01fd4d1042fcc7c5dfb54f3c1e3"
dependencies = [
"prost 0.12.6",
"serde",
@@ -5170,7 +5170,7 @@ dependencies = [
[[package]]
name = "index"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"async-trait",
"asynchronous-codec",
@@ -6001,7 +6001,7 @@ checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "log-store"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"async-stream",
"async-trait",
@@ -6321,7 +6321,7 @@ dependencies = [
[[package]]
name = "meta-client"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"async-trait",
@@ -6347,7 +6347,7 @@ dependencies = [
[[package]]
name = "meta-srv"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"async-trait",
@@ -6425,7 +6425,7 @@ dependencies = [
[[package]]
name = "metric-engine"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"aquamarine",
@@ -6528,7 +6528,7 @@ dependencies = [
[[package]]
name = "mito2"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"aquamarine",
@@ -7264,7 +7264,7 @@ dependencies = [
[[package]]
name = "object-store"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"anyhow",
"bytes",
@@ -7549,12 +7549,13 @@ dependencies = [
"ordered-float 4.3.0",
"percent-encoding",
"rand",
"serde_json",
"thiserror",
]
[[package]]
name = "operator"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"async-trait",
@@ -7599,7 +7600,7 @@ dependencies = [
"sql",
"sqlparser 0.45.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=54a267ac89c09b11c0c88934690530807185d3e7)",
"store-api",
"substrait 0.9.4",
"substrait 0.9.5",
"table",
"tokio",
"tokio-util",
@@ -7849,7 +7850,7 @@ dependencies = [
[[package]]
name = "partition"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"async-trait",
@@ -8150,7 +8151,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pipeline"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"ahash 0.8.11",
"api",
@@ -8312,7 +8313,7 @@ dependencies = [
[[package]]
name = "plugins"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"auth",
"common-base",
@@ -8586,7 +8587,7 @@ dependencies = [
[[package]]
name = "promql"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"ahash 0.8.11",
"async-trait",
@@ -8822,7 +8823,7 @@ dependencies = [
[[package]]
name = "puffin"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"async-compression 0.4.13",
"async-trait",
@@ -8944,7 +8945,7 @@ dependencies = [
[[package]]
name = "query"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"ahash 0.8.11",
"api",
@@ -8979,6 +8980,7 @@ dependencies = [
"datafusion-physical-expr",
"datafusion-sql",
"datatypes",
"fastrand",
"format_num",
"futures",
"futures-util",
@@ -8993,12 +8995,15 @@ dependencies = [
"object-store",
"once_cell",
"paste",
"pretty_assertions",
"prometheus",
"promql",
"promql-parser",
"prost 0.12.6",
"rand",
"regex",
"serde",
"serde_json",
"session",
"snafu 0.8.5",
"sql",
@@ -9007,7 +9012,7 @@ dependencies = [
"stats-cli",
"store-api",
"streaming-stats",
"substrait 0.9.4",
"substrait 0.9.5",
"table",
"tokio",
"tokio-stream",
@@ -10441,7 +10446,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "script"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"arc-swap",
@@ -10735,8 +10740,9 @@ dependencies = [
[[package]]
name = "servers"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"ahash 0.8.11",
"aide",
"api",
"arrow",
@@ -10845,7 +10851,7 @@ dependencies = [
[[package]]
name = "session"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"arc-swap",
@@ -11166,7 +11172,7 @@ dependencies = [
[[package]]
name = "sql"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"chrono",
@@ -11227,7 +11233,7 @@ dependencies = [
[[package]]
name = "sqlness-runner"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"async-trait",
"clap 4.5.19",
@@ -11447,7 +11453,7 @@ dependencies = [
[[package]]
name = "store-api"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"aquamarine",
@@ -11616,7 +11622,7 @@ dependencies = [
[[package]]
name = "substrait"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"async-trait",
"bytes",
@@ -11815,7 +11821,7 @@ dependencies = [
[[package]]
name = "table"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"async-trait",
@@ -12081,7 +12087,7 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
[[package]]
name = "tests-fuzz"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"arbitrary",
"async-trait",
@@ -12123,7 +12129,7 @@ dependencies = [
[[package]]
name = "tests-integration"
version = "0.9.4"
version = "0.9.5"
dependencies = [
"api",
"arrow-flight",
@@ -12185,7 +12191,7 @@ dependencies = [
"sql",
"sqlx",
"store-api",
"substrait 0.9.4",
"substrait 0.9.5",
"table",
"tempfile",
"time",

View File

@@ -65,7 +65,7 @@ members = [
resolver = "2"
[workspace.package]
version = "0.9.4"
version = "0.9.5"
edition = "2021"
license = "Apache-2.0"
@@ -121,7 +121,7 @@ etcd-client = { version = "0.13" }
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "0b4f7c8ab06399f6b90e1626e8d5b9697cb33bb9" }
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "b4d301184eb0d01fd4d1042fcc7c5dfb54f3c1e3" }
humantime = "2.1"
humantime-serde = "1.1"
itertools = "0.10"
@@ -138,6 +138,7 @@ opentelemetry-proto = { version = "0.5", features = [
"metrics",
"trace",
"with-serde",
"logs",
] }
parquet = { version = "51.0.0", default-features = false, features = ["arrow", "async", "object_store"] }
paste = "1.0"

View File

@@ -83,6 +83,7 @@
| `wal.backoff_max` | String | `10s` | The maximum backoff delay.<br/>**It's only used when the provider is `kafka`**. |
| `wal.backoff_base` | Integer | `2` | The exponential backoff rate, i.e. next backoff = base * current backoff.<br/>**It's only used when the provider is `kafka`**. |
| `wal.backoff_deadline` | String | `5mins` | The deadline of retries.<br/>**It's only used when the provider is `kafka`**. |
| `wal.overwrite_entry_start_id` | Bool | `false` | Ignore missing entries during read WAL.<br/>**It's only used when the provider is `kafka`**.<br/><br/>This option ensures that when Kafka messages are deleted, the system <br/>can still successfully replay memtable data without throwing an <br/>out-of-range error. <br/>However, enabling this option might lead to unexpected data loss, <br/>as the system will skip over missing entries instead of treating <br/>them as critical errors. |
| `metadata_store` | -- | -- | Metadata storage options. |
| `metadata_store.file_size` | String | `256MB` | Kv file size in bytes. |
| `metadata_store.purge_threshold` | String | `4GB` | Kv purge threshold. |
@@ -409,6 +410,7 @@
| `wal.backoff_deadline` | String | `5mins` | The deadline of retries.<br/>**It's only used when the provider is `kafka`**. |
| `wal.create_index` | Bool | `true` | Whether to enable WAL index creation.<br/>**It's only used when the provider is `kafka`**. |
| `wal.dump_index_interval` | String | `60s` | The interval for dumping WAL indexes.<br/>**It's only used when the provider is `kafka`**. |
| `wal.overwrite_entry_start_id` | Bool | `false` | Ignore missing entries during read WAL.<br/>**It's only used when the provider is `kafka`**.<br/><br/>This option ensures that when Kafka messages are deleted, the system <br/>can still successfully replay memtable data without throwing an <br/>out-of-range error. <br/>However, enabling this option might lead to unexpected data loss, <br/>as the system will skip over missing entries instead of treating <br/>them as critical errors. |
| `storage` | -- | -- | The data storage options. |
| `storage.data_home` | String | `/tmp/greptimedb/` | The working home directory. |
| `storage.type` | String | `File` | The storage type used to store the data.<br/>- `File`: the data is stored in the local file system.<br/>- `S3`: the data is stored in the S3 object storage.<br/>- `Gcs`: the data is stored in the Google Cloud Storage.<br/>- `Azblob`: the data is stored in the Azure Blob Storage.<br/>- `Oss`: the data is stored in the Aliyun OSS. |

View File

@@ -213,6 +213,17 @@ create_index = true
## **It's only used when the provider is `kafka`**.
dump_index_interval = "60s"
## Ignore missing entries during read WAL.
## **It's only used when the provider is `kafka`**.
##
## This option ensures that when Kafka messages are deleted, the system
## can still successfully replay memtable data without throwing an
## out-of-range error.
## However, enabling this option might lead to unexpected data loss,
## as the system will skip over missing entries instead of treating
## them as critical errors.
overwrite_entry_start_id = false
# The Kafka SASL configuration.
# **It's only used when the provider is `kafka`**.
# Available SASL mechanisms:

View File

@@ -237,6 +237,17 @@ backoff_base = 2
## **It's only used when the provider is `kafka`**.
backoff_deadline = "5mins"
## Ignore missing entries during read WAL.
## **It's only used when the provider is `kafka`**.
##
## This option ensures that when Kafka messages are deleted, the system
## can still successfully replay memtable data without throwing an
## out-of-range error.
## However, enabling this option might lead to unexpected data loss,
## as the system will skip over missing entries instead of treating
## them as critical errors.
overwrite_entry_start_id = false
# The Kafka SASL configuration.
# **It's only used when the provider is `kafka`**.
# Available SASL mechanisms:

View File

@@ -48,4 +48,4 @@ Please refer to [SQL query](./query.sql) for GreptimeDB and Clickhouse, and [que
## Addition
- You can tune GreptimeDB's configuration to get better performance.
- You can setup GreptimeDB to use S3 as storage, see [here](https://docs.greptime.com/user-guide/operations/configuration/#storage-options).
- You can setup GreptimeDB to use S3 as storage, see [here](https://docs.greptime.com/user-guide/deployments/configuration#storage-options).

View File

@@ -31,7 +31,6 @@ pub use polyval::PolyvalAccumulatorCreator;
pub use scipy_stats_norm_cdf::ScipyStatsNormCdfAccumulatorCreator;
pub use scipy_stats_norm_pdf::ScipyStatsNormPdfAccumulatorCreator;
use super::geo::encoding::JsonPathEncodeFunctionCreator;
use crate::function_registry::FunctionRegistry;
/// A function creates `AggregateFunctionCreator`.
@@ -93,6 +92,11 @@ impl AggregateFunctions {
register_aggr_func!("scipystatsnormcdf", 2, ScipyStatsNormCdfAccumulatorCreator);
register_aggr_func!("scipystatsnormpdf", 2, ScipyStatsNormPdfAccumulatorCreator);
register_aggr_func!("json_encode_path", 3, JsonPathEncodeFunctionCreator);
#[cfg(feature = "geo")]
register_aggr_func!(
"json_encode_path",
3,
super::geo::encoding::JsonPathEncodeFunctionCreator
);
}
}

View File

@@ -289,6 +289,7 @@ pub enum LeaderState {
///
/// - The [`Region`] may be unavailable (e.g., Crashed, Network disconnected).
/// - The [`Region`] was planned to migrate to another [`Peer`].
#[serde(alias = "Downgraded")]
Downgrading,
}
@@ -516,6 +517,73 @@ mod tests {
assert_eq!(decoded, region_route);
}
#[test]
fn test_region_route_compatibility() {
let region_route = RegionRoute {
region: Region {
id: 2.into(),
name: "r2".to_string(),
partition: None,
attrs: BTreeMap::new(),
},
leader_peer: Some(Peer::new(1, "a1")),
follower_peers: vec![Peer::new(2, "a2"), Peer::new(3, "a3")],
leader_state: Some(LeaderState::Downgrading),
leader_down_since: None,
};
let input = r#"{"region":{"id":2,"name":"r2","partition":null,"attrs":{}},"leader_peer":{"id":1,"addr":"a1"},"follower_peers":[{"id":2,"addr":"a2"},{"id":3,"addr":"a3"}],"leader_state":"Downgraded","leader_down_since":null}"#;
let decoded: RegionRoute = serde_json::from_str(input).unwrap();
assert_eq!(decoded, region_route);
let region_route = RegionRoute {
region: Region {
id: 2.into(),
name: "r2".to_string(),
partition: None,
attrs: BTreeMap::new(),
},
leader_peer: Some(Peer::new(1, "a1")),
follower_peers: vec![Peer::new(2, "a2"), Peer::new(3, "a3")],
leader_state: Some(LeaderState::Downgrading),
leader_down_since: None,
};
let input = r#"{"region":{"id":2,"name":"r2","partition":null,"attrs":{}},"leader_peer":{"id":1,"addr":"a1"},"follower_peers":[{"id":2,"addr":"a2"},{"id":3,"addr":"a3"}],"leader_status":"Downgraded","leader_down_since":null}"#;
let decoded: RegionRoute = serde_json::from_str(input).unwrap();
assert_eq!(decoded, region_route);
let region_route = RegionRoute {
region: Region {
id: 2.into(),
name: "r2".to_string(),
partition: None,
attrs: BTreeMap::new(),
},
leader_peer: Some(Peer::new(1, "a1")),
follower_peers: vec![Peer::new(2, "a2"), Peer::new(3, "a3")],
leader_state: Some(LeaderState::Downgrading),
leader_down_since: None,
};
let input = r#"{"region":{"id":2,"name":"r2","partition":null,"attrs":{}},"leader_peer":{"id":1,"addr":"a1"},"follower_peers":[{"id":2,"addr":"a2"},{"id":3,"addr":"a3"}],"leader_state":"Downgrading","leader_down_since":null}"#;
let decoded: RegionRoute = serde_json::from_str(input).unwrap();
assert_eq!(decoded, region_route);
let region_route = RegionRoute {
region: Region {
id: 2.into(),
name: "r2".to_string(),
partition: None,
attrs: BTreeMap::new(),
},
leader_peer: Some(Peer::new(1, "a1")),
follower_peers: vec![Peer::new(2, "a2"), Peer::new(3, "a3")],
leader_state: Some(LeaderState::Downgrading),
leader_down_since: None,
};
let input = r#"{"region":{"id":2,"name":"r2","partition":null,"attrs":{}},"leader_peer":{"id":1,"addr":"a1"},"follower_peers":[{"id":2,"addr":"a2"},{"id":3,"addr":"a3"}],"leader_status":"Downgrading","leader_down_since":null}"#;
let decoded: RegionRoute = serde_json::from_str(input).unwrap();
assert_eq!(decoded, region_route);
}
#[test]
fn test_de_serialize_partition() {
let p = Partition {

View File

@@ -46,6 +46,8 @@ pub struct DatanodeKafkaConfig {
pub create_index: bool,
#[serde(with = "humantime_serde")]
pub dump_index_interval: Duration,
/// Ignore missing entries during read WAL.
pub overwrite_entry_start_id: bool,
}
impl Default for DatanodeKafkaConfig {
@@ -60,6 +62,7 @@ impl Default for DatanodeKafkaConfig {
auto_create_topics: true,
create_index: true,
dump_index_interval: Duration::from_secs(60),
overwrite_entry_start_id: false,
}
}
}

View File

@@ -38,6 +38,7 @@ use snafu::{ensure, ResultExt};
use crate::error::{self, ConvertArrowArrayToScalarsSnafu, Error, Result, TryFromValueSnafu};
use crate::prelude::*;
use crate::schema::ColumnSchema;
use crate::type_id::LogicalTypeId;
use crate::types::{IntervalType, ListType};
use crate::vectors::ListVector;
@@ -1286,39 +1287,52 @@ impl<'a> From<Option<ListValueRef<'a>>> for ValueRef<'a> {
}
}
impl<'a> TryFrom<ValueRef<'a>> for serde_json::Value {
type Error = serde_json::Error;
/// transform a [ValueRef] to a [serde_json::Value].
/// The json type will be handled specially
pub fn transform_value_ref_to_json_value<'a>(
value: ValueRef<'a>,
schema: &'a ColumnSchema,
) -> serde_json::Result<serde_json::Value> {
let json_value = match value {
ValueRef::Null => serde_json::Value::Null,
ValueRef::Boolean(v) => serde_json::Value::Bool(v),
ValueRef::UInt8(v) => serde_json::Value::from(v),
ValueRef::UInt16(v) => serde_json::Value::from(v),
ValueRef::UInt32(v) => serde_json::Value::from(v),
ValueRef::UInt64(v) => serde_json::Value::from(v),
ValueRef::Int8(v) => serde_json::Value::from(v),
ValueRef::Int16(v) => serde_json::Value::from(v),
ValueRef::Int32(v) => serde_json::Value::from(v),
ValueRef::Int64(v) => serde_json::Value::from(v),
ValueRef::Float32(v) => serde_json::Value::from(v.0),
ValueRef::Float64(v) => serde_json::Value::from(v.0),
ValueRef::String(bytes) => serde_json::Value::String(bytes.to_string()),
ValueRef::Binary(bytes) => {
if let ConcreteDataType::Json(_) = schema.data_type {
match jsonb::from_slice(bytes) {
Ok(json) => json.into(),
Err(e) => {
error!(e; "Failed to parse jsonb");
serde_json::Value::Null
}
}
} else {
serde_json::to_value(bytes)?
}
}
ValueRef::Date(v) => serde_json::Value::Number(v.val().into()),
ValueRef::DateTime(v) => serde_json::Value::Number(v.val().into()),
ValueRef::List(v) => serde_json::to_value(v)?,
ValueRef::Timestamp(v) => serde_json::to_value(v.value())?,
ValueRef::Time(v) => serde_json::to_value(v.value())?,
ValueRef::IntervalYearMonth(v) => serde_json::Value::from(v),
ValueRef::IntervalDayTime(v) => serde_json::Value::from(v),
ValueRef::IntervalMonthDayNano(v) => serde_json::Value::from(v),
ValueRef::Duration(v) => serde_json::to_value(v.value())?,
ValueRef::Decimal128(v) => serde_json::to_value(v.to_string())?,
};
fn try_from(value: ValueRef<'a>) -> serde_json::Result<serde_json::Value> {
let json_value = match value {
ValueRef::Null => serde_json::Value::Null,
ValueRef::Boolean(v) => serde_json::Value::Bool(v),
ValueRef::UInt8(v) => serde_json::Value::from(v),
ValueRef::UInt16(v) => serde_json::Value::from(v),
ValueRef::UInt32(v) => serde_json::Value::from(v),
ValueRef::UInt64(v) => serde_json::Value::from(v),
ValueRef::Int8(v) => serde_json::Value::from(v),
ValueRef::Int16(v) => serde_json::Value::from(v),
ValueRef::Int32(v) => serde_json::Value::from(v),
ValueRef::Int64(v) => serde_json::Value::from(v),
ValueRef::Float32(v) => serde_json::Value::from(v.0),
ValueRef::Float64(v) => serde_json::Value::from(v.0),
ValueRef::String(bytes) => serde_json::Value::String(bytes.to_string()),
ValueRef::Binary(bytes) => serde_json::to_value(bytes)?,
ValueRef::Date(v) => serde_json::Value::Number(v.val().into()),
ValueRef::DateTime(v) => serde_json::Value::Number(v.val().into()),
ValueRef::List(v) => serde_json::to_value(v)?,
ValueRef::Timestamp(v) => serde_json::to_value(v.value())?,
ValueRef::Time(v) => serde_json::to_value(v.value())?,
ValueRef::IntervalYearMonth(v) => serde_json::Value::from(v),
ValueRef::IntervalDayTime(v) => serde_json::Value::from(v),
ValueRef::IntervalMonthDayNano(v) => serde_json::Value::from(v),
ValueRef::Duration(v) => serde_json::to_value(v.value())?,
ValueRef::Decimal128(v) => serde_json::to_value(v.to_string())?,
};
Ok(json_value)
}
Ok(json_value)
}
/// Reference to a [ListValue].

View File

@@ -17,8 +17,10 @@ use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use client::Output;
use common_error::ext::BoxedError;
use common_telemetry::tracing;
use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest;
use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
use pipeline::PipelineWay;
use servers::error::{self, AuthSnafu, Result as ServerResult};
use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
use servers::otlp;
@@ -28,7 +30,7 @@ use session::context::QueryContextRef;
use snafu::ResultExt;
use crate::instance::Instance;
use crate::metrics::{OTLP_METRICS_ROWS, OTLP_TRACES_ROWS};
use crate::metrics::{OTLP_LOGS_ROWS, OTLP_METRICS_ROWS, OTLP_TRACES_ROWS};
#[async_trait]
impl OpenTelemetryProtocolHandler for Instance {
@@ -92,4 +94,31 @@ impl OpenTelemetryProtocolHandler for Instance {
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)
}
#[tracing::instrument(skip_all)]
async fn logs(
&self,
request: ExportLogsServiceRequest,
pipeline: PipelineWay,
table_name: String,
ctx: QueryContextRef,
) -> ServerResult<Output> {
self.plugins
.get::<PermissionCheckerRef>()
.as_ref()
.check_permission(ctx.current_user(), PermissionReq::Otlp)
.context(AuthSnafu)?;
let interceptor_ref = self
.plugins
.get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
interceptor_ref.pre_execute(ctx.clone())?;
let (requests, rows) = otlp::logs::to_grpc_insert_requests(request, pipeline, table_name)?;
self.handle_log_inserts(requests, ctx)
.await
.inspect(|_| OTLP_LOGS_ROWS.inc_by(rows as u64))
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)
}
}

View File

@@ -41,16 +41,28 @@ lazy_static! {
.with_label_values(&["insert"]);
pub static ref EXECUTE_SCRIPT_ELAPSED: Histogram = HANDLE_SCRIPT_ELAPSED
.with_label_values(&["execute"]);
/// The number of OpenTelemetry metrics send by frontend node.
pub static ref OTLP_METRICS_ROWS: IntCounter = register_int_counter!(
"greptime_frontend_otlp_metrics_rows",
"frontend otlp metrics rows"
)
.unwrap();
/// The number of OpenTelemetry traces send by frontend node.
pub static ref OTLP_TRACES_ROWS: IntCounter = register_int_counter!(
"greptime_frontend_otlp_traces_rows",
"frontend otlp traces rows"
)
.unwrap();
/// The number of OpenTelemetry logs send by frontend node.
pub static ref OTLP_LOGS_ROWS: IntCounter = register_int_counter!(
"greptime_frontend_otlp_logs_rows",
"frontend otlp logs rows"
)
.unwrap();
/// The number of heartbeats send by frontend node.
pub static ref HEARTBEAT_SENT_COUNT: IntCounter = register_int_counter!(
"greptime_frontend_heartbeat_send_count",

View File

@@ -49,6 +49,8 @@ pub struct KafkaLogStore {
max_batch_bytes: usize,
/// The consumer wait timeout.
consumer_wait_timeout: Duration,
/// Ignore missing entries during read WAL.
overwrite_entry_start_id: bool,
}
impl KafkaLogStore {
@@ -64,6 +66,7 @@ impl KafkaLogStore {
client_manager,
max_batch_bytes: config.max_batch_bytes.as_bytes() as usize,
consumer_wait_timeout: config.consumer_wait_timeout,
overwrite_entry_start_id: config.overwrite_entry_start_id,
})
}
}
@@ -205,7 +208,7 @@ impl LogStore for KafkaLogStore {
async fn read(
&self,
provider: &Provider,
entry_id: EntryId,
mut entry_id: EntryId,
index: Option<WalIndex>,
) -> Result<SendableEntryStream<'static, Entry, Self::Error>> {
let provider = provider
@@ -225,6 +228,25 @@ impl LogStore for KafkaLogStore {
.client()
.clone();
if self.overwrite_entry_start_id {
let start_offset =
client
.get_offset(OffsetAt::Earliest)
.await
.context(GetOffsetSnafu {
topic: &provider.topic,
})?;
if entry_id as i64 <= start_offset {
warn!(
"The entry_id: {} is less than start_offset: {}, topic: {}. Overwriting entry_id with start_offset",
entry_id, start_offset, &provider.topic
);
entry_id = start_offset as u64;
}
}
// Gets the offset of the latest record in the topic. Actually, it's the latest record of the single partition in the topic.
// The read operation terminates when this record is consumed.
// Warning: the `get_offset` returns the end offset of the latest record. For our usage, it should be decremented.

View File

@@ -56,9 +56,8 @@ use crate::selector::SelectorType;
use crate::service::admin;
use crate::{error, Result};
#[derive(Clone)]
pub struct MetasrvInstance {
metasrv: Metasrv,
metasrv: Arc<Metasrv>,
httpsrv: Arc<HttpServer>,
@@ -83,8 +82,9 @@ impl MetasrvInstance {
.with_greptime_config_options(opts.to_toml().context(TomlFormatSnafu)?)
.build(),
);
let metasrv = Arc::new(metasrv);
// put metasrv into plugins for later use
plugins.insert::<Arc<Metasrv>>(Arc::new(metasrv.clone()));
plugins.insert::<Arc<Metasrv>>(metasrv.clone());
let export_metrics_task = ExportMetricsTask::try_new(&opts.export_metrics, Some(&plugins))
.context(InitExportMetricsTaskSnafu)?;
Ok(MetasrvInstance {
@@ -178,13 +178,13 @@ pub async fn bootstrap_metasrv_with_router(
Ok(())
}
pub fn router(metasrv: Metasrv) -> Router {
pub fn router(metasrv: Arc<Metasrv>) -> Router {
tonic::transport::Server::builder()
.accept_http1(true) // for admin services
.add_service(HeartbeatServer::new(metasrv.clone()))
.add_service(StoreServer::new(metasrv.clone()))
.add_service(ClusterServer::new(metasrv.clone()))
.add_service(ProcedureServiceServer::new(metasrv.clone()))
.add_service(HeartbeatServer::from_arc(metasrv.clone()))
.add_service(StoreServer::from_arc(metasrv.clone()))
.add_service(ClusterServer::from_arc(metasrv.clone()))
.add_service(ProcedureServiceServer::from_arc(metasrv.clone()))
.add_service(admin::make_admin_service(metasrv))
}

View File

@@ -443,7 +443,6 @@ impl Mailbox for HeartbeatMailbox {
}
/// The builder to build the group of heartbeat handlers.
#[derive(Clone)]
pub struct HeartbeatHandlerGroupBuilder {
/// The handler to handle region failure.
region_failure_handler: Option<RegionFailureHandler>,

View File

@@ -21,7 +21,6 @@ use crate::handler::{HandleControl, HeartbeatAccumulator, HeartbeatHandler};
use crate::metasrv::Context;
use crate::region::supervisor::{DatanodeHeartbeat, HeartbeatAcceptor, RegionSupervisor};
#[derive(Clone)]
pub struct RegionFailureHandler {
heartbeat_acceptor: HeartbeatAcceptor,
}

View File

@@ -26,7 +26,6 @@ use crate::metasrv::Context;
use crate::region::lease_keeper::{RegionLeaseKeeperRef, RenewRegionLeasesResponse};
use crate::region::RegionLeaseKeeper;
#[derive(Clone)]
pub struct RegionLeaseHandler {
region_lease_seconds: u64,
region_lease_keeper: RegionLeaseKeeperRef,

View File

@@ -16,7 +16,7 @@ pub mod builder;
use std::fmt::Display;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::sync::{Arc, Mutex, RwLock};
use std::time::Duration;
use clap::ValueEnum;
@@ -337,7 +337,6 @@ impl MetaStateHandler {
}
}
#[derive(Clone)]
pub struct Metasrv {
state: StateRef,
started: Arc<AtomicBool>,
@@ -353,8 +352,8 @@ pub struct Metasrv {
selector: SelectorRef,
// The flow selector is used to select a target flownode.
flow_selector: SelectorRef,
handler_group: Option<HeartbeatHandlerGroupRef>,
handler_group_builder: Option<HeartbeatHandlerGroupBuilder>,
handler_group: RwLock<Option<HeartbeatHandlerGroupRef>>,
handler_group_builder: Mutex<Option<HeartbeatHandlerGroupBuilder>>,
election: Option<ElectionRef>,
procedure_manager: ProcedureManagerRef,
mailbox: MailboxRef,
@@ -371,15 +370,7 @@ pub struct Metasrv {
}
impl Metasrv {
pub async fn try_start(&mut self) -> Result<()> {
let builder = self
.handler_group_builder
.take()
.context(error::UnexpectedSnafu {
violated: "expected heartbeat handler group builder",
})?;
self.handler_group = Some(Arc::new(builder.build()?));
pub async fn try_start(&self) -> Result<()> {
if self
.started
.compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
@@ -389,6 +380,16 @@ impl Metasrv {
return Ok(());
}
let handler_group_builder =
self.handler_group_builder
.lock()
.unwrap()
.take()
.context(error::UnexpectedSnafu {
violated: "expected heartbeat handler group builder",
})?;
*self.handler_group.write().unwrap() = Some(Arc::new(handler_group_builder.build()?));
// Creates default schema if not exists
self.table_metadata_manager
.init()
@@ -567,12 +568,8 @@ impl Metasrv {
&self.flow_selector
}
pub fn handler_group(&self) -> &Option<HeartbeatHandlerGroupRef> {
&self.handler_group
}
pub fn handler_group_builder(&mut self) -> &mut Option<HeartbeatHandlerGroupBuilder> {
&mut self.handler_group_builder
pub fn handler_group(&self) -> Option<HeartbeatHandlerGroupRef> {
self.handler_group.read().unwrap().clone()
}
pub fn election(&self) -> Option<&ElectionRef> {

View File

@@ -13,7 +13,7 @@
// limitations under the License.
use std::sync::atomic::AtomicBool;
use std::sync::{Arc, RwLock};
use std::sync::{Arc, Mutex, RwLock};
use std::time::Duration;
use client::client_manager::NodeClients;
@@ -371,8 +371,8 @@ impl MetasrvBuilder {
selector,
// TODO(jeremy): We do not allow configuring the flow selector.
flow_selector: Arc::new(RoundRobinSelector::new(SelectTarget::Flownode)),
handler_group: None,
handler_group_builder: Some(handler_group_builder),
handler_group: RwLock::new(None),
handler_group_builder: Mutex::new(Some(handler_group_builder)),
election,
procedure_manager,
mailbox,

View File

@@ -45,7 +45,20 @@ lazy_static! {
/// Meta kv cache miss counter.
pub static ref METRIC_META_KV_CACHE_MISS: IntCounterVec =
register_int_counter_vec!("greptime_meta_kv_cache_miss", "meta kv cache miss", &["op"]).unwrap();
// Heartbeat received by metasrv.
/// Heartbeat received by metasrv.
pub static ref METRIC_META_HEARTBEAT_RECV: IntCounterVec =
register_int_counter_vec!("greptime_meta_heartbeat_recv", "heartbeats received by metasrv", &["pusher_key"]).unwrap();
/// The migration execute histogram.
pub static ref METRIC_META_REGION_MIGRATION_EXECUTE: HistogramVec =
register_histogram_vec!("greptime_meta_region_migration_execute", "meta region migration execute", &["state"]).unwrap();
/// The migration error counter.
pub static ref METRIC_META_REGION_MIGRATION_ERROR: IntCounterVec =
register_int_counter_vec!("greptime_meta_region_migration_error", "meta region migration abort", &["state", "error_type"]).unwrap();
/// The migration datanode counter.
pub static ref METRIC_META_REGION_MIGRATION_DATANODES: IntCounterVec =
register_int_counter_vec!("greptime_meta_region_migration_stat", "meta region migration stat", &["datanode_type", "datanode_id"]).unwrap();
/// The migration fail counter.
pub static ref METRIC_META_REGION_MIGRATION_FAIL: IntCounter =
register_int_counter!("greptime_meta_region_migration_fail", "meta region migration fail").unwrap();
}

View File

@@ -33,7 +33,7 @@ use crate::metasrv::{Metasrv, MetasrvOptions, SelectorRef};
pub struct MockInfo {
pub server_addr: String,
pub channel_manager: ChannelManager,
pub metasrv: Metasrv,
pub metasrv: Arc<Metasrv>,
}
pub async fn mock_with_memstore() -> MockInfo {
@@ -74,16 +74,17 @@ pub async fn mock(
None => builder,
};
let mut metasrv = builder.build().await.unwrap();
let metasrv = builder.build().await.unwrap();
metasrv.try_start().await.unwrap();
let (client, server) = tokio::io::duplex(1024);
let metasrv = Arc::new(metasrv);
let service = metasrv.clone();
let _handle = tokio::spawn(async move {
tonic::transport::Server::builder()
.add_service(HeartbeatServer::new(service.clone()))
.add_service(StoreServer::new(service.clone()))
.add_service(ProcedureServiceServer::new(service.clone()))
.add_service(HeartbeatServer::from_arc(service.clone()))
.add_service(StoreServer::from_arc(service.clone()))
.add_service(ProcedureServiceServer::from_arc(service.clone()))
.serve_with_incoming(futures::stream::iter(vec![Ok::<_, std::io::Error>(server)]))
.await
});

View File

@@ -54,6 +54,7 @@ use tokio::time::Instant;
use self::migration_start::RegionMigrationStart;
use crate::error::{self, Result};
use crate::metrics::{METRIC_META_REGION_MIGRATION_ERROR, METRIC_META_REGION_MIGRATION_EXECUTE};
use crate::service::mailbox::MailboxRef;
/// It's shared in each step and available even after recovering.
@@ -390,6 +391,12 @@ impl Context {
#[async_trait::async_trait]
#[typetag::serde(tag = "region_migration_state")]
pub(crate) trait State: Sync + Send + Debug {
fn name(&self) -> &'static str {
let type_name = std::any::type_name::<Self>();
// short name
type_name.split("::").last().unwrap_or(type_name)
}
/// Yields the next [State] and [Status].
async fn next(&mut self, ctx: &mut Context) -> Result<(Box<dyn State>, Status)>;
@@ -478,10 +485,20 @@ impl Procedure for RegionMigrationProcedure {
async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
let state = &mut self.state;
let name = state.name();
let _timer = METRIC_META_REGION_MIGRATION_EXECUTE
.with_label_values(&[name])
.start_timer();
let (next, status) = state.next(&mut self.context).await.map_err(|e| {
if e.is_retryable() {
METRIC_META_REGION_MIGRATION_ERROR
.with_label_values(&[name, "retryable"])
.inc();
ProcedureError::retry_later(e)
} else {
METRIC_META_REGION_MIGRATION_ERROR
.with_label_values(&[name, "external"])
.inc();
ProcedureError::external(e)
}
})?;

View File

@@ -30,6 +30,7 @@ use store_api::storage::RegionId;
use table::table_name::TableName;
use crate::error::{self, Result};
use crate::metrics::{METRIC_META_REGION_MIGRATION_DATANODES, METRIC_META_REGION_MIGRATION_FAIL};
use crate::procedure::region_migration::{
DefaultContextFactory, PersistentContext, RegionMigrationProcedure,
};
@@ -323,6 +324,12 @@ impl RegionMigrationManager {
schema_name,
..
} = table_info.table_name();
METRIC_META_REGION_MIGRATION_DATANODES
.with_label_values(&["src", &task.from_peer.id.to_string()])
.inc();
METRIC_META_REGION_MIGRATION_DATANODES
.with_label_values(&["desc", &task.to_peer.id.to_string()])
.inc();
let RegionMigrationProcedureTask {
cluster_id,
region_id,
@@ -358,6 +365,7 @@ impl RegionMigrationManager {
if let Err(e) = watcher::wait(watcher).await {
error!(e; "Failed to wait region migration procedure {procedure_id} for {task}");
METRIC_META_REGION_MIGRATION_FAIL.inc();
return;
}

View File

@@ -30,7 +30,7 @@ use tonic::server::NamedService;
use crate::metasrv::Metasrv;
pub fn make_admin_service(metasrv: Metasrv) -> Admin {
pub fn make_admin_service(metasrv: Arc<Metasrv>) -> Admin {
let router = Router::new().route("/health", health::HealthHandler);
let router = router.route(

View File

@@ -46,12 +46,9 @@ impl heartbeat_server::Heartbeat for Metasrv {
) -> GrpcResult<Self::HeartbeatStream> {
let mut in_stream = req.into_inner();
let (tx, rx) = mpsc::channel(128);
let handler_group = self
.handler_group()
.clone()
.context(error::UnexpectedSnafu {
violated: "expected heartbeat handlers",
})?;
let handler_group = self.handler_group().context(error::UnexpectedSnafu {
violated: "expected heartbeat handlers",
})?;
let ctx = self.new_ctx();
let _handle = common_runtime::spawn_global(async move {

View File

@@ -17,6 +17,7 @@ mod catchup;
mod close;
mod create;
mod drop;
mod flush;
mod open;
mod options;
mod put;
@@ -145,7 +146,7 @@ impl RegionEngine for MetricEngine {
.alter_region(region_id, alter, &mut extension_return_value)
.await
}
RegionRequest::Flush(_) | RegionRequest::Compact(_) => {
RegionRequest::Compact(_) => {
if self.inner.is_physical_region(region_id) {
self.inner
.mito
@@ -157,10 +158,11 @@ impl RegionEngine for MetricEngine {
UnsupportedRegionRequestSnafu { request }.fail()
}
}
RegionRequest::Flush(req) => self.inner.flush_region(region_id, req).await,
RegionRequest::Delete(_) | RegionRequest::Truncate(_) => {
UnsupportedRegionRequestSnafu { request }.fail()
}
RegionRequest::Catchup(ref req) => self.inner.catchup_region(region_id, *req).await,
RegionRequest::Catchup(req) => self.inner.catchup_region(region_id, req).await,
};
result.map_err(BoxedError::new).map(|rows| RegionResponse {

View File

@@ -47,9 +47,10 @@ impl MetricEngineInner {
.await
.context(MitoCatchupOperationSnafu)?;
let data_region_id = utils::to_data_region_id(region_id);
self.mito
.handle_request(
region_id,
data_region_id,
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: req.set_writable,
entry_id: req.entry_id,

View File

@@ -0,0 +1,52 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use snafu::ResultExt;
use store_api::region_engine::RegionEngine;
use store_api::region_request::{AffectedRows, RegionFlushRequest, RegionRequest};
use store_api::storage::RegionId;
use crate::engine::MetricEngineInner;
use crate::error::{MitoFlushOperationSnafu, Result, UnsupportedRegionRequestSnafu};
use crate::utils;
impl MetricEngineInner {
pub async fn flush_region(
&self,
region_id: RegionId,
req: RegionFlushRequest,
) -> Result<AffectedRows> {
if !self.is_physical_region(region_id) {
return UnsupportedRegionRequestSnafu {
request: RegionRequest::Flush(req),
}
.fail();
}
let metadata_region_id = utils::to_metadata_region_id(region_id);
// Flushes the metadata region as well
self.mito
.handle_request(metadata_region_id, RegionRequest::Flush(req.clone()))
.await
.context(MitoFlushOperationSnafu)
.map(|response| response.affected_rows)?;
let data_region_id = utils::to_data_region_id(region_id);
self.mito
.handle_request(data_region_id, RegionRequest::Flush(req.clone()))
.await
.context(MitoFlushOperationSnafu)
.map(|response| response.affected_rows)
}
}

View File

@@ -22,6 +22,7 @@ pub mod projection;
pub(crate) mod prune;
pub(crate) mod range;
pub(crate) mod scan_region;
pub(crate) mod scan_util;
pub(crate) mod seq_scan;
pub(crate) mod unordered_scan;
@@ -57,7 +58,6 @@ use crate::error::{
use crate::memtable::BoxedBatchIterator;
use crate::metrics::{READ_BATCHES_RETURN, READ_ROWS_RETURN, READ_STAGE_ELAPSED};
use crate::read::prune::PruneReader;
use crate::sst::parquet::reader::{ReaderFilterMetrics, ReaderMetrics};
/// Storage internal representation of a batch of rows for a primary key (time series).
///
@@ -738,7 +738,7 @@ impl<T: BatchReader + ?Sized> BatchReader for Box<T> {
pub(crate) struct ScannerMetrics {
/// Duration to prepare the scan task.
prepare_scan_cost: Duration,
/// Duration to build parts.
/// Duration to build file ranges.
build_parts_cost: Duration,
/// Duration to build the (merge) reader.
build_reader_cost: Duration,
@@ -758,31 +758,17 @@ pub(crate) struct ScannerMetrics {
num_mem_ranges: usize,
/// Number of file ranges scanned.
num_file_ranges: usize,
/// Filter related metrics for readers.
filter_metrics: ReaderFilterMetrics,
}
impl ScannerMetrics {
/// Sets and observes metrics on initializing parts.
fn observe_init_part(&mut self, build_parts_cost: Duration, reader_metrics: &ReaderMetrics) {
self.build_parts_cost = build_parts_cost;
// Observes metrics.
/// Observes metrics.
fn observe_metrics(&self) {
READ_STAGE_ELAPSED
.with_label_values(&["prepare_scan"])
.observe(self.prepare_scan_cost.as_secs_f64());
READ_STAGE_ELAPSED
.with_label_values(&["build_parts"])
.observe(self.build_parts_cost.as_secs_f64());
// We only call this once so we overwrite it directly.
self.filter_metrics = reader_metrics.filter_metrics;
// Observes filter metrics.
self.filter_metrics.observe();
}
/// Observes metrics on scanner finish.
fn observe_metrics_on_finish(&self) {
READ_STAGE_ELAPSED
.with_label_values(&["build_reader"])
.observe(self.build_reader_cost.as_secs_f64());
@@ -801,6 +787,21 @@ impl ScannerMetrics {
READ_ROWS_RETURN.observe(self.num_rows as f64);
READ_BATCHES_RETURN.observe(self.num_batches as f64);
}
/// Merges metrics from another [ScannerMetrics].
fn merge_from(&mut self, other: &ScannerMetrics) {
self.prepare_scan_cost += other.prepare_scan_cost;
self.build_parts_cost += other.build_parts_cost;
self.build_reader_cost += other.build_reader_cost;
self.scan_cost += other.scan_cost;
self.convert_cost += other.convert_cost;
self.yield_cost += other.yield_cost;
self.total_cost += other.total_cost;
self.num_batches += other.num_batches;
self.num_rows += other.num_rows;
self.num_mem_ranges += other.num_mem_ranges;
self.num_file_ranges += other.num_file_ranges;
}
}
#[cfg(test)]

View File

@@ -14,7 +14,9 @@
//! Structs for partition ranges.
use common_time::Timestamp;
use smallvec::{smallvec, SmallVec};
use store_api::region_engine::PartitionRange;
use crate::memtable::MemtableRef;
use crate::read::scan_region::ScanInput;
@@ -48,6 +50,26 @@ pub(crate) struct RangeMeta {
}
impl RangeMeta {
/// Creates a [PartitionRange] with specific identifier.
/// It converts the inclusive max timestamp to exclusive end timestamp.
pub(crate) fn new_partition_range(&self, identifier: usize) -> PartitionRange {
PartitionRange {
start: self.time_range.0,
end: Timestamp::new(
// The i64::MAX timestamp may be invisible but we don't guarantee to support this
// value now.
self.time_range
.1
.value()
.checked_add(1)
.unwrap_or(self.time_range.1.value()),
self.time_range.1.unit(),
),
num_rows: self.num_rows,
identifier,
}
}
/// Creates a list of ranges from the `input` for seq scan.
pub(crate) fn seq_scan_ranges(input: &ScanInput) -> Vec<RangeMeta> {
let mut ranges = Vec::with_capacity(input.memtables.len() + input.files.len());
@@ -177,7 +199,7 @@ impl RangeMeta {
}
fn push_seq_mem_ranges(memtables: &[MemtableRef], ranges: &mut Vec<RangeMeta>) {
// For non append-only mode, each range only contains one memtable.
// For non append-only mode, each range only contains one memtable by default.
for (i, memtable) in memtables.iter().enumerate() {
let stats = memtable.stats();
let Some(time_range) = stats.time_range() else {
@@ -195,6 +217,7 @@ impl RangeMeta {
}
}
// TODO(yingwen): Support multiple row groups in a range so we can split them later.
fn push_seq_file_ranges(
num_memtables: usize,
files: &[FileHandle],
@@ -264,3 +287,83 @@ fn maybe_split_ranges_for_seq_scan(ranges: Vec<RangeMeta>) -> Vec<RangeMeta> {
new_ranges
}
#[cfg(test)]
mod tests {
use common_time::timestamp::TimeUnit;
use common_time::Timestamp;
use super::*;
type Output = (Vec<usize>, i64, i64);
fn run_group_ranges_test(input: &[(usize, i64, i64)], expect: &[Output]) {
let ranges = input
.iter()
.map(|(idx, start, end)| {
let time_range = (
Timestamp::new(*start, TimeUnit::Second),
Timestamp::new(*end, TimeUnit::Second),
);
RangeMeta {
time_range,
indices: smallvec![*idx],
row_group_indices: smallvec![RowGroupIndex {
index: *idx,
row_group_index: 0
}],
num_rows: 1,
}
})
.collect();
let output = group_ranges_for_seq_scan(ranges);
let actual: Vec<_> = output
.iter()
.map(|range| {
let indices = range.indices.to_vec();
let group_indices: Vec<_> = range
.row_group_indices
.iter()
.map(|idx| idx.index)
.collect();
assert_eq!(indices, group_indices);
let range = range.time_range;
(indices, range.0.value(), range.1.value())
})
.collect();
assert_eq!(expect, actual);
}
#[test]
fn test_group_ranges() {
// Group 1 part.
run_group_ranges_test(&[(1, 0, 2000)], &[(vec![1], 0, 2000)]);
// 1, 2, 3, 4 => [3, 1, 4], [2]
run_group_ranges_test(
&[
(1, 1000, 2000),
(2, 6000, 7000),
(3, 0, 1500),
(4, 1500, 3000),
],
&[(vec![3, 1, 4], 0, 3000), (vec![2], 6000, 7000)],
);
// 1, 2, 3 => [3], [1], [2],
run_group_ranges_test(
&[(1, 3000, 4000), (2, 4001, 6000), (3, 0, 1000)],
&[
(vec![3], 0, 1000),
(vec![1], 3000, 4000),
(vec![2], 4001, 6000),
],
);
// 1, 2, 3 => [3], [1, 2]
run_group_ranges_test(
&[(1, 3000, 4000), (2, 4000, 6000), (3, 0, 1000)],
&[(vec![3], 0, 1000), (vec![1, 2], 3000, 6000)],
);
}
}

View File

@@ -17,14 +17,12 @@
use std::collections::{BTreeMap, HashSet};
use std::fmt;
use std::sync::{Arc, Mutex as StdMutex};
use std::time::{Duration, Instant};
use std::time::Instant;
use common_error::ext::BoxedError;
use common_recordbatch::SendableRecordBatchStream;
use common_telemetry::{debug, error, tracing, warn};
use common_time::range::TimestampRange;
use common_time::Timestamp;
use datafusion::physical_plan::DisplayFormatType;
use datafusion_expr::utils::expr_to_columns;
use parquet::arrow::arrow_reader::RowSelection;
use smallvec::SmallVec;
@@ -48,7 +46,7 @@ use crate::read::unordered_scan::UnorderedScan;
use crate::read::{Batch, Source};
use crate::region::options::MergeMode;
use crate::region::version::VersionRef;
use crate::sst::file::{overlaps, FileHandle, FileMeta};
use crate::sst::file::FileHandle;
use crate::sst::index::fulltext_index::applier::builder::FulltextIndexApplierBuilder;
use crate::sst::index::fulltext_index::applier::FulltextIndexApplierRef;
use crate::sst::index::inverted_index::applier::builder::InvertedIndexApplierBuilder;
@@ -700,73 +698,6 @@ impl ScanInput {
})
}
/// Prunes file ranges to scan and adds them to the `collector`.
pub(crate) async fn prune_file_ranges(
&self,
collector: &mut impl FileRangeCollector,
) -> Result<ReaderMetrics> {
let mut file_prune_cost = Duration::ZERO;
let mut reader_metrics = ReaderMetrics::default();
for file in &self.files {
let prune_start = Instant::now();
let res = self
.access_layer
.read_sst(file.clone())
.predicate(self.predicate.clone())
.time_range(self.time_range)
.projection(Some(self.mapper.column_ids().to_vec()))
.cache(self.cache_manager.clone())
.inverted_index_applier(self.inverted_index_applier.clone())
.fulltext_index_applier(self.fulltext_index_applier.clone())
.expected_metadata(Some(self.mapper.metadata().clone()))
.build_reader_input(&mut reader_metrics)
.await;
file_prune_cost += prune_start.elapsed();
let (mut file_range_ctx, row_groups) = match res {
Ok(x) => x,
Err(e) => {
if e.is_object_not_found() && self.ignore_file_not_found {
error!(e; "File to scan does not exist, region_id: {}, file: {}", file.region_id(), file.file_id());
continue;
} else {
return Err(e);
}
}
};
if !compat::has_same_columns(
self.mapper.metadata(),
file_range_ctx.read_format().metadata(),
) {
// They have different schema. We need to adapt the batch first so the
// mapper can convert it.
let compat = CompatBatch::new(
&self.mapper,
file_range_ctx.read_format().metadata().clone(),
)?;
file_range_ctx.set_compat_batch(Some(compat));
}
// Build ranges from row groups.
let file_range_ctx = Arc::new(file_range_ctx);
let file_ranges = row_groups
.into_iter()
.map(|(row_group_idx, row_selection)| {
FileRange::new(file_range_ctx.clone(), row_group_idx, row_selection)
});
collector.append_file_ranges(file.meta_ref(), file_ranges);
}
READ_SST_COUNT.observe(self.files.len() as f64);
common_telemetry::debug!(
"Region {} prune {} files, cost is {:?}",
self.mapper.metadata().region_id,
self.files.len(),
file_prune_cost
);
Ok(reader_metrics)
}
/// Scans the input source in another task and sends batches to the sender.
pub(crate) fn spawn_scan_task(
&self,
@@ -806,10 +737,7 @@ impl ScanInput {
pub(crate) fn predicate(&self) -> Option<Predicate> {
self.predicate.clone()
}
}
#[cfg(test)]
impl ScanInput {
/// Returns number of memtables to scan.
pub(crate) fn num_memtables(&self) -> usize {
self.memtables.len()
@@ -819,166 +747,21 @@ impl ScanInput {
pub(crate) fn num_files(&self) -> usize {
self.files.len()
}
}
#[cfg(test)]
impl ScanInput {
/// Returns SST file ids to scan.
pub(crate) fn file_ids(&self) -> Vec<crate::sst::file::FileId> {
self.files.iter().map(|file| file.file_id()).collect()
}
}
/// Groups of file ranges. Each group in the list contains multiple file
/// ranges to scan. File ranges in the same group may come from different files.
pub(crate) type FileRangesGroup = SmallVec<[Vec<FileRange>; 4]>;
/// A partition of a scanner to read.
/// It contains memtables and file ranges to scan.
#[derive(Clone, Default)]
pub(crate) struct ScanPart {
/// Memtable ranges to scan.
pub(crate) memtable_ranges: Vec<MemtableRange>,
/// File ranges to scan.
pub(crate) file_ranges: FileRangesGroup,
/// Optional time range of the part (inclusive).
pub(crate) time_range: Option<(Timestamp, Timestamp)>,
}
impl fmt::Debug for ScanPart {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"ScanPart({} memtable ranges, {} file ranges",
self.memtable_ranges.len(),
self.file_ranges
.iter()
.map(|ranges| ranges.len())
.sum::<usize>(),
)?;
if let Some(time_range) = &self.time_range {
write!(f, ", time range: {:?})", time_range)
} else {
write!(f, ")")
}
}
}
impl ScanPart {
/// Returns true if the time range given `part` overlaps with this part.
pub(crate) fn overlaps(&self, part: &ScanPart) -> bool {
let (Some(current_range), Some(part_range)) = (self.time_range, part.time_range) else {
return true;
};
overlaps(&current_range, &part_range)
}
/// Merges given `part` to this part.
pub(crate) fn merge(&mut self, mut part: ScanPart) {
self.memtable_ranges.append(&mut part.memtable_ranges);
self.file_ranges.append(&mut part.file_ranges);
let Some(part_range) = part.time_range else {
return;
};
let Some(current_range) = self.time_range else {
self.time_range = part.time_range;
return;
};
let start = current_range.0.min(part_range.0);
let end = current_range.1.max(part_range.1);
self.time_range = Some((start, end));
}
/// Returns true if the we can split the part into multiple parts
/// and preserving order.
pub(crate) fn can_split_preserve_order(&self) -> bool {
self.memtable_ranges.is_empty()
&& self.file_ranges.len() == 1
&& self.file_ranges[0].len() > 1
}
}
/// A trait to collect file ranges to scan.
pub(crate) trait FileRangeCollector {
/// Appends file ranges from the **same file** to the collector.
fn append_file_ranges(
&mut self,
file_meta: &FileMeta,
file_ranges: impl Iterator<Item = FileRange>,
);
}
/// Optional list of [ScanPart]s.
#[derive(Default)]
pub(crate) struct ScanPartList(pub(crate) Option<Vec<ScanPart>>);
impl fmt::Debug for ScanPartList {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match &self.0 {
Some(parts) => write!(f, "{:?}", parts),
None => write!(f, "[]"),
}
}
}
impl ScanPartList {
/// Returns true if the list is None.
pub(crate) fn is_none(&self) -> bool {
self.0.is_none()
}
/// Sets parts to the list.
pub(crate) fn set_parts(&mut self, parts: Vec<ScanPart>) {
self.0 = Some(parts);
}
/// Gets the part by index, returns None if the index is out of bound.
/// # Panics
/// Panics if parts are not initialized.
pub(crate) fn get_part(&mut self, index: usize) -> Option<&ScanPart> {
let parts = self.0.as_ref().unwrap();
parts.get(index)
}
/// Returns the number of parts.
pub(crate) fn len(&self) -> usize {
self.0.as_ref().map_or(0, |parts| parts.len())
}
/// Returns the number of memtable ranges.
pub(crate) fn num_mem_ranges(&self) -> usize {
self.0.as_ref().map_or(0, |parts| {
parts.iter().map(|part| part.memtable_ranges.len()).sum()
})
}
/// Returns the number of files.
pub(crate) fn num_files(&self) -> usize {
self.0.as_ref().map_or(0, |parts| {
parts.iter().map(|part| part.file_ranges.len()).sum()
})
}
/// Returns the number of file ranges.
pub(crate) fn num_file_ranges(&self) -> usize {
self.0.as_ref().map_or(0, |parts| {
parts
.iter()
.flat_map(|part| part.file_ranges.iter())
.map(|ranges| ranges.len())
.sum()
})
}
}
/// Context shared by different streams from a scanner.
/// It contains the input and distributes input to multiple parts
/// to scan.
/// It contains the input and ranges to scan.
pub(crate) struct StreamContext {
/// Input memtables and files.
pub(crate) input: ScanInput,
/// Parts to scan and the cost to build parts.
/// The scanner builds parts to scan from the input lazily.
/// The mutex is used to ensure the parts are only built once.
pub(crate) parts: Mutex<(ScanPartList, Duration)>,
/// Metadata for partition ranges.
pub(crate) ranges: Vec<RangeMeta>,
/// Lists of range builders.
@@ -994,12 +777,11 @@ impl StreamContext {
pub(crate) fn seq_scan_ctx(input: ScanInput) -> Self {
let query_start = input.query_start.unwrap_or_else(Instant::now);
let ranges = RangeMeta::seq_scan_ranges(&input);
READ_SST_COUNT.observe(input.files.len() as f64);
let range_builders = RangeBuilderList::new(input.memtables.len(), input.files.len());
READ_SST_COUNT.observe(input.num_files() as f64);
let range_builders = RangeBuilderList::new(input.num_memtables(), input.num_files());
Self {
input,
parts: Mutex::new((ScanPartList::default(), Duration::default())),
ranges,
range_builders,
query_start,
@@ -1010,12 +792,11 @@ impl StreamContext {
pub(crate) fn unordered_scan_ctx(input: ScanInput) -> Self {
let query_start = input.query_start.unwrap_or_else(Instant::now);
let ranges = RangeMeta::unordered_scan_ranges(&input);
READ_SST_COUNT.observe(input.files.len() as f64);
let range_builders = RangeBuilderList::new(input.memtables.len(), input.files.len());
READ_SST_COUNT.observe(input.num_files() as f64);
let range_builders = RangeBuilderList::new(input.num_memtables(), input.num_files());
Self {
input,
parts: Mutex::new((ScanPartList::default(), Duration::default())),
ranges,
range_builders,
query_start,
@@ -1024,27 +805,28 @@ impl StreamContext {
/// Returns true if the index refers to a memtable.
pub(crate) fn is_mem_range_index(&self, index: RowGroupIndex) -> bool {
self.input.memtables.len() > index.index
self.input.num_memtables() > index.index
}
/// Creates file ranges to scan.
pub(crate) async fn build_file_ranges(
&self,
index: RowGroupIndex,
ranges: &mut Vec<FileRange>,
reader_metrics: &mut ReaderMetrics,
) -> Result<()> {
ranges.clear();
) -> Result<SmallVec<[FileRange; 2]>> {
let mut ranges = SmallVec::new();
self.range_builders
.build_file_ranges(&self.input, index, ranges, reader_metrics)
.await
.build_file_ranges(&self.input, index, &mut ranges, reader_metrics)
.await?;
Ok(ranges)
}
/// Creates memtable ranges to scan.
pub(crate) fn build_mem_ranges(&self, index: RowGroupIndex, ranges: &mut Vec<MemtableRange>) {
ranges.clear();
pub(crate) fn build_mem_ranges(&self, index: RowGroupIndex) -> SmallVec<[MemtableRange; 2]> {
let mut ranges = SmallVec::new();
self.range_builders
.build_mem_ranges(&self.input, index, ranges)
.build_mem_ranges(&self.input, index, &mut ranges);
ranges
}
/// Retrieves the partition ranges.
@@ -1052,35 +834,30 @@ impl StreamContext {
self.ranges
.iter()
.enumerate()
.map(|(idx, range_meta)| PartitionRange {
start: range_meta.time_range.0,
end: range_meta.time_range.1,
num_rows: range_meta.num_rows,
identifier: idx,
})
.map(|(idx, range_meta)| range_meta.new_partition_range(idx))
.collect()
}
/// Format the context for explain.
pub(crate) fn format_for_explain(
&self,
t: DisplayFormatType,
f: &mut fmt::Formatter,
) -> fmt::Result {
match self.parts.try_lock() {
Ok(inner) => match t {
DisplayFormatType::Default => write!(
f,
"partition_count={} ({} memtable ranges, {} file {} ranges)",
inner.0.len(),
inner.0.num_mem_ranges(),
inner.0.num_files(),
inner.0.num_file_ranges()
)?,
DisplayFormatType::Verbose => write!(f, "{:?}", inner.0)?,
},
Err(_) => write!(f, "<locked>")?,
pub(crate) fn format_for_explain(&self, f: &mut fmt::Formatter) -> fmt::Result {
let (mut num_mem_ranges, mut num_file_ranges) = (0, 0);
for range_meta in &self.ranges {
for idx in &range_meta.row_group_indices {
if self.is_mem_range_index(*idx) {
num_mem_ranges += 1;
} else {
num_file_ranges += 1;
}
}
}
write!(
f,
"partition_count={} ({} memtable ranges, {} file {} ranges)",
self.ranges.len(),
num_mem_ranges,
self.input.num_files(),
num_file_ranges,
)?;
if let Some(selector) = &self.input.series_row_selector {
write!(f, ", selector={}", selector)?;
}
@@ -1110,7 +887,7 @@ impl RangeBuilderList {
&self,
input: &ScanInput,
index: RowGroupIndex,
ranges: &mut Vec<FileRange>,
ranges: &mut SmallVec<[FileRange; 2]>,
reader_metrics: &mut ReaderMetrics,
) -> Result<()> {
let file_index = index.index - self.mem_builders.len();
@@ -1131,7 +908,7 @@ impl RangeBuilderList {
&self,
input: &ScanInput,
index: RowGroupIndex,
ranges: &mut Vec<MemtableRange>,
ranges: &mut SmallVec<[MemtableRange; 2]>,
) {
let mut builder_opt = self.mem_builders[index.index].lock().unwrap();
match &mut *builder_opt {
@@ -1159,7 +936,7 @@ struct FileRangeBuilder {
impl FileRangeBuilder {
/// Builds file ranges to read.
/// Negative `row_group_index` indicates all row groups.
fn build_ranges(&self, row_group_index: i64, ranges: &mut Vec<FileRange>) {
fn build_ranges(&self, row_group_index: i64, ranges: &mut SmallVec<[FileRange; 2]>) {
let Some(context) = self.context.clone() else {
return;
};
@@ -1196,7 +973,7 @@ struct MemRangeBuilder {
impl MemRangeBuilder {
/// Builds mem ranges to read in the memtable.
/// Negative `row_group_index` indicates all row groups.
fn build_ranges(&self, row_group_index: i64, ranges: &mut Vec<MemtableRange>) {
fn build_ranges(&self, row_group_index: i64, ranges: &mut SmallVec<[MemtableRange; 2]>) {
if row_group_index >= 0 {
let row_group_index = row_group_index as usize;
// Scans one row group.

View File

@@ -0,0 +1,182 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Utilities for scanners.
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use async_stream::try_stream;
use common_telemetry::debug;
use futures::Stream;
use store_api::storage::RegionId;
use crate::error::Result;
use crate::read::range::RowGroupIndex;
use crate::read::scan_region::StreamContext;
use crate::read::{Batch, ScannerMetrics, Source};
use crate::sst::parquet::reader::ReaderMetrics;
struct PartitionMetricsInner {
region_id: RegionId,
/// Index of the partition to scan.
partition: usize,
/// Label to distinguish different scan operation.
scanner_type: &'static str,
/// Query start time.
query_start: Instant,
/// Elapsed time before the first poll operation.
first_poll: Duration,
metrics: ScannerMetrics,
reader_metrics: ReaderMetrics,
}
impl PartitionMetricsInner {
fn on_finish(&mut self) {
if self.metrics.total_cost.is_zero() {
self.metrics.total_cost = self.query_start.elapsed();
}
self.metrics.build_parts_cost = self.reader_metrics.build_cost;
}
}
impl Drop for PartitionMetricsInner {
fn drop(&mut self) {
self.on_finish();
self.metrics.observe_metrics();
debug!(
"{} finished, region_id: {}, partition: {}, first_poll: {:?}, metrics: {:?}, reader_metrics: {:?}",
self.scanner_type, self.region_id, self.partition, self.first_poll, self.metrics, self.reader_metrics
);
}
}
/// Metrics while reading a partition.
#[derive(Clone)]
pub(crate) struct PartitionMetrics(Arc<Mutex<PartitionMetricsInner>>);
impl PartitionMetrics {
pub(crate) fn new(
region_id: RegionId,
partition: usize,
scanner_type: &'static str,
query_start: Instant,
metrics: ScannerMetrics,
) -> Self {
let inner = PartitionMetricsInner {
region_id,
partition,
scanner_type,
query_start,
first_poll: Duration::default(),
metrics,
reader_metrics: ReaderMetrics::default(),
};
Self(Arc::new(Mutex::new(inner)))
}
pub(crate) fn on_first_poll(&self) {
let mut inner = self.0.lock().unwrap();
inner.first_poll = inner.query_start.elapsed();
}
pub(crate) fn inc_num_mem_ranges(&self, num: usize) {
let mut inner = self.0.lock().unwrap();
inner.metrics.num_mem_ranges += num;
}
pub(crate) fn inc_num_file_ranges(&self, num: usize) {
let mut inner = self.0.lock().unwrap();
inner.metrics.num_file_ranges += num;
}
pub(crate) fn inc_build_reader_cost(&self, cost: Duration) {
let mut inner = self.0.lock().unwrap();
inner.metrics.build_reader_cost += cost;
}
pub(crate) fn merge_metrics(&self, metrics: &ScannerMetrics) {
let mut inner = self.0.lock().unwrap();
inner.metrics.merge_from(metrics);
}
pub(crate) fn merge_reader_metrics(&self, metrics: &ReaderMetrics) {
let mut inner = self.0.lock().unwrap();
inner.reader_metrics.merge_from(metrics);
}
pub(crate) fn on_finish(&self) {
let mut inner = self.0.lock().unwrap();
inner.on_finish();
}
}
/// Scans memtable ranges at `index`.
pub(crate) fn scan_mem_ranges(
stream_ctx: Arc<StreamContext>,
part_metrics: PartitionMetrics,
index: RowGroupIndex,
) -> impl Stream<Item = Result<Batch>> {
try_stream! {
let ranges = stream_ctx.build_mem_ranges(index);
part_metrics.inc_num_mem_ranges(ranges.len());
for range in ranges {
let build_reader_start = Instant::now();
let iter = range.build_iter()?;
part_metrics.inc_build_reader_cost(build_reader_start.elapsed());
let mut source = Source::Iter(iter);
while let Some(batch) = source.next_batch().await? {
yield batch;
}
}
}
}
/// Scans file ranges at `index`.
pub(crate) fn scan_file_ranges(
stream_ctx: Arc<StreamContext>,
part_metrics: PartitionMetrics,
index: RowGroupIndex,
read_type: &'static str,
) -> impl Stream<Item = Result<Batch>> {
try_stream! {
let mut reader_metrics = ReaderMetrics::default();
let ranges = stream_ctx
.build_file_ranges(index, &mut reader_metrics)
.await?;
part_metrics.inc_num_file_ranges(ranges.len());
for range in ranges {
let build_reader_start = Instant::now();
let reader = range.reader(None).await?;
part_metrics.inc_build_reader_cost(build_reader_start.elapsed());
let compat_batch = range.compat_batch();
let mut source = Source::PruneReader(reader);
while let Some(mut batch) = source.next_batch().await? {
if let Some(compact_batch) = compat_batch {
batch = compact_batch.compat_batch(batch)?;
}
yield batch;
}
if let Source::PruneReader(mut reader) = source {
reader_metrics.merge_from(reader.metrics());
}
}
// Reports metrics.
reader_metrics.observe_rows(read_type);
part_metrics.merge_reader_metrics(&reader_metrics);
}
}

View File

@@ -16,36 +16,29 @@
use std::fmt;
use std::sync::Arc;
use std::time::{Duration, Instant};
use std::time::Instant;
use async_stream::try_stream;
use common_error::ext::BoxedError;
use common_recordbatch::error::ExternalSnafu;
use common_recordbatch::util::ChainedRecordBatchStream;
use common_recordbatch::{RecordBatchStreamWrapper, SendableRecordBatchStream};
use common_telemetry::{debug, tracing};
use common_telemetry::tracing;
use datafusion::physical_plan::{DisplayAs, DisplayFormatType};
use datatypes::schema::SchemaRef;
use smallvec::smallvec;
use snafu::ResultExt;
use store_api::region_engine::{PartitionRange, RegionScanner, ScannerProperties};
use store_api::storage::{ColumnId, TimeSeriesRowSelector};
use table::predicate::Predicate;
use store_api::storage::TimeSeriesRowSelector;
use tokio::sync::Semaphore;
use crate::error::{PartitionOutOfRangeSnafu, Result};
use crate::memtable::MemtableRef;
use crate::read::dedup::{DedupReader, LastNonNull, LastRow};
use crate::read::last_row::LastRowReader;
use crate::read::merge::MergeReaderBuilder;
use crate::read::scan_region::{
FileRangeCollector, ScanInput, ScanPart, ScanPartList, StreamContext,
};
use crate::read::scan_region::{ScanInput, StreamContext};
use crate::read::scan_util::{scan_file_ranges, scan_mem_ranges, PartitionMetrics};
use crate::read::{BatchReader, BoxedBatchReader, ScannerMetrics, Source};
use crate::region::options::MergeMode;
use crate::sst::file::FileMeta;
use crate::sst::parquet::file_range::FileRange;
use crate::sst::parquet::reader::ReaderMetrics;
/// Scans a region and returns rows in a sorted sequence.
///
@@ -66,6 +59,8 @@ pub struct SeqScan {
impl SeqScan {
/// Creates a new [SeqScan].
pub(crate) fn new(input: ScanInput) -> Self {
// TODO(yingwen): Set permits according to partition num. But we need to support file
// level parallelism.
let parallelism = input.parallelism.parallelism.max(1);
let mut properties = ScannerProperties::default()
.with_append_mode(input.append_mode)
@@ -102,150 +97,49 @@ impl SeqScan {
/// Builds a [BoxedBatchReader] from sequential scan for compaction.
pub async fn build_reader(&self) -> Result<BoxedBatchReader> {
let mut metrics = ScannerMetrics {
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
..Default::default()
};
let maybe_reader = Self::build_all_merge_reader(
let part_metrics = PartitionMetrics::new(
self.stream_ctx.input.mapper.metadata().region_id,
0,
get_scanner_type(self.compaction),
self.stream_ctx.query_start,
ScannerMetrics {
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
..Default::default()
},
);
debug_assert_eq!(1, self.properties.partitions.len());
let partition_ranges = &self.properties.partitions[0];
let reader = Self::build_all_merge_reader(
&self.stream_ctx,
partition_ranges,
self.semaphore.clone(),
&mut metrics,
self.compaction,
self.properties.num_partitions(),
&part_metrics,
)
.await?;
// Safety: `build_merge_reader()` always returns a reader if partition is None.
let reader = maybe_reader.unwrap();
Ok(Box::new(reader))
}
/// Builds sources from a [ScanPart].
fn build_part_sources(
part: &ScanPart,
sources: &mut Vec<Source>,
row_selector: Option<TimeSeriesRowSelector>,
compaction: bool,
) -> Result<()> {
sources.reserve(part.memtable_ranges.len() + part.file_ranges.len());
// Read memtables.
for mem in &part.memtable_ranges {
let iter = mem.build_iter()?;
sources.push(Source::Iter(iter));
}
let read_type = if compaction {
"compaction"
} else {
"seq_scan_files"
};
// Read files.
for file in &part.file_ranges {
if file.is_empty() {
continue;
}
// Creates a stream to read the file.
let ranges = file.clone();
let stream = try_stream! {
let mut reader_metrics = ReaderMetrics::default();
// Safety: We checked whether it is empty before.
let file_id = ranges[0].file_handle().file_id();
let region_id = ranges[0].file_handle().region_id();
let range_num = ranges.len();
for range in ranges {
let mut reader = range.reader(row_selector).await?;
let compat_batch = range.compat_batch();
while let Some(mut batch) = reader.next_batch().await? {
if let Some(compat) = compat_batch {
batch = compat
.compat_batch(batch)?;
}
yield batch;
}
reader_metrics.merge_from(reader.metrics());
}
debug!(
"Seq scan region {}, file {}, {} ranges finished, metrics: {:?}, compaction: {}",
region_id, file_id, range_num, reader_metrics, compaction
);
// Reports metrics.
reader_metrics.observe_rows(read_type);
};
let stream = Box::pin(stream);
sources.push(Source::Stream(stream));
}
Ok(())
}
/// Builds a merge reader that reads all data.
async fn build_all_merge_reader(
stream_ctx: &StreamContext,
stream_ctx: &Arc<StreamContext>,
partition_ranges: &[PartitionRange],
semaphore: Arc<Semaphore>,
metrics: &mut ScannerMetrics,
compaction: bool,
parallelism: usize,
) -> Result<Option<BoxedBatchReader>> {
// initialize parts list
let mut parts = stream_ctx.parts.lock().await;
Self::maybe_init_parts(&stream_ctx.input, &mut parts, metrics, parallelism).await?;
let parts_len = parts.0.len();
let mut sources = Vec::with_capacity(parts_len);
for id in 0..parts_len {
let Some(part) = parts.0.get_part(id) else {
return Ok(None);
};
Self::build_part_sources(part, &mut sources, None, compaction)?;
}
Self::build_reader_from_sources(stream_ctx, sources, semaphore).await
}
/// Builds a merge reader that reads data from one [`PartitionRange`].
///
/// If the `range_id` is out of bound, returns None.
async fn build_merge_reader(
stream_ctx: &StreamContext,
range_id: usize,
semaphore: Arc<Semaphore>,
metrics: &mut ScannerMetrics,
compaction: bool,
parallelism: usize,
) -> Result<Option<BoxedBatchReader>> {
part_metrics: &PartitionMetrics,
) -> Result<BoxedBatchReader> {
let mut sources = Vec::new();
let build_start = {
let mut parts = stream_ctx.parts.lock().await;
Self::maybe_init_parts(&stream_ctx.input, &mut parts, metrics, parallelism).await?;
let Some(part) = parts.0.get_part(range_id) else {
return Ok(None);
};
let build_start = Instant::now();
Self::build_part_sources(
part,
&mut sources,
stream_ctx.input.series_row_selector,
for part_range in partition_ranges {
build_sources(
stream_ctx,
part_range,
compaction,
)?;
build_start
};
let maybe_reader = Self::build_reader_from_sources(stream_ctx, sources, semaphore).await;
let build_reader_cost = build_start.elapsed();
metrics.build_reader_cost += build_reader_cost;
debug!(
"Build reader region: {}, range_id: {}, from sources, build_reader_cost: {:?}, compaction: {}",
stream_ctx.input.mapper.metadata().region_id,
range_id,
build_reader_cost,
compaction,
);
maybe_reader
part_metrics,
&mut sources,
);
}
Self::build_reader_from_sources(stream_ctx, sources, semaphore).await
}
#[tracing::instrument(level = tracing::Level::DEBUG, skip_all)]
@@ -253,7 +147,7 @@ impl SeqScan {
stream_ctx: &StreamContext,
mut sources: Vec<Source>,
semaphore: Arc<Semaphore>,
) -> Result<Option<BoxedBatchReader>> {
) -> Result<BoxedBatchReader> {
if stream_ctx.input.parallelism.parallelism > 1 {
// Read sources in parallel. We always spawn a task so we can control the parallelism
// by the semaphore.
@@ -286,13 +180,11 @@ impl SeqScan {
None => reader,
};
Ok(Some(reader))
Ok(reader)
}
/// Scans the given partition when the part list is set properly.
/// Otherwise the returned stream might not contains any data.
// TODO: refactor out `uncached_scan_part_impl`.
#[allow(dead_code)]
fn scan_partition_impl(
&self,
partition: usize,
@@ -307,28 +199,36 @@ impl SeqScan {
));
}
let mut metrics = ScannerMetrics {
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
..Default::default()
};
let stream_ctx = self.stream_ctx.clone();
let semaphore = self.semaphore.clone();
let partition_ranges = self.properties.partitions[partition].clone();
let compaction = self.compaction;
let parallelism = self.properties.num_partitions();
let stream = try_stream! {
let first_poll = stream_ctx.query_start.elapsed();
let part_metrics = PartitionMetrics::new(
self.stream_ctx.input.mapper.metadata().region_id,
partition,
get_scanner_type(self.compaction),
stream_ctx.query_start,
ScannerMetrics {
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
..Default::default()
},
);
for partition_range in partition_ranges {
let maybe_reader =
Self::build_merge_reader(&stream_ctx, partition_range.identifier, semaphore.clone(), &mut metrics, compaction, parallelism)
let stream = try_stream! {
part_metrics.on_first_poll();
// Scans each part.
for part_range in partition_ranges {
let mut sources = Vec::new();
build_sources(&stream_ctx, &part_range, compaction, &part_metrics, &mut sources);
let mut reader =
Self::build_reader_from_sources(&stream_ctx, sources, semaphore.clone())
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let Some(mut reader) = maybe_reader else {
return;
};
let cache = stream_ctx.input.cache_manager.as_deref();
let mut metrics = ScannerMetrics::default();
let mut fetch_start = Instant::now();
while let Some(batch) = reader
.next_batch()
@@ -350,18 +250,10 @@ impl SeqScan {
fetch_start = Instant::now();
}
metrics.scan_cost += fetch_start.elapsed();
metrics.total_cost = stream_ctx.query_start.elapsed();
metrics.observe_metrics_on_finish();
debug!(
"Seq scan finished, region_id: {:?}, partition: {}, metrics: {:?}, first_poll: {:?}, compaction: {}",
stream_ctx.input.mapper.metadata().region_id,
partition,
metrics,
first_poll,
compaction,
);
part_metrics.merge_metrics(&metrics);
}
part_metrics.on_finish();
};
let stream = Box::pin(RecordBatchStreamWrapper::new(
@@ -371,133 +263,6 @@ impl SeqScan {
Ok(stream)
}
/// Scans the given partition when the part list is not set.
/// This method will do a lazy initialize of part list and
/// ignores the partition settings in `properties`.
fn uncached_scan_part_impl(
&self,
partition: usize,
) -> Result<SendableRecordBatchStream, BoxedError> {
let num_partitions = self.properties.partitions.len();
if partition >= num_partitions {
return Err(BoxedError::new(
PartitionOutOfRangeSnafu {
given: partition,
all: self.properties.partitions.len(),
}
.build(),
));
}
let mut metrics = ScannerMetrics {
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
..Default::default()
};
let stream_ctx = self.stream_ctx.clone();
let semaphore = self.semaphore.clone();
let compaction = self.compaction;
let parallelism = self.properties.num_partitions();
// build stream
let stream = try_stream! {
let first_poll = stream_ctx.query_start.elapsed();
// init parts
let parts_len = {
let mut parts = stream_ctx.parts.lock().await;
Self::maybe_init_parts(&stream_ctx.input, &mut parts, &mut metrics, parallelism).await
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
parts.0.len()
};
for id in (0..parts_len).skip(partition).step_by(num_partitions) {
let maybe_reader = Self::build_merge_reader(
&stream_ctx,
id,
semaphore.clone(),
&mut metrics,
compaction,
parallelism
)
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let Some(mut reader) = maybe_reader else {
return;
};
let cache = stream_ctx.input.cache_manager.as_deref();
let mut fetch_start = Instant::now();
while let Some(batch) = reader
.next_batch()
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?
{
metrics.scan_cost += fetch_start.elapsed();
metrics.num_batches += 1;
metrics.num_rows += batch.num_rows();
let convert_start = Instant::now();
let record_batch = stream_ctx.input.mapper.convert(&batch, cache)?;
metrics.convert_cost += convert_start.elapsed();
let yield_start = Instant::now();
yield record_batch;
metrics.yield_cost += yield_start.elapsed();
fetch_start = Instant::now();
}
metrics.scan_cost += fetch_start.elapsed();
metrics.total_cost = stream_ctx.query_start.elapsed();
metrics.observe_metrics_on_finish();
debug!(
"Seq scan finished, region_id: {}, partition: {}, id: {}, metrics: {:?}, first_poll: {:?}, compaction: {}",
stream_ctx.input.mapper.metadata().region_id,
partition,
id,
metrics,
first_poll,
compaction,
);
}
};
let stream = Box::pin(RecordBatchStreamWrapper::new(
self.stream_ctx.input.mapper.output_schema(),
Box::pin(stream),
));
Ok(stream)
}
/// Initializes parts if they are not built yet.
async fn maybe_init_parts(
input: &ScanInput,
part_list: &mut (ScanPartList, Duration),
metrics: &mut ScannerMetrics,
parallelism: usize,
) -> Result<()> {
if part_list.0.is_none() {
let now = Instant::now();
let mut distributor = SeqDistributor::default();
let reader_metrics = input.prune_file_ranges(&mut distributor).await?;
distributor.append_mem_ranges(
&input.memtables,
Some(input.mapper.column_ids()),
input.predicate.clone(),
);
part_list.0.set_parts(distributor.build_parts(parallelism));
let build_part_cost = now.elapsed();
part_list.1 = build_part_cost;
metrics.observe_init_part(build_part_cost, &reader_metrics);
} else {
// Updates the cost of building parts.
metrics.build_parts_cost = part_list.1;
}
Ok(())
}
}
impl RegionScanner for SeqScan {
@@ -510,7 +275,7 @@ impl RegionScanner for SeqScan {
}
fn scan_partition(&self, partition: usize) -> Result<SendableRecordBatchStream, BoxedError> {
self.uncached_scan_part_impl(partition)
self.scan_partition_impl(partition)
}
fn prepare(&mut self, ranges: Vec<Vec<PartitionRange>>) -> Result<(), BoxedError> {
@@ -525,24 +290,53 @@ impl RegionScanner for SeqScan {
}
impl DisplayAs for SeqScan {
fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"SeqScan: region={}, ",
self.stream_ctx.input.mapper.metadata().region_id
)?;
self.stream_ctx.format_for_explain(t, f)
self.stream_ctx.format_for_explain(f)
}
}
impl fmt::Debug for SeqScan {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("SeqScan")
.field("parts", &self.stream_ctx.parts)
.field("num_ranges", &self.stream_ctx.ranges.len())
.finish()
}
}
/// Builds sources for the partition range.
fn build_sources(
stream_ctx: &Arc<StreamContext>,
part_range: &PartitionRange,
compaction: bool,
part_metrics: &PartitionMetrics,
sources: &mut Vec<Source>,
) {
// Gets range meta.
let range_meta = &stream_ctx.ranges[part_range.identifier];
sources.reserve(range_meta.row_group_indices.len());
for index in &range_meta.row_group_indices {
let stream = if stream_ctx.is_mem_range_index(*index) {
let stream = scan_mem_ranges(stream_ctx.clone(), part_metrics.clone(), *index);
Box::pin(stream) as _
} else {
let read_type = if compaction {
"compaction"
} else {
"seq_scan_files"
};
let stream =
scan_file_ranges(stream_ctx.clone(), part_metrics.clone(), *index, read_type);
Box::pin(stream) as _
};
sources.push(Source::Stream(stream));
}
}
#[cfg(test)]
impl SeqScan {
/// Returns the input.
@@ -551,266 +345,11 @@ impl SeqScan {
}
}
/// Builds [ScanPart]s that preserves order.
#[derive(Default)]
pub(crate) struct SeqDistributor {
parts: Vec<ScanPart>,
}
impl FileRangeCollector for SeqDistributor {
fn append_file_ranges(
&mut self,
file_meta: &FileMeta,
file_ranges: impl Iterator<Item = FileRange>,
) {
// Creates a [ScanPart] for each file.
let ranges: Vec<_> = file_ranges.collect();
if ranges.is_empty() {
// No ranges to read.
return;
}
let part = ScanPart {
memtable_ranges: Vec::new(),
file_ranges: smallvec![ranges],
time_range: Some(file_meta.time_range),
};
self.parts.push(part);
}
}
impl SeqDistributor {
/// Appends memtable ranges to the distributor.
fn append_mem_ranges(
&mut self,
memtables: &[MemtableRef],
projection: Option<&[ColumnId]>,
predicate: Option<Predicate>,
) {
for mem in memtables {
let stats = mem.stats();
let mem_ranges = mem.ranges(projection, predicate.clone());
if mem_ranges.is_empty() {
continue;
}
let part = ScanPart {
memtable_ranges: mem_ranges.into_values().collect(),
file_ranges: smallvec![],
time_range: stats.time_range(),
};
self.parts.push(part);
}
}
/// Groups file ranges and memtable ranges by time ranges.
/// The output number of parts may be `<= parallelism`. If `parallelism` is 0, it will be set to 1.
///
/// Output parts have non-overlapping time ranges.
fn build_parts(self, parallelism: usize) -> Vec<ScanPart> {
let parallelism = parallelism.max(1);
let parts = group_parts_by_range(self.parts);
let parts = maybe_split_parts(parts, parallelism);
// Ensures it doesn't returns parts more than `parallelism`.
maybe_merge_parts(parts, parallelism)
}
}
/// Groups parts by time range. It may generate parts more than parallelism.
/// All time ranges are not None.
fn group_parts_by_range(mut parts: Vec<ScanPart>) -> Vec<ScanPart> {
if parts.is_empty() {
return Vec::new();
}
// Sorts parts by time range.
parts.sort_unstable_by(|a, b| {
// Safety: time ranges of parts from [SeqPartBuilder] are not None.
let a = a.time_range.unwrap();
let b = b.time_range.unwrap();
a.0.cmp(&b.0).then_with(|| b.1.cmp(&a.1))
});
let mut part_in_range = None;
// Parts with exclusive time ranges.
let mut part_groups = Vec::new();
for part in parts {
let Some(mut prev_part) = part_in_range.take() else {
part_in_range = Some(part);
continue;
};
if prev_part.overlaps(&part) {
prev_part.merge(part);
part_in_range = Some(prev_part);
} else {
// A new group.
part_groups.push(prev_part);
part_in_range = Some(part);
}
}
if let Some(part) = part_in_range {
part_groups.push(part);
}
part_groups
}
/// Merges parts by parallelism.
/// It merges parts if the number of parts is greater than `parallelism`.
fn maybe_merge_parts(mut parts: Vec<ScanPart>, parallelism: usize) -> Vec<ScanPart> {
assert!(parallelism > 0);
if parts.len() <= parallelism {
// No need to merge parts.
return parts;
}
// Sort parts by number of memtables and ranges in reverse order.
parts.sort_unstable_by(|a, b| {
a.memtable_ranges
.len()
.cmp(&b.memtable_ranges.len())
.then_with(|| {
let a_ranges_len = a
.file_ranges
.iter()
.map(|ranges| ranges.len())
.sum::<usize>();
let b_ranges_len = b
.file_ranges
.iter()
.map(|ranges| ranges.len())
.sum::<usize>();
a_ranges_len.cmp(&b_ranges_len)
})
.reverse()
});
let parts_to_reduce = parts.len() - parallelism;
for _ in 0..parts_to_reduce {
// Safety: We ensure `parts.len() > parallelism`.
let part = parts.pop().unwrap();
parts.last_mut().unwrap().merge(part);
}
parts
}
/// Splits parts by parallelism.
/// It splits a part if it only scans one file and doesn't scan any memtable.
fn maybe_split_parts(mut parts: Vec<ScanPart>, parallelism: usize) -> Vec<ScanPart> {
assert!(parallelism > 0);
if parts.len() >= parallelism {
// No need to split parts.
return parts;
}
let has_part_to_split = parts.iter().any(|part| part.can_split_preserve_order());
if !has_part_to_split {
// No proper parts to scan.
return parts;
}
// Sorts parts by the number of ranges in the first file.
parts.sort_unstable_by(|a, b| {
let a_len = a.file_ranges.first().map(|file| file.len()).unwrap_or(0);
let b_len = b.file_ranges.first().map(|file| file.len()).unwrap_or(0);
a_len.cmp(&b_len).reverse()
});
let num_parts_to_split = parallelism - parts.len();
let mut output_parts = Vec::with_capacity(parallelism);
// Split parts up to num_parts_to_split.
for part in parts.iter_mut() {
if !part.can_split_preserve_order() {
continue;
}
// Safety: `can_split_preserve_order()` ensures file_ranges.len() == 1.
// Splits part into `num_parts_to_split + 1` new parts if possible.
let target_part_num = num_parts_to_split + 1;
let ranges_per_part = (part.file_ranges[0].len() + target_part_num - 1) / target_part_num;
// `can_split_preserve_order()` ensures part.file_ranges[0].len() > 1.
assert!(ranges_per_part > 0);
for ranges in part.file_ranges[0].chunks(ranges_per_part) {
let new_part = ScanPart {
memtable_ranges: Vec::new(),
file_ranges: smallvec![ranges.to_vec()],
time_range: part.time_range,
};
output_parts.push(new_part);
}
// Replace the current part with the last output part as we will put the current part
// into the output parts later.
*part = output_parts.pop().unwrap();
if output_parts.len() >= num_parts_to_split {
// We already split enough parts.
break;
}
}
// Put the remaining parts into the output parts.
output_parts.append(&mut parts);
output_parts
}
#[cfg(test)]
mod tests {
use common_time::timestamp::TimeUnit;
use common_time::Timestamp;
use super::*;
use crate::memtable::MemtableId;
use crate::test_util::memtable_util::mem_range_for_test;
type Output = (Vec<MemtableId>, i64, i64);
fn run_group_parts_test(input: &[(MemtableId, i64, i64)], expect: &[Output]) {
let parts = input
.iter()
.map(|(id, start, end)| {
let range = (
Timestamp::new(*start, TimeUnit::Second),
Timestamp::new(*end, TimeUnit::Second),
);
ScanPart {
memtable_ranges: vec![mem_range_for_test(*id)],
file_ranges: smallvec![],
time_range: Some(range),
}
})
.collect();
let output = group_parts_by_range(parts);
let actual: Vec<_> = output
.iter()
.map(|part| {
let ids: Vec<_> = part.memtable_ranges.iter().map(|mem| mem.id()).collect();
let range = part.time_range.unwrap();
(ids, range.0.value(), range.1.value())
})
.collect();
assert_eq!(expect, actual);
}
#[test]
fn test_group_parts() {
// Group 1 part.
run_group_parts_test(&[(1, 0, 2000)], &[(vec![1], 0, 2000)]);
// 1, 2, 3, 4 => [3, 1, 4], [2]
run_group_parts_test(
&[
(1, 1000, 2000),
(2, 6000, 7000),
(3, 0, 1500),
(4, 1500, 3000),
],
&[(vec![3, 1, 4], 0, 3000), (vec![2], 6000, 7000)],
);
// 1, 2, 3 => [3], [1], [2],
run_group_parts_test(
&[(1, 3000, 4000), (2, 4001, 6000), (3, 0, 1000)],
&[
(vec![3], 0, 1000),
(vec![1], 3000, 4000),
(vec![2], 4001, 6000),
],
);
/// Returns the scanner type.
fn get_scanner_type(compaction: bool) -> &'static str {
if compaction {
"SeqScan(compaction)"
} else {
"SeqScan"
}
}

View File

@@ -21,24 +21,17 @@ use std::time::Instant;
use async_stream::{stream, try_stream};
use common_error::ext::BoxedError;
use common_recordbatch::error::ExternalSnafu;
use common_recordbatch::{RecordBatch, RecordBatchStreamWrapper, SendableRecordBatchStream};
use common_telemetry::debug;
use common_recordbatch::{RecordBatchStreamWrapper, SendableRecordBatchStream};
use datafusion::physical_plan::{DisplayAs, DisplayFormatType};
use datatypes::schema::SchemaRef;
use futures::{Stream, StreamExt};
use snafu::ResultExt;
use store_api::region_engine::{PartitionRange, RegionScanner, ScannerProperties};
use crate::cache::CacheManager;
use crate::error::Result;
use crate::memtable::MemtableRange;
use crate::read::compat::CompatBatch;
use crate::read::projection::ProjectionMapper;
use crate::read::range::RowGroupIndex;
use crate::error::{PartitionOutOfRangeSnafu, Result};
use crate::read::scan_region::{ScanInput, StreamContext};
use crate::read::{ScannerMetrics, Source};
use crate::sst::parquet::file_range::FileRange;
use crate::sst::parquet::reader::ReaderMetrics;
use crate::read::scan_util::{scan_file_ranges, scan_mem_ranges, PartitionMetrics};
use crate::read::{Batch, ScannerMetrics};
/// Scans a region without providing any output ordering guarantee.
///
@@ -85,62 +78,23 @@ impl UnorderedScan {
Ok(stream)
}
/// Fetch a batch from the source and convert it into a record batch.
async fn fetch_from_source(
source: &mut Source,
mapper: &ProjectionMapper,
cache: Option<&CacheManager>,
compat_batch: Option<&CompatBatch>,
metrics: &mut ScannerMetrics,
) -> common_recordbatch::error::Result<Option<RecordBatch>> {
let start = Instant::now();
let Some(mut batch) = source
.next_batch()
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?
else {
metrics.scan_cost += start.elapsed();
return Ok(None);
};
if let Some(compat) = compat_batch {
batch = compat
.compat_batch(batch)
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
}
metrics.scan_cost += start.elapsed();
let convert_start = Instant::now();
let record_batch = mapper.convert(&batch, cache)?;
metrics.convert_cost += convert_start.elapsed();
Ok(Some(record_batch))
}
/// Scans a [PartitionRange] and returns a stream.
fn scan_partition_range<'a>(
stream_ctx: &'a StreamContext,
part_range: &'a PartitionRange,
mem_ranges: &'a mut Vec<MemtableRange>,
file_ranges: &'a mut Vec<FileRange>,
reader_metrics: &'a mut ReaderMetrics,
metrics: &'a mut ScannerMetrics,
) -> impl Stream<Item = common_recordbatch::error::Result<RecordBatch>> + 'a {
/// Scans a [PartitionRange] by its `identifier` and returns a stream.
fn scan_partition_range(
stream_ctx: Arc<StreamContext>,
part_range_id: usize,
part_metrics: PartitionMetrics,
) -> impl Stream<Item = Result<Batch>> {
stream! {
// Gets range meta.
let range_meta = &stream_ctx.ranges[part_range.identifier];
let range_meta = &stream_ctx.ranges[part_range_id];
for index in &range_meta.row_group_indices {
if stream_ctx.is_mem_range_index(*index) {
let stream = Self::scan_mem_ranges(stream_ctx, *index, mem_ranges, metrics);
let stream = scan_mem_ranges(stream_ctx.clone(), part_metrics.clone(), *index);
for await batch in stream {
yield batch;
}
} else {
let stream = Self::scan_file_ranges(stream_ctx, *index, file_ranges, reader_metrics, metrics);
let stream = scan_file_ranges(stream_ctx.clone(), part_metrics.clone(), *index, "unordered_scan_files");
for await batch in stream {
yield batch;
}
@@ -149,124 +103,68 @@ impl UnorderedScan {
}
}
/// Scans memtable ranges at `index`.
fn scan_mem_ranges<'a>(
stream_ctx: &'a StreamContext,
index: RowGroupIndex,
ranges: &'a mut Vec<MemtableRange>,
metrics: &'a mut ScannerMetrics,
) -> impl Stream<Item = common_recordbatch::error::Result<RecordBatch>> + 'a {
try_stream! {
let mapper = &stream_ctx.input.mapper;
let cache = stream_ctx.input.cache_manager.as_deref();
stream_ctx.build_mem_ranges(index, ranges);
metrics.num_mem_ranges += ranges.len();
for range in ranges {
let build_reader_start = Instant::now();
let iter = range.build_iter().map_err(BoxedError::new).context(ExternalSnafu)?;
metrics.build_reader_cost = build_reader_start.elapsed();
let mut source = Source::Iter(iter);
while let Some(batch) =
Self::fetch_from_source(&mut source, mapper, cache, None, metrics).await?
{
metrics.num_batches += 1;
metrics.num_rows += batch.num_rows();
let yield_start = Instant::now();
yield batch;
metrics.yield_cost += yield_start.elapsed();
}
}
}
}
/// Scans file ranges at `index`.
fn scan_file_ranges<'a>(
stream_ctx: &'a StreamContext,
index: RowGroupIndex,
ranges: &'a mut Vec<FileRange>,
reader_metrics: &'a mut ReaderMetrics,
metrics: &'a mut ScannerMetrics,
) -> impl Stream<Item = common_recordbatch::error::Result<RecordBatch>> + 'a {
try_stream! {
let mapper = &stream_ctx.input.mapper;
let cache = stream_ctx.input.cache_manager.as_deref();
stream_ctx
.build_file_ranges(index, ranges, reader_metrics)
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
metrics.num_file_ranges += ranges.len();
for range in ranges {
let build_reader_start = Instant::now();
let reader = range
.reader(None)
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
metrics.build_reader_cost += build_reader_start.elapsed();
let compat_batch = range.compat_batch();
let mut source = Source::PruneReader(reader);
while let Some(batch) =
Self::fetch_from_source(&mut source, mapper, cache, compat_batch, metrics)
.await?
{
metrics.num_batches += 1;
metrics.num_rows += batch.num_rows();
let yield_start = Instant::now();
yield batch;
metrics.yield_cost += yield_start.elapsed();
}
if let Source::PruneReader(mut reader) = source {
reader_metrics.merge_from(reader.metrics());
}
}
}
}
fn scan_partition_impl(
&self,
partition: usize,
) -> Result<SendableRecordBatchStream, BoxedError> {
let mut metrics = ScannerMetrics {
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
..Default::default()
};
if partition >= self.properties.partitions.len() {
return Err(BoxedError::new(
PartitionOutOfRangeSnafu {
given: partition,
all: self.properties.partitions.len(),
}
.build(),
));
}
let part_metrics = PartitionMetrics::new(
self.stream_ctx.input.mapper.metadata().region_id,
partition,
"UnorderedScan",
self.stream_ctx.query_start,
ScannerMetrics {
prepare_scan_cost: self.stream_ctx.query_start.elapsed(),
..Default::default()
},
);
let stream_ctx = self.stream_ctx.clone();
let ranges_opt = self.properties.partitions.get(partition).cloned();
let part_ranges = self.properties.partitions[partition].clone();
let stream = stream! {
let first_poll = stream_ctx.query_start.elapsed();
let Some(part_ranges) = ranges_opt else {
return;
};
let stream = try_stream! {
part_metrics.on_first_poll();
let mut mem_ranges = Vec::new();
let mut file_ranges = Vec::new();
let mut reader_metrics = ReaderMetrics::default();
let cache = stream_ctx.input.cache_manager.as_deref();
// Scans each part.
for part_range in part_ranges {
let mut metrics = ScannerMetrics::default();
let mut fetch_start = Instant::now();
let stream = Self::scan_partition_range(
&stream_ctx,
&part_range,
&mut mem_ranges,
&mut file_ranges,
&mut reader_metrics,
&mut metrics,
stream_ctx.clone(),
part_range.identifier,
part_metrics.clone(),
);
for await batch in stream {
yield batch;
let batch = batch.map_err(BoxedError::new).context(ExternalSnafu)?;
metrics.scan_cost += fetch_start.elapsed();
metrics.num_batches += 1;
metrics.num_rows += batch.num_rows();
let convert_start = Instant::now();
let record_batch = stream_ctx.input.mapper.convert(&batch, cache)?;
metrics.convert_cost += convert_start.elapsed();
let yield_start = Instant::now();
yield record_batch;
metrics.yield_cost += yield_start.elapsed();
fetch_start = Instant::now();
}
metrics.scan_cost += fetch_start.elapsed();
part_metrics.merge_metrics(&metrics);
}
reader_metrics.observe_rows("unordered_scan_files");
metrics.total_cost = stream_ctx.query_start.elapsed();
metrics.observe_metrics_on_finish();
let mapper = &stream_ctx.input.mapper;
debug!(
"Unordered scan partition {} finished, region_id: {}, metrics: {:?}, reader_metrics: {:?}, first_poll: {:?}",
partition, mapper.metadata().region_id, metrics, reader_metrics, first_poll,
);
part_metrics.on_finish();
};
let stream = Box::pin(RecordBatchStreamWrapper::new(
self.stream_ctx.input.mapper.output_schema(),
@@ -302,20 +200,20 @@ impl RegionScanner for UnorderedScan {
}
impl DisplayAs for UnorderedScan {
fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"UnorderedScan: region={}, ",
self.stream_ctx.input.mapper.metadata().region_id
)?;
self.stream_ctx.format_for_explain(t, f)
self.stream_ctx.format_for_explain(f)
}
}
impl fmt::Debug for UnorderedScan {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("UnorderedScan")
.field("parts", &self.stream_ctx.parts)
.field("num_ranges", &self.stream_ctx.ranges.len())
.finish()
}
}

View File

@@ -90,7 +90,8 @@ impl FromStr for FileId {
}
}
/// Time range of a SST file.
/// Time range (min and max timestamps) of a SST file.
/// Both min and max are inclusive.
pub type FileTimeRange = (Timestamp, Timestamp);
/// Checks if two inclusive timestamp ranges overlap with each other.

View File

@@ -238,9 +238,6 @@ impl ParquetReaderBuilder {
cache_manager: self.cache_manager.clone(),
};
// TODO(yingwen): count the cost of the method.
metrics.build_cost = start.elapsed();
let mut filters = if let Some(predicate) = &self.predicate {
predicate
.exprs()
@@ -270,6 +267,9 @@ impl ParquetReaderBuilder {
);
let context = FileRangeContext::new(reader_builder, filters, read_format, codec);
metrics.build_cost += start.elapsed();
Ok((context, row_groups))
}

View File

@@ -35,7 +35,7 @@ use crate::memtable::key_values::KeyValue;
use crate::memtable::partition_tree::data::{timestamp_array_to_i64_slice, DataBatch, DataBuffer};
use crate::memtable::{
BoxedBatchIterator, BulkPart, IterBuilder, KeyValues, Memtable, MemtableBuilder, MemtableId,
MemtableRange, MemtableRangeContext, MemtableRef, MemtableStats,
MemtableRange, MemtableRef, MemtableStats,
};
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
@@ -361,11 +361,3 @@ pub(crate) fn collect_iter_timestamps(iter: BoxedBatchIterator) -> Vec<i64> {
.map(|v| v.unwrap().0.value())
.collect()
}
/// Builds a memtable range for test.
pub(crate) fn mem_range_for_test(id: MemtableId) -> MemtableRange {
let builder = Box::new(EmptyIterBuilder::default());
let context = Arc::new(MemtableRangeContext::new(id, builder));
MemtableRange::new(context)
}

View File

@@ -210,6 +210,37 @@ where
self.transformer.transform_mut(val)
}
pub fn prepare_pipeline_value(&self, val: Value, result: &mut [Value]) -> Result<()> {
match val {
Value::Map(map) => {
let mut search_from = 0;
// because of the key in the json map is ordered
for (payload_key, payload_value) in map.values.into_iter() {
if search_from >= self.required_keys.len() {
break;
}
// because of map key is ordered, required_keys is ordered too
if let Some(pos) = self.required_keys[search_from..]
.iter()
.position(|k| k == &payload_key)
{
result[search_from + pos] = payload_value;
// next search from is always after the current key
search_from += pos;
}
}
}
Value::String(_) => {
result[0] = val;
}
_ => {
return PrepareValueMustBeObjectSnafu.fail();
}
}
Ok(())
}
pub fn prepare(&self, val: serde_json::Value, result: &mut [Value]) -> Result<()> {
match val {
serde_json::Value::Object(map) => {
@@ -286,6 +317,11 @@ pub(crate) fn find_key_index(intermediate_keys: &[String], key: &str, kind: &str
.context(IntermediateKeyIndexSnafu { kind, key })
}
pub enum PipelineWay {
Identity,
Custom(std::sync::Arc<Pipeline<crate::GreptimeTransformer>>),
}
#[cfg(test)]
mod tests {

View File

@@ -438,18 +438,26 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("failed to coerce complex value, not supported"))]
CoerceComplexType {
#[snafu(display("Can not coerce json type to {ty}"))]
CoerceJsonTypeTo {
ty: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("failed to coerce value: {msg}"))]
#[snafu(display(
"Can not coerce {ty} to json type. we only consider object and array to be json types."
))]
CoerceTypeToJson {
ty: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to coerce value: {msg}"))]
CoerceIncompatibleTypes {
msg: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Invalid resolution: '{resolution}'. Available resolutions: {valid_resolution}"
))]

View File

@@ -402,7 +402,8 @@ impl Processor for CmcdProcessor {
#[cfg(test)]
mod tests {
use ahash::HashMap;
use std::collections::BTreeMap;
use urlencoding::decode;
use super::{CmcdProcessorBuilder, CMCD_KEYS};
@@ -563,14 +564,14 @@ mod tests {
let values = vec
.into_iter()
.map(|(k, v)| (k.to_string(), v))
.collect::<HashMap<String, Value>>();
.collect::<BTreeMap<String, Value>>();
let expected = Map { values };
let actual = processor.parse(0, &decoded).unwrap();
let actual = actual
.into_iter()
.map(|(index, value)| (intermediate_keys[index].clone(), value))
.collect::<HashMap<String, Value>>();
.collect::<BTreeMap<String, Value>>();
let actual = Map { values: actual };
assert_eq!(actual, expected);
}

View File

@@ -383,6 +383,8 @@ impl Processor for RegexProcessor {
}
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use ahash::{HashMap, HashMapExt};
use itertools::Itertools;
@@ -475,14 +477,14 @@ ignore_missing: false"#;
.map(|k| k.to_string())
.collect_vec();
let processor = builder.build(&intermediate_keys).unwrap();
let mut result = HashMap::new();
let mut result = BTreeMap::new();
for (index, pattern) in processor.patterns.iter().enumerate() {
let r = processor
.process(&breadcrumbs_str, pattern, (0, index))
.unwrap()
.into_iter()
.map(|(k, v)| (intermediate_keys[k].clone(), v))
.collect::<HashMap<_, _>>();
.collect::<BTreeMap<_, _>>();
result.extend(r);
}
let map = Map { values: result };

View File

@@ -12,16 +12,17 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use api::v1::column_data_type_extension::TypeExt;
use api::v1::column_def::options_from_fulltext;
use api::v1::ColumnOptions;
use api::v1::{ColumnDataTypeExtension, ColumnOptions, JsonTypeExtension};
use datatypes::schema::FulltextOptions;
use greptime_proto::v1::value::ValueData;
use greptime_proto::v1::{ColumnDataType, ColumnSchema, SemanticType};
use snafu::ResultExt;
use crate::etl::error::{
CoerceComplexTypeSnafu, CoerceIncompatibleTypesSnafu, CoerceStringToTypeSnafu,
CoerceUnsupportedEpochTypeSnafu, CoerceUnsupportedNullTypeSnafu,
CoerceIncompatibleTypesSnafu, CoerceJsonTypeToSnafu, CoerceStringToTypeSnafu,
CoerceTypeToJsonSnafu, CoerceUnsupportedEpochTypeSnafu, CoerceUnsupportedNullTypeSnafu,
CoerceUnsupportedNullTypeToSnafu, ColumnOptionsSnafu, Error, Result,
};
use crate::etl::transform::index::Index;
@@ -62,7 +63,10 @@ impl TryFrom<Value> for ValueData {
}
Value::Timestamp(Timestamp::Second(s)) => Ok(ValueData::TimestampSecondValue(s)),
Value::Array(_) | Value::Map(_) => CoerceComplexTypeSnafu.fail(),
Value::Array(_) | Value::Map(_) => {
let data: jsonb::Value = value.into();
Ok(ValueData::BinaryValue(data.to_vec()))
}
}
}
}
@@ -74,15 +78,15 @@ pub(crate) fn coerce_columns(transform: &Transform) -> Result<Vec<ColumnSchema>>
for field in transform.real_fields.iter() {
let column_name = field.output_name().to_string();
let datatype = coerce_type(transform)? as i32;
let (datatype, datatype_extension) = coerce_type(transform)?;
let semantic_type = coerce_semantic_type(transform) as i32;
let column = ColumnSchema {
column_name,
datatype,
datatype: datatype as i32,
semantic_type,
datatype_extension: None,
datatype_extension,
options: coerce_options(transform)?,
};
columns.push(column);
@@ -111,30 +115,41 @@ fn coerce_options(transform: &Transform) -> Result<Option<ColumnOptions>> {
}
}
fn coerce_type(transform: &Transform) -> Result<ColumnDataType> {
fn coerce_type(transform: &Transform) -> Result<(ColumnDataType, Option<ColumnDataTypeExtension>)> {
match transform.type_ {
Value::Int8(_) => Ok(ColumnDataType::Int8),
Value::Int16(_) => Ok(ColumnDataType::Int16),
Value::Int32(_) => Ok(ColumnDataType::Int32),
Value::Int64(_) => Ok(ColumnDataType::Int64),
Value::Int8(_) => Ok((ColumnDataType::Int8, None)),
Value::Int16(_) => Ok((ColumnDataType::Int16, None)),
Value::Int32(_) => Ok((ColumnDataType::Int32, None)),
Value::Int64(_) => Ok((ColumnDataType::Int64, None)),
Value::Uint8(_) => Ok(ColumnDataType::Uint8),
Value::Uint16(_) => Ok(ColumnDataType::Uint16),
Value::Uint32(_) => Ok(ColumnDataType::Uint32),
Value::Uint64(_) => Ok(ColumnDataType::Uint64),
Value::Uint8(_) => Ok((ColumnDataType::Uint8, None)),
Value::Uint16(_) => Ok((ColumnDataType::Uint16, None)),
Value::Uint32(_) => Ok((ColumnDataType::Uint32, None)),
Value::Uint64(_) => Ok((ColumnDataType::Uint64, None)),
Value::Float32(_) => Ok(ColumnDataType::Float32),
Value::Float64(_) => Ok(ColumnDataType::Float64),
Value::Float32(_) => Ok((ColumnDataType::Float32, None)),
Value::Float64(_) => Ok((ColumnDataType::Float64, None)),
Value::Boolean(_) => Ok(ColumnDataType::Boolean),
Value::String(_) => Ok(ColumnDataType::String),
Value::Boolean(_) => Ok((ColumnDataType::Boolean, None)),
Value::String(_) => Ok((ColumnDataType::String, None)),
Value::Timestamp(Timestamp::Nanosecond(_)) => Ok(ColumnDataType::TimestampNanosecond),
Value::Timestamp(Timestamp::Microsecond(_)) => Ok(ColumnDataType::TimestampMicrosecond),
Value::Timestamp(Timestamp::Millisecond(_)) => Ok(ColumnDataType::TimestampMillisecond),
Value::Timestamp(Timestamp::Second(_)) => Ok(ColumnDataType::TimestampSecond),
Value::Timestamp(Timestamp::Nanosecond(_)) => {
Ok((ColumnDataType::TimestampNanosecond, None))
}
Value::Timestamp(Timestamp::Microsecond(_)) => {
Ok((ColumnDataType::TimestampMicrosecond, None))
}
Value::Timestamp(Timestamp::Millisecond(_)) => {
Ok((ColumnDataType::TimestampMillisecond, None))
}
Value::Timestamp(Timestamp::Second(_)) => Ok((ColumnDataType::TimestampSecond, None)),
Value::Array(_) | Value::Map(_) => CoerceComplexTypeSnafu.fail(),
Value::Array(_) | Value::Map(_) => Ok((
ColumnDataType::Binary,
Some(ColumnDataTypeExtension {
type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())),
}),
)),
Value::Null => CoerceUnsupportedNullTypeToSnafu {
ty: transform.type_.to_str_type(),
@@ -191,12 +206,12 @@ pub(crate) fn coerce_value(val: &Value, transform: &Transform) -> Result<Option<
))),
},
_ => CoerceIncompatibleTypesSnafu {
msg: "Timestamp can only be coerced to another timestamp",
msg: "Timestamp can only be coerced to another type",
}
.fail(),
},
Value::Array(_) | Value::Map(_) => CoerceComplexTypeSnafu.fail(),
Value::Array(_) | Value::Map(_) => coerce_json_value(val, transform),
}
}
@@ -228,7 +243,12 @@ fn coerce_bool_value(b: bool, transform: &Transform) -> Result<Option<ValueData>
}
},
Value::Array(_) | Value::Map(_) => return CoerceComplexTypeSnafu.fail(),
Value::Array(_) | Value::Map(_) => {
return CoerceJsonTypeToSnafu {
ty: transform.type_.to_str_type(),
}
.fail()
}
Value::Null => return Ok(None),
};
@@ -264,7 +284,12 @@ fn coerce_i64_value(n: i64, transform: &Transform) -> Result<Option<ValueData>>
}
},
Value::Array(_) | Value::Map(_) => return CoerceComplexTypeSnafu.fail(),
Value::Array(_) | Value::Map(_) => {
return CoerceJsonTypeToSnafu {
ty: transform.type_.to_str_type(),
}
.fail()
}
Value::Null => return Ok(None),
};
@@ -300,7 +325,12 @@ fn coerce_u64_value(n: u64, transform: &Transform) -> Result<Option<ValueData>>
}
},
Value::Array(_) | Value::Map(_) => return CoerceComplexTypeSnafu.fail(),
Value::Array(_) | Value::Map(_) => {
return CoerceJsonTypeToSnafu {
ty: transform.type_.to_str_type(),
}
.fail()
}
Value::Null => return Ok(None),
};
@@ -336,7 +366,12 @@ fn coerce_f64_value(n: f64, transform: &Transform) -> Result<Option<ValueData>>
}
},
Value::Array(_) | Value::Map(_) => return CoerceComplexTypeSnafu.fail(),
Value::Array(_) | Value::Map(_) => {
return CoerceJsonTypeToSnafu {
ty: transform.type_.to_str_type(),
}
.fail()
}
Value::Null => return Ok(None),
};
@@ -411,12 +446,43 @@ fn coerce_string_value(s: &String, transform: &Transform) -> Result<Option<Value
None => CoerceUnsupportedEpochTypeSnafu { ty: "String" }.fail(),
},
Value::Array(_) | Value::Map(_) => CoerceComplexTypeSnafu.fail(),
Value::Array(_) | Value::Map(_) => {
return CoerceJsonTypeToSnafu {
ty: transform.type_.to_str_type(),
}
.fail()
}
Value::Null => Ok(None),
}
}
fn coerce_json_value(v: &Value, transform: &Transform) -> Result<Option<ValueData>> {
match &transform.type_ {
Value::Array(_) | Value::Map(_) => (),
t => {
return CoerceTypeToJsonSnafu {
ty: t.to_str_type(),
}
.fail();
}
}
match v {
Value::Map(_) => {
let data: jsonb::Value = v.into();
Ok(Some(ValueData::BinaryValue(data.to_vec())))
}
Value::Array(_) => {
let data: jsonb::Value = v.into();
Ok(Some(ValueData::BinaryValue(data.to_vec())))
}
_ => CoerceTypeToJsonSnafu {
ty: v.to_str_type(),
}
.fail(),
}
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -16,8 +16,10 @@ pub mod array;
pub mod map;
pub mod time;
use ahash::{HashMap, HashMapExt};
use std::collections::BTreeMap;
pub use array::Array;
use jsonb::{Number as JsonbNumber, Object as JsonbObject, Value as JsonbValue};
pub use map::Map;
use snafu::{OptionExt, ResultExt};
pub use time::Timestamp;
@@ -57,6 +59,7 @@ pub enum Value {
Timestamp(Timestamp),
/// We only consider object and array to be json types.
Array(Array),
Map(Map),
}
@@ -110,8 +113,9 @@ impl Value {
_ => Ok(Value::Timestamp(Timestamp::Nanosecond(0))),
},
"array" => Ok(Value::Array(Array::default())),
"map" => Ok(Value::Map(Map::default())),
// We only consider object and array to be json types. and use Map to represent json
// TODO(qtang): Needs to be defined with better semantics
"json" => Ok(Value::Map(Map::default())),
_ => ValueParseTypeSnafu { t }.fail(),
}
@@ -221,8 +225,7 @@ impl Value {
Value::Timestamp(_) => "epoch",
Value::Array(_) => "array",
Value::Map(_) => "map",
Value::Array(_) | Value::Map(_) => "json",
Value::Null => "null",
}
@@ -287,7 +290,7 @@ impl TryFrom<serde_json::Value> for Value {
Ok(Value::Array(Array { values }))
}
serde_json::Value::Object(v) => {
let mut values = HashMap::with_capacity(v.len());
let mut values = BTreeMap::new();
for (k, v) in v {
values.insert(k, Value::try_from(v)?);
}
@@ -318,7 +321,7 @@ impl TryFrom<&yaml_rust::Yaml> for Value {
Ok(Value::Array(Array { values }))
}
yaml_rust::Yaml::Hash(v) => {
let mut values = HashMap::new();
let mut values = BTreeMap::new();
for (k, v) in v {
let key = k
.as_str()
@@ -331,3 +334,79 @@ impl TryFrom<&yaml_rust::Yaml> for Value {
}
}
}
impl<'a> From<&Value> for JsonbValue<'a> {
fn from(value: &Value) -> Self {
match value {
Value::Null => JsonbValue::Null,
Value::Boolean(v) => JsonbValue::Bool(*v),
Value::Int8(v) => JsonbValue::Number(JsonbNumber::Int64(*v as i64)),
Value::Int16(v) => JsonbValue::Number(JsonbNumber::Int64(*v as i64)),
Value::Int32(v) => JsonbValue::Number(JsonbNumber::Int64(*v as i64)),
Value::Int64(v) => JsonbValue::Number(JsonbNumber::Int64(*v)),
Value::Uint8(v) => JsonbValue::Number(JsonbNumber::UInt64(*v as u64)),
Value::Uint16(v) => JsonbValue::Number(JsonbNumber::UInt64(*v as u64)),
Value::Uint32(v) => JsonbValue::Number(JsonbNumber::UInt64(*v as u64)),
Value::Uint64(v) => JsonbValue::Number(JsonbNumber::UInt64(*v)),
Value::Float32(v) => JsonbValue::Number(JsonbNumber::Float64(*v as f64)),
Value::Float64(v) => JsonbValue::Number(JsonbNumber::Float64(*v)),
Value::String(v) => JsonbValue::String(v.clone().into()),
Value::Timestamp(v) => JsonbValue::String(v.to_string().into()),
Value::Array(arr) => {
let mut vals: Vec<JsonbValue> = Vec::with_capacity(arr.len());
for val in arr.iter() {
vals.push(val.into());
}
JsonbValue::Array(vals)
}
Value::Map(obj) => {
let mut map = JsonbObject::new();
for (k, v) in obj.iter() {
let val: JsonbValue = v.into();
map.insert(k.to_string(), val);
}
JsonbValue::Object(map)
}
}
}
}
impl<'a> From<Value> for JsonbValue<'a> {
fn from(value: Value) -> Self {
match value {
Value::Null => JsonbValue::Null,
Value::Boolean(v) => JsonbValue::Bool(v),
Value::Int8(v) => JsonbValue::Number(JsonbNumber::Int64(v as i64)),
Value::Int16(v) => JsonbValue::Number(JsonbNumber::Int64(v as i64)),
Value::Int32(v) => JsonbValue::Number(JsonbNumber::Int64(v as i64)),
Value::Int64(v) => JsonbValue::Number(JsonbNumber::Int64(v)),
Value::Uint8(v) => JsonbValue::Number(JsonbNumber::UInt64(v as u64)),
Value::Uint16(v) => JsonbValue::Number(JsonbNumber::UInt64(v as u64)),
Value::Uint32(v) => JsonbValue::Number(JsonbNumber::UInt64(v as u64)),
Value::Uint64(v) => JsonbValue::Number(JsonbNumber::UInt64(v)),
Value::Float32(v) => JsonbValue::Number(JsonbNumber::Float64(v as f64)),
Value::Float64(v) => JsonbValue::Number(JsonbNumber::Float64(v)),
Value::String(v) => JsonbValue::String(v.into()),
Value::Timestamp(v) => JsonbValue::String(v.to_string().into()),
Value::Array(arr) => {
let mut vals: Vec<JsonbValue> = Vec::with_capacity(arr.len());
for val in arr.into_iter() {
vals.push(val.into());
}
JsonbValue::Array(vals)
}
Value::Map(obj) => {
let mut map = JsonbObject::new();
for (k, v) in obj.into_iter() {
let val: JsonbValue = v.into();
map.insert(k, val);
}
JsonbValue::Object(map)
}
}
}
}

View File

@@ -12,21 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use ahash::{HashMap, HashMapExt};
use std::collections::BTreeMap;
use ahash::HashMap;
use crate::etl::value::Value;
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone, PartialEq, Default)]
pub struct Map {
pub values: HashMap<String, Value>,
}
impl Default for Map {
fn default() -> Self {
Self {
values: HashMap::with_capacity(30),
}
}
pub values: BTreeMap<String, Value>,
}
impl Map {
@@ -47,12 +41,16 @@ impl Map {
impl From<HashMap<String, Value>> for Map {
fn from(values: HashMap<String, Value>) -> Self {
Map { values }
let mut map = Map::default();
for (k, v) in values.into_iter() {
map.insert(k, v);
}
map
}
}
impl std::ops::Deref for Map {
type Target = HashMap<String, Value>;
type Target = BTreeMap<String, Value>;
fn deref(&self) -> &Self::Target {
&self.values
@@ -65,6 +63,16 @@ impl std::ops::DerefMut for Map {
}
}
impl std::iter::IntoIterator for Map {
type Item = (String, Value);
type IntoIter = std::collections::btree_map::IntoIter<String, Value>;
fn into_iter(self) -> Self::IntoIter {
self.values.into_iter()
}
}
impl std::fmt::Display for Map {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
let values = self

View File

@@ -21,7 +21,7 @@ pub use etl::processor::Processor;
pub use etl::transform::transformer::identity_pipeline;
pub use etl::transform::{GreptimeTransformer, Transformer};
pub use etl::value::{Array, Map, Value};
pub use etl::{parse, Content, Pipeline};
pub use etl::{error as etl_error, parse, Content, Pipeline, PipelineWay};
pub use manager::{
error, pipeline_operator, table, util, PipelineInfo, PipelineRef, PipelineTableRef,
PipelineVersion,

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use api::v1::value::ValueData;
use api::v1::Rows;
use common_telemetry::tracing::info;
use greptime_proto::v1::value::ValueData::{
@@ -466,6 +467,57 @@ transform:
}
}
#[test]
fn test_json_type() {
let input_value_str = r#"
{
"product_object": {"hello":"world"},
"product_array": ["hello", "world"]
}
"#;
let input_value = serde_json::from_str::<serde_json::Value>(input_value_str).unwrap();
let pipeline_yaml = r#"
processors:
transform:
- fields:
- product_object
- product_array
type: json
"#;
let yaml_content = Content::Yaml(pipeline_yaml.into());
let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
let mut status = pipeline.init_intermediate_state();
pipeline.prepare(input_value, &mut status).unwrap();
let row = pipeline.exec_mut(&mut status).unwrap();
let r = row
.values
.into_iter()
.map(|v| v.value_data.unwrap())
.collect::<Vec<_>>();
let product_object = r[0].clone();
let product_array = r[1].clone();
match product_object {
ValueData::BinaryValue(data) => {
let jsonb = jsonb::from_slice(&data).unwrap().to_string();
assert_eq!(r#"{"hello":"world"}"#, jsonb);
}
_ => panic!("unexpected value"),
}
match product_array {
ValueData::BinaryValue(data) => {
let jsonb = jsonb::from_slice(&data).unwrap().to_string();
assert_eq!(r#"["hello","world"]"#, jsonb);
}
_ => panic!("unexpected value"),
}
}
#[test]
fn test_simple_data() {
let input_value_str = r#"

View File

@@ -71,11 +71,15 @@ arrow.workspace = true
catalog = { workspace = true, features = ["testing"] }
common-macro.workspace = true
common-query = { workspace = true, features = ["testing"] }
fastrand = "2.0"
format_num = "0.1"
num = "0.4"
num-traits = "0.2"
paste = "1.0"
pretty_assertions = "1.4.0"
rand.workspace = true
serde.workspace = true
serde_json.workspace = true
session = { workspace = true, features = ["testing"] }
statrs = "0.16"
stats-cli = "3.0"

View File

@@ -36,6 +36,7 @@ mod range_select;
pub mod region_query;
pub mod sql;
pub mod stats;
mod window_sort;
#[cfg(test)]
mod tests;

3221
src/query/src/window_sort.rs Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -14,6 +14,7 @@ testing = []
workspace = true
[dependencies]
ahash = "0.8"
aide = { version = "0.9", features = ["axum"] }
api.workspace = true
arrow.workspace = true

View File

@@ -531,6 +531,13 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("OpenTelemetry log error"))]
OpenTelemetryLog {
source: pipeline::etl_error::Error,
#[snafu(implicit)]
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -595,7 +602,8 @@ impl ErrorExt for Error {
| MysqlValueConversion { .. }
| ParseJson { .. }
| UnsupportedContentType { .. }
| TimestampOverflow { .. } => StatusCode::InvalidArguments,
| TimestampOverflow { .. }
| OpenTelemetryLog { .. } => StatusCode::InvalidArguments,
Catalog { source, .. } => source.status_code(),
RowWriter { source, .. } => source.status_code(),

View File

@@ -36,6 +36,7 @@ use common_time::timestamp::TimeUnit;
use common_time::Timestamp;
use datatypes::data_type::DataType;
use datatypes::schema::SchemaRef;
use datatypes::value::transform_value_ref_to_json_value;
use event::{LogState, LogValidatorRef};
use futures::FutureExt;
use schemars::JsonSchema;
@@ -241,14 +242,18 @@ impl HttpRecordsOutput {
} else {
let num_rows = recordbatches.iter().map(|r| r.num_rows()).sum::<usize>();
let mut rows = Vec::with_capacity(num_rows);
let schemas = schema.column_schemas();
let num_cols = schema.column_schemas().len();
rows.resize_with(num_rows, || Vec::with_capacity(num_cols));
let mut finished_row_cursor = 0;
for recordbatch in recordbatches {
for col in recordbatch.columns() {
for (col_idx, col) in recordbatch.columns().iter().enumerate() {
// safety here: schemas length is equal to the number of columns in the recordbatch
let schema = &schemas[col_idx];
for row_idx in 0..recordbatch.num_rows() {
let value = Value::try_from(col.get_ref(row_idx)).context(ToJsonSnafu)?;
let value = transform_value_ref_to_json_value(col.get_ref(row_idx), schema)
.context(ToJsonSnafu)?;
rows[row_idx + finished_row_cursor].push(value);
}
}
@@ -882,6 +887,7 @@ impl HttpServer {
Router::new()
.route("/v1/metrics", routing::post(otlp::metrics))
.route("/v1/traces", routing::post(otlp::traces))
.route("/v1/logs", routing::post(otlp::logs))
.layer(
ServiceBuilder::new()
.layer(HandleErrorLayer::new(handle_error))

View File

@@ -44,6 +44,9 @@ pub mod constants {
pub const GREPTIME_DB_HEADER_NAME: &str = "x-greptime-db-name";
pub const GREPTIME_TIMEZONE_HEADER_NAME: &str = "x-greptime-timezone";
pub const GREPTIME_DB_HEADER_ERROR_CODE: &str = common_error::GREPTIME_DB_HEADER_ERROR_CODE;
pub const GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME: &str = "x-greptime-log-pipeline-name";
pub const GREPTIME_LOG_PIPELINE_VERSION_HEADER_NAME: &str = "x-greptime-log-pipeline-version";
pub const GREPTIME_LOG_TABLE_NAME_HEADER_NAME: &str = "x-greptime-log-table-name";
}
pub static GREPTIME_DB_HEADER_FORMAT: HeaderName =

View File

@@ -12,26 +12,39 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use core::str;
use std::result::Result as StdResult;
use std::sync::Arc;
use axum::extract::State;
use axum::http::header;
use axum::extract::{FromRequestParts, State};
use axum::http::header::HeaderValue;
use axum::http::request::Parts;
use axum::http::{header, StatusCode};
use axum::response::IntoResponse;
use axum::Extension;
use axum::{async_trait, Extension};
use bytes::Bytes;
use common_telemetry::tracing;
use opentelemetry_proto::tonic::collector::logs::v1::{
ExportLogsServiceRequest, ExportLogsServiceResponse,
};
use opentelemetry_proto::tonic::collector::metrics::v1::{
ExportMetricsServiceRequest, ExportMetricsServiceResponse,
};
use opentelemetry_proto::tonic::collector::trace::v1::{
ExportTraceServiceRequest, ExportTraceServiceResponse,
};
use pipeline::util::to_pipeline_version;
use pipeline::PipelineWay;
use prost::Message;
use session::context::{Channel, QueryContext};
use snafu::prelude::*;
use super::header::{write_cost_header_map, CONTENT_TYPE_PROTOBUF};
use crate::error::{self, Result};
use crate::http::header::constants::{
GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME, GREPTIME_LOG_PIPELINE_VERSION_HEADER_NAME,
GREPTIME_LOG_TABLE_NAME_HEADER_NAME,
};
use crate::query_handler::OpenTelemetryProtocolHandlerRef;
#[axum_macros::debug_handler]
@@ -39,8 +52,9 @@ use crate::query_handler::OpenTelemetryProtocolHandlerRef;
pub async fn metrics(
State(handler): State<OpenTelemetryProtocolHandlerRef>,
Extension(mut query_ctx): Extension<QueryContext>,
bytes: Bytes,
) -> Result<OtlpMetricsResponse> {
) -> Result<OtlpResponse<ExportMetricsServiceResponse>> {
let db = query_ctx.get_db_string();
query_ctx.set_channel(Channel::Otlp);
let query_ctx = Arc::new(query_ctx);
@@ -53,7 +67,7 @@ pub async fn metrics(
handler
.metrics(request, query_ctx)
.await
.map(|o| OtlpMetricsResponse {
.map(|o| OtlpResponse {
resp_body: ExportMetricsServiceResponse {
partial_success: None,
},
@@ -61,27 +75,13 @@ pub async fn metrics(
})
}
pub struct OtlpMetricsResponse {
resp_body: ExportMetricsServiceResponse,
write_cost: usize,
}
impl IntoResponse for OtlpMetricsResponse {
fn into_response(self) -> axum::response::Response {
let mut header_map = write_cost_header_map(self.write_cost);
header_map.insert(header::CONTENT_TYPE, CONTENT_TYPE_PROTOBUF.clone());
(header_map, self.resp_body.encode_to_vec()).into_response()
}
}
#[axum_macros::debug_handler]
#[tracing::instrument(skip_all, fields(protocol = "otlp", request_type = "traces"))]
pub async fn traces(
State(handler): State<OpenTelemetryProtocolHandlerRef>,
Extension(mut query_ctx): Extension<QueryContext>,
bytes: Bytes,
) -> Result<OtlpTracesResponse> {
) -> Result<OtlpResponse<ExportTraceServiceResponse>> {
let db = query_ctx.get_db_string();
query_ctx.set_channel(Channel::Otlp);
let query_ctx = Arc::new(query_ctx);
@@ -93,7 +93,7 @@ pub async fn traces(
handler
.traces(request, query_ctx)
.await
.map(|o| OtlpTracesResponse {
.map(|o| OtlpResponse {
resp_body: ExportTraceServiceResponse {
partial_success: None,
},
@@ -101,12 +101,143 @@ pub async fn traces(
})
}
pub struct OtlpTracesResponse {
resp_body: ExportTraceServiceResponse,
pub struct PipelineInfo {
pub pipeline_name: Option<String>,
pub pipeline_version: Option<String>,
}
fn pipeline_header_error(
header: &HeaderValue,
key: &str,
) -> StdResult<String, (http::StatusCode, String)> {
let header_utf8 = str::from_utf8(header.as_bytes());
match header_utf8 {
Ok(s) => Ok(s.to_string()),
Err(_) => Err((
StatusCode::BAD_REQUEST,
format!("`{}` header is not valid UTF-8 string type.", key),
)),
}
}
#[async_trait]
impl<S> FromRequestParts<S> for PipelineInfo
where
S: Send + Sync,
{
type Rejection = (StatusCode, String);
async fn from_request_parts(parts: &mut Parts, _state: &S) -> StdResult<Self, Self::Rejection> {
let pipeline_name = parts.headers.get(GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME);
let pipeline_version = parts.headers.get(GREPTIME_LOG_PIPELINE_VERSION_HEADER_NAME);
match (pipeline_name, pipeline_version) {
(Some(name), Some(version)) => Ok(PipelineInfo {
pipeline_name: Some(pipeline_header_error(
name,
GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME,
)?),
pipeline_version: Some(pipeline_header_error(
version,
GREPTIME_LOG_PIPELINE_VERSION_HEADER_NAME,
)?),
}),
(None, _) => Ok(PipelineInfo {
pipeline_name: None,
pipeline_version: None,
}),
(Some(name), None) => Ok(PipelineInfo {
pipeline_name: Some(pipeline_header_error(
name,
GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME,
)?),
pipeline_version: None,
}),
}
}
}
pub struct TableInfo {
table_name: String,
}
#[async_trait]
impl<S> FromRequestParts<S> for TableInfo
where
S: Send + Sync,
{
type Rejection = (StatusCode, String);
async fn from_request_parts(parts: &mut Parts, _state: &S) -> StdResult<Self, Self::Rejection> {
let table_name = parts.headers.get(GREPTIME_LOG_TABLE_NAME_HEADER_NAME);
match table_name {
Some(name) => Ok(TableInfo {
table_name: pipeline_header_error(name, GREPTIME_LOG_TABLE_NAME_HEADER_NAME)?,
}),
None => Ok(TableInfo {
table_name: "opentelemetry_logs".to_string(),
}),
}
}
}
#[axum_macros::debug_handler]
#[tracing::instrument(skip_all, fields(protocol = "otlp", request_type = "traces"))]
pub async fn logs(
State(handler): State<OpenTelemetryProtocolHandlerRef>,
Extension(mut query_ctx): Extension<QueryContext>,
pipeline_info: PipelineInfo,
table_info: TableInfo,
bytes: Bytes,
) -> Result<OtlpResponse<ExportLogsServiceResponse>> {
let db = query_ctx.get_db_string();
query_ctx.set_channel(Channel::Otlp);
let query_ctx = Arc::new(query_ctx);
let _timer = crate::metrics::METRIC_HTTP_OPENTELEMETRY_LOGS_ELAPSED
.with_label_values(&[db.as_str()])
.start_timer();
let request = ExportLogsServiceRequest::decode(bytes).context(error::DecodeOtlpRequestSnafu)?;
let pipeline_way;
if let Some(pipeline_name) = &pipeline_info.pipeline_name {
let pipeline_version =
to_pipeline_version(pipeline_info.pipeline_version).map_err(|_| {
error::InvalidParameterSnafu {
reason: GREPTIME_LOG_PIPELINE_VERSION_HEADER_NAME,
}
.build()
})?;
let pipeline = match handler
.get_pipeline(pipeline_name, pipeline_version, query_ctx.clone())
.await
{
Ok(p) => p,
Err(e) => {
return Err(e);
}
};
pipeline_way = PipelineWay::Custom(pipeline);
} else {
pipeline_way = PipelineWay::Identity;
}
handler
.logs(request, pipeline_way, table_info.table_name, query_ctx)
.await
.map(|o| OtlpResponse {
resp_body: ExportLogsServiceResponse {
partial_success: None,
},
write_cost: o.meta.cost,
})
}
pub struct OtlpResponse<T: Message> {
resp_body: T,
write_cost: usize,
}
impl IntoResponse for OtlpTracesResponse {
impl<T: Message> IntoResponse for OtlpResponse<T> {
fn into_response(self) -> axum::response::Response {
let mut header_map = write_cost_header_map(self.write_cost);
header_map.insert(header::CONTENT_TYPE, CONTENT_TYPE_PROTOBUF.clone());

View File

@@ -141,6 +141,13 @@ lazy_static! {
&[METRIC_DB_LABEL]
)
.unwrap();
pub static ref METRIC_HTTP_OPENTELEMETRY_LOGS_ELAPSED: HistogramVec =
register_histogram_vec!(
"greptime_servers_http_otlp_logs_elapsed",
"servers http otlp logs elapsed",
&[METRIC_DB_LABEL]
)
.unwrap();
pub static ref METRIC_HTTP_LOGS_INGESTION_COUNTER: IntCounterVec = register_int_counter_vec!(
"greptime_servers_http_logs_ingestion_counter",
"servers http logs ingestion counter",

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod logs;
pub mod metrics;
pub mod plugin;
pub mod trace;

View File

@@ -0,0 +1,506 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::{BTreeMap, HashMap};
use api::v1::column_data_type_extension::TypeExt;
use api::v1::value::ValueData;
use api::v1::{
ColumnDataType, ColumnDataTypeExtension, ColumnOptions, ColumnSchema, JsonTypeExtension, Row,
RowInsertRequest, RowInsertRequests, Rows, SemanticType, Value as GreptimeValue,
};
use jsonb::{Number as JsonbNumber, Value as JsonbValue};
use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
use opentelemetry_proto::tonic::common::v1::{any_value, AnyValue, InstrumentationScope, KeyValue};
use opentelemetry_proto::tonic::logs::v1::LogRecord;
use pipeline::{Array, Map, PipelineWay, Value as PipelineValue};
use snafu::ResultExt;
use super::trace::attributes::OtlpAnyValue;
use crate::error::{OpenTelemetryLogSnafu, Result};
use crate::otlp::trace::span::bytes_to_hex_string;
/// Convert OpenTelemetry metrics to GreptimeDB insert requests
///
/// See
/// <https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/metrics/v1/metrics.proto>
/// for data structure of OTLP metrics.
///
/// Returns `InsertRequests` and total number of rows to ingest
pub fn to_grpc_insert_requests(
request: ExportLogsServiceRequest,
pipeline: PipelineWay,
table_name: String,
) -> Result<(RowInsertRequests, usize)> {
match pipeline {
PipelineWay::Identity => {
let rows = parse_export_logs_service_request_to_rows(request);
let len = rows.rows.len();
let insert_request = RowInsertRequest {
rows: Some(rows),
table_name,
};
Ok((
RowInsertRequests {
inserts: vec![insert_request],
},
len,
))
}
PipelineWay::Custom(p) => {
let request = parse_export_logs_service_request(request);
let mut result = Vec::new();
let mut intermediate_state = p.init_intermediate_state();
for v in request {
p.prepare_pipeline_value(v, &mut intermediate_state)
.context(OpenTelemetryLogSnafu)?;
let r = p
.exec_mut(&mut intermediate_state)
.context(OpenTelemetryLogSnafu)?;
result.push(r);
}
let len = result.len();
let rows = Rows {
schema: p.schemas().clone(),
rows: result,
};
let insert_request = RowInsertRequest {
rows: Some(rows),
table_name,
};
let insert_requests = RowInsertRequests {
inserts: vec![insert_request],
};
Ok((insert_requests, len))
}
}
}
fn scope_to_pipeline_value(
scope: Option<InstrumentationScope>,
) -> (PipelineValue, PipelineValue, PipelineValue) {
scope
.map(|x| {
(
PipelineValue::Map(Map {
values: key_value_to_map(x.attributes),
}),
PipelineValue::String(x.version),
PipelineValue::String(x.name),
)
})
.unwrap_or((
PipelineValue::Null,
PipelineValue::Null,
PipelineValue::Null,
))
}
fn scope_to_jsonb(
scope: Option<InstrumentationScope>,
) -> (JsonbValue<'static>, Option<String>, Option<String>) {
scope
.map(|x| {
(
key_value_to_jsonb(x.attributes),
Some(x.version),
Some(x.name),
)
})
.unwrap_or((JsonbValue::Null, None, None))
}
fn log_to_pipeline_value(
log: LogRecord,
resource_schema_url: PipelineValue,
resource_attr: PipelineValue,
scope_schema_url: PipelineValue,
scope_name: PipelineValue,
scope_version: PipelineValue,
scope_attrs: PipelineValue,
) -> PipelineValue {
let log_attrs = PipelineValue::Map(Map {
values: key_value_to_map(log.attributes),
});
let mut map = BTreeMap::new();
map.insert(
"Timestamp".to_string(),
PipelineValue::Uint64(log.time_unix_nano),
);
map.insert(
"ObservedTimestamp".to_string(),
PipelineValue::Uint64(log.observed_time_unix_nano),
);
// need to be convert to string
map.insert(
"TraceId".to_string(),
PipelineValue::String(bytes_to_hex_string(&log.trace_id)),
);
map.insert(
"SpanId".to_string(),
PipelineValue::String(bytes_to_hex_string(&log.span_id)),
);
map.insert("TraceFlags".to_string(), PipelineValue::Uint32(log.flags));
map.insert(
"SeverityText".to_string(),
PipelineValue::String(log.severity_text),
);
map.insert(
"SeverityNumber".to_string(),
PipelineValue::Int32(log.severity_number),
);
// need to be convert to string
map.insert(
"Body".to_string(),
log.body
.as_ref()
.map(|x| PipelineValue::String(log_body_to_string(x)))
.unwrap_or(PipelineValue::Null),
);
map.insert("ResourceSchemaUrl".to_string(), resource_schema_url);
map.insert("ResourceAttributes".to_string(), resource_attr);
map.insert("ScopeSchemaUrl".to_string(), scope_schema_url);
map.insert("ScopeName".to_string(), scope_name);
map.insert("ScopeVersion".to_string(), scope_version);
map.insert("ScopeAttributes".to_string(), scope_attrs);
map.insert("LogAttributes".to_string(), log_attrs);
PipelineValue::Map(Map { values: map })
}
fn build_otlp_logs_identity_schema() -> Vec<ColumnSchema> {
[
(
"scope_name",
ColumnDataType::String,
SemanticType::Tag,
None,
None,
),
(
"scope_version",
ColumnDataType::String,
SemanticType::Field,
None,
None,
),
(
"scope_attributes",
ColumnDataType::Binary,
SemanticType::Field,
Some(ColumnDataTypeExtension {
type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())),
}),
None,
),
(
"resource_attributes",
ColumnDataType::Binary,
SemanticType::Field,
Some(ColumnDataTypeExtension {
type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())),
}),
None,
),
(
"log_attributes",
ColumnDataType::Binary,
SemanticType::Field,
Some(ColumnDataTypeExtension {
type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())),
}),
None,
),
(
"timestamp",
ColumnDataType::TimestampNanosecond,
SemanticType::Timestamp,
None,
None,
),
(
"observed_timestamp",
ColumnDataType::TimestampNanosecond,
SemanticType::Field,
None,
None,
),
(
"trace_id",
ColumnDataType::String,
SemanticType::Tag,
None,
None,
),
(
"span_id",
ColumnDataType::String,
SemanticType::Tag,
None,
None,
),
(
"trace_flags",
ColumnDataType::Uint32,
SemanticType::Field,
None,
None,
),
(
"severity_text",
ColumnDataType::String,
SemanticType::Field,
None,
None,
),
(
"severity_number",
ColumnDataType::Int32,
SemanticType::Field,
None,
None,
),
(
"body",
ColumnDataType::String,
SemanticType::Field,
None,
Some(ColumnOptions {
options: HashMap::from([(
"fulltext".to_string(),
r#"{"enable":true}"#.to_string(),
)]),
}),
),
]
.into_iter()
.map(
|(field_name, column_type, semantic_type, datatype_extension, options)| ColumnSchema {
column_name: field_name.to_string(),
datatype: column_type as i32,
semantic_type: semantic_type as i32,
datatype_extension,
options,
},
)
.collect::<Vec<ColumnSchema>>()
}
fn build_identity_row(
log: LogRecord,
resource_attr: JsonbValue<'_>,
scope_name: Option<String>,
scope_version: Option<String>,
scope_attrs: JsonbValue<'_>,
) -> Row {
let row = vec![
GreptimeValue {
value_data: scope_name.map(ValueData::StringValue),
},
GreptimeValue {
value_data: scope_version.map(ValueData::StringValue),
},
GreptimeValue {
value_data: Some(ValueData::BinaryValue(scope_attrs.to_vec())),
},
GreptimeValue {
value_data: Some(ValueData::BinaryValue(resource_attr.to_vec())),
},
GreptimeValue {
value_data: Some(ValueData::BinaryValue(
key_value_to_jsonb(log.attributes).to_vec(),
)),
},
GreptimeValue {
value_data: Some(ValueData::TimestampNanosecondValue(
log.time_unix_nano as i64,
)),
},
GreptimeValue {
value_data: Some(ValueData::TimestampNanosecondValue(
log.observed_time_unix_nano as i64,
)),
},
GreptimeValue {
value_data: Some(ValueData::StringValue(bytes_to_hex_string(&log.trace_id))),
},
GreptimeValue {
value_data: Some(ValueData::StringValue(bytes_to_hex_string(&log.span_id))),
},
GreptimeValue {
value_data: Some(ValueData::U32Value(log.flags)),
},
GreptimeValue {
value_data: Some(ValueData::StringValue(log.severity_text)),
},
GreptimeValue {
value_data: Some(ValueData::I32Value(log.severity_number)),
},
GreptimeValue {
value_data: log
.body
.as_ref()
.map(|x| ValueData::StringValue(log_body_to_string(x))),
},
];
Row { values: row }
}
fn parse_export_logs_service_request_to_rows(request: ExportLogsServiceRequest) -> Rows {
let mut result = Vec::new();
for r in request.resource_logs {
let resource_attr = r
.resource
.map(|x| key_value_to_jsonb(x.attributes))
.unwrap_or(JsonbValue::Null);
for scope_logs in r.scope_logs {
let (scope_attrs, scope_version, scope_name) = scope_to_jsonb(scope_logs.scope);
for log in scope_logs.log_records {
let value = build_identity_row(
log,
resource_attr.clone(),
scope_name.clone(),
scope_version.clone(),
scope_attrs.clone(),
);
result.push(value);
}
}
}
Rows {
schema: build_otlp_logs_identity_schema(),
rows: result,
}
}
/// transform otlp logs request to pipeline value
/// https://opentelemetry.io/docs/concepts/signals/logs/
fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec<PipelineValue> {
let mut result = Vec::new();
for r in request.resource_logs {
let resource_attr = r
.resource
.map(|x| {
PipelineValue::Map(Map {
values: key_value_to_map(x.attributes),
})
})
.unwrap_or(PipelineValue::Null);
let resource_schema_url = PipelineValue::String(r.schema_url);
for scope_logs in r.scope_logs {
let (scope_attrs, scope_version, scope_name) =
scope_to_pipeline_value(scope_logs.scope);
let scope_schema_url = PipelineValue::String(scope_logs.schema_url);
for log in scope_logs.log_records {
let value = log_to_pipeline_value(
log,
resource_schema_url.clone(),
resource_attr.clone(),
scope_schema_url.clone(),
scope_name.clone(),
scope_version.clone(),
scope_attrs.clone(),
);
result.push(value);
}
}
}
result
}
// convert AnyValue to pipeline value
fn any_value_to_pipeline_value(value: any_value::Value) -> PipelineValue {
match value {
any_value::Value::StringValue(s) => PipelineValue::String(s),
any_value::Value::IntValue(i) => PipelineValue::Int64(i),
any_value::Value::DoubleValue(d) => PipelineValue::Float64(d),
any_value::Value::BoolValue(b) => PipelineValue::Boolean(b),
any_value::Value::ArrayValue(a) => {
let values = a
.values
.into_iter()
.map(|v| match v.value {
Some(value) => any_value_to_pipeline_value(value),
None => PipelineValue::Null,
})
.collect();
PipelineValue::Array(Array { values })
}
any_value::Value::KvlistValue(kv) => {
let value = key_value_to_map(kv.values);
PipelineValue::Map(Map { values: value })
}
any_value::Value::BytesValue(b) => PipelineValue::String(bytes_to_hex_string(&b)),
}
}
// convert otlp keyValue vec to map
fn key_value_to_map(key_values: Vec<KeyValue>) -> BTreeMap<String, PipelineValue> {
let mut map = BTreeMap::new();
for kv in key_values {
let value = match kv.value {
Some(value) => match value.value {
Some(value) => any_value_to_pipeline_value(value),
None => PipelineValue::Null,
},
None => PipelineValue::Null,
};
map.insert(kv.key.clone(), value);
}
map
}
fn any_value_to_jsonb(value: any_value::Value) -> JsonbValue<'static> {
match value {
any_value::Value::StringValue(s) => JsonbValue::String(s.into()),
any_value::Value::IntValue(i) => JsonbValue::Number(JsonbNumber::Int64(i)),
any_value::Value::DoubleValue(d) => JsonbValue::Number(JsonbNumber::Float64(d)),
any_value::Value::BoolValue(b) => JsonbValue::Bool(b),
any_value::Value::ArrayValue(a) => {
let values = a
.values
.into_iter()
.map(|v| match v.value {
Some(value) => any_value_to_jsonb(value),
None => JsonbValue::Null,
})
.collect();
JsonbValue::Array(values)
}
any_value::Value::KvlistValue(kv) => key_value_to_jsonb(kv.values),
any_value::Value::BytesValue(b) => JsonbValue::String(bytes_to_hex_string(&b).into()),
}
}
fn key_value_to_jsonb(key_values: Vec<KeyValue>) -> JsonbValue<'static> {
let mut map = BTreeMap::new();
for kv in key_values {
let value = match kv.value {
Some(value) => match value.value {
Some(value) => any_value_to_jsonb(value),
None => JsonbValue::Null,
},
None => JsonbValue::Null,
};
map.insert(kv.key.clone(), value);
}
JsonbValue::Object(map)
}
fn log_body_to_string(body: &AnyValue) -> String {
let otlp_value = OtlpAnyValue::from(body);
otlp_value.to_string()
}

View File

@@ -33,9 +33,10 @@ use api::v1::RowInsertRequests;
use async_trait::async_trait;
use common_query::Output;
use headers::HeaderValue;
use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest;
use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
use pipeline::{GreptimeTransformer, Pipeline, PipelineInfo, PipelineVersion};
use pipeline::{GreptimeTransformer, Pipeline, PipelineInfo, PipelineVersion, PipelineWay};
use serde_json::Value;
use session::context::QueryContextRef;
@@ -105,7 +106,7 @@ pub trait PromStoreProtocolHandler {
}
#[async_trait]
pub trait OpenTelemetryProtocolHandler {
pub trait OpenTelemetryProtocolHandler: LogHandler {
/// Handling opentelemetry metrics request
async fn metrics(
&self,
@@ -119,6 +120,14 @@ pub trait OpenTelemetryProtocolHandler {
request: ExportTraceServiceRequest,
ctx: QueryContextRef,
) -> Result<Output>;
async fn logs(
&self,
request: ExportLogsServiceRequest,
pipeline: PipelineWay,
table_name: String,
ctx: QueryContextRef,
) -> Result<Output>;
}
/// LogHandler is responsible for handling log related requests.

View File

@@ -180,7 +180,7 @@ impl ScannerPartitioning {
pub struct PartitionRange {
/// Start time of time index column. Inclusive.
pub start: Timestamp,
/// End time of time index column. Inclusive.
/// End time of time index column. Exclusive.
pub end: Timestamp,
/// Number of rows in this range. Is used to balance ranges between partitions.
pub num_rows: usize,

View File

@@ -639,7 +639,7 @@ impl From<v1::ChangeColumnType> for ChangeColumnType {
}
}
#[derive(Debug, Default)]
#[derive(Debug, Clone, Default)]
pub struct RegionFlushRequest {
pub row_group_size: Option<usize>,
}

View File

@@ -72,7 +72,7 @@ pub struct GreptimeDbCluster {
pub datanode_instances: HashMap<DatanodeId, Datanode>,
pub kv_backend: KvBackendRef,
pub metasrv: Metasrv,
pub metasrv: Arc<Metasrv>,
pub frontend: Arc<FeInstance>,
}

View File

@@ -17,10 +17,11 @@ use std::io::Write;
use api::prom_store::remote::WriteRequest;
use auth::user_provider_from_option;
use axum::http::{HeaderName, StatusCode};
use axum::http::{HeaderName, HeaderValue, StatusCode};
use common_error::status_code::StatusCode as ErrorCode;
use flate2::write::GzEncoder;
use flate2::Compression;
use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest;
use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
use opentelemetry_proto::tonic::metrics::v1::ResourceMetrics;
@@ -90,6 +91,7 @@ macro_rules! http_tests {
test_otlp_metrics,
test_otlp_traces,
test_otlp_logs,
);
)*
};
@@ -1520,7 +1522,7 @@ pub async fn test_otlp_metrics(store_type: StorageType) {
let client = TestClient::new(app);
// write metrics data
let res = send_req(&client, "/v1/otlp/v1/metrics", body.clone(), false).await;
let res = send_req(&client, vec![], "/v1/otlp/v1/metrics", body.clone(), false).await;
assert_eq!(StatusCode::OK, res.status());
// select metrics data
@@ -1532,7 +1534,7 @@ pub async fn test_otlp_metrics(store_type: StorageType) {
assert_eq!(res.status(), StatusCode::OK);
// write metrics data with gzip
let res = send_req(&client, "/v1/otlp/v1/metrics", body.clone(), true).await;
let res = send_req(&client, vec![], "/v1/otlp/v1/metrics", body.clone(), true).await;
assert_eq!(StatusCode::OK, res.status());
// select metrics data again
@@ -1557,7 +1559,7 @@ pub async fn test_otlp_traces(store_type: StorageType) {
let client = TestClient::new(app);
// write traces data
let res = send_req(&client, "/v1/otlp/v1/traces", body.clone(), false).await;
let res = send_req(&client, vec![], "/v1/otlp/v1/traces", body.clone(), false).await;
assert_eq!(StatusCode::OK, res.status());
// select traces data
@@ -1572,7 +1574,7 @@ pub async fn test_otlp_traces(store_type: StorageType) {
assert_eq!(res.status(), StatusCode::OK);
// write metrics data with gzip
let res = send_req(&client, "/v1/otlp/v1/traces", body.clone(), true).await;
let res = send_req(&client, vec![], "/v1/otlp/v1/traces", body.clone(), true).await;
assert_eq!(StatusCode::OK, res.status());
// select metrics data again
@@ -1581,6 +1583,40 @@ pub async fn test_otlp_traces(store_type: StorageType) {
guard.remove_all().await;
}
pub async fn test_otlp_logs(store_type: StorageType) {
common_telemetry::init_default_ut_logging();
let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "test_otlp_traces").await;
let content = r#"
{"resourceLogs":[{"resource":{"attributes":[{"key":"resource-attr","value":{"stringValue":"resource-attr-val-1"}}]},"schemaUrl":"https://opentelemetry.io/schemas/1.0.0/resourceLogs","scopeLogs":[{"scope":{},"schemaUrl":"https://opentelemetry.io/schemas/1.0.0/scopeLogs","logRecords":[{"flags":1,"timeUnixNano":1581452773000009875,"observedTimeUnixNano":1581452773000009875,"severityNumber":9,"severityText":"Info","body":{"value":{"stringValue":"This is a log message"}},"attributes":[{"key":"app","value":{"stringValue":"server"}},{"key":"instance_num","value":{"intValue":1}}],"droppedAttributesCount":1,"traceId":[48,56,48,52,48,50,48,49,48,48,48,48,48,48,48,48,48,48,48,48,48,48,48,48,48,48,48,48,48,48,48,48],"spanId":[48,49,48,50,48,52,48,56,48,48,48,48,48,48,48,48]},{"flags":1,"timeUnixNano":1581452773000000789,"observedTimeUnixNano":1581452773000000789,"severityNumber":9,"severityText":"Info","body":{"value":{"stringValue":"something happened"}},"attributes":[{"key":"customer","value":{"stringValue":"acme"}},{"key":"env","value":{"stringValue":"dev"}}],"droppedAttributesCount":1,"traceId":[48],"spanId":[48]}]}]}]}
"#;
let req: ExportLogsServiceRequest = serde_json::from_str(content).unwrap();
let body = req.encode_to_vec();
// handshake
let client = TestClient::new(app);
// write traces data
let res = send_req(
&client,
vec![(
HeaderName::from_static("x-greptime-log-table-name"),
HeaderValue::from_static("logs"),
)],
"/v1/otlp/v1/logs?db=public",
body.clone(),
false,
)
.await;
assert_eq!(StatusCode::OK, res.status());
let expected = r#"[["","",{},{"resource-attr":"resource-attr-val-1"},{"customer":"acme","env":"dev"},1581452773000000789,1581452773000000789,"30","30",1,"Info",9,"something happened"],["","",{},{"resource-attr":"resource-attr-val-1"},{"app":"server","instance_num":1},1581452773000009875,1581452773000009875,"3038303430323031303030303030303030303030303030303030303030303030","30313032303430383030303030303030",1,"Info",9,"This is a log message"]]"#;
validate_data(&client, "select * from logs;", expected).await;
guard.remove_all().await;
}
async fn validate_data(client: &TestClient, sql: &str, expected: &str) {
let res = client
.get(format!("/v1/sql?sql={sql}").as_str())
@@ -1593,11 +1629,21 @@ async fn validate_data(client: &TestClient, sql: &str, expected: &str) {
assert_eq!(v, expected);
}
async fn send_req(client: &TestClient, path: &str, body: Vec<u8>, with_gzip: bool) -> TestResponse {
async fn send_req(
client: &TestClient,
headers: Vec<(HeaderName, HeaderValue)>,
path: &str,
body: Vec<u8>,
with_gzip: bool,
) -> TestResponse {
let mut req = client
.post(path)
.header("content-type", "application/x-protobuf");
for (k, v) in headers {
req = req.header(k, v);
}
let mut len = body.len();
if with_gzip {