Compare commits

...

37 Commits

Author SHA1 Message Date
niebayes
122b47210e chore: bump version to 0.5.1 (#3116) 2024-01-08 11:32:56 +00:00
tison
316d843482 feat: support CSV format in sql HTTP API (#3062)
* chore: fix typo

Signed-off-by: tison <wander4096@gmail.com>

* add csv format

Signed-off-by: tison <wander4096@gmail.com>

* flatten response

Signed-off-by: tison <wander4096@gmail.com>

* more flatten response

Signed-off-by: tison <wander4096@gmail.com>

* add CSV format

Signed-off-by: tison <wander4096@gmail.com>

* format InfluxdbV1Response

Signed-off-by: tison <wander4096@gmail.com>

* format ErrorResponse

Signed-off-by: tison <wander4096@gmail.com>

* propagate ErrorResponse to InfluxdbV1Response

Signed-off-by: tison <wander4096@gmail.com>

* format GreptimedbV1Response

Signed-off-by: tison <wander4096@gmail.com>

* format CsvResponse

Signed-off-by: tison <wander4096@gmail.com>

* impl IntoResponse for QueryResponse

Signed-off-by: tison <wander4096@gmail.com>

* promql

Signed-off-by: tison <wander4096@gmail.com>

* sql

Signed-off-by: tison <wander4096@gmail.com>

* compile

Signed-off-by: tison <wander4096@gmail.com>

* fixup aide

Signed-off-by: tison <wander4096@gmail.com>

* clear debt

Signed-off-by: tison <wander4096@gmail.com>

* fixup UT test_recordbatches_conversion

Signed-off-by: tison <wander4096@gmail.com>

* fixup IT cases

Signed-off-by: tison <wander4096@gmail.com>

* fixup more IT cases

Signed-off-by: tison <wander4096@gmail.com>

* fixup test-integration cases

Signed-off-by: tison <wander4096@gmail.com>

* update comment

Signed-off-by: tison <wander4096@gmail.com>

* fixup deserialize and most query < 1ms

Signed-off-by: tison <wander4096@gmail.com>

* fixup auth tests

Signed-off-by: tison <wander4096@gmail.com>

* fixup tests

Signed-off-by: tison <wander4096@gmail.com>

* fixup and align X-GreptimeDB headers

Signed-off-by: tison <wander4096@gmail.com>

* fixup compile

Signed-off-by: tison <wander4096@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
2024-01-08 10:54:27 +00:00
niebayes
8c58d3f85b test(remote_wal): add unit tests for kafka remote wal (#2993)
* test: add unit tests

* feat: introduce kafka runtime backed by testcontainers

* test: add test for kafka runtime

* fix: format

* chore: make kafka image ready to be used

* feat: add entry builder

* tmp

* test: add unit tests for client manager

* test: add some unit tests for kafka log store

* chore: resolve some todos

* chore: resolve some todos

* test: add unit tests for kafka log store

* chore: add deprecate develop branch warning

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* tmp: ready to move unit tests to an indie dir

* test: update unit tests for client manager

* test: add unit tests for meta srv remote wal

* fix: license

* fix: test

* refactor: kafka image

* doc: add doc example for kafka image

* chore: migrate kafka image to an indie PR

* fix: CR

* fix: CR

* fix: test

* fix: CR

* fix: update Cargo.toml

* fix: CR

* feat: skip test if no endpoints env

* fix: format

* test: rewrite parallel test with barrier

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Co-authored-by: Ruihang Xia <waynestxia@gmail.com>
2024-01-08 10:48:11 +00:00
LFC
fcacb100a2 chore: expose some codes to let other projects use them (#3115) 2024-01-08 06:32:01 +00:00
Weny Xu
58ada1dfef fix: check env before running kafka test (#3110)
* fix: check env before running kafka test

* Apply suggestions from code review

Co-authored-by: niebayes <niebayes@gmail.com>

---------

Co-authored-by: niebayes <niebayes@gmail.com>
2024-01-08 06:30:43 +00:00
Weny Xu
f78c467a86 chore: bump opendal to 0.44.1 (#3111) 2024-01-08 03:55:58 +00:00
niebayes
78303639db feat(remote_wal): split an entry if it's too large (#3092)
* feat: split an entry if it's too large

* chore: rewrite check records

* test: add some unit tests for record

* chore: rewrite entry splitting

* chore: add unit tests for build records

* chore: add more unit tests for record

* chore: rewrite encdec of record

* revert: ignored test

* fix: set limit for max_batch_size

* fix: clippy

* chore: remove heavy logging

* fix: CR

* fix: properly terminate

* fix: CR

* fix: compiling

* fix: sqlness

* fix: CR

* fix: license

* fix: license
2024-01-05 12:41:43 +00:00
JeremyHi
bd1a5dc265 feat: metric engine support alter (#3098)
* feat: metric engine support alter

* chore: by comment

* feat: get physical table route for frontend
2024-01-05 09:46:39 +00:00
Weny Xu
e0a43f37d7 chore: bump opendal to 0.44 (#3058)
* chore: bump opendal to 0.44

* fix: fix test_object_store_cache_policy

* Revert "fix: fix test_object_store_cache_policy"

This reverts commit 46c37c343f66114e0f6ee7a0a3b9ee2b79c810af.

* fix: fix test_object_store_cache_policy

* fix: fix test_file_backend_with_lru_cache

* chore: apply suggestions from CR

* fix(mito): fix mito2 cache

* chore: apply suggestions from CR

* chore: apply suggestions from CR
2024-01-05 09:05:41 +00:00
zyy17
a89840f5f9 refactor(metrics): add 'greptime_' prefix for every metrics (#3093)
* refactor(metrics): add 'greptimedb_' prefix for every metrics

* chore: use 'greptime_' as prefix

* chore: add some prefix for new metrics

* chore: fix format error
2024-01-05 08:12:23 +00:00
dennis zhuang
c2db970687 feat: pushdown filters for some information_schema tables (#3091)
* feat: pushdown scan request to information_schema tables stream

* feat: supports filter pushdown for columns

* feat: supports filter pushdown for some information_schema tables

* fix: typo

* fix: predicate evaluate

* fix: typo

* test: predicates

* fix: comment

* fix: pub mod

* docs: improve comments

* fix: cr comments and supports like predicate

* chore: typo

* fix: cargo toml format

* chore: apply suggestion
2024-01-05 07:18:22 +00:00
LFC
e0525dbfeb chore: expose some codes to let other projects use them (#3102) 2024-01-05 06:54:01 +00:00
Weny Xu
cdc9021160 feat(metric): implement role and region_disk_usage (#3095)
* feat(metric): implement `role` and `region_disk_usage`

* Update src/datanode/src/region_server.rs

* Update src/datanode/src/heartbeat.rs

---------

Co-authored-by: LFC <990479+MichaelScofield@users.noreply.github.com>
2024-01-05 06:53:52 +00:00
dennis zhuang
702ea32538 docs: update the description of greptimedb project (#3099)
* docs: update the info of greptimedb project

* chore: move up SQL/PromQL
2024-01-05 03:06:02 +00:00
Weny Xu
342faa4e07 test: add tests for lease keeper with logical table (#3096) 2024-01-05 02:29:48 +00:00
tison
44ba131987 fix: improve redact sql regexp (#3080)
Signed-off-by: tison <wander4096@gmail.com>
2024-01-04 14:53:20 +00:00
Yingwen
96b6235f25 feat(mito): Add WriteCache struct and write SSTs to write cache (#2999)
* docs: remove todo

* feat: add upload cache

* feat: add cache to sst write path

* feat: add storage to part

* feat: add dir to part

* feat: revert storage name

* feat: flush use upload part writer

* feat: use upload part writer in compaction task

* refactor: upload part writer builds parquet writer

* chore: suppress warnings

* refactor: rename UploadCache to WriteCache

* refactor: move source to write_all()

* chore: typos

* chore: remove output mod

* feat: changes upload to async method

* docs: update cache

* chore: fix compiler errors

* docs: remove comment

* chore: simplify upload part

* refactor: remove option from cache manager param to access layer

* feat: remove cache home from file cache

* feat: write cache holds file cache

* feat: add recover and pub some methods

* feat: remove usages of UploadPartWriter

* refactor: move sst_file_path to sst mod

* refactor: use write cache in access layer

* refactor: remove upload

* style: fix clippy

* refactor: pub write cache method/structs
2024-01-04 10:53:43 +00:00
Weny Xu
f1a4750576 feat(tests-integration): add more region migration integration tests (#3094) 2024-01-04 08:18:46 +00:00
Zhenchi
d973cf81f0 feat(inverted_index): implement apply for SstIndexApplier (#3088)
* feat(inverted_index): implement apply for SstIndexApplier

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* chore: rename metrics

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2024-01-04 07:33:03 +00:00
Weny Xu
284a496f54 feat: add logs for upgrading candidate region and updating metadata (#3077)
* feat: add logs for upgrading candidate region

* feat: add logs for update metadata

* chore: apply suggestions from CR
2024-01-04 06:57:07 +00:00
WU Jingdi
4d250ed054 fix: Optimize export metric behavior (#3047)
* fix: optimze export metric bahavior

* chor: fix ci

* chore: update config format

* chore: fix format
2024-01-04 06:40:50 +00:00
LFC
ec43b9183d feat: table route for metric engine (#3053)
* feat: table route for metric engine

* feat: register logical regions

* fix: open logical region (#96)

---------

Co-authored-by: JeremyHi <jiachun_feng@proton.me>
2024-01-04 06:30:17 +00:00
ZonaHe
b025bed45c feat: update dashboard to v0.4.6 (#3089)
Co-authored-by: ZonaHex <ZonaHex@users.noreply.github.com>
2024-01-04 02:56:41 +00:00
Weny Xu
21694c2a1d feat: abort region migration if leader region peer is unexpected (#3086) 2024-01-03 11:46:51 +00:00
ClSlaid
5c66ce6e88 chore: remove unnecessary result wrappings (#3084)
patch: remove unnecessary result wrappings

Signed-off-by: 蔡略 <cailue@bupt.edu.cn>
2024-01-03 10:20:33 +00:00
Weny Xu
b2b752337b fix: fix non-physical error msg (#3087) 2024-01-03 09:40:03 +00:00
Weny Xu
aa22f9c94a refactor: allow procedure to acquire share lock (#3061)
* feat: implement `KeyRwLock`

* refactor: use KeyRwLock instead of LockMap

* refactor: use StringKey instead of String

* chore: remove redundant code

* refactor: cleanup KeyRwLock staled locks before granting new lock

* feat: clean staled locks manually

* feat: sort lock key in lexicographically order

* feat: ensure the ref count before dropping the rwlock

* feat: add more tests for rwlock

* feat: drop the key guards first

* feat: drops the key guards in the reverse order

* chore: apply suggestions from CR

* chore: apply suggestions from CR

* chore: apply suggestions from CR
2024-01-03 08:05:45 +00:00
Weny Xu
611a8aa2fe feat(tests-integration): add a naive region migration integration test (#3078)
* fix: fix heartbeat handler ignore upgrade candidate instruction

* fix: fix handler did not inject wal options

* feat: expose `RegionMigrationProcedureTask`

* feat(tests-integration): add a naive region migration test

* chore: apply suggestions from CR

* feat: add test if the target region has migrated

* chore: apply suggestions from CR
2024-01-03 07:12:59 +00:00
Zhenchi
e4c71843e6 feat(inverted_index): get memory usage of appliers (#3081)
Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2024-01-03 06:56:56 +00:00
Zhenchi
e1ad7af10c feat(puffin): finish return written bytes (#3082)
Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2024-01-03 06:55:09 +00:00
Zhenchi
b9302e4f0d feat(inverted_index): Add applier builder to convert Expr to Predicates (Part 2) (#3068)
* feat(inverted_index.integration): Add applier builder to convert Expr to Predicates (Part 1)

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* feat(inverted_index.integration): Add applier builder to convert Expr to Predicates (Part 2)

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* test: add comparison unit tests

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* test: add eq_list unit tests

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* test: add in_list unit tests

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* test: add and unit tests

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* test: strip tests

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix: address comments

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2024-01-03 05:14:40 +00:00
Yingwen
2e686fe053 feat(mito): Implement file cache (#3022)
* feat: recover cache

* feat: moka features

* test: tests for file cache

* chore: suppress warninig

* fix: parse_inde_key consider suffix

* feat: update cache

* feat: expose cache file path

* feat: use cache_path in test
2024-01-03 02:05:06 +00:00
Weny Xu
128d3717fa test(tests-integration): add a naive test with kafka wal (#3071)
* chore(tests-integration): add setup tests with kafka wal to README.md

* feat(tests-integration): add meta wal config

* fix(tests-integration): fix sign of both_instances_cases_with_kafka_wal

* chore(tests-integration): set num_topic to 3 for tests

* test(tests-integration): add a naive test with kafka wal

* chore: apply suggestions from CR
2024-01-02 09:05:20 +00:00
Weny Xu
2b181e91e0 refactor: unify the injection of WAL option (#3066)
* feat: add prepare_wal_option

* refactor: use integer hashmap

* feat: unify the injection of WAL option

* fix: fix procedure_flow_upgrade_candidate_with_retry

* chore: apply suggestions from CR
2024-01-02 07:40:02 +00:00
Weny Xu
d87ab06b28 feat: add kafka wal integration test utils (#3069)
* feat(tests-integration): add wal_config

* feat: add kafka wal integration test utils
2024-01-02 07:38:43 +00:00
Weny Xu
5653389063 feat!: correct the kafka config option (#3065)
* feat: correct the kafka config option

* refactor: rewrite the verbose comments
2024-01-02 07:31:37 +00:00
dimbtp
c4d7b0d91d feat: add some tables for information_schema (#3060)
* feat: add information_schema.optimizer_trace

* feat: add information_schema.parameters

* feat: add information_schema.profiling

* feat: add information_schema.referential_constraints

* feat: add information_schema.routines

* feat: add information_schema.schema_privileges

* feat: add information_schema.table_privileges

* feat: add information_schema.triggers

* fix: update sql test result

* feat: add information_schema.global_status

* feat: add information_schema.session_status

* fix: update sql test result

* fix: add TODO for some tables

* Update src/catalog/src/information_schema/memory_table/tables.rs

Co-authored-by: Yingwen <realevenyag@gmail.com>

---------

Co-authored-by: dennis zhuang <killme2008@gmail.com>
Co-authored-by: Yingwen <realevenyag@gmail.com>
2024-01-02 04:10:59 +00:00
204 changed files with 8985 additions and 2380 deletions

View File

@@ -19,3 +19,5 @@ GT_GCS_BUCKET = GCS bucket
GT_GCS_SCOPE = GCS scope
GT_GCS_CREDENTIAL_PATH = GCS credential path
GT_GCS_ENDPOINT = GCS end point
# Settings for kafka wal test
GT_KAFKA_ENDPOINTS = localhost:9092

144
Cargo.lock generated
View File

@@ -196,7 +196,7 @@ checksum = "8f1f8f5a6f3d50d89e3797d7593a50f96bb2aaa20ca0cc7be1fb673232c91d72"
[[package]]
name = "api"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"common-base",
"common-decimal",
@@ -674,7 +674,7 @@ dependencies = [
[[package]]
name = "auth"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"async-trait",
@@ -847,7 +847,7 @@ dependencies = [
[[package]]
name = "benchmarks"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"arrow",
"chrono",
@@ -1179,10 +1179,11 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "catalog"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"arc-swap",
"arrow",
"arrow-schema",
"async-stream",
"async-trait",
@@ -1450,7 +1451,7 @@ checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
[[package]]
name = "client"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"arrow-flight",
@@ -1483,7 +1484,7 @@ dependencies = [
"session",
"snafu",
"substrait 0.17.1",
"substrait 0.5.0",
"substrait 0.5.1",
"tokio",
"tokio-stream",
"tonic 0.10.2",
@@ -1513,7 +1514,7 @@ dependencies = [
[[package]]
name = "cmd"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"anymap",
"async-trait",
@@ -1564,7 +1565,7 @@ dependencies = [
"session",
"snafu",
"store-api",
"substrait 0.5.0",
"substrait 0.5.1",
"table",
"temp-env",
"tikv-jemallocator",
@@ -1597,7 +1598,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
[[package]]
name = "common-base"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"anymap",
"bitvec",
@@ -1612,7 +1613,7 @@ dependencies = [
[[package]]
name = "common-catalog"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"chrono",
"common-error",
@@ -1623,7 +1624,7 @@ dependencies = [
[[package]]
name = "common-config"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"common-base",
"humantime-serde",
@@ -1636,7 +1637,7 @@ dependencies = [
[[package]]
name = "common-datasource"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"arrow",
"arrow-schema",
@@ -1667,7 +1668,7 @@ dependencies = [
[[package]]
name = "common-decimal"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"arrow",
"bigdecimal",
@@ -1681,7 +1682,7 @@ dependencies = [
[[package]]
name = "common-error"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"snafu",
"strum 0.25.0",
@@ -1689,7 +1690,7 @@ dependencies = [
[[package]]
name = "common-function"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"arc-swap",
"build-data",
@@ -1713,7 +1714,7 @@ dependencies = [
[[package]]
name = "common-greptimedb-telemetry"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"async-trait",
"common-error",
@@ -1732,7 +1733,7 @@ dependencies = [
[[package]]
name = "common-grpc"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"arrow-flight",
@@ -1762,7 +1763,7 @@ dependencies = [
[[package]]
name = "common-grpc-expr"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"async-trait",
@@ -1781,7 +1782,7 @@ dependencies = [
[[package]]
name = "common-macro"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"arc-swap",
"common-query",
@@ -1796,7 +1797,7 @@ dependencies = [
[[package]]
name = "common-mem-prof"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"common-error",
"common-macro",
@@ -1809,7 +1810,7 @@ dependencies = [
[[package]]
name = "common-meta"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"async-recursion",
@@ -1832,6 +1833,7 @@ dependencies = [
"derive_builder 0.12.0",
"etcd-client",
"futures",
"futures-util",
"humantime-serde",
"hyper",
"lazy_static",
@@ -1850,11 +1852,12 @@ dependencies = [
"tokio",
"toml 0.8.8",
"tonic 0.10.2",
"uuid",
]
[[package]]
name = "common-procedure"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"async-stream",
"async-trait",
@@ -1878,7 +1881,7 @@ dependencies = [
[[package]]
name = "common-procedure-test"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"async-trait",
"common-procedure",
@@ -1886,7 +1889,7 @@ dependencies = [
[[package]]
name = "common-query"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"async-trait",
@@ -1909,7 +1912,7 @@ dependencies = [
[[package]]
name = "common-recordbatch"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"common-error",
"common-macro",
@@ -1926,7 +1929,7 @@ dependencies = [
[[package]]
name = "common-runtime"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"async-trait",
"common-error",
@@ -1946,7 +1949,7 @@ dependencies = [
[[package]]
name = "common-telemetry"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"backtrace",
"common-error",
@@ -1972,7 +1975,7 @@ dependencies = [
[[package]]
name = "common-test-util"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"once_cell",
"rand",
@@ -1981,7 +1984,7 @@ dependencies = [
[[package]]
name = "common-time"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"arrow",
"chrono",
@@ -1997,7 +2000,7 @@ dependencies = [
[[package]]
name = "common-version"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"build-data",
]
@@ -2627,7 +2630,7 @@ dependencies = [
[[package]]
name = "datanode"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"arrow-flight",
@@ -2687,7 +2690,7 @@ dependencies = [
"snafu",
"sql",
"store-api",
"substrait 0.5.0",
"substrait 0.5.1",
"table",
"tokio",
"tokio-stream",
@@ -2701,7 +2704,7 @@ dependencies = [
[[package]]
name = "datatypes"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"arrow",
"arrow-array",
@@ -3162,7 +3165,7 @@ dependencies = [
[[package]]
name = "file-engine"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"async-trait",
@@ -3293,7 +3296,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa"
[[package]]
name = "frontend"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"arc-swap",
@@ -3357,7 +3360,7 @@ dependencies = [
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
"store-api",
"strfmt",
"substrait 0.5.0",
"substrait 0.5.1",
"table",
"tokio",
"toml 0.8.8",
@@ -4011,7 +4014,7 @@ dependencies = [
[[package]]
name = "index"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"async-trait",
"asynchronous-codec",
@@ -4491,12 +4494,13 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "log-store"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"async-stream",
"async-trait",
"byteorder",
"bytes",
"chrono",
"common-base",
"common-config",
"common-error",
@@ -4505,13 +4509,14 @@ dependencies = [
"common-runtime",
"common-telemetry",
"common-test-util",
"dashmap",
"futures",
"futures-util",
"itertools 0.10.5",
"protobuf",
"protobuf-build",
"raft-engine",
"rand",
"rand_distr",
"rskafka",
"serde",
"serde_json",
@@ -4519,6 +4524,7 @@ dependencies = [
"store-api",
"tokio",
"tokio-util",
"uuid",
]
[[package]]
@@ -4765,7 +4771,7 @@ dependencies = [
[[package]]
name = "meta-client"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"async-trait",
@@ -4795,7 +4801,7 @@ dependencies = [
[[package]]
name = "meta-srv"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"anymap",
"api",
@@ -4873,7 +4879,7 @@ dependencies = [
[[package]]
name = "metric-engine"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"aquamarine",
@@ -4944,7 +4950,7 @@ dependencies = [
[[package]]
name = "mito2"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"anymap",
"api",
@@ -4986,8 +4992,10 @@ dependencies = [
"object-store",
"parquet",
"paste",
"pin-project",
"prometheus",
"prost 0.12.3",
"puffin",
"regex",
"serde",
"serde_json",
@@ -5443,7 +5451,7 @@ dependencies = [
[[package]]
name = "object-store"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"anyhow",
"async-trait",
@@ -5499,9 +5507,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
[[package]]
name = "opendal"
version = "0.40.0"
version = "0.44.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddba7299bab261d3ae2f37617fb7f45b19ed872752bb4e22cf93a69d979366c5"
checksum = "bc0ad72f7b44ca4ae59d27ea151fdc6f37305cf6efe099bdaedbb30ec34579c0"
dependencies = [
"anyhow",
"async-compat",
@@ -5512,15 +5520,15 @@ dependencies = [
"chrono",
"flagset",
"futures",
"getrandom",
"http",
"hyper",
"log",
"md-5",
"once_cell",
"parking_lot 0.12.1",
"percent-encoding",
"pin-project",
"quick-xml 0.29.0",
"quick-xml 0.30.0",
"reqsign",
"reqwest",
"serde",
@@ -5688,7 +5696,7 @@ dependencies = [
[[package]]
name = "operator"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"async-trait",
@@ -5732,7 +5740,7 @@ dependencies = [
"sql",
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
"store-api",
"substrait 0.5.0",
"substrait 0.5.1",
"table",
"tokio",
"tonic 0.10.2",
@@ -5963,7 +5971,7 @@ dependencies = [
[[package]]
name = "partition"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"async-trait",
@@ -6282,7 +6290,7 @@ dependencies = [
[[package]]
name = "plugins"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"auth",
"common-base",
@@ -6540,7 +6548,7 @@ dependencies = [
[[package]]
name = "promql"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"ahash 0.8.6",
"async-recursion",
@@ -6750,7 +6758,7 @@ dependencies = [
[[package]]
name = "puffin"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"async-trait",
"bitflags 2.4.1",
@@ -6861,7 +6869,7 @@ dependencies = [
[[package]]
name = "query"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"ahash 0.8.6",
"api",
@@ -6919,7 +6927,7 @@ dependencies = [
"stats-cli",
"store-api",
"streaming-stats",
"substrait 0.5.0",
"substrait 0.5.1",
"table",
"tokio",
"tokio-stream",
@@ -6936,9 +6944,9 @@ dependencies = [
[[package]]
name = "quick-xml"
version = "0.29.0"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81b9228215d82c7b61490fec1de287136b5de6f5700f6e58ea9ad61a7964ca51"
checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956"
dependencies = [
"memchr",
"serde",
@@ -8189,7 +8197,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "script"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"arc-swap",
@@ -8449,7 +8457,7 @@ dependencies = [
[[package]]
name = "servers"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"aide",
"api",
@@ -8545,7 +8553,7 @@ dependencies = [
[[package]]
name = "session"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"arc-swap",
@@ -8806,7 +8814,7 @@ dependencies = [
[[package]]
name = "sql"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"common-base",
@@ -8858,7 +8866,7 @@ dependencies = [
[[package]]
name = "sqlness-runner"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"async-trait",
"clap 4.4.11",
@@ -9065,7 +9073,7 @@ dependencies = [
[[package]]
name = "store-api"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"aquamarine",
@@ -9205,7 +9213,7 @@ dependencies = [
[[package]]
name = "substrait"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"async-recursion",
"async-trait",
@@ -9353,7 +9361,7 @@ dependencies = [
[[package]]
name = "table"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"anymap",
"async-trait",
@@ -9465,7 +9473,7 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
[[package]]
name = "tests-integration"
version = "0.5.0"
version = "0.5.1"
dependencies = [
"api",
"async-trait",
@@ -9521,7 +9529,7 @@ dependencies = [
"sql",
"sqlx",
"store-api",
"substrait 0.5.0",
"substrait 0.5.1",
"table",
"tempfile",
"time",

View File

@@ -58,7 +58,7 @@ members = [
resolver = "2"
[workspace.package]
version = "0.5.0"
version = "0.5.1"
edition = "2021"
license = "Apache-2.0"
@@ -180,6 +180,7 @@ operator = { path = "src/operator" }
partition = { path = "src/partition" }
plugins = { path = "src/plugins" }
promql = { path = "src/promql" }
puffin = { path = "src/puffin" }
query = { path = "src/query" }
script = { path = "src/script" }
servers = { path = "src/servers" }

View File

@@ -29,21 +29,17 @@
## What is GreptimeDB
GreptimeDB is an open-source time-series database with a special focus on
scalability, analytical capabilities and efficiency. It's designed to work on
infrastructure of the cloud era, and users benefit from its elasticity and commodity
storage.
GreptimeDB is an open-source time-series database focusing on efficiency, scalability, and analytical capabilities.
It's designed to work on infrastructure of the cloud era, and users benefit from its elasticity and commodity storage.
Our core developers have been building time-series data platform
for years. Based on their best-practices, GreptimeDB is born to give you:
Our core developers have been building time-series data platforms for years. Based on their best-practices, GreptimeDB is born to give you:
- A standalone binary that scales to highly-available distributed cluster, providing a transparent experience for cluster users
- Optimized columnar layout for handling time-series data; compacted, compressed, and stored on various storage backends
- Flexible indexes, tackling high cardinality issues down
- Distributed, parallel query execution, leveraging elastic computing resource
- Native SQL, and Python scripting for advanced analytical scenarios
- Widely adopted database protocols and APIs, native PromQL supports
- Extensible table engine architecture for extensive workloads
- Optimized columnar layout for handling time-series data; compacted, compressed, and stored on various storage backends, particularly cloud object storage with 50x cost efficiency.
- Fully open-source distributed cluster architecture that harnesses the power of cloud-native elastic computing resources.
- Seamless scalability from a standalone binary at edge to a robust, highly available distributed cluster in cloud, with a transparent experience for both developers and administrators.
- Native SQL and PromQL for queries, and Python scripting to facilitate complex analytical tasks.
- Flexible indexing capabilities and distributed, parallel-processing query engine, tackling high cardinality issues down.
- Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc.
## Quick Start

View File

@@ -51,9 +51,10 @@ sync_write = false
# Kafka wal options, see `standalone.example.toml`.
# broker_endpoints = ["127.0.0.1:9092"]
# max_batch_size = "4MB"
# Warning: Kafka has a default limit of 1MB per message in a topic.
# max_batch_size = "1MB"
# linger = "200ms"
# produce_record_timeout = "100ms"
# consumer_wait_timeout = "100ms"
# backoff_init = "500ms"
# backoff_max = "10s"
# backoff_base = 2
@@ -129,11 +130,10 @@ parallel_scan_channel_size = 32
# [export_metrics]
# whether enable export metrics, default is false
# enable = false
# The url of metrics export endpoint, default is `frontend` default HTTP endpoint.
# endpoint = "127.0.0.1:4000"
# The database name of exported metrics stores, user needs to specify a valid database
# db = ""
# The interval of export metrics
# write_interval = "30s"
# [export_metrics.remote_write]
# The url the metrics send to. The url is empty by default, url example: `http://127.0.0.1:4000/v1/prometheus/write?db=information_schema`
# url = ""
# HTTP headers of Prometheus remote-write carry
# headers = {}

View File

@@ -87,11 +87,8 @@ tcp_nodelay = true
# [export_metrics]
# whether enable export metrics, default is false
# enable = false
# The url of metrics export endpoint, default is `frontend` default HTTP endpoint.
# endpoint = "127.0.0.1:4000"
# The database name of exported metrics stores, user needs to specify a valid database
# db = ""
# The interval of export metrics
# write_interval = "30s"
# HTTP headers of Prometheus remote-write carry
# headers = {}
# for `frontend`, `self_import` is recommend to collect metrics generated by itself
# [export_metrics.self_import]
# db = "information_schema"

View File

@@ -86,11 +86,10 @@ provider = "raft_engine"
# [export_metrics]
# whether enable export metrics, default is false
# enable = false
# The url of metrics export endpoint, default is `frontend` default HTTP endpoint.
# endpoint = "127.0.0.1:4000"
# The database name of exported metrics stores, user needs to specify a valid database
# db = ""
# The interval of export metrics
# write_interval = "30s"
# [export_metrics.remote_write]
# The url the metrics send to. The url is empty by default, url example: `http://127.0.0.1:4000/v1/prometheus/write?db=information_schema`
# url = ""
# HTTP headers of Prometheus remote-write carry
# headers = {}

View File

@@ -100,29 +100,30 @@ provider = "raft_engine"
# Available selector types:
# - "round_robin" (default)
# selector_type = "round_robin"
# A Kafka topic is constructed by concatenating `topic_name_prefix` and `topic_id`.
# The prefix of topic name.
# topic_name_prefix = "greptimedb_wal_topic"
# Number of partitions per topic.
# num_partitions = 1
# Expected number of replicas of each partition.
# The number of replicas of each partition.
# replication_factor = 1
# The maximum log size a kafka batch producer could buffer.
# max_batch_size = "4MB"
# The linger duration of a kafka batch producer.
# The max size of a single producer batch.
# Warning: Kafka has a default limit of 1MB per message in a topic.
# max_batch_size = "1MB"
# The linger duration.
# linger = "200ms"
# The maximum amount of time (in milliseconds) to wait for Kafka records to be returned.
# produce_record_timeout = "100ms"
# Above which a topic creation operation will be cancelled.
# The consumer wait timeout.
# consumer_wait_timeout = "100ms"
# Create topic timeout.
# create_topic_timeout = "30s"
# The initial backoff for kafka clients.
# The initial backoff delay.
# backoff_init = "500ms"
# The maximum backoff for kafka clients.
# The maximum backoff delay.
# backoff_max = "10s"
# Exponential backoff rate, i.e. next backoff = base * current backoff.
# backoff_base = 2
# Stop reconnecting if the total wait time reaches the deadline. If this config is missing, the reconnecting won't terminate.
# The deadline of retries.
# backoff_deadline = "5mins"
# WAL data directory
@@ -230,11 +231,8 @@ parallel_scan_channel_size = 32
# [export_metrics]
# whether enable export metrics, default is false
# enable = false
# The url of metrics export endpoint, default is `frontend` default HTTP endpoint.
# endpoint = "127.0.0.1:4000"
# The database name of exported metrics stores, user needs to specify a valid database
# db = ""
# The interval of export metrics
# write_interval = "30s"
# HTTP headers of Prometheus remote-write carry
# headers = {}
# for `standalone`, `self_import` is recommend to collect metrics generated by itself
# [export_metrics.self_import]
# db = "information_schema"

View File

@@ -11,6 +11,7 @@ testing = []
api.workspace = true
arc-swap = "1.0"
arrow-schema.workspace = true
arrow.workspace = true
async-stream.workspace = true
async-trait = "0.1"
build-data = "0.1"

View File

@@ -15,6 +15,7 @@
mod columns;
mod key_column_usage;
mod memory_table;
mod predicate;
mod schemata;
mod table_names;
mod tables;
@@ -29,6 +30,7 @@ use datatypes::schema::SchemaRef;
use futures_util::StreamExt;
use lazy_static::lazy_static;
use paste::paste;
pub(crate) use predicate::Predicates;
use snafu::ResultExt;
use store_api::data_source::DataSource;
use store_api::storage::{ScanRequest, TableId};
@@ -61,6 +63,16 @@ lazy_static! {
CHECK_CONSTRAINTS,
EVENTS,
FILES,
OPTIMIZER_TRACE,
PARAMETERS,
PROFILING,
REFERENTIAL_CONSTRAINTS,
ROUTINES,
SCHEMA_PRIVILEGES,
TABLE_PRIVILEGES,
TRIGGERS,
GLOBAL_STATUS,
SESSION_STATUS,
];
}
@@ -149,7 +161,7 @@ impl InformationSchemaProvider {
fn build_table(&self, name: &str) -> Option<TableRef> {
self.information_table(name).map(|table| {
let table_info = Self::table_info(self.catalog_name.clone(), &table);
let filter_pushdown = FilterPushDownType::Unsupported;
let filter_pushdown = FilterPushDownType::Inexact;
let thin_table = ThinTable::new(table_info, filter_pushdown);
let data_source = Arc::new(InformationTableDataSource::new(table));
@@ -179,6 +191,16 @@ impl InformationSchemaProvider {
CHECK_CONSTRAINTS => setup_memory_table!(CHECK_CONSTRAINTS),
EVENTS => setup_memory_table!(EVENTS),
FILES => setup_memory_table!(FILES),
OPTIMIZER_TRACE => setup_memory_table!(OPTIMIZER_TRACE),
PARAMETERS => setup_memory_table!(PARAMETERS),
PROFILING => setup_memory_table!(PROFILING),
REFERENTIAL_CONSTRAINTS => setup_memory_table!(REFERENTIAL_CONSTRAINTS),
ROUTINES => setup_memory_table!(ROUTINES),
SCHEMA_PRIVILEGES => setup_memory_table!(SCHEMA_PRIVILEGES),
TABLE_PRIVILEGES => setup_memory_table!(TABLE_PRIVILEGES),
TRIGGERS => setup_memory_table!(TRIGGERS),
GLOBAL_STATUS => setup_memory_table!(GLOBAL_STATUS),
SESSION_STATUS => setup_memory_table!(SESSION_STATUS),
KEY_COLUMN_USAGE => Some(Arc::new(InformationSchemaKeyColumnUsage::new(
self.catalog_name.clone(),
self.catalog_manager.clone(),
@@ -218,7 +240,7 @@ trait InformationTable {
fn schema(&self) -> SchemaRef;
fn to_stream(&self) -> Result<SendableRecordBatchStream>;
fn to_stream(&self, request: ScanRequest) -> Result<SendableRecordBatchStream>;
fn table_type(&self) -> TableType {
TableType::Temporary
@@ -252,7 +274,7 @@ impl DataSource for InformationTableDataSource {
&self,
request: ScanRequest,
) -> std::result::Result<SendableRecordBatchStream, BoxedError> {
let projection = request.projection;
let projection = request.projection.clone();
let projected_schema = match &projection {
Some(projection) => self.try_project(projection)?,
None => self.table.schema(),
@@ -260,7 +282,7 @@ impl DataSource for InformationTableDataSource {
let stream = self
.table
.to_stream()
.to_stream(request)
.map_err(BoxedError::new)
.context(TablesRecordBatchSnafu)
.map_err(BoxedError::new)?

View File

@@ -29,14 +29,16 @@ use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatc
use datatypes::prelude::{ConcreteDataType, DataType};
use datatypes::scalars::ScalarVectorBuilder;
use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
use datatypes::value::Value;
use datatypes::vectors::{StringVectorBuilder, VectorRef};
use snafu::{OptionExt, ResultExt};
use store_api::storage::TableId;
use store_api::storage::{ScanRequest, TableId};
use super::{InformationTable, COLUMNS};
use crate::error::{
CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
};
use crate::information_schema::Predicates;
use crate::CatalogManager;
pub(super) struct InformationSchemaColumns {
@@ -102,14 +104,14 @@ impl InformationTable for InformationSchemaColumns {
self.schema.clone()
}
fn to_stream(&self) -> Result<SendableRecordBatchStream> {
fn to_stream(&self, request: ScanRequest) -> Result<SendableRecordBatchStream> {
let schema = self.schema.arrow_schema().clone();
let mut builder = self.builder();
let stream = Box::pin(DfRecordBatchStreamAdapter::new(
schema,
futures::stream::once(async move {
builder
.make_columns()
.make_columns(Some(request))
.await
.map(|x| x.into_df_record_batch())
.map_err(Into::into)
@@ -165,12 +167,13 @@ impl InformationSchemaColumnsBuilder {
}
/// Construct the `information_schema.columns` virtual table
async fn make_columns(&mut self) -> Result<RecordBatch> {
async fn make_columns(&mut self, request: Option<ScanRequest>) -> Result<RecordBatch> {
let catalog_name = self.catalog_name.clone();
let catalog_manager = self
.catalog_manager
.upgrade()
.context(UpgradeWeakCatalogManagerRefSnafu)?;
let predicates = Predicates::from_scan_request(&request);
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
if !catalog_manager
@@ -201,6 +204,7 @@ impl InformationSchemaColumnsBuilder {
};
self.add_column(
&predicates,
&catalog_name,
&schema_name,
&table_name,
@@ -219,6 +223,7 @@ impl InformationSchemaColumnsBuilder {
fn add_column(
&mut self,
predicates: &Predicates,
catalog_name: &str,
schema_name: &str,
table_name: &str,
@@ -227,6 +232,19 @@ impl InformationSchemaColumnsBuilder {
) {
let data_type = &column_schema.data_type.name();
let row = [
(TABLE_CATALOG, &Value::from(catalog_name)),
(TABLE_SCHEMA, &Value::from(schema_name)),
(TABLE_NAME, &Value::from(table_name)),
(COLUMN_NAME, &Value::from(column_schema.name.as_str())),
(DATA_TYPE, &Value::from(data_type.as_str())),
(SEMANTIC_TYPE, &Value::from(semantic_type)),
];
if !predicates.eval(&row) {
return;
}
self.catalog_names.push(Some(catalog_name));
self.schema_names.push(Some(schema_name));
self.table_names.push(Some(table_name));
@@ -279,7 +297,7 @@ impl DfPartitionStream for InformationSchemaColumns {
schema,
futures::stream::once(async move {
builder
.make_columns()
.make_columns(None)
.await
.map(|x| x.into_df_record_batch())
.map_err(Into::into)

View File

@@ -25,17 +25,26 @@ use datafusion::physical_plan::streaming::PartitionStream as DfPartitionStream;
use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
use datatypes::prelude::{ConcreteDataType, ScalarVectorBuilder, VectorRef};
use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
use datatypes::value::Value;
use datatypes::vectors::{StringVectorBuilder, UInt32VectorBuilder};
use snafu::{OptionExt, ResultExt};
use store_api::storage::TableId;
use store_api::storage::{ScanRequest, TableId};
use super::KEY_COLUMN_USAGE;
use crate::error::{
CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
};
use crate::information_schema::InformationTable;
use crate::information_schema::{InformationTable, Predicates};
use crate::CatalogManager;
const CONSTRAINT_SCHEMA: &str = "constraint_schema";
const CONSTRAINT_NAME: &str = "constraint_name";
const TABLE_CATALOG: &str = "table_catalog";
const TABLE_SCHEMA: &str = "table_schema";
const TABLE_NAME: &str = "table_name";
const COLUMN_NAME: &str = "column_name";
const ORDINAL_POSITION: &str = "ordinal_position";
/// The virtual table implementation for `information_schema.KEY_COLUMN_USAGE`.
pub(super) struct InformationSchemaKeyColumnUsage {
schema: SchemaRef,
@@ -60,24 +69,16 @@ impl InformationSchemaKeyColumnUsage {
false,
),
ColumnSchema::new(
"constraint_schema",
CONSTRAINT_SCHEMA,
ConcreteDataType::string_datatype(),
false,
),
ColumnSchema::new(
"constraint_name",
ConcreteDataType::string_datatype(),
false,
),
ColumnSchema::new("table_catalog", ConcreteDataType::string_datatype(), false),
ColumnSchema::new("table_schema", ConcreteDataType::string_datatype(), false),
ColumnSchema::new("table_name", ConcreteDataType::string_datatype(), false),
ColumnSchema::new("column_name", ConcreteDataType::string_datatype(), false),
ColumnSchema::new(
"ordinal_position",
ConcreteDataType::uint32_datatype(),
false,
),
ColumnSchema::new(CONSTRAINT_NAME, ConcreteDataType::string_datatype(), false),
ColumnSchema::new(TABLE_CATALOG, ConcreteDataType::string_datatype(), false),
ColumnSchema::new(TABLE_SCHEMA, ConcreteDataType::string_datatype(), false),
ColumnSchema::new(TABLE_NAME, ConcreteDataType::string_datatype(), false),
ColumnSchema::new(COLUMN_NAME, ConcreteDataType::string_datatype(), false),
ColumnSchema::new(ORDINAL_POSITION, ConcreteDataType::uint32_datatype(), false),
ColumnSchema::new(
"position_in_unique_constraint",
ConcreteDataType::uint32_datatype(),
@@ -123,14 +124,14 @@ impl InformationTable for InformationSchemaKeyColumnUsage {
self.schema.clone()
}
fn to_stream(&self) -> Result<SendableRecordBatchStream> {
fn to_stream(&self, request: ScanRequest) -> Result<SendableRecordBatchStream> {
let schema = self.schema.arrow_schema().clone();
let mut builder = self.builder();
let stream = Box::pin(DfRecordBatchStreamAdapter::new(
schema,
futures::stream::once(async move {
builder
.make_key_column_usage()
.make_key_column_usage(Some(request))
.await
.map(|x| x.into_df_record_batch())
.map_err(Into::into)
@@ -192,14 +193,14 @@ impl InformationSchemaKeyColumnUsageBuilder {
}
/// Construct the `information_schema.KEY_COLUMN_USAGE` virtual table
async fn make_key_column_usage(&mut self) -> Result<RecordBatch> {
async fn make_key_column_usage(&mut self, request: Option<ScanRequest>) -> Result<RecordBatch> {
let catalog_name = self.catalog_name.clone();
let catalog_manager = self
.catalog_manager
.upgrade()
.context(UpgradeWeakCatalogManagerRefSnafu)?;
let predicates = Predicates::from_scan_request(&request);
let mut time_index_constraints = vec![];
let mut primary_constraints = vec![];
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
@@ -223,11 +224,15 @@ impl InformationSchemaKeyColumnUsageBuilder {
for (idx, column) in schema.column_schemas().iter().enumerate() {
if column.is_time_index() {
time_index_constraints.push((
schema_name.clone(),
table_name.clone(),
column.name.clone(),
));
self.add_key_column_usage(
&predicates,
&schema_name,
"TIME INDEX",
&schema_name,
&table_name,
&column.name,
1, //always 1 for time index
);
}
if keys.contains(&idx) {
primary_constraints.push((
@@ -244,22 +249,11 @@ impl InformationSchemaKeyColumnUsageBuilder {
}
}
for (i, (schema_name, table_name, column_name)) in
time_index_constraints.into_iter().enumerate()
{
self.add_key_column_usage(
&schema_name,
"TIME INDEX",
&schema_name,
&table_name,
&column_name,
i as u32 + 1,
);
}
for (i, (schema_name, table_name, column_name)) in
primary_constraints.into_iter().enumerate()
{
self.add_key_column_usage(
&predicates,
&schema_name,
"PRIMARY",
&schema_name,
@@ -274,8 +268,10 @@ impl InformationSchemaKeyColumnUsageBuilder {
// TODO(dimbtp): Foreign key constraint has not `None` value for last 4
// fields, but it is not supported yet.
#[allow(clippy::too_many_arguments)]
fn add_key_column_usage(
&mut self,
predicates: &Predicates,
constraint_schema: &str,
constraint_name: &str,
table_schema: &str,
@@ -283,6 +279,19 @@ impl InformationSchemaKeyColumnUsageBuilder {
column_name: &str,
ordinal_position: u32,
) {
let row = [
(CONSTRAINT_SCHEMA, &Value::from(constraint_schema)),
(CONSTRAINT_NAME, &Value::from(constraint_name)),
(TABLE_SCHEMA, &Value::from(table_schema)),
(TABLE_NAME, &Value::from(table_name)),
(COLUMN_NAME, &Value::from(column_name)),
(ORDINAL_POSITION, &Value::from(ordinal_position)),
];
if !predicates.eval(&row) {
return;
}
self.constraint_catalog.push(Some("def"));
self.constraint_schema.push(Some(constraint_schema));
self.constraint_name.push(Some(constraint_name));
@@ -328,7 +337,7 @@ impl DfPartitionStream for InformationSchemaKeyColumnUsage {
schema,
futures::stream::once(async move {
builder
.make_key_column_usage()
.make_key_column_usage(None)
.await
.map(|x| x.into_df_record_batch())
.map_err(Into::into)

View File

@@ -26,7 +26,7 @@ use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatc
use datatypes::schema::SchemaRef;
use datatypes::vectors::VectorRef;
use snafu::ResultExt;
use store_api::storage::TableId;
use store_api::storage::{ScanRequest, TableId};
pub use tables::get_schema_columns;
use crate::error::{CreateRecordBatchSnafu, InternalSnafu, Result};
@@ -74,7 +74,7 @@ impl InformationTable for MemoryTable {
self.schema.clone()
}
fn to_stream(&self) -> Result<SendableRecordBatchStream> {
fn to_stream(&self, _request: ScanRequest) -> Result<SendableRecordBatchStream> {
let schema = self.schema.arrow_schema().clone();
let mut builder = self.builder();
let stream = Box::pin(DfRecordBatchStreamAdapter::new(
@@ -169,7 +169,7 @@ mod tests {
assert_eq!("test", table.table_name());
assert_eq!(schema, InformationTable::schema(&table));
let stream = table.to_stream().unwrap();
let stream = table.to_stream(ScanRequest::default()).unwrap();
let batches = RecordBatches::try_collect(stream).await.unwrap();
@@ -198,7 +198,7 @@ mod tests {
assert_eq!("test", table.table_name());
assert_eq!(schema, InformationTable::schema(&table));
let stream = table.to_stream().unwrap();
let stream = table.to_stream(ScanRequest::default()).unwrap();
let batches = RecordBatches::try_collect(stream).await.unwrap();

View File

@@ -227,6 +227,190 @@ pub fn get_schema_columns(table_name: &str) -> (SchemaRef, Vec<VectorRef>) {
vec![],
),
OPTIMIZER_TRACE => (
vec![
string_column("QUERY"),
string_column("TRACE"),
bigint_column("MISSING_BYTES_BEYOND_MAX_MEM_SIZE"),
bigint_column("INSUFFICIENT_PRIVILEGES"),
],
vec![],
),
// MySQL(https://dev.mysql.com/doc/refman/8.2/en/information-schema-parameters-table.html)
// has the spec that is different from
// PostgreSQL(https://www.postgresql.org/docs/current/infoschema-parameters.html).
// Follow `MySQL` spec here.
PARAMETERS => (
vec![
string_column("SPECIFIC_CATALOG"),
string_column("SPECIFIC_SCHEMA"),
string_column("SPECIFIC_NAME"),
bigint_column("ORDINAL_POSITION"),
string_column("PARAMETER_MODE"),
string_column("PARAMETER_NAME"),
string_column("DATA_TYPE"),
bigint_column("CHARACTER_MAXIMUM_LENGTH"),
bigint_column("CHARACTER_OCTET_LENGTH"),
bigint_column("NUMERIC_PRECISION"),
bigint_column("NUMERIC_SCALE"),
bigint_column("DATETIME_PRECISION"),
string_column("CHARACTER_SET_NAME"),
string_column("COLLATION_NAME"),
string_column("DTD_IDENTIFIER"),
string_column("ROUTINE_TYPE"),
],
vec![],
),
PROFILING => (
vec![
bigint_column("QUERY_ID"),
bigint_column("SEQ"),
string_column("STATE"),
bigint_column("DURATION"),
bigint_column("CPU_USER"),
bigint_column("CPU_SYSTEM"),
bigint_column("CONTEXT_VOLUNTARY"),
bigint_column("CONTEXT_INVOLUNTARY"),
bigint_column("BLOCK_OPS_IN"),
bigint_column("BLOCK_OPS_OUT"),
bigint_column("MESSAGES_SENT"),
bigint_column("MESSAGES_RECEIVED"),
bigint_column("PAGE_FAULTS_MAJOR"),
bigint_column("PAGE_FAULTS_MINOR"),
bigint_column("SWAPS"),
string_column("SOURCE_FUNCTION"),
string_column("SOURCE_FILE"),
bigint_column("SOURCE_LINE"),
],
vec![],
),
// TODO: _Must_ reimplement this table when foreign key constraint is supported.
REFERENTIAL_CONSTRAINTS => (
vec![
string_column("CONSTRAINT_CATALOG"),
string_column("CONSTRAINT_SCHEMA"),
string_column("CONSTRAINT_NAME"),
string_column("UNIQUE_CONSTRAINT_CATALOG"),
string_column("UNIQUE_CONSTRAINT_SCHEMA"),
string_column("UNIQUE_CONSTRAINT_NAME"),
string_column("MATCH_OPTION"),
string_column("UPDATE_RULE"),
string_column("DELETE_RULE"),
string_column("TABLE_NAME"),
string_column("REFERENCED_TABLE_NAME"),
],
vec![],
),
ROUTINES => (
vec![
string_column("SPECIFIC_NAME"),
string_column("ROUTINE_CATALOG"),
string_column("ROUTINE_SCHEMA"),
string_column("ROUTINE_NAME"),
string_column("ROUTINE_TYPE"),
string_column("DATA_TYPE"),
bigint_column("CHARACTER_MAXIMUM_LENGTH"),
bigint_column("CHARACTER_OCTET_LENGTH"),
bigint_column("NUMERIC_PRECISION"),
bigint_column("NUMERIC_SCALE"),
bigint_column("DATETIME_PRECISION"),
string_column("CHARACTER_SET_NAME"),
string_column("COLLATION_NAME"),
string_column("DTD_IDENTIFIER"),
string_column("ROUTINE_BODY"),
string_column("ROUTINE_DEFINITION"),
string_column("EXTERNAL_NAME"),
string_column("EXTERNAL_LANGUAGE"),
string_column("PARAMETER_STYLE"),
string_column("IS_DETERMINISTIC"),
string_column("SQL_DATA_ACCESS"),
string_column("SQL_PATH"),
string_column("SECURITY_TYPE"),
datetime_column("CREATED"),
datetime_column("LAST_ALTERED"),
string_column("SQL_MODE"),
string_column("ROUTINE_COMMENT"),
string_column("DEFINER"),
string_column("CHARACTER_SET_CLIENT"),
string_column("COLLATION_CONNECTION"),
string_column("DATABASE_COLLATION"),
],
vec![],
),
SCHEMA_PRIVILEGES => (
vec![
string_column("GRANTEE"),
string_column("TABLE_CATALOG"),
string_column("TABLE_SCHEMA"),
string_column("PRIVILEGE_TYPE"),
string_column("IS_GRANTABLE"),
],
vec![],
),
TABLE_PRIVILEGES => (
vec![
string_column("GRANTEE"),
string_column("TABLE_CATALOG"),
string_column("TABLE_SCHEMA"),
string_column("TABLE_NAME"),
string_column("PRIVILEGE_TYPE"),
string_column("IS_GRANTABLE"),
],
vec![],
),
TRIGGERS => (
vec![
string_column("TRIGGER_CATALOG"),
string_column("TRIGGER_SCHEMA"),
string_column("TRIGGER_NAME"),
string_column("EVENT_MANIPULATION"),
string_column("EVENT_OBJECT_CATALOG"),
string_column("EVENT_OBJECT_SCHEMA"),
string_column("EVENT_OBJECT_TABLE"),
bigint_column("ACTION_ORDER"),
string_column("ACTION_CONDITION"),
string_column("ACTION_STATEMENT"),
string_column("ACTION_ORIENTATION"),
string_column("ACTION_TIMING"),
string_column("ACTION_REFERENCE_OLD_TABLE"),
string_column("ACTION_REFERENCE_NEW_TABLE"),
string_column("ACTION_REFERENCE_OLD_ROW"),
string_column("ACTION_REFERENCE_NEW_ROW"),
datetime_column("CREATED"),
string_column("SQL_MODE"),
string_column("DEFINER"),
string_column("CHARACTER_SET_CLIENT"),
string_column("COLLATION_CONNECTION"),
string_column("DATABASE_COLLATION"),
],
vec![],
),
// TODO: Considering store internal metrics in `global_status` and
// `session_status` tables.
GLOBAL_STATUS => (
vec![
string_column("VARIABLE_NAME"),
string_column("VARIABLE_VALUE"),
],
vec![],
),
SESSION_STATUS => (
vec![
string_column("VARIABLE_NAME"),
string_column("VARIABLE_VALUE"),
],
vec![],
),
_ => unreachable!("Unknown table in information_schema: {}", table_name),
};

View File

@@ -0,0 +1,609 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use arrow::array::StringArray;
use arrow::compute::kernels::comparison;
use common_query::logical_plan::DfExpr;
use datafusion::common::ScalarValue;
use datafusion::logical_expr::expr::Like;
use datafusion::logical_expr::Operator;
use datatypes::value::Value;
use store_api::storage::ScanRequest;
type ColumnName = String;
/// Predicate to filter `information_schema` tables stream,
/// we only support these simple predicates currently.
/// TODO(dennis): supports more predicate types.
#[derive(Clone, PartialEq, Eq, Debug)]
enum Predicate {
Eq(ColumnName, Value),
Like(ColumnName, String, bool),
NotEq(ColumnName, Value),
InList(ColumnName, Vec<Value>),
And(Box<Predicate>, Box<Predicate>),
Or(Box<Predicate>, Box<Predicate>),
Not(Box<Predicate>),
}
impl Predicate {
/// Evaluate the predicate with the row, returns:
/// - `None` when the predicate can't evaluate with the row.
/// - `Some(true)` when the predicate is satisfied,
/// - `Some(false)` when the predicate is not satisfied,
fn eval(&self, row: &[(&str, &Value)]) -> Option<bool> {
match self {
Predicate::Eq(c, v) => {
for (column, value) in row {
if c != column {
continue;
}
return Some(v == *value);
}
}
Predicate::Like(c, pattern, case_insensitive) => {
for (column, value) in row {
if c != column {
continue;
}
let Value::String(bs) = value else {
continue;
};
return like_utf8(bs.as_utf8(), pattern, case_insensitive);
}
}
Predicate::NotEq(c, v) => {
for (column, value) in row {
if c != column {
continue;
}
return Some(v != *value);
}
}
Predicate::InList(c, values) => {
for (column, value) in row {
if c != column {
continue;
}
return Some(values.iter().any(|v| v == *value));
}
}
Predicate::And(left, right) => {
let left = left.eval(row);
// short-circuit
if matches!(left, Some(false)) {
return Some(false);
}
return match (left, right.eval(row)) {
(Some(left), Some(right)) => Some(left && right),
(None, Some(false)) => Some(false),
_ => None,
};
}
Predicate::Or(left, right) => {
let left = left.eval(row);
// short-circuit
if matches!(left, Some(true)) {
return Some(true);
}
return match (left, right.eval(row)) {
(Some(left), Some(right)) => Some(left || right),
(None, Some(true)) => Some(true),
_ => None,
};
}
Predicate::Not(p) => {
let Some(b) = p.eval(row) else {
return None;
};
return Some(!b);
}
}
// Can't evaluate predicate with the row
None
}
/// Try to create a predicate from datafusion [`Expr`], return None if fails.
fn from_expr(expr: DfExpr) -> Option<Predicate> {
match expr {
// NOT expr
DfExpr::Not(expr) => {
let Some(p) = Self::from_expr(*expr) else {
return None;
};
Some(Predicate::Not(Box::new(p)))
}
// expr LIKE pattern
DfExpr::Like(Like {
negated,
expr,
pattern,
case_insensitive,
..
}) if is_column(&expr) && is_string_literal(&pattern) => {
// Safety: ensured by gurad
let DfExpr::Column(c) = *expr else {
unreachable!();
};
let DfExpr::Literal(ScalarValue::Utf8(Some(pattern))) = *pattern else {
unreachable!();
};
let p = Predicate::Like(c.name, pattern, case_insensitive);
if negated {
Some(Predicate::Not(Box::new(p)))
} else {
Some(p)
}
}
// left OP right
DfExpr::BinaryExpr(bin) => match (*bin.left, bin.op, *bin.right) {
// left == right
(DfExpr::Literal(scalar), Operator::Eq, DfExpr::Column(c))
| (DfExpr::Column(c), Operator::Eq, DfExpr::Literal(scalar)) => {
let Ok(v) = Value::try_from(scalar) else {
return None;
};
Some(Predicate::Eq(c.name, v))
}
// left != right
(DfExpr::Literal(scalar), Operator::NotEq, DfExpr::Column(c))
| (DfExpr::Column(c), Operator::NotEq, DfExpr::Literal(scalar)) => {
let Ok(v) = Value::try_from(scalar) else {
return None;
};
Some(Predicate::NotEq(c.name, v))
}
// left AND right
(left, Operator::And, right) => {
let Some(left) = Self::from_expr(left) else {
return None;
};
let Some(right) = Self::from_expr(right) else {
return None;
};
Some(Predicate::And(Box::new(left), Box::new(right)))
}
// left OR right
(left, Operator::Or, right) => {
let Some(left) = Self::from_expr(left) else {
return None;
};
let Some(right) = Self::from_expr(right) else {
return None;
};
Some(Predicate::Or(Box::new(left), Box::new(right)))
}
_ => None,
},
// [NOT] IN (LIST)
DfExpr::InList(list) => {
match (*list.expr, list.list, list.negated) {
// column [NOT] IN (v1, v2, v3, ...)
(DfExpr::Column(c), list, negated) if is_all_scalars(&list) => {
let mut values = Vec::with_capacity(list.len());
for scalar in list {
// Safety: checked by `is_all_scalars`
let DfExpr::Literal(scalar) = scalar else {
unreachable!();
};
let Ok(value) = Value::try_from(scalar) else {
return None;
};
values.push(value);
}
let predicate = Predicate::InList(c.name, values);
if negated {
Some(Predicate::Not(Box::new(predicate)))
} else {
Some(predicate)
}
}
_ => None,
}
}
_ => None,
}
}
}
/// Perform SQL left LIKE right, return `None` if fail to evaluate.
/// - `s` the target string
/// - `pattern` the pattern just like '%abc'
/// - `case_insensitive` whether to perform case-insensitive like or not.
fn like_utf8(s: &str, pattern: &str, case_insensitive: &bool) -> Option<bool> {
let array = StringArray::from(vec![s]);
let patterns = StringArray::new_scalar(pattern);
let Ok(booleans) = (if *case_insensitive {
comparison::ilike(&array, &patterns)
} else {
comparison::like(&array, &patterns)
}) else {
return None;
};
// Safety: at least one value in result
Some(booleans.value(0))
}
fn is_string_literal(expr: &DfExpr) -> bool {
matches!(expr, DfExpr::Literal(ScalarValue::Utf8(Some(_))))
}
fn is_column(expr: &DfExpr) -> bool {
matches!(expr, DfExpr::Column(_))
}
/// A list of predicate
pub struct Predicates {
predicates: Vec<Predicate>,
}
impl Predicates {
/// Try its best to create predicates from [`ScanRequest`].
pub fn from_scan_request(request: &Option<ScanRequest>) -> Predicates {
if let Some(request) = request {
let mut predicates = Vec::with_capacity(request.filters.len());
for filter in &request.filters {
if let Some(predicate) = Predicate::from_expr(filter.df_expr().clone()) {
predicates.push(predicate);
}
}
Self { predicates }
} else {
Self {
predicates: Vec::new(),
}
}
}
/// Evaluate the predicates with the row.
/// returns true when all the predicates are satisfied or can't be evaluated.
pub fn eval(&self, row: &[(&str, &Value)]) -> bool {
// fast path
if self.predicates.is_empty() {
return true;
}
self.predicates
.iter()
.filter_map(|p| p.eval(row))
.all(|b| b)
}
}
/// Returns true when the values are all [`DfExpr::Literal`].
fn is_all_scalars(list: &[DfExpr]) -> bool {
list.iter().all(|v| matches!(v, DfExpr::Literal(_)))
}
#[cfg(test)]
mod tests {
use datafusion::common::{Column, ScalarValue};
use datafusion::logical_expr::expr::InList;
use datafusion::logical_expr::BinaryExpr;
use super::*;
#[test]
fn test_predicate_eval() {
let a_col = "a".to_string();
let b_col = "b".to_string();
let a_value = Value::from("a_value");
let b_value = Value::from("b_value");
let wrong_value = Value::from("wrong_value");
let a_row = [(a_col.as_str(), &a_value)];
let b_row = [("b", &wrong_value)];
let wrong_row = [(a_col.as_str(), &wrong_value)];
// Predicate::Eq
let p = Predicate::Eq(a_col.clone(), a_value.clone());
assert!(p.eval(&a_row).unwrap());
assert!(p.eval(&b_row).is_none());
assert!(!p.eval(&wrong_row).unwrap());
// Predicate::NotEq
let p = Predicate::NotEq(a_col.clone(), a_value.clone());
assert!(!p.eval(&a_row).unwrap());
assert!(p.eval(&b_row).is_none());
assert!(p.eval(&wrong_row).unwrap());
// Predicate::InList
let p = Predicate::InList(a_col.clone(), vec![a_value.clone(), b_value.clone()]);
assert!(p.eval(&a_row).unwrap());
assert!(p.eval(&b_row).is_none());
assert!(!p.eval(&wrong_row).unwrap());
assert!(p.eval(&[(&a_col, &b_value)]).unwrap());
let p1 = Predicate::Eq(a_col.clone(), a_value.clone());
let p2 = Predicate::Eq(b_col.clone(), b_value.clone());
let row = [(a_col.as_str(), &a_value), (b_col.as_str(), &b_value)];
let wrong_row = [(a_col.as_str(), &a_value), (b_col.as_str(), &wrong_value)];
//Predicate::And
let p = Predicate::And(Box::new(p1.clone()), Box::new(p2.clone()));
assert!(p.eval(&row).unwrap());
assert!(!p.eval(&wrong_row).unwrap());
assert!(p.eval(&[]).is_none());
assert!(p.eval(&[("c", &a_value)]).is_none());
assert!(!p
.eval(&[(a_col.as_str(), &b_value), (b_col.as_str(), &a_value)])
.unwrap());
assert!(!p
.eval(&[(a_col.as_str(), &b_value), (b_col.as_str(), &b_value)])
.unwrap());
assert!(p
.eval(&[(a_col.as_ref(), &a_value), ("c", &a_value)])
.is_none());
assert!(!p
.eval(&[(a_col.as_ref(), &b_value), ("c", &a_value)])
.unwrap());
//Predicate::Or
let p = Predicate::Or(Box::new(p1), Box::new(p2));
assert!(p.eval(&row).unwrap());
assert!(p.eval(&wrong_row).unwrap());
assert!(p.eval(&[]).is_none());
assert!(p.eval(&[("c", &a_value)]).is_none());
assert!(!p
.eval(&[(a_col.as_str(), &b_value), (b_col.as_str(), &a_value)])
.unwrap());
assert!(p
.eval(&[(a_col.as_str(), &b_value), (b_col.as_str(), &b_value)])
.unwrap());
assert!(p
.eval(&[(a_col.as_ref(), &a_value), ("c", &a_value)])
.unwrap());
assert!(p
.eval(&[(a_col.as_ref(), &b_value), ("c", &a_value)])
.is_none());
}
#[test]
fn test_predicate_like() {
// case insensitive
let expr = DfExpr::Like(Like {
negated: false,
expr: Box::new(column("a")),
pattern: Box::new(string_literal("%abc")),
case_insensitive: true,
escape_char: None,
});
let p = Predicate::from_expr(expr).unwrap();
assert!(
matches!(&p, Predicate::Like(c, pattern, case_insensitive) if
c == "a"
&& pattern == "%abc"
&& *case_insensitive)
);
let match_row = [
("a", &Value::from("hello AbC")),
("b", &Value::from("b value")),
];
let unmatch_row = [("a", &Value::from("bca")), ("b", &Value::from("b value"))];
assert!(p.eval(&match_row).unwrap());
assert!(!p.eval(&unmatch_row).unwrap());
assert!(p.eval(&[]).is_none());
// case sensitive
let expr = DfExpr::Like(Like {
negated: false,
expr: Box::new(column("a")),
pattern: Box::new(string_literal("%abc")),
case_insensitive: false,
escape_char: None,
});
let p = Predicate::from_expr(expr).unwrap();
assert!(
matches!(&p, Predicate::Like(c, pattern, case_insensitive) if
c == "a"
&& pattern == "%abc"
&& !*case_insensitive)
);
assert!(!p.eval(&match_row).unwrap());
assert!(!p.eval(&unmatch_row).unwrap());
assert!(p.eval(&[]).is_none());
// not like
let expr = DfExpr::Like(Like {
negated: true,
expr: Box::new(column("a")),
pattern: Box::new(string_literal("%abc")),
case_insensitive: true,
escape_char: None,
});
let p = Predicate::from_expr(expr).unwrap();
assert!(!p.eval(&match_row).unwrap());
assert!(p.eval(&unmatch_row).unwrap());
assert!(p.eval(&[]).is_none());
}
fn column(name: &str) -> DfExpr {
DfExpr::Column(Column {
relation: None,
name: name.to_string(),
})
}
fn string_literal(v: &str) -> DfExpr {
DfExpr::Literal(ScalarValue::Utf8(Some(v.to_string())))
}
fn match_string_value(v: &Value, expected: &str) -> bool {
matches!(v, Value::String(bs) if bs.as_utf8() == expected)
}
fn match_string_values(vs: &[Value], expected: &[&str]) -> bool {
assert_eq!(vs.len(), expected.len());
let mut result = true;
for (i, v) in vs.iter().enumerate() {
result = result && match_string_value(v, expected[i]);
}
result
}
fn mock_exprs() -> (DfExpr, DfExpr) {
let expr1 = DfExpr::BinaryExpr(BinaryExpr {
left: Box::new(column("a")),
op: Operator::Eq,
right: Box::new(string_literal("a_value")),
});
let expr2 = DfExpr::BinaryExpr(BinaryExpr {
left: Box::new(column("b")),
op: Operator::NotEq,
right: Box::new(string_literal("b_value")),
});
(expr1, expr2)
}
#[test]
fn test_predicate_from_expr() {
let (expr1, expr2) = mock_exprs();
let p1 = Predicate::from_expr(expr1.clone()).unwrap();
assert!(matches!(&p1, Predicate::Eq(column, v) if column == "a"
&& match_string_value(v, "a_value")));
let p2 = Predicate::from_expr(expr2.clone()).unwrap();
assert!(matches!(&p2, Predicate::NotEq(column, v) if column == "b"
&& match_string_value(v, "b_value")));
let and_expr = DfExpr::BinaryExpr(BinaryExpr {
left: Box::new(expr1.clone()),
op: Operator::And,
right: Box::new(expr2.clone()),
});
let or_expr = DfExpr::BinaryExpr(BinaryExpr {
left: Box::new(expr1.clone()),
op: Operator::Or,
right: Box::new(expr2.clone()),
});
let not_expr = DfExpr::Not(Box::new(expr1.clone()));
let and_p = Predicate::from_expr(and_expr).unwrap();
assert!(matches!(and_p, Predicate::And(left, right) if *left == p1 && *right == p2));
let or_p = Predicate::from_expr(or_expr).unwrap();
assert!(matches!(or_p, Predicate::Or(left, right) if *left == p1 && *right == p2));
let not_p = Predicate::from_expr(not_expr).unwrap();
assert!(matches!(not_p, Predicate::Not(p) if *p == p1));
let inlist_expr = DfExpr::InList(InList {
expr: Box::new(column("a")),
list: vec![string_literal("a1"), string_literal("a2")],
negated: false,
});
let inlist_p = Predicate::from_expr(inlist_expr).unwrap();
assert!(matches!(&inlist_p, Predicate::InList(c, values) if c == "a"
&& match_string_values(values, &["a1", "a2"])));
let inlist_expr = DfExpr::InList(InList {
expr: Box::new(column("a")),
list: vec![string_literal("a1"), string_literal("a2")],
negated: true,
});
let inlist_p = Predicate::from_expr(inlist_expr).unwrap();
assert!(matches!(inlist_p, Predicate::Not(p) if
matches!(&*p,
Predicate::InList(c, values) if c == "a"
&& match_string_values(values, &["a1", "a2"]))));
}
#[test]
fn test_predicates_from_scan_request() {
let predicates = Predicates::from_scan_request(&None);
assert!(predicates.predicates.is_empty());
let (expr1, expr2) = mock_exprs();
let request = ScanRequest {
filters: vec![expr1.into(), expr2.into()],
..Default::default()
};
let predicates = Predicates::from_scan_request(&Some(request));
assert_eq!(2, predicates.predicates.len());
assert!(
matches!(&predicates.predicates[0], Predicate::Eq(column, v) if column == "a"
&& match_string_value(v, "a_value"))
);
assert!(
matches!(&predicates.predicates[1], Predicate::NotEq(column, v) if column == "b"
&& match_string_value(v, "b_value"))
);
}
#[test]
fn test_predicates_eval_row() {
let wrong_row = [
("a", &Value::from("a_value")),
("b", &Value::from("b_value")),
("c", &Value::from("c_value")),
];
let row = [
("a", &Value::from("a_value")),
("b", &Value::from("not_b_value")),
("c", &Value::from("c_value")),
];
let c_row = [("c", &Value::from("c_value"))];
// test empty predicates, always returns true
let predicates = Predicates::from_scan_request(&None);
assert!(predicates.eval(&row));
assert!(predicates.eval(&wrong_row));
assert!(predicates.eval(&c_row));
let (expr1, expr2) = mock_exprs();
let request = ScanRequest {
filters: vec![expr1.into(), expr2.into()],
..Default::default()
};
let predicates = Predicates::from_scan_request(&Some(request));
assert!(predicates.eval(&row));
assert!(!predicates.eval(&wrong_row));
assert!(predicates.eval(&c_row));
}
}

View File

@@ -25,17 +25,23 @@ use datafusion::physical_plan::streaming::PartitionStream as DfPartitionStream;
use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
use datatypes::prelude::{ConcreteDataType, ScalarVectorBuilder, VectorRef};
use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
use datatypes::value::Value;
use datatypes::vectors::StringVectorBuilder;
use snafu::{OptionExt, ResultExt};
use store_api::storage::TableId;
use store_api::storage::{ScanRequest, TableId};
use super::SCHEMATA;
use crate::error::{
CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
};
use crate::information_schema::InformationTable;
use crate::information_schema::{InformationTable, Predicates};
use crate::CatalogManager;
const CATALOG_NAME: &str = "catalog_name";
const SCHEMA_NAME: &str = "schema_name";
const DEFAULT_CHARACTER_SET_NAME: &str = "default_character_set_name";
const DEFAULT_COLLATION_NAME: &str = "default_collation_name";
/// The `information_schema.schemata` table implementation.
pub(super) struct InformationSchemaSchemata {
schema: SchemaRef,
@@ -54,15 +60,15 @@ impl InformationSchemaSchemata {
pub(crate) fn schema() -> SchemaRef {
Arc::new(Schema::new(vec![
ColumnSchema::new("catalog_name", ConcreteDataType::string_datatype(), false),
ColumnSchema::new("schema_name", ConcreteDataType::string_datatype(), false),
ColumnSchema::new(CATALOG_NAME, ConcreteDataType::string_datatype(), false),
ColumnSchema::new(SCHEMA_NAME, ConcreteDataType::string_datatype(), false),
ColumnSchema::new(
"default_character_set_name",
DEFAULT_CHARACTER_SET_NAME,
ConcreteDataType::string_datatype(),
false,
),
ColumnSchema::new(
"default_collation_name",
DEFAULT_COLLATION_NAME,
ConcreteDataType::string_datatype(),
false,
),
@@ -92,14 +98,14 @@ impl InformationTable for InformationSchemaSchemata {
self.schema.clone()
}
fn to_stream(&self) -> Result<SendableRecordBatchStream> {
fn to_stream(&self, request: ScanRequest) -> Result<SendableRecordBatchStream> {
let schema = self.schema.arrow_schema().clone();
let mut builder = self.builder();
let stream = Box::pin(DfRecordBatchStreamAdapter::new(
schema,
futures::stream::once(async move {
builder
.make_schemata()
.make_schemata(Some(request))
.await
.map(|x| x.into_df_record_batch())
.map_err(Into::into)
@@ -147,12 +153,13 @@ impl InformationSchemaSchemataBuilder {
}
/// Construct the `information_schema.schemata` virtual table
async fn make_schemata(&mut self) -> Result<RecordBatch> {
async fn make_schemata(&mut self, request: Option<ScanRequest>) -> Result<RecordBatch> {
let catalog_name = self.catalog_name.clone();
let catalog_manager = self
.catalog_manager
.upgrade()
.context(UpgradeWeakCatalogManagerRefSnafu)?;
let predicates = Predicates::from_scan_request(&request);
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
if !catalog_manager
@@ -162,13 +169,24 @@ impl InformationSchemaSchemataBuilder {
continue;
}
self.add_schema(&catalog_name, &schema_name);
self.add_schema(&predicates, &catalog_name, &schema_name);
}
self.finish()
}
fn add_schema(&mut self, catalog_name: &str, schema_name: &str) {
fn add_schema(&mut self, predicates: &Predicates, catalog_name: &str, schema_name: &str) {
let row = [
(CATALOG_NAME, &Value::from(catalog_name)),
(SCHEMA_NAME, &Value::from(schema_name)),
(DEFAULT_CHARACTER_SET_NAME, &Value::from("utf8")),
(DEFAULT_COLLATION_NAME, &Value::from("utf8_bin")),
];
if !predicates.eval(&row) {
return;
}
self.catalog_names.push(Some(catalog_name));
self.schema_names.push(Some(schema_name));
self.charset_names.push(Some("utf8"));
@@ -200,7 +218,7 @@ impl DfPartitionStream for InformationSchemaSchemata {
schema,
futures::stream::once(async move {
builder
.make_schemata()
.make_schemata(None)
.await
.map(|x| x.into_df_record_batch())
.map_err(Into::into)

View File

@@ -25,6 +25,16 @@ pub const COLLATIONS: &str = "collations";
pub const COLLATION_CHARACTER_SET_APPLICABILITY: &str = "collation_character_set_applicability";
pub const CHECK_CONSTRAINTS: &str = "check_constraints";
pub const EVENTS: &str = "events";
pub const KEY_COLUMN_USAGE: &str = "key_column_usage";
pub const FILES: &str = "files";
pub const SCHEMATA: &str = "schemata";
pub const KEY_COLUMN_USAGE: &str = "key_column_usage";
pub const OPTIMIZER_TRACE: &str = "optimizer_trace";
pub const PARAMETERS: &str = "parameters";
pub const PROFILING: &str = "profiling";
pub const REFERENTIAL_CONSTRAINTS: &str = "referential_constraints";
pub const ROUTINES: &str = "routines";
pub const SCHEMA_PRIVILEGES: &str = "schema_privileges";
pub const TABLE_PRIVILEGES: &str = "table_privileges";
pub const TRIGGERS: &str = "triggers";
pub const GLOBAL_STATUS: &str = "global_status";
pub const SESSION_STATUS: &str = "session_status";

View File

@@ -25,18 +25,26 @@ use datafusion::physical_plan::streaming::PartitionStream as DfPartitionStream;
use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
use datatypes::prelude::{ConcreteDataType, ScalarVectorBuilder, VectorRef};
use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
use datatypes::value::Value;
use datatypes::vectors::{StringVectorBuilder, UInt32VectorBuilder};
use snafu::{OptionExt, ResultExt};
use store_api::storage::TableId;
use store_api::storage::{ScanRequest, TableId};
use table::metadata::TableType;
use super::TABLES;
use crate::error::{
CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
};
use crate::information_schema::InformationTable;
use crate::information_schema::{InformationTable, Predicates};
use crate::CatalogManager;
const TABLE_CATALOG: &str = "table_catalog";
const TABLE_SCHEMA: &str = "table_schema";
const TABLE_NAME: &str = "table_name";
const TABLE_TYPE: &str = "table_type";
const TABLE_ID: &str = "table_id";
const ENGINE: &str = "engine";
pub(super) struct InformationSchemaTables {
schema: SchemaRef,
catalog_name: String,
@@ -54,12 +62,12 @@ impl InformationSchemaTables {
pub(crate) fn schema() -> SchemaRef {
Arc::new(Schema::new(vec![
ColumnSchema::new("table_catalog", ConcreteDataType::string_datatype(), false),
ColumnSchema::new("table_schema", ConcreteDataType::string_datatype(), false),
ColumnSchema::new("table_name", ConcreteDataType::string_datatype(), false),
ColumnSchema::new("table_type", ConcreteDataType::string_datatype(), false),
ColumnSchema::new("table_id", ConcreteDataType::uint32_datatype(), true),
ColumnSchema::new("engine", ConcreteDataType::string_datatype(), true),
ColumnSchema::new(TABLE_CATALOG, ConcreteDataType::string_datatype(), false),
ColumnSchema::new(TABLE_SCHEMA, ConcreteDataType::string_datatype(), false),
ColumnSchema::new(TABLE_NAME, ConcreteDataType::string_datatype(), false),
ColumnSchema::new(TABLE_TYPE, ConcreteDataType::string_datatype(), false),
ColumnSchema::new(TABLE_ID, ConcreteDataType::uint32_datatype(), true),
ColumnSchema::new(ENGINE, ConcreteDataType::string_datatype(), true),
]))
}
@@ -85,14 +93,14 @@ impl InformationTable for InformationSchemaTables {
self.schema.clone()
}
fn to_stream(&self) -> Result<SendableRecordBatchStream> {
fn to_stream(&self, request: ScanRequest) -> Result<SendableRecordBatchStream> {
let schema = self.schema.arrow_schema().clone();
let mut builder = self.builder();
let stream = Box::pin(DfRecordBatchStreamAdapter::new(
schema,
futures::stream::once(async move {
builder
.make_tables()
.make_tables(Some(request))
.await
.map(|x| x.into_df_record_batch())
.map_err(Into::into)
@@ -142,12 +150,13 @@ impl InformationSchemaTablesBuilder {
}
/// Construct the `information_schema.tables` virtual table
async fn make_tables(&mut self) -> Result<RecordBatch> {
async fn make_tables(&mut self, request: Option<ScanRequest>) -> Result<RecordBatch> {
let catalog_name = self.catalog_name.clone();
let catalog_manager = self
.catalog_manager
.upgrade()
.context(UpgradeWeakCatalogManagerRefSnafu)?;
let predicates = Predicates::from_scan_request(&request);
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
if !catalog_manager
@@ -167,6 +176,7 @@ impl InformationSchemaTablesBuilder {
{
let table_info = table.table_info();
self.add_table(
&predicates,
&catalog_name,
&schema_name,
&table_name,
@@ -183,8 +193,10 @@ impl InformationSchemaTablesBuilder {
self.finish()
}
#[allow(clippy::too_many_arguments)]
fn add_table(
&mut self,
predicates: &Predicates,
catalog_name: &str,
schema_name: &str,
table_name: &str,
@@ -192,14 +204,27 @@ impl InformationSchemaTablesBuilder {
table_id: Option<u32>,
engine: Option<&str>,
) {
self.catalog_names.push(Some(catalog_name));
self.schema_names.push(Some(schema_name));
self.table_names.push(Some(table_name));
self.table_types.push(Some(match table_type {
let table_type = match table_type {
TableType::Base => "BASE TABLE",
TableType::View => "VIEW",
TableType::Temporary => "LOCAL TEMPORARY",
}));
};
let row = [
(TABLE_CATALOG, &Value::from(catalog_name)),
(TABLE_SCHEMA, &Value::from(schema_name)),
(TABLE_NAME, &Value::from(table_name)),
(TABLE_TYPE, &Value::from(table_type)),
];
if !predicates.eval(&row) {
return;
}
self.catalog_names.push(Some(catalog_name));
self.schema_names.push(Some(schema_name));
self.table_names.push(Some(table_name));
self.table_types.push(Some(table_type));
self.table_ids.push(table_id);
self.engines.push(engine);
}
@@ -229,7 +254,7 @@ impl DfPartitionStream for InformationSchemaTables {
schema,
futures::stream::once(async move {
builder
.make_tables()
.make_tables(None)
.await
.map(|x| x.into_df_record_batch())
.map_err(Into::into)

View File

@@ -19,17 +19,17 @@ use prometheus::*;
lazy_static! {
pub static ref METRIC_CATALOG_MANAGER_CATALOG_COUNT: IntGauge =
register_int_gauge!("catalog_catalog_count", "catalog catalog count").unwrap();
register_int_gauge!("greptime_catalog_catalog_count", "catalog catalog count").unwrap();
pub static ref METRIC_CATALOG_MANAGER_SCHEMA_COUNT: IntGauge =
register_int_gauge!("catalog_schema_count", "catalog schema count").unwrap();
register_int_gauge!("greptime_catalog_schema_count", "catalog schema count").unwrap();
pub static ref METRIC_CATALOG_MANAGER_TABLE_COUNT: IntGaugeVec = register_int_gauge_vec!(
"catalog_table_count",
"greptime_catalog_table_count",
"catalog table count",
&[METRIC_DB_LABEL]
)
.unwrap();
pub static ref METRIC_CATALOG_KV_REMOTE_GET: Histogram =
register_histogram!("catalog_kv_get_remote", "catalog kv get remote").unwrap();
register_histogram!("greptime_catalog_kv_get_remote", "catalog kv get remote").unwrap();
pub static ref METRIC_CATALOG_KV_GET: Histogram =
register_histogram!("catalog_kv_get", "catalog kv get").unwrap();
register_histogram!("greptime_catalog_kv_get", "catalog kv get").unwrap();
}

View File

@@ -16,7 +16,7 @@ use std::any::Any;
use common_error::ext::{BoxedError, ErrorExt};
use common_error::status_code::StatusCode;
use common_error::{GREPTIME_ERROR_CODE, GREPTIME_ERROR_MSG};
use common_error::{GREPTIME_DB_HEADER_ERROR_CODE, GREPTIME_DB_HEADER_ERROR_MSG};
use common_macro::stack_trace_debug;
use snafu::{Location, Snafu};
use tonic::{Code, Status};
@@ -115,7 +115,7 @@ impl From<Status> for Error {
.and_then(|v| String::from_utf8(v.as_bytes().to_vec()).ok())
}
let code = get_metadata_value(&e, GREPTIME_ERROR_CODE)
let code = get_metadata_value(&e, GREPTIME_DB_HEADER_ERROR_CODE)
.and_then(|s| {
if let Ok(code) = s.parse::<u32>() {
StatusCode::from_u32(code)
@@ -125,8 +125,8 @@ impl From<Status> for Error {
})
.unwrap_or(StatusCode::Unknown);
let msg =
get_metadata_value(&e, GREPTIME_ERROR_MSG).unwrap_or_else(|| e.message().to_string());
let msg = get_metadata_value(&e, GREPTIME_DB_HEADER_ERROR_MSG)
.unwrap_or_else(|| e.message().to_string());
Self::Server { code, msg }
}

View File

@@ -17,27 +17,30 @@ use prometheus::*;
lazy_static! {
pub static ref METRIC_GRPC_CREATE_TABLE: Histogram =
register_histogram!("grpc_create_table", "grpc create table").unwrap();
pub static ref METRIC_GRPC_PROMQL_RANGE_QUERY: Histogram =
register_histogram!("grpc_promql_range_query", "grpc promql range query").unwrap();
register_histogram!("greptime_grpc_create_table", "grpc create table").unwrap();
pub static ref METRIC_GRPC_PROMQL_RANGE_QUERY: Histogram = register_histogram!(
"greptime_grpc_promql_range_query",
"grpc promql range query"
)
.unwrap();
pub static ref METRIC_GRPC_INSERT: Histogram =
register_histogram!("grpc_insert", "grpc insert").unwrap();
register_histogram!("greptime_grpc_insert", "grpc insert").unwrap();
pub static ref METRIC_GRPC_DELETE: Histogram =
register_histogram!("grpc_delete", "grpc delete").unwrap();
register_histogram!("greptime_grpc_delete", "grpc delete").unwrap();
pub static ref METRIC_GRPC_SQL: Histogram =
register_histogram!("grpc_sql", "grpc sql").unwrap();
register_histogram!("greptime_grpc_sql", "grpc sql").unwrap();
pub static ref METRIC_GRPC_LOGICAL_PLAN: Histogram =
register_histogram!("grpc_logical_plan", "grpc logical plan").unwrap();
register_histogram!("greptime_grpc_logical_plan", "grpc logical plan").unwrap();
pub static ref METRIC_GRPC_ALTER: Histogram =
register_histogram!("grpc_alter", "grpc alter").unwrap();
register_histogram!("greptime_grpc_alter", "grpc alter").unwrap();
pub static ref METRIC_GRPC_DROP_TABLE: Histogram =
register_histogram!("grpc_drop_table", "grpc drop table").unwrap();
register_histogram!("greptime_grpc_drop_table", "grpc drop table").unwrap();
pub static ref METRIC_GRPC_TRUNCATE_TABLE: Histogram =
register_histogram!("grpc_truncate_table", "grpc truncate table").unwrap();
register_histogram!("greptime_grpc_truncate_table", "grpc truncate table").unwrap();
pub static ref METRIC_GRPC_DO_GET: Histogram =
register_histogram!("grpc_do_get", "grpc do get").unwrap();
register_histogram!("greptime_grpc_do_get", "grpc do get").unwrap();
pub static ref METRIC_REGION_REQUEST_GRPC: HistogramVec = register_histogram_vec!(
"grpc_region_request",
"greptime_grpc_region_request",
"grpc region request",
&["request_type"]
)

View File

@@ -252,10 +252,6 @@ impl StartCommand {
.await
.context(StartFrontendSnafu)?;
instance
.build_export_metrics_task(&opts.export_metrics)
.context(StartFrontendSnafu)?;
instance
.build_servers(opts)
.await

View File

@@ -28,7 +28,7 @@ pub mod standalone;
lazy_static::lazy_static! {
static ref APP_VERSION: prometheus::IntGaugeVec =
prometheus::register_int_gauge_vec!("app_version", "app version", &["short_version", "version"]).unwrap();
prometheus::register_int_gauge_vec!("greptime_app_version", "app version", &["short_version", "version"]).unwrap();
}
#[async_trait]

View File

@@ -406,11 +406,18 @@ impl StartCommand {
opts.wal_meta.clone(),
kv_backend.clone(),
));
let table_meta_allocator =
TableMetadataAllocator::new(table_id_sequence, wal_options_allocator.clone());
let table_metadata_manager =
Self::create_table_metadata_manager(kv_backend.clone()).await?;
let table_meta_allocator = TableMetadataAllocator::new(
table_id_sequence,
wal_options_allocator.clone(),
table_metadata_manager.clone(),
);
let ddl_task_executor = Self::create_ddl_task_executor(
kv_backend.clone(),
table_metadata_manager,
procedure_manager.clone(),
datanode_manager.clone(),
table_meta_allocator,
@@ -423,10 +430,6 @@ impl StartCommand {
.await
.context(StartFrontendSnafu)?;
frontend
.build_export_metrics_task(&opts.frontend.export_metrics)
.context(StartFrontendSnafu)?;
frontend
.build_servers(opts)
.await
@@ -441,14 +444,11 @@ impl StartCommand {
}
pub async fn create_ddl_task_executor(
kv_backend: KvBackendRef,
table_metadata_manager: TableMetadataManagerRef,
procedure_manager: ProcedureManagerRef,
datanode_manager: DatanodeManagerRef,
table_meta_allocator: TableMetadataAllocator,
) -> Result<DdlTaskExecutorRef> {
let table_metadata_manager =
Self::create_table_metadata_manager(kv_backend.clone()).await?;
let ddl_task_executor: DdlTaskExecutorRef = Arc::new(
DdlManager::try_new(
procedure_manager,
@@ -464,7 +464,7 @@ impl StartCommand {
Ok(ddl_task_executor)
}
async fn create_table_metadata_manager(
pub async fn create_table_metadata_manager(
kv_backend: KvBackendRef,
) -> Result<TableMetadataManagerRef> {
let table_metadata_manager = Arc::new(TableMetadataManager::new(kv_backend));

View File

@@ -60,6 +60,26 @@ pub const INFORMATION_SCHEMA_FILES_TABLE_ID: u32 = 14;
pub const INFORMATION_SCHEMA_SCHEMATA_TABLE_ID: u32 = 15;
/// id for information_schema.KEY_COLUMN_USAGE
pub const INFORMATION_SCHEMA_KEY_COLUMN_USAGE_TABLE_ID: u32 = 16;
/// id for information_schema.OPTIMIZER_TRACE
pub const INFORMATION_SCHEMA_OPTIMIZER_TRACE_TABLE_ID: u32 = 17;
/// id for information_schema.PARAMETERS
pub const INFORMATION_SCHEMA_PARAMETERS_TABLE_ID: u32 = 18;
/// id for information_schema.PROFILING
pub const INFORMATION_SCHEMA_PROFILING_TABLE_ID: u32 = 19;
/// id for information_schema.REFERENTIAL_CONSTRAINTS
pub const INFORMATION_SCHEMA_REFERENTIAL_CONSTRAINTS_TABLE_ID: u32 = 20;
/// id for information_schema.ROUTINES
pub const INFORMATION_SCHEMA_ROUTINES_TABLE_ID: u32 = 21;
/// id for information_schema.SCHEMA_PRIVILEGES
pub const INFORMATION_SCHEMA_SCHEMA_PRIVILEGES_TABLE_ID: u32 = 22;
/// id for information_schema.TABLE_PRIVILEGES
pub const INFORMATION_SCHEMA_TABLE_PRIVILEGES_TABLE_ID: u32 = 23;
/// id for information_schema.TRIGGERS
pub const INFORMATION_SCHEMA_TRIGGERS_TABLE_ID: u32 = 24;
/// id for information_schema.GLOBAL_STATUS
pub const INFORMATION_SCHEMA_GLOBAL_STATUS_TABLE_ID: u32 = 25;
/// id for information_schema.SESSION_STATUS
pub const INFORMATION_SCHEMA_SESSION_STATUS_TABLE_ID: u32 = 26;
/// ----- End of information_schema tables -----
pub const MITO_ENGINE: &str = "mito";

View File

@@ -90,11 +90,12 @@ mod tests {
#[test]
fn test_serde_kafka_config() {
// With all fields.
let toml_str = r#"
broker_endpoints = ["127.0.0.1:9092"]
max_batch_size = "4MB"
max_batch_size = "1MB"
linger = "200ms"
produce_record_timeout = "100ms"
consumer_wait_timeout = "100ms"
backoff_init = "500ms"
backoff_max = "10s"
backoff_base = 2
@@ -104,9 +105,9 @@ mod tests {
let expected = KafkaConfig {
broker_endpoints: vec!["127.0.0.1:9092".to_string()],
compression: RsKafkaCompression::default(),
max_batch_size: ReadableSize::mb(4),
max_batch_size: ReadableSize::mb(1),
linger: Duration::from_millis(200),
produce_record_timeout: Duration::from_millis(100),
consumer_wait_timeout: Duration::from_millis(100),
backoff: KafkaBackoffConfig {
init: Duration::from_millis(500),
max: Duration::from_secs(10),
@@ -115,6 +116,19 @@ mod tests {
},
};
assert_eq!(decoded, expected);
// With some fields missing.
let toml_str = r#"
broker_endpoints = ["127.0.0.1:9092"]
linger = "200ms"
"#;
let decoded: KafkaConfig = toml::from_str(toml_str).unwrap();
let expected = KafkaConfig {
broker_endpoints: vec!["127.0.0.1:9092".to_string()],
linger: Duration::from_millis(200),
..Default::default()
};
assert_eq!(decoded, expected);
}
#[test]

View File

@@ -40,16 +40,15 @@ pub struct KafkaConfig {
pub broker_endpoints: Vec<String>,
/// The compression algorithm used to compress log entries.
#[serde(skip)]
#[serde(default)]
pub compression: RsKafkaCompression,
/// The maximum log size a kafka batch producer could buffer.
/// The max size of a single producer batch.
pub max_batch_size: ReadableSize,
/// The linger duration of a kafka batch producer.
#[serde(with = "humantime_serde")]
pub linger: Duration,
/// The maximum amount of time (in milliseconds) to wait for Kafka records to be returned.
/// The consumer wait timeout.
#[serde(with = "humantime_serde")]
pub produce_record_timeout: Duration,
pub consumer_wait_timeout: Duration,
/// The backoff config.
#[serde(flatten, with = "kafka_backoff")]
pub backoff: KafkaBackoffConfig,
@@ -60,9 +59,10 @@ impl Default for KafkaConfig {
Self {
broker_endpoints: vec!["127.0.0.1:9092".to_string()],
compression: RsKafkaCompression::NoCompression,
max_batch_size: ReadableSize::mb(4),
// Warning: Kafka has a default limit of 1MB per message in a topic.
max_batch_size: ReadableSize::mb(1),
linger: Duration::from_millis(200),
produce_record_timeout: Duration::from_millis(100),
consumer_wait_timeout: Duration::from_millis(100),
backoff: KafkaBackoffConfig::default(),
}
}
@@ -73,17 +73,15 @@ with_prefix!(pub kafka_backoff "backoff_");
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(default)]
pub struct KafkaBackoffConfig {
/// The initial backoff for kafka clients.
/// The initial backoff delay.
#[serde(with = "humantime_serde")]
pub init: Duration,
/// The maximum backoff for kafka clients.
/// The maximum backoff delay.
#[serde(with = "humantime_serde")]
pub max: Duration,
/// Exponential backoff rate, i.e. next backoff = base * current backoff.
// Sets to u32 type since some structs containing the KafkaConfig need to derive the Eq trait.
pub base: u32,
/// Stop reconnecting if the total wait time reaches the deadline.
/// If it's None, the reconnecting won't terminate.
/// The deadline of retries. `None` stands for no deadline.
#[serde(with = "humantime_serde")]
pub deadline: Option<Duration>,
}
@@ -114,7 +112,7 @@ pub struct StandaloneKafkaConfig {
pub num_partitions: i32,
/// The replication factor of each topic.
pub replication_factor: i16,
/// Above which a topic creation operation will be cancelled.
/// The timeout of topic creation.
#[serde(with = "humantime_serde")]
pub create_topic_timeout: Duration,
}

View File

@@ -19,7 +19,7 @@ pub mod format;
pub mod mock;
pub mod status_code;
pub const GREPTIME_ERROR_CODE: &str = "x-greptime-err-code";
pub const GREPTIME_ERROR_MSG: &str = "x-greptime-err-msg";
pub const GREPTIME_DB_HEADER_ERROR_CODE: &str = "x-greptime-err-code";
pub const GREPTIME_DB_HEADER_ERROR_MSG: &str = "x-greptime-err-msg";
pub use snafu;

View File

@@ -14,6 +14,7 @@ async-stream.workspace = true
async-trait.workspace = true
base64.workspace = true
bytes.workspace = true
chrono.workspace = true
common-catalog.workspace = true
common-config.workspace = true
common-error.workspace = true
@@ -27,6 +28,7 @@ common-time.workspace = true
datatypes.workspace = true
derive_builder.workspace = true
etcd-client.workspace = true
futures-util.workspace = true
futures.workspace = true
humantime-serde.workspace = true
lazy_static.workspace = true
@@ -51,3 +53,4 @@ chrono.workspace = true
common-procedure = { workspace = true, features = ["testing"] }
datatypes.workspace = true
hyper = { version = "0.14", features = ["full"] }
uuid.workspace = true

View File

@@ -40,9 +40,7 @@ use table::requests::AlterKind;
use crate::cache_invalidator::Context;
use crate::ddl::utils::handle_operate_region_error;
use crate::ddl::DdlContext;
use crate::error::{
self, ConvertAlterTableRequestSnafu, InvalidProtoMsgSnafu, Result, TableRouteNotFoundSnafu,
};
use crate::error::{self, ConvertAlterTableRequestSnafu, InvalidProtoMsgSnafu, Result};
use crate::key::table_info::TableInfoValue;
use crate::key::table_name::TableNameKey;
use crate::key::DeserializedValueWithBytes;
@@ -65,6 +63,7 @@ impl AlterTableProcedure {
cluster_id: u64,
task: AlterTableTask,
table_info_value: DeserializedValueWithBytes<TableInfoValue>,
physical_table_name: Option<TableName>,
context: DdlContext,
) -> Result<Self> {
let alter_kind = task
@@ -84,7 +83,13 @@ impl AlterTableProcedure {
Ok(Self {
context,
data: AlterTableData::new(task, table_info_value, cluster_id, next_column_id),
data: AlterTableData::new(
task,
table_info_value,
physical_table_name,
cluster_id,
next_column_id,
),
kind,
})
}
@@ -182,23 +187,19 @@ impl AlterTableProcedure {
pub async fn submit_alter_region_requests(&mut self) -> Result<Status> {
let table_id = self.data.table_id();
let table_route = self
let (_, physical_table_route) = self
.context
.table_metadata_manager
.table_route_manager()
.get(table_id)
.await?
.context(TableRouteNotFoundSnafu { table_id })?
.into_inner();
let region_routes = table_route.region_routes()?;
.get_physical_table_route(table_id)
.await?;
let leaders = find_leaders(region_routes);
let leaders = find_leaders(&physical_table_route.region_routes);
let mut alter_region_tasks = Vec::with_capacity(leaders.len());
for datanode in leaders {
let requester = self.context.datanode_manager.datanode(&datanode).await;
let regions = find_leader_regions(region_routes, &datanode);
let regions = find_leader_regions(&physical_table_route.region_routes, &datanode);
for region in regions {
let region_id = RegionId::new(table_id, region);
@@ -335,13 +336,24 @@ impl AlterTableProcedure {
}
fn lock_key_inner(&self) -> Vec<String> {
let mut lock_key = vec![];
if let Some(physical_table_name) = self.data.physical_table_name() {
let physical_table_key = common_catalog::format_full_table_name(
&physical_table_name.catalog_name,
&physical_table_name.schema_name,
&physical_table_name.table_name,
);
lock_key.push(physical_table_key);
}
let table_ref = self.data.table_ref();
let table_key = common_catalog::format_full_table_name(
table_ref.catalog,
table_ref.schema,
table_ref.table,
);
let mut lock_key = vec![table_key];
lock_key.push(table_key);
if let Ok(Kind::RenameTable(RenameTable { new_table_name })) = self.alter_kind() {
lock_key.push(common_catalog::format_full_table_name(
@@ -394,7 +406,7 @@ impl Procedure for AlterTableProcedure {
fn lock_key(&self) -> LockKey {
let key = self.lock_key_inner();
LockKey::new(key)
LockKey::new_exclusive(key)
}
}
@@ -415,6 +427,8 @@ pub struct AlterTableData {
task: AlterTableTask,
/// Table info value before alteration.
table_info_value: DeserializedValueWithBytes<TableInfoValue>,
/// Physical table name, if the table to alter is a logical table.
physical_table_name: Option<TableName>,
cluster_id: u64,
/// Next column id of the table if the task adds columns to the table.
next_column_id: Option<ColumnId>,
@@ -424,6 +438,7 @@ impl AlterTableData {
pub fn new(
task: AlterTableTask,
table_info_value: DeserializedValueWithBytes<TableInfoValue>,
physical_table_name: Option<TableName>,
cluster_id: u64,
next_column_id: Option<ColumnId>,
) -> Self {
@@ -431,6 +446,7 @@ impl AlterTableData {
state: AlterTableState::Prepare,
task,
table_info_value,
physical_table_name,
cluster_id,
next_column_id,
}
@@ -447,6 +463,10 @@ impl AlterTableData {
fn table_info(&self) -> &RawTableInfo {
&self.table_info_value.table_info
}
fn physical_table_name(&self) -> Option<&TableName> {
self.physical_table_name.as_ref()
}
}
/// Creates region proto alter kind from `table_info` and `alter_kind`.

View File

@@ -20,7 +20,6 @@ use api::v1::region::{
};
use api::v1::{ColumnDef, SemanticType};
use async_trait::async_trait;
use common_config::WAL_OPTIONS_KEY;
use common_error::ext::BoxedError;
use common_procedure::error::{
ExternalSnafu, FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu,
@@ -48,6 +47,7 @@ use crate::rpc::ddl::CreateTableTask;
use crate::rpc::router::{
find_leader_regions, find_leaders, operating_leader_regions, RegionRoute,
};
use crate::wal::prepare_wal_option;
pub struct CreateTableProcedure {
pub context: DdlContext,
@@ -349,7 +349,7 @@ impl Procedure for CreateTableProcedure {
table_ref.table,
);
LockKey::single(key)
LockKey::single_exclusive(key)
}
}
@@ -455,13 +455,7 @@ impl CreateRequestBuilder {
request.region_id = region_id.as_u64();
request.path = storage_path;
// Stores the encoded wal options into the request options.
region_wal_options
.get(&region_id.region_number())
.and_then(|wal_options| {
request
.options
.insert(WAL_OPTIONS_KEY.to_string(), wal_options.clone())
});
prepare_wal_option(&mut request.options, region_id, region_wal_options);
if let Some(physical_table_id) = self.physical_table_id {
// Logical table has the same region numbers with physical table, and they have a one-to-one mapping.

View File

@@ -273,7 +273,7 @@ impl Procedure for DropTableProcedure {
table_ref.table,
);
LockKey::single(key)
LockKey::single_exclusive(key)
}
}

View File

@@ -18,21 +18,26 @@ use std::sync::Arc;
use async_trait::async_trait;
use common_catalog::consts::METRIC_ENGINE;
use common_telemetry::{debug, info};
use snafu::ensure;
use snafu::{ensure, OptionExt};
use store_api::metric_engine_consts::LOGICAL_TABLE_METADATA_KEY;
use store_api::storage::{RegionId, RegionNumber, TableId};
use crate::ddl::{TableMetadata, TableMetadataAllocatorContext};
use crate::error::{Result, UnsupportedSnafu};
use crate::error::{Result, TableNotFoundSnafu, UnsupportedSnafu};
use crate::key::table_name::TableNameKey;
use crate::key::table_route::{LogicalTableRouteValue, PhysicalTableRouteValue, TableRouteValue};
use crate::key::TableMetadataManagerRef;
use crate::peer::Peer;
use crate::rpc::ddl::CreateTableTask;
use crate::rpc::router::{Region, RegionRoute};
use crate::sequence::SequenceRef;
use crate::wal::{allocate_region_wal_options, WalOptionsAllocatorRef};
#[derive(Clone)]
pub struct TableMetadataAllocator {
table_id_sequence: SequenceRef,
wal_options_allocator: WalOptionsAllocatorRef,
table_metadata_manager: TableMetadataManagerRef,
peer_allocator: PeerAllocatorRef,
}
@@ -40,10 +45,12 @@ impl TableMetadataAllocator {
pub fn new(
table_id_sequence: SequenceRef,
wal_options_allocator: WalOptionsAllocatorRef,
table_metadata_manager: TableMetadataManagerRef,
) -> Self {
Self::with_peer_allocator(
table_id_sequence,
wal_options_allocator,
table_metadata_manager,
Arc::new(NoopPeerAllocator),
)
}
@@ -51,11 +58,13 @@ impl TableMetadataAllocator {
pub fn with_peer_allocator(
table_id_sequence: SequenceRef,
wal_options_allocator: WalOptionsAllocatorRef,
table_metadata_manager: TableMetadataManagerRef,
peer_allocator: PeerAllocatorRef,
) -> Self {
Self {
table_id_sequence,
wal_options_allocator,
table_metadata_manager,
peer_allocator,
}
}
@@ -115,8 +124,31 @@ impl TableMetadataAllocator {
) -> Result<TableRouteValue> {
let regions = task.partitions.len();
let table_route = if task.create_table.engine == METRIC_ENGINE {
TableRouteValue::Logical(LogicalTableRouteValue {})
let table_route = if task.create_table.engine == METRIC_ENGINE
&& let Some(physical_table_name) = task
.create_table
.table_options
.get(LOGICAL_TABLE_METADATA_KEY)
{
let physical_table_id = self
.table_metadata_manager
.table_name_manager()
.get(TableNameKey::new(
&task.create_table.catalog_name,
&task.create_table.schema_name,
physical_table_name,
))
.await?
.context(TableNotFoundSnafu {
table_name: physical_table_name,
})?
.table_id();
let region_ids = (0..regions)
.map(|i| RegionId::new(table_id, i as RegionNumber))
.collect();
TableRouteValue::Logical(LogicalTableRouteValue::new(physical_table_id, region_ids))
} else {
let peers = self.peer_allocator.alloc(ctx, regions).await?;
@@ -144,6 +176,7 @@ impl TableMetadataAllocator {
};
Ok(table_route)
}
pub async fn create(
&self,
ctx: &TableMetadataAllocatorContext,

View File

@@ -81,7 +81,7 @@ impl Procedure for TruncateTableProcedure {
table_ref.table,
);
LockKey::single(key)
LockKey::single_exclusive(key)
}
}

View File

@@ -46,6 +46,8 @@ use crate::rpc::ddl::{
TruncateTableTask,
};
use crate::rpc::router::RegionRoute;
use crate::table_name::TableName;
pub type DdlManagerRef = Arc<DdlManager>;
/// The [DdlManager] provides the ability to execute Ddl.
@@ -160,11 +162,17 @@ impl DdlManager {
cluster_id: u64,
alter_table_task: AlterTableTask,
table_info_value: DeserializedValueWithBytes<TableInfoValue>,
physical_table_name: Option<TableName>,
) -> Result<ProcedureId> {
let context = self.create_context();
let procedure =
AlterTableProcedure::new(cluster_id, alter_table_task, table_info_value, context)?;
let procedure = AlterTableProcedure::new(
cluster_id,
alter_table_task,
table_info_value,
physical_table_name,
context,
)?;
let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure));
@@ -327,8 +335,38 @@ async fn handle_alter_table_task(
table_name: table_ref.to_string(),
})?;
let physical_table_id = ddl_manager
.table_metadata_manager()
.table_route_manager()
.get_physical_table_id(table_id)
.await?;
let physical_table_name = if physical_table_id == table_id {
None
} else {
let physical_table_info = &ddl_manager
.table_metadata_manager()
.table_info_manager()
.get(physical_table_id)
.await?
.with_context(|| error::TableInfoNotFoundSnafu {
table_name: table_ref.to_string(),
})?
.table_info;
Some(TableName {
catalog_name: physical_table_info.catalog_name.clone(),
schema_name: physical_table_info.schema_name.clone(),
table_name: physical_table_info.name.clone(),
})
};
let id = ddl_manager
.submit_alter_table_task(cluster_id, alter_table_task, table_info_value)
.submit_alter_table_task(
cluster_id,
alter_table_task,
table_info_value,
physical_table_name,
)
.await?;
info!("Table: {table_id} is altered via procedure_id {id:?}");
@@ -495,8 +533,9 @@ mod tests {
Arc::new(DummyCacheInvalidator),
table_metadata_manager,
TableMetadataAllocator::new(
Arc::new(SequenceBuilder::new("test", kv_backend).build()),
Arc::new(SequenceBuilder::new("test", kv_backend.clone()).build()),
Arc::new(WalOptionsAllocator::default()),
Arc::new(TableMetadataManager::new(kv_backend)),
),
Arc::new(MemoryRegionKeeper::default()),
);

View File

@@ -92,13 +92,15 @@ impl Display for OpenRegion {
}
}
#[serde_with::serde_as]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct OpenRegion {
pub region_ident: RegionIdent,
pub region_storage_path: String,
pub region_options: HashMap<String, String>,
#[serde(default)]
pub region_wal_options: HashMap<String, String>,
#[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
pub region_wal_options: HashMap<RegionNumber, String>,
#[serde(default)]
pub skip_wal_replay: bool,
}
@@ -108,7 +110,7 @@ impl OpenRegion {
region_ident: RegionIdent,
path: &str,
region_options: HashMap<String, String>,
region_wal_options: HashMap<String, String>,
region_wal_options: HashMap<RegionNumber, String>,
skip_wal_replay: bool,
) -> Self {
Self {

View File

@@ -427,7 +427,7 @@ impl TableMetadataManager {
&region_storage_path,
region_options,
region_wal_options,
region_distribution(&x.region_routes)?,
region_distribution(&x.region_routes),
)?;
txn = txn.merge(create_datanode_table_txn);
}
@@ -483,7 +483,7 @@ impl TableMetadataManager {
.build_delete_txn(table_id, table_info_value)?;
// Deletes datanode table key value pairs.
let distribution = region_distribution(table_route_value.region_routes()?)?;
let distribution = region_distribution(table_route_value.region_routes()?);
let delete_datanode_txn = self
.datanode_table_manager()
.build_delete_txn(table_id, distribution)?;
@@ -604,12 +604,12 @@ impl TableMetadataManager {
current_table_route_value: &DeserializedValueWithBytes<TableRouteValue>,
new_region_routes: Vec<RegionRoute>,
new_region_options: &HashMap<String, String>,
new_region_wal_options: &HashMap<String, String>,
new_region_wal_options: &HashMap<RegionNumber, String>,
) -> Result<()> {
// Updates the datanode table key value pairs.
let current_region_distribution =
region_distribution(current_table_route_value.region_routes()?)?;
let new_region_distribution = region_distribution(&new_region_routes)?;
region_distribution(current_table_route_value.region_routes()?);
let new_region_distribution = region_distribution(&new_region_routes);
let update_datanode_table_txn = self.datanode_table_manager().build_update_txn(
table_id,
@@ -1191,7 +1191,7 @@ mod tests {
table_id: u32,
region_routes: &[RegionRoute],
) {
let region_distribution = region_distribution(region_routes).unwrap();
let region_distribution = region_distribution(region_routes);
for (datanode, regions) in region_distribution {
let got = table_metadata_manager
.datanode_table_manager()

View File

@@ -34,6 +34,7 @@ use crate::rpc::store::RangeRequest;
use crate::rpc::KeyValue;
use crate::DatanodeId;
#[serde_with::serde_as]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
/// RegionInfo
/// For compatible reason, DON'T modify the field name.
@@ -48,14 +49,15 @@ pub struct RegionInfo {
#[serde(default)]
pub region_options: HashMap<String, String>,
/// The per-region wal options.
/// Key: region number (in string representation). Value: the encoded wal options of the region.
/// Key: region number. Value: the encoded wal options of the region.
#[serde(default)]
pub region_wal_options: HashMap<String, String>,
#[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
pub region_wal_options: HashMap<RegionNumber, String>,
}
pub struct DatanodeTableKey {
datanode_id: DatanodeId,
table_id: TableId,
pub datanode_id: DatanodeId,
pub table_id: TableId,
}
impl DatanodeTableKey {
@@ -181,7 +183,7 @@ impl DatanodeTableManager {
.filter_map(|region_number| {
region_wal_options
.get(region_number)
.map(|wal_options| (region_number.to_string(), wal_options.clone()))
.map(|wal_options| (*region_number, wal_options.clone()))
})
.collect();
@@ -214,7 +216,7 @@ impl DatanodeTableManager {
current_region_distribution: RegionDistribution,
new_region_distribution: RegionDistribution,
new_region_options: &HashMap<String, String>,
new_region_wal_options: &HashMap<String, String>,
new_region_wal_options: &HashMap<RegionNumber, String>,
) -> Result<Txn> {
let mut opts = Vec::new();
@@ -306,6 +308,61 @@ mod tests {
assert!(parsed.is_ok());
}
#[derive(Debug, Serialize, Deserialize, PartialEq)]
struct StringHashMap {
inner: HashMap<String, String>,
}
#[serde_with::serde_as]
#[derive(Debug, Serialize, Deserialize, PartialEq)]
struct IntegerHashMap {
#[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
inner: HashMap<u32, String>,
}
#[test]
fn test_serde_with_integer_hash_map() {
let map = StringHashMap {
inner: HashMap::from([
("1".to_string(), "aaa".to_string()),
("2".to_string(), "bbb".to_string()),
("3".to_string(), "ccc".to_string()),
]),
};
let encoded = serde_json::to_string(&map).unwrap();
let decoded: IntegerHashMap = serde_json::from_str(&encoded).unwrap();
assert_eq!(
IntegerHashMap {
inner: HashMap::from([
(1, "aaa".to_string()),
(2, "bbb".to_string()),
(3, "ccc".to_string()),
]),
},
decoded
);
let map = IntegerHashMap {
inner: HashMap::from([
(1, "aaa".to_string()),
(2, "bbb".to_string()),
(3, "ccc".to_string()),
]),
};
let encoded = serde_json::to_string(&map).unwrap();
let decoded: StringHashMap = serde_json::from_str(&encoded).unwrap();
assert_eq!(
StringHashMap {
inner: HashMap::from([
("1".to_string(), "aaa".to_string()),
("2".to_string(), "bbb".to_string()),
("3".to_string(), "ccc".to_string()),
]),
},
decoded
);
}
// This test intends to ensure both the `serde_json::to_string` + `serde_json::from_str`
// and `serde_json::to_vec` + `serde_json::from_slice` work for `DatanodeTableValue`.
// Warning: if the key of `region_wal_options` is of type non-String, this test would fail.
@@ -320,9 +377,9 @@ mod tests {
("c".to_string(), "cc".to_string()),
]),
region_wal_options: HashMap::from([
("1".to_string(), "aaa".to_string()),
("2".to_string(), "bbb".to_string()),
("3".to_string(), "ccc".to_string()),
(1, "aaa".to_string()),
(2, "bbb".to_string()),
(3, "ccc".to_string()),
]),
};
let table_value = DatanodeTableValue {

View File

@@ -16,12 +16,14 @@ use std::collections::HashMap;
use std::fmt::Display;
use serde::{Deserialize, Serialize};
use snafu::{ensure, ResultExt};
use snafu::{ensure, OptionExt, ResultExt};
use store_api::storage::{RegionId, RegionNumber};
use table::metadata::TableId;
use super::{DeserializedValueWithBytes, TableMetaValue};
use crate::error::{Result, SerdeJsonSnafu, UnexpectedLogicalRouteTableSnafu};
use crate::error::{
Result, SerdeJsonSnafu, TableRouteNotFoundSnafu, UnexpectedLogicalRouteTableSnafu,
};
use crate::key::{to_removed_key, RegionDistribution, TableMetaKey, TABLE_ROUTE_PREFIX};
use crate::kv_backend::txn::{Compare, CompareOp, Txn, TxnOp, TxnOpResponse};
use crate::kv_backend::KvBackendRef;
@@ -53,7 +55,8 @@ pub struct PhysicalTableRouteValue {
#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
pub struct LogicalTableRouteValue {
// TODO(LFC): Add table route for MetricsEngine table.
physical_table_id: TableId,
region_ids: Vec<RegionId>,
}
impl TableRouteValue {
@@ -66,7 +69,7 @@ impl TableRouteValue {
ensure!(
self.is_physical(),
UnexpectedLogicalRouteTableSnafu {
err_msg: "{self:?} is a non-physical TableRouteValue.",
err_msg: format!("{self:?} is a non-physical TableRouteValue."),
}
);
let version = self.physical_table_route().version;
@@ -84,18 +87,20 @@ impl TableRouteValue {
ensure!(
self.is_physical(),
UnexpectedLogicalRouteTableSnafu {
err_msg: "{self:?} is a non-physical TableRouteValue.",
err_msg: format!("{self:?} is a non-physical TableRouteValue."),
}
);
Ok(self.physical_table_route().version)
}
/// Returns the corresponding [RegionRoute].
/// Returns the corresponding [RegionRoute], returns `None` if it's the specific region is not found.
///
/// Note: It throws an error if it's a logical table
pub fn region_route(&self, region_id: RegionId) -> Result<Option<RegionRoute>> {
ensure!(
self.is_physical(),
UnexpectedLogicalRouteTableSnafu {
err_msg: "{self:?} is a non-physical TableRouteValue.",
err_msg: format!("{self:?} is a non-physical TableRouteValue."),
}
);
Ok(self
@@ -116,7 +121,7 @@ impl TableRouteValue {
ensure!(
self.is_physical(),
UnexpectedLogicalRouteTableSnafu {
err_msg: "{self:?} is a non-physical TableRouteValue.",
err_msg: format!("{self:?} is a non-physical TableRouteValue."),
}
);
Ok(&self.physical_table_route().region_routes)
@@ -174,12 +179,19 @@ impl PhysicalTableRouteValue {
}
impl LogicalTableRouteValue {
pub fn physical_table_id(&self) -> TableId {
todo!()
pub fn new(physical_table_id: TableId, region_ids: Vec<RegionId>) -> Self {
Self {
physical_table_id,
region_ids,
}
}
pub fn region_ids(&self) -> Vec<RegionId> {
todo!()
pub fn physical_table_id(&self) -> TableId {
self.physical_table_id
}
pub fn region_ids(&self) -> &Vec<RegionId> {
&self.region_ids
}
}
@@ -324,6 +336,54 @@ impl TableRouteManager {
.transpose()
}
pub async fn get_physical_table_id(
&self,
logical_or_physical_table_id: TableId,
) -> Result<TableId> {
let table_route = self
.get(logical_or_physical_table_id)
.await?
.context(TableRouteNotFoundSnafu {
table_id: logical_or_physical_table_id,
})?
.into_inner();
match table_route {
TableRouteValue::Physical(_) => Ok(logical_or_physical_table_id),
TableRouteValue::Logical(x) => Ok(x.physical_table_id()),
}
}
pub async fn get_physical_table_route(
&self,
logical_or_physical_table_id: TableId,
) -> Result<(TableId, PhysicalTableRouteValue)> {
let table_route = self
.get(logical_or_physical_table_id)
.await?
.context(TableRouteNotFoundSnafu {
table_id: logical_or_physical_table_id,
})?
.into_inner();
match table_route {
TableRouteValue::Physical(x) => Ok((logical_or_physical_table_id, x)),
TableRouteValue::Logical(x) => {
let physical_table_id = x.physical_table_id();
let physical_table_route =
self.get(physical_table_id)
.await?
.context(TableRouteNotFoundSnafu {
table_id: physical_table_id,
})?;
Ok((
physical_table_id,
physical_table_route.physical_table_route().clone(),
))
}
}
}
/// It may return a subset of the `table_ids`.
pub async fn batch_get(
&self,
@@ -376,7 +436,7 @@ impl TableRouteManager {
) -> Result<Option<RegionDistribution>> {
self.get(table_id)
.await?
.map(|table_route| region_distribution(table_route.region_routes()?))
.map(|table_route| Ok(region_distribution(table_route.region_routes()?)))
.transpose()
}
}

View File

@@ -15,6 +15,7 @@
#![feature(assert_matches)]
#![feature(btree_extract_if)]
#![feature(async_closure)]
#![feature(let_chains)]
pub mod cache_invalidator;
pub mod datanode_manager;
@@ -35,7 +36,6 @@ pub mod sequence;
pub mod state_store;
pub mod table_name;
pub mod util;
#[allow(unused)]
pub mod wal;
pub type ClusterId = u64;

View File

@@ -16,36 +16,43 @@ use lazy_static::lazy_static;
use prometheus::*;
lazy_static! {
pub static ref METRIC_META_TXN_REQUEST: HistogramVec =
register_histogram_vec!("meta_txn_request", "meta txn request", &["target", "op"]).unwrap();
pub static ref METRIC_META_TXN_REQUEST: HistogramVec = register_histogram_vec!(
"greptime_meta_txn_request",
"meta txn request",
&["target", "op"]
)
.unwrap();
pub static ref METRIC_META_CREATE_CATALOG: Histogram =
register_histogram!("meta_create_catalog", "meta create catalog").unwrap();
pub static ref METRIC_META_CREATE_CATALOG_COUNTER: IntCounter =
register_int_counter!("meta_create_catalog_counter", "meta create catalog").unwrap();
register_histogram!("greptime_meta_create_catalog", "meta create catalog").unwrap();
pub static ref METRIC_META_CREATE_CATALOG_COUNTER: IntCounter = register_int_counter!(
"greptime_meta_create_catalog_counter",
"meta create catalog"
)
.unwrap();
pub static ref METRIC_META_CREATE_SCHEMA: Histogram =
register_histogram!("meta_create_schema", "meta create schema").unwrap();
register_histogram!("greptime_meta_create_schema", "meta create schema").unwrap();
pub static ref METRIC_META_CREATE_SCHEMA_COUNTER: IntCounter =
register_int_counter!("meta_create_schema_counter", "meta create schema").unwrap();
register_int_counter!("greptime_meta_create_schema_counter", "meta create schema").unwrap();
pub static ref METRIC_META_PROCEDURE_CREATE_TABLE: HistogramVec = register_histogram_vec!(
"meta_procedure_create_table",
"greptime_meta_procedure_create_table",
"meta procedure create table",
&["step"]
)
.unwrap();
pub static ref METRIC_META_PROCEDURE_DROP_TABLE: HistogramVec = register_histogram_vec!(
"meta_procedure_drop_table",
"greptime_meta_procedure_drop_table",
"meta procedure drop table",
&["step"]
)
.unwrap();
pub static ref METRIC_META_PROCEDURE_ALTER_TABLE: HistogramVec = register_histogram_vec!(
"meta_procedure_alter_table",
"greptime_meta_procedure_alter_table",
"meta procedure alter table",
&["step"]
)
.unwrap();
pub static ref METRIC_META_PROCEDURE_TRUNCATE_TABLE: HistogramVec = register_histogram_vec!(
"meta_procedure_truncate_table",
"greptime_meta_procedure_truncate_table",
"meta procedure truncate table",
&["step"]
)

View File

@@ -30,7 +30,7 @@ use crate::peer::Peer;
use crate::table_name::TableName;
use crate::DatanodeId;
pub fn region_distribution(region_routes: &[RegionRoute]) -> Result<RegionDistribution> {
pub fn region_distribution(region_routes: &[RegionRoute]) -> RegionDistribution {
let mut regions_id_map = RegionDistribution::new();
for route in region_routes.iter() {
if let Some(peer) = route.leader_peer.as_ref() {
@@ -42,7 +42,7 @@ pub fn region_distribution(region_routes: &[RegionRoute]) -> Result<RegionDistri
// id asc
regions.sort()
}
Ok(regions_id_map)
regions_id_map
}
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)]
@@ -123,11 +123,12 @@ pub fn convert_to_region_leader_status_map(
pub fn find_region_leader(
region_routes: &[RegionRoute],
region_number: RegionNumber,
) -> Option<&Peer> {
) -> Option<Peer> {
region_routes
.iter()
.find(|x| x.region.id.region_number() == region_number)
.and_then(|r| r.leader_peer.as_ref())
.cloned()
}
pub fn find_leader_regions(region_routes: &[RegionRoute], datanode: &Peer) -> Vec<RegionNumber> {

View File

@@ -18,10 +18,10 @@ pub mod options_allocator;
use std::collections::HashMap;
use common_config::wal::StandaloneWalConfig;
use common_config::WAL_OPTIONS_KEY;
use serde::{Deserialize, Serialize};
use serde_with::with_prefix;
use store_api::storage::{RegionId, RegionNumber};
use crate::error::Result;
use crate::wal::kafka::KafkaConfig;
pub use crate::wal::kafka::Topic as KafkaWalTopic;
pub use crate::wal::options_allocator::{
@@ -40,7 +40,7 @@ pub enum WalConfig {
impl From<StandaloneWalConfig> for WalConfig {
fn from(value: StandaloneWalConfig) -> Self {
match value {
StandaloneWalConfig::RaftEngine(config) => WalConfig::RaftEngine,
StandaloneWalConfig::RaftEngine(_) => WalConfig::RaftEngine,
StandaloneWalConfig::Kafka(config) => WalConfig::Kafka(KafkaConfig {
broker_endpoints: config.base.broker_endpoints,
num_topics: config.num_topics,
@@ -55,6 +55,16 @@ impl From<StandaloneWalConfig> for WalConfig {
}
}
pub fn prepare_wal_option(
options: &mut HashMap<String, String>,
region_id: RegionId,
region_wal_options: &HashMap<RegionNumber, String>,
) {
if let Some(wal_options) = region_wal_options.get(&region_id.region_number()) {
options.insert(WAL_OPTIONS_KEY.to_string(), wal_options.clone());
}
}
#[cfg(test)]
mod tests {
use std::time::Duration;

View File

@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#[cfg(any(test, feature = "testing"))]
pub mod test_util;
pub mod topic;
pub mod topic_manager;
pub mod topic_selector;
@@ -19,7 +21,6 @@ pub mod topic_selector;
use std::time::Duration;
use common_config::wal::kafka::{kafka_backoff, KafkaBackoffConfig, TopicSelectorType};
use common_config::wal::StandaloneWalConfig;
use serde::{Deserialize, Serialize};
pub use crate::wal::kafka::topic::Topic;
@@ -41,7 +42,7 @@ pub struct KafkaConfig {
pub num_partitions: i32,
/// The replication factor of each topic.
pub replication_factor: i16,
/// Above which a topic creation operation will be cancelled.
/// The timeout of topic creation.
#[serde(with = "humantime_serde")]
pub create_topic_timeout: Duration,
/// The backoff config.

View File

@@ -0,0 +1,33 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use common_telemetry::warn;
use futures_util::future::BoxFuture;
pub async fn run_test_with_kafka_wal<F>(test: F)
where
F: FnOnce(Vec<String>) -> BoxFuture<'static, ()>,
{
let Ok(endpoints) = std::env::var("GT_KAFKA_ENDPOINTS") else {
warn!("The endpoints is empty, skipping the test");
return;
};
let endpoints = endpoints
.split(',')
.map(|s| s.trim().to_string())
.collect::<Vec<_>>();
test(endpoints).await
}

View File

@@ -15,4 +15,5 @@
/// Kafka wal topic.
/// Publishers publish log entries to the topic while subscribers pull log entries from the topic.
/// A topic is simply a string right now. But it may be more complex in the future.
// TODO(niebayes): remove the Topic alias.
pub type Topic = String;

View File

@@ -14,10 +14,9 @@
use std::collections::HashSet;
use std::sync::Arc;
use std::time::Duration;
use common_config::wal::kafka::TopicSelectorType;
use common_telemetry::{debug, error, info};
use common_telemetry::{error, info};
use rskafka::client::controller::ControllerClient;
use rskafka::client::error::Error as RsKafkaError;
use rskafka::client::error::ProtocolError::TopicAlreadyExists;
@@ -25,7 +24,7 @@ use rskafka::client::partition::{Compression, UnknownTopicHandling};
use rskafka::client::{Client, ClientBuilder};
use rskafka::record::Record;
use rskafka::BackoffConfig;
use snafu::{ensure, AsErrorSource, ResultExt};
use snafu::{ensure, ResultExt};
use crate::error::{
BuildKafkaClientSnafu, BuildKafkaCtrlClientSnafu, BuildKafkaPartitionClientSnafu,
@@ -47,9 +46,8 @@ const DEFAULT_PARTITION: i32 = 0;
/// Manages topic initialization and selection.
pub struct TopicManager {
config: KafkaConfig,
// TODO(niebayes): maybe add a guard to ensure all topics in the topic pool are created.
topic_pool: Vec<Topic>,
topic_selector: TopicSelectorRef,
pub(crate) topic_pool: Vec<Topic>,
pub(crate) topic_selector: TopicSelectorRef,
kv_backend: KvBackendRef,
}
@@ -168,7 +166,7 @@ impl TopicManager {
vec![Record {
key: None,
value: None,
timestamp: rskafka::chrono::Utc::now(),
timestamp: chrono::Utc::now(),
headers: Default::default(),
}],
Compression::NoCompression,
@@ -240,13 +238,9 @@ impl TopicManager {
#[cfg(test)]
mod tests {
use std::env;
use common_telemetry::info;
use super::*;
use crate::kv_backend::memory::MemoryKvBackend;
use crate::kv_backend::{self};
use crate::wal::kafka::test_util::run_test_with_kafka_wal;
// Tests that topics can be successfully persisted into the kv backend and can be successfully restored from the kv backend.
#[tokio::test]
@@ -273,26 +267,60 @@ mod tests {
assert_eq!(topics, restored_topics);
}
/// Tests that the topic manager could allocate topics correctly.
#[tokio::test]
async fn test_topic_manager() {
let endpoints = env::var("GT_KAFKA_ENDPOINTS").unwrap_or_default();
common_telemetry::init_default_ut_logging();
async fn test_alloc_topics() {
run_test_with_kafka_wal(|broker_endpoints| {
Box::pin(async {
// Constructs topics that should be created.
let topics = (0..256)
.map(|i| format!("test_alloc_topics_{}_{}", i, uuid::Uuid::new_v4()))
.collect::<Vec<_>>();
if endpoints.is_empty() {
info!("The endpoints is empty, skipping the test.");
return;
}
// TODO: supports topic prefix
let kv_backend = Arc::new(MemoryKvBackend::new());
let config = KafkaConfig {
replication_factor: 1,
broker_endpoints: endpoints
.split(',')
.map(|s| s.to_string())
.collect::<Vec<_>>(),
..Default::default()
};
let manager = TopicManager::new(config, kv_backend);
manager.start().await.unwrap();
// Creates a topic manager.
let config = KafkaConfig {
replication_factor: broker_endpoints.len() as i16,
broker_endpoints,
..Default::default()
};
let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef;
let mut manager = TopicManager::new(config.clone(), kv_backend);
// Replaces the default topic pool with the constructed topics.
manager.topic_pool = topics.clone();
// Replaces the default selector with a round-robin selector without shuffled.
manager.topic_selector = Arc::new(RoundRobinTopicSelector::default());
manager.start().await.unwrap();
// Selects exactly the number of `num_topics` topics one by one.
let got = (0..topics.len())
.map(|_| manager.select().unwrap())
.cloned()
.collect::<Vec<_>>();
assert_eq!(got, topics);
// Selects exactly the number of `num_topics` topics in a batching manner.
let got = manager
.select_batch(topics.len())
.unwrap()
.into_iter()
.map(ToString::to_string)
.collect::<Vec<_>>();
assert_eq!(got, topics);
// Selects more than the number of `num_topics` topics.
let got = manager
.select_batch(2 * topics.len())
.unwrap()
.into_iter()
.map(ToString::to_string)
.collect::<Vec<_>>();
let expected = vec![topics.clone(); 2]
.into_iter()
.flatten()
.collect::<Vec<_>>();
assert_eq!(got, expected);
})
})
.await;
}
}

View File

@@ -16,7 +16,6 @@ use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use rand::Rng;
use serde::{Deserialize, Serialize};
use snafu::ensure;
use crate::error::{EmptyTopicPoolSnafu, Result};
@@ -60,6 +59,14 @@ impl TopicSelector for RoundRobinTopicSelector {
mod tests {
use super::*;
/// Tests that a selector behaves as expected when the given topic pool is empty.
#[test]
fn test_empty_topic_pool() {
let topic_pool = vec![];
let selector = RoundRobinTopicSelector::default();
assert!(selector.select(&topic_pool).is_err());
}
#[test]
fn test_round_robin_topic_selector() {
let topic_pool: Vec<_> = [0, 1, 2].into_iter().map(|v| v.to_string()).collect();

View File

@@ -107,14 +107,16 @@ pub fn allocate_region_wal_options(
mod tests {
use super::*;
use crate::kv_backend::memory::MemoryKvBackend;
use crate::wal::kafka::test_util::run_test_with_kafka_wal;
use crate::wal::kafka::topic_selector::RoundRobinTopicSelector;
use crate::wal::kafka::KafkaConfig;
// Tests the wal options allocator could successfully allocate raft-engine wal options.
// Note: tests for allocator with kafka are integration tests.
#[tokio::test]
async fn test_allocator_with_raft_engine() {
let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef;
let wal_config = WalConfig::RaftEngine;
let mut allocator = WalOptionsAllocator::new(wal_config, kv_backend);
let allocator = WalOptionsAllocator::new(wal_config, kv_backend);
allocator.start().await.unwrap();
let num_regions = 32;
@@ -128,4 +130,49 @@ mod tests {
.collect();
assert_eq!(got, expected);
}
// Tests that the wal options allocator could successfully allocate Kafka wal options.
#[tokio::test]
async fn test_allocator_with_kafka() {
run_test_with_kafka_wal(|broker_endpoints| {
Box::pin(async {
let topics = (0..256)
.map(|i| format!("test_allocator_with_kafka_{}_{}", i, uuid::Uuid::new_v4()))
.collect::<Vec<_>>();
// Creates a topic manager.
let config = KafkaConfig {
replication_factor: broker_endpoints.len() as i16,
broker_endpoints,
..Default::default()
};
let kv_backend = Arc::new(MemoryKvBackend::new()) as KvBackendRef;
let mut topic_manager = KafkaTopicManager::new(config.clone(), kv_backend);
// Replaces the default topic pool with the constructed topics.
topic_manager.topic_pool = topics.clone();
// Replaces the default selector with a round-robin selector without shuffled.
topic_manager.topic_selector = Arc::new(RoundRobinTopicSelector::default());
// Creates an options allocator.
let allocator = WalOptionsAllocator::Kafka(topic_manager);
allocator.start().await.unwrap();
let num_regions = 32;
let regions = (0..num_regions).collect::<Vec<_>>();
let got = allocate_region_wal_options(regions.clone(), &allocator).unwrap();
// Check the allocated wal options contain the expected topics.
let expected = (0..num_regions)
.map(|i| {
let options = WalOptions::Kafka(KafkaWalOptions {
topic: topics[i as usize].clone(),
});
(i, serde_json::to_string(&options).unwrap())
})
.collect::<HashMap<_, _>>();
assert_eq!(got, expected);
})
})
.await;
}
}

View File

@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
mod lock;
mod runner;
mod rwlock;
use std::collections::{HashMap, VecDeque};
use std::sync::atomic::{AtomicBool, Ordering};
@@ -29,11 +29,11 @@ use snafu::{ensure, ResultExt};
use tokio::sync::watch::{self, Receiver, Sender};
use tokio::sync::{Mutex as TokioMutex, Notify};
use self::rwlock::KeyRwLock;
use crate::error::{
DuplicateProcedureSnafu, Error, LoaderConflictSnafu, ManagerNotStartSnafu, Result,
StartRemoveOutdatedMetaTaskSnafu, StopRemoveOutdatedMetaTaskSnafu,
};
use crate::local::lock::LockMap;
use crate::local::runner::Runner;
use crate::procedure::BoxedProcedureLoader;
use crate::store::{ProcedureMessage, ProcedureStore, StateStoreRef};
@@ -57,8 +57,6 @@ const META_TTL: Duration = Duration::from_secs(60 * 10);
pub(crate) struct ProcedureMeta {
/// Id of this procedure.
id: ProcedureId,
/// Notify to wait for a lock.
lock_notify: Notify,
/// Parent procedure id.
parent_id: Option<ProcedureId>,
/// Notify to wait for subprocedures.
@@ -78,7 +76,6 @@ impl ProcedureMeta {
let (state_sender, state_receiver) = watch::channel(ProcedureState::Running);
ProcedureMeta {
id,
lock_notify: Notify::new(),
parent_id,
child_notify: Notify::new(),
lock_key,
@@ -131,7 +128,7 @@ struct LoadedProcedure {
pub(crate) struct ManagerContext {
/// Procedure loaders. The key is the type name of the procedure which the loader returns.
loaders: Mutex<HashMap<String, BoxedProcedureLoader>>,
lock_map: LockMap,
key_lock: KeyRwLock<String>,
procedures: RwLock<HashMap<ProcedureId, ProcedureMetaRef>>,
/// Messages loaded from the procedure store.
messages: Mutex<HashMap<ProcedureId, ProcedureMessage>>,
@@ -152,8 +149,8 @@ impl ManagerContext {
/// Returns a new [ManagerContext].
fn new() -> ManagerContext {
ManagerContext {
key_lock: KeyRwLock::new(),
loaders: Mutex::new(HashMap::new()),
lock_map: LockMap::new(),
procedures: RwLock::new(HashMap::new()),
messages: Mutex::new(HashMap::new()),
finished_procedures: Mutex::new(VecDeque::new()),
@@ -850,7 +847,7 @@ mod tests {
assert!(manager.procedure_watcher(procedure_id).is_none());
let mut procedure = ProcedureToLoad::new("submit");
procedure.lock_key = LockKey::single("test.submit");
procedure.lock_key = LockKey::single_exclusive("test.submit");
assert!(manager
.submit(ProcedureWithId {
id: procedure_id,
@@ -918,7 +915,7 @@ mod tests {
}
fn lock_key(&self) -> LockKey {
LockKey::single("test.submit")
LockKey::single_exclusive("test.submit")
}
}
@@ -955,7 +952,7 @@ mod tests {
let manager = LocalManager::new(config, state_store);
let mut procedure = ProcedureToLoad::new("submit");
procedure.lock_key = LockKey::single("test.submit");
procedure.lock_key = LockKey::single_exclusive("test.submit");
let procedure_id = ProcedureId::random();
assert_matches!(
manager
@@ -986,7 +983,7 @@ mod tests {
manager.start().await.unwrap();
let mut procedure = ProcedureToLoad::new("submit");
procedure.lock_key = LockKey::single("test.submit");
procedure.lock_key = LockKey::single_exclusive("test.submit");
let procedure_id = ProcedureId::random();
assert!(manager
.submit(ProcedureWithId {
@@ -1018,7 +1015,7 @@ mod tests {
manager.manager_ctx.set_running();
let mut procedure = ProcedureToLoad::new("submit");
procedure.lock_key = LockKey::single("test.submit");
procedure.lock_key = LockKey::single_exclusive("test.submit");
let procedure_id = ProcedureId::random();
assert!(manager
.submit(ProcedureWithId {
@@ -1041,7 +1038,7 @@ mod tests {
// The remove_outdated_meta method has been stopped, so any procedure meta-data will not be automatically removed.
manager.stop().await.unwrap();
let mut procedure = ProcedureToLoad::new("submit");
procedure.lock_key = LockKey::single("test.submit");
procedure.lock_key = LockKey::single_exclusive("test.submit");
let procedure_id = ProcedureId::random();
manager.manager_ctx.set_running();
@@ -1063,7 +1060,7 @@ mod tests {
// After restart
let mut procedure = ProcedureToLoad::new("submit");
procedure.lock_key = LockKey::single("test.submit");
procedure.lock_key = LockKey::single_exclusive("test.submit");
let procedure_id = ProcedureId::random();
assert!(manager
.submit(ProcedureWithId {

View File

@@ -1,214 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::{HashMap, VecDeque};
use std::sync::RwLock;
use crate::local::ProcedureMetaRef;
use crate::ProcedureId;
/// A lock entry.
#[derive(Debug)]
struct Lock {
/// Current lock owner.
owner: ProcedureMetaRef,
/// Waiter procedures.
waiters: VecDeque<ProcedureMetaRef>,
}
impl Lock {
/// Returns a [Lock] with specific `owner` procedure.
fn from_owner(owner: ProcedureMetaRef) -> Lock {
Lock {
owner,
waiters: VecDeque::new(),
}
}
/// Try to pop a waiter from the waiter list, set it as owner
/// and wake up the new owner.
///
/// Returns false if there is no waiter in the waiter list.
fn switch_owner(&mut self) -> bool {
if let Some(waiter) = self.waiters.pop_front() {
// Update owner.
self.owner = waiter.clone();
// We need to use notify_one() since the waiter may have not called `notified()` yet.
waiter.lock_notify.notify_one();
true
} else {
false
}
}
}
/// Manages lock entries for procedures.
pub(crate) struct LockMap {
locks: RwLock<HashMap<String, Lock>>,
}
impl LockMap {
/// Returns a new [LockMap].
pub(crate) fn new() -> LockMap {
LockMap {
locks: RwLock::new(HashMap::new()),
}
}
/// Acquire lock by `key` for procedure with specific `meta`.
///
/// Though `meta` is cloneable, callers must ensure that only one `meta`
/// is acquiring and holding the lock at the same time.
///
/// # Panics
/// Panics if the procedure acquires the lock recursively.
pub(crate) async fn acquire_lock(&self, key: &str, meta: ProcedureMetaRef) {
assert!(!self.hold_lock(key, meta.id));
{
let mut locks = self.locks.write().unwrap();
if let Some(lock) = locks.get_mut(key) {
// Lock already exists, but we don't expect that a procedure acquires
// the same lock again.
assert_ne!(lock.owner.id, meta.id);
// Add this procedure to the waiter list. Here we don't check
// whether the procedure is already in the waiter list as we
// expect that a procedure should not wait for two lock simultaneously.
lock.waiters.push_back(meta.clone());
} else {
let _ = locks.insert(key.to_string(), Lock::from_owner(meta));
return;
}
}
// Wait for notify.
meta.lock_notify.notified().await;
assert!(self.hold_lock(key, meta.id));
}
/// Release lock by `key`.
pub(crate) fn release_lock(&self, key: &str, procedure_id: ProcedureId) {
let mut locks = self.locks.write().unwrap();
if let Some(lock) = locks.get_mut(key) {
if lock.owner.id != procedure_id {
// This is not the lock owner.
return;
}
if !lock.switch_owner() {
// No body waits for this lock, we can remove the lock entry.
let _ = locks.remove(key);
}
}
}
/// Returns true if the procedure with specific `procedure_id` holds the
/// lock of `key`.
fn hold_lock(&self, key: &str, procedure_id: ProcedureId) -> bool {
let locks = self.locks.read().unwrap();
locks
.get(key)
.map(|lock| lock.owner.id == procedure_id)
.unwrap_or(false)
}
/// Returns true if the procedure is waiting for the lock `key`.
#[cfg(test)]
fn waiting_lock(&self, key: &str, procedure_id: ProcedureId) -> bool {
let locks = self.locks.read().unwrap();
locks
.get(key)
.map(|lock| lock.waiters.iter().any(|meta| meta.id == procedure_id))
.unwrap_or(false)
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use super::*;
use crate::local::test_util;
#[test]
fn test_lock_no_waiter() {
let meta = Arc::new(test_util::procedure_meta_for_test());
let mut lock = Lock::from_owner(meta);
assert!(!lock.switch_owner());
}
#[tokio::test]
async fn test_lock_with_waiter() {
let owner = Arc::new(test_util::procedure_meta_for_test());
let mut lock = Lock::from_owner(owner);
let waiter = Arc::new(test_util::procedure_meta_for_test());
lock.waiters.push_back(waiter.clone());
assert!(lock.switch_owner());
assert!(lock.waiters.is_empty());
waiter.lock_notify.notified().await;
assert_eq!(lock.owner.id, waiter.id);
}
#[tokio::test]
async fn test_lock_map() {
let key = "hello";
let owner = Arc::new(test_util::procedure_meta_for_test());
let lock_map = Arc::new(LockMap::new());
lock_map.acquire_lock(key, owner.clone()).await;
let waiter = Arc::new(test_util::procedure_meta_for_test());
let waiter_id = waiter.id;
// Waiter release the lock, this should not take effect.
lock_map.release_lock(key, waiter_id);
let lock_map2 = lock_map.clone();
let owner_id = owner.id;
let handle = tokio::spawn(async move {
assert!(lock_map2.hold_lock(key, owner_id));
assert!(!lock_map2.hold_lock(key, waiter_id));
// Waiter wait for lock.
lock_map2.acquire_lock(key, waiter.clone()).await;
assert!(lock_map2.hold_lock(key, waiter_id));
});
// Owner still holds the lock.
assert!(lock_map.hold_lock(key, owner_id));
// Wait until the waiter acquired the lock
while !lock_map.waiting_lock(key, waiter_id) {
tokio::time::sleep(std::time::Duration::from_millis(5)).await;
}
// Release lock
lock_map.release_lock(key, owner_id);
assert!(!lock_map.hold_lock(key, owner_id));
// Wait for task.
handle.await.unwrap();
// The waiter should hold the lock now.
assert!(lock_map.hold_lock(key, waiter_id));
lock_map.release_lock(key, waiter_id);
}
}

View File

@@ -19,8 +19,10 @@ use backon::{BackoffBuilder, ExponentialBuilder};
use common_telemetry::logging;
use tokio::time;
use super::rwlock::OwnedKeyRwLockGuard;
use crate::error::{self, ProcedurePanicSnafu, Result};
use crate::local::{ManagerContext, ProcedureMeta, ProcedureMetaRef};
use crate::procedure::StringKey;
use crate::store::ProcedureStore;
use crate::ProcedureState::Retrying;
use crate::{BoxedProcedure, Context, Error, ProcedureId, ProcedureState, ProcedureWithId, Status};
@@ -56,6 +58,7 @@ impl ExecResult {
struct ProcedureGuard {
meta: ProcedureMetaRef,
manager_ctx: Arc<ManagerContext>,
key_guards: Vec<OwnedKeyRwLockGuard>,
finish: bool,
}
@@ -65,6 +68,7 @@ impl ProcedureGuard {
ProcedureGuard {
meta,
manager_ctx,
key_guards: vec![],
finish: false,
}
}
@@ -95,10 +99,15 @@ impl Drop for ProcedureGuard {
self.manager_ctx.notify_by_subprocedure(parent_id);
}
// Release lock in reverse order.
for key in self.meta.lock_key.keys_to_unlock() {
self.manager_ctx.lock_map.release_lock(key, self.meta.id);
// Drops the key guards in the reverse order.
while !self.key_guards.is_empty() {
self.key_guards.pop();
}
// Clean the staled locks.
self.manager_ctx
.key_lock
.clean_keys(self.meta.lock_key.keys_to_lock().map(|k| k.as_string()));
}
}
@@ -121,7 +130,7 @@ impl Runner {
/// Run the procedure.
pub(crate) async fn run(mut self) {
// Ensure we can update the procedure state.
let guard = ProcedureGuard::new(self.meta.clone(), self.manager_ctx.clone());
let mut guard = ProcedureGuard::new(self.meta.clone(), self.manager_ctx.clone());
logging::info!(
"Runner {}-{} starts",
@@ -133,10 +142,14 @@ impl Runner {
// recursive locking by adding a root procedure id to the meta.
for key in self.meta.lock_key.keys_to_lock() {
// Acquire lock for each key.
self.manager_ctx
.lock_map
.acquire_lock(key, self.meta.clone())
.await;
let key_guard = match key {
StringKey::Share(key) => self.manager_ctx.key_lock.read(key.clone()).await.into(),
StringKey::Exclusive(key) => {
self.manager_ctx.key_lock.write(key.clone()).await.into()
}
};
guard.key_guards.push(key_guard);
}
// Execute the procedure. We need to release the lock whenever the the execution
@@ -604,7 +617,7 @@ mod tests {
};
let normal = ProcedureAdapter {
data: "normal".to_string(),
lock_key: LockKey::single("catalog.schema.table"),
lock_key: LockKey::single_exclusive("catalog.schema.table"),
exec_fn,
};
@@ -665,7 +678,7 @@ mod tests {
};
let suspend = ProcedureAdapter {
data: "suspend".to_string(),
lock_key: LockKey::single("catalog.schema.table"),
lock_key: LockKey::single_exclusive("catalog.schema.table"),
exec_fn,
};
@@ -697,7 +710,7 @@ mod tests {
};
let child = ProcedureAdapter {
data: "child".to_string(),
lock_key: LockKey::new(keys.iter().map(|k| k.to_string())),
lock_key: LockKey::new_exclusive(keys.iter().map(|k| k.to_string())),
exec_fn,
};
@@ -765,7 +778,7 @@ mod tests {
};
let parent = ProcedureAdapter {
data: "parent".to_string(),
lock_key: LockKey::single("catalog.schema.table"),
lock_key: LockKey::single_exclusive("catalog.schema.table"),
exec_fn,
};
@@ -784,6 +797,7 @@ mod tests {
runner.manager_ctx = manager_ctx.clone();
runner.run().await;
assert!(manager_ctx.key_lock.is_empty());
// Check child procedures.
for child_id in children_ids {
@@ -810,7 +824,7 @@ mod tests {
let exec_fn = move |_| async move { Ok(Status::Executing { persist: true }) }.boxed();
let normal = ProcedureAdapter {
data: "normal".to_string(),
lock_key: LockKey::single("catalog.schema.table"),
lock_key: LockKey::single_exclusive("catalog.schema.table"),
exec_fn,
};
@@ -851,7 +865,7 @@ mod tests {
|_| async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }.boxed();
let normal = ProcedureAdapter {
data: "fail".to_string(),
lock_key: LockKey::single("catalog.schema.table"),
lock_key: LockKey::single_exclusive("catalog.schema.table"),
exec_fn,
};
@@ -875,7 +889,7 @@ mod tests {
|_| async { Err(Error::external(MockError::new(StatusCode::Unexpected))) }.boxed();
let fail = ProcedureAdapter {
data: "fail".to_string(),
lock_key: LockKey::single("catalog.schema.table"),
lock_key: LockKey::single_exclusive("catalog.schema.table"),
exec_fn,
};
@@ -917,7 +931,7 @@ mod tests {
let retry_later = ProcedureAdapter {
data: "retry_later".to_string(),
lock_key: LockKey::single("catalog.schema.table"),
lock_key: LockKey::single_exclusive("catalog.schema.table"),
exec_fn,
};
@@ -952,7 +966,7 @@ mod tests {
let exceed_max_retry_later = ProcedureAdapter {
data: "exceed_max_retry_later".to_string(),
lock_key: LockKey::single("catalog.schema.table"),
lock_key: LockKey::single_exclusive("catalog.schema.table"),
exec_fn,
};
@@ -993,7 +1007,7 @@ mod tests {
};
let fail = ProcedureAdapter {
data: "fail".to_string(),
lock_key: LockKey::single("catalog.schema.table.region-0"),
lock_key: LockKey::single_exclusive("catalog.schema.table.region-0"),
exec_fn,
};
@@ -1027,7 +1041,7 @@ mod tests {
};
let parent = ProcedureAdapter {
data: "parent".to_string(),
lock_key: LockKey::single("catalog.schema.table"),
lock_key: LockKey::single_exclusive("catalog.schema.table"),
exec_fn,
};
@@ -1042,10 +1056,11 @@ mod tests {
// Manually add this procedure to the manager ctx.
assert!(manager_ctx.try_insert_procedure(meta.clone()));
// Replace the manager ctx.
runner.manager_ctx = manager_ctx;
runner.manager_ctx = manager_ctx.clone();
// Run the runner and execute the procedure.
runner.run().await;
assert!(manager_ctx.key_lock.is_empty());
let err = meta.state().error().unwrap().output_msg();
assert!(err.contains("subprocedure failed"), "{err}");
}

View File

@@ -0,0 +1,247 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::hash::Hash;
use std::sync::{Arc, Mutex};
use tokio::sync::{OwnedRwLockReadGuard, OwnedRwLockWriteGuard, RwLock};
pub enum OwnedKeyRwLockGuard {
Read(OwnedRwLockReadGuard<()>),
Write(OwnedRwLockWriteGuard<()>),
}
impl From<OwnedRwLockReadGuard<()>> for OwnedKeyRwLockGuard {
fn from(guard: OwnedRwLockReadGuard<()>) -> Self {
OwnedKeyRwLockGuard::Read(guard)
}
}
impl From<OwnedRwLockWriteGuard<()>> for OwnedKeyRwLockGuard {
fn from(guard: OwnedRwLockWriteGuard<()>) -> Self {
OwnedKeyRwLockGuard::Write(guard)
}
}
/// Locks based on a key, allowing other keys to lock independently.
#[derive(Debug)]
pub struct KeyRwLock<K> {
/// The inner map of locks for specific keys.
inner: Mutex<HashMap<K, Arc<RwLock<()>>>>,
}
impl<K> KeyRwLock<K>
where
K: Eq + Hash + Clone,
{
pub fn new() -> Self {
KeyRwLock {
inner: Default::default(),
}
}
/// Locks the key with shared read access, returning a guard.
pub async fn read(&self, key: K) -> OwnedRwLockReadGuard<()> {
let lock = {
let mut locks = self.inner.lock().unwrap();
locks.entry(key).or_default().clone()
};
lock.read_owned().await
}
/// Locks the key with exclusive write access, returning a guard.
pub async fn write(&self, key: K) -> OwnedRwLockWriteGuard<()> {
let lock = {
let mut locks = self.inner.lock().unwrap();
locks.entry(key).or_default().clone()
};
lock.write_owned().await
}
/// Clean up stale locks.
///
/// Note: It only cleans a lock if
/// - Its strong ref count equals one.
/// - Able to acquire the write lock.
pub fn clean_keys<'a>(&'a self, iter: impl IntoIterator<Item = &'a K>) {
let mut locks = self.inner.lock().unwrap();
let mut keys = Vec::new();
for key in iter {
if let Some(lock) = locks.get(key) {
if lock.try_write().is_ok() {
debug_assert_eq!(Arc::weak_count(lock), 0);
// Ensures nobody keeps this ref.
if Arc::strong_count(lock) == 1 {
keys.push(key);
}
}
}
}
for key in keys {
locks.remove(key);
}
}
}
#[cfg(test)]
impl<K> KeyRwLock<K>
where
K: Eq + Hash + Clone,
{
/// Tries to lock the key with shared read access, returning immediately.
pub fn try_read(&self, key: K) -> Result<OwnedRwLockReadGuard<()>, tokio::sync::TryLockError> {
let lock = {
let mut locks = self.inner.lock().unwrap();
locks.entry(key).or_default().clone()
};
lock.try_read_owned()
}
/// Tries lock this key with exclusive write access, returning immediately.
pub fn try_write(
&self,
key: K,
) -> Result<OwnedRwLockWriteGuard<()>, tokio::sync::TryLockError> {
let lock = {
let mut locks = self.inner.lock().unwrap();
locks.entry(key).or_default().clone()
};
lock.try_write_owned()
}
/// Returns number of keys.
pub fn len(&self) -> usize {
self.inner.lock().unwrap().len()
}
/// Returns true the inner map is empty.
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_naive() {
let lock_key = KeyRwLock::new();
{
let _guard = lock_key.read("test1").await;
assert_eq!(lock_key.len(), 1);
assert!(lock_key.try_read("test1").is_ok());
assert!(lock_key.try_write("test1").is_err());
}
{
let _guard0 = lock_key.write("test2").await;
let _guard = lock_key.write("test1").await;
assert_eq!(lock_key.len(), 2);
assert!(lock_key.try_read("test1").is_err());
assert!(lock_key.try_write("test1").is_err());
}
assert_eq!(lock_key.len(), 2);
lock_key.clean_keys(&vec!["test1", "test2"]);
assert!(lock_key.is_empty());
let mut guards = Vec::new();
for key in ["test1", "test2"] {
guards.push(lock_key.read(key).await);
}
while !guards.is_empty() {
guards.pop();
}
lock_key.clean_keys(vec![&"test1", &"test2"]);
assert_eq!(lock_key.len(), 0);
}
#[tokio::test]
async fn test_clean_keys() {
let lock_key = KeyRwLock::<&str>::new();
{
let rwlock = {
lock_key
.inner
.lock()
.unwrap()
.entry("test")
.or_default()
.clone()
};
assert_eq!(Arc::strong_count(&rwlock), 2);
let _guard = rwlock.read_owned().await;
{
let inner = lock_key.inner.lock().unwrap();
let rwlock = inner.get("test").unwrap();
assert_eq!(Arc::strong_count(rwlock), 2);
}
}
{
let rwlock = {
lock_key
.inner
.lock()
.unwrap()
.entry("test")
.or_default()
.clone()
};
assert_eq!(Arc::strong_count(&rwlock), 2);
let _guard = rwlock.write_owned().await;
{
let inner = lock_key.inner.lock().unwrap();
let rwlock = inner.get("test").unwrap();
assert_eq!(Arc::strong_count(rwlock), 2);
}
}
{
let inner = lock_key.inner.lock().unwrap();
let rwlock = inner.get("test").unwrap();
assert_eq!(Arc::strong_count(rwlock), 1);
}
// Someone has the ref of the rwlock, but it waits to be granted the lock.
let rwlock = {
lock_key
.inner
.lock()
.unwrap()
.entry("test")
.or_default()
.clone()
};
assert_eq!(Arc::strong_count(&rwlock), 2);
// However, One thread trying to remove the "test" key should have no effect.
lock_key.clean_keys(vec![&"test"]);
// Should get the rwlock.
{
let inner = lock_key.inner.lock().unwrap();
inner.get("test").unwrap();
}
}
}

View File

@@ -116,22 +116,49 @@ impl<T: Procedure + ?Sized> Procedure for Box<T> {
}
}
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum StringKey {
Share(String),
Exclusive(String),
}
/// Keys to identify required locks.
///
/// [LockKey] always sorts keys lexicographically so that they can be acquired
/// in the same order.
// Most procedures should only acquire 1 ~ 2 locks so we use smallvec to hold keys.
/// Most procedures should only acquire 1 ~ 2 locks so we use smallvec to hold keys.
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct LockKey(SmallVec<[String; 2]>);
pub struct LockKey(SmallVec<[StringKey; 2]>);
impl StringKey {
pub fn into_string(self) -> String {
match self {
StringKey::Share(s) => s,
StringKey::Exclusive(s) => s,
}
}
pub fn as_string(&self) -> &String {
match self {
StringKey::Share(s) => s,
StringKey::Exclusive(s) => s,
}
}
}
impl LockKey {
/// Returns a new [LockKey] with only one key.
pub fn single(key: impl Into<String>) -> LockKey {
pub fn single(key: impl Into<StringKey>) -> LockKey {
LockKey(smallvec![key.into()])
}
/// Returns a new [LockKey] with only one key.
pub fn single_exclusive(key: impl Into<String>) -> LockKey {
LockKey(smallvec![StringKey::Exclusive(key.into())])
}
/// Returns a new [LockKey] with keys from specific `iter`.
pub fn new(iter: impl IntoIterator<Item = String>) -> LockKey {
pub fn new(iter: impl IntoIterator<Item = StringKey>) -> LockKey {
let mut vec: SmallVec<_> = iter.into_iter().collect();
vec.sort();
// Dedup keys to avoid acquiring the same key multiple times.
@@ -139,14 +166,14 @@ impl LockKey {
LockKey(vec)
}
/// Returns the keys to lock.
pub fn keys_to_lock(&self) -> impl Iterator<Item = &String> {
self.0.iter()
/// Returns a new [LockKey] with keys from specific `iter`.
pub fn new_exclusive(iter: impl IntoIterator<Item = String>) -> LockKey {
Self::new(iter.into_iter().map(StringKey::Exclusive))
}
/// Returns the keys to unlock.
pub fn keys_to_unlock(&self) -> impl Iterator<Item = &String> {
self.0.iter().rev()
/// Returns the keys to lock.
pub fn keys_to_lock(&self) -> impl Iterator<Item = &StringKey> {
self.0.iter()
}
}
@@ -340,20 +367,25 @@ mod tests {
#[test]
fn test_lock_key() {
let entity = "catalog.schema.my_table";
let key = LockKey::single(entity);
assert_eq!(vec![entity], key.keys_to_lock().collect::<Vec<_>>());
assert_eq!(vec![entity], key.keys_to_unlock().collect::<Vec<_>>());
let key = LockKey::single_exclusive(entity);
assert_eq!(
vec![&StringKey::Exclusive(entity.to_string())],
key.keys_to_lock().collect::<Vec<_>>()
);
let key = LockKey::new([
let key = LockKey::new_exclusive([
"b".to_string(),
"c".to_string(),
"a".to_string(),
"c".to_string(),
]);
assert_eq!(vec!["a", "b", "c"], key.keys_to_lock().collect::<Vec<_>>());
assert_eq!(
vec!["c", "b", "a"],
key.keys_to_unlock().collect::<Vec<_>>()
vec![
&StringKey::Exclusive("a".to_string()),
&StringKey::Exclusive("b".to_string()),
&StringKey::Exclusive("c".to_string())
],
key.keys_to_lock().collect::<Vec<_>>()
);
}

View File

@@ -87,7 +87,7 @@ impl StateStore for ObjectStateStore {
let mut lister = self
.store
.lister_with(path)
.delimiter("")
.recursive(true)
.await
.map_err(|e| {
BoxedError::new(PlainError::new(

View File

@@ -98,7 +98,7 @@ mod tests {
}
fn lock_key(&self) -> LockKey {
LockKey::single("test.submit")
LockKey::single_exclusive("test.submit")
}
}

View File

@@ -20,13 +20,13 @@ pub const THREAD_NAME_LABEL: &str = "thread_name";
lazy_static! {
pub static ref METRIC_RUNTIME_THREADS_ALIVE: IntGaugeVec = register_int_gauge_vec!(
"runtime_threads_alive",
"greptime_runtime_threads_alive",
"runtime threads alive",
&[THREAD_NAME_LABEL]
)
.unwrap();
pub static ref METRIC_RUNTIME_THREADS_IDLE: IntGaugeVec = register_int_gauge_vec!(
"runtime_threads_idle",
"greptime_runtime_threads_idle",
"runtime threads idle",
&[THREAD_NAME_LABEL]
)

View File

@@ -22,7 +22,7 @@ use prometheus::*;
lazy_static! {
pub static ref PANIC_COUNTER: IntCounter =
register_int_counter!("panic_counter", "panic_counter").unwrap();
register_int_counter!("greptime_panic_counter", "panic_counter").unwrap();
}
pub fn set_panic_hook() {

View File

@@ -4,6 +4,9 @@ version.workspace = true
edition.workspace = true
license.workspace = true
[features]
testing = []
[dependencies]
api.workspace = true
arrow-flight.workspace = true

View File

@@ -22,11 +22,12 @@ use std::sync::Arc;
use catalog::memory::MemoryCatalogManager;
use common_base::Plugins;
use common_config::wal::{KafkaConfig, RaftEngineConfig};
use common_config::{WalConfig, WAL_OPTIONS_KEY};
use common_config::WalConfig;
use common_error::ext::BoxedError;
use common_greptimedb_telemetry::GreptimeDBTelemetryTask;
use common_meta::key::datanode_table::{DatanodeTableManager, DatanodeTableValue};
use common_meta::kv_backend::KvBackendRef;
use common_meta::wal::prepare_wal_option;
pub use common_procedure::options::ProcedureConfig;
use common_runtime::Runtime;
use common_telemetry::{error, info, warn};
@@ -98,7 +99,7 @@ impl Datanode {
self.start_telemetry();
if let Some(t) = self.export_metrics_task.as_ref() {
t.start()
t.start(None).context(StartServerSnafu)?
}
self.start_services().await
@@ -538,13 +539,11 @@ async fn open_all_regions(
for region_number in table_value.regions {
// Augments region options with wal options if a wal options is provided.
let mut region_options = table_value.region_info.region_options.clone();
table_value
.region_info
.region_wal_options
.get(&region_number.to_string())
.and_then(|wal_options| {
region_options.insert(WAL_OPTIONS_KEY.to_string(), wal_options.clone())
});
prepare_wal_option(
&mut region_options,
RegionId::new(table_value.table_id, region_number),
&table_value.region_info.region_wal_options,
);
regions.push((
RegionId::new(table_value.table_id, region_number),

View File

@@ -272,6 +272,16 @@ pub enum Error {
location: Location,
source: BoxedError,
},
#[snafu(display(
"Failed to find logical regions in physical region {}",
physical_region_id
))]
FindLogicalRegions {
physical_region_id: RegionId,
source: metric_engine::error::Error,
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -340,6 +350,8 @@ impl ErrorExt for Error {
}
HandleRegionRequest { source, .. } => source.status_code(),
StopRegionEngine { source, .. } => source.status_code(),
FindLogicalRegions { source, .. } => source.status_code(),
}
}

View File

@@ -305,7 +305,7 @@ impl HeartbeatTask {
}
async fn load_region_stats(region_server: &RegionServer) -> Vec<RegionStat> {
let regions = region_server.opened_regions();
let regions = region_server.reportable_regions();
let mut region_stats = Vec::new();
for stat in regions {

View File

@@ -96,6 +96,7 @@ impl HeartbeatResponseHandler for RegionHeartbeatResponseHandler {
Some((_, Instruction::OpenRegion { .. }))
| Some((_, Instruction::CloseRegion { .. }))
| Some((_, Instruction::DowngradeRegion { .. }))
| Some((_, Instruction::UpgradeRegion { .. }))
)
}
@@ -134,7 +135,7 @@ mod tests {
use common_meta::heartbeat::mailbox::{
HeartbeatMailbox, IncomingMessage, MailboxRef, MessageMeta,
};
use common_meta::instruction::{DowngradeRegion, OpenRegion};
use common_meta::instruction::{DowngradeRegion, OpenRegion, UpgradeRegion};
use mito2::config::MitoConfig;
use mito2::engine::MITO_ENGINE_NAME;
use mito2::test_util::{CreateRequestBuilder, TestEnv};
@@ -175,6 +176,44 @@ mod tests {
}
}
#[test]
fn test_is_acceptable() {
common_telemetry::init_default_ut_logging();
let region_server = mock_region_server();
let heartbeat_handler = RegionHeartbeatResponseHandler::new(region_server.clone());
let heartbeat_env = HeartbeatResponseTestEnv::new();
let meta = MessageMeta::new_test(1, "test", "dn-1", "me-0");
// Open region
let region_id = RegionId::new(1024, 1);
let storage_path = "test";
let instruction = open_region_instruction(region_id, storage_path);
assert!(heartbeat_handler
.is_acceptable(&heartbeat_env.create_handler_ctx((meta.clone(), instruction))));
// Close region
let instruction = close_region_instruction(region_id);
assert!(heartbeat_handler
.is_acceptable(&heartbeat_env.create_handler_ctx((meta.clone(), instruction))));
// Downgrade region
let instruction = Instruction::DowngradeRegion(DowngradeRegion {
region_id: RegionId::new(2048, 1),
});
assert!(heartbeat_handler
.is_acceptable(&heartbeat_env.create_handler_ctx((meta.clone(), instruction))));
// Upgrade region
let instruction = Instruction::UpgradeRegion(UpgradeRegion {
region_id,
last_entry_id: None,
wait_for_replay_timeout: None,
});
assert!(
heartbeat_handler.is_acceptable(&heartbeat_env.create_handler_ctx((meta, instruction)))
);
}
fn close_region_instruction(region_id: RegionId) -> Instruction {
Instruction::CloseRegion(RegionIdent {
table_id: region_id.table_id(),

View File

@@ -14,6 +14,7 @@
use common_error::ext::ErrorExt;
use common_meta::instruction::{InstructionReply, OpenRegion, SimpleReply};
use common_meta::wal::prepare_wal_option;
use futures_util::future::BoxFuture;
use store_api::path_utils::region_dir;
use store_api::region_request::{RegionOpenRequest, RegionRequest};
@@ -26,15 +27,14 @@ impl HandlerContext {
OpenRegion {
region_ident,
region_storage_path,
region_options,
mut region_options,
region_wal_options,
skip_wal_replay,
}: OpenRegion,
) -> BoxFuture<'static, InstructionReply> {
Box::pin(async move {
let region_id = Self::region_ident_to_region_id(&region_ident);
// TODO(niebayes): extends region options with region_wal_options.
let _ = region_wal_options;
prepare_wal_option(&mut region_options, region_id, &region_wal_options);
let request = RegionRequest::Open(RegionOpenRequest {
engine: region_ident.engine,
region_dir: region_dir(&region_storage_path, region_id),
@@ -42,10 +42,8 @@ impl HandlerContext {
skip_wal_replay,
});
let result = self.region_server.handle_request(region_id, request).await;
let success = result.is_ok();
let error = result.as_ref().map_err(|e| e.output_msg()).err();
InstructionReply::OpenRegion(SimpleReply {
result: success,
error,

View File

@@ -14,7 +14,7 @@
use common_error::ext::ErrorExt;
use common_meta::instruction::{InstructionReply, UpgradeRegion, UpgradeRegionReply};
use common_telemetry::warn;
use common_telemetry::{info, warn};
use futures_util::future::BoxFuture;
use store_api::region_request::{RegionCatchupRequest, RegionRequest};
@@ -56,6 +56,7 @@ impl HandlerContext {
.try_register(
region_id,
Box::pin(async move {
info!("Executing region: {region_id} catchup to: last entry id {last_entry_id:?}");
region_server_moved
.handle_request(
region_id,

View File

@@ -24,5 +24,5 @@ pub mod heartbeat;
pub mod metrics;
pub mod region_server;
mod store;
#[cfg(test)]
mod tests;
#[cfg(any(test, feature = "testing"))]
pub mod tests;

View File

@@ -24,26 +24,26 @@ pub const REGION_ID: &str = "region_id";
lazy_static! {
/// The elapsed time of handling a request in the region_server.
pub static ref HANDLE_REGION_REQUEST_ELAPSED: HistogramVec = register_histogram_vec!(
"datanode_handle_region_request_elapsed",
"greptime_datanode_handle_region_request_elapsed",
"datanode handle region request elapsed",
&[REGION_REQUEST_TYPE]
)
.unwrap();
/// The elapsed time since the last received heartbeat.
pub static ref LAST_RECEIVED_HEARTBEAT_ELAPSED: IntGauge = register_int_gauge!(
"last_received_heartbeat_lease_elapsed",
"greptime_last_received_heartbeat_lease_elapsed",
"last received heartbeat lease elapsed",
)
.unwrap();
pub static ref LEASE_EXPIRED_REGION: IntGaugeVec = register_int_gauge_vec!(
"lease_expired_region",
"greptime_lease_expired_region",
"lease expired region",
&[REGION_ID]
)
.unwrap();
/// The received region leases via heartbeat.
pub static ref HEARTBEAT_REGION_LEASES: IntGaugeVec = register_int_gauge_vec!(
"heartbeat_region_leases",
"greptime_heartbeat_region_leases",
"received region leases via heartbeat",
&[REGION_ROLE]
)

View File

@@ -43,6 +43,7 @@ use datafusion_common::DataFusionError;
use datafusion_expr::{Expr as DfExpr, TableProviderFilterPushDown, TableType};
use datatypes::arrow::datatypes::SchemaRef;
use futures_util::future::try_join_all;
use metric_engine::engine::MetricEngine;
use prost::Message;
use query::QueryEngineRef;
use servers::error::{self as servers_error, ExecuteGrpcRequestSnafu, Result as ServerResult};
@@ -51,6 +52,7 @@ use servers::grpc::region_server::RegionServerHandler;
use session::context::{QueryContextBuilder, QueryContextRef};
use snafu::{OptionExt, ResultExt};
use store_api::metadata::RegionMetadataRef;
use store_api::metric_engine_consts::{METRIC_ENGINE_NAME, PHYSICAL_TABLE_METADATA_KEY};
use store_api::region_engine::{RegionEngineRef, RegionRole, SetReadonlyResponse};
use store_api::region_request::{AffectedRows, RegionCloseRequest, RegionRequest};
use store_api::storage::{RegionId, ScanRequest};
@@ -60,8 +62,9 @@ use tonic::{Request, Response, Result as TonicResult};
use crate::error::{
self, BuildRegionRequestsSnafu, DecodeLogicalPlanSnafu, ExecuteLogicalPlanSnafu,
GetRegionMetadataSnafu, HandleRegionRequestSnafu, RegionEngineNotFoundSnafu,
RegionNotFoundSnafu, Result, StopRegionEngineSnafu, UnsupportedOutputSnafu,
FindLogicalRegionsSnafu, GetRegionMetadataSnafu, HandleRegionRequestSnafu,
RegionEngineNotFoundSnafu, RegionNotFoundSnafu, Result, StopRegionEngineSnafu, UnexpectedSnafu,
UnsupportedOutputSnafu,
};
use crate::event_listener::RegionServerEventListenerRef;
@@ -123,7 +126,10 @@ impl RegionServer {
self.inner.handle_read(request).await
}
pub fn opened_regions(&self) -> Vec<RegionStat> {
/// Returns all opened and reportable regions.
///
/// Notes: except all metrics regions.
pub fn reportable_regions(&self) -> Vec<RegionStat> {
self.inner
.region_map
.iter()
@@ -369,7 +375,7 @@ impl RegionServerInner {
let current_region_status = self.region_map.get(&region_id);
let engine = match region_change {
RegionChange::Register(ref engine_type) => match current_region_status {
RegionChange::Register(ref engine_type, _) => match current_region_status {
Some(status) => match status.clone() {
RegionEngineWithStatus::Registering(_) => {
return Ok(CurrentEngine::EarlyReturn(0))
@@ -427,8 +433,12 @@ impl RegionServerInner {
.start_timer();
let region_change = match &request {
RegionRequest::Create(create) => RegionChange::Register(create.engine.clone()),
RegionRequest::Open(open) => RegionChange::Register(open.engine.clone()),
RegionRequest::Create(create) => RegionChange::Register(create.engine.clone(), false),
RegionRequest::Open(open) => {
let is_opening_physical_region =
open.options.contains_key(PHYSICAL_TABLE_METADATA_KEY);
RegionChange::Register(open.engine.clone(), is_opening_physical_region)
}
RegionRequest::Close(_) | RegionRequest::Drop(_) => RegionChange::Deregisters,
RegionRequest::Put(_)
| RegionRequest::Delete(_)
@@ -460,7 +470,8 @@ impl RegionServerInner {
{
Ok(result) => {
// Sets corresponding region status to ready.
self.set_region_status_ready(region_id, engine, region_change);
self.set_region_status_ready(region_id, engine, region_change)
.await?;
Ok(result)
}
Err(err) => {
@@ -478,7 +489,7 @@ impl RegionServerInner {
region_change: &RegionChange,
) {
match region_change {
RegionChange::Register(_) => {
RegionChange::Register(_, _) => {
self.region_map.insert(
region_id,
RegionEngineWithStatus::Registering(engine.clone()),
@@ -497,7 +508,7 @@ impl RegionServerInner {
fn unset_region_status(&self, region_id: RegionId, region_change: RegionChange) {
match region_change {
RegionChange::None => {}
RegionChange::Register(_) | RegionChange::Deregisters => {
RegionChange::Register(_, _) | RegionChange::Deregisters => {
self.region_map
.remove(&region_id)
.map(|(id, engine)| engine.set_writable(id, false));
@@ -505,16 +516,20 @@ impl RegionServerInner {
}
}
fn set_region_status_ready(
async fn set_region_status_ready(
&self,
region_id: RegionId,
engine: RegionEngineRef,
region_change: RegionChange,
) {
) -> Result<()> {
let engine_type = engine.name();
match region_change {
RegionChange::None => {}
RegionChange::Register(_) => {
RegionChange::Register(_, is_opening_physical_region) => {
if is_opening_physical_region {
self.register_logical_regions(&engine, region_id).await?;
}
info!("Region {region_id} is registered to engine {engine_type}");
self.region_map
.insert(region_id, RegionEngineWithStatus::Ready(engine));
@@ -528,6 +543,37 @@ impl RegionServerInner {
self.event_listener.on_region_deregistered(region_id);
}
}
Ok(())
}
async fn register_logical_regions(
&self,
engine: &RegionEngineRef,
physical_region_id: RegionId,
) -> Result<()> {
let metric_engine =
engine
.as_any()
.downcast_ref::<MetricEngine>()
.context(UnexpectedSnafu {
violated: format!(
"expecting engine type '{}', actual '{}'",
METRIC_ENGINE_NAME,
engine.name(),
),
})?;
let logical_regions = metric_engine
.logical_regions(physical_region_id)
.await
.context(FindLogicalRegionsSnafu { physical_region_id })?;
for region in logical_regions {
self.region_map
.insert(region, RegionEngineWithStatus::Ready(engine.clone()));
info!("Logical region {} is registered!", region);
}
Ok(())
}
pub async fn handle_read(&self, request: QueryRequest) -> Result<SendableRecordBatchStream> {
@@ -622,7 +668,7 @@ impl RegionServerInner {
enum RegionChange {
None,
Register(String),
Register(String, bool),
Deregisters,
}
@@ -1051,7 +1097,7 @@ mod tests {
CurrentEngineTest {
region_id,
current_region_status: None,
region_change: RegionChange::Register(engine.name().to_string()),
region_change: RegionChange::Register(engine.name().to_string(), false),
assert: Box::new(|result| {
let current_engine = result.unwrap();
assert_matches!(current_engine, CurrentEngine::Engine(_));
@@ -1060,7 +1106,7 @@ mod tests {
CurrentEngineTest {
region_id,
current_region_status: Some(RegionEngineWithStatus::Registering(engine.clone())),
region_change: RegionChange::Register(engine.name().to_string()),
region_change: RegionChange::Register(engine.name().to_string(), false),
assert: Box::new(|result| {
let current_engine = result.unwrap();
assert_matches!(current_engine, CurrentEngine::EarlyReturn(_));
@@ -1069,7 +1115,7 @@ mod tests {
CurrentEngineTest {
region_id,
current_region_status: Some(RegionEngineWithStatus::Deregistering(engine.clone())),
region_change: RegionChange::Register(engine.name().to_string()),
region_change: RegionChange::Register(engine.name().to_string(), false),
assert: Box::new(|result| {
let err = result.unwrap_err();
assert_eq!(err.status_code(), StatusCode::RegionBusy);
@@ -1078,7 +1124,7 @@ mod tests {
CurrentEngineTest {
region_id,
current_region_status: Some(RegionEngineWithStatus::Ready(engine.clone())),
region_change: RegionChange::Register(engine.name().to_string()),
region_change: RegionChange::Register(engine.name().to_string(), false),
assert: Box::new(|result| {
let current_engine = result.unwrap();
assert_matches!(current_engine, CurrentEngine::Engine(_));

View File

@@ -207,4 +207,8 @@ impl RegionEngine for MockRegionEngine {
}
Some(RegionRole::Leader)
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::collections::HashMap;
use std::sync::{Arc, RwLock};
@@ -119,6 +120,10 @@ impl RegionEngine for FileRegionEngine {
fn role(&self, region_id: RegionId) -> Option<RegionRole> {
self.inner.state(region_id)
}
fn as_any(&self) -> &dyn Any {
self
}
}
struct EngineInner {

View File

@@ -55,7 +55,7 @@ use query::QueryEngineRef;
use raft_engine::{Config, ReadableSize, RecoveryMode};
use servers::error as server_error;
use servers::error::{AuthSnafu, ExecuteQuerySnafu, ParsePromQLSnafu};
use servers::export_metrics::{ExportMetricsOption, ExportMetricsTask};
use servers::export_metrics::ExportMetricsTask;
use servers::interceptor::{
PromQueryInterceptor, PromQueryInterceptorRef, SqlQueryInterceptor, SqlQueryInterceptorRef,
};
@@ -76,6 +76,7 @@ use sql::statements::statement::Statement;
use sqlparser::ast::ObjectName;
pub use standalone::StandaloneDatanodeManager;
use self::prom_store::ExportMetricHandler;
use crate::error::{
self, Error, ExecLogicalPlanSnafu, ExecutePromqlSnafu, ExternalSnafu, ParseSqlSnafu,
PermissionSnafu, PlanStatementSnafu, Result, SqlExecInterceptedSnafu, StartServerSnafu,
@@ -190,18 +191,16 @@ impl Instance {
&mut self,
opts: impl Into<FrontendOptions> + TomlSerializable,
) -> Result<()> {
let opts: FrontendOptions = opts.into();
self.export_metrics_task =
ExportMetricsTask::try_new(&opts.export_metrics, Some(&self.plugins))
.context(StartServerSnafu)?;
let servers = Services::build(opts, Arc::new(self.clone()), self.plugins.clone()).await?;
self.servers = Arc::new(servers);
Ok(())
}
pub fn build_export_metrics_task(&mut self, opts: &ExportMetricsOption) -> Result<()> {
self.export_metrics_task =
ExportMetricsTask::try_new(opts, Some(&self.plugins)).context(StartServerSnafu)?;
Ok(())
}
pub fn catalog_manager(&self) -> &CatalogManagerRef {
&self.catalog_manager
}
@@ -232,7 +231,15 @@ impl FrontendInstance for Instance {
self.script_executor.start(self)?;
if let Some(t) = self.export_metrics_task.as_ref() {
t.start()
if t.send_by_handler {
let handler = ExportMetricHandler::new_handler(
self.inserter.clone(),
self.statement_executor.clone(),
);
t.start(Some(handler)).context(StartServerSnafu)?
} else {
t.start(None).context(StartServerSnafu)?;
}
}
futures::future::try_join_all(self.servers.iter().map(|(name, handler)| async move {

View File

@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use api::prom_store::remote::read_request::ResponseType;
use api::prom_store::remote::{Query, QueryResult, ReadRequest, ReadResponse, WriteRequest};
use async_trait::async_trait;
@@ -21,10 +23,14 @@ use common_error::ext::BoxedError;
use common_query::Output;
use common_recordbatch::RecordBatches;
use common_telemetry::logging;
use operator::insert::InserterRef;
use operator::statement::StatementExecutor;
use prost::Message;
use servers::error::{self, AuthSnafu, Result as ServerResult};
use servers::prom_store::{self, Metrics};
use servers::query_handler::{PromStoreProtocolHandler, PromStoreResponse};
use servers::query_handler::{
PromStoreProtocolHandler, PromStoreProtocolHandlerRef, PromStoreResponse,
};
use session::context::QueryContextRef;
use snafu::{OptionExt, ResultExt};
@@ -209,3 +215,49 @@ impl PromStoreProtocolHandler for Instance {
todo!();
}
}
/// This handler is mainly used for `frontend` or `standalone` to directly import
/// the metrics collected by itself, thereby avoiding importing metrics through the network,
/// thus reducing compression and network transmission overhead,
/// so only implement `PromStoreProtocolHandler::write` method.
pub struct ExportMetricHandler {
inserter: InserterRef,
statement_executor: Arc<StatementExecutor>,
}
impl ExportMetricHandler {
pub fn new_handler(
inserter: InserterRef,
statement_executor: Arc<StatementExecutor>,
) -> PromStoreProtocolHandlerRef {
Arc::new(Self {
inserter,
statement_executor,
})
}
}
#[async_trait]
impl PromStoreProtocolHandler for ExportMetricHandler {
async fn write(&self, request: WriteRequest, ctx: QueryContextRef) -> ServerResult<()> {
let (requests, _) = prom_store::to_grpc_row_insert_requests(request)?;
self.inserter
.handle_row_inserts(requests, ctx, self.statement_executor.as_ref())
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)?;
Ok(())
}
async fn read(
&self,
_request: ReadRequest,
_ctx: QueryContextRef,
) -> ServerResult<PromStoreResponse> {
unreachable!();
}
async fn ingest_metrics(&self, _metrics: Metrics) -> ServerResult<()> {
unreachable!();
}
}

View File

@@ -22,10 +22,10 @@ use common_recordbatch::SendableRecordBatchStream;
use partition::manager::PartitionRuleManagerRef;
use query::error::{RegionQuerySnafu, Result as QueryResult};
use query::region_query::RegionQueryHandler;
use snafu::{OptionExt, ResultExt};
use snafu::ResultExt;
use store_api::storage::RegionId;
use crate::error::{FindDatanodeSnafu, FindTableRouteSnafu, RequestQuerySnafu, Result};
use crate::error::{FindTableRouteSnafu, RequestQuerySnafu, Result};
pub(crate) struct FrontendRegionQueryHandler {
partition_manager: PartitionRuleManagerRef,
@@ -58,18 +58,13 @@ impl FrontendRegionQueryHandler {
async fn do_get_inner(&self, request: QueryRequest) -> Result<SendableRecordBatchStream> {
let region_id = RegionId::from_u64(request.region_id);
let table_route = self
let peer = &self
.partition_manager
.find_table_route(region_id.table_id())
.find_region_leader(region_id)
.await
.context(FindTableRouteSnafu {
table_id: region_id.table_id(),
})?;
let peer = table_route
.find_region_leader(region_id.region_number())
.context(FindDatanodeSnafu {
region: region_id.region_number(),
})?;
let client = self.datanode_manager.datanode(peer).await;

View File

@@ -40,7 +40,7 @@ impl DatanodeManager for StandaloneDatanodeManager {
}
/// Relative to [client::region::RegionRequester]
struct RegionInvoker {
pub struct RegionInvoker {
region_server: RegionServer,
}

View File

@@ -17,34 +17,34 @@ use prometheus::*;
lazy_static! {
pub static ref METRIC_HANDLE_SQL_ELAPSED: Histogram =
register_histogram!("frontend_handle_sql_elapsed", "frontend handle sql elapsed").unwrap();
register_histogram!("greptime_frontend_handle_sql_elapsed", "frontend handle sql elapsed").unwrap();
pub static ref METRIC_HANDLE_PROMQL_ELAPSED: Histogram = register_histogram!(
"frontend_handle_promql_elapsed",
"greptime_frontend_handle_promql_elapsed",
"frontend handle promql elapsed"
)
.unwrap();
pub static ref METRIC_EXEC_PLAN_ELAPSED: Histogram =
register_histogram!("frontend_exec_plan_elapsed", "frontend exec plan elapsed").unwrap();
register_histogram!("greptime_frontend_exec_plan_elapsed", "frontend exec plan elapsed").unwrap();
pub static ref METRIC_HANDLE_SCRIPTS_ELAPSED: Histogram = register_histogram!(
"frontend_handle_scripts_elapsed",
"greptime_frontend_handle_scripts_elapsed",
"frontend handle scripts elapsed"
)
.unwrap();
pub static ref METRIC_RUN_SCRIPT_ELAPSED: Histogram =
register_histogram!("frontend_run_script_elapsed", "frontend run script elapsed").unwrap();
register_histogram!("greptime_frontend_run_script_elapsed", "frontend run script elapsed").unwrap();
/// The samples count of Prometheus remote write.
pub static ref PROM_STORE_REMOTE_WRITE_SAMPLES: IntCounter = register_int_counter!(
"frontend_prometheus_remote_write_samples",
"greptime_frontend_prometheus_remote_write_samples",
"frontend prometheus remote write samples"
)
.unwrap();
pub static ref OTLP_METRICS_ROWS: IntCounter = register_int_counter!(
"frontend_otlp_metrics_rows",
"greptime_frontend_otlp_metrics_rows",
"frontend otlp metrics rows"
)
.unwrap();
pub static ref OTLP_TRACES_ROWS: IntCounter = register_int_counter!(
"frontend_otlp_traces_rows",
"greptime_frontend_otlp_traces_rows",
"frontend otlp traces rows"
)
.unwrap();

View File

@@ -30,4 +30,7 @@ pub trait FstApplier: Send + Sync {
///
/// Returns a `Vec<u64>`, with each u64 being a value from the FstMap.
fn apply(&self, fst: &FstMap) -> Vec<u64>;
/// Returns the memory usage of the applier.
fn memory_usage(&self) -> usize;
}

View File

@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::mem::size_of;
use fst::map::OpBuilder;
use fst::{IntoStreamer, Streamer};
use regex_automata::dfa::dense::DFA;
@@ -68,6 +70,26 @@ impl FstApplier for IntersectionFstApplier {
}
values
}
fn memory_usage(&self) -> usize {
let mut size = self.ranges.capacity() * size_of::<Range>();
for range in &self.ranges {
size += range
.lower
.as_ref()
.map_or(0, |bound| bound.value.capacity());
size += range
.upper
.as_ref()
.map_or(0, |bound| bound.value.capacity());
}
size += self.dfas.capacity() * size_of::<DFA<Vec<u32>>>();
for dfa in &self.dfas {
size += dfa.memory_usage();
}
size
}
}
impl IntersectionFstApplier {
@@ -340,4 +362,36 @@ mod tests {
Err(Error::IntersectionApplierWithInList { .. })
));
}
#[test]
fn test_intersection_fst_applier_memory_usage() {
let applier = IntersectionFstApplier {
ranges: vec![],
dfas: vec![],
};
assert_eq!(applier.memory_usage(), 0);
let dfa = DFA::new("^abc$").unwrap();
assert_eq!(dfa.memory_usage(), 320);
let applier = IntersectionFstApplier {
ranges: vec![Range {
lower: Some(Bound {
value: b"aa".to_vec(),
inclusive: true,
}),
upper: Some(Bound {
value: b"cc".to_vec(),
inclusive: true,
}),
}],
dfas: vec![dfa],
};
assert_eq!(
applier.memory_usage(),
size_of::<Range>() + 4 + size_of::<DFA<Vec<u32>>>() + 320
);
}
}

View File

@@ -13,6 +13,7 @@
// limitations under the License.
use std::collections::HashSet;
use std::mem::size_of;
use snafu::{ensure, ResultExt};
@@ -35,6 +36,11 @@ impl FstApplier for KeysFstApplier {
fn apply(&self, fst: &FstMap) -> Vec<u64> {
self.keys.iter().filter_map(|k| fst.get(k)).collect()
}
fn memory_usage(&self) -> usize {
self.keys.capacity() * size_of::<Bytes>()
+ self.keys.iter().map(|k| k.capacity()).sum::<usize>()
}
}
impl KeysFstApplier {
@@ -302,4 +308,15 @@ mod tests {
let result = KeysFstApplier::try_from(predicates);
assert!(matches!(result, Err(Error::ParseRegex { .. })));
}
#[test]
fn test_keys_fst_applier_memory_usage() {
let applier = KeysFstApplier { keys: vec![] };
assert_eq!(applier.memory_usage(), 0);
let applier = KeysFstApplier {
keys: vec![b("foo"), b("bar")],
};
assert_eq!(applier.memory_usage(), 2 * size_of::<Bytes>() + 6);
}
}

View File

@@ -14,6 +14,8 @@
mod predicates_apply;
use std::collections::BTreeSet;
use async_trait::async_trait;
pub use predicates_apply::PredicatesIndexApplier;
@@ -24,15 +26,19 @@ use crate::inverted_index::format::reader::InvertedIndexReader;
///
/// Applier instances are reusable and work with various `InvertedIndexReader` instances,
/// avoiding repeated compilation of fixed predicates such as regex patterns.
#[mockall::automock]
#[async_trait]
pub trait IndexApplier {
/// Applies the predefined predicates to the data read by the given index reader, returning
/// a list of relevant indices (e.g., post IDs, group IDs, row IDs).
async fn apply(
async fn apply<'a>(
&self,
context: SearchContext,
reader: &mut dyn InvertedIndexReader,
) -> Result<Vec<usize>>;
reader: &mut (dyn InvertedIndexReader + 'a),
) -> Result<BTreeSet<usize>>;
/// Returns the memory usage of the applier.
fn memory_usage(&self) -> usize;
}
/// A context for searching the inverted index.

View File

@@ -12,6 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::BTreeSet;
use std::mem::size_of;
use async_trait::async_trait;
use common_base::BitVec;
use greptime_proto::v1::index::InvertedIndexMetas;
@@ -41,11 +44,11 @@ pub struct PredicatesIndexApplier {
impl IndexApplier for PredicatesIndexApplier {
/// Applies all `FstApplier`s to the data in the inverted index reader, intersecting the individual
/// bitmaps obtained for each index to result in a final set of indices.
async fn apply(
async fn apply<'a>(
&self,
context: SearchContext,
reader: &mut dyn InvertedIndexReader,
) -> Result<Vec<usize>> {
reader: &mut (dyn InvertedIndexReader + 'a),
) -> Result<BTreeSet<usize>> {
let metadata = reader.metadata().await?;
let mut bitmap = Self::bitmap_full_range(&metadata);
@@ -58,7 +61,7 @@ impl IndexApplier for PredicatesIndexApplier {
let Some(meta) = metadata.metas.get(name) else {
match context.index_not_found_strategy {
IndexNotFoundStrategy::ReturnEmpty => {
return Ok(vec![]);
return Ok(BTreeSet::default());
}
IndexNotFoundStrategy::Ignore => {
continue;
@@ -80,6 +83,16 @@ impl IndexApplier for PredicatesIndexApplier {
Ok(bitmap.iter_ones().collect())
}
/// Returns the memory usage of the applier.
fn memory_usage(&self) -> usize {
let mut size = self.fst_appliers.capacity() * size_of::<(IndexName, Box<dyn FstApplier>)>();
for (name, fst_applier) in &self.fst_appliers {
size += name.capacity();
size += fst_applier.memory_usage();
}
size
}
}
impl PredicatesIndexApplier {
@@ -197,7 +210,7 @@ mod tests {
.apply(SearchContext::default(), &mut mock_reader)
.await
.unwrap();
assert_eq!(indices, vec![0, 2, 4, 6]);
assert_eq!(indices, BTreeSet::from_iter([0, 2, 4, 6]));
// An index reader with a single tag "tag-0" but without value "tag-0_value-0"
let mut mock_reader = MockInvertedIndexReader::new();
@@ -251,7 +264,7 @@ mod tests {
.apply(SearchContext::default(), &mut mock_reader)
.await
.unwrap();
assert_eq!(indices, vec![0, 4, 6]);
assert_eq!(indices, BTreeSet::from_iter([0, 4, 6]));
}
#[tokio::test]
@@ -269,7 +282,7 @@ mod tests {
.apply(SearchContext::default(), &mut mock_reader)
.await
.unwrap();
assert_eq!(indices, vec![0, 1, 2, 3, 4, 5, 6, 7]); // full range to scan
assert_eq!(indices, BTreeSet::from_iter([0, 1, 2, 3, 4, 5, 6, 7])); // full range to scan
}
#[tokio::test]
@@ -341,6 +354,21 @@ mod tests {
)
.await
.unwrap();
assert_eq!(indices, vec![0, 1, 2, 3, 4, 5, 6, 7]);
assert_eq!(indices, BTreeSet::from_iter([0, 1, 2, 3, 4, 5, 6, 7]));
}
#[test]
fn test_index_applier_memory_usage() {
let mut mock_fst_applier = MockFstApplier::new();
mock_fst_applier.expect_memory_usage().returning(|| 100);
let applier = PredicatesIndexApplier {
fst_appliers: vec![(s("tag-0"), Box::new(mock_fst_applier))],
};
assert_eq!(
applier.memory_usage(),
size_of::<(IndexName, Box<dyn FstApplier>)>() + 5 + 100
);
}
}

View File

@@ -14,6 +14,7 @@ async-stream.workspace = true
async-trait.workspace = true
byteorder = "1.4"
bytes.workspace = true
chrono.workspace = true
common-base.workspace = true
common-config.workspace = true
common-error.workspace = true
@@ -21,7 +22,6 @@ common-macro.workspace = true
common-meta.workspace = true
common-runtime.workspace = true
common-telemetry.workspace = true
dashmap.workspace = true
futures-util.workspace = true
futures.workspace = true
protobuf = { version = "2", features = ["bytes"] }
@@ -37,4 +37,7 @@ tokio.workspace = true
[dev-dependencies]
common-meta = { workspace = true, features = ["testing"] }
common-test-util.workspace = true
itertools.workspace = true
rand.workspace = true
rand_distr = "0.4"
uuid.workspace = true

View File

@@ -18,6 +18,7 @@ use common_config::wal::KafkaWalTopic;
use common_error::ext::ErrorExt;
use common_macro::stack_trace_debug;
use common_runtime::error::Error as RuntimeError;
use serde_json::error::Error as JsonError;
use snafu::{Location, Snafu};
use crate::kafka::NamespaceImpl as KafkaNamespace;
@@ -123,20 +124,6 @@ pub enum Error {
error: String,
},
#[snafu(display("Failed to encode a record meta"))]
EncodeMeta {
location: Location,
#[snafu(source)]
error: serde_json::Error,
},
#[snafu(display("Failed to decode a record meta"))]
DecodeMeta {
location: Location,
#[snafu(source)]
error: serde_json::Error,
},
#[snafu(display("Missing required key in a record"))]
MissingKey { location: Location },
@@ -146,9 +133,16 @@ pub enum Error {
#[snafu(display("Cannot build a record from empty entries"))]
EmptyEntries { location: Location },
#[snafu(display("Failed to produce records to Kafka, topic: {}", topic))]
#[snafu(display(
"Failed to produce records to Kafka, topic: {}, size: {}, limit: {}",
topic,
size,
limit,
))]
ProduceRecord {
topic: KafkaWalTopic,
size: usize,
limit: usize,
location: Location,
#[snafu(source)]
error: rskafka::client::producer::Error,
@@ -172,6 +166,23 @@ pub enum Error {
#[snafu(display("Failed to do a cast"))]
Cast { location: Location },
#[snafu(display("Failed to encode object into json"))]
EncodeJson {
location: Location,
#[snafu(source)]
error: JsonError,
},
#[snafu(display("Failed to decode object from json"))]
DecodeJson {
location: Location,
#[snafu(source)]
error: JsonError,
},
#[snafu(display("The record sequence is not legal, error: {}", error))]
IllegalSequence { location: Location, error: String },
}
impl ErrorExt for Error {

View File

@@ -12,10 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
mod client_manager;
pub(crate) mod client_manager;
pub mod log_store;
mod offset;
mod record_utils;
pub(crate) mod util;
use std::fmt::Display;
@@ -29,8 +28,8 @@ use crate::error::Error;
/// Kafka Namespace implementation.
#[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize)]
pub struct NamespaceImpl {
region_id: u64,
topic: Topic,
pub region_id: u64,
pub topic: Topic,
}
impl Namespace for NamespaceImpl {
@@ -41,7 +40,7 @@ impl Namespace for NamespaceImpl {
impl Display for NamespaceImpl {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}/{}", self.topic, self.region_id)
write!(f, "[topic: {}, region: {}]", self.topic, self.region_id)
}
}
@@ -49,11 +48,11 @@ impl Display for NamespaceImpl {
#[derive(Debug, PartialEq, Clone)]
pub struct EntryImpl {
/// Entry payload.
data: Vec<u8>,
pub data: Vec<u8>,
/// The logical entry id.
id: EntryId,
pub id: EntryId,
/// The namespace used to identify and isolate log entries from different regions.
ns: NamespaceImpl,
pub ns: NamespaceImpl,
}
impl Entry for EntryImpl {
@@ -77,7 +76,7 @@ impl Display for EntryImpl {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Entry (ns: {}, id: {}, data_len: {})",
"Entry [ns: {}, id: {}, data_len: {}]",
self.ns,
self.id,
self.data.len()

View File

@@ -62,7 +62,7 @@ impl Client {
/// Manages client construction and accesses.
#[derive(Debug)]
pub(crate) struct ClientManager {
config: KafkaConfig,
pub(crate) config: KafkaConfig,
/// Top-level client in kafka. All clients are constructed by this client.
client_factory: RsKafkaClient,
/// A pool maintaining a collection of clients.
@@ -98,14 +98,13 @@ impl ClientManager {
/// Gets the client associated with the topic. If the client does not exist, a new one will
/// be created and returned.
pub(crate) async fn get_or_insert(&self, topic: &Topic) -> Result<Client> {
let client_pool = self.client_pool.read().await;
if let Some(client) = client_pool.get(topic) {
return Ok(client.clone());
{
let client_pool = self.client_pool.read().await;
if let Some(client) = client_pool.get(topic) {
return Ok(client.clone());
}
}
// Manullay releases the read lock.
drop(client_pool);
// Acquires the write lock.
let mut client_pool = self.client_pool.write().await;
match client_pool.get(topic) {
Some(client) => Ok(client.clone()),
@@ -134,3 +133,95 @@ impl ClientManager {
Ok(Client::new(raw_client, &self.config))
}
}
#[cfg(test)]
mod tests {
use common_meta::wal::kafka::test_util::run_test_with_kafka_wal;
use tokio::sync::Barrier;
use super::*;
use crate::test_util::kafka::create_topics;
/// Prepares for a test in that a collection of topics and a client manager are created.
async fn prepare(
test_name: &str,
num_topics: usize,
broker_endpoints: Vec<String>,
) -> (ClientManager, Vec<Topic>) {
let topics = create_topics(
num_topics,
|i| format!("{test_name}_{}_{}", i, uuid::Uuid::new_v4()),
&broker_endpoints,
)
.await;
let config = KafkaConfig {
broker_endpoints,
..Default::default()
};
let manager = ClientManager::try_new(&config).await.unwrap();
(manager, topics)
}
/// Sends `get_or_insert` requests sequentially to the client manager, and checks if it could handle them correctly.
#[tokio::test]
async fn test_sequential() {
run_test_with_kafka_wal(|broker_endpoints| {
Box::pin(async {
let (manager, topics) = prepare("test_sequential", 128, broker_endpoints).await;
// Assigns multiple regions to a topic.
let region_topic = (0..512)
.map(|region_id| (region_id, &topics[region_id % topics.len()]))
.collect::<HashMap<_, _>>();
// Gets all clients sequentially.
for (_, topic) in region_topic {
manager.get_or_insert(topic).await.unwrap();
}
// Ensures all clients exist.
let client_pool = manager.client_pool.read().await;
let all_exist = topics.iter().all(|topic| client_pool.contains_key(topic));
assert!(all_exist);
})
})
.await;
}
/// Sends `get_or_insert` requests in parallel to the client manager, and checks if it could handle them correctly.
#[tokio::test(flavor = "multi_thread")]
async fn test_parallel() {
run_test_with_kafka_wal(|broker_endpoints| {
Box::pin(async {
let (manager, topics) = prepare("test_parallel", 128, broker_endpoints).await;
// Assigns multiple regions to a topic.
let region_topic = (0..512)
.map(|region_id| (region_id, topics[region_id % topics.len()].clone()))
.collect::<HashMap<_, _>>();
// Gets all clients in parallel.
let manager = Arc::new(manager);
let barrier = Arc::new(Barrier::new(region_topic.len()));
let tasks = region_topic
.into_values()
.map(|topic| {
let manager = manager.clone();
let barrier = barrier.clone();
tokio::spawn(async move {
barrier.wait().await;
assert!(manager.get_or_insert(&topic).await.is_ok());
})
})
.collect::<Vec<_>>();
futures::future::try_join_all(tasks).await.unwrap();
// Ensures all clients exist.
let client_pool = manager.client_pool.read().await;
let all_exist = topics.iter().all(|topic| client_pool.contains_key(topic));
assert!(all_exist);
})
})
.await;
}
}

View File

@@ -26,10 +26,10 @@ use store_api::logstore::entry_stream::SendableEntryStream;
use store_api::logstore::namespace::Id as NamespaceId;
use store_api::logstore::{AppendBatchResponse, AppendResponse, LogStore};
use crate::error::{ConsumeRecordSnafu, Error, GetOffsetSnafu, Result};
use crate::error::{ConsumeRecordSnafu, Error, GetOffsetSnafu, IllegalSequenceSnafu, Result};
use crate::kafka::client_manager::{ClientManager, ClientManagerRef};
use crate::kafka::offset::Offset;
use crate::kafka::record_utils::{decode_from_record, RecordProducer};
use crate::kafka::util::offset::Offset;
use crate::kafka::util::record::{maybe_emit_entry, Record, RecordProducer};
use crate::kafka::{EntryImpl, NamespaceImpl};
/// A log store backed by Kafka.
@@ -85,8 +85,6 @@ impl LogStore for KafkaLogStore {
/// Appends a batch of entries and returns a response containing a map where the key is a region id
/// while the value is the id of the last successfully written entry of the region.
async fn append_batch(&self, entries: Vec<Self::Entry>) -> Result<AppendBatchResponse> {
debug!("LogStore handles append_batch with entries {:?}", entries);
if entries.is_empty() {
return Ok(AppendBatchResponse::default());
}
@@ -96,7 +94,7 @@ impl LogStore for KafkaLogStore {
for entry in entries {
producers
.entry(entry.ns.region_id)
.or_insert(RecordProducer::new(entry.ns.clone()))
.or_insert_with(|| RecordProducer::new(entry.ns.clone()))
.push(entry);
}
@@ -115,8 +113,6 @@ impl LogStore for KafkaLogStore {
.into_iter()
.collect::<HashMap<_, _>>();
debug!("Append batch result: {:?}", last_entry_ids);
Ok(AppendBatchResponse { last_entry_ids })
}
@@ -127,13 +123,10 @@ impl LogStore for KafkaLogStore {
ns: &Self::Namespace,
entry_id: EntryId,
) -> Result<SendableEntryStream<Self::Entry, Self::Error>> {
let topic = ns.topic.clone();
let region_id = ns.region_id;
// Gets the client associated with the topic.
let client = self
.client_manager
.get_or_insert(&topic)
.get_or_insert(&ns.topic)
.await?
.raw_client
.clone();
@@ -147,14 +140,19 @@ impl LogStore for KafkaLogStore {
.await
.context(GetOffsetSnafu { ns: ns.clone() })?
- 1;
// Reads entries with offsets in the range [start_offset, end_offset).
// Reads entries with offsets in the range [start_offset, end_offset].
let start_offset = Offset::try_from(entry_id)?.0;
debug!(
"Start reading entries in range [{}, {}] for ns {}",
start_offset, end_offset, ns
);
// Abort if there're no new entries.
// FIXME(niebayes): how come this case happens?
if start_offset > end_offset {
warn!(
"No new entries for ns {} in range [{}, {})",
"No new entries for ns {} in range [{}, {}]",
ns, start_offset, end_offset
);
return Ok(futures_util::stream::empty().boxed());
@@ -162,48 +160,56 @@ impl LogStore for KafkaLogStore {
let mut stream_consumer = StreamConsumerBuilder::new(client, StartOffset::At(start_offset))
.with_max_batch_size(self.config.max_batch_size.as_bytes() as i32)
.with_max_wait_ms(self.config.produce_record_timeout.as_millis() as i32)
.with_max_wait_ms(self.config.consumer_wait_timeout.as_millis() as i32)
.build();
debug!(
"Built a stream consumer for ns {} to consume entries in range [{}, {})",
"Built a stream consumer for ns {} to consume entries in range [{}, {}]",
ns, start_offset, end_offset
);
// Key: entry id, Value: the records associated with the entry.
let mut entry_records: HashMap<_, Vec<_>> = HashMap::new();
let ns_clone = ns.clone();
let stream = async_stream::stream!({
while let Some(consume_result) = stream_consumer.next().await {
// Each next will prdoce a `RecordAndOffset` and a high watermark offset.
// Each next on the stream consumer produces a `RecordAndOffset` and a high watermark offset.
// The `RecordAndOffset` contains the record data and its start offset.
// The high watermark offset is the end offset of the latest record in the partition.
let (record, high_watermark) = consume_result.context(ConsumeRecordSnafu {
ns: ns_clone.clone(),
})?;
let record_offset = record.offset;
// The high watermark offset is the offset of the last record plus one.
let (record_and_offset, high_watermark) =
consume_result.with_context(|_| ConsumeRecordSnafu {
ns: ns_clone.clone(),
})?;
let (kafka_record, offset) = (record_and_offset.record, record_and_offset.offset);
debug!(
"Read a record at offset {} for ns {}, high watermark: {}",
record_offset, ns_clone, high_watermark
offset, ns_clone, high_watermark
);
// Ignores noop records.
if record.record.value.is_none() {
// Ignores no-op records.
if kafka_record.value.is_none() {
if check_termination(offset, end_offset, &entry_records)? {
break;
}
continue;
}
let entries = decode_from_record(record.record)?;
// Filters entries by region id.
if let Some(entry) = entries.first()
&& entry.ns.region_id == region_id
{
yield Ok(entries);
// Filters records by namespace.
let record = Record::try_from(kafka_record)?;
if record.meta.ns != ns_clone {
if check_termination(offset, end_offset, &entry_records)? {
break;
}
continue;
}
// Terminates the stream if the entry with the end offset was read.
if record_offset >= end_offset {
debug!(
"Stream consumer for ns {} terminates at offset {}",
ns_clone, record_offset
);
// Tries to construct an entry from records consumed so far.
if let Some(entry) = maybe_emit_entry(record, &mut entry_records)? {
yield Ok(vec![entry]);
}
if check_termination(offset, end_offset, &entry_records)? {
break;
}
}
@@ -252,3 +258,226 @@ impl LogStore for KafkaLogStore {
Ok(())
}
}
fn check_termination(
offset: i64,
end_offset: i64,
entry_records: &HashMap<EntryId, Vec<Record>>,
) -> Result<bool> {
// Terminates the stream if the entry with the end offset was read.
if offset >= end_offset {
debug!("Stream consumer terminates at offset {}", offset);
// There must have no records when the stream terminates.
if !entry_records.is_empty() {
return IllegalSequenceSnafu {
error: "Found records leftover",
}
.fail();
}
Ok(true)
} else {
Ok(false)
}
}
#[cfg(test)]
mod tests {
use common_base::readable_size::ReadableSize;
use common_config::wal::KafkaWalTopic as Topic;
use rand::seq::IteratorRandom;
use super::*;
use crate::test_util::kafka::{
create_topics, entries_with_random_data, new_namespace, EntryBuilder,
};
// Stores test context for a region.
struct RegionContext {
ns: NamespaceImpl,
entry_builder: EntryBuilder,
expected: Vec<EntryImpl>,
flushed_entry_id: EntryId,
}
/// Prepares for a test in that a log store is constructed and a collection of topics is created.
async fn prepare(
test_name: &str,
num_topics: usize,
broker_endpoints: Vec<String>,
) -> (KafkaLogStore, Vec<Topic>) {
let topics = create_topics(
num_topics,
|i| format!("{test_name}_{}_{}", i, uuid::Uuid::new_v4()),
&broker_endpoints,
)
.await;
let config = KafkaConfig {
broker_endpoints,
max_batch_size: ReadableSize::kb(32),
..Default::default()
};
let logstore = KafkaLogStore::try_new(&config).await.unwrap();
// Appends a no-op record to each topic.
for topic in topics.iter() {
let last_entry_id = logstore
.append(EntryImpl {
data: vec![],
id: 0,
ns: new_namespace(topic, 0),
})
.await
.unwrap()
.last_entry_id;
assert_eq!(last_entry_id, 0);
}
(logstore, topics)
}
/// Creates a vector containing indexes of all regions if the `all` is true.
/// Otherwise, creates a subset of the indexes. The cardinality of the subset
/// is nearly a quarter of that of the universe set.
fn all_or_subset(all: bool, num_regions: usize) -> Vec<u64> {
assert!(num_regions > 0);
let amount = if all {
num_regions
} else {
(num_regions / 4).max(1)
};
(0..num_regions as u64).choose_multiple(&mut rand::thread_rng(), amount)
}
/// Builds entries for regions specified by `which`. Builds large entries if `large` is true.
/// Returns the aggregated entries.
fn build_entries(
region_contexts: &mut HashMap<u64, RegionContext>,
which: &[u64],
large: bool,
) -> Vec<EntryImpl> {
let mut aggregated = Vec::with_capacity(which.len());
for region_id in which {
let ctx = region_contexts.get_mut(region_id).unwrap();
// Builds entries for the region.
ctx.expected = if !large {
entries_with_random_data(3, &ctx.entry_builder)
} else {
// Builds a large entry of size 256KB which is way greater than the configured `max_batch_size` which is 32KB.
let large_entry = ctx.entry_builder.with_data([b'1'; 256 * 1024]);
vec![large_entry]
};
// Aggregates entries of all regions.
aggregated.push(ctx.expected.clone());
}
aggregated.into_iter().flatten().collect()
}
/// Starts a test with:
/// * `test_name` - The name of the test.
/// * `num_topics` - Number of topics to be created in the preparation phase.
/// * `num_regions` - Number of regions involved in the test.
/// * `num_appends` - Number of append operations to be performed.
/// * `all` - All regions will be involved in an append operation if `all` is true. Otherwise,
/// an append operation will only randomly choose a subset of regions.
/// * `large` - Builds large entries for each region is `large` is true.
async fn test_with(
test_name: &str,
num_topics: usize,
num_regions: usize,
num_appends: usize,
all: bool,
large: bool,
) {
let Ok(broker_endpoints) = std::env::var("GT_KAFKA_ENDPOINTS") else {
warn!("The endpoints is empty, skipping the test {test_name}");
return;
};
let broker_endpoints = broker_endpoints
.split(',')
.map(|s| s.trim().to_string())
.collect::<Vec<_>>();
let (logstore, topics) = prepare(test_name, num_topics, broker_endpoints).await;
let mut region_contexts = (0..num_regions)
.map(|i| {
let topic = &topics[i % topics.len()];
let ns = new_namespace(topic, i as u64);
let entry_builder = EntryBuilder::new(ns.clone());
(
i as u64,
RegionContext {
ns,
entry_builder,
expected: Vec::new(),
flushed_entry_id: 0,
},
)
})
.collect();
for _ in 0..num_appends {
// Appends entries for a subset of regions.
let which = all_or_subset(all, num_regions);
let entries = build_entries(&mut region_contexts, &which, large);
let last_entry_ids = logstore.append_batch(entries).await.unwrap().last_entry_ids;
// Reads entries for regions and checks for each region that the gotten entries are identical with the expected ones.
for region_id in which {
let ctx = &region_contexts[&region_id];
let stream = logstore
.read(&ctx.ns, ctx.flushed_entry_id + 1)
.await
.unwrap();
let got = stream
.collect::<Vec<_>>()
.await
.into_iter()
.flat_map(|x| x.unwrap())
.collect::<Vec<_>>();
assert_eq!(ctx.expected, got);
}
// Simulates a flush for regions.
for (region_id, last_entry_id) in last_entry_ids {
let ctx = region_contexts.get_mut(&region_id).unwrap();
ctx.flushed_entry_id = last_entry_id;
}
}
}
/// Appends entries for one region and checks all entries can be read successfully.
#[tokio::test]
async fn test_one_region() {
test_with("test_one_region", 1, 1, 1, true, false).await;
}
/// Appends entries for multiple regions and checks entries for each region can be read successfully.
/// A topic is assigned only a single region.
#[tokio::test]
async fn test_multi_regions_disjoint() {
test_with("test_multi_regions_disjoint", 5, 5, 1, true, false).await;
}
/// Appends entries for multiple regions and checks entries for each region can be read successfully.
/// A topic is assigned multiple regions.
#[tokio::test]
async fn test_multi_regions_overlapped() {
test_with("test_multi_regions_overlapped", 5, 20, 1, true, false).await;
}
/// Appends entries for multiple regions and checks entries for each region can be read successfully.
/// A topic may be assigned multiple regions. The append operation repeats for a several iterations.
/// Each append operation will only append entries for a subset of randomly chosen regions.
#[tokio::test]
async fn test_multi_appends() {
test_with("test_multi_appends", 5, 20, 3, false, false).await;
}
/// Appends large entries for multiple regions and checks entries for each region can be read successfully.
/// A topic may be assigned multiple regions.
#[tokio::test]
async fn test_append_large_entries() {
test_with("test_append_large_entries", 5, 20, 3, true, true).await;
}
}

View File

@@ -1,188 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use rskafka::record::Record;
use serde::{Deserialize, Serialize};
use snafu::{ensure, OptionExt, ResultExt};
use crate::error::{
DecodeMetaSnafu, EmptyEntriesSnafu, EncodeMetaSnafu, GetClientSnafu, MissingKeySnafu,
MissingValueSnafu, ProduceRecordSnafu, Result,
};
use crate::kafka::client_manager::ClientManagerRef;
use crate::kafka::offset::Offset;
use crate::kafka::{EntryId, EntryImpl, NamespaceImpl};
/// Record metadata which will be serialized/deserialized to/from the `key` of a Record.
#[derive(Debug, Serialize, Deserialize, PartialEq)]
struct RecordMeta {
/// Meta version. Used for backward compatibility.
version: u32,
/// The namespace of the entries wrapped in the record.
ns: NamespaceImpl,
/// Ids of the entries built into the record.
entry_ids: Vec<EntryId>,
/// entry_offsets[i] is the end offset (exclusive) of the data of the i-th entry in the record value.
entry_offsets: Vec<usize>,
}
impl RecordMeta {
fn new(ns: NamespaceImpl, entries: &[EntryImpl]) -> Self {
Self {
version: 0,
ns,
entry_ids: entries.iter().map(|entry| entry.id).collect(),
entry_offsets: entries
.iter()
.map(|entry| entry.data.len())
.scan(0, |presum, x| {
*presum += x;
Some(*presum)
})
.collect(),
}
}
}
/// Produces a record to a kafka topic.
pub(crate) struct RecordProducer {
/// The namespace of the entries.
ns: NamespaceImpl,
/// Entries are buffered before being built into a record.
entries: Vec<EntryImpl>,
}
impl RecordProducer {
/// Creates a new producer for producing entries with the given namespace.
pub(crate) fn new(ns: NamespaceImpl) -> Self {
Self {
ns,
entries: Vec::new(),
}
}
/// Populates the entry buffer with the given entries.
pub(crate) fn with_entries(self, entries: Vec<EntryImpl>) -> Self {
Self { entries, ..self }
}
/// Pushes an entry into the entry buffer.
pub(crate) fn push(&mut self, entry: EntryImpl) {
self.entries.push(entry);
}
/// Produces the buffered entries to kafka sever as a kafka record.
/// Returns the kafka offset of the produced record.
// TODO(niebayes): since the total size of a region's entries may be way-too large,
// the producer may need to support splitting entries into multiple records.
pub(crate) async fn produce(self, client_manager: &ClientManagerRef) -> Result<Offset> {
ensure!(!self.entries.is_empty(), EmptyEntriesSnafu);
// Produces the record through a client. The client determines when to send the record to kafka server.
let client = client_manager
.get_or_insert(&self.ns.topic)
.await
.map_err(|e| {
GetClientSnafu {
topic: &self.ns.topic,
error: e.to_string(),
}
.build()
})?;
client
.producer
.produce(encode_to_record(self.ns.clone(), self.entries)?)
.await
.map(Offset)
.context(ProduceRecordSnafu {
topic: &self.ns.topic,
})
}
}
fn encode_to_record(ns: NamespaceImpl, entries: Vec<EntryImpl>) -> Result<Record> {
let meta = RecordMeta::new(ns, &entries);
let data = entries.into_iter().flat_map(|entry| entry.data).collect();
Ok(Record {
key: Some(serde_json::to_vec(&meta).context(EncodeMetaSnafu)?),
value: Some(data),
timestamp: rskafka::chrono::Utc::now(),
headers: Default::default(),
})
}
pub(crate) fn decode_from_record(record: Record) -> Result<Vec<EntryImpl>> {
let key = record.key.context(MissingKeySnafu)?;
let value = record.value.context(MissingValueSnafu)?;
let meta: RecordMeta = serde_json::from_slice(&key).context(DecodeMetaSnafu)?;
let mut entries = Vec::with_capacity(meta.entry_ids.len());
let mut start_offset = 0;
for (i, end_offset) in meta.entry_offsets.iter().enumerate() {
entries.push(EntryImpl {
// TODO(niebayes): try to avoid the clone.
data: value[start_offset..*end_offset].to_vec(),
id: meta.entry_ids[i],
ns: meta.ns.clone(),
});
start_offset = *end_offset;
}
Ok(entries)
}
#[cfg(test)]
mod tests {
use super::*;
fn new_test_entry<D: AsRef<[u8]>>(data: D, entry_id: EntryId, ns: NamespaceImpl) -> EntryImpl {
EntryImpl {
data: data.as_ref().to_vec(),
id: entry_id,
ns,
}
}
#[test]
fn test_serde_record_meta() {
let ns = NamespaceImpl {
region_id: 1,
topic: "test_topic".to_string(),
};
let entries = vec![
new_test_entry(b"111", 1, ns.clone()),
new_test_entry(b"2222", 2, ns.clone()),
new_test_entry(b"33333", 3, ns.clone()),
];
let meta = RecordMeta::new(ns, &entries);
let encoded = serde_json::to_vec(&meta).unwrap();
let decoded: RecordMeta = serde_json::from_slice(&encoded).unwrap();
assert_eq!(meta, decoded);
}
#[test]
fn test_encdec_record() {
let ns = NamespaceImpl {
region_id: 1,
topic: "test_topic".to_string(),
};
let entries = vec![
new_test_entry(b"111", 1, ns.clone()),
new_test_entry(b"2222", 2, ns.clone()),
new_test_entry(b"33333", 3, ns.clone()),
];
let record = encode_to_record(ns, entries.clone()).unwrap();
let decoded_entries = decode_from_record(record).unwrap();
assert_eq!(entries, decoded_entries);
}
}

View File

@@ -0,0 +1,18 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod offset;
pub mod record;
#[cfg(test)]
mod test_util;

View File

@@ -0,0 +1,569 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use rskafka::record::Record as KafkaRecord;
use serde::{Deserialize, Serialize};
use snafu::{ensure, OptionExt, ResultExt};
use crate::error::{
DecodeJsonSnafu, EmptyEntriesSnafu, EncodeJsonSnafu, GetClientSnafu, IllegalSequenceSnafu,
MissingKeySnafu, MissingValueSnafu, ProduceRecordSnafu, Result,
};
use crate::kafka::client_manager::ClientManagerRef;
use crate::kafka::util::offset::Offset;
use crate::kafka::{EntryId, EntryImpl, NamespaceImpl};
/// The current version of Record.
pub(crate) const VERSION: u32 = 0;
/// The estimated size in bytes of a serialized RecordMeta.
/// A record is guaranteed to have sizeof(meta) + sizeof(data) <= max_batch_size - ESTIMATED_META_SIZE.
const ESTIMATED_META_SIZE: usize = 256;
/// The type of a record.
///
/// - If the entry is able to fit into a Kafka record, it's converted into a Full record.
///
/// - If the entry is too large to fit into a Kafka record, it's converted into a collection of records.
/// Those records must contain exactly one First record and one Last record, and potentially several
/// Middle records. There may be no Middle record.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum RecordType {
/// The record is self-contained, i.e. an entry's data is fully stored into this record.
Full,
/// The record contains the first part of an entry's data.
First,
/// The record contains one of the middle parts of an entry's data.
/// The sequence of the record is identified by the inner field.
Middle(usize),
/// The record contains the last part of an entry's data.
Last,
}
/// The metadata of a record.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct RecordMeta {
/// The version of the record. Used for backward compatibility.
version: u32,
/// The type of the record.
pub tp: RecordType,
/// The id of the entry the record associated with.
pub entry_id: EntryId,
/// The namespace of the entry the record associated with.
pub ns: NamespaceImpl,
}
/// The minimal storage unit in the Kafka log store.
///
/// An entry will be first converted into several Records before producing.
/// If an entry is able to fit into a KafkaRecord, it converts to a single Record.
/// If otherwise an entry cannot fit into a KafkaRecord, it will be split into a collection of Records.
///
/// A KafkaRecord is the minimal storage unit used by Kafka client and Kafka server.
/// The Kafka client produces KafkaRecords and consumes KafkaRecords, and Kafka server stores
/// a collection of KafkaRecords.
#[derive(Debug, Clone, PartialEq)]
pub(crate) struct Record {
/// The metadata of the record.
pub(crate) meta: RecordMeta,
/// The payload of the record.
data: Vec<u8>,
}
impl TryFrom<Record> for KafkaRecord {
type Error = crate::error::Error;
fn try_from(record: Record) -> Result<Self> {
let key = serde_json::to_vec(&record.meta).context(EncodeJsonSnafu)?;
Ok(KafkaRecord {
key: Some(key),
value: Some(record.data),
timestamp: chrono::Utc::now(),
headers: Default::default(),
})
}
}
impl TryFrom<KafkaRecord> for Record {
type Error = crate::error::Error;
fn try_from(kafka_record: KafkaRecord) -> Result<Self> {
let key = kafka_record.key.context(MissingKeySnafu)?;
let meta = serde_json::from_slice(&key).context(DecodeJsonSnafu)?;
let data = kafka_record.value.context(MissingValueSnafu)?;
Ok(Self { meta, data })
}
}
impl From<Vec<Record>> for EntryImpl {
fn from(records: Vec<Record>) -> Self {
let entry_id = records[0].meta.entry_id;
let ns = records[0].meta.ns.clone();
let data = records.into_iter().flat_map(|record| record.data).collect();
EntryImpl {
data,
id: entry_id,
ns,
}
}
}
/// Produces a record to a kafka topic.
pub(crate) struct RecordProducer {
/// The namespace of the entries.
ns: NamespaceImpl,
/// Entries are buffered before being built into a record.
entries: Vec<EntryImpl>,
}
impl RecordProducer {
/// Creates a new producer for producing entries with the given namespace.
pub(crate) fn new(ns: NamespaceImpl) -> Self {
Self {
ns,
entries: Vec::new(),
}
}
/// Populates the entry buffer with the given entries.
pub(crate) fn with_entries(self, entries: Vec<EntryImpl>) -> Self {
Self { entries, ..self }
}
/// Pushes an entry into the entry buffer.
pub(crate) fn push(&mut self, entry: EntryImpl) {
self.entries.push(entry);
}
/// Produces the buffered entries to Kafka sever. Those entries may span several Kafka records.
/// Returns the offset of the last successfully produced record.
pub(crate) async fn produce(self, client_manager: &ClientManagerRef) -> Result<Offset> {
ensure!(!self.entries.is_empty(), EmptyEntriesSnafu);
// Gets the producer in which a record buffer is maintained.
let producer = client_manager
.get_or_insert(&self.ns.topic)
.await
.map_err(|e| {
GetClientSnafu {
topic: &self.ns.topic,
error: e.to_string(),
}
.build()
})?
.producer;
// Stores the offset of the last successfully produced record.
let mut last_offset = None;
let max_record_size =
client_manager.config.max_batch_size.as_bytes() as usize - ESTIMATED_META_SIZE;
for entry in self.entries {
for record in build_records(entry, max_record_size) {
let kafka_record = KafkaRecord::try_from(record)?;
// Records of a certain region cannot be produced in parallel since their order must be static.
let offset = producer
.produce(kafka_record.clone())
.await
.map(Offset)
.with_context(|_| ProduceRecordSnafu {
topic: &self.ns.topic,
size: kafka_record.approximate_size(),
limit: max_record_size,
})?;
last_offset = Some(offset);
}
}
// Safety: there must be at least one record produced when the entries are guaranteed not empty.
Ok(last_offset.unwrap())
}
}
fn record_type(seq: usize, num_records: usize) -> RecordType {
if seq == 0 {
RecordType::First
} else if seq == num_records - 1 {
RecordType::Last
} else {
RecordType::Middle(seq)
}
}
fn build_records(entry: EntryImpl, max_record_size: usize) -> Vec<Record> {
if entry.data.len() <= max_record_size {
let record = Record {
meta: RecordMeta {
version: VERSION,
tp: RecordType::Full,
entry_id: entry.id,
ns: entry.ns,
},
data: entry.data,
};
return vec![record];
}
let chunks = entry.data.chunks(max_record_size);
let num_chunks = chunks.len();
chunks
.enumerate()
.map(|(i, chunk)| Record {
meta: RecordMeta {
version: VERSION,
tp: record_type(i, num_chunks),
entry_id: entry.id,
ns: entry.ns.clone(),
},
data: chunk.to_vec(),
})
.collect()
}
pub fn maybe_emit_entry(
record: Record,
entry_records: &mut HashMap<EntryId, Vec<Record>>,
) -> Result<Option<EntryImpl>> {
let mut entry = None;
match record.meta.tp {
RecordType::Full => {
entry = Some(EntryImpl::from(vec![record]));
}
RecordType::First => {
ensure!(
!entry_records.contains_key(&record.meta.entry_id),
IllegalSequenceSnafu {
error: "First record must be the first"
}
);
entry_records.insert(record.meta.entry_id, vec![record]);
}
RecordType::Middle(seq) => {
let prefix =
entry_records
.get_mut(&record.meta.entry_id)
.context(IllegalSequenceSnafu {
error: "Middle record must not be the first",
})?;
// Safety: the records are guaranteed not empty if the key exists.
let last_record = prefix.last().unwrap();
let legal = match last_record.meta.tp {
// Legal if this record follows a First record.
RecordType::First => seq == 1,
// Legal if this record follows a Middle record just prior to this record.
RecordType::Middle(last_seq) => last_seq + 1 == seq,
// Illegal sequence.
_ => false,
};
ensure!(
legal,
IllegalSequenceSnafu {
error: "Illegal prefix for a Middle record"
}
);
prefix.push(record);
}
RecordType::Last => {
// There must have a sequence prefix before a Last record is read.
let mut records =
entry_records
.remove(&record.meta.entry_id)
.context(IllegalSequenceSnafu {
error: "Missing prefix for a Last record",
})?;
records.push(record);
entry = Some(EntryImpl::from(records));
}
}
Ok(entry)
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use common_base::readable_size::ReadableSize;
use common_config::wal::KafkaConfig;
use uuid::Uuid;
use super::*;
use crate::kafka::client_manager::ClientManager;
use crate::kafka::util::test_util::run_test_with_kafka_wal;
// Implements some utility methods for testing.
impl Default for Record {
fn default() -> Self {
Self {
meta: RecordMeta {
version: VERSION,
tp: RecordType::Full,
ns: NamespaceImpl {
region_id: 0,
topic: "greptimedb_wal_topic".to_string(),
},
entry_id: 0,
},
data: Vec::new(),
}
}
}
impl Record {
/// Overrides tp.
fn with_tp(&self, tp: RecordType) -> Self {
Self {
meta: RecordMeta {
tp,
..self.meta.clone()
},
..self.clone()
}
}
/// Overrides data with the given data.
fn with_data(&self, data: &[u8]) -> Self {
Self {
data: data.to_vec(),
..self.clone()
}
}
/// Overrides entry id.
fn with_entry_id(&self, entry_id: EntryId) -> Self {
Self {
meta: RecordMeta {
entry_id,
..self.meta.clone()
},
..self.clone()
}
}
/// Overrides namespace.
fn with_ns(&self, ns: NamespaceImpl) -> Self {
Self {
meta: RecordMeta { ns, ..self.meta },
..self.clone()
}
}
}
fn new_test_entry<D: AsRef<[u8]>>(data: D, entry_id: EntryId, ns: NamespaceImpl) -> EntryImpl {
EntryImpl {
data: data.as_ref().to_vec(),
id: entry_id,
ns,
}
}
/// Tests that the `build_records` works as expected.
#[test]
fn test_build_records() {
let max_record_size = 128;
// On a small entry.
let ns = NamespaceImpl {
region_id: 1,
topic: "greptimedb_wal_topic".to_string(),
};
let entry = new_test_entry([b'1'; 100], 0, ns.clone());
let records = build_records(entry.clone(), max_record_size);
assert!(records.len() == 1);
assert_eq!(entry.data, records[0].data);
// On a large entry.
let entry = new_test_entry([b'1'; 150], 0, ns.clone());
let records = build_records(entry.clone(), max_record_size);
assert!(records.len() == 2);
assert_eq!(&records[0].data, &[b'1'; 128]);
assert_eq!(&records[1].data, &[b'1'; 22]);
// On a way-too large entry.
let entry = new_test_entry([b'1'; 5000], 0, ns.clone());
let records = build_records(entry.clone(), max_record_size);
let matched = entry
.data
.chunks(max_record_size)
.enumerate()
.all(|(i, chunk)| records[i].data == chunk);
assert!(matched);
}
/// Tests that Record and KafkaRecord are able to be converted back and forth.
#[test]
fn test_record_conversion() {
let record = Record {
meta: RecordMeta {
version: VERSION,
tp: RecordType::Full,
entry_id: 1,
ns: NamespaceImpl {
region_id: 1,
topic: "greptimedb_wal_topic".to_string(),
},
},
data: b"12345".to_vec(),
};
let kafka_record: KafkaRecord = record.clone().try_into().unwrap();
let got = Record::try_from(kafka_record).unwrap();
assert_eq!(record, got);
}
/// Tests that the reconstruction of an entry works as expected.
#[test]
fn test_reconstruct_entry() {
let template = Record::default();
let records = vec![
template.with_data(b"111").with_tp(RecordType::First),
template.with_data(b"222").with_tp(RecordType::Middle(1)),
template.with_data(b"333").with_tp(RecordType::Last),
];
let entry = EntryImpl::from(records.clone());
assert_eq!(records[0].meta.entry_id, entry.id);
assert_eq!(records[0].meta.ns, entry.ns);
assert_eq!(
entry.data,
records
.into_iter()
.flat_map(|record| record.data)
.collect::<Vec<_>>()
);
}
/// Tests that `maybe_emit_entry` works as expected.
/// This test does not check for illegal record sequences since they're already tested in the `test_check_records` test.
#[test]
fn test_maybe_emit_entry() {
let ns = NamespaceImpl {
region_id: 1,
topic: "greptimedb_wal_topic".to_string(),
};
let template = Record::default().with_ns(ns);
let mut entry_records = HashMap::from([
(
1,
vec![template.with_entry_id(1).with_tp(RecordType::First)],
),
(
2,
vec![template.with_entry_id(2).with_tp(RecordType::First)],
),
(
3,
vec![
template.with_entry_id(3).with_tp(RecordType::First),
template.with_entry_id(3).with_tp(RecordType::Middle(1)),
],
),
]);
// A Full record arrives.
let got = maybe_emit_entry(
template.with_entry_id(0).with_tp(RecordType::Full),
&mut entry_records,
)
.unwrap();
assert!(got.is_some());
// A First record arrives with no prefix.
let got = maybe_emit_entry(
template.with_entry_id(0).with_tp(RecordType::First),
&mut entry_records,
)
.unwrap();
assert!(got.is_none());
// A First record arrives with some prefix.
let got = maybe_emit_entry(
template.with_entry_id(1).with_tp(RecordType::First),
&mut entry_records,
);
assert!(got.is_err());
// A Middle record arrives with legal prefix (First).
let got = maybe_emit_entry(
template.with_entry_id(2).with_tp(RecordType::Middle(1)),
&mut entry_records,
)
.unwrap();
assert!(got.is_none());
// A Middle record arrives with legal prefix (Middle).
let got = maybe_emit_entry(
template.with_entry_id(2).with_tp(RecordType::Middle(2)),
&mut entry_records,
)
.unwrap();
assert!(got.is_none());
// A Middle record arrives with illegal prefix.
let got = maybe_emit_entry(
template.with_entry_id(2).with_tp(RecordType::Middle(1)),
&mut entry_records,
);
assert!(got.is_err());
// A Middle record arrives with no prefix.
let got = maybe_emit_entry(
template.with_entry_id(22).with_tp(RecordType::Middle(1)),
&mut entry_records,
);
assert!(got.is_err());
// A Last record arrives with no prefix.
let got = maybe_emit_entry(
template.with_entry_id(33).with_tp(RecordType::Last),
&mut entry_records,
);
assert!(got.is_err());
// A Last record arrives with legal prefix.
let got = maybe_emit_entry(
template.with_entry_id(3).with_tp(RecordType::Last),
&mut entry_records,
)
.unwrap();
assert!(got.is_some());
// Check state.
assert_eq!(entry_records.len(), 3);
assert_eq!(entry_records[&0].len(), 1);
assert_eq!(entry_records[&1].len(), 1);
assert_eq!(entry_records[&2].len(), 3);
}
#[tokio::test]
async fn test_produce_large_entry() {
run_test_with_kafka_wal(|broker_endpoints| {
Box::pin(async {
let topic = format!("greptimedb_wal_topic_{}", Uuid::new_v4());
let ns = NamespaceImpl {
region_id: 1,
topic,
};
let entry = new_test_entry([b'1'; 2000000], 0, ns.clone());
let producer = RecordProducer::new(ns.clone()).with_entries(vec![entry]);
let config = KafkaConfig {
broker_endpoints,
max_batch_size: ReadableSize::mb(1),
..Default::default()
};
let manager = Arc::new(ClientManager::try_new(&config).await.unwrap());
producer.produce(&manager).await.unwrap();
})
})
.await
}
}

View File

@@ -0,0 +1,35 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::env;
use common_telemetry::warn;
use futures_util::future::BoxFuture;
pub async fn run_test_with_kafka_wal<F>(test: F)
where
F: FnOnce(Vec<String>) -> BoxFuture<'static, ()>,
{
let Ok(endpoints) = env::var("GT_KAFKA_ENDPOINTS") else {
warn!("The endpoints is empty, skipping the test");
return;
};
let endpoints = endpoints
.split(',')
.map(|s| s.trim().to_string())
.collect::<Vec<_>>();
test(endpoints).await
}

View File

@@ -12,4 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#[cfg(test)]
pub mod kafka;
pub mod log_store_util;

View File

@@ -0,0 +1,126 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::atomic::{AtomicU64 as AtomicEntryId, Ordering};
use std::sync::Mutex;
use common_meta::wal::KafkaWalTopic as Topic;
use rand::distributions::Alphanumeric;
use rand::rngs::ThreadRng;
use rand::{thread_rng, Rng};
use rskafka::client::ClientBuilder;
use store_api::logstore::EntryId;
use crate::kafka::{EntryImpl, NamespaceImpl};
/// Creates `num_topiocs` number of topics each will be decorated by the given decorator.
pub async fn create_topics<F>(
num_topics: usize,
decorator: F,
broker_endpoints: &[String],
) -> Vec<Topic>
where
F: Fn(usize) -> String,
{
assert!(!broker_endpoints.is_empty());
let client = ClientBuilder::new(broker_endpoints.to_vec())
.build()
.await
.unwrap();
let ctrl_client = client.controller_client().unwrap();
let (topics, tasks): (Vec<_>, Vec<_>) = (0..num_topics)
.map(|i| {
let topic = decorator(i);
let task = ctrl_client.create_topic(topic.clone(), 1, 1, 500);
(topic, task)
})
.unzip();
futures::future::try_join_all(tasks).await.unwrap();
topics
}
/// Creates a new Kafka namespace with the given topic and region id.
pub fn new_namespace(topic: &str, region_id: u64) -> NamespaceImpl {
NamespaceImpl {
topic: topic.to_string(),
region_id,
}
}
/// A builder for building entries for a namespace.
pub struct EntryBuilder {
/// The namespace of the entries.
ns: NamespaceImpl,
/// The next entry id to allocate. It starts from 0 by default.
next_entry_id: AtomicEntryId,
/// A generator for supporting random data generation.
/// Wrapped with Mutex<Option<_>> to provide interior mutability.
rng: Mutex<Option<ThreadRng>>,
}
impl EntryBuilder {
/// Creates an EntryBuilder for the given namespace.
pub fn new(ns: NamespaceImpl) -> Self {
Self {
ns,
next_entry_id: AtomicEntryId::new(0),
rng: Mutex::new(Some(thread_rng())),
}
}
/// Sets the next entry id to the given entry id.
pub fn next_entry_id(self, entry_id: EntryId) -> Self {
Self {
next_entry_id: AtomicEntryId::new(entry_id),
..self
}
}
/// Skips the next `step` entry ids and returns the next entry id after the stepping.
pub fn skip(&mut self, step: EntryId) -> EntryId {
let old = self.next_entry_id.fetch_add(step, Ordering::Relaxed);
old + step
}
/// Builds an entry with the given data.
pub fn with_data<D: AsRef<[u8]>>(&self, data: D) -> EntryImpl {
EntryImpl {
data: data.as_ref().to_vec(),
id: self.alloc_entry_id(),
ns: self.ns.clone(),
}
}
/// Builds an entry with random data.
pub fn with_random_data(&self) -> EntryImpl {
self.with_data(self.make_random_data())
}
fn alloc_entry_id(&self) -> EntryId {
self.next_entry_id.fetch_add(1, Ordering::Relaxed)
}
fn make_random_data(&self) -> Vec<u8> {
let mut guard = self.rng.lock().unwrap();
let rng = guard.as_mut().unwrap();
(0..42).map(|_| rng.sample(Alphanumeric)).collect()
}
}
/// Builds a batch of entries each with random data.
pub fn entries_with_random_data(batch_size: usize, builder: &EntryBuilder) -> Vec<EntryImpl> {
(0..batch_size)
.map(|_| builder.with_random_data())
.collect()
}

View File

@@ -14,7 +14,7 @@
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_error::{GREPTIME_ERROR_CODE, GREPTIME_ERROR_MSG};
use common_error::{GREPTIME_DB_HEADER_ERROR_CODE, GREPTIME_DB_HEADER_ERROR_MSG};
use common_macro::stack_trace_debug;
use snafu::{Location, Snafu};
use tonic::Status;
@@ -117,7 +117,7 @@ impl From<Status> for Error {
.and_then(|v| String::from_utf8(v.as_bytes().to_vec()).ok())
}
let code = get_metadata_value(&e, GREPTIME_ERROR_CODE)
let code = get_metadata_value(&e, GREPTIME_DB_HEADER_ERROR_CODE)
.and_then(|s| {
if let Ok(code) = s.parse::<u32>() {
StatusCode::from_u32(code)
@@ -127,8 +127,8 @@ impl From<Status> for Error {
})
.unwrap_or(StatusCode::Internal);
let msg =
get_metadata_value(&e, GREPTIME_ERROR_MSG).unwrap_or_else(|| e.message().to_string());
let msg = get_metadata_value(&e, GREPTIME_DB_HEADER_ERROR_MSG)
.unwrap_or_else(|| e.message().to_string());
Self::MetaServer { code, msg }
}

View File

@@ -94,7 +94,7 @@ impl MetaSrvInstance {
self.meta_srv.try_start().await?;
if let Some(t) = self.export_metrics_task.as_ref() {
t.start()
t.start(None).context(InitExportMetricsTaskSnafu)?
}
let (tx, rx) = mpsc::channel::<()>(1);

View File

@@ -327,6 +327,13 @@ pub enum Error {
location: Location,
},
#[snafu(display("Datanode table not found: {}, datanode: {}", table_id, datanode_id))]
DatanodeTableNotFound {
table_id: TableId,
datanode_id: DatanodeId,
location: Location,
},
#[snafu(display("Table route corrupted, key: {}, reason: {}", key, reason))]
CorruptedTableRoute {
key: String,
@@ -683,6 +690,7 @@ impl ErrorExt for Error {
| Error::InvalidRegionKeyFromUtf8 { .. }
| Error::TableRouteNotFound { .. }
| Error::TableInfoNotFound { .. }
| Error::DatanodeTableNotFound { .. }
| Error::CorruptedTableRoute { .. }
| Error::MoveValue { .. }
| Error::InvalidUtf8Value { .. }

View File

@@ -225,6 +225,7 @@ impl MetaSrvBuilder {
TableMetadataAllocator::with_peer_allocator(
sequence,
wal_options_allocator.clone(),
table_metadata_manager.clone(),
peer_allocator,
)
});

View File

@@ -18,31 +18,31 @@ use prometheus::*;
lazy_static! {
/// Elapsed time to responding kv requests.
pub static ref METRIC_META_KV_REQUEST_ELAPSED: HistogramVec = register_histogram_vec!(
"meta_kv_request_elapsed",
"greptime_meta_kv_request_elapsed",
"meta kv request",
&["target", "op", "cluster_id"]
)
.unwrap();
/// The heartbeat connection gauge.
pub static ref METRIC_META_HEARTBEAT_CONNECTION_NUM: IntGauge = register_int_gauge!(
"meta_heartbeat_connection_num",
"greptime_meta_heartbeat_connection_num",
"meta heartbeat connection num"
)
.unwrap();
/// Elapsed time to execution of heartbeat handlers.
pub static ref METRIC_META_HANDLER_EXECUTE: HistogramVec =
register_histogram_vec!("meta_handler_execute", "meta handler execute", &["name"]).unwrap();
register_histogram_vec!("greptime_meta_handler_execute", "meta handler execute", &["name"]).unwrap();
/// Inactive region gauge.
pub static ref METRIC_META_INACTIVE_REGIONS: IntGauge =
register_int_gauge!("meta_inactive_regions", "meta inactive regions").unwrap();
register_int_gauge!("greptime_meta_inactive_regions", "meta inactive regions").unwrap();
/// Elapsed time to leader cache kv.
pub static ref METRIC_META_LEADER_CACHED_KV_LOAD_ELAPSED: HistogramVec =
register_histogram_vec!("meta_leader_cache_kv_load", "meta load cache", &["prefix"])
register_histogram_vec!("greptime_meta_leader_cache_kv_load", "meta load cache", &["prefix"])
.unwrap();
/// Meta kv cache hit counter.
pub static ref METRIC_META_KV_CACHE_HIT: IntCounterVec =
register_int_counter_vec!("meta_kv_cache_hit", "meta kv cache hit", &["op"]).unwrap();
register_int_counter_vec!("greptime_meta_kv_cache_hit", "meta kv cache hit", &["op"]).unwrap();
/// Meta kv cache miss counter.
pub static ref METRIC_META_KV_CACHE_MISS: IntCounterVec =
register_int_counter_vec!("meta_kv_cache_miss", "meta kv cache miss", &["op"]).unwrap();
register_int_counter_vec!("greptime_meta_kv_cache_miss", "meta kv cache miss", &["op"]).unwrap();
}

View File

@@ -16,4 +16,4 @@ pub mod region_failover;
pub mod region_migration;
#[cfg(test)]
mod tests;
mod utils;
pub mod utils;

View File

@@ -373,7 +373,7 @@ impl Procedure for RegionFailoverProcedure {
fn lock_key(&self) -> LockKey {
let region_ident = &self.node.failed_region;
let region_key = region_lock_key(region_ident.table_id, region_ident.region_number);
LockKey::single(region_key)
LockKey::single_exclusive(region_key)
}
}

View File

@@ -17,13 +17,14 @@ use std::time::Duration;
use api::v1::meta::MailboxMessage;
use async_trait::async_trait;
use common_meta::ddl::utils::region_storage_path;
use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply};
use common_meta::key::datanode_table::{DatanodeTableKey, RegionInfo};
use common_meta::peer::Peer;
use common_meta::RegionIdent;
use common_telemetry::{debug, info};
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt};
use store_api::storage::RegionNumber;
use super::update_metadata::UpdateRegionMetadata;
use super::{RegionFailoverContext, State};
@@ -44,7 +45,7 @@ pub(super) struct ActivateRegion {
// An `None` option stands for uninitialized.
region_storage_path: Option<String>,
region_options: Option<HashMap<String, String>>,
region_wal_options: Option<HashMap<String, String>>,
region_wal_options: Option<HashMap<RegionNumber, String>>,
}
impl ActivateRegion {
@@ -65,27 +66,31 @@ impl ActivateRegion {
timeout: Duration,
) -> Result<MailboxReceiver> {
let table_id = failed_region.table_id;
let table_info = ctx
// Retrieves the wal options from failed datanode table value.
let datanode_table_value = ctx
.table_metadata_manager
.table_info_manager()
.get(table_id)
.datanode_table_manager()
.get(&DatanodeTableKey::new(failed_region.datanode_id, table_id))
.await
.context(error::TableMetadataManagerSnafu)?
.context(error::TableInfoNotFoundSnafu { table_id })?
.into_inner()
.table_info;
let region_storage_path =
region_storage_path(&table_info.catalog_name, &table_info.schema_name);
.context(error::DatanodeTableNotFoundSnafu {
table_id,
datanode_id: failed_region.datanode_id,
})?;
let candidate_ident = RegionIdent {
datanode_id: self.candidate.id,
..failed_region.clone()
};
info!("Activating region: {candidate_ident:?}");
let region_options: HashMap<String, String> = (&table_info.meta.options).into();
// TODO(niebayes): properly fetch or construct region wal options.
let region_wal_options = HashMap::new();
let RegionInfo {
region_storage_path,
region_options,
region_wal_options,
..
} = datanode_table_value.region_info;
let instruction = Instruction::OpenRegion(OpenRegion::new(
candidate_ident.clone(),
&region_storage_path,

Some files were not shown because too many files have changed in this diff Show More