Compare commits

...

49 Commits

Author SHA1 Message Date
Ruihang Xia
038bc4fe6e revert toml format
Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-14 00:46:00 +08:00
Ruihang Xia
6d07c422d8 Merge branch 'main' into fix-proto-clear
Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-14 00:36:28 +08:00
Ruihang Xia
6c14ece23f accomplish test assertion
Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-14 00:32:49 +08:00
Ruihang Xia
89c51d9b87 reset Sample
Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-13 23:32:22 +08:00
Weny Xu
e4333969b4 feat(fuzz): add alter table target (#3503)
* feat(fuzz): validate semantic type of column

* feat(fuzz): add fuzz_alter_table target

* feat(fuzz): validate columns

* chore(ci): add fuzz_alter_table ci cfg
2024-03-13 14:11:47 +00:00
Zhenchi
b55905cf66 feat(fuzz): add insert target (#3499)
* fix(common-time): allow building nanos timestamp from parts split from i64::MIN

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* feat(fuzz): add insert target

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* chore: cleanup cargo.toml and polish comments

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2024-03-13 10:03:03 +00:00
WU Jingdi
fb4da05f25 fix: adjust fill behavior of range query (#3489) 2024-03-13 09:20:34 +00:00
Zhenchi
904484b525 fix(common-time): allow building nanos timestamp from parts split from i64::MIN (#3493)
Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2024-03-13 02:46:00 +00:00
tison
cafb4708ce refactor: validate constraints eagerly (#3472)
* chore: validate constraints eagerly

Signed-off-by: tison <wander4096@gmail.com>

* use timestamp column

Signed-off-by: tison <wander4096@gmail.com>

* fixup

Signed-off-by: tison <wander4096@gmail.com>

* lint

Signed-off-by: tison <wander4096@gmail.com>

* compile

Signed-off-by: tison <wander4096@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
2024-03-12 13:09:34 +00:00
Yingwen
7c895e2605 perf: more benchmarks for memtables (#3491)
* chore: remove duplicate bench

* refactor: rename bench

* perf: add full scan bench for memtable

* feat: filter bench and add time series to bench group

* chore: comment

* refactor: rename

* style: fix clippy
2024-03-12 12:02:58 +00:00
Lei, HUANG
9afe327bca feat: improve prom write requests decode performance (#3478)
* feat: optimize decode performance

* fix: some cr comments
2024-03-12 12:00:38 +00:00
discord9
58bd065c6b feat(flow): plan def (#3490)
* feat: plan def

* chore: add license

* docs: remove TODO done

* chore: add derive Ord
2024-03-12 10:59:07 +00:00
Yingwen
9aa8f756ab fix: allow passing extra table options (#3484)
* fix: do not check options in parser

* test: fix tests

* test: fix sqlness

* test: add sqlness test

* chore: log options

* chore: must specify compaction type

* feat: validate option key

* feat: add option key validation back
2024-03-12 07:03:52 +00:00
discord9
7639c227ca feat(flow): accumlator for aggr func (#3396)
* feat: Accumlator trait

* feat: add `OrdValue` accum&use enum_dispatch

* test: more accum test

* feat: eval aggr funcs

* chore: refactor test&fmt clippy

* refactor: less verbose

* test: more tests

* refactor: better err handling&use OrdValue for Count

* refactor: ignore null&more tests for error handle

* refactor: OrdValue accum

* chore: extract null check

* refactor: def&use fn signature

* chore: use extra cond with match guard

* chore: per review
2024-03-12 02:09:27 +00:00
tison
1255c1fc9e feat: to_timezone function (#3470)
* feat: to_timezone function

Signed-off-by: tison <wander4096@gmail.com>

* impl Function for ToTimezoneFunction

Signed-off-by: tison <wander4096@gmail.com>

* add test

Signed-off-by: tison <wander4096@gmail.com>

* Add original authors

Co-authored-by: parkma99 <park-ma@hotmail.com>
Co-authored-by: Yingwen <realevenyag@gmail.com>

* fixup

Signed-off-by: tison <wander4096@gmail.com>

* address comments

Signed-off-by: tison <wander4096@gmail.com>

* add issue link

Signed-off-by: tison <wander4096@gmail.com>

* code refactor

Signed-off-by: tison <wander4096@gmail.com>

* further tidy

Signed-off-by: tison <wander4096@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
Co-authored-by: parkma99 <park-ma@hotmail.com>
Co-authored-by: Yingwen <realevenyag@gmail.com>
2024-03-12 01:46:19 +00:00
Yingwen
06dcd0f6ed fix: freeze data buffer in shard (#3468)
* feat: call freeze if the active data buffer in a shard is full

* chore: more metrics

* chore: print metrics

* chore: enlarge freeze threshold

* test: test freeze

* test: fix config test
2024-03-11 14:51:06 +00:00
Weny Xu
0a4444a43a feat(fuzz): validate columns (#3485) 2024-03-11 11:34:50 +00:00
Ruihang Xia
b7ac8d6aa8 ci: use another mirror for etcd image (#3486)
Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-11 10:40:19 +00:00
Weny Xu
e767f37241 fix: fix f64 has no sufficient precision during parsing (#3483) 2024-03-11 09:28:40 +00:00
JeremyHi
da098f5568 fix: make max-txn-ops limit valid (#3481) 2024-03-11 09:27:51 +00:00
shuiyisong
aa953dcc34 fix: impl RecordBatchStream method explicitly (#3482)
fix: impl RecordBatchStream method explicitly
2024-03-11 09:07:10 +00:00
crwen
aa125a50f9 refactor: make http api returns non-200 status code (#3473)
* refactor: make http api returns non-200 status code

* recover some code
2024-03-11 03:38:36 +00:00
Ruihang Xia
d8939eb891 feat: clamp function (#3465)
* basic impl

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add unit tests

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* a little type exercise

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add sqlness case

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-11 03:26:10 +00:00
shuiyisong
0bb949787c refactor: introduce new Output with OutputMeta (#3466)
* refactor: introduce new output struct

* chore: add helper function

* chore: update comment

* chore: update commit

Co-authored-by: Ruihang Xia <waynestxia@gmail.com>

* chore: rename according to cr

---------

Co-authored-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-11 02:24:09 +00:00
WU Jingdi
8c37c3fc0f feat: support first_value/last_value in range query (#3448)
* feat: support `first_value/last_value` in range query

* chore: add sqlness test on `count`

* chore: add test
2024-03-11 01:30:39 +00:00
gcmutator
21ff3620be chore: remove repetitive words (#3469)
remove repetitive words

Signed-off-by: gcmutator <329964069@qq.com>
2024-03-09 04:18:47 +00:00
Eugene Tolbakov
aeca0d8e8a feat(influxdb): add db query param support for v2 write api (#3445)
* feat(influxdb): add db query param support for v2 write api

* fix(influxdb): update authorize logic to get catalog and schema from query string

* fix(influxdb): address CR suggestions

* fix(influxdb): use the correct import
2024-03-08 08:17:57 +00:00
Weny Xu
a309cd018a fix: fix incorrect COM_STMT_PREPARE reply (#3463)
* fix: fix incorrect `COM_STMT_PREPARE` reply

* chore: use column name instead of index
2024-03-08 07:31:20 +00:00
Yingwen
3ee53360ee perf: Reduce decode overhead during pruning keys in the memtable (#3415)
* feat: reuse value buf

* feat: skip values to decode

* feat: prune shard

chore: fix compiler errors

refactor: shard prune metrics

* fix: panic on DedupReader::try_new

* fix: prune after next

* chore: num parts metrics

* feat: metrics and logs

* chore: data build cost

* chore: more logs

* feat: cache skip result

* chore: todo

* fix: index out of bound

* test: test codec

* fix: invalid offsets

* fix: skip binary

* fix: offset buffer reuse

* chore: comment

* test: test memtable filter

* style: fix clippy

* chore: fix compiler error
2024-03-08 02:54:00 +00:00
JeremyHi
352bd7b6fd feat: max-txn-ops option (#3458)
* feat: max-txn-ops limit

* chore: by comment
2024-03-08 02:34:40 +00:00
Weny Xu
3f3ef2e7af refactor: separate the quote char and value (#3455)
refactor: use ident instead of string
2024-03-07 08:24:09 +00:00
Weny Xu
a218f12bd9 test: add fuzz test for create table (#3441)
* feat: add create table fuzz test

* chore: add ci cfg for fuzz tests

* refactor: remove redundant nightly config

* chore: run fuzz test in debug mode

* chore: use ubuntu-latest

* fix: close connection

* chore: add cache in fuzz test ci

* chore: apply suggestion from CR

* chore: apply suggestion from CR

* chore: refactor the fuzz test action
2024-03-07 06:51:19 +00:00
ZonaHe
c884c56151 feat: update dashboard to v0.4.8 (#3450)
Co-authored-by: ZonaHex <ZonaHex@users.noreply.github.com>
2024-03-07 04:06:07 +00:00
Weny Xu
9ec288cab9 chore: specify binary name (#3449) 2024-03-07 03:56:24 +00:00
LFC
1f1491e429 feat: impl some "set"s to adapt to some client apps (#3443) 2024-03-06 13:15:48 +00:00
Weny Xu
c52bc613e0 chore: add bin opt to build cmd (#3440) 2024-03-06 08:24:55 +00:00
shuiyisong
a9d42f7b87 fix: add support for influxdb basic auth (#3437) 2024-03-06 03:56:25 +00:00
tison
86ce2d8713 build(deps): upgrade opendal to 0.45.1 (#3432)
* build(deps): upgrade opendal to 0.45.1

Signed-off-by: tison <wander4096@gmail.com>

* Update src/object-store/Cargo.toml

Co-authored-by: Weny Xu <wenymedia@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
Co-authored-by: Weny Xu <wenymedia@gmail.com>
2024-03-06 03:08:59 +00:00
Yingwen
5d644c0b7f chore: bump version to v0.7.0 (#3433) 2024-03-05 12:07:37 +00:00
Ruihang Xia
020635063c feat: implement multi-dim partition rule (#3409)
* generate expr rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* implement show create for new partition rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* implement row spliter

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix: fix failed tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: fix lint issues

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: ignore tests for deprecated partition rule

* chore: remove unused partition rule tests setup

* test(sqlness): add basic partition tests

* test(multi_dim): add basic find region test

* address CR comments

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Signed-off-by: WenyXu <wenymedia@gmail.com>
Co-authored-by: WenyXu <wenymedia@gmail.com>
2024-03-05 11:39:15 +00:00
dependabot[bot]
97cbfcfe23 build(deps): bump mio from 0.8.10 to 0.8.11 (#3434)
Bumps [mio](https://github.com/tokio-rs/mio) from 0.8.10 to 0.8.11.
- [Release notes](https://github.com/tokio-rs/mio/releases)
- [Changelog](https://github.com/tokio-rs/mio/blob/master/CHANGELOG.md)
- [Commits](https://github.com/tokio-rs/mio/compare/v0.8.10...v0.8.11)

---
updated-dependencies:
- dependency-name: mio
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-03-05 11:04:14 +00:00
Lei, HUANG
7183fa198c refactor: make MergeTreeMemtable the default choice (#3430)
* refactor: make MergeTreeMemtable the default choice

* refactor: reformat

* chore: add doc to config
2024-03-05 10:00:08 +00:00
Lei, HUANG
02b18fbca1 feat: decode prom requests to grpc (#3425)
* hack: inline decode

* move to servers

* fix: samples lost

* add bench

* remove useless functions

* wip

* feat: remove object pools

* fix: minor issues

* fix: remove useless dep

* chore: rebase main

* format

* finish

* fix: format

* feat: introduce request pool

* try to fix license issue

* fix: clippy

* resolve comments

* fix:typo

* remove useless comments
2024-03-05 09:47:32 +00:00
shuiyisong
7b1c3503d0 fix: complete interceptors for all frontend entry (#3428) 2024-03-05 09:38:47 +00:00
liyang
6fd2ff49d5 ci: refine windows output env (#3431) 2024-03-05 08:38:28 +00:00
WU Jingdi
53f2a5846c feat: support tracing rule sampler (#3405)
* feat: support tracing rule sampler

* chore: simplify code
2024-03-05 15:40:02 +08:00
Yingwen
49157868f9 feat: Correct server metrics and add more metrics for scan (#3426)
* feat: drop timer on stream terminated

* refactor: combine metrics into a histogram vec

* refactor: frontend grpc metrics

* feat: add metrics middleware layer to grpc server

* refactor: move http metrics layer to metrics mod

* feat: bucket for grpc/http elapsed

* feat: remove duplicate metrics

* style: fix cilppy

* fix: incorrect bucket of promql series

* feat: more metrics for mito

* feat: convert cost

* test: fix metrics test
2024-03-04 10:15:10 +00:00
Ruihang Xia
ae2c18e1cf docs(rfcs): multi-dimension partition rule (#3350)
* docs(rfcs): multi-dimension partition rule

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* change math block type

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update tracking issue

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update discussion

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-04 08:10:54 +00:00
dennis zhuang
e6819412c5 refactor: show tables and show databases (#3423)
* refactor: show tables and show databases

* chore: clean code
2024-03-04 06:15:17 +00:00
262 changed files with 11747 additions and 2574 deletions

10
.editorconfig Normal file
View File

@@ -0,0 +1,10 @@
root = true
[*]
end_of_line = lf
indent_style = space
insert_final_newline = true
trim_trailing_whitespace = true
[{Makefile,**.mk}]
indent_style = tab

View File

@@ -21,3 +21,6 @@ GT_GCS_CREDENTIAL_PATH = GCS credential path
GT_GCS_ENDPOINT = GCS end point
# Settings for kafka wal test
GT_KAFKA_ENDPOINTS = localhost:9092
# Setting for fuzz tests
GT_MYSQL_ADDR = localhost:4002

13
.github/actions/fuzz-test/action.yaml vendored Normal file
View File

@@ -0,0 +1,13 @@
name: Fuzz Test
description: 'Fuzz test given setup and service'
inputs:
target:
description: "The fuzz target to test"
runs:
using: composite
steps:
- name: Run Fuzz Test
shell: bash
run: cargo fuzz run ${{ inputs.target }} --fuzz-dir tests-fuzz -D -s none -- -max_total_time=120
env:
GT_MYSQL_ADDR: 127.0.0.1:4002

View File

@@ -102,7 +102,7 @@ jobs:
shared-key: "build-binaries"
- name: Build greptime binaries
shell: bash
run: cargo build
run: cargo build --bin greptime --bin sqlness-runner
- name: Pack greptime binaries
shell: bash
run: |
@@ -117,6 +117,46 @@ jobs:
artifacts-dir: bins
version: current
fuzztest:
name: Fuzz Test
needs: build
runs-on: ubuntu-latest
strategy:
matrix:
target: [ "fuzz_create_table", "fuzz_alter_table" ]
steps:
- uses: actions/checkout@v4
- uses: arduino/setup-protoc@v3
- uses: dtolnay/rust-toolchain@master
with:
toolchain: ${{ env.RUST_TOOLCHAIN }}
- name: Rust Cache
uses: Swatinem/rust-cache@v2
with:
# Shares across multiple jobs
shared-key: "fuzz-test-targets"
- name: Set Rust Fuzz
shell: bash
run: |
sudo apt update && sudo apt install -y libfuzzer-14-dev
cargo install cargo-fuzz
- name: Download pre-built binaries
uses: actions/download-artifact@v4
with:
name: bins
path: .
- name: Unzip binaries
run: tar -xvf ./bins.tar.gz
- name: Run GreptimeDB
run: |
./bins/greptime standalone start&
- name: Fuzz Test
uses: ./.github/actions/fuzz-test
env:
CUSTOM_LIBFUZZER_PATH: /usr/lib/llvm-14/lib/libFuzzer.a
with:
target: ${{ matrix.target }}
sqlness:
name: Sqlness Test
needs: build

View File

@@ -91,7 +91,7 @@ env:
# The scheduled version is '${{ env.NEXT_RELEASE_VERSION }}-nightly-YYYYMMDD', like v0.2.0-nigthly-20230313;
NIGHTLY_RELEASE_PREFIX: nightly
# Note: The NEXT_RELEASE_VERSION should be modified manually by every formal release.
NEXT_RELEASE_VERSION: v0.7.0
NEXT_RELEASE_VERSION: v0.8.0
jobs:
allocate-runners:
@@ -288,7 +288,7 @@ jobs:
- name: Set build windows result
id: set-build-windows-result
run: |
echo "build-windows-result=success" >> $GITHUB_OUTPUT
echo "build-windows-result=success" >> $Env:GITHUB_OUTPUT
release-images-to-dockerhub:
name: Build and push images to DockerHub

4
.gitignore vendored
View File

@@ -46,3 +46,7 @@ benchmarks/data
*.code-workspace
venv/
# Fuzz tests
tests-fuzz/artifacts/
tests-fuzz/corpus/

315
Cargo.lock generated
View File

@@ -29,6 +29,17 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
[[package]]
name = "aes"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
dependencies = [
"cfg-if 1.0.0",
"cipher",
"cpufeatures",
]
[[package]]
name = "ahash"
version = "0.7.7"
@@ -196,7 +207,7 @@ checksum = "8f1f8f5a6f3d50d89e3797d7593a50f96bb2aaa20ca0cc7be1fb673232c91d72"
[[package]]
name = "api"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"common-base",
"common-decimal",
@@ -241,6 +252,15 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "arbitrary"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
dependencies = [
"derive_arbitrary",
]
[[package]]
name = "arc-swap"
version = "1.6.0"
@@ -675,7 +695,7 @@ dependencies = [
[[package]]
name = "auth"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -861,7 +881,7 @@ dependencies = [
[[package]]
name = "benchmarks"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arrow",
"chrono",
@@ -992,6 +1012,15 @@ dependencies = [
"generic-array",
]
[[package]]
name = "block-padding"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93"
dependencies = [
"generic-array",
]
[[package]]
name = "borsh"
version = "1.3.0"
@@ -1219,7 +1248,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "catalog"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arc-swap",
@@ -1266,6 +1295,15 @@ dependencies = [
"tokio",
]
[[package]]
name = "cbc"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6"
dependencies = [
"cipher",
]
[[package]]
name = "cc"
version = "1.0.83"
@@ -1421,6 +1459,16 @@ dependencies = [
"half 1.8.2",
]
[[package]]
name = "cipher"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
dependencies = [
"crypto-common",
"inout",
]
[[package]]
name = "clang-sys"
version = "1.6.1"
@@ -1510,7 +1558,7 @@ checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
[[package]]
name = "client"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arc-swap",
@@ -1546,7 +1594,7 @@ dependencies = [
"session",
"snafu",
"substrait 0.17.1",
"substrait 0.6.0",
"substrait 0.7.0",
"tokio",
"tokio-stream",
"tonic 0.10.2",
@@ -1576,7 +1624,7 @@ dependencies = [
[[package]]
name = "cmd"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"anymap",
"async-trait",
@@ -1629,7 +1677,7 @@ dependencies = [
"session",
"snafu",
"store-api",
"substrait 0.6.0",
"substrait 0.7.0",
"table",
"temp-env",
"tikv-jemallocator",
@@ -1672,7 +1720,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
[[package]]
name = "common-base"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"anymap",
"bitvec",
@@ -1687,7 +1735,7 @@ dependencies = [
[[package]]
name = "common-catalog"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"chrono",
"common-error",
@@ -1698,7 +1746,7 @@ dependencies = [
[[package]]
name = "common-config"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"common-base",
"humantime-serde",
@@ -1709,7 +1757,7 @@ dependencies = [
[[package]]
name = "common-datasource"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arrow",
"arrow-schema",
@@ -1741,7 +1789,7 @@ dependencies = [
[[package]]
name = "common-decimal"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arrow",
"bigdecimal",
@@ -1755,7 +1803,7 @@ dependencies = [
[[package]]
name = "common-error"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"snafu",
"strum 0.25.0",
@@ -1763,7 +1811,7 @@ dependencies = [
[[package]]
name = "common-function"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arc-swap",
@@ -1798,7 +1846,7 @@ dependencies = [
[[package]]
name = "common-greptimedb-telemetry"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"common-error",
@@ -1817,7 +1865,7 @@ dependencies = [
[[package]]
name = "common-grpc"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arrow-flight",
@@ -1847,7 +1895,7 @@ dependencies = [
[[package]]
name = "common-grpc-expr"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -1866,7 +1914,7 @@ dependencies = [
[[package]]
name = "common-macro"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arc-swap",
"common-query",
@@ -1881,7 +1929,7 @@ dependencies = [
[[package]]
name = "common-mem-prof"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"common-error",
"common-macro",
@@ -1894,7 +1942,7 @@ dependencies = [
[[package]]
name = "common-meta"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-recursion",
@@ -1944,11 +1992,11 @@ dependencies = [
[[package]]
name = "common-plugins"
version = "0.6.0"
version = "0.7.0"
[[package]]
name = "common-procedure"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-stream",
"async-trait",
@@ -1972,7 +2020,7 @@ dependencies = [
[[package]]
name = "common-procedure-test"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"common-procedure",
@@ -1980,7 +2028,7 @@ dependencies = [
[[package]]
name = "common-query"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -2003,7 +2051,7 @@ dependencies = [
[[package]]
name = "common-recordbatch"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arc-swap",
"common-base",
@@ -2023,7 +2071,7 @@ dependencies = [
[[package]]
name = "common-runtime"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"common-error",
@@ -2043,7 +2091,7 @@ dependencies = [
[[package]]
name = "common-telemetry"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"atty",
"backtrace",
@@ -2071,7 +2119,7 @@ dependencies = [
[[package]]
name = "common-test-util"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"client",
"common-query",
@@ -2083,7 +2131,7 @@ dependencies = [
[[package]]
name = "common-time"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arrow",
"chrono",
@@ -2099,14 +2147,14 @@ dependencies = [
[[package]]
name = "common-version"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"build-data",
]
[[package]]
name = "common-wal"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"common-base",
"common-error",
@@ -2754,7 +2802,7 @@ dependencies = [
[[package]]
name = "datanode"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arrow-flight",
@@ -2812,7 +2860,7 @@ dependencies = [
"snafu",
"sql",
"store-api",
"substrait 0.6.0",
"substrait 0.7.0",
"table",
"tokio",
"tokio-stream",
@@ -2826,7 +2874,7 @@ dependencies = [
[[package]]
name = "datatypes"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arrow",
"arrow-array",
@@ -2912,6 +2960,17 @@ dependencies = [
"syn 2.0.43",
]
[[package]]
name = "derive_arbitrary"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.43",
]
[[package]]
name = "derive_builder"
version = "0.11.2"
@@ -3302,7 +3361,7 @@ dependencies = [
[[package]]
name = "file-engine"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -3403,7 +3462,7 @@ dependencies = [
[[package]]
name = "flow"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"bimap",
@@ -3415,10 +3474,12 @@ dependencies = [
"common-telemetry",
"common-time",
"datatypes",
"enum_dispatch",
"hydroflow",
"itertools 0.10.5",
"num-traits",
"serde",
"serde_json",
"servers",
"session",
"snafu",
@@ -3458,7 +3519,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa"
[[package]]
name = "frontend"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arc-swap",
@@ -3522,7 +3583,7 @@ dependencies = [
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
"store-api",
"strfmt",
"substrait 0.6.0",
"substrait 0.7.0",
"table",
"tokio",
"toml 0.8.8",
@@ -4291,7 +4352,7 @@ dependencies = [
[[package]]
name = "index"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"asynchronous-codec",
@@ -4406,6 +4467,16 @@ dependencies = [
"libc",
]
[[package]]
name = "inout"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5"
dependencies = [
"block-padding",
"generic-array",
]
[[package]]
name = "instant"
version = "0.1.12"
@@ -4746,9 +4817,20 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.151"
version = "0.2.153"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4"
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
[[package]]
name = "libfuzzer-sys"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
dependencies = [
"arbitrary",
"cc",
"once_cell",
]
[[package]]
name = "libgit2-sys"
@@ -4848,7 +4930,7 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "log-store"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-stream",
"async-trait",
@@ -5137,7 +5219,7 @@ dependencies = [
[[package]]
name = "meta-client"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -5167,7 +5249,7 @@ dependencies = [
[[package]]
name = "meta-srv"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"anymap",
"api",
@@ -5247,7 +5329,7 @@ dependencies = [
[[package]]
name = "metric-engine"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"aquamarine",
@@ -5307,9 +5389,9 @@ dependencies = [
[[package]]
name = "mio"
version = "0.8.10"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
dependencies = [
"libc",
"log",
@@ -5319,7 +5401,7 @@ dependencies = [
[[package]]
name = "mito2"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"anymap",
"api",
@@ -5380,6 +5462,7 @@ dependencies = [
"tokio",
"tokio-stream",
"tokio-util",
"toml 0.8.8",
"uuid",
]
@@ -5921,9 +6004,18 @@ dependencies = [
"memchr",
]
[[package]]
name = "object-pool"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee9a3e7196d09ec86002b939f1576e8e446d58def8fd48fe578e2c72d5328d68"
dependencies = [
"parking_lot 0.11.2",
]
[[package]]
name = "object-store"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"anyhow",
"async-trait",
@@ -5979,9 +6071,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
[[package]]
name = "opendal"
version = "0.44.2"
version = "0.45.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4af824652d4d2ffabf606d337a071677ae621b05622adf35df9562f69d9b4498"
checksum = "52c17c077f23fa2d2c25d9d22af98baa43b8bbe2ef0de80cf66339aa70401467"
dependencies = [
"anyhow",
"async-trait",
@@ -5997,7 +6089,7 @@ dependencies = [
"md-5",
"once_cell",
"percent-encoding",
"quick-xml 0.30.0",
"quick-xml 0.31.0",
"reqsign",
"reqwest",
"serde",
@@ -6166,7 +6258,7 @@ dependencies = [
[[package]]
name = "operator"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -6213,7 +6305,7 @@ dependencies = [
"sql",
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
"store-api",
"substrait 0.6.0",
"substrait 0.7.0",
"table",
"tokio",
"tonic 0.10.2",
@@ -6444,7 +6536,7 @@ dependencies = [
[[package]]
name = "partition"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"async-trait",
@@ -6466,6 +6558,8 @@ dependencies = [
"serde",
"serde_json",
"snafu",
"sql",
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
"store-api",
"table",
]
@@ -6488,6 +6582,16 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd"
[[package]]
name = "pbkdf2"
version = "0.12.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
dependencies = [
"digest",
"hmac",
]
[[package]]
name = "peeking_take_while"
version = "0.1.2"
@@ -6528,6 +6632,12 @@ version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "permutation"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df202b0b0f5b8e389955afd5f27b007b00fb948162953f1db9c70d2c7e3157d7"
[[package]]
name = "pest"
version = "2.7.5"
@@ -6712,6 +6822,21 @@ dependencies = [
"spki 0.7.3",
]
[[package]]
name = "pkcs5"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e847e2c91a18bfa887dd028ec33f2fe6f25db77db3619024764914affe8b69a6"
dependencies = [
"aes",
"cbc",
"der 0.7.8",
"pbkdf2",
"scrypt",
"sha2",
"spki 0.7.3",
]
[[package]]
name = "pkcs8"
version = "0.8.0"
@@ -6730,6 +6855,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
dependencies = [
"der 0.7.8",
"pkcs5",
"rand_core",
"spki 0.7.3",
]
@@ -6769,7 +6896,7 @@ dependencies = [
[[package]]
name = "plugins"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"auth",
"common-base",
@@ -7036,7 +7163,7 @@ dependencies = [
[[package]]
name = "promql"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"ahash 0.8.6",
"async-recursion",
@@ -7247,7 +7374,7 @@ dependencies = [
[[package]]
name = "puffin"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"bitflags 2.4.1",
@@ -7368,7 +7495,7 @@ dependencies = [
[[package]]
name = "query"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"ahash 0.8.6",
"api",
@@ -7429,7 +7556,7 @@ dependencies = [
"stats-cli",
"store-api",
"streaming-stats",
"substrait 0.6.0",
"substrait 0.7.0",
"table",
"tokio",
"tokio-stream",
@@ -7444,16 +7571,6 @@ dependencies = [
"memchr",
]
[[package]]
name = "quick-xml"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956"
dependencies = [
"memchr",
"serde",
]
[[package]]
name = "quick-xml"
version = "0.31.0"
@@ -7736,9 +7853,9 @@ dependencies = [
[[package]]
name = "reqsign"
version = "0.14.6"
version = "0.14.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dce87f66ba6c6acef277a729f989a0eca946cb9ce6a15bcc036bda0f72d4b9fd"
checksum = "43e319d9de9ff4d941abf4ac718897118b0fe04577ea3f8e0f5788971784eef5"
dependencies = [
"anyhow",
"async-trait",
@@ -7763,7 +7880,6 @@ dependencies = [
"serde_json",
"sha1",
"sha2",
"tokio",
]
[[package]]
@@ -7956,6 +8072,7 @@ dependencies = [
"pkcs1 0.7.5",
"pkcs8 0.10.2",
"rand_core",
"sha2",
"signature",
"spki 0.7.3",
"subtle",
@@ -8690,6 +8807,15 @@ dependencies = [
"bytemuck",
]
[[package]]
name = "salsa20"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97a22f5af31f73a954c10289c93e8a50cc23d971e80ee446f1f6f7137a088213"
dependencies = [
"cipher",
]
[[package]]
name = "same-file"
version = "1.0.6"
@@ -8747,7 +8873,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "script"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arc-swap",
@@ -8803,6 +8929,17 @@ dependencies = [
"tokio-test",
]
[[package]]
name = "scrypt"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0516a385866c09368f0b5bcd1caff3366aace790fcd46e2bb032697bb172fd1f"
dependencies = [
"pbkdf2",
"salsa20",
"sha2",
]
[[package]]
name = "sct"
version = "0.7.1"
@@ -9020,7 +9157,7 @@ dependencies = [
[[package]]
name = "servers"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"aide",
"api",
@@ -9054,6 +9191,7 @@ dependencies = [
"common-test-util",
"common-time",
"common-version",
"criterion",
"datafusion",
"datafusion-common",
"datafusion-expr",
@@ -9061,6 +9199,7 @@ dependencies = [
"derive_builder 0.12.0",
"digest",
"futures",
"hashbrown 0.14.3",
"headers",
"hex",
"hostname",
@@ -9073,11 +9212,13 @@ dependencies = [
"mime_guess",
"mysql_async",
"notify",
"object-pool",
"once_cell",
"openmetrics-parser",
"opensrv-mysql",
"opentelemetry-proto 0.3.0",
"parking_lot 0.12.1",
"permutation",
"pgwire",
"pin-project",
"postgres-types",
@@ -9122,7 +9263,7 @@ dependencies = [
[[package]]
name = "session"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arc-swap",
@@ -9392,7 +9533,7 @@ dependencies = [
[[package]]
name = "sql"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"common-base",
@@ -9444,7 +9585,7 @@ dependencies = [
[[package]]
name = "sqlness-runner"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-trait",
"clap 4.4.11",
@@ -9651,7 +9792,7 @@ dependencies = [
[[package]]
name = "store-api"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"aquamarine",
@@ -9791,7 +9932,7 @@ dependencies = [
[[package]]
name = "substrait"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"async-recursion",
"async-trait",
@@ -9964,7 +10105,7 @@ dependencies = [
[[package]]
name = "table"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"anymap",
"async-trait",
@@ -10076,17 +10217,21 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
[[package]]
name = "tests-fuzz"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"arbitrary",
"async-trait",
"common-error",
"common-macro",
"common-query",
"common-runtime",
"common-telemetry",
"common-time",
"datatypes",
"derive_builder 0.12.0",
"dotenv",
"lazy_static",
"libfuzzer-sys",
"partition",
"rand",
"rand_chacha",
@@ -10101,7 +10246,7 @@ dependencies = [
[[package]]
name = "tests-integration"
version = "0.6.0"
version = "0.7.0"
dependencies = [
"api",
"arrow-flight",
@@ -10158,7 +10303,7 @@ dependencies = [
"sql",
"sqlx",
"store-api",
"substrait 0.6.0",
"substrait 0.7.0",
"table",
"tempfile",
"time",

View File

@@ -62,7 +62,7 @@ members = [
resolver = "2"
[workspace.package]
version = "0.6.0"
version = "0.7.0"
edition = "2021"
license = "Apache-2.0"
@@ -134,7 +134,7 @@ reqwest = { version = "0.11", default-features = false, features = [
rskafka = "0.5"
rust_decimal = "1.33"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
serde_json = { version = "1.0", features = ["float_roundtrip"] }
serde_with = "3"
smallvec = { version = "1", features = ["serde"] }
snafu = "0.7"

View File

@@ -3,6 +3,7 @@ CARGO_PROFILE ?=
FEATURES ?=
TARGET_DIR ?=
TARGET ?=
BUILD_BIN ?= greptime
CARGO_BUILD_OPTS := --locked
IMAGE_REGISTRY ?= docker.io
IMAGE_NAMESPACE ?= greptime
@@ -45,6 +46,10 @@ ifneq ($(strip $(TARGET)),)
CARGO_BUILD_OPTS += --target ${TARGET}
endif
ifneq ($(strip $(BUILD_BIN)),)
CARGO_BUILD_OPTS += --bin ${BUILD_BIN}
endif
ifneq ($(strip $(RELEASE)),)
CARGO_BUILD_OPTS += --release
endif

View File

@@ -29,7 +29,7 @@ use client::api::v1::column::Values;
use client::api::v1::{
Column, ColumnDataType, ColumnDef, CreateTableExpr, InsertRequest, InsertRequests, SemanticType,
};
use client::{Client, Database, Output, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use client::{Client, Database, OutputData, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use futures_util::TryStreamExt;
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
@@ -502,9 +502,9 @@ async fn do_query(num_iter: usize, db: &Database, table_name: &str) {
for i in 0..num_iter {
let now = Instant::now();
let res = db.sql(&query).await.unwrap();
match res {
Output::AffectedRows(_) | Output::RecordBatches(_) => (),
Output::Stream(stream, _) => {
match res.data {
OutputData::AffectedRows(_) | OutputData::RecordBatches(_) => (),
OutputData::Stream(stream) => {
stream.try_collect::<Vec<_>>().await.unwrap();
}
}

View File

@@ -138,6 +138,18 @@ mem_threshold_on_create = "64M"
# File system path to store intermediate files for external sorting (default `{data_home}/index_intermediate`).
intermediate_path = ""
[region_engine.mito.memtable]
# Memtable type.
# - "experimental": experimental memtable
# - "time_series": time-series memtable (deprecated)
type = "experimental"
# The max number of keys in one shard.
index_max_keys_per_shard = 8192
# The max rows of data inside the actively writing buffer in one shard.
data_freeze_threshold = 32768
# Max dictionary bytes.
fork_dictionary_bytes = "1GiB"
# Log options, see `standalone.example.toml`
# [logging]
# dir = "/tmp/greptimedb/logs"

View File

@@ -244,6 +244,18 @@ mem_threshold_on_create = "64M"
# File system path to store intermediate files for external sorting (default `{data_home}/index_intermediate`).
intermediate_path = ""
[region_engine.mito.memtable]
# Memtable type.
# - "experimental": experimental memtable
# - "time_series": time-series memtable (deprecated)
type = "experimental"
# The max number of keys in one shard.
index_max_keys_per_shard = 8192
# The max rows of data inside the actively writing buffer in one shard.
data_freeze_threshold = 32768
# Max dictionary bytes.
fork_dictionary_bytes = "1GiB"
# Log options
# [logging]
# Specify logs directory.
@@ -254,10 +266,11 @@ intermediate_path = ""
# enable_otlp_tracing = false
# tracing exporter endpoint with format `ip:port`, we use grpc oltp as exporter, default endpoint is `localhost:4317`
# otlp_endpoint = "localhost:4317"
# The percentage of tracing will be sampled and exported. Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1. ratio > 1 are treated as 1. Fractions < 0 are treated as 0
# tracing_sample_ratio = 1.0
# Whether to append logs to stdout. Defaults to true.
# append_stdout = true
# The percentage of tracing will be sampled and exported. Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1. ratio > 1 are treated as 1. Fractions < 0 are treated as 0
# [logging.tracing_sample_ratio]
# default_ratio = 0.0
# Standalone export the metrics generated by itself
# encoded to Prometheus remote-write format

View File

@@ -79,7 +79,7 @@ This RFC proposes to add a new expression node `MergeScan` to merge result from
│ │ │ │
└─Frontend──────┘ └─Remote-Sources──────────────┘
```
This merge operation simply chains all the the underlying remote data sources and return `RecordBatch`, just like a coalesce op. And each remote sources is a gRPC query to datanode via the substrait logical plan interface. The plan is transformed and divided from the original query that comes to frontend.
This merge operation simply chains all the underlying remote data sources and return `RecordBatch`, just like a coalesce op. And each remote sources is a gRPC query to datanode via the substrait logical plan interface. The plan is transformed and divided from the original query that comes to frontend.
## Commutativity of MergeScan

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

View File

@@ -0,0 +1,101 @@
---
Feature Name: Multi-dimension Partition Rule
Tracking Issue: https://github.com/GreptimeTeam/greptimedb/issues/3351
Date: 2024-02-21
Author: "Ruihang Xia <waynestxia@gmail.com>"
---
# Summary
A new region partition scheme that runs on multiple dimensions of the key space. The partition rule is defined by a set of simple expressions on the partition key columns.
# Motivation
The current partition rule is from MySQL's [`RANGE Partition`](https://dev.mysql.com/doc/refman/8.0/en/partitioning-range.html), which is based on a single dimension. It is sort of a [Hilbert Curve](https://en.wikipedia.org/wiki/Hilbert_curve) and pick several point on the curve to divide the space. It is neither easy to understand how the data get partitioned nor flexible enough to handle complex partitioning requirements.
Considering the future requirements like region repartitioning or autonomous rebalancing, where both workload and partition may change frequently. Here proposes a new region partition scheme that uses a set of simple expressions on the partition key columns to divide the key space.
# Details
## Partition rule
First, we define a simple expression that can be used to define the partition rule. The simple expression is a binary expression expression on the partition key columns that can be evaluated to a boolean value. The binary operator is limited to comparison operators only, like `=`, `!=`, `>`, `>=`, `<`, `<=`. And the operands are limited either literal value or partition column.
Example of valid simple expressions are $`col_A = 10`$, $`col_A \gt 10 \& col_B \gt 20`$ or $`col_A \ne 10`$.
Those expressions can be used as predicates to divide the key space into different regions. The following example have two partition columns `Col A` and `Col B`, and four partitioned regions.
```math
\left\{\begin{aligned}
&col_A \le 10 &Region_1 \\
&10 \lt col_A \& col_A \le 20 &Region_2 \\
&20 \lt col_A \space \& \space col_B \lt 100 &Region_3 \\
&20 \lt col_A \space \& \space col_B \ge 100 &Region_4
\end{aligned}\right\}
```
An advantage of this scheme is that it is easy to understand how the data get partitioned. The above example can be visualized in a 2D space (two partition column is involved in the example).
![example](2d-example.png)
Here each expression draws a line in the 2D space. Managing data partitioning becomes a matter of drawing lines in the key space.
To make it easy to use, there is a "default region" which catches all the data that doesn't match any of previous expressions. The default region exist by default and do not need to specify. It is also possible to remove this default region if the DB finds it is not necessary.
## SQL interface
The SQL interface is in response to two parts: specifying the partition columns and the partition rule. Thouth we are targeting an autonomous system, it's still allowed to give some bootstrap rules or hints on creating table.
Partition column is specified by `PARTITION ON COLUMNS` sub-clause in `CREATE TABLE`:
```sql
CREATE TABLE t (...)
PARTITION ON COLUMNS (...) ();
```
Two following brackets are for partition columns and partition rule respectively.
Columns provided here are only used as an allow-list of how the partition rule can be defined. Which means (a) the sequence between columns doesn't matter, (b) the columns provided here are not necessarily being used in the partition rule.
The partition rule part is a list of comma-separated simple expressions. Expressions here are not corresponding to region, as they might be changed by system to fit various workload.
A full example of `CREATE TABLE` with partition rule is:
```sql
CREATE TABLE IF NOT EXISTS demo (
a STRING,
b STRING,
c STRING,
d STRING,
ts TIMESTAMP,
memory DOUBLE,
TIME INDEX (ts),
PRIMARY KEY (a, b, c, d)
)
PARTITION ON COLUMNS (c, b, a) (
a < 10,
10 >= a AND a < 20,
20 >= a AND b < 100,
20 >= a AND b > 100
)
```
## Combine with storage
Examining columns separately suits our columnar storage very well in two aspects.
1. The simple expression can be pushed down to storage and file format, and is likely to hit existing index. Makes pruning operation very efficient.
2. Columns in columnar storage are not tightly coupled like in the traditional row storages, which means we can easily add or remove columns from partition rule without much impact (like a global reshuffle) on data.
The data file itself can be "projected" to the key space as a polyhedron, it is guaranteed that each plane is in parallel with some coordinate planes (in a 2D scenario, this is saying that all the files can be projected to a rectangle). Thus partition or repartition also only need to consider related columns.
![sst-project](sst-project.png)
An additional limitation is that considering how the index works and how we organize the primary keys at present, the partition columns are limited to be a subset of primary keys for better performance.
# Drawbacks
This is a breaking change.

Binary file not shown.

After

Width:  |  Height:  |  Size: 71 KiB

View File

@@ -19,9 +19,9 @@ mod partitions;
mod predicate;
mod region_peers;
mod runtime_metrics;
mod schemata;
pub mod schemata;
mod table_names;
mod tables;
pub mod tables;
use std::collections::HashMap;
use std::sync::{Arc, Weak};

View File

@@ -37,8 +37,8 @@ use crate::error::{
use crate::information_schema::{InformationTable, Predicates};
use crate::CatalogManager;
const CATALOG_NAME: &str = "catalog_name";
const SCHEMA_NAME: &str = "schema_name";
pub const CATALOG_NAME: &str = "catalog_name";
pub const SCHEMA_NAME: &str = "schema_name";
const DEFAULT_CHARACTER_SET_NAME: &str = "default_character_set_name";
const DEFAULT_COLLATION_NAME: &str = "default_collation_name";
const INIT_CAPACITY: usize = 42;

View File

@@ -39,10 +39,10 @@ use crate::error::{
use crate::information_schema::{InformationTable, Predicates};
use crate::CatalogManager;
const TABLE_CATALOG: &str = "table_catalog";
const TABLE_SCHEMA: &str = "table_schema";
const TABLE_NAME: &str = "table_name";
const TABLE_TYPE: &str = "table_type";
pub const TABLE_CATALOG: &str = "table_catalog";
pub const TABLE_SCHEMA: &str = "table_schema";
pub const TABLE_NAME: &str = "table_name";
pub const TABLE_TYPE: &str = "table_type";
const TABLE_ID: &str = "table_id";
const ENGINE: &str = "engine";
const INIT_CAPACITY: usize = 42;

View File

@@ -307,7 +307,7 @@ impl Database {
reason: "Expect 'AffectedRows' Flight messages to be the one and the only!"
}
);
Ok(Output::AffectedRows(rows))
Ok(Output::new_with_affected_rows(rows))
}
FlightMessage::Recordbatch(_) | FlightMessage::Metrics(_) => {
IllegalFlightMessagesSnafu {
@@ -340,7 +340,7 @@ impl Database {
output_ordering: None,
metrics: Default::default(),
};
Ok(Output::new_stream(Box::pin(record_batch_stream)))
Ok(Output::new_with_stream(Box::pin(record_batch_stream)))
}
}
}

View File

@@ -26,7 +26,7 @@ use api::v1::greptime_response::Response;
use api::v1::{AffectedRows, GreptimeResponse};
pub use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_error::status_code::StatusCode;
pub use common_query::Output;
pub use common_query::{Output, OutputData, OutputMeta};
pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
use snafu::OptionExt;

View File

@@ -62,7 +62,9 @@ pub struct BenchTableMetadataCommand {
impl BenchTableMetadataCommand {
pub async fn build(&self) -> Result<Instance> {
let etcd_store = EtcdStore::with_endpoints([&self.etcd_addr]).await.unwrap();
let etcd_store = EtcdStore::with_endpoints([&self.etcd_addr], 128)
.await
.unwrap();
let table_metadata_manager = Arc::new(TableMetadataManager::new(etcd_store));

View File

@@ -19,8 +19,7 @@ use async_trait::async_trait;
use clap::{Parser, ValueEnum};
use client::api::v1::auth_header::AuthScheme;
use client::api::v1::Basic;
use client::{Client, Database, DEFAULT_SCHEMA_NAME};
use common_query::Output;
use client::{Client, Database, OutputData, DEFAULT_SCHEMA_NAME};
use common_recordbatch::util::collect;
use common_telemetry::{debug, error, info, warn};
use datatypes::scalars::ScalarVector;
@@ -142,7 +141,7 @@ impl Export {
.with_context(|_| RequestDatabaseSnafu {
sql: "show databases".to_string(),
})?;
let Output::Stream(stream, _) = result else {
let OutputData::Stream(stream) = result.data else {
NotDataFromOutputSnafu.fail()?
};
let record_batch = collect(stream)
@@ -183,7 +182,7 @@ impl Export {
.sql(&sql)
.await
.with_context(|_| RequestDatabaseSnafu { sql })?;
let Output::Stream(stream, _) = result else {
let OutputData::Stream(stream) = result.data else {
NotDataFromOutputSnafu.fail()?
};
let Some(record_batch) = collect(stream)
@@ -235,7 +234,7 @@ impl Export {
.sql(&sql)
.await
.with_context(|_| RequestDatabaseSnafu { sql })?;
let Output::Stream(stream, _) = result else {
let OutputData::Stream(stream) = result.data else {
NotDataFromOutputSnafu.fail()?
};
let record_batch = collect(stream)

View File

@@ -19,7 +19,7 @@ use std::time::Instant;
use catalog::kvbackend::{
CachedMetaKvBackend, CachedMetaKvBackendBuilder, KvBackendCatalogManager,
};
use client::{Client, Database, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use client::{Client, Database, OutputData, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_base::Plugins;
use common_error::ext::ErrorExt;
use common_query::Output;
@@ -184,15 +184,15 @@ impl Repl {
}
.context(RequestDatabaseSnafu { sql: &sql })?;
let either = match output {
Output::Stream(s, _) => {
let either = match output.data {
OutputData::Stream(s) => {
let x = RecordBatches::try_collect(s)
.await
.context(CollectRecordBatchesSnafu)?;
Either::Left(x)
}
Output::RecordBatches(x) => Either::Left(x),
Output::AffectedRows(rows) => Either::Right(rows),
OutputData::RecordBatches(x) => Either::Left(x),
OutputData::AffectedRows(rows) => Either::Right(rows),
};
let end = Instant::now();

View File

@@ -70,7 +70,7 @@ impl UpgradeCommand {
etcd_addr: &self.etcd_addr,
})?;
let tool = MigrateTableMetadata {
etcd_store: EtcdStore::with_etcd_client(client),
etcd_store: EtcdStore::with_etcd_client(client, 128),
dryrun: self.dryrun,
skip_catalog_keys: self.skip_catalog_keys,
skip_table_global_keys: self.skip_table_global_keys,

View File

@@ -117,10 +117,12 @@ struct StartCommand {
/// The working home directory of this metasrv instance.
#[clap(long)]
data_home: Option<String>,
/// If it's not empty, the metasrv will store all data with this key prefix.
#[clap(long, default_value = "")]
store_key_prefix: String,
/// The max operations per txn
#[clap(long)]
max_txn_ops: Option<usize>,
}
impl StartCommand {
@@ -181,6 +183,10 @@ impl StartCommand {
opts.store_key_prefix = self.store_key_prefix.clone()
}
if let Some(max_txn_ops) = self.max_txn_ops {
opts.max_txn_ops = max_txn_ops;
}
// Disable dashboard in metasrv.
opts.http.disable_dashboard = true;

View File

@@ -28,12 +28,15 @@ const REGION: &str = "region";
const ENABLE_VIRTUAL_HOST_STYLE: &str = "enable_virtual_host_style";
pub fn is_supported_in_s3(key: &str) -> bool {
key == ENDPOINT
|| key == ACCESS_KEY_ID
|| key == SECRET_ACCESS_KEY
|| key == SESSION_TOKEN
|| key == REGION
|| key == ENABLE_VIRTUAL_HOST_STYLE
[
ENDPOINT,
ACCESS_KEY_ID,
SECRET_ACCESS_KEY,
SESSION_TOKEN,
REGION,
ENABLE_VIRTUAL_HOST_STYLE,
]
.contains(&key)
}
pub fn build_s3_backend(

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
mod clamp;
mod modulo;
mod pow;
mod rate;
@@ -19,6 +20,7 @@ mod rate;
use std::fmt;
use std::sync::Arc;
pub use clamp::ClampFunction;
use common_query::error::{GeneralDataFusionSnafu, Result};
use common_query::prelude::Signature;
use datafusion::error::DataFusionError;
@@ -40,7 +42,8 @@ impl MathFunction {
registry.register(Arc::new(ModuloFunction));
registry.register(Arc::new(PowFunction));
registry.register(Arc::new(RateFunction));
registry.register(Arc::new(RangeFunction))
registry.register(Arc::new(RangeFunction));
registry.register(Arc::new(ClampFunction));
}
}

View File

@@ -0,0 +1,403 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt::{self, Display};
use std::sync::Arc;
use common_query::error::{InvalidFuncArgsSnafu, Result};
use common_query::prelude::Signature;
use datafusion::arrow::array::{ArrayIter, PrimitiveArray};
use datafusion::logical_expr::Volatility;
use datatypes::data_type::{ConcreteDataType, DataType};
use datatypes::prelude::VectorRef;
use datatypes::types::LogicalPrimitiveType;
use datatypes::value::TryAsPrimitive;
use datatypes::vectors::PrimitiveVector;
use datatypes::with_match_primitive_type_id;
use snafu::{ensure, OptionExt};
use crate::function::Function;
#[derive(Clone, Debug, Default)]
pub struct ClampFunction;
const CLAMP_NAME: &str = "clamp";
impl Function for ClampFunction {
fn name(&self) -> &str {
CLAMP_NAME
}
fn return_type(&self, input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
// Type check is done by `signature`
Ok(input_types[0].clone())
}
fn signature(&self) -> Signature {
// input, min, max
Signature::uniform(3, ConcreteDataType::numerics(), Volatility::Immutable)
}
fn eval(
&self,
_func_ctx: crate::function::FunctionContext,
columns: &[VectorRef],
) -> Result<VectorRef> {
ensure!(
columns.len() == 3,
InvalidFuncArgsSnafu {
err_msg: format!(
"The length of the args is not correct, expect exactly 3, have: {}",
columns.len()
),
}
);
ensure!(
columns[0].data_type().is_numeric(),
InvalidFuncArgsSnafu {
err_msg: format!(
"The first arg's type is not numeric, have: {}",
columns[0].data_type()
),
}
);
ensure!(
columns[0].data_type() == columns[1].data_type()
&& columns[1].data_type() == columns[2].data_type(),
InvalidFuncArgsSnafu {
err_msg: format!(
"Arguments don't have identical types: {}, {}, {}",
columns[0].data_type(),
columns[1].data_type(),
columns[2].data_type()
),
}
);
ensure!(
columns[1].len() == 1 && columns[2].len() == 1,
InvalidFuncArgsSnafu {
err_msg: format!(
"The second and third args should be scalar, have: {:?}, {:?}",
columns[1], columns[2]
),
}
);
with_match_primitive_type_id!(columns[0].data_type().logical_type_id(), |$S| {
let input_array = columns[0].to_arrow_array();
let input = input_array
.as_any()
.downcast_ref::<PrimitiveArray<<$S as LogicalPrimitiveType>::ArrowPrimitive>>()
.unwrap();
let min = TryAsPrimitive::<$S>::try_as_primitive(&columns[1].get(0))
.with_context(|| {
InvalidFuncArgsSnafu {
err_msg: "The second arg should not be none",
}
})?;
let max = TryAsPrimitive::<$S>::try_as_primitive(&columns[2].get(0))
.with_context(|| {
InvalidFuncArgsSnafu {
err_msg: "The third arg should not be none",
}
})?;
// ensure min <= max
ensure!(
min <= max,
InvalidFuncArgsSnafu {
err_msg: format!(
"The second arg should be less than or equal to the third arg, have: {:?}, {:?}",
columns[1], columns[2]
),
}
);
clamp_impl::<$S, true, true>(input, min, max)
},{
unreachable!()
})
}
}
impl Display for ClampFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", CLAMP_NAME.to_ascii_uppercase())
}
}
fn clamp_impl<T: LogicalPrimitiveType, const CLAMP_MIN: bool, const CLAMP_MAX: bool>(
input: &PrimitiveArray<T::ArrowPrimitive>,
min: T::Native,
max: T::Native,
) -> Result<VectorRef> {
common_telemetry::info!("[DEBUG] min {min:?}, max {max:?}");
let iter = ArrayIter::new(input);
let result = iter.map(|x| {
x.map(|x| {
if CLAMP_MIN && x < min {
min
} else if CLAMP_MAX && x > max {
max
} else {
x
}
})
});
let result = PrimitiveArray::<T::ArrowPrimitive>::from_iter(result);
Ok(Arc::new(PrimitiveVector::<T>::from(result)))
}
#[cfg(test)]
mod test {
use std::sync::Arc;
use datatypes::prelude::ScalarVector;
use datatypes::vectors::{
ConstantVector, Float64Vector, Int64Vector, StringVector, UInt64Vector,
};
use super::*;
use crate::function::FunctionContext;
#[test]
fn clamp_i64() {
let inputs = [
(
vec![Some(-3), Some(-2), Some(-1), Some(0), Some(1), Some(2)],
-1,
10,
vec![Some(-1), Some(-1), Some(-1), Some(0), Some(1), Some(2)],
),
(
vec![Some(-3), Some(-2), Some(-1), Some(0), Some(1), Some(2)],
0,
0,
vec![Some(0), Some(0), Some(0), Some(0), Some(0), Some(0)],
),
(
vec![Some(-3), None, Some(-1), None, None, Some(2)],
-2,
1,
vec![Some(-2), None, Some(-1), None, None, Some(1)],
),
(
vec![None, None, None, None, None],
0,
1,
vec![None, None, None, None, None],
),
];
let func = ClampFunction;
for (in_data, min, max, expected) in inputs {
let args = [
Arc::new(Int64Vector::from(in_data)) as _,
Arc::new(Int64Vector::from_vec(vec![min])) as _,
Arc::new(Int64Vector::from_vec(vec![max])) as _,
];
let result = func
.eval(FunctionContext::default(), args.as_slice())
.unwrap();
let expected: VectorRef = Arc::new(Int64Vector::from(expected));
assert_eq!(expected, result);
}
}
#[test]
fn clamp_u64() {
let inputs = [
(
vec![Some(0), Some(1), Some(2), Some(3), Some(4), Some(5)],
1,
3,
vec![Some(1), Some(1), Some(2), Some(3), Some(3), Some(3)],
),
(
vec![Some(0), Some(1), Some(2), Some(3), Some(4), Some(5)],
0,
0,
vec![Some(0), Some(0), Some(0), Some(0), Some(0), Some(0)],
),
(
vec![Some(0), None, Some(2), None, None, Some(5)],
1,
3,
vec![Some(1), None, Some(2), None, None, Some(3)],
),
(
vec![None, None, None, None, None],
0,
1,
vec![None, None, None, None, None],
),
];
let func = ClampFunction;
for (in_data, min, max, expected) in inputs {
let args = [
Arc::new(UInt64Vector::from(in_data)) as _,
Arc::new(UInt64Vector::from_vec(vec![min])) as _,
Arc::new(UInt64Vector::from_vec(vec![max])) as _,
];
let result = func
.eval(FunctionContext::default(), args.as_slice())
.unwrap();
let expected: VectorRef = Arc::new(UInt64Vector::from(expected));
assert_eq!(expected, result);
}
}
#[test]
fn clamp_f64() {
let inputs = [
(
vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)],
-1.0,
10.0,
vec![Some(-1.0), Some(-1.0), Some(-1.0), Some(0.0), Some(1.0)],
),
(
vec![Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)],
0.0,
0.0,
vec![Some(0.0), Some(0.0), Some(0.0), Some(0.0)],
),
(
vec![Some(-3.0), None, Some(-1.0), None, None, Some(2.0)],
-2.0,
1.0,
vec![Some(-2.0), None, Some(-1.0), None, None, Some(1.0)],
),
(
vec![None, None, None, None, None],
0.0,
1.0,
vec![None, None, None, None, None],
),
];
let func = ClampFunction;
for (in_data, min, max, expected) in inputs {
let args = [
Arc::new(Float64Vector::from(in_data)) as _,
Arc::new(Float64Vector::from_vec(vec![min])) as _,
Arc::new(Float64Vector::from_vec(vec![max])) as _,
];
let result = func
.eval(FunctionContext::default(), args.as_slice())
.unwrap();
let expected: VectorRef = Arc::new(Float64Vector::from(expected));
assert_eq!(expected, result);
}
}
#[test]
fn clamp_const_i32() {
let input = vec![Some(5)];
let min = 2;
let max = 4;
let func = ClampFunction;
let args = [
Arc::new(ConstantVector::new(Arc::new(Int64Vector::from(input)), 1)) as _,
Arc::new(Int64Vector::from_vec(vec![min])) as _,
Arc::new(Int64Vector::from_vec(vec![max])) as _,
];
let result = func
.eval(FunctionContext::default(), args.as_slice())
.unwrap();
let expected: VectorRef = Arc::new(Int64Vector::from(vec![Some(4)]));
assert_eq!(expected, result);
}
#[test]
fn clamp_invalid_min_max() {
let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
let min = 10.0;
let max = -1.0;
let func = ClampFunction;
let args = [
Arc::new(Float64Vector::from(input)) as _,
Arc::new(Float64Vector::from_vec(vec![min])) as _,
Arc::new(Float64Vector::from_vec(vec![max])) as _,
];
let result = func.eval(FunctionContext::default(), args.as_slice());
assert!(result.is_err());
}
#[test]
fn clamp_type_not_match() {
let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
let min = -1;
let max = 10;
let func = ClampFunction;
let args = [
Arc::new(Float64Vector::from(input)) as _,
Arc::new(Int64Vector::from_vec(vec![min])) as _,
Arc::new(UInt64Vector::from_vec(vec![max])) as _,
];
let result = func.eval(FunctionContext::default(), args.as_slice());
assert!(result.is_err());
}
#[test]
fn clamp_min_is_not_scalar() {
let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
let min = -10.0;
let max = 1.0;
let func = ClampFunction;
let args = [
Arc::new(Float64Vector::from(input)) as _,
Arc::new(Float64Vector::from_vec(vec![min, min])) as _,
Arc::new(Float64Vector::from_vec(vec![max])) as _,
];
let result = func.eval(FunctionContext::default(), args.as_slice());
assert!(result.is_err());
}
#[test]
fn clamp_no_max() {
let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
let min = -10.0;
let func = ClampFunction;
let args = [
Arc::new(Float64Vector::from(input)) as _,
Arc::new(Float64Vector::from_vec(vec![min])) as _,
];
let result = func.eval(FunctionContext::default(), args.as_slice());
assert!(result.is_err());
}
#[test]
fn clamp_on_string() {
let input = vec![Some("foo"), Some("foo"), Some("foo"), Some("foo")];
let func = ClampFunction;
let args = [
Arc::new(StringVector::from(input)) as _,
Arc::new(StringVector::from_vec(vec!["bar"])) as _,
Arc::new(StringVector::from_vec(vec!["baz"])) as _,
];
let result = func.eval(FunctionContext::default(), args.as_slice());
assert!(result.is_err());
}
}

View File

@@ -14,9 +14,11 @@
use std::sync::Arc;
mod greatest;
mod to_timezone;
mod to_unixtime;
use greatest::GreatestFunction;
use to_timezone::ToTimezoneFunction;
use to_unixtime::ToUnixtimeFunction;
use crate::function_registry::FunctionRegistry;
@@ -25,6 +27,7 @@ pub(crate) struct TimestampFunction;
impl TimestampFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register(Arc::new(ToTimezoneFunction));
registry.register(Arc::new(ToUnixtimeFunction));
registry.register(Arc::new(GreatestFunction));
}

View File

@@ -0,0 +1,260 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt;
use std::sync::Arc;
use common_query::error::{InvalidFuncArgsSnafu, Result, UnsupportedInputDataTypeSnafu};
use common_query::prelude::Signature;
use common_time::{Timestamp, Timezone};
use datatypes::data_type::ConcreteDataType;
use datatypes::prelude::VectorRef;
use datatypes::types::TimestampType;
use datatypes::value::Value;
use datatypes::vectors::{
StringVector, TimestampMicrosecondVector, TimestampMillisecondVector,
TimestampNanosecondVector, TimestampSecondVector, Vector,
};
use snafu::{ensure, OptionExt};
use crate::function::{Function, FunctionContext};
use crate::helper;
#[derive(Clone, Debug, Default)]
pub struct ToTimezoneFunction;
const NAME: &str = "to_timezone";
fn convert_to_timezone(arg: &str) -> Option<Timezone> {
Timezone::from_tz_string(arg).ok()
}
fn convert_to_timestamp(arg: &Value) -> Option<Timestamp> {
match arg {
Value::Timestamp(ts) => Some(*ts),
_ => None,
}
}
impl fmt::Display for ToTimezoneFunction {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "TO_TIMEZONE")
}
}
impl Function for ToTimezoneFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
// type checked by signature - MUST BE timestamp
Ok(input_types[0].clone())
}
fn signature(&self) -> Signature {
helper::one_of_sigs2(
vec![
ConcreteDataType::timestamp_second_datatype(),
ConcreteDataType::timestamp_millisecond_datatype(),
ConcreteDataType::timestamp_microsecond_datatype(),
ConcreteDataType::timestamp_nanosecond_datatype(),
],
vec![ConcreteDataType::string_datatype()],
)
}
fn eval(&self, _ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
ensure!(
columns.len() == 2,
InvalidFuncArgsSnafu {
err_msg: format!(
"The length of the args is not correct, expect exactly 2, have: {}",
columns.len()
),
}
);
// TODO: maybe support epoch timestamp? https://github.com/GreptimeTeam/greptimedb/issues/3477
let ts = columns[0].data_type().as_timestamp().with_context(|| {
UnsupportedInputDataTypeSnafu {
function: NAME,
datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
}
})?;
let array = columns[0].to_arrow_array();
let times = match ts {
TimestampType::Second(_) => {
let vector = TimestampSecondVector::try_from_arrow_array(array).unwrap();
(0..vector.len())
.map(|i| convert_to_timestamp(&vector.get(i)))
.collect::<Vec<_>>()
}
TimestampType::Millisecond(_) => {
let vector = TimestampMillisecondVector::try_from_arrow_array(array).unwrap();
(0..vector.len())
.map(|i| convert_to_timestamp(&vector.get(i)))
.collect::<Vec<_>>()
}
TimestampType::Microsecond(_) => {
let vector = TimestampMicrosecondVector::try_from_arrow_array(array).unwrap();
(0..vector.len())
.map(|i| convert_to_timestamp(&vector.get(i)))
.collect::<Vec<_>>()
}
TimestampType::Nanosecond(_) => {
let vector = TimestampNanosecondVector::try_from_arrow_array(array).unwrap();
(0..vector.len())
.map(|i| convert_to_timestamp(&vector.get(i)))
.collect::<Vec<_>>()
}
};
let tzs = {
let array = columns[1].to_arrow_array();
let vector = StringVector::try_from_arrow_array(&array)
.ok()
.with_context(|| UnsupportedInputDataTypeSnafu {
function: NAME,
datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
})?;
(0..vector.len())
.map(|i| convert_to_timezone(&vector.get(i).to_string()))
.collect::<Vec<_>>()
};
let result = times
.iter()
.zip(tzs.iter())
.map(|(time, tz)| match (time, tz) {
(Some(time), _) => Some(time.to_timezone_aware_string(tz.as_ref())),
_ => None,
})
.collect::<Vec<Option<String>>>();
Ok(Arc::new(StringVector::from(result)))
}
}
#[cfg(test)]
mod tests {
use datatypes::scalars::ScalarVector;
use datatypes::timestamp::{
TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond,
};
use datatypes::vectors::StringVector;
use super::*;
#[test]
fn test_timestamp_to_timezone() {
let f = ToTimezoneFunction;
assert_eq!("to_timezone", f.name());
let results = vec![
Some("1969-12-31 19:00:01"),
None,
Some("1970-01-01 03:00:01"),
None,
];
let times: Vec<Option<TimestampSecond>> = vec![
Some(TimestampSecond::new(1)),
None,
Some(TimestampSecond::new(1)),
None,
];
let ts_vector: TimestampSecondVector =
TimestampSecondVector::from_owned_iterator(times.into_iter());
let tzs = vec![Some("America/New_York"), None, Some("Europe/Moscow"), None];
let args: Vec<VectorRef> = vec![
Arc::new(ts_vector),
Arc::new(StringVector::from(tzs.clone())),
];
let vector = f.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(4, vector.len());
let expect_times: VectorRef = Arc::new(StringVector::from(results));
assert_eq!(expect_times, vector);
let results = vec![
Some("1969-12-31 19:00:00.001"),
None,
Some("1970-01-01 03:00:00.001"),
None,
];
let times: Vec<Option<TimestampMillisecond>> = vec![
Some(TimestampMillisecond::new(1)),
None,
Some(TimestampMillisecond::new(1)),
None,
];
let ts_vector: TimestampMillisecondVector =
TimestampMillisecondVector::from_owned_iterator(times.into_iter());
let args: Vec<VectorRef> = vec![
Arc::new(ts_vector),
Arc::new(StringVector::from(tzs.clone())),
];
let vector = f.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(4, vector.len());
let expect_times: VectorRef = Arc::new(StringVector::from(results));
assert_eq!(expect_times, vector);
let results = vec![
Some("1969-12-31 19:00:00.000001"),
None,
Some("1970-01-01 03:00:00.000001"),
None,
];
let times: Vec<Option<TimestampMicrosecond>> = vec![
Some(TimestampMicrosecond::new(1)),
None,
Some(TimestampMicrosecond::new(1)),
None,
];
let ts_vector: TimestampMicrosecondVector =
TimestampMicrosecondVector::from_owned_iterator(times.into_iter());
let args: Vec<VectorRef> = vec![
Arc::new(ts_vector),
Arc::new(StringVector::from(tzs.clone())),
];
let vector = f.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(4, vector.len());
let expect_times: VectorRef = Arc::new(StringVector::from(results));
assert_eq!(expect_times, vector);
let results = vec![
Some("1969-12-31 19:00:00.000000001"),
None,
Some("1970-01-01 03:00:00.000000001"),
None,
];
let times: Vec<Option<TimestampNanosecond>> = vec![
Some(TimestampNanosecond::new(1)),
None,
Some(TimestampNanosecond::new(1)),
None,
];
let ts_vector: TimestampNanosecondVector =
TimestampNanosecondVector::from_owned_iterator(times.into_iter());
let args: Vec<VectorRef> = vec![
Arc::new(ts_vector),
Arc::new(StringVector::from(tzs.clone())),
];
let vector = f.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(4, vector.len());
let expect_times: VectorRef = Arc::new(StringVector::from(results));
assert_eq!(expect_times, vector);
}
}

View File

@@ -32,7 +32,7 @@ macro_rules! ok {
};
}
/// Internal util macro to to create an error.
/// Internal util macro to create an error.
macro_rules! error {
($span:expr, $msg: expr) => {
Err(syn::Error::new($span, $msg))

View File

@@ -67,6 +67,14 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to execute {} txn operations via Etcd", max_operations))]
EtcdTxnFailed {
max_operations: usize,
#[snafu(source)]
error: etcd_client::Error,
location: Location,
},
#[snafu(display("Failed to get sequence: {}", err_msg))]
NextSequence { err_msg: String, location: Location },
@@ -400,6 +408,7 @@ impl ErrorExt for Error {
IllegalServerState { .. }
| EtcdTxnOpResponse { .. }
| EtcdFailed { .. }
| EtcdTxnFailed { .. }
| ConnectEtcd { .. } => StatusCode::Internal,
SerdeJson { .. }

View File

@@ -464,7 +464,7 @@ impl TableMetadataManager {
pub fn max_logical_tables_per_batch(&self) -> usize {
// The batch size is max_txn_size / 3 because the size of the `tables_data`
// is 3 times the size of the `tables_data`.
self.kv_backend.max_txn_size() / 3
self.kv_backend.max_txn_ops() / 3
}
/// Creates metadata for multiple logical tables and return an error if different metadata exists.
@@ -860,6 +860,7 @@ mod tests {
use bytes::Bytes;
use common_time::util::current_time_millis;
use futures::TryStreamExt;
use store_api::storage::RegionId;
use table::metadata::{RawTableInfo, TableInfo};
use super::datanode_table::DatanodeTableKey;
@@ -1056,6 +1057,36 @@ mod tests {
);
}
#[tokio::test]
async fn test_create_many_logical_tables_metadata() {
let kv_backend = Arc::new(MemoryKvBackend::default());
let table_metadata_manager = TableMetadataManager::new(kv_backend);
let mut tables_data = vec![];
for i in 0..128 {
let table_id = i + 1;
let regin_number = table_id * 3;
let region_id = RegionId::new(table_id, regin_number);
let region_route = new_region_route(region_id.as_u64(), 2);
let region_routes = vec![region_route.clone()];
let table_info: RawTableInfo = test_utils::new_test_table_info_with_name(
table_id,
&format!("my_table_{}", table_id),
region_routes.iter().map(|r| r.region.id.region_number()),
)
.into();
let table_route_value = TableRouteValue::physical(region_routes.clone());
tables_data.push((table_info, table_route_value));
}
// creates metadata.
table_metadata_manager
.create_logical_tables_metadata(tables_data)
.await
.unwrap();
}
#[tokio::test]
async fn test_delete_table_metadata() {
let mem_kv = Arc::new(MemoryKvBackend::default());

View File

@@ -19,8 +19,9 @@ use datatypes::schema::{ColumnSchema, SchemaBuilder};
use store_api::storage::TableId;
use table::metadata::{TableInfo, TableInfoBuilder, TableMetaBuilder};
pub fn new_test_table_info<I: IntoIterator<Item = u32>>(
pub fn new_test_table_info_with_name<I: IntoIterator<Item = u32>>(
table_id: TableId,
table_name: &str,
region_numbers: I,
) -> TableInfo {
let column_schemas = vec![
@@ -50,8 +51,14 @@ pub fn new_test_table_info<I: IntoIterator<Item = u32>>(
TableInfoBuilder::default()
.table_id(table_id)
.table_version(5)
.name("mytable")
.name(table_name)
.meta(meta)
.build()
.unwrap()
}
pub fn new_test_table_info<I: IntoIterator<Item = u32>>(
table_id: TableId,
region_numbers: I,
) -> TableInfo {
new_test_table_info_with_name(table_id, "mytable", region_numbers)
}

View File

@@ -45,6 +45,10 @@ impl TxnService for ChrootKvBackend {
let txn_res = self.inner.txn(txn).await?;
Ok(self.chroot_txn_response(txn_res))
}
fn max_txn_ops(&self) -> usize {
self.inner.max_txn_ops()
}
}
#[async_trait::async_trait]

View File

@@ -33,12 +33,6 @@ use crate::rpc::store::{
};
use crate::rpc::KeyValue;
// Maximum number of operations permitted in a transaction.
// The etcd default configuration's `--max-txn-ops` is 128.
//
// For more detail, see: https://etcd.io/docs/v3.5/op-guide/configuration/
const MAX_TXN_SIZE: usize = 128;
fn convert_key_value(kv: etcd_client::KeyValue) -> KeyValue {
let (key, value) = kv.into_key_value();
KeyValue { key, value }
@@ -46,10 +40,15 @@ fn convert_key_value(kv: etcd_client::KeyValue) -> KeyValue {
pub struct EtcdStore {
client: Client,
// Maximum number of operations permitted in a transaction.
// The etcd default configuration's `--max-txn-ops` is 128.
//
// For more detail, see: https://etcd.io/docs/v3.5/op-guide/configuration/
max_txn_ops: usize,
}
impl EtcdStore {
pub async fn with_endpoints<E, S>(endpoints: S) -> Result<KvBackendRef>
pub async fn with_endpoints<E, S>(endpoints: S, max_txn_ops: usize) -> Result<KvBackendRef>
where
E: AsRef<str>,
S: AsRef<[E]>,
@@ -58,16 +57,19 @@ impl EtcdStore {
.await
.context(error::ConnectEtcdSnafu)?;
Ok(Self::with_etcd_client(client))
Ok(Self::with_etcd_client(client, max_txn_ops))
}
pub fn with_etcd_client(client: Client) -> KvBackendRef {
Arc::new(Self { client })
pub fn with_etcd_client(client: Client, max_txn_ops: usize) -> KvBackendRef {
Arc::new(Self {
client,
max_txn_ops,
})
}
async fn do_multi_txn(&self, txn_ops: Vec<TxnOp>) -> Result<Vec<TxnResponse>> {
let max_txn_size = self.max_txn_size();
if txn_ops.len() < max_txn_size {
let max_txn_ops = self.max_txn_ops();
if txn_ops.len() < max_txn_ops {
// fast path
let _timer = METRIC_META_TXN_REQUEST
.with_label_values(&["etcd", "txn"])
@@ -83,7 +85,7 @@ impl EtcdStore {
}
let txns = txn_ops
.chunks(max_txn_size)
.chunks(max_txn_ops)
.map(|part| async move {
let _timer = METRIC_META_TXN_REQUEST
.with_label_values(&["etcd", "txn"])
@@ -311,18 +313,20 @@ impl TxnService for EtcdStore {
.with_label_values(&["etcd", "txn"])
.start_timer();
let max_operations = txn.max_operations();
let etcd_txn: Txn = txn.into();
let txn_res = self
.client
.kv_client()
.txn(etcd_txn)
.await
.context(error::EtcdFailedSnafu)?;
.context(error::EtcdTxnFailedSnafu { max_operations })?;
txn_res.try_into()
}
fn max_txn_size(&self) -> usize {
MAX_TXN_SIZE
fn max_txn_ops(&self) -> usize {
self.max_txn_ops
}
}

View File

@@ -323,6 +323,10 @@ impl<T: ErrorExt + Send + Sync> TxnService for MemoryKvBackend<T> {
responses,
})
}
fn max_txn_ops(&self) -> usize {
usize::MAX
}
}
impl<T: ErrorExt + Send + Sync + 'static> ResettableKvBackend for MemoryKvBackend<T> {

View File

@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::cmp::max;
use common_error::ext::ErrorExt;
use crate::rpc::store::{DeleteRangeResponse, PutResponse, RangeResponse};
@@ -27,8 +29,8 @@ pub trait TxnService: Sync + Send {
}
/// Maximum number of operations permitted in a transaction.
fn max_txn_size(&self) -> usize {
usize::MAX
fn max_txn_ops(&self) -> usize {
unimplemented!("txn is not implemented")
}
}
@@ -192,6 +194,12 @@ impl Txn {
self.req.failure = operations.into();
self
}
#[inline]
pub fn max_operations(&self) -> usize {
let opc = max(self.req.compare.len(), self.req.success.len());
max(opc, self.req.failure.len())
}
}
impl From<Txn> for TxnRequest {

View File

@@ -152,7 +152,7 @@ impl Runner {
guard.key_guards.push(key_guard);
}
// Execute the procedure. We need to release the lock whenever the the execution
// Execute the procedure. We need to release the lock whenever the execution
// is successful or fail.
self.execute_procedure_in_loop().await;

View File

@@ -30,38 +30,87 @@ pub mod prelude;
mod signature;
use sqlparser_derive::{Visit, VisitMut};
// sql output
pub enum Output {
/// new Output struct with output data(previously Output) and output meta
#[derive(Debug)]
pub struct Output {
pub data: OutputData,
pub meta: OutputMeta,
}
/// Original Output struct
/// carrying result data to response/client/user interface
pub enum OutputData {
AffectedRows(usize),
RecordBatches(RecordBatches),
Stream(SendableRecordBatchStream, Option<Arc<dyn PhysicalPlan>>),
Stream(SendableRecordBatchStream),
}
/// OutputMeta stores meta information produced/generated during the execution
#[derive(Debug, Default)]
pub struct OutputMeta {
/// May exist for query output. One can retrieve execution metrics from this plan.
pub plan: Option<Arc<dyn PhysicalPlan>>,
pub cost: usize,
}
impl Output {
// helper function to build original `Output::Stream`
pub fn new_stream(stream: SendableRecordBatchStream) -> Self {
Output::Stream(stream, None)
pub fn new_with_affected_rows(affected_rows: usize) -> Self {
Self {
data: OutputData::AffectedRows(affected_rows),
meta: Default::default(),
}
}
pub fn new_with_record_batches(recordbatches: RecordBatches) -> Self {
Self {
data: OutputData::RecordBatches(recordbatches),
meta: Default::default(),
}
}
pub fn new_with_stream(stream: SendableRecordBatchStream) -> Self {
Self {
data: OutputData::Stream(stream),
meta: Default::default(),
}
}
pub fn new(data: OutputData, meta: OutputMeta) -> Self {
Self { data, meta }
}
}
impl Debug for Output {
impl Debug for OutputData {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Output::AffectedRows(rows) => write!(f, "Output::AffectedRows({rows})"),
Output::RecordBatches(recordbatches) => {
write!(f, "Output::RecordBatches({recordbatches:?})")
OutputData::AffectedRows(rows) => write!(f, "OutputData::AffectedRows({rows})"),
OutputData::RecordBatches(recordbatches) => {
write!(f, "OutputData::RecordBatches({recordbatches:?})")
}
Output::Stream(_, df) => {
if df.is_some() {
write!(f, "Output::Stream(<stream>, Some<physical_plan>)")
} else {
write!(f, "Output::Stream(<stream>)")
}
OutputData::Stream(_) => {
write!(f, "OutputData::Stream(<stream>)")
}
}
}
}
impl OutputMeta {
pub fn new(plan: Option<Arc<dyn PhysicalPlan>>, cost: usize) -> Self {
Self { plan, cost }
}
pub fn new_with_plan(plan: Arc<dyn PhysicalPlan>) -> Self {
Self {
plan: Some(plan),
cost: 0,
}
}
pub fn new_with_cost(cost: usize) -> Self {
Self { plan: None, cost }
}
}
pub use datafusion::physical_plan::ExecutionPlan as DfPhysicalPlan;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Visit, VisitMut)]

View File

@@ -32,7 +32,7 @@ use snafu::ResultExt;
use crate::error::{self, Result};
use crate::{
DfRecordBatch, DfSendableRecordBatchStream, RecordBatch, RecordBatchStream,
DfRecordBatch, DfSendableRecordBatchStream, OrderOption, RecordBatch, RecordBatchStream,
SendableRecordBatchStream, Stream,
};
@@ -228,6 +228,10 @@ impl RecordBatchStream for RecordBatchStreamAdapter {
Metrics::Unavailable | Metrics::Unresolved(_) => None,
}
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
}
impl Stream for RecordBatchStreamAdapter {
@@ -316,6 +320,14 @@ impl RecordBatchStream for AsyncRecordBatchStreamAdapter {
fn schema(&self) -> SchemaRef {
self.schema.clone()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
}
impl Stream for AsyncRecordBatchStreamAdapter {
@@ -375,6 +387,14 @@ mod test {
fn schema(&self) -> SchemaRef {
unimplemented!()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
}
impl Stream for MaybeErrorRecordBatchStream {

View File

@@ -39,13 +39,9 @@ use snafu::{ensure, ResultExt};
pub trait RecordBatchStream: Stream<Item = Result<RecordBatch>> {
fn schema(&self) -> SchemaRef;
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn output_ordering(&self) -> Option<&[OrderOption]>;
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics>;
}
pub type SendableRecordBatchStream = Pin<Box<dyn RecordBatchStream + Send>>;
@@ -74,6 +70,14 @@ impl RecordBatchStream for EmptyRecordBatchStream {
fn schema(&self) -> SchemaRef {
self.schema.clone()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
}
impl Stream for EmptyRecordBatchStream {
@@ -192,6 +196,14 @@ impl RecordBatchStream for SimpleRecordBatchStream {
fn schema(&self) -> SchemaRef {
self.inner.schema()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
}
impl Stream for SimpleRecordBatchStream {

View File

@@ -41,7 +41,8 @@ mod tests {
use futures::Stream;
use super::*;
use crate::RecordBatchStream;
use crate::adapter::RecordBatchMetrics;
use crate::{OrderOption, RecordBatchStream};
struct MockRecordBatchStream {
batch: Option<RecordBatch>,
@@ -52,6 +53,14 @@ mod tests {
fn schema(&self) -> SchemaRef {
self.schema.clone()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
}
impl Stream for MockRecordBatchStream {

View File

@@ -12,11 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#![feature(let_chains)]
pub mod logging;
mod macros;
pub mod metric;
mod panic_hook;
pub mod tracing_context;
mod tracing_sampler;
pub use logging::{init_default_ut_logging, init_global_logging};
pub use metric::dump_metrics;

View File

@@ -31,6 +31,7 @@ use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::prelude::*;
use tracing_subscriber::{filter, EnvFilter, Registry};
use crate::tracing_sampler::{create_sampler, TracingSampleOptions};
pub use crate::{debug, error, info, trace, warn};
const DEFAULT_OTLP_ENDPOINT: &str = "http://localhost:4317";
@@ -42,7 +43,7 @@ pub struct LoggingOptions {
pub level: Option<String>,
pub enable_otlp_tracing: bool,
pub otlp_endpoint: Option<String>,
pub tracing_sample_ratio: Option<f64>,
pub tracing_sample_ratio: Option<TracingSampleOptions>,
pub append_stdout: bool,
}
@@ -176,8 +177,10 @@ pub fn init_global_logging(
.expect("error parsing log level string");
let sampler = opts
.tracing_sample_ratio
.map(Sampler::TraceIdRatioBased)
.unwrap_or(Sampler::AlwaysOn);
.as_ref()
.map(create_sampler)
.map(Sampler::ParentBased)
.unwrap_or(Sampler::ParentBased(Box::new(Sampler::AlwaysOn)));
// Must enable 'tokio_unstable' cfg to use this feature.
// For example: `RUSTFLAGS="--cfg tokio_unstable" cargo run -F common-telemetry/console -- standalone start`
#[cfg(feature = "tokio-console")]

View File

@@ -0,0 +1,176 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashSet;
use opentelemetry::trace::{
Link, SamplingDecision, SamplingResult, SpanKind, TraceContextExt, TraceId, TraceState,
};
use opentelemetry::KeyValue;
use opentelemetry_sdk::trace::{Sampler, ShouldSample};
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(default)]
pub struct TracingSampleOptions {
pub default_ratio: f64,
pub rules: Vec<TracingSampleRule>,
}
impl Default for TracingSampleOptions {
fn default() -> Self {
Self {
default_ratio: 1.0,
rules: vec![],
}
}
}
/// Determine the sampling rate of a span according to the `rules` provided in `RuleSampler`.
/// For spans that do not hit any `rules`, the `default_ratio` is used.
#[derive(Clone, Default, Debug, Serialize, Deserialize)]
#[serde(default)]
pub struct TracingSampleRule {
pub protocol: String,
pub request_types: HashSet<String>,
pub ratio: f64,
}
impl TracingSampleRule {
pub fn match_rule(&self, protocol: &str, request_type: Option<&str>) -> Option<f64> {
if protocol == self.protocol {
if self.request_types.is_empty() {
Some(self.ratio)
} else if let Some(t) = request_type
&& self.request_types.contains(t)
{
Some(self.ratio)
} else {
None
}
} else {
None
}
}
}
impl PartialEq for TracingSampleOptions {
fn eq(&self, other: &Self) -> bool {
self.default_ratio == other.default_ratio && self.rules == other.rules
}
}
impl PartialEq for TracingSampleRule {
fn eq(&self, other: &Self) -> bool {
self.protocol == other.protocol
&& self.request_types == other.request_types
&& self.ratio == other.ratio
}
}
impl Eq for TracingSampleOptions {}
impl Eq for TracingSampleRule {}
pub fn create_sampler(opt: &TracingSampleOptions) -> Box<dyn ShouldSample> {
if opt.rules.is_empty() {
Box::new(Sampler::TraceIdRatioBased(opt.default_ratio))
} else {
Box::new(opt.clone())
}
}
impl ShouldSample for TracingSampleOptions {
fn should_sample(
&self,
parent_context: Option<&opentelemetry::Context>,
trace_id: TraceId,
_name: &str,
_span_kind: &SpanKind,
attributes: &[KeyValue],
_links: &[Link],
) -> SamplingResult {
let (mut protocol, mut request_type) = (None, None);
for kv in attributes {
match kv.key.as_str() {
"protocol" => protocol = Some(kv.value.as_str()),
"request_type" => request_type = Some(kv.value.as_str()),
_ => (),
}
}
let ratio = protocol
.and_then(|p| {
self.rules
.iter()
.find_map(|rule| rule.match_rule(p.as_ref(), request_type.as_deref()))
})
.unwrap_or(self.default_ratio);
SamplingResult {
decision: sample_based_on_probability(ratio, trace_id),
// No extra attributes ever set by the SDK samplers.
attributes: Vec::new(),
// all sampler in SDK will not modify trace state.
trace_state: match parent_context {
Some(ctx) => ctx.span().span_context().trace_state().clone(),
None => TraceState::default(),
},
}
}
}
/// The code here mainly refers to the relevant implementation of
/// [opentelemetry](https://github.com/open-telemetry/opentelemetry-rust/blob/ef4701055cc39d3448d5e5392812ded00cdd4476/opentelemetry-sdk/src/trace/sampler.rs#L229),
/// and determines whether the span needs to be collected based on the `TraceId` and sampling rate (i.e. `prob`).
fn sample_based_on_probability(prob: f64, trace_id: TraceId) -> SamplingDecision {
if prob >= 1.0 {
SamplingDecision::RecordAndSample
} else {
let prob_upper_bound = (prob.max(0.0) * (1u64 << 63) as f64) as u64;
let bytes = trace_id.to_bytes();
let (_, low) = bytes.split_at(8);
let trace_id_low = u64::from_be_bytes(low.try_into().unwrap());
let rnd_from_trace_id = trace_id_low >> 1;
if rnd_from_trace_id < prob_upper_bound {
SamplingDecision::RecordAndSample
} else {
SamplingDecision::Drop
}
}
}
#[cfg(test)]
mod test {
use std::collections::HashSet;
use crate::tracing_sampler::TracingSampleRule;
#[test]
fn test_rule() {
let rule = TracingSampleRule {
protocol: "http".to_string(),
request_types: HashSet::new(),
ratio: 1.0,
};
assert_eq!(rule.match_rule("not_http", None), None);
assert_eq!(rule.match_rule("http", None), Some(1.0));
assert_eq!(rule.match_rule("http", Some("abc")), Some(1.0));
let rule1 = TracingSampleRule {
protocol: "http".to_string(),
request_types: HashSet::from(["mysql".to_string()]),
ratio: 1.0,
};
assert_eq!(rule1.match_rule("http", None), None);
assert_eq!(rule1.match_rule("http", Some("abc")), None);
assert_eq!(rule1.match_rule("http", Some("mysql")), Some(1.0));
}
}

View File

@@ -13,7 +13,7 @@
// limitations under the License.
use client::Database;
use common_query::Output;
use common_query::OutputData;
use common_recordbatch::util;
pub enum ExpectedOutput<'a> {
@@ -23,22 +23,24 @@ pub enum ExpectedOutput<'a> {
pub async fn execute_and_check_output(db: &Database, sql: &str, expected: ExpectedOutput<'_>) {
let output = db.sql(sql).await.unwrap();
let output = output.data;
match (&output, expected) {
(Output::AffectedRows(x), ExpectedOutput::AffectedRows(y)) => {
(OutputData::AffectedRows(x), ExpectedOutput::AffectedRows(y)) => {
assert_eq!(*x, y, "actual: \n{}", x)
}
(Output::RecordBatches(_), ExpectedOutput::QueryResult(x))
| (Output::Stream(_, _), ExpectedOutput::QueryResult(x)) => {
(OutputData::RecordBatches(_), ExpectedOutput::QueryResult(x))
| (OutputData::Stream(_), ExpectedOutput::QueryResult(x)) => {
check_output_stream(output, x).await
}
_ => panic!(),
}
}
pub async fn check_output_stream(output: Output, expected: &str) {
pub async fn check_output_stream(output: OutputData, expected: &str) {
let recordbatches = match output {
Output::Stream(stream, _) => util::collect_batches(stream).await.unwrap(),
Output::RecordBatches(recordbatches) => recordbatches,
OutputData::Stream(stream) => util::collect_batches(stream).await.unwrap(),
OutputData::RecordBatches(recordbatches) => recordbatches,
_ => unreachable!(),
};
let pretty_print = recordbatches.pretty_print().unwrap();

View File

@@ -36,7 +36,7 @@ use crate::{error, Interval};
/// - for [TimeUnit::Second]: [-262144-01-01 00:00:00, +262143-12-31 23:59:59]
/// - for [TimeUnit::Millisecond]: [-262144-01-01 00:00:00.000, +262143-12-31 23:59:59.999]
/// - for [TimeUnit::Microsecond]: [-262144-01-01 00:00:00.000000, +262143-12-31 23:59:59.999999]
/// - for [TimeUnit::Nanosecond]: [1677-09-21 00:12:43.145225, 2262-04-11 23:47:16.854775807]
/// - for [TimeUnit::Nanosecond]: [1677-09-21 00:12:43.145224192, 2262-04-11 23:47:16.854775807]
///
/// # Note:
/// For values out of range, you can still store these timestamps, but while performing arithmetic
@@ -187,28 +187,28 @@ impl Timestamp {
Self { unit, value }
}
pub fn new_second(value: i64) -> Self {
pub const fn new_second(value: i64) -> Self {
Self {
value,
unit: TimeUnit::Second,
}
}
pub fn new_millisecond(value: i64) -> Self {
pub const fn new_millisecond(value: i64) -> Self {
Self {
value,
unit: TimeUnit::Millisecond,
}
}
pub fn new_microsecond(value: i64) -> Self {
pub const fn new_microsecond(value: i64) -> Self {
Self {
value,
unit: TimeUnit::Microsecond,
}
}
pub fn new_nanosecond(value: i64) -> Self {
pub const fn new_nanosecond(value: i64) -> Self {
Self {
value,
unit: TimeUnit::Nanosecond,
@@ -281,8 +281,26 @@ impl Timestamp {
.and_then(|v| v.checked_add(micros as i64))
.map(Timestamp::new_microsecond)
} else {
// Refer to <https://github.com/chronotope/chrono/issues/1289>
//
// subsec nanos are always non-negative, however the timestamp itself (both in seconds and in nanos) can be
// negative. Now i64::MIN is NOT dividable by 1_000_000_000, so
//
// (sec * 1_000_000_000) + nsec
//
// may underflow (even when in theory we COULD represent the datetime as i64) because we add the non-negative
// nanos AFTER the multiplication. This is fixed by converting the negative case to
//
// ((sec + 1) * 1_000_000_000) + (nsec - 1_000_000_000)
let mut sec = sec;
let mut nsec = nsec as i64;
if sec < 0 && nsec > 0 {
nsec -= 1_000_000_000;
sec += 1;
}
sec.checked_mul(1_000_000_000)
.and_then(|v| v.checked_add(nsec as i64))
.and_then(|v| v.checked_add(nsec))
.map(Timestamp::new_nanosecond)
}
}
@@ -425,6 +443,20 @@ impl Timestamp {
}
}
impl Timestamp {
pub const MIN_SECOND: Self = Self::new_second(-8_334_601_228_800);
pub const MAX_SECOND: Self = Self::new_second(8_210_266_876_799);
pub const MIN_MILLISECOND: Self = Self::new_millisecond(-8_334_601_228_800_000);
pub const MAX_MILLISECOND: Self = Self::new_millisecond(8_210_266_876_799_999);
pub const MIN_MICROSECOND: Self = Self::new_microsecond(-8_334_601_228_800_000_000);
pub const MAX_MICROSECOND: Self = Self::new_microsecond(8_210_266_876_799_999_999);
pub const MIN_NANOSECOND: Self = Self::new_nanosecond(i64::MIN);
pub const MAX_NANOSECOND: Self = Self::new_nanosecond(i64::MAX);
}
/// Converts the naive datetime (which has no specific timezone) to a
/// nanosecond epoch timestamp in UTC.
fn naive_datetime_to_timestamp(
@@ -586,6 +618,7 @@ impl Hash for Timestamp {
mod tests {
use std::collections::hash_map::DefaultHasher;
use chrono_tz::Tz;
use rand::Rng;
use serde_json::Value;
@@ -1297,7 +1330,7 @@ mod tests {
"+262142-12-31 23:59:59Z",
"+262142-12-31 23:59:59.999Z",
"+262142-12-31 23:59:59.999999Z",
"1677-09-21 00:12:43.145225Z",
"1677-09-21 00:12:43.145224192Z",
"2262-04-11 23:47:16.854775807Z",
"+100000-01-01 00:00:01.5Z",
];
@@ -1306,4 +1339,47 @@ mod tests {
Timestamp::from_str_utc(s).unwrap();
}
}
#[test]
fn test_min_nanos_roundtrip() {
let (sec, nsec) = Timestamp::MIN_NANOSECOND.split();
let ts = Timestamp::from_splits(sec, nsec).unwrap();
assert_eq!(Timestamp::MIN_NANOSECOND, ts);
}
#[test]
fn test_timestamp_bound_format() {
assert_eq!(
"1677-09-21 00:12:43.145224192",
Timestamp::MIN_NANOSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"2262-04-11 23:47:16.854775807",
Timestamp::MAX_NANOSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"-262143-01-01 00:00:00",
Timestamp::MIN_MICROSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"+262142-12-31 23:59:59.999999",
Timestamp::MAX_MICROSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"-262143-01-01 00:00:00",
Timestamp::MIN_MILLISECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"+262142-12-31 23:59:59.999",
Timestamp::MAX_MILLISECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"-262143-01-01 00:00:00",
Timestamp::MIN_SECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"+262142-12-31 23:59:59",
Timestamp::MAX_SECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
}
}

View File

@@ -27,7 +27,7 @@ use common_error::ext::BoxedError;
use common_error::status_code::StatusCode;
use common_query::logical_plan::Expr;
use common_query::physical_plan::DfPhysicalPlanAdapter;
use common_query::{DfPhysicalPlan, Output};
use common_query::{DfPhysicalPlan, OutputData};
use common_recordbatch::SendableRecordBatchStream;
use common_runtime::Runtime;
use common_telemetry::tracing::{self, info_span};
@@ -651,11 +651,11 @@ impl RegionServerInner {
.await
.context(ExecuteLogicalPlanSnafu)?;
match result {
Output::AffectedRows(_) | Output::RecordBatches(_) => {
match result.data {
OutputData::AffectedRows(_) | OutputData::RecordBatches(_) => {
UnsupportedOutputSnafu { expected: "stream" }.fail()
}
Output::Stream(stream, _) => Ok(stream),
OutputData::Stream(stream) => Ok(stream),
}
}

View File

@@ -370,6 +370,36 @@ impl Value {
}
}
pub trait TryAsPrimitive<T: LogicalPrimitiveType> {
fn try_as_primitive(&self) -> Option<T::Native>;
}
macro_rules! impl_try_as_primitive {
($Type: ident, $Variant: ident) => {
impl TryAsPrimitive<crate::types::$Type> for Value {
fn try_as_primitive(
&self,
) -> Option<<crate::types::$Type as crate::types::LogicalPrimitiveType>::Native> {
match self {
Value::$Variant(v) => Some((*v).into()),
_ => None,
}
}
}
};
}
impl_try_as_primitive!(Int8Type, Int8);
impl_try_as_primitive!(Int16Type, Int16);
impl_try_as_primitive!(Int32Type, Int32);
impl_try_as_primitive!(Int64Type, Int64);
impl_try_as_primitive!(UInt8Type, UInt8);
impl_try_as_primitive!(UInt16Type, UInt16);
impl_try_as_primitive!(UInt32Type, UInt32);
impl_try_as_primitive!(UInt64Type, UInt64);
impl_try_as_primitive!(Float32Type, Float32);
impl_try_as_primitive!(Float64Type, Float64);
pub fn to_null_scalar_value(output_type: &ConcreteDataType) -> Result<ScalarValue> {
Ok(match output_type {
ConcreteDataType::Null(_) => ScalarValue::Null,
@@ -2387,4 +2417,12 @@ mod tests {
);
check_value_ref_size_eq(&ValueRef::Decimal128(Decimal128::new(1234, 3, 1)), 32)
}
#[test]
fn test_incorrect_default_value_issue_3479() {
let value = OrderedF64::from(0.047318541668048164);
let serialized = serde_json::to_string(&value).unwrap();
let deserialized: OrderedF64 = serde_json::from_str(&serialized).unwrap();
assert_eq!(value, deserialized);
}
}

View File

@@ -22,8 +22,9 @@ use std::task::{Context, Poll};
use common_datasource::object_store::build_backend;
use common_error::ext::BoxedError;
use common_query::prelude::Expr;
use common_recordbatch::adapter::RecordBatchMetrics;
use common_recordbatch::error::{CastVectorSnafu, ExternalSnafu, Result as RecordBatchResult};
use common_recordbatch::{RecordBatch, RecordBatchStream, SendableRecordBatchStream};
use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
use datafusion::logical_expr::utils as df_logical_expr_utils;
use datatypes::prelude::ConcreteDataType;
use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
@@ -151,6 +152,14 @@ impl RecordBatchStream for FileToScanRegionStream {
fn schema(&self) -> SchemaRef {
self.scan_schema.clone()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
}
impl Stream for FileToScanRegionStream {

View File

@@ -18,6 +18,7 @@ common-query.workspace = true
common-telemetry.workspace = true
common-time.workspace = true
datatypes.workspace = true
enum_dispatch = "0.3"
hydroflow = "0.5.0"
itertools.workspace = true
num-traits = "0.2"
@@ -27,3 +28,6 @@ session.workspace = true
snafu.workspace = true
tokio.workspace = true
tonic.workspace = true
[dev-dependencies]
serde_json = "1.0"

View File

@@ -24,5 +24,6 @@ mod scalar;
pub(crate) use error::{EvalError, InvalidArgumentSnafu, OptimizeSnafu};
pub(crate) use func::{BinaryFunc, UnaryFunc, UnmaterializableFunc, VariadicFunc};
pub(crate) use id::{GlobalId, Id, LocalId};
pub(crate) use linear::{MapFilterProject, MfpPlan, SafeMfpPlan};
pub(crate) use relation::{AggregateExpr, AggregateFunc};
pub(crate) use scalar::ScalarExpr;

View File

@@ -61,4 +61,7 @@ pub enum EvalError {
#[snafu(display("Unsupported temporal filter: {reason}"))]
UnsupportedTemporalFilter { reason: String, location: Location },
#[snafu(display("Overflowed during evaluation"))]
Overflow { location: Location },
}

View File

@@ -45,7 +45,7 @@ use crate::repr::{self, value_to_internal_ts, Diff, Row};
/// expressions in `self.expressions`, even though this is not something
/// we can directly evaluate. The plan creation methods will defensively
/// ensure that the right thing happens.
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)]
pub struct MapFilterProject {
/// A sequence of expressions that should be appended to the row.
///
@@ -415,7 +415,7 @@ impl MapFilterProject {
}
/// A wrapper type which indicates it is safe to simply evaluate all expressions.
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
pub struct SafeMfpPlan {
pub(crate) mfp: MapFilterProject,
}
@@ -800,7 +800,7 @@ mod test {
.unwrap();
// only retain sum result
let mfp = mfp.project(vec![4]).unwrap();
// accept only if if the sum is greater than 10
// accept only if the sum is greater than 10
let mfp = mfp
.filter(vec![ScalarExpr::Column(0).call_binary(
ScalarExpr::Literal(Value::from(10i32), ConcreteDataType::int32_datatype()),

View File

@@ -21,7 +21,7 @@ mod accum;
mod func;
/// Describes an aggregation expression.
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)]
pub struct AggregateExpr {
/// Names the aggregation function.
pub func: AggregateFunc,

View File

@@ -14,7 +14,10 @@
//! Accumulators for aggregate functions that's is accumulatable. i.e. sum/count
//!
//! Currently support sum, count, any, all
//! Accumulator will only be restore from row and being updated every time dataflow need process a new batch of rows.
//! So the overhead is acceptable.
//!
//! Currently support sum, count, any, all and min/max(with one caveat that min/max can't support delete with aggregate).
use std::fmt::Display;
@@ -22,13 +25,506 @@ use common_decimal::Decimal128;
use common_time::{Date, DateTime};
use datatypes::data_type::ConcreteDataType;
use datatypes::value::{OrderedF32, OrderedF64, OrderedFloat, Value};
use enum_dispatch::enum_dispatch;
use hydroflow::futures::stream::Concat;
use serde::{Deserialize, Serialize};
use snafu::ensure;
use crate::expr::error::{InternalSnafu, TryFromValueSnafu, TypeMismatchSnafu};
use crate::expr::error::{InternalSnafu, OverflowSnafu, TryFromValueSnafu, TypeMismatchSnafu};
use crate::expr::relation::func::GenericFn;
use crate::expr::{AggregateFunc, EvalError};
use crate::repr::Diff;
/// Accumulates values for the various types of accumulable aggregations.
#[enum_dispatch]
pub trait Accumulator: Sized {
fn into_state(self) -> Vec<Value>;
fn update(
&mut self,
aggr_fn: &AggregateFunc,
value: Value,
diff: Diff,
) -> Result<(), EvalError>;
fn update_batch<I>(&mut self, aggr_fn: &AggregateFunc, value_diffs: I) -> Result<(), EvalError>
where
I: IntoIterator<Item = (Value, Diff)>,
{
for (v, d) in value_diffs {
self.update(aggr_fn, v, d)?;
}
Ok(())
}
fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError>;
}
/// Bool accumulator, used for `Any` `All` `Max/MinBool`
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct Bool {
/// The number of `true` values observed.
trues: Diff,
/// The number of `false` values observed.
falses: Diff,
}
impl TryFrom<Vec<Value>> for Bool {
type Error = EvalError;
fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
ensure!(
state.len() == 2,
InternalSnafu {
reason: "Bool Accumulator state should have 2 values",
}
);
let mut iter = state.into_iter();
Ok(Self {
trues: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
falses: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
})
}
}
impl Accumulator for Bool {
fn into_state(self) -> Vec<Value> {
vec![self.trues.into(), self.falses.into()]
}
/// Null values are ignored
fn update(
&mut self,
aggr_fn: &AggregateFunc,
value: Value,
diff: Diff,
) -> Result<(), EvalError> {
ensure!(
matches!(
aggr_fn,
AggregateFunc::Any
| AggregateFunc::All
| AggregateFunc::MaxBool
| AggregateFunc::MinBool
),
InternalSnafu {
reason: format!(
"Bool Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
);
match value {
Value::Boolean(true) => self.trues += diff,
Value::Boolean(false) => self.falses += diff,
Value::Null => (), // ignore nulls
x => {
return Err(TypeMismatchSnafu {
expected: ConcreteDataType::boolean_datatype(),
actual: x.data_type(),
}
.build());
}
};
Ok(())
}
fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
match aggr_fn {
AggregateFunc::Any => Ok(Value::from(self.trues > 0)),
AggregateFunc::All => Ok(Value::from(self.falses == 0)),
AggregateFunc::MaxBool => Ok(Value::from(self.trues > 0)),
AggregateFunc::MinBool => Ok(Value::from(self.falses == 0)),
_ => Err(InternalSnafu {
reason: format!(
"Bool Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
.build()),
}
}
}
/// Accumulates simple numeric values for sum over integer.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct SimpleNumber {
/// The accumulation of all non-NULL values observed.
accum: i128,
/// The number of non-NULL values observed.
non_nulls: Diff,
}
impl TryFrom<Vec<Value>> for SimpleNumber {
type Error = EvalError;
fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
ensure!(
state.len() == 2,
InternalSnafu {
reason: "Number Accumulator state should have 2 values",
}
);
let mut iter = state.into_iter();
Ok(Self {
accum: Decimal128::try_from(iter.next().unwrap())
.map_err(err_try_from_val)?
.val(),
non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
})
}
}
impl Accumulator for SimpleNumber {
fn into_state(self) -> Vec<Value> {
vec![
Value::Decimal128(Decimal128::new(self.accum, 38, 0)),
self.non_nulls.into(),
]
}
fn update(
&mut self,
aggr_fn: &AggregateFunc,
value: Value,
diff: Diff,
) -> Result<(), EvalError> {
ensure!(
matches!(
aggr_fn,
AggregateFunc::SumInt16
| AggregateFunc::SumInt32
| AggregateFunc::SumInt64
| AggregateFunc::SumUInt16
| AggregateFunc::SumUInt32
| AggregateFunc::SumUInt64
),
InternalSnafu {
reason: format!(
"SimpleNumber Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
);
let v = match (aggr_fn, value) {
(AggregateFunc::SumInt16, Value::Int16(x)) => i128::from(x),
(AggregateFunc::SumInt32, Value::Int32(x)) => i128::from(x),
(AggregateFunc::SumInt64, Value::Int64(x)) => i128::from(x),
(AggregateFunc::SumUInt16, Value::UInt16(x)) => i128::from(x),
(AggregateFunc::SumUInt32, Value::UInt32(x)) => i128::from(x),
(AggregateFunc::SumUInt64, Value::UInt64(x)) => i128::from(x),
(_f, Value::Null) => return Ok(()), // ignore null
(f, v) => {
let expected_datatype = f.signature().input;
return Err(TypeMismatchSnafu {
expected: expected_datatype,
actual: v.data_type(),
}
.build())?;
}
};
self.accum += v * i128::from(diff);
self.non_nulls += diff;
Ok(())
}
fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
match aggr_fn {
AggregateFunc::SumInt16 | AggregateFunc::SumInt32 | AggregateFunc::SumInt64 => {
i64::try_from(self.accum)
.map_err(|_e| OverflowSnafu {}.build())
.map(Value::from)
}
AggregateFunc::SumUInt16 | AggregateFunc::SumUInt32 | AggregateFunc::SumUInt64 => {
u64::try_from(self.accum)
.map_err(|_e| OverflowSnafu {}.build())
.map(Value::from)
}
_ => Err(InternalSnafu {
reason: format!(
"SimpleNumber Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
.build()),
}
}
}
/// Accumulates float values for sum over floating numbers.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct Float {
/// Accumulates non-special float values, i.e. not NaN, +inf, -inf.
/// accum will be set to zero if `non_nulls` is zero.
accum: OrderedF64,
/// Counts +inf
pos_infs: Diff,
/// Counts -inf
neg_infs: Diff,
/// Counts NaNs
nans: Diff,
/// Counts non-NULL values
non_nulls: Diff,
}
impl TryFrom<Vec<Value>> for Float {
type Error = EvalError;
fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
ensure!(
state.len() == 5,
InternalSnafu {
reason: "Float Accumulator state should have 5 values",
}
);
let mut iter = state.into_iter();
let mut ret = Self {
accum: OrderedF64::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
pos_infs: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
neg_infs: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
nans: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
};
// This prevent counter-intuitive behavior of summing over no values
if ret.non_nulls == 0 {
ret.accum = OrderedFloat::from(0.0);
}
Ok(ret)
}
}
impl Accumulator for Float {
fn into_state(self) -> Vec<Value> {
vec![
self.accum.into(),
self.pos_infs.into(),
self.neg_infs.into(),
self.nans.into(),
self.non_nulls.into(),
]
}
/// sum ignore null
fn update(
&mut self,
aggr_fn: &AggregateFunc,
value: Value,
diff: Diff,
) -> Result<(), EvalError> {
ensure!(
matches!(
aggr_fn,
AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64
),
InternalSnafu {
reason: format!(
"Float Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
);
let x = match (aggr_fn, value) {
(AggregateFunc::SumFloat32, Value::Float32(x)) => OrderedF64::from(*x as f64),
(AggregateFunc::SumFloat64, Value::Float64(x)) => OrderedF64::from(x),
(_f, Value::Null) => return Ok(()), // ignore null
(f, v) => {
let expected_datatype = f.signature().input;
return Err(TypeMismatchSnafu {
expected: expected_datatype,
actual: v.data_type(),
}
.build())?;
}
};
if x.is_nan() {
self.nans += diff;
} else if x.is_infinite() {
if x.is_sign_positive() {
self.pos_infs += diff;
} else {
self.neg_infs += diff;
}
} else {
self.accum += *(x * OrderedF64::from(diff as f64));
}
self.non_nulls += diff;
Ok(())
}
fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
match aggr_fn {
AggregateFunc::SumFloat32 => Ok(Value::Float32(OrderedF32::from(self.accum.0 as f32))),
AggregateFunc::SumFloat64 => Ok(Value::Float64(self.accum)),
_ => Err(InternalSnafu {
reason: format!(
"Float Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
.build()),
}
}
}
/// Accumulates a single `Ord`ed `Value`, useful for min/max aggregations.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct OrdValue {
val: Option<Value>,
non_nulls: Diff,
}
impl TryFrom<Vec<Value>> for OrdValue {
type Error = EvalError;
fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
ensure!(
state.len() == 2,
InternalSnafu {
reason: "OrdValue Accumulator state should have 2 values",
}
);
let mut iter = state.into_iter();
Ok(Self {
val: {
let v = iter.next().unwrap();
if v == Value::Null {
None
} else {
Some(v)
}
},
non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
})
}
}
impl Accumulator for OrdValue {
fn into_state(self) -> Vec<Value> {
vec![self.val.unwrap_or(Value::Null), self.non_nulls.into()]
}
/// min/max try to find results in all non-null values, if all values are null, the result is null.
/// count(col_name) gives the number of non-null values, count(*) gives the number of rows including nulls.
/// TODO(discord9): add count(*) as a aggr function
fn update(
&mut self,
aggr_fn: &AggregateFunc,
value: Value,
diff: Diff,
) -> Result<(), EvalError> {
ensure!(
aggr_fn.is_max() || aggr_fn.is_min() || matches!(aggr_fn, AggregateFunc::Count),
InternalSnafu {
reason: format!(
"OrdValue Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
);
if diff <= 0 && (aggr_fn.is_max() || aggr_fn.is_min()) {
return Err(InternalSnafu {
reason: "OrdValue Accumulator does not support non-monotonic input for min/max aggregation".to_string(),
}.build());
}
// if aggr_fn is count, the incoming value type doesn't matter in type checking
// otherwise, type need to be the same or value can be null
let check_type_aggr_fn_and_arg_value =
ty_eq_without_precision(value.data_type(), aggr_fn.signature().input)
|| matches!(aggr_fn, AggregateFunc::Count)
|| value.is_null();
let check_type_aggr_fn_and_self_val = self
.val
.as_ref()
.map(|zelf| ty_eq_without_precision(zelf.data_type(), aggr_fn.signature().input))
.unwrap_or(true)
|| matches!(aggr_fn, AggregateFunc::Count);
if !check_type_aggr_fn_and_arg_value {
return Err(TypeMismatchSnafu {
expected: aggr_fn.signature().input,
actual: value.data_type(),
}
.build());
} else if !check_type_aggr_fn_and_self_val {
return Err(TypeMismatchSnafu {
expected: aggr_fn.signature().input,
actual: self
.val
.as_ref()
.map(|v| v.data_type())
.unwrap_or(ConcreteDataType::null_datatype()),
}
.build());
}
let is_null = value.is_null();
if is_null {
return Ok(());
}
if !is_null {
// compile count(*) to count(true) to include null/non-nulls
// And the counts of non-null values are updated here
self.non_nulls += diff;
match aggr_fn.signature().generic_fn {
GenericFn::Max => {
self.val = self
.val
.clone()
.map(|v| v.max(value.clone()))
.or_else(|| Some(value))
}
GenericFn::Min => {
self.val = self
.val
.clone()
.map(|v| v.min(value.clone()))
.or_else(|| Some(value))
}
GenericFn::Count => (),
_ => unreachable!("already checked by ensure!"),
}
};
// min/max ignore nulls
Ok(())
}
fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
if aggr_fn.is_max() || aggr_fn.is_min() {
Ok(self.val.clone().unwrap_or(Value::Null))
} else if matches!(aggr_fn, AggregateFunc::Count) {
Ok(self.non_nulls.into())
} else {
Err(InternalSnafu {
reason: format!(
"OrdValue Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
.build())
}
}
}
/// Accumulates values for the various types of accumulable aggregations.
///
/// We assume that there are not more than 2^32 elements for the aggregation.
@@ -38,34 +534,407 @@ use crate::repr::Diff;
/// The float accumulator performs accumulation with tolerance for floating point error.
///
/// TODO(discord9): check for overflowing
#[enum_dispatch(Accumulator)]
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum Accum {
/// Accumulates boolean values.
Bool {
/// The number of `true` values observed.
trues: Diff,
/// The number of `false` values observed.
falses: Diff,
},
Bool(Bool),
/// Accumulates simple numeric values.
SimpleNumber {
/// The accumulation of all non-NULL values observed.
accum: i128,
/// The number of non-NULL values observed.
non_nulls: Diff,
},
SimpleNumber(SimpleNumber),
/// Accumulates float values.
Float {
/// Accumulates non-special float values, i.e. not NaN, +inf, -inf.
/// accum will be set to zero if `non_nulls` is zero.
accum: OrderedF64,
/// Counts +inf
pos_infs: Diff,
/// Counts -inf
neg_infs: Diff,
/// Counts NaNs
nans: Diff,
/// Counts non-NULL values
non_nulls: Diff,
},
Float(Float),
/// Accumulate Values that impl `Ord`
OrdValue(OrdValue),
}
impl Accum {
pub fn new_accum(aggr_fn: &AggregateFunc) -> Result<Self, EvalError> {
Ok(match aggr_fn {
AggregateFunc::Any
| AggregateFunc::All
| AggregateFunc::MaxBool
| AggregateFunc::MinBool => Self::from(Bool {
trues: 0,
falses: 0,
}),
AggregateFunc::SumInt16
| AggregateFunc::SumInt32
| AggregateFunc::SumInt64
| AggregateFunc::SumUInt16
| AggregateFunc::SumUInt32
| AggregateFunc::SumUInt64 => Self::from(SimpleNumber {
accum: 0,
non_nulls: 0,
}),
AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64 => Self::from(Float {
accum: OrderedF64::from(0.0),
pos_infs: 0,
neg_infs: 0,
nans: 0,
non_nulls: 0,
}),
f if f.is_max() || f.is_min() || matches!(f, AggregateFunc::Count) => {
Self::from(OrdValue {
val: None,
non_nulls: 0,
})
}
f => {
return Err(InternalSnafu {
reason: format!(
"Accumulator does not support this aggregation function: {:?}",
f
),
}
.build());
}
})
}
pub fn try_into_accum(aggr_fn: &AggregateFunc, state: Vec<Value>) -> Result<Self, EvalError> {
match aggr_fn {
AggregateFunc::Any
| AggregateFunc::All
| AggregateFunc::MaxBool
| AggregateFunc::MinBool => Ok(Self::from(Bool::try_from(state)?)),
AggregateFunc::SumInt16
| AggregateFunc::SumInt32
| AggregateFunc::SumInt64
| AggregateFunc::SumUInt16
| AggregateFunc::SumUInt32
| AggregateFunc::SumUInt64 => Ok(Self::from(SimpleNumber::try_from(state)?)),
AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64 => {
Ok(Self::from(Float::try_from(state)?))
}
f if f.is_max() || f.is_min() || matches!(f, AggregateFunc::Count) => {
Ok(Self::from(OrdValue::try_from(state)?))
}
f => Err(InternalSnafu {
reason: format!(
"Accumulator does not support this aggregation function: {:?}",
f
),
}
.build()),
}
}
}
fn err_try_from_val<T: Display>(reason: T) -> EvalError {
TryFromValueSnafu {
msg: reason.to_string(),
}
.build()
}
/// compare type while ignore their precision, including `TimeStamp`, `Time`,
/// `Duration`, `Interval`
fn ty_eq_without_precision(left: ConcreteDataType, right: ConcreteDataType) -> bool {
left == right
|| matches!(left, ConcreteDataType::Timestamp(..))
&& matches!(right, ConcreteDataType::Timestamp(..))
|| matches!(left, ConcreteDataType::Time(..)) && matches!(right, ConcreteDataType::Time(..))
|| matches!(left, ConcreteDataType::Duration(..))
&& matches!(right, ConcreteDataType::Duration(..))
|| matches!(left, ConcreteDataType::Interval(..))
&& matches!(right, ConcreteDataType::Interval(..))
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_accum() {
let testcases = vec![
(
AggregateFunc::SumInt32,
vec![(Value::Int32(1), 1), (Value::Null, 1)],
(
Value::Int64(1),
vec![Value::Decimal128(Decimal128::new(1, 38, 0)), 1i64.into()],
),
),
(
AggregateFunc::SumFloat32,
vec![(Value::Float32(OrderedF32::from(1.0)), 1), (Value::Null, 1)],
(
Value::Float32(OrderedF32::from(1.0)),
vec![
Value::Float64(OrderedF64::from(1.0)),
0i64.into(),
0i64.into(),
0i64.into(),
1i64.into(),
],
),
),
(
AggregateFunc::MaxInt32,
vec![(Value::Int32(1), 1), (Value::Int32(2), 1), (Value::Null, 1)],
(Value::Int32(2), vec![Value::Int32(2), 2i64.into()]),
),
(
AggregateFunc::MinInt32,
vec![(Value::Int32(2), 1), (Value::Int32(1), 1), (Value::Null, 1)],
(Value::Int32(1), vec![Value::Int32(1), 2i64.into()]),
),
(
AggregateFunc::MaxFloat32,
vec![
(Value::Float32(OrderedF32::from(1.0)), 1),
(Value::Float32(OrderedF32::from(2.0)), 1),
(Value::Null, 1),
],
(
Value::Float32(OrderedF32::from(2.0)),
vec![Value::Float32(OrderedF32::from(2.0)), 2i64.into()],
),
),
(
AggregateFunc::MaxDateTime,
vec![
(Value::DateTime(DateTime::from(0)), 1),
(Value::DateTime(DateTime::from(1)), 1),
(Value::Null, 1),
],
(
Value::DateTime(DateTime::from(1)),
vec![Value::DateTime(DateTime::from(1)), 2i64.into()],
),
),
(
AggregateFunc::Count,
vec![
(Value::Int32(1), 1),
(Value::Int32(2), 1),
(Value::Null, 1),
(Value::Null, 1),
],
(2i64.into(), vec![Value::Null, 2i64.into()]),
),
(
AggregateFunc::Any,
vec![
(Value::Boolean(false), 1),
(Value::Boolean(false), 1),
(Value::Boolean(true), 1),
(Value::Null, 1),
],
(
Value::Boolean(true),
vec![Value::from(1i64), Value::from(2i64)],
),
),
(
AggregateFunc::All,
vec![
(Value::Boolean(false), 1),
(Value::Boolean(false), 1),
(Value::Boolean(true), 1),
(Value::Null, 1),
],
(
Value::Boolean(false),
vec![Value::from(1i64), Value::from(2i64)],
),
),
(
AggregateFunc::MaxBool,
vec![
(Value::Boolean(false), 1),
(Value::Boolean(false), 1),
(Value::Boolean(true), 1),
(Value::Null, 1),
],
(
Value::Boolean(true),
vec![Value::from(1i64), Value::from(2i64)],
),
),
(
AggregateFunc::MinBool,
vec![
(Value::Boolean(false), 1),
(Value::Boolean(false), 1),
(Value::Boolean(true), 1),
(Value::Null, 1),
],
(
Value::Boolean(false),
vec![Value::from(1i64), Value::from(2i64)],
),
),
];
for (aggr_fn, input, (eval_res, state)) in testcases {
let create_and_insert = || -> Result<Accum, EvalError> {
let mut acc = Accum::new_accum(&aggr_fn)?;
acc.update_batch(&aggr_fn, input.clone())?;
let row = acc.into_state();
let acc = Accum::try_into_accum(&aggr_fn, row)?;
Ok(acc)
};
let acc = match create_and_insert() {
Ok(acc) => acc,
Err(err) => panic!(
"Failed to create accum for {:?} with input {:?} with error: {:?}",
aggr_fn, input, err
),
};
if acc.eval(&aggr_fn).unwrap() != eval_res {
panic!(
"Failed to eval accum for {:?} with input {:?}, expect {:?}, got {:?}",
aggr_fn,
input,
eval_res,
acc.eval(&aggr_fn).unwrap()
);
}
let actual_state = acc.into_state();
if actual_state != state {
panic!(
"Failed to cast into state from accum for {:?} with input {:?}, expect state {:?}, got state {:?}",
aggr_fn,
input,
state,
actual_state
);
}
}
}
#[test]
fn test_fail_path_accum() {
{
let bool_accum = Bool::try_from(vec![Value::Null]);
assert!(matches!(bool_accum, Err(EvalError::Internal { .. })));
}
{
let mut bool_accum = Bool::try_from(vec![1i64.into(), 1i64.into()]).unwrap();
// serde
let bool_accum_serde = serde_json::to_string(&bool_accum).unwrap();
let bool_accum_de = serde_json::from_str::<Bool>(&bool_accum_serde).unwrap();
assert_eq!(bool_accum, bool_accum_de);
assert!(matches!(
bool_accum.update(&AggregateFunc::MaxDate, 1.into(), 1),
Err(EvalError::Internal { .. })
));
assert!(matches!(
bool_accum.update(&AggregateFunc::Any, 1.into(), 1),
Err(EvalError::TypeMismatch { .. })
));
assert!(matches!(
bool_accum.eval(&AggregateFunc::MaxDate),
Err(EvalError::Internal { .. })
));
}
{
let ret = SimpleNumber::try_from(vec![Value::Null]);
assert!(matches!(ret, Err(EvalError::Internal { .. })));
let mut accum =
SimpleNumber::try_from(vec![Decimal128::new(0, 38, 0).into(), 0i64.into()])
.unwrap();
assert!(matches!(
accum.update(&AggregateFunc::All, 0.into(), 1),
Err(EvalError::Internal { .. })
));
assert!(matches!(
accum.update(&AggregateFunc::SumInt64, 0i32.into(), 1),
Err(EvalError::TypeMismatch { .. })
));
assert!(matches!(
accum.eval(&AggregateFunc::All),
Err(EvalError::Internal { .. })
));
accum
.update(&AggregateFunc::SumInt64, 1i64.into(), 1)
.unwrap();
accum
.update(&AggregateFunc::SumInt64, i64::MAX.into(), 1)
.unwrap();
assert!(matches!(
accum.eval(&AggregateFunc::SumInt64),
Err(EvalError::Overflow { .. })
));
}
{
let ret = Float::try_from(vec![2f64.into(), 0i64.into(), 0i64.into(), 0i64.into()]);
assert!(matches!(ret, Err(EvalError::Internal { .. })));
let mut accum = Float::try_from(vec![
2f64.into(),
0i64.into(),
0i64.into(),
0i64.into(),
1i64.into(),
])
.unwrap();
accum
.update(&AggregateFunc::SumFloat64, 2f64.into(), -1)
.unwrap();
assert!(matches!(
accum.update(&AggregateFunc::All, 0.into(), 1),
Err(EvalError::Internal { .. })
));
assert!(matches!(
accum.update(&AggregateFunc::SumFloat64, 0.0f32.into(), 1),
Err(EvalError::TypeMismatch { .. })
));
// no record, no accum
assert_eq!(
accum.eval(&AggregateFunc::SumFloat64).unwrap(),
0.0f64.into()
);
assert!(matches!(
accum.eval(&AggregateFunc::All),
Err(EvalError::Internal { .. })
));
accum
.update(&AggregateFunc::SumFloat64, f64::INFINITY.into(), 1)
.unwrap();
accum
.update(&AggregateFunc::SumFloat64, (-f64::INFINITY).into(), 1)
.unwrap();
accum
.update(&AggregateFunc::SumFloat64, f64::NAN.into(), 1)
.unwrap();
}
{
let ret = OrdValue::try_from(vec![Value::Null]);
assert!(matches!(ret, Err(EvalError::Internal { .. })));
let mut accum = OrdValue::try_from(vec![Value::Null, 0i64.into()]).unwrap();
assert!(matches!(
accum.update(&AggregateFunc::All, 0.into(), 1),
Err(EvalError::Internal { .. })
));
accum
.update(&AggregateFunc::MaxInt16, 1i16.into(), 1)
.unwrap();
assert!(matches!(
accum.update(&AggregateFunc::MaxInt16, 0i32.into(), 1),
Err(EvalError::TypeMismatch { .. })
));
assert!(matches!(
accum.update(&AggregateFunc::MaxInt16, 0i16.into(), -1),
Err(EvalError::Internal { .. })
));
accum
.update(&AggregateFunc::MaxInt16, Value::Null, 1)
.unwrap();
}
// insert uint64 into max_int64 should fail
{
let mut accum = OrdValue::try_from(vec![Value::Null, 0i64.into()]).unwrap();
assert!(matches!(
accum.update(&AggregateFunc::MaxInt64, 0u64.into(), 1),
Err(EvalError::TypeMismatch { .. })
));
}
}
}

View File

@@ -12,15 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::type_name;
use common_time::{Date, DateTime};
use datatypes::prelude::ConcreteDataType;
use datatypes::value::{OrderedF32, OrderedF64, Value};
use serde::{Deserialize, Serialize};
use crate::expr::error::{EvalError, TryFromValueSnafu, TypeMismatchSnafu};
use crate::expr::relation::accum::Accum;
use crate::expr::relation::accum::{Accum, Accumulator};
use crate::repr::Diff;
/// Aggregate functions that can be applied to a group of rows.
@@ -83,3 +81,280 @@ pub enum AggregateFunc {
Any,
All,
}
impl AggregateFunc {
pub fn is_max(&self) -> bool {
self.signature().generic_fn == GenericFn::Max
}
pub fn is_min(&self) -> bool {
self.signature().generic_fn == GenericFn::Min
}
pub fn is_sum(&self) -> bool {
self.signature().generic_fn == GenericFn::Sum
}
/// Eval value, diff with accumulator
///
/// Expect self to be accumulable aggregate functio, i.e. sum/count
///
/// TODO(discord9): deal with overflow&better accumulator
pub fn eval_diff_accumulable<I>(
&self,
accum: Vec<Value>,
value_diffs: I,
) -> Result<(Value, Vec<Value>), EvalError>
where
I: IntoIterator<Item = (Value, Diff)>,
{
let mut accum = if accum.is_empty() {
Accum::new_accum(self)?
} else {
Accum::try_into_accum(self, accum)?
};
accum.update_batch(self, value_diffs)?;
let res = accum.eval(self)?;
Ok((res, accum.into_state()))
}
}
pub struct Signature {
pub input: ConcreteDataType,
pub output: ConcreteDataType,
pub generic_fn: GenericFn,
}
#[derive(Debug, PartialEq, Eq)]
pub enum GenericFn {
Max,
Min,
Sum,
Count,
Any,
All,
}
impl AggregateFunc {
/// all concrete datatypes with precision types will be returned with largest possible variant
/// as a exception, count have a signature of `null -> i64`, but it's actually `anytype -> i64`
pub fn signature(&self) -> Signature {
match self {
AggregateFunc::MaxInt16 => Signature {
input: ConcreteDataType::int16_datatype(),
output: ConcreteDataType::int16_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxInt32 => Signature {
input: ConcreteDataType::int32_datatype(),
output: ConcreteDataType::int32_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxInt64 => Signature {
input: ConcreteDataType::int64_datatype(),
output: ConcreteDataType::int64_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxUInt16 => Signature {
input: ConcreteDataType::uint16_datatype(),
output: ConcreteDataType::uint16_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxUInt32 => Signature {
input: ConcreteDataType::uint32_datatype(),
output: ConcreteDataType::uint32_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxUInt64 => Signature {
input: ConcreteDataType::uint64_datatype(),
output: ConcreteDataType::uint64_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxFloat32 => Signature {
input: ConcreteDataType::float32_datatype(),
output: ConcreteDataType::float32_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxFloat64 => Signature {
input: ConcreteDataType::float64_datatype(),
output: ConcreteDataType::float64_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxBool => Signature {
input: ConcreteDataType::boolean_datatype(),
output: ConcreteDataType::boolean_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxString => Signature {
input: ConcreteDataType::string_datatype(),
output: ConcreteDataType::string_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxDate => Signature {
input: ConcreteDataType::date_datatype(),
output: ConcreteDataType::date_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxDateTime => Signature {
input: ConcreteDataType::datetime_datatype(),
output: ConcreteDataType::datetime_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxTimestamp => Signature {
input: ConcreteDataType::timestamp_second_datatype(),
output: ConcreteDataType::timestamp_second_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxTime => Signature {
input: ConcreteDataType::time_second_datatype(),
output: ConcreteDataType::time_second_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxDuration => Signature {
input: ConcreteDataType::duration_second_datatype(),
output: ConcreteDataType::duration_second_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxInterval => Signature {
input: ConcreteDataType::interval_year_month_datatype(),
output: ConcreteDataType::interval_year_month_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MinInt16 => Signature {
input: ConcreteDataType::int16_datatype(),
output: ConcreteDataType::int16_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinInt32 => Signature {
input: ConcreteDataType::int32_datatype(),
output: ConcreteDataType::int32_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinInt64 => Signature {
input: ConcreteDataType::int64_datatype(),
output: ConcreteDataType::int64_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinUInt16 => Signature {
input: ConcreteDataType::uint16_datatype(),
output: ConcreteDataType::uint16_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinUInt32 => Signature {
input: ConcreteDataType::uint32_datatype(),
output: ConcreteDataType::uint32_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinUInt64 => Signature {
input: ConcreteDataType::uint64_datatype(),
output: ConcreteDataType::uint64_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinFloat32 => Signature {
input: ConcreteDataType::float32_datatype(),
output: ConcreteDataType::float32_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinFloat64 => Signature {
input: ConcreteDataType::float64_datatype(),
output: ConcreteDataType::float64_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinBool => Signature {
input: ConcreteDataType::boolean_datatype(),
output: ConcreteDataType::boolean_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinString => Signature {
input: ConcreteDataType::string_datatype(),
output: ConcreteDataType::string_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinDate => Signature {
input: ConcreteDataType::date_datatype(),
output: ConcreteDataType::date_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinDateTime => Signature {
input: ConcreteDataType::datetime_datatype(),
output: ConcreteDataType::datetime_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinTimestamp => Signature {
input: ConcreteDataType::timestamp_second_datatype(),
output: ConcreteDataType::timestamp_second_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinTime => Signature {
input: ConcreteDataType::time_second_datatype(),
output: ConcreteDataType::time_second_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinDuration => Signature {
input: ConcreteDataType::duration_second_datatype(),
output: ConcreteDataType::duration_second_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinInterval => Signature {
input: ConcreteDataType::interval_year_month_datatype(),
output: ConcreteDataType::interval_year_month_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::SumInt16 => Signature {
input: ConcreteDataType::int16_datatype(),
output: ConcreteDataType::int16_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumInt32 => Signature {
input: ConcreteDataType::int32_datatype(),
output: ConcreteDataType::int32_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumInt64 => Signature {
input: ConcreteDataType::int64_datatype(),
output: ConcreteDataType::int64_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumUInt16 => Signature {
input: ConcreteDataType::uint16_datatype(),
output: ConcreteDataType::uint16_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumUInt32 => Signature {
input: ConcreteDataType::uint32_datatype(),
output: ConcreteDataType::uint32_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumUInt64 => Signature {
input: ConcreteDataType::uint64_datatype(),
output: ConcreteDataType::uint64_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumFloat32 => Signature {
input: ConcreteDataType::float32_datatype(),
output: ConcreteDataType::float32_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumFloat64 => Signature {
input: ConcreteDataType::float64_datatype(),
output: ConcreteDataType::float64_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::Count => Signature {
input: ConcreteDataType::null_datatype(),
output: ConcreteDataType::int64_datatype(),
generic_fn: GenericFn::Count,
},
AggregateFunc::Any => Signature {
input: ConcreteDataType::boolean_datatype(),
output: ConcreteDataType::boolean_datatype(),
generic_fn: GenericFn::Any,
},
AggregateFunc::All => Signature {
input: ConcreteDataType::boolean_datatype(),
output: ConcreteDataType::boolean_datatype(),
generic_fn: GenericFn::All,
},
}
}
}

View File

@@ -17,4 +17,5 @@
// allow unused for now because it should be use later
mod adapter;
mod expr;
mod plan;
mod repr;

98
src/flow/src/plan.rs Normal file
View File

@@ -0,0 +1,98 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! This module contain basic definition for dataflow's plan
//! that can be translate to hydro dataflow
mod join;
mod reduce;
use serde::{Deserialize, Serialize};
pub(crate) use self::reduce::{AccumulablePlan, KeyValPlan, ReducePlan};
use crate::expr::{
AggregateExpr, EvalError, Id, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr,
};
use crate::plan::join::JoinPlan;
use crate::repr::{DiffRow, RelationType};
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
pub struct TypedPlan {
/// output type of the relation
pub typ: RelationType,
pub plan: Plan,
}
/// TODO(discord9): support `TableFunc`by define FlatMap that map 1 to n)
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
pub enum Plan {
/// A constant collection of rows.
Constant { rows: Vec<DiffRow> },
/// Get CDC data from an source, be it external reference to an existing source or an internal
/// reference to a `Let` identifier
Get { id: Id },
/// Create a temporary collection from given `value``, and make this bind only available
/// in scope of `body`
Let {
id: LocalId,
value: Box<Plan>,
body: Box<Plan>,
},
/// Map, Filter, and Project operators.
Mfp {
/// The input collection.
input: Box<Plan>,
/// Linear operator to apply to each record.
mfp: MapFilterProject,
},
/// Reduce operator, aggregation by key assembled from KeyValPlan
Reduce {
/// The input collection.
input: Box<Plan>,
/// A plan for changing input records into key, value pairs.
key_val_plan: KeyValPlan,
/// A plan for performing the reduce.
///
/// The implementation of reduction has several different strategies based
/// on the properties of the reduction, and the input itself.
reduce_plan: ReducePlan,
},
/// A multiway relational equijoin, with fused map, filter, and projection.
///
/// This stage performs a multiway join among `inputs`, using the equality
/// constraints expressed in `plan`. The plan also describes the implementation
/// strategy we will use, and any pushed down per-record work.
Join {
/// An ordered list of inputs that will be joined.
inputs: Vec<Plan>,
/// Detailed information about the implementation of the join.
///
/// This includes information about the implementation strategy, but also
/// any map, filter, project work that we might follow the join with, but
/// potentially pushed down into the implementation of the join.
plan: JoinPlan,
},
/// Adds the contents of the input collections.
///
/// Importantly, this is *multiset* union, so the multiplicities of records will
/// add. This is in contrast to *set* union, where the multiplicities would be
/// capped at one. A set union can be formed with `Union` followed by `Reduce`
/// implementing the "distinct" operator.
Union {
/// The input collections
inputs: Vec<Plan>,
/// Whether to consolidate the output, e.g., cancel negated records.
consolidate_output: bool,
},
}

78
src/flow/src/plan/join.rs Normal file
View File

@@ -0,0 +1,78 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use serde::{Deserialize, Serialize};
use crate::expr::ScalarExpr;
use crate::plan::SafeMfpPlan;
/// TODO(discord9): consider impl more join strategies
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
pub enum JoinPlan {
Linear(LinearJoinPlan),
}
/// Determine if a given row should stay in the output. And apply a map filter project before output the row
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
pub struct JoinFilter {
/// each element in the outer vector will check if each expr in itself can be eval to same value
/// if not, the row will be filtered out. Useful for equi-join(join based on equality of some columns)
pub ready_equivalences: Vec<Vec<ScalarExpr>>,
/// Apply a map filter project before output the row
pub before: SafeMfpPlan,
}
/// A plan for the execution of a linear join.
///
/// A linear join is a sequence of stages, each of which introduces
/// a new collection. Each stage is represented by a [LinearStagePlan].
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
pub struct LinearJoinPlan {
/// The source relation from which we start the join.
pub source_relation: usize,
/// The arrangement to use for the source relation, if any
pub source_key: Option<Vec<ScalarExpr>>,
/// An initial closure to apply before any stages.
///
/// Values of `None` indicate the identity closure.
pub initial_closure: Option<JoinFilter>,
/// A *sequence* of stages to apply one after the other.
pub stage_plans: Vec<LinearStagePlan>,
/// A concluding filter to apply after the last stage.
///
/// Values of `None` indicate the identity closure.
pub final_closure: Option<JoinFilter>,
}
/// A plan for the execution of one stage of a linear join.
///
/// Each stage is a binary join between the current accumulated
/// join results, and a new collection. The former is referred to
/// as the "stream" and the latter the "lookup".
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
pub struct LinearStagePlan {
/// The index of the relation into which we will look up.
pub lookup_relation: usize,
/// The key expressions to use for the stream relation.
pub stream_key: Vec<ScalarExpr>,
/// Columns to retain from the stream relation.
/// These columns are those that are not redundant with `stream_key`,
/// and cannot be read out of the key component of an arrangement.
pub stream_thinning: Vec<usize>,
/// The key expressions to use for the lookup relation.
pub lookup_key: Vec<ScalarExpr>,
/// The closure to apply to the concatenation of the key columns,
/// the stream value columns, and the lookup value colunms.
pub closure: JoinFilter,
}

View File

@@ -0,0 +1,50 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use serde::{Deserialize, Serialize};
use crate::expr::{AggregateExpr, Id, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr};
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
pub struct KeyValPlan {
pub key_plan: SafeMfpPlan,
pub val_plan: SafeMfpPlan,
}
/// TODO(discord9): def&impl of Hierarchical aggregates(for min/max with support to deletion) and
/// basic aggregates(for other aggregate functions) and mixed aggregate
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
pub enum ReducePlan {
/// Plan for not computing any aggregations, just determining the set of
/// distinct keys.
Distinct,
/// Plan for computing only accumulable aggregations.
/// Including simple functions like `sum`, `count`, `min/max`(without deletion)
Accumulable(AccumulablePlan),
}
/// Accumulable plan for the execution of a reduction.
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
pub struct AccumulablePlan {
/// All of the aggregations we were asked to compute, stored
/// in order.
pub full_aggrs: Vec<AggregateExpr>,
/// All of the non-distinct accumulable aggregates.
/// Each element represents:
/// (index of aggr output, index of value among inputs, aggr expr)
/// These will all be rendered together in one dataflow fragment.
pub simple_aggrs: Vec<(usize, usize, AggregateExpr)>,
/// Same as above but for all of the `DISTINCT` accumulable aggregations.
pub distinct_aggrs: Vec<(usize, usize, AggregateExpr)>,
}

View File

@@ -33,7 +33,10 @@ use snafu::ResultExt;
use crate::expr::error::{CastValueSnafu, EvalError};
/// System-wide Record count difference type.
/// System-wide Record count difference type. Useful for capture data change
///
/// i.e. +1 means insert one record, -1 means remove,
/// and +/-n means insert/remove multiple duplicate records.
pub type Diff = i64;
/// System-wide default timestamp type

View File

@@ -28,6 +28,7 @@ use api::v1::meta::Role;
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use catalog::CatalogManagerRef;
use client::OutputData;
use common_base::Plugins;
use common_config::KvBackendConfig;
use common_error::ext::BoxedError;
@@ -39,14 +40,16 @@ use common_procedure::local::{LocalManager, ManagerConfig};
use common_procedure::options::ProcedureConfig;
use common_procedure::ProcedureManagerRef;
use common_query::Output;
use common_telemetry::error;
use common_telemetry::logging::info;
use common_telemetry::{error, tracing};
use log_store::raft_engine::RaftEngineBackend;
use meta_client::client::{MetaClient, MetaClientBuilder};
use meta_client::MetaClientOptions;
use operator::delete::DeleterRef;
use operator::insert::InserterRef;
use operator::statement::StatementExecutor;
use prometheus::HistogramTimer;
use query::metrics::OnDone;
use query::parser::{PromQuery, QueryLanguageParser, QueryStatement};
use query::plan::LogicalPlan;
use query::query_engine::options::{validate_catalog_and_schema, QueryOptions};
@@ -85,7 +88,6 @@ use crate::error::{
};
use crate::frontend::{FrontendOptions, TomlSerializable};
use crate::heartbeat::HeartbeatTask;
use crate::metrics;
use crate::script::ScriptExecutor;
#[async_trait]
@@ -275,8 +277,8 @@ impl Instance {
impl SqlQueryHandler for Instance {
type Error = Error;
#[tracing::instrument(skip_all)]
async fn do_query(&self, query: &str, query_ctx: QueryContextRef) -> Vec<Result<Output>> {
let _timer = metrics::METRIC_HANDLE_SQL_ELAPSED.start_timer();
let query_interceptor_opt = self.plugins.get::<SqlQueryInterceptorRef<Error>>();
let query_interceptor = query_interceptor_opt.as_ref();
let query = match query_interceptor.pre_parsing(query, query_ctx.clone()) {
@@ -336,7 +338,6 @@ impl SqlQueryHandler for Instance {
}
async fn do_exec_plan(&self, plan: LogicalPlan, query_ctx: QueryContextRef) -> Result<Output> {
let _timer = metrics::METRIC_EXEC_PLAN_ELAPSED.start_timer();
// plan should be prepared before exec
// we'll do check there
self.query_engine
@@ -345,6 +346,7 @@ impl SqlQueryHandler for Instance {
.context(ExecLogicalPlanSnafu)
}
#[tracing::instrument(skip_all)]
async fn do_promql_query(
&self,
query: &PromQuery,
@@ -398,14 +400,27 @@ impl SqlQueryHandler for Instance {
}
}
/// Attaches a timer to the output and observes it once the output is exhausted.
pub fn attach_timer(output: Output, timer: HistogramTimer) -> Output {
match output.data {
OutputData::AffectedRows(_) | OutputData::RecordBatches(_) => output,
OutputData::Stream(stream) => {
let stream = OnDone::new(stream, move || {
timer.observe_duration();
});
Output::new(OutputData::Stream(Box::pin(stream)), output.meta)
}
}
}
#[async_trait]
impl PrometheusHandler for Instance {
#[tracing::instrument(skip_all)]
async fn do_query(
&self,
query: &PromQuery,
query_ctx: QueryContextRef,
) -> server_error::Result<Output> {
let _timer = metrics::METRIC_HANDLE_PROMQL_ELAPSED.start_timer();
let interceptor = self
.plugins
.get::<PromQueryInterceptorRef<server_error::Error>>();

View File

@@ -20,6 +20,7 @@ use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use common_meta::table_name::TableName;
use common_query::Output;
use common_telemetry::tracing;
use query::parser::PromQuery;
use servers::interceptor::{GrpcQueryInterceptor, GrpcQueryInterceptorRef};
use servers::query_handler::grpc::GrpcQueryHandler;
@@ -31,7 +32,8 @@ use crate::error::{
Error, IncompleteGrpcRequestSnafu, NotSupportedSnafu, PermissionSnafu, Result,
TableOperationSnafu,
};
use crate::instance::Instance;
use crate::instance::{attach_timer, Instance};
use crate::metrics::{GRPC_HANDLE_PROMQL_ELAPSED, GRPC_HANDLE_SQL_ELAPSED};
#[async_trait]
impl GrpcQueryHandler for Instance {
@@ -59,6 +61,7 @@ impl GrpcQueryHandler for Instance {
})?;
match query {
Query::Sql(sql) => {
let timer = GRPC_HANDLE_SQL_ELAPSED.start_timer();
let mut result = SqlQueryHandler::do_query(self, &sql, ctx.clone()).await;
ensure!(
result.len() == 1,
@@ -66,7 +69,8 @@ impl GrpcQueryHandler for Instance {
feat: "execute multiple statements in SQL query string through GRPC interface"
}
);
result.remove(0)?
let output = result.remove(0)?;
attach_timer(output, timer)
}
Query::LogicalPlan(_) => {
return NotSupportedSnafu {
@@ -75,6 +79,7 @@ impl GrpcQueryHandler for Instance {
.fail();
}
Query::PromRangeQuery(promql) => {
let timer = GRPC_HANDLE_PROMQL_ELAPSED.start_timer();
let prom_query = PromQuery {
query: promql.query,
start: promql.start,
@@ -89,7 +94,8 @@ impl GrpcQueryHandler for Instance {
feat: "execute multiple statements in PromQL query string through GRPC interface"
}
);
result.remove(0)?
let output = result.remove(0)?;
attach_timer(output, timer)
}
}
}
@@ -107,7 +113,7 @@ impl GrpcQueryHandler for Instance {
.statement_executor
.create_table_inner(&mut expr, None, &ctx)
.await?;
Output::AffectedRows(0)
Output::new_with_affected_rows(0)
}
DdlExpr::Alter(expr) => self.statement_executor.alter_table_inner(expr).await?,
DdlExpr::CreateDatabase(expr) => {
@@ -173,6 +179,7 @@ fn fill_catalog_and_schema_from_context(ddl_expr: &mut DdlExpr, ctx: &QueryConte
}
impl Instance {
#[tracing::instrument(skip_all)]
pub async fn handle_inserts(
&self,
requests: InsertRequests,
@@ -184,6 +191,7 @@ impl Instance {
.context(TableOperationSnafu)
}
#[tracing::instrument(skip_all)]
pub async fn handle_row_inserts(
&self,
requests: RowInsertRequests,
@@ -195,6 +203,7 @@ impl Instance {
.context(TableOperationSnafu)
}
#[tracing::instrument(skip_all)]
pub async fn handle_metric_row_inserts(
&self,
requests: RowInsertRequests,
@@ -207,6 +216,7 @@ impl Instance {
.context(TableOperationSnafu)
}
#[tracing::instrument(skip_all)]
pub async fn handle_deletes(
&self,
requests: DeleteRequests,
@@ -218,6 +228,7 @@ impl Instance {
.context(TableOperationSnafu)
}
#[tracing::instrument(skip_all)]
pub async fn handle_row_deletes(
&self,
requests: RowDeleteRequests,

View File

@@ -15,8 +15,9 @@
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use common_error::ext::BoxedError;
use servers::error::AuthSnafu;
use servers::error::{AuthSnafu, Error};
use servers::influxdb::InfluxdbRequest;
use servers::interceptor::{LineProtocolInterceptor, LineProtocolInterceptorRef};
use servers::query_handler::InfluxdbLineProtocolHandler;
use session::context::QueryContextRef;
use snafu::ResultExt;
@@ -36,6 +37,9 @@ impl InfluxdbLineProtocolHandler for Instance {
.check_permission(ctx.current_user(), PermissionReq::LineProtocol)
.context(AuthSnafu)?;
let interceptor_ref = self.plugins.get::<LineProtocolInterceptorRef<Error>>();
interceptor_ref.pre_execute(&request.lines, ctx.clone())?;
let requests = request.try_into()?;
let _ = self
.handle_row_inserts(requests, ctx)

View File

@@ -15,6 +15,7 @@
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use common_error::ext::BoxedError;
use common_telemetry::tracing;
use servers::error as server_error;
use servers::error::AuthSnafu;
use servers::opentsdb::codec::DataPoint;
@@ -27,6 +28,7 @@ use crate::instance::Instance;
#[async_trait]
impl OpentsdbProtocolHandler for Instance {
#[tracing::instrument(skip_all, fields(protocol = "opentsdb"))]
async fn exec(
&self,
data_points: Vec<DataPoint>,
@@ -45,8 +47,8 @@ impl OpentsdbProtocolHandler for Instance {
.map_err(BoxedError::new)
.context(servers::error::ExecuteGrpcQuerySnafu)?;
Ok(match output {
common_query::Output::AffectedRows(rows) => rows,
Ok(match output.data {
common_query::OutputData::AffectedRows(rows) => rows,
_ => unreachable!(),
})
}

View File

@@ -15,6 +15,7 @@
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use common_error::ext::BoxedError;
use common_telemetry::tracing;
use opentelemetry_proto::tonic::collector::metrics::v1::{
ExportMetricsServiceRequest, ExportMetricsServiceResponse,
};
@@ -22,6 +23,7 @@ use opentelemetry_proto::tonic::collector::trace::v1::{
ExportTraceServiceRequest, ExportTraceServiceResponse,
};
use servers::error::{self, AuthSnafu, Result as ServerResult};
use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
use servers::otlp;
use servers::otlp::plugin::TraceParserRef;
use servers::query_handler::OpenTelemetryProtocolHandler;
@@ -33,6 +35,7 @@ use crate::metrics::{OTLP_METRICS_ROWS, OTLP_TRACES_ROWS};
#[async_trait]
impl OpenTelemetryProtocolHandler for Instance {
#[tracing::instrument(skip_all)]
async fn metrics(
&self,
request: ExportMetricsServiceRequest,
@@ -43,6 +46,12 @@ impl OpenTelemetryProtocolHandler for Instance {
.as_ref()
.check_permission(ctx.current_user(), PermissionReq::Otlp)
.context(AuthSnafu)?;
let interceptor_ref = self
.plugins
.get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
interceptor_ref.pre_execute(ctx.clone())?;
let (requests, rows) = otlp::metrics::to_grpc_insert_requests(request)?;
let _ = self
.handle_row_inserts(requests, ctx)
@@ -59,6 +68,7 @@ impl OpenTelemetryProtocolHandler for Instance {
Ok(resp)
}
#[tracing::instrument(skip_all)]
async fn traces(
&self,
request: ExportTraceServiceRequest,
@@ -70,6 +80,11 @@ impl OpenTelemetryProtocolHandler for Instance {
.check_permission(ctx.current_user(), PermissionReq::Otlp)
.context(AuthSnafu)?;
let interceptor_ref = self
.plugins
.get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
interceptor_ref.pre_execute(ctx.clone())?;
let (table_name, spans) = match self.plugins.get::<TraceParserRef>() {
Some(parser) => (parser.table_name(), parser.parse(request)),
None => (

View File

@@ -16,19 +16,22 @@ use std::sync::Arc;
use api::prom_store::remote::read_request::ResponseType;
use api::prom_store::remote::{Query, QueryResult, ReadRequest, ReadResponse, WriteRequest};
use api::v1::RowInsertRequests;
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use client::OutputData;
use common_catalog::format_full_table_name;
use common_error::ext::BoxedError;
use common_query::prelude::GREPTIME_PHYSICAL_TABLE;
use common_query::Output;
use common_recordbatch::RecordBatches;
use common_telemetry::logging;
use common_telemetry::{logging, tracing};
use operator::insert::InserterRef;
use operator::statement::StatementExecutor;
use prost::Message;
use servers::error::{self, AuthSnafu, Result as ServerResult};
use servers::http::prom_store::PHYSICAL_TABLE_PARAM;
use servers::interceptor::{PromStoreProtocolInterceptor, PromStoreProtocolInterceptorRef};
use servers::prom_store::{self, Metrics};
use servers::query_handler::{
PromStoreProtocolHandler, PromStoreProtocolHandlerRef, PromStoreResponse,
@@ -75,7 +78,7 @@ fn negotiate_response_type(accepted_response_types: &[i32]) -> ServerResult<Resp
}
async fn to_query_result(table_name: &str, output: Output) -> ServerResult<QueryResult> {
let Output::Stream(stream, _) = output else {
let OutputData::Stream(stream) = output.data else {
unreachable!()
};
let recordbatches = RecordBatches::try_collect(stream)
@@ -87,6 +90,7 @@ async fn to_query_result(table_name: &str, output: Output) -> ServerResult<Query
}
impl Instance {
#[tracing::instrument(skip_all)]
async fn handle_remote_query(
&self,
ctx: &QueryContextRef,
@@ -126,6 +130,7 @@ impl Instance {
.context(ExecLogicalPlanSnafu)
}
#[tracing::instrument(skip_all)]
async fn handle_remote_queries(
&self,
ctx: QueryContextRef,
@@ -166,8 +171,12 @@ impl PromStoreProtocolHandler for Instance {
.as_ref()
.check_permission(ctx.current_user(), PermissionReq::PromStoreWrite)
.context(AuthSnafu)?;
let interceptor_ref = self
.plugins
.get::<PromStoreProtocolInterceptorRef<servers::error::Error>>();
interceptor_ref.pre_write(&request, ctx.clone())?;
let (requests, samples) = prom_store::to_grpc_row_insert_requests(request)?;
let (requests, samples) = prom_store::to_grpc_row_insert_requests(&request)?;
if with_metric_engine {
let physical_table = ctx
.extension(PHYSICAL_TABLE_PARAM)
@@ -190,6 +199,38 @@ impl PromStoreProtocolHandler for Instance {
Ok(())
}
async fn write_fast(
&self,
request: RowInsertRequests,
ctx: QueryContextRef,
with_metric_engine: bool,
) -> ServerResult<()> {
self.plugins
.get::<PermissionCheckerRef>()
.as_ref()
.check_permission(ctx.current_user(), PermissionReq::PromStoreWrite)
.context(AuthSnafu)?;
if with_metric_engine {
let physical_table = ctx
.extension(PHYSICAL_TABLE_PARAM)
.unwrap_or(GREPTIME_PHYSICAL_TABLE)
.to_string();
let _ = self
.handle_metric_row_inserts(request, ctx.clone(), physical_table.to_string())
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)?;
} else {
let _ = self
.handle_row_inserts(request, ctx.clone())
.await
.map_err(BoxedError::new)
.context(error::ExecuteGrpcQuerySnafu)?;
}
Ok(())
}
async fn read(
&self,
request: ReadRequest,
@@ -200,6 +241,10 @@ impl PromStoreProtocolHandler for Instance {
.as_ref()
.check_permission(ctx.current_user(), PermissionReq::PromStoreRead)
.context(AuthSnafu)?;
let interceptor_ref = self
.plugins
.get::<PromStoreProtocolInterceptorRef<servers::error::Error>>();
interceptor_ref.pre_read(&request, ctx.clone())?;
let response_type = negotiate_response_type(&request.accepted_response_types)?;
@@ -265,7 +310,7 @@ impl PromStoreProtocolHandler for ExportMetricHandler {
ctx: QueryContextRef,
_: bool,
) -> ServerResult<()> {
let (requests, _) = prom_store::to_grpc_row_insert_requests(request)?;
let (requests, _) = prom_store::to_grpc_row_insert_requests(&request)?;
self.inserter
.handle_metric_row_inserts(
requests,
@@ -279,6 +324,15 @@ impl PromStoreProtocolHandler for ExportMetricHandler {
Ok(())
}
async fn write_fast(
&self,
_request: RowInsertRequests,
_ctx: QueryContextRef,
_with_metric_engine: bool,
) -> ServerResult<()> {
unimplemented!()
}
async fn read(
&self,
_request: ReadRequest,

View File

@@ -16,6 +16,8 @@ use std::collections::HashMap;
use async_trait::async_trait;
use common_query::Output;
use servers::error::Error;
use servers::interceptor::{ScriptInterceptor, ScriptInterceptorRef};
use servers::query_handler::ScriptHandler;
use session::context::QueryContextRef;
@@ -30,7 +32,10 @@ impl ScriptHandler for Instance {
name: &str,
script: &str,
) -> servers::error::Result<()> {
let _timer = metrics::METRIC_HANDLE_SCRIPTS_ELAPSED.start_timer();
let interceptor_ref = self.plugins.get::<ScriptInterceptorRef<Error>>();
interceptor_ref.pre_execute(name, query_ctx.clone())?;
let _timer = metrics::INSERT_SCRIPTS_ELAPSED.start_timer();
self.script_executor
.insert_script(query_ctx, name, script)
.await
@@ -42,7 +47,10 @@ impl ScriptHandler for Instance {
name: &str,
params: HashMap<String, String>,
) -> servers::error::Result<Output> {
let _timer = metrics::METRIC_RUN_SCRIPT_ELAPSED.start_timer();
let interceptor_ref = self.plugins.get::<ScriptInterceptorRef<Error>>();
interceptor_ref.pre_execute(name, query_ctx.clone())?;
let _timer = metrics::EXECUTE_SCRIPT_ELAPSED.start_timer();
self.script_executor
.execute_script(query_ctx, name, params)
.await

View File

@@ -16,22 +16,32 @@ use lazy_static::lazy_static;
use prometheus::*;
lazy_static! {
pub static ref METRIC_HANDLE_SQL_ELAPSED: Histogram =
register_histogram!("greptime_frontend_handle_sql_elapsed", "frontend handle sql elapsed").unwrap();
pub static ref METRIC_HANDLE_PROMQL_ELAPSED: Histogram = register_histogram!(
"greptime_frontend_handle_promql_elapsed",
"frontend handle promql elapsed"
/// Timer of handling query in RPC handler.
pub static ref GRPC_HANDLE_QUERY_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_frontend_grpc_handle_query_elapsed",
"Elapsed time of handling queries in RPC handler",
&["type"],
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
pub static ref METRIC_EXEC_PLAN_ELAPSED: Histogram =
register_histogram!("greptime_frontend_exec_plan_elapsed", "frontend exec plan elapsed").unwrap();
pub static ref METRIC_HANDLE_SCRIPTS_ELAPSED: Histogram = register_histogram!(
"greptime_frontend_handle_scripts_elapsed",
"frontend handle scripts elapsed"
pub static ref GRPC_HANDLE_SQL_ELAPSED: Histogram = GRPC_HANDLE_QUERY_ELAPSED
.with_label_values(&["sql"]);
pub static ref GRPC_HANDLE_PROMQL_ELAPSED: Histogram = GRPC_HANDLE_QUERY_ELAPSED
.with_label_values(&["promql"]);
/// Timer of handling scripts in the script handler.
pub static ref HANDLE_SCRIPT_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_frontend_handle_script_elapsed",
"Elapsed time of handling scripts in the script handler",
&["type"],
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
pub static ref METRIC_RUN_SCRIPT_ELAPSED: Histogram =
register_histogram!("greptime_frontend_run_script_elapsed", "frontend run script elapsed").unwrap();
pub static ref INSERT_SCRIPTS_ELAPSED: Histogram = HANDLE_SCRIPT_ELAPSED
.with_label_values(&["insert"]);
pub static ref EXECUTE_SCRIPT_ELAPSED: Histogram = HANDLE_SCRIPT_ELAPSED
.with_label_values(&["execute"]);
/// The samples count of Prometheus remote write.
pub static ref PROM_STORE_REMOTE_WRITE_SAMPLES: IntCounter = register_int_counter!(
"greptime_frontend_prometheus_remote_write_samples",

View File

@@ -152,6 +152,10 @@ impl TxnService for RaftEngineBackend {
responses,
})
}
fn max_txn_ops(&self) -> usize {
usize::MAX
}
}
#[async_trait::async_trait]

View File

@@ -24,7 +24,9 @@ fn main() {
#[tokio::main]
async fn run() {
let kv_backend = EtcdStore::with_endpoints(["127.0.0.1:2380"]).await.unwrap();
let kv_backend = EtcdStore::with_endpoints(["127.0.0.1:2380"], 128)
.await
.unwrap();
// put
let put_req = PutRequest {

View File

@@ -193,7 +193,8 @@ pub async fn metasrv_builder(
(None, false) => {
let etcd_client = create_etcd_client(opts).await?;
let kv_backend = {
let etcd_backend = EtcdStore::with_etcd_client(etcd_client.clone());
let etcd_backend =
EtcdStore::with_etcd_client(etcd_client.clone(), opts.max_txn_ops);
if !opts.store_key_prefix.is_empty() {
Arc::new(ChrootKvBackend::new(
opts.store_key_prefix.clone().into_bytes(),

View File

@@ -79,6 +79,17 @@ pub struct MetaSrvOptions {
pub wal: MetaSrvWalConfig,
pub export_metrics: ExportMetricsOption,
pub store_key_prefix: String,
/// The max operations per txn
///
/// This value is usually limited by which store is used for the `KvBackend`.
/// For example, if using etcd, this value should ensure that it is less than
/// or equal to the `--max-txn-ops` option value of etcd.
///
/// TODO(jeremy): Currently, this option only affects the etcd store, but it may
/// also affect other stores in the future. In other words, each store needs to
/// limit the number of operations in a txn because an infinitely large txn could
/// potentially block other operations.
pub max_txn_ops: usize,
}
impl MetaSrvOptions {
@@ -112,6 +123,7 @@ impl Default for MetaSrvOptions {
wal: MetaSrvWalConfig::default(),
export_metrics: ExportMetricsOption::default(),
store_key_prefix: String::new(),
max_txn_ops: 128,
}
}
}

View File

@@ -42,7 +42,7 @@ pub async fn mock_with_memstore() -> MockInfo {
}
pub async fn mock_with_etcdstore(addr: &str) -> MockInfo {
let kv_backend = EtcdStore::with_endpoints([addr]).await.unwrap();
let kv_backend = EtcdStore::with_endpoints([addr], 128).await.unwrap();
mock(Default::default(), kv_backend, None, None).await
}

View File

@@ -380,6 +380,10 @@ impl TxnService for LeaderCachedKvBackend {
Ok(res)
}
fn max_txn_ops(&self) -> usize {
self.store.max_txn_ops()
}
}
impl ResettableKvBackend for LeaderCachedKvBackend {

View File

@@ -76,7 +76,9 @@ common-test-util.workspace = true
criterion = "0.4"
log-store.workspace = true
rand.workspace = true
toml.workspace = true
[[bench]]
name = "bench_merge_tree"
name = "memtable_bench"
harness = false
required-features = ["test"]

View File

@@ -7,3 +7,9 @@ The Alfa Romeo [MiTo](https://en.wikipedia.org/wiki/Alfa_Romeo_MiTo) is a front-
> "You can't be a true petrolhead until you've owned an Alfa Romeo."
> <div align="right">-- by Jeremy Clarkson</div>
## Benchmarks
Run benchmarks in this crate:
```bash
cargo bench -p mito2 -F test
```

View File

@@ -0,0 +1,352 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use api::v1::value::ValueData;
use api::v1::{Row, Rows, SemanticType};
use criterion::{criterion_group, criterion_main, Criterion};
use datafusion_common::Column;
use datafusion_expr::{lit, Expr};
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use mito2::memtable::merge_tree::{MergeTreeConfig, MergeTreeMemtable};
use mito2::memtable::time_series::TimeSeriesMemtable;
use mito2::memtable::{KeyValues, Memtable};
use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema};
use rand::rngs::ThreadRng;
use rand::seq::SliceRandom;
use rand::Rng;
use store_api::metadata::{
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
};
use store_api::storage::RegionId;
use table::predicate::Predicate;
/// Writes rows.
fn write_rows(c: &mut Criterion) {
let metadata = memtable_util::metadata_with_primary_key(vec![1, 0], true);
let timestamps = (0..100).collect::<Vec<_>>();
// Note that this test only generate one time series.
let mut group = c.benchmark_group("write");
group.bench_function("merge_tree", |b| {
let memtable =
MergeTreeMemtable::new(1, metadata.clone(), None, &MergeTreeConfig::default());
let kvs =
memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
b.iter(|| {
memtable.write(&kvs).unwrap();
});
});
group.bench_function("time_series", |b| {
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
let kvs =
memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
b.iter(|| {
memtable.write(&kvs).unwrap();
});
});
}
/// Scans all rows.
fn full_scan(c: &mut Criterion) {
let metadata = Arc::new(cpu_metadata());
let config = MergeTreeConfig::default();
let start_sec = 1710043200;
let generator = CpuDataGenerator::new(metadata.clone(), 4000, start_sec, start_sec + 3600 * 2);
let mut group = c.benchmark_group("full_scan");
group.sample_size(10);
group.bench_function("merge_tree", |b| {
let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &config);
for kvs in generator.iter() {
memtable.write(&kvs).unwrap();
}
b.iter(|| {
let iter = memtable.iter(None, None).unwrap();
for batch in iter {
let _batch = batch.unwrap();
}
});
});
group.bench_function("time_series", |b| {
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
for kvs in generator.iter() {
memtable.write(&kvs).unwrap();
}
b.iter(|| {
let iter = memtable.iter(None, None).unwrap();
for batch in iter {
let _batch = batch.unwrap();
}
});
});
}
/// Filters 1 host.
fn filter_1_host(c: &mut Criterion) {
let metadata = Arc::new(cpu_metadata());
let config = MergeTreeConfig::default();
let start_sec = 1710043200;
let generator = CpuDataGenerator::new(metadata.clone(), 4000, start_sec, start_sec + 3600 * 2);
let mut group = c.benchmark_group("filter_1_host");
group.sample_size(10);
group.bench_function("merge_tree", |b| {
let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &config);
for kvs in generator.iter() {
memtable.write(&kvs).unwrap();
}
let predicate = generator.random_host_filter();
b.iter(|| {
let iter = memtable.iter(None, Some(predicate.clone())).unwrap();
for batch in iter {
let _batch = batch.unwrap();
}
});
});
group.bench_function("time_series", |b| {
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
for kvs in generator.iter() {
memtable.write(&kvs).unwrap();
}
let predicate = generator.random_host_filter();
b.iter(|| {
let iter = memtable.iter(None, Some(predicate.clone())).unwrap();
for batch in iter {
let _batch = batch.unwrap();
}
});
});
}
struct Host {
hostname: String,
region: String,
datacenter: String,
rack: String,
os: String,
arch: String,
team: String,
service: String,
service_version: String,
service_environment: String,
}
impl Host {
fn random_with_id(id: usize) -> Host {
let mut rng = rand::thread_rng();
let region = format!("ap-southeast-{}", rng.gen_range(0..10));
let datacenter = format!(
"{}{}",
region,
['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap()
);
Host {
hostname: format!("host_{id}"),
region,
datacenter,
rack: rng.gen_range(0..100).to_string(),
os: "Ubuntu16.04LTS".to_string(),
arch: "x86".to_string(),
team: "CHI".to_string(),
service: rng.gen_range(0..100).to_string(),
service_version: rng.gen_range(0..10).to_string(),
service_environment: "test".to_string(),
}
}
fn fill_values(&self, values: &mut Vec<api::v1::Value>) {
let tags = [
api::v1::Value {
value_data: Some(ValueData::StringValue(self.hostname.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.region.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.datacenter.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.rack.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.os.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.arch.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.team.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.service.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.service_version.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.service_environment.clone())),
},
];
for tag in tags {
values.push(tag);
}
}
}
struct CpuDataGenerator {
metadata: RegionMetadataRef,
column_schemas: Vec<api::v1::ColumnSchema>,
hosts: Vec<Host>,
start_sec: i64,
end_sec: i64,
}
impl CpuDataGenerator {
fn new(metadata: RegionMetadataRef, num_hosts: usize, start_sec: i64, end_sec: i64) -> Self {
let column_schemas = region_metadata_to_row_schema(&metadata);
Self {
metadata,
column_schemas,
hosts: Self::generate_hosts(num_hosts),
start_sec,
end_sec,
}
}
fn iter(&self) -> impl Iterator<Item = KeyValues> + '_ {
// point per 10s.
(self.start_sec..self.end_sec)
.step_by(10)
.enumerate()
.map(|(seq, ts)| self.build_key_values(seq, ts))
}
fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues {
let rows = self
.hosts
.iter()
.map(|host| {
let mut rng = rand::thread_rng();
let mut values = Vec::with_capacity(21);
values.push(api::v1::Value {
value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
});
host.fill_values(&mut values);
for _ in 0..10 {
values.push(api::v1::Value {
value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))),
});
}
Row { values }
})
.collect();
let mutation = api::v1::Mutation {
op_type: api::v1::OpType::Put as i32,
sequence: seq as u64,
rows: Some(Rows {
schema: self.column_schemas.clone(),
rows,
}),
};
KeyValues::new(&self.metadata, mutation).unwrap()
}
fn random_host_filter(&self) -> Predicate {
let host = self.random_hostname();
let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host));
Predicate::new(vec![expr.into()])
}
fn random_hostname(&self) -> String {
let mut rng = rand::thread_rng();
self.hosts.choose(&mut rng).unwrap().hostname.clone()
}
fn random_f64(rng: &mut ThreadRng) -> f64 {
let base: u32 = rng.gen_range(30..95);
base as f64
}
fn generate_hosts(num_hosts: usize) -> Vec<Host> {
(0..num_hosts).map(Host::random_with_id).collect()
}
}
/// Creates a metadata for TSBS cpu-like table.
fn cpu_metadata() -> RegionMetadata {
let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
),
semantic_type: SemanticType::Timestamp,
column_id: 0,
});
let mut column_id = 1;
let tags = [
"hostname",
"region",
"datacenter",
"rack",
"os",
"arch",
"team",
"service",
"service_version",
"service_environment",
];
for tag in tags {
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true),
semantic_type: SemanticType::Tag,
column_id,
});
column_id += 1;
}
let fields = [
"usage_user",
"usage_system",
"usage_idle",
"usage_nice",
"usage_iowait",
"usage_irq",
"usage_softirq",
"usage_steal",
"usage_guest",
"usage_guest_nice",
];
for field in fields {
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true),
semantic_type: SemanticType::Field,
column_id,
});
column_id += 1;
}
builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
builder.build().unwrap()
}
criterion_group!(benches, write_rows, full_scan, filter_1_host);
criterion_main!(benches);

View File

@@ -1,36 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use criterion::{criterion_group, criterion_main, Criterion};
use mito2::memtable::merge_tree::{MergeTreeConfig, MergeTreeMemtable};
use mito2::memtable::Memtable;
use mito2::test_util::memtable_util;
fn bench_merge_tree_memtable(c: &mut Criterion) {
let metadata = memtable_util::metadata_with_primary_key(vec![1, 0], true);
let timestamps = (0..100).collect::<Vec<_>>();
let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &MergeTreeConfig::default());
let _ = c.bench_function("MergeTreeMemtable", |b| {
let kvs =
memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
b.iter(|| {
memtable.write(&kvs).unwrap();
});
});
}
criterion_group!(benches, bench_merge_tree_memtable);
criterion_main!(benches);

View File

@@ -158,7 +158,7 @@ impl CacheManager {
}
}
/// Gets the the write cache.
/// Gets the write cache.
pub(crate) fn write_cache(&self) -> Option<&WriteCacheRef> {
self.write_cache.as_ref()
}

View File

@@ -24,7 +24,7 @@ use serde::{Deserialize, Serialize};
use serde_with::{serde_as, NoneAsEmptyString};
use crate::error::Result;
use crate::memtable::merge_tree::MergeTreeConfig;
use crate::memtable::MemtableConfig;
use crate::sst::DEFAULT_WRITE_BUFFER_SIZE;
/// Default max running background job.
@@ -104,8 +104,8 @@ pub struct MitoConfig {
/// Inverted index configs.
pub inverted_index: InvertedIndexConfig,
/// Experimental memtable.
pub experimental_memtable: Option<MergeTreeConfig>,
/// Memtable config
pub memtable: MemtableConfig,
}
impl Default for MitoConfig {
@@ -131,7 +131,7 @@ impl Default for MitoConfig {
parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE,
allow_stale_entries: false,
inverted_index: InvertedIndexConfig::default(),
experimental_memtable: None,
memtable: MemtableConfig::default(),
};
// Adjust buffer and cache size according to system memory if we can.
@@ -319,3 +319,25 @@ fn divide_num_cpus(divisor: usize) -> usize {
(cores + divisor - 1) / divisor
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_deserialize_config() {
let s = r#"
[memtable]
type = "experimental"
index_max_keys_per_shard = 8192
data_freeze_threshold = 1024
dedup = true
fork_dictionary_bytes = "512MiB"
"#;
let config: MitoConfig = toml::from_str(s).unwrap();
let MemtableConfig::Experimental(config) = &config.memtable else {
unreachable!()
};
assert_eq!(1024, config.data_freeze_threshold);
}
}

View File

@@ -47,6 +47,7 @@ mod truncate_test;
use std::any::Any;
use std::sync::Arc;
use std::time::Instant;
use async_trait::async_trait;
use common_error::ext::BoxedError;
@@ -219,6 +220,7 @@ impl EngineInner {
/// Handles the scan `request` and returns a [Scanner] for the `request`.
fn handle_query(&self, region_id: RegionId, request: ScanRequest) -> Result<Scanner> {
let query_start = Instant::now();
// Reading a region doesn't need to go through the region worker thread.
let region = self
.workers
@@ -239,7 +241,8 @@ impl EngineInner {
Some(cache_manager),
)
.with_parallelism(scan_parallelism)
.ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled());
.with_ignore_inverted_index(self.config.inverted_index.apply_on_query.disabled())
.with_start_time(query_start);
scan_region.scanner()
}

View File

@@ -14,16 +14,12 @@
//! Memtables are write buffers for regions.
pub mod key_values;
pub mod merge_tree;
pub mod time_series;
pub(crate) mod version;
use std::fmt;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::Arc;
use common_time::Timestamp;
use serde::{Deserialize, Serialize};
use store_api::metadata::RegionMetadataRef;
use store_api::storage::ColumnId;
use table::predicate::Predicate;
@@ -31,14 +27,34 @@ use table::predicate::Predicate;
use crate::error::Result;
use crate::flush::WriteBufferManagerRef;
pub use crate::memtable::key_values::KeyValues;
use crate::memtable::merge_tree::MergeTreeConfig;
use crate::metrics::WRITE_BUFFER_BYTES;
use crate::read::Batch;
pub mod key_values;
pub mod merge_tree;
pub mod time_series;
pub(crate) mod version;
/// Id for memtables.
///
/// Should be unique under the same region.
pub type MemtableId = u32;
/// Config for memtables.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum MemtableConfig {
Experimental(MergeTreeConfig),
TimeSeries,
}
impl Default for MemtableConfig {
fn default() -> Self {
Self::Experimental(MergeTreeConfig::default())
}
}
#[derive(Debug, Default)]
pub struct MemtableStats {
/// The estimated bytes allocated by this memtable from heap.
@@ -187,9 +203,30 @@ impl Drop for AllocTracker {
#[cfg(test)]
mod tests {
use common_base::readable_size::ReadableSize;
use super::*;
use crate::flush::{WriteBufferManager, WriteBufferManagerImpl};
#[test]
fn test_deserialize_memtable_config() {
let s = r#"
type = "experimental"
index_max_keys_per_shard = 8192
data_freeze_threshold = 1024
dedup = true
fork_dictionary_bytes = "512MiB"
"#;
let config: MemtableConfig = toml::from_str(s).unwrap();
let MemtableConfig::Experimental(merge_tree) = config else {
unreachable!()
};
assert!(merge_tree.dedup);
assert_eq!(8192, merge_tree.index_max_keys_per_shard);
assert_eq!(1024, merge_tree.data_freeze_threshold);
assert_eq!(ReadableSize::mb(512), merge_tree.fork_dictionary_bytes);
}
#[test]
fn test_alloc_tracker_without_manager() {
let tracker = AllocTracker::new(None);

View File

@@ -44,7 +44,7 @@ use crate::memtable::{
};
/// Use `1/DICTIONARY_SIZE_FACTOR` of OS memory as dictionary size.
const DICTIONARY_SIZE_FACTOR: u64 = 16;
const DICTIONARY_SIZE_FACTOR: u64 = 8;
/// Id of a shard, only unique inside a partition.
type ShardId = u32;
@@ -74,7 +74,7 @@ pub struct MergeTreeConfig {
impl Default for MergeTreeConfig {
fn default() -> Self {
let mut fork_dictionary_bytes = ReadableSize::mb(512);
let mut fork_dictionary_bytes = ReadableSize::gb(1);
if let Some(sys_memory) = common_config::utils::get_sys_total_memory() {
let adjust_dictionary_bytes =
std::cmp::min(sys_memory / DICTIONARY_SIZE_FACTOR, fork_dictionary_bytes);
@@ -85,7 +85,7 @@ impl Default for MergeTreeConfig {
Self {
index_max_keys_per_shard: 8192,
data_freeze_threshold: 102400,
data_freeze_threshold: 131072,
dedup: true,
fork_dictionary_bytes,
}
@@ -293,6 +293,8 @@ mod tests {
use std::collections::BTreeSet;
use common_time::Timestamp;
use datafusion_common::{Column, ScalarValue};
use datafusion_expr::{BinaryExpr, Expr, Operator};
use datatypes::scalars::ScalarVector;
use datatypes::vectors::{Int64Vector, TimestampMillisecondVector};
@@ -528,4 +530,55 @@ mod tests {
.collect::<Vec<_>>();
assert_eq!(expect, read);
}
#[test]
fn test_memtable_filter() {
let metadata = memtable_util::metadata_with_primary_key(vec![0, 1], false);
// Try to build a memtable via the builder.
let memtable = MergeTreeMemtableBuilder::new(
MergeTreeConfig {
index_max_keys_per_shard: 40,
..Default::default()
},
None,
)
.build(1, &metadata);
for i in 0..100 {
let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
let kvs =
memtable_util::build_key_values(&metadata, "hello".to_string(), i, &timestamps, 1);
memtable.write(&kvs).unwrap();
}
for i in 0..100 {
let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
let expr = Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "k1".to_string(),
})),
op: Operator::Eq,
right: Box::new(Expr::Literal(ScalarValue::UInt32(Some(i)))),
});
let iter = memtable
.iter(None, Some(Predicate::new(vec![expr.into()])))
.unwrap();
let read = iter
.flat_map(|batch| {
batch
.unwrap()
.timestamps()
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap()
.iter_data()
.collect::<Vec<_>>()
.into_iter()
})
.map(|v| v.unwrap().0.value())
.collect::<Vec<_>>();
assert_eq!(timestamps, read);
}
}
}

View File

@@ -957,6 +957,18 @@ impl DataParts {
self.active.write_row(pk_index, kv)
}
/// Returns the number of rows in the active buffer.
pub fn num_active_rows(&self) -> usize {
self.active.num_rows()
}
/// Freezes active buffer and creates a new active buffer.
pub fn freeze(&mut self) -> Result<()> {
let part = self.active.freeze(None, false)?;
self.frozen.push(part);
Ok(())
}
/// Reads data from all parts including active and frozen parts.
/// The returned iterator yields a record batch of one primary key at a time.
/// The order of yielding primary keys is determined by provided weights.
@@ -976,6 +988,11 @@ impl DataParts {
pub(crate) fn is_empty(&self) -> bool {
self.active.is_empty() && self.frozen.iter().all(|part| part.is_empty())
}
#[cfg(test)]
pub(crate) fn frozen_len(&self) -> usize {
self.frozen.len()
}
}
pub struct DataPartsReaderBuilder {
@@ -994,9 +1011,11 @@ impl DataPartsReaderBuilder {
for p in self.parts {
nodes.push(DataNode::new(DataSource::Part(p)));
}
let num_parts = nodes.len();
let merger = Merger::try_new(nodes)?;
Ok(DataPartsReader {
merger,
num_parts,
elapsed: Default::default(),
})
}
@@ -1005,6 +1024,7 @@ impl DataPartsReaderBuilder {
/// Reader for all parts inside a `DataParts`.
pub struct DataPartsReader {
merger: Merger<DataNode>,
num_parts: usize,
elapsed: Duration,
}
@@ -1032,6 +1052,10 @@ impl DataPartsReader {
pub(crate) fn is_valid(&self) -> bool {
self.merger.is_valid()
}
pub(crate) fn num_parts(&self) -> usize {
self.num_parts
}
}
#[cfg(test)]

View File

@@ -45,7 +45,7 @@ impl<T: DataBatchSource> DataBatchSource for DedupReader<T> {
}
fn next(&mut self) -> Result<()> {
loop {
while self.inner.is_valid() {
match &mut self.prev_batch_last_row {
None => {
// First shot, fill prev_batch_last_row and current_batch_range with first batch.

View File

@@ -78,7 +78,7 @@ impl Partition {
// Finds key in shards, now we ensure one key only exists in one shard.
if let Some(pk_id) = inner.find_key_in_shards(primary_key) {
inner.write_to_shard(pk_id, &key_value);
inner.write_to_shard(pk_id, &key_value)?;
inner.num_rows += 1;
return Ok(());
}
@@ -106,7 +106,7 @@ impl Partition {
}
/// Writes to the partition without a primary key.
pub fn write_no_key(&self, key_value: KeyValue) {
pub fn write_no_key(&self, key_value: KeyValue) -> Result<()> {
let mut inner = self.inner.write().unwrap();
// If no primary key, always write to the first shard.
debug_assert!(!inner.shards.is_empty());
@@ -117,12 +117,24 @@ impl Partition {
shard_id: 0,
pk_index: 0,
};
inner.shards[0].write_with_pk_id(pk_id, &key_value);
inner.shards[0].write_with_pk_id(pk_id, &key_value)?;
inner.num_rows += 1;
Ok(())
}
/// Scans data in the partition.
pub fn read(&self, mut context: ReadPartitionContext) -> Result<PartitionReader> {
let start = Instant::now();
let key_filter = if context.need_prune_key {
Some(PrimaryKeyFilter::new(
context.metadata.clone(),
context.filters.clone(),
context.row_codec.clone(),
))
} else {
None
};
let (builder_source, shard_reader_builders) = {
let inner = self.inner.read().unwrap();
let mut shard_source = Vec::with_capacity(inner.shards.len() + 1);
@@ -141,14 +153,21 @@ impl Partition {
(builder_reader, shard_source)
};
context.metrics.num_shards += shard_reader_builders.len();
let mut nodes = shard_reader_builders
.into_iter()
.map(|builder| Ok(ShardNode::new(ShardSource::Shard(builder.build()?))))
.map(|builder| {
Ok(ShardNode::new(ShardSource::Shard(
builder.build(key_filter.clone())?,
)))
})
.collect::<Result<Vec<_>>>()?;
if let Some(builder) = builder_source {
context.metrics.num_builder += 1;
// Move the initialization of ShardBuilderReader out of read lock.
let shard_builder_reader = builder.build(Some(&context.pk_weights))?;
let shard_builder_reader =
builder.build(Some(&context.pk_weights), key_filter.clone())?;
nodes.push(ShardNode::new(ShardSource::Builder(shard_builder_reader)));
}
@@ -156,8 +175,10 @@ impl Partition {
let merger = ShardMerger::try_new(nodes)?;
if self.dedup {
let source = DedupReader::try_new(merger)?;
context.metrics.build_partition_reader += start.elapsed();
PartitionReader::new(context, Box::new(source))
} else {
context.metrics.build_partition_reader += start.elapsed();
PartitionReader::new(context, Box::new(merger))
}
}
@@ -266,11 +287,11 @@ pub(crate) struct PartitionStats {
#[derive(Default)]
struct PartitionReaderMetrics {
prune_pk: Duration,
build_partition_reader: Duration,
read_source: Duration,
data_batch_to_batch: Duration,
keys_before_pruning: usize,
keys_after_pruning: usize,
num_builder: usize,
num_shards: usize,
}
/// Reader to scan rows in a partition.
@@ -279,18 +300,11 @@ struct PartitionReaderMetrics {
pub struct PartitionReader {
context: ReadPartitionContext,
source: BoxedDataBatchSource,
last_yield_pk_id: Option<PkId>,
}
impl PartitionReader {
fn new(context: ReadPartitionContext, source: BoxedDataBatchSource) -> Result<Self> {
let mut reader = Self {
context,
source,
last_yield_pk_id: None,
};
// Find next valid batch.
reader.prune_batch_by_key()?;
let reader = Self { context, source };
Ok(reader)
}
@@ -305,8 +319,7 @@ impl PartitionReader {
/// # Panics
/// Panics if the reader is invalid.
pub fn next(&mut self) -> Result<()> {
self.advance_source()?;
self.prune_batch_by_key()
self.advance_source()
}
/// Converts current data batch into a [Batch].
@@ -336,106 +349,77 @@ impl PartitionReader {
self.context.metrics.read_source += read_source.elapsed();
Ok(())
}
fn prune_batch_by_key(&mut self) -> Result<()> {
if self.context.metadata.primary_key.is_empty() || !self.context.need_prune_key {
// Nothing to prune.
return Ok(());
}
while self.source.is_valid() {
let pk_id = self.source.current_pk_id();
if let Some(yield_pk_id) = self.last_yield_pk_id {
if pk_id == yield_pk_id {
// If this batch has the same key as last returned batch.
// We can return it without evaluating filters.
break;
}
}
let key = self.source.current_key().unwrap();
self.context.metrics.keys_before_pruning += 1;
// Prune batch by primary key.
if prune_primary_key(
&self.context.metadata,
&self.context.filters,
&self.context.row_codec,
key,
&mut self.context.metrics,
) {
// We need this key.
self.last_yield_pk_id = Some(pk_id);
self.context.metrics.keys_after_pruning += 1;
break;
}
self.advance_source()?;
}
Ok(())
}
}
fn prune_primary_key(
metadata: &RegionMetadataRef,
filters: &[SimpleFilterEvaluator],
codec: &McmpRowCodec,
pk: &[u8],
metrics: &mut PartitionReaderMetrics,
) -> bool {
let start = Instant::now();
let res = prune_primary_key_inner(metadata, filters, codec, pk);
metrics.prune_pk += start.elapsed();
res
#[derive(Clone)]
pub(crate) struct PrimaryKeyFilter {
metadata: RegionMetadataRef,
filters: Arc<Vec<SimpleFilterEvaluator>>,
codec: Arc<McmpRowCodec>,
offsets_buf: Vec<usize>,
}
// TODO(yingwen): Improve performance of key pruning. Now we need to find index and
// then decode and convert each value.
/// Returns true if the `pk` is still needed.
fn prune_primary_key_inner(
metadata: &RegionMetadataRef,
filters: &[SimpleFilterEvaluator],
codec: &McmpRowCodec,
pk: &[u8],
) -> bool {
if filters.is_empty() {
return true;
impl PrimaryKeyFilter {
pub(crate) fn new(
metadata: RegionMetadataRef,
filters: Arc<Vec<SimpleFilterEvaluator>>,
codec: Arc<McmpRowCodec>,
) -> Self {
Self {
metadata,
filters,
codec,
offsets_buf: Vec::new(),
}
}
// no primary key, we simply return true.
if metadata.primary_key.is_empty() {
return true;
}
let pk_values = match codec.decode(pk) {
Ok(values) => values,
Err(e) => {
common_telemetry::error!(e; "Failed to decode primary key");
pub(crate) fn prune_primary_key(&mut self, pk: &[u8]) -> bool {
if self.filters.is_empty() {
return true;
}
};
// evaluate filters against primary key values
let mut result = true;
for filter in filters {
if Partition::is_partition_column(filter.column_name()) {
continue;
// no primary key, we simply return true.
if self.metadata.primary_key.is_empty() {
return true;
}
let Some(column) = metadata.column_by_name(filter.column_name()) else {
continue;
};
// ignore filters that are not referencing primary key columns
if column.semantic_type != SemanticType::Tag {
continue;
// evaluate filters against primary key values
let mut result = true;
self.offsets_buf.clear();
for filter in &*self.filters {
if Partition::is_partition_column(filter.column_name()) {
continue;
}
let Some(column) = self.metadata.column_by_name(filter.column_name()) else {
continue;
};
// ignore filters that are not referencing primary key columns
if column.semantic_type != SemanticType::Tag {
continue;
}
// index of the column in primary keys.
// Safety: A tag column is always in primary key.
let index = self.metadata.primary_key_index(column.column_id).unwrap();
let value = match self.codec.decode_value_at(pk, index, &mut self.offsets_buf) {
Ok(v) => v,
Err(e) => {
common_telemetry::error!(e; "Failed to decode primary key");
return true;
}
};
// TODO(yingwen): `evaluate_scalar()` creates temporary arrays to compare scalars. We
// can compare the bytes directly without allocation and matching types as we use
// comparable encoding.
// Safety: arrow schema and datatypes are constructed from the same source.
let scalar_value = value
.try_to_scalar_value(&column.column_schema.data_type)
.unwrap();
result &= filter.evaluate_scalar(&scalar_value).unwrap_or(true);
}
// index of the column in primary keys.
// Safety: A tag column is always in primary key.
let index = metadata.primary_key_index(column.column_id).unwrap();
// Safety: arrow schema and datatypes are constructed from the same source.
let scalar_value = pk_values[index]
.try_to_scalar_value(&column.column_schema.data_type)
.unwrap();
result &= filter.evaluate_scalar(&scalar_value).unwrap_or(true);
result
}
result
}
/// Structs to reuse across readers to avoid allocating for each reader.
@@ -443,7 +427,7 @@ pub(crate) struct ReadPartitionContext {
metadata: RegionMetadataRef,
row_codec: Arc<McmpRowCodec>,
projection: HashSet<ColumnId>,
filters: Vec<SimpleFilterEvaluator>,
filters: Arc<Vec<SimpleFilterEvaluator>>,
/// Buffer to store pk weights.
pk_weights: Vec<u16>,
need_prune_key: bool,
@@ -452,10 +436,6 @@ pub(crate) struct ReadPartitionContext {
impl Drop for ReadPartitionContext {
fn drop(&mut self) {
let partition_prune_pk = self.metrics.prune_pk.as_secs_f64();
MERGE_TREE_READ_STAGE_ELAPSED
.with_label_values(&["partition_prune_pk"])
.observe(partition_prune_pk);
let partition_read_source = self.metrics.read_source.as_secs_f64();
MERGE_TREE_READ_STAGE_ELAPSED
.with_label_values(&["partition_read_source"])
@@ -465,16 +445,19 @@ impl Drop for ReadPartitionContext {
.with_label_values(&["partition_data_batch_to_batch"])
.observe(partition_data_batch_to_batch);
if self.metrics.keys_before_pruning != 0 {
common_telemetry::debug!(
"TreeIter pruning, before: {}, after: {}, partition_read_source: {}s, partition_prune_pk: {}s, partition_data_batch_to_batch: {}s",
self.metrics.keys_before_pruning,
self.metrics.keys_after_pruning,
partition_read_source,
partition_prune_pk,
partition_data_batch_to_batch,
);
}
common_telemetry::debug!(
"TreeIter partitions metrics, \
num_builder: {}, \
num_shards: {}, \
build_partition_reader: {}s, \
partition_read_source: {}s, \
partition_data_batch_to_batch: {}s",
self.metrics.num_builder,
self.metrics.num_shards,
self.metrics.build_partition_reader.as_secs_f64(),
partition_read_source,
partition_data_batch_to_batch,
);
}
}
@@ -490,7 +473,7 @@ impl ReadPartitionContext {
metadata,
row_codec,
projection,
filters,
filters: Arc::new(filters),
pk_weights: Vec::new(),
need_prune_key,
metrics: Default::default(),
@@ -578,7 +561,16 @@ impl Inner {
fn new(metadata: RegionMetadataRef, config: &MergeTreeConfig) -> Self {
let (shards, current_shard_id) = if metadata.primary_key.is_empty() {
let data_parts = DataParts::new(metadata.clone(), DATA_INIT_CAP, config.dedup);
(vec![Shard::new(0, None, data_parts, config.dedup)], 1)
(
vec![Shard::new(
0,
None,
data_parts,
config.dedup,
config.data_freeze_threshold,
)],
1,
)
} else {
(Vec::new(), 0)
};
@@ -598,18 +590,22 @@ impl Inner {
self.pk_to_pk_id.get(primary_key).copied()
}
fn write_to_shard(&mut self, pk_id: PkId, key_value: &KeyValue) {
fn write_to_shard(&mut self, pk_id: PkId, key_value: &KeyValue) -> Result<()> {
if pk_id.shard_id == self.shard_builder.current_shard_id() {
self.shard_builder.write_with_pk_id(pk_id, key_value);
return;
}
for shard in &mut self.shards {
if shard.shard_id == pk_id.shard_id {
shard.write_with_pk_id(pk_id, key_value);
self.num_rows += 1;
return;
}
return Ok(());
}
// Safety: We find the shard by shard id.
let shard = self
.shards
.iter_mut()
.find(|shard| shard.shard_id == pk_id.shard_id)
.unwrap();
shard.write_with_pk_id(pk_id, key_value)?;
self.num_rows += 1;
Ok(())
}
fn freeze_active_shard(&mut self) -> Result<()> {

View File

@@ -15,6 +15,7 @@
//! Shard in a partition.
use std::cmp::Ordering;
use std::time::{Duration, Instant};
use store_api::metadata::RegionMetadataRef;
@@ -25,8 +26,10 @@ use crate::memtable::merge_tree::data::{
};
use crate::memtable::merge_tree::dict::KeyDictRef;
use crate::memtable::merge_tree::merger::{Merger, Node};
use crate::memtable::merge_tree::partition::PrimaryKeyFilter;
use crate::memtable::merge_tree::shard_builder::ShardBuilderReader;
use crate::memtable::merge_tree::{PkId, ShardId};
use crate::memtable::merge_tree::{PkId, PkIndex, ShardId};
use crate::metrics::MERGE_TREE_READ_STAGE_ELAPSED;
/// Shard stores data related to the same key dictionary.
pub struct Shard {
@@ -36,6 +39,8 @@ pub struct Shard {
/// Data in the shard.
data_parts: DataParts,
dedup: bool,
/// Number of rows to freeze a data part.
data_freeze_threshold: usize,
}
impl Shard {
@@ -45,20 +50,29 @@ impl Shard {
key_dict: Option<KeyDictRef>,
data_parts: DataParts,
dedup: bool,
data_freeze_threshold: usize,
) -> Shard {
Shard {
shard_id,
key_dict,
data_parts,
dedup,
data_freeze_threshold,
}
}
/// Writes a key value into the shard.
pub fn write_with_pk_id(&mut self, pk_id: PkId, key_value: &KeyValue) {
///
/// It will freezes the active buffer if it is full.
pub fn write_with_pk_id(&mut self, pk_id: PkId, key_value: &KeyValue) -> Result<()> {
debug_assert_eq!(self.shard_id, pk_id.shard_id);
if self.data_parts.num_active_rows() >= self.data_freeze_threshold {
self.data_parts.freeze()?;
}
self.data_parts.write_row(pk_id.pk_index, key_value);
Ok(())
}
/// Scans the shard.
@@ -80,6 +94,7 @@ impl Shard {
key_dict: self.key_dict.clone(),
data_parts: DataParts::new(metadata, DATA_INIT_CAP, self.dedup),
dedup: self.dedup,
data_freeze_threshold: self.data_freeze_threshold,
}
}
@@ -131,18 +146,15 @@ pub struct ShardReaderBuilder {
}
impl ShardReaderBuilder {
pub(crate) fn build(self) -> Result<ShardReader> {
pub(crate) fn build(self, key_filter: Option<PrimaryKeyFilter>) -> Result<ShardReader> {
let ShardReaderBuilder {
shard_id,
key_dict,
inner,
} = self;
let now = Instant::now();
let parts_reader = inner.build()?;
Ok(ShardReader {
shard_id,
key_dict,
parts_reader,
})
ShardReader::new(shard_id, key_dict, parts_reader, key_filter, now.elapsed())
}
}
@@ -151,15 +163,46 @@ pub struct ShardReader {
shard_id: ShardId,
key_dict: Option<KeyDictRef>,
parts_reader: DataPartsReader,
key_filter: Option<PrimaryKeyFilter>,
last_yield_pk_index: Option<PkIndex>,
keys_before_pruning: usize,
keys_after_pruning: usize,
prune_pk_cost: Duration,
data_build_cost: Duration,
}
impl ShardReader {
fn new(
shard_id: ShardId,
key_dict: Option<KeyDictRef>,
parts_reader: DataPartsReader,
key_filter: Option<PrimaryKeyFilter>,
data_build_cost: Duration,
) -> Result<Self> {
let has_pk = key_dict.is_some();
let mut reader = Self {
shard_id,
key_dict,
parts_reader,
key_filter: if has_pk { key_filter } else { None },
last_yield_pk_index: None,
keys_before_pruning: 0,
keys_after_pruning: 0,
prune_pk_cost: Duration::default(),
data_build_cost,
};
reader.prune_batch_by_key()?;
Ok(reader)
}
fn is_valid(&self) -> bool {
self.parts_reader.is_valid()
}
fn next(&mut self) -> Result<()> {
self.parts_reader.next()
self.parts_reader.next()?;
self.prune_batch_by_key()
}
fn current_key(&self) -> Option<&[u8]> {
@@ -180,6 +223,54 @@ impl ShardReader {
fn current_data_batch(&self) -> DataBatch {
self.parts_reader.current_data_batch()
}
fn prune_batch_by_key(&mut self) -> Result<()> {
let Some(key_filter) = &mut self.key_filter else {
return Ok(());
};
while self.parts_reader.is_valid() {
let pk_index = self.parts_reader.current_data_batch().pk_index();
if let Some(yield_pk_index) = self.last_yield_pk_index {
if pk_index == yield_pk_index {
break;
}
}
self.keys_before_pruning += 1;
// Safety: `key_filter` is some so the shard has primary keys.
let key = self.key_dict.as_ref().unwrap().key_by_pk_index(pk_index);
let now = Instant::now();
if key_filter.prune_primary_key(key) {
self.prune_pk_cost += now.elapsed();
self.last_yield_pk_index = Some(pk_index);
self.keys_after_pruning += 1;
break;
}
self.prune_pk_cost += now.elapsed();
self.parts_reader.next()?;
}
Ok(())
}
}
impl Drop for ShardReader {
fn drop(&mut self) {
let shard_prune_pk = self.prune_pk_cost.as_secs_f64();
MERGE_TREE_READ_STAGE_ELAPSED
.with_label_values(&["shard_prune_pk"])
.observe(shard_prune_pk);
if self.keys_before_pruning > 0 {
common_telemetry::debug!(
"ShardReader metrics, data parts: {}, before pruning: {}, after pruning: {}, prune cost: {}s, build cost: {}s",
self.parts_reader.num_parts(),
self.keys_before_pruning,
self.keys_after_pruning,
shard_prune_pk,
self.data_build_cost.as_secs_f64(),
);
}
}
}
/// A merger that merges batches from multiple shards.
@@ -388,6 +479,7 @@ mod tests {
shard_id: ShardId,
metadata: RegionMetadataRef,
input: &[(KeyValues, PkIndex)],
data_freeze_threshold: usize,
) -> Shard {
let mut dict_builder = KeyDictBuilder::new(1024);
let mut metrics = WriteMetrics::default();
@@ -402,27 +494,17 @@ mod tests {
let dict = dict_builder.finish(&mut BTreeMap::new()).unwrap();
let data_parts = DataParts::new(metadata, DATA_INIT_CAP, true);
Shard::new(shard_id, Some(Arc::new(dict)), data_parts, true)
Shard::new(
shard_id,
Some(Arc::new(dict)),
data_parts,
true,
data_freeze_threshold,
)
}
#[test]
fn test_write_read_shard() {
let metadata = metadata_for_test();
let input = input_with_key(&metadata);
let mut shard = new_shard_with_dict(8, metadata, &input);
assert!(shard.is_empty());
for (key_values, pk_index) in &input {
for kv in key_values.iter() {
let pk_id = PkId {
shard_id: shard.shard_id,
pk_index: *pk_index,
};
shard.write_with_pk_id(pk_id, &kv);
}
}
assert!(!shard.is_empty());
let mut reader = shard.read().unwrap().build().unwrap();
fn collect_timestamps(shard: &Shard) -> Vec<i64> {
let mut reader = shard.read().unwrap().build(None).unwrap();
let mut timestamps = Vec::new();
while reader.is_valid() {
let rb = reader.current_data_batch().slice_record_batch();
@@ -432,6 +514,64 @@ mod tests {
reader.next().unwrap();
}
timestamps
}
#[test]
fn test_write_read_shard() {
let metadata = metadata_for_test();
let input = input_with_key(&metadata);
let mut shard = new_shard_with_dict(8, metadata, &input, 100);
assert!(shard.is_empty());
for (key_values, pk_index) in &input {
for kv in key_values.iter() {
let pk_id = PkId {
shard_id: shard.shard_id,
pk_index: *pk_index,
};
shard.write_with_pk_id(pk_id, &kv).unwrap();
}
}
assert!(!shard.is_empty());
let timestamps = collect_timestamps(&shard);
assert_eq!(vec![0, 1, 10, 11, 20, 21], timestamps);
}
#[test]
fn test_shard_freeze() {
let metadata = metadata_for_test();
let kvs = build_key_values_with_ts_seq_values(
&metadata,
"shard".to_string(),
0,
[0].into_iter(),
[Some(0.0)].into_iter(),
0,
);
let mut shard = new_shard_with_dict(8, metadata.clone(), &[(kvs, 0)], 50);
let expected: Vec<_> = (0..200).collect();
for i in &expected {
let kvs = build_key_values_with_ts_seq_values(
&metadata,
"shard".to_string(),
0,
[*i].into_iter(),
[Some(0.0)].into_iter(),
*i as u64,
);
let pk_id = PkId {
shard_id: shard.shard_id,
pk_index: *i as PkIndex,
};
for kv in kvs.iter() {
shard.write_with_pk_id(pk_id, &kv).unwrap();
}
}
assert!(!shard.is_empty());
assert_eq!(3, shard.data_parts.frozen_len());
let timestamps = collect_timestamps(&shard);
assert_eq!(expected, timestamps);
}
}

View File

@@ -16,6 +16,7 @@
use std::collections::{BTreeMap, HashMap};
use std::sync::Arc;
use std::time::{Duration, Instant};
use store_api::metadata::RegionMetadataRef;
@@ -26,8 +27,9 @@ use crate::memtable::merge_tree::data::{
};
use crate::memtable::merge_tree::dict::{DictBuilderReader, KeyDictBuilder};
use crate::memtable::merge_tree::metrics::WriteMetrics;
use crate::memtable::merge_tree::partition::PrimaryKeyFilter;
use crate::memtable::merge_tree::shard::Shard;
use crate::memtable::merge_tree::{MergeTreeConfig, PkId, ShardId};
use crate::memtable::merge_tree::{MergeTreeConfig, PkId, PkIndex, ShardId};
use crate::metrics::MERGE_TREE_READ_STAGE_ELAPSED;
/// Builder to write keys and data to a shard that the key dictionary
@@ -136,7 +138,13 @@ impl ShardBuilder {
let shard_id = self.current_shard_id;
self.current_shard_id += 1;
Ok(Some(Shard::new(shard_id, key_dict, data_parts, self.dedup)))
Ok(Some(Shard::new(
shard_id,
key_dict,
data_parts,
self.dedup,
self.data_freeze_threshold,
)))
}
/// Scans the shard builder.
@@ -176,13 +184,20 @@ pub(crate) struct ShardBuilderReaderBuilder {
}
impl ShardBuilderReaderBuilder {
pub(crate) fn build(self, pk_weights: Option<&[u16]>) -> Result<ShardBuilderReader> {
pub(crate) fn build(
self,
pk_weights: Option<&[u16]>,
key_filter: Option<PrimaryKeyFilter>,
) -> Result<ShardBuilderReader> {
let now = Instant::now();
let data_reader = self.data_reader.build(pk_weights)?;
Ok(ShardBuilderReader {
shard_id: self.shard_id,
dict_reader: self.dict_reader,
ShardBuilderReader::new(
self.shard_id,
self.dict_reader,
data_reader,
})
key_filter,
now.elapsed(),
)
}
}
@@ -191,15 +206,45 @@ pub struct ShardBuilderReader {
shard_id: ShardId,
dict_reader: DictBuilderReader,
data_reader: DataBufferReader,
key_filter: Option<PrimaryKeyFilter>,
last_yield_pk_index: Option<PkIndex>,
keys_before_pruning: usize,
keys_after_pruning: usize,
prune_pk_cost: Duration,
data_build_cost: Duration,
}
impl ShardBuilderReader {
fn new(
shard_id: ShardId,
dict_reader: DictBuilderReader,
data_reader: DataBufferReader,
key_filter: Option<PrimaryKeyFilter>,
data_build_cost: Duration,
) -> Result<Self> {
let mut reader = ShardBuilderReader {
shard_id,
dict_reader,
data_reader,
key_filter,
last_yield_pk_index: None,
keys_before_pruning: 0,
keys_after_pruning: 0,
prune_pk_cost: Duration::default(),
data_build_cost,
};
reader.prune_batch_by_key()?;
Ok(reader)
}
pub fn is_valid(&self) -> bool {
self.data_reader.is_valid()
}
pub fn next(&mut self) -> Result<()> {
self.data_reader.next()
self.data_reader.next()?;
self.prune_batch_by_key()
}
pub fn current_key(&self) -> Option<&[u8]> {
@@ -218,6 +263,52 @@ impl ShardBuilderReader {
pub fn current_data_batch(&self) -> DataBatch {
self.data_reader.current_data_batch()
}
fn prune_batch_by_key(&mut self) -> Result<()> {
let Some(key_filter) = &mut self.key_filter else {
return Ok(());
};
while self.data_reader.is_valid() {
let pk_index = self.data_reader.current_data_batch().pk_index();
if let Some(yield_pk_index) = self.last_yield_pk_index {
if pk_index == yield_pk_index {
break;
}
}
self.keys_before_pruning += 1;
let key = self.dict_reader.key_by_pk_index(pk_index);
let now = Instant::now();
if key_filter.prune_primary_key(key) {
self.prune_pk_cost += now.elapsed();
self.last_yield_pk_index = Some(pk_index);
self.keys_after_pruning += 1;
break;
}
self.prune_pk_cost += now.elapsed();
self.data_reader.next()?;
}
Ok(())
}
}
impl Drop for ShardBuilderReader {
fn drop(&mut self) {
let shard_builder_prune_pk = self.prune_pk_cost.as_secs_f64();
MERGE_TREE_READ_STAGE_ELAPSED
.with_label_values(&["shard_builder_prune_pk"])
.observe(shard_builder_prune_pk);
if self.keys_before_pruning > 0 {
common_telemetry::debug!(
"ShardBuilderReader metrics, before pruning: {}, after pruning: {}, prune cost: {}s, build cost: {}s",
self.keys_before_pruning,
self.keys_after_pruning,
shard_builder_prune_pk,
self.data_build_cost.as_secs_f64(),
);
}
}
}
#[cfg(test)]
@@ -306,7 +397,7 @@ mod tests {
let mut reader = shard_builder
.read(&mut pk_weights)
.unwrap()
.build(Some(&pk_weights))
.build(Some(&pk_weights), None)
.unwrap();
let mut timestamps = Vec::new();
while reader.is_valid() {

View File

@@ -39,7 +39,7 @@ use crate::memtable::merge_tree::partition::{
};
use crate::memtable::merge_tree::MergeTreeConfig;
use crate::memtable::{BoxedBatchIterator, KeyValues};
use crate::metrics::{MERGE_TREE_READ_STAGE_ELAPSED, READ_STAGE_ELAPSED};
use crate::metrics::{MERGE_TREE_READ_STAGE_ELAPSED, READ_ROWS_TOTAL, READ_STAGE_ELAPSED};
use crate::read::Batch;
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
@@ -124,7 +124,7 @@ impl MergeTree {
if !has_pk {
// No primary key.
self.write_no_key(kv);
self.write_no_key(kv)?;
continue;
}
@@ -299,7 +299,7 @@ impl MergeTree {
)
}
fn write_no_key(&self, key_value: KeyValue) {
fn write_no_key(&self, key_value: KeyValue) -> Result<()> {
let partition_key = Partition::get_partition_key(&key_value, self.is_partitioned);
let partition = self.get_or_create_partition(partition_key);
@@ -397,6 +397,9 @@ struct TreeIter {
impl Drop for TreeIter {
fn drop(&mut self) {
READ_ROWS_TOTAL
.with_label_values(&["merge_tree_memtable"])
.inc_by(self.metrics.rows_fetched as u64);
MERGE_TREE_READ_STAGE_ELAPSED
.with_label_values(&["fetch_next_partition"])
.observe(self.metrics.fetch_partition_elapsed.as_secs_f64());

View File

@@ -123,7 +123,7 @@ lazy_static! {
vec![0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 60.0, 300.0]
)
.unwrap();
/// Counter of rows read.
/// Counter of rows read from different source.
pub static ref READ_ROWS_TOTAL: IntCounterVec =
register_int_counter_vec!("greptime_mito_read_rows_total", "mito read rows total", &[TYPE_LABEL]).unwrap();
/// Counter of filtered rows during merge.
@@ -137,6 +137,24 @@ lazy_static! {
register_int_counter_vec!("greptime_mito_precise_filter_rows_total", "mito precise filter rows total", &[TYPE_LABEL]).unwrap();
pub static ref READ_ROWS_IN_ROW_GROUP_TOTAL: IntCounterVec =
register_int_counter_vec!("greptime_mito_read_rows_in_row_group_total", "mito read rows in row group total", &[TYPE_LABEL]).unwrap();
/// Histogram for the number of SSTs to scan per query.
pub static ref READ_SST_COUNT: Histogram = register_histogram!(
"greptime_mito_read_sst_count",
"Number of SSTs to scan in a scan task",
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 256.0, 1024.0],
).unwrap();
/// Histogram for the number of rows returned per query.
pub static ref READ_ROWS_RETURN: Histogram = register_histogram!(
"greptime_mito_read_rows_return",
"Number of rows returned in a scan task",
exponential_buckets(100.0, 10.0, 8).unwrap(),
).unwrap();
/// Histogram for the number of batches returned per query.
pub static ref READ_BATCHES_RETURN: Histogram = register_histogram!(
"greptime_mito_read_batches_return",
"Number of rows returned in a scan task",
exponential_buckets(100.0, 10.0, 7).unwrap(),
).unwrap();
// ------- End of query metrics.
// Cache related metrics.

View File

@@ -15,6 +15,7 @@
//! Scans a region according to the scan request.
use std::sync::Arc;
use std::time::Instant;
use common_recordbatch::SendableRecordBatchStream;
use common_telemetry::{debug, warn};
@@ -124,6 +125,8 @@ pub(crate) struct ScanRegion {
parallelism: ScanParallism,
/// Whether to ignore inverted index.
ignore_inverted_index: bool,
/// Start time of the scan task.
start_time: Option<Instant>,
}
impl ScanRegion {
@@ -141,6 +144,7 @@ impl ScanRegion {
cache_manager,
parallelism: ScanParallism::default(),
ignore_inverted_index: false,
start_time: None,
}
}
@@ -152,11 +156,17 @@ impl ScanRegion {
}
#[must_use]
pub(crate) fn ignore_inverted_index(mut self, ignore: bool) -> Self {
pub(crate) fn with_ignore_inverted_index(mut self, ignore: bool) -> Self {
self.ignore_inverted_index = ignore;
self
}
#[must_use]
pub(crate) fn with_start_time(mut self, now: Instant) -> Self {
self.start_time = Some(now);
self
}
/// Returns a [Scanner] to scan the region.
pub(crate) fn scanner(self) -> Result<Scanner> {
self.seq_scan().map(Scanner::Seq)
@@ -223,7 +233,8 @@ impl ScanRegion {
.with_files(files)
.with_cache(self.cache_manager)
.with_index_applier(index_applier)
.with_parallelism(self.parallelism);
.with_parallelism(self.parallelism)
.with_start_time(self.start_time);
Ok(seq_scan)
}

View File

@@ -32,7 +32,7 @@ use crate::access_layer::AccessLayerRef;
use crate::cache::{CacheManager, CacheManagerRef};
use crate::error::Result;
use crate::memtable::MemtableRef;
use crate::metrics::READ_STAGE_ELAPSED;
use crate::metrics::{READ_BATCHES_RETURN, READ_ROWS_RETURN, READ_SST_COUNT, READ_STAGE_ELAPSED};
use crate::read::compat::{self, CompatReader};
use crate::read::merge::MergeReaderBuilder;
use crate::read::projection::ProjectionMapper;
@@ -65,6 +65,8 @@ pub struct SeqScan {
parallelism: ScanParallism,
/// Index applier.
index_applier: Option<SstIndexApplierRef>,
/// Start time of the query.
query_start: Option<Instant>,
}
impl SeqScan {
@@ -82,6 +84,7 @@ impl SeqScan {
ignore_file_not_found: false,
parallelism: ScanParallism::default(),
index_applier: None,
query_start: None,
}
}
@@ -141,10 +144,19 @@ impl SeqScan {
self
}
/// Sets start time of the query.
#[must_use]
pub(crate) fn with_start_time(mut self, now: Option<Instant>) -> Self {
self.query_start = now;
self
}
/// Builds a stream for the query.
pub async fn build_stream(&self) -> Result<SendableRecordBatchStream> {
let start = Instant::now();
let mut metrics = Metrics::default();
let build_start = Instant::now();
let query_start = self.query_start.unwrap_or(build_start);
metrics.prepare_scan_cost = query_start.elapsed();
let use_parallel = self.use_parallel_reader();
// Scans all memtables and SSTs. Builds a merge reader to merge results.
let mut reader = if use_parallel {
@@ -152,9 +164,13 @@ impl SeqScan {
} else {
self.build_reader().await?
};
let elapsed = start.elapsed();
metrics.build_reader_cost = elapsed;
metrics.scan_cost = elapsed;
metrics.build_reader_cost = build_start.elapsed();
READ_STAGE_ELAPSED
.with_label_values(&["prepare_scan"])
.observe(metrics.prepare_scan_cost.as_secs_f64());
READ_STAGE_ELAPSED
.with_label_values(&["build_reader"])
.observe(metrics.build_reader_cost.as_secs_f64());
// Creates a stream to poll the batch reader and convert batch into record batch.
let mapper = self.mapper.clone();
@@ -165,15 +181,22 @@ impl SeqScan {
while let Some(batch) =
Self::fetch_record_batch(&mut reader, &mapper, cache, &mut metrics).await?
{
metrics.num_batches += 1;
metrics.num_rows += batch.num_rows();
yield batch;
}
// Update metrics.
metrics.total_cost = query_start.elapsed();
READ_STAGE_ELAPSED.with_label_values(&["convert_rb"]).observe(metrics.convert_cost.as_secs_f64());
READ_STAGE_ELAPSED.with_label_values(&["scan"]).observe(metrics.scan_cost.as_secs_f64());
READ_STAGE_ELAPSED.with_label_values(&["total"]).observe(metrics.total_cost.as_secs_f64());
READ_ROWS_RETURN.observe(metrics.num_rows as f64);
READ_BATCHES_RETURN.observe(metrics.num_batches as f64);
debug!(
"Seq scan finished, region_id: {:?}, metrics: {:?}, use_parallel: {}, parallelism: {}",
mapper.metadata().region_id, metrics, use_parallel, parallelism,
);
// Update metrics.
READ_STAGE_ELAPSED.with_label_values(&["total"]).observe(metrics.scan_cost.as_secs_f64());
};
let stream = Box::pin(RecordBatchStreamWrapper::new(
self.mapper.output_schema(),
@@ -249,6 +272,8 @@ impl SeqScan {
}
}
READ_SST_COUNT.observe(self.files.len() as f64);
Ok(sources)
}
@@ -318,12 +343,20 @@ impl SeqScan {
/// Metrics for [SeqScan].
#[derive(Debug, Default)]
struct Metrics {
/// Duration to prepare the scan task.
prepare_scan_cost: Duration,
/// Duration to build the reader.
build_reader_cost: Duration,
/// Duration to scan data.
scan_cost: Duration,
/// Duration to convert batches.
convert_cost: Duration,
/// Duration of the scan.
total_cost: Duration,
/// Number of batches returned.
num_batches: usize,
/// Number of rows returned.
num_rows: usize,
}
#[cfg(test)]

View File

@@ -171,6 +171,8 @@ impl RegionOpener {
// Initial memtable id is 0.
let mutable = self.memtable_builder.build(0, &metadata);
debug!("Create region {} with options: {:?}", region_id, options);
let version = VersionBuilder::new(metadata, mutable)
.options(options)
.build();
@@ -249,6 +251,9 @@ impl RegionOpener {
let region_id = self.region_id;
let object_store = self.object_store(&region_options.storage)?.clone();
debug!("Open region {} with options: {:?}", region_id, self.options);
let access_layer = Arc::new(AccessLayer::new(
self.region_dir.clone(),
object_store,

View File

@@ -13,6 +13,8 @@
// limitations under the License.
//! Options for a region.
//!
//! If we add options in this mod, we also need to modify [store_api::mito_engine_options].
use std::collections::HashMap;
use std::time::Duration;
@@ -358,6 +360,7 @@ mod tests {
("compaction.type", "twcs"),
("storage", "S3"),
("index.inverted_index.ignore_column_ids", "1,2,3"),
("index.inverted_index.segment_row_count", "512"),
(
WAL_OPTIONS_KEY,
&serde_json::to_string(&wal_options).unwrap(),
@@ -376,7 +379,7 @@ mod tests {
index_options: IndexOptions {
inverted_index: InvertedIndexOptions {
ignore_column_ids: vec![1, 2, 3],
segment_row_count: 1024,
segment_row_count: 512,
},
},
};

View File

@@ -215,6 +215,61 @@ impl SortField {
Decimal128, Decimal128
)
}
/// Skip deserializing this field, returns the length of it.
fn skip_deserialize(
&self,
bytes: &[u8],
deserializer: &mut Deserializer<&[u8]>,
) -> Result<usize> {
let pos = deserializer.position();
if bytes[pos] == 0 {
deserializer.advance(1);
return Ok(1);
}
let to_skip = match &self.data_type {
ConcreteDataType::Boolean(_) => 2,
ConcreteDataType::Int8(_) | ConcreteDataType::UInt8(_) => 2,
ConcreteDataType::Int16(_) | ConcreteDataType::UInt16(_) => 3,
ConcreteDataType::Int32(_) | ConcreteDataType::UInt32(_) => 5,
ConcreteDataType::Int64(_) | ConcreteDataType::UInt64(_) => 9,
ConcreteDataType::Float32(_) => 5,
ConcreteDataType::Float64(_) => 9,
ConcreteDataType::Binary(_) => {
// Now the encoder encode binary as a list of bytes so we can't use
// skip bytes.
let pos_before = deserializer.position();
let mut current = pos_before + 1;
while bytes[current] == 1 {
current += 2;
}
let to_skip = current - pos_before + 1;
deserializer.advance(to_skip);
return Ok(to_skip);
}
ConcreteDataType::String(_) => {
let pos_before = deserializer.position();
deserializer.advance(1);
deserializer
.skip_bytes()
.context(error::DeserializeFieldSnafu)?;
return Ok(deserializer.position() - pos_before);
}
ConcreteDataType::Date(_) => 5,
ConcreteDataType::DateTime(_) => 9,
ConcreteDataType::Timestamp(_) => 9, // We treat timestamp as Option<i64>
ConcreteDataType::Time(_) => 10, // i64 and 1 byte time unit
ConcreteDataType::Duration(_) => 10,
ConcreteDataType::Interval(_) => 18,
ConcreteDataType::Decimal128(_) => 19,
ConcreteDataType::Null(_)
| ConcreteDataType::List(_)
| ConcreteDataType::Dictionary(_) => 0,
};
deserializer.advance(to_skip);
Ok(to_skip)
}
}
/// A memory-comparable row [Value] encoder/decoder.
@@ -236,6 +291,52 @@ impl McmpRowCodec {
pub fn estimated_size(&self) -> usize {
self.fields.iter().map(|f| f.estimated_size()).sum()
}
/// Decode value at `pos` in `bytes`.
///
/// The i-th element in offsets buffer is how many bytes to skip in order to read value at `pos`.
pub fn decode_value_at(
&self,
bytes: &[u8],
pos: usize,
offsets_buf: &mut Vec<usize>,
) -> Result<Value> {
let mut deserializer = Deserializer::new(bytes);
if pos < offsets_buf.len() {
// We computed the offset before.
let to_skip = offsets_buf[pos];
deserializer.advance(to_skip);
return self.fields[pos].deserialize(&mut deserializer);
}
if offsets_buf.is_empty() {
let mut offset = 0;
// Skip values before `pos`.
for i in 0..pos {
// Offset to skip before reading value i.
offsets_buf.push(offset);
let skip = self.fields[i].skip_deserialize(bytes, &mut deserializer)?;
offset += skip;
}
// Offset to skip before reading this value.
offsets_buf.push(offset);
} else {
// Offsets are not enough.
let value_start = offsets_buf.len() - 1;
// Advances to decode value at `value_start`.
let mut offset = offsets_buf[value_start];
deserializer.advance(offset);
for i in value_start..pos {
// Skip value i.
let skip = self.fields[i].skip_deserialize(bytes, &mut deserializer)?;
// Offset for the value at i + 1.
offset += skip;
offsets_buf.push(offset);
}
}
self.fields[pos].deserialize(&mut deserializer)
}
}
impl RowCodec for McmpRowCodec {
@@ -274,7 +375,7 @@ impl RowCodec for McmpRowCodec {
#[cfg(test)]
mod tests {
use common_base::bytes::StringBytes;
use common_time::Timestamp;
use common_time::{DateTime, Timestamp};
use datatypes::value::Value;
use super::*;
@@ -292,6 +393,18 @@ mod tests {
let result = encoder.encode(value_ref.iter().cloned()).unwrap();
let decoded = encoder.decode(&result).unwrap();
assert_eq!(decoded, row);
let mut decoded = Vec::new();
let mut offsets = Vec::new();
// Iter two times to test offsets buffer.
for _ in 0..2 {
decoded.clear();
for i in 0..data_types.len() {
let value = encoder.decode_value_at(&result, i, &mut offsets).unwrap();
decoded.push(value);
}
assert_eq!(data_types.len(), offsets.len(), "offsets: {:?}", offsets);
assert_eq!(decoded, row);
}
}
#[test]
@@ -416,5 +529,53 @@ mod tests {
],
vec![Value::Null, Value::Int64(43), Value::Boolean(true)],
);
// All types.
check_encode_and_decode(
&[
ConcreteDataType::boolean_datatype(),
ConcreteDataType::int8_datatype(),
ConcreteDataType::uint8_datatype(),
ConcreteDataType::int16_datatype(),
ConcreteDataType::uint16_datatype(),
ConcreteDataType::int32_datatype(),
ConcreteDataType::uint32_datatype(),
ConcreteDataType::int64_datatype(),
ConcreteDataType::uint64_datatype(),
ConcreteDataType::float32_datatype(),
ConcreteDataType::float64_datatype(),
ConcreteDataType::binary_datatype(),
ConcreteDataType::string_datatype(),
ConcreteDataType::date_datatype(),
ConcreteDataType::datetime_datatype(),
ConcreteDataType::timestamp_millisecond_datatype(),
ConcreteDataType::time_millisecond_datatype(),
ConcreteDataType::duration_millisecond_datatype(),
ConcreteDataType::interval_month_day_nano_datatype(),
ConcreteDataType::decimal128_default_datatype(),
],
vec![
Value::Boolean(true),
Value::Int8(8),
Value::UInt8(8),
Value::Int16(16),
Value::UInt16(16),
Value::Int32(32),
Value::UInt32(32),
Value::Int64(64),
Value::UInt64(64),
Value::Float32(1.0.into()),
Value::Float64(1.0.into()),
Value::Binary(b"hello"[..].into()),
Value::String("world".into()),
Value::Date(Date::new(10)),
Value::DateTime(DateTime::new(11)),
Value::Timestamp(Timestamp::new_millisecond(12)),
Value::Time(Time::new_millisecond(13)),
Value::Duration(Duration::new_millisecond(14)),
Value::Interval(Interval::from_month_day_nano(1, 1, 15)),
Value::Decimal128(Decimal128::from(16)),
],
);
}
}

View File

@@ -219,25 +219,14 @@ pub(crate) fn extract_data_batch(batch: &DataBatch) -> (u16, Vec<(i64, u64)>) {
/// Builds key values with timestamps (ms) and sequences for test.
pub(crate) fn build_key_values_with_ts_seq_values(
schema: &RegionMetadataRef,
metadata: &RegionMetadataRef,
k0: String,
k1: u32,
timestamps: impl Iterator<Item = i64>,
values: impl Iterator<Item = Option<f64>>,
sequence: SequenceNumber,
) -> KeyValues {
let column_schema = schema
.column_metadatas
.iter()
.map(|c| api::v1::ColumnSchema {
column_name: c.column_schema.name.clone(),
datatype: ColumnDataTypeWrapper::try_from(c.column_schema.data_type.clone())
.unwrap()
.datatype() as i32,
semantic_type: c.semantic_type as i32,
..Default::default()
})
.collect();
let column_schema = region_metadata_to_row_schema(metadata);
let rows = timestamps
.zip(values)
@@ -269,7 +258,23 @@ pub(crate) fn build_key_values_with_ts_seq_values(
rows,
}),
};
KeyValues::new(schema.as_ref(), mutation).unwrap()
KeyValues::new(metadata.as_ref(), mutation).unwrap()
}
/// Converts the region metadata to column schemas for a row.
pub fn region_metadata_to_row_schema(metadata: &RegionMetadataRef) -> Vec<api::v1::ColumnSchema> {
metadata
.column_metadatas
.iter()
.map(|c| api::v1::ColumnSchema {
column_name: c.column_schema.name.clone(),
datatype: ColumnDataTypeWrapper::try_from(c.column_schema.data_type.clone())
.unwrap()
.datatype() as i32,
semantic_type: c.semantic_type as i32,
..Default::default()
})
.collect()
}
/// Encode keys.

Some files were not shown because too many files have changed in this diff Show More