Compare commits

...

43 Commits

Author SHA1 Message Date
Yingwen
8ca9e01455 feat: Partition memtables by time if compaction window is provided (#3501)
* feat: define time partitions

* feat: adapt time partitions to version

* feat: implement non write methods

* feat: add write one to memtable

* feat: implement write

* chore: fix warning

* fix: inner not set

* refactor: add collect_iter_timestamps

* test: test partitions

* chore: debug log

* chore: fix typos

* chore: log memtable id

* fix: empty check

* chore: log total parts

* chore: update comments
2024-03-14 11:13:01 +00:00
Weny Xu
3a326775ee ci: add bin options to reduce build burden (#3518)
chore: add bin options
2024-03-14 11:05:35 +00:00
Yingwen
5ad3b7984e docs: add v0.7 TSBS benchmark result (#3512)
* docs: add v0.7 TSBS benchmark result

* docs: add OS

* docs: fix format
2024-03-14 08:29:52 +00:00
Yingwen
4fc27bdc75 chore: bump version to v0.7.1 (#3510)
chore: bump version
2024-03-14 07:43:47 +00:00
LFC
e3c82568e5 fix: correctly generate sequences when the value is pre-existed (#3502) 2024-03-14 06:55:12 +00:00
tison
61f0703af8 feat: support decode gzip if influxdb write specify it (#3494)
* feat: support dedoce gzip if influxdb write specify it

Signed-off-by: tison <wander4096@gmail.com>

* address comments

Signed-off-by: tison <wander4096@gmail.com>

* simplify with tower_http DecompressionLayer

Signed-off-by: tison <wander4096@gmail.com>

* tidy some code

Signed-off-by: tison <wander4096@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
2024-03-14 04:26:26 +00:00
Ruihang Xia
b85d7bb575 fix: decoding prometheus remote write proto doesn't reset the value (#3505)
* reset Sample

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* accomplish test assertion

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* revert toml format

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-14 03:08:14 +00:00
Ning Sun
d334d74986 fix!: remove error message from http header to avoid panic (#3506)
fix: remove error message from http header
2024-03-14 01:43:38 +00:00
Ning Sun
5ca8521e87 ci: attempt to setup docker cache for etcd (#3488)
* ci: attempt to setup docker cache for etcd

* ci: do not use file hash for cache key
2024-03-14 00:48:02 +00:00
Weny Xu
e4333969b4 feat(fuzz): add alter table target (#3503)
* feat(fuzz): validate semantic type of column

* feat(fuzz): add fuzz_alter_table target

* feat(fuzz): validate columns

* chore(ci): add fuzz_alter_table ci cfg
2024-03-13 14:11:47 +00:00
Zhenchi
b55905cf66 feat(fuzz): add insert target (#3499)
* fix(common-time): allow building nanos timestamp from parts split from i64::MIN

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* feat(fuzz): add insert target

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* chore: cleanup cargo.toml and polish comments

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2024-03-13 10:03:03 +00:00
WU Jingdi
fb4da05f25 fix: adjust fill behavior of range query (#3489) 2024-03-13 09:20:34 +00:00
Zhenchi
904484b525 fix(common-time): allow building nanos timestamp from parts split from i64::MIN (#3493)
Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2024-03-13 02:46:00 +00:00
tison
cafb4708ce refactor: validate constraints eagerly (#3472)
* chore: validate constraints eagerly

Signed-off-by: tison <wander4096@gmail.com>

* use timestamp column

Signed-off-by: tison <wander4096@gmail.com>

* fixup

Signed-off-by: tison <wander4096@gmail.com>

* lint

Signed-off-by: tison <wander4096@gmail.com>

* compile

Signed-off-by: tison <wander4096@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
2024-03-12 13:09:34 +00:00
Yingwen
7c895e2605 perf: more benchmarks for memtables (#3491)
* chore: remove duplicate bench

* refactor: rename bench

* perf: add full scan bench for memtable

* feat: filter bench and add time series to bench group

* chore: comment

* refactor: rename

* style: fix clippy
2024-03-12 12:02:58 +00:00
Lei, HUANG
9afe327bca feat: improve prom write requests decode performance (#3478)
* feat: optimize decode performance

* fix: some cr comments
2024-03-12 12:00:38 +00:00
discord9
58bd065c6b feat(flow): plan def (#3490)
* feat: plan def

* chore: add license

* docs: remove TODO done

* chore: add derive Ord
2024-03-12 10:59:07 +00:00
Yingwen
9aa8f756ab fix: allow passing extra table options (#3484)
* fix: do not check options in parser

* test: fix tests

* test: fix sqlness

* test: add sqlness test

* chore: log options

* chore: must specify compaction type

* feat: validate option key

* feat: add option key validation back
2024-03-12 07:03:52 +00:00
discord9
7639c227ca feat(flow): accumlator for aggr func (#3396)
* feat: Accumlator trait

* feat: add `OrdValue` accum&use enum_dispatch

* test: more accum test

* feat: eval aggr funcs

* chore: refactor test&fmt clippy

* refactor: less verbose

* test: more tests

* refactor: better err handling&use OrdValue for Count

* refactor: ignore null&more tests for error handle

* refactor: OrdValue accum

* chore: extract null check

* refactor: def&use fn signature

* chore: use extra cond with match guard

* chore: per review
2024-03-12 02:09:27 +00:00
tison
1255c1fc9e feat: to_timezone function (#3470)
* feat: to_timezone function

Signed-off-by: tison <wander4096@gmail.com>

* impl Function for ToTimezoneFunction

Signed-off-by: tison <wander4096@gmail.com>

* add test

Signed-off-by: tison <wander4096@gmail.com>

* Add original authors

Co-authored-by: parkma99 <park-ma@hotmail.com>
Co-authored-by: Yingwen <realevenyag@gmail.com>

* fixup

Signed-off-by: tison <wander4096@gmail.com>

* address comments

Signed-off-by: tison <wander4096@gmail.com>

* add issue link

Signed-off-by: tison <wander4096@gmail.com>

* code refactor

Signed-off-by: tison <wander4096@gmail.com>

* further tidy

Signed-off-by: tison <wander4096@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
Co-authored-by: parkma99 <park-ma@hotmail.com>
Co-authored-by: Yingwen <realevenyag@gmail.com>
2024-03-12 01:46:19 +00:00
Yingwen
06dcd0f6ed fix: freeze data buffer in shard (#3468)
* feat: call freeze if the active data buffer in a shard is full

* chore: more metrics

* chore: print metrics

* chore: enlarge freeze threshold

* test: test freeze

* test: fix config test
2024-03-11 14:51:06 +00:00
Weny Xu
0a4444a43a feat(fuzz): validate columns (#3485) 2024-03-11 11:34:50 +00:00
Ruihang Xia
b7ac8d6aa8 ci: use another mirror for etcd image (#3486)
Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-11 10:40:19 +00:00
Weny Xu
e767f37241 fix: fix f64 has no sufficient precision during parsing (#3483) 2024-03-11 09:28:40 +00:00
JeremyHi
da098f5568 fix: make max-txn-ops limit valid (#3481) 2024-03-11 09:27:51 +00:00
shuiyisong
aa953dcc34 fix: impl RecordBatchStream method explicitly (#3482)
fix: impl RecordBatchStream method explicitly
2024-03-11 09:07:10 +00:00
crwen
aa125a50f9 refactor: make http api returns non-200 status code (#3473)
* refactor: make http api returns non-200 status code

* recover some code
2024-03-11 03:38:36 +00:00
Ruihang Xia
d8939eb891 feat: clamp function (#3465)
* basic impl

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add unit tests

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* a little type exercise

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add sqlness case

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-11 03:26:10 +00:00
shuiyisong
0bb949787c refactor: introduce new Output with OutputMeta (#3466)
* refactor: introduce new output struct

* chore: add helper function

* chore: update comment

* chore: update commit

Co-authored-by: Ruihang Xia <waynestxia@gmail.com>

* chore: rename according to cr

---------

Co-authored-by: Ruihang Xia <waynestxia@gmail.com>
2024-03-11 02:24:09 +00:00
WU Jingdi
8c37c3fc0f feat: support first_value/last_value in range query (#3448)
* feat: support `first_value/last_value` in range query

* chore: add sqlness test on `count`

* chore: add test
2024-03-11 01:30:39 +00:00
gcmutator
21ff3620be chore: remove repetitive words (#3469)
remove repetitive words

Signed-off-by: gcmutator <329964069@qq.com>
2024-03-09 04:18:47 +00:00
Eugene Tolbakov
aeca0d8e8a feat(influxdb): add db query param support for v2 write api (#3445)
* feat(influxdb): add db query param support for v2 write api

* fix(influxdb): update authorize logic to get catalog and schema from query string

* fix(influxdb): address CR suggestions

* fix(influxdb): use the correct import
2024-03-08 08:17:57 +00:00
Weny Xu
a309cd018a fix: fix incorrect COM_STMT_PREPARE reply (#3463)
* fix: fix incorrect `COM_STMT_PREPARE` reply

* chore: use column name instead of index
2024-03-08 07:31:20 +00:00
Yingwen
3ee53360ee perf: Reduce decode overhead during pruning keys in the memtable (#3415)
* feat: reuse value buf

* feat: skip values to decode

* feat: prune shard

chore: fix compiler errors

refactor: shard prune metrics

* fix: panic on DedupReader::try_new

* fix: prune after next

* chore: num parts metrics

* feat: metrics and logs

* chore: data build cost

* chore: more logs

* feat: cache skip result

* chore: todo

* fix: index out of bound

* test: test codec

* fix: invalid offsets

* fix: skip binary

* fix: offset buffer reuse

* chore: comment

* test: test memtable filter

* style: fix clippy

* chore: fix compiler error
2024-03-08 02:54:00 +00:00
JeremyHi
352bd7b6fd feat: max-txn-ops option (#3458)
* feat: max-txn-ops limit

* chore: by comment
2024-03-08 02:34:40 +00:00
Weny Xu
3f3ef2e7af refactor: separate the quote char and value (#3455)
refactor: use ident instead of string
2024-03-07 08:24:09 +00:00
Weny Xu
a218f12bd9 test: add fuzz test for create table (#3441)
* feat: add create table fuzz test

* chore: add ci cfg for fuzz tests

* refactor: remove redundant nightly config

* chore: run fuzz test in debug mode

* chore: use ubuntu-latest

* fix: close connection

* chore: add cache in fuzz test ci

* chore: apply suggestion from CR

* chore: apply suggestion from CR

* chore: refactor the fuzz test action
2024-03-07 06:51:19 +00:00
ZonaHe
c884c56151 feat: update dashboard to v0.4.8 (#3450)
Co-authored-by: ZonaHex <ZonaHex@users.noreply.github.com>
2024-03-07 04:06:07 +00:00
Weny Xu
9ec288cab9 chore: specify binary name (#3449) 2024-03-07 03:56:24 +00:00
LFC
1f1491e429 feat: impl some "set"s to adapt to some client apps (#3443) 2024-03-06 13:15:48 +00:00
Weny Xu
c52bc613e0 chore: add bin opt to build cmd (#3440) 2024-03-06 08:24:55 +00:00
shuiyisong
a9d42f7b87 fix: add support for influxdb basic auth (#3437) 2024-03-06 03:56:25 +00:00
tison
86ce2d8713 build(deps): upgrade opendal to 0.45.1 (#3432)
* build(deps): upgrade opendal to 0.45.1

Signed-off-by: tison <wander4096@gmail.com>

* Update src/object-store/Cargo.toml

Co-authored-by: Weny Xu <wenymedia@gmail.com>

---------

Signed-off-by: tison <wander4096@gmail.com>
Co-authored-by: Weny Xu <wenymedia@gmail.com>
2024-03-06 03:08:59 +00:00
212 changed files with 9385 additions and 1974 deletions

10
.editorconfig Normal file
View File

@@ -0,0 +1,10 @@
root = true
[*]
end_of_line = lf
indent_style = space
insert_final_newline = true
trim_trailing_whitespace = true
[{Makefile,**.mk}]
indent_style = tab

View File

@@ -21,3 +21,6 @@ GT_GCS_CREDENTIAL_PATH = GCS credential path
GT_GCS_ENDPOINT = GCS end point
# Settings for kafka wal test
GT_KAFKA_ENDPOINTS = localhost:9092
# Setting for fuzz tests
GT_MYSQL_ADDR = localhost:4002

View File

@@ -70,7 +70,7 @@ runs:
- name: Build greptime binary
shell: pwsh
run: cargo build --profile ${{ inputs.cargo-profile }} --features ${{ inputs.features }} --target ${{ inputs.arch }}
run: cargo build --profile ${{ inputs.cargo-profile }} --features ${{ inputs.features }} --target ${{ inputs.arch }} --bin greptime
- name: Upload artifacts
uses: ./.github/actions/upload-artifacts

13
.github/actions/fuzz-test/action.yaml vendored Normal file
View File

@@ -0,0 +1,13 @@
name: Fuzz Test
description: 'Fuzz test given setup and service'
inputs:
target:
description: "The fuzz target to test"
runs:
using: composite
steps:
- name: Run Fuzz Test
shell: bash
run: cargo fuzz run ${{ inputs.target }} --fuzz-dir tests-fuzz -D -s none -- -max_total_time=120
env:
GT_MYSQL_ADDR: 127.0.0.1:4002

View File

@@ -102,7 +102,7 @@ jobs:
shared-key: "build-binaries"
- name: Build greptime binaries
shell: bash
run: cargo build
run: cargo build --bin greptime --bin sqlness-runner
- name: Pack greptime binaries
shell: bash
run: |
@@ -117,6 +117,46 @@ jobs:
artifacts-dir: bins
version: current
fuzztest:
name: Fuzz Test
needs: build
runs-on: ubuntu-latest
strategy:
matrix:
target: [ "fuzz_create_table", "fuzz_alter_table" ]
steps:
- uses: actions/checkout@v4
- uses: arduino/setup-protoc@v3
- uses: dtolnay/rust-toolchain@master
with:
toolchain: ${{ env.RUST_TOOLCHAIN }}
- name: Rust Cache
uses: Swatinem/rust-cache@v2
with:
# Shares across multiple jobs
shared-key: "fuzz-test-targets"
- name: Set Rust Fuzz
shell: bash
run: |
sudo apt update && sudo apt install -y libfuzzer-14-dev
cargo install cargo-fuzz
- name: Download pre-built binaries
uses: actions/download-artifact@v4
with:
name: bins
path: .
- name: Unzip binaries
run: tar -xvf ./bins.tar.gz
- name: Run GreptimeDB
run: |
./bins/greptime standalone start&
- name: Fuzz Test
uses: ./.github/actions/fuzz-test
env:
CUSTOM_LIBFUZZER_PATH: /usr/lib/llvm-14/lib/libFuzzer.a
with:
target: ${{ matrix.target }}
sqlness:
name: Sqlness Test
needs: build
@@ -239,6 +279,10 @@ jobs:
with:
# Shares cross multiple jobs
shared-key: "coverage-test"
- name: Docker Cache
uses: ScribeMD/docker-cache@0.3.7
with:
key: docker-${{ runner.os }}-coverage
- name: Install latest nextest release
uses: taiki-e/install-action@nextest
- name: Install cargo-llvm-cov

4
.gitignore vendored
View File

@@ -46,3 +46,7 @@ benchmarks/data
*.code-workspace
venv/
# Fuzz tests
tests-fuzz/artifacts/
tests-fuzz/corpus/

316
Cargo.lock generated
View File

@@ -29,6 +29,17 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
[[package]]
name = "aes"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
dependencies = [
"cfg-if 1.0.0",
"cipher",
"cpufeatures",
]
[[package]]
name = "ahash"
version = "0.7.7"
@@ -196,7 +207,7 @@ checksum = "8f1f8f5a6f3d50d89e3797d7593a50f96bb2aaa20ca0cc7be1fb673232c91d72"
[[package]]
name = "api"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"common-base",
"common-decimal",
@@ -241,6 +252,15 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "arbitrary"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
dependencies = [
"derive_arbitrary",
]
[[package]]
name = "arc-swap"
version = "1.6.0"
@@ -550,7 +570,6 @@ version = "0.3.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "942c7cd7ae39e91bde4820d74132e9862e62c2f386c3aa90ccf55949f5bad63a"
dependencies = [
"brotli",
"bzip2",
"flate2",
"futures-core",
@@ -569,6 +588,7 @@ version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5"
dependencies = [
"brotli",
"bzip2",
"flate2",
"futures-core",
@@ -675,7 +695,7 @@ dependencies = [
[[package]]
name = "auth"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"async-trait",
@@ -861,7 +881,7 @@ dependencies = [
[[package]]
name = "benchmarks"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"arrow",
"chrono",
@@ -992,6 +1012,15 @@ dependencies = [
"generic-array",
]
[[package]]
name = "block-padding"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93"
dependencies = [
"generic-array",
]
[[package]]
name = "borsh"
version = "1.3.0"
@@ -1219,7 +1248,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "catalog"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"arc-swap",
@@ -1266,6 +1295,15 @@ dependencies = [
"tokio",
]
[[package]]
name = "cbc"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6"
dependencies = [
"cipher",
]
[[package]]
name = "cc"
version = "1.0.83"
@@ -1421,6 +1459,16 @@ dependencies = [
"half 1.8.2",
]
[[package]]
name = "cipher"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
dependencies = [
"crypto-common",
"inout",
]
[[package]]
name = "clang-sys"
version = "1.6.1"
@@ -1510,7 +1558,7 @@ checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
[[package]]
name = "client"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"arc-swap",
@@ -1546,7 +1594,7 @@ dependencies = [
"session",
"snafu",
"substrait 0.17.1",
"substrait 0.7.0",
"substrait 0.7.1",
"tokio",
"tokio-stream",
"tonic 0.10.2",
@@ -1576,7 +1624,7 @@ dependencies = [
[[package]]
name = "cmd"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"anymap",
"async-trait",
@@ -1629,7 +1677,7 @@ dependencies = [
"session",
"snafu",
"store-api",
"substrait 0.7.0",
"substrait 0.7.1",
"table",
"temp-env",
"tikv-jemallocator",
@@ -1672,7 +1720,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
[[package]]
name = "common-base"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"anymap",
"bitvec",
@@ -1687,7 +1735,7 @@ dependencies = [
[[package]]
name = "common-catalog"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"chrono",
"common-error",
@@ -1698,7 +1746,7 @@ dependencies = [
[[package]]
name = "common-config"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"common-base",
"humantime-serde",
@@ -1709,7 +1757,7 @@ dependencies = [
[[package]]
name = "common-datasource"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"arrow",
"arrow-schema",
@@ -1741,7 +1789,7 @@ dependencies = [
[[package]]
name = "common-decimal"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"arrow",
"bigdecimal",
@@ -1755,7 +1803,7 @@ dependencies = [
[[package]]
name = "common-error"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"snafu",
"strum 0.25.0",
@@ -1763,7 +1811,7 @@ dependencies = [
[[package]]
name = "common-function"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"arc-swap",
@@ -1798,7 +1846,7 @@ dependencies = [
[[package]]
name = "common-greptimedb-telemetry"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"async-trait",
"common-error",
@@ -1817,7 +1865,7 @@ dependencies = [
[[package]]
name = "common-grpc"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"arrow-flight",
@@ -1847,7 +1895,7 @@ dependencies = [
[[package]]
name = "common-grpc-expr"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"async-trait",
@@ -1866,7 +1914,7 @@ dependencies = [
[[package]]
name = "common-macro"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"arc-swap",
"common-query",
@@ -1881,7 +1929,7 @@ dependencies = [
[[package]]
name = "common-mem-prof"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"common-error",
"common-macro",
@@ -1894,7 +1942,7 @@ dependencies = [
[[package]]
name = "common-meta"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"async-recursion",
@@ -1944,11 +1992,11 @@ dependencies = [
[[package]]
name = "common-plugins"
version = "0.7.0"
version = "0.7.1"
[[package]]
name = "common-procedure"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"async-stream",
"async-trait",
@@ -1972,7 +2020,7 @@ dependencies = [
[[package]]
name = "common-procedure-test"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"async-trait",
"common-procedure",
@@ -1980,7 +2028,7 @@ dependencies = [
[[package]]
name = "common-query"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"async-trait",
@@ -2003,7 +2051,7 @@ dependencies = [
[[package]]
name = "common-recordbatch"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"arc-swap",
"common-base",
@@ -2023,7 +2071,7 @@ dependencies = [
[[package]]
name = "common-runtime"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"async-trait",
"common-error",
@@ -2043,7 +2091,7 @@ dependencies = [
[[package]]
name = "common-telemetry"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"atty",
"backtrace",
@@ -2071,7 +2119,7 @@ dependencies = [
[[package]]
name = "common-test-util"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"client",
"common-query",
@@ -2083,7 +2131,7 @@ dependencies = [
[[package]]
name = "common-time"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"arrow",
"chrono",
@@ -2099,14 +2147,14 @@ dependencies = [
[[package]]
name = "common-version"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"build-data",
]
[[package]]
name = "common-wal"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"common-base",
"common-error",
@@ -2754,7 +2802,7 @@ dependencies = [
[[package]]
name = "datanode"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"arrow-flight",
@@ -2812,7 +2860,7 @@ dependencies = [
"snafu",
"sql",
"store-api",
"substrait 0.7.0",
"substrait 0.7.1",
"table",
"tokio",
"tokio-stream",
@@ -2826,7 +2874,7 @@ dependencies = [
[[package]]
name = "datatypes"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"arrow",
"arrow-array",
@@ -2912,6 +2960,17 @@ dependencies = [
"syn 2.0.43",
]
[[package]]
name = "derive_arbitrary"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.43",
]
[[package]]
name = "derive_builder"
version = "0.11.2"
@@ -3302,7 +3361,7 @@ dependencies = [
[[package]]
name = "file-engine"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"async-trait",
@@ -3403,7 +3462,7 @@ dependencies = [
[[package]]
name = "flow"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"bimap",
@@ -3415,10 +3474,12 @@ dependencies = [
"common-telemetry",
"common-time",
"datatypes",
"enum_dispatch",
"hydroflow",
"itertools 0.10.5",
"num-traits",
"serde",
"serde_json",
"servers",
"session",
"snafu",
@@ -3458,7 +3519,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa"
[[package]]
name = "frontend"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"arc-swap",
@@ -3522,7 +3583,7 @@ dependencies = [
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
"store-api",
"strfmt",
"substrait 0.7.0",
"substrait 0.7.1",
"table",
"tokio",
"toml 0.8.8",
@@ -4291,7 +4352,7 @@ dependencies = [
[[package]]
name = "index"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"async-trait",
"asynchronous-codec",
@@ -4406,6 +4467,16 @@ dependencies = [
"libc",
]
[[package]]
name = "inout"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5"
dependencies = [
"block-padding",
"generic-array",
]
[[package]]
name = "instant"
version = "0.1.12"
@@ -4455,11 +4526,12 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
[[package]]
name = "iri-string"
version = "0.4.1"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f0f7638c1e223529f1bfdc48c8b133b9e0b434094d1d28473161ee48b235f78"
checksum = "21859b667d66a4c1dacd9df0863b3efb65785474255face87f5bca39dd8407c0"
dependencies = [
"nom",
"memchr",
"serde",
]
[[package]]
@@ -4746,9 +4818,20 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.151"
version = "0.2.153"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4"
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
[[package]]
name = "libfuzzer-sys"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
dependencies = [
"arbitrary",
"cc",
"once_cell",
]
[[package]]
name = "libgit2-sys"
@@ -4848,7 +4931,7 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "log-store"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"async-stream",
"async-trait",
@@ -5137,7 +5220,7 @@ dependencies = [
[[package]]
name = "meta-client"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"async-trait",
@@ -5167,7 +5250,7 @@ dependencies = [
[[package]]
name = "meta-srv"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"anymap",
"api",
@@ -5247,7 +5330,7 @@ dependencies = [
[[package]]
name = "metric-engine"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"aquamarine",
@@ -5319,7 +5402,7 @@ dependencies = [
[[package]]
name = "mito2"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"anymap",
"api",
@@ -5933,7 +6016,7 @@ dependencies = [
[[package]]
name = "object-store"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"anyhow",
"async-trait",
@@ -5989,9 +6072,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
[[package]]
name = "opendal"
version = "0.44.2"
version = "0.45.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4af824652d4d2ffabf606d337a071677ae621b05622adf35df9562f69d9b4498"
checksum = "52c17c077f23fa2d2c25d9d22af98baa43b8bbe2ef0de80cf66339aa70401467"
dependencies = [
"anyhow",
"async-trait",
@@ -6007,7 +6090,7 @@ dependencies = [
"md-5",
"once_cell",
"percent-encoding",
"quick-xml 0.30.0",
"quick-xml 0.31.0",
"reqsign",
"reqwest",
"serde",
@@ -6176,7 +6259,7 @@ dependencies = [
[[package]]
name = "operator"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"async-trait",
@@ -6223,7 +6306,7 @@ dependencies = [
"sql",
"sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=6a93567ae38d42be5c8d08b13c8ff4dde26502ef)",
"store-api",
"substrait 0.7.0",
"substrait 0.7.1",
"table",
"tokio",
"tonic 0.10.2",
@@ -6454,7 +6537,7 @@ dependencies = [
[[package]]
name = "partition"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"async-trait",
@@ -6500,6 +6583,16 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8835116a5c179084a830efb3adc117ab007512b535bc1a21c991d3b32a6b44dd"
[[package]]
name = "pbkdf2"
version = "0.12.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
dependencies = [
"digest",
"hmac",
]
[[package]]
name = "peeking_take_while"
version = "0.1.2"
@@ -6540,6 +6633,12 @@ version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "permutation"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df202b0b0f5b8e389955afd5f27b007b00fb948162953f1db9c70d2c7e3157d7"
[[package]]
name = "pest"
version = "2.7.5"
@@ -6724,6 +6823,21 @@ dependencies = [
"spki 0.7.3",
]
[[package]]
name = "pkcs5"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e847e2c91a18bfa887dd028ec33f2fe6f25db77db3619024764914affe8b69a6"
dependencies = [
"aes",
"cbc",
"der 0.7.8",
"pbkdf2",
"scrypt",
"sha2",
"spki 0.7.3",
]
[[package]]
name = "pkcs8"
version = "0.8.0"
@@ -6742,6 +6856,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
dependencies = [
"der 0.7.8",
"pkcs5",
"rand_core",
"spki 0.7.3",
]
@@ -6781,7 +6897,7 @@ dependencies = [
[[package]]
name = "plugins"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"auth",
"common-base",
@@ -7048,7 +7164,7 @@ dependencies = [
[[package]]
name = "promql"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"ahash 0.8.6",
"async-recursion",
@@ -7259,7 +7375,7 @@ dependencies = [
[[package]]
name = "puffin"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"async-trait",
"bitflags 2.4.1",
@@ -7380,7 +7496,7 @@ dependencies = [
[[package]]
name = "query"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"ahash 0.8.6",
"api",
@@ -7441,7 +7557,7 @@ dependencies = [
"stats-cli",
"store-api",
"streaming-stats",
"substrait 0.7.0",
"substrait 0.7.1",
"table",
"tokio",
"tokio-stream",
@@ -7456,16 +7572,6 @@ dependencies = [
"memchr",
]
[[package]]
name = "quick-xml"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956"
dependencies = [
"memchr",
"serde",
]
[[package]]
name = "quick-xml"
version = "0.31.0"
@@ -7748,9 +7854,9 @@ dependencies = [
[[package]]
name = "reqsign"
version = "0.14.6"
version = "0.14.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dce87f66ba6c6acef277a729f989a0eca946cb9ce6a15bcc036bda0f72d4b9fd"
checksum = "43e319d9de9ff4d941abf4ac718897118b0fe04577ea3f8e0f5788971784eef5"
dependencies = [
"anyhow",
"async-trait",
@@ -7775,7 +7881,6 @@ dependencies = [
"serde_json",
"sha1",
"sha2",
"tokio",
]
[[package]]
@@ -7968,6 +8073,7 @@ dependencies = [
"pkcs1 0.7.5",
"pkcs8 0.10.2",
"rand_core",
"sha2",
"signature",
"spki 0.7.3",
"subtle",
@@ -8702,6 +8808,15 @@ dependencies = [
"bytemuck",
]
[[package]]
name = "salsa20"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97a22f5af31f73a954c10289c93e8a50cc23d971e80ee446f1f6f7137a088213"
dependencies = [
"cipher",
]
[[package]]
name = "same-file"
version = "1.0.6"
@@ -8759,7 +8874,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "script"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"arc-swap",
@@ -8815,6 +8930,17 @@ dependencies = [
"tokio-test",
]
[[package]]
name = "scrypt"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0516a385866c09368f0b5bcd1caff3366aace790fcd46e2bb032697bb172fd1f"
dependencies = [
"pbkdf2",
"salsa20",
"sha2",
]
[[package]]
name = "sct"
version = "0.7.1"
@@ -9032,7 +9158,7 @@ dependencies = [
[[package]]
name = "servers"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"aide",
"api",
@@ -9074,6 +9200,7 @@ dependencies = [
"derive_builder 0.12.0",
"digest",
"futures",
"hashbrown 0.14.3",
"headers",
"hex",
"hostname",
@@ -9092,6 +9219,7 @@ dependencies = [
"opensrv-mysql",
"opentelemetry-proto 0.3.0",
"parking_lot 0.12.1",
"permutation",
"pgwire",
"pin-project",
"postgres-types",
@@ -9136,7 +9264,7 @@ dependencies = [
[[package]]
name = "session"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"arc-swap",
@@ -9406,7 +9534,7 @@ dependencies = [
[[package]]
name = "sql"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"common-base",
@@ -9458,7 +9586,7 @@ dependencies = [
[[package]]
name = "sqlness-runner"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"async-trait",
"clap 4.4.11",
@@ -9665,7 +9793,7 @@ dependencies = [
[[package]]
name = "store-api"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"aquamarine",
@@ -9805,7 +9933,7 @@ dependencies = [
[[package]]
name = "substrait"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"async-recursion",
"async-trait",
@@ -9978,7 +10106,7 @@ dependencies = [
[[package]]
name = "table"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"anymap",
"async-trait",
@@ -10090,17 +10218,21 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
[[package]]
name = "tests-fuzz"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"arbitrary",
"async-trait",
"common-error",
"common-macro",
"common-query",
"common-runtime",
"common-telemetry",
"common-time",
"datatypes",
"derive_builder 0.12.0",
"dotenv",
"lazy_static",
"libfuzzer-sys",
"partition",
"rand",
"rand_chacha",
@@ -10115,7 +10247,7 @@ dependencies = [
[[package]]
name = "tests-integration"
version = "0.7.0"
version = "0.7.1"
dependencies = [
"api",
"arrow-flight",
@@ -10172,7 +10304,7 @@ dependencies = [
"sql",
"sqlx",
"store-api",
"substrait 0.7.0",
"substrait 0.7.1",
"table",
"tempfile",
"time",
@@ -10731,13 +10863,13 @@ dependencies = [
[[package]]
name = "tower-http"
version = "0.3.5"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f873044bf02dd1e8239e9c1293ea39dad76dc594ec16185d0a1bf31d8dc8d858"
checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140"
dependencies = [
"async-compression 0.3.15",
"base64 0.13.1",
"bitflags 1.3.2",
"async-compression 0.4.5",
"base64 0.21.5",
"bitflags 2.4.1",
"bytes",
"futures-core",
"futures-util",

View File

@@ -62,7 +62,7 @@ members = [
resolver = "2"
[workspace.package]
version = "0.7.0"
version = "0.7.1"
edition = "2021"
license = "Apache-2.0"
@@ -134,7 +134,7 @@ reqwest = { version = "0.11", default-features = false, features = [
rskafka = "0.5"
rust_decimal = "1.33"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
serde_json = { version = "1.0", features = ["float_roundtrip"] }
serde_with = "3"
smallvec = { version = "1", features = ["serde"] }
snafu = "0.7"

View File

@@ -3,6 +3,7 @@ CARGO_PROFILE ?=
FEATURES ?=
TARGET_DIR ?=
TARGET ?=
BUILD_BIN ?= greptime
CARGO_BUILD_OPTS := --locked
IMAGE_REGISTRY ?= docker.io
IMAGE_NAMESPACE ?= greptime
@@ -45,6 +46,10 @@ ifneq ($(strip $(TARGET)),)
CARGO_BUILD_OPTS += --target ${TARGET}
endif
ifneq ($(strip $(BUILD_BIN)),)
CARGO_BUILD_OPTS += --bin ${BUILD_BIN}
endif
ifneq ($(strip $(RELEASE)),)
CARGO_BUILD_OPTS += --release
endif

View File

@@ -29,7 +29,7 @@ use client::api::v1::column::Values;
use client::api::v1::{
Column, ColumnDataType, ColumnDef, CreateTableExpr, InsertRequest, InsertRequests, SemanticType,
};
use client::{Client, Database, Output, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use client::{Client, Database, OutputData, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use futures_util::TryStreamExt;
use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
@@ -502,9 +502,9 @@ async fn do_query(num_iter: usize, db: &Database, table_name: &str) {
for i in 0..num_iter {
let now = Instant::now();
let res = db.sql(&query).await.unwrap();
match res {
Output::AffectedRows(_) | Output::RecordBatches(_) => (),
Output::Stream(stream, _) => {
match res.data {
OutputData::AffectedRows(_) | OutputData::RecordBatches(_) => (),
OutputData::Stream(stream) => {
stream.try_collect::<Vec<_>>().await.unwrap();
}
}

View File

@@ -0,0 +1,50 @@
# TSBS benchmark - v0.7.0
## Environment
### Local
| | |
| ------ | ---------------------------------- |
| CPU | AMD Ryzen 7 7735HS (8 core 3.2GHz) |
| Memory | 32GB |
| Disk | SOLIDIGM SSDPFKNU010TZ |
| OS | Ubuntu 22.04.2 LTS |
### Amazon EC2
| | |
| ------- | -------------- |
| Machine | c5d.2xlarge |
| CPU | 8 core |
| Memory | 16GB |
| Disk | 50GB (GP3) |
| OS | Ubuntu 22.04.1 |
## Write performance
| Environment | Ingest rate (rows/s) |
| ------------------ | --------------------- |
| Local | 3695814.64 |
| EC2 c5d.2xlarge | 2987166.64 |
## Query performance
| Query type | Local (ms) | EC2 c5d.2xlarge (ms) |
| --------------------- | ---------- | ---------------------- |
| cpu-max-all-1 | 30.56 | 54.74 |
| cpu-max-all-8 | 52.69 | 70.50 |
| double-groupby-1 | 664.30 | 1366.63 |
| double-groupby-5 | 1391.26 | 2141.71 |
| double-groupby-all | 2828.94 | 3389.59 |
| groupby-orderby-limit | 718.92 | 1213.90 |
| high-cpu-1 | 29.21 | 52.98 |
| high-cpu-all | 5514.12 | 7194.91 |
| lastpoint | 7571.40 | 9423.41 |
| single-groupby-1-1-1 | 19.09 | 7.77 |
| single-groupby-1-1-12 | 27.28 | 51.64 |
| single-groupby-1-8-1 | 31.85 | 11.64 |
| single-groupby-5-1-1 | 16.14 | 9.67 |
| single-groupby-5-1-12 | 27.21 | 53.62 |
| single-groupby-5-8-1 | 39.62 | 14.96 |

View File

@@ -79,7 +79,7 @@ This RFC proposes to add a new expression node `MergeScan` to merge result from
│ │ │ │
└─Frontend──────┘ └─Remote-Sources──────────────┘
```
This merge operation simply chains all the the underlying remote data sources and return `RecordBatch`, just like a coalesce op. And each remote sources is a gRPC query to datanode via the substrait logical plan interface. The plan is transformed and divided from the original query that comes to frontend.
This merge operation simply chains all the underlying remote data sources and return `RecordBatch`, just like a coalesce op. And each remote sources is a gRPC query to datanode via the substrait logical plan interface. The plan is transformed and divided from the original query that comes to frontend.
## Commutativity of MergeScan

View File

@@ -307,7 +307,7 @@ impl Database {
reason: "Expect 'AffectedRows' Flight messages to be the one and the only!"
}
);
Ok(Output::AffectedRows(rows))
Ok(Output::new_with_affected_rows(rows))
}
FlightMessage::Recordbatch(_) | FlightMessage::Metrics(_) => {
IllegalFlightMessagesSnafu {
@@ -340,7 +340,7 @@ impl Database {
output_ordering: None,
metrics: Default::default(),
};
Ok(Output::new_stream(Box::pin(record_batch_stream)))
Ok(Output::new_with_stream(Box::pin(record_batch_stream)))
}
}
}

View File

@@ -26,7 +26,7 @@ use api::v1::greptime_response::Response;
use api::v1::{AffectedRows, GreptimeResponse};
pub use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_error::status_code::StatusCode;
pub use common_query::Output;
pub use common_query::{Output, OutputData, OutputMeta};
pub use common_recordbatch::{RecordBatches, SendableRecordBatchStream};
use snafu::OptionExt;

View File

@@ -62,7 +62,9 @@ pub struct BenchTableMetadataCommand {
impl BenchTableMetadataCommand {
pub async fn build(&self) -> Result<Instance> {
let etcd_store = EtcdStore::with_endpoints([&self.etcd_addr]).await.unwrap();
let etcd_store = EtcdStore::with_endpoints([&self.etcd_addr], 128)
.await
.unwrap();
let table_metadata_manager = Arc::new(TableMetadataManager::new(etcd_store));

View File

@@ -19,8 +19,7 @@ use async_trait::async_trait;
use clap::{Parser, ValueEnum};
use client::api::v1::auth_header::AuthScheme;
use client::api::v1::Basic;
use client::{Client, Database, DEFAULT_SCHEMA_NAME};
use common_query::Output;
use client::{Client, Database, OutputData, DEFAULT_SCHEMA_NAME};
use common_recordbatch::util::collect;
use common_telemetry::{debug, error, info, warn};
use datatypes::scalars::ScalarVector;
@@ -142,7 +141,7 @@ impl Export {
.with_context(|_| RequestDatabaseSnafu {
sql: "show databases".to_string(),
})?;
let Output::Stream(stream, _) = result else {
let OutputData::Stream(stream) = result.data else {
NotDataFromOutputSnafu.fail()?
};
let record_batch = collect(stream)
@@ -183,7 +182,7 @@ impl Export {
.sql(&sql)
.await
.with_context(|_| RequestDatabaseSnafu { sql })?;
let Output::Stream(stream, _) = result else {
let OutputData::Stream(stream) = result.data else {
NotDataFromOutputSnafu.fail()?
};
let Some(record_batch) = collect(stream)
@@ -235,7 +234,7 @@ impl Export {
.sql(&sql)
.await
.with_context(|_| RequestDatabaseSnafu { sql })?;
let Output::Stream(stream, _) = result else {
let OutputData::Stream(stream) = result.data else {
NotDataFromOutputSnafu.fail()?
};
let record_batch = collect(stream)

View File

@@ -19,7 +19,7 @@ use std::time::Instant;
use catalog::kvbackend::{
CachedMetaKvBackend, CachedMetaKvBackendBuilder, KvBackendCatalogManager,
};
use client::{Client, Database, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use client::{Client, Database, OutputData, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_base::Plugins;
use common_error::ext::ErrorExt;
use common_query::Output;
@@ -184,15 +184,15 @@ impl Repl {
}
.context(RequestDatabaseSnafu { sql: &sql })?;
let either = match output {
Output::Stream(s, _) => {
let either = match output.data {
OutputData::Stream(s) => {
let x = RecordBatches::try_collect(s)
.await
.context(CollectRecordBatchesSnafu)?;
Either::Left(x)
}
Output::RecordBatches(x) => Either::Left(x),
Output::AffectedRows(rows) => Either::Right(rows),
OutputData::RecordBatches(x) => Either::Left(x),
OutputData::AffectedRows(rows) => Either::Right(rows),
};
let end = Instant::now();

View File

@@ -70,7 +70,7 @@ impl UpgradeCommand {
etcd_addr: &self.etcd_addr,
})?;
let tool = MigrateTableMetadata {
etcd_store: EtcdStore::with_etcd_client(client),
etcd_store: EtcdStore::with_etcd_client(client, 128),
dryrun: self.dryrun,
skip_catalog_keys: self.skip_catalog_keys,
skip_table_global_keys: self.skip_table_global_keys,

View File

@@ -117,10 +117,12 @@ struct StartCommand {
/// The working home directory of this metasrv instance.
#[clap(long)]
data_home: Option<String>,
/// If it's not empty, the metasrv will store all data with this key prefix.
#[clap(long, default_value = "")]
store_key_prefix: String,
/// The max operations per txn
#[clap(long)]
max_txn_ops: Option<usize>,
}
impl StartCommand {
@@ -181,6 +183,10 @@ impl StartCommand {
opts.store_key_prefix = self.store_key_prefix.clone()
}
if let Some(max_txn_ops) = self.max_txn_ops {
opts.max_txn_ops = max_txn_ops;
}
// Disable dashboard in metasrv.
opts.http.disable_dashboard = true;

View File

@@ -28,12 +28,15 @@ const REGION: &str = "region";
const ENABLE_VIRTUAL_HOST_STYLE: &str = "enable_virtual_host_style";
pub fn is_supported_in_s3(key: &str) -> bool {
key == ENDPOINT
|| key == ACCESS_KEY_ID
|| key == SECRET_ACCESS_KEY
|| key == SESSION_TOKEN
|| key == REGION
|| key == ENABLE_VIRTUAL_HOST_STYLE
[
ENDPOINT,
ACCESS_KEY_ID,
SECRET_ACCESS_KEY,
SESSION_TOKEN,
REGION,
ENABLE_VIRTUAL_HOST_STYLE,
]
.contains(&key)
}
pub fn build_s3_backend(

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
mod clamp;
mod modulo;
mod pow;
mod rate;
@@ -19,6 +20,7 @@ mod rate;
use std::fmt;
use std::sync::Arc;
pub use clamp::ClampFunction;
use common_query::error::{GeneralDataFusionSnafu, Result};
use common_query::prelude::Signature;
use datafusion::error::DataFusionError;
@@ -40,7 +42,8 @@ impl MathFunction {
registry.register(Arc::new(ModuloFunction));
registry.register(Arc::new(PowFunction));
registry.register(Arc::new(RateFunction));
registry.register(Arc::new(RangeFunction))
registry.register(Arc::new(RangeFunction));
registry.register(Arc::new(ClampFunction));
}
}

View File

@@ -0,0 +1,403 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt::{self, Display};
use std::sync::Arc;
use common_query::error::{InvalidFuncArgsSnafu, Result};
use common_query::prelude::Signature;
use datafusion::arrow::array::{ArrayIter, PrimitiveArray};
use datafusion::logical_expr::Volatility;
use datatypes::data_type::{ConcreteDataType, DataType};
use datatypes::prelude::VectorRef;
use datatypes::types::LogicalPrimitiveType;
use datatypes::value::TryAsPrimitive;
use datatypes::vectors::PrimitiveVector;
use datatypes::with_match_primitive_type_id;
use snafu::{ensure, OptionExt};
use crate::function::Function;
#[derive(Clone, Debug, Default)]
pub struct ClampFunction;
const CLAMP_NAME: &str = "clamp";
impl Function for ClampFunction {
fn name(&self) -> &str {
CLAMP_NAME
}
fn return_type(&self, input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
// Type check is done by `signature`
Ok(input_types[0].clone())
}
fn signature(&self) -> Signature {
// input, min, max
Signature::uniform(3, ConcreteDataType::numerics(), Volatility::Immutable)
}
fn eval(
&self,
_func_ctx: crate::function::FunctionContext,
columns: &[VectorRef],
) -> Result<VectorRef> {
ensure!(
columns.len() == 3,
InvalidFuncArgsSnafu {
err_msg: format!(
"The length of the args is not correct, expect exactly 3, have: {}",
columns.len()
),
}
);
ensure!(
columns[0].data_type().is_numeric(),
InvalidFuncArgsSnafu {
err_msg: format!(
"The first arg's type is not numeric, have: {}",
columns[0].data_type()
),
}
);
ensure!(
columns[0].data_type() == columns[1].data_type()
&& columns[1].data_type() == columns[2].data_type(),
InvalidFuncArgsSnafu {
err_msg: format!(
"Arguments don't have identical types: {}, {}, {}",
columns[0].data_type(),
columns[1].data_type(),
columns[2].data_type()
),
}
);
ensure!(
columns[1].len() == 1 && columns[2].len() == 1,
InvalidFuncArgsSnafu {
err_msg: format!(
"The second and third args should be scalar, have: {:?}, {:?}",
columns[1], columns[2]
),
}
);
with_match_primitive_type_id!(columns[0].data_type().logical_type_id(), |$S| {
let input_array = columns[0].to_arrow_array();
let input = input_array
.as_any()
.downcast_ref::<PrimitiveArray<<$S as LogicalPrimitiveType>::ArrowPrimitive>>()
.unwrap();
let min = TryAsPrimitive::<$S>::try_as_primitive(&columns[1].get(0))
.with_context(|| {
InvalidFuncArgsSnafu {
err_msg: "The second arg should not be none",
}
})?;
let max = TryAsPrimitive::<$S>::try_as_primitive(&columns[2].get(0))
.with_context(|| {
InvalidFuncArgsSnafu {
err_msg: "The third arg should not be none",
}
})?;
// ensure min <= max
ensure!(
min <= max,
InvalidFuncArgsSnafu {
err_msg: format!(
"The second arg should be less than or equal to the third arg, have: {:?}, {:?}",
columns[1], columns[2]
),
}
);
clamp_impl::<$S, true, true>(input, min, max)
},{
unreachable!()
})
}
}
impl Display for ClampFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", CLAMP_NAME.to_ascii_uppercase())
}
}
fn clamp_impl<T: LogicalPrimitiveType, const CLAMP_MIN: bool, const CLAMP_MAX: bool>(
input: &PrimitiveArray<T::ArrowPrimitive>,
min: T::Native,
max: T::Native,
) -> Result<VectorRef> {
common_telemetry::info!("[DEBUG] min {min:?}, max {max:?}");
let iter = ArrayIter::new(input);
let result = iter.map(|x| {
x.map(|x| {
if CLAMP_MIN && x < min {
min
} else if CLAMP_MAX && x > max {
max
} else {
x
}
})
});
let result = PrimitiveArray::<T::ArrowPrimitive>::from_iter(result);
Ok(Arc::new(PrimitiveVector::<T>::from(result)))
}
#[cfg(test)]
mod test {
use std::sync::Arc;
use datatypes::prelude::ScalarVector;
use datatypes::vectors::{
ConstantVector, Float64Vector, Int64Vector, StringVector, UInt64Vector,
};
use super::*;
use crate::function::FunctionContext;
#[test]
fn clamp_i64() {
let inputs = [
(
vec![Some(-3), Some(-2), Some(-1), Some(0), Some(1), Some(2)],
-1,
10,
vec![Some(-1), Some(-1), Some(-1), Some(0), Some(1), Some(2)],
),
(
vec![Some(-3), Some(-2), Some(-1), Some(0), Some(1), Some(2)],
0,
0,
vec![Some(0), Some(0), Some(0), Some(0), Some(0), Some(0)],
),
(
vec![Some(-3), None, Some(-1), None, None, Some(2)],
-2,
1,
vec![Some(-2), None, Some(-1), None, None, Some(1)],
),
(
vec![None, None, None, None, None],
0,
1,
vec![None, None, None, None, None],
),
];
let func = ClampFunction;
for (in_data, min, max, expected) in inputs {
let args = [
Arc::new(Int64Vector::from(in_data)) as _,
Arc::new(Int64Vector::from_vec(vec![min])) as _,
Arc::new(Int64Vector::from_vec(vec![max])) as _,
];
let result = func
.eval(FunctionContext::default(), args.as_slice())
.unwrap();
let expected: VectorRef = Arc::new(Int64Vector::from(expected));
assert_eq!(expected, result);
}
}
#[test]
fn clamp_u64() {
let inputs = [
(
vec![Some(0), Some(1), Some(2), Some(3), Some(4), Some(5)],
1,
3,
vec![Some(1), Some(1), Some(2), Some(3), Some(3), Some(3)],
),
(
vec![Some(0), Some(1), Some(2), Some(3), Some(4), Some(5)],
0,
0,
vec![Some(0), Some(0), Some(0), Some(0), Some(0), Some(0)],
),
(
vec![Some(0), None, Some(2), None, None, Some(5)],
1,
3,
vec![Some(1), None, Some(2), None, None, Some(3)],
),
(
vec![None, None, None, None, None],
0,
1,
vec![None, None, None, None, None],
),
];
let func = ClampFunction;
for (in_data, min, max, expected) in inputs {
let args = [
Arc::new(UInt64Vector::from(in_data)) as _,
Arc::new(UInt64Vector::from_vec(vec![min])) as _,
Arc::new(UInt64Vector::from_vec(vec![max])) as _,
];
let result = func
.eval(FunctionContext::default(), args.as_slice())
.unwrap();
let expected: VectorRef = Arc::new(UInt64Vector::from(expected));
assert_eq!(expected, result);
}
}
#[test]
fn clamp_f64() {
let inputs = [
(
vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)],
-1.0,
10.0,
vec![Some(-1.0), Some(-1.0), Some(-1.0), Some(0.0), Some(1.0)],
),
(
vec![Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)],
0.0,
0.0,
vec![Some(0.0), Some(0.0), Some(0.0), Some(0.0)],
),
(
vec![Some(-3.0), None, Some(-1.0), None, None, Some(2.0)],
-2.0,
1.0,
vec![Some(-2.0), None, Some(-1.0), None, None, Some(1.0)],
),
(
vec![None, None, None, None, None],
0.0,
1.0,
vec![None, None, None, None, None],
),
];
let func = ClampFunction;
for (in_data, min, max, expected) in inputs {
let args = [
Arc::new(Float64Vector::from(in_data)) as _,
Arc::new(Float64Vector::from_vec(vec![min])) as _,
Arc::new(Float64Vector::from_vec(vec![max])) as _,
];
let result = func
.eval(FunctionContext::default(), args.as_slice())
.unwrap();
let expected: VectorRef = Arc::new(Float64Vector::from(expected));
assert_eq!(expected, result);
}
}
#[test]
fn clamp_const_i32() {
let input = vec![Some(5)];
let min = 2;
let max = 4;
let func = ClampFunction;
let args = [
Arc::new(ConstantVector::new(Arc::new(Int64Vector::from(input)), 1)) as _,
Arc::new(Int64Vector::from_vec(vec![min])) as _,
Arc::new(Int64Vector::from_vec(vec![max])) as _,
];
let result = func
.eval(FunctionContext::default(), args.as_slice())
.unwrap();
let expected: VectorRef = Arc::new(Int64Vector::from(vec![Some(4)]));
assert_eq!(expected, result);
}
#[test]
fn clamp_invalid_min_max() {
let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
let min = 10.0;
let max = -1.0;
let func = ClampFunction;
let args = [
Arc::new(Float64Vector::from(input)) as _,
Arc::new(Float64Vector::from_vec(vec![min])) as _,
Arc::new(Float64Vector::from_vec(vec![max])) as _,
];
let result = func.eval(FunctionContext::default(), args.as_slice());
assert!(result.is_err());
}
#[test]
fn clamp_type_not_match() {
let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
let min = -1;
let max = 10;
let func = ClampFunction;
let args = [
Arc::new(Float64Vector::from(input)) as _,
Arc::new(Int64Vector::from_vec(vec![min])) as _,
Arc::new(UInt64Vector::from_vec(vec![max])) as _,
];
let result = func.eval(FunctionContext::default(), args.as_slice());
assert!(result.is_err());
}
#[test]
fn clamp_min_is_not_scalar() {
let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
let min = -10.0;
let max = 1.0;
let func = ClampFunction;
let args = [
Arc::new(Float64Vector::from(input)) as _,
Arc::new(Float64Vector::from_vec(vec![min, min])) as _,
Arc::new(Float64Vector::from_vec(vec![max])) as _,
];
let result = func.eval(FunctionContext::default(), args.as_slice());
assert!(result.is_err());
}
#[test]
fn clamp_no_max() {
let input = vec![Some(-3.0), Some(-2.0), Some(-1.0), Some(0.0), Some(1.0)];
let min = -10.0;
let func = ClampFunction;
let args = [
Arc::new(Float64Vector::from(input)) as _,
Arc::new(Float64Vector::from_vec(vec![min])) as _,
];
let result = func.eval(FunctionContext::default(), args.as_slice());
assert!(result.is_err());
}
#[test]
fn clamp_on_string() {
let input = vec![Some("foo"), Some("foo"), Some("foo"), Some("foo")];
let func = ClampFunction;
let args = [
Arc::new(StringVector::from(input)) as _,
Arc::new(StringVector::from_vec(vec!["bar"])) as _,
Arc::new(StringVector::from_vec(vec!["baz"])) as _,
];
let result = func.eval(FunctionContext::default(), args.as_slice());
assert!(result.is_err());
}
}

View File

@@ -14,9 +14,11 @@
use std::sync::Arc;
mod greatest;
mod to_timezone;
mod to_unixtime;
use greatest::GreatestFunction;
use to_timezone::ToTimezoneFunction;
use to_unixtime::ToUnixtimeFunction;
use crate::function_registry::FunctionRegistry;
@@ -25,6 +27,7 @@ pub(crate) struct TimestampFunction;
impl TimestampFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register(Arc::new(ToTimezoneFunction));
registry.register(Arc::new(ToUnixtimeFunction));
registry.register(Arc::new(GreatestFunction));
}

View File

@@ -0,0 +1,260 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::fmt;
use std::sync::Arc;
use common_query::error::{InvalidFuncArgsSnafu, Result, UnsupportedInputDataTypeSnafu};
use common_query::prelude::Signature;
use common_time::{Timestamp, Timezone};
use datatypes::data_type::ConcreteDataType;
use datatypes::prelude::VectorRef;
use datatypes::types::TimestampType;
use datatypes::value::Value;
use datatypes::vectors::{
StringVector, TimestampMicrosecondVector, TimestampMillisecondVector,
TimestampNanosecondVector, TimestampSecondVector, Vector,
};
use snafu::{ensure, OptionExt};
use crate::function::{Function, FunctionContext};
use crate::helper;
#[derive(Clone, Debug, Default)]
pub struct ToTimezoneFunction;
const NAME: &str = "to_timezone";
fn convert_to_timezone(arg: &str) -> Option<Timezone> {
Timezone::from_tz_string(arg).ok()
}
fn convert_to_timestamp(arg: &Value) -> Option<Timestamp> {
match arg {
Value::Timestamp(ts) => Some(*ts),
_ => None,
}
}
impl fmt::Display for ToTimezoneFunction {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "TO_TIMEZONE")
}
}
impl Function for ToTimezoneFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, input_types: &[ConcreteDataType]) -> Result<ConcreteDataType> {
// type checked by signature - MUST BE timestamp
Ok(input_types[0].clone())
}
fn signature(&self) -> Signature {
helper::one_of_sigs2(
vec![
ConcreteDataType::timestamp_second_datatype(),
ConcreteDataType::timestamp_millisecond_datatype(),
ConcreteDataType::timestamp_microsecond_datatype(),
ConcreteDataType::timestamp_nanosecond_datatype(),
],
vec![ConcreteDataType::string_datatype()],
)
}
fn eval(&self, _ctx: FunctionContext, columns: &[VectorRef]) -> Result<VectorRef> {
ensure!(
columns.len() == 2,
InvalidFuncArgsSnafu {
err_msg: format!(
"The length of the args is not correct, expect exactly 2, have: {}",
columns.len()
),
}
);
// TODO: maybe support epoch timestamp? https://github.com/GreptimeTeam/greptimedb/issues/3477
let ts = columns[0].data_type().as_timestamp().with_context(|| {
UnsupportedInputDataTypeSnafu {
function: NAME,
datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
}
})?;
let array = columns[0].to_arrow_array();
let times = match ts {
TimestampType::Second(_) => {
let vector = TimestampSecondVector::try_from_arrow_array(array).unwrap();
(0..vector.len())
.map(|i| convert_to_timestamp(&vector.get(i)))
.collect::<Vec<_>>()
}
TimestampType::Millisecond(_) => {
let vector = TimestampMillisecondVector::try_from_arrow_array(array).unwrap();
(0..vector.len())
.map(|i| convert_to_timestamp(&vector.get(i)))
.collect::<Vec<_>>()
}
TimestampType::Microsecond(_) => {
let vector = TimestampMicrosecondVector::try_from_arrow_array(array).unwrap();
(0..vector.len())
.map(|i| convert_to_timestamp(&vector.get(i)))
.collect::<Vec<_>>()
}
TimestampType::Nanosecond(_) => {
let vector = TimestampNanosecondVector::try_from_arrow_array(array).unwrap();
(0..vector.len())
.map(|i| convert_to_timestamp(&vector.get(i)))
.collect::<Vec<_>>()
}
};
let tzs = {
let array = columns[1].to_arrow_array();
let vector = StringVector::try_from_arrow_array(&array)
.ok()
.with_context(|| UnsupportedInputDataTypeSnafu {
function: NAME,
datatypes: columns.iter().map(|c| c.data_type()).collect::<Vec<_>>(),
})?;
(0..vector.len())
.map(|i| convert_to_timezone(&vector.get(i).to_string()))
.collect::<Vec<_>>()
};
let result = times
.iter()
.zip(tzs.iter())
.map(|(time, tz)| match (time, tz) {
(Some(time), _) => Some(time.to_timezone_aware_string(tz.as_ref())),
_ => None,
})
.collect::<Vec<Option<String>>>();
Ok(Arc::new(StringVector::from(result)))
}
}
#[cfg(test)]
mod tests {
use datatypes::scalars::ScalarVector;
use datatypes::timestamp::{
TimestampMicrosecond, TimestampMillisecond, TimestampNanosecond, TimestampSecond,
};
use datatypes::vectors::StringVector;
use super::*;
#[test]
fn test_timestamp_to_timezone() {
let f = ToTimezoneFunction;
assert_eq!("to_timezone", f.name());
let results = vec![
Some("1969-12-31 19:00:01"),
None,
Some("1970-01-01 03:00:01"),
None,
];
let times: Vec<Option<TimestampSecond>> = vec![
Some(TimestampSecond::new(1)),
None,
Some(TimestampSecond::new(1)),
None,
];
let ts_vector: TimestampSecondVector =
TimestampSecondVector::from_owned_iterator(times.into_iter());
let tzs = vec![Some("America/New_York"), None, Some("Europe/Moscow"), None];
let args: Vec<VectorRef> = vec![
Arc::new(ts_vector),
Arc::new(StringVector::from(tzs.clone())),
];
let vector = f.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(4, vector.len());
let expect_times: VectorRef = Arc::new(StringVector::from(results));
assert_eq!(expect_times, vector);
let results = vec![
Some("1969-12-31 19:00:00.001"),
None,
Some("1970-01-01 03:00:00.001"),
None,
];
let times: Vec<Option<TimestampMillisecond>> = vec![
Some(TimestampMillisecond::new(1)),
None,
Some(TimestampMillisecond::new(1)),
None,
];
let ts_vector: TimestampMillisecondVector =
TimestampMillisecondVector::from_owned_iterator(times.into_iter());
let args: Vec<VectorRef> = vec![
Arc::new(ts_vector),
Arc::new(StringVector::from(tzs.clone())),
];
let vector = f.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(4, vector.len());
let expect_times: VectorRef = Arc::new(StringVector::from(results));
assert_eq!(expect_times, vector);
let results = vec![
Some("1969-12-31 19:00:00.000001"),
None,
Some("1970-01-01 03:00:00.000001"),
None,
];
let times: Vec<Option<TimestampMicrosecond>> = vec![
Some(TimestampMicrosecond::new(1)),
None,
Some(TimestampMicrosecond::new(1)),
None,
];
let ts_vector: TimestampMicrosecondVector =
TimestampMicrosecondVector::from_owned_iterator(times.into_iter());
let args: Vec<VectorRef> = vec![
Arc::new(ts_vector),
Arc::new(StringVector::from(tzs.clone())),
];
let vector = f.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(4, vector.len());
let expect_times: VectorRef = Arc::new(StringVector::from(results));
assert_eq!(expect_times, vector);
let results = vec![
Some("1969-12-31 19:00:00.000000001"),
None,
Some("1970-01-01 03:00:00.000000001"),
None,
];
let times: Vec<Option<TimestampNanosecond>> = vec![
Some(TimestampNanosecond::new(1)),
None,
Some(TimestampNanosecond::new(1)),
None,
];
let ts_vector: TimestampNanosecondVector =
TimestampNanosecondVector::from_owned_iterator(times.into_iter());
let args: Vec<VectorRef> = vec![
Arc::new(ts_vector),
Arc::new(StringVector::from(tzs.clone())),
];
let vector = f.eval(FunctionContext::default(), &args).unwrap();
assert_eq!(4, vector.len());
let expect_times: VectorRef = Arc::new(StringVector::from(results));
assert_eq!(expect_times, vector);
}
}

View File

@@ -32,7 +32,7 @@ macro_rules! ok {
};
}
/// Internal util macro to to create an error.
/// Internal util macro to create an error.
macro_rules! error {
($span:expr, $msg: expr) => {
Err(syn::Error::new($span, $msg))

View File

@@ -67,6 +67,14 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to execute {} txn operations via Etcd", max_operations))]
EtcdTxnFailed {
max_operations: usize,
#[snafu(source)]
error: etcd_client::Error,
location: Location,
},
#[snafu(display("Failed to get sequence: {}", err_msg))]
NextSequence { err_msg: String, location: Location },
@@ -400,6 +408,7 @@ impl ErrorExt for Error {
IllegalServerState { .. }
| EtcdTxnOpResponse { .. }
| EtcdFailed { .. }
| EtcdTxnFailed { .. }
| ConnectEtcd { .. } => StatusCode::Internal,
SerdeJson { .. }

View File

@@ -464,7 +464,7 @@ impl TableMetadataManager {
pub fn max_logical_tables_per_batch(&self) -> usize {
// The batch size is max_txn_size / 3 because the size of the `tables_data`
// is 3 times the size of the `tables_data`.
self.kv_backend.max_txn_size() / 3
self.kv_backend.max_txn_ops() / 3
}
/// Creates metadata for multiple logical tables and return an error if different metadata exists.
@@ -860,6 +860,7 @@ mod tests {
use bytes::Bytes;
use common_time::util::current_time_millis;
use futures::TryStreamExt;
use store_api::storage::RegionId;
use table::metadata::{RawTableInfo, TableInfo};
use super::datanode_table::DatanodeTableKey;
@@ -1056,6 +1057,36 @@ mod tests {
);
}
#[tokio::test]
async fn test_create_many_logical_tables_metadata() {
let kv_backend = Arc::new(MemoryKvBackend::default());
let table_metadata_manager = TableMetadataManager::new(kv_backend);
let mut tables_data = vec![];
for i in 0..128 {
let table_id = i + 1;
let regin_number = table_id * 3;
let region_id = RegionId::new(table_id, regin_number);
let region_route = new_region_route(region_id.as_u64(), 2);
let region_routes = vec![region_route.clone()];
let table_info: RawTableInfo = test_utils::new_test_table_info_with_name(
table_id,
&format!("my_table_{}", table_id),
region_routes.iter().map(|r| r.region.id.region_number()),
)
.into();
let table_route_value = TableRouteValue::physical(region_routes.clone());
tables_data.push((table_info, table_route_value));
}
// creates metadata.
table_metadata_manager
.create_logical_tables_metadata(tables_data)
.await
.unwrap();
}
#[tokio::test]
async fn test_delete_table_metadata() {
let mem_kv = Arc::new(MemoryKvBackend::default());

View File

@@ -19,8 +19,9 @@ use datatypes::schema::{ColumnSchema, SchemaBuilder};
use store_api::storage::TableId;
use table::metadata::{TableInfo, TableInfoBuilder, TableMetaBuilder};
pub fn new_test_table_info<I: IntoIterator<Item = u32>>(
pub fn new_test_table_info_with_name<I: IntoIterator<Item = u32>>(
table_id: TableId,
table_name: &str,
region_numbers: I,
) -> TableInfo {
let column_schemas = vec![
@@ -50,8 +51,14 @@ pub fn new_test_table_info<I: IntoIterator<Item = u32>>(
TableInfoBuilder::default()
.table_id(table_id)
.table_version(5)
.name("mytable")
.name(table_name)
.meta(meta)
.build()
.unwrap()
}
pub fn new_test_table_info<I: IntoIterator<Item = u32>>(
table_id: TableId,
region_numbers: I,
) -> TableInfo {
new_test_table_info_with_name(table_id, "mytable", region_numbers)
}

View File

@@ -45,6 +45,10 @@ impl TxnService for ChrootKvBackend {
let txn_res = self.inner.txn(txn).await?;
Ok(self.chroot_txn_response(txn_res))
}
fn max_txn_ops(&self) -> usize {
self.inner.max_txn_ops()
}
}
#[async_trait::async_trait]

View File

@@ -33,12 +33,6 @@ use crate::rpc::store::{
};
use crate::rpc::KeyValue;
// Maximum number of operations permitted in a transaction.
// The etcd default configuration's `--max-txn-ops` is 128.
//
// For more detail, see: https://etcd.io/docs/v3.5/op-guide/configuration/
const MAX_TXN_SIZE: usize = 128;
fn convert_key_value(kv: etcd_client::KeyValue) -> KeyValue {
let (key, value) = kv.into_key_value();
KeyValue { key, value }
@@ -46,10 +40,15 @@ fn convert_key_value(kv: etcd_client::KeyValue) -> KeyValue {
pub struct EtcdStore {
client: Client,
// Maximum number of operations permitted in a transaction.
// The etcd default configuration's `--max-txn-ops` is 128.
//
// For more detail, see: https://etcd.io/docs/v3.5/op-guide/configuration/
max_txn_ops: usize,
}
impl EtcdStore {
pub async fn with_endpoints<E, S>(endpoints: S) -> Result<KvBackendRef>
pub async fn with_endpoints<E, S>(endpoints: S, max_txn_ops: usize) -> Result<KvBackendRef>
where
E: AsRef<str>,
S: AsRef<[E]>,
@@ -58,16 +57,19 @@ impl EtcdStore {
.await
.context(error::ConnectEtcdSnafu)?;
Ok(Self::with_etcd_client(client))
Ok(Self::with_etcd_client(client, max_txn_ops))
}
pub fn with_etcd_client(client: Client) -> KvBackendRef {
Arc::new(Self { client })
pub fn with_etcd_client(client: Client, max_txn_ops: usize) -> KvBackendRef {
Arc::new(Self {
client,
max_txn_ops,
})
}
async fn do_multi_txn(&self, txn_ops: Vec<TxnOp>) -> Result<Vec<TxnResponse>> {
let max_txn_size = self.max_txn_size();
if txn_ops.len() < max_txn_size {
let max_txn_ops = self.max_txn_ops();
if txn_ops.len() < max_txn_ops {
// fast path
let _timer = METRIC_META_TXN_REQUEST
.with_label_values(&["etcd", "txn"])
@@ -83,7 +85,7 @@ impl EtcdStore {
}
let txns = txn_ops
.chunks(max_txn_size)
.chunks(max_txn_ops)
.map(|part| async move {
let _timer = METRIC_META_TXN_REQUEST
.with_label_values(&["etcd", "txn"])
@@ -311,18 +313,20 @@ impl TxnService for EtcdStore {
.with_label_values(&["etcd", "txn"])
.start_timer();
let max_operations = txn.max_operations();
let etcd_txn: Txn = txn.into();
let txn_res = self
.client
.kv_client()
.txn(etcd_txn)
.await
.context(error::EtcdFailedSnafu)?;
.context(error::EtcdTxnFailedSnafu { max_operations })?;
txn_res.try_into()
}
fn max_txn_size(&self) -> usize {
MAX_TXN_SIZE
fn max_txn_ops(&self) -> usize {
self.max_txn_ops
}
}

View File

@@ -323,6 +323,10 @@ impl<T: ErrorExt + Send + Sync> TxnService for MemoryKvBackend<T> {
responses,
})
}
fn max_txn_ops(&self) -> usize {
usize::MAX
}
}
impl<T: ErrorExt + Send + Sync + 'static> ResettableKvBackend for MemoryKvBackend<T> {

View File

@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::cmp::max;
use common_error::ext::ErrorExt;
use crate::rpc::store::{DeleteRangeResponse, PutResponse, RangeResponse};
@@ -27,8 +29,8 @@ pub trait TxnService: Sync + Send {
}
/// Maximum number of operations permitted in a transaction.
fn max_txn_size(&self) -> usize {
usize::MAX
fn max_txn_ops(&self) -> usize {
unimplemented!("txn is not implemented")
}
}
@@ -192,6 +194,12 @@ impl Txn {
self.req.failure = operations.into();
self
}
#[inline]
pub fn max_operations(&self) -> usize {
let opc = max(self.req.compare.len(), self.req.success.len());
max(opc, self.req.failure.len())
}
}
impl From<Txn> for TxnRequest {

View File

@@ -34,10 +34,14 @@ pub struct SequenceBuilder {
max: u64,
}
fn seq_name(name: impl AsRef<str>) -> String {
format!("{}-{}", SEQ_PREFIX, name.as_ref())
}
impl SequenceBuilder {
pub fn new(name: impl AsRef<str>, generator: KvBackendRef) -> Self {
Self {
name: format!("{}-{}", SEQ_PREFIX, name.as_ref()),
name: seq_name(name),
initial: 0,
step: 1,
generator,
@@ -138,13 +142,14 @@ impl Inner {
pub async fn next_range(&self) -> Result<Range<u64>> {
let key = self.name.as_bytes();
let mut start = self.next;
for _ in 0..self.force_quit {
let expect = if start == self.initial {
vec![]
} else {
u64::to_le_bytes(start).to_vec()
};
let mut expect = if start == self.initial {
vec![]
} else {
u64::to_le_bytes(start).to_vec()
};
for _ in 0..self.force_quit {
let step = self.step.min(self.max - start);
ensure!(
@@ -167,15 +172,24 @@ impl Inner {
if !res.success {
if let Some(kv) = res.prev_kv {
let value = kv.value;
ensure!(
value.len() == std::mem::size_of::<u64>(),
error::UnexpectedSequenceValueSnafu {
err_msg: format!("key={}, unexpected value={:?}", self.name, value)
expect = kv.value.clone();
let v: [u8; 8] = match kv.value.try_into() {
Ok(a) => a,
Err(v) => {
return error::UnexpectedSequenceValueSnafu {
err_msg: format!("Not a valid u64 for '{}': {v:?}", self.name),
}
.fail()
}
);
start = u64::from_le_bytes(value.try_into().unwrap());
};
let v = u64::from_le_bytes(v);
// If the existed value is smaller than the initial, we should start from the initial.
start = v.max(self.initial);
} else {
expect = vec![];
start = self.initial;
}
continue;
@@ -197,8 +211,12 @@ impl Inner {
#[cfg(test)]
mod tests {
use std::any::Any;
use std::collections::HashSet;
use std::sync::Arc;
use itertools::{Itertools, MinMaxResult};
use tokio::sync::mpsc;
use super::*;
use crate::error::Error;
use crate::kv_backend::memory::MemoryKvBackend;
@@ -209,6 +227,76 @@ mod tests {
DeleteRangeResponse, PutRequest, PutResponse, RangeRequest, RangeResponse,
};
#[tokio::test]
async fn test_sequence_with_existed_value() {
async fn test(exist: u64, expected: Vec<u64>) {
let kv_backend = Arc::new(MemoryKvBackend::default());
let exist = u64::to_le_bytes(exist);
kv_backend
.put(PutRequest::new().with_key(seq_name("s")).with_value(exist))
.await
.unwrap();
let initial = 100;
let seq = SequenceBuilder::new("s", kv_backend)
.initial(initial)
.build();
let mut actual = Vec::with_capacity(expected.len());
for _ in 0..expected.len() {
actual.push(seq.next().await.unwrap());
}
assert_eq!(actual, expected);
}
// put a value not greater than the "initial", the sequence should start from "initial"
test(1, vec![100, 101, 102]).await;
test(100, vec![100, 101, 102]).await;
// put a value greater than the "initial", the sequence should start from the put value
test(200, vec![200, 201, 202]).await;
}
#[tokio::test(flavor = "multi_thread")]
async fn test_sequence_with_contention() {
let seq = Arc::new(
SequenceBuilder::new("s", Arc::new(MemoryKvBackend::default()))
.initial(1024)
.build(),
);
let (tx, mut rx) = mpsc::unbounded_channel();
// Spawn 10 tasks to concurrently get the next sequence. Each task will get 100 sequences.
for _ in 0..10 {
tokio::spawn({
let seq = seq.clone();
let tx = tx.clone();
async move {
for _ in 0..100 {
tx.send(seq.next().await.unwrap()).unwrap()
}
}
});
}
// Test that we get 1000 unique sequences, and start from 1024 to 2023.
let mut nums = HashSet::new();
let mut c = 0;
while c < 1000
&& let Some(x) = rx.recv().await
{
nums.insert(x);
c += 1;
}
assert_eq!(nums.len(), 1000);
let MinMaxResult::MinMax(min, max) = nums.iter().minmax() else {
unreachable!("nums has more than one elements");
};
assert_eq!(*min, 1024);
assert_eq!(*max, 2023);
}
#[tokio::test]
async fn test_sequence() {
let kv_backend = Arc::new(MemoryKvBackend::default());

View File

@@ -152,7 +152,7 @@ impl Runner {
guard.key_guards.push(key_guard);
}
// Execute the procedure. We need to release the lock whenever the the execution
// Execute the procedure. We need to release the lock whenever the execution
// is successful or fail.
self.execute_procedure_in_loop().await;

View File

@@ -30,38 +30,87 @@ pub mod prelude;
mod signature;
use sqlparser_derive::{Visit, VisitMut};
// sql output
pub enum Output {
/// new Output struct with output data(previously Output) and output meta
#[derive(Debug)]
pub struct Output {
pub data: OutputData,
pub meta: OutputMeta,
}
/// Original Output struct
/// carrying result data to response/client/user interface
pub enum OutputData {
AffectedRows(usize),
RecordBatches(RecordBatches),
Stream(SendableRecordBatchStream, Option<Arc<dyn PhysicalPlan>>),
Stream(SendableRecordBatchStream),
}
/// OutputMeta stores meta information produced/generated during the execution
#[derive(Debug, Default)]
pub struct OutputMeta {
/// May exist for query output. One can retrieve execution metrics from this plan.
pub plan: Option<Arc<dyn PhysicalPlan>>,
pub cost: usize,
}
impl Output {
// helper function to build original `Output::Stream`
pub fn new_stream(stream: SendableRecordBatchStream) -> Self {
Output::Stream(stream, None)
pub fn new_with_affected_rows(affected_rows: usize) -> Self {
Self {
data: OutputData::AffectedRows(affected_rows),
meta: Default::default(),
}
}
pub fn new_with_record_batches(recordbatches: RecordBatches) -> Self {
Self {
data: OutputData::RecordBatches(recordbatches),
meta: Default::default(),
}
}
pub fn new_with_stream(stream: SendableRecordBatchStream) -> Self {
Self {
data: OutputData::Stream(stream),
meta: Default::default(),
}
}
pub fn new(data: OutputData, meta: OutputMeta) -> Self {
Self { data, meta }
}
}
impl Debug for Output {
impl Debug for OutputData {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Output::AffectedRows(rows) => write!(f, "Output::AffectedRows({rows})"),
Output::RecordBatches(recordbatches) => {
write!(f, "Output::RecordBatches({recordbatches:?})")
OutputData::AffectedRows(rows) => write!(f, "OutputData::AffectedRows({rows})"),
OutputData::RecordBatches(recordbatches) => {
write!(f, "OutputData::RecordBatches({recordbatches:?})")
}
Output::Stream(_, df) => {
if df.is_some() {
write!(f, "Output::Stream(<stream>, Some<physical_plan>)")
} else {
write!(f, "Output::Stream(<stream>)")
}
OutputData::Stream(_) => {
write!(f, "OutputData::Stream(<stream>)")
}
}
}
}
impl OutputMeta {
pub fn new(plan: Option<Arc<dyn PhysicalPlan>>, cost: usize) -> Self {
Self { plan, cost }
}
pub fn new_with_plan(plan: Arc<dyn PhysicalPlan>) -> Self {
Self {
plan: Some(plan),
cost: 0,
}
}
pub fn new_with_cost(cost: usize) -> Self {
Self { plan: None, cost }
}
}
pub use datafusion::physical_plan::ExecutionPlan as DfPhysicalPlan;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Visit, VisitMut)]

View File

@@ -32,7 +32,7 @@ use snafu::ResultExt;
use crate::error::{self, Result};
use crate::{
DfRecordBatch, DfSendableRecordBatchStream, RecordBatch, RecordBatchStream,
DfRecordBatch, DfSendableRecordBatchStream, OrderOption, RecordBatch, RecordBatchStream,
SendableRecordBatchStream, Stream,
};
@@ -228,6 +228,10 @@ impl RecordBatchStream for RecordBatchStreamAdapter {
Metrics::Unavailable | Metrics::Unresolved(_) => None,
}
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
}
impl Stream for RecordBatchStreamAdapter {
@@ -316,6 +320,14 @@ impl RecordBatchStream for AsyncRecordBatchStreamAdapter {
fn schema(&self) -> SchemaRef {
self.schema.clone()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
}
impl Stream for AsyncRecordBatchStreamAdapter {
@@ -375,6 +387,14 @@ mod test {
fn schema(&self) -> SchemaRef {
unimplemented!()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
}
impl Stream for MaybeErrorRecordBatchStream {

View File

@@ -39,13 +39,9 @@ use snafu::{ensure, ResultExt};
pub trait RecordBatchStream: Stream<Item = Result<RecordBatch>> {
fn schema(&self) -> SchemaRef;
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn output_ordering(&self) -> Option<&[OrderOption]>;
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics>;
}
pub type SendableRecordBatchStream = Pin<Box<dyn RecordBatchStream + Send>>;
@@ -74,6 +70,14 @@ impl RecordBatchStream for EmptyRecordBatchStream {
fn schema(&self) -> SchemaRef {
self.schema.clone()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
}
impl Stream for EmptyRecordBatchStream {
@@ -192,6 +196,14 @@ impl RecordBatchStream for SimpleRecordBatchStream {
fn schema(&self) -> SchemaRef {
self.inner.schema()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
}
impl Stream for SimpleRecordBatchStream {

View File

@@ -41,7 +41,8 @@ mod tests {
use futures::Stream;
use super::*;
use crate::RecordBatchStream;
use crate::adapter::RecordBatchMetrics;
use crate::{OrderOption, RecordBatchStream};
struct MockRecordBatchStream {
batch: Option<RecordBatch>,
@@ -52,6 +53,14 @@ mod tests {
fn schema(&self) -> SchemaRef {
self.schema.clone()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
}
impl Stream for MockRecordBatchStream {

View File

@@ -13,7 +13,7 @@
// limitations under the License.
use client::Database;
use common_query::Output;
use common_query::OutputData;
use common_recordbatch::util;
pub enum ExpectedOutput<'a> {
@@ -23,22 +23,24 @@ pub enum ExpectedOutput<'a> {
pub async fn execute_and_check_output(db: &Database, sql: &str, expected: ExpectedOutput<'_>) {
let output = db.sql(sql).await.unwrap();
let output = output.data;
match (&output, expected) {
(Output::AffectedRows(x), ExpectedOutput::AffectedRows(y)) => {
(OutputData::AffectedRows(x), ExpectedOutput::AffectedRows(y)) => {
assert_eq!(*x, y, "actual: \n{}", x)
}
(Output::RecordBatches(_), ExpectedOutput::QueryResult(x))
| (Output::Stream(_, _), ExpectedOutput::QueryResult(x)) => {
(OutputData::RecordBatches(_), ExpectedOutput::QueryResult(x))
| (OutputData::Stream(_), ExpectedOutput::QueryResult(x)) => {
check_output_stream(output, x).await
}
_ => panic!(),
}
}
pub async fn check_output_stream(output: Output, expected: &str) {
pub async fn check_output_stream(output: OutputData, expected: &str) {
let recordbatches = match output {
Output::Stream(stream, _) => util::collect_batches(stream).await.unwrap(),
Output::RecordBatches(recordbatches) => recordbatches,
OutputData::Stream(stream) => util::collect_batches(stream).await.unwrap(),
OutputData::RecordBatches(recordbatches) => recordbatches,
_ => unreachable!(),
};
let pretty_print = recordbatches.pretty_print().unwrap();

View File

@@ -36,7 +36,7 @@ use crate::{error, Interval};
/// - for [TimeUnit::Second]: [-262144-01-01 00:00:00, +262143-12-31 23:59:59]
/// - for [TimeUnit::Millisecond]: [-262144-01-01 00:00:00.000, +262143-12-31 23:59:59.999]
/// - for [TimeUnit::Microsecond]: [-262144-01-01 00:00:00.000000, +262143-12-31 23:59:59.999999]
/// - for [TimeUnit::Nanosecond]: [1677-09-21 00:12:43.145225, 2262-04-11 23:47:16.854775807]
/// - for [TimeUnit::Nanosecond]: [1677-09-21 00:12:43.145224192, 2262-04-11 23:47:16.854775807]
///
/// # Note:
/// For values out of range, you can still store these timestamps, but while performing arithmetic
@@ -187,28 +187,28 @@ impl Timestamp {
Self { unit, value }
}
pub fn new_second(value: i64) -> Self {
pub const fn new_second(value: i64) -> Self {
Self {
value,
unit: TimeUnit::Second,
}
}
pub fn new_millisecond(value: i64) -> Self {
pub const fn new_millisecond(value: i64) -> Self {
Self {
value,
unit: TimeUnit::Millisecond,
}
}
pub fn new_microsecond(value: i64) -> Self {
pub const fn new_microsecond(value: i64) -> Self {
Self {
value,
unit: TimeUnit::Microsecond,
}
}
pub fn new_nanosecond(value: i64) -> Self {
pub const fn new_nanosecond(value: i64) -> Self {
Self {
value,
unit: TimeUnit::Nanosecond,
@@ -281,8 +281,26 @@ impl Timestamp {
.and_then(|v| v.checked_add(micros as i64))
.map(Timestamp::new_microsecond)
} else {
// Refer to <https://github.com/chronotope/chrono/issues/1289>
//
// subsec nanos are always non-negative, however the timestamp itself (both in seconds and in nanos) can be
// negative. Now i64::MIN is NOT dividable by 1_000_000_000, so
//
// (sec * 1_000_000_000) + nsec
//
// may underflow (even when in theory we COULD represent the datetime as i64) because we add the non-negative
// nanos AFTER the multiplication. This is fixed by converting the negative case to
//
// ((sec + 1) * 1_000_000_000) + (nsec - 1_000_000_000)
let mut sec = sec;
let mut nsec = nsec as i64;
if sec < 0 && nsec > 0 {
nsec -= 1_000_000_000;
sec += 1;
}
sec.checked_mul(1_000_000_000)
.and_then(|v| v.checked_add(nsec as i64))
.and_then(|v| v.checked_add(nsec))
.map(Timestamp::new_nanosecond)
}
}
@@ -425,6 +443,20 @@ impl Timestamp {
}
}
impl Timestamp {
pub const MIN_SECOND: Self = Self::new_second(-8_334_601_228_800);
pub const MAX_SECOND: Self = Self::new_second(8_210_266_876_799);
pub const MIN_MILLISECOND: Self = Self::new_millisecond(-8_334_601_228_800_000);
pub const MAX_MILLISECOND: Self = Self::new_millisecond(8_210_266_876_799_999);
pub const MIN_MICROSECOND: Self = Self::new_microsecond(-8_334_601_228_800_000_000);
pub const MAX_MICROSECOND: Self = Self::new_microsecond(8_210_266_876_799_999_999);
pub const MIN_NANOSECOND: Self = Self::new_nanosecond(i64::MIN);
pub const MAX_NANOSECOND: Self = Self::new_nanosecond(i64::MAX);
}
/// Converts the naive datetime (which has no specific timezone) to a
/// nanosecond epoch timestamp in UTC.
fn naive_datetime_to_timestamp(
@@ -586,6 +618,7 @@ impl Hash for Timestamp {
mod tests {
use std::collections::hash_map::DefaultHasher;
use chrono_tz::Tz;
use rand::Rng;
use serde_json::Value;
@@ -1297,7 +1330,7 @@ mod tests {
"+262142-12-31 23:59:59Z",
"+262142-12-31 23:59:59.999Z",
"+262142-12-31 23:59:59.999999Z",
"1677-09-21 00:12:43.145225Z",
"1677-09-21 00:12:43.145224192Z",
"2262-04-11 23:47:16.854775807Z",
"+100000-01-01 00:00:01.5Z",
];
@@ -1306,4 +1339,47 @@ mod tests {
Timestamp::from_str_utc(s).unwrap();
}
}
#[test]
fn test_min_nanos_roundtrip() {
let (sec, nsec) = Timestamp::MIN_NANOSECOND.split();
let ts = Timestamp::from_splits(sec, nsec).unwrap();
assert_eq!(Timestamp::MIN_NANOSECOND, ts);
}
#[test]
fn test_timestamp_bound_format() {
assert_eq!(
"1677-09-21 00:12:43.145224192",
Timestamp::MIN_NANOSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"2262-04-11 23:47:16.854775807",
Timestamp::MAX_NANOSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"-262143-01-01 00:00:00",
Timestamp::MIN_MICROSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"+262142-12-31 23:59:59.999999",
Timestamp::MAX_MICROSECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"-262143-01-01 00:00:00",
Timestamp::MIN_MILLISECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"+262142-12-31 23:59:59.999",
Timestamp::MAX_MILLISECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"-262143-01-01 00:00:00",
Timestamp::MIN_SECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
assert_eq!(
"+262142-12-31 23:59:59",
Timestamp::MAX_SECOND.to_timezone_aware_string(Some(&Timezone::Named(Tz::UTC)))
);
}
}

View File

@@ -73,7 +73,7 @@ tokio-stream = { workspace = true, features = ["net"] }
toml.workspace = true
tonic.workspace = true
tower = { version = "0.4", features = ["full"] }
tower-http = { version = "0.3", features = ["full"] }
tower-http = { version = "0.4", features = ["full"] }
url = "2.3.1"
uuid.workspace = true

View File

@@ -27,7 +27,7 @@ use common_error::ext::BoxedError;
use common_error::status_code::StatusCode;
use common_query::logical_plan::Expr;
use common_query::physical_plan::DfPhysicalPlanAdapter;
use common_query::{DfPhysicalPlan, Output};
use common_query::{DfPhysicalPlan, OutputData};
use common_recordbatch::SendableRecordBatchStream;
use common_runtime::Runtime;
use common_telemetry::tracing::{self, info_span};
@@ -651,11 +651,11 @@ impl RegionServerInner {
.await
.context(ExecuteLogicalPlanSnafu)?;
match result {
Output::AffectedRows(_) | Output::RecordBatches(_) => {
match result.data {
OutputData::AffectedRows(_) | OutputData::RecordBatches(_) => {
UnsupportedOutputSnafu { expected: "stream" }.fail()
}
Output::Stream(stream, _) => Ok(stream),
OutputData::Stream(stream) => Ok(stream),
}
}

View File

@@ -370,6 +370,36 @@ impl Value {
}
}
pub trait TryAsPrimitive<T: LogicalPrimitiveType> {
fn try_as_primitive(&self) -> Option<T::Native>;
}
macro_rules! impl_try_as_primitive {
($Type: ident, $Variant: ident) => {
impl TryAsPrimitive<crate::types::$Type> for Value {
fn try_as_primitive(
&self,
) -> Option<<crate::types::$Type as crate::types::LogicalPrimitiveType>::Native> {
match self {
Value::$Variant(v) => Some((*v).into()),
_ => None,
}
}
}
};
}
impl_try_as_primitive!(Int8Type, Int8);
impl_try_as_primitive!(Int16Type, Int16);
impl_try_as_primitive!(Int32Type, Int32);
impl_try_as_primitive!(Int64Type, Int64);
impl_try_as_primitive!(UInt8Type, UInt8);
impl_try_as_primitive!(UInt16Type, UInt16);
impl_try_as_primitive!(UInt32Type, UInt32);
impl_try_as_primitive!(UInt64Type, UInt64);
impl_try_as_primitive!(Float32Type, Float32);
impl_try_as_primitive!(Float64Type, Float64);
pub fn to_null_scalar_value(output_type: &ConcreteDataType) -> Result<ScalarValue> {
Ok(match output_type {
ConcreteDataType::Null(_) => ScalarValue::Null,
@@ -2387,4 +2417,12 @@ mod tests {
);
check_value_ref_size_eq(&ValueRef::Decimal128(Decimal128::new(1234, 3, 1)), 32)
}
#[test]
fn test_incorrect_default_value_issue_3479() {
let value = OrderedF64::from(0.047318541668048164);
let serialized = serde_json::to_string(&value).unwrap();
let deserialized: OrderedF64 = serde_json::from_str(&serialized).unwrap();
assert_eq!(value, deserialized);
}
}

View File

@@ -22,8 +22,9 @@ use std::task::{Context, Poll};
use common_datasource::object_store::build_backend;
use common_error::ext::BoxedError;
use common_query::prelude::Expr;
use common_recordbatch::adapter::RecordBatchMetrics;
use common_recordbatch::error::{CastVectorSnafu, ExternalSnafu, Result as RecordBatchResult};
use common_recordbatch::{RecordBatch, RecordBatchStream, SendableRecordBatchStream};
use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
use datafusion::logical_expr::utils as df_logical_expr_utils;
use datatypes::prelude::ConcreteDataType;
use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
@@ -151,6 +152,14 @@ impl RecordBatchStream for FileToScanRegionStream {
fn schema(&self) -> SchemaRef {
self.scan_schema.clone()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
None
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
None
}
}
impl Stream for FileToScanRegionStream {

View File

@@ -18,6 +18,7 @@ common-query.workspace = true
common-telemetry.workspace = true
common-time.workspace = true
datatypes.workspace = true
enum_dispatch = "0.3"
hydroflow = "0.5.0"
itertools.workspace = true
num-traits = "0.2"
@@ -27,3 +28,6 @@ session.workspace = true
snafu.workspace = true
tokio.workspace = true
tonic.workspace = true
[dev-dependencies]
serde_json = "1.0"

View File

@@ -24,5 +24,6 @@ mod scalar;
pub(crate) use error::{EvalError, InvalidArgumentSnafu, OptimizeSnafu};
pub(crate) use func::{BinaryFunc, UnaryFunc, UnmaterializableFunc, VariadicFunc};
pub(crate) use id::{GlobalId, Id, LocalId};
pub(crate) use linear::{MapFilterProject, MfpPlan, SafeMfpPlan};
pub(crate) use relation::{AggregateExpr, AggregateFunc};
pub(crate) use scalar::ScalarExpr;

View File

@@ -61,4 +61,7 @@ pub enum EvalError {
#[snafu(display("Unsupported temporal filter: {reason}"))]
UnsupportedTemporalFilter { reason: String, location: Location },
#[snafu(display("Overflowed during evaluation"))]
Overflow { location: Location },
}

View File

@@ -45,7 +45,7 @@ use crate::repr::{self, value_to_internal_ts, Diff, Row};
/// expressions in `self.expressions`, even though this is not something
/// we can directly evaluate. The plan creation methods will defensively
/// ensure that the right thing happens.
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)]
pub struct MapFilterProject {
/// A sequence of expressions that should be appended to the row.
///
@@ -415,7 +415,7 @@ impl MapFilterProject {
}
/// A wrapper type which indicates it is safe to simply evaluate all expressions.
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
pub struct SafeMfpPlan {
pub(crate) mfp: MapFilterProject,
}
@@ -800,7 +800,7 @@ mod test {
.unwrap();
// only retain sum result
let mfp = mfp.project(vec![4]).unwrap();
// accept only if if the sum is greater than 10
// accept only if the sum is greater than 10
let mfp = mfp
.filter(vec![ScalarExpr::Column(0).call_binary(
ScalarExpr::Literal(Value::from(10i32), ConcreteDataType::int32_datatype()),

View File

@@ -21,7 +21,7 @@ mod accum;
mod func;
/// Describes an aggregation expression.
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)]
pub struct AggregateExpr {
/// Names the aggregation function.
pub func: AggregateFunc,

View File

@@ -14,7 +14,10 @@
//! Accumulators for aggregate functions that's is accumulatable. i.e. sum/count
//!
//! Currently support sum, count, any, all
//! Accumulator will only be restore from row and being updated every time dataflow need process a new batch of rows.
//! So the overhead is acceptable.
//!
//! Currently support sum, count, any, all and min/max(with one caveat that min/max can't support delete with aggregate).
use std::fmt::Display;
@@ -22,13 +25,506 @@ use common_decimal::Decimal128;
use common_time::{Date, DateTime};
use datatypes::data_type::ConcreteDataType;
use datatypes::value::{OrderedF32, OrderedF64, OrderedFloat, Value};
use enum_dispatch::enum_dispatch;
use hydroflow::futures::stream::Concat;
use serde::{Deserialize, Serialize};
use snafu::ensure;
use crate::expr::error::{InternalSnafu, TryFromValueSnafu, TypeMismatchSnafu};
use crate::expr::error::{InternalSnafu, OverflowSnafu, TryFromValueSnafu, TypeMismatchSnafu};
use crate::expr::relation::func::GenericFn;
use crate::expr::{AggregateFunc, EvalError};
use crate::repr::Diff;
/// Accumulates values for the various types of accumulable aggregations.
#[enum_dispatch]
pub trait Accumulator: Sized {
fn into_state(self) -> Vec<Value>;
fn update(
&mut self,
aggr_fn: &AggregateFunc,
value: Value,
diff: Diff,
) -> Result<(), EvalError>;
fn update_batch<I>(&mut self, aggr_fn: &AggregateFunc, value_diffs: I) -> Result<(), EvalError>
where
I: IntoIterator<Item = (Value, Diff)>,
{
for (v, d) in value_diffs {
self.update(aggr_fn, v, d)?;
}
Ok(())
}
fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError>;
}
/// Bool accumulator, used for `Any` `All` `Max/MinBool`
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct Bool {
/// The number of `true` values observed.
trues: Diff,
/// The number of `false` values observed.
falses: Diff,
}
impl TryFrom<Vec<Value>> for Bool {
type Error = EvalError;
fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
ensure!(
state.len() == 2,
InternalSnafu {
reason: "Bool Accumulator state should have 2 values",
}
);
let mut iter = state.into_iter();
Ok(Self {
trues: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
falses: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
})
}
}
impl Accumulator for Bool {
fn into_state(self) -> Vec<Value> {
vec![self.trues.into(), self.falses.into()]
}
/// Null values are ignored
fn update(
&mut self,
aggr_fn: &AggregateFunc,
value: Value,
diff: Diff,
) -> Result<(), EvalError> {
ensure!(
matches!(
aggr_fn,
AggregateFunc::Any
| AggregateFunc::All
| AggregateFunc::MaxBool
| AggregateFunc::MinBool
),
InternalSnafu {
reason: format!(
"Bool Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
);
match value {
Value::Boolean(true) => self.trues += diff,
Value::Boolean(false) => self.falses += diff,
Value::Null => (), // ignore nulls
x => {
return Err(TypeMismatchSnafu {
expected: ConcreteDataType::boolean_datatype(),
actual: x.data_type(),
}
.build());
}
};
Ok(())
}
fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
match aggr_fn {
AggregateFunc::Any => Ok(Value::from(self.trues > 0)),
AggregateFunc::All => Ok(Value::from(self.falses == 0)),
AggregateFunc::MaxBool => Ok(Value::from(self.trues > 0)),
AggregateFunc::MinBool => Ok(Value::from(self.falses == 0)),
_ => Err(InternalSnafu {
reason: format!(
"Bool Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
.build()),
}
}
}
/// Accumulates simple numeric values for sum over integer.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct SimpleNumber {
/// The accumulation of all non-NULL values observed.
accum: i128,
/// The number of non-NULL values observed.
non_nulls: Diff,
}
impl TryFrom<Vec<Value>> for SimpleNumber {
type Error = EvalError;
fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
ensure!(
state.len() == 2,
InternalSnafu {
reason: "Number Accumulator state should have 2 values",
}
);
let mut iter = state.into_iter();
Ok(Self {
accum: Decimal128::try_from(iter.next().unwrap())
.map_err(err_try_from_val)?
.val(),
non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
})
}
}
impl Accumulator for SimpleNumber {
fn into_state(self) -> Vec<Value> {
vec![
Value::Decimal128(Decimal128::new(self.accum, 38, 0)),
self.non_nulls.into(),
]
}
fn update(
&mut self,
aggr_fn: &AggregateFunc,
value: Value,
diff: Diff,
) -> Result<(), EvalError> {
ensure!(
matches!(
aggr_fn,
AggregateFunc::SumInt16
| AggregateFunc::SumInt32
| AggregateFunc::SumInt64
| AggregateFunc::SumUInt16
| AggregateFunc::SumUInt32
| AggregateFunc::SumUInt64
),
InternalSnafu {
reason: format!(
"SimpleNumber Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
);
let v = match (aggr_fn, value) {
(AggregateFunc::SumInt16, Value::Int16(x)) => i128::from(x),
(AggregateFunc::SumInt32, Value::Int32(x)) => i128::from(x),
(AggregateFunc::SumInt64, Value::Int64(x)) => i128::from(x),
(AggregateFunc::SumUInt16, Value::UInt16(x)) => i128::from(x),
(AggregateFunc::SumUInt32, Value::UInt32(x)) => i128::from(x),
(AggregateFunc::SumUInt64, Value::UInt64(x)) => i128::from(x),
(_f, Value::Null) => return Ok(()), // ignore null
(f, v) => {
let expected_datatype = f.signature().input;
return Err(TypeMismatchSnafu {
expected: expected_datatype,
actual: v.data_type(),
}
.build())?;
}
};
self.accum += v * i128::from(diff);
self.non_nulls += diff;
Ok(())
}
fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
match aggr_fn {
AggregateFunc::SumInt16 | AggregateFunc::SumInt32 | AggregateFunc::SumInt64 => {
i64::try_from(self.accum)
.map_err(|_e| OverflowSnafu {}.build())
.map(Value::from)
}
AggregateFunc::SumUInt16 | AggregateFunc::SumUInt32 | AggregateFunc::SumUInt64 => {
u64::try_from(self.accum)
.map_err(|_e| OverflowSnafu {}.build())
.map(Value::from)
}
_ => Err(InternalSnafu {
reason: format!(
"SimpleNumber Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
.build()),
}
}
}
/// Accumulates float values for sum over floating numbers.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct Float {
/// Accumulates non-special float values, i.e. not NaN, +inf, -inf.
/// accum will be set to zero if `non_nulls` is zero.
accum: OrderedF64,
/// Counts +inf
pos_infs: Diff,
/// Counts -inf
neg_infs: Diff,
/// Counts NaNs
nans: Diff,
/// Counts non-NULL values
non_nulls: Diff,
}
impl TryFrom<Vec<Value>> for Float {
type Error = EvalError;
fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
ensure!(
state.len() == 5,
InternalSnafu {
reason: "Float Accumulator state should have 5 values",
}
);
let mut iter = state.into_iter();
let mut ret = Self {
accum: OrderedF64::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
pos_infs: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
neg_infs: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
nans: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
};
// This prevent counter-intuitive behavior of summing over no values
if ret.non_nulls == 0 {
ret.accum = OrderedFloat::from(0.0);
}
Ok(ret)
}
}
impl Accumulator for Float {
fn into_state(self) -> Vec<Value> {
vec![
self.accum.into(),
self.pos_infs.into(),
self.neg_infs.into(),
self.nans.into(),
self.non_nulls.into(),
]
}
/// sum ignore null
fn update(
&mut self,
aggr_fn: &AggregateFunc,
value: Value,
diff: Diff,
) -> Result<(), EvalError> {
ensure!(
matches!(
aggr_fn,
AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64
),
InternalSnafu {
reason: format!(
"Float Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
);
let x = match (aggr_fn, value) {
(AggregateFunc::SumFloat32, Value::Float32(x)) => OrderedF64::from(*x as f64),
(AggregateFunc::SumFloat64, Value::Float64(x)) => OrderedF64::from(x),
(_f, Value::Null) => return Ok(()), // ignore null
(f, v) => {
let expected_datatype = f.signature().input;
return Err(TypeMismatchSnafu {
expected: expected_datatype,
actual: v.data_type(),
}
.build())?;
}
};
if x.is_nan() {
self.nans += diff;
} else if x.is_infinite() {
if x.is_sign_positive() {
self.pos_infs += diff;
} else {
self.neg_infs += diff;
}
} else {
self.accum += *(x * OrderedF64::from(diff as f64));
}
self.non_nulls += diff;
Ok(())
}
fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
match aggr_fn {
AggregateFunc::SumFloat32 => Ok(Value::Float32(OrderedF32::from(self.accum.0 as f32))),
AggregateFunc::SumFloat64 => Ok(Value::Float64(self.accum)),
_ => Err(InternalSnafu {
reason: format!(
"Float Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
.build()),
}
}
}
/// Accumulates a single `Ord`ed `Value`, useful for min/max aggregations.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub struct OrdValue {
val: Option<Value>,
non_nulls: Diff,
}
impl TryFrom<Vec<Value>> for OrdValue {
type Error = EvalError;
fn try_from(state: Vec<Value>) -> Result<Self, Self::Error> {
ensure!(
state.len() == 2,
InternalSnafu {
reason: "OrdValue Accumulator state should have 2 values",
}
);
let mut iter = state.into_iter();
Ok(Self {
val: {
let v = iter.next().unwrap();
if v == Value::Null {
None
} else {
Some(v)
}
},
non_nulls: Diff::try_from(iter.next().unwrap()).map_err(err_try_from_val)?,
})
}
}
impl Accumulator for OrdValue {
fn into_state(self) -> Vec<Value> {
vec![self.val.unwrap_or(Value::Null), self.non_nulls.into()]
}
/// min/max try to find results in all non-null values, if all values are null, the result is null.
/// count(col_name) gives the number of non-null values, count(*) gives the number of rows including nulls.
/// TODO(discord9): add count(*) as a aggr function
fn update(
&mut self,
aggr_fn: &AggregateFunc,
value: Value,
diff: Diff,
) -> Result<(), EvalError> {
ensure!(
aggr_fn.is_max() || aggr_fn.is_min() || matches!(aggr_fn, AggregateFunc::Count),
InternalSnafu {
reason: format!(
"OrdValue Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
);
if diff <= 0 && (aggr_fn.is_max() || aggr_fn.is_min()) {
return Err(InternalSnafu {
reason: "OrdValue Accumulator does not support non-monotonic input for min/max aggregation".to_string(),
}.build());
}
// if aggr_fn is count, the incoming value type doesn't matter in type checking
// otherwise, type need to be the same or value can be null
let check_type_aggr_fn_and_arg_value =
ty_eq_without_precision(value.data_type(), aggr_fn.signature().input)
|| matches!(aggr_fn, AggregateFunc::Count)
|| value.is_null();
let check_type_aggr_fn_and_self_val = self
.val
.as_ref()
.map(|zelf| ty_eq_without_precision(zelf.data_type(), aggr_fn.signature().input))
.unwrap_or(true)
|| matches!(aggr_fn, AggregateFunc::Count);
if !check_type_aggr_fn_and_arg_value {
return Err(TypeMismatchSnafu {
expected: aggr_fn.signature().input,
actual: value.data_type(),
}
.build());
} else if !check_type_aggr_fn_and_self_val {
return Err(TypeMismatchSnafu {
expected: aggr_fn.signature().input,
actual: self
.val
.as_ref()
.map(|v| v.data_type())
.unwrap_or(ConcreteDataType::null_datatype()),
}
.build());
}
let is_null = value.is_null();
if is_null {
return Ok(());
}
if !is_null {
// compile count(*) to count(true) to include null/non-nulls
// And the counts of non-null values are updated here
self.non_nulls += diff;
match aggr_fn.signature().generic_fn {
GenericFn::Max => {
self.val = self
.val
.clone()
.map(|v| v.max(value.clone()))
.or_else(|| Some(value))
}
GenericFn::Min => {
self.val = self
.val
.clone()
.map(|v| v.min(value.clone()))
.or_else(|| Some(value))
}
GenericFn::Count => (),
_ => unreachable!("already checked by ensure!"),
}
};
// min/max ignore nulls
Ok(())
}
fn eval(&self, aggr_fn: &AggregateFunc) -> Result<Value, EvalError> {
if aggr_fn.is_max() || aggr_fn.is_min() {
Ok(self.val.clone().unwrap_or(Value::Null))
} else if matches!(aggr_fn, AggregateFunc::Count) {
Ok(self.non_nulls.into())
} else {
Err(InternalSnafu {
reason: format!(
"OrdValue Accumulator does not support this aggregation function: {:?}",
aggr_fn
),
}
.build())
}
}
}
/// Accumulates values for the various types of accumulable aggregations.
///
/// We assume that there are not more than 2^32 elements for the aggregation.
@@ -38,34 +534,407 @@ use crate::repr::Diff;
/// The float accumulator performs accumulation with tolerance for floating point error.
///
/// TODO(discord9): check for overflowing
#[enum_dispatch(Accumulator)]
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum Accum {
/// Accumulates boolean values.
Bool {
/// The number of `true` values observed.
trues: Diff,
/// The number of `false` values observed.
falses: Diff,
},
Bool(Bool),
/// Accumulates simple numeric values.
SimpleNumber {
/// The accumulation of all non-NULL values observed.
accum: i128,
/// The number of non-NULL values observed.
non_nulls: Diff,
},
SimpleNumber(SimpleNumber),
/// Accumulates float values.
Float {
/// Accumulates non-special float values, i.e. not NaN, +inf, -inf.
/// accum will be set to zero if `non_nulls` is zero.
accum: OrderedF64,
/// Counts +inf
pos_infs: Diff,
/// Counts -inf
neg_infs: Diff,
/// Counts NaNs
nans: Diff,
/// Counts non-NULL values
non_nulls: Diff,
},
Float(Float),
/// Accumulate Values that impl `Ord`
OrdValue(OrdValue),
}
impl Accum {
pub fn new_accum(aggr_fn: &AggregateFunc) -> Result<Self, EvalError> {
Ok(match aggr_fn {
AggregateFunc::Any
| AggregateFunc::All
| AggregateFunc::MaxBool
| AggregateFunc::MinBool => Self::from(Bool {
trues: 0,
falses: 0,
}),
AggregateFunc::SumInt16
| AggregateFunc::SumInt32
| AggregateFunc::SumInt64
| AggregateFunc::SumUInt16
| AggregateFunc::SumUInt32
| AggregateFunc::SumUInt64 => Self::from(SimpleNumber {
accum: 0,
non_nulls: 0,
}),
AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64 => Self::from(Float {
accum: OrderedF64::from(0.0),
pos_infs: 0,
neg_infs: 0,
nans: 0,
non_nulls: 0,
}),
f if f.is_max() || f.is_min() || matches!(f, AggregateFunc::Count) => {
Self::from(OrdValue {
val: None,
non_nulls: 0,
})
}
f => {
return Err(InternalSnafu {
reason: format!(
"Accumulator does not support this aggregation function: {:?}",
f
),
}
.build());
}
})
}
pub fn try_into_accum(aggr_fn: &AggregateFunc, state: Vec<Value>) -> Result<Self, EvalError> {
match aggr_fn {
AggregateFunc::Any
| AggregateFunc::All
| AggregateFunc::MaxBool
| AggregateFunc::MinBool => Ok(Self::from(Bool::try_from(state)?)),
AggregateFunc::SumInt16
| AggregateFunc::SumInt32
| AggregateFunc::SumInt64
| AggregateFunc::SumUInt16
| AggregateFunc::SumUInt32
| AggregateFunc::SumUInt64 => Ok(Self::from(SimpleNumber::try_from(state)?)),
AggregateFunc::SumFloat32 | AggregateFunc::SumFloat64 => {
Ok(Self::from(Float::try_from(state)?))
}
f if f.is_max() || f.is_min() || matches!(f, AggregateFunc::Count) => {
Ok(Self::from(OrdValue::try_from(state)?))
}
f => Err(InternalSnafu {
reason: format!(
"Accumulator does not support this aggregation function: {:?}",
f
),
}
.build()),
}
}
}
fn err_try_from_val<T: Display>(reason: T) -> EvalError {
TryFromValueSnafu {
msg: reason.to_string(),
}
.build()
}
/// compare type while ignore their precision, including `TimeStamp`, `Time`,
/// `Duration`, `Interval`
fn ty_eq_without_precision(left: ConcreteDataType, right: ConcreteDataType) -> bool {
left == right
|| matches!(left, ConcreteDataType::Timestamp(..))
&& matches!(right, ConcreteDataType::Timestamp(..))
|| matches!(left, ConcreteDataType::Time(..)) && matches!(right, ConcreteDataType::Time(..))
|| matches!(left, ConcreteDataType::Duration(..))
&& matches!(right, ConcreteDataType::Duration(..))
|| matches!(left, ConcreteDataType::Interval(..))
&& matches!(right, ConcreteDataType::Interval(..))
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_accum() {
let testcases = vec![
(
AggregateFunc::SumInt32,
vec![(Value::Int32(1), 1), (Value::Null, 1)],
(
Value::Int64(1),
vec![Value::Decimal128(Decimal128::new(1, 38, 0)), 1i64.into()],
),
),
(
AggregateFunc::SumFloat32,
vec![(Value::Float32(OrderedF32::from(1.0)), 1), (Value::Null, 1)],
(
Value::Float32(OrderedF32::from(1.0)),
vec![
Value::Float64(OrderedF64::from(1.0)),
0i64.into(),
0i64.into(),
0i64.into(),
1i64.into(),
],
),
),
(
AggregateFunc::MaxInt32,
vec![(Value::Int32(1), 1), (Value::Int32(2), 1), (Value::Null, 1)],
(Value::Int32(2), vec![Value::Int32(2), 2i64.into()]),
),
(
AggregateFunc::MinInt32,
vec![(Value::Int32(2), 1), (Value::Int32(1), 1), (Value::Null, 1)],
(Value::Int32(1), vec![Value::Int32(1), 2i64.into()]),
),
(
AggregateFunc::MaxFloat32,
vec![
(Value::Float32(OrderedF32::from(1.0)), 1),
(Value::Float32(OrderedF32::from(2.0)), 1),
(Value::Null, 1),
],
(
Value::Float32(OrderedF32::from(2.0)),
vec![Value::Float32(OrderedF32::from(2.0)), 2i64.into()],
),
),
(
AggregateFunc::MaxDateTime,
vec![
(Value::DateTime(DateTime::from(0)), 1),
(Value::DateTime(DateTime::from(1)), 1),
(Value::Null, 1),
],
(
Value::DateTime(DateTime::from(1)),
vec![Value::DateTime(DateTime::from(1)), 2i64.into()],
),
),
(
AggregateFunc::Count,
vec![
(Value::Int32(1), 1),
(Value::Int32(2), 1),
(Value::Null, 1),
(Value::Null, 1),
],
(2i64.into(), vec![Value::Null, 2i64.into()]),
),
(
AggregateFunc::Any,
vec![
(Value::Boolean(false), 1),
(Value::Boolean(false), 1),
(Value::Boolean(true), 1),
(Value::Null, 1),
],
(
Value::Boolean(true),
vec![Value::from(1i64), Value::from(2i64)],
),
),
(
AggregateFunc::All,
vec![
(Value::Boolean(false), 1),
(Value::Boolean(false), 1),
(Value::Boolean(true), 1),
(Value::Null, 1),
],
(
Value::Boolean(false),
vec![Value::from(1i64), Value::from(2i64)],
),
),
(
AggregateFunc::MaxBool,
vec![
(Value::Boolean(false), 1),
(Value::Boolean(false), 1),
(Value::Boolean(true), 1),
(Value::Null, 1),
],
(
Value::Boolean(true),
vec![Value::from(1i64), Value::from(2i64)],
),
),
(
AggregateFunc::MinBool,
vec![
(Value::Boolean(false), 1),
(Value::Boolean(false), 1),
(Value::Boolean(true), 1),
(Value::Null, 1),
],
(
Value::Boolean(false),
vec![Value::from(1i64), Value::from(2i64)],
),
),
];
for (aggr_fn, input, (eval_res, state)) in testcases {
let create_and_insert = || -> Result<Accum, EvalError> {
let mut acc = Accum::new_accum(&aggr_fn)?;
acc.update_batch(&aggr_fn, input.clone())?;
let row = acc.into_state();
let acc = Accum::try_into_accum(&aggr_fn, row)?;
Ok(acc)
};
let acc = match create_and_insert() {
Ok(acc) => acc,
Err(err) => panic!(
"Failed to create accum for {:?} with input {:?} with error: {:?}",
aggr_fn, input, err
),
};
if acc.eval(&aggr_fn).unwrap() != eval_res {
panic!(
"Failed to eval accum for {:?} with input {:?}, expect {:?}, got {:?}",
aggr_fn,
input,
eval_res,
acc.eval(&aggr_fn).unwrap()
);
}
let actual_state = acc.into_state();
if actual_state != state {
panic!(
"Failed to cast into state from accum for {:?} with input {:?}, expect state {:?}, got state {:?}",
aggr_fn,
input,
state,
actual_state
);
}
}
}
#[test]
fn test_fail_path_accum() {
{
let bool_accum = Bool::try_from(vec![Value::Null]);
assert!(matches!(bool_accum, Err(EvalError::Internal { .. })));
}
{
let mut bool_accum = Bool::try_from(vec![1i64.into(), 1i64.into()]).unwrap();
// serde
let bool_accum_serde = serde_json::to_string(&bool_accum).unwrap();
let bool_accum_de = serde_json::from_str::<Bool>(&bool_accum_serde).unwrap();
assert_eq!(bool_accum, bool_accum_de);
assert!(matches!(
bool_accum.update(&AggregateFunc::MaxDate, 1.into(), 1),
Err(EvalError::Internal { .. })
));
assert!(matches!(
bool_accum.update(&AggregateFunc::Any, 1.into(), 1),
Err(EvalError::TypeMismatch { .. })
));
assert!(matches!(
bool_accum.eval(&AggregateFunc::MaxDate),
Err(EvalError::Internal { .. })
));
}
{
let ret = SimpleNumber::try_from(vec![Value::Null]);
assert!(matches!(ret, Err(EvalError::Internal { .. })));
let mut accum =
SimpleNumber::try_from(vec![Decimal128::new(0, 38, 0).into(), 0i64.into()])
.unwrap();
assert!(matches!(
accum.update(&AggregateFunc::All, 0.into(), 1),
Err(EvalError::Internal { .. })
));
assert!(matches!(
accum.update(&AggregateFunc::SumInt64, 0i32.into(), 1),
Err(EvalError::TypeMismatch { .. })
));
assert!(matches!(
accum.eval(&AggregateFunc::All),
Err(EvalError::Internal { .. })
));
accum
.update(&AggregateFunc::SumInt64, 1i64.into(), 1)
.unwrap();
accum
.update(&AggregateFunc::SumInt64, i64::MAX.into(), 1)
.unwrap();
assert!(matches!(
accum.eval(&AggregateFunc::SumInt64),
Err(EvalError::Overflow { .. })
));
}
{
let ret = Float::try_from(vec![2f64.into(), 0i64.into(), 0i64.into(), 0i64.into()]);
assert!(matches!(ret, Err(EvalError::Internal { .. })));
let mut accum = Float::try_from(vec![
2f64.into(),
0i64.into(),
0i64.into(),
0i64.into(),
1i64.into(),
])
.unwrap();
accum
.update(&AggregateFunc::SumFloat64, 2f64.into(), -1)
.unwrap();
assert!(matches!(
accum.update(&AggregateFunc::All, 0.into(), 1),
Err(EvalError::Internal { .. })
));
assert!(matches!(
accum.update(&AggregateFunc::SumFloat64, 0.0f32.into(), 1),
Err(EvalError::TypeMismatch { .. })
));
// no record, no accum
assert_eq!(
accum.eval(&AggregateFunc::SumFloat64).unwrap(),
0.0f64.into()
);
assert!(matches!(
accum.eval(&AggregateFunc::All),
Err(EvalError::Internal { .. })
));
accum
.update(&AggregateFunc::SumFloat64, f64::INFINITY.into(), 1)
.unwrap();
accum
.update(&AggregateFunc::SumFloat64, (-f64::INFINITY).into(), 1)
.unwrap();
accum
.update(&AggregateFunc::SumFloat64, f64::NAN.into(), 1)
.unwrap();
}
{
let ret = OrdValue::try_from(vec![Value::Null]);
assert!(matches!(ret, Err(EvalError::Internal { .. })));
let mut accum = OrdValue::try_from(vec![Value::Null, 0i64.into()]).unwrap();
assert!(matches!(
accum.update(&AggregateFunc::All, 0.into(), 1),
Err(EvalError::Internal { .. })
));
accum
.update(&AggregateFunc::MaxInt16, 1i16.into(), 1)
.unwrap();
assert!(matches!(
accum.update(&AggregateFunc::MaxInt16, 0i32.into(), 1),
Err(EvalError::TypeMismatch { .. })
));
assert!(matches!(
accum.update(&AggregateFunc::MaxInt16, 0i16.into(), -1),
Err(EvalError::Internal { .. })
));
accum
.update(&AggregateFunc::MaxInt16, Value::Null, 1)
.unwrap();
}
// insert uint64 into max_int64 should fail
{
let mut accum = OrdValue::try_from(vec![Value::Null, 0i64.into()]).unwrap();
assert!(matches!(
accum.update(&AggregateFunc::MaxInt64, 0u64.into(), 1),
Err(EvalError::TypeMismatch { .. })
));
}
}
}

View File

@@ -12,15 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::type_name;
use common_time::{Date, DateTime};
use datatypes::prelude::ConcreteDataType;
use datatypes::value::{OrderedF32, OrderedF64, Value};
use serde::{Deserialize, Serialize};
use crate::expr::error::{EvalError, TryFromValueSnafu, TypeMismatchSnafu};
use crate::expr::relation::accum::Accum;
use crate::expr::relation::accum::{Accum, Accumulator};
use crate::repr::Diff;
/// Aggregate functions that can be applied to a group of rows.
@@ -83,3 +81,280 @@ pub enum AggregateFunc {
Any,
All,
}
impl AggregateFunc {
pub fn is_max(&self) -> bool {
self.signature().generic_fn == GenericFn::Max
}
pub fn is_min(&self) -> bool {
self.signature().generic_fn == GenericFn::Min
}
pub fn is_sum(&self) -> bool {
self.signature().generic_fn == GenericFn::Sum
}
/// Eval value, diff with accumulator
///
/// Expect self to be accumulable aggregate functio, i.e. sum/count
///
/// TODO(discord9): deal with overflow&better accumulator
pub fn eval_diff_accumulable<I>(
&self,
accum: Vec<Value>,
value_diffs: I,
) -> Result<(Value, Vec<Value>), EvalError>
where
I: IntoIterator<Item = (Value, Diff)>,
{
let mut accum = if accum.is_empty() {
Accum::new_accum(self)?
} else {
Accum::try_into_accum(self, accum)?
};
accum.update_batch(self, value_diffs)?;
let res = accum.eval(self)?;
Ok((res, accum.into_state()))
}
}
pub struct Signature {
pub input: ConcreteDataType,
pub output: ConcreteDataType,
pub generic_fn: GenericFn,
}
#[derive(Debug, PartialEq, Eq)]
pub enum GenericFn {
Max,
Min,
Sum,
Count,
Any,
All,
}
impl AggregateFunc {
/// all concrete datatypes with precision types will be returned with largest possible variant
/// as a exception, count have a signature of `null -> i64`, but it's actually `anytype -> i64`
pub fn signature(&self) -> Signature {
match self {
AggregateFunc::MaxInt16 => Signature {
input: ConcreteDataType::int16_datatype(),
output: ConcreteDataType::int16_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxInt32 => Signature {
input: ConcreteDataType::int32_datatype(),
output: ConcreteDataType::int32_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxInt64 => Signature {
input: ConcreteDataType::int64_datatype(),
output: ConcreteDataType::int64_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxUInt16 => Signature {
input: ConcreteDataType::uint16_datatype(),
output: ConcreteDataType::uint16_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxUInt32 => Signature {
input: ConcreteDataType::uint32_datatype(),
output: ConcreteDataType::uint32_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxUInt64 => Signature {
input: ConcreteDataType::uint64_datatype(),
output: ConcreteDataType::uint64_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxFloat32 => Signature {
input: ConcreteDataType::float32_datatype(),
output: ConcreteDataType::float32_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxFloat64 => Signature {
input: ConcreteDataType::float64_datatype(),
output: ConcreteDataType::float64_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxBool => Signature {
input: ConcreteDataType::boolean_datatype(),
output: ConcreteDataType::boolean_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxString => Signature {
input: ConcreteDataType::string_datatype(),
output: ConcreteDataType::string_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxDate => Signature {
input: ConcreteDataType::date_datatype(),
output: ConcreteDataType::date_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxDateTime => Signature {
input: ConcreteDataType::datetime_datatype(),
output: ConcreteDataType::datetime_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxTimestamp => Signature {
input: ConcreteDataType::timestamp_second_datatype(),
output: ConcreteDataType::timestamp_second_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxTime => Signature {
input: ConcreteDataType::time_second_datatype(),
output: ConcreteDataType::time_second_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxDuration => Signature {
input: ConcreteDataType::duration_second_datatype(),
output: ConcreteDataType::duration_second_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MaxInterval => Signature {
input: ConcreteDataType::interval_year_month_datatype(),
output: ConcreteDataType::interval_year_month_datatype(),
generic_fn: GenericFn::Max,
},
AggregateFunc::MinInt16 => Signature {
input: ConcreteDataType::int16_datatype(),
output: ConcreteDataType::int16_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinInt32 => Signature {
input: ConcreteDataType::int32_datatype(),
output: ConcreteDataType::int32_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinInt64 => Signature {
input: ConcreteDataType::int64_datatype(),
output: ConcreteDataType::int64_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinUInt16 => Signature {
input: ConcreteDataType::uint16_datatype(),
output: ConcreteDataType::uint16_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinUInt32 => Signature {
input: ConcreteDataType::uint32_datatype(),
output: ConcreteDataType::uint32_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinUInt64 => Signature {
input: ConcreteDataType::uint64_datatype(),
output: ConcreteDataType::uint64_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinFloat32 => Signature {
input: ConcreteDataType::float32_datatype(),
output: ConcreteDataType::float32_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinFloat64 => Signature {
input: ConcreteDataType::float64_datatype(),
output: ConcreteDataType::float64_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinBool => Signature {
input: ConcreteDataType::boolean_datatype(),
output: ConcreteDataType::boolean_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinString => Signature {
input: ConcreteDataType::string_datatype(),
output: ConcreteDataType::string_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinDate => Signature {
input: ConcreteDataType::date_datatype(),
output: ConcreteDataType::date_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinDateTime => Signature {
input: ConcreteDataType::datetime_datatype(),
output: ConcreteDataType::datetime_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinTimestamp => Signature {
input: ConcreteDataType::timestamp_second_datatype(),
output: ConcreteDataType::timestamp_second_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinTime => Signature {
input: ConcreteDataType::time_second_datatype(),
output: ConcreteDataType::time_second_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinDuration => Signature {
input: ConcreteDataType::duration_second_datatype(),
output: ConcreteDataType::duration_second_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::MinInterval => Signature {
input: ConcreteDataType::interval_year_month_datatype(),
output: ConcreteDataType::interval_year_month_datatype(),
generic_fn: GenericFn::Min,
},
AggregateFunc::SumInt16 => Signature {
input: ConcreteDataType::int16_datatype(),
output: ConcreteDataType::int16_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumInt32 => Signature {
input: ConcreteDataType::int32_datatype(),
output: ConcreteDataType::int32_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumInt64 => Signature {
input: ConcreteDataType::int64_datatype(),
output: ConcreteDataType::int64_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumUInt16 => Signature {
input: ConcreteDataType::uint16_datatype(),
output: ConcreteDataType::uint16_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumUInt32 => Signature {
input: ConcreteDataType::uint32_datatype(),
output: ConcreteDataType::uint32_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumUInt64 => Signature {
input: ConcreteDataType::uint64_datatype(),
output: ConcreteDataType::uint64_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumFloat32 => Signature {
input: ConcreteDataType::float32_datatype(),
output: ConcreteDataType::float32_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::SumFloat64 => Signature {
input: ConcreteDataType::float64_datatype(),
output: ConcreteDataType::float64_datatype(),
generic_fn: GenericFn::Sum,
},
AggregateFunc::Count => Signature {
input: ConcreteDataType::null_datatype(),
output: ConcreteDataType::int64_datatype(),
generic_fn: GenericFn::Count,
},
AggregateFunc::Any => Signature {
input: ConcreteDataType::boolean_datatype(),
output: ConcreteDataType::boolean_datatype(),
generic_fn: GenericFn::Any,
},
AggregateFunc::All => Signature {
input: ConcreteDataType::boolean_datatype(),
output: ConcreteDataType::boolean_datatype(),
generic_fn: GenericFn::All,
},
}
}
}

View File

@@ -17,4 +17,5 @@
// allow unused for now because it should be use later
mod adapter;
mod expr;
mod plan;
mod repr;

98
src/flow/src/plan.rs Normal file
View File

@@ -0,0 +1,98 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! This module contain basic definition for dataflow's plan
//! that can be translate to hydro dataflow
mod join;
mod reduce;
use serde::{Deserialize, Serialize};
pub(crate) use self::reduce::{AccumulablePlan, KeyValPlan, ReducePlan};
use crate::expr::{
AggregateExpr, EvalError, Id, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr,
};
use crate::plan::join::JoinPlan;
use crate::repr::{DiffRow, RelationType};
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
pub struct TypedPlan {
/// output type of the relation
pub typ: RelationType,
pub plan: Plan,
}
/// TODO(discord9): support `TableFunc`by define FlatMap that map 1 to n)
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
pub enum Plan {
/// A constant collection of rows.
Constant { rows: Vec<DiffRow> },
/// Get CDC data from an source, be it external reference to an existing source or an internal
/// reference to a `Let` identifier
Get { id: Id },
/// Create a temporary collection from given `value``, and make this bind only available
/// in scope of `body`
Let {
id: LocalId,
value: Box<Plan>,
body: Box<Plan>,
},
/// Map, Filter, and Project operators.
Mfp {
/// The input collection.
input: Box<Plan>,
/// Linear operator to apply to each record.
mfp: MapFilterProject,
},
/// Reduce operator, aggregation by key assembled from KeyValPlan
Reduce {
/// The input collection.
input: Box<Plan>,
/// A plan for changing input records into key, value pairs.
key_val_plan: KeyValPlan,
/// A plan for performing the reduce.
///
/// The implementation of reduction has several different strategies based
/// on the properties of the reduction, and the input itself.
reduce_plan: ReducePlan,
},
/// A multiway relational equijoin, with fused map, filter, and projection.
///
/// This stage performs a multiway join among `inputs`, using the equality
/// constraints expressed in `plan`. The plan also describes the implementation
/// strategy we will use, and any pushed down per-record work.
Join {
/// An ordered list of inputs that will be joined.
inputs: Vec<Plan>,
/// Detailed information about the implementation of the join.
///
/// This includes information about the implementation strategy, but also
/// any map, filter, project work that we might follow the join with, but
/// potentially pushed down into the implementation of the join.
plan: JoinPlan,
},
/// Adds the contents of the input collections.
///
/// Importantly, this is *multiset* union, so the multiplicities of records will
/// add. This is in contrast to *set* union, where the multiplicities would be
/// capped at one. A set union can be formed with `Union` followed by `Reduce`
/// implementing the "distinct" operator.
Union {
/// The input collections
inputs: Vec<Plan>,
/// Whether to consolidate the output, e.g., cancel negated records.
consolidate_output: bool,
},
}

78
src/flow/src/plan/join.rs Normal file
View File

@@ -0,0 +1,78 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use serde::{Deserialize, Serialize};
use crate::expr::ScalarExpr;
use crate::plan::SafeMfpPlan;
/// TODO(discord9): consider impl more join strategies
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
pub enum JoinPlan {
Linear(LinearJoinPlan),
}
/// Determine if a given row should stay in the output. And apply a map filter project before output the row
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
pub struct JoinFilter {
/// each element in the outer vector will check if each expr in itself can be eval to same value
/// if not, the row will be filtered out. Useful for equi-join(join based on equality of some columns)
pub ready_equivalences: Vec<Vec<ScalarExpr>>,
/// Apply a map filter project before output the row
pub before: SafeMfpPlan,
}
/// A plan for the execution of a linear join.
///
/// A linear join is a sequence of stages, each of which introduces
/// a new collection. Each stage is represented by a [LinearStagePlan].
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
pub struct LinearJoinPlan {
/// The source relation from which we start the join.
pub source_relation: usize,
/// The arrangement to use for the source relation, if any
pub source_key: Option<Vec<ScalarExpr>>,
/// An initial closure to apply before any stages.
///
/// Values of `None` indicate the identity closure.
pub initial_closure: Option<JoinFilter>,
/// A *sequence* of stages to apply one after the other.
pub stage_plans: Vec<LinearStagePlan>,
/// A concluding filter to apply after the last stage.
///
/// Values of `None` indicate the identity closure.
pub final_closure: Option<JoinFilter>,
}
/// A plan for the execution of one stage of a linear join.
///
/// Each stage is a binary join between the current accumulated
/// join results, and a new collection. The former is referred to
/// as the "stream" and the latter the "lookup".
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd)]
pub struct LinearStagePlan {
/// The index of the relation into which we will look up.
pub lookup_relation: usize,
/// The key expressions to use for the stream relation.
pub stream_key: Vec<ScalarExpr>,
/// Columns to retain from the stream relation.
/// These columns are those that are not redundant with `stream_key`,
/// and cannot be read out of the key component of an arrangement.
pub stream_thinning: Vec<usize>,
/// The key expressions to use for the lookup relation.
pub lookup_key: Vec<ScalarExpr>,
/// The closure to apply to the concatenation of the key columns,
/// the stream value columns, and the lookup value colunms.
pub closure: JoinFilter,
}

View File

@@ -0,0 +1,50 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use serde::{Deserialize, Serialize};
use crate::expr::{AggregateExpr, Id, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr};
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
pub struct KeyValPlan {
pub key_plan: SafeMfpPlan,
pub val_plan: SafeMfpPlan,
}
/// TODO(discord9): def&impl of Hierarchical aggregates(for min/max with support to deletion) and
/// basic aggregates(for other aggregate functions) and mixed aggregate
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
pub enum ReducePlan {
/// Plan for not computing any aggregations, just determining the set of
/// distinct keys.
Distinct,
/// Plan for computing only accumulable aggregations.
/// Including simple functions like `sum`, `count`, `min/max`(without deletion)
Accumulable(AccumulablePlan),
}
/// Accumulable plan for the execution of a reduction.
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Deserialize, Serialize)]
pub struct AccumulablePlan {
/// All of the aggregations we were asked to compute, stored
/// in order.
pub full_aggrs: Vec<AggregateExpr>,
/// All of the non-distinct accumulable aggregates.
/// Each element represents:
/// (index of aggr output, index of value among inputs, aggr expr)
/// These will all be rendered together in one dataflow fragment.
pub simple_aggrs: Vec<(usize, usize, AggregateExpr)>,
/// Same as above but for all of the `DISTINCT` accumulable aggregations.
pub distinct_aggrs: Vec<(usize, usize, AggregateExpr)>,
}

View File

@@ -33,7 +33,10 @@ use snafu::ResultExt;
use crate::expr::error::{CastValueSnafu, EvalError};
/// System-wide Record count difference type.
/// System-wide Record count difference type. Useful for capture data change
///
/// i.e. +1 means insert one record, -1 means remove,
/// and +/-n means insert/remove multiple duplicate records.
pub type Diff = i64;
/// System-wide default timestamp type

View File

@@ -28,6 +28,7 @@ use api::v1::meta::Role;
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use catalog::CatalogManagerRef;
use client::OutputData;
use common_base::Plugins;
use common_config::KvBackendConfig;
use common_error::ext::BoxedError;
@@ -401,13 +402,13 @@ impl SqlQueryHandler for Instance {
/// Attaches a timer to the output and observes it once the output is exhausted.
pub fn attach_timer(output: Output, timer: HistogramTimer) -> Output {
match output {
Output::AffectedRows(_) | Output::RecordBatches(_) => output,
Output::Stream(stream, plan) => {
match output.data {
OutputData::AffectedRows(_) | OutputData::RecordBatches(_) => output,
OutputData::Stream(stream) => {
let stream = OnDone::new(stream, move || {
timer.observe_duration();
});
Output::Stream(Box::pin(stream), plan)
Output::new(OutputData::Stream(Box::pin(stream)), output.meta)
}
}
}

View File

@@ -113,7 +113,7 @@ impl GrpcQueryHandler for Instance {
.statement_executor
.create_table_inner(&mut expr, None, &ctx)
.await?;
Output::AffectedRows(0)
Output::new_with_affected_rows(0)
}
DdlExpr::Alter(expr) => self.statement_executor.alter_table_inner(expr).await?,
DdlExpr::CreateDatabase(expr) => {

View File

@@ -47,8 +47,8 @@ impl OpentsdbProtocolHandler for Instance {
.map_err(BoxedError::new)
.context(servers::error::ExecuteGrpcQuerySnafu)?;
Ok(match output {
common_query::Output::AffectedRows(rows) => rows,
Ok(match output.data {
common_query::OutputData::AffectedRows(rows) => rows,
_ => unreachable!(),
})
}

View File

@@ -19,6 +19,7 @@ use api::prom_store::remote::{Query, QueryResult, ReadRequest, ReadResponse, Wri
use api::v1::RowInsertRequests;
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use client::OutputData;
use common_catalog::format_full_table_name;
use common_error::ext::BoxedError;
use common_query::prelude::GREPTIME_PHYSICAL_TABLE;
@@ -77,7 +78,7 @@ fn negotiate_response_type(accepted_response_types: &[i32]) -> ServerResult<Resp
}
async fn to_query_result(table_name: &str, output: Output) -> ServerResult<QueryResult> {
let Output::Stream(stream, _) = output else {
let OutputData::Stream(stream) = output.data else {
unreachable!()
};
let recordbatches = RecordBatches::try_collect(stream)

View File

@@ -152,6 +152,10 @@ impl TxnService for RaftEngineBackend {
responses,
})
}
fn max_txn_ops(&self) -> usize {
usize::MAX
}
}
#[async_trait::async_trait]

View File

@@ -24,7 +24,9 @@ fn main() {
#[tokio::main]
async fn run() {
let kv_backend = EtcdStore::with_endpoints(["127.0.0.1:2380"]).await.unwrap();
let kv_backend = EtcdStore::with_endpoints(["127.0.0.1:2380"], 128)
.await
.unwrap();
// put
let put_req = PutRequest {

View File

@@ -193,7 +193,8 @@ pub async fn metasrv_builder(
(None, false) => {
let etcd_client = create_etcd_client(opts).await?;
let kv_backend = {
let etcd_backend = EtcdStore::with_etcd_client(etcd_client.clone());
let etcd_backend =
EtcdStore::with_etcd_client(etcd_client.clone(), opts.max_txn_ops);
if !opts.store_key_prefix.is_empty() {
Arc::new(ChrootKvBackend::new(
opts.store_key_prefix.clone().into_bytes(),

View File

@@ -79,6 +79,17 @@ pub struct MetaSrvOptions {
pub wal: MetaSrvWalConfig,
pub export_metrics: ExportMetricsOption,
pub store_key_prefix: String,
/// The max operations per txn
///
/// This value is usually limited by which store is used for the `KvBackend`.
/// For example, if using etcd, this value should ensure that it is less than
/// or equal to the `--max-txn-ops` option value of etcd.
///
/// TODO(jeremy): Currently, this option only affects the etcd store, but it may
/// also affect other stores in the future. In other words, each store needs to
/// limit the number of operations in a txn because an infinitely large txn could
/// potentially block other operations.
pub max_txn_ops: usize,
}
impl MetaSrvOptions {
@@ -112,6 +123,7 @@ impl Default for MetaSrvOptions {
wal: MetaSrvWalConfig::default(),
export_metrics: ExportMetricsOption::default(),
store_key_prefix: String::new(),
max_txn_ops: 128,
}
}
}

View File

@@ -42,7 +42,7 @@ pub async fn mock_with_memstore() -> MockInfo {
}
pub async fn mock_with_etcdstore(addr: &str) -> MockInfo {
let kv_backend = EtcdStore::with_endpoints([addr]).await.unwrap();
let kv_backend = EtcdStore::with_endpoints([addr], 128).await.unwrap();
mock(Default::default(), kv_backend, None, None).await
}

View File

@@ -380,6 +380,10 @@ impl TxnService for LeaderCachedKvBackend {
Ok(res)
}
fn max_txn_ops(&self) -> usize {
self.store.max_txn_ops()
}
}
impl ResettableKvBackend for LeaderCachedKvBackend {

View File

@@ -79,5 +79,6 @@ rand.workspace = true
toml.workspace = true
[[bench]]
name = "bench_merge_tree"
name = "memtable_bench"
harness = false
required-features = ["test"]

View File

@@ -7,3 +7,9 @@ The Alfa Romeo [MiTo](https://en.wikipedia.org/wiki/Alfa_Romeo_MiTo) is a front-
> "You can't be a true petrolhead until you've owned an Alfa Romeo."
> <div align="right">-- by Jeremy Clarkson</div>
## Benchmarks
Run benchmarks in this crate:
```bash
cargo bench -p mito2 -F test
```

View File

@@ -0,0 +1,352 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use api::v1::value::ValueData;
use api::v1::{Row, Rows, SemanticType};
use criterion::{criterion_group, criterion_main, Criterion};
use datafusion_common::Column;
use datafusion_expr::{lit, Expr};
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use mito2::memtable::merge_tree::{MergeTreeConfig, MergeTreeMemtable};
use mito2::memtable::time_series::TimeSeriesMemtable;
use mito2::memtable::{KeyValues, Memtable};
use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema};
use rand::rngs::ThreadRng;
use rand::seq::SliceRandom;
use rand::Rng;
use store_api::metadata::{
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
};
use store_api::storage::RegionId;
use table::predicate::Predicate;
/// Writes rows.
fn write_rows(c: &mut Criterion) {
let metadata = memtable_util::metadata_with_primary_key(vec![1, 0], true);
let timestamps = (0..100).collect::<Vec<_>>();
// Note that this test only generate one time series.
let mut group = c.benchmark_group("write");
group.bench_function("merge_tree", |b| {
let memtable =
MergeTreeMemtable::new(1, metadata.clone(), None, &MergeTreeConfig::default());
let kvs =
memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
b.iter(|| {
memtable.write(&kvs).unwrap();
});
});
group.bench_function("time_series", |b| {
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
let kvs =
memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
b.iter(|| {
memtable.write(&kvs).unwrap();
});
});
}
/// Scans all rows.
fn full_scan(c: &mut Criterion) {
let metadata = Arc::new(cpu_metadata());
let config = MergeTreeConfig::default();
let start_sec = 1710043200;
let generator = CpuDataGenerator::new(metadata.clone(), 4000, start_sec, start_sec + 3600 * 2);
let mut group = c.benchmark_group("full_scan");
group.sample_size(10);
group.bench_function("merge_tree", |b| {
let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &config);
for kvs in generator.iter() {
memtable.write(&kvs).unwrap();
}
b.iter(|| {
let iter = memtable.iter(None, None).unwrap();
for batch in iter {
let _batch = batch.unwrap();
}
});
});
group.bench_function("time_series", |b| {
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
for kvs in generator.iter() {
memtable.write(&kvs).unwrap();
}
b.iter(|| {
let iter = memtable.iter(None, None).unwrap();
for batch in iter {
let _batch = batch.unwrap();
}
});
});
}
/// Filters 1 host.
fn filter_1_host(c: &mut Criterion) {
let metadata = Arc::new(cpu_metadata());
let config = MergeTreeConfig::default();
let start_sec = 1710043200;
let generator = CpuDataGenerator::new(metadata.clone(), 4000, start_sec, start_sec + 3600 * 2);
let mut group = c.benchmark_group("filter_1_host");
group.sample_size(10);
group.bench_function("merge_tree", |b| {
let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &config);
for kvs in generator.iter() {
memtable.write(&kvs).unwrap();
}
let predicate = generator.random_host_filter();
b.iter(|| {
let iter = memtable.iter(None, Some(predicate.clone())).unwrap();
for batch in iter {
let _batch = batch.unwrap();
}
});
});
group.bench_function("time_series", |b| {
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None);
for kvs in generator.iter() {
memtable.write(&kvs).unwrap();
}
let predicate = generator.random_host_filter();
b.iter(|| {
let iter = memtable.iter(None, Some(predicate.clone())).unwrap();
for batch in iter {
let _batch = batch.unwrap();
}
});
});
}
struct Host {
hostname: String,
region: String,
datacenter: String,
rack: String,
os: String,
arch: String,
team: String,
service: String,
service_version: String,
service_environment: String,
}
impl Host {
fn random_with_id(id: usize) -> Host {
let mut rng = rand::thread_rng();
let region = format!("ap-southeast-{}", rng.gen_range(0..10));
let datacenter = format!(
"{}{}",
region,
['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap()
);
Host {
hostname: format!("host_{id}"),
region,
datacenter,
rack: rng.gen_range(0..100).to_string(),
os: "Ubuntu16.04LTS".to_string(),
arch: "x86".to_string(),
team: "CHI".to_string(),
service: rng.gen_range(0..100).to_string(),
service_version: rng.gen_range(0..10).to_string(),
service_environment: "test".to_string(),
}
}
fn fill_values(&self, values: &mut Vec<api::v1::Value>) {
let tags = [
api::v1::Value {
value_data: Some(ValueData::StringValue(self.hostname.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.region.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.datacenter.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.rack.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.os.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.arch.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.team.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.service.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.service_version.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.service_environment.clone())),
},
];
for tag in tags {
values.push(tag);
}
}
}
struct CpuDataGenerator {
metadata: RegionMetadataRef,
column_schemas: Vec<api::v1::ColumnSchema>,
hosts: Vec<Host>,
start_sec: i64,
end_sec: i64,
}
impl CpuDataGenerator {
fn new(metadata: RegionMetadataRef, num_hosts: usize, start_sec: i64, end_sec: i64) -> Self {
let column_schemas = region_metadata_to_row_schema(&metadata);
Self {
metadata,
column_schemas,
hosts: Self::generate_hosts(num_hosts),
start_sec,
end_sec,
}
}
fn iter(&self) -> impl Iterator<Item = KeyValues> + '_ {
// point per 10s.
(self.start_sec..self.end_sec)
.step_by(10)
.enumerate()
.map(|(seq, ts)| self.build_key_values(seq, ts))
}
fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues {
let rows = self
.hosts
.iter()
.map(|host| {
let mut rng = rand::thread_rng();
let mut values = Vec::with_capacity(21);
values.push(api::v1::Value {
value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
});
host.fill_values(&mut values);
for _ in 0..10 {
values.push(api::v1::Value {
value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))),
});
}
Row { values }
})
.collect();
let mutation = api::v1::Mutation {
op_type: api::v1::OpType::Put as i32,
sequence: seq as u64,
rows: Some(Rows {
schema: self.column_schemas.clone(),
rows,
}),
};
KeyValues::new(&self.metadata, mutation).unwrap()
}
fn random_host_filter(&self) -> Predicate {
let host = self.random_hostname();
let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host));
Predicate::new(vec![expr.into()])
}
fn random_hostname(&self) -> String {
let mut rng = rand::thread_rng();
self.hosts.choose(&mut rng).unwrap().hostname.clone()
}
fn random_f64(rng: &mut ThreadRng) -> f64 {
let base: u32 = rng.gen_range(30..95);
base as f64
}
fn generate_hosts(num_hosts: usize) -> Vec<Host> {
(0..num_hosts).map(Host::random_with_id).collect()
}
}
/// Creates a metadata for TSBS cpu-like table.
fn cpu_metadata() -> RegionMetadata {
let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
),
semantic_type: SemanticType::Timestamp,
column_id: 0,
});
let mut column_id = 1;
let tags = [
"hostname",
"region",
"datacenter",
"rack",
"os",
"arch",
"team",
"service",
"service_version",
"service_environment",
];
for tag in tags {
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true),
semantic_type: SemanticType::Tag,
column_id,
});
column_id += 1;
}
let fields = [
"usage_user",
"usage_system",
"usage_idle",
"usage_nice",
"usage_iowait",
"usage_irq",
"usage_softirq",
"usage_steal",
"usage_guest",
"usage_guest_nice",
];
for field in fields {
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true),
semantic_type: SemanticType::Field,
column_id,
});
column_id += 1;
}
builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
builder.build().unwrap()
}
criterion_group!(benches, write_rows, full_scan, filter_1_host);
criterion_main!(benches);

View File

@@ -1,36 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use criterion::{criterion_group, criterion_main, Criterion};
use mito2::memtable::merge_tree::{MergeTreeConfig, MergeTreeMemtable};
use mito2::memtable::Memtable;
use mito2::test_util::memtable_util;
fn bench_merge_tree_memtable(c: &mut Criterion) {
let metadata = memtable_util::metadata_with_primary_key(vec![1, 0], true);
let timestamps = (0..100).collect::<Vec<_>>();
let memtable = MergeTreeMemtable::new(1, metadata.clone(), None, &MergeTreeConfig::default());
let _ = c.bench_function("MergeTreeMemtable", |b| {
let kvs =
memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &timestamps, 1);
b.iter(|| {
memtable.write(&kvs).unwrap();
});
});
}
criterion_group!(benches, bench_merge_tree_memtable);
criterion_main!(benches);

View File

@@ -158,7 +158,7 @@ impl CacheManager {
}
}
/// Gets the the write cache.
/// Gets the write cache.
pub(crate) fn write_cache(&self) -> Option<&WriteCacheRef> {
self.write_cache.as_ref()
}

View File

@@ -26,6 +26,7 @@ use table::predicate::Predicate;
use crate::error::Result;
use crate::flush::WriteBufferManagerRef;
use crate::memtable::key_values::KeyValue;
pub use crate::memtable::key_values::KeyValues;
use crate::memtable::merge_tree::MergeTreeConfig;
use crate::metrics::WRITE_BUFFER_BYTES;
@@ -33,6 +34,7 @@ use crate::read::Batch;
pub mod key_values;
pub mod merge_tree;
pub mod time_partition;
pub mod time_series;
pub(crate) mod version;
@@ -82,9 +84,12 @@ pub trait Memtable: Send + Sync + fmt::Debug {
/// Returns the id of this memtable.
fn id(&self) -> MemtableId;
/// Write key values into the memtable.
/// Writes key values into the memtable.
fn write(&self, kvs: &KeyValues) -> Result<()>;
/// Writes one key value pair into the memtable.
fn write_one(&self, key_value: KeyValue) -> Result<()>;
/// Scans the memtable.
/// `projection` selects columns to read, `None` means reading all columns.
/// `filters` are the predicates to be pushed down to memtable.

View File

@@ -71,7 +71,7 @@ impl KeyValues {
/// Primary key columns have the same order as region's primary key. Field
/// columns are ordered by their position in the region schema (The same order
/// as users defined while creating the region).
#[derive(Debug)]
#[derive(Debug, Clone, Copy)]
pub struct KeyValue<'a> {
row: &'a Row,
schema: &'a Vec<ColumnSchema>,

View File

@@ -36,6 +36,7 @@ use table::predicate::Predicate;
use crate::error::Result;
use crate::flush::WriteBufferManagerRef;
use crate::memtable::key_values::KeyValue;
use crate::memtable::merge_tree::metrics::WriteMetrics;
use crate::memtable::merge_tree::tree::MergeTree;
use crate::memtable::{
@@ -85,7 +86,7 @@ impl Default for MergeTreeConfig {
Self {
index_max_keys_per_shard: 8192,
data_freeze_threshold: 32768,
data_freeze_threshold: 131072,
dedup: true,
fork_dictionary_bytes,
}
@@ -127,6 +128,17 @@ impl Memtable for MergeTreeMemtable {
res
}
fn write_one(&self, key_value: KeyValue) -> Result<()> {
let mut metrics = WriteMetrics::default();
let mut pk_buffer = Vec::new();
// Ensures the memtable always updates stats.
let res = self.tree.write_one(key_value, &mut pk_buffer, &mut metrics);
self.update_stats(&metrics);
res
}
fn iter(
&self,
projection: Option<&[ColumnId]>,
@@ -290,14 +302,14 @@ impl MemtableBuilder for MergeTreeMemtableBuilder {
#[cfg(test)]
mod tests {
use std::collections::BTreeSet;
use common_time::Timestamp;
use datafusion_common::{Column, ScalarValue};
use datafusion_expr::{BinaryExpr, Expr, Operator};
use datatypes::scalars::ScalarVector;
use datatypes::vectors::{Int64Vector, TimestampMillisecondVector};
use datatypes::vectors::Int64Vector;
use super::*;
use crate::test_util::memtable_util;
use crate::test_util::memtable_util::{self, collect_iter_timestamps};
#[test]
fn test_memtable_sorted_input() {
@@ -320,23 +332,10 @@ mod tests {
let expected_ts = kvs
.iter()
.map(|kv| kv.timestamp().as_timestamp().unwrap().unwrap().value())
.collect::<BTreeSet<_>>();
.collect::<Vec<_>>();
let iter = memtable.iter(None, None).unwrap();
let read = iter
.flat_map(|batch| {
batch
.unwrap()
.timestamps()
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap()
.iter_data()
.collect::<Vec<_>>()
.into_iter()
})
.map(|v| v.unwrap().0.value())
.collect::<BTreeSet<_>>();
let read = collect_iter_timestamps(iter);
assert_eq!(expected_ts, read);
let stats = memtable.stats();
@@ -384,20 +383,7 @@ mod tests {
memtable.write(&kvs).unwrap();
let iter = memtable.iter(None, None).unwrap();
let read = iter
.flat_map(|batch| {
batch
.unwrap()
.timestamps()
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap()
.iter_data()
.collect::<Vec<_>>()
.into_iter()
})
.map(|v| v.unwrap().0.value())
.collect::<Vec<_>>();
let read = collect_iter_timestamps(iter);
assert_eq!(vec![0, 1, 2, 3, 4, 5, 6, 7], read);
let iter = memtable.iter(None, None).unwrap();
@@ -512,20 +498,45 @@ mod tests {
let expect = data.into_iter().map(|x| x.2).collect::<Vec<_>>();
let iter = memtable.iter(None, None).unwrap();
let read = iter
.flat_map(|batch| {
batch
.unwrap()
.timestamps()
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap()
.iter_data()
.collect::<Vec<_>>()
.into_iter()
})
.map(|v| v.unwrap().0.value())
.collect::<Vec<_>>();
let read = collect_iter_timestamps(iter);
assert_eq!(expect, read);
}
#[test]
fn test_memtable_filter() {
let metadata = memtable_util::metadata_with_primary_key(vec![0, 1], false);
// Try to build a memtable via the builder.
let memtable = MergeTreeMemtableBuilder::new(
MergeTreeConfig {
index_max_keys_per_shard: 40,
..Default::default()
},
None,
)
.build(1, &metadata);
for i in 0..100 {
let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
let kvs =
memtable_util::build_key_values(&metadata, "hello".to_string(), i, &timestamps, 1);
memtable.write(&kvs).unwrap();
}
for i in 0..100 {
let timestamps: Vec<_> = (0..10).map(|v| i as i64 * 1000 + v).collect();
let expr = Expr::BinaryExpr(BinaryExpr {
left: Box::new(Expr::Column(Column {
relation: None,
name: "k1".to_string(),
})),
op: Operator::Eq,
right: Box::new(Expr::Literal(ScalarValue::UInt32(Some(i)))),
});
let iter = memtable
.iter(None, Some(Predicate::new(vec![expr.into()])))
.unwrap();
let read = collect_iter_timestamps(iter);
assert_eq!(timestamps, read);
}
}
}

View File

@@ -957,6 +957,18 @@ impl DataParts {
self.active.write_row(pk_index, kv)
}
/// Returns the number of rows in the active buffer.
pub fn num_active_rows(&self) -> usize {
self.active.num_rows()
}
/// Freezes active buffer and creates a new active buffer.
pub fn freeze(&mut self) -> Result<()> {
let part = self.active.freeze(None, false)?;
self.frozen.push(part);
Ok(())
}
/// Reads data from all parts including active and frozen parts.
/// The returned iterator yields a record batch of one primary key at a time.
/// The order of yielding primary keys is determined by provided weights.
@@ -976,6 +988,11 @@ impl DataParts {
pub(crate) fn is_empty(&self) -> bool {
self.active.is_empty() && self.frozen.iter().all(|part| part.is_empty())
}
#[cfg(test)]
pub(crate) fn frozen_len(&self) -> usize {
self.frozen.len()
}
}
pub struct DataPartsReaderBuilder {
@@ -994,9 +1011,11 @@ impl DataPartsReaderBuilder {
for p in self.parts {
nodes.push(DataNode::new(DataSource::Part(p)));
}
let num_parts = nodes.len();
let merger = Merger::try_new(nodes)?;
Ok(DataPartsReader {
merger,
num_parts,
elapsed: Default::default(),
})
}
@@ -1005,6 +1024,7 @@ impl DataPartsReaderBuilder {
/// Reader for all parts inside a `DataParts`.
pub struct DataPartsReader {
merger: Merger<DataNode>,
num_parts: usize,
elapsed: Duration,
}
@@ -1032,6 +1052,10 @@ impl DataPartsReader {
pub(crate) fn is_valid(&self) -> bool {
self.merger.is_valid()
}
pub(crate) fn num_parts(&self) -> usize {
self.num_parts
}
}
#[cfg(test)]

View File

@@ -45,7 +45,7 @@ impl<T: DataBatchSource> DataBatchSource for DedupReader<T> {
}
fn next(&mut self) -> Result<()> {
loop {
while self.inner.is_valid() {
match &mut self.prev_batch_last_row {
None => {
// First shot, fill prev_batch_last_row and current_batch_range with first batch.

View File

@@ -78,7 +78,7 @@ impl Partition {
// Finds key in shards, now we ensure one key only exists in one shard.
if let Some(pk_id) = inner.find_key_in_shards(primary_key) {
inner.write_to_shard(pk_id, &key_value);
inner.write_to_shard(pk_id, &key_value)?;
inner.num_rows += 1;
return Ok(());
}
@@ -106,7 +106,7 @@ impl Partition {
}
/// Writes to the partition without a primary key.
pub fn write_no_key(&self, key_value: KeyValue) {
pub fn write_no_key(&self, key_value: KeyValue) -> Result<()> {
let mut inner = self.inner.write().unwrap();
// If no primary key, always write to the first shard.
debug_assert!(!inner.shards.is_empty());
@@ -117,12 +117,24 @@ impl Partition {
shard_id: 0,
pk_index: 0,
};
inner.shards[0].write_with_pk_id(pk_id, &key_value);
inner.shards[0].write_with_pk_id(pk_id, &key_value)?;
inner.num_rows += 1;
Ok(())
}
/// Scans data in the partition.
pub fn read(&self, mut context: ReadPartitionContext) -> Result<PartitionReader> {
let start = Instant::now();
let key_filter = if context.need_prune_key {
Some(PrimaryKeyFilter::new(
context.metadata.clone(),
context.filters.clone(),
context.row_codec.clone(),
))
} else {
None
};
let (builder_source, shard_reader_builders) = {
let inner = self.inner.read().unwrap();
let mut shard_source = Vec::with_capacity(inner.shards.len() + 1);
@@ -141,14 +153,21 @@ impl Partition {
(builder_reader, shard_source)
};
context.metrics.num_shards += shard_reader_builders.len();
let mut nodes = shard_reader_builders
.into_iter()
.map(|builder| Ok(ShardNode::new(ShardSource::Shard(builder.build()?))))
.map(|builder| {
Ok(ShardNode::new(ShardSource::Shard(
builder.build(key_filter.clone())?,
)))
})
.collect::<Result<Vec<_>>>()?;
if let Some(builder) = builder_source {
context.metrics.num_builder += 1;
// Move the initialization of ShardBuilderReader out of read lock.
let shard_builder_reader = builder.build(Some(&context.pk_weights))?;
let shard_builder_reader =
builder.build(Some(&context.pk_weights), key_filter.clone())?;
nodes.push(ShardNode::new(ShardSource::Builder(shard_builder_reader)));
}
@@ -156,8 +175,10 @@ impl Partition {
let merger = ShardMerger::try_new(nodes)?;
if self.dedup {
let source = DedupReader::try_new(merger)?;
context.metrics.build_partition_reader += start.elapsed();
PartitionReader::new(context, Box::new(source))
} else {
context.metrics.build_partition_reader += start.elapsed();
PartitionReader::new(context, Box::new(merger))
}
}
@@ -266,11 +287,11 @@ pub(crate) struct PartitionStats {
#[derive(Default)]
struct PartitionReaderMetrics {
prune_pk: Duration,
build_partition_reader: Duration,
read_source: Duration,
data_batch_to_batch: Duration,
keys_before_pruning: usize,
keys_after_pruning: usize,
num_builder: usize,
num_shards: usize,
}
/// Reader to scan rows in a partition.
@@ -279,18 +300,11 @@ struct PartitionReaderMetrics {
pub struct PartitionReader {
context: ReadPartitionContext,
source: BoxedDataBatchSource,
last_yield_pk_id: Option<PkId>,
}
impl PartitionReader {
fn new(context: ReadPartitionContext, source: BoxedDataBatchSource) -> Result<Self> {
let mut reader = Self {
context,
source,
last_yield_pk_id: None,
};
// Find next valid batch.
reader.prune_batch_by_key()?;
let reader = Self { context, source };
Ok(reader)
}
@@ -305,8 +319,7 @@ impl PartitionReader {
/// # Panics
/// Panics if the reader is invalid.
pub fn next(&mut self) -> Result<()> {
self.advance_source()?;
self.prune_batch_by_key()
self.advance_source()
}
/// Converts current data batch into a [Batch].
@@ -336,106 +349,77 @@ impl PartitionReader {
self.context.metrics.read_source += read_source.elapsed();
Ok(())
}
fn prune_batch_by_key(&mut self) -> Result<()> {
if self.context.metadata.primary_key.is_empty() || !self.context.need_prune_key {
// Nothing to prune.
return Ok(());
}
while self.source.is_valid() {
let pk_id = self.source.current_pk_id();
if let Some(yield_pk_id) = self.last_yield_pk_id {
if pk_id == yield_pk_id {
// If this batch has the same key as last returned batch.
// We can return it without evaluating filters.
break;
}
}
let key = self.source.current_key().unwrap();
self.context.metrics.keys_before_pruning += 1;
// Prune batch by primary key.
if prune_primary_key(
&self.context.metadata,
&self.context.filters,
&self.context.row_codec,
key,
&mut self.context.metrics,
) {
// We need this key.
self.last_yield_pk_id = Some(pk_id);
self.context.metrics.keys_after_pruning += 1;
break;
}
self.advance_source()?;
}
Ok(())
}
}
fn prune_primary_key(
metadata: &RegionMetadataRef,
filters: &[SimpleFilterEvaluator],
codec: &McmpRowCodec,
pk: &[u8],
metrics: &mut PartitionReaderMetrics,
) -> bool {
let start = Instant::now();
let res = prune_primary_key_inner(metadata, filters, codec, pk);
metrics.prune_pk += start.elapsed();
res
#[derive(Clone)]
pub(crate) struct PrimaryKeyFilter {
metadata: RegionMetadataRef,
filters: Arc<Vec<SimpleFilterEvaluator>>,
codec: Arc<McmpRowCodec>,
offsets_buf: Vec<usize>,
}
// TODO(yingwen): Improve performance of key pruning. Now we need to find index and
// then decode and convert each value.
/// Returns true if the `pk` is still needed.
fn prune_primary_key_inner(
metadata: &RegionMetadataRef,
filters: &[SimpleFilterEvaluator],
codec: &McmpRowCodec,
pk: &[u8],
) -> bool {
if filters.is_empty() {
return true;
impl PrimaryKeyFilter {
pub(crate) fn new(
metadata: RegionMetadataRef,
filters: Arc<Vec<SimpleFilterEvaluator>>,
codec: Arc<McmpRowCodec>,
) -> Self {
Self {
metadata,
filters,
codec,
offsets_buf: Vec::new(),
}
}
// no primary key, we simply return true.
if metadata.primary_key.is_empty() {
return true;
}
let pk_values = match codec.decode(pk) {
Ok(values) => values,
Err(e) => {
common_telemetry::error!(e; "Failed to decode primary key");
pub(crate) fn prune_primary_key(&mut self, pk: &[u8]) -> bool {
if self.filters.is_empty() {
return true;
}
};
// evaluate filters against primary key values
let mut result = true;
for filter in filters {
if Partition::is_partition_column(filter.column_name()) {
continue;
// no primary key, we simply return true.
if self.metadata.primary_key.is_empty() {
return true;
}
let Some(column) = metadata.column_by_name(filter.column_name()) else {
continue;
};
// ignore filters that are not referencing primary key columns
if column.semantic_type != SemanticType::Tag {
continue;
// evaluate filters against primary key values
let mut result = true;
self.offsets_buf.clear();
for filter in &*self.filters {
if Partition::is_partition_column(filter.column_name()) {
continue;
}
let Some(column) = self.metadata.column_by_name(filter.column_name()) else {
continue;
};
// ignore filters that are not referencing primary key columns
if column.semantic_type != SemanticType::Tag {
continue;
}
// index of the column in primary keys.
// Safety: A tag column is always in primary key.
let index = self.metadata.primary_key_index(column.column_id).unwrap();
let value = match self.codec.decode_value_at(pk, index, &mut self.offsets_buf) {
Ok(v) => v,
Err(e) => {
common_telemetry::error!(e; "Failed to decode primary key");
return true;
}
};
// TODO(yingwen): `evaluate_scalar()` creates temporary arrays to compare scalars. We
// can compare the bytes directly without allocation and matching types as we use
// comparable encoding.
// Safety: arrow schema and datatypes are constructed from the same source.
let scalar_value = value
.try_to_scalar_value(&column.column_schema.data_type)
.unwrap();
result &= filter.evaluate_scalar(&scalar_value).unwrap_or(true);
}
// index of the column in primary keys.
// Safety: A tag column is always in primary key.
let index = metadata.primary_key_index(column.column_id).unwrap();
// Safety: arrow schema and datatypes are constructed from the same source.
let scalar_value = pk_values[index]
.try_to_scalar_value(&column.column_schema.data_type)
.unwrap();
result &= filter.evaluate_scalar(&scalar_value).unwrap_or(true);
result
}
result
}
/// Structs to reuse across readers to avoid allocating for each reader.
@@ -443,7 +427,7 @@ pub(crate) struct ReadPartitionContext {
metadata: RegionMetadataRef,
row_codec: Arc<McmpRowCodec>,
projection: HashSet<ColumnId>,
filters: Vec<SimpleFilterEvaluator>,
filters: Arc<Vec<SimpleFilterEvaluator>>,
/// Buffer to store pk weights.
pk_weights: Vec<u16>,
need_prune_key: bool,
@@ -452,10 +436,6 @@ pub(crate) struct ReadPartitionContext {
impl Drop for ReadPartitionContext {
fn drop(&mut self) {
let partition_prune_pk = self.metrics.prune_pk.as_secs_f64();
MERGE_TREE_READ_STAGE_ELAPSED
.with_label_values(&["partition_prune_pk"])
.observe(partition_prune_pk);
let partition_read_source = self.metrics.read_source.as_secs_f64();
MERGE_TREE_READ_STAGE_ELAPSED
.with_label_values(&["partition_read_source"])
@@ -465,16 +445,19 @@ impl Drop for ReadPartitionContext {
.with_label_values(&["partition_data_batch_to_batch"])
.observe(partition_data_batch_to_batch);
if self.metrics.keys_before_pruning != 0 {
common_telemetry::debug!(
"TreeIter pruning, before: {}, after: {}, partition_read_source: {}s, partition_prune_pk: {}s, partition_data_batch_to_batch: {}s",
self.metrics.keys_before_pruning,
self.metrics.keys_after_pruning,
partition_read_source,
partition_prune_pk,
partition_data_batch_to_batch,
);
}
common_telemetry::debug!(
"TreeIter partitions metrics, \
num_builder: {}, \
num_shards: {}, \
build_partition_reader: {}s, \
partition_read_source: {}s, \
partition_data_batch_to_batch: {}s",
self.metrics.num_builder,
self.metrics.num_shards,
self.metrics.build_partition_reader.as_secs_f64(),
partition_read_source,
partition_data_batch_to_batch,
);
}
}
@@ -490,7 +473,7 @@ impl ReadPartitionContext {
metadata,
row_codec,
projection,
filters,
filters: Arc::new(filters),
pk_weights: Vec::new(),
need_prune_key,
metrics: Default::default(),
@@ -578,7 +561,16 @@ impl Inner {
fn new(metadata: RegionMetadataRef, config: &MergeTreeConfig) -> Self {
let (shards, current_shard_id) = if metadata.primary_key.is_empty() {
let data_parts = DataParts::new(metadata.clone(), DATA_INIT_CAP, config.dedup);
(vec![Shard::new(0, None, data_parts, config.dedup)], 1)
(
vec![Shard::new(
0,
None,
data_parts,
config.dedup,
config.data_freeze_threshold,
)],
1,
)
} else {
(Vec::new(), 0)
};
@@ -598,18 +590,22 @@ impl Inner {
self.pk_to_pk_id.get(primary_key).copied()
}
fn write_to_shard(&mut self, pk_id: PkId, key_value: &KeyValue) {
fn write_to_shard(&mut self, pk_id: PkId, key_value: &KeyValue) -> Result<()> {
if pk_id.shard_id == self.shard_builder.current_shard_id() {
self.shard_builder.write_with_pk_id(pk_id, key_value);
return;
}
for shard in &mut self.shards {
if shard.shard_id == pk_id.shard_id {
shard.write_with_pk_id(pk_id, key_value);
self.num_rows += 1;
return;
}
return Ok(());
}
// Safety: We find the shard by shard id.
let shard = self
.shards
.iter_mut()
.find(|shard| shard.shard_id == pk_id.shard_id)
.unwrap();
shard.write_with_pk_id(pk_id, key_value)?;
self.num_rows += 1;
Ok(())
}
fn freeze_active_shard(&mut self) -> Result<()> {

View File

@@ -15,6 +15,7 @@
//! Shard in a partition.
use std::cmp::Ordering;
use std::time::{Duration, Instant};
use store_api::metadata::RegionMetadataRef;
@@ -25,8 +26,10 @@ use crate::memtable::merge_tree::data::{
};
use crate::memtable::merge_tree::dict::KeyDictRef;
use crate::memtable::merge_tree::merger::{Merger, Node};
use crate::memtable::merge_tree::partition::PrimaryKeyFilter;
use crate::memtable::merge_tree::shard_builder::ShardBuilderReader;
use crate::memtable::merge_tree::{PkId, ShardId};
use crate::memtable::merge_tree::{PkId, PkIndex, ShardId};
use crate::metrics::MERGE_TREE_READ_STAGE_ELAPSED;
/// Shard stores data related to the same key dictionary.
pub struct Shard {
@@ -36,6 +39,8 @@ pub struct Shard {
/// Data in the shard.
data_parts: DataParts,
dedup: bool,
/// Number of rows to freeze a data part.
data_freeze_threshold: usize,
}
impl Shard {
@@ -45,20 +50,29 @@ impl Shard {
key_dict: Option<KeyDictRef>,
data_parts: DataParts,
dedup: bool,
data_freeze_threshold: usize,
) -> Shard {
Shard {
shard_id,
key_dict,
data_parts,
dedup,
data_freeze_threshold,
}
}
/// Writes a key value into the shard.
pub fn write_with_pk_id(&mut self, pk_id: PkId, key_value: &KeyValue) {
///
/// It will freezes the active buffer if it is full.
pub fn write_with_pk_id(&mut self, pk_id: PkId, key_value: &KeyValue) -> Result<()> {
debug_assert_eq!(self.shard_id, pk_id.shard_id);
if self.data_parts.num_active_rows() >= self.data_freeze_threshold {
self.data_parts.freeze()?;
}
self.data_parts.write_row(pk_id.pk_index, key_value);
Ok(())
}
/// Scans the shard.
@@ -80,6 +94,7 @@ impl Shard {
key_dict: self.key_dict.clone(),
data_parts: DataParts::new(metadata, DATA_INIT_CAP, self.dedup),
dedup: self.dedup,
data_freeze_threshold: self.data_freeze_threshold,
}
}
@@ -131,18 +146,15 @@ pub struct ShardReaderBuilder {
}
impl ShardReaderBuilder {
pub(crate) fn build(self) -> Result<ShardReader> {
pub(crate) fn build(self, key_filter: Option<PrimaryKeyFilter>) -> Result<ShardReader> {
let ShardReaderBuilder {
shard_id,
key_dict,
inner,
} = self;
let now = Instant::now();
let parts_reader = inner.build()?;
Ok(ShardReader {
shard_id,
key_dict,
parts_reader,
})
ShardReader::new(shard_id, key_dict, parts_reader, key_filter, now.elapsed())
}
}
@@ -151,15 +163,46 @@ pub struct ShardReader {
shard_id: ShardId,
key_dict: Option<KeyDictRef>,
parts_reader: DataPartsReader,
key_filter: Option<PrimaryKeyFilter>,
last_yield_pk_index: Option<PkIndex>,
keys_before_pruning: usize,
keys_after_pruning: usize,
prune_pk_cost: Duration,
data_build_cost: Duration,
}
impl ShardReader {
fn new(
shard_id: ShardId,
key_dict: Option<KeyDictRef>,
parts_reader: DataPartsReader,
key_filter: Option<PrimaryKeyFilter>,
data_build_cost: Duration,
) -> Result<Self> {
let has_pk = key_dict.is_some();
let mut reader = Self {
shard_id,
key_dict,
parts_reader,
key_filter: if has_pk { key_filter } else { None },
last_yield_pk_index: None,
keys_before_pruning: 0,
keys_after_pruning: 0,
prune_pk_cost: Duration::default(),
data_build_cost,
};
reader.prune_batch_by_key()?;
Ok(reader)
}
fn is_valid(&self) -> bool {
self.parts_reader.is_valid()
}
fn next(&mut self) -> Result<()> {
self.parts_reader.next()
self.parts_reader.next()?;
self.prune_batch_by_key()
}
fn current_key(&self) -> Option<&[u8]> {
@@ -180,6 +223,54 @@ impl ShardReader {
fn current_data_batch(&self) -> DataBatch {
self.parts_reader.current_data_batch()
}
fn prune_batch_by_key(&mut self) -> Result<()> {
let Some(key_filter) = &mut self.key_filter else {
return Ok(());
};
while self.parts_reader.is_valid() {
let pk_index = self.parts_reader.current_data_batch().pk_index();
if let Some(yield_pk_index) = self.last_yield_pk_index {
if pk_index == yield_pk_index {
break;
}
}
self.keys_before_pruning += 1;
// Safety: `key_filter` is some so the shard has primary keys.
let key = self.key_dict.as_ref().unwrap().key_by_pk_index(pk_index);
let now = Instant::now();
if key_filter.prune_primary_key(key) {
self.prune_pk_cost += now.elapsed();
self.last_yield_pk_index = Some(pk_index);
self.keys_after_pruning += 1;
break;
}
self.prune_pk_cost += now.elapsed();
self.parts_reader.next()?;
}
Ok(())
}
}
impl Drop for ShardReader {
fn drop(&mut self) {
let shard_prune_pk = self.prune_pk_cost.as_secs_f64();
MERGE_TREE_READ_STAGE_ELAPSED
.with_label_values(&["shard_prune_pk"])
.observe(shard_prune_pk);
if self.keys_before_pruning > 0 {
common_telemetry::debug!(
"ShardReader metrics, data parts: {}, before pruning: {}, after pruning: {}, prune cost: {}s, build cost: {}s",
self.parts_reader.num_parts(),
self.keys_before_pruning,
self.keys_after_pruning,
shard_prune_pk,
self.data_build_cost.as_secs_f64(),
);
}
}
}
/// A merger that merges batches from multiple shards.
@@ -388,6 +479,7 @@ mod tests {
shard_id: ShardId,
metadata: RegionMetadataRef,
input: &[(KeyValues, PkIndex)],
data_freeze_threshold: usize,
) -> Shard {
let mut dict_builder = KeyDictBuilder::new(1024);
let mut metrics = WriteMetrics::default();
@@ -402,27 +494,17 @@ mod tests {
let dict = dict_builder.finish(&mut BTreeMap::new()).unwrap();
let data_parts = DataParts::new(metadata, DATA_INIT_CAP, true);
Shard::new(shard_id, Some(Arc::new(dict)), data_parts, true)
Shard::new(
shard_id,
Some(Arc::new(dict)),
data_parts,
true,
data_freeze_threshold,
)
}
#[test]
fn test_write_read_shard() {
let metadata = metadata_for_test();
let input = input_with_key(&metadata);
let mut shard = new_shard_with_dict(8, metadata, &input);
assert!(shard.is_empty());
for (key_values, pk_index) in &input {
for kv in key_values.iter() {
let pk_id = PkId {
shard_id: shard.shard_id,
pk_index: *pk_index,
};
shard.write_with_pk_id(pk_id, &kv);
}
}
assert!(!shard.is_empty());
let mut reader = shard.read().unwrap().build().unwrap();
fn collect_timestamps(shard: &Shard) -> Vec<i64> {
let mut reader = shard.read().unwrap().build(None).unwrap();
let mut timestamps = Vec::new();
while reader.is_valid() {
let rb = reader.current_data_batch().slice_record_batch();
@@ -432,6 +514,64 @@ mod tests {
reader.next().unwrap();
}
timestamps
}
#[test]
fn test_write_read_shard() {
let metadata = metadata_for_test();
let input = input_with_key(&metadata);
let mut shard = new_shard_with_dict(8, metadata, &input, 100);
assert!(shard.is_empty());
for (key_values, pk_index) in &input {
for kv in key_values.iter() {
let pk_id = PkId {
shard_id: shard.shard_id,
pk_index: *pk_index,
};
shard.write_with_pk_id(pk_id, &kv).unwrap();
}
}
assert!(!shard.is_empty());
let timestamps = collect_timestamps(&shard);
assert_eq!(vec![0, 1, 10, 11, 20, 21], timestamps);
}
#[test]
fn test_shard_freeze() {
let metadata = metadata_for_test();
let kvs = build_key_values_with_ts_seq_values(
&metadata,
"shard".to_string(),
0,
[0].into_iter(),
[Some(0.0)].into_iter(),
0,
);
let mut shard = new_shard_with_dict(8, metadata.clone(), &[(kvs, 0)], 50);
let expected: Vec<_> = (0..200).collect();
for i in &expected {
let kvs = build_key_values_with_ts_seq_values(
&metadata,
"shard".to_string(),
0,
[*i].into_iter(),
[Some(0.0)].into_iter(),
*i as u64,
);
let pk_id = PkId {
shard_id: shard.shard_id,
pk_index: *i as PkIndex,
};
for kv in kvs.iter() {
shard.write_with_pk_id(pk_id, &kv).unwrap();
}
}
assert!(!shard.is_empty());
assert_eq!(3, shard.data_parts.frozen_len());
let timestamps = collect_timestamps(&shard);
assert_eq!(expected, timestamps);
}
}

View File

@@ -16,6 +16,7 @@
use std::collections::{BTreeMap, HashMap};
use std::sync::Arc;
use std::time::{Duration, Instant};
use store_api::metadata::RegionMetadataRef;
@@ -26,8 +27,9 @@ use crate::memtable::merge_tree::data::{
};
use crate::memtable::merge_tree::dict::{DictBuilderReader, KeyDictBuilder};
use crate::memtable::merge_tree::metrics::WriteMetrics;
use crate::memtable::merge_tree::partition::PrimaryKeyFilter;
use crate::memtable::merge_tree::shard::Shard;
use crate::memtable::merge_tree::{MergeTreeConfig, PkId, ShardId};
use crate::memtable::merge_tree::{MergeTreeConfig, PkId, PkIndex, ShardId};
use crate::metrics::MERGE_TREE_READ_STAGE_ELAPSED;
/// Builder to write keys and data to a shard that the key dictionary
@@ -136,7 +138,13 @@ impl ShardBuilder {
let shard_id = self.current_shard_id;
self.current_shard_id += 1;
Ok(Some(Shard::new(shard_id, key_dict, data_parts, self.dedup)))
Ok(Some(Shard::new(
shard_id,
key_dict,
data_parts,
self.dedup,
self.data_freeze_threshold,
)))
}
/// Scans the shard builder.
@@ -176,13 +184,20 @@ pub(crate) struct ShardBuilderReaderBuilder {
}
impl ShardBuilderReaderBuilder {
pub(crate) fn build(self, pk_weights: Option<&[u16]>) -> Result<ShardBuilderReader> {
pub(crate) fn build(
self,
pk_weights: Option<&[u16]>,
key_filter: Option<PrimaryKeyFilter>,
) -> Result<ShardBuilderReader> {
let now = Instant::now();
let data_reader = self.data_reader.build(pk_weights)?;
Ok(ShardBuilderReader {
shard_id: self.shard_id,
dict_reader: self.dict_reader,
ShardBuilderReader::new(
self.shard_id,
self.dict_reader,
data_reader,
})
key_filter,
now.elapsed(),
)
}
}
@@ -191,15 +206,45 @@ pub struct ShardBuilderReader {
shard_id: ShardId,
dict_reader: DictBuilderReader,
data_reader: DataBufferReader,
key_filter: Option<PrimaryKeyFilter>,
last_yield_pk_index: Option<PkIndex>,
keys_before_pruning: usize,
keys_after_pruning: usize,
prune_pk_cost: Duration,
data_build_cost: Duration,
}
impl ShardBuilderReader {
fn new(
shard_id: ShardId,
dict_reader: DictBuilderReader,
data_reader: DataBufferReader,
key_filter: Option<PrimaryKeyFilter>,
data_build_cost: Duration,
) -> Result<Self> {
let mut reader = ShardBuilderReader {
shard_id,
dict_reader,
data_reader,
key_filter,
last_yield_pk_index: None,
keys_before_pruning: 0,
keys_after_pruning: 0,
prune_pk_cost: Duration::default(),
data_build_cost,
};
reader.prune_batch_by_key()?;
Ok(reader)
}
pub fn is_valid(&self) -> bool {
self.data_reader.is_valid()
}
pub fn next(&mut self) -> Result<()> {
self.data_reader.next()
self.data_reader.next()?;
self.prune_batch_by_key()
}
pub fn current_key(&self) -> Option<&[u8]> {
@@ -218,6 +263,52 @@ impl ShardBuilderReader {
pub fn current_data_batch(&self) -> DataBatch {
self.data_reader.current_data_batch()
}
fn prune_batch_by_key(&mut self) -> Result<()> {
let Some(key_filter) = &mut self.key_filter else {
return Ok(());
};
while self.data_reader.is_valid() {
let pk_index = self.data_reader.current_data_batch().pk_index();
if let Some(yield_pk_index) = self.last_yield_pk_index {
if pk_index == yield_pk_index {
break;
}
}
self.keys_before_pruning += 1;
let key = self.dict_reader.key_by_pk_index(pk_index);
let now = Instant::now();
if key_filter.prune_primary_key(key) {
self.prune_pk_cost += now.elapsed();
self.last_yield_pk_index = Some(pk_index);
self.keys_after_pruning += 1;
break;
}
self.prune_pk_cost += now.elapsed();
self.data_reader.next()?;
}
Ok(())
}
}
impl Drop for ShardBuilderReader {
fn drop(&mut self) {
let shard_builder_prune_pk = self.prune_pk_cost.as_secs_f64();
MERGE_TREE_READ_STAGE_ELAPSED
.with_label_values(&["shard_builder_prune_pk"])
.observe(shard_builder_prune_pk);
if self.keys_before_pruning > 0 {
common_telemetry::debug!(
"ShardBuilderReader metrics, before pruning: {}, after pruning: {}, prune cost: {}s, build cost: {}s",
self.keys_before_pruning,
self.keys_after_pruning,
shard_builder_prune_pk,
self.data_build_cost.as_secs_f64(),
);
}
}
}
#[cfg(test)]
@@ -306,7 +397,7 @@ mod tests {
let mut reader = shard_builder
.read(&mut pk_weights)
.unwrap()
.build(Some(&pk_weights))
.build(Some(&pk_weights), None)
.unwrap();
let mut timestamps = Vec::new();
while reader.is_valid() {

View File

@@ -124,7 +124,7 @@ impl MergeTree {
if !has_pk {
// No primary key.
self.write_no_key(kv);
self.write_no_key(kv)?;
continue;
}
@@ -148,6 +148,54 @@ impl MergeTree {
Ok(())
}
/// Write one key value pair into the tree.
///
/// # Panics
/// Panics if the tree is immutable (frozen).
pub fn write_one(
&self,
kv: KeyValue,
pk_buffer: &mut Vec<u8>,
metrics: &mut WriteMetrics,
) -> Result<()> {
let has_pk = !self.metadata.primary_key.is_empty();
ensure!(
kv.num_primary_keys() == self.row_codec.num_fields(),
PrimaryKeyLengthMismatchSnafu {
expect: self.row_codec.num_fields(),
actual: kv.num_primary_keys(),
}
);
// Safety: timestamp of kv must be both present and a valid timestamp value.
let ts = kv.timestamp().as_timestamp().unwrap().unwrap().value();
metrics.min_ts = metrics.min_ts.min(ts);
metrics.max_ts = metrics.max_ts.max(ts);
metrics.value_bytes += kv.fields().map(|v| v.data_size()).sum::<usize>();
if !has_pk {
// No primary key.
return self.write_no_key(kv);
}
// Encode primary key.
pk_buffer.clear();
if self.is_partitioned {
// Use sparse encoder for metric engine.
self.sparse_encoder
.encode_to_vec(kv.primary_keys(), pk_buffer)?;
} else {
self.row_codec.encode_to_vec(kv.primary_keys(), pk_buffer)?;
}
// Write rows with
self.write_with_key(pk_buffer, kv, metrics)?;
metrics.value_bytes += std::mem::size_of::<Timestamp>() + std::mem::size_of::<OpType>();
Ok(())
}
/// Scans the tree.
pub fn read(
&self,
@@ -299,7 +347,7 @@ impl MergeTree {
)
}
fn write_no_key(&self, key_value: KeyValue) {
fn write_no_key(&self, key_value: KeyValue) -> Result<()> {
let partition_key = Partition::get_partition_key(&key_value, self.is_partitioned);
let partition = self.get_or_create_partition(partition_key);

View File

@@ -0,0 +1,551 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Partitions memtables by time.
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use std::time::Duration;
use common_telemetry::debug;
use common_time::timestamp::TimeUnit;
use common_time::timestamp_millis::BucketAligned;
use common_time::Timestamp;
use smallvec::{smallvec, SmallVec};
use snafu::OptionExt;
use store_api::metadata::RegionMetadataRef;
use crate::error::{InvalidRequestSnafu, Result};
use crate::memtable::key_values::KeyValue;
use crate::memtable::version::SmallMemtableVec;
use crate::memtable::{KeyValues, MemtableBuilderRef, MemtableId, MemtableRef};
/// A partition holds rows with timestamps between `[min, max)`.
#[derive(Debug, Clone)]
pub struct TimePartition {
/// Memtable of the partition.
memtable: MemtableRef,
/// Time range of the partition. `None` means there is no time range. The time
/// range is `None` if and only if the [TimePartitions::part_duration] is `None`.
time_range: Option<PartTimeRange>,
}
impl TimePartition {
/// Returns whether the `ts` belongs to the partition.
fn contains_timestamp(&self, ts: Timestamp) -> bool {
let Some(range) = self.time_range else {
return true;
};
range.contains_timestamp(ts)
}
/// Write rows to the part.
fn write(&self, kvs: &KeyValues) -> Result<()> {
self.memtable.write(kvs)
}
}
type PartitionVec = SmallVec<[TimePartition; 2]>;
/// Partitions.
#[derive(Debug)]
pub struct TimePartitions {
/// Mutable data of partitions.
inner: Mutex<PartitionsInner>,
/// Duration of a partition.
///
/// `None` means there is only one partition and the [TimePartition::time_range] is
/// also `None`.
part_duration: Option<Duration>,
/// Metadata of the region.
metadata: RegionMetadataRef,
/// Builder of memtables.
builder: MemtableBuilderRef,
}
pub type TimePartitionsRef = Arc<TimePartitions>;
impl TimePartitions {
/// Returns a new empty partition list with optional duration.
pub fn new(
metadata: RegionMetadataRef,
builder: MemtableBuilderRef,
next_memtable_id: MemtableId,
part_duration: Option<Duration>,
) -> Self {
let mut inner = PartitionsInner::new(next_memtable_id);
if part_duration.is_none() {
// If `part_duration` is None, then we create a partition with `None` time
// range so we will write all rows to that partition.
let memtable = builder.build(inner.alloc_memtable_id(), &metadata);
debug!(
"Creates a time partition for all timestamps, region: {}, memtable_id: {}",
metadata.region_id,
memtable.id(),
);
let part = TimePartition {
memtable,
time_range: None,
};
inner.parts.push(part);
}
Self {
inner: Mutex::new(inner),
part_duration,
metadata,
builder,
}
}
/// Write key values to memtables.
///
/// It creates new partitions if necessary.
pub fn write(&self, kvs: &KeyValues) -> Result<()> {
// Get all parts.
let parts = self.list_partitions();
// Checks whether all rows belongs to a single part. Checks in reverse order as we usually
// put to latest part.
for part in parts.iter().rev() {
let mut all_in_partition = true;
for kv in kvs.iter() {
// Safety: We checked the schema in the write request.
let ts = kv.timestamp().as_timestamp().unwrap().unwrap();
if !part.contains_timestamp(ts) {
all_in_partition = false;
break;
}
}
if !all_in_partition {
continue;
}
// We can write all rows to this part.
return part.write(kvs);
}
// Slow path: We have to split kvs by partitions.
self.write_multi_parts(kvs, &parts)
}
/// Append memtables in partitions to `memtables`.
pub fn list_memtables(&self, memtables: &mut Vec<MemtableRef>) {
let inner = self.inner.lock().unwrap();
memtables.extend(inner.parts.iter().map(|part| part.memtable.clone()));
}
/// Returns the number of partitions.
pub fn num_partitions(&self) -> usize {
let inner = self.inner.lock().unwrap();
inner.parts.len()
}
/// Returns true if all memtables are empty.
pub fn is_empty(&self) -> bool {
let inner = self.inner.lock().unwrap();
inner.parts.iter().all(|part| part.memtable.is_empty())
}
/// Freezes all memtables.
pub fn freeze(&self) -> Result<()> {
let inner = self.inner.lock().unwrap();
for part in &*inner.parts {
part.memtable.freeze()?;
}
Ok(())
}
/// Forks latest partition.
pub fn fork(&self, metadata: &RegionMetadataRef) -> Self {
let mut inner = self.inner.lock().unwrap();
let latest_part = inner
.parts
.iter()
.max_by_key(|part| part.time_range.map(|range| range.min_timestamp))
.cloned();
let Some(old_part) = latest_part else {
return Self::new(
metadata.clone(),
self.builder.clone(),
inner.next_memtable_id,
self.part_duration,
);
};
let memtable = old_part.memtable.fork(inner.alloc_memtable_id(), metadata);
let new_part = TimePartition {
memtable,
time_range: old_part.time_range,
};
Self {
inner: Mutex::new(PartitionsInner::with_partition(
new_part,
inner.next_memtable_id,
)),
part_duration: self.part_duration,
metadata: metadata.clone(),
builder: self.builder.clone(),
}
}
/// Returns partition duration.
pub(crate) fn part_duration(&self) -> Option<Duration> {
self.part_duration
}
/// Returns memory usage.
pub(crate) fn memory_usage(&self) -> usize {
let inner = self.inner.lock().unwrap();
inner
.parts
.iter()
.map(|part| part.memtable.stats().estimated_bytes)
.sum()
}
/// Append memtables in partitions to small vec.
pub(crate) fn list_memtables_to_small_vec(&self, memtables: &mut SmallMemtableVec) {
let inner = self.inner.lock().unwrap();
memtables.extend(inner.parts.iter().map(|part| part.memtable.clone()));
}
/// Returns the next memtable id.
pub(crate) fn next_memtable_id(&self) -> MemtableId {
let inner = self.inner.lock().unwrap();
inner.next_memtable_id
}
/// Returns all partitions.
fn list_partitions(&self) -> PartitionVec {
let inner = self.inner.lock().unwrap();
inner.parts.clone()
}
/// Write to multiple partitions.
fn write_multi_parts(&self, kvs: &KeyValues, parts: &PartitionVec) -> Result<()> {
// If part duration is `None` then there is always one partition and all rows
// will be put in that partition before invoking this method.
debug_assert!(self.part_duration.is_some());
let mut parts_to_write = HashMap::new();
let mut missing_parts = HashMap::new();
for kv in kvs.iter() {
let mut part_found = false;
// Safety: We used the timestamp before.
let ts = kv.timestamp().as_timestamp().unwrap().unwrap();
for part in parts {
if part.contains_timestamp(ts) {
// Safety: Since part duration is `Some` so all time range should be `Some`.
parts_to_write
.entry(part.time_range.unwrap().min_timestamp)
.or_insert_with(|| PartitionToWrite {
partition: part.clone(),
key_values: Vec::new(),
})
.key_values
.push(kv);
part_found = true;
break;
}
}
if !part_found {
// We need to write it to a new part.
// Safety: `new()` ensures duration is always Some if we do to this method.
let part_duration = self.part_duration.unwrap();
let part_start =
partition_start_timestamp(ts, part_duration).with_context(|| {
InvalidRequestSnafu {
region_id: self.metadata.region_id,
reason: format!(
"timestamp {ts:?} and bucket {part_duration:?} are out of range"
),
}
})?;
missing_parts
.entry(part_start)
.or_insert_with(Vec::new)
.push(kv);
}
}
// Writes rows to existing parts.
for part_to_write in parts_to_write.into_values() {
for kv in part_to_write.key_values {
part_to_write.partition.memtable.write_one(kv)?;
}
}
let part_duration = self.part_duration.unwrap();
// Creates new parts and writes to them. Acquires the lock to avoid others create
// the same partition.
let mut inner = self.inner.lock().unwrap();
for (part_start, key_values) in missing_parts {
let part_pos = match inner
.parts
.iter()
.position(|part| part.time_range.unwrap().min_timestamp == part_start)
{
Some(pos) => pos,
None => {
let range = PartTimeRange::from_start_duration(part_start, part_duration)
.with_context(|| InvalidRequestSnafu {
region_id: self.metadata.region_id,
reason: format!(
"Partition time range for {part_start:?} is out of bound, bucket size: {part_duration:?}",
),
})?;
let memtable = self
.builder
.build(inner.alloc_memtable_id(), &self.metadata);
debug!(
"Create time partition {:?} for region {}, duration: {:?}, memtable_id: {}, parts_total: {}",
range,
self.metadata.region_id,
part_duration,
memtable.id(),
inner.parts.len() + 1
);
let pos = inner.parts.len();
inner.parts.push(TimePartition {
memtable,
time_range: Some(range),
});
pos
}
};
let memtable = &inner.parts[part_pos].memtable;
for kv in key_values {
memtable.write_one(kv)?;
}
}
Ok(())
}
}
/// Computes the start timestamp of the partition for `ts`.
///
/// It always use bucket size in seconds which should fit all timestamp resolution.
fn partition_start_timestamp(ts: Timestamp, bucket: Duration) -> Option<Timestamp> {
// Safety: We convert it to seconds so it never returns `None`.
let ts_sec = ts.convert_to(TimeUnit::Second).unwrap();
let bucket_sec: i64 = bucket.as_secs().try_into().ok()?;
let start_sec = ts_sec.align_by_bucket(bucket_sec)?;
start_sec.convert_to(ts.unit())
}
#[derive(Debug)]
struct PartitionsInner {
/// All partitions.
parts: PartitionVec,
/// Next memtable id.
next_memtable_id: MemtableId,
}
impl PartitionsInner {
fn new(next_memtable_id: MemtableId) -> Self {
Self {
parts: Default::default(),
next_memtable_id,
}
}
fn with_partition(part: TimePartition, next_memtable_id: MemtableId) -> Self {
Self {
parts: smallvec![part],
next_memtable_id,
}
}
fn alloc_memtable_id(&mut self) -> MemtableId {
let id = self.next_memtable_id;
self.next_memtable_id += 1;
id
}
}
/// Time range of a partition.
#[derive(Debug, Clone, Copy)]
struct PartTimeRange {
/// Inclusive min timestamp of rows in the partition.
min_timestamp: Timestamp,
/// Exclusive max timestamp of rows in the partition.
max_timestamp: Timestamp,
}
impl PartTimeRange {
fn from_start_duration(start: Timestamp, duration: Duration) -> Option<Self> {
let start_sec = start.convert_to(TimeUnit::Second)?;
let end_sec = start_sec.add_duration(duration).ok()?;
let min_timestamp = start_sec.convert_to(start.unit())?;
let max_timestamp = end_sec.convert_to(start.unit())?;
Some(Self {
min_timestamp,
max_timestamp,
})
}
/// Returns whether the `ts` belongs to the partition.
fn contains_timestamp(&self, ts: Timestamp) -> bool {
self.min_timestamp <= ts && ts < self.max_timestamp
}
}
struct PartitionToWrite<'a> {
partition: TimePartition,
key_values: Vec<KeyValue<'a>>,
}
#[cfg(test)]
mod tests {
use super::*;
use crate::memtable::merge_tree::MergeTreeMemtableBuilder;
use crate::test_util::memtable_util::{self, collect_iter_timestamps};
#[test]
fn test_no_duration() {
let metadata = memtable_util::metadata_for_test();
let builder = Arc::new(MergeTreeMemtableBuilder::default());
let partitions = TimePartitions::new(metadata.clone(), builder, 0, None);
assert_eq!(1, partitions.num_partitions());
assert!(partitions.is_empty());
let kvs = memtable_util::build_key_values(
&metadata,
"hello".to_string(),
0,
&[1000, 3000, 7000, 5000, 6000],
0, // sequence 0, 1, 2, 3, 4
);
partitions.write(&kvs).unwrap();
assert_eq!(1, partitions.num_partitions());
assert!(!partitions.is_empty());
assert!(!partitions.is_empty());
let mut memtables = Vec::new();
partitions.list_memtables(&mut memtables);
let iter = memtables[0].iter(None, None).unwrap();
let timestamps = collect_iter_timestamps(iter);
assert_eq!(&[1000, 3000, 5000, 6000, 7000], &timestamps[..]);
}
#[test]
fn test_write_single_part() {
let metadata = memtable_util::metadata_for_test();
let builder = Arc::new(MergeTreeMemtableBuilder::default());
let partitions =
TimePartitions::new(metadata.clone(), builder, 0, Some(Duration::from_secs(10)));
assert_eq!(0, partitions.num_partitions());
let kvs = memtable_util::build_key_values(
&metadata,
"hello".to_string(),
0,
&[5000, 2000, 0],
0, // sequence 0, 1, 2
);
// It should creates a new partition.
partitions.write(&kvs).unwrap();
assert_eq!(1, partitions.num_partitions());
assert!(!partitions.is_empty());
let kvs = memtable_util::build_key_values(
&metadata,
"hello".to_string(),
0,
&[3000, 7000, 4000],
3, // sequence 3, 4, 5
);
// Still writes to the same partition.
partitions.write(&kvs).unwrap();
assert_eq!(1, partitions.num_partitions());
let mut memtables = Vec::new();
partitions.list_memtables(&mut memtables);
let iter = memtables[0].iter(None, None).unwrap();
let timestamps = collect_iter_timestamps(iter);
assert_eq!(&[0, 2000, 3000, 4000, 5000, 7000], &timestamps[..]);
let parts = partitions.list_partitions();
assert_eq!(
Timestamp::new_millisecond(0),
parts[0].time_range.unwrap().min_timestamp
);
assert_eq!(
Timestamp::new_millisecond(10000),
parts[0].time_range.unwrap().max_timestamp
);
}
#[test]
fn test_write_multi_parts() {
let metadata = memtable_util::metadata_for_test();
let builder = Arc::new(MergeTreeMemtableBuilder::default());
let partitions =
TimePartitions::new(metadata.clone(), builder, 0, Some(Duration::from_secs(5)));
assert_eq!(0, partitions.num_partitions());
let kvs = memtable_util::build_key_values(
&metadata,
"hello".to_string(),
0,
&[2000, 0],
0, // sequence 0, 1
);
// It should creates a new partition.
partitions.write(&kvs).unwrap();
assert_eq!(1, partitions.num_partitions());
assert!(!partitions.is_empty());
let kvs = memtable_util::build_key_values(
&metadata,
"hello".to_string(),
0,
&[3000, 7000, 4000, 5000],
2, // sequence 2, 3, 4, 5
);
// Writes 2 rows to the old partition and 1 row to a new partition.
partitions.write(&kvs).unwrap();
assert_eq!(2, partitions.num_partitions());
let parts = partitions.list_partitions();
let iter = parts[0].memtable.iter(None, None).unwrap();
let timestamps = collect_iter_timestamps(iter);
assert_eq!(
Timestamp::new_millisecond(0),
parts[0].time_range.unwrap().min_timestamp
);
assert_eq!(
Timestamp::new_millisecond(5000),
parts[0].time_range.unwrap().max_timestamp
);
assert_eq!(&[0, 2000, 3000, 4000], &timestamps[..]);
let iter = parts[1].memtable.iter(None, None).unwrap();
let timestamps = collect_iter_timestamps(iter);
assert_eq!(&[5000, 7000], &timestamps[..]);
assert_eq!(
Timestamp::new_millisecond(5000),
parts[1].time_range.unwrap().min_timestamp
);
assert_eq!(
Timestamp::new_millisecond(10000),
parts[1].time_range.unwrap().max_timestamp
);
}
}

View File

@@ -38,6 +38,7 @@ use table::predicate::Predicate;
use crate::error::{ComputeArrowSnafu, ConvertVectorSnafu, PrimaryKeyLengthMismatchSnafu, Result};
use crate::flush::WriteBufferManagerRef;
use crate::memtable::key_values::KeyValue;
use crate::memtable::{
AllocTracker, BoxedBatchIterator, KeyValues, Memtable, MemtableBuilder, MemtableId,
MemtableRef, MemtableStats,
@@ -110,49 +111,75 @@ impl TimeSeriesMemtable {
}
/// Updates memtable stats.
fn update_stats(&self, request_size: usize, min: i64, max: i64) {
self.alloc_tracker.on_allocation(request_size);
fn update_stats(&self, stats: LocalStats) {
self.alloc_tracker.on_allocation(stats.allocated);
loop {
let current_min = self.min_timestamp.load(Ordering::Relaxed);
if min >= current_min {
if stats.min_ts >= current_min {
break;
}
let Err(updated) = self.min_timestamp.compare_exchange(
current_min,
min,
stats.min_ts,
Ordering::Relaxed,
Ordering::Relaxed,
) else {
break;
};
if updated == min {
if updated == stats.min_ts {
break;
}
}
loop {
let current_max = self.max_timestamp.load(Ordering::Relaxed);
if max <= current_max {
if stats.max_ts <= current_max {
break;
}
let Err(updated) = self.max_timestamp.compare_exchange(
current_max,
max,
stats.max_ts,
Ordering::Relaxed,
Ordering::Relaxed,
) else {
break;
};
if updated == max {
if updated == stats.max_ts {
break;
}
}
}
fn write_key_value(&self, kv: KeyValue, stats: &mut LocalStats) -> Result<()> {
ensure!(
kv.num_primary_keys() == self.row_codec.num_fields(),
PrimaryKeyLengthMismatchSnafu {
expect: self.row_codec.num_fields(),
actual: kv.num_primary_keys()
}
);
let primary_key_encoded = self.row_codec.encode(kv.primary_keys())?;
let fields = kv.fields().collect::<Vec<_>>();
stats.allocated += fields.iter().map(|v| v.data_size()).sum::<usize>();
let (series, series_allocated) = self.series_set.get_or_add_series(primary_key_encoded);
stats.allocated += series_allocated;
// safety: timestamp of kv must be both present and a valid timestamp value.
let ts = kv.timestamp().as_timestamp().unwrap().unwrap().value();
stats.min_ts = stats.min_ts.min(ts);
stats.max_ts = stats.max_ts.max(ts);
let mut guard = series.write().unwrap();
guard.push(kv.timestamp(), kv.sequence(), kv.op_type(), fields);
Ok(())
}
}
impl Debug for TimeSeriesMemtable {
@@ -167,43 +194,30 @@ impl Memtable for TimeSeriesMemtable {
}
fn write(&self, kvs: &KeyValues) -> Result<()> {
let mut allocated = 0;
let mut min_ts = i64::MAX;
let mut max_ts = i64::MIN;
let mut local_stats = LocalStats::default();
for kv in kvs.iter() {
ensure!(
kv.num_primary_keys() == self.row_codec.num_fields(),
PrimaryKeyLengthMismatchSnafu {
expect: self.row_codec.num_fields(),
actual: kv.num_primary_keys()
}
);
let primary_key_encoded = self.row_codec.encode(kv.primary_keys())?;
let fields = kv.fields().collect::<Vec<_>>();
allocated += fields.iter().map(|v| v.data_size()).sum::<usize>();
let (series, series_allocated) = self.series_set.get_or_add_series(primary_key_encoded);
allocated += series_allocated;
// safety: timestamp of kv must be both present and a valid timestamp value.
let ts = kv.timestamp().as_timestamp().unwrap().unwrap().value();
min_ts = min_ts.min(ts);
max_ts = max_ts.max(ts);
let mut guard = series.write().unwrap();
guard.push(kv.timestamp(), kv.sequence(), kv.op_type(), fields);
self.write_key_value(kv, &mut local_stats)?;
}
allocated += kvs.num_rows() * std::mem::size_of::<Timestamp>();
allocated += kvs.num_rows() * std::mem::size_of::<OpType>();
local_stats.allocated += kvs.num_rows() * std::mem::size_of::<Timestamp>();
local_stats.allocated += kvs.num_rows() * std::mem::size_of::<OpType>();
// TODO(hl): this maybe inaccurate since for-iteration may return early.
// We may lift the primary key length check out of Memtable::write
// so that we can ensure writing to memtable will succeed.
self.update_stats(allocated, min_ts, max_ts);
self.update_stats(local_stats);
Ok(())
}
fn write_one(&self, key_value: KeyValue) -> Result<()> {
let mut local_stats = LocalStats::default();
let res = self.write_key_value(key_value, &mut local_stats);
local_stats.allocated += std::mem::size_of::<Timestamp>() + std::mem::size_of::<OpType>();
self.update_stats(local_stats);
res
}
fn iter(
&self,
projection: Option<&[ColumnId]>,
@@ -267,6 +281,22 @@ impl Memtable for TimeSeriesMemtable {
}
}
struct LocalStats {
allocated: usize,
min_ts: i64,
max_ts: i64,
}
impl Default for LocalStats {
fn default() -> Self {
LocalStats {
allocated: 0,
min_ts: i64::MAX,
max_ts: i64::MIN,
}
}
}
type SeriesRwLockMap = RwLock<BTreeMap<Vec<u8>, Arc<RwLock<Series>>>>;
struct SeriesSet {

View File

@@ -20,26 +20,29 @@ use smallvec::SmallVec;
use store_api::metadata::RegionMetadataRef;
use crate::error::Result;
use crate::memtable::time_partition::TimePartitionsRef;
use crate::memtable::{MemtableId, MemtableRef};
pub(crate) type SmallMemtableVec = SmallVec<[MemtableRef; 2]>;
/// A version of current memtables in a region.
#[derive(Debug, Clone)]
pub(crate) struct MemtableVersion {
/// Mutable memtable.
pub(crate) mutable: MemtableRef,
pub(crate) mutable: TimePartitionsRef,
/// Immutable memtables.
///
/// We only allow one flush job per region but if a flush job failed, then we
/// might need to store more than one immutable memtable on the next time we
/// flush the region.
immutables: SmallVec<[MemtableRef; 2]>,
immutables: SmallMemtableVec,
}
pub(crate) type MemtableVersionRef = Arc<MemtableVersion>;
impl MemtableVersion {
/// Returns a new [MemtableVersion] with specific mutable memtable.
pub(crate) fn new(mutable: MemtableRef) -> MemtableVersion {
pub(crate) fn new(mutable: TimePartitionsRef) -> MemtableVersion {
MemtableVersion {
mutable,
immutables: SmallVec::new(),
@@ -53,8 +56,8 @@ impl MemtableVersion {
/// Lists mutable and immutable memtables.
pub(crate) fn list_memtables(&self) -> Vec<MemtableRef> {
let mut mems = Vec::with_capacity(self.immutables.len() + 1);
mems.push(self.mutable.clone());
let mut mems = Vec::with_capacity(self.immutables.len() + self.mutable.num_partitions());
self.mutable.list_memtables(&mut mems);
mems.extend_from_slice(&self.immutables);
mems
}
@@ -76,15 +79,13 @@ impl MemtableVersion {
// soft limit.
self.mutable.freeze()?;
// Fork the memtable.
let mutable = self.mutable.fork(self.next_memtable_id(), metadata);
let mutable = Arc::new(self.mutable.fork(metadata));
// Pushes the mutable memtable to immutable list.
let immutables = self
.immutables
.iter()
.cloned()
.chain([self.mutable.clone()])
.collect();
let mut immutables =
SmallVec::with_capacity(self.immutables.len() + self.mutable.num_partitions());
self.mutable.list_memtables_to_small_vec(&mut immutables);
immutables.extend(self.immutables.iter().cloned());
Ok(Some(MemtableVersion {
mutable,
immutables,
@@ -103,7 +104,7 @@ impl MemtableVersion {
/// Returns the memory usage of the mutable memtable.
pub(crate) fn mutable_usage(&self) -> usize {
self.mutable.stats().estimated_bytes
self.mutable.memory_usage()
}
/// Returns the memory usage of the immutable memtables.
@@ -121,9 +122,4 @@ impl MemtableVersion {
pub(crate) fn is_empty(&self) -> bool {
self.mutable.is_empty() && self.immutables.is_empty()
}
/// Returns the next memtable id.
pub(crate) fn next_memtable_id(&self) -> MemtableId {
self.mutable.id() + 1
}
}

View File

@@ -37,6 +37,7 @@ use crate::error::{
};
use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
use crate::manifest::storage::manifest_compress_type;
use crate::memtable::time_partition::TimePartitions;
use crate::memtable::MemtableBuilderRef;
use crate::region::options::RegionOptions;
use crate::region::version::{VersionBuilder, VersionControl, VersionControlRef};
@@ -169,7 +170,15 @@ impl RegionOpener {
RegionManifestManager::new(metadata.clone(), region_manifest_options).await?;
// Initial memtable id is 0.
let mutable = self.memtable_builder.build(0, &metadata);
let part_duration = options.compaction.time_window();
let mutable = Arc::new(TimePartitions::new(
metadata.clone(),
self.memtable_builder,
0,
part_duration,
));
debug!("Create region {} with options: {:?}", region_id, options);
let version = VersionBuilder::new(metadata, mutable)
.options(options)
@@ -249,6 +258,9 @@ impl RegionOpener {
let region_id = self.region_id;
let object_store = self.object_store(&region_options.storage)?.clone();
debug!("Open region {} with options: {:?}", region_id, self.options);
let access_layer = Arc::new(AccessLayer::new(
self.region_dir.clone(),
object_store,
@@ -260,7 +272,13 @@ impl RegionOpener {
self.cache_manager.clone(),
));
// Initial memtable id is 0.
let mutable = self.memtable_builder.build(0, &metadata);
let part_duration = region_options.compaction.time_window();
let mutable = Arc::new(TimePartitions::new(
metadata.clone(),
self.memtable_builder.clone(),
0,
part_duration,
));
let version = VersionBuilder::new(metadata, mutable)
.add_files(file_purger.clone(), manifest.files.values().cloned())
.flushed_entry_id(manifest.flushed_entry_id)

View File

@@ -13,6 +13,8 @@
// limitations under the License.
//! Options for a region.
//!
//! If we add options in this mod, we also need to modify [store_api::mito_engine_options].
use std::collections::HashMap;
use std::time::Duration;
@@ -92,6 +94,14 @@ pub enum CompactionOptions {
Twcs(TwcsOptions),
}
impl CompactionOptions {
pub(crate) fn time_window(&self) -> Option<Duration> {
match self {
CompactionOptions::Twcs(opts) => opts.time_window,
}
}
}
impl Default for CompactionOptions {
fn default() -> Self {
Self::Twcs(TwcsOptions::default())
@@ -358,6 +368,7 @@ mod tests {
("compaction.type", "twcs"),
("storage", "S3"),
("index.inverted_index.ignore_column_ids", "1,2,3"),
("index.inverted_index.segment_row_count", "512"),
(
WAL_OPTIONS_KEY,
&serde_json::to_string(&wal_options).unwrap(),
@@ -376,7 +387,7 @@ mod tests {
index_options: IndexOptions {
inverted_index: InvertedIndexOptions {
ignore_column_ids: vec![1, 2, 3],
segment_row_count: 1024,
segment_row_count: 512,
},
},
};

View File

@@ -31,8 +31,9 @@ use store_api::storage::SequenceNumber;
use crate::error::Result;
use crate::manifest::action::RegionEdit;
use crate::memtable::time_partition::{TimePartitions, TimePartitionsRef};
use crate::memtable::version::{MemtableVersion, MemtableVersionRef};
use crate::memtable::{MemtableBuilderRef, MemtableId, MemtableRef};
use crate::memtable::{MemtableBuilderRef, MemtableId};
use crate::region::options::RegionOptions;
use crate::sst::file::FileMeta;
use crate::sst::file_purger::FilePurgerRef;
@@ -122,8 +123,14 @@ impl VersionControl {
/// Mark all opened files as deleted and set the delete marker in [VersionControlData]
pub(crate) fn mark_dropped(&self, memtable_builder: &MemtableBuilderRef) {
let version = self.current().version;
let new_mutable =
memtable_builder.build(version.memtables.next_memtable_id(), &version.metadata);
let part_duration = version.memtables.mutable.part_duration();
let next_memtable_id = version.memtables.mutable.next_memtable_id();
let new_mutable = Arc::new(TimePartitions::new(
version.metadata.clone(),
memtable_builder.clone(),
next_memtable_id,
part_duration,
));
let mut data = self.data.write().unwrap();
data.is_dropped = true;
@@ -140,7 +147,14 @@ impl VersionControl {
/// new schema. Memtables of the version must be empty.
pub(crate) fn alter_schema(&self, metadata: RegionMetadataRef, builder: &MemtableBuilderRef) {
let version = self.current().version;
let new_mutable = builder.build(version.memtables.next_memtable_id(), &metadata);
let part_duration = version.memtables.mutable.part_duration();
let next_memtable_id = version.memtables.mutable.next_memtable_id();
let new_mutable = Arc::new(TimePartitions::new(
metadata.clone(),
builder.clone(),
next_memtable_id,
part_duration,
));
debug_assert!(version.memtables.mutable.is_empty());
debug_assert!(version.memtables.immutables().is_empty());
let new_version = Arc::new(
@@ -163,8 +177,14 @@ impl VersionControl {
) {
let version = self.current().version;
let new_mutable =
memtable_builder.build(version.memtables.next_memtable_id(), &version.metadata);
let part_duration = version.memtables.mutable.part_duration();
let next_memtable_id = version.memtables.mutable.next_memtable_id();
let new_mutable = Arc::new(TimePartitions::new(
version.metadata.clone(),
memtable_builder.clone(),
next_memtable_id,
part_duration,
));
let new_version = Arc::new(
VersionBuilder::new(version.metadata.clone(), new_mutable)
.flushed_entry_id(truncated_entry_id)
@@ -242,7 +262,7 @@ pub(crate) struct VersionBuilder {
impl VersionBuilder {
/// Returns a new builder.
pub(crate) fn new(metadata: RegionMetadataRef, mutable: MemtableRef) -> Self {
pub(crate) fn new(metadata: RegionMetadataRef, mutable: TimePartitionsRef) -> Self {
VersionBuilder {
metadata,
memtables: Arc::new(MemtableVersion::new(mutable)),

View File

@@ -215,6 +215,61 @@ impl SortField {
Decimal128, Decimal128
)
}
/// Skip deserializing this field, returns the length of it.
fn skip_deserialize(
&self,
bytes: &[u8],
deserializer: &mut Deserializer<&[u8]>,
) -> Result<usize> {
let pos = deserializer.position();
if bytes[pos] == 0 {
deserializer.advance(1);
return Ok(1);
}
let to_skip = match &self.data_type {
ConcreteDataType::Boolean(_) => 2,
ConcreteDataType::Int8(_) | ConcreteDataType::UInt8(_) => 2,
ConcreteDataType::Int16(_) | ConcreteDataType::UInt16(_) => 3,
ConcreteDataType::Int32(_) | ConcreteDataType::UInt32(_) => 5,
ConcreteDataType::Int64(_) | ConcreteDataType::UInt64(_) => 9,
ConcreteDataType::Float32(_) => 5,
ConcreteDataType::Float64(_) => 9,
ConcreteDataType::Binary(_) => {
// Now the encoder encode binary as a list of bytes so we can't use
// skip bytes.
let pos_before = deserializer.position();
let mut current = pos_before + 1;
while bytes[current] == 1 {
current += 2;
}
let to_skip = current - pos_before + 1;
deserializer.advance(to_skip);
return Ok(to_skip);
}
ConcreteDataType::String(_) => {
let pos_before = deserializer.position();
deserializer.advance(1);
deserializer
.skip_bytes()
.context(error::DeserializeFieldSnafu)?;
return Ok(deserializer.position() - pos_before);
}
ConcreteDataType::Date(_) => 5,
ConcreteDataType::DateTime(_) => 9,
ConcreteDataType::Timestamp(_) => 9, // We treat timestamp as Option<i64>
ConcreteDataType::Time(_) => 10, // i64 and 1 byte time unit
ConcreteDataType::Duration(_) => 10,
ConcreteDataType::Interval(_) => 18,
ConcreteDataType::Decimal128(_) => 19,
ConcreteDataType::Null(_)
| ConcreteDataType::List(_)
| ConcreteDataType::Dictionary(_) => 0,
};
deserializer.advance(to_skip);
Ok(to_skip)
}
}
/// A memory-comparable row [Value] encoder/decoder.
@@ -236,6 +291,52 @@ impl McmpRowCodec {
pub fn estimated_size(&self) -> usize {
self.fields.iter().map(|f| f.estimated_size()).sum()
}
/// Decode value at `pos` in `bytes`.
///
/// The i-th element in offsets buffer is how many bytes to skip in order to read value at `pos`.
pub fn decode_value_at(
&self,
bytes: &[u8],
pos: usize,
offsets_buf: &mut Vec<usize>,
) -> Result<Value> {
let mut deserializer = Deserializer::new(bytes);
if pos < offsets_buf.len() {
// We computed the offset before.
let to_skip = offsets_buf[pos];
deserializer.advance(to_skip);
return self.fields[pos].deserialize(&mut deserializer);
}
if offsets_buf.is_empty() {
let mut offset = 0;
// Skip values before `pos`.
for i in 0..pos {
// Offset to skip before reading value i.
offsets_buf.push(offset);
let skip = self.fields[i].skip_deserialize(bytes, &mut deserializer)?;
offset += skip;
}
// Offset to skip before reading this value.
offsets_buf.push(offset);
} else {
// Offsets are not enough.
let value_start = offsets_buf.len() - 1;
// Advances to decode value at `value_start`.
let mut offset = offsets_buf[value_start];
deserializer.advance(offset);
for i in value_start..pos {
// Skip value i.
let skip = self.fields[i].skip_deserialize(bytes, &mut deserializer)?;
// Offset for the value at i + 1.
offset += skip;
offsets_buf.push(offset);
}
}
self.fields[pos].deserialize(&mut deserializer)
}
}
impl RowCodec for McmpRowCodec {
@@ -274,7 +375,7 @@ impl RowCodec for McmpRowCodec {
#[cfg(test)]
mod tests {
use common_base::bytes::StringBytes;
use common_time::Timestamp;
use common_time::{DateTime, Timestamp};
use datatypes::value::Value;
use super::*;
@@ -292,6 +393,18 @@ mod tests {
let result = encoder.encode(value_ref.iter().cloned()).unwrap();
let decoded = encoder.decode(&result).unwrap();
assert_eq!(decoded, row);
let mut decoded = Vec::new();
let mut offsets = Vec::new();
// Iter two times to test offsets buffer.
for _ in 0..2 {
decoded.clear();
for i in 0..data_types.len() {
let value = encoder.decode_value_at(&result, i, &mut offsets).unwrap();
decoded.push(value);
}
assert_eq!(data_types.len(), offsets.len(), "offsets: {:?}", offsets);
assert_eq!(decoded, row);
}
}
#[test]
@@ -416,5 +529,53 @@ mod tests {
],
vec![Value::Null, Value::Int64(43), Value::Boolean(true)],
);
// All types.
check_encode_and_decode(
&[
ConcreteDataType::boolean_datatype(),
ConcreteDataType::int8_datatype(),
ConcreteDataType::uint8_datatype(),
ConcreteDataType::int16_datatype(),
ConcreteDataType::uint16_datatype(),
ConcreteDataType::int32_datatype(),
ConcreteDataType::uint32_datatype(),
ConcreteDataType::int64_datatype(),
ConcreteDataType::uint64_datatype(),
ConcreteDataType::float32_datatype(),
ConcreteDataType::float64_datatype(),
ConcreteDataType::binary_datatype(),
ConcreteDataType::string_datatype(),
ConcreteDataType::date_datatype(),
ConcreteDataType::datetime_datatype(),
ConcreteDataType::timestamp_millisecond_datatype(),
ConcreteDataType::time_millisecond_datatype(),
ConcreteDataType::duration_millisecond_datatype(),
ConcreteDataType::interval_month_day_nano_datatype(),
ConcreteDataType::decimal128_default_datatype(),
],
vec![
Value::Boolean(true),
Value::Int8(8),
Value::UInt8(8),
Value::Int16(16),
Value::UInt16(16),
Value::Int32(32),
Value::UInt32(32),
Value::Int64(64),
Value::UInt64(64),
Value::Float32(1.0.into()),
Value::Float64(1.0.into()),
Value::Binary(b"hello"[..].into()),
Value::String("world".into()),
Value::Date(Date::new(10)),
Value::DateTime(DateTime::new(11)),
Value::Timestamp(Timestamp::new_millisecond(12)),
Value::Time(Time::new_millisecond(13)),
Value::Duration(Duration::new_millisecond(14)),
Value::Interval(Interval::from_month_day_nano(1, 1, 15)),
Value::Decimal128(Decimal128::from(16)),
],
);
}
}

View File

@@ -21,7 +21,9 @@ use api::v1::value::ValueData;
use api::v1::{Row, Rows, SemanticType};
use datatypes::arrow::array::UInt64Array;
use datatypes::data_type::ConcreteDataType;
use datatypes::scalars::ScalarVector;
use datatypes::schema::ColumnSchema;
use datatypes::vectors::TimestampMillisecondVector;
use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder, RegionMetadataRef};
use store_api::storage::{ColumnId, RegionId, SequenceNumber};
use table::predicate::Predicate;
@@ -58,6 +60,10 @@ impl Memtable for EmptyMemtable {
Ok(())
}
fn write_one(&self, _key_value: KeyValue) -> Result<()> {
Ok(())
}
fn iter(
&self,
_projection: Option<&[ColumnId]>,
@@ -219,25 +225,14 @@ pub(crate) fn extract_data_batch(batch: &DataBatch) -> (u16, Vec<(i64, u64)>) {
/// Builds key values with timestamps (ms) and sequences for test.
pub(crate) fn build_key_values_with_ts_seq_values(
schema: &RegionMetadataRef,
metadata: &RegionMetadataRef,
k0: String,
k1: u32,
timestamps: impl Iterator<Item = i64>,
values: impl Iterator<Item = Option<f64>>,
sequence: SequenceNumber,
) -> KeyValues {
let column_schema = schema
.column_metadatas
.iter()
.map(|c| api::v1::ColumnSchema {
column_name: c.column_schema.name.clone(),
datatype: ColumnDataTypeWrapper::try_from(c.column_schema.data_type.clone())
.unwrap()
.datatype() as i32,
semantic_type: c.semantic_type as i32,
..Default::default()
})
.collect();
let column_schema = region_metadata_to_row_schema(metadata);
let rows = timestamps
.zip(values)
@@ -269,7 +264,23 @@ pub(crate) fn build_key_values_with_ts_seq_values(
rows,
}),
};
KeyValues::new(schema.as_ref(), mutation).unwrap()
KeyValues::new(metadata.as_ref(), mutation).unwrap()
}
/// Converts the region metadata to column schemas for a row.
pub fn region_metadata_to_row_schema(metadata: &RegionMetadataRef) -> Vec<api::v1::ColumnSchema> {
metadata
.column_metadatas
.iter()
.map(|c| api::v1::ColumnSchema {
column_name: c.column_schema.name.clone(),
datatype: ColumnDataTypeWrapper::try_from(c.column_schema.data_type.clone())
.unwrap()
.datatype() as i32,
semantic_type: c.semantic_type as i32,
..Default::default()
})
.collect()
}
/// Encode keys.
@@ -298,3 +309,20 @@ pub(crate) fn encode_key_by_kv(key_value: &KeyValue) -> Vec<u8> {
]);
row_codec.encode(key_value.primary_keys()).unwrap()
}
/// Collects timestamps from the batch iter.
pub(crate) fn collect_iter_timestamps(iter: BoxedBatchIterator) -> Vec<i64> {
iter.flat_map(|batch| {
batch
.unwrap()
.timestamps()
.as_any()
.downcast_ref::<TimestampMillisecondVector>()
.unwrap()
.iter_data()
.collect::<Vec<_>>()
.into_iter()
})
.map(|v| v.unwrap().0.value())
.collect()
}

View File

@@ -25,7 +25,7 @@ use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder}
use store_api::storage::RegionId;
use crate::manifest::action::RegionEdit;
use crate::memtable::MemtableBuilder;
use crate::memtable::time_partition::TimePartitions;
use crate::region::version::{Version, VersionBuilder, VersionControl};
use crate::sst::file::{FileId, FileMeta};
use crate::sst::file_purger::FilePurgerRef;
@@ -101,7 +101,12 @@ impl VersionControlBuilder {
pub(crate) fn build_version(&self) -> Version {
let metadata = Arc::new(self.metadata.clone());
let mutable = self.memtable_builder.build(0, &metadata);
let mutable = Arc::new(TimePartitions::new(
metadata.clone(),
self.memtable_builder.clone(),
0,
None,
));
VersionBuilder::new(metadata, mutable)
.add_files(self.file_purger.clone(), self.files.values().cloned())
.build()

View File

@@ -18,7 +18,7 @@ futures.workspace = true
lazy_static.workspace = true
md5 = "0.7"
moka = { workspace = true, features = ["future"] }
opendal = { version = "0.44", features = [
opendal = { version = "0.45", features = [
"layers-tracing",
] }
prometheus.workspace = true

View File

@@ -91,7 +91,8 @@ impl Deleter {
.await?;
let affected_rows = self.do_request(deletes, &ctx).await?;
Ok(Output::AffectedRows(affected_rows as _))
Ok(Output::new_with_affected_rows(affected_rows))
}
pub async fn handle_table_delete(

View File

@@ -111,7 +111,7 @@ impl Inserter {
.await?;
let affected_rows = self.do_request(inserts, &ctx).await?;
Ok(Output::AffectedRows(affected_rows as _))
Ok(Output::new_with_affected_rows(affected_rows))
}
/// Handle row inserts request with metric engine.
@@ -149,7 +149,7 @@ impl Inserter {
.await?;
let affected_rows = self.do_request(inserts, &ctx).await?;
Ok(Output::AffectedRows(affected_rows as _))
Ok(Output::new_with_affected_rows(affected_rows))
}
pub async fn handle_table_insert(
@@ -185,7 +185,7 @@ impl Inserter {
.await?;
let affected_rows = self.do_request(inserts, ctx).await?;
Ok(Output::AffectedRows(affected_rows as _))
Ok(Output::new_with_affected_rows(affected_rows))
}
}
@@ -468,8 +468,6 @@ impl Inserter {
&req.table_name,
);
info!("Logical table `{table_ref}` does not exist, try creating table");
let request_schema = req.rows.as_ref().unwrap().schema.as_slice();
let mut create_table_expr = build_create_table_expr(&table_ref, request_schema)?;

View File

@@ -40,12 +40,13 @@ use query::plan::LogicalPlan;
use query::QueryEngineRef;
use session::context::QueryContextRef;
use session::table_name::table_idents_to_full_name;
use snafu::{OptionExt, ResultExt};
use snafu::{ensure, OptionExt, ResultExt};
use sql::statements::copy::{CopyDatabase, CopyDatabaseArgument, CopyTable, CopyTableArgument};
use sql::statements::set_variables::SetVariables;
use sql::statements::statement::Statement;
use sql::statements::OptionMap;
use sql::util::format_raw_object_name;
use sqlparser::ast::{Expr, ObjectName, Value};
use sqlparser::ast::{Expr, Ident, ObjectName, Value};
use table::requests::{CopyDatabaseRequest, CopyDirection, CopyTableRequest};
use table::table_reference::TableReference;
use table::TableRef;
@@ -122,11 +123,11 @@ impl StatementExecutor {
CopyDirection::Export => self
.copy_table_to(req, query_ctx)
.await
.map(Output::AffectedRows),
.map(Output::new_with_affected_rows),
CopyDirection::Import => self
.copy_table_from(req, query_ctx)
.await
.map(Output::AffectedRows),
.map(Output::new_with_affected_rows),
}
}
@@ -151,15 +152,15 @@ impl StatementExecutor {
Statement::CreateTable(stmt) => {
let _ = self.create_table(stmt, query_ctx).await?;
Ok(Output::AffectedRows(0))
Ok(Output::new_with_affected_rows(0))
}
Statement::CreateTableLike(stmt) => {
let _ = self.create_table_like(stmt, query_ctx).await?;
Ok(Output::AffectedRows(0))
Ok(Output::new_with_affected_rows(0))
}
Statement::CreateExternalTable(stmt) => {
let _ = self.create_external_table(stmt, query_ctx).await?;
Ok(Output::AffectedRows(0))
Ok(Output::new_with_affected_rows(0))
}
Statement::Alter(alter_table) => self.alter_table(alter_table, query_ctx).await,
Statement::DropTable(stmt) => {
@@ -207,6 +208,22 @@ impl StatementExecutor {
let var_name = set_var.variable.to_string().to_uppercase();
match var_name.as_str() {
"TIMEZONE" | "TIME_ZONE" => set_timezone(set_var.value, query_ctx)?,
// Some postgresql client app may submit a "SET bytea_output" stmt upon connection.
// However, currently we lack the support for it (tracked in https://github.com/GreptimeTeam/greptimedb/issues/3438),
// so we just ignore it here instead of returning an error to break the connection.
// Since the "bytea_output" only determines the output format of binary values,
// it won't cause much trouble if we do so.
// TODO(#3438): Remove this temporary workaround after the feature is implemented.
"BYTEA_OUTPUT" => (),
// Same as "bytea_output", we just ignore it here.
// Not harmful since it only relates to how date is viewed in client app's output.
// The tracked issue is https://github.com/GreptimeTeam/greptimedb/issues/3442.
// TODO(#3442): Remove this temporary workaround after the feature is implemented.
"DATESTYLE" => (),
"CLIENT_ENCODING" => validate_client_encoding(set_var)?,
_ => {
return NotSupportedSnafu {
feat: format!("Unsupported set variable {}", var_name),
@@ -214,7 +231,7 @@ impl StatementExecutor {
.fail()
}
}
Ok(Output::AffectedRows(0))
Ok(Output::new_with_affected_rows(0))
}
Statement::ShowVariables(show_variable) => self.show_variable(show_variable, query_ctx),
}
@@ -257,6 +274,39 @@ impl StatementExecutor {
}
}
fn validate_client_encoding(set: SetVariables) -> Result<()> {
let Some((encoding, [])) = set.value.split_first() else {
return InvalidSqlSnafu {
err_msg: "must provide one and only one client encoding value",
}
.fail();
};
let encoding = match encoding {
Expr::Value(Value::SingleQuotedString(x))
| Expr::Identifier(Ident {
value: x,
quote_style: _,
}) => x.to_uppercase(),
_ => {
return InvalidSqlSnafu {
err_msg: format!("client encoding must be a string, actual: {:?}", encoding),
}
.fail();
}
};
// For the sake of simplicity, we only support "UTF8" ("UNICODE" is the alias for it,
// see https://www.postgresql.org/docs/current/multibyte.html#MULTIBYTE-CHARSET-SUPPORTED).
// "UTF8" is universal and sufficient for almost all cases.
// GreptimeDB itself is always using "UTF8" as the internal encoding.
ensure!(
encoding == "UTF8" || encoding == "UNICODE",
NotSupportedSnafu {
feat: format!("client encoding of '{}'", encoding)
}
);
Ok(())
}
fn set_timezone(exprs: Vec<Expr>, ctx: QueryContextRef) -> Result<()> {
let tz_expr = exprs.first().context(NotSupportedSnafu {
feat: "No timezone find in set variable statement",

View File

@@ -15,10 +15,10 @@
use std::path::Path;
use std::str::FromStr;
use client::Output;
use common_datasource::file_format::Format;
use common_datasource::lister::{Lister, Source};
use common_datasource::object_store::build_backend;
use common_query::Output;
use common_telemetry::{debug, error, info, tracing};
use object_store::Entry;
use regex::Regex;
@@ -96,7 +96,7 @@ impl StatementExecutor {
.await?;
exported_rows += exported;
}
Ok(Output::AffectedRows(exported_rows))
Ok(Output::new_with_affected_rows(exported_rows))
}
/// Imports data to database from a given location and returns total rows imported.
@@ -169,7 +169,7 @@ impl StatementExecutor {
}
}
}
Ok(Output::AffectedRows(rows_inserted))
Ok(Output::new_with_affected_rows(rows_inserted))
}
}

View File

@@ -14,6 +14,7 @@
use std::sync::Arc;
use client::OutputData;
use common_base::readable_size::ReadableSize;
use common_datasource::file_format::csv::stream_to_csv;
use common_datasource::file_format::json::stream_to_json;
@@ -21,7 +22,6 @@ use common_datasource::file_format::parquet::stream_to_parquet;
use common_datasource::file_format::Format;
use common_datasource::object_store::{build_backend, parse_url};
use common_datasource::util::find_dir_and_filename;
use common_query::Output;
use common_recordbatch::adapter::DfRecordBatchStreamAdapter;
use common_recordbatch::SendableRecordBatchStream;
use common_telemetry::{debug, tracing};
@@ -134,9 +134,9 @@ impl StatementExecutor {
.execute(LogicalPlan::DfPlan(plan), query_ctx)
.await
.context(ExecLogicalPlanSnafu)?;
let stream = match output {
Output::Stream(stream, _) => stream,
Output::RecordBatches(record_batches) => record_batches.as_stream(),
let stream = match output.data {
OutputData::Stream(stream) => stream,
OutputData::RecordBatches(record_batches) => record_batches.as_stream(),
_ => unreachable!(),
};

View File

@@ -338,10 +338,10 @@ impl StatementExecutor {
.await
.context(error::InvalidateTableCacheSnafu)?;
Ok(Output::AffectedRows(0))
Ok(Output::new_with_affected_rows(0))
} else if drop_if_exists {
// DROP TABLE IF EXISTS meets table not found - ignored
Ok(Output::AffectedRows(0))
Ok(Output::new_with_affected_rows(0))
} else {
Err(TableNotFoundSnafu {
table_name: table_name.to_string(),
@@ -367,7 +367,7 @@ impl StatementExecutor {
let table_id = table.table_info().table_id();
self.truncate_table_procedure(&table_name, table_id).await?;
Ok(Output::AffectedRows(0))
Ok(Output::new_with_affected_rows(0))
}
fn verify_alter(
@@ -471,7 +471,7 @@ impl StatementExecutor {
.await
.context(error::InvalidateTableCacheSnafu)?;
Ok(Output::AffectedRows(0))
Ok(Output::new_with_affected_rows(0))
}
async fn create_table_procedure(
@@ -580,7 +580,7 @@ impl StatementExecutor {
if exists {
return if create_if_not_exists {
Ok(Output::AffectedRows(1))
Ok(Output::new_with_affected_rows(1))
} else {
error::SchemaExistsSnafu { name: database }.fail()
};
@@ -592,7 +592,7 @@ impl StatementExecutor {
.await
.context(TableMetadataManagerSnafu)?;
Ok(Output::AffectedRows(1))
Ok(Output::new_with_affected_rows(1))
}
}

View File

@@ -429,7 +429,7 @@ mod test {
ts_range,
value_range,
timestamps,
// that two `2.0` is because `duration_to_start` are shrunk to to
// that two `2.0` is because `duration_to_start` are shrunk to
// `duration_to_zero`, and causes `duration_to_zero` less than
// `extrapolation_threshold`.
vec![2.0, 1.5, 1.5, 1.5, 2.0, 1.5, 1.5, 1.5],

View File

@@ -28,7 +28,7 @@ use common_function::function::FunctionRef;
use common_function::scalars::aggregate::AggregateFunctionMetaRef;
use common_query::physical_plan::{DfPhysicalPlanAdapter, PhysicalPlan, PhysicalPlanAdapter};
use common_query::prelude::ScalarUdf;
use common_query::Output;
use common_query::{Output, OutputData, OutputMeta};
use common_recordbatch::adapter::RecordBatchStreamAdapter;
use common_recordbatch::{EmptyRecordBatchStream, SendableRecordBatchStream};
use common_telemetry::tracing;
@@ -90,9 +90,9 @@ impl DatafusionQueryEngine {
optimized_physical_plan
};
Ok(Output::Stream(
self.execute_stream(&ctx, &physical_plan)?,
Some(physical_plan),
Ok(Output::new(
OutputData::Stream(self.execute_stream(&ctx, &physical_plan)?),
OutputMeta::new_with_plan(physical_plan),
))
}
@@ -121,9 +121,9 @@ impl DatafusionQueryEngine {
let output = self
.exec_query_plan(LogicalPlan::DfPlan((*dml.input).clone()), query_ctx.clone())
.await?;
let mut stream = match output {
Output::RecordBatches(batches) => batches.as_stream(),
Output::Stream(stream, _) => stream,
let mut stream = match output.data {
OutputData::RecordBatches(batches) => batches.as_stream(),
OutputData::Stream(stream) => stream,
_ => unreachable!(),
};
@@ -148,7 +148,7 @@ impl DatafusionQueryEngine {
};
affected_rows += rows;
}
Ok(Output::AffectedRows(affected_rows))
Ok(Output::new_with_affected_rows(affected_rows))
}
#[tracing::instrument(skip_all)]
@@ -471,7 +471,6 @@ mod tests {
use catalog::RegisterTableRequest;
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, NUMBERS_TABLE_ID};
use common_query::Output;
use common_recordbatch::util;
use datafusion::prelude::{col, lit};
use datatypes::prelude::ConcreteDataType;
@@ -534,8 +533,8 @@ mod tests {
let output = engine.execute(plan, QueryContext::arc()).await.unwrap();
match output {
Output::Stream(recordbatch, _) => {
match output.data {
OutputData::Stream(recordbatch) => {
let numbers = util::collect(recordbatch).await.unwrap();
assert_eq!(1, numbers.len());
assert_eq!(numbers[0].num_columns(), 1);

View File

@@ -15,7 +15,8 @@
use std::pin::Pin;
use std::task::{Context, Poll};
use common_recordbatch::{RecordBatch, RecordBatchStream, SendableRecordBatchStream};
use common_recordbatch::adapter::RecordBatchMetrics;
use common_recordbatch::{OrderOption, RecordBatch, RecordBatchStream, SendableRecordBatchStream};
use datatypes::schema::SchemaRef;
use futures::Stream;
use futures_util::ready;
@@ -78,6 +79,14 @@ impl<F: FnOnce() + Unpin> RecordBatchStream for OnDone<F> {
fn schema(&self) -> SchemaRef {
self.stream.schema()
}
fn output_ordering(&self) -> Option<&[OrderOption]> {
self.stream.output_ordering()
}
fn metrics(&self) -> Option<RecordBatchMetrics> {
self.stream.metrics()
}
}
impl<F: FnOnce() + Unpin> Stream for OnDone<F> {

View File

@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::any::Any;
use std::cmp::Ordering;
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, HashMap};
use std::fmt::Display;
use std::pin::Pin;
use std::sync::Arc;
@@ -21,8 +23,8 @@ use std::task::{Context, Poll};
use std::time::Duration;
use ahash::RandomState;
use arrow::compute::{self, cast_with_options, CastOptions};
use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit};
use arrow::compute::{self, cast_with_options, CastOptions, SortColumn};
use arrow_schema::{DataType, Field, Schema, SchemaRef, SortOptions, TimeUnit};
use common_query::DfPhysicalPlan;
use common_recordbatch::DfSendableRecordBatchStream;
use datafusion::common::{Result as DataFusionResult, Statistics};
@@ -35,10 +37,14 @@ use datafusion::physical_plan::{
SendableRecordBatchStream,
};
use datafusion::physical_planner::create_physical_sort_expr;
use datafusion_common::utils::get_arrayref_at_indices;
use datafusion_common::utils::{get_arrayref_at_indices, get_row_at_idx};
use datafusion_common::{DFField, DFSchema, DFSchemaRef, DataFusionError, ScalarValue};
use datafusion_expr::utils::exprlist_to_fields;
use datafusion_expr::{Accumulator, Expr, ExprSchemable, LogicalPlan, UserDefinedLogicalNodeCore};
use datafusion_expr::utils::{exprlist_to_fields, COUNT_STAR_EXPANSION};
use datafusion_expr::{
lit, Accumulator, AggregateFunction, Expr, ExprSchemable, LogicalPlan,
UserDefinedLogicalNodeCore,
};
use datafusion_physical_expr::aggregate::utils::down_cast_any_ref;
use datafusion_physical_expr::expressions::create_aggregate_expr as create_aggr_expr;
use datafusion_physical_expr::hash_utils::create_hashes;
use datafusion_physical_expr::{
@@ -58,6 +64,140 @@ use crate::error::{DataFusionSnafu, RangeQuerySnafu, Result};
type Millisecond = <TimestampMillisecondType as ArrowPrimitiveType>::Native;
/// Implementation of `first_value`/`last_value`
/// aggregate function adapted to range query
#[derive(Debug)]
struct RangeFirstListValue {
/// calculate expr
expr: Arc<dyn PhysicalExpr>,
order_bys: Vec<PhysicalSortExpr>,
}
impl RangeFirstListValue {
pub fn new_aggregate_expr(
expr: Arc<dyn PhysicalExpr>,
order_bys: Vec<PhysicalSortExpr>,
) -> Arc<dyn AggregateExpr> {
Arc::new(Self { expr, order_bys })
}
}
impl PartialEq<dyn Any> for RangeFirstListValue {
fn eq(&self, other: &dyn Any) -> bool {
down_cast_any_ref(other)
.downcast_ref::<Self>()
.map(|x| self.expr.eq(&x.expr) && self.order_bys.iter().eq(x.order_bys.iter()))
.unwrap_or(false)
}
}
impl AggregateExpr for RangeFirstListValue {
fn as_any(&self) -> &dyn std::any::Any {
self
}
fn create_accumulator(&self) -> DataFusionResult<Box<dyn Accumulator>> {
Ok(Box::new(RangeFirstListValueAcc::new(
self.order_bys.iter().map(|order| order.options).collect(),
)))
}
fn expressions(&self) -> Vec<Arc<dyn PhysicalExpr>> {
let mut exprs: Vec<_> = self
.order_bys
.iter()
.map(|order| order.expr.clone())
.collect();
exprs.push(self.expr.clone());
exprs
}
fn field(&self) -> DataFusionResult<Field> {
unreachable!("AggregateExpr::field will not be used in range query")
}
fn state_fields(&self) -> DataFusionResult<Vec<Field>> {
unreachable!("AggregateExpr::state_fields will not be used in range query")
}
}
#[derive(Debug)]
pub struct RangeFirstListValueAcc {
pub sort_options: Vec<SortOptions>,
pub sort_columns: Vec<ScalarValue>,
pub data: Option<ScalarValue>,
}
impl RangeFirstListValueAcc {
pub fn new(sort_options: Vec<SortOptions>) -> Self {
Self {
sort_options,
sort_columns: vec![],
data: None,
}
}
}
impl Accumulator for RangeFirstListValueAcc {
fn update_batch(&mut self, values: &[ArrayRef]) -> DataFusionResult<()> {
let columns: Vec<_> = values
.iter()
.zip(self.sort_options.iter())
.map(|(v, s)| SortColumn {
values: v.clone(),
options: Some(*s),
})
.collect();
// finding the Top1 problem with complexity O(n)
let idx = compute::lexsort_to_indices(&columns, Some(1))?.value(0);
let vs = get_row_at_idx(values, idx as usize)?;
let need_update = self.data.is_none()
|| vs
.iter()
.zip(self.sort_columns.iter())
.zip(self.sort_options.iter())
.find_map(|((new_value, old_value), sort_option)| {
if new_value.is_null() && old_value.is_null() {
None
} else if sort_option.nulls_first
&& (new_value.is_null() || old_value.is_null())
{
Some(new_value.is_null())
} else {
new_value.partial_cmp(old_value).map(|x| {
(x == Ordering::Greater && sort_option.descending)
|| (x == Ordering::Less && !sort_option.descending)
})
}
})
.unwrap_or(false);
if need_update {
self.sort_columns = vs;
self.data = Some(ScalarValue::try_from_array(
&values[self.sort_options.len()],
idx as usize,
)?);
}
Ok(())
}
fn evaluate(&self) -> DataFusionResult<ScalarValue> {
Ok(self.data.clone().unwrap_or(ScalarValue::Null))
}
fn size(&self) -> usize {
std::mem::size_of_val(self)
}
fn state(&self) -> DataFusionResult<Vec<ScalarValue>> {
unreachable!("Accumulator::state will not be used in range query")
}
fn merge_batch(&mut self, _states: &[ArrayRef]) -> DataFusionResult<()> {
unreachable!("Accumulator::merge_batch will not be used in range query")
}
}
#[derive(PartialEq, Eq, Debug, Hash, Clone)]
pub enum Fill {
Null,
@@ -78,14 +218,15 @@ impl Display for Fill {
}
impl Fill {
pub fn try_from_str(value: &str, datatype: &DataType) -> DfResult<Self> {
pub fn try_from_str(value: &str, datatype: &DataType) -> DfResult<Option<Self>> {
let s = value.to_uppercase();
match s.as_str() {
"NULL" | "" => Ok(Self::Null),
"PREV" => Ok(Self::Prev),
"" => Ok(None),
"NULL" => Ok(Some(Self::Null)),
"PREV" => Ok(Some(Self::Prev)),
"LINEAR" => {
if datatype.is_numeric() {
Ok(Self::Linear)
Ok(Some(Self::Linear))
} else {
Err(DataFusionError::Plan(format!(
"Use FILL LINEAR on Non-numeric DataType {}",
@@ -100,13 +241,17 @@ impl Fill {
s, err
))
})
.map(Fill::Const),
.map(|x| Some(Fill::Const(x))),
}
}
/// The input `data` contains data on a complete time series.
/// If the filling strategy is `PREV` or `LINEAR`, caller must be ensured that the incoming `ts`&`data` is ascending time order.
pub fn apply_fill_strategy(&self, ts: &[i64], data: &mut [ScalarValue]) -> DfResult<()> {
// No calculation need in `Fill::Null`
if matches!(self, Fill::Null) {
return Ok(());
}
let len = data.len();
if *self == Fill::Linear {
return Self::fill_linear(ts, data);
@@ -114,7 +259,6 @@ impl Fill {
for i in 0..len {
if data[i].is_null() {
match self {
Fill::Null => continue,
Fill::Prev => {
if i != 0 {
data[i] = data[i - 1].clone()
@@ -122,7 +266,8 @@ impl Fill {
}
// The calculation of linear interpolation is relatively complicated.
// `Self::fill_linear` is used to dispose `Fill::Linear`.
Fill::Linear => unreachable!(),
// No calculation need in `Fill::Null`
Fill::Linear | Fill::Null => unreachable!(),
Fill::Const(v) => data[i] = v.clone(),
}
}
@@ -219,12 +364,12 @@ fn linear_interpolation(
#[derive(Eq, Clone, Debug)]
pub struct RangeFn {
/// with format like `max(a) RANGE 300s FILL NULL`
/// with format like `max(a) RANGE 300s [FILL NULL]`
pub name: String,
pub data_type: DataType,
pub expr: Expr,
pub range: Duration,
pub fill: Fill,
pub fill: Option<Fill>,
/// If the `FIll` strategy is `Linear` and the output is an integer,
/// it is possible to calculate a floating point number.
/// So for `FILL==LINEAR`, the entire data will be implicitly converted to Float type
@@ -271,6 +416,7 @@ pub struct RangeSelect {
pub align: Duration,
pub align_to: i64,
pub time_index: String,
pub time_expr: Expr,
pub by: Vec<Expr>,
pub schema: DFSchemaRef,
pub by_schema: DFSchemaRef,
@@ -324,7 +470,7 @@ impl RangeSelect {
name,
data_type.clone(),
// Only when data fill with Const option, the data can't be null
!matches!(fill, Fill::Const(..)),
!matches!(fill, Some(Fill::Const(..))),
))
},
)
@@ -382,6 +528,7 @@ impl RangeSelect {
align,
align_to,
time_index: time_index_name,
time_expr: time_index,
schema,
by_schema,
by,
@@ -440,6 +587,7 @@ impl UserDefinedLogicalNodeCore for RangeSelect {
range_expr: self.range_expr.clone(),
input: Arc::new(inputs[0].clone()),
time_index: self.time_index.clone(),
time_expr: self.time_expr.clone(),
schema: self.schema.clone(),
by: self.by.clone(),
by_schema: self.by_schema.clone(),
@@ -452,6 +600,7 @@ impl UserDefinedLogicalNodeCore for RangeSelect {
impl RangeSelect {
fn create_physical_expr_list(
&self,
is_count_aggr: bool,
exprs: &[Expr],
df_schema: &Arc<DFSchema>,
schema: &Schema,
@@ -459,7 +608,20 @@ impl RangeSelect {
) -> DfResult<Vec<Arc<dyn PhysicalExpr>>> {
exprs
.iter()
.map(|by| create_physical_expr(by, df_schema, schema, session_state.execution_props()))
.map(|e| match e {
// `count(*)` will be rewritten by `CountWildcardRule` into `count(1)` when optimizing logical plan.
// The modification occurs after range plan rewrite.
// At this time, aggregate plan has been replaced by a custom range plan,
// so `CountWildcardRule` has not been applied.
// We manually modify it when creating the physical plan.
Expr::Wildcard if is_count_aggr => create_physical_expr(
&lit(COUNT_STAR_EXPANSION),
df_schema,
schema,
session_state.execution_props(),
),
_ => create_physical_expr(e, df_schema, schema, session_state.execution_props()),
})
.collect::<DfResult<Vec<_>>>()
}
@@ -488,6 +650,72 @@ impl RangeSelect {
.iter()
.map(|range_fn| {
let expr = match &range_fn.expr {
Expr::AggregateFunction(aggr)
if aggr.fun == AggregateFunction::FirstValue
|| aggr.fun == AggregateFunction::LastValue =>
{
// Because we only need to find the first_value/last_value,
// the complexity of sorting the entire batch is O(nlogn).
// We can sort the batch with limit 1.
// In this case, the algorithm degenerates into finding the Top1 problem with complexity O(n).
// We need reverse the sort order of last_value to correctly apply limit 1 when sorting.
let order_by = if let Some(exprs) = &aggr.order_by {
exprs
.iter()
.map(|x| {
create_physical_sort_expr(
x,
input_dfschema,
&input_schema,
session_state.execution_props(),
)
.map(|expr| {
// reverse the last_value sort
if aggr.fun == AggregateFunction::LastValue {
PhysicalSortExpr {
expr: expr.expr,
options: SortOptions {
descending: !expr.options.descending,
nulls_first: !expr.options.nulls_first,
},
}
} else {
expr
}
})
})
.collect::<DfResult<Vec<_>>>()?
} else {
// if user not assign order by, time index is needed as default ordering
let time_index = create_physical_expr(
&self.time_expr,
input_dfschema,
&input_schema,
session_state.execution_props(),
)?;
vec![PhysicalSortExpr {
expr: time_index,
options: SortOptions {
descending: aggr.fun == AggregateFunction::LastValue,
nulls_first: false,
},
}]
};
let arg = self.create_physical_expr_list(
false,
&aggr.args,
input_dfschema,
&input_schema,
session_state,
)?;
// first_value/last_value has only one param.
// The param have been checked by datafusion in logical plan stage.
// We can safely assume that there is only one element here.
Ok(RangeFirstListValue::new_aggregate_expr(
arg[0].clone(),
order_by,
))
}
Expr::AggregateFunction(aggr) => {
let order_by = if let Some(exprs) = &aggr.order_by {
exprs
@@ -508,6 +736,7 @@ impl RangeSelect {
&aggr.fun,
false,
&self.create_physical_expr_list(
aggr.fun == AggregateFunction::Count,
&aggr.args,
input_dfschema,
&input_schema,
@@ -523,6 +752,7 @@ impl RangeSelect {
let expr = create_aggr_udf_expr(
&aggr_udf.fun,
&self.create_physical_expr_list(
false,
&aggr_udf.args,
input_dfschema,
&input_schema,
@@ -564,6 +794,7 @@ impl RangeSelect {
align: self.align.as_millis() as Millisecond,
align_to: self.align_to,
by: self.create_physical_expr_list(
false,
&self.by,
input_dfschema,
&input_schema,
@@ -584,10 +815,26 @@ struct RangeFnExec {
pub expr: Arc<dyn AggregateExpr>,
pub args: Vec<Arc<dyn PhysicalExpr>>,
pub range: Millisecond,
pub fill: Fill,
pub fill: Option<Fill>,
pub need_cast: Option<DataType>,
}
impl Display for RangeFnExec {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if let Some(fill) = &self.fill {
write!(
f,
"{} RANGE {}s FILL {}",
self.expr.name(),
self.range / 1000,
fill
)
} else {
write!(f, "{} RANGE {}s", self.expr.name(), self.range / 1000)
}
}
}
#[derive(Debug)]
pub struct RangeSelectExec {
input: Arc<dyn ExecutionPlan>,
@@ -608,18 +855,8 @@ impl DisplayAs for RangeSelectExec {
match t {
DisplayFormatType::Default | DisplayFormatType::Verbose => {
write!(f, "RangeSelectExec: ")?;
let range_expr_strs: Vec<String> = self
.range_exec
.iter()
.map(|e| {
format!(
"{} RANGE {}s FILL {}",
e.expr.name(),
e.range / 1000,
e.fill
)
})
.collect();
let range_expr_strs: Vec<String> =
self.range_exec.iter().map(RangeFnExec::to_string).collect();
let by: Vec<String> = self.by.iter().map(|e| e.to_string()).collect();
write!(
f,
@@ -713,7 +950,7 @@ impl ExecutionPlan for RangeSelectExec {
by: self.by.clone(),
series_map: HashMap::new(),
exec_state: ExecutionState::ReadingInput,
output_num_rows: 0,
num_not_null_rows: 0,
row_converter,
modify_map: HashMap::new(),
metric: baseline_metric,
@@ -753,8 +990,8 @@ struct RangeSelectStream {
/// value: `[row_ids]`
/// It is used to record the data that needs to be aggregated in each time slot during the data update process
modify_map: HashMap<(u64, Millisecond), Vec<u32>>,
/// The number of rows of the final output
output_num_rows: usize,
/// The number of rows of not null rows in the final output
num_not_null_rows: usize,
metric: BaselineMetrics,
schema_project: Option<Vec<usize>>,
schema_before_project: SchemaRef,
@@ -766,7 +1003,7 @@ struct SeriesState {
row: OwnedRow,
/// key: align_ts
/// value: a vector, each element is a range_fn follow the order of `range_exec`
align_ts_accumulator: HashMap<Millisecond, Vec<Box<dyn Accumulator>>>,
align_ts_accumulator: BTreeMap<Millisecond, Vec<Box<dyn Accumulator>>>,
}
/// Use `align_to` as time origin.
@@ -882,7 +1119,7 @@ impl RangeSelectStream {
let accumulators_map =
self.series_map.entry(*hash).or_insert_with(|| SeriesState {
row: by_rows.row(*row as usize).owned(),
align_ts_accumulator: HashMap::new(),
align_ts_accumulator: BTreeMap::new(),
});
match accumulators_map.align_ts_accumulator.entry(*ts) {
Entry::Occupied(mut e) => {
@@ -890,7 +1127,7 @@ impl RangeSelectStream {
accumulators[i].update_batch(&sliced_arrays)
}
Entry::Vacant(e) => {
self.output_num_rows += 1;
self.num_not_null_rows += 1;
let mut accumulators = self
.range_exec
.iter()
@@ -915,29 +1152,47 @@ impl RangeSelectStream {
// 1 for time index column
let mut columns: Vec<Arc<dyn Array>> =
Vec::with_capacity(1 + self.range_exec.len() + self.by.len());
let mut ts_builder = TimestampMillisecondBuilder::with_capacity(self.output_num_rows);
let mut all_scalar = vec![Vec::with_capacity(self.output_num_rows); self.range_exec.len()];
let mut by_rows = Vec::with_capacity(self.output_num_rows);
let mut ts_builder = TimestampMillisecondBuilder::with_capacity(self.num_not_null_rows);
let mut all_scalar =
vec![Vec::with_capacity(self.num_not_null_rows); self.range_exec.len()];
let mut by_rows = Vec::with_capacity(self.num_not_null_rows);
let mut start_index = 0;
// RangePlan is calculated on a row basis. If a column uses the PREV or LINEAR filling strategy,
// we must arrange the data in the entire data row to determine the NULL filling value.
let need_sort_output = self
// If any range expr need fill, we need fill both the missing align_ts and null value.
let need_fill_output = self.range_exec.iter().any(|range| range.fill.is_some());
// The padding value for each accumulator
let padding_values = self
.range_exec
.iter()
.any(|range| range.fill == Fill::Linear || range.fill == Fill::Prev);
.map(|e| e.expr.create_accumulator()?.evaluate())
.collect::<DfResult<Vec<_>>>()?;
for SeriesState {
row,
align_ts_accumulator,
} in self.series_map.values()
{
// collect data on time series
let mut align_ts = align_ts_accumulator.keys().copied().collect::<Vec<_>>();
if need_sort_output {
align_ts.sort();
// skip empty time series
if align_ts_accumulator.is_empty() {
continue;
}
// find the first and last align_ts
let begin_ts = *align_ts_accumulator.first_key_value().unwrap().0;
let end_ts = *align_ts_accumulator.last_key_value().unwrap().0;
let align_ts = if need_fill_output {
// we need to fill empty align_ts which not data in that solt
(begin_ts..=end_ts).step_by(self.align as usize).collect()
} else {
align_ts_accumulator.keys().copied().collect::<Vec<_>>()
};
for ts in &align_ts {
for (i, accumulator) in align_ts_accumulator.get(ts).unwrap().iter().enumerate() {
all_scalar[i].push(accumulator.evaluate()?);
if let Some(slot) = align_ts_accumulator.get(ts) {
for (column, acc) in all_scalar.iter_mut().zip(slot.iter()) {
column.push(acc.evaluate()?);
}
} else {
// fill null in empty time solt
for (column, padding) in all_scalar.iter_mut().zip(padding_values.iter()) {
column.push(padding.clone())
}
}
}
ts_builder.append_slice(&align_ts);
@@ -950,14 +1205,16 @@ impl RangeSelectStream {
) in self.range_exec.iter().enumerate()
{
let time_series_data =
&mut all_scalar[i][start_index..start_index + align_ts_accumulator.len()];
&mut all_scalar[i][start_index..start_index + align_ts.len()];
if let Some(data_type) = need_cast {
cast_scalar_values(time_series_data, data_type)?;
}
fill.apply_fill_strategy(&align_ts, time_series_data)?;
if let Some(fill) = fill {
fill.apply_fill_strategy(&align_ts, time_series_data)?;
}
}
by_rows.resize(by_rows.len() + align_ts_accumulator.len(), row.row());
start_index += align_ts_accumulator.len();
by_rows.resize(by_rows.len() + align_ts.len(), row.row());
start_index += align_ts.len();
}
for column_scalar in all_scalar {
columns.push(ScalarValue::iter_to_array(column_scalar)?);
@@ -1078,7 +1335,7 @@ mod test {
const TIME_INDEX_COLUMN: &str = "timestamp";
fn prepare_test_data(is_float: bool) -> MemoryExec {
fn prepare_test_data(is_float: bool, is_gap: bool) -> MemoryExec {
let schema = Arc::new(Schema::new(vec![
Field::new(TIME_INDEX_COLUMN, TimestampMillisecondType::DATA_TYPE, true),
Field::new(
@@ -1092,16 +1349,23 @@ mod test {
),
Field::new("host", DataType::Utf8, true),
]));
let timestamp_column: Arc<dyn Array> = Arc::new(TimestampMillisecondArray::from(vec![
0, 5_000, 10_000, 15_000, 20_000, // host 1 every 5s
0, 5_000, 10_000, 15_000, 20_000, // host 2 every 5s
])) as _;
let mut host = vec!["host1"; 5];
host.extend(vec!["host2"; 5]);
let value_column: Arc<dyn Array> = if is_float {
Arc::new(nullable_array!(Float64;
0.0, null, 1.0, null, 2.0, // data for host 1
3.0, null, 4.0, null, 5.0 // data for host 2
let timestamp_column: Arc<dyn Array> = if !is_gap {
Arc::new(TimestampMillisecondArray::from(vec![
0, 5_000, 10_000, 15_000, 20_000, // host 1 every 5s
0, 5_000, 10_000, 15_000, 20_000, // host 2 every 5s
])) as _
} else {
Arc::new(TimestampMillisecondArray::from(vec![
0, 15_000, // host 1 every 5s, missing data on 5_000, 10_000
0, 15_000, // host 2 every 5s, missing data on 5_000, 10_000
])) as _
};
let mut host = vec!["host1"; timestamp_column.len() / 2];
host.extend(vec!["host2"; timestamp_column.len() / 2]);
let mut value_column: Arc<dyn Array> = if is_gap {
Arc::new(nullable_array!(Int64;
0, 6, // data for host 1
6, 12 // data for host 2
)) as _
} else {
Arc::new(nullable_array!(Int64;
@@ -1109,6 +1373,11 @@ mod test {
3, null, 4, null, 5 // data for host 2
)) as _
};
if is_float {
value_column =
cast_with_options(&value_column, &DataType::Float64, &CastOptions::default())
.unwrap();
}
let host_column: Arc<dyn Array> = Arc::new(StringArray::from(host)) as _;
let data = RecordBatch::try_new(
schema.clone(),
@@ -1123,8 +1392,9 @@ mod test {
range1: Millisecond,
range2: Millisecond,
align: Millisecond,
fill: Fill,
fill: Option<Fill>,
is_float: bool,
is_gap: bool,
expected: String,
) {
let data_type = if is_float {
@@ -1132,13 +1402,13 @@ mod test {
} else {
DataType::Int64
};
let (need_cast, schema_data_type) = if !is_float && fill == Fill::Linear {
let (need_cast, schema_data_type) = if !is_float && matches!(fill, Some(Fill::Linear)) {
// data_type = DataType::Float64;
(Some(DataType::Float64), DataType::Float64)
} else {
(None, data_type.clone())
};
let memory_exec = Arc::new(prepare_test_data(is_float));
let memory_exec = Arc::new(prepare_test_data(is_float, is_gap));
let schema = Arc::new(Schema::new(vec![
Field::new("MIN(value)", schema_data_type.clone(), true),
Field::new("MAX(value)", schema_data_type, true),
@@ -1223,7 +1493,16 @@ mod test {
\n| 3.0 | 3.0 | 1970-01-01T00:00:00 | host2 |\
\n+------------+------------+---------------------+-------+",
);
do_range_select_test(10_000, 10_000, 1_000_000, Fill::Null, true, expected).await;
do_range_select_test(
10_000,
10_000,
1_000_000,
Some(Fill::Null),
true,
false,
expected,
)
.await;
}
#[tokio::test]
@@ -1246,7 +1525,16 @@ mod test {
\n| 5.0 | 5.0 | 1970-01-01T00:00:20 | host2 |\
\n+------------+------------+---------------------+-------+",
);
do_range_select_test(10_000, 5_000, 5_000, Fill::Null, true, expected).await;
do_range_select_test(
10_000,
5_000,
5_000,
Some(Fill::Null),
true,
false,
expected,
)
.await;
}
#[tokio::test]
@@ -1269,7 +1557,16 @@ mod test {
\n| 5.0 | 5.0 | 1970-01-01T00:00:20 | host2 |\
\n+------------+------------+---------------------+-------+",
);
do_range_select_test(10_000, 5_000, 5_000, Fill::Prev, true, expected).await;
do_range_select_test(
10_000,
5_000,
5_000,
Some(Fill::Prev),
true,
false,
expected,
)
.await;
}
#[tokio::test]
@@ -1292,7 +1589,16 @@ mod test {
\n| 5.0 | 5.0 | 1970-01-01T00:00:20 | host2 |\
\n+------------+------------+---------------------+-------+",
);
do_range_select_test(10_000, 5_000, 5_000, Fill::Linear, true, expected).await;
do_range_select_test(
10_000,
5_000,
5_000,
Some(Fill::Linear),
true,
false,
expected,
)
.await;
}
#[tokio::test]
@@ -1315,7 +1621,16 @@ mod test {
\n| 5.0 | 5.0 | 1970-01-01T00:00:20 | host2 |\
\n+------------+------------+---------------------+-------+",
);
do_range_select_test(10_000, 5_000, 5_000, Fill::Linear, false, expected).await;
do_range_select_test(
10_000,
5_000,
5_000,
Some(Fill::Linear),
false,
false,
expected,
)
.await;
}
#[tokio::test]
@@ -1342,7 +1657,101 @@ mod test {
10_000,
5_000,
5_000,
Fill::Const(ScalarValue::Float64(Some(6.6))),
Some(Fill::Const(ScalarValue::Float64(Some(6.6)))),
true,
false,
expected,
)
.await;
}
#[tokio::test]
async fn range_fill_gap() {
let expected = String::from(
"+------------+------------+---------------------+-------+\
\n| MIN(value) | MAX(value) | timestamp | host |\
\n+------------+------------+---------------------+-------+\
\n| 0.0 | 0.0 | 1970-01-01T00:00:00 | host1 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:15 | host1 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:00 | host2 |\
\n| 12.0 | 12.0 | 1970-01-01T00:00:15 | host2 |\
\n+------------+------------+---------------------+-------+",
);
do_range_select_test(5_000, 5_000, 5_000, None, true, true, expected).await;
let expected = String::from(
"+------------+------------+---------------------+-------+\
\n| MIN(value) | MAX(value) | timestamp | host |\
\n+------------+------------+---------------------+-------+\
\n| 0.0 | 0.0 | 1970-01-01T00:00:00 | host1 |\
\n| | | 1970-01-01T00:00:05 | host1 |\
\n| | | 1970-01-01T00:00:10 | host1 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:15 | host1 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:00 | host2 |\
\n| | | 1970-01-01T00:00:05 | host2 |\
\n| | | 1970-01-01T00:00:10 | host2 |\
\n| 12.0 | 12.0 | 1970-01-01T00:00:15 | host2 |\
\n+------------+------------+---------------------+-------+",
);
do_range_select_test(5_000, 5_000, 5_000, Some(Fill::Null), true, true, expected).await;
let expected = String::from(
"+------------+------------+---------------------+-------+\
\n| MIN(value) | MAX(value) | timestamp | host |\
\n+------------+------------+---------------------+-------+\
\n| 0.0 | 0.0 | 1970-01-01T00:00:00 | host1 |\
\n| 0.0 | 0.0 | 1970-01-01T00:00:05 | host1 |\
\n| 0.0 | 0.0 | 1970-01-01T00:00:10 | host1 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:15 | host1 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:00 | host2 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:05 | host2 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:10 | host2 |\
\n| 12.0 | 12.0 | 1970-01-01T00:00:15 | host2 |\
\n+------------+------------+---------------------+-------+",
);
do_range_select_test(5_000, 5_000, 5_000, Some(Fill::Prev), true, true, expected).await;
let expected = String::from(
"+------------+------------+---------------------+-------+\
\n| MIN(value) | MAX(value) | timestamp | host |\
\n+------------+------------+---------------------+-------+\
\n| 0.0 | 0.0 | 1970-01-01T00:00:00 | host1 |\
\n| 2.0 | 2.0 | 1970-01-01T00:00:05 | host1 |\
\n| 4.0 | 4.0 | 1970-01-01T00:00:10 | host1 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:15 | host1 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:00 | host2 |\
\n| 8.0 | 8.0 | 1970-01-01T00:00:05 | host2 |\
\n| 10.0 | 10.0 | 1970-01-01T00:00:10 | host2 |\
\n| 12.0 | 12.0 | 1970-01-01T00:00:15 | host2 |\
\n+------------+------------+---------------------+-------+",
);
do_range_select_test(
5_000,
5_000,
5_000,
Some(Fill::Linear),
true,
true,
expected,
)
.await;
let expected = String::from(
"+------------+------------+---------------------+-------+\
\n| MIN(value) | MAX(value) | timestamp | host |\
\n+------------+------------+---------------------+-------+\
\n| 0.0 | 0.0 | 1970-01-01T00:00:00 | host1 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:05 | host1 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:10 | host1 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:15 | host1 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:00 | host2 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:05 | host2 |\
\n| 6.0 | 6.0 | 1970-01-01T00:00:10 | host2 |\
\n| 12.0 | 12.0 | 1970-01-01T00:00:15 | host2 |\
\n+------------+------------+---------------------+-------+",
);
do_range_select_test(
5_000,
5_000,
5_000,
Some(Fill::Const(ScalarValue::Float64(Some(6.0)))),
true,
true,
expected,
)
@@ -1351,7 +1760,8 @@ mod test {
#[test]
fn fill_test() {
assert!(Fill::try_from_str("Linear", &DataType::UInt8).unwrap() == Fill::Linear);
assert!(Fill::try_from_str("", &DataType::UInt8).unwrap().is_none());
assert!(Fill::try_from_str("Linear", &DataType::UInt8).unwrap() == Some(Fill::Linear));
assert_eq!(
Fill::try_from_str("Linear", &DataType::Boolean)
.unwrap_err()
@@ -1372,7 +1782,7 @@ mod test {
);
assert!(
Fill::try_from_str("8", &DataType::UInt8).unwrap()
== Fill::Const(ScalarValue::UInt8(Some(8)))
== Some(Fill::Const(ScalarValue::UInt8(Some(8))))
);
let mut test1 = vec![
ScalarValue::UInt8(Some(8)),
@@ -1447,4 +1857,44 @@ mod test {
Fill::Linear.apply_fill_strategy(&ts, &mut test1).unwrap();
assert_eq!(test, test1);
}
#[test]
fn test_fist_last_accumulator() {
let mut acc = RangeFirstListValueAcc::new(vec![
SortOptions {
descending: true,
nulls_first: false,
},
SortOptions {
descending: false,
nulls_first: true,
},
]);
let batch1: Vec<Arc<dyn Array>> = vec![
Arc::new(nullable_array!(Float64;
0.0, null, 0.0, null, 1.0
)),
Arc::new(nullable_array!(Float64;
5.0, null, 4.0, null, 3.0
)),
Arc::new(nullable_array!(Int64;
1, 2, 3, 4, 5
)),
];
let batch2: Vec<Arc<dyn Array>> = vec![
Arc::new(nullable_array!(Float64;
3.0, 3.0, 3.0, 3.0, 3.0
)),
Arc::new(nullable_array!(Float64;
null,3.0, 3.0, 3.0, 3.0
)),
Arc::new(nullable_array!(Int64;
6, 7, 8, 9, 10
)),
];
acc.update_batch(&batch1).unwrap();
assert_eq!(acc.evaluate().unwrap(), ScalarValue::Int64(Some(5)));
acc.update_batch(&batch2).unwrap();
assert_eq!(acc.evaluate().unwrap(), ScalarValue::Int64(Some(6)));
}
}

Some files were not shown because too many files have changed in this diff Show More