Compare commits

..

79 Commits

Author SHA1 Message Date
Discord9
f995204060 test: more reduce tests 2023-09-06 16:38:51 +08:00
Discord9
93561291e4 support more binary function 2023-09-06 16:38:51 +08:00
Discord9
9f59d68391 eval func 2023-09-06 16:37:49 +08:00
Discord9
51083b12bd reduce_bucketed 2023-09-06 16:37:49 +08:00
Discord9
c80165c377 test: simple render 2023-09-06 16:37:49 +08:00
Discord9
76d8709774 sink&source 2023-09-06 16:37:49 +08:00
Discord9
2cf7d6d569 feat: build_accumulable 2023-09-06 16:37:49 +08:00
Discord9
045c8079e6 feat: flow util func 2023-09-06 16:37:49 +08:00
Discord9
54f2f6495f mfp & reduce partially 2023-09-06 16:37:49 +08:00
Discord9
2798d266f5 feat: render plan partially writen 2023-09-06 16:37:49 +08:00
Discord9
824d03a642 working on reduce 2023-09-06 16:36:41 +08:00
Discord9
47f41371d0 Arrangement&types 2023-09-06 16:36:41 +08:00
Discord9
d702b6e5c4 use newer DD 2023-09-06 16:36:41 +08:00
Discord9
13c02f3f92 basic skeleton 2023-09-06 16:36:41 +08:00
Discord9
b52eb2313e renamed as greptime-flow 2023-09-06 16:36:41 +08:00
Discord9
d422bc8401 basic demo 2023-09-06 16:36:41 +08:00
Zou Wei
b8c50d00aa feat: sqlness test for interval type (#2265)
* feat: add integration-test for interval type.

* chore: add two cases.

* chore: cr

* chore: Field to Column
2023-09-04 14:30:48 +08:00
Ruihang Xia
a12ee5cab8 fix: qualify inputs on handling join in promql (#2297)
* add qualifier to join inputs

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add one more case

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update test results

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2023-09-01 11:51:34 +08:00
ZonaHe
a0d15b489a feat: update dashboard to v0.3.2 (#2295)
Co-authored-by: ZonaHex <ZonaHex@users.noreply.github.com>
2023-08-31 22:05:00 +08:00
shuiyisong
baa372520d fix: json compatibility to null (#2287)
* fix: existing null value for schema name value

* chore: fix null check

* fix: change catalognamevalue and schemanamevalue to option

* fix: fix null case
2023-08-31 14:21:10 +08:00
shuiyisong
5df4d44761 feat: schema level opts (#2283)
* chore: update proto

* chore: add try from for schema name value

* chore: merge schema opts to table opts while creating table

* chore: use table ttl opts first

* chore: add unit test

* chore: update proto version
2023-08-30 08:11:08 +00:00
Weny Xu
8e9f2ffce4 fix: skip procedure if target route is not found (#2277)
* fix: skip procedure if target route is not found

* chore: apply suggestions from CR
2023-08-30 06:59:50 +00:00
Weny Xu
1101e7bb18 fix: deregister table after keeper closes table (#2278)
* fix: deregister table after keeper closes table

* chore: apply suggestions from CR
2023-08-30 03:43:04 +00:00
zyy17
5fbc941023 ci: upload the latest artifacts to 'latest/' directory of S3 bucket in scheduled and formal release (#2276)
Signed-off-by: zyy17 <zyylsxm@gmail.com>
2023-08-29 09:00:45 +00:00
Bamboo1
68600a2cf9 feat(mito2): add file purger and cooperate with scheduler to purge sst files (#2251)
* feat: add file purger and use scheduler

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: code format

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: code format

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* feat: print some information about handling error message

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* fix: resolve conversion

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: code format

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: resolve conversation

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* fix: resolve conflicting files

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: code format

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: code format

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

---------

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>
2023-08-29 07:55:03 +00:00
Yingwen
805f254d15 feat(mito): Flush framework for mito2 (#2262)
* feat: write buffer manager

* feat: skeleton

* feat: add flush logic to write path

* feat: add methods to memtable trait

* feat: freeze memtable

* feat: define flush task

* feat: schedule_flush wip

* feat: adding pending requests/tasks

* feat: separate ddl request and background request

* feat: Remove RegionTask and RequestBody

* feat: handle flush related requests

* feat: make tests pass

* style: fix clippy

* docs: update comment

* refactor: rename background requests

* feat: replace Option<RegionWriteCtx> with an enum MaybeStalling
2023-08-29 07:13:15 +00:00
Zhenchi
2a6c830ca7 refactor(table): remove Table impl for system (#2270)
* refactor(table): remove Table impl for system

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix: format & import

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2023-08-29 03:43:43 +00:00
Weny Xu
22dea02485 fix: use RegionId region number instead (#2273) 2023-08-29 02:52:24 +00:00
LFC
ef75e8f7c3 feat: create distributed Mito2 table (#2246)
* feat: create distributed Mito2 table

* rebase develop
2023-08-28 12:07:52 +00:00
Weny Xu
71fc3c42d9 fix: open region does not register catalog/schema (#2271)
* fix: open region does not register catalog/schema

* fix: fix ci
2023-08-28 12:06:10 +00:00
JeremyHi
c02ac36ce8 feat: avoid confusion in desc table (#2272)
feat: Field to Column to aviod confusion in DESC TABLE
2023-08-28 11:50:33 +00:00
Lei, HUANG
c112b9a763 feat(mito2): WAL replay (#2264)
* feat: replay memtable when opening table

* test: region replay

* refactor: save logstore in TestEnv

* fix: some cr comments

* chore: rebase develop

* chore: update last entry id during replay
2023-08-28 11:45:23 +00:00
Weny Xu
96fd17aa0a fix: fix typoes (#2268) 2023-08-28 09:26:00 +00:00
Ruihang Xia
6b8cf0bbf0 feat: impl region engine for mito (#2269)
* update proto

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* convert request

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update proto

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* import result convertor

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* rename symbols

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2023-08-28 09:24:12 +00:00
Yingwen
e2522dff21 feat(mito): Skeleton for scanning a region (#2230)
* feat: define stream builder

* feat: scan region wip

* feat: create SeqScan in ScanRegion

* feat: scanner

* feat: engine handles scan request

* feat: map projection index to column id

* feat: Impl record batch stream

* refactor: change BatchConverter to ProjectionMapper

* feat: add column_ids to mapper

* feat: implement SeqScan::build()

* chore: fix typo

* docs: add mermaid for ScanRegion

* style: fix clippy

* test: fix record batch test

* fix: update sequence and entry id

* test: test query

* feat: address CR comment

* chore: address CR comments

* chore: Update src/mito2/src/read/scan_region.rs

Co-authored-by: Lei, HUANG <6406592+v0y4g3r@users.noreply.github.com>

---------

Co-authored-by: Lei, HUANG <6406592+v0y4g3r@users.noreply.github.com>
2023-08-28 06:59:31 +00:00
LFC
d8f851bef2 fix: keep region failover state not changed upon failure (#2261) 2023-08-28 04:40:47 +00:00
JeremyHi
63b22b2403 feat: prometheus row inserter (#2263)
* feat: prometheus row inserter

* chore: add unit test

* refactor: to row_insert_requests

* chore: typo

* chore: alloc row by TableData

* chore: by review comment
2023-08-28 03:22:23 +00:00
Weny Xu
c56f5e39cd refactor: set default metasrv procedure retry times to 12 (#2242) 2023-08-26 07:41:15 +00:00
Weny Xu
7ff200c0fa fix: align region numbers to real regions (#2257) 2023-08-25 11:48:58 +00:00
dennis zhuang
5160838d04 chore: change version to 0.4.0-nightly (#2258)
* chore: change version to 0.4.0-nightly

* fix: test
2023-08-25 09:44:39 +00:00
shuiyisong
f16f58266e refactor: query_ctx from http middleware (#2253)
* chore: change userinfo to query_ctx in http handler

* chore: minor change

* chore: move prometheus http to http mod

* chore: fix uni test:

* chore: add back schema check

* chore: minor change

* chore: remove clone
2023-08-25 09:36:33 +00:00
Ruihang Xia
8d446ed741 fix: quote ident on rendered SQL (#2248)
* fix: quote ident on rendered SQL

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* read quote style from query context

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* update sqlness result

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2023-08-25 07:25:21 +00:00
JeremyHi
de1daec680 feat: upgrade desc table output (#2256) 2023-08-25 06:52:22 +00:00
Zhenchi
9d87c8b6de refactor(table): cleanup dist table (#2255)
Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2023-08-25 06:37:39 +00:00
Lei, HUANG
6bf260a05c chore: write to mito2 (#2250)
* chore: write to mito2

* fix: clippy

* feat: brdige memtable

* chore: rebase develop
2023-08-25 06:18:42 +00:00
WU Jingdi
15912afd96 fix: the inconsistent order of input/output in range select (#2229)
* fix: the inconsistent order of input/output in range select

* chore: apply CR
2023-08-25 04:12:59 +00:00
Lei, HUANG
dbe0e95f2f feat(mito2): concat and projection (#2243)
* refactor: use arrow::compute::concat instead of push values to vector builders

* feat: support projection

* refactor: remove sequence

* refactor: concatenate

* fix: series must not be empty

* refactor: projection
2023-08-25 03:25:27 +00:00
Ruihang Xia
20b7f907b2 fix: promql planner should clear its states on each selector (#2247)
* reset planner status on selector

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add sqlness test

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add empty line

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* sort result

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* mask fields to keep ordering

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2023-08-25 03:07:44 +00:00
Weny Xu
b13d932e4e fix: fix RegionAliveKeeper does not find the table after restarting (#2249) 2023-08-25 03:05:17 +00:00
Bamboo1
48348aa364 fix: fix test_scheduler_continuous_stop in scheduler (#2252)
* fix: fix test_scheduler_continuous_stop in scheduler

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: add document annotation

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

---------

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>
2023-08-25 02:59:48 +00:00
Zhenchi
9ce73e7ca1 refactor(frontend): TableScan instead of scan_to_stream for COPY TO (#2244)
* refactor(frontend): TableScan instead of `scan_to_stream` for `COPY TO`

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix: format

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2023-08-24 12:46:54 +00:00
Ruihang Xia
b633a16667 feat: apply rewriter to subquery exprs (#2245)
* apply rewriter to subquery exprs

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* workaround for datafusion's check

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* clean up

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add sqlness test

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* fix typo

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* change time index type

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2023-08-24 11:48:04 +00:00
Zhenchi
0a6ab2a287 refactor(script): not to call scan_to_stream on table (#2241)
* refactor(script): not to call `scan_to_stream` on table

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* refactor: build plan via LogicalPlanBuilder

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2023-08-24 08:10:07 +00:00
JeremyHi
7746e5b172 feat: dist row inserter (#2231)
* feat: fronend row inserter

* feat: row splitter

chore: row splitter's unit test

* feat: RowDistInserter

* feat: make influxdb line protocol using row-based protocol

* Update src/partition/src/row_splitter.rs

Co-authored-by: Yingwen <realevenyag@gmail.com>

* Update src/frontend/src/instance/distributed/row_inserter.rs

Co-authored-by: Yingwen <realevenyag@gmail.com>

* chore: by review comment

* Update src/frontend/src/instance/distributed/row_inserter.rs

Co-authored-by: LFC <bayinamine@gmail.com>

* chore: by comment

---------

Co-authored-by: Yingwen <realevenyag@gmail.com>
Co-authored-by: LFC <bayinamine@gmail.com>
2023-08-24 06:58:05 +00:00
Weny Xu
a7e0e2330e fix: invalidate cache after altering (#2239) 2023-08-24 03:56:17 +00:00
Lei, HUANG
19d2d77b41 fix: parse large timestamp (#2185)
* feat: support parsing large timestamp values

* chore: update sqlness tests

* fix: tests

* fix: allow larger window
2023-08-24 03:52:15 +00:00
Yingwen
4ee1034012 feat(mito): merge reader for mito2 (#2210)
* feat: Implement slice and first/last timestamp for Batch

* feat(mito): implements sort/concat for Batch

* chore: fix typo

* chore: remove comments

* feat: sort and dedup

* test: test batch operations

* chore: cast enum to test op type

* test: test filter related api

* sytle: fix clippy

* feat: implement Node and CompareFirst

* feat: merge reader wip

* feat: merge wip

* feat: use batch's operation to sort and dedup

* feat: implement BatchReader for MergeReader

* feat: simplify codes

* test: test merge reader

* refactor: use test util to create batch

* refactor: remove unused imports

* feat: update comment

* chore: remove metadata() from Source

* chroe: update comment

* feat: source supports batch iterator

* chore: update comment
2023-08-24 03:37:51 +00:00
Ruihang Xia
e5ba3d1708 feat: rewrite the dist analyzer (#2238)
* it works!

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* clean up

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add documents

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* remove unstable timestamp from sqlness test

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* rename rewriter struct

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2023-08-24 03:29:08 +00:00
dennis zhuang
8b1f4eb958 feat: types sqlness tests (#2073)
* feat: timestamp types sqlness tests

* feat: adds timestamp tests

* test: add string tests

* test: comment a case in timestamp

* test: add float type tests

* chore: adds TODO

* feat: set TZ=UTC for sqlness test
2023-08-24 03:26:19 +00:00
discord9
eca7e87129 chore: try from value (#2236)
* chore: try from value

* chore: add TryFromValueError variant
2023-08-24 02:44:13 +00:00
Weny Xu
beb92ba1d2 refactor: use table id instead of table ident (#2233) 2023-08-23 13:28:08 +00:00
Lei, HUANG
fdb5ad23bf refactor: use Batch::sort_and_dedup instead of Values::sort_in_place (#2235) 2023-08-23 08:56:49 +00:00
Ruihang Xia
d581688fd2 fix: dist planner has wrong behavior in table with multiple partitions (#2237)
* fix: dist planner has wrong behavior in table with multiple partitions

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* Update tests/cases/distributed/explain/multi_partitions.sql

Co-authored-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
Co-authored-by: Zhenchi <zhongzc_arch@outlook.com>
2023-08-23 08:32:20 +00:00
Bamboo1
4dbc32f532 refactor: remove associate type in scheduler to simplify it #2153 (#2194)
* feature: add a simple scheduler using flume

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* fix: only use a sender rather clone many senders

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* fix: use select to avoid loop

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* feat: add parameters in new function to build the flume capacity and number of receivers

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* test: add countdownlatch test concurrency

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* test: add barrier replacing countdownlatch to test concurrency and add wait all tasks finished in stop

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: add some document annotation

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: add license header

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: code format

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: add Cargo.lock

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: Cargo.toml format

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: delete println in test

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: code format

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: code format

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* feat: add error handle

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* fix: fix error handle and add test scheduler stop

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: spelling mistake

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* fix: wait all tasks finished

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: add todo which need wrap Future returned by send_async

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: code format

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* test: remove unnessary sleep in test

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* fix: resolve some conflicts

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* fix: resolve conversation

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: code format

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* chore: code format

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

* feat: modify the function of schedule to synchronize and drop sender after stopping scheduler

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>

---------

Signed-off-by: ZhuZiyi <zyzhu2001@gmail.com>
2023-08-23 06:28:00 +00:00
Zhenchi
af95e46512 refactor(table): eliminate calls to DistTable.delete (#2225)
* refactor(table): eliminate calls to DistTable.delete

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix: format

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix: clippy

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2023-08-23 02:33:48 +00:00
Weny Xu
d81ddd8879 chore: fix clippy (#2232) 2023-08-23 02:24:29 +00:00
Ning Sun
88247e4284 fix!: resolve residual issues with removing prometheus port (#2227)
* fix: resolve residual issues when removing prometheus port

* fix: remove prometheus from sample config as well
2023-08-23 01:49:11 +00:00
Ruihang Xia
18250c4803 feat: implement Flight and gRPC services for RegionServer (#2226)
* extract FlightCraft trait

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* split service handler in GrpcServer

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* left grpc server implement

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* start region server if configured

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2023-08-22 13:30:09 +00:00
dennis zhuang
18fa0e01ed feat: remove checkpoint_on_startup (#2228)
feat: update flushed manifest version when it is larger
2023-08-22 13:09:34 +00:00
Yingwen
cc3e198975 feat(mito): Implement operations like concat and sort for Batch (#2203)
* feat: Implement slice and first/last timestamp for Batch

* feat(mito): implements sort/concat for Batch

* chore: fix typo

* chore: remove comments

* feat: sort and dedup

* test: test batch operations

* chore: cast enum to test op type

* test: test filter related api

* sytle: fix clippy

* docs: comment for slice

* chore: address CR comment

Don't return Option in get_timestamp()/get_sequence()
2023-08-22 12:03:02 +00:00
Yingwen
cd3755c615 feat(mito): Support handling RegionWriteRequest (#2218)
* feat: convert region request to worker write request

* chore: remove unused codes

* test: fix tests compiler errors

* chore: remove create/close/open request from worker requests

* chore: add comment

* chore: fix typo
2023-08-22 11:16:00 +00:00
Lei, HUANG
be1e13c713 feat(mito2): time series memtable (#2208)
* feat: time series memtable

* feat: add some test

* fix: some clippy warnings

* chore: some rustdoc

* refactor: test

* fix: remove useless functions

* feat: add config for TimeSeriesMemtable

* chore: some optimize

* refactor: remove bucketing

* refactor: avoid cloing RegionMetadataRef across all Series; make initial_builder_capacity a const; sort batch only by timestamp and sequence
2023-08-22 08:40:46 +00:00
Zhenchi
cb3561f3b3 refactor(table): eliminate calls to DistTable.insert (#2219)
Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2023-08-22 06:15:02 +00:00
Niwaka
b3b43fe1c3 fix: table options can't be found in distributed mode (#2209)
* fix: table options can't be found in distributed mode

* refactor: use iterator for regions_numbers

* chore: remove TODO
2023-08-22 03:53:56 +00:00
WU Jingdi
b411769de6 feat: Implement a basical range select query (#2138)
* feat: Implement a basical range select query

* chore: support any timestamp type & CR fix
2023-08-22 03:07:14 +00:00
niebayes
e5f4ca2dab feat: streaming do_get (#2171)
* feat: rewrite do_get for streaming get flight data

* feat: rewrite do_get call stack but leave the async stream adapter not modified yet

* feat: rewrite the async stream adapter to accept greptime record batch stream

* fix: resolve some PR comments

* feat: rewrite tests to adapt to the streaming do_get

* feat: add unit tests for streaming do_get

* feat: rewrite timer metric of merge scan

* remove unhelpful unit tests for streaming do_get

* add a new metric timer for merge scan and fix some test errors

* rewrite mysql writer to write query results in a streaming manner

* fix: fix fmt errors

* fix: rewrite sqlness runner to take into account the streaming do_get

* fix: fix toml format errors

* fix: resolve some PR comments

* fix: resolve some PR comments

* fix: refactor do_get to increase readability

* fix: refactor mysql try_write_one to increase readability
2023-08-22 02:54:05 +00:00
Weny Xu
5b7b2cf77d fix: fix ddl client can not update leader addr (#2205)
* fix: fix ddl client can not update leader addr

* chore: apply suggestions from CR

* feat: add message to context

* fix: only retry if unavailable or deadline exceeded

* chore: apply suggestions from CR
2023-08-21 13:57:29 +00:00
shuiyisong
9352649f22 chore: add table region key to delete in upgrade tool (#2214) 2023-08-21 08:16:10 +00:00
shuiyisong
c5f507c20e fix: add user_info extension to prom_store handler (#2212)
chore: add user_info extention to prom_store auth
2023-08-21 04:55:34 +00:00
314 changed files with 21044 additions and 3121 deletions

View File

@@ -32,6 +32,10 @@ inputs:
description: Upload to S3
required: false
default: 'true'
upload-latest-artifacts:
description: Upload the latest artifacts to S3
required: false
default: 'true'
working-dir:
description: Working directory to build the artifacts
required: false
@@ -59,4 +63,5 @@ runs:
aws-secret-access-key: ${{ inputs.aws-secret-access-key }}
aws-region: ${{ inputs.aws-region }}
upload-to-s3: ${{ inputs.upload-to-s3 }}
upload-latest-artifacts: ${{ inputs.upload-latest-artifacts }}
working-dir: ${{ inputs.working-dir }}

View File

@@ -33,6 +33,10 @@ inputs:
description: Upload to S3
required: false
default: 'true'
upload-latest-artifacts:
description: Upload the latest artifacts to S3
required: false
default: 'true'
working-dir:
description: Working directory to build the artifacts
required: false
@@ -69,6 +73,7 @@ runs:
aws-secret-access-key: ${{ inputs.aws-secret-access-key }}
aws-region: ${{ inputs.aws-region }}
upload-to-s3: ${{ inputs.upload-to-s3 }}
upload-latest-artifacts: ${{ inputs.upload-latest-artifacts }}
working-dir: ${{ inputs.working-dir }}
- name: Build greptime without pyo3
@@ -85,6 +90,7 @@ runs:
aws-secret-access-key: ${{ inputs.aws-secret-access-key }}
aws-region: ${{ inputs.aws-region }}
upload-to-s3: ${{ inputs.upload-to-s3 }}
upload-latest-artifacts: ${{ inputs.upload-latest-artifacts }}
working-dir: ${{ inputs.working-dir }}
- name: Clean up the target directory # Clean up the target directory for the centos7 base image, or it will still use the objects of last build.
@@ -106,4 +112,5 @@ runs:
aws-secret-access-key: ${{ inputs.aws-secret-access-key }}
aws-region: ${{ inputs.aws-region }}
upload-to-s3: ${{ inputs.upload-to-s3 }}
upload-latest-artifacts: ${{ inputs.upload-latest-artifacts }}
working-dir: ${{ inputs.working-dir }}

View File

@@ -26,6 +26,18 @@ inputs:
description: Upload to S3
required: false
default: 'true'
upload-latest-artifacts:
description: Upload the latest artifacts to S3
required: false
default: 'true'
upload-max-retry-times:
description: Max retry times for uploading artifacts to S3
required: false
default: "20"
upload-retry-timeout:
description: Timeout for uploading artifacts to S3
required: false
default: "10" # minutes
working-dir:
description: Working directory to upload the artifacts
required: false
@@ -74,8 +86,8 @@ runs:
AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-secret-access-key }}
AWS_DEFAULT_REGION: ${{ inputs.aws-region }}
with:
max_attempts: 20
timeout_minutes: 5
max_attempts: ${{ inputs.upload-max-retry-times }}
timeout_minutes: ${{ inputs.upload-retry-timeout }}
# The bucket layout will be:
# releases/greptimedb
# ├── v0.1.0
@@ -92,3 +104,22 @@ runs:
aws s3 cp \
${{ inputs.artifacts-dir }}.sha256sum \
s3://${{ inputs.release-to-s3-bucket }}/releases/greptimedb/${{ inputs.version }}/${{ inputs.artifacts-dir }}.sha256sum
- name: Upload latest artifacts to S3
if: ${{ inputs.upload-to-s3 == 'true' && inputs.upload-latest-artifacts == 'true' }} # We'll also upload the latest artifacts to S3 in the scheduled and formal release.
uses: nick-invision/retry@v2
env:
AWS_ACCESS_KEY_ID: ${{ inputs.aws-access-key-id }}
AWS_SECRET_ACCESS_KEY: ${{ inputs.aws-secret-access-key }}
AWS_DEFAULT_REGION: ${{ inputs.aws-region }}
with:
max_attempts: ${{ inputs.upload-max-retry-times }}
timeout_minutes: ${{ inputs.upload-retry-timeout }}
command: |
cd ${{ inputs.working-dir }} && \
aws s3 cp \
${{ inputs.artifacts-dir }}.tar.gz \
s3://${{ inputs.release-to-s3-bucket }}/releases/greptimedb/latest/${{ inputs.artifacts-dir }}.tar.gz && \
aws s3 cp \
${{ inputs.artifacts-dir }}.sha256sum \
s3://${{ inputs.release-to-s3-bucket }}/releases/greptimedb/latest/${{ inputs.artifacts-dir }}.sha256sum

View File

@@ -151,6 +151,7 @@ jobs:
aws-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
aws-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
upload-latest-artifacts: false
build-linux-arm64-artifacts:
name: Build linux-arm64 artifacts
@@ -174,6 +175,7 @@ jobs:
aws-access-key-id: ${{ secrets.AWS_CN_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_CN_SECRET_ACCESS_KEY }}
aws-region: ${{ vars.AWS_RELEASE_BUCKET_REGION }}
upload-latest-artifacts: false
release-images-to-dockerhub:
name: Build and push images to DockerHub

372
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -46,6 +46,7 @@ members = [
"src/sql",
"src/storage",
"src/store-api",
"src/flow",
"src/table",
"src/table-procedure",
"tests-integration",
@@ -54,7 +55,7 @@ members = [
resolver = "2"
[workspace.package]
version = "0.3.2"
version = "0.4.0-nightly"
edition = "2021"
license = "Apache-2.0"
@@ -67,17 +68,18 @@ arrow-schema = { version = "43.0", features = ["serde"] }
async-stream = "0.3"
async-trait = "0.1"
chrono = { version = "0.4", features = ["serde"] }
datafusion = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
datafusion-common = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
datafusion-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
datafusion-optimizer = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
datafusion-physical-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
datafusion-sql = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
datafusion-substrait = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "2ceb7f927c40787773fdc466d6a4b79f3a6c0001" }
datafusion = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
datafusion-common = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
datafusion-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
datafusion-optimizer = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
datafusion-physical-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
datafusion-sql = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
datafusion-substrait = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "c0b0fca548e99d020c76e1a1cd7132aab26000e1" }
derive_builder = "0.12"
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "c30a2607be4044502094b25c408171a666a8ff6d" }
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "4a277f27caa035a801d5b9c020a0449777736614" }
humantime-serde = "1.1"
itertools = "0.10"
lazy_static = "1.4"
once_cell = "1.18"
@@ -90,9 +92,10 @@ regex = "1.8"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
snafu = { version = "0.7", features = ["backtraces"] }
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "c3814f08afa19786b13d72b1731a1e8b3cac4ab9", features = [
sqlparser = { git = "https://github.com/GreptimeTeam/sqlparser-rs.git", rev = "296a4f6c73b129d6f565a42a2e5e53c6bc2b9da4", features = [
"visitor",
] }
strum = { version = "0.25", features = ["derive"] }
tempfile = "3"
tokio = { version = "1.28", features = ["full"] }
tokio-util = { version = "0.7", features = ["io-util", "compat"] }

View File

@@ -57,8 +57,6 @@ max_purge_tasks = 32
checkpoint_margin = 10
# Region manifest logs and checkpoints gc execution duration
gc_duration = '10m'
# Whether to try creating a manifest checkpoint on region opening
checkpoint_on_startup = false
# Storage flush options
[storage.flush]

View File

@@ -53,10 +53,6 @@ enable = true
[prom_store_options]
enable = true
# Prometheus protocol options, see `standalone.example.toml`.
[prometheus_options]
addr = "127.0.0.1:4004"
# Metasrv client options, see `datanode.example.toml`.
[meta_client_options]
metasrv_addrs = ["127.0.0.1:3002"]

View File

@@ -26,7 +26,7 @@ enable_telemetry = true
# Procedure storage options.
[procedure]
# Procedure max retry time.
max_retry_times = 3
max_retry_times = 12
# Initial retry delay of procedures, increases exponentially
retry_delay = "500ms"

View File

@@ -76,11 +76,6 @@ enable = true
# Whether to enable Prometheus remote write and read in HTTP API, true by default.
enable = true
# Prometheus protocol options
[prometheus_options]
# Prometheus API server address, "127.0.0.1:4004" by default.
addr = "127.0.0.1:4004"
# WAL options.
[wal]
# WAL data directory
@@ -121,8 +116,6 @@ max_purge_tasks = 32
checkpoint_margin = 10
# Region manifest logs and checkpoints gc execution duration
gc_duration = '10m'
# Whether to try creating a manifest checkpoint on region opening
checkpoint_on_startup = false
# Storage flush options
[storage.flush]

View File

@@ -55,6 +55,10 @@ impl ColumnDataTypeWrapper {
Ok(Self(datatype))
}
pub fn new(datatype: ColumnDataType) -> Self {
Self(datatype)
}
pub fn datatype(&self) -> ColumnDataType {
self.0
}

View File

@@ -70,11 +70,9 @@ impl InformationSchemaProvider {
pub fn table(&self, name: &str) -> Option<TableRef> {
self.information_table(name).map(|table| {
let schema = table.schema();
let table_info = Self::table_info(self.catalog_name.clone(), &table);
let table_type = table.table_type();
let filter_pushdown = FilterPushDownType::Unsupported;
let thin_table = ThinTable::new(schema, table_info, table_type, filter_pushdown);
let thin_table = ThinTable::new(table_info, filter_pushdown);
let data_source = Arc::new(InformationTableDataSource::new(table));
Arc::new(ThinTableAdapter::new(thin_table, data_source)) as _

View File

@@ -136,7 +136,7 @@ impl LocalCatalogManager {
schema: INFORMATION_SCHEMA_NAME.to_string(),
table_name: SYSTEM_CATALOG_TABLE_NAME.to_string(),
table_id: SYSTEM_CATALOG_TABLE_ID,
table: self.system.information_schema.system.clone(),
table: self.system.information_schema.system.as_table_ref(),
};
self.catalogs.register_table(register_table_req).await?;

View File

@@ -97,26 +97,7 @@ impl CatalogManager for MemoryCatalogManager {
}
async fn deregister_table(&self, request: DeregisterTableRequest) -> Result<()> {
let mut catalogs = self.catalogs.write().unwrap();
let schema = catalogs
.get_mut(&request.catalog)
.with_context(|| CatalogNotFoundSnafu {
catalog_name: &request.catalog,
})?
.get_mut(&request.schema)
.with_context(|| SchemaNotFoundSnafu {
catalog: &request.catalog,
schema: &request.schema,
})?;
let result = schema.remove(&request.table_name);
if result.is_some() {
decrement_gauge!(
crate::metrics::METRIC_CATALOG_MANAGER_TABLE_COUNT,
1.0,
&[crate::metrics::db_label(&request.catalog, &request.schema)],
);
}
Ok(())
self.deregister_table_sync(request)
}
async fn register_schema(&self, request: RegisterSchemaRequest) -> Result<bool> {
@@ -157,15 +138,7 @@ impl CatalogManager for MemoryCatalogManager {
}
async fn schema_exist(&self, catalog: &str, schema: &str) -> Result<bool> {
Ok(self
.catalogs
.read()
.unwrap()
.get(catalog)
.with_context(|| CatalogNotFoundSnafu {
catalog_name: catalog,
})?
.contains_key(schema))
self.schema_exist_sync(catalog, schema)
}
async fn table(
@@ -187,7 +160,7 @@ impl CatalogManager for MemoryCatalogManager {
}
async fn catalog_exist(&self, catalog: &str) -> Result<bool> {
Ok(self.catalogs.read().unwrap().get(catalog).is_some())
self.catalog_exist_sync(catalog)
}
async fn table_exist(&self, catalog: &str, schema: &str, table: &str) -> Result<bool> {
@@ -245,7 +218,7 @@ impl CatalogManager for MemoryCatalogManager {
}
impl MemoryCatalogManager {
/// Create a manager with some default setups
/// Creates a manager with some default setups
/// (e.g. default catalog/schema and information schema)
pub fn with_default_setup() -> Arc<Self> {
let manager = Arc::new(Self {
@@ -267,19 +240,23 @@ impl MemoryCatalogManager {
manager
}
/// Registers a catalog and return the catalog already exist
pub fn register_catalog_if_absent(&self, name: String) -> bool {
let mut catalogs = self.catalogs.write().unwrap();
let entry = catalogs.entry(name);
match entry {
Entry::Occupied(_) => true,
Entry::Vacant(v) => {
let _ = v.insert(HashMap::new());
false
}
}
fn schema_exist_sync(&self, catalog: &str, schema: &str) -> Result<bool> {
Ok(self
.catalogs
.read()
.unwrap()
.get(catalog)
.with_context(|| CatalogNotFoundSnafu {
catalog_name: catalog,
})?
.contains_key(schema))
}
fn catalog_exist_sync(&self, catalog: &str) -> Result<bool> {
Ok(self.catalogs.read().unwrap().get(catalog).is_some())
}
/// Registers a catalog if it does not exist and returns false if the schema exists.
pub fn register_catalog_sync(self: &Arc<Self>, name: String) -> Result<bool> {
let mut catalogs = self.catalogs.write().unwrap();
@@ -294,6 +271,32 @@ impl MemoryCatalogManager {
}
}
pub fn deregister_table_sync(&self, request: DeregisterTableRequest) -> Result<()> {
let mut catalogs = self.catalogs.write().unwrap();
let schema = catalogs
.get_mut(&request.catalog)
.with_context(|| CatalogNotFoundSnafu {
catalog_name: &request.catalog,
})?
.get_mut(&request.schema)
.with_context(|| SchemaNotFoundSnafu {
catalog: &request.catalog,
schema: &request.schema,
})?;
let result = schema.remove(&request.table_name);
if result.is_some() {
decrement_gauge!(
crate::metrics::METRIC_CATALOG_MANAGER_TABLE_COUNT,
1.0,
&[crate::metrics::db_label(&request.catalog, &request.schema)],
);
}
Ok(())
}
/// Registers a schema if it does not exist.
/// It returns an error if the catalog does not exist,
/// and returns false if the schema exists.
pub fn register_schema_sync(&self, request: RegisterSchemaRequest) -> Result<bool> {
let mut catalogs = self.catalogs.write().unwrap();
let catalog = catalogs
@@ -312,6 +315,7 @@ impl MemoryCatalogManager {
}
}
/// Registers a schema and returns an error if the catalog or schema does not exist.
pub fn register_table_sync(&self, request: RegisterTableRequest) -> Result<bool> {
let mut catalogs = self.catalogs.write().unwrap();
let schema = catalogs
@@ -353,9 +357,25 @@ impl MemoryCatalogManager {
#[cfg(any(test, feature = "testing"))]
pub fn new_with_table(table: TableRef) -> Arc<Self> {
let manager = Self::with_default_setup();
let catalog = &table.table_info().catalog_name;
let schema = &table.table_info().schema_name;
if !manager.catalog_exist_sync(catalog).unwrap() {
manager.register_catalog_sync(catalog.to_string()).unwrap();
}
if !manager.schema_exist_sync(catalog, schema).unwrap() {
manager
.register_schema_sync(RegisterSchemaRequest {
catalog: catalog.to_string(),
schema: schema.to_string(),
})
.unwrap();
}
let request = RegisterTableRequest {
catalog: DEFAULT_CATALOG_NAME.to_string(),
schema: DEFAULT_SCHEMA_NAME.to_string(),
catalog: catalog.to_string(),
schema: schema.to_string(),
table_name: table.table_info().name.clone(),
table_id: table.table_info().ident.table_id,
table,
@@ -524,10 +544,14 @@ mod tests {
}
#[test]
pub fn test_register_if_absent() {
pub fn test_register_catalog_sync() {
let list = MemoryCatalogManager::with_default_setup();
assert!(!list.register_catalog_if_absent("test_catalog".to_string(),));
assert!(list.register_catalog_if_absent("test_catalog".to_string()));
assert!(list
.register_catalog_sync("test_catalog".to_string())
.unwrap());
assert!(!list
.register_catalog_sync("test_catalog".to_string())
.unwrap());
}
#[tokio::test]

View File

@@ -85,6 +85,7 @@ impl RemoteCatalogManager {
let engine_manager = self.engine_manager.clone();
let memory_catalog_manager = self.memory_catalog_manager.clone();
let table_metadata_manager = self.table_metadata_manager.clone();
let region_alive_keepers = self.region_alive_keepers.clone();
common_runtime::spawn_bg(async move {
let table_id = datanode_table_value.table_id;
if let Err(e) = open_and_register_table(
@@ -92,6 +93,7 @@ impl RemoteCatalogManager {
datanode_table_value,
memory_catalog_manager,
table_metadata_manager,
region_alive_keepers,
)
.await
{
@@ -116,6 +118,7 @@ async fn open_and_register_table(
datanode_table_value: DatanodeTableValue,
memory_catalog_manager: Arc<MemoryCatalogManager>,
table_metadata_manager: TableMetadataManagerRef,
region_alive_keepers: Arc<RegionAliveKeepers>,
) -> Result<()> {
let context = EngineContext {};
@@ -192,7 +195,8 @@ async fn open_and_register_table(
table_id,
table,
};
let registered = memory_catalog_manager.register_table_sync(request)?;
let registered =
register_table(&memory_catalog_manager, &region_alive_keepers, request).await?;
ensure!(
registered,
TableExistsSnafu {
@@ -203,6 +207,32 @@ async fn open_and_register_table(
Ok(())
}
async fn register_table(
memory_catalog_manager: &Arc<MemoryCatalogManager>,
region_alive_keepers: &Arc<RegionAliveKeepers>,
request: RegisterTableRequest,
) -> Result<bool> {
let table = request.table.clone();
let registered = memory_catalog_manager.register_table_sync(request)?;
if registered {
let table_info = table.table_info();
let table_ident = TableIdent {
catalog: table_info.catalog_name.clone(),
schema: table_info.schema_name.clone(),
table: table_info.name.clone(),
table_id: table_info.table_id(),
engine: table_info.meta.engine.clone(),
};
region_alive_keepers
.register_table(table_ident, table, memory_catalog_manager.clone())
.await?;
}
Ok(registered)
}
#[async_trait]
impl CatalogManager for RemoteCatalogManager {
async fn start(&self) -> Result<()> {
@@ -221,25 +251,12 @@ impl CatalogManager for RemoteCatalogManager {
}
async fn register_table(&self, request: RegisterTableRequest) -> Result<bool> {
let table = request.table.clone();
let registered = self.memory_catalog_manager.register_table_sync(request)?;
if registered {
let table_info = table.table_info();
let table_ident = TableIdent {
catalog: table_info.catalog_name.clone(),
schema: table_info.schema_name.clone(),
table: table_info.name.clone(),
table_id: table_info.table_id(),
engine: table_info.meta.engine.clone(),
};
self.region_alive_keepers
.register_table(table_ident, table)
.await?;
}
Ok(registered)
register_table(
&self.memory_catalog_manager,
&self.region_alive_keepers,
request,
)
.await
}
async fn deregister_table(&self, request: DeregisterTableRequest) -> Result<()> {

View File

@@ -29,6 +29,7 @@ use snafu::{OptionExt, ResultExt};
use store_api::storage::RegionNumber;
use table::engine::manager::TableEngineManagerRef;
use table::engine::{CloseTableResult, EngineContext, TableEngineRef};
use table::metadata::TableId;
use table::requests::CloseTableRequest;
use table::TableRef;
use tokio::sync::{mpsc, oneshot, Mutex};
@@ -36,11 +37,13 @@ use tokio::task::JoinHandle;
use tokio::time::{Duration, Instant};
use crate::error::{Result, TableEngineNotFoundSnafu};
use crate::local::MemoryCatalogManager;
use crate::DeregisterTableRequest;
/// [RegionAliveKeepers] manages all [RegionAliveKeeper] in a scope of tables.
pub struct RegionAliveKeepers {
table_engine_manager: TableEngineManagerRef,
keepers: Arc<Mutex<HashMap<TableIdent, Arc<RegionAliveKeeper>>>>,
keepers: Arc<Mutex<HashMap<TableId, Arc<RegionAliveKeeper>>>>,
heartbeat_interval_millis: u64,
started: AtomicBool,
@@ -65,12 +68,18 @@ impl RegionAliveKeepers {
}
}
pub async fn find_keeper(&self, table_ident: &TableIdent) -> Option<Arc<RegionAliveKeeper>> {
self.keepers.lock().await.get(table_ident).cloned()
pub async fn find_keeper(&self, table_id: TableId) -> Option<Arc<RegionAliveKeeper>> {
self.keepers.lock().await.get(&table_id).cloned()
}
pub async fn register_table(&self, table_ident: TableIdent, table: TableRef) -> Result<()> {
let keeper = self.find_keeper(&table_ident).await;
pub async fn register_table(
&self,
table_ident: TableIdent,
table: TableRef,
catalog_manager: Arc<MemoryCatalogManager>,
) -> Result<()> {
let table_id = table_ident.table_id;
let keeper = self.find_keeper(table_id).await;
if keeper.is_some() {
return Ok(());
}
@@ -84,6 +93,7 @@ impl RegionAliveKeepers {
let keeper = Arc::new(RegionAliveKeeper::new(
table_engine,
catalog_manager,
table_ident.clone(),
self.heartbeat_interval_millis,
));
@@ -92,7 +102,7 @@ impl RegionAliveKeepers {
}
let mut keepers = self.keepers.lock().await;
let _ = keepers.insert(table_ident.clone(), keeper.clone());
let _ = keepers.insert(table_id, keeper.clone());
if self.started.load(Ordering::Relaxed) {
keeper.start().await;
@@ -108,15 +118,16 @@ impl RegionAliveKeepers {
&self,
table_ident: &TableIdent,
) -> Option<Arc<RegionAliveKeeper>> {
self.keepers.lock().await.remove(table_ident).map(|x| {
let table_id = table_ident.table_id;
self.keepers.lock().await.remove(&table_id).map(|x| {
info!("Deregister RegionAliveKeeper for table {table_ident}");
x
})
}
pub async fn register_region(&self, region_ident: &RegionIdent) {
let table_ident = &region_ident.table_ident;
let Some(keeper) = self.find_keeper(table_ident).await else {
let table_id = region_ident.table_ident.table_id;
let Some(keeper) = self.find_keeper(table_id).await else {
// Alive keeper could be affected by lagging msg, just warn and ignore.
warn!("Alive keeper for region {region_ident} is not found!");
return;
@@ -125,8 +136,8 @@ impl RegionAliveKeepers {
}
pub async fn deregister_region(&self, region_ident: &RegionIdent) {
let table_ident = &region_ident.table_ident;
let Some(keeper) = self.find_keeper(table_ident).await else {
let table_id = region_ident.table_ident.table_id;
let Some(keeper) = self.find_keeper(table_id).await else {
// Alive keeper could be affected by lagging msg, just warn and ignore.
warn!("Alive keeper for region {region_ident} is not found!");
return;
@@ -178,7 +189,8 @@ impl HeartbeatResponseHandler for RegionAliveKeepers {
}
};
let Some(keeper) = self.keepers.lock().await.get(&table_ident).cloned() else {
let table_id = table_ident.table_id;
let Some(keeper) = self.keepers.lock().await.get(&table_id).cloned() else {
// Alive keeper could be affected by lagging msg, just warn and ignore.
warn!("Alive keeper for table {table_ident} is not found!");
continue;
@@ -199,6 +211,7 @@ impl HeartbeatResponseHandler for RegionAliveKeepers {
/// Datanode, it will "extend" the region's "lease", with a deadline for [RegionAliveKeeper] to
/// countdown.
pub struct RegionAliveKeeper {
catalog_manager: Arc<MemoryCatalogManager>,
table_engine: TableEngineRef,
table_ident: TableIdent,
countdown_task_handles: Arc<Mutex<HashMap<RegionNumber, Arc<CountdownTaskHandle>>>>,
@@ -209,10 +222,12 @@ pub struct RegionAliveKeeper {
impl RegionAliveKeeper {
fn new(
table_engine: TableEngineRef,
catalog_manager: Arc<MemoryCatalogManager>,
table_ident: TableIdent,
heartbeat_interval_millis: u64,
) -> Self {
Self {
catalog_manager,
table_engine,
table_ident,
countdown_task_handles: Arc::new(Mutex::new(HashMap::new())),
@@ -240,11 +255,29 @@ impl RegionAliveKeeper {
let _ = x.lock().await.remove(&region);
} // Else the countdown task handles map could be dropped because the keeper is dropped.
};
let catalog_manager = self.catalog_manager.clone();
let ident = self.table_ident.clone();
let handle = Arc::new(CountdownTaskHandle::new(
self.table_engine.clone(),
self.table_ident.clone(),
region,
|| on_task_finished,
move |result: Option<CloseTableResult>| {
if matches!(result, Some(CloseTableResult::Released(_))) {
let result = catalog_manager.deregister_table_sync(DeregisterTableRequest {
catalog: ident.catalog.to_string(),
schema: ident.schema.to_string(),
table_name: ident.table.to_string(),
});
info!(
"Deregister table: {} after countdown task finished, result: {result:?}",
ident.table_id
);
} else {
debug!("Countdown task returns: {result:?}");
}
on_task_finished
},
));
let mut handles = self.countdown_task_handles.lock().await;
@@ -343,7 +376,7 @@ impl CountdownTaskHandle {
table_engine: TableEngineRef,
table_ident: TableIdent,
region: RegionNumber,
on_task_finished: impl FnOnce() -> Fut + Send + 'static,
on_task_finished: impl FnOnce(Option<CloseTableResult>) -> Fut + Send + 'static,
) -> Self
where
Fut: Future<Output = ()> + Send,
@@ -357,8 +390,8 @@ impl CountdownTaskHandle {
rx,
};
let handler = common_runtime::spawn_bg(async move {
countdown_task.run().await;
on_task_finished().await;
let result = countdown_task.run().await;
on_task_finished(result).await;
});
Self {
@@ -410,7 +443,8 @@ struct CountdownTask {
}
impl CountdownTask {
async fn run(&mut self) {
// returns true if
async fn run(&mut self) -> Option<CloseTableResult> {
// 30 years. See `Instant::far_future`.
let far_future = Instant::now() + Duration::from_secs(86400 * 365 * 30);
@@ -464,10 +498,11 @@ impl CountdownTask {
"Region {region} of table {table_ident} is closed, result: {result:?}. \
RegionAliveKeeper out.",
);
break;
return Some(result);
}
}
}
None
}
async fn close_region(&self) -> CloseTableResult {
@@ -543,11 +578,16 @@ mod test {
table_options: TableOptions::default(),
engine: "MockTableEngine".to_string(),
}));
let catalog_manager = MemoryCatalogManager::new_with_table(table.clone());
keepers
.register_table(table_ident.clone(), table)
.register_table(table_ident.clone(), table, catalog_manager)
.await
.unwrap();
assert!(keepers.keepers.lock().await.contains_key(&table_ident));
assert!(keepers
.keepers
.lock()
.await
.contains_key(&table_ident.table_id));
(table_ident, keepers)
}
@@ -602,7 +642,7 @@ mod test {
.keepers
.lock()
.await
.get(&table_ident)
.get(&table_ident.table_id)
.cloned()
.unwrap();
@@ -649,7 +689,7 @@ mod test {
})
.await;
let mut regions = keepers
.find_keeper(&table_ident)
.find_keeper(table_ident.table_id)
.await
.unwrap()
.countdown_task_handles
@@ -676,7 +716,8 @@ mod test {
table_id: 1024,
engine: "mito".to_string(),
};
let keeper = RegionAliveKeeper::new(table_engine, table_ident, 1000);
let catalog_manager = MemoryCatalogManager::with_default_setup();
let keeper = RegionAliveKeeper::new(table_engine, catalog_manager, table_ident, 1000);
let region = 1;
assert!(keeper.find_handle(&region).await.is_none());
@@ -719,7 +760,7 @@ mod test {
table_engine.clone(),
table_ident.clone(),
1,
|| async move { finished_clone.store(true, Ordering::Relaxed) },
|_| async move { finished_clone.store(true, Ordering::Relaxed) },
);
let tx = handle.tx.clone();
@@ -741,7 +782,7 @@ mod test {
let finished = Arc::new(AtomicBool::new(false));
let finished_clone = finished.clone();
let handle = CountdownTaskHandle::new(table_engine, table_ident, 1, || async move {
let handle = CountdownTaskHandle::new(table_engine, table_ident, 1, |_| async move {
finished_clone.store(true, Ordering::Relaxed)
});
handle.tx.send(CountdownCommand::Start(100)).await.unwrap();

View File

@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::collections::HashMap;
use std::sync::Arc;
@@ -21,24 +20,23 @@ use common_catalog::consts::{
SYSTEM_CATALOG_NAME, SYSTEM_CATALOG_TABLE_ID, SYSTEM_CATALOG_TABLE_NAME,
};
use common_recordbatch::SendableRecordBatchStream;
use common_telemetry::debug;
use common_telemetry::{debug, warn};
use common_time::util;
use datatypes::prelude::{ConcreteDataType, ScalarVector, VectorRef};
use datatypes::schema::{ColumnSchema, RawSchema, SchemaRef};
use datatypes::schema::{ColumnSchema, RawSchema};
use datatypes::vectors::{BinaryVector, TimestampMillisecondVector, UInt8Vector};
use serde::{Deserialize, Serialize};
use snafu::{ensure, OptionExt, ResultExt};
use store_api::storage::ScanRequest;
use table::engine::{EngineContext, TableEngineRef};
use table::metadata::{TableId, TableInfoRef, TableType};
use table::requests::{
CreateTableRequest, DeleteRequest, InsertRequest, OpenTableRequest, TableOptions,
};
use table::{Result as TableResult, Table, TableRef};
use table::metadata::TableId;
use table::requests::{CreateTableRequest, InsertRequest, OpenTableRequest, TableOptions};
use table::TableRef;
use crate::error::{
self, CreateSystemCatalogSnafu, EmptyValueSnafu, Error, InvalidEntryTypeSnafu, InvalidKeySnafu,
OpenSystemCatalogSnafu, Result, ValueDeserializeSnafu,
self, CreateSystemCatalogSnafu, DeregisterTableSnafu, EmptyValueSnafu, Error,
InsertCatalogRecordSnafu, InvalidEntryTypeSnafu, InvalidKeySnafu, OpenSystemCatalogSnafu,
Result, ValueDeserializeSnafu,
};
use crate::DeregisterTableRequest;
@@ -48,42 +46,6 @@ pub const VALUE_INDEX: usize = 3;
pub struct SystemCatalogTable(TableRef);
#[async_trait::async_trait]
impl Table for SystemCatalogTable {
fn as_any(&self) -> &dyn Any {
self
}
fn schema(&self) -> SchemaRef {
self.0.schema()
}
async fn scan_to_stream(&self, request: ScanRequest) -> TableResult<SendableRecordBatchStream> {
self.0.scan_to_stream(request).await
}
/// Insert values into table.
async fn insert(&self, request: InsertRequest) -> TableResult<usize> {
self.0.insert(request).await
}
fn table_info(&self) -> TableInfoRef {
self.0.table_info()
}
fn table_type(&self) -> TableType {
self.0.table_type()
}
async fn delete(&self, request: DeleteRequest) -> TableResult<usize> {
self.0.delete(request).await
}
fn statistics(&self) -> Option<table::stats::TableStatistics> {
self.0.statistics()
}
}
impl SystemCatalogTable {
pub async fn new(engine: TableEngineRef) -> Result<Self> {
let request = OpenTableRequest {
@@ -126,6 +88,54 @@ impl SystemCatalogTable {
}
}
pub async fn register_table(
&self,
catalog: String,
schema: String,
table_name: String,
table_id: TableId,
engine: String,
) -> Result<usize> {
let insert_request =
build_table_insert_request(catalog, schema, table_name, table_id, engine);
self.0
.insert(insert_request)
.await
.context(InsertCatalogRecordSnafu)
}
pub(crate) async fn deregister_table(
&self,
request: &DeregisterTableRequest,
table_id: TableId,
) -> Result<()> {
let deletion_request = build_table_deletion_request(request, table_id);
self.0
.insert(deletion_request)
.await
.map(|x| {
if x != 1 {
let table = common_catalog::format_full_table_name(
&request.catalog,
&request.schema,
&request.table_name
);
warn!("Failed to delete table record from information_schema, unexpected returned result: {x}, table: {table}");
}
})
.with_context(|_| DeregisterTableSnafu {
request: request.clone(),
})
}
pub async fn register_schema(&self, catalog: String, schema: String) -> Result<usize> {
let insert_request = build_schema_insert_request(catalog, schema);
self.0
.insert(insert_request)
.await
.context(InsertCatalogRecordSnafu)
}
/// Create a stream of all entries inside system catalog table
pub async fn records(&self) -> Result<SendableRecordBatchStream> {
let full_projection = None;
@@ -137,11 +147,16 @@ impl SystemCatalogTable {
limit: None,
};
let stream = self
.0
.scan_to_stream(scan_req)
.await
.context(error::SystemCatalogTableScanSnafu)?;
Ok(stream)
}
pub fn as_table_ref(&self) -> TableRef {
self.0.clone()
}
}
/// Build system catalog table schema.
@@ -541,14 +556,14 @@ mod tests {
async fn test_system_table_type() {
let (_dir, table_engine) = prepare_table_engine().await;
let system_table = SystemCatalogTable::new(table_engine).await.unwrap();
assert_eq!(Base, system_table.table_type());
assert_eq!(Base, system_table.as_table_ref().table_type());
}
#[tokio::test]
async fn test_system_table_info() {
let (_dir, table_engine) = prepare_table_engine().await;
let system_table = SystemCatalogTable::new(table_engine).await.unwrap();
let info = system_table.table_info();
let info = system_table.as_table_ref().table_info();
assert_eq!(TableType::Base, info.table_type);
assert_eq!(SYSTEM_CATALOG_TABLE_NAME, info.name);
assert_eq!(SYSTEM_CATALOG_TABLE_ID, info.ident.table_id);
@@ -561,14 +576,16 @@ mod tests {
let (_, table_engine) = prepare_table_engine().await;
let catalog_table = SystemCatalogTable::new(table_engine).await.unwrap();
let table_insertion = build_table_insert_request(
DEFAULT_CATALOG_NAME.to_string(),
DEFAULT_SCHEMA_NAME.to_string(),
"my_table".to_string(),
1,
MITO_ENGINE.to_string(),
);
let result = catalog_table.insert(table_insertion).await.unwrap();
let result = catalog_table
.register_table(
DEFAULT_CATALOG_NAME.to_string(),
DEFAULT_SCHEMA_NAME.to_string(),
"my_table".to_string(),
1,
MITO_ENGINE.to_string(),
)
.await
.unwrap();
assert_eq!(result, 1);
let records = catalog_table.records().await.unwrap();
@@ -598,16 +615,17 @@ mod tests {
});
assert_eq!(entry, expected);
let table_deletion = build_table_deletion_request(
&DeregisterTableRequest {
catalog: DEFAULT_CATALOG_NAME.to_string(),
schema: DEFAULT_SCHEMA_NAME.to_string(),
table_name: "my_table".to_string(),
},
1,
);
let result = catalog_table.insert(table_deletion).await.unwrap();
assert_eq!(result, 1);
catalog_table
.deregister_table(
&DeregisterTableRequest {
catalog: DEFAULT_CATALOG_NAME.to_string(),
schema: DEFAULT_SCHEMA_NAME.to_string(),
table_name: "my_table".to_string(),
},
1,
)
.await
.unwrap();
let records = catalog_table.records().await.unwrap();
let batches = RecordBatches::try_collect(records).await.unwrap().take();

View File

@@ -16,16 +16,9 @@
use std::sync::Arc;
use common_telemetry::logging;
use snafu::ResultExt;
use table::metadata::TableId;
use table::Table;
use crate::error::{self, InsertCatalogRecordSnafu, Result as CatalogResult};
use crate::system::{
build_schema_insert_request, build_table_deletion_request, build_table_insert_request,
SystemCatalogTable,
};
use crate::system::SystemCatalogTable;
use crate::DeregisterTableRequest;
pub struct InformationSchema {
@@ -54,36 +47,21 @@ impl SystemCatalog {
table_id: TableId,
engine: String,
) -> crate::error::Result<usize> {
let request = build_table_insert_request(catalog, schema, table_name, table_id, engine);
self.information_schema
.system
.insert(request)
.register_table(catalog, schema, table_name, table_id, engine)
.await
.context(InsertCatalogRecordSnafu)
}
pub(crate) async fn deregister_table(
&self,
request: &DeregisterTableRequest,
table_id: TableId,
) -> CatalogResult<()> {
) -> crate::error::Result<()> {
self.information_schema
.system
.insert(build_table_deletion_request(request, table_id))
.deregister_table(request, table_id)
.await
.map(|x| {
if x != 1 {
let table = common_catalog::format_full_table_name(
&request.catalog,
&request.schema,
&request.table_name
);
logging::warn!("Failed to delete table record from information_schema, unexpected returned result: {x}, table: {table}");
}
})
.with_context(|_| error::DeregisterTableSnafu {
request: request.clone(),
})
}
pub async fn register_schema(
@@ -91,11 +69,9 @@ impl SystemCatalog {
catalog: String,
schema: String,
) -> crate::error::Result<usize> {
let request = build_schema_insert_request(catalog, schema);
self.information_schema
.system
.insert(request)
.register_schema(catalog, schema)
.await
.context(InsertCatalogRecordSnafu)
}
}

View File

@@ -396,7 +396,7 @@ mod tests {
assert!(catalog_manager.register_table(request).await.unwrap());
let keeper = region_alive_keepers
.find_keeper(&table_before)
.find_keeper(table_before.table_id)
.await
.unwrap();
let deadline = keeper.deadline(0).await.unwrap();
@@ -435,7 +435,7 @@ mod tests {
assert!(catalog_manager.register_table(request).await.unwrap());
let keeper = region_alive_keepers
.find_keeper(&table_after)
.find_keeper(table_after.table_id)
.await
.unwrap();
let deadline = keeper.deadline(0).await.unwrap();
@@ -443,7 +443,7 @@ mod tests {
assert!(deadline <= Instant::now() + Duration::from_secs(20));
let keeper = region_alive_keepers
.find_keeper(&table_before)
.find_keeper(table_before.table_id)
.await
.unwrap();
let deadline = keeper.deadline(0).await.unwrap();

View File

@@ -22,6 +22,7 @@ common-telemetry = { workspace = true }
common-time = { workspace = true }
datafusion.workspace = true
datatypes = { workspace = true }
derive_builder.workspace = true
enum_dispatch = "0.3"
futures-util.workspace = true
moka = { version = "0.9", features = ["future"] }

View File

@@ -17,6 +17,7 @@ use std::sync::Arc;
use api::v1::greptime_database_client::GreptimeDatabaseClient;
use api::v1::health_check_client::HealthCheckClient;
use api::v1::prometheus_gateway_client::PrometheusGatewayClient;
use api::v1::region::region_client::RegionClient as PbRegionClient;
use api::v1::HealthCheckRequest;
use arrow_flight::flight_service_client::FlightServiceClient;
use common_grpc::channel_manager::ChannelManager;
@@ -82,11 +83,6 @@ impl Client {
Default::default()
}
pub fn with_manager(channel_manager: ChannelManager) -> Self {
let inner = Arc::new(Inner::with_manager(channel_manager));
Self { inner }
}
pub fn with_urls<U, A>(urls: A) -> Self
where
U: AsRef<str>,
@@ -157,6 +153,11 @@ impl Client {
})
}
pub(crate) fn raw_region_client(&self) -> Result<PbRegionClient<Channel>> {
let (_, channel) = self.find_channel()?;
Ok(PbRegionClient::new(channel))
}
pub fn make_prometheus_gateway_client(&self) -> Result<PrometheusGatewayClient<Channel>> {
let (_, channel) = self.find_channel()?;
Ok(PrometheusGatewayClient::new(channel))

View File

@@ -19,18 +19,21 @@ use api::v1::query_request::Query;
use api::v1::{
AlterExpr, AuthHeader, CompactTableExpr, CreateTableExpr, DdlRequest, DeleteRequests,
DropTableExpr, FlushTableExpr, GreptimeRequest, InsertRequests, PromRangeQuery, QueryRequest,
RequestHeader, TruncateTableExpr,
RequestHeader, RowInsertRequests, TruncateTableExpr,
};
use arrow_flight::{FlightData, Ticket};
use arrow_flight::Ticket;
use async_stream::stream;
use common_error::ext::{BoxedError, ErrorExt};
use common_grpc::flight::{flight_messages_to_recordbatches, FlightDecoder, FlightMessage};
use common_grpc::flight::{FlightDecoder, FlightMessage};
use common_query::Output;
use common_recordbatch::error::ExternalSnafu;
use common_recordbatch::RecordBatchStreamAdaptor;
use common_telemetry::{logging, timer};
use futures_util::{TryFutureExt, TryStreamExt};
use futures_util::StreamExt;
use prost::Message;
use snafu::{ensure, ResultExt};
use crate::error::{ConvertFlightDataSnafu, IllegalFlightMessagesSnafu, ServerSnafu};
use crate::error::{ConvertFlightDataSnafu, Error, IllegalFlightMessagesSnafu, ServerSnafu};
use crate::{error, from_grpc_response, metrics, Client, Result, StreamInserter};
#[derive(Clone, Debug, Default)]
@@ -112,6 +115,11 @@ impl Database {
self.handle(Request::Inserts(requests)).await
}
pub async fn row_insert(&self, requests: RowInsertRequests) -> Result<u32> {
let _timer = timer!(metrics::METRIC_GRPC_INSERT);
self.handle(Request::RowInserts(requests)).await
}
pub fn streaming_inserter(&self) -> Result<StreamInserter> {
self.streaming_inserter_with_channel_size(65536)
}
@@ -283,55 +291,81 @@ impl Database {
let mut client = self.client.make_flight_client()?;
let flight_data: Vec<FlightData> = client
.mut_inner()
.do_get(request)
.and_then(|response| response.into_inner().try_collect())
.await
.map_err(|e| {
let tonic_code = e.code();
let e: error::Error = e.into();
let code = e.status_code();
let msg = e.to_string();
ServerSnafu { code, msg }
.fail::<()>()
.map_err(BoxedError::new)
.context(error::FlightGetSnafu {
tonic_code,
addr: client.addr(),
})
.map_err(|error| {
logging::error!(
"Failed to do Flight get, addr: {}, code: {}, source: {}",
client.addr(),
tonic_code,
error
);
error
})
.unwrap_err()
})?;
let decoder = &mut FlightDecoder::default();
let flight_messages = flight_data
.into_iter()
.map(|x| decoder.try_decode(x).context(ConvertFlightDataSnafu))
.collect::<Result<Vec<_>>>()?;
let output = if let Some(FlightMessage::AffectedRows(rows)) = flight_messages.get(0) {
ensure!(
flight_messages.len() == 1,
IllegalFlightMessagesSnafu {
reason: "Expect 'AffectedRows' Flight messages to be one and only!"
}
let response = client.mut_inner().do_get(request).await.map_err(|e| {
let tonic_code = e.code();
let e: error::Error = e.into();
let code = e.status_code();
let msg = e.to_string();
let error = Error::FlightGet {
tonic_code,
addr: client.addr().to_string(),
source: BoxedError::new(ServerSnafu { code, msg }.build()),
};
logging::error!(
"Failed to do Flight get, addr: {}, code: {}, source: {}",
client.addr(),
tonic_code,
error
);
Output::AffectedRows(*rows)
} else {
let recordbatches = flight_messages_to_recordbatches(flight_messages)
.context(ConvertFlightDataSnafu)?;
Output::RecordBatches(recordbatches)
error
})?;
let flight_data_stream = response.into_inner();
let mut decoder = FlightDecoder::default();
let mut flight_message_stream = flight_data_stream.map(move |flight_data| {
flight_data
.map_err(Error::from)
.and_then(|data| decoder.try_decode(data).context(ConvertFlightDataSnafu))
});
let Some(first_flight_message) = flight_message_stream.next().await else {
return IllegalFlightMessagesSnafu {
reason: "Expect the response not to be empty",
}
.fail();
};
Ok(output)
let first_flight_message = first_flight_message?;
match first_flight_message {
FlightMessage::AffectedRows(rows) => {
ensure!(
flight_message_stream.next().await.is_none(),
IllegalFlightMessagesSnafu {
reason: "Expect 'AffectedRows' Flight messages to be the one and the only!"
}
);
Ok(Output::AffectedRows(rows))
}
FlightMessage::Recordbatch(_) => IllegalFlightMessagesSnafu {
reason: "The first flight message cannot be a RecordBatch message",
}
.fail(),
FlightMessage::Schema(schema) => {
let stream = Box::pin(stream!({
while let Some(flight_message) = flight_message_stream.next().await {
let flight_message = flight_message
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
let FlightMessage::Recordbatch(record_batch) = flight_message else {
yield IllegalFlightMessagesSnafu {reason: "A Schema message must be succeeded exclusively by a set of RecordBatch messages"}
.fail()
.map_err(BoxedError::new)
.context(ExternalSnafu);
break;
};
yield Ok(record_batch);
}
}));
let record_batch_stream = RecordBatchStreamAdaptor {
schema,
stream,
output_ordering: None,
};
Ok(Output::Stream(Box::pin(record_batch_stream)))
}
}
}
}

View File

@@ -18,6 +18,7 @@ mod database;
pub mod error;
pub mod load_balance;
mod metrics;
pub mod region;
mod stream_insert;
pub use api;

View File

@@ -25,3 +25,4 @@ pub const METRIC_GRPC_FLUSH_TABLE: &str = "grpc.flush_table";
pub const METRIC_GRPC_COMPACT_TABLE: &str = "grpc.compact_table";
pub const METRIC_GRPC_TRUNCATE_TABLE: &str = "grpc.truncate_table";
pub const METRIC_GRPC_DO_GET: &str = "grpc.do_get";
pub(crate) const METRIC_REGION_REQUEST_GRPC: &str = "grpc.region_request";

146
src/client/src/region.rs Normal file
View File

@@ -0,0 +1,146 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use api::v1::region::{region_request, RegionRequest, RegionRequestHeader, RegionResponse};
use api::v1::ResponseHeader;
use common_error::status_code::StatusCode;
use common_telemetry::timer;
use snafu::OptionExt;
use crate::error::{IllegalDatabaseResponseSnafu, Result, ServerSnafu};
use crate::{metrics, Client};
type AffectedRows = u64;
#[derive(Debug)]
pub struct RegionRequester {
trace_id: Option<u64>,
span_id: Option<u64>,
client: Client,
}
impl RegionRequester {
pub fn new(client: Client) -> Self {
// TODO(LFC): Pass in trace_id and span_id from some context when we have it.
Self {
trace_id: None,
span_id: None,
client,
}
}
pub async fn handle(self, request: region_request::Body) -> Result<AffectedRows> {
let request_type = request.as_ref().to_string();
let request = RegionRequest {
header: Some(RegionRequestHeader {
trace_id: self.trace_id,
span_id: self.span_id,
}),
body: Some(request),
};
let _timer = timer!(
metrics::METRIC_REGION_REQUEST_GRPC,
&[("request_type", request_type)]
);
let mut client = self.client.raw_region_client()?;
let RegionResponse {
header,
affected_rows,
} = client.handle(request).await?.into_inner();
check_response_header(header)?;
Ok(affected_rows)
}
}
fn check_response_header(header: Option<ResponseHeader>) -> Result<()> {
let status = header
.and_then(|header| header.status)
.context(IllegalDatabaseResponseSnafu {
err_msg: "either response header or status is missing",
})?;
if StatusCode::is_success(status.status_code) {
Ok(())
} else {
let code =
StatusCode::from_u32(status.status_code).context(IllegalDatabaseResponseSnafu {
err_msg: format!("unknown server status: {:?}", status),
})?;
ServerSnafu {
code,
msg: status.err_msg,
}
.fail()
}
}
#[cfg(test)]
mod test {
use api::v1::Status as PbStatus;
use super::*;
use crate::Error::{IllegalDatabaseResponse, Server};
#[test]
fn test_check_response_header() {
let result = check_response_header(None);
assert!(matches!(
result.unwrap_err(),
IllegalDatabaseResponse { .. }
));
let result = check_response_header(Some(ResponseHeader { status: None }));
assert!(matches!(
result.unwrap_err(),
IllegalDatabaseResponse { .. }
));
let result = check_response_header(Some(ResponseHeader {
status: Some(PbStatus {
status_code: StatusCode::Success as u32,
err_msg: "".to_string(),
}),
}));
assert!(result.is_ok());
let result = check_response_header(Some(ResponseHeader {
status: Some(PbStatus {
status_code: u32::MAX,
err_msg: "".to_string(),
}),
}));
assert!(matches!(
result.unwrap_err(),
IllegalDatabaseResponse { .. }
));
let result = check_response_header(Some(ResponseHeader {
status: Some(PbStatus {
status_code: StatusCode::Internal as u32,
err_msg: "blabla".to_string(),
}),
}));
let Server { code, msg } = result.unwrap_err() else {
unreachable!()
};
assert_eq!(code, StatusCode::Internal);
assert_eq!(msg, "blabla");
}
}

View File

@@ -16,6 +16,7 @@ use api::v1::greptime_database_client::GreptimeDatabaseClient;
use api::v1::greptime_request::Request;
use api::v1::{
AuthHeader, GreptimeRequest, GreptimeResponse, InsertRequest, InsertRequests, RequestHeader,
RowInsertRequest, RowInsertRequests,
};
use tokio::sync::mpsc;
use tokio::task::JoinHandle;
@@ -84,6 +85,18 @@ impl StreamInserter {
})
}
pub async fn row_insert(&self, requests: Vec<RowInsertRequest>) -> Result<()> {
let inserts = RowInsertRequests { inserts: requests };
let request = self.to_rpc_request(Request::RowInserts(inserts));
self.sender.send(request).await.map_err(|e| {
error::ClientStreamingSnafu {
err_msg: e.to_string(),
}
.build()
})
}
pub async fn finish(self) -> Result<u32> {
drop(self.sender);

View File

@@ -133,17 +133,18 @@ impl MigrateTableMetadata {
);
while let Some((key, value)) = stream.try_next().await.context(error::IterStreamSnafu)? {
self.migrate_table_route_key(value).await?;
let table_id = self.migrate_table_route_key(value).await?;
keys.push(key);
keys.push(TableRegionKey::new(table_id).as_raw_key())
}
info!("Total migrated TableRouteKeys: {}", keys.len());
info!("Total migrated TableRouteKeys: {}", keys.len() / 2);
self.delete_migrated_keys(keys).await;
Ok(())
}
async fn migrate_table_route_key(&self, value: TableRouteValue) -> Result<()> {
async fn migrate_table_route_key(&self, value: TableRouteValue) -> Result<u32> {
let table_route = TableRoute::try_from_raw(
&value.peers,
value.table_route.expect("expected table_route"),
@@ -152,7 +153,8 @@ impl MigrateTableMetadata {
let new_table_value = NextTableRouteValue::new(table_route.region_routes);
let new_key = NextTableRouteKey::new(table_route.table.id as u32);
let table_id = table_route.table.id as u32;
let new_key = NextTableRouteKey::new(table_id);
info!("Creating '{new_key}'");
if self.dryrun {
@@ -168,7 +170,7 @@ impl MigrateTableMetadata {
.unwrap();
}
Ok(())
Ok(table_id)
}
async fn migrate_schema_keys(&self) -> Result<()> {
@@ -203,7 +205,7 @@ impl MigrateTableMetadata {
async fn migrate_schema_key(&self, key: &v1SchemaKey) -> Result<()> {
let new_key = SchemaNameKey::new(&key.catalog_name, &key.schema_name);
let schema_name_value = SchemaNameValue;
let schema_name_value = SchemaNameValue::default();
info!("Creating '{new_key}'");
@@ -310,7 +312,7 @@ impl MigrateTableMetadata {
async fn delete_migrated_keys(&self, keys: Vec<Vec<u8>>) {
for keys in keys.chunks(PAGE_SIZE) {
info!("Deleting {} TableGlobalKeys", keys.len());
info!("Deleting {} keys", keys.len());
let req = BatchDeleteRequest {
keys: keys.to_vec(),
prev_kv: false,

View File

@@ -229,7 +229,6 @@ mod tests {
[storage.manifest]
checkpoint_margin = 9
gc_duration = '7s'
checkpoint_on_startup = true
compress = true
[logging]
@@ -289,7 +288,6 @@ mod tests {
RegionManifestConfig {
checkpoint_margin: Some(9),
gc_duration: Some(Duration::from_secs(7)),
checkpoint_on_startup: true,
compress: true
},
options.storage.manifest,
@@ -383,9 +381,6 @@ mod tests {
max_files_in_level0 = 7
max_purge_tasks = 32
[storage.manifest]
checkpoint_on_startup = true
[logging]
level = "debug"
dir = "/tmp/greptimedb/test/logs"

View File

@@ -20,7 +20,7 @@ use common_base::Plugins;
use common_telemetry::logging;
use frontend::frontend::FrontendOptions;
use frontend::instance::{FrontendInstance, Instance as FeInstance};
use frontend::service_config::{InfluxdbOptions, PrometheusOptions};
use frontend::service_config::InfluxdbOptions;
use meta_client::MetaClientOptions;
use servers::tls::{TlsMode, TlsOption};
use servers::Mode;
@@ -99,8 +99,6 @@ pub struct StartCommand {
#[clap(long)]
mysql_addr: Option<String>,
#[clap(long)]
prom_addr: Option<String>,
#[clap(long)]
postgres_addr: Option<String>,
#[clap(long)]
opentsdb_addr: Option<String>,
@@ -171,10 +169,6 @@ impl StartCommand {
}
}
if let Some(addr) = &self.prom_addr {
opts.prometheus_options = Some(PrometheusOptions { addr: addr.clone() });
}
if let Some(addr) = &self.postgres_addr {
if let Some(postgres_opts) = &mut opts.postgres_options {
postgres_opts.addr = addr.clone();
@@ -248,7 +242,6 @@ mod tests {
fn test_try_from_start_command() {
let command = StartCommand {
http_addr: Some("127.0.0.1:1234".to_string()),
prom_addr: Some("127.0.0.1:4444".to_string()),
mysql_addr: Some("127.0.0.1:5678".to_string()),
postgres_addr: Some("127.0.0.1:5432".to_string()),
opentsdb_addr: Some("127.0.0.1:4321".to_string()),
@@ -276,10 +269,6 @@ mod tests {
opts.opentsdb_options.as_ref().unwrap().addr,
"127.0.0.1:4321"
);
assert_eq!(
opts.prometheus_options.as_ref().unwrap().addr,
"127.0.0.1:4444"
);
let default_opts = FrontendOptions::default();
assert_eq!(

View File

@@ -201,17 +201,6 @@ mod tests {
.join(ENV_VAR_SEP),
Some("42s"),
),
(
// storage.manifest.checkpoint_on_startup = true
[
env_prefix.to_string(),
"storage".to_uppercase(),
"manifest".to_uppercase(),
"checkpoint_on_startup".to_uppercase(),
]
.join(ENV_VAR_SEP),
Some("true"),
),
(
// wal.dir = /other/wal/dir
[
@@ -253,7 +242,6 @@ mod tests {
opts.storage.manifest.gc_duration,
Some(Duration::from_secs(42))
);
assert!(opts.storage.manifest.checkpoint_on_startup);
assert_eq!(
opts.meta_client_options.unwrap().metasrv_addrs,
vec![

View File

@@ -24,7 +24,6 @@ use frontend::frontend::FrontendOptions;
use frontend::instance::{FrontendInstance, Instance as FeInstance};
use frontend::service_config::{
GrpcOptions, InfluxdbOptions, MysqlOptions, OpentsdbOptions, PostgresOptions, PromStoreOptions,
PrometheusOptions,
};
use serde::{Deserialize, Serialize};
use servers::http::HttpOptions;
@@ -91,7 +90,6 @@ pub struct StandaloneOptions {
pub opentsdb_options: Option<OpentsdbOptions>,
pub influxdb_options: Option<InfluxdbOptions>,
pub prom_store_options: Option<PromStoreOptions>,
pub prometheus_options: Option<PrometheusOptions>,
pub wal: WalConfig,
pub storage: StorageConfig,
pub procedure: ProcedureConfig,
@@ -111,7 +109,6 @@ impl Default for StandaloneOptions {
opentsdb_options: Some(OpentsdbOptions::default()),
influxdb_options: Some(InfluxdbOptions::default()),
prom_store_options: Some(PromStoreOptions::default()),
prometheus_options: Some(PrometheusOptions::default()),
wal: WalConfig::default(),
storage: StorageConfig::default(),
procedure: ProcedureConfig::default(),
@@ -131,7 +128,6 @@ impl StandaloneOptions {
opentsdb_options: self.opentsdb_options,
influxdb_options: self.influxdb_options,
prom_store_options: self.prom_store_options,
prometheus_options: self.prometheus_options,
meta_client_options: None,
logging: self.logging,
..Default::default()
@@ -193,8 +189,6 @@ struct StartCommand {
#[clap(long)]
mysql_addr: Option<String>,
#[clap(long)]
prom_addr: Option<String>,
#[clap(long)]
postgres_addr: Option<String>,
#[clap(long)]
opentsdb_addr: Option<String>,
@@ -271,10 +265,6 @@ impl StartCommand {
}
}
if let Some(addr) = &self.prom_addr {
opts.prometheus_options = Some(PrometheusOptions { addr: addr.clone() })
}
if let Some(addr) = &self.postgres_addr {
if let Some(postgres_opts) = &mut opts.postgres_options {
postgres_opts.addr = addr.clone();
@@ -408,7 +398,6 @@ mod tests {
[storage.manifest]
checkpoint_margin = 9
gc_duration = '7s'
checkpoint_on_startup = true
[http_options]
addr = "127.0.0.1:4000"

View File

@@ -35,8 +35,14 @@ pub const INFORMATION_SCHEMA_TABLES_TABLE_ID: u32 = 3;
pub const INFORMATION_SCHEMA_COLUMNS_TABLE_ID: u32 = 4;
pub const MITO_ENGINE: &str = "mito";
pub const MITO2_ENGINE: &str = "mito2";
pub fn default_engine() -> &'static str {
MITO_ENGINE
}
pub const IMMUTABLE_FILE_ENGINE: &str = "file";
pub const SEMANTIC_TYPE_PRIMARY_KEY: &str = "PRIMARY KEY";
pub const SEMANTIC_TYPE_PRIMARY_KEY: &str = "TAG";
pub const SEMANTIC_TYPE_FIELD: &str = "FIELD";
pub const SEMANTIC_TYPE_TIME_INDEX: &str = "TIME INDEX";
pub const SEMANTIC_TYPE_TIME_INDEX: &str = "TIMESTAMP";

View File

@@ -27,7 +27,7 @@ orc-rust = "0.2"
paste = "1.0"
regex = "1.7"
snafu.workspace = true
strum = { version = "0.21", features = ["derive"] }
strum.workspace = true
tokio-util.workspace = true
tokio.workspace = true
url = "2.3"

View File

@@ -6,4 +6,4 @@ license.workspace = true
[dependencies]
snafu = { version = "0.7", features = ["backtraces"] }
strum = { version = "0.24", features = ["std", "derive"] }
strum.workspace = true

View File

@@ -14,6 +14,7 @@ common-error = { workspace = true }
common-recordbatch = { workspace = true }
common-runtime = { workspace = true }
common-telemetry = { workspace = true }
common-time = { workspace = true }
dashmap = "5.4"
datafusion.workspace = true
datatypes = { workspace = true }

View File

@@ -75,6 +75,9 @@ pub enum Error {
location: Location,
source: datatypes::error::Error,
},
#[snafu(display("Not supported: {}", feat))]
NotSupported { feat: String },
}
impl ErrorExt for Error {
@@ -83,7 +86,8 @@ impl ErrorExt for Error {
Error::InvalidTlsConfig { .. }
| Error::InvalidConfigFilePath { .. }
| Error::TypeMismatch { .. }
| Error::InvalidFlightData { .. } => StatusCode::InvalidArguments,
| Error::InvalidFlightData { .. }
| Error::NotSupported { .. } => StatusCode::InvalidArguments,
Error::CreateChannel { .. }
| Error::Conversion { .. }

View File

@@ -18,9 +18,11 @@ use std::fmt::Display;
use api::helper::values_with_capacity;
use api::v1::{Column, ColumnDataType, SemanticType};
use common_base::BitVec;
use common_time::timestamp::TimeUnit;
use snafu::ensure;
use crate::error::{Result, TypeMismatchSnafu};
use crate::Error;
type ColumnName = String;
@@ -259,6 +261,24 @@ impl Display for Precision {
}
}
impl TryFrom<Precision> for TimeUnit {
type Error = Error;
fn try_from(precision: Precision) -> std::result::Result<Self, Self::Error> {
Ok(match precision {
Precision::Second => TimeUnit::Second,
Precision::Millisecond => TimeUnit::Millisecond,
Precision::Microsecond => TimeUnit::Microsecond,
Precision::Nanosecond => TimeUnit::Nanosecond,
_ => {
return Err(Error::NotSupported {
feat: format!("convert {precision} into TimeUnit"),
})
}
})
}
}
#[cfg(test)]
mod tests {
use api::v1::{ColumnDataType, SemanticType};

View File

@@ -15,6 +15,7 @@ common-telemetry = { workspace = true }
common-time = { workspace = true }
etcd-client.workspace = true
futures.workspace = true
humantime-serde.workspace = true
lazy_static.workspace = true
prost.workspace = true
regex.workspace = true

View File

@@ -54,6 +54,13 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to parse value {} into key {}", value, key))]
ParseOption {
key: String,
value: String,
location: Location,
},
#[snafu(display("Corrupted table route data, err: {}", err_msg))]
RouteInfoCorrupted { err_msg: String, location: Location },
@@ -151,6 +158,7 @@ impl ErrorExt for Error {
IllegalServerState { .. } | EtcdTxnOpResponse { .. } => StatusCode::Internal,
SerdeJson { .. }
| ParseOption { .. }
| RouteInfoCorrupted { .. }
| InvalidProtoMsg { .. }
| InvalidTableMetadata { .. }

View File

@@ -215,9 +215,14 @@ impl TableMetadataManager {
/// The caller MUST ensure it has the exclusive access to `TableNameKey`.
pub async fn create_table_metadata(
&self,
table_info: RawTableInfo,
mut table_info: RawTableInfo,
region_routes: Vec<RegionRoute>,
) -> Result<()> {
let region_numbers = region_routes
.iter()
.map(|region| region.region.id.region_number())
.collect::<Vec<_>>();
table_info.meta.region_numbers = region_numbers;
let table_id = table_info.ident.table_id;
// Creates table name.
@@ -489,15 +494,35 @@ macro_rules! impl_table_meta_value {
}
}
#[macro_export]
macro_rules! impl_optional_meta_value {
($($val_ty: ty), *) => {
$(
impl $val_ty {
pub fn try_from_raw_value(raw_value: &[u8]) -> Result<Option<Self>> {
serde_json::from_slice(raw_value).context(SerdeJsonSnafu)
}
pub fn try_as_raw_value(&self) -> Result<Vec<u8>> {
serde_json::to_vec(self).context(SerdeJsonSnafu)
}
}
)*
}
}
impl_table_meta_value! {
CatalogNameValue,
SchemaNameValue,
TableNameValue,
TableInfoValue,
DatanodeTableValue,
TableRouteValue
}
impl_optional_meta_value! {
CatalogNameValue,
SchemaNameValue
}
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
@@ -524,7 +549,7 @@ mod tests {
assert_eq!(removed, to_removed_key(key));
}
fn new_test_table_info() -> TableInfo {
fn new_test_table_info(region_numbers: impl Iterator<Item = u32>) -> TableInfo {
let column_schemas = vec![
ColumnSchema::new("col1", ConcreteDataType::int32_datatype(), true),
ColumnSchema::new(
@@ -546,6 +571,7 @@ mod tests {
.primary_key_indices(vec![0])
.engine("engine")
.next_column_id(3)
.region_numbers(region_numbers.collect::<Vec<_>>())
.build()
.unwrap();
TableInfoBuilder::default()
@@ -578,9 +604,10 @@ mod tests {
async fn test_create_table_metadata() {
let mem_kv = Arc::new(MemoryKvBackend::default());
let table_metadata_manager = TableMetadataManager::new(mem_kv);
let table_info: RawTableInfo = new_test_table_info().into();
let region_route = new_test_region_route();
let region_routes = vec![region_route.clone()];
let table_info: RawTableInfo =
new_test_table_info(region_routes.iter().map(|r| r.region.id.region_number())).into();
// creates metadata.
table_metadata_manager
.create_table_metadata(table_info.clone(), region_routes.clone())
@@ -612,11 +639,12 @@ mod tests {
async fn test_delete_table_metadata() {
let mem_kv = Arc::new(MemoryKvBackend::default());
let table_metadata_manager = TableMetadataManager::new(mem_kv);
let table_info: RawTableInfo = new_test_table_info().into();
let table_id = table_info.ident.table_id;
let region_route = new_test_region_route();
let datanode_id = 2;
let region_routes = vec![region_route.clone()];
let table_info: RawTableInfo =
new_test_table_info(region_routes.iter().map(|r| r.region.id.region_number())).into();
let table_id = table_info.ident.table_id;
let datanode_id = 2;
let table_route_value = TableRouteValue::new(region_routes.clone());
// creates metadata.
@@ -682,10 +710,11 @@ mod tests {
async fn test_rename_table() {
let mem_kv = Arc::new(MemoryKvBackend::default());
let table_metadata_manager = TableMetadataManager::new(mem_kv);
let table_info: RawTableInfo = new_test_table_info().into();
let table_id = table_info.ident.table_id;
let region_route = new_test_region_route();
let region_routes = vec![region_route.clone()];
let table_info: RawTableInfo =
new_test_table_info(region_routes.iter().map(|r| r.region.id.region_number())).into();
let table_id = table_info.ident.table_id;
// creates metadata.
table_metadata_manager
.create_table_metadata(table_info.clone(), region_routes.clone())
@@ -746,10 +775,11 @@ mod tests {
async fn test_update_table_info() {
let mem_kv = Arc::new(MemoryKvBackend::default());
let table_metadata_manager = TableMetadataManager::new(mem_kv);
let table_info: RawTableInfo = new_test_table_info().into();
let table_id = table_info.ident.table_id;
let region_route = new_test_region_route();
let region_routes = vec![region_route.clone()];
let table_info: RawTableInfo =
new_test_table_info(region_routes.iter().map(|r| r.region.id.region_number())).into();
let table_id = table_info.ident.table_id;
// creates metadata.
table_metadata_manager
.create_table_metadata(table_info.clone(), region_routes.clone())
@@ -811,9 +841,10 @@ mod tests {
async fn test_update_table_route() {
let mem_kv = Arc::new(MemoryKvBackend::default());
let table_metadata_manager = TableMetadataManager::new(mem_kv);
let table_info: RawTableInfo = new_test_table_info().into();
let region_route = new_test_region_route();
let region_routes = vec![region_route.clone()];
let table_info: RawTableInfo =
new_test_table_info(region_routes.iter().map(|r| r.region.id.region_number())).into();
let table_id = table_info.ident.table_id;
let current_table_route_value = TableRouteValue::new(region_routes.clone());
// creates metadata.

View File

@@ -12,22 +12,27 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::fmt::Display;
use std::sync::Arc;
use std::time::Duration;
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use futures::stream::BoxStream;
use futures::StreamExt;
use humantime_serde::re::humantime;
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt};
use crate::error::{self, Error, InvalidTableMetadataSnafu, Result};
use crate::error::{self, Error, InvalidTableMetadataSnafu, ParseOptionSnafu, Result};
use crate::key::{TableMetaKey, SCHEMA_NAME_KEY_PATTERN, SCHEMA_NAME_KEY_PREFIX};
use crate::kv_backend::KvBackendRef;
use crate::range_stream::{PaginationStream, DEFAULT_PAGE_SIZE};
use crate::rpc::store::{PutRequest, RangeRequest};
use crate::rpc::KeyValue;
const OPT_KEY_TTL: &str = "ttl";
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct SchemaNameKey<'a> {
pub catalog: &'a str,
@@ -43,8 +48,33 @@ impl<'a> Default for SchemaNameKey<'a> {
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct SchemaNameValue;
#[derive(Debug, Default, Clone, PartialEq, Serialize, Deserialize)]
pub struct SchemaNameValue {
#[serde(default)]
#[serde(with = "humantime_serde")]
pub ttl: Option<Duration>,
}
impl TryFrom<&HashMap<String, String>> for SchemaNameValue {
type Error = Error;
fn try_from(value: &HashMap<String, String>) -> std::result::Result<Self, Self::Error> {
let ttl = value
.get(OPT_KEY_TTL)
.map(|ttl_str| {
ttl_str.parse::<humantime::Duration>().map_err(|_| {
ParseOptionSnafu {
key: OPT_KEY_TTL,
value: ttl_str.clone(),
}
.build()
})
})
.transpose()?
.map(|ttl| ttl.into());
Ok(Self { ttl })
}
}
impl<'a> SchemaNameKey<'a> {
pub fn new(catalog: &'a str, schema: &'a str) -> Self {
@@ -108,11 +138,15 @@ impl SchemaManager {
}
/// Creates `SchemaNameKey`.
pub async fn create(&self, schema: SchemaNameKey<'_>) -> Result<()> {
pub async fn create(
&self,
schema: SchemaNameKey<'_>,
value: Option<SchemaNameValue>,
) -> Result<()> {
let raw_key = schema.as_raw_key();
let req = PutRequest::new()
.with_key(raw_key)
.with_value(SchemaNameValue.try_as_raw_value()?);
.with_value(value.unwrap_or_default().try_as_raw_value()?);
self.kv_backend.put(req).await?;
@@ -125,6 +159,14 @@ impl SchemaManager {
Ok(self.kv_backend.get(&raw_key).await?.is_some())
}
pub async fn get(&self, schema: SchemaNameKey<'_>) -> Result<Option<SchemaNameValue>> {
let raw_key = schema.as_raw_key();
let value = self.kv_backend.get(&raw_key).await?;
value
.and_then(|v| SchemaNameValue::try_from_raw_value(v.value.as_ref()).transpose())
.transpose()
}
/// Returns a schema stream, it lists all schemas belong to the target `catalog`.
pub async fn schema_names(&self, catalog: &str) -> BoxStream<'static, Result<String>> {
let start_key = SchemaNameKey::range_start_key(catalog);
@@ -143,25 +185,39 @@ impl SchemaManager {
#[cfg(test)]
mod tests {
use super::*;
use crate::kv_backend::memory::MemoryKvBackend;
#[test]
fn test_serialization() {
let key = SchemaNameKey::new("my-catalog", "my-schema");
assert_eq!(key.to_string(), "__schema_name/my-catalog/my-schema");
let parsed: SchemaNameKey<'_> = "__schema_name/my-catalog/my-schema".try_into().unwrap();
assert_eq!(key, parsed);
let value = SchemaNameValue {
ttl: Some(Duration::from_secs(10)),
};
let mut opts: HashMap<String, String> = HashMap::new();
opts.insert("ttl".to_string(), "10s".to_string());
let from_value = SchemaNameValue::try_from(&opts).unwrap();
assert_eq!(value, from_value);
let parsed = SchemaNameValue::try_from_raw_value("{\"ttl\":\"10s\"}".as_bytes()).unwrap();
assert_eq!(Some(value), parsed);
let none = SchemaNameValue::try_from_raw_value("null".as_bytes()).unwrap();
assert!(none.is_none());
let err_empty = SchemaNameValue::try_from_raw_value("".as_bytes());
assert!(err_empty.is_err());
}
#[tokio::test]
async fn test_key_exist() {
let manager = SchemaManager::new(Arc::new(MemoryKvBackend::default()));
let schema_key = SchemaNameKey::new("my-catalog", "my-schema");
manager.create(schema_key).await.unwrap();
manager.create(schema_key, None).await.unwrap();
assert!(manager.exist(schema_key).await.unwrap());

View File

@@ -12,7 +12,7 @@ common-error = { workspace = true }
common-runtime = { workspace = true }
common-telemetry = { workspace = true }
futures.workspace = true
humantime-serde = "1.1"
humantime-serde.workspace = true
object-store = { workspace = true }
serde.workspace = true
serde_json = "1.0"

View File

@@ -34,13 +34,8 @@ use crate::{
SendableRecordBatchStream, Stream,
};
type FutureStream = Pin<
Box<
dyn std::future::Future<
Output = std::result::Result<DfSendableRecordBatchStream, DataFusionError>,
> + Send,
>,
>;
type FutureStream =
Pin<Box<dyn std::future::Future<Output = Result<SendableRecordBatchStream>> + Send>>;
/// ParquetRecordBatchStream -> DataFusion RecordBatchStream
pub struct ParquetRecordBatchStreamAdapter<T> {
@@ -223,7 +218,7 @@ impl Stream for RecordBatchStreamAdapter {
enum AsyncRecordBatchStreamAdapterState {
Uninit(FutureStream),
Ready(DfSendableRecordBatchStream),
Ready(SendableRecordBatchStream),
Failed,
}
@@ -261,17 +256,12 @@ impl Stream for AsyncRecordBatchStreamAdapter {
}
Err(e) => {
self.state = AsyncRecordBatchStreamAdapterState::Failed;
return Poll::Ready(Some(
Err(e).context(error::InitRecordbatchStreamSnafu),
));
return Poll::Ready(Some(Err(e)));
}
};
}
AsyncRecordBatchStreamAdapterState::Ready(stream) => {
return Poll::Ready(ready!(Pin::new(stream).poll_next(cx)).map(|x| {
let df_record_batch = x.context(error::PollStreamSnafu)?;
RecordBatch::try_from_df_record_batch(self.schema(), df_record_batch)
}))
return Poll::Ready(ready!(Pin::new(stream).poll_next(cx)))
}
AsyncRecordBatchStreamAdapterState::Failed => return Poll::Ready(None),
}
@@ -296,6 +286,7 @@ mod test {
use snafu::IntoError;
use super::*;
use crate::error::Error;
use crate::RecordBatches;
#[tokio::test]
@@ -330,12 +321,7 @@ mod test {
) -> FutureStream {
Box::pin(async move {
maybe_recordbatches
.map(|items| {
Box::pin(DfRecordBatchStreamAdapter::new(Box::pin(
MaybeErrorRecordBatchStream { items },
))) as _
})
.map_err(|e| DataFusionError::External(Box::new(e)))
.map(|items| Box::pin(MaybeErrorRecordBatchStream { items }) as _)
})
}
@@ -369,20 +355,24 @@ mod test {
.into_error(BoxedError::new(MockError::new(StatusCode::Unknown)))),
]));
let adapter = AsyncRecordBatchStreamAdapter::new(schema.clone(), poll_err_stream);
let result = RecordBatches::try_collect(Box::pin(adapter)).await;
assert_eq!(
result.unwrap_err().to_string(),
"Failed to poll stream, source: External error: External error, source: Unknown"
let err = RecordBatches::try_collect(Box::pin(adapter))
.await
.unwrap_err();
assert!(
matches!(err, Error::External { .. }),
"unexpected err {err}"
);
let failed_to_init_stream =
new_future_stream(Err(error::ExternalSnafu
.into_error(BoxedError::new(MockError::new(StatusCode::Internal)))));
let adapter = AsyncRecordBatchStreamAdapter::new(schema.clone(), failed_to_init_stream);
let result = RecordBatches::try_collect(Box::pin(adapter)).await;
assert_eq!(
result.unwrap_err().to_string(),
"Failed to init Recordbatch stream, source: External error: External error, source: Internal"
let err = RecordBatches::try_collect(Box::pin(adapter))
.await
.unwrap_err();
assert!(
matches!(err, Error::External { .. }),
"unexpected err {err}"
);
}
}

View File

@@ -37,7 +37,7 @@ pub enum Error {
source: datatypes::error::Error,
},
#[snafu(display("External error, source: {}", source))]
#[snafu(display("External error, location: {}, source: {}", location, source))]
External {
location: Location,
source: BoxedError,

View File

@@ -202,13 +202,26 @@ impl Stream for SimpleRecordBatchStream {
}
/// Adapt a [Stream] of [RecordBatch] to a [RecordBatchStream].
pub struct RecordBatchStreamAdaptor {
pub struct RecordBatchStreamAdaptor<S> {
pub schema: SchemaRef,
pub stream: Pin<Box<dyn Stream<Item = Result<RecordBatch>> + Send>>,
pub stream: S,
pub output_ordering: Option<Vec<OrderOption>>,
}
impl RecordBatchStream for RecordBatchStreamAdaptor {
impl<S> RecordBatchStreamAdaptor<S> {
/// Creates a RecordBatchStreamAdaptor without output ordering requirement.
pub fn new(schema: SchemaRef, stream: S) -> RecordBatchStreamAdaptor<S> {
RecordBatchStreamAdaptor {
schema,
stream,
output_ordering: None,
}
}
}
impl<S: Stream<Item = Result<RecordBatch>> + Unpin> RecordBatchStream
for RecordBatchStreamAdaptor<S>
{
fn schema(&self) -> SchemaRef {
self.schema.clone()
}
@@ -218,7 +231,7 @@ impl RecordBatchStream for RecordBatchStreamAdaptor {
}
}
impl Stream for RecordBatchStreamAdaptor {
impl<S: Stream<Item = Result<RecordBatch>> + Unpin> Stream for RecordBatchStreamAdaptor<S> {
type Item = Result<RecordBatch>;
fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {

View File

@@ -29,6 +29,16 @@ use crate::error::{ArithmeticOverflowSnafu, Error, ParseTimestampSnafu, Timestam
use crate::timezone::TimeZone;
use crate::util::{div_ceil, format_utc_datetime, local_datetime_to_utc};
/// Timestamp represents the value of units(seconds/milliseconds/microseconds/nanoseconds) elapsed
/// since UNIX epoch. The valid value range of [Timestamp] depends on it's unit (all in UTC time zone):
/// - for [TimeUnit::Second]: [-262144-01-01 00:00:00, +262143-12-31 23:59:59]
/// - for [TimeUnit::Millisecond]: [-262144-01-01 00:00:00.000, +262143-12-31 23:59:59.999]
/// - for [TimeUnit::Microsecond]: [-262144-01-01 00:00:00.000000, +262143-12-31 23:59:59.999999]
/// - for [TimeUnit::Nanosecond]: [1677-09-21 00:12:43.145225, 2262-04-11 23:47:16.854775807]
///
/// # Note:
/// For values out of range, you can still store these timestamps, but while performing arithmetic
/// or formatting operations, it will return an error or just overflow.
#[derive(Debug, Clone, Default, Copy, Serialize, Deserialize)]
pub struct Timestamp {
value: i64,
@@ -169,6 +179,28 @@ impl Timestamp {
(sec_div, nsec)
}
/// Creates a new Timestamp instance from seconds and nanoseconds parts.
/// Returns None if overflow.
fn from_splits(sec: i64, nsec: u32) -> Option<Self> {
if nsec == 0 {
Some(Timestamp::new_second(sec))
} else if nsec % 1_000_000 == 0 {
let millis = nsec / 1_000_000;
sec.checked_mul(1000)
.and_then(|v| v.checked_add(millis as i64))
.map(Timestamp::new_millisecond)
} else if nsec % 1000 == 0 {
let micros = nsec / 1000;
sec.checked_mul(1_000_000)
.and_then(|v| v.checked_add(micros as i64))
.map(Timestamp::new_microsecond)
} else {
sec.checked_mul(1_000_000_000)
.and_then(|v| v.checked_add(nsec as i64))
.map(Timestamp::new_nanosecond)
}
}
/// Format timestamp to ISO8601 string. If the timestamp exceeds what chrono timestamp can
/// represent, this function simply print the timestamp unit and value in plain string.
pub fn to_iso8601_string(&self) -> String {
@@ -205,6 +237,12 @@ impl Timestamp {
let (sec, nsec) = self.split();
NaiveDateTime::from_timestamp_opt(sec, nsec)
}
pub fn from_chrono_datetime(ndt: NaiveDateTime) -> Option<Self> {
let sec = ndt.timestamp();
let nsec = ndt.timestamp_subsec_nanos();
Timestamp::from_splits(sec, nsec)
}
}
impl FromStr for Timestamp {
@@ -225,13 +263,16 @@ impl FromStr for Timestamp {
// RFC3339 timestamp (with a T)
let s = s.trim();
if let Ok(ts) = DateTime::parse_from_rfc3339(s) {
return Ok(Timestamp::new(ts.timestamp_nanos(), TimeUnit::Nanosecond));
return Timestamp::from_chrono_datetime(ts.naive_utc())
.context(ParseTimestampSnafu { raw: s });
}
if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") {
return Ok(Timestamp::new(ts.timestamp_nanos(), TimeUnit::Nanosecond));
return Timestamp::from_chrono_datetime(ts.naive_utc())
.context(ParseTimestampSnafu { raw: s });
}
if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") {
return Ok(Timestamp::new(ts.timestamp_nanos(), TimeUnit::Nanosecond));
return Timestamp::from_chrono_datetime(ts.naive_utc())
.context(ParseTimestampSnafu { raw: s });
}
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
@@ -264,7 +305,7 @@ fn naive_datetime_to_timestamp(
match local_datetime_to_utc(&datetime) {
LocalResult::None => ParseTimestampSnafu { raw: s }.fail(),
LocalResult::Single(utc) | LocalResult::Ambiguous(utc, _) => {
Ok(Timestamp::new(utc.timestamp_nanos(), TimeUnit::Nanosecond))
Timestamp::from_chrono_datetime(utc).context(ParseTimestampSnafu { raw: s })
}
}
}
@@ -608,11 +649,7 @@ mod tests {
// but expected timestamp is in UTC timezone
fn check_from_str(s: &str, expect: &str) {
let ts = Timestamp::from_str(s).unwrap();
let time = NaiveDateTime::from_timestamp_opt(
ts.value / 1_000_000_000,
(ts.value % 1_000_000_000) as u32,
)
.unwrap();
let time = ts.to_chrono_datetime().unwrap();
assert_eq!(expect, time.to_string());
}
@@ -1049,4 +1086,70 @@ mod tests {
TimeUnit::from(ArrowTimeUnit::Nanosecond)
);
}
fn check_conversion(ts: Timestamp, valid: bool) {
let Some(t2) = ts.to_chrono_datetime() else {
if valid {
panic!("Cannot convert {:?} to Chrono NaiveDateTime", ts);
}
return;
};
let Some(t3) = Timestamp::from_chrono_datetime(t2) else {
if valid {
panic!("Cannot convert Chrono NaiveDateTime {:?} to Timestamp", t2);
}
return;
};
assert_eq!(t3, ts);
}
#[test]
fn test_from_naive_date_time() {
let min_sec = Timestamp::new_second(-8334632851200);
let max_sec = Timestamp::new_second(8210298412799);
check_conversion(min_sec, true);
check_conversion(Timestamp::new_second(min_sec.value - 1), false);
check_conversion(max_sec, true);
check_conversion(Timestamp::new_second(max_sec.value + 1), false);
let min_millis = Timestamp::new_millisecond(-8334632851200000);
let max_millis = Timestamp::new_millisecond(8210298412799999);
check_conversion(min_millis, true);
check_conversion(Timestamp::new_millisecond(min_millis.value - 1), false);
check_conversion(max_millis, true);
check_conversion(Timestamp::new_millisecond(max_millis.value + 1), false);
let min_micros = Timestamp::new_microsecond(-8334632851200000000);
let max_micros = Timestamp::new_microsecond(8210298412799999999);
check_conversion(min_micros, true);
check_conversion(Timestamp::new_microsecond(min_micros.value - 1), false);
check_conversion(max_micros, true);
check_conversion(Timestamp::new_microsecond(max_micros.value + 1), false);
let min_nanos = Timestamp::new_nanosecond(-9223372036854775000);
let max_nanos = Timestamp::new_nanosecond(i64::MAX);
check_conversion(min_nanos, true);
check_conversion(Timestamp::new_nanosecond(min_nanos.value - 1), false);
check_conversion(max_nanos, true);
}
#[test]
fn test_parse_timestamp_range() {
let valid_strings = vec![
"-262144-01-01 00:00:00Z",
"+262143-12-31 23:59:59Z",
"-262144-01-01 00:00:00Z",
"+262143-12-31 23:59:59.999Z",
"-262144-01-01 00:00:00Z",
"+262143-12-31 23:59:59.999999Z",
"1677-09-21 00:12:43.145225Z",
"2262-04-11 23:47:16.854775807Z",
"+100000-01-01 00:00:01.5Z",
];
for s in valid_strings {
Timestamp::from_str(s).unwrap();
}
}
}

View File

@@ -9,6 +9,7 @@ testing = ["meta-srv/mock"]
[dependencies]
api = { workspace = true }
arrow-flight.workspace = true
async-compat = "0.2"
async-stream.workspace = true
async-trait.workspace = true
@@ -39,7 +40,7 @@ datatypes = { workspace = true }
file-table-engine = { workspace = true }
futures = "0.3"
futures-util.workspace = true
humantime-serde = "1.1"
humantime-serde.workspace = true
hyper = { version = "0.14", features = ["full"] }
key-lock = "0.1"
log-store = { workspace = true }

View File

@@ -256,8 +256,6 @@ pub struct RegionManifestConfig {
/// Region manifest logs and checkpoints gc task execution duration.
#[serde(with = "humantime_serde")]
pub gc_duration: Option<Duration>,
/// Whether to try creating a manifest checkpoint on region opening
pub checkpoint_on_startup: bool,
/// Whether to compress manifest and checkpoint file by gzip
pub compress: bool,
}
@@ -267,7 +265,6 @@ impl Default for RegionManifestConfig {
Self {
checkpoint_margin: Some(10u16),
gc_duration: Some(Duration::from_secs(600)),
checkpoint_on_startup: false,
compress: false,
}
}
@@ -341,7 +338,6 @@ impl From<&DatanodeOptions> for StorageEngineConfig {
fn from(value: &DatanodeOptions) -> Self {
Self {
compress_manifest: value.storage.manifest.compress,
manifest_checkpoint_on_startup: value.storage.manifest.checkpoint_on_startup,
manifest_checkpoint_margin: value.storage.manifest.checkpoint_margin,
manifest_gc_duration: value.storage.manifest.gc_duration,
max_files_in_l0: value.storage.compaction.max_files_in_level0,

View File

@@ -556,6 +556,16 @@ pub enum Error {
location: Location,
source: BoxedError,
},
#[snafu(display(
"Failed to build region requests, location:{}, source: {}",
location,
source
))]
BuildRegionRequests {
location: Location,
source: store_api::metadata::MetadataError,
},
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -569,6 +579,7 @@ impl ErrorExt for Error {
| ExecuteStatement { source, .. }
| ExecuteLogicalPlan { source, .. } => source.status_code(),
BuildRegionRequests { source, .. } => source.status_code(),
HandleHeartbeatResponse { source, .. } => source.status_code(),
DecodeLogicalPlan { source, .. } => source.status_code(),

View File

@@ -15,9 +15,9 @@
use std::sync::Arc;
use async_trait::async_trait;
use catalog::error::Error as CatalogError;
use catalog::error::{Error as CatalogError, Result as CatalogResult};
use catalog::remote::region_alive_keeper::RegionAliveKeepers;
use catalog::{CatalogManagerRef, RegisterTableRequest};
use catalog::{CatalogManagerRef, RegisterSchemaRequest, RegisterTableRequest};
use common_catalog::format_full_table_name;
use common_meta::error::Result as MetaResult;
use common_meta::heartbeat::handler::{
@@ -30,6 +30,7 @@ use store_api::storage::RegionNumber;
use table::engine::manager::TableEngineManagerRef;
use table::engine::EngineContext;
use table::requests::OpenTableRequest;
use table::Table;
use crate::error::{self, Result};
@@ -157,6 +158,45 @@ impl OpenRegionHandler {
Ok(false)
}
async fn register_table(
&self,
request: &OpenTableRequest,
table: Arc<dyn Table>,
) -> CatalogResult<bool> {
if !self
.catalog_manager
.catalog_exist(&request.catalog_name)
.await?
{
self.catalog_manager
.clone()
.register_catalog(request.catalog_name.to_string())
.await?;
}
if !self
.catalog_manager
.schema_exist(&request.catalog_name, &request.schema_name)
.await?
{
self.catalog_manager
.register_schema(RegisterSchemaRequest {
catalog: request.catalog_name.to_string(),
schema: request.schema_name.to_string(),
})
.await?;
}
let request = RegisterTableRequest {
catalog: request.catalog_name.to_string(),
schema: request.schema_name.to_string(),
table_name: request.table_name.to_string(),
table_id: request.table_id,
table,
};
self.catalog_manager.register_table(request).await
}
async fn open_region_inner(&self, engine: String, request: OpenTableRequest) -> Result<bool> {
let OpenTableRequest {
catalog_name,
@@ -187,14 +227,8 @@ impl OpenRegionHandler {
table_name: format_full_table_name(catalog_name, schema_name, table_name),
})?
{
let request = RegisterTableRequest {
catalog: request.catalog_name.clone(),
schema: request.schema_name.clone(),
table_name: request.table_name.clone(),
table_id: request.table_id,
table,
};
let result = self.catalog_manager.register_table(request).await;
let result = self.register_table(&request, table).await;
match result {
Ok(_) | Err(CatalogError::TableExists { .. }) => Ok(true),
e => e.with_context(|_| error::RegisterTableSnafu {

View File

@@ -365,6 +365,7 @@ mod test {
expr: Some(DdlExpr::CreateDatabase(CreateDatabaseExpr {
database_name: "my_database".to_string(),
create_if_not_exists: true,
options: Default::default(),
})),
});
let output = instance.do_query(query, QueryContext::arc()).await.unwrap();
@@ -418,6 +419,7 @@ mod test {
expr: Some(DdlExpr::CreateDatabase(CreateDatabaseExpr {
database_name: "my_database".to_string(),
create_if_not_exists: true,
options: Default::default(),
})),
});
let output = instance.do_query(query, QueryContext::arc()).await.unwrap();
@@ -485,6 +487,7 @@ mod test {
expr: Some(DdlExpr::CreateDatabase(CreateDatabaseExpr {
database_name: "my_database".to_string(),
create_if_not_exists: true,
options: Default::default(),
})),
});
let output = instance.do_query(query, QueryContext::arc()).await.unwrap();
@@ -589,6 +592,7 @@ mod test {
expr: Some(DdlExpr::CreateDatabase(CreateDatabaseExpr {
database_name: "my_database".to_string(),
create_if_not_exists: true,
options: Default::default(),
})),
});
let output = instance.do_query(query, QueryContext::arc()).await.unwrap();
@@ -661,6 +665,7 @@ mod test {
expr: Some(DdlExpr::CreateDatabase(CreateDatabaseExpr {
database_name: "my_database".to_string(),
create_if_not_exists: true,
options: Default::default(),
})),
});
let output = instance.do_query(query, QueryContext::arc()).await.unwrap();

View File

@@ -141,7 +141,8 @@ impl Instance {
let table_ref = TableReference::full(&catalog, &schema, &table);
let table = self.sql_handler.get_table(&table_ref).await?;
query::sql::show_create_table(table, None).context(ExecuteStatementSnafu)
query::sql::show_create_table(table, None, query_ctx.clone())
.context(ExecuteStatementSnafu)
}
Statement::TruncateTable(truncate_table) => {
let (catalog_name, schema_name, table_name) =

View File

@@ -14,15 +14,20 @@
use std::any::Any;
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use std::sync::{Arc, Mutex, RwLock};
use api::v1::region::QueryRequest;
use api::v1::region::{region_request, QueryRequest, RegionResponse};
use api::v1::{ResponseHeader, Status};
use arrow_flight::{FlightData, Ticket};
use async_trait::async_trait;
use bytes::Bytes;
use common_error::ext::BoxedError;
use common_error::status_code::StatusCode;
use common_query::logical_plan::Expr;
use common_query::physical_plan::DfPhysicalPlanAdapter;
use common_query::{DfPhysicalPlan, Output};
use common_recordbatch::SendableRecordBatchStream;
use common_runtime::Runtime;
use common_telemetry::info;
use dashmap::DashMap;
use datafusion::catalog::schema::SchemaProvider;
@@ -33,7 +38,12 @@ use datafusion::execution::context::SessionState;
use datafusion_common::DataFusionError;
use datafusion_expr::{Expr as DfExpr, TableType};
use datatypes::arrow::datatypes::SchemaRef;
use futures_util::future::try_join_all;
use prost::Message;
use query::QueryEngineRef;
use servers::error::{self as servers_error, ExecuteGrpcRequestSnafu, Result as ServerResult};
use servers::grpc::flight::{FlightCraft, FlightRecordBatchStream, TonicStream};
use servers::grpc::region_server::RegionServerHandler;
use session::context::QueryContext;
use snafu::{OptionExt, ResultExt};
use store_api::metadata::RegionMetadataRef;
@@ -42,31 +52,129 @@ use store_api::region_request::RegionRequest;
use store_api::storage::{RegionId, ScanRequest};
use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
use table::table::scan::StreamScanAdapter;
use tonic::{Request, Response, Result as TonicResult};
use crate::error::{
DecodeLogicalPlanSnafu, ExecuteLogicalPlanSnafu, GetRegionMetadataSnafu,
HandleRegionRequestSnafu, RegionEngineNotFoundSnafu, RegionNotFoundSnafu, Result,
UnsupportedOutputSnafu,
BuildRegionRequestsSnafu, DecodeLogicalPlanSnafu, ExecuteLogicalPlanSnafu,
GetRegionMetadataSnafu, HandleRegionRequestSnafu, RegionEngineNotFoundSnafu,
RegionNotFoundSnafu, Result, UnsupportedOutputSnafu,
};
#[derive(Clone)]
pub struct RegionServer {
engines: HashMap<String, RegionEngineRef>,
region_map: DashMap<RegionId, RegionEngineRef>,
query_engine: QueryEngineRef,
inner: Arc<RegionServerInner>,
}
impl RegionServer {
pub fn new(query_engine: QueryEngineRef) -> Self {
pub fn new(query_engine: QueryEngineRef, runtime: Arc<Runtime>) -> Self {
Self {
engines: HashMap::new(),
region_map: DashMap::new(),
query_engine,
inner: Arc::new(RegionServerInner::new(query_engine, runtime)),
}
}
pub fn register_engine(&mut self, engine: RegionEngineRef) {
self.inner.register_engine(engine);
}
pub async fn handle_request(
&self,
region_id: RegionId,
request: RegionRequest,
) -> Result<Output> {
self.inner.handle_request(region_id, request).await
}
pub async fn handle_read(&self, request: QueryRequest) -> Result<SendableRecordBatchStream> {
self.inner.handle_read(request).await
}
}
#[async_trait]
impl RegionServerHandler for RegionServer {
async fn handle(&self, request: region_request::Body) -> ServerResult<RegionResponse> {
let requests = RegionRequest::try_from_request_body(request)
.context(BuildRegionRequestsSnafu)
.map_err(BoxedError::new)
.context(ExecuteGrpcRequestSnafu)?;
let join_tasks = requests.into_iter().map(|(region_id, req)| {
let self_to_move = self.clone();
self.inner
.runtime
.spawn(async move { self_to_move.handle_request(region_id, req).await })
});
let results = try_join_all(join_tasks)
.await
.context(servers_error::JoinTaskSnafu)?;
// merge results by simply sum up affected rows.
// only insert/delete will have multiple results.
let mut affected_rows = 0;
for result in results {
match result
.map_err(BoxedError::new)
.context(servers_error::ExecuteGrpcRequestSnafu)?
{
Output::AffectedRows(rows) => affected_rows += rows,
Output::Stream(_) | Output::RecordBatches(_) => {
// TODO: change the output type to only contains `affected_rows`
unreachable!()
}
}
}
Ok(RegionResponse {
header: Some(ResponseHeader {
status: Some(Status {
status_code: StatusCode::Success as _,
..Default::default()
}),
}),
affected_rows: affected_rows as _,
})
}
}
#[async_trait]
impl FlightCraft for RegionServer {
async fn do_get(
&self,
request: Request<Ticket>,
) -> TonicResult<Response<TonicStream<FlightData>>> {
let ticket = request.into_inner().ticket;
let request = QueryRequest::decode(ticket.as_ref())
.context(servers_error::InvalidFlightTicketSnafu)?;
let result = self.handle_read(request).await?;
let stream = Box::pin(FlightRecordBatchStream::new(result));
Ok(Response::new(stream))
}
}
struct RegionServerInner {
engines: RwLock<HashMap<String, RegionEngineRef>>,
region_map: DashMap<RegionId, RegionEngineRef>,
query_engine: QueryEngineRef,
runtime: Arc<Runtime>,
}
impl RegionServerInner {
pub fn new(query_engine: QueryEngineRef, runtime: Arc<Runtime>) -> Self {
Self {
engines: RwLock::new(HashMap::new()),
region_map: DashMap::new(),
query_engine,
runtime,
}
}
pub fn register_engine(&self, engine: RegionEngineRef) {
let engine_name = engine.name();
self.engines.insert(engine_name.to_string(), engine);
self.engines
.write()
.unwrap()
.insert(engine_name.to_string(), engine);
}
pub async fn handle_request(
@@ -80,7 +188,7 @@ impl RegionServer {
RegionRequest::Create(create) => RegionChange::Register(create.engine.clone()),
RegionRequest::Open(open) => RegionChange::Register(open.engine.clone()),
RegionRequest::Close(_) | RegionRequest::Drop(_) => RegionChange::Deregisters,
RegionRequest::Write(_)
RegionRequest::Put(_)
| RegionRequest::Delete(_)
| RegionRequest::Alter(_)
| RegionRequest::Flush(_)
@@ -90,6 +198,8 @@ impl RegionServer {
let engine = match &region_change {
RegionChange::Register(engine_type) => self
.engines
.read()
.unwrap()
.get(engine_type)
.with_context(|| RegionEngineNotFoundSnafu { name: engine_type })?
.clone(),

View File

@@ -31,6 +31,7 @@ use crate::error::{
WaitForGrpcServingSnafu,
};
use crate::instance::InstanceRef;
use crate::region_server::RegionServer;
pub mod grpc;
@@ -42,6 +43,9 @@ pub struct Services {
impl Services {
pub async fn try_new(instance: InstanceRef, opts: &DatanodeOptions) -> Result<Self> {
// TODO(ruihang): remove database service once region server is ready.
let enable_region_server = option_env!("ENABLE_REGION_SERVER").is_some();
let grpc_runtime = Arc::new(
RuntimeBuilder::default()
.worker_threads(opts.rpc_runtime_size)
@@ -50,10 +54,24 @@ impl Services {
.context(RuntimeResourceSnafu)?,
);
let region_server = RegionServer::new(instance.query_engine(), grpc_runtime.clone());
let flight_handler = if enable_region_server {
Some(Arc::new(region_server.clone()) as _)
} else {
None
};
let region_server_handler = if enable_region_server {
Some(Arc::new(region_server.clone()) as _)
} else {
None
};
Ok(Self {
grpc_server: GrpcServer::new(
ServerGrpcQueryHandlerAdaptor::arc(instance),
None,
flight_handler,
region_server_handler,
None,
grpc_runtime,
),

View File

@@ -20,6 +20,7 @@ use api::v1::greptime_request::Request as GrpcRequest;
use api::v1::meta::HeartbeatResponse;
use api::v1::query_request::Query;
use api::v1::QueryRequest;
use catalog::local::MemoryCatalogManager;
use catalog::remote::region_alive_keeper::RegionAliveKeepers;
use catalog::CatalogManagerRef;
use common_meta::heartbeat::handler::{
@@ -160,8 +161,10 @@ async fn test_open_region_handler() {
let table_ident = &region_ident.table_ident;
let table = prepare_table(instance.inner()).await;
let dummy_catalog_manager = MemoryCatalogManager::with_default_setup();
region_alive_keepers
.register_table(table_ident.clone(), table)
.register_table(table_ident.clone(), table, dummy_catalog_manager)
.await
.unwrap();
@@ -173,14 +176,17 @@ async fn test_open_region_handler() {
InstructionReply::OpenRegion(SimpleReply { result: true, .. })
);
let keeper = region_alive_keepers.find_keeper(table_ident).await.unwrap();
let keeper = region_alive_keepers
.find_keeper(table_ident.table_id)
.await
.unwrap();
let deadline = keeper.deadline(0).await.unwrap();
assert!(deadline <= Instant::now() + Duration::from_secs(20));
// Opens a non-exist table
let non_exist_table_ident = TableIdent {
catalog: "greptime".to_string(),
schema: "public".to_string(),
catalog: "foo".to_string(),
schema: "non-exist".to_string(),
table: "non-exist".to_string(),
table_id: 2024,
engine: "mito".to_string(),
@@ -203,7 +209,7 @@ async fn test_open_region_handler() {
);
assert!(region_alive_keepers
.find_keeper(&non_exist_table_ident)
.find_keeper(non_exist_table_ident.table_id)
.await
.is_none());
@@ -222,7 +228,7 @@ async fn test_open_region_handler() {
assert_test_table_not_found(instance.inner()).await;
assert!(region_alive_keepers
.find_keeper(table_ident)
.find_keeper(table_ident.table_id)
.await
.is_none());

View File

@@ -115,6 +115,9 @@ pub enum Error {
#[snafu(display("Column {} already exists", column))]
DuplicateColumn { column: String, location: Location },
#[snafu(display("Failed to unpack value to given type: {}", reason))]
TryFromValue { reason: String, location: Location },
}
impl ErrorExt for Error {

View File

@@ -32,7 +32,7 @@ use serde::{Deserialize, Serialize};
use snafu::{ensure, ResultExt};
use crate::error;
use crate::error::Result;
use crate::error::{Error, Result, TryFromValueSnafu};
use crate::prelude::*;
use crate::type_id::LogicalTypeId;
use crate::types::{IntervalType, ListType};
@@ -441,6 +441,62 @@ impl Ord for Value {
}
}
macro_rules! impl_try_from_value {
($Variant: ident, $Type: ident) => {
impl TryFrom<Value> for $Type {
type Error = Error;
#[inline]
fn try_from(from: Value) -> std::result::Result<Self, Self::Error> {
match from {
Value::$Variant(v) => Ok(v.into()),
_ => TryFromValueSnafu {
reason: format!("{:?} is not a {}", from, stringify!($Type)),
}
.fail(),
}
}
}
impl TryFrom<Value> for Option<$Type> {
type Error = Error;
#[inline]
fn try_from(from: Value) -> std::result::Result<Self, Self::Error> {
match from {
Value::$Variant(v) => Ok(Some(v.into())),
Value::Null => Ok(None),
_ => TryFromValueSnafu {
reason: format!("{:?} is not a {}", from, stringify!($Type)),
}
.fail(),
}
}
}
};
}
impl_try_from_value!(Boolean, bool);
impl_try_from_value!(UInt8, u8);
impl_try_from_value!(UInt16, u16);
impl_try_from_value!(UInt32, u32);
impl_try_from_value!(UInt64, u64);
impl_try_from_value!(Int8, i8);
impl_try_from_value!(Int16, i16);
impl_try_from_value!(Int32, i32);
impl_try_from_value!(Int64, i64);
impl_try_from_value!(Float32, f32);
impl_try_from_value!(Float64, f64);
impl_try_from_value!(Float32, OrderedF32);
impl_try_from_value!(Float64, OrderedF64);
impl_try_from_value!(String, StringBytes);
impl_try_from_value!(Binary, Bytes);
impl_try_from_value!(Date, Date);
impl_try_from_value!(Time, Time);
impl_try_from_value!(DateTime, DateTime);
impl_try_from_value!(Timestamp, Timestamp);
impl_try_from_value!(Interval, Interval);
macro_rules! impl_value_from {
($Variant: ident, $Type: ident) => {
impl From<$Type> for Value {
@@ -471,6 +527,8 @@ impl_value_from!(Int32, i32);
impl_value_from!(Int64, i64);
impl_value_from!(Float32, f32);
impl_value_from!(Float64, f64);
impl_value_from!(Float32, OrderedF32);
impl_value_from!(Float64, OrderedF64);
impl_value_from!(String, StringBytes);
impl_value_from!(Binary, Bytes);
impl_value_from!(Date, Date);

View File

@@ -39,7 +39,8 @@ impl BooleanVector {
&self.array
}
pub(crate) fn as_boolean_array(&self) -> &BooleanArray {
/// Get the inner boolean array.
pub fn as_boolean_array(&self) -> &BooleanArray {
&self.array
}

View File

@@ -230,7 +230,8 @@ impl<T: LogicalPrimitiveType> PrimitiveVector<T> {
}
}
pub(crate) fn as_arrow(&self) -> &PrimitiveArray<T::ArrowPrimitive> {
/// Get the inner arrow array.
pub fn as_arrow(&self) -> &PrimitiveArray<T::ArrowPrimitive> {
&self.array
}
@@ -245,7 +246,11 @@ impl<T: LogicalPrimitiveType> PrimitiveVector<T> {
}
// To distinguish with `Vector::slice()`.
fn get_slice(&self, offset: usize, length: usize) -> Self {
/// Slice the batch, returning a new batch.
///
/// # Panics
/// This function panics if `offset + length > self.len()`.
pub fn get_slice(&self, offset: usize, length: usize) -> Self {
let data = self.array.to_data().slice(offset, length);
Self::from_array_data(data)
}
@@ -295,8 +300,7 @@ impl<T: LogicalPrimitiveType> Vector for PrimitiveVector<T> {
}
fn slice(&self, offset: usize, length: usize) -> VectorRef {
let data = self.array.to_data().slice(offset, length);
Arc::new(Self::from_array_data(data))
Arc::new(self.get_slice(offset, length))
}
fn get(&self, index: usize) -> Value {

25
src/flow/Cargo.toml Normal file
View File

@@ -0,0 +1,25 @@
[package]
name = "flow"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
# use version from crates.io for now to prevent version slewing
# disable default-features which include `abomonaion` which we don't need for IPC
# timely = {version = "0.12.0", default-features = false, features = ["bincode"]}
# differential-dataflow = "0.12.0"
# timely = "0.12.0"
# differential-dataflow = "0.12.0"
# TODO(discord9): fork later for fixed version git dependency
timely = { git = "https://github.com/TimelyDataflow/timely-dataflow", default-features = false, features = [
"bincode",
] }
differential-dataflow = { git = "https://github.com/TimelyDataflow/differential-dataflow" } #, rev = "99fa67db" }
datafusion-expr.workspace = true
datafusion-substrait.workspace = true
serde = { version = "1.0", features = ["derive"] }
datatypes = { path = "../datatypes" }
common-telemetry = { path = "../common/telemetry" }

View File

@@ -0,0 +1,3 @@
//! for getting data from source and sending results to sink
//! and communicating with other parts of the database
//! also commands storage and computation layer

View File

@@ -0,0 +1,22 @@
use std::collections::BTreeMap;
use crate::expr::GlobalId;
/// Worker-local state that is maintained across dataflows.
///
/// This state is restricted to the COMPUTE state, the deterministic, idempotent work
/// done between data ingress and egress.
pub struct ComputeState {
/// State kept for each installed compute collection.
///
/// Each collection has exactly one frontier.
/// How the frontier is communicated depends on the collection type:
/// * Frontiers of indexes are equal to the frontier of their corresponding traces in the
/// `TraceManager`.
/// * Persist sinks store their current frontier in `CollectionState::sink_write_frontier`.
/// * Subscribes report their frontiers through the `subscribe_response_buffer`.
pub collections: BTreeMap<GlobalId, CollectionState>,
}
/// State maintained for a compute collection.
pub struct CollectionState {}

View File

@@ -0,0 +1,743 @@
use std::collections::BTreeMap;
use differential_dataflow::lattice::Lattice;
use differential_dataflow::operators::arrange::Arranged;
use differential_dataflow::trace::wrappers::enter::TraceEnter;
use differential_dataflow::trace::wrappers::frontier::TraceFrontier;
use differential_dataflow::trace::{BatchReader, Cursor, TraceReader};
use differential_dataflow::{Collection, Data};
use timely::communication::message::RefOrMut;
use timely::dataflow::operators::generic::OutputHandle;
use timely::dataflow::operators::Capability;
use timely::dataflow::scopes::Child;
use timely::dataflow::{Scope, ScopeParent};
use timely::progress::timestamp::Refines;
use timely::progress::{Antichain, Timestamp};
use super::plan::Plan;
use super::types::DataflowDescription;
use crate::compute::render::RenderTimestamp;
use crate::compute::typedefs::{TraceErrHandle, TraceRowHandle};
use crate::expr::{GlobalId, Id, MapFilterProject, ScalarExpr};
use crate::repr;
use crate::repr::{Diff, Row};
use crate::storage::errors::DataflowError;
// Local type definition to avoid the horror in signatures.
pub(crate) type KeyArrangement<S, K, V> =
Arranged<S, TraceRowHandle<K, V, <S as ScopeParent>::Timestamp, Diff>>;
pub(crate) type Arrangement<S, V> = KeyArrangement<S, V, V>;
pub(crate) type ErrArrangement<S> =
Arranged<S, TraceErrHandle<DataflowError, <S as ScopeParent>::Timestamp, Diff>>;
pub(crate) type ArrangementImport<S, V, T> = Arranged<
S,
TraceEnter<TraceFrontier<TraceRowHandle<V, V, T, Diff>>, <S as ScopeParent>::Timestamp>,
>;
pub(crate) type ErrArrangementImport<S, T> = Arranged<
S,
TraceEnter<
TraceFrontier<TraceErrHandle<DataflowError, T, Diff>>,
<S as ScopeParent>::Timestamp,
>,
>;
/// Describes flavor of arrangement: local or imported trace.
#[derive(Clone)]
pub enum ArrangementFlavor<S: Scope, V: Data, T = repr::Timestamp>
where
T: Timestamp + Lattice,
S::Timestamp: Lattice + Refines<T>,
{
/// A dataflow-local arrangement.
Local(Arrangement<S, V>, ErrArrangement<S>),
/// An imported trace from outside the dataflow.
///
/// The `GlobalId` identifier exists so that exports of this same trace
/// can refer back to and depend on the original instance.
Trace(
GlobalId,
ArrangementImport<S, V, T>,
ErrArrangementImport<S, T>,
),
}
impl<S: Scope, T> ArrangementFlavor<S, Row, T>
where
T: Timestamp + Lattice,
S::Timestamp: Lattice + Refines<T>,
{
/// Presents `self` as a stream of updates.
///
/// This method presents the contents as they are, without further computation.
/// If you have logic that could be applied to each record, consider using the
/// `flat_map` methods which allows this and can reduce the work done.
pub fn as_collection(&self) -> (Collection<S, Row, Diff>, Collection<S, DataflowError, Diff>) {
match &self {
ArrangementFlavor::Local(oks, errs) => (
oks.as_collection(move |k: &Row, v: &Row| {
// type annotated because rust-analzyer can't infer the type due to being complex closures
// see https://github.com/rust-lang/rust-analyzer/issues/6338
let mut k = k.clone();
k.extend(v.clone().into_iter());
k
}),
errs.as_collection(|k, &()| k.clone()),
),
ArrangementFlavor::Trace(_, oks, errs) => (
oks.as_collection(move |k, v| {
let mut k = k.clone();
k.extend(v.clone().into_iter());
k
}),
errs.as_collection(|k, &()| k.clone()),
),
}
}
/// Constructs and applies logic to elements of `self` and returns the results.
///
/// `constructor` takes a permutation and produces the logic to apply on elements. The logic
/// conceptually receives `(&Row, &Row)` pairs in the form of a slice. Only after borrowing
/// the elements and applying the permutation the datums will be in the expected order.
///
/// If `key` is set, this is a promise that `logic` will produce no results on
/// records for which the key does not evaluate to the value. This is used to
/// leap directly to exactly those records.
pub fn flat_map<I, C, L>(
&self,
key: Option<Row>,
constructor: C,
) -> (
timely::dataflow::Stream<S, I::Item>,
Collection<S, DataflowError, Diff>,
)
where
I: IntoIterator,
I::Item: Data,
C: FnOnce() -> L,
L: for<'a, 'b> FnMut(&'a [&'b RefOrMut<'b, Row>], &'a S::Timestamp, &'a Diff) -> I
+ 'static,
{
// Set a number of tuples after which the operator should yield.
// This allows us to remain responsive even when enumerating a substantial
// arrangement, as well as provides time to accumulate our produced output.
let refuel = 1000000;
match &self {
ArrangementFlavor::Local(oks, errs) => {
let mut logic = constructor();
let oks = CollectionBundle::<S, Row, T>::flat_map_core(
oks,
key,
move |k, v, t, d| logic(&[&k, &v], t, d),
refuel,
);
let errs = errs.as_collection(|k, &()| k.clone());
(oks, errs)
}
ArrangementFlavor::Trace(_, oks, errs) => {
let mut logic = constructor();
let oks = CollectionBundle::<S, Row, T>::flat_map_core(
oks,
key,
move |k, v, t, d| logic(&[&k, &v], t, d),
refuel,
);
let errs = errs.as_collection(|k, &()| k.clone());
(oks, errs)
}
}
}
}
impl<S: Scope, V: Data, T> ArrangementFlavor<S, V, T>
where
T: Timestamp + Lattice,
S::Timestamp: Lattice + Refines<T>,
{
pub fn scope(&self) -> S {
match self {
ArrangementFlavor::Local(oks, _errs) => oks.stream.scope(),
ArrangementFlavor::Trace(_gid, oks, _errs) => oks.stream.scope(),
}
}
/// Brings the arrangement flavor into a region.
pub fn enter_region<'a>(
&self,
region: &Child<'a, S, S::Timestamp>,
) -> ArrangementFlavor<Child<'a, S, S::Timestamp>, V, T> {
match self {
ArrangementFlavor::Local(oks, errs) => {
ArrangementFlavor::Local(oks.enter_region(region), errs.enter_region(region))
}
ArrangementFlavor::Trace(gid, oks, errs) => {
ArrangementFlavor::Trace(*gid, oks.enter_region(region), errs.enter_region(region))
}
}
}
}
impl<'a, S: Scope, V: Data, T> ArrangementFlavor<Child<'a, S, S::Timestamp>, V, T>
where
T: Timestamp + Lattice,
S::Timestamp: Lattice + Refines<T>,
{
/// Extracts the arrangement flavor from a region.
pub fn leave_region(&self) -> ArrangementFlavor<S, V, T> {
match self {
ArrangementFlavor::Local(oks, errs) => {
ArrangementFlavor::Local(oks.leave_region(), errs.leave_region())
}
ArrangementFlavor::Trace(gid, oks, errs) => {
ArrangementFlavor::Trace(*gid, oks.leave_region(), errs.leave_region())
}
}
}
}
pub struct Context<S, V: Data, T = repr::Timestamp>
where
T: Timestamp + Lattice,
S: Scope,
S::Timestamp: Lattice + Refines<T>,
{
/// The scope within which all managed collections exist.
///
/// It is an error to add any collections not contained in this scope.
pub(crate) scope: S,
/// The debug name of the dataflow associated with this context.
pub debug_name: String,
/// The Timely ID of the dataflow associated with this context.
pub dataflow_id: usize,
/// Frontier before which updates should not be emitted.
///
/// We *must* apply it to sinks, to ensure correct outputs.
/// We *should* apply it to sources and imported traces, because it improves performance.
pub since_frontier: Antichain<T>,
/// Frontier after which updates should not be emitted.
/// Used to limit the amount of work done when appropriate.
pub until_frontier: Antichain<T>,
/// Bindings of identifiers to collections.
pub bindings: BTreeMap<Id, CollectionBundle<S, V, T>>,
}
impl<S: Scope, V: Data> Context<S, V>
where
S::Timestamp: Lattice + Refines<repr::Timestamp>,
{
/// TODO(discord9)" DataflowDesc & Plan & etc.
/// Creates a new empty Context from given dataflow
pub fn for_dataflow_in<Plan>(dataflow: &DataflowDescription<Plan, ()>, scope: S) -> Self {
let dataflow_id = scope.addr()[0];
let since_frontier = dataflow
.as_of
.clone()
.unwrap_or_else(|| Antichain::from_elem(Timestamp::minimum()));
// TODO(discord9)=: get since_frontier and until_frontier from dataflow_desc
Self {
scope,
debug_name: dataflow.debug_name.clone(),
dataflow_id,
since_frontier,
until_frontier: dataflow.until.clone(),
bindings: BTreeMap::new(),
}
}
}
impl<S: Scope, V: Data, T: Lattice> Context<S, V, T>
where
T: Timestamp + Lattice,
S::Timestamp: Lattice + Refines<T>,
{
/// Insert a collection bundle by an identifier.
///
/// This is expected to be used to install external collections (sources, indexes, other views),
/// as well as for `Let` bindings of local collections.
pub fn insert_id(
&mut self,
id: Id,
collection: CollectionBundle<S, V, T>,
) -> Option<CollectionBundle<S, V, T>> {
self.bindings.insert(id, collection)
}
/// Remove a collection bundle by an identifier.
///
/// The primary use of this method is uninstalling `Let` bindings.
pub fn remove_id(&mut self, id: Id) -> Option<CollectionBundle<S, V, T>> {
self.bindings.remove(&id)
}
/// Melds a collection bundle to whatever exists.
#[allow(clippy::map_entry)]
pub fn update_id(&mut self, id: Id, collection: CollectionBundle<S, V, T>) {
if !self.bindings.contains_key(&id) {
self.bindings.insert(id, collection);
} else {
let binding = self
.bindings
.get_mut(&id)
.expect("Binding verified to exist");
if collection.collection.is_some() {
binding.collection = collection.collection;
}
for (key, flavor) in collection.arranged.into_iter() {
binding.arranged.insert(key, flavor);
}
}
}
/// Look up a collection bundle by an identifier.
pub fn lookup_id(&self, id: Id) -> Option<CollectionBundle<S, V, T>> {
self.bindings.get(&id).cloned()
}
}
type ResultCollection<S, V> = (Collection<S, V, Diff>, Collection<S, DataflowError, Diff>);
/// A bundle of the various ways a collection can be represented.
///
/// This type maintains the invariant that it does contain at least one valid
/// source of data, either a collection or at least one arrangement.
#[derive(Clone)]
pub struct CollectionBundle<S, V, T = repr::Timestamp>
where
T: Timestamp + Lattice,
S: Scope,
S::Timestamp: Lattice + Refines<T>,
V: Data,
{
pub(crate) collection: Option<ResultCollection<S, V>>,
/// TODO(discord9): impl: 1. ScalarExpr(Could be from substrait), 2. Arrangement
pub(crate) arranged: BTreeMap<Vec<ScalarExpr>, ArrangementFlavor<S, V, T>>,
}
impl<S: Scope, V: Data, T: Lattice> CollectionBundle<S, V, T>
where
T: Timestamp + Lattice,
S::Timestamp: Lattice + Refines<T>,
{
/// Construct a new collection bundle from update streams.
pub fn from_collections(
oks: Collection<S, V, Diff>,
errs: Collection<S, DataflowError, Diff>,
) -> Self {
Self {
collection: Some((oks, errs)),
arranged: BTreeMap::default(),
}
}
/// Inserts arrangements by the expressions on which they are keyed.
pub fn from_expressions(
exprs: Vec<ScalarExpr>,
arrangements: ArrangementFlavor<S, V, T>,
) -> Self {
let mut arranged = BTreeMap::new();
arranged.insert(exprs, arrangements);
Self {
collection: None,
arranged,
}
}
/// Inserts arrangements by the columns on which they are keyed.
pub fn from_columns<I: IntoIterator<Item = usize>>(
columns: I,
arrangements: ArrangementFlavor<S, V, T>,
) -> Self {
let mut keys = Vec::new();
for column in columns {
keys.push(ScalarExpr::Column(column));
}
Self::from_expressions(keys, arrangements)
}
/// The scope containing the collection bundle.
pub fn scope(&self) -> S {
if let Some((oks, _errs)) = &self.collection {
oks.inner.scope()
} else {
self.arranged
.values()
.next()
.expect("Must contain a valid collection")
.scope()
}
}
/// Brings the collection bundle into a region.
pub fn enter_region<'a>(
&self,
region: &Child<'a, S, S::Timestamp>,
) -> CollectionBundle<Child<'a, S, S::Timestamp>, V, T> {
CollectionBundle {
collection: self
.collection
.as_ref()
.map(|(oks, errs)| (oks.enter_region(region), errs.enter_region(region))),
arranged: self
.arranged
.iter()
.map(|(key, bundle)| (key.clone(), bundle.enter_region(region)))
.collect(),
}
}
}
impl<S, T> CollectionBundle<S, repr::Row, T>
where
T: timely::progress::Timestamp + Lattice,
S: Scope,
S::Timestamp: Refines<T> + Lattice + RenderTimestamp,
{
/// Presents `self` as a stream of updates, having been subjected to `mfp`.
///
/// This operator is able to apply the logic of `mfp` early, which can substantially
/// reduce the amount of data produced when `mfp` is non-trivial.
///
/// The `key_val` argument, when present, indicates that a specific arrangement should
/// be used, and if, in addition, the `val` component is present,
/// that we can seek to the supplied row.
pub fn as_collection_core(
&self,
mut mfp: MapFilterProject,
key_val: Option<(Vec<ScalarExpr>, Option<Row>)>,
until: Antichain<repr::Timestamp>,
) -> (Collection<S, Row, Diff>, Collection<S, DataflowError, Diff>) {
mfp.optimize();
let mfp_plan = mfp.into_plan().unwrap();
// If the MFP is trivial, we can just call `as_collection`.
// In the case that we weren't going to apply the `key_val` optimization,
// this path results in a slightly smaller and faster
// dataflow graph, and is intended to fix
let has_key_val = matches!(&key_val, Some((_key, Some(_val))));
if mfp_plan.is_identity() && !has_key_val {
let key = key_val.map(|(k, _v)| k);
return self.as_specific_collection(key.as_deref());
}
let (stream, errors) = self.flat_map(key_val, || {
let until = std::rc::Rc::new(until);
// this logic get executed every time a new row arrives
move |row_parts, time, diff| {
let until = std::rc::Rc::clone(&until);
let row_iters = row_parts
.iter()
.flat_map(|row| (**row).to_owned().into_iter());
let mut datums_local = Vec::new();
datums_local.extend(row_iters);
let time = time.clone();
let event_time: repr::Timestamp = *time.clone().event_time();
mfp_plan
.evaluate::<DataflowError, _>(
&mut datums_local,
event_time,
*diff,
move |time| !until.less_equal(time),
)
.map(move |x| match x {
Ok((row, event_time, diff)) => {
// Copy the whole time, and re-populate event time.
let mut time: S::Timestamp = time.clone();
*time.event_time() = event_time;
Ok((row, time, diff))
}
Err((e, event_time, diff)) => {
// Copy the whole time, and re-populate event time.
let mut time: S::Timestamp = time.clone();
*time.event_time() = event_time;
Err((e, time, diff))
}
})
}
});
use timely::dataflow::operators::ok_err::OkErr;
let (oks, errs) = stream.ok_err(|x| x);
use differential_dataflow::AsCollection;
let oks = oks.as_collection();
let errs = errs.as_collection();
(oks, errors.concat(&errs))
}
}
impl<'a, S: Scope, V: Data, T> CollectionBundle<Child<'a, S, S::Timestamp>, V, T>
where
T: Timestamp + Lattice,
S::Timestamp: Lattice + Refines<T>,
{
/// Extracts the collection bundle from a region.
pub fn leave_region(&self) -> CollectionBundle<S, V, T> {
CollectionBundle {
collection: self
.collection
.as_ref()
.map(|(oks, errs)| (oks.leave_region(), errs.leave_region())),
arranged: self
.arranged
.iter()
.map(|(key, bundle)| (key.clone(), bundle.leave_region()))
.collect(),
}
}
}
impl<S: Scope, T: Lattice> CollectionBundle<S, Row, T>
where
T: Timestamp + Lattice,
S::Timestamp: Lattice + Refines<T>,
{
/// Asserts that the arrangement for a specific key
/// (or the raw collection for no key) exists,
/// and returns the corresponding collection.
///
/// This returns the collection as-is, without
/// doing any unthinning transformation.
/// Therefore, it should be used when the appropriate transformation
/// was planned as part of a following MFP.
pub fn as_specific_collection(
&self,
key: Option<&[ScalarExpr]>,
) -> (Collection<S, Row, Diff>, Collection<S, DataflowError, Diff>) {
// Any operator that uses this method was told to use a particular
// collection during LIR planning, where we should have made
// sure that that collection exists.
//
// If it doesn't, we panic.
match key {
None => self
.collection
.clone()
.expect("The unarranged collection doesn't exist."),
Some(key) => self
.arranged
.get(key)
.unwrap_or_else(|| panic!("The collection arranged by {:?} doesn't exist.", key))
.as_collection(),
}
}
/// Constructs and applies logic to elements of a collection and returns the results.
///
/// `constructor` takes a permutation and produces the logic to apply on elements. The logic
/// conceptually receives `(&Row, &Row)` pairs in the form of a slice. Only after borrowing
/// the elements and applying the permutation the datums will be in the expected order.
///
/// If `key_val` is set, this is a promise that `logic` will produce no results on
/// records for which the key does not evaluate to the value. This is used when we
/// have an arrangement by that key to leap directly to exactly those records.
/// It is important that `logic` still guard against data that does not satisfy
/// this constraint, as this method does not statically know that it will have
/// that arrangement.
pub fn flat_map<I, C, L>(
&self,
key_val: Option<(Vec<ScalarExpr>, Option<Row>)>,
constructor: C,
) -> (
timely::dataflow::Stream<S, I::Item>,
Collection<S, DataflowError, Diff>,
)
where
I: IntoIterator,
I::Item: Data,
C: FnOnce() -> L,
L: for<'a, 'b> FnMut(&'a [&'b RefOrMut<'b, Row>], &'a S::Timestamp, &'a Diff) -> I
+ 'static,
{
// If `key_val` is set, we should have use the corresponding arrangement.
// If there isn't one, that implies an error in the contract between
// key-production and available arrangements.
if let Some((key, val)) = key_val {
let flavor = self
.arrangement(&key)
.expect("Should have ensured during planning that this arrangement exists.");
flavor.flat_map(val, constructor)
} else {
use timely::dataflow::operators::Map;
let (oks, errs) = self
.collection
.clone()
.expect("Invariant violated: CollectionBundle contains no collection.");
let mut logic = constructor();
(
oks.inner
.flat_map(move |(mut v, t, d)| logic(&[&RefOrMut::Mut(&mut v)], &t, &d)),
errs,
)
}
}
/// Factored out common logic for using literal keys in general traces.
///
/// This logic is sufficiently interesting that we want to write it only
/// once, and thereby avoid any skew in the two uses of the logic.
///
/// The function presents the contents of the trace as `(key, value, time, delta)` tuples,
/// where key and value are rows.
fn flat_map_core<Tr, I, L>(
trace: &Arranged<S, Tr>,
key: Option<Row>,
mut logic: L,
refuel: usize,
) -> timely::dataflow::Stream<S, I::Item>
where
Tr: TraceReader<Key = Row, Val = Row, Time = S::Timestamp, R = repr::Diff>
+ Clone
+ 'static,
I: IntoIterator,
I::Item: Data,
L: for<'a, 'b> FnMut(
RefOrMut<'b, Row>,
RefOrMut<'b, Row>,
&'a S::Timestamp,
&'a repr::Diff,
) -> I
+ 'static,
{
let mode = if key.is_some() { "index" } else { "scan" };
let name = format!("ArrangementFlatMap({})", mode);
use timely::dataflow::channels::pact::Pipeline;
use timely::dataflow::operators::Operator;
trace.stream.unary(Pipeline, &name, move |_, info| {
// Acquire an activator to reschedule the operator when it has unfinished work.
use timely::scheduling::Activator;
let activations = trace.stream.scope().activations();
let activator = Activator::new(&info.address[..], activations);
// Maintain a list of work to do, cursor to navigate and process.
let mut todo = std::collections::VecDeque::new();
move |input, output| {
// First, dequeue all batches.
input.for_each(|time, data| {
let capability = time.retain();
for batch in data.iter() {
// enqueue a capability, cursor, and batch.
todo.push_back(PendingWork::new(
capability.clone(),
batch.cursor(),
batch.clone(),
));
}
});
// Second, make progress on `todo`.
let mut fuel = refuel;
while !todo.is_empty() && fuel > 0 {
todo.front_mut()
.unwrap()
.do_work(&key, &mut logic, &mut fuel, output);
if fuel > 0 {
todo.pop_front();
}
}
// If we have not finished all work, re-activate the operator.
if !todo.is_empty() {
activator.activate();
}
}
})
}
/// Look up an arrangement by the expressions that form the key.
///
/// The result may be `None` if no such arrangement exists, or it may be one of many
/// "arrangement flavors" that represent the types of arranged data we might have.
pub fn arrangement(&self, key: &[ScalarExpr]) -> Option<ArrangementFlavor<S, Row, T>> {
self.arranged.get(key).cloned()
}
}
struct PendingWork<C>
where
C: Cursor,
C::Time: Timestamp,
{
capability: Capability<C::Time>,
cursor: C,
batch: C::Storage,
}
/// Handle specialized to `Vec`-based container.
type PendingOutputHandle<'a, C, I> = OutputHandle<
'a,
<C as Cursor>::Time,
<I as IntoIterator>::Item,
timely::dataflow::channels::pushers::Tee<<C as Cursor>::Time, <I as IntoIterator>::Item>,
>;
impl<C: Cursor> PendingWork<C>
where
C::Key: PartialEq,
C::Time: Timestamp,
{
/// Create a new bundle of pending work, from the capability, cursor, and backing storage.
fn new(capability: Capability<C::Time>, cursor: C, batch: C::Storage) -> Self {
Self {
capability,
cursor,
batch,
}
}
/// Perform roughly `fuel` work through the cursor, applying `logic` and sending results to `output`.
fn do_work<I, L>(
&mut self,
key: &Option<C::Key>,
logic: &mut L,
fuel: &mut usize,
output: &mut PendingOutputHandle<'_, C, I>,
) where
I: IntoIterator,
I::Item: Data,
L: for<'a, 'b> FnMut(
RefOrMut<'b, C::Key>,
RefOrMut<'b, C::Val>,
&'a C::Time,
&'a C::R,
) -> I
+ 'static,
{
// Attempt to make progress on this batch.
let mut work: usize = 0;
let mut session = output.session(&self.capability);
if let Some(key) = key {
if self.cursor.get_key(&self.batch) != Some(key) {
self.cursor.seek_key(&self.batch, key);
}
if self.cursor.get_key(&self.batch) == Some(key) {
while let Some(val) = self.cursor.get_val(&self.batch) {
self.cursor.map_times(&self.batch, |time, diff| {
for datum in logic(RefOrMut::Ref(key), RefOrMut::Ref(val), time, diff) {
session.give(datum);
work += 1;
}
});
self.cursor.step_val(&self.batch);
if work >= *fuel {
*fuel = 0;
return;
}
}
}
} else {
while let Some(key) = self.cursor.get_key(&self.batch) {
while let Some(val) = self.cursor.get_val(&self.batch) {
self.cursor.map_times(&self.batch, |time, diff| {
for datum in logic(RefOrMut::Ref(key), RefOrMut::Ref(val), time, diff) {
session.give(datum);
work += 1;
}
});
self.cursor.step_val(&self.batch);
if work >= *fuel {
*fuel = 0;
return;
}
}
self.cursor.step_key(&self.batch);
}
}
*fuel -= work;
}
}

View File

@@ -0,0 +1,15 @@
//! for generate dataflow from logical plan and computing the dataflow
mod compute_state;
mod context;
mod plan;
mod render;
mod typedefs;
mod types;
pub use context::Context;
// TODO(discord9): make a simplified version of source/sink
// sink: simply get rows out of sinked collection/err collection and put it somewhere
// (R, T, D) row of course with since/until frontier to limit
// source: simply insert stuff into it

View File

@@ -0,0 +1,10 @@
use serde::{Deserialize, Serialize};
/// A delta query is implemented by a set of paths, one for each input.
///
/// Each delta query path responds to its input changes by repeated lookups
/// in arrangements for other join inputs. These lookups require specific
/// instructions about which expressions to use as keys. Along the way,
/// various closures are applied to filter and project as early as possible.
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct DeltaJoinPlan {}

View File

@@ -0,0 +1,9 @@
use serde::{Deserialize, Serialize};
/// TODO(discord9): impl Join
/// A plan for the execution of a linear join.
///
/// A linear join is a sequence of stages, each of which introduces
/// a new collection. Each stage is represented by a [LinearStagePlan].
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct LinearJoinPlan {}

View File

@@ -0,0 +1,15 @@
use serde::{Deserialize, Serialize};
mod delta_join;
mod linear_join;
pub use delta_join::DeltaJoinPlan;
pub use linear_join::LinearJoinPlan;
/// TODO(discord9)(discord9): impl Join
/// A complete enumeration of possible join plans to render.
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub enum JoinPlan {
/// A join implemented by a linear join.
Linear(LinearJoinPlan),
/// A join implemented by a delta join.
Delta(DeltaJoinPlan),
}

View File

@@ -0,0 +1,222 @@
mod join;
mod reduce;
use std::collections::BTreeMap;
use join::JoinPlan;
pub(crate) use reduce::{
convert_indexes_to_skips, AccumulablePlan, BucketedPlan, KeyValPlan, ReducePlan,
};
use serde::{Deserialize, Serialize};
use crate::expr::{Id, LocalId, MapFilterProject, ScalarExpr, TableFunc};
use crate::repr::{self, Diff, Row};
use crate::storage::errors::EvalError;
/// The forms in which an operator's output is available;
/// it can be considered the plan-time equivalent of
/// `render::context::CollectionBundle`.
///
/// These forms are either "raw", representing an unarranged collection,
/// or "arranged", representing one that has been arranged by some key.
///
/// The raw collection, if it exists, may be consumed directly.
///
/// The arranged collections are slightly more complicated:
/// Each key here is attached to a description of how the corresponding
/// arrangement is permuted to remove value columns
/// that are redundant with key columns. Thus, the first element in each
/// tuple of `arranged` is the arrangement key; the second is the map of
/// logical output columns to columns in the key or value of the deduplicated
/// representation, and the third is a "thinning expression",
/// or list of columns to include in the value
/// when arranging.
///
/// For example, assume a 5-column collection is to be arranged by the key
/// `[Column(2), Column(0) + Column(3), Column(1)]`.
/// Then `Column(1)` and `Column(2)` in the value are redundant with the key, and
/// only columns 0, 3, and 4 need to be stored separately.
/// The thinning expression will then be `[0, 3, 4]`.
///
/// The permutation represents how to recover the
/// original values (logically `[Column(0), Column(1), Column(2), Column(3), Column(4)]`)
/// from the key and value of the arrangement, logically
/// `[Column(2), Column(0) + Column(3), Column(1), Column(0), Column(3), Column(4)]`.
/// Thus, the permutation in this case should be `{0: 3, 1: 2, 2: 0, 3: 4, 4: 5}`.
///
/// Note that this description, while true at the time of writing, is merely illustrative;
/// users of this struct should not rely on the exact strategy used for generating
/// the permutations. As long as clients apply the thinning expression
/// when creating arrangements, and permute by the hashmap when reading them,
/// the contract of the function where they are generated (`expr::permutation_for_arrangement`)
/// ensures that the correct values will be read.
#[derive(Default, Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
pub struct AvailableCollections {
/// Whether the collection exists in unarranged form.
pub raw: bool,
/// The set of arrangements of the collection, along with a
/// column permutation mapping
pub arranged: Vec<KeyWithColumnPermutation>,
}
pub type KeyWithColumnPermutation = (Vec<ScalarExpr>, BTreeMap<usize, usize>, Vec<usize>);
impl AvailableCollections {
/// Represent a collection that has no arrangements.
pub fn new_raw() -> Self {
Self {
raw: true,
arranged: vec![],
}
}
/// Represent a collection that is arranged in the
/// specified ways.
pub fn new_arranged(arranged: Vec<KeyWithColumnPermutation>) -> Self {
assert!(
!arranged.is_empty(),
"Invariant violated: at least one collection must exist"
);
Self {
raw: false,
arranged,
}
}
}
/// Rendering Plan
///
/// TODO(discord9): see if we ever need to support recursive plans
#[derive(Debug, Clone)]
pub enum Plan<T = repr::Timestamp> {
/// A collection containing a pre-determined collection.
Constant {
rows: Result<Vec<(Row, T, Diff)>, EvalError>,
},
/// A reference to a bound collection.
///
/// This is commonly either an external reference to an existing source or
/// maintained arrangement, or an internal reference to a `Let` identifier.
Get {
id: Id,
keys: AvailableCollections,
plan: GetPlan,
},
/// Binds `value` to `id`, and then results in `body` with that binding.
///
/// This stage has the effect of sharing `value` across multiple possible
/// uses in `body`, and is the only mechanism we have for sharing collection
/// information across parts of a dataflow.
///
/// The binding is not available outside of `body`.
Let {
/// The local identifier to be used, available to `body` as `Id::Local(id)`.
id: LocalId,
/// The collection that should be bound to `id`.
value: Box<Plan<T>>,
/// The collection that results, which is allowed to contain `Get` stages
/// that reference `Id::Local(id)`.
body: Box<Plan<T>>,
},
/// Map, Filter, and Project operators.
///
/// This stage contains work that we would ideally like to fuse to other plan
/// stages, but for practical reasons cannot. For example: reduce, threshold,
/// and topk stages are not able to absorb this operator.
Mfp {
/// The input collection.
input: Box<Plan<T>>,
/// Linear operator to apply to each record.
mfp: MapFilterProject,
/// Whether the input is from an arrangement, and if so,
/// whether we can seek to a specific value therein
input_key_val: Option<(Vec<ScalarExpr>, Option<Row>)>,
},
/// A variable number of output records for each input record.
///
/// This stage is a bit of a catch-all for logic that does not easily fit in
/// map stages. This includes table valued functions, but also functions of
/// multiple arguments, and functions that modify the sign of updates.
///
/// This stage allows a `MapFilterProject` operator to be fused to its output,
/// and this can be very important as otherwise the output of `func` is just
/// appended to the input record, for as many outputs as it has. This has the
/// unpleasant default behavior of repeating potentially large records that
/// are being unpacked, producing quadratic output in those cases. Instead,
/// in these cases use a `mfp` member that projects away these large fields.
FlatMap {
/// The input collection.
input: Box<Plan<T>>,
/// The variable-record emitting function.
func: TableFunc,
/// Expressions that for each row prepare the arguments to `func`.
exprs: Vec<ScalarExpr>,
/// Linear operator to apply to each record produced by `func`.
mfp: MapFilterProject,
/// The particular arrangement of the input we expect to use,
/// if any
input_key: Option<Vec<ScalarExpr>>,
},
/// A multiway relational equijoin, with fused map, filter, and projection.
///
/// This stage performs a multiway join among `inputs`, using the equality
/// constraints expressed in `plan`. The plan also describes the implementation
/// strategy we will use, and any pushed down per-record work.
Join {
/// An ordered list of inputs that will be joined.
inputs: Vec<Plan<T>>,
/// Detailed information about the implementation of the join.
///
/// This includes information about the implementation strategy, but also
/// any map, filter, project work that we might follow the join with, but
/// potentially pushed down into the implementation of the join.
plan: JoinPlan,
},
/// Aggregation by key.
Reduce {
/// The input collection.
input: Box<Plan<T>>,
/// A plan for changing input records into key, value pairs.
key_val_plan: KeyValPlan,
/// A plan for performing the reduce.
///
/// The implementation of reduction has several different strategies based
/// on the properties of the reduction, and the input itself. Please check
/// out the documentation for this type for more detail.
plan: ReducePlan,
/// The particular arrangement of the input we expect to use,
/// if any
input_key: Option<Vec<ScalarExpr>>,
},
}
/// TODO(discord9): impl GetPlan
#[derive(Debug, Clone)]
pub enum GetPlan {
/// Simply pass input arrangements on to the next stage.
PassArrangements,
/// Using the supplied key, optionally seek the row, and apply the MFP.
Arrangement(Vec<ScalarExpr>, Option<Row>, MapFilterProject),
/// Scan the input collection (unarranged) and apply the MFP.
Collection(MapFilterProject),
}
/// Returns bucket sizes, descending, suitable for hierarchical decomposition of an operator, based
/// on the expected number of rows that will have the same group key.
fn bucketing_of_expected_group_size(expected_group_size: Option<u64>) -> Vec<u64> {
let mut buckets = vec![];
let mut current = 16;
// Plan for 4B records in the expected case if the user didn't specify a group size.
let limit = expected_group_size.unwrap_or(4_000_000_000);
// Distribute buckets in powers of 16, so that we can strike a balance between how many inputs
// each layer gets from the preceding layer, while also limiting the number of layers.
while current < limit {
buckets.push(current);
current = current.saturating_mul(16);
}
buckets.reverse();
buckets
}

View File

@@ -0,0 +1,233 @@
use serde::{Deserialize, Serialize};
use crate::expr::{AggregateExpr, AggregateFunc, MapFilterProject, SafeMfpPlan};
/// This enum represents the three potential types of aggregations.
#[derive(Copy, Clone, Debug, Deserialize, Eq, Hash, Ord, PartialEq, PartialOrd, Serialize)]
pub enum ReductionType {
/// Accumulable functions can be subtracted from (are invertible), and associative.
/// We can compute these results by moving some data to the diff field under arbitrary
/// changes to inputs. Examples include sum or count.
Accumulable,
/// Hierarchical functions are associative, which means we can split up the work of
/// computing them across subsets. Note that hierarchical reductions should also
/// reduce the data in some way, as otherwise rendering them hierarchically is not
/// worth it. Examples include min or max.
Hierarchical,
/// Basic, for lack of a better word, are functions that are neither accumulable
/// nor hierarchical. Examples include jsonb_agg.
Basic,
}
/// Plan for extracting keys and values in preparation for a reduction.
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct KeyValPlan {
/// Extracts the columns used as the key.
pub key_plan: SafeMfpPlan,
/// Extracts the columns used to feed the aggregations.
pub val_plan: SafeMfpPlan,
}
/// Transforms a vector containing indexes of needed columns into one containing
/// the "skips" an iterator over a Row would need to perform to see those values.
///
/// This function requires that all of the elements in `indexes` are strictly
/// increasing.
///
/// # Examples
///
/// ```
/// assert_eq!(convert_indexes_to_skips(vec![3, 6, 10, 15]), [3, 2, 3, 4])
/// ```
pub fn convert_indexes_to_skips(mut indexes: Vec<usize>) -> Vec<usize> {
for i in 1..indexes.len() {
assert!(
indexes[i - 1] < indexes[i],
"convert_indexes_to_skip needs indexes to be strictly increasing. Received: {:?}",
indexes,
);
}
for i in (1..indexes.len()).rev() {
indexes[i] -= indexes[i - 1];
indexes[i] -= 1;
}
indexes
}
/// A `ReducePlan` provides a concise description for how we will
/// execute a given reduce expression.
///
/// The provided reduce expression can have no
/// aggregations, in which case its just a `Distinct` and otherwise
/// it's composed of a combination of accumulable, hierarchical and
/// basic aggregations.
///
/// We want to try to centralize as much decision making about the
/// shape / general computation of the rendered dataflow graph
/// in this plan, and then make actually rendering the graph
/// be as simple (and compiler verifiable) as possible.
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub enum ReducePlan {
/// Plan for not computing any aggregations, just determining the set of
/// distinct keys.
Distinct,
/// Plan for computing only accumulable aggregations.
Accumulable(AccumulablePlan),
/// Plan for computing only hierarchical aggregations.
Hierarchical(HierarchicalPlan),
/// Plan for computing only basic aggregations.
Basic(BasicPlan),
/// Plan for computing a mix of different kinds of aggregations.
/// We need to do extra work here to reassemble results back in the
/// requested order.
Collation(CollationPlan),
}
/// Plan for computing a set of accumulable aggregations.
///
/// We fuse all of the accumulable aggregations together
/// and compute them with one dataflow fragment. We need to
/// be careful to separate out the aggregations that
/// apply only to the distinct set of values. We need
/// to apply a distinct operator to those before we
/// combine them with everything else.
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct AccumulablePlan {
/// All of the aggregations we were asked to compute, stored
/// in order.
pub full_aggrs: Vec<AggregateExpr>,
/// All of the non-distinct accumulable aggregates.
/// Each element represents:
/// (index of the aggregation among accumulable aggregations,
/// index of the datum among inputs, aggregation expr)
/// These will all be rendered together in one dataflow fragment.
pub simple_aggrs: Vec<(usize, usize, AggregateExpr)>,
/// Same as above but for all of the `DISTINCT` accumulable aggregations.
pub distinct_aggrs: Vec<(usize, usize, AggregateExpr)>,
}
// TODO(discord9): others
/// Plan for computing a set of hierarchical aggregations.
///
/// In the append-only setting we can render them in-place
/// with monotonic plans, but otherwise, we need to render
/// them with a reduction tree that splits the inputs into
/// small, and then progressively larger, buckets
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub enum HierarchicalPlan {
/// Plan hierarchical aggregations under monotonic inputs.
Monotonic(MonotonicPlan),
/// Plan for hierarchical aggregations under non-monotonic inputs.
Bucketed(BucketedPlan),
}
/// Plan for computing a set of hierarchical aggregations with a
/// monotonic input.
///
/// Here, the aggregations will be rendered in place. We don't
/// need to worry about retractions because the inputs are
/// append only, so we can change our computation to
/// only retain the "best" value in the diff field, instead
/// of holding onto all values.
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct MonotonicPlan {
/// All of the aggregations we were asked to compute.
pub aggr_funcs: Vec<AggregateFunc>,
/// Set of "skips" or calls to `nth()` an iterator needs to do over
/// the input to extract the relevant datums.
pub skips: Vec<usize>,
/// True if the input is logically but not physically monotonic,
/// and the operator must first consolidate the inputs to remove
/// potential negations.
pub must_consolidate: bool,
}
/// Plan for computing a set of hierarchical aggregations
/// with non-monotonic inputs.
///
/// To perform hierarchical aggregations with stable runtimes
/// under updates we'll subdivide the group key into buckets, compute
/// the reduction in each of those subdivided buckets and then combine
/// the results into a coarser bucket (one that represents a larger
/// fraction of the original input) and redo the reduction in another
/// layer. Effectively, we'll construct a min / max heap out of a series
/// of reduce operators (each one is a separate layer).
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct BucketedPlan {
/// All of the aggregations we were asked to compute.
pub aggr_funcs: Vec<AggregateFunc>,
/// Set of "skips" or calls to `nth()` an iterator needs to do over
/// the input to extract the relevant datums.
pub skips: Vec<usize>,
/// The number of buckets in each layer of the reduction tree. Should
/// be decreasing, and ideally, a power of two so that we can easily
/// distribute values to buckets with `value.hashed() % buckets[layer]`.
pub buckets: Vec<u64>,
}
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub enum BasicPlan {}
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct CollationPlan {}
/// Determines whether a function can be accumulated in an update's "difference" field,
/// and whether it can be subjected to recursive (hierarchical) aggregation.
///
/// Accumulable aggregations will be packed into differential dataflow's "difference" field,
/// which can be accumulated in-place using the addition operation on the type. Aggregations
/// that indicate they are accumulable will still need to provide an action that takes their
/// data and introduces it as a difference, and the post-processing when the accumulated value
/// is presented as data.
///
/// Hierarchical aggregations will be subjected to repeated aggregation on initially small but
/// increasingly large subsets of each key. This has the intended property that no invocation
/// is on a significantly large set of values (and so, no incremental update needs to reform
/// significant input data). Hierarchical aggregates can be rendered more efficiently if the
/// input stream is append-only as then we only need to retain the "currently winning" value.
/// Every hierarchical aggregate needs to supply a corresponding ReductionMonoid implementation.
fn reduction_type(func: &AggregateFunc) -> ReductionType {
match func {
AggregateFunc::SumInt16
| AggregateFunc::SumInt32
| AggregateFunc::SumInt64
| AggregateFunc::SumUInt16
| AggregateFunc::SumUInt32
| AggregateFunc::SumUInt64
| AggregateFunc::SumFloat32
| AggregateFunc::SumFloat64
| AggregateFunc::Count
| AggregateFunc::Any
| AggregateFunc::All => ReductionType::Accumulable,
AggregateFunc::MaxInt16
| AggregateFunc::MaxInt32
| AggregateFunc::MaxInt64
| AggregateFunc::MaxUInt16
| AggregateFunc::MaxUInt32
| AggregateFunc::MaxUInt64
| AggregateFunc::MaxFloat32
| AggregateFunc::MaxFloat64
| AggregateFunc::MaxBool
| AggregateFunc::MaxString
| AggregateFunc::MaxDate
| AggregateFunc::MaxTimestamp
| AggregateFunc::MaxTimestampTz
| AggregateFunc::MinInt16
| AggregateFunc::MinInt32
| AggregateFunc::MinInt64
| AggregateFunc::MinUInt16
| AggregateFunc::MinUInt32
| AggregateFunc::MinUInt64
| AggregateFunc::MinFloat32
| AggregateFunc::MinFloat64
| AggregateFunc::MinBool
| AggregateFunc::MinString
| AggregateFunc::MinDate
| AggregateFunc::MinTimestamp
| AggregateFunc::MinTimestampTz => ReductionType::Hierarchical,
_ => ReductionType::Basic,
}
}

View File

@@ -0,0 +1,60 @@
use std::hash::Hash;
use differential_dataflow::ExchangeData;
use crate::repr::Row;
/// Used to make possibly-validating code generic: think of this as a kind of `MaybeResult`,
/// specialized for use in compute. Validation code will only run when the error constructor is
/// Some.
pub(super) trait MaybeValidatingRow<T, E>: ExchangeData + Hash {
fn ok(t: T) -> Self;
fn into_error() -> Option<fn(E) -> Self>;
}
impl<E> MaybeValidatingRow<Row, E> for Row {
fn ok(t: Row) -> Self {
t
}
fn into_error() -> Option<fn(E) -> Self> {
None
}
}
impl<E> MaybeValidatingRow<(), E> for () {
fn ok(t: ()) -> Self {
t
}
fn into_error() -> Option<fn(E) -> Self> {
None
}
}
impl<E, R> MaybeValidatingRow<Vec<R>, E> for Vec<R>
where
R: ExchangeData + Hash,
{
fn ok(t: Vec<R>) -> Self {
t
}
fn into_error() -> Option<fn(E) -> Self> {
None
}
}
impl<T, E> MaybeValidatingRow<T, E> for Result<T, E>
where
T: ExchangeData + Hash,
E: ExchangeData + Hash,
{
fn ok(row: T) -> Self {
Ok(row)
}
fn into_error() -> Option<fn(E) -> Self> {
Some(Err)
}
}

View File

@@ -0,0 +1,626 @@
//! for building the flow graph from PLAN
//! this is basically the last step before actually running the flow graph
use differential_dataflow::lattice::Lattice;
use differential_dataflow::AsCollection;
use timely::communication::Allocate;
use timely::dataflow::operators::capture::Extract;
use timely::dataflow::operators::{Capture, ToStream};
use timely::dataflow::Scope;
use timely::progress::timestamp::Refines;
use timely::progress::Timestamp;
use timely::worker::Worker as TimelyWorker;
use super::types::DataflowDescription;
use crate::compute::compute_state::ComputeState;
use crate::compute::context::CollectionBundle;
use crate::compute::plan::Plan;
use crate::compute::types::BuildDesc;
use crate::compute::Context;
use crate::expr::Id;
use crate::repr::{self, Row};
use crate::storage::errors::DataflowError;
mod error;
mod reduce;
/// Assemble the "compute" side of a dataflow, i.e. all but the sources.
///
/// This method imports sources from provided assets, and then builds the remaining
/// dataflow using "compute-local" assets like shared arrangements, and producing
/// both arrangements and sinks.
pub fn build_compute_dataflow<A: Allocate>(
timely_worker: &mut TimelyWorker<A>,
compute_state: &mut ComputeState,
dataflow: DataflowDescription<Plan, ()>,
) {
todo!()
}
pub trait RenderTimestamp: Timestamp + Lattice + Refines<repr::Timestamp> {
/// The system timestamp component of the timestamp.
///
/// This is useful for manipulating the system time, as when delaying
/// updates for subsequent cancellation, as with monotonic reduction.
fn system_time(&mut self) -> &mut repr::Timestamp;
/// Effects a system delay in terms of the timestamp summary.
fn system_delay(delay: repr::Timestamp) -> <Self as Timestamp>::Summary;
/// The event timestamp component of the timestamp.
fn event_time(&mut self) -> &mut repr::Timestamp;
/// Effects an event delay in terms of the timestamp summary.
fn event_delay(delay: repr::Timestamp) -> <Self as Timestamp>::Summary;
/// Steps the timestamp back so that logical compaction to the output will
/// not conflate `self` with any historical times.
fn step_back(&self) -> Self;
}
impl RenderTimestamp for repr::Timestamp {
fn system_time(&mut self) -> &mut repr::Timestamp {
self
}
fn system_delay(delay: repr::Timestamp) -> <Self as Timestamp>::Summary {
delay
}
fn event_time(&mut self) -> &mut repr::Timestamp {
self
}
fn event_delay(delay: repr::Timestamp) -> <Self as Timestamp>::Summary {
delay
}
fn step_back(&self) -> Self {
self.saturating_sub(1)
}
}
// This implementation block allows child timestamps to vary from parent timestamps.
impl<G> Context<G, Row>
where
G: Scope,
G::Timestamp: RenderTimestamp,
{
/// render plan and insert into context with given GlobalId
pub(crate) fn build_object(&mut self, object: BuildDesc<Plan>) {
// First, transform the relation expression into a render plan.
let bundle = self.render_plan(object.plan);
self.insert_id(Id::Global(object.id), bundle);
}
}
impl<S> Context<S, Row>
where
S: Scope,
S::Timestamp: RenderTimestamp,
{
/// Renders a plan to a differential dataflow, producing the collection of results.
///
/// The return type reflects the uncertainty about the data representation, perhaps
/// as a stream of data, perhaps as an arrangement, perhaps as a stream of batches.
pub fn render_plan(&mut self, plan: Plan) -> CollectionBundle<S, Row> {
match plan {
Plan::Constant { rows } => {
let (rows, errs) = match rows {
Ok(rows) => (rows, Vec::new()),
Err(err) => (Vec::new(), vec![err]),
};
let since_frontier = self.since_frontier.clone();
let until = self.until_frontier.clone();
let ok_collection = rows
.into_iter()
.filter_map(move |(row, mut time, diff)| {
time.advance_by(since_frontier.borrow());
if !until.less_equal(&time) {
Some((
row,
<S::Timestamp as Refines<repr::Timestamp>>::to_inner(time),
diff,
))
} else {
None
}
})
.to_stream(&mut self.scope)
.as_collection();
let mut error_time: repr::Timestamp = Timestamp::minimum();
error_time.advance_by(self.since_frontier.borrow());
let err_collection = errs
.into_iter()
.map(move |e| {
(
DataflowError::from(e),
<S::Timestamp as Refines<repr::Timestamp>>::to_inner(error_time),
1,
)
})
.to_stream(&mut self.scope)
.as_collection();
CollectionBundle::from_collections(ok_collection, err_collection)
}
Plan::Get { id, keys, plan } => {
// Recover the collection from `self` and then apply `mfp` to it.
// If `mfp` happens to be trivial, we can just return the collection.
let mut collection = self
.lookup_id(id)
.unwrap_or_else(|| panic!("Get({:?}) not found at render time", id));
match plan {
crate::compute::plan::GetPlan::PassArrangements => {
// Assert that each of `keys` are present in `collection`.
if !keys
.arranged
.iter()
.all(|(key, _, _)| collection.arranged.contains_key(key))
{
let not_included: Vec<_> = keys
.arranged
.iter()
.filter(|(key, _, _)| !collection.arranged.contains_key(key))
.map(|(key, _, _)| key)
.collect();
panic!(
"Those keys {:?} is not included in collections keys:{:?}",
not_included,
collection.arranged.keys().cloned().collect::<Vec<_>>()
);
}
assert!(keys.raw <= collection.collection.is_some());
// Retain only those keys we want to import.
collection.arranged.retain(|key, _val| {
keys.arranged.iter().any(|(key2, _, _)| key2 == key)
});
collection
}
crate::compute::plan::GetPlan::Arrangement(key, row, mfp) => {
let (oks, errs) = collection.as_collection_core(
mfp,
Some((key, row)),
self.until_frontier.clone(),
);
CollectionBundle::from_collections(oks, errs)
}
crate::compute::plan::GetPlan::Collection(mfp) => {
let (oks, errs) =
collection.as_collection_core(mfp, None, self.until_frontier.clone());
CollectionBundle::from_collections(oks, errs)
}
}
}
Plan::Let { id, value, body } => {
// Render `value` and bind it to `id`. Complain if this shadows an id.
let value = self.render_plan(*value);
let prebound = self.insert_id(Id::Local(id), value);
assert!(prebound.is_none());
let body = self.render_plan(*body);
self.remove_id(Id::Local(id));
body
}
Plan::Mfp {
input,
mfp,
input_key_val,
} => {
let input = self.render_plan(*input);
// If `mfp` is non-trivial, we should apply it and produce a collection.
if mfp.is_identity() {
input
} else {
let (oks, errs) =
input.as_collection_core(mfp, input_key_val, self.until_frontier.clone());
CollectionBundle::from_collections(oks, errs)
}
}
Plan::Reduce {
input,
key_val_plan,
plan,
input_key,
} => {
let input = self.render_plan(*input);
self.render_reduce(input, key_val_plan, plan, input_key)
}
_ => todo!("To be implemented"),
}
}
}
#[cfg(test)]
mod test {
use std::any::Any;
use std::collections::{BTreeMap, BTreeSet};
use std::rc::Rc;
use datatypes::prelude::ConcreteDataType;
use datatypes::value::Value;
use differential_dataflow::input::{Input, InputSession};
use differential_dataflow::Collection;
use timely::dataflow::scopes::Child;
use timely::dataflow::Stream;
use timely::Config;
use super::*;
use crate::compute::plan::{
AccumulablePlan, AvailableCollections, GetPlan, KeyValPlan, ReducePlan,
};
use crate::expr::{
AggregateExpr, BinaryFunc, GlobalId, LocalId, MapFilterProject, SafeMfpPlan, ScalarExpr,
UnaryFunc,
};
use crate::repr::Diff;
type OkStream<G> = Stream<G, (Row, repr::Timestamp, Diff)>;
type ErrStream<G> = Stream<G, (DataflowError, repr::Timestamp, Diff)>;
type OkCollection<G> = Collection<G, Row, Diff>;
type ErrCollection<G> = Collection<G, DataflowError, Diff>;
/// used as a token to prevent certain resources from being dropped
type AnyToken = Rc<dyn Any>;
struct MockSourceToken {
handle: InputSession<repr::Timestamp, Row, Diff>,
err_handle: InputSession<repr::Timestamp, DataflowError, Diff>,
}
fn mock_input_session(input: &mut InputSession<repr::Timestamp, Row, Diff>, cnt: i64) {
// TODO: mock a cpu usage monotonic input with timestamp
// cpu, mem, ts
// f32, f32, DateTime
let schema = [
ConcreteDataType::float32_datatype(),
ConcreteDataType::float32_datatype(),
ConcreteDataType::datetime_datatype(),
];
let arrs = (0..cnt).map(|i| (i as f32 / cnt as f32, i as f32 / cnt as f32, i));
// need more mechanism to make timestamp also timestamp here
for (cpu, mem, ts) in arrs {
input.update(
Row::pack(vec![cpu.into(), mem.into(), Value::DateTime(ts.into())]),
1,
);
input.advance_to(ts as u64)
}
input.flush();
}
// a simple test to see if the dataflow can be built and run
fn exec_dataflow(
input_id: Vec<Id>,
dataflow: DataflowDescription<Plan>,
sink_ids: Vec<GlobalId>,
output_keys: Vec<Option<Vec<ScalarExpr>>>,
input_mock_length: i64,
) {
timely::execute(Config::thread(), move |worker| {
println!("worker: {:?}", worker.index());
let mut input = InputSession::<repr::Timestamp, Row, Diff>::new();
worker.dataflow_named(
"ProofOfConcept",
|scope: &mut Child<'_, _, repr::Timestamp>| {
let mut test_ctx =
Context::<_, Row, _>::for_dataflow_in(&dataflow, scope.clone());
let ok_collection = input.to_collection(scope);
let (err_handle, err_collection) = scope.new_collection();
let input_collection =
CollectionBundle::<_, _, repr::Timestamp>::from_collections(
ok_collection,
err_collection,
);
// TODO: generate `import_sources` from `dataflow.source_imports`
let import_sources: Vec<_> = input_id
.clone()
.into_iter()
.zip(vec![input_collection])
.collect();
// import sources
for (id, collection) in import_sources {
test_ctx.insert_id(id, collection);
}
for build_desc in &dataflow.objects_to_build {
test_ctx.build_object(build_desc.clone());
}
dbg!(test_ctx.bindings.keys());
// TODO: export sinks
for (sink, output_key) in sink_ids.iter().zip(output_keys.iter()) {
let sink = *sink;
println!("Inspecting sink {:?}", sink.clone());
let inspect = test_ctx.lookup_id(Id::Global(sink)).unwrap();
dbg!(inspect.collection.is_some());
dbg!(inspect.arranged.keys());
let inspect = inspect.as_specific_collection(output_key.as_deref());
inspect
.0
.inspect(move |x| println!("inspect {:?} {:?}", sink.clone(), x));
}
},
);
mock_input_session(&mut input, input_mock_length);
})
.expect("Computation terminated abnormally");
}
#[test]
fn test_simple_poc_reduce_group_by() {
// 1. build dataflow with input collection connected
// 2. give input
// type annotation is needed to prevent rust-analyzer to give up type deduction
// simple give dataflow information
// will be build by given dataflow information from other nodes later
// key is the third column
let place_holder =
ScalarExpr::Literal(Ok(Value::Boolean(true)), ConcreteDataType::int64_datatype());
let count_col = |i: usize| AggregateExpr {
func: crate::expr::AggregateFunc::Count,
expr: ScalarExpr::Column(i),
distinct: false,
};
let sum_col = |i: usize| AggregateExpr {
func: crate::expr::AggregateFunc::SumFloat32,
expr: ScalarExpr::Column(i),
distinct: false,
};
// equal to `SELECT minute, SUM(cpu) FROM input GROUP BY ts/300 as minute;
// cpu, mem, ts
// --map--> cpu, mem, ts/300
// --reduce--> ts/300, AVG(cpu), AVG(mem)
let cast_datetime = ScalarExpr::CallUnary {
func: UnaryFunc::CastDatetimeToInt64,
expr: Box::new(ScalarExpr::Column(2)),
};
let ts_div_5 = ScalarExpr::CallBinary {
func: BinaryFunc::DivInt64,
expr1: Box::new(cast_datetime),
expr2: Box::new(ScalarExpr::Literal(
Ok(Value::Int64(5.into())),
ConcreteDataType::int64_datatype(),
)),
};
let cast_int64_to_float32 = |i: usize| ScalarExpr::CallUnary {
func: UnaryFunc::CastInt64ToFloat32,
expr: Box::new(ScalarExpr::Column(i)),
};
let reduce_group_by_window = vec![
// cpu, mem, ts
// --reduce--> ts/300, SUM(cpu), SUM(mem), COUNT(cpu), COUNT(mem)
// -- map --> ts/300, AVG(cpu), AVG(mem)
BuildDesc {
id: GlobalId::User(0),
plan: Plan::Reduce {
input: Box::new(Plan::Get {
id: Id::Global(GlobalId::System(0)),
keys: AvailableCollections::new_raw(),
plan: GetPlan::Collection(
MapFilterProject::new(3).map([ts_div_5]).project([0, 1, 3]),
),
}),
key_val_plan: KeyValPlan {
key_plan: SafeMfpPlan {
mfp: MapFilterProject::new(3).project([2]),
},
val_plan: SafeMfpPlan {
mfp: MapFilterProject::new(3).project([0, 1]),
},
},
// --reduce--> ts/300(key), SUM(cpu), SUM(mem), COUNT(cpu), COUNT(mem)
plan: ReducePlan::Accumulable(AccumulablePlan {
full_aggrs: vec![sum_col(0), sum_col(1), count_col(0), count_col(1)],
simple_aggrs: vec![
(0, 0, sum_col(0)),
(1, 1, sum_col(1)),
(2, 0, count_col(0)),
(3, 1, count_col(1)),
],
distinct_aggrs: vec![],
}),
input_key: None,
},
},
// 0 1 2 3 4
// ts/300(key), SUM(cpu), SUM(mem), COUNT(cpu), COUNT(mem),
// -- map --> AVG(cpu), AVG(mem), ts/300
BuildDesc {
id: GlobalId::User(1),
plan: Plan::Get {
id: Id::Global(GlobalId::User(0)),
// not used since plan is GetPlan::Arrangement
keys: AvailableCollections::new_raw(),
plan: GetPlan::Arrangement(
vec![ScalarExpr::Column(0)],
None,
MapFilterProject::new(5)
.map([
ScalarExpr::CallBinary {
func: BinaryFunc::DivFloat32,
expr1: Box::new(ScalarExpr::Column(1)),
expr2: Box::new(cast_int64_to_float32(3)),
},
ScalarExpr::CallBinary {
func: BinaryFunc::DivFloat32,
expr1: Box::new(ScalarExpr::Column(2)),
expr2: Box::new(cast_int64_to_float32(4)),
},
])
.project([0, 5, 6]),
),
},
},
];
let input_id = vec![Id::Global(GlobalId::System(0))];
let dataflow = {
let mut dataflow = DataflowDescription::<Plan, ()>::new("test".to_string());
dataflow.objects_to_build = reduce_group_by_window;
dataflow
};
let sink_ids = [GlobalId::User(0), GlobalId::User(1)];
exec_dataflow(
input_id.clone(),
dataflow.clone(),
sink_ids.to_vec(),
vec![Some(vec![ScalarExpr::Column(0)]), None],
10,
);
}
#[test]
fn test_simple_poc_reduce_count() {
// 1. build dataflow with input collection connected
// 2. give input
// type annotation is needed to prevent rust-analyzer to give up type deduction
// simple give dataflow information
// will be build by given dataflow information from other nodes later
// key is the third column
let place_holder =
ScalarExpr::Literal(Ok(Value::Boolean(true)), ConcreteDataType::int64_datatype());
let key_plan = SafeMfpPlan {
mfp: MapFilterProject::new(3)
.map([place_holder.clone()])
.project([3]),
};
let val_plan = SafeMfpPlan {
mfp: MapFilterProject::new(3).project([0, 1, 2]),
};
let count = AggregateExpr {
func: crate::expr::AggregateFunc::Count,
expr: place_holder,
distinct: false,
};
// equal to `SELECT COUNT(*) FROM input;`
let reduce_group_by_window = vec![
// count(true)
BuildDesc {
id: GlobalId::User(0),
plan: Plan::Reduce {
input: Box::new(Plan::Get {
id: Id::Global(GlobalId::System(0)),
keys: AvailableCollections::new_raw(),
plan: GetPlan::Collection(MapFilterProject::new(3)),
}),
key_val_plan: KeyValPlan { key_plan, val_plan },
plan: ReducePlan::Accumulable(AccumulablePlan {
full_aggrs: vec![count.clone()],
simple_aggrs: vec![(0, 0, count)],
distinct_aggrs: vec![],
}),
input_key: None,
},
},
// get second column
BuildDesc {
id: GlobalId::User(1),
plan: Plan::Get {
id: Id::Global(GlobalId::User(0)),
// not used since plan is GetPlan::Arrangement
keys: AvailableCollections::new_raw(),
plan: GetPlan::Arrangement(
vec![ScalarExpr::Column(0)],
None,
MapFilterProject::new(2).project([1]),
),
},
},
];
let input_id = vec![Id::Global(GlobalId::System(0))];
let dataflow = {
let mut dataflow = DataflowDescription::<Plan, ()>::new("test".to_string());
dataflow.objects_to_build = reduce_group_by_window;
dataflow
};
let sink_ids = [GlobalId::User(1)];
exec_dataflow(
input_id.clone(),
dataflow.clone(),
sink_ids.to_vec(),
vec![None],
10,
);
}
#[test]
fn test_simple_poc_reduce_distinct() {
// 1. build dataflow with input collection connected
// 2. give input
// type annotation is needed to prevent rust-analyzer to give up type deduction
// simple give dataflow information
// will be build by given dataflow information from other nodes later
// window need date_trunc which is still WIP
// key is the third column
let key_plan = SafeMfpPlan {
mfp: MapFilterProject::new(3).project([2]),
};
let val_plan = SafeMfpPlan {
mfp: MapFilterProject::new(3).project([0, 1]),
};
// equal to `SELECT ts, COUNT(*) FROM input GROUP BY ts;`
let reduce_plan = vec![BuildDesc {
id: GlobalId::User(0),
plan: Plan::Reduce {
input: Box::new(Plan::Get {
id: Id::Global(GlobalId::System(0)),
keys: AvailableCollections::new_raw(),
plan: GetPlan::Collection(MapFilterProject::new(3)),
}),
key_val_plan: KeyValPlan { key_plan, val_plan },
plan: ReducePlan::Distinct,
input_key: None,
},
}];
let input_id = vec![Id::Global(GlobalId::System(0))];
let dataflow = {
let mut dataflow = DataflowDescription::<Plan, ()>::new("test".to_string());
dataflow.objects_to_build = reduce_plan;
dataflow
};
let sink_ids = [GlobalId::User(0)];
exec_dataflow(
input_id.clone(),
dataflow.clone(),
sink_ids.to_vec(),
vec![Some(vec![ScalarExpr::Column(0)])],
10,
);
}
#[test]
#[allow(clippy::print_stdout)]
fn test_constant_plan_render() {
let build_descs = vec![BuildDesc {
id: GlobalId::User(0),
plan: Plan::Constant {
rows: Ok(vec![(Row::default(), 0, 1)]),
},
}];
let dataflow = DataflowDescription::<Plan, ()>::new("test".to_string());
timely::execute_from_args(std::iter::empty::<String>(), move |worker| {
println!("worker: {:?}", worker.index());
let mut input = InputSession::<repr::Timestamp, Row, Diff>::new();
worker.dataflow(|scope: &mut Child<'_, _, repr::Timestamp>| {
let mut test_ctx = Context::<_, Row, _>::for_dataflow_in(&dataflow, scope.clone());
for build_desc in &build_descs {
test_ctx.build_object(build_desc.clone());
}
let input_collection = input.to_collection(scope);
let err_collection = InputSession::new().to_collection(scope);
let input_collection =
CollectionBundle::from_collections(input_collection, err_collection);
// insert collection
test_ctx.insert_id(Id::Local(LocalId(0)), input_collection);
let inspect = test_ctx
.lookup_id(Id::Global(GlobalId::User(0)))
.unwrap()
.as_specific_collection(None);
inspect.0.inspect(|x| println!("inspect {:?}", x));
});
// input.insert(Row::default());
input.update(Row::default(), 1);
input.advance_to(1);
})
.expect("Computation terminated abnormally");
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,20 @@
use differential_dataflow::operators::arrange::TraceAgent;
use differential_dataflow::trace::implementations::ord::{OrdKeySpine, OrdValSpine};
use crate::repr::{Diff, Row, Timestamp};
use crate::storage::errors::DataflowError;
// TODO(discord9): consider use ColValSpine for columnation storage
/// T: Time, R: Diff, O: Offset
pub type RowSpine<K, V, T, R, O = usize> = OrdValSpine<K, V, T, R, O>;
/// T: Time, R: Diff, O: Offset
pub type RowKeySpine<K, T, R, O = usize> = OrdKeySpine<K, T, R, O>;
/// T: Time, R: Diff, O: Offset
pub type ErrSpine<K, T, R, O = usize> = OrdKeySpine<K, T, R, O>;
/// T: Time, R: Diff, O: Offset
pub type ErrValSpine<K, T, R, O = usize> = OrdValSpine<K, DataflowError, T, R, O>;
pub type TraceRowHandle<K, V, T, R> = TraceAgent<RowSpine<K, V, T, R>>;
pub type TraceErrHandle<K, T, R> = TraceAgent<ErrSpine<K, T, R>>;
pub type KeysValsHandle = TraceRowHandle<Row, Row, Timestamp, Diff>;
pub type ErrsHandle = TraceErrHandle<DataflowError, Timestamp, Diff>;

View File

@@ -0,0 +1,75 @@
use std::collections::BTreeMap;
use serde::{Deserialize, Serialize};
use timely::progress::Antichain;
use crate::compute::plan::Plan;
use crate::compute::types::sinks::ComputeSinkDesc;
use crate::compute::types::sources::SourceInstanceDesc;
use crate::expr::{GlobalId, ScalarExpr};
use crate::repr::{self, RelationType};
/// A description of a dataflow to construct and results to surface.
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct DataflowDescription<P, S: 'static = (), T = repr::Timestamp> {
/// Sources instantiations made available to the dataflow pair with monotonicity information.
pub source_imports: BTreeMap<GlobalId, (SourceInstanceDesc<S>, bool)>,
/// Indexes made available to the dataflow.
/// (id of new index, description of index, relationtype of base source/view, monotonic)
pub index_imports: BTreeMap<GlobalId, (IndexDesc, RelationType, bool)>,
/// Views and indexes to be built and stored in the local context.
/// Objects must be built in the specific order, as there may be
/// dependencies of later objects on prior identifiers.
pub objects_to_build: Vec<BuildDesc<P>>,
/// Indexes to be made available to be shared with other dataflows
/// (id of new index, description of index, relationtype of base source/view)
pub index_exports: BTreeMap<GlobalId, (IndexDesc, RelationType)>,
/// sinks to be created
/// (id of new sink, description of sink)
pub sink_exports: BTreeMap<GlobalId, ComputeSinkDesc<S, T>>,
/// An optional frontier to which inputs should be advanced.
///
/// If this is set, it should override the default setting determined by
/// the upper bound of `since` frontiers contributing to the dataflow.
/// It is an error for this to be set to a frontier not beyond that default.
pub as_of: Option<Antichain<T>>,
/// Frontier beyond which the dataflow should not execute.
/// Specifically, updates at times greater or equal to this frontier are suppressed.
/// This is often set to `as_of + 1` to enable "batch" computations.
pub until: Antichain<T>,
/// Human readable name
pub debug_name: String,
}
impl<P, T> DataflowDescription<P, (), T> {
/// Creates a new dataflow description with a human-readable name.
pub fn new(name: String) -> Self {
Self {
source_imports: Default::default(),
index_imports: Default::default(),
objects_to_build: Vec::new(),
index_exports: Default::default(),
sink_exports: Default::default(),
as_of: Default::default(),
until: Antichain::new(),
debug_name: name,
}
}
}
/// An association of a global identifier to an expression.
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct BuildDesc<P = Plan> {
pub id: GlobalId,
pub plan: P,
}
/// An index storing processed updates so they can be queried
/// or reused in other computations
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, Hash)]
pub struct IndexDesc {
/// Identity of the collection the index is on.
pub on_id: GlobalId,
/// Expressions to be arranged, in order of decreasing primacy.
pub key: Vec<ScalarExpr>,
}

View File

@@ -0,0 +1,8 @@
use serde::{Deserialize, Serialize};
use crate::expr::GlobalId;
mod dataflow;
mod sinks;
mod sources;
pub(crate) use dataflow::{BuildDesc, DataflowDescription, IndexDesc};

View File

@@ -0,0 +1,28 @@
use serde::{Deserialize, Serialize};
use timely::progress::Antichain;
use crate::expr::GlobalId;
use crate::repr::{self, RelationDesc};
/// A sink for updates to a relational collection.
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct ComputeSinkDesc<S: 'static = (), T = repr::Timestamp> {
pub from: GlobalId,
pub from_desc: RelationDesc,
pub connection: ComputeSinkConnection<S>,
pub with_snapshot: bool,
pub up_to: Antichain<T>,
}
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub enum ComputeSinkConnection<S: 'static = ()> {
// TODO(discord9): consider if ever needed
Subscribe,
Persist(PersistSinkConnection<S>),
}
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct PersistSinkConnection<S> {
pub value_desc: RelationDesc,
pub storage_metadata: S,
}

View File

@@ -0,0 +1,26 @@
use serde::{Deserialize, Serialize};
use crate::expr::MapFilterProject;
use crate::repr::RelationType;
/// A description of an instantiation of a source.
///
/// This includes a description of the source, but additionally any
/// context-dependent options like the ability to apply filtering and
/// projection to the records as they emerge.
#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)]
pub struct SourceInstanceDesc<M> {
/// Arguments for this instantiation of the source.
pub arguments: SourceInstanceArguments,
/// Additional metadata used by the storage client of a compute instance to read it.
pub storage_metadata: M,
/// The relation type of this source
pub typ: RelationType,
}
/// Per-source construction arguments.
#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)]
pub struct SourceInstanceArguments {
/// Linear operators to be applied record-by-record.
pub operators: Option<MapFilterProject>,
}

224
src/flow/src/expr/func.rs Normal file
View File

@@ -0,0 +1,224 @@
use datatypes::value::Value;
use serde::{Deserialize, Serialize};
use super::ScalarExpr;
// TODO(discord9): more function & eval
use crate::{repr::Row, storage::errors::EvalError};
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize, Hash)]
pub enum UnaryFunc {
Not,
IsNull,
IsTrue,
IsFalse,
CastDatetimeToInt64,
CastInt64ToFloat32,
}
impl UnaryFunc {
pub fn eval(&self, values: &[Value], expr: &ScalarExpr) -> Result<Value, EvalError> {
let arg = expr.eval(values)?;
match self {
Self::CastDatetimeToInt64 => {
let datetime = if let Value::DateTime(datetime) = arg {
Ok(datetime.val())
} else {
Err(EvalError::TypeMismatch(format!(
"cannot cast {:?} to datetime",
arg
)))
}?;
Ok(Value::from(datetime))
}
Self::CastInt64ToFloat32 => {
let int64 = if let Value::Int64(int64) = arg {
Ok(int64)
} else {
Err(EvalError::TypeMismatch(format!(
"cannot cast {:?} to int64",
arg
)))
}?;
Ok(Value::from(int64 as f32))
}
_ => todo!(),
}
}
}
/// TODO: support more binary functions for more types
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize, Hash)]
pub enum BinaryFunc {
Eq,
NotEq,
Lt,
Lte,
Gt,
Gte,
AddInt16,
AddInt32,
AddInt64,
AddUInt16,
AddUInt32,
AddUInt64,
AddFloat32,
AddFloat64,
SubInt16,
SubInt32,
SubInt64,
SubUInt16,
SubUInt32,
SubUInt64,
SubFloat32,
SubFloat64,
MulInt16,
MulInt32,
MulInt64,
MulUInt16,
MulUInt32,
MulUInt64,
MulFloat32,
MulFloat64,
DivInt16,
DivInt32,
DivInt64,
DivUInt16,
DivUInt32,
DivUInt64,
DivFloat32,
DivFloat64,
ModInt16,
ModInt32,
ModInt64,
ModUInt16,
ModUInt32,
ModUInt64,
}
impl BinaryFunc {
pub fn eval(
&self,
values: &[Value],
expr1: &ScalarExpr,
expr2: &ScalarExpr,
) -> Result<Value, EvalError> {
let left = expr1.eval(values)?;
let right = expr2.eval(values)?;
match self {
Self::Eq => Ok(Value::from(left == right)),
Self::NotEq => Ok(Value::from(left != right)),
Self::Lt => Ok(Value::from(left < right)),
Self::Lte => Ok(Value::from(left <= right)),
Self::Gt => Ok(Value::from(left > right)),
Self::Gte => Ok(Value::from(left >= right)),
Self::AddInt16 => Ok(add::<i16>(left, right)?),
Self::AddInt32 => Ok(add::<i32>(left, right)?),
Self::AddInt64 => Ok(add::<i64>(left, right)?),
Self::AddUInt16 => Ok(add::<u16>(left, right)?),
Self::AddUInt32 => Ok(add::<u32>(left, right)?),
Self::AddUInt64 => Ok(add::<u64>(left, right)?),
Self::AddFloat32 => Ok(add::<f32>(left, right)?),
Self::AddFloat64 => Ok(add::<f64>(left, right)?),
Self::SubInt16 => Ok(sub::<i16>(left, right)?),
Self::SubInt32 => Ok(sub::<i32>(left, right)?),
Self::SubInt64 => Ok(sub::<i64>(left, right)?),
Self::SubUInt16 => Ok(sub::<u16>(left, right)?),
Self::SubUInt32 => Ok(sub::<u32>(left, right)?),
Self::SubUInt64 => Ok(sub::<u64>(left, right)?),
Self::SubFloat32 => Ok(sub::<f32>(left, right)?),
Self::SubFloat64 => Ok(sub::<f64>(left, right)?),
Self::MulInt16 => Ok(mul::<i16>(left, right)?),
Self::MulInt32 => Ok(mul::<i32>(left, right)?),
Self::MulInt64 => Ok(mul::<i64>(left, right)?),
Self::MulUInt16 => Ok(mul::<u16>(left, right)?),
Self::MulUInt32 => Ok(mul::<u32>(left, right)?),
Self::MulUInt64 => Ok(mul::<u64>(left, right)?),
Self::MulFloat32 => Ok(mul::<f32>(left, right)?),
Self::MulFloat64 => Ok(mul::<f64>(left, right)?),
Self::DivInt16 => Ok(div::<i16>(left, right)?),
Self::DivInt32 => Ok(div::<i32>(left, right)?),
Self::DivInt64 => Ok(div::<i64>(left, right)?),
Self::DivUInt16 => Ok(div::<u16>(left, right)?),
Self::DivUInt32 => Ok(div::<u32>(left, right)?),
Self::DivUInt64 => Ok(div::<u64>(left, right)?),
Self::DivFloat32 => Ok(div::<f32>(left, right)?),
Self::DivFloat64 => Ok(div::<f64>(left, right)?),
Self::ModInt16 => Ok(rem::<i16>(left, right)?),
Self::ModInt32 => Ok(rem::<i32>(left, right)?),
Self::ModInt64 => Ok(rem::<i64>(left, right)?),
Self::ModUInt16 => Ok(rem::<u16>(left, right)?),
Self::ModUInt32 => Ok(rem::<u32>(left, right)?),
Self::ModUInt64 => Ok(rem::<u64>(left, right)?),
_ => todo!(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize, Hash)]
pub enum VariadicFunc {}
impl VariadicFunc {
pub fn eval(&self, values: &[Value], exprs: &[ScalarExpr]) -> Result<Value, EvalError> {
todo!()
}
}
fn add<T>(left: Value, right: Value) -> Result<Value, EvalError>
where
T: TryFrom<Value> + std::ops::Add<Output = T>,
<T as TryFrom<Value>>::Error: std::fmt::Debug,
Value: From<T>,
{
let left = T::try_from(left).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
let right = T::try_from(right).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
Ok(Value::from(left + right))
}
fn sub<T>(left: Value, right: Value) -> Result<Value, EvalError>
where
T: TryFrom<Value> + std::ops::Sub<Output = T>,
<T as TryFrom<Value>>::Error: std::fmt::Debug,
Value: From<T>,
{
let left = T::try_from(left).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
let right = T::try_from(right).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
Ok(Value::from(left - right))
}
fn mul<T>(left: Value, right: Value) -> Result<Value, EvalError>
where
T: TryFrom<Value> + std::ops::Mul<Output = T>,
<T as TryFrom<Value>>::Error: std::fmt::Debug,
Value: From<T>,
{
let left = T::try_from(left).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
let right = T::try_from(right).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
Ok(Value::from(left * right))
}
fn div<T>(left: Value, right: Value) -> Result<Value, EvalError>
where
T: TryFrom<Value> + std::ops::Div<Output = T>,
<T as TryFrom<Value>>::Error: std::fmt::Debug,
Value: From<T>,
{
let left = T::try_from(left).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
let right = T::try_from(right).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
Ok(Value::from(left / right))
}
fn rem<T>(left: Value, right: Value) -> Result<Value, EvalError>
where
T: TryFrom<Value> + std::ops::Rem<Output = T>,
<T as TryFrom<Value>>::Error: std::fmt::Debug,
Value: From<T>,
{
let left = T::try_from(left).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
let right = T::try_from(right).map_err(|e| EvalError::TypeMismatch(format!("{:?}", e)))?;
Ok(Value::from(left % right))
}

24
src/flow/src/expr/id.rs Normal file
View File

@@ -0,0 +1,24 @@
use serde::{Deserialize, Serialize};
#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
pub enum GlobalId {
/// System namespace.
System(u64),
/// User namespace.
User(u64),
/// Transient namespace.
Transient(u64),
/// Dummy id for query being explained
Explain,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub struct LocalId(pub(crate) u64);
#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub enum Id {
/// An identifier that refers to a local component of a dataflow.
Local(LocalId),
/// An identifier that refers to a global dataflow.
Global(GlobalId),
}

381
src/flow/src/expr/linear.rs Normal file
View File

@@ -0,0 +1,381 @@
use std::collections::{BTreeMap, BTreeSet};
use datatypes::value::Value;
use serde::{Deserialize, Serialize};
use crate::expr::{Id, LocalId, ScalarExpr};
use crate::repr::{self, Diff, Row};
use crate::storage::errors::EvalError;
/// A compound operator that can be applied row-by-row.
///
/// This operator integrates the map, filter, and project operators.
/// It applies a sequences of map expressions, which are allowed to
/// refer to previous expressions, interleaved with predicates which
/// must be satisfied for an output to be produced. If all predicates
/// evaluate to `Datum::True` the data at the identified columns are
/// collected and produced as output in a packed `Row`.
///
/// This operator is a "builder" and its contents may contain expressions
/// that are not yet executable. For example, it may contain temporal
/// expressions in `self.expressions`, even though this is not something
/// we can directly evaluate. The plan creation methods will defensively
/// ensure that the right thing happens.
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct MapFilterProject {
/// A sequence of expressions that should be appended to the row.
///
/// Many of these expressions may not be produced in the output,
/// and may only be present as common subexpressions.
pub expressions: Vec<ScalarExpr>,
/// Expressions that must evaluate to `Datum::True` for the output
/// row to be produced.
///
/// Each entry is prepended with a column identifier indicating
/// the column *before* which the predicate should first be applied.
/// Most commonly this would be one plus the largest column identifier
/// in the predicate's support, but it could be larger to implement
/// guarded evaluation of predicates.
///
/// This list should be sorted by the first field.
pub predicates: Vec<(usize, ScalarExpr)>,
/// A sequence of column identifiers whose data form the output row.
pub projection: Vec<usize>,
/// The expected number of input columns.
///
/// This is needed to ensure correct identification of newly formed
/// columns in the output.
pub input_arity: usize,
}
impl MapFilterProject {
/// Create a no-op operator for an input of a supplied arity.
pub fn new(input_arity: usize) -> Self {
Self {
expressions: Vec::new(),
predicates: Vec::new(),
projection: (0..input_arity).collect(),
input_arity,
}
}
/// Given two mfps, return an mfp that applies one
/// followed by the other.
/// Note that the arguments are in the opposite order
/// from how function composition is usually written in mathematics.
pub fn compose(before: Self, after: Self) -> Self {
let (m, f, p) = after.into_map_filter_project();
before.map(m).filter(f).project(p)
}
/// True if the operator describes the identity transformation.
pub fn is_identity(&self) -> bool {
self.expressions.is_empty()
&& self.predicates.is_empty()
&& self.projection.len() == self.input_arity
&& self.projection.iter().enumerate().all(|(i, p)| i == *p)
}
/// Retain only the indicated columns in the presented order.
pub fn project<I>(mut self, columns: I) -> Self
where
I: IntoIterator<Item = usize> + std::fmt::Debug,
{
self.projection = columns.into_iter().map(|c| self.projection[c]).collect();
self
}
/// Retain only rows satisfying these predicates.
///
/// This method introduces predicates as eagerly as they can be evaluated,
/// which may not be desired for predicates that may cause exceptions.
/// If fine manipulation is required, the predicates can be added manually.
pub fn filter<I>(mut self, predicates: I) -> Self
where
I: IntoIterator<Item = ScalarExpr>,
{
for mut predicate in predicates {
// Correct column references.
predicate.permute(&self.projection[..]);
// Validate column references.
assert!(predicate
.support()
.into_iter()
.all(|c| c < self.input_arity + self.expressions.len()));
// Insert predicate as eagerly as it can be evaluated:
// just after the largest column in its support is formed.
let max_support = predicate
.support()
.into_iter()
.max()
.map(|c| c + 1)
.unwrap_or(0);
self.predicates.push((max_support, predicate))
}
// Stable sort predicates by position at which they take effect.
// We put literal errors at the end as a stop-gap to avoid erroring
// before we are able to evaluate any predicates that might prevent it.
self.predicates
.sort_by_key(|(position, predicate)| (predicate.is_literal_err(), *position));
self
}
/// Append the result of evaluating expressions to each row.
pub fn map<I>(mut self, expressions: I) -> Self
where
I: IntoIterator<Item = ScalarExpr>,
{
for mut expression in expressions {
// Correct column references.
expression.permute(&self.projection[..]);
// Validate column references.
assert!(expression
.support()
.into_iter()
.all(|c| c < self.input_arity + self.expressions.len()));
// Introduce expression and produce as output.
self.expressions.push(expression);
self.projection
.push(self.input_arity + self.expressions.len() - 1);
}
self
}
/// Like [`MapFilterProject::as_map_filter_project`], but consumes `self` rather than cloning.
pub fn into_map_filter_project(self) -> (Vec<ScalarExpr>, Vec<ScalarExpr>, Vec<usize>) {
let predicates = self
.predicates
.into_iter()
.map(|(_pos, predicate)| predicate)
.collect();
(self.expressions, predicates, self.projection)
}
/// As the arguments to `Map`, `Filter`, and `Project` operators.
///
/// In principle, this operator can be implemented as a sequence of
/// more elemental operators, likely less efficiently.
pub fn as_map_filter_project(&self) -> (Vec<ScalarExpr>, Vec<ScalarExpr>, Vec<usize>) {
self.clone().into_map_filter_project()
}
}
impl MapFilterProject {
pub fn optimize(&mut self) {
// TODO(discord9): optimize later
}
/// Convert the `MapFilterProject` into a staged evaluation plan.
///
/// The main behavior is extract temporal predicates, which cannot be evaluated
/// using the standard machinery.
pub fn into_plan(self) -> Result<MfpPlan, String> {
MfpPlan::create_from(self)
}
/// Lists input columns whose values are used in outputs.
///
/// It is entirely appropriate to determine the demand of an instance
/// and then both apply a projection to the subject of the instance and
/// `self.permute` this instance.
pub fn demand(&self) -> BTreeSet<usize> {
let mut demanded = BTreeSet::new();
for (_index, pred) in self.predicates.iter() {
demanded.extend(pred.support());
}
demanded.extend(self.projection.iter().cloned());
for index in (0..self.expressions.len()).rev() {
if demanded.contains(&(self.input_arity + index)) {
demanded.extend(self.expressions[index].support());
}
}
demanded.retain(|col| col < &self.input_arity);
demanded
}
/// Update input column references, due to an input projection or permutation.
///
/// The `shuffle` argument remaps expected column identifiers to new locations,
/// with the expectation that `shuffle` describes all input columns, and so the
/// intermediate results will be able to start at position `shuffle.len()`.
///
/// The supplied `shuffle` may not list columns that are not "demanded" by the
/// instance, and so we should ensure that `self` is optimized to not reference
/// columns that are not demanded.
pub fn permute(&mut self, mut shuffle: BTreeMap<usize, usize>, new_input_arity: usize) {
let (mut map, mut filter, mut project) = self.as_map_filter_project();
for index in 0..map.len() {
// Intermediate columns are just shifted.
shuffle.insert(self.input_arity + index, new_input_arity + index);
}
for expr in map.iter_mut() {
expr.permute_map(&shuffle);
}
for pred in filter.iter_mut() {
pred.permute_map(&shuffle);
}
for proj in project.iter_mut() {
assert!(shuffle[proj] < new_input_arity + map.len());
*proj = shuffle[proj];
}
*self = Self::new(new_input_arity)
.map(map)
.filter(filter)
.project(project)
}
}
/// A wrapper type which indicates it is safe to simply evaluate all expressions.
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct SafeMfpPlan {
pub(crate) mfp: MapFilterProject,
}
impl SafeMfpPlan {
pub fn permute(&mut self, map: BTreeMap<usize, usize>, new_arity: usize) {
self.mfp.permute(map, new_arity);
}
/// Evaluates the linear operator on a supplied list of datums.
///
/// The arguments are the initial datums associated with the row,
/// and an appropriately lifetimed arena for temporary allocations
/// needed by scalar evaluation.
///
/// An `Ok` result will either be `None` if any predicate did not
/// evaluate to `Value::Boolean(true)`, or the values of the columns listed
/// by `self.projection` if all predicates passed. If an error
/// occurs in the evaluation it is returned as an `Err` variant.
/// As the evaluation exits early with failed predicates, it may
/// miss some errors that would occur later in evaluation.
///
/// The `row` is not cleared first, but emptied if the function
/// returns `Ok(Some(row)).
#[inline(always)]
pub fn evaluate_into(
&self,
values: &mut Vec<Value>,
row_buf: &mut Row,
) -> Result<Option<Row>, EvalError> {
let passed_predicates = self.evaluate_inner(values)?;
if !passed_predicates {
Ok(None)
} else {
row_buf.clear();
row_buf.extend(self.mfp.projection.iter().map(|c| values[*c].clone()));
Ok(Some(row_buf.clone()))
}
}
/// A version of `evaluate` which produces an iterator over `Datum`
/// as output.
///
/// This version can be useful when one wants to capture the resulting
/// datums without packing and then unpacking a row.
#[inline(always)]
pub fn evaluate_iter<'a>(
&'a self,
datums: &'a mut Vec<Value>,
) -> Result<Option<impl Iterator<Item = Value> + 'a>, EvalError> {
let passed_predicates = self.evaluate_inner(datums)?;
if !passed_predicates {
Ok(None)
} else {
Ok(Some(
self.mfp.projection.iter().map(move |i| datums[*i].clone()),
))
}
}
/// Populates `datums` with `self.expressions` and tests `self.predicates`.
///
/// This does not apply `self.projection`, which is up to the calling method.
pub fn evaluate_inner(&self, values: &mut Vec<Value>) -> Result<bool, EvalError> {
let mut expression = 0;
for (support, predicate) in self.mfp.predicates.iter() {
while self.mfp.input_arity + expression < *support {
values.push(self.mfp.expressions[expression].eval(&values[..])?);
expression += 1;
}
if predicate.eval(&values[..])? != Value::Boolean(true) {
return Ok(false);
}
}
while expression < self.mfp.expressions.len() {
values.push(self.mfp.expressions[expression].eval(&values[..])?);
expression += 1;
}
Ok(true)
}
}
impl std::ops::Deref for SafeMfpPlan {
type Target = MapFilterProject;
fn deref(&self) -> &Self::Target {
&self.mfp
}
}
/// Predicates partitioned into temporal and non-temporal.
///
/// Temporal predicates require some recognition to determine their
/// structure, and it is best to do that once and re-use the results.
///
/// There are restrictions on the temporal predicates we currently support.
/// They must directly constrain `MzNow` from below or above,
/// by expressions that do not themselves contain `MzNow`.
/// Conjunctions of such constraints are also ok.
#[derive(Clone, Debug, PartialEq)]
pub struct MfpPlan {
/// Normal predicates to evaluate on `&[Datum]` and expect `Ok(Datum::True)`.
pub(crate) mfp: SafeMfpPlan,
/// TODO(discord9): impl temporal filter later
/// Expressions that when evaluated lower-bound `MzNow`.
pub(crate) lower_bounds: Vec<ScalarExpr>,
/// Expressions that when evaluated upper-bound `MzNow`.
pub(crate) upper_bounds: Vec<ScalarExpr>,
}
impl MfpPlan {
pub fn create_from(mut mfp: MapFilterProject) -> Result<Self, String> {
Ok(Self {
mfp: SafeMfpPlan { mfp },
lower_bounds: Vec::new(),
upper_bounds: Vec::new(),
})
}
pub fn evaluate<E: From<EvalError>, V: Fn(&repr::Timestamp) -> bool>(
&self,
values: &mut Vec<Value>,
time: repr::Timestamp,
diff: Diff,
valid_time: V,
) -> impl Iterator<Item = Result<(Row, repr::Timestamp, Diff), (E, repr::Timestamp, Diff)>>
{
match self.mfp.evaluate_inner(values) {
Err(e) => {
return Some(Err((e.into(), time, diff)))
.into_iter()
.chain(None.into_iter());
}
Ok(true) => {}
Ok(false) => {
return None.into_iter().chain(None.into_iter());
}
}
// TODO(discord9): Temporal filter
let ret = Row::pack(self.mfp.mfp.projection.iter().map(|c| values[*c].clone()));
Some(Ok((ret, time, diff)))
.into_iter()
.chain(None.into_iter())
}
/// Indicates if the planned `MapFilterProject` emits exactly its inputs as outputs.
pub fn is_identity(&self) -> bool {
self.mfp.mfp.is_identity() && self.lower_bounds.is_empty() && self.upper_bounds.is_empty()
}
}

207
src/flow/src/expr/mod.rs Normal file
View File

@@ -0,0 +1,207 @@
//! for declare dataflow description that is the last step before build dataflow
mod func;
mod id;
mod linear;
mod relation;
use std::collections::{BTreeMap, BTreeSet};
use datatypes::prelude::ConcreteDataType;
use datatypes::value::Value;
pub use id::{GlobalId, Id, LocalId};
pub use linear::{MapFilterProject, SafeMfpPlan};
pub(crate) use relation::{AggregateExpr, AggregateFunc, TableFunc};
use serde::{Deserialize, Serialize};
pub(crate) use crate::expr::func::{BinaryFunc, UnaryFunc, VariadicFunc};
use crate::storage::errors::EvalError;
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum ScalarExpr {
/// A column of the input row
Column(usize),
/// A literal value.
Literal(Result<Value, EvalError>, ConcreteDataType),
CallUnary {
func: UnaryFunc,
expr: Box<ScalarExpr>,
},
CallBinary {
func: BinaryFunc,
expr1: Box<ScalarExpr>,
expr2: Box<ScalarExpr>,
},
CallVariadic {
func: VariadicFunc,
exprs: Vec<ScalarExpr>,
},
/// Conditionally evaluated expressions.
///
/// It is important that `then` and `els` only be evaluated if
/// `cond` is true or not, respectively. This is the only way
/// users can guard execution (other logical operator do not
/// short-circuit) and we need to preserve that.
If {
cond: Box<ScalarExpr>,
then: Box<ScalarExpr>,
els: Box<ScalarExpr>,
},
}
impl ScalarExpr {
pub fn eval(&self, values: &[Value]) -> Result<Value, EvalError> {
match self {
ScalarExpr::Column(index) => Ok(values[*index].clone()),
ScalarExpr::Literal(row_res, _ty) => row_res.clone(),
ScalarExpr::CallUnary { func, expr } => func.eval(values, expr),
ScalarExpr::CallBinary { func, expr1, expr2 } => func.eval(values, expr1, expr2),
ScalarExpr::CallVariadic { func, exprs } => func.eval(values, exprs),
ScalarExpr::If { cond, then, els } => match cond.eval(values) {
Ok(Value::Boolean(true)) => then.eval(values),
Ok(Value::Boolean(false)) => els.eval(values),
_ => Err(EvalError::InvalidArgument(
"if condition must be boolean".to_string(),
)),
},
}
}
/// Rewrites column indices with their value in `permutation`.
///
/// This method is applicable even when `permutation` is not a
/// strict permutation, and it only needs to have entries for
/// each column referenced in `self`.
pub fn permute(&mut self, permutation: &[usize]) {
#[allow(deprecated)]
self.visit_mut_post_nolimit(&mut |e| {
if let ScalarExpr::Column(old_i) = e {
*old_i = permutation[*old_i];
}
});
}
/// Rewrites column indices with their value in `permutation`.
///
/// This method is applicable even when `permutation` is not a
/// strict permutation, and it only needs to have entries for
/// each column referenced in `self`.
pub fn permute_map(&mut self, permutation: &BTreeMap<usize, usize>) {
#[allow(deprecated)]
self.visit_mut_post_nolimit(&mut |e| {
if let ScalarExpr::Column(old_i) = e {
*old_i = permutation[old_i];
}
});
}
pub fn support(&self) -> BTreeSet<usize> {
let mut support = BTreeSet::new();
#[allow(deprecated)]
self.visit_post_nolimit(&mut |e| {
if let ScalarExpr::Column(i) = e {
support.insert(*i);
}
});
support
}
pub fn as_literal(&self) -> Option<Result<Value, &EvalError>> {
if let ScalarExpr::Literal(lit, _column_type) = self {
Some(lit.as_ref().map(|row| row.clone()))
} else {
None
}
}
pub fn is_literal(&self) -> bool {
matches!(self, ScalarExpr::Literal(_, _))
}
pub fn is_literal_true(&self) -> bool {
Some(Ok(Value::Boolean(true))) == self.as_literal()
}
pub fn is_literal_false(&self) -> bool {
Some(Ok(Value::Boolean(false))) == self.as_literal()
}
pub fn is_literal_null(&self) -> bool {
Some(Ok(Value::Null)) == self.as_literal()
}
pub fn is_literal_ok(&self) -> bool {
matches!(self, ScalarExpr::Literal(Ok(_), _typ))
}
pub fn is_literal_err(&self) -> bool {
matches!(self, ScalarExpr::Literal(Err(_), _typ))
}
}
impl ScalarExpr {
/// visit post-order without stack call limit, but may cause stack overflow
fn visit_post_nolimit<F>(&self, f: &mut F)
where
F: FnMut(&Self),
{
self.visit_children(|e| e.visit_post_nolimit(f));
f(self);
}
fn visit_children<F>(&self, mut f: F)
where
F: FnMut(&Self),
{
match self {
ScalarExpr::Column(_) | ScalarExpr::Literal(_, _) => (),
ScalarExpr::CallUnary { func, expr } => f(expr),
ScalarExpr::CallBinary { func, expr1, expr2 } => {
f(expr1);
f(expr2);
}
ScalarExpr::CallVariadic { func, exprs } => {
for expr in exprs {
f(expr);
}
}
ScalarExpr::If { cond, then, els } => {
f(cond);
f(then);
f(els);
}
}
}
fn visit_mut_post_nolimit<F>(&mut self, f: &mut F)
where
F: FnMut(&mut Self),
{
self.visit_mut_children(|e: &mut Self| e.visit_mut_post_nolimit(f));
f(self);
}
fn visit_mut_children<F>(&mut self, mut f: F)
where
F: FnMut(&mut Self),
{
match self {
ScalarExpr::Column(_) | ScalarExpr::Literal(_, _) => (),
ScalarExpr::CallUnary { func, expr } => f(expr),
ScalarExpr::CallBinary { func, expr1, expr2 } => {
f(expr1);
f(expr2);
}
ScalarExpr::CallVariadic { func, exprs } => {
for expr in exprs {
f(expr);
}
}
ScalarExpr::If { cond, then, els } => {
f(cond);
f(then);
f(els);
}
}
}
}

View File

@@ -0,0 +1,206 @@
use datatypes::prelude::ConcreteDataType;
use datatypes::value::{OrderedF32, OrderedF64, Value};
use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash)]
pub enum AggregateFunc {
MaxInt16,
MaxInt32,
MaxInt64,
MaxUInt16,
MaxUInt32,
MaxUInt64,
MaxFloat32,
MaxFloat64,
MaxBool,
MaxString,
MaxDate,
MaxTimestamp,
MaxTimestampTz,
MinInt16,
MinInt32,
MinInt64,
MinUInt16,
MinUInt32,
MinUInt64,
MinFloat32,
MinFloat64,
MinBool,
MinString,
MinDate,
MinTimestamp,
MinTimestampTz,
SumInt16,
SumInt32,
SumInt64,
SumUInt16,
SumUInt32,
SumUInt64,
SumFloat32,
SumFloat64,
Count,
Any,
All,
}
impl AggregateFunc {
pub fn eval<I>(&self, values: I) -> Value
where
I: IntoIterator<Item = Value>,
{
// TODO: impl more functions like min/max/sumTimestamp etc.
match self {
AggregateFunc::MaxInt16 => max_value::<I, i16>(values),
AggregateFunc::MaxInt32 => max_value::<I, i32>(values),
AggregateFunc::MaxInt64 => max_value::<I, i64>(values),
AggregateFunc::MaxUInt16 => max_value::<I, u16>(values),
AggregateFunc::MaxUInt32 => max_value::<I, u32>(values),
AggregateFunc::MaxUInt64 => max_value::<I, u64>(values),
AggregateFunc::MaxFloat32 => max_value::<I, OrderedF32>(values),
AggregateFunc::MaxFloat64 => max_value::<I, OrderedF64>(values),
AggregateFunc::MaxBool => max_value::<I, bool>(values),
AggregateFunc::MaxString => max_string(values),
AggregateFunc::MinInt16 => min_value::<I, i16>(values),
AggregateFunc::MinInt32 => min_value::<I, i32>(values),
AggregateFunc::MinInt64 => min_value::<I, i64>(values),
AggregateFunc::MinUInt16 => min_value::<I, u16>(values),
AggregateFunc::MinUInt32 => min_value::<I, u32>(values),
AggregateFunc::MinUInt64 => min_value::<I, u16>(values),
AggregateFunc::MinFloat32 => min_value::<I, OrderedF32>(values),
AggregateFunc::MinFloat64 => min_value::<I, OrderedF64>(values),
AggregateFunc::MinBool => min_value::<I, bool>(values),
AggregateFunc::MinString => min_string(values),
AggregateFunc::SumInt16 => sum_value::<I, i16, i64>(values),
AggregateFunc::SumInt32 => sum_value::<I, i32, i64>(values),
AggregateFunc::SumInt64 => sum_value::<I, i64, i64>(values),
AggregateFunc::SumUInt16 => sum_value::<I, u16, u64>(values),
AggregateFunc::SumUInt32 => sum_value::<I, u32, u64>(values),
AggregateFunc::SumUInt64 => sum_value::<I, u64, u64>(values),
AggregateFunc::SumFloat32 => sum_value::<I, f32, f32>(values),
AggregateFunc::SumFloat64 => sum_value::<I, f64, f64>(values),
AggregateFunc::Count => count(values),
AggregateFunc::All => all(values),
AggregateFunc::Any => any(values),
_ => todo!(),
}
}
}
fn max_string<I>(values: I) -> Value
where
I: IntoIterator<Item = Value>,
{
match values.into_iter().filter(|d| !d.is_null()).max_by(|a, b| {
let a = a.as_value_ref();
let a = a.as_string().expect("unexpected type").unwrap();
let b = b.as_value_ref();
let b = b.as_string().expect("unexpected type").unwrap();
a.cmp(b)
}) {
Some(v) => v,
None => Value::Null,
}
}
fn max_value<I, TypedValue>(values: I) -> Value
where
I: IntoIterator<Item = Value>,
TypedValue: TryFrom<Value> + Ord,
<TypedValue as TryFrom<Value>>::Error: std::fmt::Debug,
Value: From<Option<TypedValue>>,
{
let x: Option<TypedValue> = values
.into_iter()
.filter(|v| !v.is_null())
.map(|v| TypedValue::try_from(v).expect("unexpected type"))
.max();
x.into()
}
fn min_string<I>(values: I) -> Value
where
I: IntoIterator<Item = Value>,
{
match values.into_iter().filter(|d| !d.is_null()).min_by(|a, b| {
let a = a.as_value_ref();
let a = a.as_string().expect("unexpected type").unwrap();
let b = b.as_value_ref();
let b = b.as_string().expect("unexpected type").unwrap();
a.cmp(b)
}) {
Some(v) => v,
None => Value::Null,
}
}
fn min_value<I, TypedValue>(values: I) -> Value
where
I: IntoIterator<Item = Value>,
TypedValue: TryFrom<Value> + Ord,
<TypedValue as TryFrom<Value>>::Error: std::fmt::Debug,
Value: From<Option<TypedValue>>,
{
let x: Option<TypedValue> = values
.into_iter()
.filter(|v| !v.is_null())
.map(|v| TypedValue::try_from(v).expect("unexpected type"))
.min();
x.into()
}
fn sum_value<I, ValueType, ResultType>(values: I) -> Value
where
I: IntoIterator<Item = Value>,
ValueType: TryFrom<Value>,
<ValueType as TryFrom<Value>>::Error: std::fmt::Debug,
Value: From<Option<ValueType>>,
ResultType: From<ValueType> + std::iter::Sum + Into<Value>,
{
// If no row qualifies, then the result of COUNT is 0 (zero), and the result of any other aggregate function is the null value.
let mut values = values.into_iter().filter(|v| !v.is_null()).peekable();
if values.peek().is_none() {
Value::Null
} else {
let x = values
.map(|v| ResultType::from(ValueType::try_from(v).expect("unexpected type")))
.sum::<ResultType>();
x.into()
}
}
fn count<I>(values: I) -> Value
where
I: IntoIterator<Item = Value>,
{
let x = values.into_iter().filter(|v| !v.is_null()).count() as i64;
Value::from(x)
}
fn any<I>(datums: I) -> Value
where
I: IntoIterator<Item = Value>,
{
datums
.into_iter()
.fold(Value::Boolean(false), |state, next| match (state, next) {
(Value::Boolean(true), _) | (_, Value::Boolean(true)) => Value::Boolean(true),
(Value::Null, _) | (_, Value::Null) => Value::Null,
_ => Value::Boolean(false),
})
}
fn all<I>(datums: I) -> Value
where
I: IntoIterator<Item = Value>,
{
datums
.into_iter()
.fold(Value::Boolean(true), |state, next| match (state, next) {
(Value::Boolean(false), _) | (_, Value::Boolean(false)) => Value::Boolean(false),
(Value::Null, _) | (_, Value::Null) => Value::Null,
_ => Value::Boolean(true),
})
}

View File

@@ -0,0 +1,22 @@
pub(crate) use func::AggregateFunc;
use serde::{Deserialize, Serialize};
use crate::expr::ScalarExpr;
mod func;
/// function that might emit multiple output record for one input row
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash)]
pub enum TableFunc {}
/// Describes an aggregation expression.
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct AggregateExpr {
/// Names the aggregation function.
pub func: AggregateFunc,
/// An expression which extracts from each row the input to `func`.
pub expr: ScalarExpr,
/// Should the aggregation be applied only to distinct results in each group.
#[serde(default)]
pub distinct: bool,
}

9
src/flow/src/lib.rs Normal file
View File

@@ -0,0 +1,9 @@
#![allow(unused)]
#![allow(clippy::mutable_key_type)]
mod adapter;
mod compute;
mod expr;
mod repr;
mod storage;
mod util;

62
src/flow/src/repr/mod.rs Normal file
View File

@@ -0,0 +1,62 @@
//! basically a wrapper around the `datatype` crate
//! for basic Data Representation
use std::borrow::Borrow;
use std::slice::SliceIndex;
use datatypes::value::Value;
pub(crate) use relation::{RelationDesc, RelationType};
use serde::{Deserialize, Serialize};
/// System-wide Record count difference type.
pub type Diff = i64;
mod relation;
mod timestamp;
/// A row is a vector of values.
///
/// TODO(discord9): use a more efficient representation
///i.e. more compact like raw u8 of \[tag0, value0, tag1, value1, ...\]
#[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Default, Serialize, Deserialize)]
pub struct Row {
inner: Vec<Value>,
}
impl Row {
pub fn get(&self, idx: usize) -> Option<&Value> {
self.inner.get(idx)
}
pub fn clear(&mut self) {
self.inner.clear();
}
pub fn packer(&mut self) -> &mut Vec<Value> {
self.inner.clear();
&mut self.inner
}
pub fn pack<I>(iter: I) -> Row
where
I: IntoIterator<Item = Value>,
{
Self {
inner: iter.into_iter().collect(),
}
}
pub fn unpack(&self) -> Vec<Value> {
self.inner.clone()
}
pub fn extend<I>(&mut self, iter: I)
where
I: IntoIterator<Item = Value>,
{
self.inner.extend(iter);
}
pub fn into_iter(self) -> impl Iterator<Item = Value> {
self.inner.into_iter()
}
pub fn iter(&self) -> impl Iterator<Item = &Value> {
self.inner.iter()
}
}
/// System-wide default timestamp type
pub type Timestamp = u64;

View File

@@ -0,0 +1,342 @@
use datatypes::prelude::ConcreteDataType;
use serde::{Deserialize, Serialize};
/// The type of a relation.
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash)]
pub struct RelationType {
/// The type for each column, in order.
pub column_types: Vec<ColumnType>,
/// Sets of indices that are "keys" for the collection.
///
/// Each element in this list is a set of column indices, each with the
/// property that the collection contains at most one record with each
/// distinct set of values for each column. Alternately, for a specific set
/// of values assigned to the these columns there is at most one record.
///
/// A collection can contain multiple sets of keys, although it is common to
/// have either zero or one sets of key indices.
#[serde(default)]
pub keys: Vec<Vec<usize>>,
}
impl RelationType {
/// Constructs a `RelationType` representing the relation with no columns and
/// no keys.
pub fn empty() -> Self {
RelationType::new(vec![])
}
/// Constructs a new `RelationType` from specified column types.
///
/// The `RelationType` will have no keys.
pub fn new(column_types: Vec<ColumnType>) -> Self {
RelationType {
column_types,
keys: Vec::new(),
}
}
/// Adds a new key for the relation.
pub fn with_key(mut self, mut indices: Vec<usize>) -> Self {
indices.sort_unstable();
if !self.keys.contains(&indices) {
self.keys.push(indices);
}
self
}
pub fn with_keys(mut self, keys: Vec<Vec<usize>>) -> Self {
for key in keys {
self = self.with_key(key)
}
self
}
/// Computes the number of columns in the relation.
pub fn arity(&self) -> usize {
self.column_types.len()
}
/// Gets the index of the columns used when creating a default index.
pub fn default_key(&self) -> Vec<usize> {
if let Some(key) = self.keys.first() {
if key.is_empty() {
(0..self.column_types.len()).collect()
} else {
key.clone()
}
} else {
(0..self.column_types.len()).collect()
}
}
/// True if any collection described by `self` could safely be described by `other`.
///
/// In practice this means checking that the scalar types match exactly, and that the
/// nullability of `self` is at least as strict as `other`, and that all keys of `other`
/// contain some key of `self` (as a set of key columns is less strict than any subset).
pub fn subtypes(&self, other: &RelationType) -> bool {
let all_keys = other.keys.iter().all(|key1| {
self.keys
.iter()
.any(|key2| key1.iter().all(|k| key2.contains(k)))
});
if !all_keys {
return false;
}
if self.column_types.len() != other.column_types.len() {
return false;
}
for (col1, col2) in self.column_types.iter().zip(other.column_types.iter()) {
if col1.nullable && !col2.nullable {
return false;
}
if col1.scalar_type != col2.scalar_type {
return false;
}
}
true
}
}
/// The type of a `Value`
///
/// [`ColumnType`] bundles information about the scalar type of a datum (e.g.,
/// Int32 or String) with its nullability.
///
/// To construct a column type, either initialize the struct directly, or
/// use the [`ScalarType::nullable`] method.
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash)]
pub struct ColumnType {
/// The underlying scalar type (e.g., Int32 or String) of this column.
pub scalar_type: ConcreteDataType,
/// Whether this datum can be null.`
#[serde(default = "return_true")]
pub nullable: bool,
}
/// This method exists solely for the purpose of making ColumnType nullable by
/// default in unit tests. The default value of a bool is false, and the only
/// way to make an object take on any other value by default is to pass it a
/// function that returns the desired default value. See
/// <https://github.com/serde-rs/serde/issues/1030>
#[inline(always)]
fn return_true() -> bool {
true
}
/// A description of the shape of a relation.
///
/// It bundles a [`RelationType`] with the name of each column in the relation.
/// Individual column names are optional.
///
/// # Examples
///
/// A `RelationDesc`s is typically constructed via its builder API:
///
/// ```
/// use mz_repr::{ColumnType, RelationDesc, ScalarType};
///
/// let desc = RelationDesc::empty()
/// .with_column("id", ScalarType::Int64.nullable(false))
/// .with_column("price", ScalarType::Float64.nullable(true));
/// ```
///
/// In more complicated cases, like when constructing a `RelationDesc` in
/// response to user input, it may be more convenient to construct a relation
/// type first, and imbue it with column names to form a `RelationDesc` later:
///
/// ```
/// use mz_repr::RelationDesc;
///
/// # fn plan_query(_: &str) -> mz_repr::RelationType { mz_repr::RelationType::new(vec![]) }
/// let relation_type = plan_query("SELECT * FROM table");
/// let names = (0..relation_type.arity()).map(|i| match i {
/// 0 => "first",
/// 1 => "second",
/// _ => "unknown",
/// });
/// let desc = RelationDesc::new(relation_type, names);
/// ```
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, Hash)]
pub struct RelationDesc {
typ: RelationType,
names: Vec<ColumnName>,
}
impl RelationDesc {
/// Constructs a new `RelationDesc` that represents the empty relation
/// with no columns and no keys.
pub fn empty() -> Self {
RelationDesc {
typ: RelationType::empty(),
names: vec![],
}
}
/// Constructs a new `RelationDesc` from a `RelationType` and an iterator
/// over column names.
///
/// # Panics
///
/// Panics if the arity of the `RelationType` is not equal to the number of
/// items in `names`.
pub fn new<I, N>(typ: RelationType, names: I) -> Self
where
I: IntoIterator<Item = N>,
N: Into<ColumnName>,
{
let names: Vec<_> = names.into_iter().map(|name| name.into()).collect();
assert_eq!(typ.column_types.len(), names.len());
RelationDesc { typ, names }
}
pub fn from_names_and_types<I, T, N>(iter: I) -> Self
where
I: IntoIterator<Item = (N, T)>,
T: Into<ColumnType>,
N: Into<ColumnName>,
{
let (names, types): (Vec<_>, Vec<_>) = iter.into_iter().unzip();
let types = types.into_iter().map(Into::into).collect();
let typ = RelationType::new(types);
Self::new(typ, names)
}
/// Concatenates a `RelationDesc` onto the end of this `RelationDesc`.
pub fn concat(mut self, other: Self) -> Self {
let self_len = self.typ.column_types.len();
self.names.extend(other.names);
self.typ.column_types.extend(other.typ.column_types);
for k in other.typ.keys {
let k = k.into_iter().map(|idx| idx + self_len).collect();
self = self.with_key(k);
}
self
}
/// Appends a column with the specified name and type.
pub fn with_column<N>(mut self, name: N, column_type: ColumnType) -> Self
where
N: Into<ColumnName>,
{
self.typ.column_types.push(column_type);
self.names.push(name.into());
self
}
/// Adds a new key for the relation.
pub fn with_key(mut self, indices: Vec<usize>) -> Self {
self.typ = self.typ.with_key(indices);
self
}
/// Drops all existing keys.
pub fn without_keys(mut self) -> Self {
self.typ.keys.clear();
self
}
/// Builds a new relation description with the column names replaced with
/// new names.
///
/// # Panics
///
/// Panics if the arity of the relation type does not match the number of
/// items in `names`.
pub fn with_names<I, N>(self, names: I) -> Self
where
I: IntoIterator<Item = N>,
N: Into<ColumnName>,
{
Self::new(self.typ, names)
}
/// Computes the number of columns in the relation.
pub fn arity(&self) -> usize {
self.typ.arity()
}
/// Returns the relation type underlying this relation description.
pub fn typ(&self) -> &RelationType {
&self.typ
}
/// Returns an iterator over the columns in this relation.
pub fn iter(&self) -> impl Iterator<Item = (&ColumnName, &ColumnType)> {
self.iter_names().zip(self.iter_types())
}
/// Returns an iterator over the types of the columns in this relation.
pub fn iter_types(&self) -> impl Iterator<Item = &ColumnType> {
self.typ.column_types.iter()
}
/// Returns an iterator over the names of the columns in this relation.
pub fn iter_names(&self) -> impl Iterator<Item = &ColumnName> {
self.names.iter()
}
/// Finds a column by name.
///
/// Returns the index and type of the column named `name`. If no column with
/// the specified name exists, returns `None`. If multiple columns have the
/// specified name, the leftmost column is returned.
pub fn get_by_name(&self, name: &ColumnName) -> Option<(usize, &ColumnType)> {
self.iter_names()
.position(|n| n == name)
.map(|i| (i, &self.typ.column_types[i]))
}
/// Gets the name of the `i`th column.
///
/// # Panics
///
/// Panics if `i` is not a valid column index.
pub fn get_name(&self, i: usize) -> &ColumnName {
&self.names[i]
}
/// Mutably gets the name of the `i`th column.
///
/// # Panics
///
/// Panics if `i` is not a valid column index.
pub fn get_name_mut(&mut self, i: usize) -> &mut ColumnName {
&mut self.names[i]
}
/// Gets the name of the `i`th column if that column name is unambiguous.
///
/// If at least one other column has the same name as the `i`th column,
/// returns `None`. If the `i`th column has no name, returns `None`.
///
/// # Panics
///
/// Panics if `i` is not a valid column index.
pub fn get_unambiguous_name(&self, i: usize) -> Option<&ColumnName> {
let name = &self.names[i];
if self.iter_names().filter(|n| *n == name).count() == 1 {
Some(name)
} else {
None
}
}
}
/// The name of a column in a [`RelationDesc`].
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize, Hash)]
pub struct ColumnName(pub(crate) String);
impl ColumnName {
/// Returns this column name as a `str`.
pub fn as_str(&self) -> &str {
&self.0
}
/// Returns a mutable reference to the string underlying this column name.
pub fn as_mut_str(&mut self) -> &mut String {
&mut self.0
}
}

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,28 @@
use serde::{Deserialize, Serialize};
// TODO(discord9): more error types
#[derive(Ord, PartialOrd, Clone, Debug, Eq, Deserialize, Serialize, PartialEq, Hash)]
pub enum DataflowError {
EvalError(Box<EvalError>),
}
impl From<EvalError> for DataflowError {
fn from(e: EvalError) -> Self {
DataflowError::EvalError(Box::new(e))
}
}
#[derive(Ord, PartialOrd, Clone, Debug, Eq, Deserialize, Serialize, PartialEq, Hash)]
pub enum EvalError {
DivisionByZero,
TypeMismatch(String),
InvalidArgument(String),
Internal(String),
}
#[test]
fn tell_goal() {
use differential_dataflow::ExchangeData;
fn a<T: ExchangeData>(_: T) {}
a(DataflowError::from(EvalError::DivisionByZero));
}

View File

@@ -0,0 +1,4 @@
//! TODO: Storage Layer: wrap grpc write request for providing definite collection for streaming process, and able to send read request should random access is needed
//! and store result of stream processing
pub(crate) mod errors;

View File

150
src/flow/src/util/buffer.rs Normal file
View File

@@ -0,0 +1,150 @@
use differential_dataflow::consolidation::consolidate_updates;
use differential_dataflow::difference::Semigroup;
use differential_dataflow::Data;
use timely::communication::Push;
use timely::dataflow::channels::Bundle;
use timely::dataflow::operators::generic::OutputHandle;
use timely::dataflow::operators::{Capability, InputCapability};
use timely::progress::Timestamp;
/// A buffer that consolidates updates
///
/// The buffer implements a wrapper around [OutputHandle] consolidating elements pushed to it. It is
/// backed by a capacity-limited buffer, which means that compaction only occurs within the
/// dimensions of the buffer, i.e. the number of unique keys is less than half of the buffer's
/// capacity.
///
/// A cap is retained whenever the current time changes to be able to flush on drop or when the time
/// changes again.
///
/// The buffer is filled with updates until it reaches its capacity. At this point, the updates are
/// consolidated to free up space. This process repeats until the consolidation recovered less than
/// half of the buffer's capacity, at which point the buffer will be shipped.
///
/// The buffer retains a capability to send data on flush. It will flush all data once dropped, if
/// time changes, or if the buffer capacity is reached.
pub struct ConsolidateBuffer<'a, 'b, T, D: Data, R: Semigroup, P>
where
P: Push<Bundle<T, (D, T, R)>> + 'a,
T: Data + Timestamp + 'a,
D: 'a,
{
// a buffer for records, to send at self.cap
// Invariant: Buffer only contains data if cap is Some.
buffer: Vec<(D, T, R)>,
output_handle: &'b mut OutputHandle<'a, T, (D, T, R), P>,
cap: Option<Capability<T>>,
port: usize,
previous_len: usize,
}
impl<'a, 'b, T, D: Data, R: Semigroup, P> ConsolidateBuffer<'a, 'b, T, D, R, P>
where
T: Data + Timestamp + 'a,
P: Push<Bundle<T, (D, T, R)>> + 'a,
{
/// Create a new [ConsolidateBuffer], wrapping the provided session.
///
/// * `output_handle`: The output to send data to.
/// * 'port': The output port to retain capabilities for.
pub fn new(output_handle: &'b mut OutputHandle<'a, T, (D, T, R), P>, port: usize) -> Self {
Self {
output_handle,
port,
cap: None,
buffer: Vec::with_capacity(::timely::container::buffer::default_capacity::<(D, T, R)>()),
previous_len: 0,
}
}
#[inline]
/// Provides an iterator of elements to the buffer
pub fn give_iterator<I: Iterator<Item = (D, T, R)>>(
&mut self,
cap: &InputCapability<T>,
iter: I,
) {
for item in iter {
self.give(cap, item);
}
}
/// Give an element to the buffer
pub fn give(&mut self, cap: &InputCapability<T>, data: (D, T, R)) {
// Retain a cap for the current time, which will be used on flush.
if self.cap.as_ref().map_or(true, |t| t.time() != cap.time()) {
// Flush on capability change
self.flush();
// Retain capability for the specified output port.
self.cap = Some(cap.delayed_for_output(cap.time(), self.port));
}
self.give_internal(data);
}
/// Give an element to the buffer, using a pre-fabricated capability. Note that the capability
/// must be valid for the associated output.
pub fn give_at(&mut self, cap: &Capability<T>, data: (D, T, R)) {
// Retain a cap for the current time, which will be used on flush.
if self.cap.as_ref().map_or(true, |t| t.time() != cap.time()) {
// Flush on capability change
self.flush();
// Retain capability.
self.cap = Some(cap.clone());
}
self.give_internal(data);
}
/// Give an element and possibly flush the buffer. Note that this needs to have access
/// to a capability, which the public functions ensure.
fn give_internal(&mut self, data: (D, T, R)) {
self.buffer.push(data);
// Limit, if possible, the lifetime of the allocations for data
// and consolidate smaller buffers if we're in the lucky case
// of a small domain for D
if self.buffer.len() >= 2 * self.previous_len {
// Consolidate while the consolidation frees at least half the buffer
consolidate_updates(&mut self.buffer);
if self.buffer.len() > self.buffer.capacity() / 2 {
self.flush();
} else {
self.previous_len = self.buffer.len();
}
// At this point, it is an invariant across give calls that self.previous_len
// will be in the interval [0, self.buffer.capacity() / 2]. So, we will enter
// this if-statement block again when self.buffer.len() == self.buffer.capacity()
// or earlier. If consolidation is not effective to keep self.buffer.len()
// below half capacity, then flushing when more than half-full will
// maintain the invariant.
}
}
/// Flush the internal buffer to the underlying session
pub fn flush(&mut self) {
if let Some(cap) = &self.cap {
self.output_handle.session(cap).give_vec(&mut self.buffer);
// Ensure that the capacity is at least equal to the default in case
// it was reduced by give_vec. Note that we cannot rely here on give_vec
// returning us a buffer with zero capacity.
if self.buffer.capacity() < ::timely::container::buffer::default_capacity::<(D, T, R)>()
{
let to_reserve = ::timely::container::buffer::default_capacity::<(D, T, R)>()
- self.buffer.capacity();
self.buffer.reserve_exact(to_reserve);
}
self.previous_len = 0;
}
}
}
impl<'a, 'b, T, D: Data, R: Semigroup, P> Drop for ConsolidateBuffer<'a, 'b, T, D, R, P>
where
P: Push<Bundle<T, (D, T, R)>> + 'a,
T: Data + Timestamp + 'a,
D: 'a,
{
fn drop(&mut self) {
self.flush();
}
}

7
src/flow/src/util/mod.rs Normal file
View File

@@ -0,0 +1,7 @@
//! utilitys including extend differential dataflow to deal with errors and etc.
mod buffer;
mod operator;
mod reduce;
pub use operator::CollectionExt;
pub use reduce::ReduceExt;

View File

@@ -0,0 +1,257 @@
use differential_dataflow::difference::{Multiply, Semigroup};
use differential_dataflow::lattice::Lattice;
use differential_dataflow::operators::arrange::Arrange;
use differential_dataflow::trace::{Batch, Trace, TraceReader};
use differential_dataflow::{AsCollection, Collection};
use timely::dataflow::channels::pact::{Exchange, ParallelizationContract, Pipeline};
use timely::dataflow::channels::pushers::Tee;
use timely::dataflow::operators::generic::builder_rc::OperatorBuilder as OperatorBuilderRc;
use timely::dataflow::operators::generic::operator::{self, Operator};
use timely::dataflow::operators::generic::{InputHandle, OperatorInfo, OutputHandle};
use timely::dataflow::operators::Capability;
use timely::dataflow::{Scope, Stream};
use timely::{Data, ExchangeData};
use crate::util::buffer::ConsolidateBuffer;
pub trait StreamExt<G, D1>
where
D1: Data,
G: Scope,
{
/// Like `timely::dataflow::operators::generic::operator::Operator::unary`,
/// but the logic function can handle failures.
///
/// Creates a new dataflow operator that partitions its input stream by a
/// parallelization strategy `pact` and repeatedly invokes `logic`, the
/// function returned by the function passed as `constructor`. The `logic`
/// function can read to the input stream and write to either of two output
/// streams, where the first output stream represents successful
/// computations and the second output stream represents failed
/// computations.
fn unary_fallible<D2, E, B, P>(
&self,
pact: P,
name: &str,
constructor: B,
) -> (Stream<G, D2>, Stream<G, E>)
where
D2: Data,
E: Data,
B: FnOnce(
Capability<G::Timestamp>,
OperatorInfo,
) -> Box<
dyn FnMut(
&mut InputHandle<G::Timestamp, D1, P::Puller>,
&mut OutputHandle<G::Timestamp, D2, Tee<G::Timestamp, D2>>,
&mut OutputHandle<G::Timestamp, E, Tee<G::Timestamp, E>>,
) + 'static,
>,
P: ParallelizationContract<G::Timestamp, D1>;
/// Like [`timely::dataflow::operators::map::Map::flat_map`], but `logic`
/// is allowed to fail. The first returned stream will contain the
/// successful applications of `logic`, while the second returned stream
/// will contain the failed applications.
fn flat_map_fallible<D2, E, I, L>(&self, name: &str, logic: L) -> (Stream<G, D2>, Stream<G, E>)
where
D2: Data,
E: Data,
I: IntoIterator<Item = Result<D2, E>>,
L: FnMut(D1) -> I + 'static;
}
/// Extension methods for differential [`Collection`]s.
pub trait CollectionExt<G, D1, R>
where
G: Scope,
R: Semigroup,
{
/// Creates a new empty collection in `scope`.
fn empty(scope: &G) -> Collection<G, D1, R>;
/// Like [`Collection::map`], but `logic` is allowed to fail. The first
/// returned collection will contain successful applications of `logic`,
/// while the second returned collection will contain the failed
/// applications.
fn map_fallible<D2, E, L>(
&self,
name: &str,
mut logic: L,
) -> (Collection<G, D2, R>, Collection<G, E, R>)
where
D2: Data,
E: Data,
L: FnMut(D1) -> Result<D2, E> + 'static,
{
self.flat_map_fallible(name, move |record| Some(logic(record)))
}
/// Like [`Collection::flat_map`], but `logic` is allowed to fail. The first
/// returned collection will contain the successful applications of `logic`,
/// while the second returned collection will contain the failed
/// applications.
fn flat_map_fallible<D2, E, I, L>(
&self,
name: &str,
logic: L,
) -> (Collection<G, D2, R>, Collection<G, E, R>)
where
D2: Data,
E: Data,
I: IntoIterator<Item = Result<D2, E>>,
L: FnMut(D1) -> I + 'static;
/// Replaces each record with another, with a new difference type.
///
/// This method is most commonly used to take records containing aggregatable data (e.g. numbers to be summed)
/// and move the data into the difference component. This will allow differential dataflow to update in-place.
fn explode_one<D2, R2, L>(&self, logic: L) -> Collection<G, D2, <R2 as Multiply<R>>::Output>
where
D2: differential_dataflow::Data,
R2: Semigroup + Multiply<R>,
<R2 as Multiply<R>>::Output: Data + Semigroup,
L: FnMut(D1) -> (D2, R2) + 'static,
G::Timestamp: Lattice;
}
impl<G, D1> StreamExt<G, D1> for Stream<G, D1>
where
D1: Data,
G: Scope,
{
fn unary_fallible<D2, E, B, P>(
&self,
pact: P,
name: &str,
constructor: B,
) -> (Stream<G, D2>, Stream<G, E>)
where
D2: Data,
E: Data,
B: FnOnce(
Capability<G::Timestamp>,
OperatorInfo,
) -> Box<
dyn FnMut(
&mut InputHandle<G::Timestamp, D1, P::Puller>,
&mut OutputHandle<G::Timestamp, D2, Tee<G::Timestamp, D2>>,
&mut OutputHandle<G::Timestamp, E, Tee<G::Timestamp, E>>,
) + 'static,
>,
P: ParallelizationContract<G::Timestamp, D1>,
{
let mut builder = OperatorBuilderRc::new(name.into(), self.scope());
builder.set_notify(false);
let operator_info = builder.operator_info();
let mut input = builder.new_input(self, pact);
let (mut ok_output, ok_stream) = builder.new_output();
let (mut err_output, err_stream) = builder.new_output();
builder.build(move |mut capabilities| {
// `capabilities` should be a single-element vector.
let capability = capabilities.pop().unwrap();
let mut logic = constructor(capability, operator_info);
move |_frontiers| {
let mut ok_output_handle = ok_output.activate();
let mut err_output_handle = err_output.activate();
logic(&mut input, &mut ok_output_handle, &mut err_output_handle);
}
});
(ok_stream, err_stream)
}
#[allow(clippy::redundant_closure)]
fn flat_map_fallible<D2, E, I, L>(
&self,
name: &str,
mut logic: L,
) -> (Stream<G, D2>, Stream<G, E>)
where
D2: Data,
E: Data,
I: IntoIterator<Item = Result<D2, E>>,
L: FnMut(D1) -> I + 'static,
{
let mut storage = Vec::new();
self.unary_fallible(Pipeline, name, move |_, _| {
Box::new(move |input, ok_output, err_output| {
input.for_each(|time, data| {
let mut ok_session = ok_output.session(&time);
let mut err_session = err_output.session(&time);
data.swap(&mut storage);
for r in storage.drain(..).flat_map(|d1| logic(d1)) {
match r {
Ok(d2) => ok_session.give(d2),
Err(e) => err_session.give(e),
}
}
})
})
})
}
}
impl<G, D1, R> CollectionExt<G, D1, R> for Collection<G, D1, R>
where
G: Scope,
G::Timestamp: Data,
D1: Data,
R: Semigroup,
{
fn empty(scope: &G) -> Collection<G, D1, R> {
operator::empty(scope).as_collection()
}
fn flat_map_fallible<D2, E, I, L>(
&self,
name: &str,
mut logic: L,
) -> (Collection<G, D2, R>, Collection<G, E, R>)
where
D2: Data,
E: Data,
I: IntoIterator<Item = Result<D2, E>>,
L: FnMut(D1) -> I + 'static,
{
let (ok_stream, err_stream) = self.inner.flat_map_fallible(name, move |(d1, t, r)| {
logic(d1).into_iter().map(move |res| match res {
Ok(d2) => Ok((d2, t.clone(), r.clone())),
Err(e) => Err((e, t.clone(), r.clone())),
})
});
(ok_stream.as_collection(), err_stream.as_collection())
}
fn explode_one<D2, R2, L>(&self, mut logic: L) -> Collection<G, D2, <R2 as Multiply<R>>::Output>
where
D2: differential_dataflow::Data,
R2: Semigroup + Multiply<R>,
<R2 as Multiply<R>>::Output: Data + Semigroup,
L: FnMut(D1) -> (D2, R2) + 'static,
G::Timestamp: Lattice,
{
self.inner
.unary(Pipeline, "ExplodeOne", move |_, _| {
let mut buffer = Vec::new();
move |input, output| {
let mut out = ConsolidateBuffer::new(output, 0);
input.for_each(|time, data| {
data.swap(&mut buffer);
out.give_iterator(
&time,
buffer.drain(..).map(|(x, t, d)| {
let (x, d2) = logic(x);
(x, t, d2.multiply(&d))
}),
);
});
}
})
.as_collection()
}
}

View File

@@ -0,0 +1,68 @@
use differential_dataflow::difference::{Abelian, Semigroup};
use differential_dataflow::lattice::Lattice;
use differential_dataflow::operators::arrange::{Arranged, TraceAgent};
use differential_dataflow::operators::reduce::ReduceCore;
use differential_dataflow::trace::{Batch, Trace, TraceReader};
use differential_dataflow::Data;
use timely::dataflow::Scope;
/// Extension trait for `ReduceCore`, currently providing a reduction based
/// on an operator-pair approach.
pub trait ReduceExt<G: Scope, K: Data, V: Data, R: Semigroup>
where
G::Timestamp: Lattice + Ord,
{
/// This method produces a reduction pair based on the same input arrangement. Each reduction
/// in the pair operates with its own logic and the two output arrangements from the reductions
/// are produced as a result. The method is useful for reductions that need to present different
/// output views on the same input data. An example is producing an error-free reduction output
/// along with a separate error output indicating when the error-free output is valid.
fn reduce_pair<L1, T1, L2, T2>(
&self,
name1: &str,
name2: &str,
logic1: L1,
logic2: L2,
) -> (Arranged<G, TraceAgent<T1>>, Arranged<G, TraceAgent<T2>>)
where
T1: Trace + TraceReader<Key = K, Time = G::Timestamp> + 'static,
T1::Val: Data,
T1::R: Abelian,
T1::Batch: Batch,
L1: FnMut(&K, &[(&V, R)], &mut Vec<(T1::Val, T1::R)>) + 'static,
T2: Trace + TraceReader<Key = K, Time = G::Timestamp> + 'static,
T2::Val: Data,
T2::R: Abelian,
T2::Batch: Batch,
L2: FnMut(&K, &[(&V, R)], &mut Vec<(T2::Val, T2::R)>) + 'static;
}
impl<G: Scope, K: Data, V: Data, Tr, R: Semigroup> ReduceExt<G, K, V, R> for Arranged<G, Tr>
where
G::Timestamp: Lattice + Ord,
Tr: TraceReader<Key = K, Val = V, Time = G::Timestamp, R = R> + Clone + 'static,
{
fn reduce_pair<L1, T1, L2, T2>(
&self,
name1: &str,
name2: &str,
logic1: L1,
logic2: L2,
) -> (Arranged<G, TraceAgent<T1>>, Arranged<G, TraceAgent<T2>>)
where
T1: Trace + TraceReader<Key = K, Time = G::Timestamp> + 'static,
T1::Val: Data,
T1::R: Abelian,
T1::Batch: Batch,
L1: FnMut(&K, &[(&V, R)], &mut Vec<(T1::Val, T1::R)>) + 'static,
T2: Trace + TraceReader<Key = K, Time = G::Timestamp> + 'static,
T2::Val: Data,
T2::R: Abelian,
T2::Batch: Batch,
L2: FnMut(&K, &[(&V, R)], &mut Vec<(T2::Val, T2::R)>) + 'static,
{
let arranged1 = self.reduce_abelian::<L1, T1>(name1, logic1);
let arranged2 = self.reduce_abelian::<L2, T2>(name2, logic2);
(arranged1, arranged2)
}
}

View File

@@ -39,7 +39,7 @@ datatypes = { workspace = true }
file-table-engine = { workspace = true }
futures = "0.3"
futures-util.workspace = true
humantime-serde = "1.1"
humantime-serde.workspace = true
itertools.workspace = true
meta-client = { workspace = true }
# Although it is not used, please do not delete it.

View File

@@ -38,7 +38,6 @@ use common_meta::key::table_info::TableInfoKey;
use common_meta::key::table_name::TableNameKey;
use common_meta::key::{TableMetaKey, TableMetadataManagerRef};
use common_meta::kv_backend::KvBackendRef;
use common_meta::table_name::TableName;
use common_telemetry::debug;
use futures_util::TryStreamExt;
use partition::manager::PartitionRuleManagerRef;
@@ -417,12 +416,7 @@ impl CatalogManager for FrontendCatalogManager {
.try_into()
.context(catalog_err::InvalidTableInfoInCatalogSnafu)?,
);
let table = Arc::new(DistTable::new(
TableName::new(catalog, schema, table_name),
table_info,
Arc::new(self.clone()),
));
Ok(Some(table))
Ok(Some(DistTable::table(table_info)))
}
fn as_any(&self) -> &dyn Any {

View File

@@ -513,7 +513,7 @@ pub enum Error {
},
#[snafu(display("Failed to read record batch, source: {}", source))]
ReadRecordBatch {
ReadDfRecordBatch {
source: datafusion::error::DataFusionError,
location: Location,
},
@@ -600,6 +600,18 @@ pub enum Error {
#[snafu(display("Empty data: {}", msg))]
EmptyData { msg: String, location: Location },
#[snafu(display("Failed to read record batch, source: {}", source))]
ReadRecordBatch {
source: common_recordbatch::error::Error,
location: Location,
},
#[snafu(display("Failed to build column vectors, source: {}", source))]
BuildColumnVectors {
source: common_recordbatch::error::Error,
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -678,7 +690,7 @@ impl ErrorExt for Error {
Error::JoinTask { .. }
| Error::BuildParquetRecordBatchStream { .. }
| Error::ReadRecordBatch { .. }
| Error::ReadDfRecordBatch { .. }
| Error::BuildFileStream { .. }
| Error::WriteStreamToFile { .. }
| Error::Unexpected { .. } => StatusCode::Unexpected,
@@ -731,6 +743,10 @@ impl ErrorExt for Error {
Error::WriteParquet { source, .. } => source.status_code(),
Error::InvalidCopyParameter { .. } => StatusCode::InvalidArguments,
Error::ReadRecordBatch { source, .. } | Error::BuildColumnVectors { source, .. } => {
source.status_code()
}
}
}

View File

@@ -21,7 +21,7 @@ use servers::Mode;
use crate::service_config::{
DatanodeOptions, GrpcOptions, InfluxdbOptions, MysqlOptions, OpentsdbOptions, OtlpOptions,
PostgresOptions, PromStoreOptions, PrometheusOptions,
PostgresOptions, PromStoreOptions,
};
#[derive(Clone, Debug, Serialize, Deserialize)]
@@ -37,7 +37,6 @@ pub struct FrontendOptions {
pub opentsdb_options: Option<OpentsdbOptions>,
pub influxdb_options: Option<InfluxdbOptions>,
pub prom_store_options: Option<PromStoreOptions>,
pub prometheus_options: Option<PrometheusOptions>,
pub otlp_options: Option<OtlpOptions>,
pub meta_client_options: Option<MetaClientOptions>,
pub logging: LoggingOptions,
@@ -57,7 +56,6 @@ impl Default for FrontendOptions {
opentsdb_options: Some(OpentsdbOptions::default()),
influxdb_options: Some(InfluxdbOptions::default()),
prom_store_options: Some(PromStoreOptions::default()),
prometheus_options: Some(PrometheusOptions::default()),
otlp_options: Some(OtlpOptions::default()),
meta_client_options: None,
logging: LoggingOptions::default(),

View File

@@ -38,7 +38,7 @@ use catalog::remote::CachedMetaKvBackend;
use catalog::CatalogManagerRef;
use client::client_manager::DatanodeClients;
use common_base::Plugins;
use common_catalog::consts::MITO_ENGINE;
use common_catalog::consts::default_engine;
use common_error::ext::BoxedError;
use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
use common_meta::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler;
@@ -64,7 +64,7 @@ use servers::error::{AuthSnafu, ExecuteQuerySnafu, ParsePromQLSnafu};
use servers::interceptor::{
PromQueryInterceptor, PromQueryInterceptorRef, SqlQueryInterceptor, SqlQueryInterceptorRef,
};
use servers::prometheus::PrometheusHandler;
use servers::prometheus_handler::PrometheusHandler;
use servers::query_handler::grpc::{GrpcQueryHandler, GrpcQueryHandlerRef};
use servers::query_handler::sql::SqlQueryHandler;
use servers::query_handler::{
@@ -213,7 +213,6 @@ impl Instance {
let create_expr_factory = CreateExprFactory;
let row_inserter = Arc::new(RowInserter::new(
MITO_ENGINE.to_string(),
catalog_manager.clone(),
create_expr_factory,
dist_instance.clone(),
@@ -286,7 +285,6 @@ impl Instance {
let grpc_query_handler = StandaloneGrpcQueryHandler::arc(dn_instance.clone());
let row_inserter = Arc::new(RowInserter::new(
MITO_ENGINE.to_string(),
catalog_manager.clone(),
create_expr_factory,
grpc_query_handler.clone(),
@@ -366,7 +364,7 @@ impl Instance {
catalog_name, schema_name, table_name,
);
let _ = self
.create_table_by_columns(ctx, table_name, columns, MITO_ENGINE)
.create_table_by_columns(ctx, table_name, columns, default_engine())
.await?;
info!(
"Successfully created table on insertion: {}.{}.{}",

View File

@@ -14,6 +14,7 @@
pub mod deleter;
pub(crate) mod inserter;
pub(crate) mod row_inserter;
use std::collections::HashMap;
use std::sync::Arc;
@@ -23,7 +24,7 @@ use api::v1::ddl_request::Expr as DdlExpr;
use api::v1::greptime_request::Request;
use api::v1::{
column_def, AlterExpr, CompactTableExpr, CreateDatabaseExpr, CreateTableExpr, DeleteRequests,
FlushTableExpr, InsertRequests, TruncateTableExpr,
FlushTableExpr, InsertRequests, RowInsertRequests, TruncateTableExpr,
};
use async_trait::async_trait;
use catalog::{CatalogManager, DeregisterTableRequest, RegisterTableRequest};
@@ -33,7 +34,7 @@ use client::Database;
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_catalog::format_full_table_name;
use common_error::ext::BoxedError;
use common_meta::key::schema_name::SchemaNameKey;
use common_meta::key::schema_name::{SchemaNameKey, SchemaNameValue};
use common_meta::peer::Peer;
use common_meta::rpc::ddl::{DdlTask, SubmitDdlTaskRequest, SubmitDdlTaskResponse};
use common_meta::rpc::router::{Partition, Partition as MetaPartition, RouteRequest};
@@ -57,6 +58,7 @@ use sql::statements::create::{PartitionEntry, Partitions};
use sql::statements::statement::Statement;
use sql::statements::{self, sql_value_to_value};
use store_api::storage::RegionNumber;
use table::error::TableOperationSnafu;
use table::metadata::{RawTableInfo, RawTableMeta, TableId, TableIdent, TableInfo, TableType};
use table::requests::{AlterTableRequest, TableOptions};
use table::TableRef;
@@ -66,11 +68,12 @@ use crate::error::{
self, AlterExprToRequestSnafu, CatalogSnafu, ColumnDataTypeSnafu, ColumnNotFoundSnafu,
DeserializePartitionSnafu, InvokeDatanodeSnafu, NotSupportedSnafu, ParseSqlSnafu,
RequestDatanodeSnafu, RequestMetaSnafu, Result, SchemaExistsSnafu, TableAlreadyExistSnafu,
TableNotFoundSnafu, TableSnafu, UnrecognizedTableOptionSnafu,
TableMetadataManagerSnafu, TableNotFoundSnafu, TableSnafu, UnrecognizedTableOptionSnafu,
};
use crate::expr_factory;
use crate::instance::distributed::deleter::DistDeleter;
use crate::instance::distributed::inserter::DistInserter;
use crate::instance::distributed::row_inserter::RowDistInserter;
use crate::table::DistTable;
const MAX_VALUE: &str = "MAXVALUE";
@@ -101,6 +104,18 @@ impl DistInstance {
partitions: Option<Partitions>,
) -> Result<TableRef> {
let _timer = common_telemetry::timer!(crate::metrics::DIST_CREATE_TABLE);
// 1. get schema info
let schema_value = self
.catalog_manager
.table_metadata_manager_ref()
.schema_manager()
.get(SchemaNameKey::new(
&create_table.catalog_name,
&create_table.schema_name,
))
.await
.context(TableMetadataManagerSnafu)?;
let table_name = TableName::new(
&create_table.catalog_name,
&create_table.schema_name,
@@ -109,7 +124,7 @@ impl DistInstance {
let (partitions, partition_cols) = parse_partitions(create_table, partitions)?;
let mut table_info = create_table_info(create_table, partition_cols)?;
let mut table_info = create_table_info(create_table, partition_cols, schema_value)?;
let resp = self
.create_table_procedure(create_table, partitions, table_info.clone())
@@ -121,15 +136,12 @@ impl DistInstance {
info!("Successfully created distributed table '{table_name}' with table id {table_id}");
table_info.ident.table_id = table_id;
let table_info = Arc::new(table_info.try_into().context(error::CreateTableInfoSnafu)?);
create_table.table_id = Some(api::v1::TableId { id: table_id });
let table = Arc::new(DistTable::new(
table_name.clone(),
table_info,
self.catalog_manager.clone(),
));
let table = DistTable::table(table_info);
let request = RegisterTableRequest {
catalog: table_name.catalog_name.clone(),
@@ -148,10 +160,7 @@ impl DistInstance {
}
);
// Since the table information created on meta does not go through KvBackend, so we
// manually invalidate the cache here.
//
// TODO(fys): when the meta invalidation cache mechanism is established, remove it.
// Invalidates local cache ASAP.
self.catalog_manager
.invalidate_table(
&table_name.catalog_name,
@@ -191,10 +200,7 @@ impl DistInstance {
.await
.context(CatalogSnafu)?;
// Since the table information dropped on meta does not go through KvBackend, so we
// manually invalidate the cache here.
//
// TODO(fys): when the meta invalidation cache mechanism is established, remove it.
// Invalidates local cache ASAP.
self.catalog_manager()
.invalidate_table(
&table_name.catalog_name,
@@ -346,6 +352,7 @@ impl DistInstance {
let expr = CreateDatabaseExpr {
database_name: stmt.name.to_string(),
create_if_not_exists: stmt.if_not_exists,
options: Default::default(),
};
self.handle_create_database(expr, query_ctx).await
}
@@ -372,26 +379,24 @@ impl DistInstance {
self.drop_table(table_name).await
}
Statement::Insert(insert) => {
let (catalog, schema, table) =
let (catalog, schema, _) =
table_idents_to_full_name(insert.table_name(), query_ctx.clone())
.map_err(BoxedError::new)
.context(error::ExternalSnafu)?;
let table = self
.catalog_manager
.table(&catalog, &schema, &table)
.await
.context(CatalogSnafu)?
.context(TableNotFoundSnafu { table_name: table })?;
let insert_request =
SqlHandler::insert_to_request(self.catalog_manager.clone(), &insert, query_ctx)
.await
.context(InvokeDatanodeSnafu)?;
Ok(Output::AffectedRows(
table.insert(insert_request).await.context(TableSnafu)?,
))
let inserter = DistInserter::new(catalog, schema, self.catalog_manager.clone());
let affected_rows = inserter
.insert(vec![insert_request])
.await
.map_err(BoxedError::new)
.context(TableOperationSnafu)
.context(TableSnafu)?;
Ok(Output::AffectedRows(affected_rows as usize))
}
Statement::ShowCreateTable(show) => {
let (catalog, schema, table) =
@@ -407,7 +412,8 @@ impl DistInstance {
.context(TableNotFoundSnafu { table_name: &table })?;
let table_name = TableName::new(catalog, schema, table);
self.show_create_table(table_name, table_ref).await
self.show_create_table(table_name, table_ref, query_ctx.clone())
.await
}
Statement::TruncateTable(stmt) => {
let (catalog, schema, table) =
@@ -424,7 +430,12 @@ impl DistInstance {
}
}
async fn show_create_table(&self, table_name: TableName, table: TableRef) -> Result<Output> {
async fn show_create_table(
&self,
table_name: TableName,
table: TableRef,
query_ctx: QueryContextRef,
) -> Result<Output> {
let partitions = self
.catalog_manager
.partition_manager()
@@ -436,7 +447,8 @@ impl DistInstance {
let partitions = create_partitions_stmt(partitions)?;
query::sql::show_create_table(table, partitions).context(error::ExecuteStatementSnafu)
query::sql::show_create_table(table, partitions, query_ctx)
.context(error::ExecuteStatementSnafu)
}
/// Handles distributed database creation
@@ -478,10 +490,12 @@ impl DistInstance {
}
);
let schema_value =
SchemaNameValue::try_from(&expr.options).context(error::TableMetadataManagerSnafu)?;
self.catalog_manager
.table_metadata_manager_ref()
.schema_manager()
.create(schema)
.create(schema, Some(schema_value))
.await
.context(error::TableMetadataManagerSnafu)?;
@@ -556,6 +570,11 @@ impl DistInstance {
.await
.context(error::RequestMetaSnafu)?;
// Invalidates local cache ASAP.
self.catalog_manager()
.invalidate_table(catalog_name, schema_name, table_name, table_id)
.await;
Ok(Output::AffectedRows(0))
}
@@ -625,6 +644,20 @@ impl DistInstance {
Ok(Output::AffectedRows(affected_rows as usize))
}
async fn handle_row_dist_insert(
&self,
requests: RowInsertRequests,
ctx: QueryContextRef,
) -> Result<Output> {
let inserter = RowDistInserter::new(
ctx.current_catalog().to_owned(),
ctx.current_schema().to_owned(),
self.catalog_manager.clone(),
);
let affected_rows = inserter.insert(requests).await?;
Ok(Output::AffectedRows(affected_rows as usize))
}
async fn handle_dist_delete(
&self,
request: DeleteRequests,
@@ -665,8 +698,9 @@ impl GrpcQueryHandler for DistInstance {
async fn do_query(&self, request: Request, ctx: QueryContextRef) -> Result<Output> {
match request {
Request::Inserts(requests) => self.handle_dist_insert(requests, ctx).await,
Request::RowInserts(_) | Request::RowDeletes(_) => NotSupportedSnafu {
feat: "row inserts/deletes",
Request::RowInserts(requests) => self.handle_row_dist_insert(requests, ctx).await,
Request::RowDeletes(_) => NotSupportedSnafu {
feat: "row deletes",
}
.fail(),
Request::Deletes(requests) => self.handle_dist_delete(requests, ctx).await,
@@ -726,7 +760,7 @@ fn create_partitions_stmt(partitions: Vec<PartitionInfo>) -> Result<Option<Parti
.into_iter()
.map(|info| {
// Generated the partition name from id
let name = &format!("r{}", info.id.as_u64());
let name = &format!("r{}", info.id.region_number());
let bounds = info.partition.partition_bounds();
let value_list = bounds
.iter()
@@ -753,6 +787,7 @@ fn create_partitions_stmt(partitions: Vec<PartitionInfo>) -> Result<Option<Parti
fn create_table_info(
create_table: &CreateTableExpr,
partition_columns: Vec<String>,
schema_opts: Option<SchemaNameValue>,
) -> Result<RawTableInfo> {
let mut column_schemas = Vec::with_capacity(create_table.column_defs.len());
let mut column_name_to_index_map = HashMap::new();
@@ -799,6 +834,10 @@ fn create_table_info(
})
.collect::<Result<Vec<_>>>()?;
let table_options = TableOptions::try_from(&create_table.table_options)
.context(UnrecognizedTableOptionSnafu)?;
let table_options = merge_options(table_options, schema_opts);
let meta = RawTableMeta {
schema: raw_schema,
primary_key_indices,
@@ -807,8 +846,7 @@ fn create_table_info(
next_column_id: column_schemas.len() as u32,
region_numbers: vec![],
engine_options: HashMap::new(),
options: TableOptions::try_from(&create_table.table_options)
.context(UnrecognizedTableOptionSnafu)?,
options: table_options,
created_on: DateTime::default(),
partition_key_indices,
};
@@ -835,6 +873,14 @@ fn create_table_info(
Ok(table_info)
}
fn merge_options(
mut table_opts: TableOptions,
schema_opts: Option<SchemaNameValue>,
) -> TableOptions {
table_opts.ttl = table_opts.ttl.or(schema_opts.and_then(|s| s.ttl));
table_opts
}
fn parse_partitions(
create_table: &CreateTableExpr,
partitions: Option<Partitions>,

Some files were not shown because too many files have changed in this diff Show More